Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -48,14 +48,15 @@ void xa_opt_quantized_conv2d_nhwc_depthwise_asym8sxsym8s_asym8s(
WORD32* __restrict__ p_bias =
(WORD32* __restrict__)bias.const_data_ptr<int32_t>();

WORD32 input_height = conv1d ? 1 : input.size(2);
WORD32 input_width = conv1d ? input.size(2) : input.size(3);
WORD32 input_channels = input.size(1);
WORD32 kernel_height = conv1d ? 1 : weight.size(2);
WORD32 kernel_width = conv1d ? weight.size(2) : weight.size(3);
// NHWC layout: 4D=[N,H,W,C], 3D=[N,W,C]
WORD32 input_height = conv1d ? 1 : input.size(1);
WORD32 input_width = conv1d ? input.size(1) : input.size(2);
WORD32 input_channels = conv1d ? input.size(2) : input.size(3);
WORD32 kernel_height = conv1d ? 1 : weight.size(1);
WORD32 kernel_width = conv1d ? weight.size(1) : weight.size(2);
WORD32 out_channels = weight.size(0);
WORD32 out_height = conv1d ? 1 : out.size(2);
WORD32 out_width = conv1d ? out.size(2) : out.size(3);
WORD32 out_height = conv1d ? 1 : out.size(1);
WORD32 out_width = conv1d ? out.size(1) : out.size(2);
WORD32 batches = input.size(0);

WORD32 x_stride = stride[1];
Expand All @@ -79,6 +80,18 @@ void xa_opt_quantized_conv2d_nhwc_depthwise_asym8sxsym8s_asym8s(
out_shift32[i] = 0;
}

// Rearrange weight from [OC, KH, KW, IC/G] (graph NHWC format) to
// [KH, KW, OC] (NNLib HWC format expected for inp_data_format=0).
// For depthwise IC/G=1, so this is a transpose of [OC, KH*KW] to [KH*KW, OC].
WORD32 kernel_size = kernel_height * kernel_width;
WORD32 weight_size = out_channels * kernel_size;
WORD8* p_kernel_hwc = (WORD8*)kernels::allocate_temp_memory(ctx, weight_size);
for (int oc = 0; oc < out_channels; oc++) {
for (int k = 0; k < kernel_size; k++) {
p_kernel_hwc[k * out_channels + oc] = p_kernel[oc * kernel_size + k];
}
}

WORD32 scratch_size = xa_nn_conv2d_depthwise_getsize(
input_height,
input_width,
Expand Down Expand Up @@ -107,7 +120,7 @@ void xa_opt_quantized_conv2d_nhwc_depthwise_asym8sxsym8s_asym8s(

xa_nn_conv2d_depthwise_per_chan_sym8sxasym8s(
out_batch,
p_kernel,
p_kernel_hwc,
in_batch,
p_bias,
input_height,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -48,14 +48,15 @@ void xa_opt_quantized_conv2d_nhwc_depthwise_asym8uxsym8u_asym8u(
WORD32* __restrict__ p_bias =
(WORD32* __restrict__)bias.const_data_ptr<int32_t>();

WORD32 input_height = conv1d ? 1 : input.size(2);
WORD32 input_width = conv1d ? input.size(2) : input.size(3);
WORD32 input_channels = input.size(1);
WORD32 kernel_height = conv1d ? 1 : weight.size(2);
WORD32 kernel_width = conv1d ? weight.size(2) : weight.size(3);
// NHWC layout: 4D=[N,H,W,C], 3D=[N,W,C]
WORD32 input_height = conv1d ? 1 : input.size(1);
WORD32 input_width = conv1d ? input.size(1) : input.size(2);
WORD32 input_channels = conv1d ? input.size(2) : input.size(3);
WORD32 kernel_height = conv1d ? 1 : weight.size(1);
WORD32 kernel_width = conv1d ? weight.size(1) : weight.size(2);
WORD32 out_channels = weight.size(0);
WORD32 out_height = conv1d ? 1 : out.size(2);
WORD32 out_width = conv1d ? out.size(2) : out.size(3);
WORD32 out_height = conv1d ? 1 : out.size(1);
WORD32 out_width = conv1d ? out.size(1) : out.size(2);
WORD32 batches = input.size(0);

WORD32 x_stride = stride[1];
Expand All @@ -79,6 +80,19 @@ void xa_opt_quantized_conv2d_nhwc_depthwise_asym8uxsym8u_asym8u(
out_shift32[i] = 0;
}

// Rearrange weight from [OC, KH, KW, IC/G] (graph NHWC format) to
// [KH, KW, OC] (NNLib HWC format expected for inp_data_format=0).
// For depthwise IC/G=1, so this is a transpose of [OC, KH*KW] to [KH*KW, OC].
WORD32 kernel_size = kernel_height * kernel_width;
WORD32 weight_size = out_channels * kernel_size;
UWORD8* p_kernel_hwc =
(UWORD8*)kernels::allocate_temp_memory(ctx, weight_size);
for (int oc = 0; oc < out_channels; oc++) {
for (int k = 0; k < kernel_size; k++) {
p_kernel_hwc[k * out_channels + oc] = p_kernel[oc * kernel_size + k];
}
}

WORD32 scratch_size = xa_nn_conv2d_depthwise_getsize(
input_height,
input_width,
Expand Down Expand Up @@ -107,7 +121,7 @@ void xa_opt_quantized_conv2d_nhwc_depthwise_asym8uxsym8u_asym8u(

xa_nn_conv2d_depthwise_per_chan_sym8sxasym8s(
(WORD8*)out_batch,
(WORD8*)p_kernel,
(WORD8*)p_kernel_hwc,
(WORD8*)in_batch,
p_bias,
input_height,
Expand Down
32 changes: 23 additions & 9 deletions backends/cadence/hifi/operators/op_quantized_conv2d_nhwc_out.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -176,15 +176,16 @@ void xa_opt_quantized_conv2d_nhwc(
WORD32* __restrict__ p_bias =
(WORD32* __restrict__)bias.const_data_ptr<int32_t>();

WORD32 input_height = conv1d ? 1 : input.size(2);
WORD32 input_width = conv1d ? input.size(2) : input.size(3);
WORD32 input_channels = input.size(1);
WORD32 kernel_height = conv1d ? 1 : weight.size(2);
WORD32 kernel_width = conv1d ? weight.size(2) : weight.size(3);
WORD32 kernel_channels = weight.size(1);
// NHWC layout: 4D=[N,H,W,C], 3D=[N,W,C]
WORD32 input_height = conv1d ? 1 : input.size(1);
WORD32 input_width = conv1d ? input.size(1) : input.size(2);
WORD32 input_channels = conv1d ? input.size(2) : input.size(3);
WORD32 kernel_height = conv1d ? 1 : weight.size(1);
WORD32 kernel_width = conv1d ? weight.size(1) : weight.size(2);
WORD32 kernel_channels = conv1d ? weight.size(2) : weight.size(3);
WORD32 out_channels = weight.size(0);
WORD32 out_height = conv1d ? 1 : out.size(2);
WORD32 out_width = conv1d ? out.size(2) : out.size(3);
WORD32 out_height = conv1d ? 1 : out.size(1);
WORD32 out_width = conv1d ? out.size(1) : out.size(2);
WORD32 batches = input.size(0);

WORD32 x_stride = stride[1];
Expand Down Expand Up @@ -285,6 +286,19 @@ void xa_opt_quantized_conv2d_nhwc(
if (groups == input_channels) {
WORD32 channels_multiplier = out_channels / input_channels;

// Rearrange weight from [OC, KH, KW, IC/G] (graph NHWC format) to
// [KH, KW, OC] (NNLib HWC format expected for inp_data_format=0).
WORD32 kernel_size_dw = kernel_height * kernel_width;
WORD32 weight_size_dw = out_channels * kernel_size_dw;
WORD8* p_kernel_hwc =
(WORD8*)kernels::allocate_temp_memory(ctx, weight_size_dw);
for (int oc = 0; oc < out_channels; oc++) {
for (int k = 0; k < kernel_size_dw; k++) {
p_kernel_hwc[k * out_channels + oc] =
p_kernel[oc * kernel_size_dw + k];
}
}

scratch_size = xa_nn_conv2d_depthwise_getsize(
input_height,
input_width,
Expand Down Expand Up @@ -322,7 +336,7 @@ void xa_opt_quantized_conv2d_nhwc(

xa_nn_conv2d_depthwise_per_chan_sym8sxasym8s(
out_batch,
p_kernel,
p_kernel_hwc,
in_batch,
p_bias,
input_height,
Expand Down
Loading