diff --git a/backends/cadence/hifi/operators/op_quantized_conv2d_nhwc_depthwise_asym8sxsym8s_asym8s_per_tensor_out.cpp b/backends/cadence/hifi/operators/op_quantized_conv2d_nhwc_depthwise_asym8sxsym8s_asym8s_per_tensor_out.cpp index 384ebbb4f48..ae54e906a21 100644 --- a/backends/cadence/hifi/operators/op_quantized_conv2d_nhwc_depthwise_asym8sxsym8s_asym8s_per_tensor_out.cpp +++ b/backends/cadence/hifi/operators/op_quantized_conv2d_nhwc_depthwise_asym8sxsym8s_asym8s_per_tensor_out.cpp @@ -48,14 +48,15 @@ void xa_opt_quantized_conv2d_nhwc_depthwise_asym8sxsym8s_asym8s( WORD32* __restrict__ p_bias = (WORD32* __restrict__)bias.const_data_ptr(); - WORD32 input_height = conv1d ? 1 : input.size(2); - WORD32 input_width = conv1d ? input.size(2) : input.size(3); - WORD32 input_channels = input.size(1); - WORD32 kernel_height = conv1d ? 1 : weight.size(2); - WORD32 kernel_width = conv1d ? weight.size(2) : weight.size(3); + // NHWC layout: 4D=[N,H,W,C], 3D=[N,W,C] + WORD32 input_height = conv1d ? 1 : input.size(1); + WORD32 input_width = conv1d ? input.size(1) : input.size(2); + WORD32 input_channels = conv1d ? input.size(2) : input.size(3); + WORD32 kernel_height = conv1d ? 1 : weight.size(1); + WORD32 kernel_width = conv1d ? weight.size(1) : weight.size(2); WORD32 out_channels = weight.size(0); - WORD32 out_height = conv1d ? 1 : out.size(2); - WORD32 out_width = conv1d ? out.size(2) : out.size(3); + WORD32 out_height = conv1d ? 1 : out.size(1); + WORD32 out_width = conv1d ? out.size(1) : out.size(2); WORD32 batches = input.size(0); WORD32 x_stride = stride[1]; @@ -79,6 +80,18 @@ void xa_opt_quantized_conv2d_nhwc_depthwise_asym8sxsym8s_asym8s( out_shift32[i] = 0; } + // Rearrange weight from [OC, KH, KW, IC/G] (graph NHWC format) to + // [KH, KW, OC] (NNLib HWC format expected for inp_data_format=0). + // For depthwise IC/G=1, so this is a transpose of [OC, KH*KW] to [KH*KW, OC]. + WORD32 kernel_size = kernel_height * kernel_width; + WORD32 weight_size = out_channels * kernel_size; + WORD8* p_kernel_hwc = (WORD8*)kernels::allocate_temp_memory(ctx, weight_size); + for (int oc = 0; oc < out_channels; oc++) { + for (int k = 0; k < kernel_size; k++) { + p_kernel_hwc[k * out_channels + oc] = p_kernel[oc * kernel_size + k]; + } + } + WORD32 scratch_size = xa_nn_conv2d_depthwise_getsize( input_height, input_width, @@ -107,7 +120,7 @@ void xa_opt_quantized_conv2d_nhwc_depthwise_asym8sxsym8s_asym8s( xa_nn_conv2d_depthwise_per_chan_sym8sxasym8s( out_batch, - p_kernel, + p_kernel_hwc, in_batch, p_bias, input_height, diff --git a/backends/cadence/hifi/operators/op_quantized_conv2d_nhwc_depthwise_asym8uxsym8u_asym8u_per_tensor_out.cpp b/backends/cadence/hifi/operators/op_quantized_conv2d_nhwc_depthwise_asym8uxsym8u_asym8u_per_tensor_out.cpp index 07df1a416d7..2532489a7a0 100644 --- a/backends/cadence/hifi/operators/op_quantized_conv2d_nhwc_depthwise_asym8uxsym8u_asym8u_per_tensor_out.cpp +++ b/backends/cadence/hifi/operators/op_quantized_conv2d_nhwc_depthwise_asym8uxsym8u_asym8u_per_tensor_out.cpp @@ -48,14 +48,15 @@ void xa_opt_quantized_conv2d_nhwc_depthwise_asym8uxsym8u_asym8u( WORD32* __restrict__ p_bias = (WORD32* __restrict__)bias.const_data_ptr(); - WORD32 input_height = conv1d ? 1 : input.size(2); - WORD32 input_width = conv1d ? input.size(2) : input.size(3); - WORD32 input_channels = input.size(1); - WORD32 kernel_height = conv1d ? 1 : weight.size(2); - WORD32 kernel_width = conv1d ? weight.size(2) : weight.size(3); + // NHWC layout: 4D=[N,H,W,C], 3D=[N,W,C] + WORD32 input_height = conv1d ? 1 : input.size(1); + WORD32 input_width = conv1d ? input.size(1) : input.size(2); + WORD32 input_channels = conv1d ? input.size(2) : input.size(3); + WORD32 kernel_height = conv1d ? 1 : weight.size(1); + WORD32 kernel_width = conv1d ? weight.size(1) : weight.size(2); WORD32 out_channels = weight.size(0); - WORD32 out_height = conv1d ? 1 : out.size(2); - WORD32 out_width = conv1d ? out.size(2) : out.size(3); + WORD32 out_height = conv1d ? 1 : out.size(1); + WORD32 out_width = conv1d ? out.size(1) : out.size(2); WORD32 batches = input.size(0); WORD32 x_stride = stride[1]; @@ -79,6 +80,19 @@ void xa_opt_quantized_conv2d_nhwc_depthwise_asym8uxsym8u_asym8u( out_shift32[i] = 0; } + // Rearrange weight from [OC, KH, KW, IC/G] (graph NHWC format) to + // [KH, KW, OC] (NNLib HWC format expected for inp_data_format=0). + // For depthwise IC/G=1, so this is a transpose of [OC, KH*KW] to [KH*KW, OC]. + WORD32 kernel_size = kernel_height * kernel_width; + WORD32 weight_size = out_channels * kernel_size; + UWORD8* p_kernel_hwc = + (UWORD8*)kernels::allocate_temp_memory(ctx, weight_size); + for (int oc = 0; oc < out_channels; oc++) { + for (int k = 0; k < kernel_size; k++) { + p_kernel_hwc[k * out_channels + oc] = p_kernel[oc * kernel_size + k]; + } + } + WORD32 scratch_size = xa_nn_conv2d_depthwise_getsize( input_height, input_width, @@ -107,7 +121,7 @@ void xa_opt_quantized_conv2d_nhwc_depthwise_asym8uxsym8u_asym8u( xa_nn_conv2d_depthwise_per_chan_sym8sxasym8s( (WORD8*)out_batch, - (WORD8*)p_kernel, + (WORD8*)p_kernel_hwc, (WORD8*)in_batch, p_bias, input_height, diff --git a/backends/cadence/hifi/operators/op_quantized_conv2d_nhwc_out.cpp b/backends/cadence/hifi/operators/op_quantized_conv2d_nhwc_out.cpp index b2a7c341997..325cce828fe 100644 --- a/backends/cadence/hifi/operators/op_quantized_conv2d_nhwc_out.cpp +++ b/backends/cadence/hifi/operators/op_quantized_conv2d_nhwc_out.cpp @@ -176,15 +176,16 @@ void xa_opt_quantized_conv2d_nhwc( WORD32* __restrict__ p_bias = (WORD32* __restrict__)bias.const_data_ptr(); - WORD32 input_height = conv1d ? 1 : input.size(2); - WORD32 input_width = conv1d ? input.size(2) : input.size(3); - WORD32 input_channels = input.size(1); - WORD32 kernel_height = conv1d ? 1 : weight.size(2); - WORD32 kernel_width = conv1d ? weight.size(2) : weight.size(3); - WORD32 kernel_channels = weight.size(1); + // NHWC layout: 4D=[N,H,W,C], 3D=[N,W,C] + WORD32 input_height = conv1d ? 1 : input.size(1); + WORD32 input_width = conv1d ? input.size(1) : input.size(2); + WORD32 input_channels = conv1d ? input.size(2) : input.size(3); + WORD32 kernel_height = conv1d ? 1 : weight.size(1); + WORD32 kernel_width = conv1d ? weight.size(1) : weight.size(2); + WORD32 kernel_channels = conv1d ? weight.size(2) : weight.size(3); WORD32 out_channels = weight.size(0); - WORD32 out_height = conv1d ? 1 : out.size(2); - WORD32 out_width = conv1d ? out.size(2) : out.size(3); + WORD32 out_height = conv1d ? 1 : out.size(1); + WORD32 out_width = conv1d ? out.size(1) : out.size(2); WORD32 batches = input.size(0); WORD32 x_stride = stride[1]; @@ -285,6 +286,19 @@ void xa_opt_quantized_conv2d_nhwc( if (groups == input_channels) { WORD32 channels_multiplier = out_channels / input_channels; + // Rearrange weight from [OC, KH, KW, IC/G] (graph NHWC format) to + // [KH, KW, OC] (NNLib HWC format expected for inp_data_format=0). + WORD32 kernel_size_dw = kernel_height * kernel_width; + WORD32 weight_size_dw = out_channels * kernel_size_dw; + WORD8* p_kernel_hwc = + (WORD8*)kernels::allocate_temp_memory(ctx, weight_size_dw); + for (int oc = 0; oc < out_channels; oc++) { + for (int k = 0; k < kernel_size_dw; k++) { + p_kernel_hwc[k * out_channels + oc] = + p_kernel[oc * kernel_size_dw + k]; + } + } + scratch_size = xa_nn_conv2d_depthwise_getsize( input_height, input_width, @@ -322,7 +336,7 @@ void xa_opt_quantized_conv2d_nhwc( xa_nn_conv2d_depthwise_per_chan_sym8sxasym8s( out_batch, - p_kernel, + p_kernel_hwc, in_batch, p_bias, input_height,