From 4632ae27dab3d48409b403a874273d833d12308f Mon Sep 17 00:00:00 2001 From: Jake Stevens Date: Thu, 12 Feb 2026 12:16:17 -0800 Subject: [PATCH] Fix NHWC depthwise conv weight layout and dimension indexing for HiFi operators (#17423) Summary: The NHWC depthwise conv HiFi operators had two bugs that caused incorrect inference results on the DSP: 1. **Weight layout mismatch**: The operators passed weight tensors directly to NNLib's `xa_nn_conv2d_depthwise_per_chan_sym8sxasym8s` with `inp_data_format=0` (NHWC), but the weight data from the ExecuTorch graph is in `[OC, KH, KW, IC/G]` format (output-channel first), while NNLib's NHWC path expects `[KH, KW, OC]` (spatial-first / HWC). This caused every depthwise conv to compute with scrambled kernel values. Fixed by rearranging the weight buffer from `[OC, KH*KW]` to `[KH*KW, OC]` before calling NNLib. The NCHW operators were unaffected because NNLib's NCHW path (`inp_data_format=1`) expects CHW kernel format which matches the graph's weight layout. 2. **NCHW dimension indexing in NHWC operators**: The operators extracted tensor dimensions using NCHW conventions (e.g., `input_channels = input.size(1)`) despite operating on NHWC-layout tensors. For 3D conv1d inputs `[N, W, C]`, `input.size(1)` is the spatial width, not channels. Fixed by adding a `conv1d` guard and using NHWC indexing: `input_channels = conv1d ? input.size(2) : input.size(3)`. Reviewed By: mcremon-meta Differential Revision: D93125592 --- ...ise_asym8sxsym8s_asym8s_per_tensor_out.cpp | 29 ++++++++++++----- ...ise_asym8uxsym8u_asym8u_per_tensor_out.cpp | 30 ++++++++++++----- .../op_quantized_conv2d_nhwc_out.cpp | 32 +++++++++++++------ 3 files changed, 66 insertions(+), 25 deletions(-) diff --git a/backends/cadence/hifi/operators/op_quantized_conv2d_nhwc_depthwise_asym8sxsym8s_asym8s_per_tensor_out.cpp b/backends/cadence/hifi/operators/op_quantized_conv2d_nhwc_depthwise_asym8sxsym8s_asym8s_per_tensor_out.cpp index 384ebbb4f48..ae54e906a21 100644 --- a/backends/cadence/hifi/operators/op_quantized_conv2d_nhwc_depthwise_asym8sxsym8s_asym8s_per_tensor_out.cpp +++ b/backends/cadence/hifi/operators/op_quantized_conv2d_nhwc_depthwise_asym8sxsym8s_asym8s_per_tensor_out.cpp @@ -48,14 +48,15 @@ void xa_opt_quantized_conv2d_nhwc_depthwise_asym8sxsym8s_asym8s( WORD32* __restrict__ p_bias = (WORD32* __restrict__)bias.const_data_ptr(); - WORD32 input_height = conv1d ? 1 : input.size(2); - WORD32 input_width = conv1d ? input.size(2) : input.size(3); - WORD32 input_channels = input.size(1); - WORD32 kernel_height = conv1d ? 1 : weight.size(2); - WORD32 kernel_width = conv1d ? weight.size(2) : weight.size(3); + // NHWC layout: 4D=[N,H,W,C], 3D=[N,W,C] + WORD32 input_height = conv1d ? 1 : input.size(1); + WORD32 input_width = conv1d ? input.size(1) : input.size(2); + WORD32 input_channels = conv1d ? input.size(2) : input.size(3); + WORD32 kernel_height = conv1d ? 1 : weight.size(1); + WORD32 kernel_width = conv1d ? weight.size(1) : weight.size(2); WORD32 out_channels = weight.size(0); - WORD32 out_height = conv1d ? 1 : out.size(2); - WORD32 out_width = conv1d ? out.size(2) : out.size(3); + WORD32 out_height = conv1d ? 1 : out.size(1); + WORD32 out_width = conv1d ? out.size(1) : out.size(2); WORD32 batches = input.size(0); WORD32 x_stride = stride[1]; @@ -79,6 +80,18 @@ void xa_opt_quantized_conv2d_nhwc_depthwise_asym8sxsym8s_asym8s( out_shift32[i] = 0; } + // Rearrange weight from [OC, KH, KW, IC/G] (graph NHWC format) to + // [KH, KW, OC] (NNLib HWC format expected for inp_data_format=0). + // For depthwise IC/G=1, so this is a transpose of [OC, KH*KW] to [KH*KW, OC]. + WORD32 kernel_size = kernel_height * kernel_width; + WORD32 weight_size = out_channels * kernel_size; + WORD8* p_kernel_hwc = (WORD8*)kernels::allocate_temp_memory(ctx, weight_size); + for (int oc = 0; oc < out_channels; oc++) { + for (int k = 0; k < kernel_size; k++) { + p_kernel_hwc[k * out_channels + oc] = p_kernel[oc * kernel_size + k]; + } + } + WORD32 scratch_size = xa_nn_conv2d_depthwise_getsize( input_height, input_width, @@ -107,7 +120,7 @@ void xa_opt_quantized_conv2d_nhwc_depthwise_asym8sxsym8s_asym8s( xa_nn_conv2d_depthwise_per_chan_sym8sxasym8s( out_batch, - p_kernel, + p_kernel_hwc, in_batch, p_bias, input_height, diff --git a/backends/cadence/hifi/operators/op_quantized_conv2d_nhwc_depthwise_asym8uxsym8u_asym8u_per_tensor_out.cpp b/backends/cadence/hifi/operators/op_quantized_conv2d_nhwc_depthwise_asym8uxsym8u_asym8u_per_tensor_out.cpp index 07df1a416d7..2532489a7a0 100644 --- a/backends/cadence/hifi/operators/op_quantized_conv2d_nhwc_depthwise_asym8uxsym8u_asym8u_per_tensor_out.cpp +++ b/backends/cadence/hifi/operators/op_quantized_conv2d_nhwc_depthwise_asym8uxsym8u_asym8u_per_tensor_out.cpp @@ -48,14 +48,15 @@ void xa_opt_quantized_conv2d_nhwc_depthwise_asym8uxsym8u_asym8u( WORD32* __restrict__ p_bias = (WORD32* __restrict__)bias.const_data_ptr(); - WORD32 input_height = conv1d ? 1 : input.size(2); - WORD32 input_width = conv1d ? input.size(2) : input.size(3); - WORD32 input_channels = input.size(1); - WORD32 kernel_height = conv1d ? 1 : weight.size(2); - WORD32 kernel_width = conv1d ? weight.size(2) : weight.size(3); + // NHWC layout: 4D=[N,H,W,C], 3D=[N,W,C] + WORD32 input_height = conv1d ? 1 : input.size(1); + WORD32 input_width = conv1d ? input.size(1) : input.size(2); + WORD32 input_channels = conv1d ? input.size(2) : input.size(3); + WORD32 kernel_height = conv1d ? 1 : weight.size(1); + WORD32 kernel_width = conv1d ? weight.size(1) : weight.size(2); WORD32 out_channels = weight.size(0); - WORD32 out_height = conv1d ? 1 : out.size(2); - WORD32 out_width = conv1d ? out.size(2) : out.size(3); + WORD32 out_height = conv1d ? 1 : out.size(1); + WORD32 out_width = conv1d ? out.size(1) : out.size(2); WORD32 batches = input.size(0); WORD32 x_stride = stride[1]; @@ -79,6 +80,19 @@ void xa_opt_quantized_conv2d_nhwc_depthwise_asym8uxsym8u_asym8u( out_shift32[i] = 0; } + // Rearrange weight from [OC, KH, KW, IC/G] (graph NHWC format) to + // [KH, KW, OC] (NNLib HWC format expected for inp_data_format=0). + // For depthwise IC/G=1, so this is a transpose of [OC, KH*KW] to [KH*KW, OC]. + WORD32 kernel_size = kernel_height * kernel_width; + WORD32 weight_size = out_channels * kernel_size; + UWORD8* p_kernel_hwc = + (UWORD8*)kernels::allocate_temp_memory(ctx, weight_size); + for (int oc = 0; oc < out_channels; oc++) { + for (int k = 0; k < kernel_size; k++) { + p_kernel_hwc[k * out_channels + oc] = p_kernel[oc * kernel_size + k]; + } + } + WORD32 scratch_size = xa_nn_conv2d_depthwise_getsize( input_height, input_width, @@ -107,7 +121,7 @@ void xa_opt_quantized_conv2d_nhwc_depthwise_asym8uxsym8u_asym8u( xa_nn_conv2d_depthwise_per_chan_sym8sxasym8s( (WORD8*)out_batch, - (WORD8*)p_kernel, + (WORD8*)p_kernel_hwc, (WORD8*)in_batch, p_bias, input_height, diff --git a/backends/cadence/hifi/operators/op_quantized_conv2d_nhwc_out.cpp b/backends/cadence/hifi/operators/op_quantized_conv2d_nhwc_out.cpp index b2a7c341997..325cce828fe 100644 --- a/backends/cadence/hifi/operators/op_quantized_conv2d_nhwc_out.cpp +++ b/backends/cadence/hifi/operators/op_quantized_conv2d_nhwc_out.cpp @@ -176,15 +176,16 @@ void xa_opt_quantized_conv2d_nhwc( WORD32* __restrict__ p_bias = (WORD32* __restrict__)bias.const_data_ptr(); - WORD32 input_height = conv1d ? 1 : input.size(2); - WORD32 input_width = conv1d ? input.size(2) : input.size(3); - WORD32 input_channels = input.size(1); - WORD32 kernel_height = conv1d ? 1 : weight.size(2); - WORD32 kernel_width = conv1d ? weight.size(2) : weight.size(3); - WORD32 kernel_channels = weight.size(1); + // NHWC layout: 4D=[N,H,W,C], 3D=[N,W,C] + WORD32 input_height = conv1d ? 1 : input.size(1); + WORD32 input_width = conv1d ? input.size(1) : input.size(2); + WORD32 input_channels = conv1d ? input.size(2) : input.size(3); + WORD32 kernel_height = conv1d ? 1 : weight.size(1); + WORD32 kernel_width = conv1d ? weight.size(1) : weight.size(2); + WORD32 kernel_channels = conv1d ? weight.size(2) : weight.size(3); WORD32 out_channels = weight.size(0); - WORD32 out_height = conv1d ? 1 : out.size(2); - WORD32 out_width = conv1d ? out.size(2) : out.size(3); + WORD32 out_height = conv1d ? 1 : out.size(1); + WORD32 out_width = conv1d ? out.size(1) : out.size(2); WORD32 batches = input.size(0); WORD32 x_stride = stride[1]; @@ -285,6 +286,19 @@ void xa_opt_quantized_conv2d_nhwc( if (groups == input_channels) { WORD32 channels_multiplier = out_channels / input_channels; + // Rearrange weight from [OC, KH, KW, IC/G] (graph NHWC format) to + // [KH, KW, OC] (NNLib HWC format expected for inp_data_format=0). + WORD32 kernel_size_dw = kernel_height * kernel_width; + WORD32 weight_size_dw = out_channels * kernel_size_dw; + WORD8* p_kernel_hwc = + (WORD8*)kernels::allocate_temp_memory(ctx, weight_size_dw); + for (int oc = 0; oc < out_channels; oc++) { + for (int k = 0; k < kernel_size_dw; k++) { + p_kernel_hwc[k * out_channels + oc] = + p_kernel[oc * kernel_size_dw + k]; + } + } + scratch_size = xa_nn_conv2d_depthwise_getsize( input_height, input_width, @@ -322,7 +336,7 @@ void xa_opt_quantized_conv2d_nhwc( xa_nn_conv2d_depthwise_per_chan_sym8sxasym8s( out_batch, - p_kernel, + p_kernel_hwc, in_batch, p_bias, input_height,