From 4632ae27dab3d48409b403a874273d833d12308f Mon Sep 17 00:00:00 2001
From: Jake Stevens <jrstevens@meta.com>
Date: Thu, 12 Feb 2026 12:16:17 -0800
Subject: [PATCH] Fix NHWC depthwise conv weight layout and dimension indexing
 for HiFi operators (#17423)

Summary:

 The NHWC depthwise conv HiFi operators had two bugs that caused incorrect inference results on the DSP:

  1. **Weight layout mismatch**: The operators passed weight tensors directly to NNLib's `xa_nn_conv2d_depthwise_per_chan_sym8sxasym8s` with `inp_data_format=0` (NHWC), but the weight data from the ExecuTorch graph is in `[OC, KH, KW, IC/G]` format (output-channel first), while NNLib's NHWC path expects `[KH, KW, OC]` (spatial-first / HWC). This caused every depthwise conv to compute with scrambled kernel values. Fixed by rearranging the weight buffer from `[OC, KH*KW]` to `[KH*KW, OC]` before calling NNLib. The NCHW operators were unaffected because NNLib's NCHW path (`inp_data_format=1`) expects CHW kernel format which matches the graph's weight layout.

  2. **NCHW dimension indexing in NHWC operators**: The operators extracted tensor dimensions using NCHW conventions (e.g., `input_channels = input.size(1)`) despite operating on NHWC-layout tensors. For 3D conv1d inputs `[N, W, C]`, `input.size(1)` is the spatial width, not channels. Fixed by adding a `conv1d` guard and using NHWC indexing: `input_channels = conv1d ? input.size(2) : input.size(3)`.

Reviewed By: mcremon-meta

Differential Revision: D93125592
---
 ...ise_asym8sxsym8s_asym8s_per_tensor_out.cpp | 29 ++++++++++++-----
 ...ise_asym8uxsym8u_asym8u_per_tensor_out.cpp | 30 ++++++++++++-----
 .../op_quantized_conv2d_nhwc_out.cpp          | 32 +++++++++++++------
 3 files changed, 66 insertions(+), 25 deletions(-)

diff --git a/backends/cadence/hifi/operators/op_quantized_conv2d_nhwc_depthwise_asym8sxsym8s_asym8s_per_tensor_out.cpp b/backends/cadence/hifi/operators/op_quantized_conv2d_nhwc_depthwise_asym8sxsym8s_asym8s_per_tensor_out.cpp
index 384ebbb4f48..ae54e906a21 100644
--- a/backends/cadence/hifi/operators/op_quantized_conv2d_nhwc_depthwise_asym8sxsym8s_asym8s_per_tensor_out.cpp
+++ b/backends/cadence/hifi/operators/op_quantized_conv2d_nhwc_depthwise_asym8sxsym8s_asym8s_per_tensor_out.cpp
@@ -48,14 +48,15 @@ void xa_opt_quantized_conv2d_nhwc_depthwise_asym8sxsym8s_asym8s(
   WORD32* __restrict__ p_bias =
       (WORD32* __restrict__)bias.const_data_ptr<int32_t>();
 
-  WORD32 input_height = conv1d ? 1 : input.size(2);
-  WORD32 input_width = conv1d ? input.size(2) : input.size(3);
-  WORD32 input_channels = input.size(1);
-  WORD32 kernel_height = conv1d ? 1 : weight.size(2);
-  WORD32 kernel_width = conv1d ? weight.size(2) : weight.size(3);
+  // NHWC layout: 4D=[N,H,W,C], 3D=[N,W,C]
+  WORD32 input_height = conv1d ? 1 : input.size(1);
+  WORD32 input_width = conv1d ? input.size(1) : input.size(2);
+  WORD32 input_channels = conv1d ? input.size(2) : input.size(3);
+  WORD32 kernel_height = conv1d ? 1 : weight.size(1);
+  WORD32 kernel_width = conv1d ? weight.size(1) : weight.size(2);
   WORD32 out_channels = weight.size(0);
-  WORD32 out_height = conv1d ? 1 : out.size(2);
-  WORD32 out_width = conv1d ? out.size(2) : out.size(3);
+  WORD32 out_height = conv1d ? 1 : out.size(1);
+  WORD32 out_width = conv1d ? out.size(1) : out.size(2);
   WORD32 batches = input.size(0);
 
   WORD32 x_stride = stride[1];
@@ -79,6 +80,18 @@ void xa_opt_quantized_conv2d_nhwc_depthwise_asym8sxsym8s_asym8s(
     out_shift32[i] = 0;
   }
 
+  // Rearrange weight from [OC, KH, KW, IC/G] (graph NHWC format) to
+  // [KH, KW, OC] (NNLib HWC format expected for inp_data_format=0).
+  // For depthwise IC/G=1, so this is a transpose of [OC, KH*KW] to [KH*KW, OC].
+  WORD32 kernel_size = kernel_height * kernel_width;
+  WORD32 weight_size = out_channels * kernel_size;
+  WORD8* p_kernel_hwc = (WORD8*)kernels::allocate_temp_memory(ctx, weight_size);
+  for (int oc = 0; oc < out_channels; oc++) {
+    for (int k = 0; k < kernel_size; k++) {
+      p_kernel_hwc[k * out_channels + oc] = p_kernel[oc * kernel_size + k];
+    }
+  }
+
   WORD32 scratch_size = xa_nn_conv2d_depthwise_getsize(
       input_height,
       input_width,
@@ -107,7 +120,7 @@ void xa_opt_quantized_conv2d_nhwc_depthwise_asym8sxsym8s_asym8s(
 
     xa_nn_conv2d_depthwise_per_chan_sym8sxasym8s(
         out_batch,
-        p_kernel,
+        p_kernel_hwc,
         in_batch,
         p_bias,
         input_height,
diff --git a/backends/cadence/hifi/operators/op_quantized_conv2d_nhwc_depthwise_asym8uxsym8u_asym8u_per_tensor_out.cpp b/backends/cadence/hifi/operators/op_quantized_conv2d_nhwc_depthwise_asym8uxsym8u_asym8u_per_tensor_out.cpp
index 07df1a416d7..2532489a7a0 100644
--- a/backends/cadence/hifi/operators/op_quantized_conv2d_nhwc_depthwise_asym8uxsym8u_asym8u_per_tensor_out.cpp
+++ b/backends/cadence/hifi/operators/op_quantized_conv2d_nhwc_depthwise_asym8uxsym8u_asym8u_per_tensor_out.cpp
@@ -48,14 +48,15 @@ void xa_opt_quantized_conv2d_nhwc_depthwise_asym8uxsym8u_asym8u(
   WORD32* __restrict__ p_bias =
       (WORD32* __restrict__)bias.const_data_ptr<int32_t>();
 
-  WORD32 input_height = conv1d ? 1 : input.size(2);
-  WORD32 input_width = conv1d ? input.size(2) : input.size(3);
-  WORD32 input_channels = input.size(1);
-  WORD32 kernel_height = conv1d ? 1 : weight.size(2);
-  WORD32 kernel_width = conv1d ? weight.size(2) : weight.size(3);
+  // NHWC layout: 4D=[N,H,W,C], 3D=[N,W,C]
+  WORD32 input_height = conv1d ? 1 : input.size(1);
+  WORD32 input_width = conv1d ? input.size(1) : input.size(2);
+  WORD32 input_channels = conv1d ? input.size(2) : input.size(3);
+  WORD32 kernel_height = conv1d ? 1 : weight.size(1);
+  WORD32 kernel_width = conv1d ? weight.size(1) : weight.size(2);
   WORD32 out_channels = weight.size(0);
-  WORD32 out_height = conv1d ? 1 : out.size(2);
-  WORD32 out_width = conv1d ? out.size(2) : out.size(3);
+  WORD32 out_height = conv1d ? 1 : out.size(1);
+  WORD32 out_width = conv1d ? out.size(1) : out.size(2);
   WORD32 batches = input.size(0);
 
   WORD32 x_stride = stride[1];
@@ -79,6 +80,19 @@ void xa_opt_quantized_conv2d_nhwc_depthwise_asym8uxsym8u_asym8u(
     out_shift32[i] = 0;
   }
 
+  // Rearrange weight from [OC, KH, KW, IC/G] (graph NHWC format) to
+  // [KH, KW, OC] (NNLib HWC format expected for inp_data_format=0).
+  // For depthwise IC/G=1, so this is a transpose of [OC, KH*KW] to [KH*KW, OC].
+  WORD32 kernel_size = kernel_height * kernel_width;
+  WORD32 weight_size = out_channels * kernel_size;
+  UWORD8* p_kernel_hwc =
+      (UWORD8*)kernels::allocate_temp_memory(ctx, weight_size);
+  for (int oc = 0; oc < out_channels; oc++) {
+    for (int k = 0; k < kernel_size; k++) {
+      p_kernel_hwc[k * out_channels + oc] = p_kernel[oc * kernel_size + k];
+    }
+  }
+
   WORD32 scratch_size = xa_nn_conv2d_depthwise_getsize(
       input_height,
       input_width,
@@ -107,7 +121,7 @@ void xa_opt_quantized_conv2d_nhwc_depthwise_asym8uxsym8u_asym8u(
 
     xa_nn_conv2d_depthwise_per_chan_sym8sxasym8s(
         (WORD8*)out_batch,
-        (WORD8*)p_kernel,
+        (WORD8*)p_kernel_hwc,
         (WORD8*)in_batch,
         p_bias,
         input_height,
diff --git a/backends/cadence/hifi/operators/op_quantized_conv2d_nhwc_out.cpp b/backends/cadence/hifi/operators/op_quantized_conv2d_nhwc_out.cpp
index b2a7c341997..325cce828fe 100644
--- a/backends/cadence/hifi/operators/op_quantized_conv2d_nhwc_out.cpp
+++ b/backends/cadence/hifi/operators/op_quantized_conv2d_nhwc_out.cpp
@@ -176,15 +176,16 @@ void xa_opt_quantized_conv2d_nhwc(
     WORD32* __restrict__ p_bias =
         (WORD32* __restrict__)bias.const_data_ptr<int32_t>();
 
-    WORD32 input_height = conv1d ? 1 : input.size(2);
-    WORD32 input_width = conv1d ? input.size(2) : input.size(3);
-    WORD32 input_channels = input.size(1);
-    WORD32 kernel_height = conv1d ? 1 : weight.size(2);
-    WORD32 kernel_width = conv1d ? weight.size(2) : weight.size(3);
-    WORD32 kernel_channels = weight.size(1);
+    // NHWC layout: 4D=[N,H,W,C], 3D=[N,W,C]
+    WORD32 input_height = conv1d ? 1 : input.size(1);
+    WORD32 input_width = conv1d ? input.size(1) : input.size(2);
+    WORD32 input_channels = conv1d ? input.size(2) : input.size(3);
+    WORD32 kernel_height = conv1d ? 1 : weight.size(1);
+    WORD32 kernel_width = conv1d ? weight.size(1) : weight.size(2);
+    WORD32 kernel_channels = conv1d ? weight.size(2) : weight.size(3);
     WORD32 out_channels = weight.size(0);
-    WORD32 out_height = conv1d ? 1 : out.size(2);
-    WORD32 out_width = conv1d ? out.size(2) : out.size(3);
+    WORD32 out_height = conv1d ? 1 : out.size(1);
+    WORD32 out_width = conv1d ? out.size(1) : out.size(2);
     WORD32 batches = input.size(0);
 
     WORD32 x_stride = stride[1];
@@ -285,6 +286,19 @@ void xa_opt_quantized_conv2d_nhwc(
     if (groups == input_channels) {
       WORD32 channels_multiplier = out_channels / input_channels;
 
+      // Rearrange weight from [OC, KH, KW, IC/G] (graph NHWC format) to
+      // [KH, KW, OC] (NNLib HWC format expected for inp_data_format=0).
+      WORD32 kernel_size_dw = kernel_height * kernel_width;
+      WORD32 weight_size_dw = out_channels * kernel_size_dw;
+      WORD8* p_kernel_hwc =
+          (WORD8*)kernels::allocate_temp_memory(ctx, weight_size_dw);
+      for (int oc = 0; oc < out_channels; oc++) {
+        for (int k = 0; k < kernel_size_dw; k++) {
+          p_kernel_hwc[k * out_channels + oc] =
+              p_kernel[oc * kernel_size_dw + k];
+        }
+      }
+
       scratch_size = xa_nn_conv2d_depthwise_getsize(
           input_height,
           input_width,
@@ -322,7 +336,7 @@ void xa_opt_quantized_conv2d_nhwc(
 
         xa_nn_conv2d_depthwise_per_chan_sym8sxasym8s(
             out_batch,
-            p_kernel,
+            p_kernel_hwc,
             in_batch,
             p_bias,
             input_height,