pytorch · JakeStevens · Feb 12, 2026
diff --git a/.../hifi/operators/op_quantized_conv2d_nhwc_depthwise_asym8sxsym8s_asym8s_per_tensor_out.cpp b/.../hifi/operators/op_quantized_conv2d_nhwc_depthwise_asym8sxsym8s_asym8s_per_tensor_out.cpp
@@ -48,14 +48,15 @@ void xa_opt_quantized_conv2d_nhwc_depthwise_asym8sxsym8s_asym8s(
   WORD32* __restrict__ p_bias =
       (WORD32* __restrict__)bias.const_data_ptr<int32_t>();
 
-  WORD32 input_height = conv1d ? 1 : input.size(2);
-  WORD32 input_width = conv1d ? input.size(2) : input.size(3);
-  WORD32 input_channels = input.size(1);
-  WORD32 kernel_height = conv1d ? 1 : weight.size(2);
-  WORD32 kernel_width = conv1d ? weight.size(2) : weight.size(3);
+  // NHWC layout: 4D=[N,H,W,C], 3D=[N,W,C]
+  WORD32 input_height = conv1d ? 1 : input.size(1);
+  WORD32 input_width = conv1d ? input.size(1) : input.size(2);
+  WORD32 input_channels = conv1d ? input.size(2) : input.size(3);
+  WORD32 kernel_height = conv1d ? 1 : weight.size(1);
+  WORD32 kernel_width = conv1d ? weight.size(1) : weight.size(2);
   WORD32 out_channels = weight.size(0);
-  WORD32 out_height = conv1d ? 1 : out.size(2);
-  WORD32 out_width = conv1d ? out.size(2) : out.size(3);
+  WORD32 out_height = conv1d ? 1 : out.size(1);
+  WORD32 out_width = conv1d ? out.size(1) : out.size(2);
   WORD32 batches = input.size(0);
 
   WORD32 x_stride = stride[1];
@@ -79,6 +80,18 @@ void xa_opt_quantized_conv2d_nhwc_depthwise_asym8sxsym8s_asym8s(
     out_shift32[i] = 0;
   }
 
+  // Rearrange weight from [OC, KH, KW, IC/G] (graph NHWC format) to
+  // [KH, KW, OC] (NNLib HWC format expected for inp_data_format=0).
+  // For depthwise IC/G=1, so this is a transpose of [OC, KH*KW] to [KH*KW, OC].
+  WORD32 kernel_size = kernel_height * kernel_width;
+  WORD32 weight_size = out_channels * kernel_size;
+  WORD8* p_kernel_hwc = (WORD8*)kernels::allocate_temp_memory(ctx, weight_size);
+  for (int oc = 0; oc < out_channels; oc++) {
+    for (int k = 0; k < kernel_size; k++) {
+      p_kernel_hwc[k * out_channels + oc] = p_kernel[oc * kernel_size + k];
+    }
+  }
+
   WORD32 scratch_size = xa_nn_conv2d_depthwise_getsize(
       input_height,
       input_width,
@@ -107,7 +120,7 @@ void xa_opt_quantized_conv2d_nhwc_depthwise_asym8sxsym8s_asym8s(
 
     xa_nn_conv2d_depthwise_per_chan_sym8sxasym8s(
         out_batch,
-        p_kernel,
+        p_kernel_hwc,
         in_batch,
         p_bias,
         input_height,

diff --git a/.../hifi/operators/op_quantized_conv2d_nhwc_depthwise_asym8uxsym8u_asym8u_per_tensor_out.cpp b/.../hifi/operators/op_quantized_conv2d_nhwc_depthwise_asym8uxsym8u_asym8u_per_tensor_out.cpp
@@ -48,14 +48,15 @@ void xa_opt_quantized_conv2d_nhwc_depthwise_asym8uxsym8u_asym8u(
   WORD32* __restrict__ p_bias =
       (WORD32* __restrict__)bias.const_data_ptr<int32_t>();
 
-  WORD32 input_height = conv1d ? 1 : input.size(2);
-  WORD32 input_width = conv1d ? input.size(2) : input.size(3);
-  WORD32 input_channels = input.size(1);
-  WORD32 kernel_height = conv1d ? 1 : weight.size(2);
-  WORD32 kernel_width = conv1d ? weight.size(2) : weight.size(3);
+  // NHWC layout: 4D=[N,H,W,C], 3D=[N,W,C]
+  WORD32 input_height = conv1d ? 1 : input.size(1);
+  WORD32 input_width = conv1d ? input.size(1) : input.size(2);
+  WORD32 input_channels = conv1d ? input.size(2) : input.size(3);
+  WORD32 kernel_height = conv1d ? 1 : weight.size(1);
+  WORD32 kernel_width = conv1d ? weight.size(1) : weight.size(2);
   WORD32 out_channels = weight.size(0);
-  WORD32 out_height = conv1d ? 1 : out.size(2);
-  WORD32 out_width = conv1d ? out.size(2) : out.size(3);
+  WORD32 out_height = conv1d ? 1 : out.size(1);
+  WORD32 out_width = conv1d ? out.size(1) : out.size(2);
   WORD32 batches = input.size(0);
 
   WORD32 x_stride = stride[1];
@@ -79,6 +80,19 @@ void xa_opt_quantized_conv2d_nhwc_depthwise_asym8uxsym8u_asym8u(
     out_shift32[i] = 0;
   }
 
+  // Rearrange weight from [OC, KH, KW, IC/G] (graph NHWC format) to
+  // [KH, KW, OC] (NNLib HWC format expected for inp_data_format=0).
+  // For depthwise IC/G=1, so this is a transpose of [OC, KH*KW] to [KH*KW, OC].
+  WORD32 kernel_size = kernel_height * kernel_width;
+  WORD32 weight_size = out_channels * kernel_size;
+  UWORD8* p_kernel_hwc =
+      (UWORD8*)kernels::allocate_temp_memory(ctx, weight_size);
+  for (int oc = 0; oc < out_channels; oc++) {
+    for (int k = 0; k < kernel_size; k++) {
+      p_kernel_hwc[k * out_channels + oc] = p_kernel[oc * kernel_size + k];
+    }
+  }
+
   WORD32 scratch_size = xa_nn_conv2d_depthwise_getsize(
       input_height,
       input_width,
@@ -107,7 +121,7 @@ void xa_opt_quantized_conv2d_nhwc_depthwise_asym8uxsym8u_asym8u(
 
     xa_nn_conv2d_depthwise_per_chan_sym8sxasym8s(
         (WORD8*)out_batch,
-        (WORD8*)p_kernel,
+        (WORD8*)p_kernel_hwc,
         (WORD8*)in_batch,
         p_bias,
         input_height,

diff --git a/backends/cadence/hifi/operators/op_quantized_conv2d_nhwc_out.cpp b/backends/cadence/hifi/operators/op_quantized_conv2d_nhwc_out.cpp
@@ -176,15 +176,16 @@ void xa_opt_quantized_conv2d_nhwc(
     WORD32* __restrict__ p_bias =
         (WORD32* __restrict__)bias.const_data_ptr<int32_t>();
 
-    WORD32 input_height = conv1d ? 1 : input.size(2);
-    WORD32 input_width = conv1d ? input.size(2) : input.size(3);
-    WORD32 input_channels = input.size(1);
-    WORD32 kernel_height = conv1d ? 1 : weight.size(2);
-    WORD32 kernel_width = conv1d ? weight.size(2) : weight.size(3);
-    WORD32 kernel_channels = weight.size(1);
+    // NHWC layout: 4D=[N,H,W,C], 3D=[N,W,C]
+    WORD32 input_height = conv1d ? 1 : input.size(1);
+    WORD32 input_width = conv1d ? input.size(1) : input.size(2);
+    WORD32 input_channels = conv1d ? input.size(2) : input.size(3);
+    WORD32 kernel_height = conv1d ? 1 : weight.size(1);
+    WORD32 kernel_width = conv1d ? weight.size(1) : weight.size(2);
+    WORD32 kernel_channels = conv1d ? weight.size(2) : weight.size(3);
     WORD32 out_channels = weight.size(0);
-    WORD32 out_height = conv1d ? 1 : out.size(2);
-    WORD32 out_width = conv1d ? out.size(2) : out.size(3);
+    WORD32 out_height = conv1d ? 1 : out.size(1);
+    WORD32 out_width = conv1d ? out.size(1) : out.size(2);
     WORD32 batches = input.size(0);
 
     WORD32 x_stride = stride[1];
@@ -285,6 +286,19 @@ void xa_opt_quantized_conv2d_nhwc(
     if (groups == input_channels) {
       WORD32 channels_multiplier = out_channels / input_channels;
 
+      // Rearrange weight from [OC, KH, KW, IC/G] (graph NHWC format) to
+      // [KH, KW, OC] (NNLib HWC format expected for inp_data_format=0).
+      WORD32 kernel_size_dw = kernel_height * kernel_width;
+      WORD32 weight_size_dw = out_channels * kernel_size_dw;
+      WORD8* p_kernel_hwc =
+          (WORD8*)kernels::allocate_temp_memory(ctx, weight_size_dw);
+      for (int oc = 0; oc < out_channels; oc++) {
+        for (int k = 0; k < kernel_size_dw; k++) {
+          p_kernel_hwc[k * out_channels + oc] =
+              p_kernel[oc * kernel_size_dw + k];
+        }
+      }
+
       scratch_size = xa_nn_conv2d_depthwise_getsize(
           input_height,
           input_width,
@@ -322,7 +336,7 @@ void xa_opt_quantized_conv2d_nhwc(
 
         xa_nn_conv2d_depthwise_per_chan_sym8sxasym8s(
             out_batch,
-            p_kernel,
+            p_kernel_hwc,
             in_batch,
             p_bias,
             input_height,