From a25bf1d8de15d98ac31944af3310cd4f987d3694 Mon Sep 17 00:00:00 2001
From: Tony Arcieri <bascule@gmail.com>
Date: Thu, 26 Feb 2026 23:05:36 -0700
Subject: [PATCH 1/2] keccak: convert ARMv8 ASM into intrinsics

Rewrites the inline assembly implementation using an equivalent (but not
identical) intrinsics implementation. Also exposes support for computing
two Keccak states in parallel which a previous comment in the ASM
implementation noted was possible but wasn't actually exposed, and is
now available as `p1600_armv8_sha3_times2` (though not yet in the public
API, see #110).

This is a little tricky due to high register pressure: this
implementation uses every vector register.

I started by rewriting the round loop and iterating over the round
constants, then breaking apart the body into theta and everything else
(rho/pi/chi/iota), mapping the NEON registers onto a `[uint64x2_t; 25]`
state.

Theta was translated by hand, but the rest of them were too tedious
regarding a manual mapping of the registers to slots in the state array.
So I wrote a small program that operates over a representation of the
original assembly, doing all the bookkeeping for which registers map to
which slots in the state array, and outputs the equivalent intrinsics
code.

Godbolt links to the original `asm!` versus this translation:
- original: https://godbolt.org/z/G8Mf5vboE
- translated: https://godbolt.org/z/sszzbdexK

It's using nearly the same number of instructions, but there are
differences between the two versions, i.e. it isn't an identical
recreation of the original assembly, which I'm not sure is
possible/preferable, but it should be functionally equivalent.

Benchmarks (`sha3` crate):

- Pure software implementation:

test sha3_224_10    ... bench:          17.97 ns/iter (+/- 0.32) = 588 MB/s
test sha3_224_100   ... bench:         164.15 ns/iter (+/- 5.14) = 609 MB/s
test sha3_224_1000  ... bench:       1,646.07 ns/iter (+/- 139.45) = 607 MB/s
test sha3_224_10000 ... bench:      16,585.52 ns/iter (+/- 1,168.57) = 602 MB/s
test sha3_256_10    ... bench:          19.12 ns/iter (+/- 0.77) = 526 MB/s
test sha3_256_1000  ... bench:       1,694.21 ns/iter (+/- 41.20) = 590 MB/s
test sha3_256_10000 ... bench:      16,807.40 ns/iter (+/- 556.17) = 594 MB/s
test sha3_265_100   ... bench:         173.41 ns/iter (+/- 4.98) = 578 MB/s
test sha3_384_10    ... bench:          24.32 ns/iter (+/- 1.16) = 416 MB/s
test sha3_384_100   ... bench:         225.00 ns/iter (+/- 5.50) = 444 MB/s
test sha3_384_1000  ... bench:       2,224.49 ns/iter (+/- 47.86) = 449 MB/s
test sha3_384_10000 ... bench:      22,181.02 ns/iter (+/- 971.37) = 450 MB/s
test sha3_512_10    ... bench:          33.78 ns/iter (+/- 0.32) = 303 MB/s
test sha3_512_100   ... bench:         320.54 ns/iter (+/- 10.77) = 312 MB/s
test sha3_512_1000  ... bench:       3,174.62 ns/iter (+/- 80.98) = 315 MB/s
test sha3_512_10000 ... bench:      31,629.97 ns/iter (+/- 871.85) = 316 MB/s
test shake128_10    ... bench:          15.97 ns/iter (+/- 0.44) = 666 MB/s
test shake128_100   ... bench:         142.19 ns/iter (+/- 6.58) = 704 MB/s
test shake128_1000  ... bench:       1,390.27 ns/iter (+/- 56.14) = 719 MB/s
test shake128_10000 ... bench:      13,813.13 ns/iter (+/- 677.65) = 723 MB/s
test shake256_10    ... bench:          19.06 ns/iter (+/- 0.44) = 526 MB/s
test shake256_100   ... bench:         173.50 ns/iter (+/- 4.26) = 578 MB/s
test shake256_1000  ... bench:       1,695.05 ns/iter (+/- 87.19) = 589 MB/s
test shake256_10000 ... bench:      16,882.98 ns/iter (+/- 683.56) = 592 MB/s

- New intrinsics implementation:

test sha3_224_10    ... bench:          13.07 ns/iter (+/- 0.55) = 769 MB/s
test sha3_224_100   ... bench:         111.29 ns/iter (+/- 6.62) = 900 MB/s
test sha3_224_1000  ... bench:       1,113.87 ns/iter (+/- 29.88) = 898 MB/s
test sha3_224_10000 ... bench:      11,095.95 ns/iter (+/- 302.99) = 901 MB/s
test sha3_256_10    ... bench:          13.53 ns/iter (+/- 0.51) = 769 MB/s
test sha3_256_1000  ... bench:       1,173.40 ns/iter (+/- 33.72) = 852 MB/s
test sha3_256_10000 ... bench:      12,305.99 ns/iter (+/- 623.31) = 812 MB/s
test sha3_265_100   ... bench:         118.16 ns/iter (+/- 2.85) = 847 MB/s
test sha3_384_10    ... bench:          17.27 ns/iter (+/- 0.78) = 588 MB/s
test sha3_384_100   ... bench:         153.80 ns/iter (+/- 5.42) = 653 MB/s
test sha3_384_1000  ... bench:       1,529.35 ns/iter (+/- 18.99) = 654 MB/s
test sha3_384_10000 ... bench:      15,239.19 ns/iter (+/- 189.19) = 656 MB/s
test sha3_512_10    ... bench:          23.43 ns/iter (+/- 0.95) = 434 MB/s
test sha3_512_100   ... bench:         218.97 ns/iter (+/- 4.01) = 458 MB/s
test sha3_512_1000  ... bench:       2,193.58 ns/iter (+/- 37.98) = 455 MB/s
test sha3_512_10000 ... bench:      21,968.75 ns/iter (+/- 385.75) = 455 MB/s
test shake128_10    ... bench:          11.47 ns/iter (+/- 0.32) = 909 MB/s
test shake128_100   ... bench:          95.51 ns/iter (+/- 1.32) = 1052 MB/s
test shake128_1000  ... bench:         960.08 ns/iter (+/- 34.57) = 1041 MB/s
test shake128_10000 ... bench:       9,564.39 ns/iter (+/- 255.34) = 1045 MB/s
test shake256_10    ... bench:          13.61 ns/iter (+/- 0.53) = 769 MB/s
test shake256_100   ... bench:         116.77 ns/iter (+/- 1.94) = 862 MB/s
test shake256_1000  ... bench:       1,163.09 ns/iter (+/- 27.17) = 859 MB/s
test shake256_10000 ... bench:      11,750.47 ns/iter (+/- 250.38) = 851 MB/s

- Original assembly:

test sha3_224_10    ... bench:          12.54 ns/iter (+/- 0.43) = 833 MB/s
test sha3_224_100   ... bench:         109.49 ns/iter (+/- 2.54) = 917 MB/s
test sha3_224_1000  ... bench:       1,095.79 ns/iter (+/- 32.04) = 913 MB/s
test sha3_224_10000 ... bench:      10,953.02 ns/iter (+/- 157.49) = 912 MB/s
test sha3_256_10    ... bench:          13.05 ns/iter (+/- 0.25) = 769 MB/s
test sha3_256_1000  ... bench:       1,161.46 ns/iter (+/- 28.09) = 861 MB/s
test sha3_256_10000 ... bench:      11,609.98 ns/iter (+/- 148.88) = 861 MB/s
test sha3_265_100   ... bench:         118.17 ns/iter (+/- 7.42) = 847 MB/s
test sha3_384_10    ... bench:          17.07 ns/iter (+/- 2.80) = 588 MB/s
test sha3_384_100   ... bench:         151.93 ns/iter (+/- 4.39) = 662 MB/s
test sha3_384_1000  ... bench:       1,506.50 ns/iter (+/- 40.71) = 664 MB/s
test sha3_384_10000 ... bench:      15,119.04 ns/iter (+/- 495.59) = 661 MB/s
test sha3_512_10    ... bench:          22.93 ns/iter (+/- 0.53) = 454 MB/s
test sha3_512_100   ... bench:         216.77 ns/iter (+/- 7.42) = 462 MB/s
test sha3_512_1000  ... bench:       2,165.67 ns/iter (+/- 49.04) = 461 MB/s
test sha3_512_10000 ... bench:      21,666.71 ns/iter (+/- 651.02) = 461 MB/s
test shake128_10    ... bench:          11.30 ns/iter (+/- 0.14) = 909 MB/s
test shake128_100   ... bench:          94.75 ns/iter (+/- 3.86) = 1063 MB/s
test shake128_1000  ... bench:         961.72 ns/iter (+/- 81.88) = 1040 MB/s
test shake128_10000 ... bench:       9,573.39 ns/iter (+/- 311.05) = 1044 MB/s
test shake256_10    ... bench:          13.17 ns/iter (+/- 0.54) = 769 MB/s
test shake256_100   ... bench:         117.39 ns/iter (+/- 3.22) = 854 MB/s
test shake256_1000  ... bench:       1,174.65 ns/iter (+/- 45.62) = 851 MB/s
test shake256_10000 ... bench:      11,659.19 ns/iter (+/- 330.23) = 857 MB/s

The performance seems pretty close to the original assembly, maybe
just slightly slower.
---
 keccak/Cargo.toml     |   4 +-
 keccak/benches/mod.rs |   4 +-
 keccak/src/armv8.rs   | 244 +++++++++++++++++++++---------------------
 keccak/src/lib.rs     |  38 ++-----
 4 files changed, 136 insertions(+), 154 deletions(-)

diff --git a/keccak/Cargo.toml b/keccak/Cargo.toml
index 91930a4..014d313 100644
--- a/keccak/Cargo.toml
+++ b/keccak/Cargo.toml
@@ -16,7 +16,7 @@ readme = "README.md"
 edition = "2024"
 rust-version = "1.85"
 
-[target.'cfg(all(target_arch = "aarch64", keccak_backend = "armv8_asm"))'.dependencies]
+[target.'cfg(target_arch = "aarch64")'.dependencies]
 cpufeatures = "0.3"
 
 [lints.rust]
@@ -29,7 +29,7 @@ unused_qualifications = "warn"
 
 [lints.rust.unexpected_cfgs]
 level = "warn"
-check-cfg = ['cfg(keccak_backend, values("armv8_asm", "simd", "soft-compact"))']
+check-cfg = ['cfg(keccak_backend, values("simd", "soft-compact"))']
 
 [lints.clippy]
 borrow_as_ptr = "warn"
diff --git a/keccak/benches/mod.rs b/keccak/benches/mod.rs
index 34dd3d3..df73767 100644
--- a/keccak/benches/mod.rs
+++ b/keccak/benches/mod.rs
@@ -1,7 +1,7 @@
 //! keccak benchmarks
 
 #![feature(test)]
-#![cfg_attr(feature = "simd", feature(portable_simd))]
+#![cfg_attr(keccak_backend = "simd", feature(portable_simd))]
 
 extern crate keccak;
 extern crate test;
@@ -35,7 +35,7 @@ fn b_p1600_16(b: &mut test::Bencher) {
     b.iter(|| p1600(&mut data, 16));
 }
 
-#[cfg(feature = "simd")]
+#[cfg(keccak_backend = "simd")]
 mod simd {
     use keccak::simd::{f1600x2, f1600x4, f1600x8, u64x2, u64x4, u64x8};
 
diff --git a/keccak/src/armv8.rs b/keccak/src/armv8.rs
index cb230a7..5cec580 100644
--- a/keccak/src/armv8.rs
+++ b/keccak/src/armv8.rs
@@ -1,4 +1,7 @@
-use crate::PLEN;
+#![allow(unsafe_op_in_unsafe_fn)]
+
+use crate::{PLEN, RC};
+use core::{arch::aarch64::*, array};
 
 /// Keccak-p1600 on ARMv8.4-A with `FEAT_SHA3`.
 ///
@@ -6,137 +9,138 @@ use crate::PLEN;
 /// Adapted from the Keccak-f1600 implementation in the XKCP/K12.
 /// see <https://github.com/XKCP/K12/blob/df6a21e6d1f34c1aa36e8d702540899c97dba5a0/lib/ARMv8Asha3/KeccakP-1600-ARMv8Asha3.S#L69>
 #[target_feature(enable = "sha3")]
-pub unsafe fn p1600_armv8_sha3_asm(state: &mut [u64; PLEN], round_count: usize) {
+pub unsafe fn p1600_armv8_sha3(state: &mut [u64; PLEN], round_count: usize) {
+    let mut s = [*state, Default::default()];
+    // SAFETY: both functions have the same safety invariants, namely they require the `sha3`
+    // target feature is available, and the caller is responsible for ensuring support
+    unsafe { p1600_armv8_sha3_times2(&mut s, round_count) };
+    *state = s[0];
+}
+
+/// Keccak-p1600 on ARMv8.4-A with `FEAT_SHA3` with support for 2 parallel states.
+///
+/// See p. K12.2.2  p. 11,749 of the ARM Reference manual.
+/// Adapted from the Keccak-f1600 implementation in the XKCP/K12.
+///
+/// <https://github.com/XKCP/K12/blob/df6a21e/lib/ARMv8Asha3/KeccakP-1600-ARMv8Asha3.S#L69>
+#[target_feature(enable = "sha3")]
+pub unsafe fn p1600_armv8_sha3_times2(state: &mut [[u64; PLEN]; 2], round_count: usize) {
     assert!(
         matches!(round_count, 1..=24),
         "invalid round count (must be 1-24): {}",
         round_count
     );
 
-    // SAFETY:
-    // - caller is responsible for ensuring that the target CPU is at least ARMv8.4-A with
-    //   `FEAT_SHA3` using runtime CPU feature detection
-    // - `round_count` is ensured to be in the range `1..=24` above
-    // - `state` is valid, aligned, and mutably borrowed as a Rust reference above
-    unsafe {
-        core::arch::asm!("
-            // Read state
-            ld1.1d {{ v0- v3}}, [x0], #32
-            ld1.1d {{ v4- v7}}, [x0], #32
-            ld1.1d {{ v8-v11}}, [x0], #32
-            ld1.1d {{v12-v15}}, [x0], #32
-            ld1.1d {{v16-v19}}, [x0], #32
-            ld1.1d {{v20-v23}}, [x0], #32
-            ld1.1d {{v24}},     [x0]
-            sub x0, x0, #192
-
-            // NOTE: This loop actually computes two f1600 functions in
-            // parallel, in both the lower and the upper 64-bit of the
-            // 128-bit registers v0-v24.
-            0:  sub	x8, x8, #1
-
-            // Theta Calculations
-            eor3.16b   v25, v20, v15, v10
-            eor3.16b   v26, v21, v16, v11
-            eor3.16b   v27, v22, v17, v12
-            eor3.16b   v28, v23, v18, v13
-            eor3.16b   v29, v24, v19, v14
-            eor3.16b   v25, v25,  v5,  v0
-            eor3.16b   v26, v26,  v6,  v1
-            eor3.16b   v27, v27,  v7,  v2
-            eor3.16b   v28, v28,  v8,  v3
-            eor3.16b   v29, v29,  v9,  v4
-            rax1.2d    v30, v25, v27
-            rax1.2d    v31, v26, v28
-            rax1.2d    v27, v27, v29
-            rax1.2d    v28, v28, v25
-            rax1.2d    v29, v29, v26
-
-            // Rho and Phi
-            eor.16b     v0,  v0, v29
-            xar.2d     v25,  v1, v30, #64 -  1
-            xar.2d      v1,  v6, v30, #64 - 44
-            xar.2d      v6,  v9, v28, #64 - 20
-            xar.2d      v9, v22, v31, #64 - 61
-            xar.2d     v22, v14, v28, #64 - 39
-            xar.2d     v14, v20, v29, #64 - 18
-            xar.2d     v26,  v2, v31, #64 - 62
-            xar.2d      v2, v12, v31, #64 - 43
-            xar.2d     v12, v13, v27, #64 - 25
-            xar.2d     v13, v19, v28, #64 -  8
-            xar.2d     v19, v23, v27, #64 - 56
-            xar.2d     v23, v15, v29, #64 - 41
-            xar.2d     v15,  v4, v28, #64 - 27
-            xar.2d     v28, v24, v28, #64 - 14
-            xar.2d     v24, v21, v30, #64 -  2
-            xar.2d      v8,  v8, v27, #64 - 55
-            xar.2d      v4, v16, v30, #64 - 45
-            xar.2d     v16,  v5, v29, #64 - 36
-            xar.2d      v5,  v3, v27, #64 - 28
-            xar.2d     v27, v18, v27, #64 - 21
-            xar.2d      v3, v17, v31, #64 - 15
-            xar.2d     v30, v11, v30, #64 - 10
-            xar.2d     v31,  v7, v31, #64 -  6
-            xar.2d     v29, v10, v29, #64 -  3
-
-            // Chi and Iota
-            bcax.16b   v20, v26, v22,  v8
-            bcax.16b   v21,  v8, v23, v22
-            bcax.16b   v22, v22, v24, v23
-            bcax.16b   v23, v23, v26, v24
-            bcax.16b   v24, v24,  v8, v26
+    let mut s: [uint64x2_t; PLEN] =
+        array::from_fn(|i| vcombine_u64(vcreate_u64(state[0][i]), vcreate_u64(state[1][i])));
 
-            ld1r.2d    {{v26}}, [x1], #8
-
-            bcax.16b   v17, v30, v19,  v3
-            bcax.16b   v18,  v3, v15, v19
-            bcax.16b   v19, v19, v16, v15
-            bcax.16b   v15, v15, v30, v16
-            bcax.16b   v16, v16,  v3, v30
-
-            bcax.16b   v10, v25, v12, v31
-            bcax.16b   v11, v31, v13, v12
-            bcax.16b   v12, v12, v14, v13
-            bcax.16b   v13, v13, v25, v14
-            bcax.16b   v14, v14, v31, v25
-
-            bcax.16b    v7, v29,  v9,  v4
-            bcax.16b    v8,  v4,  v5,  v9
-            bcax.16b    v9,  v9,  v6,  v5
-            bcax.16b    v5,  v5, v29,  v6
-            bcax.16b    v6,  v6,  v4, v29
+    for &rc in &RC[(24 - round_count)..] {
+        let (d0, d1, d2, d3, d4) = theta(&s);
+        let t = rho_pi(&s, d0, d1, d2, d3, d4);
+        s = chi_iota(&t, rc);
+    }
 
-            bcax.16b    v3, v27,  v0, v28
-            bcax.16b    v4, v28,  v1,  v0
-            bcax.16b    v0,  v0,  v2,  v1
-            bcax.16b    v1,  v1, v27,  v2
-            bcax.16b    v2,  v2, v28, v27
+    for i in 0..PLEN {
+        state[0][i] = vgetq_lane_u64::<0>(s[i]);
+        state[1][i] = vgetq_lane_u64::<1>(s[i]);
+    }
+}
 
-            eor.16b v0,v0,v26
+#[target_feature(enable = "sha3")]
+unsafe fn theta(
+    s: &[uint64x2_t; 25],
+) -> (uint64x2_t, uint64x2_t, uint64x2_t, uint64x2_t, uint64x2_t) {
+    let c0 = veor3q_u64(s[0], s[5], veor3q_u64(s[10], s[15], s[20]));
+    let c1 = veor3q_u64(s[1], s[6], veor3q_u64(s[11], s[16], s[21]));
+    let c2 = veor3q_u64(s[2], s[7], veor3q_u64(s[12], s[17], s[22]));
+    let c3 = veor3q_u64(s[3], s[8], veor3q_u64(s[13], s[18], s[23]));
+    let c4 = veor3q_u64(s[4], s[9], veor3q_u64(s[14], s[19], s[24]));
+
+    let d0 = vrax1q_u64(c4, c1);
+    let d1 = vrax1q_u64(c0, c2);
+    let d2 = vrax1q_u64(c1, c3);
+    let d3 = vrax1q_u64(c2, c4);
+    let d4 = vrax1q_u64(c3, c0);
+
+    (d0, d1, d2, d3, d4)
+}
 
-            // Rounds loop
-            cbnz    w8, 0b
+#[target_feature(enable = "sha3")]
+unsafe fn rho_pi(
+    s: &[uint64x2_t; 25],
+    d0: uint64x2_t,
+    d1: uint64x2_t,
+    d2: uint64x2_t,
+    d3: uint64x2_t,
+    d4: uint64x2_t,
+) -> [uint64x2_t; 25] {
+    let v0 = veorq_u64(s[0], d0);
+    let v25 = vxarq_u64::<63>(s[1], d1);
+    let v1 = vxarq_u64::<20>(s[6], d1);
+    let v6 = vxarq_u64::<44>(s[9], d4);
+    let v9 = vxarq_u64::<3>(s[22], d2);
+    let v22 = vxarq_u64::<25>(s[14], d4);
+    let v14 = vxarq_u64::<46>(s[20], d0);
+    let v26 = vxarq_u64::<2>(s[2], d2);
+    let v2 = vxarq_u64::<21>(s[12], d2);
+    let v12 = vxarq_u64::<39>(s[13], d3);
+    let v13 = vxarq_u64::<56>(s[19], d4);
+    let v19 = vxarq_u64::<8>(s[23], d3);
+    let v23 = vxarq_u64::<23>(s[15], d0);
+    let v15 = vxarq_u64::<37>(s[4], d4);
+    let v28 = vxarq_u64::<50>(s[24], d4);
+    let v24 = vxarq_u64::<62>(s[21], d1);
+    let v8 = vxarq_u64::<9>(s[8], d3);
+    let v4 = vxarq_u64::<19>(s[16], d1);
+    let v16 = vxarq_u64::<28>(s[5], d0);
+    let v5 = vxarq_u64::<36>(s[3], d3);
+    let v27 = vxarq_u64::<43>(s[18], d3);
+    let v3 = vxarq_u64::<49>(s[17], d2);
+    let v30 = vxarq_u64::<54>(s[11], d1);
+    let v31 = vxarq_u64::<58>(s[7], d2);
+    let v29 = vxarq_u64::<61>(s[10], d0);
+    [
+        v0, v25, v26, v5, v15, v16, v1, v31, v8, v6, v29, v30, v2, v12, v22, v23, v4, v3, v27, v13,
+        v14, v24, v9, v19, v28,
+    ]
+}
 
-            // Write state
-            st1.1d	{{ v0- v3}}, [x0], #32
-            st1.1d	{{ v4- v7}}, [x0], #32
-            st1.1d	{{ v8-v11}}, [x0], #32
-            st1.1d	{{v12-v15}}, [x0], #32
-            st1.1d	{{v16-v19}}, [x0], #32
-            st1.1d	{{v20-v23}}, [x0], #32
-            st1.1d	{{v24}},     [x0]
-        ",
-            inout("x0") state.as_mut_ptr() => _,
-            inout("x1") crate::RC[24-round_count..].as_ptr() => _,
-            inout("x8") round_count => _,
-            clobber_abi("C"),
-            options(nostack)
-        );
-    }
+#[target_feature(enable = "sha3")]
+unsafe fn chi_iota(t: &[uint64x2_t; 25], rc: u64) -> [uint64x2_t; 25] {
+    let rc_v = vdupq_n_u64(rc);
+    let v20 = vbcaxq_u64(t[2], t[14], t[8]);
+    let v21 = vbcaxq_u64(t[8], t[15], t[14]);
+    let v22 = vbcaxq_u64(t[14], t[21], t[15]);
+    let v23 = vbcaxq_u64(t[15], t[2], t[21]);
+    let v24 = vbcaxq_u64(t[21], t[8], t[2]);
+    let v17 = vbcaxq_u64(t[11], t[23], t[17]);
+    let v18 = vbcaxq_u64(t[17], t[4], t[23]);
+    let v19 = vbcaxq_u64(t[23], t[5], t[4]);
+    let v15 = vbcaxq_u64(t[4], t[11], t[5]);
+    let v16 = vbcaxq_u64(t[5], t[17], t[11]);
+    let v10 = vbcaxq_u64(t[1], t[13], t[7]);
+    let v11 = vbcaxq_u64(t[7], t[19], t[13]);
+    let v12 = vbcaxq_u64(t[13], t[20], t[19]);
+    let v13 = vbcaxq_u64(t[19], t[1], t[20]);
+    let v14 = vbcaxq_u64(t[20], t[7], t[1]);
+    let v7 = vbcaxq_u64(t[10], t[22], t[16]);
+    let v8 = vbcaxq_u64(t[16], t[3], t[22]);
+    let v9 = vbcaxq_u64(t[22], t[9], t[3]);
+    let v5 = vbcaxq_u64(t[3], t[10], t[9]);
+    let v6 = vbcaxq_u64(t[9], t[16], t[10]);
+    let v3 = vbcaxq_u64(t[18], t[0], t[24]);
+    let v4 = vbcaxq_u64(t[24], t[6], t[0]);
+    let v0 = vbcaxq_u64(t[0], t[12], t[6]);
+    let v1 = vbcaxq_u64(t[6], t[18], t[12]);
+    let v2 = vbcaxq_u64(t[12], t[24], t[18]);
+    let v0_iota = veorq_u64(v0, rc_v);
+    [
+        v0_iota, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18,
+        v19, v20, v21, v22, v23, v24,
+    ]
 }
 
 #[cfg(all(test, target_feature = "sha3"))]
-#[allow(clippy::undocumented_unsafe_blocks)]
 mod tests {
     use super::*;
 
@@ -200,9 +204,9 @@ mod tests {
         ];
 
         let mut state = [0u64; 25];
-        unsafe { p1600_armv8_sha3_asm(&mut state, 24) };
+        unsafe { p1600_armv8_sha3(&mut state, 24) };
         assert_eq!(state, state_first);
-        unsafe { p1600_armv8_sha3_asm(&mut state, 24) };
+        unsafe { p1600_armv8_sha3(&mut state, 24) };
         assert_eq!(state, state_second);
     }
 }
diff --git a/keccak/src/lib.rs b/keccak/src/lib.rs
index 2e19bba..e760227 100644
--- a/keccak/src/lib.rs
+++ b/keccak/src/lib.rs
@@ -38,28 +38,6 @@
 //!     0x20D06CD26A8FBF5C,
 //! ]);
 //! ```
-//!
-//! ## Intrinsics support
-//! ### ARMv8 `asm!`
-//! The [`KeccakP1600`] struct supports the use of optimized inline assembly implementations of
-//! specialized intrinsics for ARMv8 CPUs along with runtime CPU feature detection, but requires
-//! setting the `keccak_backend="armv8_asm"` configuration option via `RUSTFLAGS`, e.g. by setting
-//! an environment variable:
-//!
-//! ```console
-//! $ RUSTFLAGS='--cfg keccak_backend="armv8_asm"' cargo build --release
-//! ```
-//!
-//! Or you can persistently configure it for your project in `.cargo/config.toml`:
-//!
-//! ```toml
-//! # In .cargo/config.toml
-//! [build]
-//! rustflags = ['--cfg', 'keccak_backend="armv8_asm"']
-//! ```
-//!
-//! [1]: https://docs.rs/sha3
-//! [2]: https://docs.rs/tiny-keccak
 
 use core::{
     fmt::Debug,
@@ -70,10 +48,10 @@ use core::{
 #[rustfmt::skip]
 mod unroll;
 
-#[cfg(all(target_arch = "aarch64", keccak_backend = "armv8_asm"))]
+#[cfg(target_arch = "aarch64")]
 mod armv8;
 
-#[cfg(all(target_arch = "aarch64", keccak_backend = "armv8_asm"))]
+#[cfg(target_arch = "aarch64")]
 cpufeatures::new!(armv8_sha3_intrinsics, "sha3");
 
 const PLEN: usize = 25;
@@ -191,7 +169,7 @@ impl_keccak!(p1600, f1600, u64);
 #[derive(Clone, Debug)]
 pub struct KeccakP1600 {
     state: [u64; PLEN],
-    #[cfg(all(target_arch = "aarch64", keccak_backend = "armv8_asm"))]
+    #[cfg(target_arch = "aarch64")]
     has_intrinsics: armv8_sha3_intrinsics::InitToken,
 }
 
@@ -202,17 +180,17 @@ impl KeccakP1600 {
     pub fn new(state: [u64; PLEN]) -> Self {
         Self {
             state,
-            #[cfg(all(target_arch = "aarch64", keccak_backend = "armv8_asm"))]
+            #[cfg(target_arch = "aarch64")]
             has_intrinsics: armv8_sha3_intrinsics::init(),
         }
     }
 
     /// `Keccak-p[1600, rc]` permutation.
     pub fn p1600(&mut self, round_count: usize) {
-        #[cfg(all(target_arch = "aarch64", keccak_backend = "armv8_asm"))]
+        #[cfg(target_arch = "aarch64")]
         if self.has_intrinsics.get() {
             // SAFETY: we just performed runtime CPU feature detection above
-            unsafe { armv8::p1600_armv8_sha3_asm(&mut self.state, round_count) }
+            unsafe { armv8::p1600_armv8_sha3(&mut self.state, round_count) }
             return;
         }
 
@@ -221,10 +199,10 @@ impl KeccakP1600 {
 
     /// `Keccak-f[1600]` permutation.
     pub fn f1600(&mut self) {
-        #[cfg(all(target_arch = "aarch64", keccak_backend = "armv8_asm"))]
+        #[cfg(target_arch = "aarch64")]
         if self.has_intrinsics.get() {
             // SAFETY: we just performed runtime CPU feature detection above
-            unsafe { armv8::p1600_armv8_sha3_asm(&mut self.state, u64::KECCAK_F_ROUND_COUNT) }
+            unsafe { armv8::p1600_armv8_sha3(&mut self.state, u64::KECCAK_F_ROUND_COUNT) }
             return;
         }
 

From 70463c1b0d4b835097048bbff7719121bd36e390 Mon Sep 17 00:00:00 2001
From: Tony Arcieri <bascule@gmail.com>
Date: Fri, 27 Feb 2026 17:30:49 -0700
Subject: [PATCH 2/2] Add CHANGELOG entry and TODO for MSRV bump

---
 keccak/CHANGELOG.md | 4 ++++
 keccak/src/armv8.rs | 3 +++
 2 files changed, 7 insertions(+)

diff --git a/keccak/CHANGELOG.md b/keccak/CHANGELOG.md
index 94cad8d..aafb9b9 100644
--- a/keccak/CHANGELOG.md
+++ b/keccak/CHANGELOG.md
@@ -9,10 +9,14 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 TODO: fill out rest of changelog
 
+### Changed
+- ARMv8 backend now on-by-default and written using intrinsics ([#112])
+
 ### Fixed
 - Use `doc_cfg` in place of removed `doc_auto_cfg` feature ([#91])
 
 [#91]: https://github.com/RustCrypto/sponges/pull/91
+[#112]: https://github.com/RustCrypto/sponges/pull/112
 
 ## 0.1.6 (2026-02-13)
 ### Fixed
diff --git a/keccak/src/armv8.rs b/keccak/src/armv8.rs
index 5cec580..e5c4ba6 100644
--- a/keccak/src/armv8.rs
+++ b/keccak/src/armv8.rs
@@ -1,3 +1,6 @@
+//! ARMv8 intrinsics-based backend.
+
+// TODO(tarcieri): remove when MSRV 1.87
 #![allow(unsafe_op_in_unsafe_fn)]
 
 use crate::{PLEN, RC};