From a25bf1d8de15d98ac31944af3310cd4f987d3694 Mon Sep 17 00:00:00 2001 From: Tony Arcieri Date: Thu, 26 Feb 2026 23:05:36 -0700 Subject: [PATCH 1/2] keccak: convert ARMv8 ASM into intrinsics Rewrites the inline assembly implementation using an equivalent (but not identical) intrinsics implementation. Also exposes support for computing two Keccak states in parallel which a previous comment in the ASM implementation noted was possible but wasn't actually exposed, and is now available as `p1600_armv8_sha3_times2` (though not yet in the public API, see #110). This is a little tricky due to high register pressure: this implementation uses every vector register. I started by rewriting the round loop and iterating over the round constants, then breaking apart the body into theta and everything else (rho/pi/chi/iota), mapping the NEON registers onto a `[uint64x2_t; 25]` state. Theta was translated by hand, but the rest of them were too tedious regarding a manual mapping of the registers to slots in the state array. So I wrote a small program that operates over a representation of the original assembly, doing all the bookkeeping for which registers map to which slots in the state array, and outputs the equivalent intrinsics code. Godbolt links to the original `asm!` versus this translation: - original: https://godbolt.org/z/G8Mf5vboE - translated: https://godbolt.org/z/sszzbdexK It's using nearly the same number of instructions, but there are differences between the two versions, i.e. it isn't an identical recreation of the original assembly, which I'm not sure is possible/preferable, but it should be functionally equivalent. Benchmarks (`sha3` crate): - Pure software implementation: test sha3_224_10 ... bench: 17.97 ns/iter (+/- 0.32) = 588 MB/s test sha3_224_100 ... bench: 164.15 ns/iter (+/- 5.14) = 609 MB/s test sha3_224_1000 ... bench: 1,646.07 ns/iter (+/- 139.45) = 607 MB/s test sha3_224_10000 ... bench: 16,585.52 ns/iter (+/- 1,168.57) = 602 MB/s test sha3_256_10 ... bench: 19.12 ns/iter (+/- 0.77) = 526 MB/s test sha3_256_1000 ... bench: 1,694.21 ns/iter (+/- 41.20) = 590 MB/s test sha3_256_10000 ... bench: 16,807.40 ns/iter (+/- 556.17) = 594 MB/s test sha3_265_100 ... bench: 173.41 ns/iter (+/- 4.98) = 578 MB/s test sha3_384_10 ... bench: 24.32 ns/iter (+/- 1.16) = 416 MB/s test sha3_384_100 ... bench: 225.00 ns/iter (+/- 5.50) = 444 MB/s test sha3_384_1000 ... bench: 2,224.49 ns/iter (+/- 47.86) = 449 MB/s test sha3_384_10000 ... bench: 22,181.02 ns/iter (+/- 971.37) = 450 MB/s test sha3_512_10 ... bench: 33.78 ns/iter (+/- 0.32) = 303 MB/s test sha3_512_100 ... bench: 320.54 ns/iter (+/- 10.77) = 312 MB/s test sha3_512_1000 ... bench: 3,174.62 ns/iter (+/- 80.98) = 315 MB/s test sha3_512_10000 ... bench: 31,629.97 ns/iter (+/- 871.85) = 316 MB/s test shake128_10 ... bench: 15.97 ns/iter (+/- 0.44) = 666 MB/s test shake128_100 ... bench: 142.19 ns/iter (+/- 6.58) = 704 MB/s test shake128_1000 ... bench: 1,390.27 ns/iter (+/- 56.14) = 719 MB/s test shake128_10000 ... bench: 13,813.13 ns/iter (+/- 677.65) = 723 MB/s test shake256_10 ... bench: 19.06 ns/iter (+/- 0.44) = 526 MB/s test shake256_100 ... bench: 173.50 ns/iter (+/- 4.26) = 578 MB/s test shake256_1000 ... bench: 1,695.05 ns/iter (+/- 87.19) = 589 MB/s test shake256_10000 ... bench: 16,882.98 ns/iter (+/- 683.56) = 592 MB/s - New intrinsics implementation: test sha3_224_10 ... bench: 13.07 ns/iter (+/- 0.55) = 769 MB/s test sha3_224_100 ... bench: 111.29 ns/iter (+/- 6.62) = 900 MB/s test sha3_224_1000 ... bench: 1,113.87 ns/iter (+/- 29.88) = 898 MB/s test sha3_224_10000 ... bench: 11,095.95 ns/iter (+/- 302.99) = 901 MB/s test sha3_256_10 ... bench: 13.53 ns/iter (+/- 0.51) = 769 MB/s test sha3_256_1000 ... bench: 1,173.40 ns/iter (+/- 33.72) = 852 MB/s test sha3_256_10000 ... bench: 12,305.99 ns/iter (+/- 623.31) = 812 MB/s test sha3_265_100 ... bench: 118.16 ns/iter (+/- 2.85) = 847 MB/s test sha3_384_10 ... bench: 17.27 ns/iter (+/- 0.78) = 588 MB/s test sha3_384_100 ... bench: 153.80 ns/iter (+/- 5.42) = 653 MB/s test sha3_384_1000 ... bench: 1,529.35 ns/iter (+/- 18.99) = 654 MB/s test sha3_384_10000 ... bench: 15,239.19 ns/iter (+/- 189.19) = 656 MB/s test sha3_512_10 ... bench: 23.43 ns/iter (+/- 0.95) = 434 MB/s test sha3_512_100 ... bench: 218.97 ns/iter (+/- 4.01) = 458 MB/s test sha3_512_1000 ... bench: 2,193.58 ns/iter (+/- 37.98) = 455 MB/s test sha3_512_10000 ... bench: 21,968.75 ns/iter (+/- 385.75) = 455 MB/s test shake128_10 ... bench: 11.47 ns/iter (+/- 0.32) = 909 MB/s test shake128_100 ... bench: 95.51 ns/iter (+/- 1.32) = 1052 MB/s test shake128_1000 ... bench: 960.08 ns/iter (+/- 34.57) = 1041 MB/s test shake128_10000 ... bench: 9,564.39 ns/iter (+/- 255.34) = 1045 MB/s test shake256_10 ... bench: 13.61 ns/iter (+/- 0.53) = 769 MB/s test shake256_100 ... bench: 116.77 ns/iter (+/- 1.94) = 862 MB/s test shake256_1000 ... bench: 1,163.09 ns/iter (+/- 27.17) = 859 MB/s test shake256_10000 ... bench: 11,750.47 ns/iter (+/- 250.38) = 851 MB/s - Original assembly: test sha3_224_10 ... bench: 12.54 ns/iter (+/- 0.43) = 833 MB/s test sha3_224_100 ... bench: 109.49 ns/iter (+/- 2.54) = 917 MB/s test sha3_224_1000 ... bench: 1,095.79 ns/iter (+/- 32.04) = 913 MB/s test sha3_224_10000 ... bench: 10,953.02 ns/iter (+/- 157.49) = 912 MB/s test sha3_256_10 ... bench: 13.05 ns/iter (+/- 0.25) = 769 MB/s test sha3_256_1000 ... bench: 1,161.46 ns/iter (+/- 28.09) = 861 MB/s test sha3_256_10000 ... bench: 11,609.98 ns/iter (+/- 148.88) = 861 MB/s test sha3_265_100 ... bench: 118.17 ns/iter (+/- 7.42) = 847 MB/s test sha3_384_10 ... bench: 17.07 ns/iter (+/- 2.80) = 588 MB/s test sha3_384_100 ... bench: 151.93 ns/iter (+/- 4.39) = 662 MB/s test sha3_384_1000 ... bench: 1,506.50 ns/iter (+/- 40.71) = 664 MB/s test sha3_384_10000 ... bench: 15,119.04 ns/iter (+/- 495.59) = 661 MB/s test sha3_512_10 ... bench: 22.93 ns/iter (+/- 0.53) = 454 MB/s test sha3_512_100 ... bench: 216.77 ns/iter (+/- 7.42) = 462 MB/s test sha3_512_1000 ... bench: 2,165.67 ns/iter (+/- 49.04) = 461 MB/s test sha3_512_10000 ... bench: 21,666.71 ns/iter (+/- 651.02) = 461 MB/s test shake128_10 ... bench: 11.30 ns/iter (+/- 0.14) = 909 MB/s test shake128_100 ... bench: 94.75 ns/iter (+/- 3.86) = 1063 MB/s test shake128_1000 ... bench: 961.72 ns/iter (+/- 81.88) = 1040 MB/s test shake128_10000 ... bench: 9,573.39 ns/iter (+/- 311.05) = 1044 MB/s test shake256_10 ... bench: 13.17 ns/iter (+/- 0.54) = 769 MB/s test shake256_100 ... bench: 117.39 ns/iter (+/- 3.22) = 854 MB/s test shake256_1000 ... bench: 1,174.65 ns/iter (+/- 45.62) = 851 MB/s test shake256_10000 ... bench: 11,659.19 ns/iter (+/- 330.23) = 857 MB/s The performance seems pretty close to the original assembly, maybe just slightly slower. --- keccak/Cargo.toml | 4 +- keccak/benches/mod.rs | 4 +- keccak/src/armv8.rs | 244 +++++++++++++++++++++--------------------- keccak/src/lib.rs | 38 ++----- 4 files changed, 136 insertions(+), 154 deletions(-) diff --git a/keccak/Cargo.toml b/keccak/Cargo.toml index 91930a4..014d313 100644 --- a/keccak/Cargo.toml +++ b/keccak/Cargo.toml @@ -16,7 +16,7 @@ readme = "README.md" edition = "2024" rust-version = "1.85" -[target.'cfg(all(target_arch = "aarch64", keccak_backend = "armv8_asm"))'.dependencies] +[target.'cfg(target_arch = "aarch64")'.dependencies] cpufeatures = "0.3" [lints.rust] @@ -29,7 +29,7 @@ unused_qualifications = "warn" [lints.rust.unexpected_cfgs] level = "warn" -check-cfg = ['cfg(keccak_backend, values("armv8_asm", "simd", "soft-compact"))'] +check-cfg = ['cfg(keccak_backend, values("simd", "soft-compact"))'] [lints.clippy] borrow_as_ptr = "warn" diff --git a/keccak/benches/mod.rs b/keccak/benches/mod.rs index 34dd3d3..df73767 100644 --- a/keccak/benches/mod.rs +++ b/keccak/benches/mod.rs @@ -1,7 +1,7 @@ //! keccak benchmarks #![feature(test)] -#![cfg_attr(feature = "simd", feature(portable_simd))] +#![cfg_attr(keccak_backend = "simd", feature(portable_simd))] extern crate keccak; extern crate test; @@ -35,7 +35,7 @@ fn b_p1600_16(b: &mut test::Bencher) { b.iter(|| p1600(&mut data, 16)); } -#[cfg(feature = "simd")] +#[cfg(keccak_backend = "simd")] mod simd { use keccak::simd::{f1600x2, f1600x4, f1600x8, u64x2, u64x4, u64x8}; diff --git a/keccak/src/armv8.rs b/keccak/src/armv8.rs index cb230a7..5cec580 100644 --- a/keccak/src/armv8.rs +++ b/keccak/src/armv8.rs @@ -1,4 +1,7 @@ -use crate::PLEN; +#![allow(unsafe_op_in_unsafe_fn)] + +use crate::{PLEN, RC}; +use core::{arch::aarch64::*, array}; /// Keccak-p1600 on ARMv8.4-A with `FEAT_SHA3`. /// @@ -6,137 +9,138 @@ use crate::PLEN; /// Adapted from the Keccak-f1600 implementation in the XKCP/K12. /// see #[target_feature(enable = "sha3")] -pub unsafe fn p1600_armv8_sha3_asm(state: &mut [u64; PLEN], round_count: usize) { +pub unsafe fn p1600_armv8_sha3(state: &mut [u64; PLEN], round_count: usize) { + let mut s = [*state, Default::default()]; + // SAFETY: both functions have the same safety invariants, namely they require the `sha3` + // target feature is available, and the caller is responsible for ensuring support + unsafe { p1600_armv8_sha3_times2(&mut s, round_count) }; + *state = s[0]; +} + +/// Keccak-p1600 on ARMv8.4-A with `FEAT_SHA3` with support for 2 parallel states. +/// +/// See p. K12.2.2 p. 11,749 of the ARM Reference manual. +/// Adapted from the Keccak-f1600 implementation in the XKCP/K12. +/// +/// +#[target_feature(enable = "sha3")] +pub unsafe fn p1600_armv8_sha3_times2(state: &mut [[u64; PLEN]; 2], round_count: usize) { assert!( matches!(round_count, 1..=24), "invalid round count (must be 1-24): {}", round_count ); - // SAFETY: - // - caller is responsible for ensuring that the target CPU is at least ARMv8.4-A with - // `FEAT_SHA3` using runtime CPU feature detection - // - `round_count` is ensured to be in the range `1..=24` above - // - `state` is valid, aligned, and mutably borrowed as a Rust reference above - unsafe { - core::arch::asm!(" - // Read state - ld1.1d {{ v0- v3}}, [x0], #32 - ld1.1d {{ v4- v7}}, [x0], #32 - ld1.1d {{ v8-v11}}, [x0], #32 - ld1.1d {{v12-v15}}, [x0], #32 - ld1.1d {{v16-v19}}, [x0], #32 - ld1.1d {{v20-v23}}, [x0], #32 - ld1.1d {{v24}}, [x0] - sub x0, x0, #192 - - // NOTE: This loop actually computes two f1600 functions in - // parallel, in both the lower and the upper 64-bit of the - // 128-bit registers v0-v24. - 0: sub x8, x8, #1 - - // Theta Calculations - eor3.16b v25, v20, v15, v10 - eor3.16b v26, v21, v16, v11 - eor3.16b v27, v22, v17, v12 - eor3.16b v28, v23, v18, v13 - eor3.16b v29, v24, v19, v14 - eor3.16b v25, v25, v5, v0 - eor3.16b v26, v26, v6, v1 - eor3.16b v27, v27, v7, v2 - eor3.16b v28, v28, v8, v3 - eor3.16b v29, v29, v9, v4 - rax1.2d v30, v25, v27 - rax1.2d v31, v26, v28 - rax1.2d v27, v27, v29 - rax1.2d v28, v28, v25 - rax1.2d v29, v29, v26 - - // Rho and Phi - eor.16b v0, v0, v29 - xar.2d v25, v1, v30, #64 - 1 - xar.2d v1, v6, v30, #64 - 44 - xar.2d v6, v9, v28, #64 - 20 - xar.2d v9, v22, v31, #64 - 61 - xar.2d v22, v14, v28, #64 - 39 - xar.2d v14, v20, v29, #64 - 18 - xar.2d v26, v2, v31, #64 - 62 - xar.2d v2, v12, v31, #64 - 43 - xar.2d v12, v13, v27, #64 - 25 - xar.2d v13, v19, v28, #64 - 8 - xar.2d v19, v23, v27, #64 - 56 - xar.2d v23, v15, v29, #64 - 41 - xar.2d v15, v4, v28, #64 - 27 - xar.2d v28, v24, v28, #64 - 14 - xar.2d v24, v21, v30, #64 - 2 - xar.2d v8, v8, v27, #64 - 55 - xar.2d v4, v16, v30, #64 - 45 - xar.2d v16, v5, v29, #64 - 36 - xar.2d v5, v3, v27, #64 - 28 - xar.2d v27, v18, v27, #64 - 21 - xar.2d v3, v17, v31, #64 - 15 - xar.2d v30, v11, v30, #64 - 10 - xar.2d v31, v7, v31, #64 - 6 - xar.2d v29, v10, v29, #64 - 3 - - // Chi and Iota - bcax.16b v20, v26, v22, v8 - bcax.16b v21, v8, v23, v22 - bcax.16b v22, v22, v24, v23 - bcax.16b v23, v23, v26, v24 - bcax.16b v24, v24, v8, v26 + let mut s: [uint64x2_t; PLEN] = + array::from_fn(|i| vcombine_u64(vcreate_u64(state[0][i]), vcreate_u64(state[1][i]))); - ld1r.2d {{v26}}, [x1], #8 - - bcax.16b v17, v30, v19, v3 - bcax.16b v18, v3, v15, v19 - bcax.16b v19, v19, v16, v15 - bcax.16b v15, v15, v30, v16 - bcax.16b v16, v16, v3, v30 - - bcax.16b v10, v25, v12, v31 - bcax.16b v11, v31, v13, v12 - bcax.16b v12, v12, v14, v13 - bcax.16b v13, v13, v25, v14 - bcax.16b v14, v14, v31, v25 - - bcax.16b v7, v29, v9, v4 - bcax.16b v8, v4, v5, v9 - bcax.16b v9, v9, v6, v5 - bcax.16b v5, v5, v29, v6 - bcax.16b v6, v6, v4, v29 + for &rc in &RC[(24 - round_count)..] { + let (d0, d1, d2, d3, d4) = theta(&s); + let t = rho_pi(&s, d0, d1, d2, d3, d4); + s = chi_iota(&t, rc); + } - bcax.16b v3, v27, v0, v28 - bcax.16b v4, v28, v1, v0 - bcax.16b v0, v0, v2, v1 - bcax.16b v1, v1, v27, v2 - bcax.16b v2, v2, v28, v27 + for i in 0..PLEN { + state[0][i] = vgetq_lane_u64::<0>(s[i]); + state[1][i] = vgetq_lane_u64::<1>(s[i]); + } +} - eor.16b v0,v0,v26 +#[target_feature(enable = "sha3")] +unsafe fn theta( + s: &[uint64x2_t; 25], +) -> (uint64x2_t, uint64x2_t, uint64x2_t, uint64x2_t, uint64x2_t) { + let c0 = veor3q_u64(s[0], s[5], veor3q_u64(s[10], s[15], s[20])); + let c1 = veor3q_u64(s[1], s[6], veor3q_u64(s[11], s[16], s[21])); + let c2 = veor3q_u64(s[2], s[7], veor3q_u64(s[12], s[17], s[22])); + let c3 = veor3q_u64(s[3], s[8], veor3q_u64(s[13], s[18], s[23])); + let c4 = veor3q_u64(s[4], s[9], veor3q_u64(s[14], s[19], s[24])); + + let d0 = vrax1q_u64(c4, c1); + let d1 = vrax1q_u64(c0, c2); + let d2 = vrax1q_u64(c1, c3); + let d3 = vrax1q_u64(c2, c4); + let d4 = vrax1q_u64(c3, c0); + + (d0, d1, d2, d3, d4) +} - // Rounds loop - cbnz w8, 0b +#[target_feature(enable = "sha3")] +unsafe fn rho_pi( + s: &[uint64x2_t; 25], + d0: uint64x2_t, + d1: uint64x2_t, + d2: uint64x2_t, + d3: uint64x2_t, + d4: uint64x2_t, +) -> [uint64x2_t; 25] { + let v0 = veorq_u64(s[0], d0); + let v25 = vxarq_u64::<63>(s[1], d1); + let v1 = vxarq_u64::<20>(s[6], d1); + let v6 = vxarq_u64::<44>(s[9], d4); + let v9 = vxarq_u64::<3>(s[22], d2); + let v22 = vxarq_u64::<25>(s[14], d4); + let v14 = vxarq_u64::<46>(s[20], d0); + let v26 = vxarq_u64::<2>(s[2], d2); + let v2 = vxarq_u64::<21>(s[12], d2); + let v12 = vxarq_u64::<39>(s[13], d3); + let v13 = vxarq_u64::<56>(s[19], d4); + let v19 = vxarq_u64::<8>(s[23], d3); + let v23 = vxarq_u64::<23>(s[15], d0); + let v15 = vxarq_u64::<37>(s[4], d4); + let v28 = vxarq_u64::<50>(s[24], d4); + let v24 = vxarq_u64::<62>(s[21], d1); + let v8 = vxarq_u64::<9>(s[8], d3); + let v4 = vxarq_u64::<19>(s[16], d1); + let v16 = vxarq_u64::<28>(s[5], d0); + let v5 = vxarq_u64::<36>(s[3], d3); + let v27 = vxarq_u64::<43>(s[18], d3); + let v3 = vxarq_u64::<49>(s[17], d2); + let v30 = vxarq_u64::<54>(s[11], d1); + let v31 = vxarq_u64::<58>(s[7], d2); + let v29 = vxarq_u64::<61>(s[10], d0); + [ + v0, v25, v26, v5, v15, v16, v1, v31, v8, v6, v29, v30, v2, v12, v22, v23, v4, v3, v27, v13, + v14, v24, v9, v19, v28, + ] +} - // Write state - st1.1d {{ v0- v3}}, [x0], #32 - st1.1d {{ v4- v7}}, [x0], #32 - st1.1d {{ v8-v11}}, [x0], #32 - st1.1d {{v12-v15}}, [x0], #32 - st1.1d {{v16-v19}}, [x0], #32 - st1.1d {{v20-v23}}, [x0], #32 - st1.1d {{v24}}, [x0] - ", - inout("x0") state.as_mut_ptr() => _, - inout("x1") crate::RC[24-round_count..].as_ptr() => _, - inout("x8") round_count => _, - clobber_abi("C"), - options(nostack) - ); - } +#[target_feature(enable = "sha3")] +unsafe fn chi_iota(t: &[uint64x2_t; 25], rc: u64) -> [uint64x2_t; 25] { + let rc_v = vdupq_n_u64(rc); + let v20 = vbcaxq_u64(t[2], t[14], t[8]); + let v21 = vbcaxq_u64(t[8], t[15], t[14]); + let v22 = vbcaxq_u64(t[14], t[21], t[15]); + let v23 = vbcaxq_u64(t[15], t[2], t[21]); + let v24 = vbcaxq_u64(t[21], t[8], t[2]); + let v17 = vbcaxq_u64(t[11], t[23], t[17]); + let v18 = vbcaxq_u64(t[17], t[4], t[23]); + let v19 = vbcaxq_u64(t[23], t[5], t[4]); + let v15 = vbcaxq_u64(t[4], t[11], t[5]); + let v16 = vbcaxq_u64(t[5], t[17], t[11]); + let v10 = vbcaxq_u64(t[1], t[13], t[7]); + let v11 = vbcaxq_u64(t[7], t[19], t[13]); + let v12 = vbcaxq_u64(t[13], t[20], t[19]); + let v13 = vbcaxq_u64(t[19], t[1], t[20]); + let v14 = vbcaxq_u64(t[20], t[7], t[1]); + let v7 = vbcaxq_u64(t[10], t[22], t[16]); + let v8 = vbcaxq_u64(t[16], t[3], t[22]); + let v9 = vbcaxq_u64(t[22], t[9], t[3]); + let v5 = vbcaxq_u64(t[3], t[10], t[9]); + let v6 = vbcaxq_u64(t[9], t[16], t[10]); + let v3 = vbcaxq_u64(t[18], t[0], t[24]); + let v4 = vbcaxq_u64(t[24], t[6], t[0]); + let v0 = vbcaxq_u64(t[0], t[12], t[6]); + let v1 = vbcaxq_u64(t[6], t[18], t[12]); + let v2 = vbcaxq_u64(t[12], t[24], t[18]); + let v0_iota = veorq_u64(v0, rc_v); + [ + v0_iota, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, + v19, v20, v21, v22, v23, v24, + ] } #[cfg(all(test, target_feature = "sha3"))] -#[allow(clippy::undocumented_unsafe_blocks)] mod tests { use super::*; @@ -200,9 +204,9 @@ mod tests { ]; let mut state = [0u64; 25]; - unsafe { p1600_armv8_sha3_asm(&mut state, 24) }; + unsafe { p1600_armv8_sha3(&mut state, 24) }; assert_eq!(state, state_first); - unsafe { p1600_armv8_sha3_asm(&mut state, 24) }; + unsafe { p1600_armv8_sha3(&mut state, 24) }; assert_eq!(state, state_second); } } diff --git a/keccak/src/lib.rs b/keccak/src/lib.rs index 2e19bba..e760227 100644 --- a/keccak/src/lib.rs +++ b/keccak/src/lib.rs @@ -38,28 +38,6 @@ //! 0x20D06CD26A8FBF5C, //! ]); //! ``` -//! -//! ## Intrinsics support -//! ### ARMv8 `asm!` -//! The [`KeccakP1600`] struct supports the use of optimized inline assembly implementations of -//! specialized intrinsics for ARMv8 CPUs along with runtime CPU feature detection, but requires -//! setting the `keccak_backend="armv8_asm"` configuration option via `RUSTFLAGS`, e.g. by setting -//! an environment variable: -//! -//! ```console -//! $ RUSTFLAGS='--cfg keccak_backend="armv8_asm"' cargo build --release -//! ``` -//! -//! Or you can persistently configure it for your project in `.cargo/config.toml`: -//! -//! ```toml -//! # In .cargo/config.toml -//! [build] -//! rustflags = ['--cfg', 'keccak_backend="armv8_asm"'] -//! ``` -//! -//! [1]: https://docs.rs/sha3 -//! [2]: https://docs.rs/tiny-keccak use core::{ fmt::Debug, @@ -70,10 +48,10 @@ use core::{ #[rustfmt::skip] mod unroll; -#[cfg(all(target_arch = "aarch64", keccak_backend = "armv8_asm"))] +#[cfg(target_arch = "aarch64")] mod armv8; -#[cfg(all(target_arch = "aarch64", keccak_backend = "armv8_asm"))] +#[cfg(target_arch = "aarch64")] cpufeatures::new!(armv8_sha3_intrinsics, "sha3"); const PLEN: usize = 25; @@ -191,7 +169,7 @@ impl_keccak!(p1600, f1600, u64); #[derive(Clone, Debug)] pub struct KeccakP1600 { state: [u64; PLEN], - #[cfg(all(target_arch = "aarch64", keccak_backend = "armv8_asm"))] + #[cfg(target_arch = "aarch64")] has_intrinsics: armv8_sha3_intrinsics::InitToken, } @@ -202,17 +180,17 @@ impl KeccakP1600 { pub fn new(state: [u64; PLEN]) -> Self { Self { state, - #[cfg(all(target_arch = "aarch64", keccak_backend = "armv8_asm"))] + #[cfg(target_arch = "aarch64")] has_intrinsics: armv8_sha3_intrinsics::init(), } } /// `Keccak-p[1600, rc]` permutation. pub fn p1600(&mut self, round_count: usize) { - #[cfg(all(target_arch = "aarch64", keccak_backend = "armv8_asm"))] + #[cfg(target_arch = "aarch64")] if self.has_intrinsics.get() { // SAFETY: we just performed runtime CPU feature detection above - unsafe { armv8::p1600_armv8_sha3_asm(&mut self.state, round_count) } + unsafe { armv8::p1600_armv8_sha3(&mut self.state, round_count) } return; } @@ -221,10 +199,10 @@ impl KeccakP1600 { /// `Keccak-f[1600]` permutation. pub fn f1600(&mut self) { - #[cfg(all(target_arch = "aarch64", keccak_backend = "armv8_asm"))] + #[cfg(target_arch = "aarch64")] if self.has_intrinsics.get() { // SAFETY: we just performed runtime CPU feature detection above - unsafe { armv8::p1600_armv8_sha3_asm(&mut self.state, u64::KECCAK_F_ROUND_COUNT) } + unsafe { armv8::p1600_armv8_sha3(&mut self.state, u64::KECCAK_F_ROUND_COUNT) } return; } From 70463c1b0d4b835097048bbff7719121bd36e390 Mon Sep 17 00:00:00 2001 From: Tony Arcieri Date: Fri, 27 Feb 2026 17:30:49 -0700 Subject: [PATCH 2/2] Add CHANGELOG entry and TODO for MSRV bump --- keccak/CHANGELOG.md | 4 ++++ keccak/src/armv8.rs | 3 +++ 2 files changed, 7 insertions(+) diff --git a/keccak/CHANGELOG.md b/keccak/CHANGELOG.md index 94cad8d..aafb9b9 100644 --- a/keccak/CHANGELOG.md +++ b/keccak/CHANGELOG.md @@ -9,10 +9,14 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 TODO: fill out rest of changelog +### Changed +- ARMv8 backend now on-by-default and written using intrinsics ([#112]) + ### Fixed - Use `doc_cfg` in place of removed `doc_auto_cfg` feature ([#91]) [#91]: https://github.com/RustCrypto/sponges/pull/91 +[#112]: https://github.com/RustCrypto/sponges/pull/112 ## 0.1.6 (2026-02-13) ### Fixed diff --git a/keccak/src/armv8.rs b/keccak/src/armv8.rs index 5cec580..e5c4ba6 100644 --- a/keccak/src/armv8.rs +++ b/keccak/src/armv8.rs @@ -1,3 +1,6 @@ +//! ARMv8 intrinsics-based backend. + +// TODO(tarcieri): remove when MSRV 1.87 #![allow(unsafe_op_in_unsafe_fn)] use crate::{PLEN, RC};