diff --git a/keccak/CHANGELOG.md b/keccak/CHANGELOG.md
index 94cad8d..aafb9b9 100644
--- a/keccak/CHANGELOG.md
+++ b/keccak/CHANGELOG.md
@@ -9,10 +9,14 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 TODO: fill out rest of changelog
 
+### Changed
+- ARMv8 backend now on-by-default and written using intrinsics ([#112])
+
 ### Fixed
 - Use `doc_cfg` in place of removed `doc_auto_cfg` feature ([#91])
 
 [#91]: https://github.com/RustCrypto/sponges/pull/91
+[#112]: https://github.com/RustCrypto/sponges/pull/112
 
 ## 0.1.6 (2026-02-13)
 ### Fixed
diff --git a/keccak/Cargo.toml b/keccak/Cargo.toml
index 91930a4..014d313 100644
--- a/keccak/Cargo.toml
+++ b/keccak/Cargo.toml
@@ -16,7 +16,7 @@ readme = "README.md"
 edition = "2024"
 rust-version = "1.85"
 
-[target.'cfg(all(target_arch = "aarch64", keccak_backend = "armv8_asm"))'.dependencies]
+[target.'cfg(target_arch = "aarch64")'.dependencies]
 cpufeatures = "0.3"
 
 [lints.rust]
@@ -29,7 +29,7 @@ unused_qualifications = "warn"
 
 [lints.rust.unexpected_cfgs]
 level = "warn"
-check-cfg = ['cfg(keccak_backend, values("armv8_asm", "simd", "soft-compact"))']
+check-cfg = ['cfg(keccak_backend, values("simd", "soft-compact"))']
 
 [lints.clippy]
 borrow_as_ptr = "warn"
diff --git a/keccak/benches/mod.rs b/keccak/benches/mod.rs
index 34dd3d3..df73767 100644
--- a/keccak/benches/mod.rs
+++ b/keccak/benches/mod.rs
@@ -1,7 +1,7 @@
 //! keccak benchmarks
 
 #![feature(test)]
-#![cfg_attr(feature = "simd", feature(portable_simd))]
+#![cfg_attr(keccak_backend = "simd", feature(portable_simd))]
 
 extern crate keccak;
 extern crate test;
@@ -35,7 +35,7 @@ fn b_p1600_16(b: &mut test::Bencher) {
     b.iter(|| p1600(&mut data, 16));
 }
 
-#[cfg(feature = "simd")]
+#[cfg(keccak_backend = "simd")]
 mod simd {
     use keccak::simd::{f1600x2, f1600x4, f1600x8, u64x2, u64x4, u64x8};
 
diff --git a/keccak/src/armv8.rs b/keccak/src/armv8.rs
index cb230a7..e5c4ba6 100644
--- a/keccak/src/armv8.rs
+++ b/keccak/src/armv8.rs
@@ -1,4 +1,10 @@
-use crate::PLEN;
+//! ARMv8 intrinsics-based backend.
+
+// TODO(tarcieri): remove when MSRV 1.87
+#![allow(unsafe_op_in_unsafe_fn)]
+
+use crate::{PLEN, RC};
+use core::{arch::aarch64::*, array};
 
 /// Keccak-p1600 on ARMv8.4-A with `FEAT_SHA3`.
 ///
@@ -6,137 +12,138 @@ use crate::PLEN;
 /// Adapted from the Keccak-f1600 implementation in the XKCP/K12.
 /// see <https://github.com/XKCP/K12/blob/df6a21e6d1f34c1aa36e8d702540899c97dba5a0/lib/ARMv8Asha3/KeccakP-1600-ARMv8Asha3.S#L69>
 #[target_feature(enable = "sha3")]
-pub unsafe fn p1600_armv8_sha3_asm(state: &mut [u64; PLEN], round_count: usize) {
+pub unsafe fn p1600_armv8_sha3(state: &mut [u64; PLEN], round_count: usize) {
+    let mut s = [*state, Default::default()];
+    // SAFETY: both functions have the same safety invariants, namely they require the `sha3`
+    // target feature is available, and the caller is responsible for ensuring support
+    unsafe { p1600_armv8_sha3_times2(&mut s, round_count) };
+    *state = s[0];
+}
+
+/// Keccak-p1600 on ARMv8.4-A with `FEAT_SHA3` with support for 2 parallel states.
+///
+/// See p. K12.2.2  p. 11,749 of the ARM Reference manual.
+/// Adapted from the Keccak-f1600 implementation in the XKCP/K12.
+///
+/// <https://github.com/XKCP/K12/blob/df6a21e/lib/ARMv8Asha3/KeccakP-1600-ARMv8Asha3.S#L69>
+#[target_feature(enable = "sha3")]
+pub unsafe fn p1600_armv8_sha3_times2(state: &mut [[u64; PLEN]; 2], round_count: usize) {
     assert!(
         matches!(round_count, 1..=24),
         "invalid round count (must be 1-24): {}",
         round_count
     );
 
-    // SAFETY:
-    // - caller is responsible for ensuring that the target CPU is at least ARMv8.4-A with
-    //   `FEAT_SHA3` using runtime CPU feature detection
-    // - `round_count` is ensured to be in the range `1..=24` above
-    // - `state` is valid, aligned, and mutably borrowed as a Rust reference above
-    unsafe {
-        core::arch::asm!("
-            // Read state
-            ld1.1d {{ v0- v3}}, [x0], #32
-            ld1.1d {{ v4- v7}}, [x0], #32
-            ld1.1d {{ v8-v11}}, [x0], #32
-            ld1.1d {{v12-v15}}, [x0], #32
-            ld1.1d {{v16-v19}}, [x0], #32
-            ld1.1d {{v20-v23}}, [x0], #32
-            ld1.1d {{v24}},     [x0]
-            sub x0, x0, #192
-
-            // NOTE: This loop actually computes two f1600 functions in
-            // parallel, in both the lower and the upper 64-bit of the
-            // 128-bit registers v0-v24.
-            0:  sub	x8, x8, #1
-
-            // Theta Calculations
-            eor3.16b   v25, v20, v15, v10
-            eor3.16b   v26, v21, v16, v11
-            eor3.16b   v27, v22, v17, v12
-            eor3.16b   v28, v23, v18, v13
-            eor3.16b   v29, v24, v19, v14
-            eor3.16b   v25, v25,  v5,  v0
-            eor3.16b   v26, v26,  v6,  v1
-            eor3.16b   v27, v27,  v7,  v2
-            eor3.16b   v28, v28,  v8,  v3
-            eor3.16b   v29, v29,  v9,  v4
-            rax1.2d    v30, v25, v27
-            rax1.2d    v31, v26, v28
-            rax1.2d    v27, v27, v29
-            rax1.2d    v28, v28, v25
-            rax1.2d    v29, v29, v26
-
-            // Rho and Phi
-            eor.16b     v0,  v0, v29
-            xar.2d     v25,  v1, v30, #64 -  1
-            xar.2d      v1,  v6, v30, #64 - 44
-            xar.2d      v6,  v9, v28, #64 - 20
-            xar.2d      v9, v22, v31, #64 - 61
-            xar.2d     v22, v14, v28, #64 - 39
-            xar.2d     v14, v20, v29, #64 - 18
-            xar.2d     v26,  v2, v31, #64 - 62
-            xar.2d      v2, v12, v31, #64 - 43
-            xar.2d     v12, v13, v27, #64 - 25
-            xar.2d     v13, v19, v28, #64 -  8
-            xar.2d     v19, v23, v27, #64 - 56
-            xar.2d     v23, v15, v29, #64 - 41
-            xar.2d     v15,  v4, v28, #64 - 27
-            xar.2d     v28, v24, v28, #64 - 14
-            xar.2d     v24, v21, v30, #64 -  2
-            xar.2d      v8,  v8, v27, #64 - 55
-            xar.2d      v4, v16, v30, #64 - 45
-            xar.2d     v16,  v5, v29, #64 - 36
-            xar.2d      v5,  v3, v27, #64 - 28
-            xar.2d     v27, v18, v27, #64 - 21
-            xar.2d      v3, v17, v31, #64 - 15
-            xar.2d     v30, v11, v30, #64 - 10
-            xar.2d     v31,  v7, v31, #64 -  6
-            xar.2d     v29, v10, v29, #64 -  3
-
-            // Chi and Iota
-            bcax.16b   v20, v26, v22,  v8
-            bcax.16b   v21,  v8, v23, v22
-            bcax.16b   v22, v22, v24, v23
-            bcax.16b   v23, v23, v26, v24
-            bcax.16b   v24, v24,  v8, v26
+    let mut s: [uint64x2_t; PLEN] =
+        array::from_fn(|i| vcombine_u64(vcreate_u64(state[0][i]), vcreate_u64(state[1][i])));
 
-            ld1r.2d    {{v26}}, [x1], #8
-
-            bcax.16b   v17, v30, v19,  v3
-            bcax.16b   v18,  v3, v15, v19
-            bcax.16b   v19, v19, v16, v15
-            bcax.16b   v15, v15, v30, v16
-            bcax.16b   v16, v16,  v3, v30
-
-            bcax.16b   v10, v25, v12, v31
-            bcax.16b   v11, v31, v13, v12
-            bcax.16b   v12, v12, v14, v13
-            bcax.16b   v13, v13, v25, v14
-            bcax.16b   v14, v14, v31, v25
-
-            bcax.16b    v7, v29,  v9,  v4
-            bcax.16b    v8,  v4,  v5,  v9
-            bcax.16b    v9,  v9,  v6,  v5
-            bcax.16b    v5,  v5, v29,  v6
-            bcax.16b    v6,  v6,  v4, v29
+    for &rc in &RC[(24 - round_count)..] {
+        let (d0, d1, d2, d3, d4) = theta(&s);
+        let t = rho_pi(&s, d0, d1, d2, d3, d4);
+        s = chi_iota(&t, rc);
+    }
 
-            bcax.16b    v3, v27,  v0, v28
-            bcax.16b    v4, v28,  v1,  v0
-            bcax.16b    v0,  v0,  v2,  v1
-            bcax.16b    v1,  v1, v27,  v2
-            bcax.16b    v2,  v2, v28, v27
+    for i in 0..PLEN {
+        state[0][i] = vgetq_lane_u64::<0>(s[i]);
+        state[1][i] = vgetq_lane_u64::<1>(s[i]);
+    }
+}
 
-            eor.16b v0,v0,v26
+#[target_feature(enable = "sha3")]
+unsafe fn theta(
+    s: &[uint64x2_t; 25],
+) -> (uint64x2_t, uint64x2_t, uint64x2_t, uint64x2_t, uint64x2_t) {
+    let c0 = veor3q_u64(s[0], s[5], veor3q_u64(s[10], s[15], s[20]));
+    let c1 = veor3q_u64(s[1], s[6], veor3q_u64(s[11], s[16], s[21]));
+    let c2 = veor3q_u64(s[2], s[7], veor3q_u64(s[12], s[17], s[22]));
+    let c3 = veor3q_u64(s[3], s[8], veor3q_u64(s[13], s[18], s[23]));
+    let c4 = veor3q_u64(s[4], s[9], veor3q_u64(s[14], s[19], s[24]));
+
+    let d0 = vrax1q_u64(c4, c1);
+    let d1 = vrax1q_u64(c0, c2);
+    let d2 = vrax1q_u64(c1, c3);
+    let d3 = vrax1q_u64(c2, c4);
+    let d4 = vrax1q_u64(c3, c0);
+
+    (d0, d1, d2, d3, d4)
+}
 
-            // Rounds loop
-            cbnz    w8, 0b
+#[target_feature(enable = "sha3")]
+unsafe fn rho_pi(
+    s: &[uint64x2_t; 25],
+    d0: uint64x2_t,
+    d1: uint64x2_t,
+    d2: uint64x2_t,
+    d3: uint64x2_t,
+    d4: uint64x2_t,
+) -> [uint64x2_t; 25] {
+    let v0 = veorq_u64(s[0], d0);
+    let v25 = vxarq_u64::<63>(s[1], d1);
+    let v1 = vxarq_u64::<20>(s[6], d1);
+    let v6 = vxarq_u64::<44>(s[9], d4);
+    let v9 = vxarq_u64::<3>(s[22], d2);
+    let v22 = vxarq_u64::<25>(s[14], d4);
+    let v14 = vxarq_u64::<46>(s[20], d0);
+    let v26 = vxarq_u64::<2>(s[2], d2);
+    let v2 = vxarq_u64::<21>(s[12], d2);
+    let v12 = vxarq_u64::<39>(s[13], d3);
+    let v13 = vxarq_u64::<56>(s[19], d4);
+    let v19 = vxarq_u64::<8>(s[23], d3);
+    let v23 = vxarq_u64::<23>(s[15], d0);
+    let v15 = vxarq_u64::<37>(s[4], d4);
+    let v28 = vxarq_u64::<50>(s[24], d4);
+    let v24 = vxarq_u64::<62>(s[21], d1);
+    let v8 = vxarq_u64::<9>(s[8], d3);
+    let v4 = vxarq_u64::<19>(s[16], d1);
+    let v16 = vxarq_u64::<28>(s[5], d0);
+    let v5 = vxarq_u64::<36>(s[3], d3);
+    let v27 = vxarq_u64::<43>(s[18], d3);
+    let v3 = vxarq_u64::<49>(s[17], d2);
+    let v30 = vxarq_u64::<54>(s[11], d1);
+    let v31 = vxarq_u64::<58>(s[7], d2);
+    let v29 = vxarq_u64::<61>(s[10], d0);
+    [
+        v0, v25, v26, v5, v15, v16, v1, v31, v8, v6, v29, v30, v2, v12, v22, v23, v4, v3, v27, v13,
+        v14, v24, v9, v19, v28,
+    ]
+}
 
-            // Write state
-            st1.1d	{{ v0- v3}}, [x0], #32
-            st1.1d	{{ v4- v7}}, [x0], #32
-            st1.1d	{{ v8-v11}}, [x0], #32
-            st1.1d	{{v12-v15}}, [x0], #32
-            st1.1d	{{v16-v19}}, [x0], #32
-            st1.1d	{{v20-v23}}, [x0], #32
-            st1.1d	{{v24}},     [x0]
-        ",
-            inout("x0") state.as_mut_ptr() => _,
-            inout("x1") crate::RC[24-round_count..].as_ptr() => _,
-            inout("x8") round_count => _,
-            clobber_abi("C"),
-            options(nostack)
-        );
-    }
+#[target_feature(enable = "sha3")]
+unsafe fn chi_iota(t: &[uint64x2_t; 25], rc: u64) -> [uint64x2_t; 25] {
+    let rc_v = vdupq_n_u64(rc);
+    let v20 = vbcaxq_u64(t[2], t[14], t[8]);
+    let v21 = vbcaxq_u64(t[8], t[15], t[14]);
+    let v22 = vbcaxq_u64(t[14], t[21], t[15]);
+    let v23 = vbcaxq_u64(t[15], t[2], t[21]);
+    let v24 = vbcaxq_u64(t[21], t[8], t[2]);
+    let v17 = vbcaxq_u64(t[11], t[23], t[17]);
+    let v18 = vbcaxq_u64(t[17], t[4], t[23]);
+    let v19 = vbcaxq_u64(t[23], t[5], t[4]);
+    let v15 = vbcaxq_u64(t[4], t[11], t[5]);
+    let v16 = vbcaxq_u64(t[5], t[17], t[11]);
+    let v10 = vbcaxq_u64(t[1], t[13], t[7]);
+    let v11 = vbcaxq_u64(t[7], t[19], t[13]);
+    let v12 = vbcaxq_u64(t[13], t[20], t[19]);
+    let v13 = vbcaxq_u64(t[19], t[1], t[20]);
+    let v14 = vbcaxq_u64(t[20], t[7], t[1]);
+    let v7 = vbcaxq_u64(t[10], t[22], t[16]);
+    let v8 = vbcaxq_u64(t[16], t[3], t[22]);
+    let v9 = vbcaxq_u64(t[22], t[9], t[3]);
+    let v5 = vbcaxq_u64(t[3], t[10], t[9]);
+    let v6 = vbcaxq_u64(t[9], t[16], t[10]);
+    let v3 = vbcaxq_u64(t[18], t[0], t[24]);
+    let v4 = vbcaxq_u64(t[24], t[6], t[0]);
+    let v0 = vbcaxq_u64(t[0], t[12], t[6]);
+    let v1 = vbcaxq_u64(t[6], t[18], t[12]);
+    let v2 = vbcaxq_u64(t[12], t[24], t[18]);
+    let v0_iota = veorq_u64(v0, rc_v);
+    [
+        v0_iota, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18,
+        v19, v20, v21, v22, v23, v24,
+    ]
 }
 
 #[cfg(all(test, target_feature = "sha3"))]
-#[allow(clippy::undocumented_unsafe_blocks)]
 mod tests {
     use super::*;
 
@@ -200,9 +207,9 @@ mod tests {
         ];
 
         let mut state = [0u64; 25];
-        unsafe { p1600_armv8_sha3_asm(&mut state, 24) };
+        unsafe { p1600_armv8_sha3(&mut state, 24) };
         assert_eq!(state, state_first);
-        unsafe { p1600_armv8_sha3_asm(&mut state, 24) };
+        unsafe { p1600_armv8_sha3(&mut state, 24) };
         assert_eq!(state, state_second);
     }
 }
diff --git a/keccak/src/lib.rs b/keccak/src/lib.rs
index 2e19bba..e760227 100644
--- a/keccak/src/lib.rs
+++ b/keccak/src/lib.rs
@@ -38,28 +38,6 @@
 //!     0x20D06CD26A8FBF5C,
 //! ]);
 //! ```
-//!
-//! ## Intrinsics support
-//! ### ARMv8 `asm!`
-//! The [`KeccakP1600`] struct supports the use of optimized inline assembly implementations of
-//! specialized intrinsics for ARMv8 CPUs along with runtime CPU feature detection, but requires
-//! setting the `keccak_backend="armv8_asm"` configuration option via `RUSTFLAGS`, e.g. by setting
-//! an environment variable:
-//!
-//! ```console
-//! $ RUSTFLAGS='--cfg keccak_backend="armv8_asm"' cargo build --release
-//! ```
-//!
-//! Or you can persistently configure it for your project in `.cargo/config.toml`:
-//!
-//! ```toml
-//! # In .cargo/config.toml
-//! [build]
-//! rustflags = ['--cfg', 'keccak_backend="armv8_asm"']
-//! ```
-//!
-//! [1]: https://docs.rs/sha3
-//! [2]: https://docs.rs/tiny-keccak
 
 use core::{
     fmt::Debug,
@@ -70,10 +48,10 @@ use core::{
 #[rustfmt::skip]
 mod unroll;
 
-#[cfg(all(target_arch = "aarch64", keccak_backend = "armv8_asm"))]
+#[cfg(target_arch = "aarch64")]
 mod armv8;
 
-#[cfg(all(target_arch = "aarch64", keccak_backend = "armv8_asm"))]
+#[cfg(target_arch = "aarch64")]
 cpufeatures::new!(armv8_sha3_intrinsics, "sha3");
 
 const PLEN: usize = 25;
@@ -191,7 +169,7 @@ impl_keccak!(p1600, f1600, u64);
 #[derive(Clone, Debug)]
 pub struct KeccakP1600 {
     state: [u64; PLEN],
-    #[cfg(all(target_arch = "aarch64", keccak_backend = "armv8_asm"))]
+    #[cfg(target_arch = "aarch64")]
     has_intrinsics: armv8_sha3_intrinsics::InitToken,
 }
 
@@ -202,17 +180,17 @@ impl KeccakP1600 {
     pub fn new(state: [u64; PLEN]) -> Self {
         Self {
             state,
-            #[cfg(all(target_arch = "aarch64", keccak_backend = "armv8_asm"))]
+            #[cfg(target_arch = "aarch64")]
             has_intrinsics: armv8_sha3_intrinsics::init(),
         }
     }
 
     /// `Keccak-p[1600, rc]` permutation.
     pub fn p1600(&mut self, round_count: usize) {
-        #[cfg(all(target_arch = "aarch64", keccak_backend = "armv8_asm"))]
+        #[cfg(target_arch = "aarch64")]
         if self.has_intrinsics.get() {
             // SAFETY: we just performed runtime CPU feature detection above
-            unsafe { armv8::p1600_armv8_sha3_asm(&mut self.state, round_count) }
+            unsafe { armv8::p1600_armv8_sha3(&mut self.state, round_count) }
             return;
         }
 
@@ -221,10 +199,10 @@ impl KeccakP1600 {
 
     /// `Keccak-f[1600]` permutation.
     pub fn f1600(&mut self) {
-        #[cfg(all(target_arch = "aarch64", keccak_backend = "armv8_asm"))]
+        #[cfg(target_arch = "aarch64")]
         if self.has_intrinsics.get() {
             // SAFETY: we just performed runtime CPU feature detection above
-            unsafe { armv8::p1600_armv8_sha3_asm(&mut self.state, u64::KECCAK_F_ROUND_COUNT) }
+            unsafe { armv8::p1600_armv8_sha3(&mut self.state, u64::KECCAK_F_ROUND_COUNT) }
             return;
         }