Skip to main content

binius_field/arch/x86_64/
packed_ghash_128.rs

1// Copyright 2024-2025 Irreducible Inc.
2// Copyright 2026 The Binius Developers
3
4//! PCLMULQDQ-accelerated implementation of GHASH for x86_64.
5//!
6//! This module provides optimized GHASH multiplication using the PCLMULQDQ instruction
7//! available on modern x86_64 processors. The implementation follows the algorithm
8//! described in the GHASH specification with polynomial x^128 + x^7 + x^2 + x + 1.
9
10use super::m128::M128;
11#[cfg(not(target_feature = "pclmulqdq"))]
12use crate::arch::portable::univariate_mul_utils_128::{Underlier128bLanes, spread_bits_64};
13// Used by the CLMUL-accelerated `ClMulUnderlier` impl and the `GhashWideMul1x`/`GhashSquare1x`
14// aliases below.
15#[cfg(target_feature = "pclmulqdq")]
16use crate::arch::x86_64::arithmetic::ghash;
17
18/// Widening-multiply wrapper used by the GHASH packing: the reduction-deferring
19/// `GhashClMulWideMul` when PCLMULQDQ is available, otherwise the portable `GhashWideMul` which
20/// also defers reduction for deferred-reduction sum-of-products.
21#[cfg(target_feature = "pclmulqdq")]
22pub type GhashWideMul1x<T> = ghash::GhashClMulWideMul<T>;
23#[cfg(not(target_feature = "pclmulqdq"))]
24pub type GhashWideMul1x<T> = crate::arch::portable::arithmetic::ghash::GhashWideMul<T>;
25
26/// Square wrapper for the `PackedBinaryGhash1x128b` packing: the CLMUL square `GhashClMul` when
27/// PCLMULQDQ is available, otherwise the shared software square `GhashSoftMul`.
28#[cfg(target_feature = "pclmulqdq")]
29pub type GhashSquare1x<T> = ghash::GhashClMul<T>;
30#[cfg(not(target_feature = "pclmulqdq"))]
31pub type GhashSquare1x<T> = crate::arch::portable::arithmetic::ghash::GhashSoftMul<T>;
32
33/// Invert wrapper for the `PackedBinaryGhash1x128b` packing: the shared Itoh-Tsujii inversion
34/// (there is no CLMUL inverse).
35pub type GhashInvert1x<T> = crate::arch::portable::arithmetic::itoh_tsujii::GhashItohTsujii<T>;
36
37/// `Underlier128bLanes` for x86_64 `M128` — required for the portable `GhashWideMul`/`GhashSoftMul`
38/// fallbacks.
39///
40/// Delegates through `u128` (SSE2 load/store) since this path is only active on targets without
41/// PCLMULQDQ, where SIMD lane extraction intrinsics are not necessarily available.
42#[cfg(not(target_feature = "pclmulqdq"))]
43impl Underlier128bLanes for M128 {
44	type U64 = u64;
45
46	#[inline(always)]
47	fn split_hi_lo_64(self) -> (u64, u64) {
48		u128::from(self).split_hi_lo_64()
49	}
50
51	#[inline(always)]
52	fn join_u64s(high: u64, low: u64) -> Self {
53		Self::from(u128::join_u64s(high, low))
54	}
55
56	#[inline(always)]
57	fn broadcast_64(val: u64) -> Self {
58		Self::from(u128::broadcast_64(val))
59	}
60
61	#[inline(always)]
62	fn spread_bits_128(self) -> (Self, Self) {
63		let (hi, lo) = self.split_hi_lo_64();
64		(Self::from(spread_bits_64(hi)), Self::from(spread_bits_64(lo)))
65	}
66}
67
68#[cfg(target_feature = "pclmulqdq")]
69impl ghash::ClMulUnderlier for M128 {
70	#[inline]
71	fn clmulepi64<const IMM8: i32>(a: Self, b: Self) -> Self {
72		unsafe { std::arch::x86_64::_mm_clmulepi64_si128::<IMM8>(a.into(), b.into()) }.into()
73	}
74
75	#[inline]
76	fn move_64_to_hi(a: Self) -> Self {
77		unsafe { std::arch::x86_64::_mm_slli_si128::<8>(a.into()) }.into()
78	}
79}