binius_field/arch/portable/arithmetic/
ghash.rs

1// Copyright 2023-2025 Irreducible Inc.
2// Copyright 2026 The Binius Developers
3
4//! Portable (software) implementation of GHASH field multiplication.
5
6use std::{
7	iter::Sum,
8	ops::{Add, AddAssign, Sub, SubAssign},
9};
10
11use bytemuck::TransparentWrapper;
12
13use super::super::univariate_mul_utils_128::{Underlier64bLanes, Underlier128bLanes, bmul64};
14use crate::{
15	BinaryField128bGhash as GhashB128, WideMul,
16	arch::PackedPrimitiveType,
17	arithmetic_traits::{MulXWide, Square},
18};
19
20/// Multiply two GHASH field elements using software implementation.
21///
22/// Method described at:
23/// * <https://www.bearssl.org/constanttime.html#ghash-for-gcm>
24/// * <https://crypto.stackexchange.com/questions/66448/how-does-bearssls-gcm-modular-reduction-work/66462#66462>
25///
26/// This code does not conform to the bit-endianness requirements of the GCM specification, but is
27/// a valid GHASH field multiplication with the modified representation.
28#[inline]
29pub fn ghash_mul<U: Underlier128bLanes>(x: U, y: U) -> U {
30	ghash_wide_mul(x, y).reduce()
31}
32
33/// Widening multiply: the schoolbook polynomial product of two GHASH field elements, without the
34/// modular reduction. The unreduced result can be accumulated by XOR and reduced once at the end
35/// via [`WideGhashProduct::reduce`].
36#[inline]
37pub fn ghash_wide_mul<U: Underlier128bLanes>(x: U, y: U) -> WideGhashProduct<U> {
38	// Convert to U64x2 representation
39	let (x1, x0) = U::split_hi_lo_64(x);
40	let (y1, y0) = U::split_hi_lo_64(y);
41
42	// Perform multiplication
43	let x0r = x0.reverse_bits_64();
44	let x1r = x1.reverse_bits_64();
45	let x2 = x0 ^ x1;
46	let x2r = x0r ^ x1r;
47
48	let y0r = y0.reverse_bits_64();
49	let y1r = y1.reverse_bits_64();
50	let y2 = y0 ^ y1;
51	let y2r = y0r ^ y1r;
52
53	let z0 = bmul64(y0, x0);
54	let z1 = bmul64(y1, x1);
55	let mut z2 = bmul64(y2, x2);
56
57	let mut z0h = bmul64(y0r, x0r);
58	let mut z1h = bmul64(y1r, x1r);
59	let mut z2h = bmul64(y2r, x2r);
60
61	z2 ^= z0 ^ z1;
62	z2h ^= z0h ^ z1h;
63	z0h = z0h.reverse_bits_64().shr_64(1);
64	z1h = z1h.reverse_bits_64().shr_64(1);
65	z2h = z2h.reverse_bits_64().shr_64(1);
66
67	WideGhashProduct {
68		v0: z0,
69		v1: z0h ^ z2,
70		v2: z1 ^ z2h,
71		v3: z1h,
72	}
73}
74
75#[inline]
76pub fn ghash_square<U: Underlier128bLanes>(x: U) -> U {
77	// Squared value in the polynomial basis is just a value with bits interleaved with zeroes.
78	let (hi, lo) = x.spread_bits_128();
79
80	let (v3, v2) = hi.split_hi_lo_64();
81	let (v1, v0) = lo.split_hi_lo_64();
82
83	reduce_64(v0, v1, v2, v3)
84}
85
86/// Reduce a 256-bit value represented as four 64-bit values by the GHASH polynomial.
87#[inline]
88fn reduce_64<U: Underlier128bLanes>(
89	mut v0: U::U64,
90	mut v1: U::U64,
91	mut v2: U::U64,
92	v3: U::U64,
93) -> U {
94	// Reduce modulo X^64 + X^7 + X^2 + X + 1.
95	v1 ^= v3 ^ v3.shl_64(1) ^ v3.shl_64(2) ^ v3.shl_64(7);
96	v2 ^= v3.shr_64(63) ^ v3.shr_64(62) ^ v3.shr_64(57);
97	v0 ^= v2 ^ v2.shl_64(1) ^ v2.shl_64(2) ^ v2.shl_64(7);
98	v1 ^= v2.shr_64(63) ^ v2.shr_64(62) ^ v2.shr_64(57);
99
100	// Convert back to 128-bit lanes
101	U::join_u64s(v1, v0)
102}
103
104/// An unreduced GHASH product, stored as the four 64-bit limbs `(v0, v1, v2, v3)` of the 256-bit
105/// schoolbook product. Values of this type can be summed by XOR and reduced once at the end via
106/// [`reduce`](WideGhashProduct::reduce).
107#[derive(Clone, Copy, Default, Debug)]
108pub struct WideGhashProduct<U: Underlier128bLanes> {
109	v0: U::U64,
110	v1: U::U64,
111	v2: U::U64,
112	v3: U::U64,
113}
114
115impl<U: Underlier128bLanes> WideGhashProduct<U> {
116	/// Reduce the accumulated wide product to a single GF(2^128) element.
117	#[inline]
118	pub fn reduce(self) -> U {
119		reduce_64(self.v0, self.v1, self.v2, self.v3)
120	}
121}
122
123impl<U: Underlier128bLanes> MulXWide for WideGhashProduct<U> {
124	/// Shifts the 256-bit schoolbook product left by one bit, carrying between the four 64-bit
125	/// limbs.
126	///
127	/// The product of two 128-bit polynomials has degree at most 254, and XOR-accumulating such
128	/// products preserves that, so the top bit of `v3` is always clear and nothing is shifted out.
129	#[inline]
130	fn mul_x_wide(self) -> Self {
131		Self {
132			v0: self.v0.shl_64(1),
133			v1: self.v1.shl_64(1) ^ self.v0.shr_64(63),
134			v2: self.v2.shl_64(1) ^ self.v1.shr_64(63),
135			v3: self.v3.shl_64(1) ^ self.v2.shr_64(63),
136		}
137	}
138}
139
140impl<U: Underlier128bLanes> Add for WideGhashProduct<U> {
141	type Output = Self;
142
143	#[inline]
144	fn add(self, rhs: Self) -> Self {
145		Self {
146			v0: self.v0 ^ rhs.v0,
147			v1: self.v1 ^ rhs.v1,
148			v2: self.v2 ^ rhs.v2,
149			v3: self.v3 ^ rhs.v3,
150		}
151	}
152}
153
154impl<U: Underlier128bLanes> AddAssign for WideGhashProduct<U> {
155	#[inline]
156	fn add_assign(&mut self, rhs: Self) {
157		self.v0 ^= rhs.v0;
158		self.v1 ^= rhs.v1;
159		self.v2 ^= rhs.v2;
160		self.v3 ^= rhs.v3;
161	}
162}
163
164impl<U: Underlier128bLanes> Sum for WideGhashProduct<U> {
165	#[inline]
166	fn sum<I: Iterator<Item = Self>>(iter: I) -> Self {
167		iter.fold(Self::default(), |acc, x| acc + x)
168	}
169}
170
171// In characteristic 2, subtraction is identical to addition (XOR).
172impl<U: Underlier128bLanes> Sub for WideGhashProduct<U> {
173	type Output = Self;
174
175	#[inline]
176	fn sub(self, rhs: Self) -> Self {
177		Self {
178			v0: self.v0 ^ rhs.v0,
179			v1: self.v1 ^ rhs.v1,
180			v2: self.v2 ^ rhs.v2,
181			v3: self.v3 ^ rhs.v3,
182		}
183	}
184}
185
186impl<U: Underlier128bLanes> SubAssign for WideGhashProduct<U> {
187	#[inline]
188	fn sub_assign(&mut self, rhs: Self) {
189		self.v0 ^= rhs.v0;
190		self.v1 ^= rhs.v1;
191		self.v2 ^= rhs.v2;
192		self.v3 ^= rhs.v3;
193	}
194}
195
196/// Widening-multiply wrapper for the portable GHASH packing.
197///
198/// [`wide_mul`](WideMul::wide_mul) computes the unreduced schoolbook product via
199/// [`ghash_wide_mul`], and [`reduce`](WideMul::reduce) performs the GHASH modular reduction. This
200/// defers the reduction so a sum of products is reduced only once.
201#[repr(transparent)]
202#[derive(bytemuck::TransparentWrapper)]
203pub struct GhashWideMul<T>(T);
204
205impl<U: Underlier128bLanes> WideMul for GhashWideMul<PackedPrimitiveType<U, GhashB128>> {
206	type Output = WideGhashProduct<U>;
207
208	#[inline]
209	fn wide_mul(a: Self, b: Self) -> Self::Output {
210		let a = PackedPrimitiveType::peel(Self::peel(a));
211		let b = PackedPrimitiveType::peel(Self::peel(b));
212		ghash_wide_mul(a, b)
213	}
214
215	#[inline]
216	fn reduce(wide: Self::Output) -> Self {
217		Self::wrap(PackedPrimitiveType::wrap(wide.reduce()))
218	}
219}
220
221/// Square strategy wrapper for the software GHASH implementation.
222///
223/// Shared by the portable and wasm32 packings and used by the x86_64 packing when CLMUL is
224/// unavailable. Squares via the bit-spread [`ghash_square`], which interleaves the input bits with
225/// zeroes and reduces — no carryless multiply required.
226#[repr(transparent)]
227#[derive(TransparentWrapper)]
228pub struct GhashSoftMul<T>(T);
229
230impl<U: Underlier128bLanes> Square for GhashSoftMul<PackedPrimitiveType<U, GhashB128>> {
231	#[inline]
232	fn square(self) -> Self {
233		Self::wrap(PackedPrimitiveType::wrap(ghash_square(PackedPrimitiveType::peel(Self::peel(
234			self,
235		)))))
236	}
237}
238
239#[cfg(test)]
240mod tests {
241	use proptest::{prelude::any, proptest};
242
243	use super::{super::super::m128::M128, MulXWide, ghash_mul, ghash_wide_mul};
244
245	// Exercises the deferred wide-mul building blocks (`ghash_wide_mul` + `WideGhashProduct`) that
246	// `GhashWideMul` wraps, directly on the portable `M128`. This runs on every host, whereas the
247	// portable `PackedBinaryGhash1x128b` is only a usable `PackedField` on targets where it is the
248	// re-exported b128 type (covered there by the proptests in `packed_ghash.rs`).
249	proptest! {
250		// The split must agree with the fused multiply: wide-multiply then reduce == ghash_mul.
251		#[test]
252		fn wide_mul_then_reduce_matches_ghash_mul(a in any::<u128>(), b in any::<u128>()) {
253			let (a, b) = (M128::from(a), M128::from(b));
254			assert_eq!(ghash_wide_mul(a, b).reduce(), ghash_mul(a, b));
255		}
256
257		// Accumulate two unreduced products and reduce once.
258		#[test]
259		fn wide_mul_deferred_accumulation(
260			a1 in any::<u128>(), b1 in any::<u128>(),
261			a2 in any::<u128>(), b2 in any::<u128>(),
262		) {
263			let (a1, b1) = (M128::from(a1), M128::from(b1));
264			let (a2, b2) = (M128::from(a2), M128::from(b2));
265			let acc = ghash_wide_mul(a1, b1) + ghash_wide_mul(a2, b2);
266			assert_eq!(acc.reduce(), ghash_mul(a1, b1) ^ ghash_mul(a2, b2));
267		}
268
269		// Scaling by X commutes with the reduction: scaling the unreduced product matches
270		// multiplying the reduced product by X (the field element 2).
271		#[test]
272		fn mul_x_wide_commutes_with_reduce(a in any::<u128>(), b in any::<u128>()) {
273			let (a, b) = (M128::from(a), M128::from(b));
274			let wide = ghash_wide_mul(a, b);
275			assert_eq!(wide.mul_x_wide().reduce(), ghash_mul(wide.reduce(), M128::from(2u128)));
276		}
277	}
278}
binius_field/arch/portable/arithmetic/ghash.rs

binius_field/arch/portable/arithmetic/
ghash.rs