poly1305/backend/avx2/helpers.rs
1//! AVX2 helpers for implementing Poly1305 using 26-bit limbs.
2
3use core::fmt;
4use core::ops::{Add, Mul};
5
6#[cfg(target_arch = "x86")]
7use core::arch::x86::*;
8#[cfg(target_arch = "x86_64")]
9use core::arch::x86_64::*;
10
11use super::ParBlocks;
12use crate::{Block, Key};
13
14const fn set02(x3: u8, x2: u8, x1: u8, x0: u8) -> i32 {
15 (((x3) << 6) | ((x2) << 4) | ((x1) << 2) | (x0)) as i32
16}
17
18/// Helper for Display impls of aligned values.
19fn write_130(f: &mut fmt::Formatter<'_>, limbs: [u32; 5]) -> fmt::Result {
20 let r0 = limbs[0] as u128;
21 let r1 = limbs[1] as u128;
22 let r2 = limbs[2] as u128;
23 let r3 = limbs[3] as u128;
24 let r4 = limbs[4] as u128;
25
26 // Reduce into two u128s
27 let l0 = r0 + (r1 << 26) + (r2 << 52) + (r3 << 78);
28 let (l0, c) = l0.overflowing_add(r4 << 104);
29 let l1 = (r4 >> 24) + if c { 1 } else { 0 };
30
31 write!(f, "0x{:02x}{:032x}", l1, l0)
32}
33
34/// Helper for Display impls of unreduced values.
35fn write_130_wide(f: &mut fmt::Formatter<'_>, limbs: [u64; 5]) -> fmt::Result {
36 let r0 = limbs[0] as u128;
37 let r1 = limbs[1] as u128;
38 let r2 = limbs[2] as u128;
39 let r3 = limbs[3] as u128;
40 let r4 = limbs[4] as u128;
41
42 // Reduce into two u128s
43 let l0 = r0 + (r1 << 26) + (r2 << 52);
44 let (l0, c1) = l0.overflowing_add(r3 << 78);
45 let (l0, c2) = l0.overflowing_add(r4 << 104);
46 let l1 = (r3 >> 50) + (r4 >> 24) + if c1 { 1 } else { 0 } + if c2 { 1 } else { 0 };
47
48 write!(f, "0x{:02x}{:032x}", l1, l0)
49}
50
51/// Derives the Poly1305 addition and polynomial keys.
52#[target_feature(enable = "avx2")]
53pub(super) unsafe fn prepare_keys(key: &Key) -> (AdditionKey, PrecomputedMultiplier) {
54 // [k7, k6, k5, k4, k3, k2, k1, k0]
55 let key = _mm256_loadu_si256(key.as_ptr() as *const _);
56
57 // Prepare addition key: [0, k7, 0, k6, 0, k5, 0, k4]
58 let k = AdditionKey(_mm256_and_si256(
59 _mm256_permutevar8x32_epi32(key, _mm256_set_epi32(3, 7, 2, 6, 1, 5, 0, 4)),
60 _mm256_set_epi32(0, -1, 0, -1, 0, -1, 0, -1),
61 ));
62
63 // Prepare polynomial key R = k & 0xffffffc0ffffffc0ffffffc0fffffff:
64 let r = Aligned130::new(_mm256_and_si256(
65 key,
66 _mm256_set_epi32(0, 0, 0, 0, 0x0ffffffc, 0x0ffffffc, 0x0ffffffc, 0x0fffffff),
67 ));
68
69 (k, r.into())
70}
71
72/// A 130-bit integer aligned across five 26-bit limbs.
73///
74/// The top three 32-bit words of the underlying 256-bit vector are ignored.
75#[derive(Clone, Copy, Debug)]
76pub(super) struct Aligned130(pub(super) __m256i);
77
78impl fmt::Display for Aligned130 {
79 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
80 let mut v0 = [0u8; 32];
81 unsafe {
82 _mm256_storeu_si256(v0.as_mut_ptr() as *mut _, self.0);
83 }
84
85 write!(f, "Aligned130(")?;
86 write_130(
87 f,
88 [
89 u32::from_le_bytes(v0[0..4].try_into().unwrap()),
90 u32::from_le_bytes(v0[4..8].try_into().unwrap()),
91 u32::from_le_bytes(v0[8..12].try_into().unwrap()),
92 u32::from_le_bytes(v0[12..16].try_into().unwrap()),
93 u32::from_le_bytes(v0[16..20].try_into().unwrap()),
94 ],
95 )?;
96 write!(f, ")")
97 }
98}
99
100impl Aligned130 {
101 /// Aligns a 16-byte Poly1305 block at 26-bit boundaries within 32-bit words, and sets
102 /// the high bit.
103 #[target_feature(enable = "avx2")]
104 pub(super) unsafe fn from_block(block: &Block) -> Self {
105 Aligned130::new(_mm256_or_si256(
106 _mm256_and_si256(
107 // Load the 128-bit block into a 256-bit vector.
108 _mm256_castsi128_si256(_mm_loadu_si128(block.as_ptr() as *const _)),
109 // Mask off the upper 128 bits (undefined by _mm256_castsi128_si256).
110 _mm256_set_epi64x(0, 0, -1, -1),
111 ),
112 // Set the high bit.
113 _mm256_set_epi64x(0, 1, 0, 0),
114 ))
115 }
116
117 /// Aligns a partial Poly1305 block at 26-bit boundaries within 32-bit words.
118 ///
119 /// Assumes that the high bit is already correctly set for the partial block.
120 #[target_feature(enable = "avx2")]
121 pub(super) unsafe fn from_partial_block(block: &Block) -> Self {
122 Aligned130::new(_mm256_and_si256(
123 // Load the 128-bit block into a 256-bit vector.
124 _mm256_castsi128_si256(_mm_loadu_si128(block.as_ptr() as *const _)),
125 // Mask off the upper 128 bits (undefined by _mm256_castsi128_si256).
126 _mm256_set_epi64x(0, 0, -1, -1),
127 ))
128 }
129
130 /// Splits a 130-bit integer into five 26-bit limbs.
131 #[target_feature(enable = "avx2")]
132 unsafe fn new(x: __m256i) -> Self {
133 // Starting from a 130-bit integer split across 32-bit words:
134 // [0, 0, 0, [0; 30] || x4[2..0], x3, x2, x1, x0]
135
136 // - Grab the low bits of each word:
137 // x1 = [
138 // [0; 32],
139 // [0; 32],
140 // [0; 32],
141 // [0; 6] || x4[ 2..0] || [0; 24],
142 // x3[14..0] || [0; 18],
143 // x2[20..0] || [0; 12],
144 // x1[26..0] || [0; 6],
145 // x0,
146 // ]
147 let xl = _mm256_sllv_epi32(x, _mm256_set_epi32(32, 32, 32, 24, 18, 12, 6, 0));
148
149 // Grab the high bits of each word, rotated up by one word:
150 // xh = [
151 // [0; 32],
152 // [0; 32],
153 // [0; 32],
154 // [0; 8] || x3[32.. 8]
155 // [0; 14] || x2[32..14]
156 // [0; 20] || x1[32..20]
157 // [0; 26] || x0[32..26],
158 // [0; 32],
159 // ]
160 let xh = _mm256_permutevar8x32_epi32(
161 _mm256_srlv_epi32(x, _mm256_set_epi32(32, 32, 32, 2, 8, 14, 20, 26)),
162 _mm256_set_epi32(6, 5, 4, 3, 2, 1, 0, 7),
163 );
164
165 // - Combine the low and high bits:
166 // [
167 // [0; 32],
168 // [0; 32],
169 // [0; 32],
170 // [0; 6] || x4[ 2..0] || x3[32.. 8]
171 // x3[14..0] || x2[32..14]
172 // x2[20..0] || x1[32..20]
173 // x1[26..0] || x0[32..26],
174 // x0,
175 // ]
176 // - Mask to 26 bits:
177 // [
178 // [0; 32],
179 // [0; 32],
180 // [0; 32],
181 // [0; 6] || x4[ 2..0] || x3[32.. 8]
182 // [0; 6] || x3[ 8..0] || x2[32..14]
183 // [0; 6] || x2[14..0] || x1[32..20]
184 // [0; 6] || x1[20..0] || x0[32..26],
185 // [0; 6] || x0[26..0],
186 // ]
187 Aligned130(_mm256_and_si256(
188 _mm256_or_si256(xl, xh),
189 _mm256_set_epi32(
190 0, 0, 0, 0x3ffffff, 0x3ffffff, 0x3ffffff, 0x3ffffff, 0x3ffffff,
191 ),
192 ))
193 }
194}
195
196impl Add<Aligned130> for Aligned130 {
197 type Output = Aligned130;
198
199 fn add(self, other: Aligned130) -> Aligned130 {
200 // With 26-bit limbs inside 32-bit words, there is plenty of space for unreduced
201 // addition.
202 unsafe { Aligned130(_mm256_add_epi32(self.0, other.0)) }
203 }
204}
205
206/// A pre-computed multiplier.
207#[derive(Clone, Copy, Debug)]
208pub(super) struct PrecomputedMultiplier {
209 pub(super) a: __m256i,
210 pub(super) a_5: __m256i,
211}
212
213impl From<Aligned130> for PrecomputedMultiplier {
214 fn from(r: Aligned130) -> Self {
215 unsafe {
216 // Precompute 5*R.
217 //
218 // The 5-limb representation (r_4, r_3, r_2, r_1, r_0) of R and
219 // (5·r4, 5·r3, 5·r2, 5·r1) are represented in two 256-bit vectors in the
220 // following manner:
221 // r1: [5·r_4, 5·r_3, 5·r_2, r_4, r_3, r_2, r_1, r_0]
222 // r1_5: [5·r_1, 5·r_1, 5·r_1, 5·r_1, 5·r_1, 5·r_1, 5·r_1, 5·r_1]
223 let a_5 = _mm256_permutevar8x32_epi32(
224 _mm256_add_epi32(r.0, _mm256_slli_epi32(r.0, 2)),
225 _mm256_set_epi32(4, 3, 2, 1, 1, 1, 1, 1),
226 );
227 let a = _mm256_blend_epi32(r.0, a_5, 0b11100000);
228 let a_5 = _mm256_permute2x128_si256(a_5, a_5, 0);
229
230 PrecomputedMultiplier { a, a_5 }
231 }
232 }
233}
234
235impl Mul<PrecomputedMultiplier> for PrecomputedMultiplier {
236 type Output = Unreduced130;
237
238 fn mul(self, other: PrecomputedMultiplier) -> Unreduced130 {
239 // Pass through to `self.a` for multiplication.
240 Aligned130(self.a) * other
241 }
242}
243
244impl Mul<PrecomputedMultiplier> for Aligned130 {
245 type Output = Unreduced130;
246
247 /// Multiplies 2 values using lazy reduction.
248 ///
249 /// Context switches from 32 bit to 64 bit.
250 #[inline(always)]
251 fn mul(self, other: PrecomputedMultiplier) -> Unreduced130 {
252 unsafe {
253 // Starting with the following limb layout:
254 // x = [ 0, 0, 0, x_4, x_3, x_2, x_1, x_0]
255 // y = [5·r_4, 5·r_3, 5·r_2, r_4, r_3, r_2, r_1, r_0]
256 // z = [5·r_1, 5·r_1, 5·r_1, 5·r_1, 5·r_1, 5·r_1, 5·r_1, 5·r_1]
257 let x = self.0;
258 let y = other.a;
259 let z = other.a_5;
260
261 // [ 0, x_4, 0, x_3, 0, x_2, 0, x_1] (32-bit words)
262 // * 5·r_4
263 // = [5·r_4·x_4, 5·r_4·x_3, 5·r_4·x_2, 5·r_4·x_1] (64-bit words)
264 let v0 = _mm256_mul_epu32(
265 _mm256_permutevar8x32_epi32(x, _mm256_set_epi64x(4, 3, 2, 1)),
266 _mm256_permutevar8x32_epi32(y, _mm256_set_epi64x(7, 7, 7, 7)),
267 );
268 // [ 0, x_3, 0, x_2, 0, x_1, 0, x_0] (32-bit words)
269 // * r_0
270 // = [ r_0·x_3, r_0·x_2, r_0·x_1, r_0·x_0] (64-bit words)
271 // + previous step
272 // = [
273 // r_0·x_3 + 5·r_4·x_4,
274 // r_0·x_2 + 5·r_4·x_3,
275 // r_0·x_1 + 5·r_4·x_2,
276 // r_0·x_0 + 5·r_4·x_1,
277 // ]
278 let v0 = _mm256_add_epi64(
279 v0,
280 _mm256_mul_epu32(
281 _mm256_permutevar8x32_epi32(x, _mm256_set_epi64x(3, 2, 1, 0)),
282 _mm256_broadcastd_epi32(_mm256_castsi256_si128(y)),
283 ),
284 );
285 // [ 0, x_1, 0, x_1, 0, x_3, 0, x_3]
286 // * [ 0, r_2, 0, r_1, 0, 5·r_3, 0, 5·r_2]
287 // = [r_2·x_1, r_1·x_1, 5·r_3·x_3, 5·r_2·x_3]
288 // + previous step
289 // = [
290 // r_0·x_3 + r_2·x_1 + 5·r_4·x_4,
291 // r_0·x_2 + r_1·x_1 + 5·r_4·x_3,
292 // r_0·x_1 + 5·r_3·x_3 + 5·r_4·x_2,
293 // r_0·x_0 + 5·r_2·x_3 + 5·r_4·x_1,
294 // ]
295 let v0 = _mm256_add_epi64(
296 v0,
297 _mm256_mul_epu32(
298 _mm256_permutevar8x32_epi32(x, _mm256_set_epi64x(1, 1, 3, 3)),
299 _mm256_permutevar8x32_epi32(y, _mm256_set_epi64x(2, 1, 6, 5)),
300 ),
301 );
302 // [x_3, x_2, x_1, x_0, x_1, x_0, 0, x_4]
303 // * [ 0, r_1, 0, r_2, 0, r_1, 5·r_1, 5·r_1]
304 // = [ r_1·x_2, r_2·x_0, r_1·x_0, 5·r_1·x_4]
305 // + previous step
306 // = [
307 // r_0·x_3 + r_1·x_2 + r_2·x_1 + 5·r_4·x_4,
308 // r_0·x_2 + r_1·x_1 + r_2·x_0 + 5·r_4·x_3,
309 // r_0·x_1 + r_1·x_0 + 5·r_3·x_3 + 5·r_4·x_2,
310 // r_0·x_0 + 5·r_1·x_4 + 5·r_2·x_3 + 5·r_4·x_1,
311 // ]
312 let v0 = _mm256_add_epi64(
313 v0,
314 _mm256_mul_epu32(
315 _mm256_permute4x64_epi64(x, set02(1, 0, 0, 2)),
316 _mm256_blend_epi32(
317 _mm256_permutevar8x32_epi32(y, _mm256_set_epi64x(1, 2, 1, 1)),
318 z,
319 0x03,
320 ),
321 ),
322 );
323 // [x_1, x_0, 0, x_4, 0, x_4, x_3, x_2]
324 // * [ 0, r_3, 0, 5·r_3, 0, 5·r_2, 0, 5·r_3]
325 // = [ r_3·x_0, 5·r_3·x_4, 5·r_2·x_4, 5·r_3·x_2]
326 // + previous step
327 // v0 = [
328 // r_0·x_3 + r_1·x_2 + r_2·x_1 + r_3·x_0 + 5·r_4·x_4,
329 // r_0·x_2 + r_1·x_1 + r_2·x_0 + 5·r_3·x_4 + 5·r_4·x_3,
330 // r_0·x_1 + r_1·x_0 + 5·r_2·x_4 + 5·r_3·x_3 + 5·r_4·x_2,
331 // r_0·x_0 + 5·r_1·x_4 + 5·r_2·x_3 + 5·r_3·x_2 + 5·r_4·x_1,
332 // ]
333 let v0 = _mm256_add_epi64(
334 v0,
335 _mm256_mul_epu32(
336 _mm256_permute4x64_epi64(x, set02(0, 2, 2, 1)),
337 _mm256_permutevar8x32_epi32(y, _mm256_set_epi64x(3, 6, 5, 6)),
338 ),
339 );
340
341 // [ 0, x_3, 0, x_2, 0, x_1, 0, x_0]
342 // * [ 0, r_1, 0, r_2, 0, r_3, 0, r_4]
343 // = [r_1·x_3, r_2·x_2, r_3·x_1, r_4·x_0]
344 let v1 = _mm256_mul_epu32(
345 _mm256_permutevar8x32_epi32(x, _mm256_set_epi64x(3, 2, 1, 0)),
346 _mm256_permutevar8x32_epi32(y, _mm256_set_epi64x(1, 2, 3, 4)),
347 );
348 // [r_3·x_1, r_4·x_0, r_1·x_3, r_2·x_2]
349 // + previous step
350 // = [
351 // r_1·x_3 + r_3·x_1,
352 // r_2·x_2 + r_4·x_0,
353 // r_1·x_3 + r_3·x_1,
354 // r_2·x_2 + r_4·x_0,
355 // ]
356 let v1 = _mm256_add_epi64(v1, _mm256_permute4x64_epi64(v1, set02(1, 0, 3, 2)));
357 // [
358 // r_2·x_2 + r_4·x_0,
359 // r_2·x_2 + r_4·x_0,
360 // r_2·x_2 + r_4·x_0,
361 // r_1·x_3 + r_3·x_1,
362 // ]
363 // + previous step
364 // = [
365 // r_1·x_3 + r_2·x_2 + r_3·x_1 + r_4·x_0,
366 // 2·r_2·x_2 + 2·r_4·x_0,
367 // r_1·x_3 + r_2·x_2 + r_3·x_1 + r_4·x_0,
368 // r_1·x_3 + r_2·x_2 + r_3·x_1 + r_4·x_0,
369 // ]
370 let v1 = _mm256_add_epi64(v1, _mm256_permute4x64_epi64(v1, set02(0, 0, 0, 1)));
371 // [ x_1, x_0, x_1, x_0, x_1, x_0, 0, x_4]
372 // * [5·r_4, 5·r_3, 5·r_2, r_4, r_3, r_2, r_1, r_0]
373 // = [ 5·r_3·x_0, r_4·x_0, r_2·x_0, r_0·x_4]
374 // + previous step
375 // v1 = [
376 // 5·r_3·x_0 + r_1·x_3 + r_2·x_2 + r_3·x_1 + r_4·x_0,
377 // 2·r_2·x_2 + 3·r_4·x_0,
378 // r_2·x_0 + r_1·x_3 + r_2·x_2 + r_3·x_1 + r_4·x_0,
379 // r_0·x_4 + r_1·x_3 + r_2·x_2 + r_3·x_1 + r_4·x_0,
380 // ]
381 let v1 = _mm256_add_epi64(
382 v1,
383 _mm256_mul_epu32(_mm256_permute4x64_epi64(x, set02(0, 0, 0, 2)), y),
384 );
385
386 // The result:
387 // v1 = [
388 // 5·r_3·x_0 + r_1·x_3 + r_2·x_2 + r_3·x_1 + r_4·x_0,
389 // 2·r_2·x_2 + 3·r_4·x_0,
390 // r_2·x_0 + r_1·x_3 + r_2·x_2 + r_3·x_1 + r_4·x_0,
391 // r_0·x_4 + r_1·x_3 + r_2·x_2 + r_3·x_1 + r_4·x_0,
392 // ]
393 // v0 = [
394 // r_0·x_3 + r_1·x_2 + r_2·x_1 + r_3·x_0 + 5·r_4·x_4,
395 // r_0·x_2 + r_1·x_1 + r_2·x_0 + 5·r_3·x_4 + 5·r_4·x_3,
396 // r_0·x_1 + r_1·x_0 + 5·r_2·x_4 + 5·r_3·x_3 + 5·r_4·x_2,
397 // r_0·x_0 + 5·r_1·x_4 + 5·r_2·x_3 + 5·r_3·x_2 + 5·r_4·x_1,
398 // ]
399 // This corresponds to (3) in Goll Gueron 2015:
400 // v1 = [ _, _, _, t_4]
401 // v0 = [t_3, t_2, t_1, t_0]
402 Unreduced130 { v0, v1 }
403 }
404 }
405}
406
407/// The unreduced output of an `Aligned130` multiplication.
408///
409/// Represented internally with 64-bit limbs.
410#[derive(Copy, Clone, Debug)]
411pub(super) struct Unreduced130 {
412 v0: __m256i,
413 v1: __m256i,
414}
415
416impl fmt::Display for Unreduced130 {
417 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
418 let mut v0 = [0u8; 32];
419 let mut v1 = [0u8; 32];
420 unsafe {
421 _mm256_storeu_si256(v0.as_mut_ptr() as *mut _, self.v0);
422 _mm256_storeu_si256(v1.as_mut_ptr() as *mut _, self.v1);
423 }
424
425 write!(f, "Unreduced130(")?;
426 write_130_wide(
427 f,
428 [
429 u64::from_le_bytes(v0[0..8].try_into().unwrap()),
430 u64::from_le_bytes(v0[8..16].try_into().unwrap()),
431 u64::from_le_bytes(v0[16..24].try_into().unwrap()),
432 u64::from_le_bytes(v0[24..32].try_into().unwrap()),
433 u64::from_le_bytes(v1[0..8].try_into().unwrap()),
434 ],
435 )?;
436 write!(f, ")")
437 }
438}
439
440impl Unreduced130 {
441 /// Reduces x modulo 2^130 - 5.
442 ///
443 /// Context switches from 64 bit to 32 bit.
444 #[inline(always)]
445 pub(super) fn reduce(self) -> Aligned130 {
446 unsafe {
447 // Starting with the following limb layout:
448 // self.v1 = [ _, _, _, t_4]
449 // self.v0 = [t_3, t_2, t_1, t_0]
450 let (red_1, red_0) = adc(self.v1, self.v0);
451 let (red_1, red_0) = red(red_1, red_0);
452 let (red_1, red_0) = adc(red_1, red_0);
453
454 // - Switch context from 64-bit limbs to 32-bit limbs:
455 Aligned130(_mm256_blend_epi32(
456 _mm256_permutevar8x32_epi32(red_0, _mm256_set_epi32(0, 6, 4, 0, 6, 4, 2, 0)),
457 _mm256_permutevar8x32_epi32(red_1, _mm256_set_epi32(0, 6, 4, 0, 6, 4, 2, 0)),
458 0x90,
459 ))
460 }
461 }
462}
463
464/// Carry chain
465#[inline(always)]
466unsafe fn adc(v1: __m256i, v0: __m256i) -> (__m256i, __m256i) {
467 // [t_3, t_2 % 2^26, t_1 % 2^26, t_0 % 2^26]
468 // + [t_2 >> 26, t_1 >> 26, t_0 >> 26, 0 ]
469 // = [
470 // t_3 + t_2 >> 26,
471 // t_2 % 2^26 + t_1 >> 26,
472 // t_1 % 2^26 + t_0 >> 26,
473 // t_0 % 2^26,
474 // ]
475 let v0 = _mm256_add_epi64(
476 _mm256_and_si256(v0, _mm256_set_epi64x(-1, 0x3ffffff, 0x3ffffff, 0x3ffffff)),
477 _mm256_permute4x64_epi64(
478 _mm256_srlv_epi64(v0, _mm256_set_epi64x(64, 26, 26, 26)),
479 set02(2, 1, 0, 3),
480 ),
481 );
482 // [_, _, _, t_4]
483 // + [
484 // (t_2 % 2^26 + t_1 >> 26) >> 26,
485 // (t_1 % 2^26 + t_0 >> 26) >> 26,
486 // (t_0 % 2^26 ) >> 26,
487 // (t_3 + t_2 >> 26) >> 26,
488 // ]
489 // = [_, _, _, t_4 + (t_3 + t_2 >> 26) >> 26]
490 let v1 = _mm256_add_epi64(
491 v1,
492 _mm256_permute4x64_epi64(_mm256_srli_epi64(v0, 26), set02(2, 1, 0, 3)),
493 );
494 // [
495 // (t_3 + t_2 >> 26) % 2^26,
496 // t_2 % 2^26 + t_1 >> 26,
497 // t_1 % 2^26 + t_0 >> 26,
498 // t_0 % 2^26,
499 // ]
500 let chain = _mm256_and_si256(v0, _mm256_set_epi64x(0x3ffffff, -1, -1, -1));
501
502 (v1, chain)
503}
504
505/// Reduction modulus 2^130-5
506#[inline(always)]
507unsafe fn red(v1: __m256i, v0: __m256i) -> (__m256i, __m256i) {
508 // t = [0, 0, 0, t_4 >> 26]
509 let t = _mm256_srlv_epi64(v1, _mm256_set_epi64x(64, 64, 64, 26));
510 // v0 + 5·t = [t_3, t_2, t_1, t_0 + 5·(t_4 >> 26)]
511 let red_0 = _mm256_add_epi64(_mm256_add_epi64(v0, t), _mm256_slli_epi64(t, 2));
512 // [0, 0, 0, t_4 % 2^26]
513 let red_1 = _mm256_and_si256(v1, _mm256_set_epi64x(0, 0, 0, 0x3ffffff));
514 (red_1, red_0)
515}
516
517/// A pair of `Aligned130`s.
518#[derive(Clone, Debug)]
519pub(super) struct Aligned2x130 {
520 v0: Aligned130,
521 v1: Aligned130,
522}
523
524impl fmt::Display for Aligned2x130 {
525 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
526 writeln!(f, "Aligned2x130([")?;
527 writeln!(f, " {},", self.v0)?;
528 writeln!(f, " {},", self.v1)?;
529 write!(f, "])")
530 }
531}
532
533impl Aligned2x130 {
534 /// Aligns two 16-byte Poly1305 blocks at 26-bit boundaries within 32-bit words, and
535 /// sets the high bit for each block.
536 ///
537 /// # Panics
538 ///
539 /// Panics if `src.len() < 32`.
540 #[target_feature(enable = "avx2")]
541 pub(super) unsafe fn from_blocks(src: &[Block; 2]) -> Self {
542 Aligned2x130 {
543 v0: Aligned130::from_block(&src[0]),
544 v1: Aligned130::from_block(&src[1]),
545 }
546 }
547
548 /// Multiplies 2x2 and add both results simultaneously using lazy reduction.
549 ///
550 /// Context switches from 32 bit to 64 bit.
551 #[inline(always)]
552 pub(super) fn mul_and_sum(
553 self,
554 r1: PrecomputedMultiplier,
555 r2: PrecomputedMultiplier,
556 ) -> Unreduced130 {
557 unsafe {
558 // Starting with the following limb layout:
559 // x.v1 = [ 0, 0, 0, x1_4, x1_3, x1_2, x1_1, x1_0]
560 // x.v0 = [ 0, 0, 0, x0_4, x0_3, x0_2, x0_1, x0_0]
561 // r1 = [5·r1_4, 5·r1_3, 5·r1_2, r1_4, r1_3, r1_2, r1_1, r1_0]
562 // r15 = [5·r1_1, 5·r1_1, 5·r1_1, 5·r1_1, 5·r1_1, 5·r1_1, 5·r1_1, 5·r1_1]
563 // r2 = [5·r2_4, 5·r2_3, 5·r2_2, r2_4, r2_3, r2_2, r2_1, r2_0]
564 // r25 = [5·r2_1, 5·r2_1, 5·r2_1, 5·r2_1, 5·r2_1, 5·r2_1, 5·r2_1, 5·r2_1]
565 let x = self;
566 let r15 = r1.a_5;
567 let r25 = r2.a_5;
568 let r1 = r1.a;
569 let r2 = r2.a;
570
571 // v0 = [
572 // 5·x0_4·r2_4,
573 // 5·x0_3·r2_4,
574 // 5·x0_2·r2_4,
575 // 5·x0_1·r2_4,
576 // ]
577 let mut v0 = _mm256_mul_epu32(
578 // [_, x0_4, _, x0_3, _, x0_2, _, x0_1]
579 _mm256_permutevar8x32_epi32(x.v0.0, _mm256_set_epi64x(4, 3, 2, 1)),
580 // [_, 5·r2_4, _, 5·r2_4, _, 5·r2_4, _, 5·r2_4]
581 _mm256_permutevar8x32_epi32(r2, _mm256_set1_epi64x(7)),
582 );
583 // v1 = [
584 // 5·x1_4·r1_4,
585 // 5·x1_3·r1_4,
586 // 5·x1_2·r1_4,
587 // 5·x1_1·r1_4,
588 // ]
589 let mut v1 = _mm256_mul_epu32(
590 // [_, x1_4, _, x1_3, _, x1_2, _, x1_1]
591 _mm256_permutevar8x32_epi32(x.v1.0, _mm256_set_epi64x(4, 3, 2, 1)),
592 // [_, 5·r1_4, _, 5·r1_4, _, 5·r1_4, _, 5·r1_4]
593 _mm256_permutevar8x32_epi32(r1, _mm256_set1_epi64x(7)),
594 );
595
596 // v0 = [
597 // `x0_0·r2_3`+ 5·x0_4·r2_4,
598 // `5·x0_4·r2_3`+ 5·x0_3·r2_4,
599 // `5·x0_4·r2_2` + 5·x0_2·r2_4,
600 // `5·x0_2·r2_3`+ 5·x0_1·r2_4,
601 // ]
602 v0 = _mm256_add_epi64(
603 v0,
604 _mm256_mul_epu32(
605 // [_, x0_0, _, x0_4, _, x0_4, _, x0_2]
606 _mm256_permute4x64_epi64(x.v0.0, set02(0, 2, 2, 1)),
607 // [_, r2_3, _, 5·r2_3, _, 5·r2_2, _, 5·r2_3]
608 _mm256_permutevar8x32_epi32(r2, _mm256_set_epi64x(3, 6, 5, 6)),
609 ),
610 );
611 // v1 = [
612 // `x1_0·r1_3`+ 5·x1_4·r1_4,
613 // `5·x1_4·r1_3`+ 5·x1_3·r1_4,
614 // `5·x1_4·r1_2` + 5·x1_2·r1_4,
615 // `5·x1_2·r1_3`+ 5·x1_1·r1_4,
616 // ]
617 v1 = _mm256_add_epi64(
618 v1,
619 _mm256_mul_epu32(
620 // [_, x1_0, _, x1_4, _, x1_4, _, x1_2]
621 _mm256_permute4x64_epi64(x.v1.0, set02(0, 2, 2, 1)),
622 // [_, r1_3, _, 5·r1_3, _, 5·r1_2, _, 5·r1_3]
623 _mm256_permutevar8x32_epi32(r1, _mm256_set_epi64x(3, 6, 5, 6)),
624 ),
625 );
626 // v0 = [
627 // `x0_1·r2_2`+ x0_0·r2_3 + 5·x0_4·r2_4,
628 // `x0_1·r2_1` + 5·x0_4·r2_3 + 5·x0_3·r2_4,
629 // 5·x0_4·r2_2 +`5·x0_3·r2_3`+ 5·x0_2·r2_4,
630 // `5·x0_3·r2_2`+ 5·x0_2·r2_3 + 5·x0_1·r2_4,
631 // ]
632 v0 = _mm256_add_epi64(
633 v0,
634 _mm256_mul_epu32(
635 // [_, x0_1, _, x0_1, _, x0_3, _, x0_3]
636 _mm256_permutevar8x32_epi32(x.v0.0, _mm256_set_epi64x(1, 1, 3, 3)),
637 // [_, r2_2, _, r2_1, _, 5·r2_3, _, 5·r2_2]
638 _mm256_permutevar8x32_epi32(r2, _mm256_set_epi64x(2, 1, 6, 5)),
639 ),
640 );
641 // v1 = [
642 // `x1_1·r1_2`+ x1_0·r1_3 + 5·x1_4·r1_4,
643 // `x1_1·r1_1` + 5·x1_4·r1_3 + 5·x1_3·r1_4,
644 // 5·x1_4·r1_2 +`5·x1_3·r1_3`+ 5·x1_2·r1_4,
645 // `5·x1_3·r1_2`+ 5·x1_2·r1_3 + 5·x1_1·r1_4,
646 // ]
647 v1 = _mm256_add_epi64(
648 v1,
649 _mm256_mul_epu32(
650 // [_, x1_1, _, x1_1, _, x1_3, _, x1_3]
651 _mm256_permutevar8x32_epi32(x.v1.0, _mm256_set_epi64x(1, 1, 3, 3)),
652 // [_, r1_2, _, r1_1, _, 5·r1_3, _, 5·r1_2]
653 _mm256_permutevar8x32_epi32(r1, _mm256_set_epi64x(2, 1, 6, 5)),
654 ),
655 );
656 // v0 = [
657 // `x0_3·r2_0` + x0_1·r2_2 + x0_0·r2_3 + 5·x0_4·r2_4,
658 // `x0_2·r2_0`+ x0_1·r2_1 + 5·x0_4·r2_3 + 5·x0_3·r2_4,
659 // `x0_1·r2_0` + 5·x0_4·r2_2 + 5·x0_3·r2_3 + 5·x0_2·r2_4,
660 // `x0_0·r2_0` + 5·x0_3·r2_2 + 5·x0_2·r2_3 + 5·x0_1·r2_4,
661 // ]
662 v0 = _mm256_add_epi64(
663 v0,
664 _mm256_mul_epu32(
665 // [_, x0_3, _, x0_2, _, x0_1, _, x0_0]
666 _mm256_permutevar8x32_epi32(x.v0.0, _mm256_set_epi64x(3, 2, 1, 0)),
667 // [_, r2_0, _, r2_0, _, r2_0, _, r2_0]
668 _mm256_broadcastd_epi32(_mm256_castsi256_si128(r2)),
669 ),
670 );
671 // v1 = [
672 // `x1_3·r1_0` + x1_1·r1_2 + x1_0·r1_3 + 5·x1_4·r1_4,
673 // `x1_2·r1_0`+ x1_1·r1_1 + 5·x1_4·r1_3 + 5·x1_3·r1_4,
674 // `x1_1·r1_0` + 5·x1_4·r1_2 + 5·x1_3·r1_3 + 5·x1_2·r1_4,
675 // `x1_0·r1_0` + 5·x1_3·r1_2 + 5·x1_2·r1_3 + 5·x1_1·r1_4,
676 // ]
677 v1 = _mm256_add_epi64(
678 v1,
679 _mm256_mul_epu32(
680 // [_, x1_3, _, x1_2, _, x1_1, _, x1_0]
681 _mm256_permutevar8x32_epi32(x.v1.0, _mm256_set_epi64x(3, 2, 1, 0)),
682 // [_, r1_0, _, r1_0, _, r1_0, _, r1_0]
683 _mm256_broadcastd_epi32(_mm256_castsi256_si128(r1)),
684 ),
685 );
686
687 // t0 = [x0_3, x0_2, x0_1, x0_0, x0_1, x0_0, 0, x0_4]
688 // t1 = [x1_3, x1_2, x1_1, x1_0, x1_1, x1_0, 0, x1_4]
689 let mut t0 = _mm256_permute4x64_epi64(x.v0.0, set02(1, 0, 0, 2));
690 let mut t1 = _mm256_permute4x64_epi64(x.v1.0, set02(1, 0, 0, 2));
691
692 // v0 = [
693 // x0_3·r2_0 + `x0_2·r2_1`+ x0_1·r2_2 + x0_0·r2_3 + 5·x0_4·r2_4,
694 // x0_2·r2_0 + x0_1·r2_1 + `x0_0·r2_2`+ 5·x0_4·r2_3 + 5·x0_3·r2_4,
695 // x0_1·r2_0 + `x0_0·r2_1`+ 5·x0_4·r2_2 + 5·x0_3·r2_3 + 5·x0_2·r2_4,
696 // x0_0·r2_0 +`5·x0_4·r2_1`+ 5·x0_3·r2_2 + 5·x0_2·r2_3 + 5·x0_1·r2_4,
697 // ]
698 v0 = _mm256_add_epi64(
699 v0,
700 _mm256_mul_epu32(
701 // [_, x0_2, _, x0_0, _, x0_0, _, x0_4]
702 t0,
703 // [_, r2_1, _, r2_2, _, r2_1, _, 5·r2_1]
704 _mm256_blend_epi32(
705 // [r2_0, r2_1, r2_0, r2_2, r2_0, r2_1, r2_0, r2_1]
706 _mm256_permutevar8x32_epi32(r2, _mm256_set_epi64x(1, 2, 1, 1)),
707 r25,
708 0b00000011,
709 ),
710 ),
711 );
712 // v1 = [
713 // x1_3·r1_0 + `x1_2·r1_1`+ x1_1·r1_2 + x1_0·r1_3 + 5·x1_4·r1_4,
714 // x1_2·r1_0 + x1_1·r1_1 + `x1_0·r1_2`+ 5·x1_4·r1_3 + 5·x1_3·r1_4,
715 // x1_1·r1_0 + `x1_0·r1_1`+ 5·x1_4·r1_2 + 5·x1_3·r1_3 + 5·x1_2·r1_4,
716 // x1_0·r1_0 +`5·x1_4·r1_1`+ 5·x1_3·r1_2 + 5·x1_2·r1_3 + 5·x1_1·r1_4,
717 // ]
718 v1 = _mm256_add_epi64(
719 v1,
720 _mm256_mul_epu32(
721 // [_, x1_2, _, x1_0, _, x1_0, _, x1_4]
722 t1,
723 // [_, r1_1, _, r1_2, _, r1_1, _, 5·r1_1]
724 _mm256_blend_epi32(
725 // [r1_0, r1_1, r1_0, r1_2, r1_0, r1_1, r1_0, r1_1]
726 _mm256_permutevar8x32_epi32(r1, _mm256_set_epi64x(1, 2, 1, 1)),
727 r15,
728 0b00000011,
729 ),
730 ),
731 );
732 // v0 = [
733 // x0_3·r2_0 + x0_2·r2_1 + x0_1·r2_2 + x0_0·r2_3 + 5·x0_4·r2_4 + x1_3·r1_0 + x1_2·r1_1 + x1_1·r1_2 + x1_0·r1_3 + 5·x1_4·r1_4,
734 // x0_2·r2_0 + x0_1·r2_1 + x0_0·r2_2 + 5·x0_4·r2_3 + 5·x0_3·r2_4 + x1_2·r1_0 + x1_1·r1_1 + x1_0·r1_2 + 5·x1_4·r1_3 + 5·x1_3·r1_4,
735 // x0_1·r2_0 + x0_0·r2_1 + 5·x0_4·r2_2 + 5·x0_3·r2_3 + 5·x0_2·r2_4 + x1_1·r1_0 + x1_0·r1_1 + 5·x1_4·r1_2 + 5·x1_3·r1_3 + 5·x1_2·r1_4,
736 // x0_0·r2_0 + 5·x0_4·r2_1 + 5·x0_3·r2_2 + 5·x0_2·r2_3 + 5·x0_1·r2_4 + x1_0·r1_0 + 5·x1_4·r1_1 + 5·x1_3·r1_2 + 5·x1_2·r1_3 + 5·x1_1·r1_4,
737 // ]
738 v0 = _mm256_add_epi64(v0, v1);
739
740 // t0 = [
741 // 5·x0_2·r2_3,
742 // x0_0·r2_4,
743 // x0_0·r2_2,
744 // x0_4·r2_0,
745 // ]
746 // t1 = [
747 // 5·x1_2·r1_3,
748 // x1_0·r1_4,
749 // x1_0·r1_2,
750 // x1_4·r1_0,
751 // ]
752 t0 = _mm256_mul_epu32(t0, r2);
753 t1 = _mm256_mul_epu32(t1, r1);
754
755 // v1 = [
756 // 5·x0_2·r2_3 + 5·x1_2·r1_3,
757 // x0_0·r2_4 + x1_0·r1_4,
758 // x0_0·r2_2 + x1_0·r1_2,
759 // x0_4·r2_0 + x1_4·r1_0,
760 // ]
761 v1 = _mm256_add_epi64(t0, t1);
762
763 // t0 = [
764 // x0_3·r2_1,
765 // x0_2·r2_2,
766 // x0_1·r2_3,
767 // x0_0·r2_4,
768 // ]
769 t0 = _mm256_mul_epu32(
770 // [_, x0_3, _, x0_2, _, x0_1, _, x0_0]
771 _mm256_permutevar8x32_epi32(x.v0.0, _mm256_set_epi64x(3, 2, 1, 0)),
772 // [_, r2_1, _, r2_2, _, r2_3, _, r2_4]
773 _mm256_permutevar8x32_epi32(r2, _mm256_set_epi64x(1, 2, 3, 4)),
774 );
775 // t1 = [
776 // x1_3·r1_1,
777 // x1_2·r1_2,
778 // x1_1·r1_3,
779 // x1_0·r1_4,
780 // ]
781 t1 = _mm256_mul_epu32(
782 // [_, x1_3, _, x1_2, _, x1_1, _, x1_0]
783 _mm256_permutevar8x32_epi32(x.v1.0, _mm256_set_epi64x(3, 2, 1, 0)),
784 // [_, r1_1, _, r1_2, _, r1_3, _, r1_4]
785 _mm256_permutevar8x32_epi32(r1, _mm256_set_epi64x(1, 2, 3, 4)),
786 );
787 // t0 = [
788 // x0_3·r2_1 + x1_3·r1_1,
789 // x0_2·r2_2 + x1_2·r1_2,
790 // x0_1·r2_3 + x1_1·r1_3,
791 // x0_0·r2_4 + x1_0·r1_4,
792 // ]
793 t0 = _mm256_add_epi64(t0, t1);
794 // t0 = [
795 // x0_3·r2_1 + x0_1·r2_3 + x1_3·r1_1 + x1_1·r1_3,
796 // x0_2·r2_2 + x0_0·r2_4 + x1_2·r1_2 + x1_0·r1_4,
797 // x0_3·r2_1 + x0_1·r2_3 + x1_3·r1_1 + x1_1·r1_3,
798 // x0_2·r2_2 + x0_0·r2_4 + x1_2·r1_2 + x1_0·r1_4,
799 // ]
800 t0 = _mm256_add_epi64(t0, _mm256_permute4x64_epi64(t0, set02(1, 0, 3, 2)));
801 // t0 = [
802 // x0_3·r2_1 + x0_2·r2_2 + x0_1·r2_3 + x0_0·r2_4 + x1_3·r1_1 + x1_2·r1_2 + x1_1·r1_3 + x1_0·r1_4,
803 // x0_3·r2_1 + x0_2·r2_2 + x0_1·r2_3 + x0_0·r2_4 + x1_3·r1_1 + x1_2·r1_2 + x1_1·r1_3 + x1_0·r1_4,
804 // x0_3·r2_1 + x0_2·r2_2 + x0_1·r2_3 + x0_0·r2_4 + x1_3·r1_1 + x1_2·r1_2 + x1_1·r1_3 + x1_0·r1_4,
805 // x0_3·r2_1 + x0_2·r2_2 + x0_1·r2_3 + x0_0·r2_4 + x1_3·r1_1 + x1_2·r1_2 + x1_1·r1_3 + x1_0·r1_4,
806 // ]
807 t0 = _mm256_add_epi64(t0, _mm256_permute4x64_epi64(t0, set02(2, 3, 0, 1)));
808
809 // v1 = [
810 // 5·x0_2·r2_3 + x0_3·r2_1 + x0_2·r2_2 + x0_1·r2_3 + x0_0·r2_4 + 5·x1_2·r1_3 + x1_3·r1_1 + x1_2·r1_2 + x1_1·r1_3 + x1_0·r1_4,
811 // x0_0·r2_4 + x0_3·r2_1 + x0_2·r2_2 + x0_1·r2_3 + x0_0·r2_4 + x1_0·r1_4 + x1_3·r1_1 + x1_2·r1_2 + x1_1·r1_3 + x1_0·r1_4,
812 // x0_0·r2_2 + x0_3·r2_1 + x0_2·r2_2 + x0_1·r2_3 + x0_0·r2_4 + x1_0·r1_2 + x1_3·r1_1 + x1_2·r1_2 + x1_1·r1_3 + x1_0·r1_4,
813 // x0_4·r2_0 + x0_3·r2_1 + x0_2·r2_2 + x0_1·r2_3 + x0_0·r2_4 + x1_4·r1_0 + x1_3·r1_1 + x1_2·r1_2 + x1_1·r1_3 + x1_0·r1_4,
814 // ]
815 v1 = _mm256_add_epi64(v1, t0);
816
817 // The result:
818 // v1 = [
819 // _, _, _,
820 // x0_4·r2_0 + x0_3·r2_1 + x0_2·r2_2 + x0_1·r2_3 + x0_0·r2_4 + x1_4·r1_0 + x1_3·r1_1 + x1_2·r1_2 + x1_1·r1_3 + x1_0·r1_4,
821 // ]
822 // v0 = [
823 // x0_3·r2_0 + x0_2·r2_1 + x0_1·r2_2 + x0_0·r2_3 + 5·x0_4·r2_4 + x1_3·r1_0 + x1_2·r1_1 + x1_1·r1_2 + x1_0·r1_3 + 5·x1_4·r1_4,
824 // x0_2·r2_0 + x0_1·r2_1 + x0_0·r2_2 + 5·x0_4·r2_3 + 5·x0_3·r2_4 + x1_2·r1_0 + x1_1·r1_1 + x1_0·r1_2 + 5·x1_4·r1_3 + 5·x1_3·r1_4,
825 // x0_1·r2_0 + x0_0·r2_1 + 5·x0_4·r2_2 + 5·x0_3·r2_3 + 5·x0_2·r2_4 + x1_1·r1_0 + x1_0·r1_1 + 5·x1_4·r1_2 + 5·x1_3·r1_3 + 5·x1_2·r1_4,
826 // x0_0·r2_0 + 5·x0_4·r2_1 + 5·x0_3·r2_2 + 5·x0_2·r2_3 + 5·x0_1·r2_4 + x1_0·r1_0 + 5·x1_4·r1_1 + 5·x1_3·r1_2 + 5·x1_2·r1_3 + 5·x1_1·r1_4,
827 // ]
828 Unreduced130 { v0, v1 }
829 }
830 }
831}
832
833impl Add<Aligned130> for Aligned2x130 {
834 type Output = Aligned2x130;
835
836 /// Adds `other` into the lower integer of `self`.
837 fn add(self, other: Aligned130) -> Aligned2x130 {
838 Aligned2x130 {
839 v0: self.v0 + other,
840 v1: self.v1,
841 }
842 }
843}
844
845/// A multiplier that takes 130-bit integers `(x3, x2, x1, x0)` and computes
846/// `(x3·R^4, x2·R^3, x1·R^2, x0·R) mod 2^130 - 5`.
847#[derive(Copy, Clone, Debug)]
848pub(super) struct SpacedMultiplier4x130 {
849 v0: __m256i,
850 v1: __m256i,
851 r1: PrecomputedMultiplier,
852}
853
854impl SpacedMultiplier4x130 {
855 /// Returns `(multipler, R^4)` given `(R^1, R^2)`.
856 #[target_feature(enable = "avx2")]
857 pub(super) unsafe fn new(
858 r1: PrecomputedMultiplier,
859 r2: PrecomputedMultiplier,
860 ) -> (Self, PrecomputedMultiplier) {
861 let r3 = (r2 * r1).reduce();
862 let r4 = (r2 * r2).reduce();
863
864 // v0 = [r2_4, r2_3, r2_1, r3_4, r3_3, r3_2, r3_1, r3_0]
865 let v0 = _mm256_blend_epi32(
866 r3.0,
867 _mm256_permutevar8x32_epi32(r2.a, _mm256_set_epi32(4, 3, 1, 0, 0, 0, 0, 0)),
868 0b11100000,
869 );
870
871 // v1 = [r2_4, r2_2, r2_0, r4_4, r4_3, r4_2, r4_1, r4_0]
872 let v1 = _mm256_blend_epi32(
873 r4.0,
874 _mm256_permutevar8x32_epi32(r2.a, _mm256_set_epi32(4, 2, 0, 0, 0, 0, 0, 0)),
875 0b11100000,
876 );
877
878 let m = SpacedMultiplier4x130 { v0, v1, r1 };
879
880 (m, r4.into())
881 }
882}
883
884/// Four 130-bit integers aligned across five 26-bit limbs each.
885///
886/// Unlike `Aligned2x130` which wraps two `Aligned130`s, this struct represents the four
887/// integers as 20 limbs spread across three 256-bit vectors.
888#[derive(Copy, Clone, Debug)]
889pub(super) struct Aligned4x130 {
890 v0: __m256i,
891 v1: __m256i,
892 v2: __m256i,
893}
894
895impl fmt::Display for Aligned4x130 {
896 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
897 let mut v0 = [0u8; 32];
898 let mut v1 = [0u8; 32];
899 let mut v2 = [0u8; 32];
900 unsafe {
901 _mm256_storeu_si256(v0.as_mut_ptr() as *mut _, self.v0);
902 _mm256_storeu_si256(v1.as_mut_ptr() as *mut _, self.v1);
903 _mm256_storeu_si256(v2.as_mut_ptr() as *mut _, self.v2);
904 }
905
906 writeln!(f, "Aligned4x130([")?;
907 write!(f, " ")?;
908 write_130(
909 f,
910 [
911 u32::from_le_bytes(v0[0..4].try_into().unwrap()),
912 u32::from_le_bytes(v1[0..4].try_into().unwrap()),
913 u32::from_le_bytes(v0[4..8].try_into().unwrap()),
914 u32::from_le_bytes(v1[4..8].try_into().unwrap()),
915 u32::from_le_bytes(v2[0..4].try_into().unwrap()),
916 ],
917 )?;
918 writeln!(f, ",")?;
919 write!(f, " ")?;
920 write_130(
921 f,
922 [
923 u32::from_le_bytes(v0[8..12].try_into().unwrap()),
924 u32::from_le_bytes(v1[8..12].try_into().unwrap()),
925 u32::from_le_bytes(v0[12..16].try_into().unwrap()),
926 u32::from_le_bytes(v1[12..16].try_into().unwrap()),
927 u32::from_le_bytes(v2[8..12].try_into().unwrap()),
928 ],
929 )?;
930 writeln!(f, ",")?;
931 write!(f, " ")?;
932 write_130(
933 f,
934 [
935 u32::from_le_bytes(v0[16..20].try_into().unwrap()),
936 u32::from_le_bytes(v1[16..20].try_into().unwrap()),
937 u32::from_le_bytes(v0[20..24].try_into().unwrap()),
938 u32::from_le_bytes(v1[20..24].try_into().unwrap()),
939 u32::from_le_bytes(v2[16..20].try_into().unwrap()),
940 ],
941 )?;
942 writeln!(f, ",")?;
943 write!(f, " ")?;
944 write_130(
945 f,
946 [
947 u32::from_le_bytes(v0[24..28].try_into().unwrap()),
948 u32::from_le_bytes(v1[24..28].try_into().unwrap()),
949 u32::from_le_bytes(v0[28..32].try_into().unwrap()),
950 u32::from_le_bytes(v1[28..32].try_into().unwrap()),
951 u32::from_le_bytes(v2[24..28].try_into().unwrap()),
952 ],
953 )?;
954 writeln!(f, ",")?;
955 write!(f, "])")
956 }
957}
958
959impl Aligned4x130 {
960 /// Aligns four 16-byte Poly1305 blocks at 26-bit boundaries within 32-bit words, and
961 /// sets the high bit for each block.
962 ///
963 /// # Panics
964 ///
965 /// Panics if `src.len() < 64`.
966 #[target_feature(enable = "avx2")]
967 pub(super) unsafe fn from_blocks(src: &[Block; 4]) -> Self {
968 let (lo, hi) = src.split_at(2);
969 let blocks_23 = _mm256_loadu_si256(hi.as_ptr() as *const _);
970 let blocks_01 = _mm256_loadu_si256(lo.as_ptr() as *const _);
971
972 Self::from_loaded_blocks(blocks_01, blocks_23)
973 }
974
975 /// Aligns four 16-byte Poly1305 blocks at 26-bit boundaries within 32-bit words, and
976 /// sets the high bit for each block.
977 #[target_feature(enable = "avx2")]
978 pub(super) unsafe fn from_par_blocks(src: &ParBlocks) -> Self {
979 let (lo, hi) = src.split_at(2);
980 let blocks_23 = _mm256_loadu_si256(hi.as_ptr() as *const _);
981 let blocks_01 = _mm256_loadu_si256(lo.as_ptr() as *const _);
982
983 Self::from_loaded_blocks(blocks_01, blocks_23)
984 }
985
986 /// Aligns four 16-byte Poly1305 blocks at 26-bit boundaries within 32-bit words, and
987 /// sets the high bit for each block.
988 ///
989 /// The four blocks must be in the following 32-bit word layout:
990 /// [b33, b32, b31, b30, b23, b22, b21, b20]
991 /// [b13, b12, b11, b10, b03, b02, b01, b00]
992 #[target_feature(enable = "avx2")]
993 unsafe fn from_loaded_blocks(blocks_01: __m256i, blocks_23: __m256i) -> Self {
994 // 26-bit mask on each 32-bit word.
995 let mask_26 = _mm256_set1_epi32(0x3ffffff);
996 // Sets bit 24 of each 32-bit word.
997 let set_hibit = _mm256_set1_epi32(1 << 24);
998
999 // - Unpack the upper and lower 64 bits:
1000 // [b33, b32, b13, b12, b23, b22, b03, b02]
1001 // [b31, b30, b11, b10, b21, b20, b01, b00]
1002 //
1003 // - Swap the middle two 64-bit words:
1004 // a0 = [b33, b32, b23, b22, b13, b12, b03, b02]
1005 // a1 = [b31, b30, b21, b20, b11, b10, b01, b00]
1006 let a0 = _mm256_permute4x64_epi64(
1007 _mm256_unpackhi_epi64(blocks_01, blocks_23),
1008 set02(3, 1, 2, 0),
1009 );
1010 let a1 = _mm256_permute4x64_epi64(
1011 _mm256_unpacklo_epi64(blocks_01, blocks_23),
1012 set02(3, 1, 2, 0),
1013 );
1014
1015 // - Take the upper 24 bits of each 64-bit word in a0, and set the high bits:
1016 // v2 = [
1017 // [0; 7] || 1 || [0; 31] || 1 || b33[32..8],
1018 // [0; 7] || 1 || [0; 31] || 1 || b23[32..8],
1019 // [0; 7] || 1 || [0; 31] || 1 || b13[32..8],
1020 // [0; 7] || 1 || [0; 31] || 1 || b03[32..8],
1021 // ]
1022 let v2 = _mm256_or_si256(_mm256_srli_epi64(a0, 40), set_hibit);
1023
1024 // - Combine the lower 46 bits of each 64-bit word in a0 with the upper 18
1025 // bits of each 64-bit word in a1:
1026 // a2 = [
1027 // b33[14..0] || b32 || b31[32..14],
1028 // b23[14..0] || b22 || b21[32..14],
1029 // b13[14..0] || b12 || b11[32..14],
1030 // b03[14..0] || b02 || b01[32..14],
1031 // ]
1032 let a2 = _mm256_or_si256(_mm256_srli_epi64(a1, 46), _mm256_slli_epi64(a0, 18));
1033
1034 // - Take the upper 38 bits of each 64-bit word in a1:
1035 // [
1036 // [0; 26] || b31 || b30[32..26],
1037 // [0; 26] || b21 || b20[32..26],
1038 // [0; 26] || b11 || b10[32..26],
1039 // [0; 26] || b01 || b00[32..26],
1040 // ]
1041 // - Blend in a2 on 32-bit words with alternating [a2 a1 ..] control pattern:
1042 // [
1043 // b33[14..0] || b32[32..14] || b31[26..0] || b30[32..26],
1044 // b23[14..0] || b22[32..14] || b21[26..0] || b20[32..26],
1045 // b13[14..0] || b12[32..14] || b11[26..0] || b10[32..26],
1046 // b03[14..0] || b02[32..14] || b01[26..0] || b00[32..26],
1047 // ]
1048 // - Apply the 26-bit mask to each 32-bit word:
1049 // v1 = [
1050 // [0; 6] || b33[8..0] || b32[32..14] || [0; 6] || b31[20..0] || b30[32..26],
1051 // [0; 6] || b23[8..0] || b22[32..14] || [0; 6] || b21[20..0] || b20[32..26],
1052 // [0; 6] || b13[8..0] || b12[32..14] || [0; 6] || b11[20..0] || b10[32..26],
1053 // [0; 6] || b03[8..0] || b02[32..14] || [0; 6] || b01[20..0] || b00[32..26],
1054 // ]
1055 let v1 = _mm256_and_si256(
1056 _mm256_blend_epi32(_mm256_srli_epi64(a1, 26), a2, 0xAA),
1057 mask_26,
1058 );
1059
1060 // - Take the lower 38 bits of each 64-bit word in a2:
1061 // [
1062 // b32[20..0] || b31[32..14] || [0; 26],
1063 // b22[20..0] || b21[32..14] || [0; 26],
1064 // b12[20..0] || b11[32..14] || [0; 26],
1065 // b02[20..0] || b01[32..14] || [0; 26],
1066 // ]
1067 // - Blend in a1 on 32-bit words with alternating [a2 a1 ..] control pattern:
1068 // [
1069 // b32[20..0] || b31[32..20] || b30,
1070 // b22[20..0] || b21[32..20] || b20,
1071 // b12[20..0] || b11[32..20] || b10,
1072 // b02[20..0] || b01[32..20] || b00,
1073 // ]
1074 // - Apply the 26-bit mask to each 32-bit word:
1075 // v0 = [
1076 // [0; 6] || b32[14..0] || b31[32..20] || [0; 6] || b30[26..0],
1077 // [0; 6] || b22[14..0] || b21[32..20] || [0; 6] || b20[26..0],
1078 // [0; 6] || b12[14..0] || b11[32..20] || [0; 6] || b10[26..0],
1079 // [0; 6] || b02[14..0] || b01[32..20] || [0; 6] || b00[26..0],
1080 // ]
1081 let v0 = _mm256_and_si256(
1082 _mm256_blend_epi32(a1, _mm256_slli_epi64(a2, 26), 0xAA),
1083 mask_26,
1084 );
1085
1086 // The result:
1087 // v2 = [ v1 = [ v0 = [
1088 // [0; 7] || 1 || [0; 24], [0; 6] || b33[ 8..0] || b32[32..14], [0; 6] || b32[14..0] || b31[32..20],
1089 // [0; 7] || 1 || b33[32..8], [0; 6] || b31[20..0] || b30[32..26], [0; 6] || b30[26..0],
1090 // [0; 7] || 1 || [0; 24], [0; 6] || b23[ 8..0] || b22[32..14], [0; 6] || b22[14..0] || b21[32..20],
1091 // [0; 7] || 1 || b23[32..8], [0; 6] || b21[20..0] || b20[32..26], [0; 6] || b20[26..0],
1092 // [0; 7] || 1 || [0; 24], [0; 6] || b13[ 8..0] || b12[32..14], [0; 6] || b12[14..0] || b11[32..20],
1093 // [0; 7] || 1 || b13[32..8], [0; 6] || b11[20..0] || b10[32..26], [0; 6] || b10[26..0],
1094 // [0; 7] || 1 || [0; 24], [0; 6] || b03[ 8..0] || b02[32..14], [0; 6] || b02[14..0] || b01[32..20],
1095 // [0; 7] || 1 || b03[32..8], [0; 6] || b01[20..0] || b00[32..26], [0; 6] || b00[26..0],
1096 // ] ] ]
1097 Aligned4x130 { v0, v1, v2 }
1098 }
1099}
1100
1101impl Add<Aligned4x130> for Aligned4x130 {
1102 type Output = Aligned4x130;
1103
1104 #[inline(always)]
1105 fn add(self, other: Aligned4x130) -> Aligned4x130 {
1106 // With 26-bit limbs inside 32-bit words, there is plenty of space for unreduced
1107 // addition.
1108 unsafe {
1109 Aligned4x130 {
1110 v0: _mm256_add_epi32(self.v0, other.v0),
1111 v1: _mm256_add_epi32(self.v1, other.v1),
1112 v2: _mm256_add_epi32(self.v2, other.v2),
1113 }
1114 }
1115 }
1116}
1117
1118impl Mul<PrecomputedMultiplier> for &Aligned4x130 {
1119 type Output = Unreduced4x130;
1120
1121 #[inline(always)]
1122 fn mul(self, other: PrecomputedMultiplier) -> Unreduced4x130 {
1123 unsafe {
1124 // Starting with the following limb layout:
1125 // x.v2 = [ _, x34, _, x24, _, x14, _, x04]
1126 // x.v1 = [ x33, x31, x23, x21, x13, x11, x03, x01]
1127 // x.v0 = [ x32, x30, x22, x20, x12, x10, x02, x00]
1128 // y = [5·r_4, 5·r_3, 5·r_2, r_4, r_3, r_2, r_1, r_0]
1129 // z = [5·r_1, 5·r_1, 5·r_1, 5·r_1, 5·r_1, 5·r_1, 5·r_1, 5·r_1]
1130 let mut x = *self;
1131 let y = other.a;
1132 let z = other.a_5;
1133
1134 // Prepare a permutation that swaps the two limbs within a 64-bit window.
1135 let ord = _mm256_set_epi32(6, 7, 4, 5, 2, 3, 0, 1);
1136
1137 // t0 = [r_1, r_0, r_1, r_0, r_1, r_0, r_1, r_0] -> ·r_0
1138 // t1 = [r_3, r_2, r_3, r_2, r_3, r_2, r_3, r_2] -> ·r_2
1139 let mut t0 = _mm256_permute4x64_epi64(y, set02(0, 0, 0, 0));
1140 let mut t1 = _mm256_permute4x64_epi64(y, set02(1, 1, 1, 1));
1141
1142 // v0 = [x30·r_0, x20·r_0, x10·r_0, x00·r_0]
1143 // v1 = [x31·r_0, x21·r_0, x11·r_0, x01·r_0]
1144 // v4 = [x34·r_0, x24·r_0, x14·r_0, x04·r_0]
1145 // v2 = [x30·r_2, x20·r_2, x10·r_2, x00·r_2]
1146 // v3 = [x31·r_2, x21·r_2, x11·r_2, x01·r_2]
1147 let mut v0 = _mm256_mul_epu32(x.v0, t0); // xN0·r_0
1148 let mut v1 = _mm256_mul_epu32(x.v1, t0); // xN1·r_0
1149 let mut v4 = _mm256_mul_epu32(x.v2, t0); // xN4·r_0
1150 let mut v2 = _mm256_mul_epu32(x.v0, t1); // xN0·r_2
1151 let mut v3 = _mm256_mul_epu32(x.v1, t1); // xN1·r_2
1152
1153 // t0 = [r_0, r_1, r_0, r_1, r_0, r_1, r_0, r_1] -> ·r_1
1154 // t1 = [r_2, r_3, r_2, r_3, r_2, r_3, r_2, r_3] -> ·r_3
1155 t0 = _mm256_permutevar8x32_epi32(t0, ord);
1156 t1 = _mm256_permutevar8x32_epi32(t1, ord);
1157
1158 // v1 = [x31·r_0 + x30·r_1, x21·r_0 + x20·r_1, x11·r_0 + x10·r_1, x01·r_0 + x00·r_1]
1159 // v2 = [x31·r_1 + x30·r_2, x21·r_1 + x20·r_2, x11·r_1 + x10·r_2, x01·r_1 + x00·r_2]
1160 // v3 = [x31·r_2 + x30·r_3, x21·r_2 + x20·r_3, x11·r_2 + x10·r_3, x01·r_2 + x00·r_3]
1161 // v4 = [x34·r_0 + x31·r_3, x24·r_0 + x21·r_3, x14·r_0 + x11·r_3, x04·r_0 + x01·r_3]
1162 v1 = _mm256_add_epi64(v1, _mm256_mul_epu32(x.v0, t0)); // + xN0·r_1
1163 v2 = _mm256_add_epi64(v2, _mm256_mul_epu32(x.v1, t0)); // + xN1·r_1
1164 v3 = _mm256_add_epi64(v3, _mm256_mul_epu32(x.v0, t1)); // + xN0·r_3
1165 v4 = _mm256_add_epi64(v4, _mm256_mul_epu32(x.v1, t1)); // + xN1·r_3
1166
1167 // t2 = [5·r_2, r_4, 5·r_2, r_4, 5·r_2, r_4, 5·r_2, r_4] -> ·r_4
1168 let mut t2 = _mm256_permute4x64_epi64(y, set02(2, 2, 2, 2));
1169
1170 // v4 = [
1171 // x34·r_0 + x31·r_3 + x30·r_4,
1172 // x24·r_0 + x21·r_3 + x20·r_4,
1173 // x14·r_0 + x11·r_3 + x10·r_4,
1174 // x04·r_0 + x01·r_3 + x00·r_4,
1175 // ]
1176 v4 = _mm256_add_epi64(v4, _mm256_mul_epu32(x.v0, t2)); // + xN0·r_4
1177
1178 // x.v0 = [x30, x32, x20, x22, x10, x12, x00, x02]
1179 // x.v1 = [x31, x33, x21, x23, x11, x13, x01, x03]
1180 // t2 = [r_4, 5·r_2, r_4, 5·r_2, r_4, 5·r_2, r_4, 5·r_2] -> ·5·r_2
1181 x.v0 = _mm256_permutevar8x32_epi32(x.v0, ord);
1182 x.v1 = _mm256_permutevar8x32_epi32(x.v1, ord);
1183 t2 = _mm256_permutevar8x32_epi32(t2, ord);
1184
1185 // v0 = [
1186 // x30·r_0 + 5·x33·r_2,
1187 // x20·r_0 + 5·x23·r_2,
1188 // x10·r_0 + 5·x13·r_2,
1189 // x00·r_0 + 5·x03·r_2,
1190 // ]
1191 // v1 = [
1192 // x31·r_0 + x30·r_1 + 5·x34·r_2,
1193 // x21·r_0 + x20·r_1 + 5·x24·r_2,
1194 // x11·r_0 + x10·r_1 + 5·x14·r_2,
1195 // x01·r_0 + x00·r_1 + 5·x04·r_2,
1196 // ]
1197 // v3 = [
1198 // x32·r_1 + x31·r_2 + x30·r_3,
1199 // x22·r_1 + x21·r_2 + x20·r_3,
1200 // x12·r_1 + x11·r_2 + x10·r_3,
1201 // x02·r_1 + x01·r_2 + x00·r_3,
1202 // ]
1203 // v4 = [
1204 // x34·r_0 + x33·r_1 + x31·r_3 + x30·r_4,
1205 // x24·r_0 + x23·r_1 + x21·r_3 + x20·r_4,
1206 // x14·r_0 + x13·r_1 + x11·r_3 + x10·r_4,
1207 // x04·r_0 + x03·r_1 + x01·r_3 + x00·r_4,
1208 // ]
1209 v0 = _mm256_add_epi64(v0, _mm256_mul_epu32(x.v1, t2)); // + 5·xN3·r_2
1210 v1 = _mm256_add_epi64(v1, _mm256_mul_epu32(x.v2, t2)); // + 5·xN4·r_2
1211 v3 = _mm256_add_epi64(v3, _mm256_mul_epu32(x.v0, t0)); // + xN2·r_1
1212 v4 = _mm256_add_epi64(v4, _mm256_mul_epu32(x.v1, t0)); // + xN3·r_1
1213
1214 // t0 = [r_1, r_0, r_1, r_0, r_1, r_0, r_1, r_0] -> ·r_0
1215 // t1 = [r_3, r_2, r_3, r_2, r_3, r_2, r_3, r_2] -> ·r_2
1216 t0 = _mm256_permutevar8x32_epi32(t0, ord);
1217 t1 = _mm256_permutevar8x32_epi32(t1, ord);
1218
1219 // v2 = [
1220 // x32·r_0 + x31·r_1 + x30·r_2,
1221 // x22·r_0 + x21·r_1 + x20·r_2,
1222 // x12·r_0 + x11·r_1 + x10·r_2,
1223 // x02·r_0 + x01·r_1 + x00·r_2,
1224 // ]
1225 // v3 = [
1226 // x33·r_0 + x32·r_1 + x31·r_2 + x30·r_3,
1227 // x23·r_0 + x22·r_1 + x21·r_2 + x20·r_3,
1228 // x13·r_0 + x12·r_1 + x11·r_2 + x10·r_3,
1229 // x03·r_0 + x02·r_1 + x01·r_2 + x00·r_3,
1230 // ]
1231 // v4 = [
1232 // x34·r_0 + x33·r_1 + x32·r_2 + x31·r_3 + x30·r_4,
1233 // x24·r_0 + x23·r_1 + x22·r_2 + x21·r_3 + x20·r_4,
1234 // x14·r_0 + x13·r_1 + x12·r_2 + x11·r_3 + x10·r_4,
1235 // x04·r_0 + x03·r_1 + x02·r_2 + x01·r_3 + x00·r_4,
1236 // ]
1237 v2 = _mm256_add_epi64(v2, _mm256_mul_epu32(x.v0, t0)); // + xN2·r_0
1238 v3 = _mm256_add_epi64(v3, _mm256_mul_epu32(x.v1, t0)); // + xN3·r_0
1239 v4 = _mm256_add_epi64(v4, _mm256_mul_epu32(x.v0, t1)); // + xN2·r_2
1240
1241 // t0 = [5·r_4, 5·r_3, 5·r_4, 5·r_3, 5·r_4, 5·r_3, 5·r_4, 5·r_3] -> ·5·r_3
1242 t0 = _mm256_permute4x64_epi64(y, set02(3, 3, 3, 3));
1243
1244 // v0 = [
1245 // x30·r_0 + 5·x33·r_2 + 5·x32·r_3,
1246 // x20·r_0 + 5·x23·r_2 + 5·x22·r_3,
1247 // x10·r_0 + 5·x13·r_2 + 5·x12·r_3,
1248 // x00·r_0 + 5·x03·r_2 + 5·x02·r_3,
1249 // ]
1250 // v1 = [
1251 // x31·r_0 + x30·r_1 + 5·x34·r_2 + 5·x33·r_3,
1252 // x21·r_0 + x20·r_1 + 5·x24·r_2 + 5·x23·r_3,
1253 // x11·r_0 + x10·r_1 + 5·x14·r_2 + 5·x13·r_3,
1254 // x01·r_0 + x00·r_1 + 5·x04·r_2 + 5·x03·r_3,
1255 // ]
1256 // v2 = [
1257 // x32·r_0 + x31·r_1 + x30·r_2 + 5·x34·r_3,
1258 // x22·r_0 + x21·r_1 + x20·r_2 + 5·x24·r_3,
1259 // x12·r_0 + x11·r_1 + x10·r_2 + 5·x14·r_3,
1260 // x02·r_0 + x01·r_1 + x00·r_2 + 5·x04·r_3,
1261 // ]
1262 v0 = _mm256_add_epi64(v0, _mm256_mul_epu32(x.v0, t0)); // + 5·xN2·r_3
1263 v1 = _mm256_add_epi64(v1, _mm256_mul_epu32(x.v1, t0)); // + 5·xN3·r_3
1264 v2 = _mm256_add_epi64(v2, _mm256_mul_epu32(x.v2, t0)); // + 5·xN4·r_3
1265
1266 // t0 = [5·r_3, 5·r_4, 5·r_3, 5·r_4, 5·r_3, 5·r_4, 5·r_3, 5·r_4] -> ·5·r_4
1267 t0 = _mm256_permutevar8x32_epi32(t0, ord);
1268
1269 // v1 = [
1270 // x31·r_0 + x30·r_1 + 5·x34·r_2 + 5·x33·r_3 + 5·x32·r_4,
1271 // x21·r_0 + x20·r_1 + 5·x24·r_2 + 5·x23·r_3 + 5·x22·r_4,
1272 // x11·r_0 + x10·r_1 + 5·x14·r_2 + 5·x13·r_3 + 5·x12·r_4,
1273 // x01·r_0 + x00·r_1 + 5·x04·r_2 + 5·x03·r_3 + 5·x02·r_4,
1274 // ]
1275 // v2 = [
1276 // x32·r_0 + x31·r_1 + x30·r_2 + 5·x34·r_3 + 5·x33·r_4,
1277 // x22·r_0 + x21·r_1 + x20·r_2 + 5·x24·r_3 + 5·x23·r_4,
1278 // x12·r_0 + x11·r_1 + x10·r_2 + 5·x14·r_3 + 5·x13·r_4,
1279 // x02·r_0 + x01·r_1 + x00·r_2 + 5·x04·r_3 + 5·x03·r_4,
1280 // ]
1281 // v3 = [
1282 // x33·r_0 + x32·r_1 + x31·r_2 + x30·r_3 + 5·x34·r_4,
1283 // x23·r_0 + x22·r_1 + x21·r_2 + x20·r_3 + 5·x24·r_4,
1284 // x13·r_0 + x12·r_1 + x11·r_2 + x10·r_3 + 5·x14·r_4,
1285 // x03·r_0 + x02·r_1 + x01·r_2 + x00·r_3 + 5·x04·r_4,
1286 // ]
1287 v1 = _mm256_add_epi64(v1, _mm256_mul_epu32(x.v0, t0)); // + 5·xN2·r_4
1288 v2 = _mm256_add_epi64(v2, _mm256_mul_epu32(x.v1, t0)); // + 5·xN3·r_4
1289 v3 = _mm256_add_epi64(v3, _mm256_mul_epu32(x.v2, t0)); // + 5·xN4·r_4
1290
1291 // x.v1 = [x33, x31, x23, x21, x13, x11, x03, x01]
1292 x.v1 = _mm256_permutevar8x32_epi32(x.v1, ord);
1293
1294 // v0 = [
1295 // x30·r_0 + 5·x34·r_1 + 5·x33·r_2 + 5·x32·r_3 + 5·x31·r_4,
1296 // x20·r_0 + 5·x24·r_1 + 5·x23·r_2 + 5·x22·r_3 + 5·x21·r_4,
1297 // x10·r_0 + 5·x14·r_1 + 5·x13·r_2 + 5·x12·r_3 + 5·x11·r_4,
1298 // x00·r_0 + 5·x04·r_1 + 5·x03·r_2 + 5·x02·r_3 + 5·x01·r_4,
1299 // ]
1300 v0 = _mm256_add_epi64(v0, _mm256_mul_epu32(x.v1, t0)); // + 5·xN1·r_4
1301 v0 = _mm256_add_epi64(v0, _mm256_mul_epu32(x.v2, z)); // + 5·xN4·r_1
1302
1303 // The result:
1304 // v4 = [
1305 // x34·r_0 + x33·r_1 + x32·r_2 + x31·r_3 + x30·r_4,
1306 // x24·r_0 + x23·r_1 + x22·r_2 + x21·r_3 + x20·r_4,
1307 // x14·r_0 + x13·r_1 + x12·r_2 + x11·r_3 + x10·r_4,
1308 // x04·r_0 + x03·r_1 + x02·r_2 + x01·r_3 + x00·r_4,
1309 // ]
1310 // v3 = [
1311 // x33·r_0 + x32·r_1 + x31·r_2 + x30·r_3 + 5·x34·r_4,
1312 // x23·r_0 + x22·r_1 + x21·r_2 + x20·r_3 + 5·x24·r_4,
1313 // x13·r_0 + x12·r_1 + x11·r_2 + x10·r_3 + 5·x14·r_4,
1314 // x03·r_0 + x02·r_1 + x01·r_2 + x00·r_3 + 5·x04·r_4,
1315 // ]
1316 // v2 = [
1317 // x32·r_0 + x31·r_1 + x30·r_2 + 5·x34·r_3 + 5·x33·r_4,
1318 // x22·r_0 + x21·r_1 + x20·r_2 + 5·x24·r_3 + 5·x23·r_4,
1319 // x12·r_0 + x11·r_1 + x10·r_2 + 5·x14·r_3 + 5·x13·r_4,
1320 // x02·r_0 + x01·r_1 + x00·r_2 + 5·x04·r_3 + 5·x03·r_4,
1321 // ]
1322 // v1 = [
1323 // x31·r_0 + x30·r_1 + 5·x34·r_2 + 5·x33·r_3 + 5·x32·r_4,
1324 // x21·r_0 + x20·r_1 + 5·x24·r_2 + 5·x23·r_3 + 5·x22·r_4,
1325 // x11·r_0 + x10·r_1 + 5·x14·r_2 + 5·x13·r_3 + 5·x12·r_4,
1326 // x01·r_0 + x00·r_1 + 5·x04·r_2 + 5·x03·r_3 + 5·x02·r_4,
1327 // ]
1328 // v0 = [
1329 // x30·r_0 + 5·x34·r_1 + 5·x33·r_2 + 5·x32·r_3 + 5·x31·r_4,
1330 // x20·r_0 + 5·x24·r_1 + 5·x23·r_2 + 5·x22·r_3 + 5·x21·r_4,
1331 // x10·r_0 + 5·x14·r_1 + 5·x13·r_2 + 5·x12·r_3 + 5·x11·r_4,
1332 // x00·r_0 + 5·x04·r_1 + 5·x03·r_2 + 5·x02·r_3 + 5·x01·r_4,
1333 // ]
1334 Unreduced4x130 { v0, v1, v2, v3, v4 }
1335 }
1336 }
1337}
1338
1339impl Mul<SpacedMultiplier4x130> for Aligned4x130 {
1340 type Output = Unreduced4x130;
1341
1342 #[inline(always)]
1343 fn mul(self, m: SpacedMultiplier4x130) -> Unreduced4x130 {
1344 unsafe {
1345 // Starting with the following limb layout:
1346 // x.v2 = [ _, x34, _, x24, _, x14, _, x04]
1347 // x.v1 = [ x33, x31, x23, x21, x13, x11, x03, x01]
1348 // x.v0 = [ x32, x30, x22, x20, x12, x10, x02, x00]
1349 // m.v1 = [ r2_4, r2_2, r2_0, r4_4, r4_3, r4_2, r4_1, r4_0]
1350 // m.v0 = [ r2_4, r2_3, r2_1, r3_4, r3_3, r3_2, r3_1, r3_0]
1351 // r1 = [5·r1_4, 5·r1_3, 5·r1_2, r1_4, r1_3, r1_2, r1_1, r1_0]
1352 let mut x = self;
1353 let r1 = m.r1.a;
1354
1355 // v0 = [r2_0, r2_1, r4_4, r3_4, r4_1, r3_1, r4_0, r3_0]
1356 // v1 = [r2_4, r2_4, r2_2, r2_3, r4_3, r3_3, r4_2, r3_2]
1357 let v0 = _mm256_unpacklo_epi32(m.v0, m.v1);
1358 let v1 = _mm256_unpackhi_epi32(m.v0, m.v1);
1359
1360 // m_r_0 = [r1_1, r1_0, r2_1, r2_0, r3_1, r3_0, r4_1, r4_0] -> ·rN_0
1361 // m_r_2 = [r1_3, r1_2, r2_3, r2_2, r3_3, r3_2, r4_3, r4_2] -> ·rN_2
1362 // m_r_4 = [r1_1, r1_4, r2_1, r2_4, r3_1, r3_4, r4_1, r4_4] -> ·rN_4
1363 let ord = _mm256_set_epi32(1, 0, 6, 7, 2, 0, 3, 1);
1364 let m_r_0 = _mm256_blend_epi32(
1365 _mm256_permutevar8x32_epi32(r1, ord),
1366 _mm256_permutevar8x32_epi32(v0, ord),
1367 0b00111111,
1368 );
1369 let ord = _mm256_set_epi32(3, 2, 4, 5, 2, 0, 3, 1);
1370 let m_r_2 = _mm256_blend_epi32(
1371 _mm256_permutevar8x32_epi32(r1, ord),
1372 _mm256_permutevar8x32_epi32(v1, ord),
1373 0b00111111,
1374 );
1375 let ord = _mm256_set_epi32(1, 4, 6, 6, 2, 4, 3, 5);
1376 let m_r_4 = _mm256_blend_epi32(
1377 _mm256_blend_epi32(
1378 _mm256_permutevar8x32_epi32(r1, ord),
1379 _mm256_permutevar8x32_epi32(v1, ord),
1380 0b00010000,
1381 ),
1382 _mm256_permutevar8x32_epi32(v0, ord),
1383 0b00101111,
1384 );
1385
1386 // v0 = [x30·r1_0, x20·r2_0, x10·r3_0, x00·r4_0]
1387 // v1 = [x31·r1_0, x21·r2_0, x11·r3_0, x01·r4_0]
1388 // v2 = [x30·r1_2, x20·r2_2, x10·r3_2, x00·r4_2]
1389 // v3 = [x31·r1_2, x21·r2_2, x11·r3_2, x01·r4_2]
1390 // v4 = [x30·r1_4, x20·r2_4, x10·r3_4, x00·r4_4]
1391 let mut v0 = _mm256_mul_epu32(x.v0, m_r_0); // xM0·rN_0
1392 let mut v1 = _mm256_mul_epu32(x.v1, m_r_0); // xM1·rN_0
1393 let mut v2 = _mm256_mul_epu32(x.v0, m_r_2); // xM0·rN_2
1394 let mut v3 = _mm256_mul_epu32(x.v1, m_r_2); // xM1·rN_2
1395 let mut v4 = _mm256_mul_epu32(x.v0, m_r_4); // xM0·rN_4
1396
1397 // m_r_1 = [r1_0, r1_1, r2_0, r2_1, r3_0, r3_1, r4_0, r4_1] -> ·rN_1
1398 // m_r_3 = [r1_2, r1_3, r2_2, r2_3, r3_2, r3_3, r4_2, r4_3] -> ·rN_3
1399 let ord = _mm256_set_epi32(6, 7, 4, 5, 2, 3, 0, 1);
1400 let m_r_1 = _mm256_permutevar8x32_epi32(m_r_0, ord);
1401 let m_r_3 = _mm256_permutevar8x32_epi32(m_r_2, ord);
1402
1403 // v1 = [
1404 // x31·r1_0 + x30·r1_1,
1405 // x21·r2_0 + x20·r2_1,
1406 // x11·r3_0 + x10·r3_1,
1407 // x01·r4_0 + x00·r4_1,
1408 // ]
1409 // v2 = [
1410 // x31·r1_1 + x30·r1_2,
1411 // x21·r2_1 + x20·r2_2,
1412 // x11·r3_1 + x10·r3_2,
1413 // x01·r4_1 + x00·r4_2,
1414 // ]
1415 // v3 = [
1416 // x31·r1_2 + x30·r1_3,
1417 // x21·r2_2 + x20·r2_3,
1418 // x11·r3_2 + x10·r3_3,
1419 // x01·r4_2 + x00·r4_3,
1420 // ]
1421 // v4 = [
1422 // x34·r1_0 + x31·r1_3 + x30·r1_4,
1423 // x24·r2_0 + x21·r2_3 + x20·r2_4,
1424 // x14·r3_0 + x11·r3_3 + x10·r3_4,
1425 // x04·r4_0 + x01·r4_3 + x00·r4_4,
1426 // ]
1427 v1 = _mm256_add_epi64(v1, _mm256_mul_epu32(x.v0, m_r_1)); // + xM0·rN_1
1428 v2 = _mm256_add_epi64(v2, _mm256_mul_epu32(x.v1, m_r_1)); // + xM1·rN_1
1429 v3 = _mm256_add_epi64(v3, _mm256_mul_epu32(x.v0, m_r_3)); // + xM0·rN_3
1430 v4 = _mm256_add_epi64(v4, _mm256_mul_epu32(x.v1, m_r_3)); // + xM1·rN_3
1431 v4 = _mm256_add_epi64(v4, _mm256_mul_epu32(x.v2, m_r_0)); // + xM4·rN_0
1432
1433 // x.v0 = [x30, x32, x20, x22, x10, x12, x00, x02]
1434 x.v0 = _mm256_permutevar8x32_epi32(x.v0, ord);
1435
1436 // v2 = [
1437 // x32·r1_0 + x31·r1_1 + x30·r1_2,
1438 // x22·r2_0 + x21·r2_1 + x20·r2_2,
1439 // x12·r3_0 + x11·r3_1 + x10·r3_2,
1440 // x02·r4_0 + x01·r4_1 + x00·r4_2,
1441 // ]
1442 // v3 = [
1443 // x32·r1_1 + x31·r1_2 + x30·r1_3,
1444 // x22·r2_1 + x21·r2_2 + x20·r2_3,
1445 // x12·r3_1 + x11·r3_2 + x10·r3_3,
1446 // x02·r4_1 + x01·r4_2 + x00·r4_3,
1447 // ]
1448 // v4 = [
1449 // x34·r1_0 + x32·r1_2 + x31·r1_3 + x30·r1_4,
1450 // x24·r2_0 + x22·r2_2 + x21·r2_3 + x20·r2_4,
1451 // x14·r3_0 + x12·r3_2 + x11·r3_3 + x10·r3_4,
1452 // x04·r4_0 + x02·r4_2 + x01·r4_3 + x00·r4_4,
1453 // ]
1454 v2 = _mm256_add_epi64(v2, _mm256_mul_epu32(x.v0, m_r_0)); // + xM2·rN_0
1455 v3 = _mm256_add_epi64(v3, _mm256_mul_epu32(x.v0, m_r_1)); // + xM2·rN_1
1456 v4 = _mm256_add_epi64(v4, _mm256_mul_epu32(x.v0, m_r_2)); // + xM2·rN_2
1457
1458 // m_5r_3 = [5·r1_2, 5·r1_3, 5·r2_2, 5·r2_3, 5·r3_2, 5·r3_3, 5·r4_2, 5·r4_3] -> ·5·rN_3
1459 // m_5r_4 = [5·r1_1, 5·r1_4, 5·r2_1, 5·r2_4, 5·r3_1, 5·r3_4, 5·r4_1, 5·r4_4] -> ·5·rN_4
1460 let m_5r_3 = _mm256_add_epi32(m_r_3, _mm256_slli_epi32(m_r_3, 2));
1461 let m_5r_4 = _mm256_add_epi32(m_r_4, _mm256_slli_epi32(m_r_4, 2));
1462
1463 // v0 = [
1464 // x30·r1_0 + 5·x32·r1_3 + 5·x31·r1_4,
1465 // x20·r2_0 + 5·x22·r2_3 + 5·x21·r2_4,
1466 // x10·r3_0 + 5·x12·r3_3 + 5·x11·r3_4,
1467 // x00·r4_0 + 5·x02·r4_3 + 5·x01·r4_4,
1468 // ]
1469 // v1 = [
1470 // x31·r1_0 + x30·r1_1 + 5·x32·r1_4,
1471 // x21·r2_0 + x20·r2_1 + 5·x22·r2_4,
1472 // x11·r3_0 + x10·r3_1 + 5·x12·r3_4,
1473 // x01·r4_0 + x00·r4_1 + 5·x02·r4_4,
1474 // ]
1475 // v2 = [
1476 // x32·r1_0 + x31·r1_1 + x30·r1_2 + 5·x34·r1_3,
1477 // x22·r2_0 + x21·r2_1 + x20·r2_2 + 5·x24·r2_3,
1478 // x12·r3_0 + x11·r3_1 + x10·r3_2 + 5·x14·r3_3,
1479 // x02·r4_0 + x01·r4_1 + x00·r4_2 + 5·x04·r4_3,
1480 // ]
1481 // v3 = [
1482 // x32·r1_1 + x31·r1_2 + x30·r1_3 + 5·x34·r1_4,
1483 // x22·r2_1 + x21·r2_2 + x20·r2_3 + 5·x24·r2_4,
1484 // x12·r3_1 + x11·r3_2 + x10·r3_3 + 5·x14·r3_4,
1485 // x02·r4_1 + x01·r4_2 + x00·r4_3 + 5·x04·r4_4,
1486 // ]
1487 v0 = _mm256_add_epi64(v0, _mm256_mul_epu32(x.v0, m_5r_3)); // + 5·xM2·rN_3
1488 v0 = _mm256_add_epi64(v0, _mm256_mul_epu32(x.v1, m_5r_4)); // + 5·xM1·rN_4
1489 v1 = _mm256_add_epi64(v1, _mm256_mul_epu32(x.v0, m_5r_4)); // + 5·xM2·rN_4
1490 v2 = _mm256_add_epi64(v2, _mm256_mul_epu32(x.v2, m_5r_3)); // + 5·xM4·rN_3
1491 v3 = _mm256_add_epi64(v3, _mm256_mul_epu32(x.v2, m_5r_4)); // + 5·xM4·rN_4
1492
1493 // x.v1 = [x31, x33, x21, x23, x11, x13, x01, x03]
1494 x.v1 = _mm256_permutevar8x32_epi32(x.v1, ord);
1495
1496 // v1 = [
1497 // x31·r1_0 + x30·r1_1 + 5·x33·r1_3 + 5·x32·r1_4,
1498 // x21·r2_0 + x20·r2_1 + 5·x23·r2_3 + 5·x22·r2_4,
1499 // x11·r3_0 + x10·r3_1 + 5·x13·r3_3 + 5·x12·r3_4,
1500 // x01·r4_0 + x00·r4_1 + 5·x03·r4_3 + 5·x02·r4_4,
1501 // ]
1502 // v2 = [
1503 // x32·r1_0 + x31·r1_1 + x30·r1_2 + 5·x34·r1_3 + 5·x33·r1_4,
1504 // x22·r2_0 + x21·r2_1 + x20·r2_2 + 5·x24·r2_3 + 5·x23·r2_4,
1505 // x12·r3_0 + x11·r3_1 + x10·r3_2 + 5·x14·r3_3 + 5·x13·r3_4,
1506 // x02·r4_0 + x01·r4_1 + x00·r4_2 + 5·x04·r4_3 + 5·x03·r4_4,
1507 // ]
1508 // v3 = [
1509 // x33·r1_0 + x32·r1_1 + x31·r1_2 + x30·r1_3 + 5·x34·r1_4,
1510 // x23·r2_0 + x22·r2_1 + x21·r2_2 + x20·r2_3 + 5·x24·r2_4,
1511 // x13·r3_0 + x12·r3_1 + x11·r3_2 + x10·r3_3 + 5·x14·r3_4,
1512 // x03·r4_0 + x02·r4_1 + x01·r4_2 + x00·r4_3 + 5·x04·r4_4,
1513 // ]
1514 // v4 = [
1515 // x34·r1_0 + x33·r1_1 + x32·r1_2 + x31·r1_3 + x30·r1_4,
1516 // x24·r2_0 + x23·r2_1 + x22·r2_2 + x21·r2_3 + x20·r2_4,
1517 // x14·r3_0 + x13·r3_1 + x12·r3_2 + x11·r3_3 + x10·r3_4,
1518 // x04·r4_0 + x03·r4_1 + x02·r4_2 + x01·r4_3 + x00·r4_4,
1519 // ]
1520 v1 = _mm256_add_epi64(v1, _mm256_mul_epu32(x.v1, m_5r_3)); // + 5·xM3·rN_3
1521 v2 = _mm256_add_epi64(v2, _mm256_mul_epu32(x.v1, m_5r_4)); // + 5·xM3·rN_4
1522 v3 = _mm256_add_epi64(v3, _mm256_mul_epu32(x.v1, m_r_0)); // + xM3·rN_0
1523 v4 = _mm256_add_epi64(v4, _mm256_mul_epu32(x.v1, m_r_1)); // + xM3·rN_1
1524
1525 // m_5r_1 = [5·r1_4, 5·r1_1, 5·r2_4, 5·r2_1, 5·r3_4, 5·r3_1, 5·r4_4, 5·r4_1] -> ·5·rN_1
1526 // m_5r_2 = [5·r1_3, 5·r1_2, 5·r2_3, 5·r2_2, 5·r3_3, 5·r3_2, 5·r4_3, 5·r4_2] -> ·5·rN_2
1527 let m_5r_1 = _mm256_permutevar8x32_epi32(m_5r_4, ord);
1528 let m_5r_2 = _mm256_permutevar8x32_epi32(m_5r_3, ord);
1529
1530 // v0 = [
1531 // x30·r1_0 + 5·x34·r1_1 + 5·x33·r1_2 + 5·x32·r1_3 + 5·x31·r1_4,
1532 // x20·r2_0 + 5·x24·r2_1 + 5·x23·r2_2 + 5·x22·r2_3 + 5·x21·r2_4,
1533 // x10·r3_0 + 5·x14·r3_1 + 5·x13·r3_2 + 5·x12·r3_3 + 5·x11·r3_4,
1534 // x00·r4_0 + 5·x04·r4_1 + 5·x03·r4_2 + 5·x02·r4_3 + 5·x01·r4_4,
1535 // ]
1536 // v1 = [
1537 // x31·r1_0 + x30·r1_1 + 5·x34·r1_2 + 5·x33·r1_3 + 5·x32·r1_4,
1538 // x21·r2_0 + x20·r2_1 + 5·x24·r2_2 + 5·x23·r2_3 + 5·x22·r2_4,
1539 // x11·r3_0 + x10·r3_1 + 5·x14·r3_2 + 5·x13·r3_3 + 5·x12·r3_4,
1540 // x01·r4_0 + x00·r4_1 + 5·x04·r4_2 + 5·x03·r4_3 + 5·x02·r4_4,
1541 // ]
1542 v0 = _mm256_add_epi64(v0, _mm256_mul_epu32(x.v1, m_5r_2)); // + 5·xM3·rN_2
1543 v0 = _mm256_add_epi64(v0, _mm256_mul_epu32(x.v2, m_5r_1)); // + 5·xM4·rN_1
1544 v1 = _mm256_add_epi64(v1, _mm256_mul_epu32(x.v2, m_5r_2)); // + 5·xM4·rN_2
1545
1546 // The result:
1547 // v4 = [
1548 // x34·r1_0 + x33·r1_1 + x32·r1_2 + x31·r1_3 + x30·r1_4,
1549 // x24·r2_0 + x23·r2_1 + x22·r2_2 + x21·r2_3 + x20·r2_4,
1550 // x14·r3_0 + x13·r3_1 + x12·r3_2 + x11·r3_3 + x10·r3_4,
1551 // x04·r4_0 + x03·r4_1 + x02·r4_2 + x01·r4_3 + x00·r4_4,
1552 // ]
1553 // v3 = [
1554 // x33·r1_0 + x32·r1_1 + x31·r1_2 + x30·r1_3 + 5·x34·r1_4,
1555 // x23·r2_0 + x22·r2_1 + x21·r2_2 + x20·r2_3 + 5·x24·r2_4,
1556 // x13·r3_0 + x12·r3_1 + x11·r3_2 + x10·r3_3 + 5·x14·r3_4,
1557 // x03·r4_0 + x02·r4_1 + x01·r4_2 + x00·r4_3 + 5·x04·r4_4,
1558 // ]
1559 // v2 = [
1560 // x32·r1_0 + x31·r1_1 + x30·r1_2 + 5·x34·r1_3 + 5·x33·r1_4,
1561 // x22·r2_0 + x21·r2_1 + x20·r2_2 + 5·x24·r2_3 + 5·x23·r2_4,
1562 // x12·r3_0 + x11·r3_1 + x10·r3_2 + 5·x14·r3_3 + 5·x13·r3_4,
1563 // x02·r4_0 + x01·r4_1 + x00·r4_2 + 5·x04·r4_3 + 5·x03·r4_4,
1564 // ]
1565 // v1 = [
1566 // x31·r1_0 + x30·r1_1 + 5·x34·r1_2 + 5·x33·r1_3 + 5·x32·r1_4,
1567 // x21·r2_0 + x20·r2_1 + 5·x24·r2_2 + 5·x23·r2_3 + 5·x22·r2_4,
1568 // x11·r3_0 + x10·r3_1 + 5·x14·r3_2 + 5·x13·r3_3 + 5·x12·r3_4,
1569 // x01·r4_0 + x00·r4_1 + 5·x04·r4_2 + 5·x03·r4_3 + 5·x02·r4_4,
1570 // ]
1571 // v0 = [
1572 // x30·r1_0 + 5·x34·r1_1 + 5·x33·r1_2 + 5·x32·r1_3 + 5·x31·r1_4,
1573 // x20·r2_0 + 5·x24·r2_1 + 5·x23·r2_2 + 5·x22·r2_3 + 5·x21·r2_4,
1574 // x10·r3_0 + 5·x14·r3_1 + 5·x13·r3_2 + 5·x12·r3_3 + 5·x11·r3_4,
1575 // x00·r4_0 + 5·x04·r4_1 + 5·x03·r4_2 + 5·x02·r4_3 + 5·x01·r4_4,
1576 // ]
1577 Unreduced4x130 { v0, v1, v2, v3, v4 }
1578 }
1579 }
1580}
1581
1582/// The unreduced output of an Aligned4x130 multiplication.
1583#[derive(Clone, Debug)]
1584pub(super) struct Unreduced4x130 {
1585 v0: __m256i,
1586 v1: __m256i,
1587 v2: __m256i,
1588 v3: __m256i,
1589 v4: __m256i,
1590}
1591
1592impl fmt::Display for Unreduced4x130 {
1593 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
1594 let mut v0 = [0u8; 32];
1595 let mut v1 = [0u8; 32];
1596 let mut v2 = [0u8; 32];
1597 let mut v3 = [0u8; 32];
1598 let mut v4 = [0u8; 32];
1599 unsafe {
1600 _mm256_storeu_si256(v0.as_mut_ptr() as *mut _, self.v0);
1601 _mm256_storeu_si256(v1.as_mut_ptr() as *mut _, self.v1);
1602 _mm256_storeu_si256(v2.as_mut_ptr() as *mut _, self.v2);
1603 _mm256_storeu_si256(v3.as_mut_ptr() as *mut _, self.v3);
1604 _mm256_storeu_si256(v4.as_mut_ptr() as *mut _, self.v4);
1605 }
1606
1607 writeln!(f, "Unreduced4x130([")?;
1608 write!(f, " ")?;
1609 write_130_wide(
1610 f,
1611 [
1612 u64::from_le_bytes(v0[0..8].try_into().unwrap()),
1613 u64::from_le_bytes(v1[0..8].try_into().unwrap()),
1614 u64::from_le_bytes(v2[0..8].try_into().unwrap()),
1615 u64::from_le_bytes(v3[0..8].try_into().unwrap()),
1616 u64::from_le_bytes(v4[0..8].try_into().unwrap()),
1617 ],
1618 )?;
1619 writeln!(f, ",")?;
1620 write!(f, " ")?;
1621 write_130_wide(
1622 f,
1623 [
1624 u64::from_le_bytes(v0[8..16].try_into().unwrap()),
1625 u64::from_le_bytes(v1[8..16].try_into().unwrap()),
1626 u64::from_le_bytes(v2[8..16].try_into().unwrap()),
1627 u64::from_le_bytes(v3[8..16].try_into().unwrap()),
1628 u64::from_le_bytes(v4[8..16].try_into().unwrap()),
1629 ],
1630 )?;
1631 writeln!(f, ",")?;
1632 write!(f, " ")?;
1633 write_130_wide(
1634 f,
1635 [
1636 u64::from_le_bytes(v0[16..24].try_into().unwrap()),
1637 u64::from_le_bytes(v1[16..24].try_into().unwrap()),
1638 u64::from_le_bytes(v2[16..24].try_into().unwrap()),
1639 u64::from_le_bytes(v3[16..24].try_into().unwrap()),
1640 u64::from_le_bytes(v4[16..24].try_into().unwrap()),
1641 ],
1642 )?;
1643 writeln!(f, ",")?;
1644 write!(f, " ")?;
1645 write_130_wide(
1646 f,
1647 [
1648 u64::from_le_bytes(v0[24..32].try_into().unwrap()),
1649 u64::from_le_bytes(v1[24..32].try_into().unwrap()),
1650 u64::from_le_bytes(v2[24..32].try_into().unwrap()),
1651 u64::from_le_bytes(v3[24..32].try_into().unwrap()),
1652 u64::from_le_bytes(v4[24..32].try_into().unwrap()),
1653 ],
1654 )?;
1655 writeln!(f, ",")?;
1656 write!(f, "])")
1657 }
1658}
1659
1660impl Unreduced4x130 {
1661 #[inline(always)]
1662 pub(super) fn reduce(self) -> Aligned4x130 {
1663 unsafe {
1664 // Starting with the following limb layout across 64-bit words:
1665 // x.v4 = [u34, u24, u14, u04]
1666 // x.v3 = [u33, u23, u13, u03]
1667 // x.v2 = [u32, u22, u12, u02]
1668 // x.v1 = [u31, u21, u11, u01]
1669 // x.v0 = [u30, u20, u10, u00]
1670 let x = self;
1671
1672 // 26-bit mask on each 64-bit word.
1673 let mask_26 = _mm256_set1_epi64x(0x3ffffff);
1674
1675 // Carry from x0 up into x1, returning their new values.
1676 let adc = |x1: __m256i, x0: __m256i| -> (__m256i, __m256i) {
1677 let y1 = _mm256_add_epi64(x1, _mm256_srli_epi64(x0, 26));
1678 let y0 = _mm256_and_si256(x0, mask_26);
1679 (y1, y0)
1680 };
1681
1682 // Reduce modulo 2^130 - 5 from x4 down into x0, returning their new values.
1683 let red = |x4: __m256i, x0: __m256i| -> (__m256i, __m256i) {
1684 let y0 = _mm256_add_epi64(
1685 x0,
1686 _mm256_mul_epu32(_mm256_srli_epi64(x4, 26), _mm256_set1_epi64x(5)),
1687 );
1688 let y4 = _mm256_and_si256(x4, mask_26);
1689 (y4, y0)
1690 };
1691
1692 // Reduce the four integers in parallel to below 2^130.
1693 let (red_1, red_0) = adc(x.v1, x.v0);
1694 let (red_4, red_3) = adc(x.v4, x.v3);
1695 let (red_2, red_1) = adc(x.v2, red_1);
1696 let (red_4, red_0) = red(red_4, red_0);
1697 let (red_3, red_2) = adc(red_3, red_2);
1698 let (red_1, red_0) = adc(red_1, red_0);
1699 let (red_4, red_3) = adc(red_4, red_3);
1700
1701 // At this point, all limbs are contained within the lower 32 bits of each
1702 // 64-bit word. The upper limb of each integer (in red_4) is positioned
1703 // correctly for Aligned4x130, but the other limbs need to be blended
1704 // together:
1705 // - v0 contains limbs 0 and 2.
1706 // - v1 contains limbs 1 and 3.
1707 Aligned4x130 {
1708 v0: _mm256_blend_epi32(red_0, _mm256_slli_epi64(red_2, 32), 0b10101010),
1709 v1: _mm256_blend_epi32(red_1, _mm256_slli_epi64(red_3, 32), 0b10101010),
1710 v2: red_4,
1711 }
1712 }
1713 }
1714
1715 /// Returns the unreduced sum of the four 130-bit integers.
1716 #[inline(always)]
1717 pub(super) fn sum(self) -> Unreduced130 {
1718 unsafe {
1719 // Starting with the following limb layout across 64-bit words:
1720 // x.v4 = [u34, u24, u14, u04]
1721 // x.v3 = [u33, u23, u13, u03]
1722 // x.v2 = [u32, u22, u12, u02]
1723 // x.v1 = [u31, u21, u11, u01]
1724 // x.v0 = [u30, u20, u10, u00]
1725 let x = self;
1726
1727 // v0 = [
1728 // u31 + u21,
1729 // u30 + u20,
1730 // u11 + u01,
1731 // u10 + u00,
1732 // ]
1733 let v0 = _mm256_add_epi64(
1734 _mm256_unpackhi_epi64(x.v0, x.v1),
1735 _mm256_unpacklo_epi64(x.v0, x.v1),
1736 );
1737
1738 // v1 = [
1739 // u33 + u23,
1740 // u32 + u22,
1741 // u13 + u03,
1742 // u12 + u02,
1743 // ]
1744 let v1 = _mm256_add_epi64(
1745 _mm256_unpackhi_epi64(x.v2, x.v3),
1746 _mm256_unpacklo_epi64(x.v2, x.v3),
1747 );
1748
1749 // v0 = [
1750 // u33 + u23 + u13 + u03,
1751 // u32 + u22 + u12 + u02,
1752 // u31 + u21 + u11 + u01,
1753 // u30 + u20 + u10 + u00,
1754 // ]
1755 let v0 = _mm256_add_epi64(
1756 _mm256_inserti128_si256(v0, _mm256_castsi256_si128(v1), 1),
1757 _mm256_inserti128_si256(v1, _mm256_extractf128_si256(v0, 1), 0),
1758 );
1759
1760 // v1 = [
1761 // u34 + u14,
1762 // u24 + u04,
1763 // u14 + u34,
1764 // u04 + u24,
1765 // ]
1766 let v1 = _mm256_add_epi64(x.v4, _mm256_permute4x64_epi64(x.v4, set02(1, 0, 3, 2)));
1767
1768 // v1 = [
1769 // u34 + u24 + u14 + u04,
1770 // u24 + u24 + u04 + u04,
1771 // u34 + u24 + u14 + u04,
1772 // u34 + u24 + u14 + u04,
1773 // ]
1774 let v1 = _mm256_add_epi64(v1, _mm256_permute4x64_epi64(v1, set02(0, 0, 0, 1)));
1775
1776 // The result:
1777 // v1 = [
1778 // u34 + u24 + u14 + u04,
1779 // u24 + u24 + u04 + u04,
1780 // u34 + u24 + u14 + u04,
1781 // u34 + u24 + u14 + u04,
1782 // ]
1783 // v0 = [
1784 // u33 + u23 + u13 + u03,
1785 // u32 + u22 + u12 + u02,
1786 // u31 + u21 + u11 + u01,
1787 // u30 + u20 + u10 + u00,
1788 // ]
1789 // This corresponds to:
1790 // v1 = [ _, _, _, t_4]
1791 // v0 = [t_3, t_2, t_1, t_0]
1792 Unreduced130 { v0, v1 }
1793 }
1794 }
1795}
1796
1797#[derive(Clone, Copy, Debug)]
1798pub(super) struct AdditionKey(__m256i);
1799
1800impl Add<Aligned130> for AdditionKey {
1801 type Output = IntegerTag;
1802
1803 /// Computes x + k mod 2^128
1804 #[inline(always)]
1805 fn add(self, x: Aligned130) -> IntegerTag {
1806 unsafe {
1807 // Starting with the following limb layout:
1808 // x = [0, _, _, x4, x3, x2, x1, x0]
1809 // k = [0, k7, 0, k6, 0, k5, 0, k4]
1810 let mut x = _mm256_and_si256(x.0, _mm256_set_epi32(0, 0, 0, -1, -1, -1, -1, -1));
1811 let k = self.0;
1812
1813 /// Reduce to an integer below 2^130.
1814 unsafe fn propagate_carry(x: __m256i) -> __m256i {
1815 // t = [
1816 // 0,
1817 // 0,
1818 // 0,
1819 // x3 >> 26,
1820 // x2 >> 26,
1821 // x1 >> 26,
1822 // x0 >> 26,
1823 // x4 >> 26,
1824 // ];
1825 let t = _mm256_permutevar8x32_epi32(
1826 _mm256_srli_epi32(x, 26),
1827 _mm256_set_epi32(7, 7, 7, 3, 2, 1, 0, 4),
1828 );
1829
1830 // [
1831 // 0,
1832 // 0,
1833 // 0,
1834 // x4 % 2^26,
1835 // x3 % 2^26,
1836 // x2 % 2^26,
1837 // x1 % 2^26,
1838 // x0 % 2^26,
1839 // ]
1840 // + t + [0, 0, 0, 0, 0, 0, 0, 4·(x4 >> 26)]
1841 // = [
1842 // 0,
1843 // 0,
1844 // 0,
1845 // x4 % 2^26 + x3 >> 26,
1846 // x3 % 2^26 + x2 >> 26,
1847 // x2 % 2^26 + x1 >> 26,
1848 // x1 % 2^26 + x0 >> 26,
1849 // x0 % 2^26 + 5·(x4 >> 26),
1850 // ] => [0, 0, 0, x4, x3, x2, x1, x0]
1851 _mm256_add_epi32(
1852 _mm256_add_epi32(
1853 _mm256_and_si256(
1854 x,
1855 _mm256_set_epi32(
1856 0, 0, 0, 0x3ffffff, 0x3ffffff, 0x3ffffff, 0x3ffffff, 0x3ffffff,
1857 ),
1858 ),
1859 t,
1860 ),
1861 _mm256_permutevar8x32_epi32(
1862 _mm256_slli_epi32(t, 2),
1863 _mm256_set_epi32(7, 7, 7, 7, 7, 7, 7, 0),
1864 ),
1865 )
1866 }
1867
1868 // Reduce modulus 2^130-5:
1869 // - Reduce to an integer below 2^130:
1870 // TODO: Is it more efficient to unpack the limbs for this?
1871 for _ in 0..5 {
1872 x = propagate_carry(x);
1873 }
1874
1875 // - Compute x + -p by adding 5 and carrying up to the top limb:
1876 // g = [0, 0, 0, g4, g3, g2, g1, g0]
1877 let mut g = _mm256_add_epi32(x, _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, 5));
1878 // TODO: Is it more efficient to unpack the limbs for this?
1879 for _ in 0..4 {
1880 g = propagate_carry(g);
1881 }
1882 let g = _mm256_sub_epi32(g, _mm256_set_epi32(0, 0, 0, 1 << 26, 0, 0, 0, 0));
1883
1884 // - Check whether g4 overflowed:
1885 let mask = _mm256_permutevar8x32_epi32(
1886 _mm256_sub_epi32(_mm256_srli_epi32(g, 32 - 1), _mm256_set1_epi32(1)),
1887 _mm256_set1_epi32(4),
1888 );
1889
1890 // - Select x if g4 overflowed, else g:
1891 let x = _mm256_or_si256(
1892 _mm256_and_si256(x, _mm256_xor_si256(mask, _mm256_set1_epi32(-1))),
1893 _mm256_and_si256(g, mask),
1894 );
1895
1896 // Align back to 32 bits per digit. We drop the top two bits of the top limb,
1897 // because we only care about the lower 128 bits from here onward, and don't
1898 // need to track overflow or reduce.
1899 // [
1900 // 0,
1901 // 0,
1902 // 0,
1903 // 0,
1904 // x4[24..0] || x3[26..18],
1905 // x3[18..0] || x2[26..12],
1906 // x2[12..0] || x1[26.. 6],
1907 // x1[ 6..0] || x0[26.. 0],
1908 // ]
1909 let x = _mm256_or_si256(
1910 _mm256_srlv_epi32(x, _mm256_set_epi32(32, 32, 32, 32, 18, 12, 6, 0)),
1911 _mm256_permutevar8x32_epi32(
1912 _mm256_sllv_epi32(x, _mm256_set_epi32(32, 32, 32, 8, 14, 20, 26, 32)),
1913 _mm256_set_epi32(7, 7, 7, 7, 4, 3, 2, 1),
1914 ),
1915 );
1916
1917 // Add key
1918 // [
1919 // (x4[24..0] || x3[26..18]) + k7,
1920 // (x3[18..0] || x2[26..12]) + k6,
1921 // (x2[12..0] || x1[26.. 6]) + k5,
1922 // (x1[ 6..0] || x0[26.. 0]) + k4,
1923 // ]
1924 let mut x = _mm256_add_epi64(
1925 _mm256_permutevar8x32_epi32(x, _mm256_set_epi32(7, 3, 7, 2, 7, 1, 7, 0)),
1926 k,
1927 );
1928
1929 // Ensure that all carries are handled
1930 unsafe fn propagate_carry_32(x: __m256i) -> __m256i {
1931 // [
1932 // (l4 % 2^32) + (l3 >> 32),
1933 // (l3 % 2^32) + (l2 >> 32),
1934 // (l2 % 2^32) + (l1 >> 32),
1935 // (l1 % 2^32),
1936 // ]
1937 _mm256_add_epi64(
1938 _mm256_and_si256(x, _mm256_set_epi32(0, -1, 0, -1, 0, -1, 0, -1)),
1939 _mm256_permute4x64_epi64(
1940 _mm256_and_si256(
1941 _mm256_srli_epi64(x, 32),
1942 _mm256_set_epi64x(0, -1, -1, -1),
1943 ),
1944 set02(2, 1, 0, 3),
1945 ),
1946 )
1947 }
1948 for _ in 0..3 {
1949 x = propagate_carry_32(x);
1950 }
1951
1952 // Now that all limbs are at most 32 bits, realign from 64- to 32-bit limbs.
1953 // [
1954 // 0,
1955 // 0,
1956 // 0,
1957 // 0,
1958 // ((x4[24..0] || x3[26..18]) + k7) % 2^32 + ((x3[18..0] || x2[26..12]) + k6) >> 32,
1959 // ((x3[18..0] || x2[26..12]) + k6) % 2^32 + ((x2[12..0] || x1[26.. 6]) + k5) >> 32,
1960 // ((x2[12..0] || x1[26.. 6]) + k5) % 2^32 + ((x1[ 6..0] || x0[26.. 0]) + k4) >> 32,
1961 // ((x1[ 6..0] || x0[26.. 0]) + k4) % 2^32,
1962 // ]
1963 let x = _mm256_permutevar8x32_epi32(x, _mm256_set_epi32(7, 7, 7, 7, 6, 4, 2, 0));
1964
1965 // Reduce modulus 2^128
1966 IntegerTag(_mm256_castsi256_si128(x))
1967 }
1968 }
1969}
1970
1971pub(super) struct IntegerTag(__m128i);
1972
1973impl From<AdditionKey> for IntegerTag {
1974 fn from(k: AdditionKey) -> Self {
1975 unsafe {
1976 // There was no polynomial to add.
1977 IntegerTag(_mm256_castsi256_si128(_mm256_permutevar8x32_epi32(
1978 k.0,
1979 _mm256_set_epi32(0, 0, 0, 0, 6, 4, 2, 0),
1980 )))
1981 }
1982 }
1983}
1984
1985impl IntegerTag {
1986 pub(super) fn write(self, tag: &mut [u8]) {
1987 unsafe {
1988 _mm_storeu_si128(tag.as_mut_ptr() as *mut _, self.0);
1989 }
1990 }
1991}