1use super::*;
2
3simd_type!({
5 #[allow(missing_docs)]
12 pub struct V3 {
13 pub sse: f!("sse"),
14 pub sse2: f!("sse2"),
15 pub fxsr: f!("fxsr"),
16 pub sse3: f!("sse3"),
17 pub ssse3: f!("ssse3"),
18 pub sse4_1: f!("sse4.1"),
19 pub sse4_2: f!("sse4.2"),
20 pub popcnt: f!("popcnt"),
21 pub avx: f!("avx"),
22 pub avx2: f!("avx2"),
23 pub bmi1: f!("bmi1"),
24 pub bmi2: f!("bmi2"),
25 pub fma: f!("fma"),
26 pub lzcnt: f!("lzcnt"),
27 }
28});
29
30#[inline(always)]
32fn avx2_pshufb(simd: V3, bytes: __m256i, idxs: __m256i) -> __m256i {
33 let mid = simd.avx._mm256_set1_epi8(16i8);
34 let high = simd.avx._mm256_set1_epi8(32i8);
35 let hihi = simd.avx2._mm256_permute2x128_si256::<0x11>(bytes, bytes);
42 let hi_shuf = simd.avx2._mm256_shuffle_epi8(
43 hihi, idxs, );
46 let compose = simd.avx2._mm256_blendv_epi8(
48 simd.avx._mm256_set1_epi8(0),
49 hi_shuf,
50 simd.avx2._mm256_cmpgt_epi8(high, idxs),
51 );
52 let lolo = simd.avx2._mm256_permute2x128_si256::<0x00>(bytes, bytes);
53 let lo_shuf = simd.avx2._mm256_shuffle_epi8(lolo, idxs);
54 simd.avx2
56 ._mm256_blendv_epi8(compose, lo_shuf, simd.avx2._mm256_cmpgt_epi8(mid, idxs))
57}
58
59static AVX2_ROTATE_IDX: [u8x32; 32] = [
60 u8x32(
61 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
62 25, 26, 27, 28, 29, 30, 31,
63 ),
64 u8x32(
65 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
66 24, 25, 26, 27, 28, 29, 30,
67 ),
68 u8x32(
69 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22,
70 23, 24, 25, 26, 27, 28, 29,
71 ),
72 u8x32(
73 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21,
74 22, 23, 24, 25, 26, 27, 28,
75 ),
76 u8x32(
77 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20,
78 21, 22, 23, 24, 25, 26, 27,
79 ),
80 u8x32(
81 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
82 20, 21, 22, 23, 24, 25, 26,
83 ),
84 u8x32(
85 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
86 19, 20, 21, 22, 23, 24, 25,
87 ),
88 u8x32(
89 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,
90 18, 19, 20, 21, 22, 23, 24,
91 ),
92 u8x32(
93 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
94 17, 18, 19, 20, 21, 22, 23,
95 ),
96 u8x32(
97 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
98 16, 17, 18, 19, 20, 21, 22,
99 ),
100 u8x32(
101 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
102 15, 16, 17, 18, 19, 20, 21,
103 ),
104 u8x32(
105 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
106 14, 15, 16, 17, 18, 19, 20,
107 ),
108 u8x32(
109 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
110 13, 14, 15, 16, 17, 18, 19,
111 ),
112 u8x32(
113 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
114 12, 13, 14, 15, 16, 17, 18,
115 ),
116 u8x32(
117 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
118 11, 12, 13, 14, 15, 16, 17,
119 ),
120 u8x32(
121 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
122 10, 11, 12, 13, 14, 15, 16,
123 ),
124 u8x32(
125 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8,
126 9, 10, 11, 12, 13, 14, 15,
127 ),
128 u8x32(
129 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7,
130 8, 9, 10, 11, 12, 13, 14,
131 ),
132 u8x32(
133 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5,
134 6, 7, 8, 9, 10, 11, 12, 13,
135 ),
136 u8x32(
137 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4,
138 5, 6, 7, 8, 9, 10, 11, 12,
139 ),
140 u8x32(
141 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3,
142 4, 5, 6, 7, 8, 9, 10, 11,
143 ),
144 u8x32(
145 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1,
146 2, 3, 4, 5, 6, 7, 8, 9, 10,
147 ),
148 u8x32(
149 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0,
150 1, 2, 3, 4, 5, 6, 7, 8, 9,
151 ),
152 u8x32(
153 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
154 0, 1, 2, 3, 4, 5, 6, 7, 8,
155 ),
156 u8x32(
157 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30,
158 31, 0, 1, 2, 3, 4, 5, 6, 7,
159 ),
160 u8x32(
161 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
162 30, 31, 0, 1, 2, 3, 4, 5, 6,
163 ),
164 u8x32(
165 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
166 30, 31, 0, 1, 2, 3, 4, 5,
167 ),
168 u8x32(
169 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28,
170 29, 30, 31, 0, 1, 2, 3, 4,
171 ),
172 u8x32(
173 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27,
174 28, 29, 30, 31, 0, 1, 2, 3,
175 ),
176 u8x32(
177 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
178 27, 28, 29, 30, 31, 0, 1, 2,
179 ),
180 u8x32(
181 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
182 27, 28, 29, 30, 31, 0, 1,
183 ),
184 u8x32(
185 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
186 26, 27, 28, 29, 30, 31, 0,
187 ),
188];
189
190static AVX2_128_ROTATE_IDX: [u8x16; 16] = [
191 u8x16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15),
192 u8x16(15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14),
193 u8x16(14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13),
194 u8x16(13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12),
195 u8x16(12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11),
196 u8x16(11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10),
197 u8x16(10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9),
198 u8x16(9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8),
199 u8x16(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7),
200 u8x16(7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6),
201 u8x16(6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5),
202 u8x16(5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4),
203 u8x16(4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3),
204 u8x16(3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2),
205 u8x16(2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1),
206 u8x16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0),
207];
208
209static V3_U32_MASKS: [u32x8; 9] = [
210 u32x8(0, 0, 0, 0, 0, 0, 0, 0),
211 u32x8(!0, 0, 0, 0, 0, 0, 0, 0),
212 u32x8(!0, !0, 0, 0, 0, 0, 0, 0),
213 u32x8(!0, !0, !0, 0, 0, 0, 0, 0),
214 u32x8(!0, !0, !0, !0, 0, 0, 0, 0),
215 u32x8(!0, !0, !0, !0, !0, 0, 0, 0),
216 u32x8(!0, !0, !0, !0, !0, !0, 0, 0),
217 u32x8(!0, !0, !0, !0, !0, !0, !0, 0),
218 u32x8(!0, !0, !0, !0, !0, !0, !0, !0),
219];
220static V3_U32_LAST_MASKS: [u32x8; 9] = [
221 u32x8(0, 0, 0, 0, 0, 0, 0, 0),
222 u32x8(0, 0, 0, 0, 0, 0, 0, !0),
223 u32x8(0, 0, 0, 0, 0, 0, !0, !0),
224 u32x8(0, 0, 0, 0, 0, !0, !0, !0),
225 u32x8(0, 0, 0, 0, !0, !0, !0, !0),
226 u32x8(0, 0, 0, !0, !0, !0, !0, !0),
227 u32x8(0, 0, !0, !0, !0, !0, !0, !0),
228 u32x8(0, !0, !0, !0, !0, !0, !0, !0),
229 u32x8(!0, !0, !0, !0, !0, !0, !0, !0),
230];
231
232impl Seal for V3 {}
233impl Seal for V3_Scalar {}
234
235#[derive(Copy, Clone, Debug)]
236#[repr(transparent)]
237pub struct V3_Scalar(pub V3);
238
239#[inline(always)]
240pub(super) fn avx_load_u32s(simd: Avx2, slice: &[u32]) -> u32x8 {
241 _ = simd;
242 unsafe { avx_ld_u32s(slice.as_ptr(), LD_ST[2 * (16 * slice.len().min(8))]) }
243}
244
245#[inline(always)]
246pub(super) fn avx_store_u32s(simd: Avx2, slice: &mut [u32], value: u32x8) {
247 _ = simd;
248 unsafe {
249 avx_st_u32s(
250 slice.as_mut_ptr(),
251 value,
252 LD_ST[2 * (16 * slice.len().min(8)) + 1],
253 );
254 }
255}
256
257impl core::ops::Deref for V3 {
258 type Target = V2;
259
260 #[inline(always)]
261 fn deref(&self) -> &Self::Target {
262 V2 {
263 sse: self.sse,
264 sse2: self.sse2,
265 fxsr: self.fxsr,
266 sse3: self.sse3,
267 ssse3: self.ssse3,
268 sse4_1: self.sse4_1,
269 sse4_2: self.sse4_2,
270 popcnt: self.popcnt,
271 }
272 .to_ref()
273 }
274}
275
276impl Simd for V3 {
277 type c32s = f32x8;
278 type c64s = f64x4;
279 type f32s = f32x8;
280 type f64s = f64x4;
281 type i32s = i32x8;
282 type i64s = i64x4;
283 type m32s = m32x8;
284 type m64s = m64x4;
285 type u32s = u32x8;
286 type u64s = u64x4;
287
288 const REGISTER_COUNT: usize = 16;
289
290 #[inline(always)]
291 fn abs2_c32s(self, a: Self::c32s) -> Self::c32s {
292 let sqr = self.mul_f32s(a, a);
293 let sqr_rev = self
294 .avx
295 ._mm256_shuffle_ps::<0b10_11_00_01>(cast!(sqr), cast!(sqr));
296 self.add_f32s(sqr, cast!(sqr_rev))
297 }
298
299 #[inline(always)]
300 fn abs2_c64s(self, a: Self::c64s) -> Self::c64s {
301 let sqr = self.mul_f64s(a, a);
302 let sqr_rev = self.avx._mm256_shuffle_pd::<0b0101>(cast!(sqr), cast!(sqr));
303 self.add_f64s(sqr, cast!(sqr_rev))
304 }
305
306 #[inline(always)]
307 fn abs_max_c32s(self, a: Self::c32s) -> Self::c32s {
308 let max = self.abs_f32s(a);
309 let max_rev = self
310 .avx
311 ._mm256_shuffle_ps::<0b10_11_00_01>(cast!(a), cast!(a));
312 self.max_f32s(max, cast!(max_rev))
313 }
314
315 #[inline(always)]
316 fn abs_max_c64s(self, a: Self::c64s) -> Self::c64s {
317 let max = self.abs_f64s(a);
318 let max_rev = self.avx._mm256_shuffle_pd::<0b0101>(cast!(max), cast!(max));
319 self.max_f64s(max, cast!(max_rev))
320 }
321
322 #[inline(always)]
323 fn add_c32s(self, a: Self::c32s, b: Self::c32s) -> Self::c32s {
324 self.add_f32s(a, b)
325 }
326
327 #[inline(always)]
328 fn add_c64s(self, a: Self::c64s, b: Self::c64s) -> Self::c64s {
329 self.add_f64s(a, b)
330 }
331
332 #[inline(always)]
333 fn add_f32s(self, a: Self::f32s, b: Self::f32s) -> Self::f32s {
334 cast!(self.avx._mm256_add_ps(cast!(a), cast!(b)))
335 }
336
337 #[inline(always)]
338 fn add_f64s(self, a: Self::f64s, b: Self::f64s) -> Self::f64s {
339 cast!(self.avx._mm256_add_pd(cast!(a), cast!(b)))
340 }
341
342 #[inline(always)]
343 fn add_u32s(self, a: Self::u32s, b: Self::u32s) -> Self::u32s {
344 cast!(self.avx2._mm256_add_epi32(cast!(a), cast!(b)))
345 }
346
347 #[inline(always)]
348 fn add_u64s(self, a: Self::u64s, b: Self::u64s) -> Self::u64s {
349 cast!(self.avx2._mm256_add_epi64(cast!(a), cast!(b)))
350 }
351
352 #[inline(always)]
353 fn and_m32s(self, a: Self::m32s, b: Self::m32s) -> Self::m32s {
354 cast!(self.avx._mm256_and_pd(cast!(a), cast!(b)))
355 }
356
357 #[inline(always)]
358 fn and_m64s(self, a: Self::m64s, b: Self::m64s) -> Self::m64s {
359 cast!(self.avx._mm256_and_pd(cast!(a), cast!(b)))
360 }
361
362 #[inline(always)]
363 fn and_u32s(self, a: Self::u32s, b: Self::u32s) -> Self::u32s {
364 cast!(self.avx._mm256_and_pd(cast!(a), cast!(b)))
365 }
366
367 #[inline(always)]
368 fn and_u64s(self, a: Self::u64s, b: Self::u64s) -> Self::u64s {
369 cast!(self.avx._mm256_and_pd(cast!(a), cast!(b)))
370 }
371
372 #[inline(always)]
373 fn conj_c32s(self, a: Self::c32s) -> Self::c32s {
374 self.xor_f32s(a, self.splat_c32s(c32 { re: 0.0, im: -0.0 }))
375 }
376
377 #[inline(always)]
378 fn conj_c64s(self, a: Self::c64s) -> Self::c64s {
379 self.xor_f64s(a, self.splat_c64s(c64 { re: 0.0, im: -0.0 }))
380 }
381
382 #[inline(always)]
383 fn conj_mul_add_c32s(self, a: Self::c32s, b: Self::c32s, c: Self::c32s) -> Self::c32s {
384 let ab = cast!(a);
385 let xy = cast!(b);
386
387 let yx = self.avx._mm256_permute_ps::<0b10_11_00_01>(xy);
388 let aa = self.avx._mm256_moveldup_ps(ab);
389 let bb = self.avx._mm256_movehdup_ps(ab);
390
391 cast!(
392 self.fma
393 ._mm256_fmsubadd_ps(aa, xy, self.fma._mm256_fmsubadd_ps(bb, yx, cast!(c)))
394 )
395 }
396
397 #[inline(always)]
398 fn conj_mul_add_c64s(self, a: Self::c64s, b: Self::c64s, c: Self::c64s) -> Self::c64s {
399 let ab = cast!(a);
400 let xy = cast!(b);
401
402 let yx = self.avx._mm256_permute_pd::<0b0101>(xy);
403 let aa = self.avx._mm256_unpacklo_pd(ab, ab);
404 let bb = self.avx._mm256_unpackhi_pd(ab, ab);
405
406 cast!(
407 self.fma
408 ._mm256_fmsubadd_pd(aa, xy, self.fma._mm256_fmsubadd_pd(bb, yx, cast!(c)))
409 )
410 }
411
412 #[inline(always)]
413 fn conj_mul_c32s(self, a: Self::c32s, b: Self::c32s) -> Self::c32s {
414 let ab = cast!(a);
415 let xy = cast!(b);
416
417 let yx = self.avx._mm256_permute_ps::<0b10_11_00_01>(xy);
418 let aa = self.avx._mm256_moveldup_ps(ab);
419 let bb = self.avx._mm256_movehdup_ps(ab);
420
421 cast!(
422 self.fma
423 ._mm256_fmsubadd_ps(aa, xy, self.avx._mm256_mul_ps(bb, yx))
424 )
425 }
426
427 #[inline(always)]
428 fn conj_mul_c64s(self, a: Self::c64s, b: Self::c64s) -> Self::c64s {
429 let ab = cast!(a);
430 let xy = cast!(b);
431
432 let yx = self.avx._mm256_permute_pd::<0b0101>(xy);
433 let aa = self.avx._mm256_unpacklo_pd(ab, ab);
434 let bb = self.avx._mm256_unpackhi_pd(ab, ab);
435
436 cast!(
437 self.fma
438 ._mm256_fmsubadd_pd(aa, xy, self.avx._mm256_mul_pd(bb, yx))
439 )
440 }
441
442 #[inline(always)]
443 fn deinterleave_shfl_f32s<T: Interleave>(self, values: T) -> T {
444 let avx = self.avx;
445
446 if try_const! { core::mem::size_of::<T>() == 2 * core::mem::size_of::<Self::f32s>() } {
447 let values: [__m256d; 2] = unsafe { core::mem::transmute_copy(&values) };
448 let values = [
454 cast!(avx._mm256_unpacklo_ps(cast!(values[0]), cast!(values[1]))),
455 cast!(avx._mm256_unpackhi_ps(cast!(values[0]), cast!(values[1]))),
456 ];
457
458 let values = [
461 avx._mm256_unpacklo_pd(values[0], values[1]),
462 avx._mm256_unpackhi_pd(values[0], values[1]),
463 ];
464
465 unsafe { core::mem::transmute_copy(&values) }
466 } else if try_const! { core::mem::size_of::<T>() == 4 * core::mem::size_of::<Self::f32s>() }
467 {
468 let values: [__m256d; 4] = unsafe { core::mem::transmute_copy(&values) };
473
474 let values = [
479 cast!(avx._mm256_unpacklo_ps(cast!(values[0]), cast!(values[1]))),
480 cast!(avx._mm256_unpackhi_ps(cast!(values[0]), cast!(values[1]))),
481 cast!(avx._mm256_unpacklo_ps(cast!(values[2]), cast!(values[3]))),
482 cast!(avx._mm256_unpackhi_ps(cast!(values[2]), cast!(values[3]))),
483 ];
484
485 let values = [
486 avx._mm256_unpacklo_pd(values[0], values[1]),
487 avx._mm256_unpackhi_pd(values[0], values[1]),
488 avx._mm256_unpacklo_pd(values[2], values[3]),
489 avx._mm256_unpackhi_pd(values[2], values[3]),
490 ];
491
492 let values = [
497 avx._mm256_unpacklo_pd(values[0], values[2]),
498 avx._mm256_unpacklo_pd(values[1], values[3]),
499 avx._mm256_unpackhi_pd(values[0], values[2]),
500 avx._mm256_unpackhi_pd(values[1], values[3]),
501 ];
502
503 unsafe { core::mem::transmute_copy(&values) }
504 } else {
505 unsafe { deinterleave_fallback::<f32, Self::f32s, T>(values) }
506 }
507 }
508
509 #[inline(always)]
510 fn deinterleave_shfl_f64s<T: Interleave>(self, values: T) -> T {
511 let avx = self.avx;
512
513 if try_const! { core::mem::size_of::<T>() == 2 * core::mem::size_of::<Self::f64s>() } {
514 let values: [__m256d; 2] = unsafe { core::mem::transmute_copy(&values) };
515 let values = [
516 avx._mm256_unpacklo_pd(values[0], values[1]),
517 avx._mm256_unpackhi_pd(values[0], values[1]),
518 ];
519 unsafe { core::mem::transmute_copy(&values) }
520 } else if try_const! { core::mem::size_of::<T>() == 4 * core::mem::size_of::<Self::f64s>() }
521 {
522 let values: [__m256d; 4] = unsafe { core::mem::transmute_copy(&values) };
523
524 let values: [__m256d; 4] = [
534 avx._mm256_unpacklo_pd(values[0], values[1]),
535 avx._mm256_unpackhi_pd(values[0], values[1]),
536 avx._mm256_unpacklo_pd(values[2], values[3]),
537 avx._mm256_unpackhi_pd(values[2], values[3]),
538 ];
539
540 let values = [
545 avx._mm256_permute2f128_pd::<0b0010_0000>(values[0], values[2]),
546 avx._mm256_permute2f128_pd::<0b0010_0000>(values[1], values[3]),
547 avx._mm256_permute2f128_pd::<0b0011_0001>(values[0], values[2]),
548 avx._mm256_permute2f128_pd::<0b0011_0001>(values[1], values[3]),
549 ];
550
551 unsafe { core::mem::transmute_copy(&values) }
552 } else {
553 unsafe { deinterleave_fallback::<f64, Self::f64s, T>(values) }
554 }
555 }
556
557 #[inline(always)]
558 fn div_f32s(self, a: Self::f32s, b: Self::f32s) -> Self::f32s {
559 cast!(self.avx._mm256_div_ps(cast!(a), cast!(b)))
560 }
561
562 #[inline(always)]
563 fn div_f64s(self, a: Self::f64s, b: Self::f64s) -> Self::f64s {
564 cast!(self.avx._mm256_div_pd(cast!(a), cast!(b)))
565 }
566
567 #[inline(always)]
568 fn equal_f32s(self, a: Self::f32s, b: Self::f32s) -> Self::m32s {
569 cast!(self.avx._mm256_cmp_ps::<_CMP_EQ_OQ>(cast!(a), cast!(b)))
570 }
571
572 #[inline(always)]
573 fn equal_f64s(self, a: Self::f64s, b: Self::f64s) -> Self::m64s {
574 cast!(self.avx._mm256_cmp_pd::<_CMP_EQ_OQ>(cast!(a), cast!(b)))
575 }
576
577 #[inline(always)]
578 fn greater_than_or_equal_u32s(self, a: Self::u32s, b: Self::u32s) -> Self::m32s {
579 self.cmp_ge_u32x8(a, b)
580 }
581
582 #[inline(always)]
583 fn greater_than_or_equal_u64s(self, a: Self::u64s, b: Self::u64s) -> Self::m64s {
584 self.cmp_ge_u64x4(a, b)
585 }
586
587 #[inline(always)]
588 fn greater_than_u32s(self, a: Self::u32s, b: Self::u32s) -> Self::m32s {
589 self.cmp_gt_u32x8(a, b)
590 }
591
592 #[inline(always)]
593 fn greater_than_u64s(self, a: Self::u64s, b: Self::u64s) -> Self::m64s {
594 self.cmp_gt_u64x4(a, b)
595 }
596
597 #[inline(always)]
598 fn interleave_shfl_f32s<T: Interleave>(self, values: T) -> T {
599 if try_const! {
600 (core::mem::size_of::<T>() == 2 * core::mem::size_of::<Self::f32s>())
601 || (core::mem::size_of::<T>() == 4 * core::mem::size_of::<Self::f32s>())
602 } {
603 self.deinterleave_shfl_f32s(values)
605 } else {
606 unsafe { interleave_fallback::<f32, Self::f32s, T>(values) }
607 }
608 }
609
610 #[inline(always)]
611 fn interleave_shfl_f64s<T: Interleave>(self, values: T) -> T {
612 if try_const! {
613 (core::mem::size_of::<T>() == 2 * core::mem::size_of::<Self::f64s>())
614 || (core::mem::size_of::<T>() == 4 * core::mem::size_of::<Self::f64s>())
615 } {
616 self.deinterleave_shfl_f64s(values)
618 } else {
619 unsafe { interleave_fallback::<f64, Self::f64s, T>(values) }
620 }
621 }
622
623 #[inline(always)]
624 fn less_than_f32s(self, a: Self::f32s, b: Self::f32s) -> Self::m32s {
625 cast!(self.avx._mm256_cmp_ps::<_CMP_LT_OQ>(cast!(a), cast!(b)))
626 }
627
628 #[inline(always)]
629 fn less_than_f64s(self, a: Self::f64s, b: Self::f64s) -> Self::m64s {
630 cast!(self.avx._mm256_cmp_pd::<_CMP_LT_OQ>(cast!(a), cast!(b)))
631 }
632
633 #[inline(always)]
634 fn less_than_or_equal_f32s(self, a: Self::f32s, b: Self::f32s) -> Self::m32s {
635 cast!(self.avx._mm256_cmp_ps::<_CMP_LE_OQ>(cast!(a), cast!(b)))
636 }
637
638 #[inline(always)]
639 fn less_than_or_equal_f64s(self, a: Self::f64s, b: Self::f64s) -> Self::m64s {
640 cast!(self.avx._mm256_cmp_pd::<_CMP_LE_OQ>(cast!(a), cast!(b)))
641 }
642
643 #[inline(always)]
644 fn less_than_or_equal_u32s(self, a: Self::u32s, b: Self::u32s) -> Self::m32s {
645 self.cmp_le_u32x8(a, b)
646 }
647
648 #[inline(always)]
649 fn less_than_or_equal_u64s(self, a: Self::u64s, b: Self::u64s) -> Self::m64s {
650 self.cmp_le_u64x4(a, b)
651 }
652
653 #[inline(always)]
654 fn less_than_u32s(self, a: Self::u32s, b: Self::u32s) -> Self::m32s {
655 self.cmp_lt_u32x8(a, b)
656 }
657
658 #[inline(always)]
659 fn less_than_u64s(self, a: Self::u64s, b: Self::u64s) -> Self::m64s {
660 self.cmp_lt_u64x4(a, b)
661 }
662
663 #[inline(always)]
664 fn mask_between_m32s(self, start: u32, end: u32) -> MemMask<Self::m32s> {
665 let start = start.min(8) as usize;
666 let end = end.min(8) as usize;
667 MemMask {
668 mask: self.and_m32s(
669 cast!(V3_U32_LAST_MASKS[8 - start]),
670 cast!(V3_U32_MASKS[end]),
671 ),
672 load: Some(LD_ST[2 * (16 * end + start) + 0]),
673 store: Some(LD_ST[2 * (16 * end + start) + 1]),
674 }
675 }
676
677 #[inline(always)]
678 fn mask_between_m64s(self, start: u64, end: u64) -> MemMask<Self::m64s> {
679 let start = (2 * start.min(4)) as usize;
680 let end = (2 * end.min(4)) as usize;
681 MemMask {
682 mask: self.and_m64s(
683 cast!(V3_U32_LAST_MASKS[8 - start]),
684 cast!(V3_U32_MASKS[end]),
685 ),
686 load: Some(LD_ST[2 * (16 * end + start) + 0]),
687 store: Some(LD_ST[2 * (16 * end + start) + 1]),
688 }
689 }
690
691 #[inline(always)]
695 unsafe fn mask_load_ptr_c32s(self, mask: MemMask<Self::m32s>, ptr: *const c32) -> Self::c32s {
696 cast!(self.mask_load_ptr_u32s(mask, ptr as _))
697 }
698
699 #[inline(always)]
703 unsafe fn mask_load_ptr_c64s(self, mask: MemMask<Self::m64s>, ptr: *const c64) -> Self::c64s {
704 cast!(self.mask_load_ptr_u64s(mask, ptr as _))
705 }
706
707 #[inline(always)]
711 unsafe fn mask_load_ptr_u32s(self, mask: MemMask<Self::m32s>, ptr: *const u32) -> Self::u32s {
712 match mask.load {
713 Some(load) => avx_ld_u32s(ptr, load),
714 None => cast!(self.avx2._mm256_maskload_epi32(ptr as _, cast!(mask.mask))),
715 }
716 }
717
718 #[inline(always)]
722 unsafe fn mask_load_ptr_u64s(self, mask: MemMask<Self::m64s>, ptr: *const u64) -> Self::u64s {
723 cast!(self.mask_load_ptr_u32s(
724 MemMask {
725 mask: cast!(mask.mask),
726 load: mask.load,
727 store: mask.store
728 },
729 ptr as _
730 ))
731 }
732
733 #[inline(always)]
737 unsafe fn mask_store_ptr_c32s(
738 self,
739 mask: MemMask<Self::m32s>,
740 ptr: *mut c32,
741 values: Self::c32s,
742 ) {
743 self.mask_store_ptr_u32s(mask, ptr as _, cast!(values))
744 }
745
746 #[inline(always)]
750 unsafe fn mask_store_ptr_c64s(
751 self,
752 mask: MemMask<Self::m64s>,
753 ptr: *mut c64,
754 values: Self::c64s,
755 ) {
756 self.mask_store_ptr_u64s(mask, ptr as _, cast!(values))
757 }
758
759 #[inline(always)]
763 unsafe fn mask_store_ptr_u32s(
764 self,
765 mask: MemMask<Self::m32s>,
766 ptr: *mut u32,
767 values: Self::u32s,
768 ) {
769 match mask.store {
770 Some(store) => avx_st_u32s(ptr, values, store),
771 None => _mm256_maskstore_epi32(ptr as *mut i32, cast!(mask.mask), cast!(values)),
772 }
773 }
774
775 #[inline(always)]
779 unsafe fn mask_store_ptr_u64s(
780 self,
781 mask: MemMask<Self::m64s>,
782 ptr: *mut u64,
783 values: Self::u64s,
784 ) {
785 self.mask_store_ptr_u32s(
786 MemMask {
787 mask: cast!(mask.mask),
788 load: mask.load,
789 store: mask.store,
790 },
791 ptr as _,
792 cast!(values),
793 )
794 }
795
796 #[inline(always)]
797 fn max_f32s(self, a: Self::f32s, b: Self::f32s) -> Self::f32s {
798 cast!(self.avx._mm256_max_ps(cast!(a), cast!(b)))
799 }
800
801 #[inline(always)]
802 fn max_f64s(self, a: Self::f64s, b: Self::f64s) -> Self::f64s {
803 cast!(self.avx._mm256_max_pd(cast!(a), cast!(b)))
804 }
805
806 #[inline(always)]
807 fn min_f32s(self, a: Self::f32s, b: Self::f32s) -> Self::f32s {
808 cast!(self.avx._mm256_min_ps(cast!(a), cast!(b)))
809 }
810
811 #[inline(always)]
812 fn min_f64s(self, a: Self::f64s, b: Self::f64s) -> Self::f64s {
813 cast!(self.avx._mm256_min_pd(cast!(a), cast!(b)))
814 }
815
816 #[inline(always)]
817 fn mul_add_c32s(self, a: Self::c32s, b: Self::c32s, c: Self::c32s) -> Self::c32s {
818 let ab = cast!(a);
819 let xy = cast!(b);
820
821 let yx = self.avx._mm256_permute_ps::<0b10_11_00_01>(xy);
822 let aa = self.avx._mm256_moveldup_ps(ab);
823 let bb = self.avx._mm256_movehdup_ps(ab);
824
825 cast!(
826 self.fma
827 ._mm256_fmaddsub_ps(aa, xy, self.fma._mm256_fmaddsub_ps(bb, yx, cast!(c)))
828 )
829 }
830
831 #[inline(always)]
832 fn mul_add_c64s(self, a: Self::c64s, b: Self::c64s, c: Self::c64s) -> Self::c64s {
833 let ab = cast!(a);
834 let xy = cast!(b);
835
836 let yx = self.avx._mm256_permute_pd::<0b0101>(xy);
837 let aa = self.avx._mm256_unpacklo_pd(ab, ab);
838 let bb = self.avx._mm256_unpackhi_pd(ab, ab);
839
840 cast!(
841 self.fma
842 ._mm256_fmaddsub_pd(aa, xy, self.fma._mm256_fmaddsub_pd(bb, yx, cast!(c)))
843 )
844 }
845
846 #[inline(always)]
847 fn mul_add_e_f32s(self, a: Self::f32s, b: Self::f32s, c: Self::f32s) -> Self::f32s {
848 self.mul_add_f32s(a, b, c)
849 }
850
851 #[inline(always)]
852 fn mul_add_e_f64s(self, a: Self::f64s, b: Self::f64s, c: Self::f64s) -> Self::f64s {
853 self.mul_add_f64s(a, b, c)
854 }
855
856 #[inline(always)]
857 fn mul_add_f32s(self, a: Self::f32s, b: Self::f32s, c: Self::f32s) -> Self::f32s {
858 cast!(self.fma._mm256_fmadd_ps(cast!(a), cast!(b), cast!(c)))
859 }
860
861 #[inline(always)]
862 fn mul_add_f64s(self, a: Self::f64s, b: Self::f64s, c: Self::f64s) -> Self::f64s {
863 cast!(self.fma._mm256_fmadd_pd(cast!(a), cast!(b), cast!(c)))
864 }
865
866 #[inline(always)]
867 fn mul_c32s(self, a: Self::c32s, b: Self::c32s) -> Self::c32s {
868 let ab = cast!(a);
869 let xy = cast!(b);
870
871 let yx = self.avx._mm256_permute_ps::<0b10_11_00_01>(xy);
872 let aa = self.avx._mm256_moveldup_ps(ab);
873 let bb = self.avx._mm256_movehdup_ps(ab);
874
875 cast!(
876 self.fma
877 ._mm256_fmaddsub_ps(aa, xy, self.avx._mm256_mul_ps(bb, yx))
878 )
879 }
880
881 #[inline(always)]
882 fn mul_c64s(self, a: Self::c64s, b: Self::c64s) -> Self::c64s {
883 let ab = cast!(a);
884 let xy = cast!(b);
885
886 let yx = self.avx._mm256_permute_pd::<0b0101>(xy);
887 let aa = self.avx._mm256_unpacklo_pd(ab, ab);
888 let bb = self.avx._mm256_unpackhi_pd(ab, ab);
889
890 cast!(
891 self.fma
892 ._mm256_fmaddsub_pd(aa, xy, self.avx._mm256_mul_pd(bb, yx))
893 )
894 }
895
896 #[inline(always)]
897 fn mul_f32s(self, a: Self::f32s, b: Self::f32s) -> Self::f32s {
898 cast!(self.avx._mm256_mul_ps(cast!(a), cast!(b)))
899 }
900
901 #[inline(always)]
902 fn mul_f64s(self, a: Self::f64s, b: Self::f64s) -> Self::f64s {
903 cast!(self.avx._mm256_mul_pd(cast!(a), cast!(b)))
904 }
905
906 #[inline(always)]
907 fn neg_c32s(self, a: Self::c32s) -> Self::c32s {
908 self.xor_f32s(a, self.splat_f32s(-0.0))
909 }
910
911 #[inline(always)]
912 fn neg_c64s(self, a: Self::c64s) -> Self::c64s {
913 self.xor_f64s(a, self.splat_f64s(-0.0))
914 }
915
916 #[inline(always)]
917 fn not_m32s(self, a: Self::m32s) -> Self::m32s {
918 cast!(
919 self.avx
920 ._mm256_xor_pd(cast!(self.avx._mm256_set1_epi32(-1)), cast!(a),)
921 )
922 }
923
924 #[inline(always)]
925 fn not_m64s(self, a: Self::m64s) -> Self::m64s {
926 cast!(
927 self.avx
928 ._mm256_xor_pd(cast!(self.avx._mm256_set1_epi32(-1)), cast!(a),)
929 )
930 }
931
932 #[inline(always)]
933 fn not_u32s(self, a: Self::u32s) -> Self::u32s {
934 cast!(
935 self.avx
936 ._mm256_xor_pd(cast!(self.avx._mm256_set1_epi32(-1)), cast!(a),)
937 )
938 }
939
940 #[inline(always)]
941 fn not_u64s(self, a: Self::u64s) -> Self::u64s {
942 cast!(
943 self.avx
944 ._mm256_xor_pd(cast!(self.avx._mm256_set1_epi32(-1)), cast!(a),)
945 )
946 }
947
948 #[inline(always)]
949 fn or_m32s(self, a: Self::m32s, b: Self::m32s) -> Self::m32s {
950 cast!(self.avx._mm256_or_pd(cast!(a), cast!(b)))
951 }
952
953 #[inline(always)]
954 fn or_m64s(self, a: Self::m64s, b: Self::m64s) -> Self::m64s {
955 cast!(self.avx._mm256_or_pd(cast!(a), cast!(b)))
956 }
957
958 #[inline(always)]
959 fn or_u32s(self, a: Self::u32s, b: Self::u32s) -> Self::u32s {
960 cast!(self.avx._mm256_or_pd(cast!(a), cast!(b)))
961 }
962
963 #[inline(always)]
964 fn or_u64s(self, a: Self::u64s, b: Self::u64s) -> Self::u64s {
965 cast!(self.avx._mm256_or_pd(cast!(a), cast!(b)))
966 }
967
968 #[inline(always)]
969 fn partial_load_u32s(self, slice: &[u32]) -> Self::u32s {
970 avx_load_u32s(self.avx2, slice)
971 }
972
973 #[inline(always)]
974 fn partial_load_u64s(self, slice: &[u64]) -> Self::u64s {
975 cast!(self.partial_load_u32s(bytemuck::cast_slice(slice)))
976 }
977
978 #[inline(always)]
979 fn partial_store_u32s(self, slice: &mut [u32], values: Self::u32s) {
980 avx_store_u32s(self.avx2, slice, values)
981 }
982
983 #[inline(always)]
984 fn partial_store_u64s(self, slice: &mut [u64], values: Self::u64s) {
985 self.partial_store_u32s(bytemuck::cast_slice_mut(slice), cast!(values))
986 }
987
988 #[inline(always)]
989 fn reduce_max_c32s(self, a: Self::c32s) -> c32 {
990 let a: __m256 = cast!(a);
991 let r = self.sse._mm_max_ps(
992 self.avx._mm256_castps256_ps128(a),
993 self.avx._mm256_extractf128_ps::<1>(a),
994 );
995 (*self).reduce_max_c32x2(cast!(r))
996 }
997
998 #[inline(always)]
999 fn reduce_max_c64s(self, a: Self::c64s) -> c64 {
1000 let a: __m256d = cast!(a);
1001 let r = self.sse2._mm_max_pd(
1002 self.avx._mm256_castpd256_pd128(a),
1003 self.avx._mm256_extractf128_pd::<1>(a),
1004 );
1005 (*self).reduce_max_c64x1(cast!(r))
1006 }
1007
1008 #[inline(always)]
1009 fn reduce_max_f32s(self, a: Self::f32s) -> f32 {
1010 let a: __m256 = cast!(a);
1011 let r = self.sse._mm_max_ps(
1012 self.avx._mm256_castps256_ps128(a),
1013 self.avx._mm256_extractf128_ps::<1>(a),
1014 );
1015 (*self).reduce_max_f32x4(cast!(r))
1016 }
1017
1018 #[inline(always)]
1019 fn reduce_max_f64s(self, a: Self::f64s) -> f64 {
1020 let a: __m256d = cast!(a);
1021 let r = self.sse2._mm_max_pd(
1022 self.avx._mm256_castpd256_pd128(a),
1023 self.avx._mm256_extractf128_pd::<1>(a),
1024 );
1025 (*self).reduce_max_f64x2(cast!(r))
1026 }
1027
1028 #[inline(always)]
1029 fn reduce_min_c32s(self, a: Self::c32s) -> c32 {
1030 let a: __m256 = cast!(a);
1031 let r = self.sse._mm_min_ps(
1032 self.avx._mm256_castps256_ps128(a),
1033 self.avx._mm256_extractf128_ps::<1>(a),
1034 );
1035 (*self).reduce_min_c32x2(cast!(r))
1036 }
1037
1038 #[inline(always)]
1039 fn reduce_min_c64s(self, a: Self::c64s) -> c64 {
1040 let a: __m256d = cast!(a);
1041 let r = self.sse2._mm_min_pd(
1042 self.avx._mm256_castpd256_pd128(a),
1043 self.avx._mm256_extractf128_pd::<1>(a),
1044 );
1045 (*self).reduce_min_c64x1(cast!(r))
1046 }
1047
1048 #[inline(always)]
1049 fn reduce_min_f32s(self, a: Self::f32s) -> f32 {
1050 let a: __m256 = cast!(a);
1051 let r = self.sse._mm_min_ps(
1052 self.avx._mm256_castps256_ps128(a),
1053 self.avx._mm256_extractf128_ps::<1>(a),
1054 );
1055 (*self).reduce_min_f32x4(cast!(r))
1056 }
1057
1058 #[inline(always)]
1059 fn reduce_min_f64s(self, a: Self::f64s) -> f64 {
1060 let a: __m256d = cast!(a);
1061 let r = self.sse2._mm_min_pd(
1062 self.avx._mm256_castpd256_pd128(a),
1063 self.avx._mm256_extractf128_pd::<1>(a),
1064 );
1065 (*self).reduce_min_f64x2(cast!(r))
1066 }
1067
1068 #[inline(always)]
1069 fn reduce_product_f32s(self, a: Self::f32s) -> f32 {
1070 let a: __m256 = cast!(a);
1071 let r = self.sse._mm_mul_ps(
1072 self.avx._mm256_castps256_ps128(a),
1073 self.avx._mm256_extractf128_ps::<1>(a),
1074 );
1075 (*self).reduce_product_f32x4(cast!(r))
1076 }
1077
1078 #[inline(always)]
1079 fn reduce_product_f64s(self, a: Self::f64s) -> f64 {
1080 let a: __m256d = cast!(a);
1081 let r = self.sse2._mm_mul_pd(
1082 self.avx._mm256_castpd256_pd128(a),
1083 self.avx._mm256_extractf128_pd::<1>(a),
1084 );
1085 (*self).reduce_product_f64x2(cast!(r))
1086 }
1087
1088 #[inline(always)]
1089 fn reduce_sum_c32s(self, a: Self::c32s) -> c32 {
1090 let a: __m256 = cast!(a);
1091 let r = self.sse._mm_add_ps(
1092 self.avx._mm256_castps256_ps128(a),
1093 self.avx._mm256_extractf128_ps::<1>(a),
1094 );
1095 (*self).reduce_sum_c32x2(cast!(r))
1096 }
1097
1098 #[inline(always)]
1099 fn reduce_sum_c64s(self, a: Self::c64s) -> c64 {
1100 let a: __m256d = cast!(a);
1101 let r = self.sse2._mm_add_pd(
1102 self.avx._mm256_castpd256_pd128(a),
1103 self.avx._mm256_extractf128_pd::<1>(a),
1104 );
1105 (*self).reduce_sum_c64x1(cast!(r))
1106 }
1107
1108 #[inline(always)]
1109 fn reduce_sum_f32s(self, a: Self::f32s) -> f32 {
1110 let a: __m256 = cast!(a);
1111 let r = self.sse._mm_add_ps(
1112 self.avx._mm256_castps256_ps128(a),
1113 self.avx._mm256_extractf128_ps::<1>(a),
1114 );
1115 (*self).reduce_sum_f32x4(cast!(r))
1116 }
1117
1118 #[inline(always)]
1119 fn reduce_sum_f64s(self, a: Self::f64s) -> f64 {
1120 let a: __m256d = cast!(a);
1121 let r = self.sse2._mm_add_pd(
1122 self.avx._mm256_castpd256_pd128(a),
1123 self.avx._mm256_extractf128_pd::<1>(a),
1124 );
1125 (*self).reduce_sum_f64x2(cast!(r))
1126 }
1127
1128 #[inline(always)]
1129 fn rotate_right_c32s(self, a: Self::c32s, amount: usize) -> Self::c32s {
1130 cast!(avx2_pshufb(
1131 self,
1132 cast!(a),
1133 cast!(AVX2_ROTATE_IDX[8 * (amount % 4)]),
1134 ))
1135 }
1136
1137 #[inline(always)]
1138 fn rotate_right_c64s(self, a: Self::c64s, amount: usize) -> Self::c64s {
1139 cast!(avx2_pshufb(
1140 self,
1141 cast!(a),
1142 cast!(AVX2_ROTATE_IDX[16 * (amount % 2)]),
1143 ))
1144 }
1145
1146 #[inline(always)]
1147 fn rotate_right_u32s(self, a: Self::u32s, amount: usize) -> Self::u32s {
1148 cast!(avx2_pshufb(
1149 self,
1150 cast!(a),
1151 cast!(AVX2_ROTATE_IDX[4 * (amount % 8)]),
1152 ))
1153 }
1154
1155 #[inline(always)]
1156 fn rotate_right_u64s(self, a: Self::u64s, amount: usize) -> Self::u64s {
1157 cast!(avx2_pshufb(
1158 self,
1159 cast!(a),
1160 cast!(AVX2_ROTATE_IDX[8 * (amount % 4)]),
1161 ))
1162 }
1163
1164 #[inline(always)]
1165 fn select_u32s_m32s(
1166 self,
1167 mask: Self::m32s,
1168 if_true: Self::u32s,
1169 if_false: Self::u32s,
1170 ) -> Self::u32s {
1171 let mask: __m256 = cast!(mask);
1172 let if_true: __m256 = cast!(if_true);
1173 let if_false: __m256 = cast!(if_false);
1174
1175 cast!(self.avx._mm256_blendv_ps(if_false, if_true, mask))
1176 }
1177
1178 #[inline(always)]
1179 fn select_u64s_m64s(
1180 self,
1181 mask: Self::m64s,
1182 if_true: Self::u64s,
1183 if_false: Self::u64s,
1184 ) -> Self::u64s {
1185 let mask: __m256d = cast!(mask);
1186 let if_true: __m256d = cast!(if_true);
1187 let if_false: __m256d = cast!(if_false);
1188
1189 cast!(self.avx._mm256_blendv_pd(if_false, if_true, mask))
1190 }
1191
1192 #[inline(always)]
1193 fn splat_c32s(self, value: c32) -> Self::c32s {
1194 cast!(self.splat_f64s(cast!(value)))
1195 }
1196
1197 #[inline(always)]
1198 fn splat_c64s(self, value: c64) -> Self::c64s {
1199 cast!(self.avx._mm256_broadcast_pd(&cast!(value)))
1200 }
1201
1202 #[inline(always)]
1203 fn splat_f32s(self, value: f32) -> Self::f32s {
1204 cast!(self.avx._mm256_set1_ps(value))
1205 }
1206
1207 #[inline(always)]
1208 fn splat_f64s(self, value: f64) -> Self::f64s {
1209 cast!(self.avx._mm256_set1_pd(value))
1210 }
1211
1212 #[inline(always)]
1213 fn splat_u32s(self, value: u32) -> Self::u32s {
1214 cast!(self.avx._mm256_set1_epi32(value as i32))
1215 }
1216
1217 #[inline(always)]
1218 fn splat_u64s(self, value: u64) -> Self::u64s {
1219 cast!(self.avx._mm256_set1_epi64x(value as i64))
1220 }
1221
1222 #[inline(always)]
1223 fn sub_c32s(self, a: Self::c32s, b: Self::c32s) -> Self::c32s {
1224 self.sub_f32s(a, b)
1225 }
1226
1227 #[inline(always)]
1228 fn sub_c64s(self, a: Self::c64s, b: Self::c64s) -> Self::c64s {
1229 self.sub_f64s(a, b)
1230 }
1231
1232 #[inline(always)]
1233 fn sub_f32s(self, a: Self::f32s, b: Self::f32s) -> Self::f32s {
1234 cast!(self.avx._mm256_sub_ps(cast!(a), cast!(b)))
1235 }
1236
1237 #[inline(always)]
1238 fn sub_f64s(self, a: Self::f64s, b: Self::f64s) -> Self::f64s {
1239 cast!(self.avx._mm256_sub_pd(cast!(a), cast!(b)))
1240 }
1241
1242 #[inline(always)]
1243 fn sub_u32s(self, a: Self::u32s, b: Self::u32s) -> Self::u32s {
1244 cast!(self.avx2._mm256_sub_epi32(cast!(a), cast!(b)))
1245 }
1246
1247 #[inline(always)]
1248 fn sub_u64s(self, a: Self::u64s, b: Self::u64s) -> Self::u64s {
1249 cast!(self.avx2._mm256_sub_epi64(cast!(a), cast!(b)))
1250 }
1251
1252 #[inline(always)]
1253 fn swap_re_im_c32s(self, a: Self::c32s) -> Self::c32s {
1254 cast!(self.avx._mm256_permute_ps::<0b10_11_00_01>(cast!(a)))
1255 }
1256
1257 #[inline(always)]
1258 fn swap_re_im_c64s(self, a: Self::c64s) -> Self::c64s {
1259 cast!(self.avx._mm256_permute_pd::<0b0101>(cast!(a)))
1260 }
1261
1262 #[inline(always)]
1263 fn vectorize<Op: WithSimd>(self, op: Op) -> Op::Output {
1264 struct Impl<Op> {
1265 this: V3,
1266 op: Op,
1267 }
1268 impl<Op: WithSimd> crate::NullaryFnOnce for Impl<Op> {
1269 type Output = Op::Output;
1270
1271 #[inline(always)]
1272 fn call(self) -> Self::Output {
1273 self.op.with_simd(self.this)
1274 }
1275 }
1276 self.vectorize(Impl { this: self, op })
1277 }
1278
1279 #[inline(always)]
1280 fn widening_mul_u32s(self, a: Self::u32s, b: Self::u32s) -> (Self::u32s, Self::u32s) {
1281 self.widening_mul_u32x8(a, b)
1282 }
1283
1284 #[inline(always)]
1285 fn wrapping_dyn_shl_u32s(self, a: Self::u32s, amount: Self::u32s) -> Self::u32s {
1286 self.shl_dyn_u32x8(a, self.and_u32x8(amount, self.splat_u32x8(32 - 1)))
1287 }
1288
1289 #[inline(always)]
1290 fn wrapping_dyn_shr_u32s(self, a: Self::u32s, amount: Self::u32s) -> Self::u32s {
1291 self.shr_dyn_u32x8(a, self.and_u32x8(amount, self.splat_u32x8(32 - 1)))
1292 }
1293
1294 #[inline(always)]
1295 fn xor_m32s(self, a: Self::m32s, b: Self::m32s) -> Self::m32s {
1296 cast!(self.avx._mm256_xor_pd(cast!(a), cast!(b)))
1297 }
1298
1299 #[inline(always)]
1300 fn xor_m64s(self, a: Self::m64s, b: Self::m64s) -> Self::m64s {
1301 cast!(self.avx._mm256_xor_pd(cast!(a), cast!(b)))
1302 }
1303
1304 #[inline(always)]
1305 fn xor_u32s(self, a: Self::u32s, b: Self::u32s) -> Self::u32s {
1306 cast!(self.avx._mm256_xor_pd(cast!(a), cast!(b)))
1307 }
1308
1309 #[inline(always)]
1310 fn xor_u64s(self, a: Self::u64s, b: Self::u64s) -> Self::u64s {
1311 cast!(self.avx._mm256_xor_pd(cast!(a), cast!(b)))
1312 }
1313
1314 #[inline(always)]
1315 fn greater_than_or_equal_i32s(self, a: Self::i32s, b: Self::i32s) -> Self::m32s {
1316 self.cmp_ge_i32x8(a, b)
1317 }
1318
1319 #[inline(always)]
1320 fn greater_than_or_equal_i64s(self, a: Self::i64s, b: Self::i64s) -> Self::m64s {
1321 self.cmp_ge_i64x4(a, b)
1322 }
1323
1324 #[inline(always)]
1325 fn greater_than_i32s(self, a: Self::i32s, b: Self::i32s) -> Self::m32s {
1326 self.cmp_gt_i32x8(a, b)
1327 }
1328
1329 #[inline(always)]
1330 fn greater_than_i64s(self, a: Self::i64s, b: Self::i64s) -> Self::m64s {
1331 self.cmp_gt_i64x4(a, b)
1332 }
1333
1334 #[inline(always)]
1335 fn less_than_or_equal_i32s(self, a: Self::i32s, b: Self::i32s) -> Self::m32s {
1336 self.cmp_le_i32x8(a, b)
1337 }
1338
1339 #[inline(always)]
1340 fn less_than_or_equal_i64s(self, a: Self::i64s, b: Self::i64s) -> Self::m64s {
1341 self.cmp_le_i64x4(a, b)
1342 }
1343
1344 #[inline(always)]
1345 fn less_than_i32s(self, a: Self::i32s, b: Self::i32s) -> Self::m32s {
1346 self.cmp_lt_i32x8(a, b)
1347 }
1348
1349 #[inline(always)]
1350 fn less_than_i64s(self, a: Self::i64s, b: Self::i64s) -> Self::m64s {
1351 self.cmp_lt_i64x4(a, b)
1352 }
1353}
1354
1355#[derive(Copy, Clone, Debug)]
1356#[repr(transparent)]
1357pub struct V3_128b(pub V3);
1358
1359#[derive(Copy, Clone, Debug)]
1360#[repr(transparent)]
1361pub struct V3_256b(pub V3);
1362
1363#[derive(Copy, Clone, Debug)]
1364#[repr(transparent)]
1365pub struct V3_512b(pub V3);
1366
1367impl core::ops::Deref for V3_128b {
1368 type Target = V3;
1369
1370 #[inline]
1371 fn deref(&self) -> &Self::Target {
1372 &self.0
1373 }
1374}
1375
1376impl core::ops::Deref for V3_256b {
1377 type Target = V3;
1378
1379 #[inline]
1380 fn deref(&self) -> &Self::Target {
1381 &self.0
1382 }
1383}
1384
1385impl core::ops::Deref for V3_512b {
1386 type Target = V3;
1387
1388 #[inline]
1389 fn deref(&self) -> &Self::Target {
1390 &self.0
1391 }
1392}
1393
1394impl Seal for V3_128b {}
1395impl Seal for V3_256b {}
1396impl Seal for V3_512b {}
1397
1398impl Simd for V3_128b {
1399 type c32s = f32x4;
1400 type c64s = f64x2;
1401 type f32s = f32x4;
1402 type f64s = f64x2;
1403 type i32s = i32x4;
1404 type i64s = i64x2;
1405 type m32s = m32x4;
1406 type m64s = m64x2;
1407 type u32s = u32x4;
1408 type u64s = u64x2;
1409
1410 const REGISTER_COUNT: usize = 16;
1411
1412 #[inline(always)]
1413 fn abs2_c32s(self, a: Self::c32s) -> Self::c32s {
1414 let sqr = self.mul_f32s(a, a);
1415 let sqr_rev = self
1416 .sse
1417 ._mm_shuffle_ps::<0b10_11_00_01>(cast!(sqr), cast!(sqr));
1418 self.add_f32s(sqr, cast!(sqr_rev))
1419 }
1420
1421 #[inline(always)]
1422 fn abs2_c64s(self, a: Self::c64s) -> Self::c64s {
1423 let sqr = self.mul_f64s(a, a);
1424 let sqr_rev = self.sse2._mm_shuffle_pd::<0b01>(cast!(sqr), cast!(sqr));
1425 self.add_f64s(sqr, cast!(sqr_rev))
1426 }
1427
1428 #[inline(always)]
1429 fn abs_max_c32s(self, a: Self::c32s) -> Self::c32s {
1430 let sqr = self.abs_f32s(a);
1431 let sqr_rev = self
1432 .sse
1433 ._mm_shuffle_ps::<0b10_11_00_01>(cast!(sqr), cast!(sqr));
1434 self.max_f32s(sqr, cast!(sqr_rev))
1435 }
1436
1437 #[inline(always)]
1438 fn abs_max_c64s(self, a: Self::c64s) -> Self::c64s {
1439 let sqr = self.abs_f64s(a);
1440 let sqr_rev = self.sse2._mm_shuffle_pd::<0b01>(cast!(sqr), cast!(sqr));
1441 self.max_f64s(sqr, cast!(sqr_rev))
1442 }
1443
1444 #[inline(always)]
1445 fn add_c32s(self, a: Self::c32s, b: Self::c32s) -> Self::c32s {
1446 self.add_f32s(a, b)
1447 }
1448
1449 #[inline(always)]
1450 fn add_c64s(self, a: Self::c64s, b: Self::c64s) -> Self::c64s {
1451 self.add_f64s(a, b)
1452 }
1453
1454 #[inline(always)]
1455 fn add_f32s(self, a: Self::f32s, b: Self::f32s) -> Self::f32s {
1456 self.add_f32x4(a, b)
1457 }
1458
1459 #[inline(always)]
1460 fn add_f64s(self, a: Self::f64s, b: Self::f64s) -> Self::f64s {
1461 self.add_f64x2(a, b)
1462 }
1463
1464 #[inline(always)]
1465 fn add_u32s(self, a: Self::u32s, b: Self::u32s) -> Self::u32s {
1466 self.wrapping_add_u32x4(a, b)
1467 }
1468
1469 #[inline(always)]
1470 fn add_u64s(self, a: Self::u64s, b: Self::u64s) -> Self::u64s {
1471 self.wrapping_add_u64x2(a, b)
1472 }
1473
1474 #[inline(always)]
1475 fn and_m32s(self, a: Self::m32s, b: Self::m32s) -> Self::m32s {
1476 cast!(self.sse2._mm_and_pd(cast!(a), cast!(b)))
1477 }
1478
1479 #[inline(always)]
1480 fn and_m64s(self, a: Self::m64s, b: Self::m64s) -> Self::m64s {
1481 cast!(self.sse2._mm_and_pd(cast!(a), cast!(b)))
1482 }
1483
1484 #[inline(always)]
1485 fn and_u32s(self, a: Self::u32s, b: Self::u32s) -> Self::u32s {
1486 cast!(self.sse2._mm_and_pd(cast!(a), cast!(b)))
1487 }
1488
1489 #[inline(always)]
1490 fn and_u64s(self, a: Self::u64s, b: Self::u64s) -> Self::u64s {
1491 cast!(self.sse2._mm_and_pd(cast!(a), cast!(b)))
1492 }
1493
1494 #[inline(always)]
1495 fn conj_c32s(self, a: Self::c32s) -> Self::c32s {
1496 self.xor_f32s(a, self.splat_c32s(c32 { re: 0.0, im: -0.0 }))
1497 }
1498
1499 #[inline(always)]
1500 fn conj_c64s(self, a: Self::c64s) -> Self::c64s {
1501 self.xor_f64s(a, self.splat_c64s(c64 { re: 0.0, im: -0.0 }))
1502 }
1503
1504 #[inline(always)]
1505 fn conj_mul_add_c32s(self, a: Self::c32s, b: Self::c32s, c: Self::c32s) -> Self::c32s {
1506 let ab = cast!(a);
1507 let xy = cast!(b);
1508
1509 let yx = self.avx._mm_permute_ps::<0b10_11_00_01>(xy);
1510 let aa = self.sse3._mm_moveldup_ps(ab);
1511 let bb = self.sse3._mm_movehdup_ps(ab);
1512
1513 cast!(
1514 self.fma
1515 ._mm_fmsubadd_ps(aa, xy, self.fma._mm_fmsubadd_ps(bb, yx, cast!(c)))
1516 )
1517 }
1518
1519 #[inline(always)]
1520 fn conj_mul_add_c64s(self, a: Self::c64s, b: Self::c64s, c: Self::c64s) -> Self::c64s {
1521 let ab = cast!(a);
1522 let xy = cast!(b);
1523
1524 let yx = self.avx._mm_permute_pd::<0b01>(xy);
1525 let aa = self.sse2._mm_unpacklo_pd(ab, ab);
1526 let bb = self.sse2._mm_unpackhi_pd(ab, ab);
1527
1528 cast!(
1529 self.fma
1530 ._mm_fmsubadd_pd(aa, xy, self.fma._mm_fmsubadd_pd(bb, yx, cast!(c)))
1531 )
1532 }
1533
1534 #[inline(always)]
1535 fn conj_mul_c32s(self, a: Self::c32s, b: Self::c32s) -> Self::c32s {
1536 let ab = cast!(a);
1537 let xy = cast!(b);
1538
1539 let yx = self.avx._mm_permute_ps::<0b10_11_00_01>(xy);
1540 let aa = self.sse3._mm_moveldup_ps(ab);
1541 let bb = self.sse3._mm_movehdup_ps(ab);
1542
1543 cast!(
1544 self.fma
1545 ._mm_fmsubadd_ps(aa, xy, self.sse._mm_mul_ps(bb, yx))
1546 )
1547 }
1548
1549 #[inline(always)]
1550 fn conj_mul_c64s(self, a: Self::c64s, b: Self::c64s) -> Self::c64s {
1551 let ab = cast!(a);
1552 let xy = cast!(b);
1553
1554 let yx = self.avx._mm_permute_pd::<0b01>(xy);
1555 let aa = self.sse2._mm_unpacklo_pd(ab, ab);
1556 let bb = self.sse2._mm_unpackhi_pd(ab, ab);
1557
1558 cast!(
1559 self.fma
1560 ._mm_fmsubadd_pd(aa, xy, self.sse2._mm_mul_pd(bb, yx))
1561 )
1562 }
1563
1564 #[inline(always)]
1565 fn div_f32s(self, a: Self::f32s, b: Self::f32s) -> Self::f32s {
1566 self.div_f32x4(a, b)
1567 }
1568
1569 #[inline(always)]
1570 fn div_f64s(self, a: Self::f64s, b: Self::f64s) -> Self::f64s {
1571 self.div_f64x2(a, b)
1572 }
1573
1574 #[inline(always)]
1575 fn equal_f32s(self, a: Self::f32s, b: Self::f32s) -> Self::m32s {
1576 self.cmp_eq_f32x4(a, b)
1577 }
1578
1579 #[inline(always)]
1580 fn equal_f64s(self, a: Self::f64s, b: Self::f64s) -> Self::m64s {
1581 self.cmp_eq_f64x2(a, b)
1582 }
1583
1584 #[inline(always)]
1585 fn greater_than_or_equal_u32s(self, a: Self::u32s, b: Self::u32s) -> Self::m32s {
1586 self.cmp_ge_u32x4(a, b)
1587 }
1588
1589 #[inline(always)]
1590 fn greater_than_or_equal_u64s(self, a: Self::u64s, b: Self::u64s) -> Self::m64s {
1591 self.cmp_ge_u64x2(a, b)
1592 }
1593
1594 #[inline(always)]
1595 fn greater_than_u32s(self, a: Self::u32s, b: Self::u32s) -> Self::m32s {
1596 self.cmp_gt_u32x4(a, b)
1597 }
1598
1599 #[inline(always)]
1600 fn greater_than_u64s(self, a: Self::u64s, b: Self::u64s) -> Self::m64s {
1601 self.cmp_gt_u64x2(a, b)
1602 }
1603
1604 #[inline(always)]
1605 fn less_than_f32s(self, a: Self::f32s, b: Self::f32s) -> Self::m32s {
1606 self.cmp_lt_f32x4(a, b)
1607 }
1608
1609 #[inline(always)]
1610 fn less_than_f64s(self, a: Self::f64s, b: Self::f64s) -> Self::m64s {
1611 self.cmp_lt_f64x2(a, b)
1612 }
1613
1614 #[inline(always)]
1615 fn less_than_or_equal_f32s(self, a: Self::f32s, b: Self::f32s) -> Self::m32s {
1616 self.cmp_le_f32x4(a, b)
1617 }
1618
1619 #[inline(always)]
1620 fn less_than_or_equal_f64s(self, a: Self::f64s, b: Self::f64s) -> Self::m64s {
1621 self.cmp_le_f64x2(a, b)
1622 }
1623
1624 #[inline(always)]
1625 fn less_than_or_equal_u32s(self, a: Self::u32s, b: Self::u32s) -> Self::m32s {
1626 self.cmp_le_u32x4(a, b)
1627 }
1628
1629 #[inline(always)]
1630 fn less_than_or_equal_u64s(self, a: Self::u64s, b: Self::u64s) -> Self::m64s {
1631 self.cmp_le_u64x2(a, b)
1632 }
1633
1634 #[inline(always)]
1635 fn less_than_u32s(self, a: Self::u32s, b: Self::u32s) -> Self::m32s {
1636 self.cmp_lt_u32x4(a, b)
1637 }
1638
1639 #[inline(always)]
1640 fn less_than_u64s(self, a: Self::u64s, b: Self::u64s) -> Self::m64s {
1641 self.cmp_lt_u64x2(a, b)
1642 }
1643
1644 #[inline(always)]
1645 unsafe fn mask_load_ptr_c32s(self, mask: MemMask<Self::m32s>, ptr: *const c32) -> Self::c32s {
1646 cast!(self.mask_load_ptr_u32s(mask, ptr as _))
1647 }
1648
1649 #[inline(always)]
1650 unsafe fn mask_load_ptr_c64s(self, mask: MemMask<Self::m64s>, ptr: *const c64) -> Self::c64s {
1651 cast!(self.mask_load_ptr_u64s(mask, ptr as _))
1652 }
1653
1654 #[inline(always)]
1655 unsafe fn mask_load_ptr_u32s(self, mask: MemMask<Self::m32s>, ptr: *const u32) -> Self::u32s {
1656 match mask.load {
1657 Some(load) => cast_lossy(avx_ld_u32s(ptr, load)),
1658 None => cast!(self.avx2._mm_maskload_epi32(ptr as _, cast!(mask.mask))),
1659 }
1660 }
1661
1662 #[inline(always)]
1663 unsafe fn mask_load_ptr_u64s(self, mask: MemMask<Self::m64s>, ptr: *const u64) -> Self::u64s {
1664 cast!(self.mask_load_ptr_u32s(
1665 MemMask {
1666 mask: cast!(mask.mask),
1667 load: mask.load,
1668 store: mask.store
1669 },
1670 ptr as _
1671 ))
1672 }
1673
1674 #[inline(always)]
1675 unsafe fn mask_store_ptr_c32s(
1676 self,
1677 mask: MemMask<Self::m32s>,
1678 ptr: *mut c32,
1679 values: Self::c32s,
1680 ) {
1681 self.mask_store_ptr_u32s(mask, ptr as _, cast!(values));
1682 }
1683
1684 #[inline(always)]
1685 unsafe fn mask_store_ptr_c64s(
1686 self,
1687 mask: MemMask<Self::m64s>,
1688 ptr: *mut c64,
1689 values: Self::c64s,
1690 ) {
1691 self.mask_store_ptr_u64s(mask, ptr as _, cast!(values))
1692 }
1693
1694 #[inline(always)]
1695 unsafe fn mask_store_ptr_u32s(
1696 self,
1697 mask: MemMask<Self::m32s>,
1698 ptr: *mut u32,
1699 values: Self::u32s,
1700 ) {
1701 match mask.store {
1702 Some(store) => avx_st_u32s(ptr, cast!([values, self.splat_u32s(0)]), store),
1703 None => self
1704 .avx2
1705 ._mm_maskstore_epi32(ptr as _, cast!(mask.mask), cast!(values)),
1706 }
1707 }
1708
1709 #[inline(always)]
1710 unsafe fn mask_store_ptr_u64s(
1711 self,
1712 mask: MemMask<Self::m64s>,
1713 ptr: *mut u64,
1714 values: Self::u64s,
1715 ) {
1716 self.mask_store_ptr_u32s(
1717 MemMask {
1718 mask: cast!(mask.mask),
1719 load: mask.load,
1720 store: mask.store,
1721 },
1722 ptr as _,
1723 cast!(values),
1724 )
1725 }
1726
1727 #[inline(always)]
1728 fn max_f32s(self, a: Self::f32s, b: Self::f32s) -> Self::f32s {
1729 self.max_f32x4(a, b)
1730 }
1731
1732 #[inline(always)]
1733 fn max_f64s(self, a: Self::f64s, b: Self::f64s) -> Self::f64s {
1734 self.max_f64x2(a, b)
1735 }
1736
1737 #[inline(always)]
1738 fn min_f32s(self, a: Self::f32s, b: Self::f32s) -> Self::f32s {
1739 self.min_f32x4(a, b)
1740 }
1741
1742 #[inline(always)]
1743 fn min_f64s(self, a: Self::f64s, b: Self::f64s) -> Self::f64s {
1744 self.min_f64x2(a, b)
1745 }
1746
1747 #[inline(always)]
1748 fn mul_add_c32s(self, a: Self::c32s, b: Self::c32s, c: Self::c32s) -> Self::c32s {
1749 let ab = cast!(a);
1750 let xy = cast!(b);
1751
1752 let yx = self.avx._mm_permute_ps::<0b10_11_00_01>(xy);
1753 let aa = self.sse3._mm_moveldup_ps(ab);
1754 let bb = self.sse3._mm_movehdup_ps(ab);
1755
1756 cast!(
1757 self.fma
1758 ._mm_fmaddsub_ps(aa, xy, self.fma._mm_fmaddsub_ps(bb, yx, cast!(c)))
1759 )
1760 }
1761
1762 #[inline(always)]
1763 fn mul_add_c64s(self, a: Self::c64s, b: Self::c64s, c: Self::c64s) -> Self::c64s {
1764 let ab = cast!(a);
1765 let xy = cast!(b);
1766
1767 let yx = self.avx._mm_permute_pd::<0b01>(xy);
1768 let aa = self.sse2._mm_unpacklo_pd(ab, ab);
1769 let bb = self.sse2._mm_unpackhi_pd(ab, ab);
1770
1771 cast!(
1772 self.fma
1773 ._mm_fmaddsub_pd(aa, xy, self.fma._mm_fmaddsub_pd(bb, yx, cast!(c)))
1774 )
1775 }
1776
1777 #[inline(always)]
1778 fn mul_add_e_f32s(self, a: Self::f32s, b: Self::f32s, c: Self::f32s) -> Self::f32s {
1779 self.mul_add_f32s(a, b, c)
1780 }
1781
1782 #[inline(always)]
1783 fn mul_add_e_f64s(self, a: Self::f64s, b: Self::f64s, c: Self::f64s) -> Self::f64s {
1784 self.mul_add_f64s(a, b, c)
1785 }
1786
1787 #[inline(always)]
1788 fn mul_add_f32s(self, a: Self::f32s, b: Self::f32s, c: Self::f32s) -> Self::f32s {
1789 self.mul_add_f32x4(a, b, c)
1790 }
1791
1792 #[inline(always)]
1793 fn mul_add_f64s(self, a: Self::f64s, b: Self::f64s, c: Self::f64s) -> Self::f64s {
1794 self.mul_add_f64x2(a, b, c)
1795 }
1796
1797 #[inline(always)]
1798 fn mul_c32s(self, a: Self::c32s, b: Self::c32s) -> Self::c32s {
1799 let ab = cast!(a);
1800 let xy = cast!(b);
1801
1802 let yx = self.avx._mm_permute_ps::<0b10_11_00_01>(xy);
1803 let aa = self.sse3._mm_moveldup_ps(ab);
1804 let bb = self.sse3._mm_movehdup_ps(ab);
1805
1806 cast!(
1807 self.fma
1808 ._mm_fmaddsub_ps(aa, xy, self.sse._mm_mul_ps(bb, yx))
1809 )
1810 }
1811
1812 #[inline(always)]
1813 fn mul_c64s(self, a: Self::c64s, b: Self::c64s) -> Self::c64s {
1814 let ab = cast!(a);
1815 let xy = cast!(b);
1816
1817 let yx = self.avx._mm_permute_pd::<0b01>(xy);
1818 let aa = self.sse2._mm_unpacklo_pd(ab, ab);
1819 let bb = self.sse2._mm_unpackhi_pd(ab, ab);
1820
1821 cast!(
1822 self.fma
1823 ._mm_fmaddsub_pd(aa, xy, self.sse2._mm_mul_pd(bb, yx))
1824 )
1825 }
1826
1827 #[inline(always)]
1828 fn mul_f32s(self, a: Self::f32s, b: Self::f32s) -> Self::f32s {
1829 self.mul_f32x4(a, b)
1830 }
1831
1832 #[inline(always)]
1833 fn mul_f64s(self, a: Self::f64s, b: Self::f64s) -> Self::f64s {
1834 self.mul_f64x2(a, b)
1835 }
1836
1837 #[inline(always)]
1838 fn neg_c32s(self, a: Self::c32s) -> Self::c32s {
1839 self.xor_f32s(a, self.splat_f32s(-0.0))
1840 }
1841
1842 #[inline(always)]
1843 fn neg_c64s(self, a: Self::c64s) -> Self::c64s {
1844 self.xor_f64s(a, self.splat_f64s(-0.0))
1845 }
1846
1847 #[inline(always)]
1848 fn not_m32s(self, a: Self::m32s) -> Self::m32s {
1849 cast!(
1850 self.sse2
1851 ._mm_xor_pd(cast!(self.sse2._mm_set1_epi32(-1)), cast!(a),)
1852 )
1853 }
1854
1855 #[inline(always)]
1856 fn not_m64s(self, a: Self::m64s) -> Self::m64s {
1857 cast!(
1858 self.sse2
1859 ._mm_xor_pd(cast!(self.sse2._mm_set1_epi32(-1)), cast!(a),)
1860 )
1861 }
1862
1863 #[inline(always)]
1864 fn not_u32s(self, a: Self::u32s) -> Self::u32s {
1865 cast!(
1866 self.sse2
1867 ._mm_xor_pd(cast!(self.sse2._mm_set1_epi32(-1)), cast!(a),)
1868 )
1869 }
1870
1871 #[inline(always)]
1872 fn not_u64s(self, a: Self::u64s) -> Self::u64s {
1873 cast!(
1874 self.sse2
1875 ._mm_xor_pd(cast!(self.sse2._mm_set1_epi32(-1)), cast!(a),)
1876 )
1877 }
1878
1879 #[inline(always)]
1880 fn or_m32s(self, a: Self::m32s, b: Self::m32s) -> Self::m32s {
1881 cast!(self.sse2._mm_or_pd(cast!(a), cast!(b)))
1882 }
1883
1884 #[inline(always)]
1885 fn or_m64s(self, a: Self::m64s, b: Self::m64s) -> Self::m64s {
1886 cast!(self.sse2._mm_or_pd(cast!(a), cast!(b)))
1887 }
1888
1889 #[inline(always)]
1890 fn or_u32s(self, a: Self::u32s, b: Self::u32s) -> Self::u32s {
1891 cast!(self.sse2._mm_or_pd(cast!(a), cast!(b)))
1892 }
1893
1894 #[inline(always)]
1895 fn or_u64s(self, a: Self::u64s, b: Self::u64s) -> Self::u64s {
1896 cast!(self.sse2._mm_or_pd(cast!(a), cast!(b)))
1897 }
1898
1899 #[inline(always)]
1900 fn partial_load_u32s(self, slice: &[u32]) -> Self::u32s {
1901 cast_lossy(avx_load_u32s(self.avx2, slice))
1902 }
1903
1904 #[inline(always)]
1905 fn partial_load_u64s(self, slice: &[u64]) -> Self::u64s {
1906 cast!(self.partial_load_u32s(bytemuck::cast_slice(slice)))
1907 }
1908
1909 #[inline(always)]
1910 fn partial_store_u32s(self, slice: &mut [u32], values: Self::u32s) {
1911 avx_store_u32s(self.avx2, slice, cast!([values, self.splat_u32s(0)]))
1912 }
1913
1914 #[inline(always)]
1915 fn partial_store_u64s(self, slice: &mut [u64], values: Self::u64s) {
1916 self.partial_store_u32s(bytemuck::cast_slice_mut(slice), cast!(values))
1917 }
1918
1919 #[inline(always)]
1920 fn reduce_max_c32s(self, a: Self::c32s) -> c32 {
1921 let a: __m128 = cast!(a);
1922 let hi = self.sse._mm_movehl_ps(a, a);
1923 let r0 = self.sse._mm_max_ps(a, hi);
1924 cast!(self.sse2._mm_cvtsd_f64(cast!(r0)))
1925 }
1926
1927 #[inline(always)]
1928 fn reduce_max_c64s(self, a: Self::c64s) -> c64 {
1929 cast!(a)
1930 }
1931
1932 #[inline(always)]
1933 fn reduce_max_f32s(self, a: Self::f32s) -> f32 {
1934 let a: __m128 = cast!(a);
1935 let hi = self.sse._mm_movehl_ps(a, a);
1936 let r0 = self.sse._mm_max_ps(a, hi);
1937 let r0_shuffled = self.sse._mm_shuffle_ps::<0b0001>(r0, r0);
1938 let r = self.sse._mm_max_ss(r0, r0_shuffled);
1939 self.sse._mm_cvtss_f32(r)
1940 }
1941
1942 #[inline(always)]
1943 fn reduce_max_f64s(self, a: Self::f64s) -> f64 {
1944 let a: __m128d = cast!(a);
1945 let hi = cast!(self.sse._mm_movehl_ps(cast!(a), cast!(a)));
1946 let r = self.sse2._mm_max_sd(a, hi);
1947 self.sse2._mm_cvtsd_f64(r)
1948 }
1949
1950 #[inline(always)]
1951 fn reduce_min_c32s(self, a: Self::c32s) -> c32 {
1952 let a: __m128 = cast!(a);
1953 let hi = self.sse._mm_movehl_ps(a, a);
1954 let r0 = self.sse._mm_min_ps(a, hi);
1955 cast!(self.sse2._mm_cvtsd_f64(cast!(r0)))
1956 }
1957
1958 #[inline(always)]
1959 fn reduce_min_c64s(self, a: Self::c64s) -> c64 {
1960 cast!(a)
1961 }
1962
1963 #[inline(always)]
1964 fn reduce_min_f32s(self, a: Self::f32s) -> f32 {
1965 let a: __m128 = cast!(a);
1966 let hi = self.sse._mm_movehl_ps(a, a);
1967 let r0 = self.sse._mm_min_ps(a, hi);
1968 let r0_shuffled = self.sse._mm_shuffle_ps::<0b0001>(r0, r0);
1969 let r = self.sse._mm_min_ss(r0, r0_shuffled);
1970 self.sse._mm_cvtss_f32(r)
1971 }
1972
1973 #[inline(always)]
1974 fn reduce_min_f64s(self, a: Self::f64s) -> f64 {
1975 let a: __m128d = cast!(a);
1976 let hi = cast!(self.sse._mm_movehl_ps(cast!(a), cast!(a)));
1977 let r = self.sse2._mm_min_sd(a, hi);
1978 self.sse2._mm_cvtsd_f64(r)
1979 }
1980
1981 #[inline(always)]
1982 fn reduce_product_f32s(self, a: Self::f32s) -> f32 {
1983 let a: __m128 = cast!(a);
1984 let hi = self.sse._mm_movehl_ps(a, a);
1985 let r0 = self.sse._mm_mul_ps(a, hi);
1986 let r0_shuffled = self.sse._mm_shuffle_ps::<0b0001>(r0, r0);
1987 let r = self.sse._mm_mul_ss(r0, r0_shuffled);
1988 self.sse._mm_cvtss_f32(r)
1989 }
1990
1991 #[inline(always)]
1992 fn reduce_product_f64s(self, a: Self::f64s) -> f64 {
1993 let a: __m128d = cast!(a);
1994 let hi = cast!(self.sse._mm_movehl_ps(cast!(a), cast!(a)));
1995 let r = self.sse2._mm_mul_sd(a, hi);
1996 self.sse2._mm_cvtsd_f64(r)
1997 }
1998
1999 #[inline(always)]
2000 fn reduce_sum_c32s(self, a: Self::c32s) -> c32 {
2001 let a: __m128 = cast!(a);
2003 let hi = self.sse._mm_movehl_ps(a, a);
2005
2006 let r0 = self.sse._mm_add_ps(a, hi);
2008
2009 cast!(self.sse2._mm_cvtsd_f64(cast!(r0)))
2010 }
2011
2012 #[inline(always)]
2013 fn reduce_sum_c64s(self, a: Self::c64s) -> c64 {
2014 cast!(a)
2015 }
2016
2017 #[inline(always)]
2018 fn reduce_sum_f32s(self, a: Self::f32s) -> f32 {
2019 let a: __m128 = cast!(a);
2020 let hi = self.sse._mm_movehl_ps(a, a);
2021 let r0 = self.sse._mm_add_ps(a, hi);
2022 let r0_shuffled = self.sse._mm_shuffle_ps::<0b0001>(r0, r0);
2023 let r = self.sse._mm_add_ss(r0, r0_shuffled);
2024 self.sse._mm_cvtss_f32(r)
2025 }
2026
2027 #[inline(always)]
2028 fn reduce_sum_f64s(self, a: Self::f64s) -> f64 {
2029 let a: __m128d = cast!(a);
2030 let hi = cast!(self.sse._mm_movehl_ps(cast!(a), cast!(a)));
2031 let r = self.sse2._mm_add_sd(a, hi);
2032 self.sse2._mm_cvtsd_f64(r)
2033 }
2034
2035 #[inline(always)]
2036 fn rotate_right_c32s(self, a: Self::c32s, amount: usize) -> Self::c32s {
2037 cast!(
2038 self.ssse3
2039 ._mm_shuffle_epi8(cast!(a), cast!(AVX2_128_ROTATE_IDX[8 * (amount % 2)]))
2040 )
2041 }
2042
2043 #[inline(always)]
2044 fn rotate_right_c64s(self, a: Self::c64s, amount: usize) -> Self::c64s {
2045 _ = amount;
2046 a
2047 }
2048
2049 #[inline(always)]
2050 fn rotate_right_u32s(self, a: Self::u32s, amount: usize) -> Self::u32s {
2051 cast!(
2052 self.ssse3
2053 ._mm_shuffle_epi8(cast!(a), cast!(AVX2_128_ROTATE_IDX[4 * (amount % 4)]))
2054 )
2055 }
2056
2057 #[inline(always)]
2058 fn rotate_right_u64s(self, a: Self::u64s, amount: usize) -> Self::u64s {
2059 cast!(
2060 self.ssse3
2061 ._mm_shuffle_epi8(cast!(a), cast!(AVX2_128_ROTATE_IDX[8 * (amount % 2)]))
2062 )
2063 }
2064
2065 #[inline(always)]
2066 fn select_u32s_m32s(
2067 self,
2068 mask: Self::m32s,
2069 if_true: Self::u32s,
2070 if_false: Self::u32s,
2071 ) -> Self::u32s {
2072 self.select_u32x4(mask, if_true, if_false)
2073 }
2074
2075 #[inline(always)]
2076 fn select_u64s_m64s(
2077 self,
2078 mask: Self::m64s,
2079 if_true: Self::u64s,
2080 if_false: Self::u64s,
2081 ) -> Self::u64s {
2082 self.select_u64x2(mask, if_true, if_false)
2083 }
2084
2085 #[inline(always)]
2086 fn splat_c32s(self, value: c32) -> Self::c32s {
2087 cast!(self.splat_f64x2(cast!(value)))
2088 }
2089
2090 #[inline(always)]
2091 fn splat_c64s(self, value: c64) -> Self::c64s {
2092 cast!(value)
2093 }
2094
2095 #[inline(always)]
2096 fn splat_f32s(self, value: f32) -> Self::f32s {
2097 self.splat_f32x4(value)
2098 }
2099
2100 #[inline(always)]
2101 fn splat_f64s(self, value: f64) -> Self::f64s {
2102 self.splat_f64x2(value)
2103 }
2104
2105 #[inline(always)]
2106 fn splat_u32s(self, value: u32) -> Self::u32s {
2107 self.splat_u32x4(value)
2108 }
2109
2110 #[inline(always)]
2111 fn splat_u64s(self, value: u64) -> Self::u64s {
2112 self.splat_u64x2(value)
2113 }
2114
2115 #[inline(always)]
2116 fn sub_c32s(self, a: Self::c32s, b: Self::c32s) -> Self::c32s {
2117 self.sub_f32x4(a, b)
2118 }
2119
2120 #[inline(always)]
2121 fn sub_c64s(self, a: Self::c64s, b: Self::c64s) -> Self::c64s {
2122 self.sub_f64x2(a, b)
2123 }
2124
2125 #[inline(always)]
2126 fn sub_f32s(self, a: Self::f32s, b: Self::f32s) -> Self::f32s {
2127 self.sub_f32x4(a, b)
2128 }
2129
2130 #[inline(always)]
2131 fn sub_f64s(self, a: Self::f64s, b: Self::f64s) -> Self::f64s {
2132 self.sub_f64x2(a, b)
2133 }
2134
2135 #[inline(always)]
2136 fn sub_u32s(self, a: Self::u32s, b: Self::u32s) -> Self::u32s {
2137 self.wrapping_sub_u32x4(a, b)
2138 }
2139
2140 #[inline(always)]
2141 fn sub_u64s(self, a: Self::u64s, b: Self::u64s) -> Self::u64s {
2142 self.wrapping_sub_u64x2(a, b)
2143 }
2144
2145 #[inline(always)]
2146 fn swap_re_im_c32s(self, a: Self::c32s) -> Self::c32s {
2147 cast!(self.avx._mm_permute_ps::<0b10_11_00_01>(cast!(a)))
2148 }
2149
2150 #[inline(always)]
2151 fn swap_re_im_c64s(self, a: Self::c64s) -> Self::c64s {
2152 cast!(self.avx._mm_permute_pd::<0b01>(cast!(a)))
2153 }
2154
2155 #[inline(always)]
2156 fn vectorize<Op: WithSimd>(self, op: Op) -> Op::Output {
2157 Simd::vectorize(self.0, op)
2158 }
2159
2160 #[inline(always)]
2161 fn widening_mul_u32s(self, a: Self::u32s, b: Self::u32s) -> (Self::u32s, Self::u32s) {
2162 self.widening_mul_u32x4(a, b)
2163 }
2164
2165 #[inline(always)]
2166 fn wrapping_dyn_shl_u32s(self, a: Self::u32s, amount: Self::u32s) -> Self::u32s {
2167 self.shl_dyn_u32x4(a, self.and_u32x4(amount, self.splat_u32x4(32 - 1)))
2168 }
2169
2170 #[inline(always)]
2171 fn wrapping_dyn_shr_u32s(self, a: Self::u32s, amount: Self::u32s) -> Self::u32s {
2172 self.shr_dyn_u32x4(a, self.and_u32x4(amount, self.splat_u32x4(32 - 1)))
2173 }
2174
2175 #[inline(always)]
2176 fn xor_m32s(self, a: Self::m32s, b: Self::m32s) -> Self::m32s {
2177 cast!(self.sse2._mm_xor_pd(cast!(a), cast!(b)))
2178 }
2179
2180 #[inline(always)]
2181 fn xor_m64s(self, a: Self::m64s, b: Self::m64s) -> Self::m64s {
2182 cast!(self.sse2._mm_xor_pd(cast!(a), cast!(b)))
2183 }
2184
2185 #[inline(always)]
2186 fn xor_u32s(self, a: Self::u32s, b: Self::u32s) -> Self::u32s {
2187 cast!(self.sse2._mm_xor_pd(cast!(a), cast!(b)))
2188 }
2189
2190 #[inline(always)]
2191 fn xor_u64s(self, a: Self::u64s, b: Self::u64s) -> Self::u64s {
2192 cast!(self.sse2._mm_xor_pd(cast!(a), cast!(b)))
2193 }
2194
2195 #[inline(always)]
2196 fn greater_than_or_equal_i32s(self, a: Self::i32s, b: Self::i32s) -> Self::m32s {
2197 self.cmp_ge_i32x4(a, b)
2198 }
2199
2200 #[inline(always)]
2201 fn greater_than_or_equal_i64s(self, a: Self::i64s, b: Self::i64s) -> Self::m64s {
2202 self.cmp_ge_i64x2(a, b)
2203 }
2204
2205 #[inline(always)]
2206 fn greater_than_i32s(self, a: Self::i32s, b: Self::i32s) -> Self::m32s {
2207 self.cmp_gt_i32x4(a, b)
2208 }
2209
2210 #[inline(always)]
2211 fn greater_than_i64s(self, a: Self::i64s, b: Self::i64s) -> Self::m64s {
2212 self.cmp_gt_i64x2(a, b)
2213 }
2214
2215 #[inline(always)]
2216 fn less_than_or_equal_i32s(self, a: Self::i32s, b: Self::i32s) -> Self::m32s {
2217 self.cmp_le_i32x4(a, b)
2218 }
2219
2220 #[inline(always)]
2221 fn less_than_or_equal_i64s(self, a: Self::i64s, b: Self::i64s) -> Self::m64s {
2222 self.cmp_le_i64x2(a, b)
2223 }
2224
2225 #[inline(always)]
2226 fn less_than_i32s(self, a: Self::i32s, b: Self::i32s) -> Self::m32s {
2227 self.cmp_lt_i32x4(a, b)
2228 }
2229
2230 #[inline(always)]
2231 fn less_than_i64s(self, a: Self::i64s, b: Self::i64s) -> Self::m64s {
2232 self.cmp_lt_i64x2(a, b)
2233 }
2234}
2235
2236impl Simd for V3_256b {
2237 type c32s = f32x8;
2238 type c64s = f64x4;
2239 type f32s = f32x8;
2240 type f64s = f64x4;
2241 type i32s = i32x8;
2242 type i64s = i64x4;
2243 type m32s = m32x8;
2244 type m64s = m64x4;
2245 type u32s = u32x8;
2246 type u64s = u64x4;
2247
2248 const REGISTER_COUNT: usize = 16;
2249
2250 inherit!({
2251 fn abs2_c32s(self, a: Self::c32s) -> Self::c32s;
2252 fn abs2_c64s(self, a: Self::c64s) -> Self::c64s;
2253 fn abs_max_c32s(self, a: Self::c32s) -> Self::c32s;
2254 fn abs_max_c64s(self, a: Self::c64s) -> Self::c64s;
2255 fn add_c32s(self, a: Self::c32s, b: Self::c32s) -> Self::c32s;
2256 fn add_c64s(self, a: Self::c64s, b: Self::c64s) -> Self::c64s;
2257 fn add_f32s(self, a: Self::f32s, b: Self::f32s) -> Self::f32s;
2258 fn add_f64s(self, a: Self::f64s, b: Self::f64s) -> Self::f64s;
2259 fn add_u32s(self, a: Self::u32s, b: Self::u32s) -> Self::u32s;
2260 fn add_u64s(self, a: Self::u64s, b: Self::u64s) -> Self::u64s;
2261 fn and_m32s(self, a: Self::m32s, b: Self::m32s) -> Self::m32s;
2262 fn and_m64s(self, a: Self::m64s, b: Self::m64s) -> Self::m64s;
2263 fn and_u32s(self, a: Self::u32s, b: Self::u32s) -> Self::u32s;
2264 fn and_u64s(self, a: Self::u64s, b: Self::u64s) -> Self::u64s;
2265 fn conj_c32s(self, a: Self::c32s) -> Self::c32s;
2266 fn conj_c64s(self, a: Self::c64s) -> Self::c64s;
2267 fn conj_mul_add_c32s(self, a: Self::c32s, b: Self::c32s, c: Self::c32s) -> Self::c32s;
2268 fn conj_mul_add_c64s(self, a: Self::c64s, b: Self::c64s, c: Self::c64s) -> Self::c64s;
2269 fn conj_mul_c32s(self, a: Self::c32s, b: Self::c32s) -> Self::c32s;
2270 fn conj_mul_c64s(self, a: Self::c64s, b: Self::c64s) -> Self::c64s;
2271 fn div_f32s(self, a: Self::f32s, b: Self::f32s) -> Self::f32s;
2272 fn div_f64s(self, a: Self::f64s, b: Self::f64s) -> Self::f64s;
2273 fn equal_f32s(self, a: Self::f32s, b: Self::f32s) -> Self::m32s;
2274 fn equal_f64s(self, a: Self::f64s, b: Self::f64s) -> Self::m64s;
2275 fn greater_than_or_equal_u32s(self, a: Self::u32s, b: Self::u32s) -> Self::m32s;
2276 fn greater_than_or_equal_u64s(self, a: Self::u64s, b: Self::u64s) -> Self::m64s;
2277 fn greater_than_u32s(self, a: Self::u32s, b: Self::u32s) -> Self::m32s;
2278 fn greater_than_u64s(self, a: Self::u64s, b: Self::u64s) -> Self::m64s;
2279 fn less_than_f32s(self, a: Self::f32s, b: Self::f32s) -> Self::m32s;
2280 fn less_than_f64s(self, a: Self::f64s, b: Self::f64s) -> Self::m64s;
2281 fn less_than_or_equal_f32s(self, a: Self::f32s, b: Self::f32s) -> Self::m32s;
2282 fn less_than_or_equal_f64s(self, a: Self::f64s, b: Self::f64s) -> Self::m64s;
2283 fn less_than_or_equal_u32s(self, a: Self::u32s, b: Self::u32s) -> Self::m32s;
2284 fn less_than_or_equal_u64s(self, a: Self::u64s, b: Self::u64s) -> Self::m64s;
2285 fn less_than_u32s(self, a: Self::u32s, b: Self::u32s) -> Self::m32s;
2286 fn less_than_u64s(self, a: Self::u64s, b: Self::u64s) -> Self::m64s;
2287 fn mask_between_m32s(self, start: u32, end: u32) -> MemMask<Self::m32s>;
2288 fn mask_between_m64s(self, start: u64, end: u64) -> MemMask<Self::m64s>;
2289 unsafe fn mask_load_ptr_c32s(
2293 self,
2294 mask: MemMask<Self::m32s>,
2295 ptr: *const c32,
2296 ) -> Self::c32s;
2297 unsafe fn mask_load_ptr_c64s(
2301 self,
2302 mask: MemMask<Self::m64s>,
2303 ptr: *const c64,
2304 ) -> Self::c64s;
2305 unsafe fn mask_load_ptr_u32s(
2309 self,
2310 mask: MemMask<Self::m32s>,
2311 ptr: *const u32,
2312 ) -> Self::u32s;
2313 unsafe fn mask_load_ptr_u64s(
2317 self,
2318 mask: MemMask<Self::m64s>,
2319 ptr: *const u64,
2320 ) -> Self::u64s;
2321 unsafe fn mask_store_ptr_c32s(
2325 self,
2326 mask: MemMask<Self::m32s>,
2327 ptr: *mut c32,
2328 values: Self::c32s,
2329 );
2330 unsafe fn mask_store_ptr_c64s(
2334 self,
2335 mask: MemMask<Self::m64s>,
2336 ptr: *mut c64,
2337 values: Self::c64s,
2338 );
2339 unsafe fn mask_store_ptr_u32s(
2343 self,
2344 mask: MemMask<Self::m32s>,
2345 ptr: *mut u32,
2346 values: Self::u32s,
2347 );
2348 unsafe fn mask_store_ptr_u64s(
2352 self,
2353 mask: MemMask<Self::m64s>,
2354 ptr: *mut u64,
2355 values: Self::u64s,
2356 );
2357 fn max_f32s(self, a: Self::f32s, b: Self::f32s) -> Self::f32s;
2358 fn max_f64s(self, a: Self::f64s, b: Self::f64s) -> Self::f64s;
2359 fn min_f32s(self, a: Self::f32s, b: Self::f32s) -> Self::f32s;
2360 fn min_f64s(self, a: Self::f64s, b: Self::f64s) -> Self::f64s;
2361 fn mul_add_c32s(self, a: Self::c32s, b: Self::c32s, c: Self::c32s) -> Self::c32s;
2362 fn mul_add_c64s(self, a: Self::c64s, b: Self::c64s, c: Self::c64s) -> Self::c64s;
2363 fn mul_add_e_f32s(self, a: Self::f32s, b: Self::f32s, c: Self::f32s) -> Self::f32s;
2364 fn mul_add_e_f64s(self, a: Self::f64s, b: Self::f64s, c: Self::f64s) -> Self::f64s;
2365 fn mul_add_f32s(self, a: Self::f32s, b: Self::f32s, c: Self::f32s) -> Self::f32s;
2366 fn mul_add_f64s(self, a: Self::f64s, b: Self::f64s, c: Self::f64s) -> Self::f64s;
2367 fn mul_c32s(self, a: Self::c32s, b: Self::c32s) -> Self::c32s;
2368 fn mul_c64s(self, a: Self::c64s, b: Self::c64s) -> Self::c64s;
2369 fn mul_f32s(self, a: Self::f32s, b: Self::f32s) -> Self::f32s;
2370 fn mul_f64s(self, a: Self::f64s, b: Self::f64s) -> Self::f64s;
2371 fn neg_c32s(self, a: Self::c32s) -> Self::c32s;
2372 fn neg_c64s(self, a: Self::c64s) -> Self::c64s;
2373 fn not_m32s(self, a: Self::m32s) -> Self::m32s;
2374 fn not_m64s(self, a: Self::m64s) -> Self::m64s;
2375 fn not_u32s(self, a: Self::u32s) -> Self::u32s;
2376 fn not_u64s(self, a: Self::u64s) -> Self::u64s;
2377 fn or_m32s(self, a: Self::m32s, b: Self::m32s) -> Self::m32s;
2378 fn or_m64s(self, a: Self::m64s, b: Self::m64s) -> Self::m64s;
2379 fn or_u32s(self, a: Self::u32s, b: Self::u32s) -> Self::u32s;
2380 fn or_u64s(self, a: Self::u64s, b: Self::u64s) -> Self::u64s;
2381 fn partial_load_u32s(self, slice: &[u32]) -> Self::u32s;
2382 fn partial_load_u64s(self, slice: &[u64]) -> Self::u64s;
2383 fn partial_store_u32s(self, slice: &mut [u32], values: Self::u32s);
2384 fn partial_store_u64s(self, slice: &mut [u64], values: Self::u64s);
2385 fn reduce_max_c32s(self, a: Self::c32s) -> c32;
2386 fn reduce_max_c64s(self, a: Self::c64s) -> c64;
2387 fn reduce_max_f32s(self, a: Self::f32s) -> f32;
2388 fn reduce_max_f64s(self, a: Self::f64s) -> f64;
2389 fn reduce_min_c32s(self, a: Self::c32s) -> c32;
2390 fn reduce_min_c64s(self, a: Self::c64s) -> c64;
2391 fn reduce_min_f32s(self, a: Self::f32s) -> f32;
2392 fn reduce_min_f64s(self, a: Self::f64s) -> f64;
2393 fn reduce_product_f32s(self, a: Self::f32s) -> f32;
2394 fn reduce_product_f64s(self, a: Self::f64s) -> f64;
2395 fn reduce_sum_c32s(self, a: Self::c32s) -> c32;
2396 fn reduce_sum_c64s(self, a: Self::c64s) -> c64;
2397 fn reduce_sum_f32s(self, a: Self::f32s) -> f32;
2398 fn reduce_sum_f64s(self, a: Self::f64s) -> f64;
2399 fn rotate_right_c32s(self, a: Self::c32s, amount: usize) -> Self::c32s;
2400 fn rotate_right_c64s(self, a: Self::c64s, amount: usize) -> Self::c64s;
2401 fn rotate_right_u32s(self, a: Self::u32s, amount: usize) -> Self::u32s;
2402 fn rotate_right_u64s(self, a: Self::u64s, amount: usize) -> Self::u64s;
2403 fn select_u32s_m32s(
2404 self,
2405 mask: Self::m32s,
2406 if_true: Self::u32s,
2407 if_false: Self::u32s,
2408 ) -> Self::u32s;
2409 fn select_u64s_m64s(
2410 self,
2411 mask: Self::m64s,
2412 if_true: Self::u64s,
2413 if_false: Self::u64s,
2414 ) -> Self::u64s;
2415 fn splat_c32s(self, value: c32) -> Self::c32s;
2416 fn splat_c64s(self, value: c64) -> Self::c64s;
2417 fn splat_f32s(self, value: f32) -> Self::f32s;
2418 fn splat_f64s(self, value: f64) -> Self::f64s;
2419 fn splat_u32s(self, value: u32) -> Self::u32s;
2420 fn splat_u64s(self, value: u64) -> Self::u64s;
2421 fn sub_c32s(self, a: Self::c32s, b: Self::c32s) -> Self::c32s;
2422 fn sub_c64s(self, a: Self::c64s, b: Self::c64s) -> Self::c64s;
2423 fn sub_f32s(self, a: Self::f32s, b: Self::f32s) -> Self::f32s;
2424 fn sub_f64s(self, a: Self::f64s, b: Self::f64s) -> Self::f64s;
2425 fn sub_u32s(self, a: Self::u32s, b: Self::u32s) -> Self::u32s;
2426 fn sub_u64s(self, a: Self::u64s, b: Self::u64s) -> Self::u64s;
2427 fn swap_re_im_c32s(self, a: Self::c32s) -> Self::c32s;
2428 fn swap_re_im_c64s(self, a: Self::c64s) -> Self::c64s;
2429 fn widening_mul_u32s(self, a: Self::u32s, b: Self::u32s) -> (Self::u32s, Self::u32s);
2430 fn wrapping_dyn_shl_u32s(self, a: Self::u32s, amount: Self::u32s) -> Self::u32s;
2431 fn wrapping_dyn_shr_u32s(self, a: Self::u32s, amount: Self::u32s) -> Self::u32s;
2432 fn xor_m32s(self, a: Self::m32s, b: Self::m32s) -> Self::m32s;
2433 fn xor_m64s(self, a: Self::m64s, b: Self::m64s) -> Self::m64s;
2434 fn xor_u32s(self, a: Self::u32s, b: Self::u32s) -> Self::u32s;
2435 fn xor_u64s(self, a: Self::u64s, b: Self::u64s) -> Self::u64s;
2436 fn greater_than_or_equal_i32s(self, a: Self::i32s, b: Self::i32s) -> Self::m32s;
2437 fn greater_than_or_equal_i64s(self, a: Self::i64s, b: Self::i64s) -> Self::m64s;
2438 fn greater_than_i32s(self, a: Self::i32s, b: Self::i32s) -> Self::m32s;
2439 fn greater_than_i64s(self, a: Self::i64s, b: Self::i64s) -> Self::m64s;
2440 fn less_than_or_equal_i32s(self, a: Self::i32s, b: Self::i32s) -> Self::m32s;
2441 fn less_than_or_equal_i64s(self, a: Self::i64s, b: Self::i64s) -> Self::m64s;
2442 fn less_than_i32s(self, a: Self::i32s, b: Self::i32s) -> Self::m32s;
2443 fn less_than_i64s(self, a: Self::i64s, b: Self::i64s) -> Self::m64s;
2444 });
2445
2446 #[inline(always)]
2447 fn vectorize<Op: WithSimd>(self, op: Op) -> Op::Output {
2448 Simd::vectorize(self.0, op)
2449 }
2450}
2451
2452impl Simd for V3_512b {
2453 type c32s = f32x16;
2454 type c64s = f64x8;
2455 type f32s = f32x16;
2456 type f64s = f64x8;
2457 type i32s = i32x16;
2458 type i64s = i64x8;
2459 type m32s = m32x16;
2460 type m64s = m64x8;
2461 type u32s = u32x16;
2462 type u64s = u64x8;
2463
2464 const REGISTER_COUNT: usize = 8;
2465
2466 inherit_x2!(V3_256b(*self), {
2467 fn abs2_c32s(self, a: Self::c32s) -> Self::c32s;
2468 fn abs2_c64s(self, a: Self::c64s) -> Self::c64s;
2469 fn abs_max_c32s(self, a: Self::c32s) -> Self::c32s;
2470 fn abs_max_c64s(self, a: Self::c64s) -> Self::c64s;
2471 fn add_c32s(self, a: Self::c32s, b: Self::c32s) -> Self::c32s;
2472 fn add_c64s(self, a: Self::c64s, b: Self::c64s) -> Self::c64s;
2473 fn add_f32s(self, a: Self::f32s, b: Self::f32s) -> Self::f32s;
2474 fn add_f64s(self, a: Self::f64s, b: Self::f64s) -> Self::f64s;
2475 fn add_u32s(self, a: Self::u32s, b: Self::u32s) -> Self::u32s;
2476 fn add_u64s(self, a: Self::u64s, b: Self::u64s) -> Self::u64s;
2477 fn and_m32s(self, a: Self::m32s, b: Self::m32s) -> Self::m32s;
2478 fn and_m64s(self, a: Self::m64s, b: Self::m64s) -> Self::m64s;
2479 fn and_u32s(self, a: Self::u32s, b: Self::u32s) -> Self::u32s;
2480 fn and_u64s(self, a: Self::u64s, b: Self::u64s) -> Self::u64s;
2481 fn conj_c32s(self, a: Self::c32s) -> Self::c32s;
2482 fn conj_c64s(self, a: Self::c64s) -> Self::c64s;
2483 fn conj_mul_add_c32s(self, a: Self::c32s, b: Self::c32s, c: Self::c32s) -> Self::c32s;
2484 fn conj_mul_add_c64s(self, a: Self::c64s, b: Self::c64s, c: Self::c64s) -> Self::c64s;
2485 fn conj_mul_c32s(self, a: Self::c32s, b: Self::c32s) -> Self::c32s;
2486 fn conj_mul_c64s(self, a: Self::c64s, b: Self::c64s) -> Self::c64s;
2487 fn div_f32s(self, a: Self::f32s, b: Self::f32s) -> Self::f32s;
2488 fn div_f64s(self, a: Self::f64s, b: Self::f64s) -> Self::f64s;
2489 fn equal_f32s(self, a: Self::f32s, b: Self::f32s) -> Self::m32s;
2490 fn equal_f64s(self, a: Self::f64s, b: Self::f64s) -> Self::m64s;
2491 fn greater_than_or_equal_u32s(self, a: Self::u32s, b: Self::u32s) -> Self::m32s;
2492 fn greater_than_or_equal_u64s(self, a: Self::u64s, b: Self::u64s) -> Self::m64s;
2493 fn greater_than_u32s(self, a: Self::u32s, b: Self::u32s) -> Self::m32s;
2494 fn greater_than_u64s(self, a: Self::u64s, b: Self::u64s) -> Self::m64s;
2495 fn less_than_f32s(self, a: Self::f32s, b: Self::f32s) -> Self::m32s;
2496 fn less_than_f64s(self, a: Self::f64s, b: Self::f64s) -> Self::m64s;
2497 fn less_than_or_equal_f32s(self, a: Self::f32s, b: Self::f32s) -> Self::m32s;
2498 fn less_than_or_equal_f64s(self, a: Self::f64s, b: Self::f64s) -> Self::m64s;
2499 fn less_than_or_equal_u32s(self, a: Self::u32s, b: Self::u32s) -> Self::m32s;
2500 fn less_than_or_equal_u64s(self, a: Self::u64s, b: Self::u64s) -> Self::m64s;
2501 fn less_than_u32s(self, a: Self::u32s, b: Self::u32s) -> Self::m32s;
2502 fn less_than_u64s(self, a: Self::u64s, b: Self::u64s) -> Self::m64s;
2503 fn max_f32s(self, a: Self::f32s, b: Self::f32s) -> Self::f32s;
2504 fn max_f64s(self, a: Self::f64s, b: Self::f64s) -> Self::f64s;
2505 fn min_f32s(self, a: Self::f32s, b: Self::f32s) -> Self::f32s;
2506 fn min_f64s(self, a: Self::f64s, b: Self::f64s) -> Self::f64s;
2507 fn mul_add_c32s(self, a: Self::c32s, b: Self::c32s, c: Self::c32s) -> Self::c32s;
2508 fn mul_add_c64s(self, a: Self::c64s, b: Self::c64s, c: Self::c64s) -> Self::c64s;
2509 fn mul_add_e_f32s(self, a: Self::f32s, b: Self::f32s, c: Self::f32s) -> Self::f32s;
2510 fn mul_add_e_f64s(self, a: Self::f64s, b: Self::f64s, c: Self::f64s) -> Self::f64s;
2511 fn mul_add_f32s(self, a: Self::f32s, b: Self::f32s, c: Self::f32s) -> Self::f32s;
2512 fn mul_add_f64s(self, a: Self::f64s, b: Self::f64s, c: Self::f64s) -> Self::f64s;
2513 fn mul_c32s(self, a: Self::c32s, b: Self::c32s) -> Self::c32s;
2514 fn mul_c64s(self, a: Self::c64s, b: Self::c64s) -> Self::c64s;
2515 fn mul_f32s(self, a: Self::f32s, b: Self::f32s) -> Self::f32s;
2516 fn mul_f64s(self, a: Self::f64s, b: Self::f64s) -> Self::f64s;
2517 fn neg_c32s(self, a: Self::c32s) -> Self::c32s;
2518 fn neg_c64s(self, a: Self::c64s) -> Self::c64s;
2519 fn not_m32s(self, a: Self::m32s) -> Self::m32s;
2520 fn not_m64s(self, a: Self::m64s) -> Self::m64s;
2521 fn not_u32s(self, a: Self::u32s) -> Self::u32s;
2522 fn not_u64s(self, a: Self::u64s) -> Self::u64s;
2523 fn or_m32s(self, a: Self::m32s, b: Self::m32s) -> Self::m32s;
2524 fn or_m64s(self, a: Self::m64s, b: Self::m64s) -> Self::m64s;
2525 fn or_u32s(self, a: Self::u32s, b: Self::u32s) -> Self::u32s;
2526 fn or_u64s(self, a: Self::u64s, b: Self::u64s) -> Self::u64s;
2527 fn select_u32s_m32s(
2528 self,
2529 mask: Self::m32s,
2530 if_true: Self::u32s,
2531 if_false: Self::u32s,
2532 ) -> Self::u32s;
2533 fn select_u64s_m64s(
2534 self,
2535 mask: Self::m64s,
2536 if_true: Self::u64s,
2537 if_false: Self::u64s,
2538 ) -> Self::u64s;
2539 fn sub_c32s(self, a: Self::c32s, b: Self::c32s) -> Self::c32s;
2540 fn sub_c64s(self, a: Self::c64s, b: Self::c64s) -> Self::c64s;
2541 fn sub_f32s(self, a: Self::f32s, b: Self::f32s) -> Self::f32s;
2542 fn sub_f64s(self, a: Self::f64s, b: Self::f64s) -> Self::f64s;
2543 fn sub_u32s(self, a: Self::u32s, b: Self::u32s) -> Self::u32s;
2544 fn sub_u64s(self, a: Self::u64s, b: Self::u64s) -> Self::u64s;
2545 fn swap_re_im_c32s(self, a: Self::c32s) -> Self::c32s;
2546 fn swap_re_im_c64s(self, a: Self::c64s) -> Self::c64s;
2547 fn wrapping_dyn_shl_u32s(self, a: Self::u32s, amount: Self::u32s) -> Self::u32s;
2548 fn wrapping_dyn_shr_u32s(self, a: Self::u32s, amount: Self::u32s) -> Self::u32s;
2549 fn xor_m32s(self, a: Self::m32s, b: Self::m32s) -> Self::m32s;
2550 fn xor_m64s(self, a: Self::m64s, b: Self::m64s) -> Self::m64s;
2551 fn xor_u32s(self, a: Self::u32s, b: Self::u32s) -> Self::u32s;
2552 fn xor_u64s(self, a: Self::u64s, b: Self::u64s) -> Self::u64s;
2553 fn greater_than_or_equal_i32s(self, a: Self::i32s, b: Self::i32s) -> Self::m32s;
2554 fn greater_than_or_equal_i64s(self, a: Self::i64s, b: Self::i64s) -> Self::m64s;
2555 fn greater_than_i32s(self, a: Self::i32s, b: Self::i32s) -> Self::m32s;
2556 fn greater_than_i64s(self, a: Self::i64s, b: Self::i64s) -> Self::m64s;
2557 fn less_than_or_equal_i32s(self, a: Self::i32s, b: Self::i32s) -> Self::m32s;
2558 fn less_than_or_equal_i64s(self, a: Self::i64s, b: Self::i64s) -> Self::m64s;
2559 fn less_than_i32s(self, a: Self::i32s, b: Self::i32s) -> Self::m32s;
2560 fn less_than_i64s(self, a: Self::i64s, b: Self::i64s) -> Self::m64s;
2561 });
2562
2563 inherit_x2!(V3_256b(*self), splat, {
2564 fn splat_c32s(self, value: c32) -> Self::c32s;
2565 fn splat_c64s(self, value: c64) -> Self::c64s;
2566 fn splat_f32s(self, value: f32) -> Self::f32s;
2567 fn splat_f64s(self, value: f64) -> Self::f64s;
2568 fn splat_u32s(self, value: u32) -> Self::u32s;
2569 fn splat_u64s(self, value: u64) -> Self::u64s;
2570 });
2571
2572 inherit_x2!(V3_256b(*self), wide, {
2573 fn widening_mul_u32s(self, a: Self::u32s, b: Self::u32s) -> (Self::u32s, Self::u32s);
2574 });
2575
2576 #[inline(always)]
2577 fn rotate_right_c32s(self, a: Self::c32s, amount: usize) -> Self::c32s {
2578 let simd = V3_256b(*self);
2579 let amount = amount % Self::C32_LANES;
2580 let [mut a0, mut a1]: [_; 2] = cast!(a);
2581 if amount >= Self::C32_LANES / 2 {
2582 core::mem::swap(&mut a0, &mut a1);
2583 }
2584 let amount = amount % (Self::C32_LANES / 2);
2585 let mask = simd.mask_between_m32s(0, amount as _).mask();
2586 let a0 = simd.rotate_right_c32s(a0, amount);
2587 let a1 = simd.rotate_right_c32s(a1, amount);
2588
2589 cast!([
2590 simd.select_f32s_m32s(mask, a1, a0),
2591 simd.select_f32s_m32s(mask, a0, a1),
2592 ])
2593 }
2594
2595 #[inline(always)]
2596 fn rotate_right_c64s(self, a: Self::c64s, amount: usize) -> Self::c64s {
2597 let simd = V3_256b(*self);
2598 let amount = amount % Self::C64_LANES;
2599 let [mut a0, mut a1]: [_; 2] = cast!(a);
2600 if amount >= Self::C64_LANES / 2 {
2601 core::mem::swap(&mut a0, &mut a1);
2602 }
2603 let amount = amount % (Self::C64_LANES / 2);
2604 let mask = simd.mask_between_m64s(0, amount as _).mask();
2605 let a0 = simd.rotate_right_c64s(a0, amount);
2606 let a1 = simd.rotate_right_c64s(a1, amount);
2607
2608 cast!([
2609 simd.select_f64s_m64s(mask, a1, a0),
2610 simd.select_f64s_m64s(mask, a0, a1),
2611 ])
2612 }
2613
2614 #[inline(always)]
2615 fn rotate_right_u32s(self, a: Self::u32s, amount: usize) -> Self::u32s {
2616 let simd = V3_256b(*self);
2617 let amount = amount % Self::U32_LANES;
2618 let [mut a0, mut a1]: [_; 2] = cast!(a);
2619 if amount >= Self::U32_LANES / 2 {
2620 core::mem::swap(&mut a0, &mut a1);
2621 }
2622 let amount = amount % (Self::U32_LANES / 2);
2623 let mask = simd.mask_between_m32s(0, amount as _).mask();
2624 let a0 = simd.rotate_right_u32s(a0, amount);
2625 let a1 = simd.rotate_right_u32s(a1, amount);
2626
2627 cast!([
2628 simd.select_u32s_m32s(mask, a1, a0),
2629 simd.select_u32s_m32s(mask, a0, a1),
2630 ])
2631 }
2632
2633 #[inline(always)]
2634 fn rotate_right_u64s(self, a: Self::u64s, amount: usize) -> Self::u64s {
2635 let simd = V3_256b(*self);
2636 let amount = amount % Self::U64_LANES;
2637 let [mut a0, mut a1]: [_; 2] = cast!(a);
2638 if amount >= Self::U64_LANES / 2 {
2639 core::mem::swap(&mut a0, &mut a1);
2640 }
2641 let amount = amount % (Self::U64_LANES / 2);
2642 let mask = simd.mask_between_m64s(0, amount as _).mask();
2643 let a0 = simd.rotate_right_u64s(a0, amount);
2644 let a1 = simd.rotate_right_u64s(a1, amount);
2645
2646 cast!([
2647 simd.select_u64s_m64s(mask, a1, a0),
2648 simd.select_u64s_m64s(mask, a0, a1),
2649 ])
2650 }
2651
2652 #[inline(always)]
2656 unsafe fn mask_load_ptr_c32s(self, mask: MemMask<Self::m32s>, ptr: *const c32) -> Self::c32s {
2657 let simd = V3_256b(*self);
2658 let mask: [_; 2] = cast!(mask.mask());
2659 cast!([
2660 simd.mask_load_ptr_c32s(MemMask::new(mask[0]), ptr.wrapping_add(0)),
2661 simd.mask_load_ptr_c32s(MemMask::new(mask[1]), ptr.wrapping_add(Self::C32_LANES)),
2662 ])
2663 }
2664
2665 #[inline(always)]
2669 unsafe fn mask_load_ptr_c64s(self, mask: MemMask<Self::m64s>, ptr: *const c64) -> Self::c64s {
2670 let simd = V3_256b(*self);
2671 let mask: [_; 2] = cast!(mask.mask());
2672 cast!([
2673 simd.mask_load_ptr_c64s(MemMask::new(mask[0]), ptr.wrapping_add(0)),
2674 simd.mask_load_ptr_c64s(MemMask::new(mask[1]), ptr.wrapping_add(Self::C64_LANES)),
2675 ])
2676 }
2677
2678 #[inline(always)]
2682 unsafe fn mask_load_ptr_u32s(self, mask: MemMask<Self::m32s>, ptr: *const u32) -> Self::u32s {
2683 let simd = V3_256b(*self);
2684 let mask: [_; 2] = cast!(mask.mask());
2685 cast!([
2686 simd.mask_load_ptr_u32s(MemMask::new(mask[0]), ptr.wrapping_add(0)),
2687 simd.mask_load_ptr_u32s(MemMask::new(mask[1]), ptr.wrapping_add(Self::U32_LANES)),
2688 ])
2689 }
2690
2691 #[inline(always)]
2695 unsafe fn mask_load_ptr_u64s(self, mask: MemMask<Self::m64s>, ptr: *const u64) -> Self::u64s {
2696 let simd = V3_256b(*self);
2697 let mask: [_; 2] = cast!(mask.mask());
2698 cast!([
2699 simd.mask_load_ptr_u64s(MemMask::new(mask[0]), ptr.wrapping_add(0)),
2700 simd.mask_load_ptr_u64s(MemMask::new(mask[1]), ptr.wrapping_add(Self::U64_LANES)),
2701 ])
2702 }
2703
2704 #[inline(always)]
2708 unsafe fn mask_store_ptr_c32s(
2709 self,
2710 mask: MemMask<Self::m32s>,
2711 ptr: *mut c32,
2712 values: Self::c32s,
2713 ) {
2714 let simd = V3_256b(*self);
2715 let mask: [_; 2] = cast!(mask.mask());
2716 let values: [_; 2] = cast!(values);
2717 cast!([
2718 simd.mask_store_ptr_c32s(MemMask::new(mask[0]), ptr.wrapping_add(0), values[0]),
2719 simd.mask_store_ptr_c32s(
2720 MemMask::new(mask[1]),
2721 ptr.wrapping_add(Self::C32_LANES),
2722 values[1]
2723 ),
2724 ])
2725 }
2726
2727 #[inline(always)]
2731 unsafe fn mask_store_ptr_c64s(
2732 self,
2733 mask: MemMask<Self::m64s>,
2734 ptr: *mut c64,
2735 values: Self::c64s,
2736 ) {
2737 let simd = V3_256b(*self);
2738 let mask: [_; 2] = cast!(mask.mask());
2739 let values: [_; 2] = cast!(values);
2740 cast!([
2741 simd.mask_store_ptr_c64s(MemMask::new(mask[0]), ptr.wrapping_add(0), values[0]),
2742 simd.mask_store_ptr_c64s(
2743 MemMask::new(mask[1]),
2744 ptr.wrapping_add(Self::C64_LANES),
2745 values[1]
2746 ),
2747 ])
2748 }
2749
2750 #[inline(always)]
2754 unsafe fn mask_store_ptr_u32s(
2755 self,
2756 mask: MemMask<Self::m32s>,
2757 ptr: *mut u32,
2758 values: Self::u32s,
2759 ) {
2760 let simd = V3_256b(*self);
2761 let mask: [_; 2] = cast!(mask.mask());
2762 let values: [_; 2] = cast!(values);
2763 cast!([
2764 simd.mask_store_ptr_u32s(MemMask::new(mask[0]), ptr.wrapping_add(0), values[0]),
2765 simd.mask_store_ptr_u32s(
2766 MemMask::new(mask[1]),
2767 ptr.wrapping_add(Self::U32_LANES),
2768 values[1]
2769 ),
2770 ])
2771 }
2772
2773 #[inline(always)]
2777 unsafe fn mask_store_ptr_u64s(
2778 self,
2779 mask: MemMask<Self::m64s>,
2780 ptr: *mut u64,
2781 values: Self::u64s,
2782 ) {
2783 let simd = V3_256b(*self);
2784 let mask: [_; 2] = cast!(mask.mask());
2785 let values: [_; 2] = cast!(values);
2786 cast!([
2787 simd.mask_store_ptr_u64s(MemMask::new(mask[0]), ptr.wrapping_add(0), values[0]),
2788 simd.mask_store_ptr_u64s(
2789 MemMask::new(mask[1]),
2790 ptr.wrapping_add(Self::U64_LANES),
2791 values[1]
2792 ),
2793 ])
2794 }
2795
2796 #[inline(always)]
2797 fn reduce_max_c32s(self, a: Self::c32s) -> c32 {
2798 let simd = V3_256b(*self);
2799 let a: [_; 2] = cast!(a);
2800 simd.reduce_max_c32s(simd.max_f32s(a[0], a[1]))
2801 }
2802
2803 #[inline(always)]
2804 fn reduce_max_c64s(self, a: Self::c64s) -> c64 {
2805 let simd = V3_256b(*self);
2806 let a: [_; 2] = cast!(a);
2807 simd.reduce_max_c64s(simd.max_f64s(a[0], a[1]))
2808 }
2809
2810 #[inline(always)]
2811 fn reduce_max_f32s(self, a: Self::f32s) -> f32 {
2812 let simd = V3_256b(*self);
2813 let a: [_; 2] = cast!(a);
2814 simd.reduce_max_f32s(simd.max_f32s(a[0], a[1]))
2815 }
2816
2817 #[inline(always)]
2818 fn reduce_max_f64s(self, a: Self::f64s) -> f64 {
2819 let simd = V3_256b(*self);
2820 let a: [_; 2] = cast!(a);
2821 simd.reduce_max_f64s(simd.max_f64s(a[0], a[1]))
2822 }
2823
2824 #[inline(always)]
2825 fn reduce_min_c32s(self, a: Self::c32s) -> c32 {
2826 let simd = V3_256b(*self);
2827 let a: [_; 2] = cast!(a);
2828 simd.reduce_min_c32s(simd.min_f32s(a[0], a[1]))
2829 }
2830
2831 #[inline(always)]
2832 fn reduce_min_c64s(self, a: Self::c64s) -> c64 {
2833 let simd = V3_256b(*self);
2834 let a: [_; 2] = cast!(a);
2835 simd.reduce_min_c64s(simd.min_f64s(a[0], a[1]))
2836 }
2837
2838 #[inline(always)]
2839 fn reduce_min_f32s(self, a: Self::f32s) -> f32 {
2840 let simd = V3_256b(*self);
2841 let a: [_; 2] = cast!(a);
2842 simd.reduce_min_f32s(simd.min_f32s(a[0], a[1]))
2843 }
2844
2845 #[inline(always)]
2846 fn reduce_min_f64s(self, a: Self::f64s) -> f64 {
2847 let simd = V3_256b(*self);
2848 let a: [_; 2] = cast!(a);
2849 simd.reduce_min_f64s(simd.min_f64s(a[0], a[1]))
2850 }
2851
2852 #[inline(always)]
2853 fn reduce_product_f32s(self, a: Self::f32s) -> f32 {
2854 let simd = V3_256b(*self);
2855 let a: [_; 2] = cast!(a);
2856 simd.reduce_product_f32s(simd.mul_f32s(a[0], a[1]))
2857 }
2858
2859 #[inline(always)]
2860 fn reduce_product_f64s(self, a: Self::f64s) -> f64 {
2861 let simd = V3_256b(*self);
2862 let a: [_; 2] = cast!(a);
2863 simd.reduce_product_f64s(simd.mul_f64s(a[0], a[1]))
2864 }
2865
2866 #[inline(always)]
2867 fn reduce_sum_c32s(self, a: Self::c32s) -> c32 {
2868 let simd = V3_256b(*self);
2869 let a: [_; 2] = cast!(a);
2870 simd.reduce_sum_c32s(simd.add_c32s(a[0], a[1]))
2871 }
2872
2873 #[inline(always)]
2874 fn reduce_sum_c64s(self, a: Self::c64s) -> c64 {
2875 let simd = V3_256b(*self);
2876 let a: [_; 2] = cast!(a);
2877 simd.reduce_sum_c64s(simd.add_c64s(a[0], a[1]))
2878 }
2879
2880 #[inline(always)]
2881 fn reduce_sum_f32s(self, a: Self::f32s) -> f32 {
2882 let simd = V3_256b(*self);
2883 let a: [_; 2] = cast!(a);
2884 simd.reduce_sum_f32s(simd.add_f32s(a[0], a[1]))
2885 }
2886
2887 #[inline(always)]
2888 fn reduce_sum_f64s(self, a: Self::f64s) -> f64 {
2889 let simd = V3_256b(*self);
2890 let a: [_; 2] = cast!(a);
2891 simd.reduce_sum_f64s(simd.add_f64s(a[0], a[1]))
2892 }
2893
2894 #[inline(always)]
2895 fn vectorize<Op: WithSimd>(self, op: Op) -> Op::Output {
2896 Simd::vectorize(self.0, op)
2897 }
2898}
2899
2900impl V3 {
2901 #[inline(always)]
2903 pub fn abs_f32x8(self, a: f32x8) -> f32x8 {
2904 self.and_f32x8(a, cast!(self.splat_u32x8((1 << 31) - 1)))
2905 }
2906
2907 #[inline(always)]
2909 pub fn abs_f64x4(self, a: f64x4) -> f64x4 {
2910 self.and_f64x4(a, cast!(self.splat_u64x4((1 << 63) - 1)))
2911 }
2912
2913 #[inline(always)]
2915 pub fn add_f32x8(self, a: f32x8, b: f32x8) -> f32x8 {
2916 cast!(self.avx._mm256_add_ps(cast!(a), cast!(b)))
2917 }
2918
2919 #[inline(always)]
2921 pub fn add_f64x4(self, a: f64x4, b: f64x4) -> f64x4 {
2922 cast!(self.avx._mm256_add_pd(cast!(a), cast!(b)))
2923 }
2924
2925 #[inline(always)]
2927 pub fn and_f32x8(self, a: f32x8, b: f32x8) -> f32x8 {
2928 cast!(self.avx._mm256_and_ps(cast!(a), cast!(b)))
2929 }
2930
2931 #[inline(always)]
2933 pub fn and_f64x4(self, a: f64x4, b: f64x4) -> f64x4 {
2934 cast!(self.avx._mm256_and_pd(cast!(a), cast!(b)))
2935 }
2936
2937 #[inline(always)]
2939 pub fn and_i16x16(self, a: i16x16, b: i16x16) -> i16x16 {
2940 cast!(self.avx2._mm256_and_si256(cast!(a), cast!(b)))
2941 }
2942
2943 #[inline(always)]
2945 pub fn and_i32x8(self, a: i32x8, b: i32x8) -> i32x8 {
2946 cast!(self.avx2._mm256_and_si256(cast!(a), cast!(b)))
2947 }
2948
2949 #[inline(always)]
2951 pub fn and_i64x4(self, a: i64x4, b: i64x4) -> i64x4 {
2952 cast!(self.avx2._mm256_and_si256(cast!(a), cast!(b)))
2953 }
2954
2955 #[inline(always)]
2957 pub fn and_i8x32(self, a: i8x32, b: i8x32) -> i8x32 {
2958 cast!(self.avx2._mm256_and_si256(cast!(a), cast!(b)))
2959 }
2960
2961 #[inline(always)]
2963 pub fn and_m16x16(self, a: m16x16, b: m16x16) -> m16x16 {
2964 cast!(self.avx2._mm256_and_si256(cast!(a), cast!(b)))
2965 }
2966
2967 #[inline(always)]
2969 pub fn and_m32x8(self, a: m32x8, b: m32x8) -> m32x8 {
2970 cast!(self.avx2._mm256_and_si256(cast!(a), cast!(b)))
2971 }
2972
2973 #[inline(always)]
2975 pub fn and_m64x4(self, a: m64x4, b: m64x4) -> m64x4 {
2976 cast!(self.avx2._mm256_and_si256(cast!(a), cast!(b)))
2977 }
2978
2979 #[inline(always)]
2981 pub fn and_m8x32(self, a: m8x32, b: m8x32) -> m8x32 {
2982 cast!(self.avx2._mm256_and_si256(cast!(a), cast!(b)))
2983 }
2984
2985 #[inline(always)]
2987 pub fn and_u16x16(self, a: u16x16, b: u16x16) -> u16x16 {
2988 cast!(self.avx2._mm256_and_si256(cast!(a), cast!(b)))
2989 }
2990
2991 #[inline(always)]
2993 pub fn and_u32x8(self, a: u32x8, b: u32x8) -> u32x8 {
2994 cast!(self.avx2._mm256_and_si256(cast!(a), cast!(b)))
2995 }
2996
2997 #[inline(always)]
2999 pub fn and_u64x4(self, a: u64x4, b: u64x4) -> u64x4 {
3000 cast!(self.avx2._mm256_and_si256(cast!(a), cast!(b)))
3001 }
3002
3003 #[inline(always)]
3005 pub fn and_u8x32(self, a: u8x32, b: u8x32) -> u8x32 {
3006 cast!(self.avx2._mm256_and_si256(cast!(a), cast!(b)))
3007 }
3008
3009 #[inline(always)]
3011 pub fn andnot_f32x8(self, a: f32x8, b: f32x8) -> f32x8 {
3012 cast!(self.avx._mm256_andnot_ps(cast!(a), cast!(b)))
3013 }
3014
3015 #[inline(always)]
3017 pub fn andnot_f64x4(self, a: f64x4, b: f64x4) -> f64x4 {
3018 cast!(self.avx._mm256_andnot_pd(cast!(a), cast!(b)))
3019 }
3020
3021 #[inline(always)]
3023 pub fn andnot_i16x16(self, a: i16x16, b: i16x16) -> i16x16 {
3024 cast!(self.avx2._mm256_andnot_si256(cast!(a), cast!(b)))
3025 }
3026
3027 #[inline(always)]
3029 pub fn andnot_i32x8(self, a: i32x8, b: i32x8) -> i32x8 {
3030 cast!(self.avx2._mm256_andnot_si256(cast!(a), cast!(b)))
3031 }
3032
3033 #[inline(always)]
3035 pub fn andnot_i64x4(self, a: i64x4, b: i64x4) -> i64x4 {
3036 cast!(self.avx2._mm256_andnot_si256(cast!(a), cast!(b)))
3037 }
3038
3039 #[inline(always)]
3041 pub fn andnot_i8x32(self, a: i8x32, b: i8x32) -> i8x32 {
3042 cast!(self.avx2._mm256_andnot_si256(cast!(a), cast!(b)))
3043 }
3044
3045 #[inline(always)]
3047 pub fn andnot_m16x16(self, a: m16x16, b: m16x16) -> m16x16 {
3048 cast!(self.avx2._mm256_andnot_si256(cast!(a), cast!(b)))
3049 }
3050
3051 #[inline(always)]
3053 pub fn andnot_m32x8(self, a: m32x8, b: m32x8) -> m32x8 {
3054 cast!(self.avx2._mm256_andnot_si256(cast!(a), cast!(b)))
3055 }
3056
3057 #[inline(always)]
3059 pub fn andnot_m64x4(self, a: m64x4, b: m64x4) -> m64x4 {
3060 cast!(self.avx2._mm256_andnot_si256(cast!(a), cast!(b)))
3061 }
3062
3063 #[inline(always)]
3065 pub fn andnot_m8x32(self, a: m8x32, b: m8x32) -> m8x32 {
3066 cast!(self.avx2._mm256_andnot_si256(cast!(a), cast!(b)))
3067 }
3068
3069 #[inline(always)]
3071 pub fn andnot_u16x16(self, a: u16x16, b: u16x16) -> u16x16 {
3072 cast!(self.avx2._mm256_andnot_si256(cast!(a), cast!(b)))
3073 }
3074
3075 #[inline(always)]
3077 pub fn andnot_u32x8(self, a: u32x8, b: u32x8) -> u32x8 {
3078 cast!(self.avx2._mm256_andnot_si256(cast!(a), cast!(b)))
3079 }
3080
3081 #[inline(always)]
3083 pub fn andnot_u64x4(self, a: u64x4, b: u64x4) -> u64x4 {
3084 cast!(self.avx2._mm256_andnot_si256(cast!(a), cast!(b)))
3085 }
3086
3087 #[inline(always)]
3089 pub fn andnot_u8x32(self, a: u8x32, b: u8x32) -> u8x32 {
3090 cast!(self.avx2._mm256_andnot_si256(cast!(a), cast!(b)))
3091 }
3092
3093 #[inline(always)]
3098 pub fn apply_sign_i16x16(self, sign: i16x16, a: i16x16) -> i16x16 {
3099 cast!(self.avx2._mm256_sign_epi16(cast!(a), cast!(sign)))
3100 }
3101
3102 #[inline(always)]
3107 pub fn apply_sign_i32x8(self, sign: i32x8, a: i32x8) -> i32x8 {
3108 cast!(self.avx2._mm256_sign_epi32(cast!(a), cast!(sign)))
3109 }
3110
3111 #[inline(always)]
3116 pub fn apply_sign_i8x32(self, sign: i8x32, a: i8x32) -> i8x32 {
3117 cast!(self.avx2._mm256_sign_epi8(cast!(a), cast!(sign)))
3118 }
3119
3120 #[inline(always)]
3122 pub fn approx_reciprocal_f32x8(self, a: f32x8) -> f32x8 {
3123 cast!(self.avx._mm256_rcp_ps(cast!(a)))
3124 }
3125
3126 #[inline(always)]
3128 pub fn approx_reciprocal_sqrt_f32x8(self, a: f32x8) -> f32x8 {
3129 cast!(self.avx._mm256_rsqrt_ps(cast!(a)))
3130 }
3131
3132 #[inline(always)]
3134 pub fn average_u16x16(self, a: u16x16, b: u16x16) -> u16x16 {
3135 cast!(self.avx2._mm256_avg_epu16(cast!(a), cast!(b)))
3136 }
3137
3138 #[inline(always)]
3140 pub fn average_u8x32(self, a: u8x32, b: u8x32) -> u8x32 {
3141 cast!(self.avx2._mm256_avg_epu8(cast!(a), cast!(b)))
3142 }
3143
3144 #[inline(always)]
3146 pub fn ceil_f32x8(self, a: f32x8) -> f32x8 {
3147 cast!(self.avx._mm256_ceil_ps(cast!(a)))
3148 }
3149
3150 #[inline(always)]
3152 pub fn ceil_f64x4(self, a: f64x4) -> f64x4 {
3153 cast!(self.avx._mm256_ceil_pd(cast!(a)))
3154 }
3155
3156 #[inline(always)]
3158 pub fn cmp_eq_f32x8(self, a: f32x8, b: f32x8) -> m32x8 {
3159 cast!(self.avx._mm256_cmp_ps::<_CMP_EQ_OQ>(cast!(a), cast!(b)))
3160 }
3161
3162 #[inline(always)]
3164 pub fn cmp_eq_f64x4(self, a: f64x4, b: f64x4) -> m64x4 {
3165 cast!(self.avx._mm256_cmp_pd::<_CMP_EQ_OQ>(cast!(a), cast!(b)))
3166 }
3167
3168 #[inline(always)]
3170 pub fn cmp_eq_i16x16(self, a: i16x16, b: i16x16) -> m16x16 {
3171 cast!(self.avx2._mm256_cmpeq_epi16(cast!(a), cast!(b)))
3172 }
3173
3174 #[inline(always)]
3176 pub fn cmp_eq_i32x8(self, a: i32x8, b: i32x8) -> m32x8 {
3177 cast!(self.avx2._mm256_cmpeq_epi32(cast!(a), cast!(b)))
3178 }
3179
3180 #[inline(always)]
3182 pub fn cmp_eq_i64x4(self, a: i64x4, b: i64x4) -> m64x4 {
3183 cast!(self.avx2._mm256_cmpeq_epi64(cast!(a), cast!(b)))
3184 }
3185
3186 #[inline(always)]
3188 pub fn cmp_eq_i8x32(self, a: i8x32, b: i8x32) -> m8x32 {
3189 cast!(self.avx2._mm256_cmpeq_epi8(cast!(a), cast!(b)))
3190 }
3191
3192 #[inline(always)]
3194 pub fn cmp_eq_u16x16(self, a: u16x16, b: u16x16) -> m16x16 {
3195 cast!(self.avx2._mm256_cmpeq_epi16(cast!(a), cast!(b)))
3196 }
3197
3198 #[inline(always)]
3200 pub fn cmp_eq_u32x8(self, a: u32x8, b: u32x8) -> m32x8 {
3201 cast!(self.avx2._mm256_cmpeq_epi32(cast!(a), cast!(b)))
3202 }
3203
3204 #[inline(always)]
3206 pub fn cmp_eq_u64x4(self, a: u64x4, b: u64x4) -> m64x4 {
3207 cast!(self.avx2._mm256_cmpeq_epi64(cast!(a), cast!(b)))
3208 }
3209
3210 #[inline(always)]
3212 pub fn cmp_eq_u8x32(self, a: u8x32, b: u8x32) -> m8x32 {
3213 cast!(self.avx2._mm256_cmpeq_epi8(cast!(a), cast!(b)))
3214 }
3215
3216 #[inline(always)]
3218 pub fn cmp_ge_f32x8(self, a: f32x8, b: f32x8) -> m32x8 {
3219 cast!(self.avx._mm256_cmp_ps::<_CMP_GE_OQ>(cast!(a), cast!(b)))
3220 }
3221
3222 #[inline(always)]
3224 pub fn cmp_ge_f64x4(self, a: f64x4, b: f64x4) -> m64x4 {
3225 cast!(self.avx._mm256_cmp_pd::<_CMP_GE_OQ>(cast!(a), cast!(b)))
3226 }
3227
3228 #[inline(always)]
3230 pub fn cmp_ge_i16x16(self, a: i16x16, b: i16x16) -> m16x16 {
3231 self.not_m16x16(self.cmp_lt_i16x16(a, b))
3232 }
3233
3234 #[inline(always)]
3236 pub fn cmp_ge_i32x8(self, a: i32x8, b: i32x8) -> m32x8 {
3237 self.not_m32x8(self.cmp_lt_i32x8(a, b))
3238 }
3239
3240 #[inline(always)]
3242 pub fn cmp_ge_i64x4(self, a: i64x4, b: i64x4) -> m64x4 {
3243 self.not_m64x4(self.cmp_lt_i64x4(a, b))
3244 }
3245
3246 #[inline(always)]
3248 pub fn cmp_ge_i8x32(self, a: i8x32, b: i8x32) -> m8x32 {
3249 self.not_m8x32(self.cmp_lt_i8x32(a, b))
3250 }
3251
3252 #[inline(always)]
3254 pub fn cmp_ge_u16x16(self, a: u16x16, b: u16x16) -> m16x16 {
3255 self.not_m16x16(self.cmp_lt_u16x16(a, b))
3256 }
3257
3258 #[inline(always)]
3260 pub fn cmp_ge_u32x8(self, a: u32x8, b: u32x8) -> m32x8 {
3261 self.not_m32x8(self.cmp_lt_u32x8(a, b))
3262 }
3263
3264 #[inline(always)]
3266 pub fn cmp_ge_u64x4(self, a: u64x4, b: u64x4) -> m64x4 {
3267 self.not_m64x4(self.cmp_lt_u64x4(a, b))
3268 }
3269
3270 #[inline(always)]
3272 pub fn cmp_ge_u8x32(self, a: u8x32, b: u8x32) -> m8x32 {
3273 self.not_m8x32(self.cmp_lt_u8x32(a, b))
3274 }
3275
3276 #[inline(always)]
3278 pub fn cmp_gt_f32x8(self, a: f32x8, b: f32x8) -> m32x8 {
3279 cast!(self.avx._mm256_cmp_ps::<_CMP_GT_OQ>(cast!(a), cast!(b)))
3280 }
3281
3282 #[inline(always)]
3284 pub fn cmp_gt_f64x4(self, a: f64x4, b: f64x4) -> m64x4 {
3285 cast!(self.avx._mm256_cmp_pd::<_CMP_GT_OQ>(cast!(a), cast!(b)))
3286 }
3287
3288 #[inline(always)]
3290 pub fn cmp_gt_i16x16(self, a: i16x16, b: i16x16) -> m16x16 {
3291 cast!(self.avx2._mm256_cmpgt_epi16(cast!(a), cast!(b)))
3292 }
3293
3294 #[inline(always)]
3296 pub fn cmp_gt_i32x8(self, a: i32x8, b: i32x8) -> m32x8 {
3297 cast!(self.avx2._mm256_cmpgt_epi32(cast!(a), cast!(b)))
3298 }
3299
3300 #[inline(always)]
3302 pub fn cmp_gt_i64x4(self, a: i64x4, b: i64x4) -> m64x4 {
3303 cast!(self.avx2._mm256_cmpgt_epi64(cast!(a), cast!(b)))
3304 }
3305
3306 #[inline(always)]
3308 pub fn cmp_gt_i8x32(self, a: i8x32, b: i8x32) -> m8x32 {
3309 cast!(self.avx2._mm256_cmpgt_epi8(cast!(a), cast!(b)))
3310 }
3311
3312 #[inline(always)]
3314 pub fn cmp_gt_u16x16(self, a: u16x16, b: u16x16) -> m16x16 {
3315 let k = self.splat_u16x16(0x8000);
3316 self.cmp_gt_i16x16(cast!(self.xor_u16x16(a, k)), cast!(self.xor_u16x16(b, k)))
3317 }
3318
3319 #[inline(always)]
3321 pub fn cmp_gt_u32x8(self, a: u32x8, b: u32x8) -> m32x8 {
3322 let k = self.splat_u32x8(0x80000000);
3323 self.cmp_gt_i32x8(cast!(self.xor_u32x8(a, k)), cast!(self.xor_u32x8(b, k)))
3324 }
3325
3326 #[inline(always)]
3328 pub fn cmp_gt_u64x4(self, a: u64x4, b: u64x4) -> m64x4 {
3329 let k = self.splat_u64x4(0x8000000000000000);
3330 self.cmp_gt_i64x4(cast!(self.xor_u64x4(a, k)), cast!(self.xor_u64x4(b, k)))
3331 }
3332
3333 #[inline(always)]
3335 pub fn cmp_gt_u8x32(self, a: u8x32, b: u8x32) -> m8x32 {
3336 let k = self.splat_u8x32(0x80);
3337 self.cmp_gt_i8x32(cast!(self.xor_u8x32(a, k)), cast!(self.xor_u8x32(b, k)))
3338 }
3339
3340 #[inline(always)]
3342 pub fn cmp_le_f32x8(self, a: f32x8, b: f32x8) -> m32x8 {
3343 cast!(self.avx._mm256_cmp_ps::<_CMP_LE_OQ>(cast!(a), cast!(b)))
3344 }
3345
3346 #[inline(always)]
3348 pub fn cmp_le_f64x4(self, a: f64x4, b: f64x4) -> m64x4 {
3349 cast!(self.avx._mm256_cmp_pd::<_CMP_LE_OQ>(cast!(a), cast!(b)))
3350 }
3351
3352 #[inline(always)]
3354 pub fn cmp_le_i16x16(self, a: i16x16, b: i16x16) -> m16x16 {
3355 self.not_m16x16(self.cmp_gt_i16x16(a, b))
3356 }
3357
3358 #[inline(always)]
3360 pub fn cmp_le_i32x8(self, a: i32x8, b: i32x8) -> m32x8 {
3361 self.not_m32x8(self.cmp_gt_i32x8(a, b))
3362 }
3363
3364 #[inline(always)]
3366 pub fn cmp_le_i64x4(self, a: i64x4, b: i64x4) -> m64x4 {
3367 self.not_m64x4(self.cmp_gt_i64x4(a, b))
3368 }
3369
3370 #[inline(always)]
3372 pub fn cmp_le_i8x32(self, a: i8x32, b: i8x32) -> m8x32 {
3373 self.not_m8x32(self.cmp_gt_i8x32(a, b))
3374 }
3375
3376 #[inline(always)]
3378 pub fn cmp_le_u16x16(self, a: u16x16, b: u16x16) -> m16x16 {
3379 self.not_m16x16(self.cmp_gt_u16x16(a, b))
3380 }
3381
3382 #[inline(always)]
3384 pub fn cmp_le_u32x8(self, a: u32x8, b: u32x8) -> m32x8 {
3385 self.not_m32x8(self.cmp_gt_u32x8(a, b))
3386 }
3387
3388 #[inline(always)]
3390 pub fn cmp_le_u64x4(self, a: u64x4, b: u64x4) -> m64x4 {
3391 self.not_m64x4(self.cmp_gt_u64x4(a, b))
3392 }
3393
3394 #[inline(always)]
3396 pub fn cmp_le_u8x32(self, a: u8x32, b: u8x32) -> m8x32 {
3397 self.not_m8x32(self.cmp_gt_u8x32(a, b))
3398 }
3399
3400 #[inline(always)]
3402 pub fn cmp_lt_f32x8(self, a: f32x8, b: f32x8) -> m32x8 {
3403 cast!(self.avx._mm256_cmp_ps::<_CMP_LT_OQ>(cast!(a), cast!(b)))
3404 }
3405
3406 #[inline(always)]
3408 pub fn cmp_lt_f64x4(self, a: f64x4, b: f64x4) -> m64x4 {
3409 cast!(self.avx._mm256_cmp_pd::<_CMP_LT_OQ>(cast!(a), cast!(b)))
3410 }
3411
3412 #[inline(always)]
3414 pub fn cmp_lt_i16x16(self, a: i16x16, b: i16x16) -> m16x16 {
3415 cast!(self.avx2._mm256_cmpgt_epi16(cast!(b), cast!(a)))
3416 }
3417
3418 #[inline(always)]
3420 pub fn cmp_lt_i32x8(self, a: i32x8, b: i32x8) -> m32x8 {
3421 cast!(self.avx2._mm256_cmpgt_epi32(cast!(b), cast!(a)))
3422 }
3423
3424 #[inline(always)]
3426 pub fn cmp_lt_i64x4(self, a: i64x4, b: i64x4) -> m64x4 {
3427 cast!(self.avx2._mm256_cmpgt_epi64(cast!(b), cast!(a)))
3428 }
3429
3430 #[inline(always)]
3432 pub fn cmp_lt_i8x32(self, a: i8x32, b: i8x32) -> m8x32 {
3433 cast!(self.avx2._mm256_cmpgt_epi8(cast!(b), cast!(a)))
3434 }
3435
3436 #[inline(always)]
3438 pub fn cmp_lt_u16x16(self, a: u16x16, b: u16x16) -> m16x16 {
3439 let k = self.splat_u16x16(0x8000);
3440 self.cmp_lt_i16x16(cast!(self.xor_u16x16(a, k)), cast!(self.xor_u16x16(b, k)))
3441 }
3442
3443 #[inline(always)]
3445 pub fn cmp_lt_u32x8(self, a: u32x8, b: u32x8) -> m32x8 {
3446 let k = self.splat_u32x8(0x80000000);
3447 self.cmp_lt_i32x8(cast!(self.xor_u32x8(a, k)), cast!(self.xor_u32x8(b, k)))
3448 }
3449
3450 #[inline(always)]
3452 pub fn cmp_lt_u64x4(self, a: u64x4, b: u64x4) -> m64x4 {
3453 let k = self.splat_u64x4(0x8000000000000000);
3454 self.cmp_lt_i64x4(cast!(self.xor_u64x4(a, k)), cast!(self.xor_u64x4(b, k)))
3455 }
3456
3457 #[inline(always)]
3459 pub fn cmp_lt_u8x32(self, a: u8x32, b: u8x32) -> m8x32 {
3460 let k = self.splat_u8x32(0x80);
3461 self.cmp_lt_i8x32(cast!(self.xor_u8x32(a, k)), cast!(self.xor_u8x32(b, k)))
3462 }
3463
3464 #[inline(always)]
3466 pub fn cmp_not_eq_f32x8(self, a: f32x8, b: f32x8) -> m32x8 {
3467 cast!(self.avx._mm256_cmp_ps::<_CMP_NEQ_UQ>(cast!(a), cast!(b)))
3468 }
3469
3470 #[inline(always)]
3472 pub fn cmp_not_eq_f64x4(self, a: f64x4, b: f64x4) -> m64x4 {
3473 cast!(self.avx._mm256_cmp_pd::<_CMP_NEQ_UQ>(cast!(a), cast!(b)))
3474 }
3475
3476 #[inline(always)]
3478 pub fn cmp_not_ge_f32x8(self, a: f32x8, b: f32x8) -> m32x8 {
3479 cast!(self.avx._mm256_cmp_ps::<_CMP_NGE_UQ>(cast!(a), cast!(b)))
3480 }
3481
3482 #[inline(always)]
3484 pub fn cmp_not_ge_f64x4(self, a: f64x4, b: f64x4) -> m64x4 {
3485 cast!(self.avx._mm256_cmp_pd::<_CMP_NGE_UQ>(cast!(a), cast!(b)))
3486 }
3487
3488 #[inline(always)]
3490 pub fn cmp_not_gt_f32x8(self, a: f32x8, b: f32x8) -> m32x8 {
3491 cast!(self.avx._mm256_cmp_ps::<_CMP_NGT_UQ>(cast!(a), cast!(b)))
3492 }
3493
3494 #[inline(always)]
3496 pub fn cmp_not_gt_f64x4(self, a: f64x4, b: f64x4) -> m64x4 {
3497 cast!(self.avx._mm256_cmp_pd::<_CMP_NGT_UQ>(cast!(a), cast!(b)))
3498 }
3499
3500 #[inline(always)]
3502 pub fn cmp_not_le_f32x8(self, a: f32x8, b: f32x8) -> m32x8 {
3503 cast!(self.avx._mm256_cmp_ps::<_CMP_NLE_UQ>(cast!(a), cast!(b)))
3504 }
3505
3506 #[inline(always)]
3508 pub fn cmp_not_le_f64x4(self, a: f64x4, b: f64x4) -> m64x4 {
3509 cast!(self.avx._mm256_cmp_pd::<_CMP_NLE_UQ>(cast!(a), cast!(b)))
3510 }
3511
3512 #[inline(always)]
3514 pub fn cmp_not_lt_f32x8(self, a: f32x8, b: f32x8) -> m32x8 {
3515 cast!(self.avx._mm256_cmp_ps::<_CMP_NLT_UQ>(cast!(a), cast!(b)))
3516 }
3517
3518 #[inline(always)]
3520 pub fn cmp_not_lt_f64x4(self, a: f64x4, b: f64x4) -> m64x4 {
3521 cast!(self.avx._mm256_cmp_pd::<_CMP_NLT_UQ>(cast!(a), cast!(b)))
3522 }
3523
3524 #[inline(always)]
3526 pub fn convert_f32x4_to_f64x4(self, a: f32x4) -> f64x4 {
3527 cast!(self.avx._mm256_cvtps_pd(cast!(a)))
3528 }
3529
3530 #[inline(always)]
3532 pub fn convert_f32x8_to_i32x8(self, a: f32x8) -> i32x8 {
3533 cast!(self.avx._mm256_cvttps_epi32(cast!(a)))
3534 }
3535
3536 #[inline(always)]
3538 pub fn convert_f64x4_to_f32x4(self, a: f64x4) -> f32x4 {
3539 cast!(self.avx._mm256_cvtpd_ps(cast!(a)))
3540 }
3541
3542 #[inline(always)]
3544 pub fn convert_f64x4_to_i32x4(self, a: f64x4) -> i32x4 {
3545 cast!(self.avx._mm256_cvttpd_epi32(cast!(a)))
3546 }
3547
3548 #[inline(always)]
3550 pub fn convert_i16x16_to_u16x16(self, a: i16x16) -> u16x16 {
3551 cast!(a)
3552 }
3553
3554 #[inline(always)]
3556 pub fn convert_i16x8_to_i32x8(self, a: i16x8) -> i32x8 {
3557 cast!(self.avx2._mm256_cvtepi16_epi32(cast!(a)))
3558 }
3559
3560 #[inline(always)]
3562 pub fn convert_i16x8_to_i64x4(self, a: i16x8) -> i64x4 {
3563 cast!(self.avx2._mm256_cvtepi16_epi64(cast!(a)))
3564 }
3565
3566 #[inline(always)]
3568 pub fn convert_i16x8_to_u32x8(self, a: i16x8) -> u32x8 {
3569 cast!(self.avx2._mm256_cvtepi16_epi32(cast!(a)))
3570 }
3571
3572 #[inline(always)]
3574 pub fn convert_i16x8_to_u64x4(self, a: i16x8) -> u64x4 {
3575 cast!(self.avx2._mm256_cvtepi16_epi64(cast!(a)))
3576 }
3577
3578 #[inline(always)]
3580 pub fn convert_i32x4_to_f64x4(self, a: i32x4) -> f64x4 {
3581 cast!(self.avx._mm256_cvtepi32_pd(cast!(a)))
3582 }
3583
3584 #[inline(always)]
3586 pub fn convert_i32x4_to_i64x4(self, a: i32x4) -> i64x4 {
3587 cast!(self.avx2._mm256_cvtepi32_epi64(cast!(a)))
3588 }
3589
3590 #[inline(always)]
3592 pub fn convert_i32x4_to_u64x4(self, a: i32x4) -> u64x4 {
3593 cast!(self.avx2._mm256_cvtepi32_epi64(cast!(a)))
3594 }
3595
3596 #[inline(always)]
3598 pub fn convert_i32x8_to_f32x8(self, a: i32x8) -> f32x8 {
3599 cast!(self.avx._mm256_cvtepi32_ps(cast!(a)))
3600 }
3601
3602 #[inline(always)]
3604 pub fn convert_i32x8_to_u32x8(self, a: i32x8) -> u32x8 {
3605 cast!(a)
3606 }
3607
3608 #[inline(always)]
3610 pub fn convert_i8x16_to_i16x16(self, a: i8x16) -> i16x16 {
3611 cast!(self.avx2._mm256_cvtepi8_epi16(cast!(a)))
3612 }
3613
3614 #[inline(always)]
3616 pub fn convert_i8x16_to_i32x8(self, a: i8x16) -> i32x8 {
3617 cast!(self.avx2._mm256_cvtepi8_epi32(cast!(a)))
3618 }
3619
3620 #[inline(always)]
3622 pub fn convert_i8x16_to_i64x4(self, a: i8x16) -> i64x4 {
3623 cast!(self.avx2._mm256_cvtepi8_epi64(cast!(a)))
3624 }
3625
3626 #[inline(always)]
3628 pub fn convert_i8x16_to_u16x16(self, a: i8x16) -> u16x16 {
3629 cast!(self.avx2._mm256_cvtepi8_epi16(cast!(a)))
3630 }
3631
3632 #[inline(always)]
3634 pub fn convert_i8x16_to_u32x8(self, a: i8x16) -> u32x8 {
3635 cast!(self.avx2._mm256_cvtepi8_epi32(cast!(a)))
3636 }
3637
3638 #[inline(always)]
3640 pub fn convert_i8x16_to_u64x4(self, a: i8x16) -> u64x4 {
3641 cast!(self.avx2._mm256_cvtepi8_epi64(cast!(a)))
3642 }
3643
3644 #[inline(always)]
3646 pub fn convert_i8x32_to_u8x32(self, a: i8x32) -> u8x32 {
3647 cast!(a)
3648 }
3649
3650 #[inline(always)]
3652 pub fn convert_u16x16_to_i16x16(self, a: u16x16) -> i16x16 {
3653 cast!(a)
3654 }
3655
3656 #[inline(always)]
3658 pub fn convert_u16x8_to_i32x8(self, a: u16x8) -> i32x8 {
3659 cast!(self.avx2._mm256_cvtepu16_epi32(cast!(a)))
3660 }
3661
3662 #[inline(always)]
3664 pub fn convert_u16x8_to_i64x4(self, a: u16x8) -> i64x4 {
3665 cast!(self.avx2._mm256_cvtepu16_epi64(cast!(a)))
3666 }
3667
3668 #[inline(always)]
3670 pub fn convert_u16x8_to_u32x8(self, a: u16x8) -> u32x8 {
3671 cast!(self.avx2._mm256_cvtepu16_epi32(cast!(a)))
3672 }
3673
3674 #[inline(always)]
3676 pub fn convert_u16x8_to_u64x4(self, a: u16x8) -> u64x4 {
3677 cast!(self.avx2._mm256_cvtepu16_epi64(cast!(a)))
3678 }
3679
3680 #[inline(always)]
3682 pub fn convert_u32x4_to_i64x4(self, a: u32x4) -> i64x4 {
3683 cast!(self.avx2._mm256_cvtepu32_epi64(cast!(a)))
3684 }
3685
3686 #[inline(always)]
3688 pub fn convert_u32x4_to_u64x4(self, a: u32x4) -> u64x4 {
3689 cast!(self.avx2._mm256_cvtepu32_epi64(cast!(a)))
3690 }
3691
3692 #[inline(always)]
3694 pub fn convert_u32x8_to_i32x8(self, a: u32x8) -> i32x8 {
3695 cast!(a)
3696 }
3697
3698 #[inline(always)]
3700 pub fn convert_u8x16_to_i16x16(self, a: u8x16) -> i16x16 {
3701 cast!(self.avx2._mm256_cvtepu8_epi16(cast!(a)))
3702 }
3703
3704 #[inline(always)]
3706 pub fn convert_u8x16_to_i32x8(self, a: u8x16) -> i32x8 {
3707 cast!(self.avx2._mm256_cvtepu8_epi32(cast!(a)))
3708 }
3709
3710 #[inline(always)]
3712 pub fn convert_u8x16_to_i64x4(self, a: u8x16) -> i64x4 {
3713 cast!(self.avx2._mm256_cvtepu8_epi64(cast!(a)))
3714 }
3715
3716 #[inline(always)]
3718 pub fn convert_u8x16_to_u16x16(self, a: u8x16) -> u16x16 {
3719 cast!(self.avx2._mm256_cvtepu8_epi16(cast!(a)))
3720 }
3721
3722 #[inline(always)]
3724 pub fn convert_u8x16_to_u32x8(self, a: u8x16) -> u32x8 {
3725 cast!(self.avx2._mm256_cvtepu8_epi32(cast!(a)))
3726 }
3727
3728 #[inline(always)]
3730 pub fn convert_u8x16_to_u64x4(self, a: u8x16) -> u64x4 {
3731 cast!(self.avx2._mm256_cvtepu8_epi64(cast!(a)))
3732 }
3733
3734 #[inline(always)]
3736 pub fn convert_u8x32_to_i8x32(self, a: u8x32) -> i8x32 {
3737 cast!(a)
3738 }
3739
3740 #[inline(always)]
3742 pub fn div_f32x8(self, a: f32x8, b: f32x8) -> f32x8 {
3743 cast!(self.avx._mm256_div_ps(cast!(a), cast!(b)))
3744 }
3745
3746 #[inline(always)]
3748 pub fn div_f64x4(self, a: f64x4, b: f64x4) -> f64x4 {
3749 cast!(self.avx._mm256_div_pd(cast!(a), cast!(b)))
3750 }
3751
3752 #[inline(always)]
3754 pub fn floor_f32x8(self, a: f32x8) -> f32x8 {
3755 cast!(self.avx._mm256_floor_ps(cast!(a)))
3756 }
3757
3758 #[inline(always)]
3760 pub fn floor_f64x4(self, a: f64x4) -> f64x4 {
3761 cast!(self.avx._mm256_floor_pd(cast!(a)))
3762 }
3763
3764 #[inline(always)]
3768 pub fn horizontal_add_pack_f32x8(self, a: f32x8, b: f32x8) -> f32x8 {
3769 cast!(self.avx._mm256_hadd_ps(cast!(a), cast!(b)))
3770 }
3771
3772 #[inline(always)]
3776 pub fn horizontal_add_pack_f64x4(self, a: f64x4, b: f64x4) -> f64x4 {
3777 cast!(self.avx._mm256_hadd_pd(cast!(a), cast!(b)))
3778 }
3779
3780 #[inline(always)]
3784 pub fn horizontal_add_pack_i16x16(self, a: i16x16, b: i16x16) -> i16x16 {
3785 cast!(self.avx2._mm256_hadd_epi16(cast!(a), cast!(b)))
3786 }
3787
3788 #[inline(always)]
3792 pub fn horizontal_add_pack_i32x8(self, a: i32x8, b: i32x8) -> i32x8 {
3793 cast!(self.avx2._mm256_hadd_epi32(cast!(a), cast!(b)))
3794 }
3795
3796 #[inline(always)]
3800 pub fn horizontal_saturating_add_pack_i16x16(self, a: i16x16, b: i16x16) -> i16x16 {
3801 cast!(self.avx2._mm256_hadds_epi16(cast!(a), cast!(b)))
3802 }
3803
3804 #[inline(always)]
3808 pub fn horizontal_saturating_sub_pack_i16x16(self, a: i16x16, b: i16x16) -> i16x16 {
3809 cast!(self.avx2._mm256_hsubs_epi16(cast!(a), cast!(b)))
3810 }
3811
3812 #[inline(always)]
3816 pub fn horizontal_sub_pack_f32x8(self, a: f32x8, b: f32x8) -> f32x8 {
3817 cast!(self.avx._mm256_hsub_ps(cast!(a), cast!(b)))
3818 }
3819
3820 #[inline(always)]
3824 pub fn horizontal_sub_pack_f64x4(self, a: f64x4, b: f64x4) -> f64x4 {
3825 cast!(self.avx._mm256_hsub_pd(cast!(a), cast!(b)))
3826 }
3827
3828 #[inline(always)]
3832 pub fn horizontal_sub_pack_i16x16(self, a: i16x16, b: i16x16) -> i16x16 {
3833 cast!(self.avx2._mm256_hsub_epi16(cast!(a), cast!(b)))
3834 }
3835
3836 #[inline(always)]
3840 pub fn horizontal_sub_pack_i32x8(self, a: i32x8, b: i32x8) -> i32x8 {
3841 cast!(self.avx2._mm256_hsub_epi32(cast!(a), cast!(b)))
3842 }
3843
3844 #[inline(always)]
3846 pub fn is_nan_f32x8(self, a: f32x8) -> m32x8 {
3847 cast!(self.avx._mm256_cmp_ps::<_CMP_UNORD_Q>(cast!(a), cast!(a)))
3848 }
3849
3850 #[inline(always)]
3852 pub fn is_nan_f64x4(self, a: f64x4) -> m64x4 {
3853 cast!(self.avx._mm256_cmp_pd::<_CMP_UNORD_Q>(cast!(a), cast!(a)))
3854 }
3855
3856 #[inline(always)]
3858 pub fn is_not_nan_f32x8(self, a: f32x8) -> m32x8 {
3859 cast!(self.avx._mm256_cmp_ps::<_CMP_ORD_Q>(cast!(a), cast!(a)))
3860 }
3861
3862 #[inline(always)]
3864 pub fn is_not_nan_f64x4(self, a: f64x4) -> m64x4 {
3865 cast!(self.avx._mm256_cmp_pd::<_CMP_ORD_Q>(cast!(a), cast!(a)))
3866 }
3867
3868 #[inline(always)]
3870 pub fn max_f32x8(self, a: f32x8, b: f32x8) -> f32x8 {
3871 cast!(self.avx._mm256_max_ps(cast!(a), cast!(b)))
3872 }
3873
3874 #[inline(always)]
3876 pub fn max_f64x4(self, a: f64x4, b: f64x4) -> f64x4 {
3877 cast!(self.avx._mm256_max_pd(cast!(a), cast!(b)))
3878 }
3879
3880 #[inline(always)]
3882 pub fn max_i16x16(self, a: i16x16, b: i16x16) -> i16x16 {
3883 cast!(self.avx2._mm256_max_epi16(cast!(a), cast!(b)))
3884 }
3885
3886 #[inline(always)]
3888 pub fn max_i32x8(self, a: i32x8, b: i32x8) -> i32x8 {
3889 cast!(self.avx2._mm256_max_epi32(cast!(a), cast!(b)))
3890 }
3891
3892 #[inline(always)]
3894 pub fn max_i8x32(self, a: i8x32, b: i8x32) -> i8x32 {
3895 cast!(self.avx2._mm256_max_epi8(cast!(a), cast!(b)))
3896 }
3897
3898 #[inline(always)]
3900 pub fn max_u16x16(self, a: u16x16, b: u16x16) -> u16x16 {
3901 cast!(self.avx2._mm256_max_epu16(cast!(a), cast!(b)))
3902 }
3903
3904 #[inline(always)]
3906 pub fn max_u32x8(self, a: u32x8, b: u32x8) -> u32x8 {
3907 cast!(self.avx2._mm256_max_epu32(cast!(a), cast!(b)))
3908 }
3909
3910 #[inline(always)]
3912 pub fn max_u8x32(self, a: u8x32, b: u8x32) -> u8x32 {
3913 cast!(self.avx2._mm256_max_epu8(cast!(a), cast!(b)))
3914 }
3915
3916 #[inline(always)]
3918 pub fn min_f32x8(self, a: f32x8, b: f32x8) -> f32x8 {
3919 cast!(self.avx._mm256_min_ps(cast!(a), cast!(b)))
3920 }
3921
3922 #[inline(always)]
3924 pub fn min_f64x4(self, a: f64x4, b: f64x4) -> f64x4 {
3925 cast!(self.avx._mm256_min_pd(cast!(a), cast!(b)))
3926 }
3927
3928 #[inline(always)]
3930 pub fn min_i16x16(self, a: i16x16, b: i16x16) -> i16x16 {
3931 cast!(self.avx2._mm256_min_epi16(cast!(a), cast!(b)))
3932 }
3933
3934 #[inline(always)]
3936 pub fn min_i32x8(self, a: i32x8, b: i32x8) -> i32x8 {
3937 cast!(self.avx2._mm256_min_epi32(cast!(a), cast!(b)))
3938 }
3939
3940 #[inline(always)]
3942 pub fn min_i8x32(self, a: i8x32, b: i8x32) -> i8x32 {
3943 cast!(self.avx2._mm256_min_epi8(cast!(a), cast!(b)))
3944 }
3945
3946 #[inline(always)]
3948 pub fn min_u16x16(self, a: u16x16, b: u16x16) -> u16x16 {
3949 cast!(self.avx2._mm256_min_epu16(cast!(a), cast!(b)))
3950 }
3951
3952 #[inline(always)]
3954 pub fn min_u32x8(self, a: u32x8, b: u32x8) -> u32x8 {
3955 cast!(self.avx2._mm256_min_epu32(cast!(a), cast!(b)))
3956 }
3957
3958 #[inline(always)]
3960 pub fn min_u8x32(self, a: u8x32, b: u8x32) -> u8x32 {
3961 cast!(self.avx2._mm256_min_epu8(cast!(a), cast!(b)))
3962 }
3963
3964 #[inline(always)]
3967 pub fn mul_add_f32x4(self, a: f32x4, b: f32x4, c: f32x4) -> f32x4 {
3968 cast!(self.fma._mm_fmadd_ps(cast!(a), cast!(b), cast!(c)))
3969 }
3970
3971 #[inline(always)]
3974 pub fn mul_add_f32x8(self, a: f32x8, b: f32x8, c: f32x8) -> f32x8 {
3975 cast!(self.fma._mm256_fmadd_ps(cast!(a), cast!(b), cast!(c)))
3976 }
3977
3978 #[inline(always)]
3981 pub fn mul_add_f64x2(self, a: f64x2, b: f64x2, c: f64x2) -> f64x2 {
3982 cast!(self.fma._mm_fmadd_pd(cast!(a), cast!(b), cast!(c)))
3983 }
3984
3985 #[inline(always)]
3988 pub fn mul_add_f64x4(self, a: f64x4, b: f64x4, c: f64x4) -> f64x4 {
3989 cast!(self.fma._mm256_fmadd_pd(cast!(a), cast!(b), cast!(c)))
3990 }
3991
3992 #[inline(always)]
3995 pub fn mul_addsub_f32x4(self, a: f32x4, b: f32x4, c: f32x4) -> f32x4 {
3996 cast!(self.fma._mm_fmsubadd_ps(cast!(a), cast!(b), cast!(c)))
3997 }
3998
3999 #[inline(always)]
4002 pub fn mul_addsub_f32x8(self, a: f32x8, b: f32x8, c: f32x8) -> f32x8 {
4003 cast!(self.fma._mm256_fmsubadd_ps(cast!(a), cast!(b), cast!(c)))
4004 }
4005
4006 #[inline(always)]
4009 pub fn mul_addsub_f64x2(self, a: f64x2, b: f64x2, c: f64x2) -> f64x2 {
4010 cast!(self.fma._mm_fmsubadd_pd(cast!(a), cast!(b), cast!(c)))
4011 }
4012
4013 #[inline(always)]
4016 pub fn mul_addsub_f64x4(self, a: f64x4, b: f64x4, c: f64x4) -> f64x4 {
4017 cast!(self.fma._mm256_fmsubadd_pd(cast!(a), cast!(b), cast!(c)))
4018 }
4019
4020 #[inline(always)]
4022 pub fn mul_f32x8(self, a: f32x8, b: f32x8) -> f32x8 {
4023 cast!(self.avx._mm256_mul_ps(cast!(a), cast!(b)))
4024 }
4025
4026 #[inline(always)]
4028 pub fn mul_f64x4(self, a: f64x4, b: f64x4) -> f64x4 {
4029 cast!(self.avx._mm256_mul_pd(cast!(a), cast!(b)))
4030 }
4031
4032 #[inline(always)]
4035 pub fn mul_sub_f32x4(self, a: f32x4, b: f32x4, c: f32x4) -> f32x4 {
4036 cast!(self.fma._mm_fmsub_ps(cast!(a), cast!(b), cast!(c)))
4037 }
4038
4039 #[inline(always)]
4042 pub fn mul_sub_f32x8(self, a: f32x8, b: f32x8, c: f32x8) -> f32x8 {
4043 cast!(self.fma._mm256_fmsub_ps(cast!(a), cast!(b), cast!(c)))
4044 }
4045
4046 #[inline(always)]
4049 pub fn mul_sub_f64x2(self, a: f64x2, b: f64x2, c: f64x2) -> f64x2 {
4050 cast!(self.fma._mm_fmsub_pd(cast!(a), cast!(b), cast!(c)))
4051 }
4052
4053 #[inline(always)]
4056 pub fn mul_sub_f64x4(self, a: f64x4, b: f64x4, c: f64x4) -> f64x4 {
4057 cast!(self.fma._mm256_fmsub_pd(cast!(a), cast!(b), cast!(c)))
4058 }
4059
4060 #[inline(always)]
4063 pub fn mul_subadd_f32x4(self, a: f32x4, b: f32x4, c: f32x4) -> f32x4 {
4064 cast!(self.fma._mm_fmaddsub_ps(cast!(a), cast!(b), cast!(c)))
4065 }
4066
4067 #[inline(always)]
4070 pub fn mul_subadd_f32x8(self, a: f32x8, b: f32x8, c: f32x8) -> f32x8 {
4071 cast!(self.fma._mm256_fmaddsub_ps(cast!(a), cast!(b), cast!(c)))
4072 }
4073
4074 #[inline(always)]
4077 pub fn mul_subadd_f64x2(self, a: f64x2, b: f64x2, c: f64x2) -> f64x2 {
4078 cast!(self.fma._mm_fmaddsub_pd(cast!(a), cast!(b), cast!(c)))
4079 }
4080
4081 #[inline(always)]
4084 pub fn mul_subadd_f64x4(self, a: f64x4, b: f64x4, c: f64x4) -> f64x4 {
4085 cast!(self.fma._mm256_fmaddsub_pd(cast!(a), cast!(b), cast!(c)))
4086 }
4087
4088 #[inline(always)]
4092 pub fn multiply_saturating_add_adjacent_i8x32(self, a: i8x32, b: i8x32) -> i16x16 {
4093 cast!(self.avx2._mm256_maddubs_epi16(cast!(a), cast!(b)))
4094 }
4095
4096 #[inline(always)]
4100 pub fn multiply_wrapping_add_adjacent_i16x16(self, a: i16x16, b: i16x16) -> i32x8 {
4101 cast!(self.avx2._mm256_madd_epi16(cast!(a), cast!(b)))
4102 }
4103
4104 #[inline(always)]
4108 pub fn multisum_of_absolute_differences_u8x32<const OFFSETS: i32>(
4109 self,
4110 a: u8x32,
4111 b: u8x32,
4112 ) -> u16x16 {
4113 cast!(self.avx2._mm256_mpsadbw_epu8::<OFFSETS>(cast!(a), cast!(b)))
4114 }
4115
4116 #[inline(always)]
4119 pub fn negate_mul_add_f32x4(self, a: f32x4, b: f32x4, c: f32x4) -> f32x4 {
4120 cast!(self.fma._mm_fnmadd_ps(cast!(a), cast!(b), cast!(c)))
4121 }
4122
4123 #[inline(always)]
4126 pub fn negate_mul_add_f32x8(self, a: f32x8, b: f32x8, c: f32x8) -> f32x8 {
4127 cast!(self.fma._mm256_fnmadd_ps(cast!(a), cast!(b), cast!(c)))
4128 }
4129
4130 #[inline(always)]
4133 pub fn negate_mul_add_f64x2(self, a: f64x2, b: f64x2, c: f64x2) -> f64x2 {
4134 cast!(self.fma._mm_fnmadd_pd(cast!(a), cast!(b), cast!(c)))
4135 }
4136
4137 #[inline(always)]
4140 pub fn negate_mul_add_f64x4(self, a: f64x4, b: f64x4, c: f64x4) -> f64x4 {
4141 cast!(self.fma._mm256_fnmadd_pd(cast!(a), cast!(b), cast!(c)))
4142 }
4143
4144 #[inline(always)]
4147 pub fn negate_mul_sub_f32x4(self, a: f32x4, b: f32x4, c: f32x4) -> f32x4 {
4148 cast!(self.fma._mm_fnmsub_ps(cast!(a), cast!(b), cast!(c)))
4149 }
4150
4151 #[inline(always)]
4154 pub fn negate_mul_sub_f32x8(self, a: f32x8, b: f32x8, c: f32x8) -> f32x8 {
4155 cast!(self.fma._mm256_fnmsub_ps(cast!(a), cast!(b), cast!(c)))
4156 }
4157
4158 #[inline(always)]
4161 pub fn negate_mul_sub_f64x2(self, a: f64x2, b: f64x2, c: f64x2) -> f64x2 {
4162 cast!(self.fma._mm_fnmsub_pd(cast!(a), cast!(b), cast!(c)))
4163 }
4164
4165 #[inline(always)]
4168 pub fn negate_mul_sub_f64x4(self, a: f64x4, b: f64x4, c: f64x4) -> f64x4 {
4169 cast!(self.fma._mm256_fnmsub_pd(cast!(a), cast!(b), cast!(c)))
4170 }
4171
4172 #[inline(always)]
4174 pub fn not_i16x16(self, a: i16x16) -> i16x16 {
4175 self.xor_i16x16(a, self.splat_i16x16(!0))
4176 }
4177
4178 #[inline(always)]
4180 pub fn not_i32x8(self, a: i32x8) -> i32x8 {
4181 self.xor_i32x8(a, self.splat_i32x8(!0))
4182 }
4183
4184 #[inline(always)]
4186 pub fn not_i64x4(self, a: i64x4) -> i64x4 {
4187 self.xor_i64x4(a, self.splat_i64x4(!0))
4188 }
4189
4190 #[inline(always)]
4192 pub fn not_i8x32(self, a: i8x32) -> i8x32 {
4193 self.xor_i8x32(a, self.splat_i8x32(!0))
4194 }
4195
4196 #[inline(always)]
4198 pub fn not_m16x16(self, a: m16x16) -> m16x16 {
4199 self.xor_m16x16(a, self.splat_m16x16(m16::new(true)))
4200 }
4201
4202 #[inline(always)]
4204 pub fn not_m32x8(self, a: m32x8) -> m32x8 {
4205 self.xor_m32x8(a, self.splat_m32x8(m32::new(true)))
4206 }
4207
4208 #[inline(always)]
4210 pub fn not_m64x4(self, a: m64x4) -> m64x4 {
4211 self.xor_m64x4(a, self.splat_m64x4(m64::new(true)))
4212 }
4213
4214 #[inline(always)]
4216 pub fn not_m8x32(self, a: m8x32) -> m8x32 {
4217 self.xor_m8x32(a, self.splat_m8x32(m8::new(true)))
4218 }
4219
4220 #[inline(always)]
4222 pub fn not_u16x16(self, a: u16x16) -> u16x16 {
4223 self.xor_u16x16(a, self.splat_u16x16(!0))
4224 }
4225
4226 #[inline(always)]
4228 pub fn not_u32x8(self, a: u32x8) -> u32x8 {
4229 self.xor_u32x8(a, self.splat_u32x8(!0))
4230 }
4231
4232 #[inline(always)]
4234 pub fn not_u64x4(self, a: u64x4) -> u64x4 {
4235 self.xor_u64x4(a, self.splat_u64x4(!0))
4236 }
4237
4238 #[inline(always)]
4240 pub fn not_u8x32(self, a: u8x32) -> u8x32 {
4241 self.xor_u8x32(a, self.splat_u8x32(!0))
4242 }
4243
4244 #[inline(always)]
4246 pub fn or_f32x8(self, a: f32x8, b: f32x8) -> f32x8 {
4247 cast!(self.avx._mm256_or_ps(cast!(a), cast!(b)))
4248 }
4249
4250 #[inline(always)]
4252 pub fn or_f64x4(self, a: f64x4, b: f64x4) -> f64x4 {
4253 cast!(self.avx._mm256_or_pd(cast!(a), cast!(b)))
4254 }
4255
4256 #[inline(always)]
4258 pub fn or_i16x16(self, a: i16x16, b: i16x16) -> i16x16 {
4259 cast!(self.avx2._mm256_or_si256(cast!(a), cast!(b)))
4260 }
4261
4262 #[inline(always)]
4264 pub fn or_i32x8(self, a: i32x8, b: i32x8) -> i32x8 {
4265 cast!(self.avx2._mm256_or_si256(cast!(a), cast!(b)))
4266 }
4267
4268 #[inline(always)]
4270 pub fn or_i64x4(self, a: i64x4, b: i64x4) -> i64x4 {
4271 cast!(self.avx2._mm256_or_si256(cast!(a), cast!(b)))
4272 }
4273
4274 #[inline(always)]
4276 pub fn or_i8x32(self, a: i8x32, b: i8x32) -> i8x32 {
4277 cast!(self.avx2._mm256_or_si256(cast!(a), cast!(b)))
4278 }
4279
4280 #[inline(always)]
4282 pub fn or_m16x16(self, a: m16x16, b: m16x16) -> m16x16 {
4283 cast!(self.avx2._mm256_or_si256(cast!(a), cast!(b)))
4284 }
4285
4286 #[inline(always)]
4288 pub fn or_m32x8(self, a: m32x8, b: m32x8) -> m32x8 {
4289 cast!(self.avx2._mm256_or_si256(cast!(a), cast!(b)))
4290 }
4291
4292 #[inline(always)]
4294 pub fn or_m64x4(self, a: m64x4, b: m64x4) -> m64x4 {
4295 cast!(self.avx2._mm256_or_si256(cast!(a), cast!(b)))
4296 }
4297
4298 #[inline(always)]
4300 pub fn or_m8x32(self, a: m8x32, b: m8x32) -> m8x32 {
4301 cast!(self.avx2._mm256_or_si256(cast!(a), cast!(b)))
4302 }
4303
4304 #[inline(always)]
4306 pub fn or_u16x16(self, a: u16x16, b: u16x16) -> u16x16 {
4307 cast!(self.avx2._mm256_or_si256(cast!(a), cast!(b)))
4308 }
4309
4310 #[inline(always)]
4312 pub fn or_u32x8(self, a: u32x8, b: u32x8) -> u32x8 {
4313 cast!(self.avx2._mm256_or_si256(cast!(a), cast!(b)))
4314 }
4315
4316 #[inline(always)]
4318 pub fn or_u64x4(self, a: u64x4, b: u64x4) -> u64x4 {
4319 cast!(self.avx2._mm256_or_si256(cast!(a), cast!(b)))
4320 }
4321
4322 #[inline(always)]
4324 pub fn or_u8x32(self, a: u8x32, b: u8x32) -> u8x32 {
4325 cast!(self.avx2._mm256_or_si256(cast!(a), cast!(b)))
4326 }
4327
4328 #[inline(always)]
4332 pub fn pack_with_signed_saturation_i16x16(self, a: i16x16, b: i16x16) -> i8x32 {
4333 cast!(self.avx2._mm256_packs_epi16(cast!(a), cast!(b)))
4334 }
4335
4336 #[inline(always)]
4340 pub fn pack_with_signed_saturation_i32x8(self, a: i32x8, b: i32x8) -> i16x16 {
4341 cast!(self.avx2._mm256_packs_epi32(cast!(a), cast!(b)))
4342 }
4343
4344 #[inline(always)]
4348 pub fn pack_with_unsigned_saturation_i16x16(self, a: i16x16, b: i16x16) -> u8x32 {
4349 cast!(self.avx2._mm256_packus_epi16(cast!(a), cast!(b)))
4350 }
4351
4352 #[inline(always)]
4356 pub fn pack_with_unsigned_saturation_i32x8(self, a: i32x8, b: i32x8) -> u16x16 {
4357 cast!(self.avx2._mm256_packus_epi32(cast!(a), cast!(b)))
4358 }
4359
4360 #[inline(always)]
4363 pub fn round_f32x8(self, a: f32x8) -> f32x8 {
4364 const ROUNDING: i32 = _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC;
4365 cast!(self.avx._mm256_round_ps::<ROUNDING>(cast!(a)))
4366 }
4367
4368 #[inline(always)]
4371 pub fn round_f64x4(self, a: f64x4) -> f64x4 {
4372 const ROUNDING: i32 = _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC;
4373 cast!(self.avx._mm256_round_pd::<ROUNDING>(cast!(a)))
4374 }
4375
4376 #[inline(always)]
4378 pub fn saturating_add_i16x16(self, a: i16x16, b: i16x16) -> i16x16 {
4379 cast!(self.avx2._mm256_adds_epi16(cast!(a), cast!(b)))
4380 }
4381
4382 #[inline(always)]
4384 pub fn saturating_add_i8x32(self, a: i8x32, b: i8x32) -> i8x32 {
4385 cast!(self.avx2._mm256_adds_epi8(cast!(a), cast!(b)))
4386 }
4387
4388 #[inline(always)]
4390 pub fn saturating_add_u16x16(self, a: u16x16, b: u16x16) -> u16x16 {
4391 cast!(self.avx2._mm256_adds_epu16(cast!(a), cast!(b)))
4392 }
4393
4394 #[inline(always)]
4396 pub fn saturating_add_u8x32(self, a: u8x32, b: u8x32) -> u8x32 {
4397 cast!(self.avx2._mm256_adds_epu8(cast!(a), cast!(b)))
4398 }
4399
4400 #[inline(always)]
4402 pub fn saturating_sub_i16x16(self, a: i16x16, b: i16x16) -> i16x16 {
4403 cast!(self.avx2._mm256_subs_epi16(cast!(a), cast!(b)))
4404 }
4405
4406 #[inline(always)]
4408 pub fn saturating_sub_i8x32(self, a: i8x32, b: i8x32) -> i8x32 {
4409 cast!(self.avx2._mm256_subs_epi8(cast!(a), cast!(b)))
4410 }
4411
4412 #[inline(always)]
4414 pub fn saturating_sub_u16x16(self, a: u16x16, b: u16x16) -> u16x16 {
4415 cast!(self.avx2._mm256_subs_epu16(cast!(a), cast!(b)))
4416 }
4417
4418 #[inline(always)]
4420 pub fn saturating_sub_u8x32(self, a: u8x32, b: u8x32) -> u8x32 {
4421 cast!(self.avx2._mm256_subs_epu8(cast!(a), cast!(b)))
4422 }
4423
4424 #[inline(always)]
4427 pub fn select_const_f32x8<const MASK8: i32>(self, if_true: f32x8, if_false: f32x8) -> f32x8 {
4428 cast!(
4429 self.avx
4430 ._mm256_blend_ps::<MASK8>(cast!(if_false), cast!(if_true)),
4431 )
4432 }
4433
4434 #[inline(always)]
4437 pub fn select_const_f64x4<const MASK4: i32>(self, if_true: f64x4, if_false: f64x4) -> f64x4 {
4438 cast!(self.select_const_u64x4::<MASK4>(cast!(if_true), cast!(if_false)))
4439 }
4440
4441 #[inline(always)]
4444 pub fn select_const_i32x8<const MASK8: i32>(self, if_true: i32x8, if_false: i32x8) -> i32x8 {
4445 cast!(self.select_const_u32x8::<MASK8>(cast!(if_true), cast!(if_false)))
4446 }
4447
4448 #[inline(always)]
4451 pub fn select_const_i64x4<const MASK4: i32>(self, if_true: i64x4, if_false: i64x4) -> i64x4 {
4452 cast!(self.select_const_u64x4::<MASK4>(cast!(if_true), cast!(if_false)))
4453 }
4454
4455 #[inline(always)]
4458 pub fn select_const_u32x8<const MASK8: i32>(self, if_true: u32x8, if_false: u32x8) -> u32x8 {
4459 cast!(
4460 self.avx2
4461 ._mm256_blend_epi32::<MASK8>(cast!(if_false), cast!(if_true)),
4462 )
4463 }
4464
4465 #[inline(always)]
4468 pub fn select_const_u64x4<const MASK4: i32>(self, if_true: u64x4, if_false: u64x4) -> u64x4 {
4469 cast!(
4470 self.avx
4471 ._mm256_blend_pd::<MASK4>(cast!(if_false), cast!(if_true)),
4472 )
4473 }
4474
4475 #[inline(always)]
4478 pub fn select_f32x8(self, mask: m32x8, if_true: f32x8, if_false: f32x8) -> f32x8 {
4479 cast!(
4480 self.avx
4481 ._mm256_blendv_ps(cast!(if_false), cast!(if_true), cast!(mask)),
4482 )
4483 }
4484
4485 #[inline(always)]
4488 pub fn select_f64x4(self, mask: m64x4, if_true: f64x4, if_false: f64x4) -> f64x4 {
4489 cast!(
4490 self.avx
4491 ._mm256_blendv_pd(cast!(if_false), cast!(if_true), cast!(mask)),
4492 )
4493 }
4494
4495 #[inline(always)]
4498 pub fn select_i16x16(self, mask: m16x16, if_true: i16x16, if_false: i16x16) -> i16x16 {
4499 cast!(self.select_u16x16(mask, cast!(if_true), cast!(if_false)))
4500 }
4501
4502 #[inline(always)]
4505 pub fn select_i32x8(self, mask: m32x8, if_true: i32x8, if_false: i32x8) -> i32x8 {
4506 cast!(self.select_u32x8(mask, cast!(if_true), cast!(if_false)))
4507 }
4508
4509 #[inline(always)]
4512 pub fn select_i64x4(self, mask: m64x4, if_true: i64x4, if_false: i64x4) -> i64x4 {
4513 cast!(self.select_u64x4(mask, cast!(if_true), cast!(if_false)))
4514 }
4515
4516 #[inline(always)]
4519 pub fn select_i8x32(self, mask: m8x32, if_true: i8x32, if_false: i8x32) -> i8x32 {
4520 cast!(self.select_u8x32(mask, cast!(if_true), cast!(if_false)))
4521 }
4522
4523 #[inline(always)]
4526 pub fn select_u16x16(self, mask: m16x16, if_true: u16x16, if_false: u16x16) -> u16x16 {
4527 cast!(
4528 self.avx2
4529 ._mm256_blendv_epi8(cast!(if_false), cast!(if_true), cast!(mask)),
4530 )
4531 }
4532
4533 #[inline(always)]
4536 pub fn select_u32x8(self, mask: m32x8, if_true: u32x8, if_false: u32x8) -> u32x8 {
4537 cast!(
4538 self.avx2
4539 ._mm256_blendv_epi8(cast!(if_false), cast!(if_true), cast!(mask)),
4540 )
4541 }
4542
4543 #[inline(always)]
4546 pub fn select_u64x4(self, mask: m64x4, if_true: u64x4, if_false: u64x4) -> u64x4 {
4547 cast!(
4548 self.avx2
4549 ._mm256_blendv_epi8(cast!(if_false), cast!(if_true), cast!(mask)),
4550 )
4551 }
4552
4553 #[inline(always)]
4556 pub fn select_u8x32(self, mask: m8x32, if_true: u8x32, if_false: u8x32) -> u8x32 {
4557 cast!(
4558 self.avx2
4559 ._mm256_blendv_epi8(cast!(if_false), cast!(if_true), cast!(mask)),
4560 )
4561 }
4562
4563 #[inline(always)]
4566 pub fn shl_const_i16x16<const AMOUNT: i32>(self, a: i16x16) -> i16x16 {
4567 cast!(self.avx2._mm256_slli_epi16::<AMOUNT>(cast!(a)))
4568 }
4569
4570 #[inline(always)]
4573 pub fn shl_const_i32x8<const AMOUNT: i32>(self, a: i32x8) -> i32x8 {
4574 cast!(self.avx2._mm256_slli_epi32::<AMOUNT>(cast!(a)))
4575 }
4576
4577 #[inline(always)]
4580 pub fn shl_const_i64x4<const AMOUNT: i32>(self, a: i64x4) -> i64x4 {
4581 cast!(self.avx2._mm256_slli_epi64::<AMOUNT>(cast!(a)))
4582 }
4583
4584 #[inline(always)]
4587 pub fn shl_const_u16x16<const AMOUNT: i32>(self, a: u16x16) -> u16x16 {
4588 cast!(self.avx2._mm256_slli_epi16::<AMOUNT>(cast!(a)))
4589 }
4590
4591 #[inline(always)]
4594 pub fn shl_const_u32x8<const AMOUNT: i32>(self, a: u32x8) -> u32x8 {
4595 cast!(self.avx2._mm256_slli_epi32::<AMOUNT>(cast!(a)))
4596 }
4597
4598 #[inline(always)]
4601 pub fn shl_const_u64x4<const AMOUNT: i32>(self, a: u64x4) -> u64x4 {
4602 cast!(self.avx2._mm256_slli_epi64::<AMOUNT>(cast!(a)))
4603 }
4604
4605 #[inline(always)]
4609 pub fn shl_dyn_i32x4(self, a: i32x4, amount: u32x4) -> i32x4 {
4610 cast!(self.avx2._mm_sllv_epi32(cast!(a), cast!(amount)))
4611 }
4612
4613 #[inline(always)]
4617 pub fn shl_dyn_i32x8(self, a: i32x8, amount: u32x8) -> i32x8 {
4618 cast!(self.avx2._mm256_sllv_epi32(cast!(a), cast!(amount)))
4619 }
4620
4621 #[inline(always)]
4625 pub fn shl_dyn_i64x2(self, a: i64x2, amount: u64x2) -> i64x2 {
4626 cast!(self.avx2._mm_sllv_epi64(cast!(a), cast!(amount)))
4627 }
4628
4629 #[inline(always)]
4633 pub fn shl_dyn_i64x4(self, a: i64x4, amount: u64x4) -> i64x4 {
4634 cast!(self.avx2._mm256_sllv_epi64(cast!(a), cast!(amount)))
4635 }
4636
4637 #[inline(always)]
4641 pub fn shl_dyn_u32x4(self, a: u32x4, amount: u32x4) -> u32x4 {
4642 cast!(self.avx2._mm_sllv_epi32(cast!(a), cast!(amount)))
4643 }
4644
4645 #[inline(always)]
4649 pub fn shl_dyn_u32x8(self, a: u32x8, amount: u32x8) -> u32x8 {
4650 cast!(self.avx2._mm256_sllv_epi32(cast!(a), cast!(amount)))
4651 }
4652
4653 #[inline(always)]
4657 pub fn shl_dyn_u64x2(self, a: u64x2, amount: u64x2) -> u64x2 {
4658 cast!(self.avx2._mm_sllv_epi64(cast!(a), cast!(amount)))
4659 }
4660
4661 #[inline(always)]
4665 pub fn shl_dyn_u64x4(self, a: u64x4, amount: u64x4) -> u64x4 {
4666 cast!(self.avx2._mm256_sllv_epi64(cast!(a), cast!(amount)))
4667 }
4668
4669 #[inline(always)]
4673 pub fn shl_i16x16(self, a: i16x16, amount: u64x2) -> i16x16 {
4674 cast!(self.avx2._mm256_sll_epi16(cast!(a), cast!(amount)))
4675 }
4676
4677 #[inline(always)]
4681 pub fn shl_i32x8(self, a: i32x8, amount: u64x2) -> i32x8 {
4682 cast!(self.avx2._mm256_sll_epi32(cast!(a), cast!(amount)))
4683 }
4684
4685 #[inline(always)]
4689 pub fn shl_i64x4(self, a: i64x4, amount: u64x2) -> i64x4 {
4690 cast!(self.avx2._mm256_sll_epi64(cast!(a), cast!(amount)))
4691 }
4692
4693 #[inline(always)]
4697 pub fn shl_u16x16(self, a: u16x16, amount: u64x2) -> u16x16 {
4698 cast!(self.avx2._mm256_sll_epi16(cast!(a), cast!(amount)))
4699 }
4700
4701 #[inline(always)]
4705 pub fn shl_u32x8(self, a: u32x8, amount: u64x2) -> u32x8 {
4706 cast!(self.avx2._mm256_sll_epi32(cast!(a), cast!(amount)))
4707 }
4708
4709 #[inline(always)]
4713 pub fn shl_u64x4(self, a: u64x4, amount: u64x2) -> u64x4 {
4714 cast!(self.avx2._mm256_sll_epi64(cast!(a), cast!(amount)))
4715 }
4716
4717 #[inline(always)]
4721 pub fn shr_const_i16x16<const AMOUNT: i32>(self, a: i16x16) -> i16x16 {
4722 cast!(self.avx2._mm256_srai_epi16::<AMOUNT>(cast!(a)))
4723 }
4724
4725 #[inline(always)]
4729 pub fn shr_const_i32x8<const AMOUNT: i32>(self, a: i32x8) -> i32x8 {
4730 cast!(self.avx2._mm256_srai_epi32::<AMOUNT>(cast!(a)))
4731 }
4732
4733 #[inline(always)]
4736 pub fn shr_const_u16x16<const AMOUNT: i32>(self, a: u16x16) -> u16x16 {
4737 cast!(self.avx2._mm256_srli_epi16::<AMOUNT>(cast!(a)))
4738 }
4739
4740 #[inline(always)]
4743 pub fn shr_const_u32x8<const AMOUNT: i32>(self, a: u32x8) -> u32x8 {
4744 cast!(self.avx2._mm256_srli_epi32::<AMOUNT>(cast!(a)))
4745 }
4746
4747 #[inline(always)]
4750 pub fn shr_const_u64x4<const AMOUNT: i32>(self, a: u64x4) -> u64x4 {
4751 cast!(self.avx2._mm256_srli_epi64::<AMOUNT>(cast!(a)))
4752 }
4753
4754 #[inline(always)]
4759 pub fn shr_dyn_i32x4(self, a: i32x4, amount: i32x4) -> i32x4 {
4760 cast!(self.avx2._mm_srav_epi32(cast!(a), cast!(amount)))
4761 }
4762
4763 #[inline(always)]
4768 pub fn shr_dyn_i32x8(self, a: i32x8, amount: i32x8) -> i32x8 {
4769 cast!(self.avx2._mm256_srav_epi32(cast!(a), cast!(amount)))
4770 }
4771
4772 #[inline(always)]
4776 pub fn shr_dyn_u32x4(self, a: u32x4, amount: u32x4) -> u32x4 {
4777 cast!(self.avx2._mm_srlv_epi32(cast!(a), cast!(amount)))
4778 }
4779
4780 #[inline(always)]
4784 pub fn shr_dyn_u32x8(self, a: u32x8, amount: u32x8) -> u32x8 {
4785 cast!(self.avx2._mm256_srlv_epi32(cast!(a), cast!(amount)))
4786 }
4787
4788 #[inline(always)]
4792 pub fn shr_dyn_u64x2(self, a: u64x2, amount: u64x2) -> u64x2 {
4793 cast!(self.avx2._mm_srlv_epi64(cast!(a), cast!(amount)))
4794 }
4795
4796 #[inline(always)]
4800 pub fn shr_dyn_u64x4(self, a: u64x4, amount: u64x4) -> u64x4 {
4801 cast!(self.avx2._mm256_srlv_epi64(cast!(a), cast!(amount)))
4802 }
4803
4804 #[inline(always)]
4809 pub fn shr_i16x16(self, a: i16x16, amount: u64x2) -> i16x16 {
4810 cast!(self.avx2._mm256_sra_epi16(cast!(a), cast!(amount)))
4811 }
4812
4813 #[inline(always)]
4818 pub fn shr_i32x8(self, a: i32x8, amount: u64x2) -> i32x8 {
4819 cast!(self.avx2._mm256_sra_epi32(cast!(a), cast!(amount)))
4820 }
4821
4822 #[inline(always)]
4826 pub fn shr_u16x16(self, a: u16x16, amount: u64x2) -> u16x16 {
4827 cast!(self.avx2._mm256_srl_epi16(cast!(a), cast!(amount)))
4828 }
4829
4830 #[inline(always)]
4834 pub fn shr_u32x8(self, a: u32x8, amount: u64x2) -> u32x8 {
4835 cast!(self.avx2._mm256_srl_epi32(cast!(a), cast!(amount)))
4836 }
4837
4838 #[inline(always)]
4842 pub fn shr_u64x4(self, a: u64x4, amount: u64x2) -> u64x4 {
4843 cast!(self.avx2._mm256_srl_epi64(cast!(a), cast!(amount)))
4844 }
4845
4846 #[inline(always)]
4848 pub fn splat_f32x8(self, value: f32) -> f32x8 {
4849 cast!(self.avx._mm256_set1_ps(value))
4850 }
4851
4852 #[inline(always)]
4854 pub fn splat_f64x4(self, value: f64) -> f64x4 {
4855 cast!(self.avx._mm256_set1_pd(value))
4856 }
4857
4858 #[inline(always)]
4860 pub fn splat_i16x16(self, value: i16) -> i16x16 {
4861 cast!(self.avx._mm256_set1_epi16(value))
4862 }
4863
4864 #[inline(always)]
4866 pub fn splat_i32x8(self, value: i32) -> i32x8 {
4867 cast!(self.avx._mm256_set1_epi32(value))
4868 }
4869
4870 #[inline(always)]
4872 pub fn splat_i64x4(self, value: i64) -> i64x4 {
4873 cast!(self.avx._mm256_set1_epi64x(value))
4874 }
4875
4876 #[inline(always)]
4878 pub fn splat_i8x32(self, value: i8) -> i8x32 {
4879 cast!(self.avx._mm256_set1_epi8(value))
4880 }
4881
4882 #[inline(always)]
4884 pub fn splat_m16x16(self, value: m16) -> m16x16 {
4885 cast!(self.avx._mm256_set1_epi16(value.0 as i16))
4886 }
4887
4888 #[inline(always)]
4890 pub fn splat_m32x8(self, value: m32) -> m32x8 {
4891 cast!(self.avx._mm256_set1_epi32(value.0 as i32))
4892 }
4893
4894 #[inline(always)]
4896 pub fn splat_m64x4(self, value: m64) -> m64x4 {
4897 cast!(self.avx._mm256_set1_epi64x(value.0 as i64))
4898 }
4899
4900 #[inline(always)]
4902 pub fn splat_m8x32(self, value: m8) -> m8x32 {
4903 cast!(self.avx._mm256_set1_epi8(value.0 as i8))
4904 }
4905
4906 #[inline(always)]
4908 pub fn splat_u16x16(self, value: u16) -> u16x16 {
4909 cast!(self.avx._mm256_set1_epi16(value as i16))
4910 }
4911
4912 #[inline(always)]
4914 pub fn splat_u32x8(self, value: u32) -> u32x8 {
4915 cast!(self.avx._mm256_set1_epi32(value as i32))
4916 }
4917
4918 #[inline(always)]
4920 pub fn splat_u64x4(self, value: u64) -> u64x4 {
4921 cast!(self.avx._mm256_set1_epi64x(value as i64))
4922 }
4923
4924 #[inline(always)]
4926 pub fn splat_u8x32(self, value: u8) -> u8x32 {
4927 cast!(self.avx._mm256_set1_epi8(value as i8))
4928 }
4929
4930 #[inline(always)]
4932 pub fn sqrt_f32x8(self, a: f32x8) -> f32x8 {
4933 cast!(self.avx._mm256_sqrt_ps(cast!(a)))
4934 }
4935
4936 #[inline(always)]
4938 pub fn sqrt_f64x4(self, a: f64x4) -> f64x4 {
4939 cast!(self.avx._mm256_sqrt_pd(cast!(a)))
4940 }
4941
4942 #[inline(always)]
4944 pub fn sub_f32x8(self, a: f32x8, b: f32x8) -> f32x8 {
4945 cast!(self.avx._mm256_sub_ps(cast!(a), cast!(b)))
4946 }
4947
4948 #[inline(always)]
4950 pub fn sub_f64x4(self, a: f64x4, b: f64x4) -> f64x4 {
4951 cast!(self.avx._mm256_sub_pd(cast!(a), cast!(b)))
4952 }
4953
4954 #[inline(always)]
4956 pub fn subadd_f32x8(self, a: f32x8, b: f32x8) -> f32x8 {
4957 cast!(self.avx._mm256_addsub_ps(cast!(a), cast!(b)))
4958 }
4959
4960 #[inline(always)]
4962 pub fn subadd_f64x4(self, a: f64x4, b: f64x4) -> f64x4 {
4963 cast!(self.avx._mm256_addsub_pd(cast!(a), cast!(b)))
4964 }
4965
4966 #[inline(always)]
4970 pub fn sum_of_absolute_differences_u8x32(self, a: u8x32, b: u8x32) -> u64x4 {
4971 cast!(self.avx2._mm256_sad_epu8(cast!(a), cast!(b)))
4972 }
4973
4974 #[inline(always)]
4976 pub fn truncate_f32x8(self, a: f32x8) -> f32x8 {
4977 const ROUNDING: i32 = _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC;
4978 cast!(self.avx._mm256_round_ps::<ROUNDING>(cast!(a)))
4979 }
4980
4981 #[inline(always)]
4983 pub fn truncate_f64x4(self, a: f64x4) -> f64x4 {
4984 const ROUNDING: i32 = _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC;
4985 cast!(self.avx._mm256_round_pd::<ROUNDING>(cast!(a)))
4986 }
4987
4988 #[inline(always)]
4990 pub fn unsigned_abs_i16x16(self, a: i16x16) -> u16x16 {
4991 cast!(self.avx2._mm256_abs_epi16(cast!(a)))
4992 }
4993
4994 #[inline(always)]
4996 pub fn unsigned_abs_i32x8(self, a: i32x8) -> u32x8 {
4997 cast!(self.avx2._mm256_abs_epi32(cast!(a)))
4998 }
4999
5000 #[inline(always)]
5002 pub fn unsigned_abs_i8x32(self, a: i8x32) -> u8x32 {
5003 cast!(self.avx2._mm256_abs_epi8(cast!(a)))
5004 }
5005
5006 #[inline(always)]
5009 pub fn widening_mul_i16x16(self, a: i16x16, b: i16x16) -> (i16x16, i16x16) {
5010 (
5011 cast!(self.avx2._mm256_mullo_epi16(cast!(a), cast!(b))),
5012 cast!(self.avx2._mm256_mulhi_epi16(cast!(a), cast!(b))),
5013 )
5014 }
5015
5016 #[inline(always)]
5019 pub fn widening_mul_i32x8(self, a: i32x8, b: i32x8) -> (i32x8, i32x8) {
5020 let a = cast!(a);
5021 let b = cast!(b);
5022 let avx2 = self.avx2;
5023
5024 let ab_evens = self.avx2._mm256_mul_epi32(a, b);
5026 let ab_odds = self.avx2._mm256_mul_epi32(
5028 avx2._mm256_srli_epi64::<32>(a),
5029 avx2._mm256_srli_epi64::<32>(b),
5030 );
5031
5032 let ab_lo = self.avx2._mm256_blend_epi32::<0b10101010>(
5033 ab_evens,
5035 avx2._mm256_slli_epi64::<32>(ab_odds),
5037 );
5038 let ab_hi = self.avx2._mm256_blend_epi32::<0b10101010>(
5039 avx2._mm256_srli_epi64::<32>(ab_evens),
5041 ab_odds,
5043 );
5044
5045 (cast!(ab_lo), cast!(ab_hi))
5046 }
5047
5048 #[inline(always)]
5051 pub fn widening_mul_u16x16(self, a: u16x16, b: u16x16) -> (u16x16, u16x16) {
5052 (
5053 cast!(self.avx2._mm256_mullo_epi16(cast!(a), cast!(b))),
5054 cast!(self.avx2._mm256_mulhi_epu16(cast!(a), cast!(b))),
5055 )
5056 }
5057
5058 #[inline(always)]
5061 pub fn widening_mul_u32x8(self, a: u32x8, b: u32x8) -> (u32x8, u32x8) {
5062 let a = cast!(a);
5063 let b = cast!(b);
5064 let avx2 = self.avx2;
5065
5066 let ab_evens = avx2._mm256_mul_epu32(a, b);
5068 let ab_odds = avx2._mm256_mul_epu32(
5070 avx2._mm256_srli_epi64::<32>(a),
5071 avx2._mm256_srli_epi64::<32>(b),
5072 );
5073
5074 let ab_lo = self.avx2._mm256_blend_epi32::<0b10101010>(
5075 ab_evens,
5077 avx2._mm256_slli_epi64::<32>(ab_odds),
5079 );
5080 let ab_hi = self.avx2._mm256_blend_epi32::<0b10101010>(
5081 avx2._mm256_srli_epi64::<32>(ab_evens),
5083 ab_odds,
5085 );
5086
5087 (cast!(ab_lo), cast!(ab_hi))
5088 }
5089
5090 #[inline(always)]
5092 pub fn wrapping_add_i16x16(self, a: i16x16, b: i16x16) -> i16x16 {
5093 cast!(self.avx2._mm256_add_epi16(cast!(a), cast!(b)))
5094 }
5095
5096 #[inline(always)]
5098 pub fn wrapping_add_i32x8(self, a: i32x8, b: i32x8) -> i32x8 {
5099 cast!(self.avx2._mm256_add_epi32(cast!(a), cast!(b)))
5100 }
5101
5102 #[inline(always)]
5104 pub fn wrapping_add_i64x4(self, a: i64x4, b: i64x4) -> i64x4 {
5105 cast!(self.avx2._mm256_add_epi64(cast!(a), cast!(b)))
5106 }
5107
5108 #[inline(always)]
5110 pub fn wrapping_add_i8x32(self, a: i8x32, b: i8x32) -> i8x32 {
5111 cast!(self.avx2._mm256_add_epi8(cast!(a), cast!(b)))
5112 }
5113
5114 #[inline(always)]
5116 pub fn wrapping_add_u16x16(self, a: u16x16, b: u16x16) -> u16x16 {
5117 cast!(self.avx2._mm256_add_epi16(cast!(a), cast!(b)))
5118 }
5119
5120 #[inline(always)]
5122 pub fn wrapping_add_u32x8(self, a: u32x8, b: u32x8) -> u32x8 {
5123 cast!(self.avx2._mm256_add_epi32(cast!(a), cast!(b)))
5124 }
5125
5126 #[inline(always)]
5128 pub fn wrapping_add_u64x4(self, a: u64x4, b: u64x4) -> u64x4 {
5129 cast!(self.avx2._mm256_add_epi64(cast!(a), cast!(b)))
5130 }
5131
5132 #[inline(always)]
5134 pub fn wrapping_add_u8x32(self, a: u8x32, b: u8x32) -> u8x32 {
5135 cast!(self.avx2._mm256_add_epi8(cast!(a), cast!(b)))
5136 }
5137
5138 #[inline(always)]
5140 pub fn wrapping_mul_i16x16(self, a: i16x16, b: i16x16) -> i16x16 {
5141 cast!(self.avx2._mm256_mullo_epi16(cast!(a), cast!(b)))
5142 }
5143
5144 #[inline(always)]
5146 pub fn wrapping_mul_i32x8(self, a: i32x8, b: i32x8) -> i32x8 {
5147 cast!(self.avx2._mm256_mullo_epi32(cast!(a), cast!(b)))
5148 }
5149
5150 #[inline(always)]
5152 pub fn wrapping_mul_u16x16(self, a: u16x16, b: u16x16) -> u16x16 {
5153 cast!(self.avx2._mm256_mullo_epi16(cast!(a), cast!(b)))
5154 }
5155
5156 #[inline(always)]
5158 pub fn wrapping_mul_u32x8(self, a: u32x8, b: u32x8) -> u32x8 {
5159 cast!(self.avx2._mm256_mullo_epi32(cast!(a), cast!(b)))
5160 }
5161
5162 #[inline(always)]
5164 pub fn wrapping_sub_i16x16(self, a: i16x16, b: i16x16) -> i16x16 {
5165 cast!(self.avx2._mm256_sub_epi16(cast!(a), cast!(b)))
5166 }
5167
5168 #[inline(always)]
5170 pub fn wrapping_sub_i32x8(self, a: i32x8, b: i32x8) -> i32x8 {
5171 cast!(self.avx2._mm256_sub_epi32(cast!(a), cast!(b)))
5172 }
5173
5174 #[inline(always)]
5176 pub fn wrapping_sub_i64x4(self, a: i64x4, b: i64x4) -> i64x4 {
5177 cast!(self.avx2._mm256_sub_epi64(cast!(a), cast!(b)))
5178 }
5179
5180 #[inline(always)]
5182 pub fn wrapping_sub_i8x32(self, a: i8x32, b: i8x32) -> i8x32 {
5183 cast!(self.avx2._mm256_sub_epi8(cast!(a), cast!(b)))
5184 }
5185
5186 #[inline(always)]
5188 pub fn wrapping_sub_u16x16(self, a: u16x16, b: u16x16) -> u16x16 {
5189 cast!(self.avx2._mm256_sub_epi16(cast!(a), cast!(b)))
5190 }
5191
5192 #[inline(always)]
5194 pub fn wrapping_sub_u32x8(self, a: u32x8, b: u32x8) -> u32x8 {
5195 cast!(self.avx2._mm256_sub_epi32(cast!(a), cast!(b)))
5196 }
5197
5198 #[inline(always)]
5200 pub fn wrapping_sub_u64x4(self, a: u64x4, b: u64x4) -> u64x4 {
5201 cast!(self.avx2._mm256_sub_epi64(cast!(a), cast!(b)))
5202 }
5203
5204 #[inline(always)]
5206 pub fn wrapping_sub_u8x32(self, a: u8x32, b: u8x32) -> u8x32 {
5207 cast!(self.avx2._mm256_sub_epi8(cast!(a), cast!(b)))
5208 }
5209
5210 #[inline(always)]
5212 pub fn xor_f32x8(self, a: f32x8, b: f32x8) -> f32x8 {
5213 cast!(self.avx._mm256_xor_ps(cast!(a), cast!(b)))
5214 }
5215
5216 #[inline(always)]
5218 pub fn xor_f64x4(self, a: f64x4, b: f64x4) -> f64x4 {
5219 cast!(self.avx._mm256_xor_pd(cast!(a), cast!(b)))
5220 }
5221
5222 #[inline(always)]
5224 pub fn xor_i16x16(self, a: i16x16, b: i16x16) -> i16x16 {
5225 cast!(self.avx2._mm256_xor_si256(cast!(a), cast!(b)))
5226 }
5227
5228 #[inline(always)]
5230 pub fn xor_i32x8(self, a: i32x8, b: i32x8) -> i32x8 {
5231 cast!(self.avx2._mm256_xor_si256(cast!(a), cast!(b)))
5232 }
5233
5234 #[inline(always)]
5236 pub fn xor_i64x4(self, a: i64x4, b: i64x4) -> i64x4 {
5237 cast!(self.avx2._mm256_xor_si256(cast!(a), cast!(b)))
5238 }
5239
5240 #[inline(always)]
5242 pub fn xor_i8x32(self, a: i8x32, b: i8x32) -> i8x32 {
5243 cast!(self.avx2._mm256_xor_si256(cast!(a), cast!(b)))
5244 }
5245
5246 #[inline(always)]
5248 pub fn xor_m16x16(self, a: m16x16, b: m16x16) -> m16x16 {
5249 cast!(self.avx2._mm256_xor_si256(cast!(a), cast!(b)))
5250 }
5251
5252 #[inline(always)]
5254 pub fn xor_m32x8(self, a: m32x8, b: m32x8) -> m32x8 {
5255 cast!(self.avx2._mm256_xor_si256(cast!(a), cast!(b)))
5256 }
5257
5258 #[inline(always)]
5260 pub fn xor_m64x4(self, a: m64x4, b: m64x4) -> m64x4 {
5261 cast!(self.avx2._mm256_xor_si256(cast!(a), cast!(b)))
5262 }
5263
5264 #[inline(always)]
5266 pub fn xor_m8x32(self, a: m8x32, b: m8x32) -> m8x32 {
5267 cast!(self.avx2._mm256_xor_si256(cast!(a), cast!(b)))
5268 }
5269
5270 #[inline(always)]
5272 pub fn xor_u16x16(self, a: u16x16, b: u16x16) -> u16x16 {
5273 cast!(self.avx2._mm256_xor_si256(cast!(a), cast!(b)))
5274 }
5275
5276 #[inline(always)]
5278 pub fn xor_u32x8(self, a: u32x8, b: u32x8) -> u32x8 {
5279 cast!(self.avx2._mm256_xor_si256(cast!(a), cast!(b)))
5280 }
5281
5282 #[inline(always)]
5284 pub fn xor_u64x4(self, a: u64x4, b: u64x4) -> u64x4 {
5285 cast!(self.avx2._mm256_xor_si256(cast!(a), cast!(b)))
5286 }
5287
5288 #[inline(always)]
5290 pub fn xor_u8x32(self, a: u8x32, b: u8x32) -> u8x32 {
5291 cast!(self.avx2._mm256_xor_si256(cast!(a), cast!(b)))
5292 }
5293}