pulp/x86/
v3.rs

1use super::*;
2
3// https://en.wikipedia.org/wiki/X86-64#Microarchitecture_levels
4simd_type!({
5	/// AVX instruction set.
6	///
7	/// Notable additions over [`V2`] include:
8	///  - Instructions operating on 256-bit SIMD vectors.
9	///  - Shift functions with a separate shift per lane, such as [`V3::shl_dyn_u32x4`].
10	///  - Fused multiply-accumulate instructions, such as [`V3::mul_add_f32x4`].
11	#[allow(missing_docs)]
12	pub struct V3 {
13		pub sse: f!("sse"),
14		pub sse2: f!("sse2"),
15		pub fxsr: f!("fxsr"),
16		pub sse3: f!("sse3"),
17		pub ssse3: f!("ssse3"),
18		pub sse4_1: f!("sse4.1"),
19		pub sse4_2: f!("sse4.2"),
20		pub popcnt: f!("popcnt"),
21		pub avx: f!("avx"),
22		pub avx2: f!("avx2"),
23		pub bmi1: f!("bmi1"),
24		pub bmi2: f!("bmi2"),
25		pub fma: f!("fma"),
26		pub lzcnt: f!("lzcnt"),
27	}
28});
29
30// copied from the standard library
31#[inline(always)]
32fn avx2_pshufb(simd: V3, bytes: __m256i, idxs: __m256i) -> __m256i {
33	let mid = simd.avx._mm256_set1_epi8(16i8);
34	let high = simd.avx._mm256_set1_epi8(32i8);
35	// This is ordering sensitive, and LLVM will order these how you put them.
36	// Most AVX2 impls use ~5 "ports", and only 1 or 2 are capable of permutes.
37	// But the "compose" step will lower to ops that can also use at least 1 other port.
38	// So this tries to break up permutes so composition flows through "open" ports.
39	// Comparative benches should be done on multiple AVX2 CPUs before reordering this
40
41	let hihi = simd.avx2._mm256_permute2x128_si256::<0x11>(bytes, bytes);
42	let hi_shuf = simd.avx2._mm256_shuffle_epi8(
43		hihi, // duplicate the vector's top half
44		idxs, // so that using only 4 bits of an index still picks bytes 16-31
45	);
46	// A zero-fill during the compose step gives the "all-Neon-like" OOB-is-0 semantics
47	let compose = simd.avx2._mm256_blendv_epi8(
48		simd.avx._mm256_set1_epi8(0),
49		hi_shuf,
50		simd.avx2._mm256_cmpgt_epi8(high, idxs),
51	);
52	let lolo = simd.avx2._mm256_permute2x128_si256::<0x00>(bytes, bytes);
53	let lo_shuf = simd.avx2._mm256_shuffle_epi8(lolo, idxs);
54	// Repeat, then pick indices < 16, overwriting indices 0-15 from previous compose step
55	simd.avx2
56		._mm256_blendv_epi8(compose, lo_shuf, simd.avx2._mm256_cmpgt_epi8(mid, idxs))
57}
58
59static AVX2_ROTATE_IDX: [u8x32; 32] = [
60	u8x32(
61		0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
62		25, 26, 27, 28, 29, 30, 31,
63	),
64	u8x32(
65		31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
66		24, 25, 26, 27, 28, 29, 30,
67	),
68	u8x32(
69		30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22,
70		23, 24, 25, 26, 27, 28, 29,
71	),
72	u8x32(
73		29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21,
74		22, 23, 24, 25, 26, 27, 28,
75	),
76	u8x32(
77		28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20,
78		21, 22, 23, 24, 25, 26, 27,
79	),
80	u8x32(
81		27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
82		20, 21, 22, 23, 24, 25, 26,
83	),
84	u8x32(
85		26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
86		19, 20, 21, 22, 23, 24, 25,
87	),
88	u8x32(
89		25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,
90		18, 19, 20, 21, 22, 23, 24,
91	),
92	u8x32(
93		24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
94		17, 18, 19, 20, 21, 22, 23,
95	),
96	u8x32(
97		23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
98		16, 17, 18, 19, 20, 21, 22,
99	),
100	u8x32(
101		22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
102		15, 16, 17, 18, 19, 20, 21,
103	),
104	u8x32(
105		21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
106		14, 15, 16, 17, 18, 19, 20,
107	),
108	u8x32(
109		20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
110		13, 14, 15, 16, 17, 18, 19,
111	),
112	u8x32(
113		19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
114		12, 13, 14, 15, 16, 17, 18,
115	),
116	u8x32(
117		18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
118		11, 12, 13, 14, 15, 16, 17,
119	),
120	u8x32(
121		17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
122		10, 11, 12, 13, 14, 15, 16,
123	),
124	u8x32(
125		16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8,
126		9, 10, 11, 12, 13, 14, 15,
127	),
128	u8x32(
129		15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7,
130		8, 9, 10, 11, 12, 13, 14,
131	),
132	u8x32(
133		14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5,
134		6, 7, 8, 9, 10, 11, 12, 13,
135	),
136	u8x32(
137		13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4,
138		5, 6, 7, 8, 9, 10, 11, 12,
139	),
140	u8x32(
141		12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3,
142		4, 5, 6, 7, 8, 9, 10, 11,
143	),
144	u8x32(
145		11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1,
146		2, 3, 4, 5, 6, 7, 8, 9, 10,
147	),
148	u8x32(
149		10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0,
150		1, 2, 3, 4, 5, 6, 7, 8, 9,
151	),
152	u8x32(
153		9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
154		0, 1, 2, 3, 4, 5, 6, 7, 8,
155	),
156	u8x32(
157		8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30,
158		31, 0, 1, 2, 3, 4, 5, 6, 7,
159	),
160	u8x32(
161		7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
162		30, 31, 0, 1, 2, 3, 4, 5, 6,
163	),
164	u8x32(
165		6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
166		30, 31, 0, 1, 2, 3, 4, 5,
167	),
168	u8x32(
169		5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28,
170		29, 30, 31, 0, 1, 2, 3, 4,
171	),
172	u8x32(
173		4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27,
174		28, 29, 30, 31, 0, 1, 2, 3,
175	),
176	u8x32(
177		3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
178		27, 28, 29, 30, 31, 0, 1, 2,
179	),
180	u8x32(
181		2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
182		27, 28, 29, 30, 31, 0, 1,
183	),
184	u8x32(
185		1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
186		26, 27, 28, 29, 30, 31, 0,
187	),
188];
189
190static AVX2_128_ROTATE_IDX: [u8x16; 16] = [
191	u8x16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15),
192	u8x16(15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14),
193	u8x16(14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13),
194	u8x16(13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12),
195	u8x16(12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11),
196	u8x16(11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10),
197	u8x16(10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9),
198	u8x16(9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8),
199	u8x16(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7),
200	u8x16(7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6),
201	u8x16(6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5),
202	u8x16(5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4),
203	u8x16(4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3),
204	u8x16(3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2),
205	u8x16(2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1),
206	u8x16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0),
207];
208
209static V3_U32_MASKS: [u32x8; 9] = [
210	u32x8(0, 0, 0, 0, 0, 0, 0, 0),
211	u32x8(!0, 0, 0, 0, 0, 0, 0, 0),
212	u32x8(!0, !0, 0, 0, 0, 0, 0, 0),
213	u32x8(!0, !0, !0, 0, 0, 0, 0, 0),
214	u32x8(!0, !0, !0, !0, 0, 0, 0, 0),
215	u32x8(!0, !0, !0, !0, !0, 0, 0, 0),
216	u32x8(!0, !0, !0, !0, !0, !0, 0, 0),
217	u32x8(!0, !0, !0, !0, !0, !0, !0, 0),
218	u32x8(!0, !0, !0, !0, !0, !0, !0, !0),
219];
220static V3_U32_LAST_MASKS: [u32x8; 9] = [
221	u32x8(0, 0, 0, 0, 0, 0, 0, 0),
222	u32x8(0, 0, 0, 0, 0, 0, 0, !0),
223	u32x8(0, 0, 0, 0, 0, 0, !0, !0),
224	u32x8(0, 0, 0, 0, 0, !0, !0, !0),
225	u32x8(0, 0, 0, 0, !0, !0, !0, !0),
226	u32x8(0, 0, 0, !0, !0, !0, !0, !0),
227	u32x8(0, 0, !0, !0, !0, !0, !0, !0),
228	u32x8(0, !0, !0, !0, !0, !0, !0, !0),
229	u32x8(!0, !0, !0, !0, !0, !0, !0, !0),
230];
231
232impl Seal for V3 {}
233impl Seal for V3_Scalar {}
234
235#[derive(Copy, Clone, Debug)]
236#[repr(transparent)]
237pub struct V3_Scalar(pub V3);
238
239#[inline(always)]
240pub(super) fn avx_load_u32s(simd: Avx2, slice: &[u32]) -> u32x8 {
241	_ = simd;
242	unsafe { avx_ld_u32s(slice.as_ptr(), LD_ST[2 * (16 * slice.len().min(8))]) }
243}
244
245#[inline(always)]
246pub(super) fn avx_store_u32s(simd: Avx2, slice: &mut [u32], value: u32x8) {
247	_ = simd;
248	unsafe {
249		avx_st_u32s(
250			slice.as_mut_ptr(),
251			value,
252			LD_ST[2 * (16 * slice.len().min(8)) + 1],
253		);
254	}
255}
256
257impl core::ops::Deref for V3 {
258	type Target = V2;
259
260	#[inline(always)]
261	fn deref(&self) -> &Self::Target {
262		V2 {
263			sse: self.sse,
264			sse2: self.sse2,
265			fxsr: self.fxsr,
266			sse3: self.sse3,
267			ssse3: self.ssse3,
268			sse4_1: self.sse4_1,
269			sse4_2: self.sse4_2,
270			popcnt: self.popcnt,
271		}
272		.to_ref()
273	}
274}
275
276impl Simd for V3 {
277	type c32s = f32x8;
278	type c64s = f64x4;
279	type f32s = f32x8;
280	type f64s = f64x4;
281	type i32s = i32x8;
282	type i64s = i64x4;
283	type m32s = m32x8;
284	type m64s = m64x4;
285	type u32s = u32x8;
286	type u64s = u64x4;
287
288	const REGISTER_COUNT: usize = 16;
289
290	#[inline(always)]
291	fn abs2_c32s(self, a: Self::c32s) -> Self::c32s {
292		let sqr = self.mul_f32s(a, a);
293		let sqr_rev = self
294			.avx
295			._mm256_shuffle_ps::<0b10_11_00_01>(cast!(sqr), cast!(sqr));
296		self.add_f32s(sqr, cast!(sqr_rev))
297	}
298
299	#[inline(always)]
300	fn abs2_c64s(self, a: Self::c64s) -> Self::c64s {
301		let sqr = self.mul_f64s(a, a);
302		let sqr_rev = self.avx._mm256_shuffle_pd::<0b0101>(cast!(sqr), cast!(sqr));
303		self.add_f64s(sqr, cast!(sqr_rev))
304	}
305
306	#[inline(always)]
307	fn abs_max_c32s(self, a: Self::c32s) -> Self::c32s {
308		let max = self.abs_f32s(a);
309		let max_rev = self
310			.avx
311			._mm256_shuffle_ps::<0b10_11_00_01>(cast!(a), cast!(a));
312		self.max_f32s(max, cast!(max_rev))
313	}
314
315	#[inline(always)]
316	fn abs_max_c64s(self, a: Self::c64s) -> Self::c64s {
317		let max = self.abs_f64s(a);
318		let max_rev = self.avx._mm256_shuffle_pd::<0b0101>(cast!(max), cast!(max));
319		self.max_f64s(max, cast!(max_rev))
320	}
321
322	#[inline(always)]
323	fn add_c32s(self, a: Self::c32s, b: Self::c32s) -> Self::c32s {
324		self.add_f32s(a, b)
325	}
326
327	#[inline(always)]
328	fn add_c64s(self, a: Self::c64s, b: Self::c64s) -> Self::c64s {
329		self.add_f64s(a, b)
330	}
331
332	#[inline(always)]
333	fn add_f32s(self, a: Self::f32s, b: Self::f32s) -> Self::f32s {
334		cast!(self.avx._mm256_add_ps(cast!(a), cast!(b)))
335	}
336
337	#[inline(always)]
338	fn add_f64s(self, a: Self::f64s, b: Self::f64s) -> Self::f64s {
339		cast!(self.avx._mm256_add_pd(cast!(a), cast!(b)))
340	}
341
342	#[inline(always)]
343	fn add_u32s(self, a: Self::u32s, b: Self::u32s) -> Self::u32s {
344		cast!(self.avx2._mm256_add_epi32(cast!(a), cast!(b)))
345	}
346
347	#[inline(always)]
348	fn add_u64s(self, a: Self::u64s, b: Self::u64s) -> Self::u64s {
349		cast!(self.avx2._mm256_add_epi64(cast!(a), cast!(b)))
350	}
351
352	#[inline(always)]
353	fn and_m32s(self, a: Self::m32s, b: Self::m32s) -> Self::m32s {
354		cast!(self.avx._mm256_and_pd(cast!(a), cast!(b)))
355	}
356
357	#[inline(always)]
358	fn and_m64s(self, a: Self::m64s, b: Self::m64s) -> Self::m64s {
359		cast!(self.avx._mm256_and_pd(cast!(a), cast!(b)))
360	}
361
362	#[inline(always)]
363	fn and_u32s(self, a: Self::u32s, b: Self::u32s) -> Self::u32s {
364		cast!(self.avx._mm256_and_pd(cast!(a), cast!(b)))
365	}
366
367	#[inline(always)]
368	fn and_u64s(self, a: Self::u64s, b: Self::u64s) -> Self::u64s {
369		cast!(self.avx._mm256_and_pd(cast!(a), cast!(b)))
370	}
371
372	#[inline(always)]
373	fn conj_c32s(self, a: Self::c32s) -> Self::c32s {
374		self.xor_f32s(a, self.splat_c32s(c32 { re: 0.0, im: -0.0 }))
375	}
376
377	#[inline(always)]
378	fn conj_c64s(self, a: Self::c64s) -> Self::c64s {
379		self.xor_f64s(a, self.splat_c64s(c64 { re: 0.0, im: -0.0 }))
380	}
381
382	#[inline(always)]
383	fn conj_mul_add_c32s(self, a: Self::c32s, b: Self::c32s, c: Self::c32s) -> Self::c32s {
384		let ab = cast!(a);
385		let xy = cast!(b);
386
387		let yx = self.avx._mm256_permute_ps::<0b10_11_00_01>(xy);
388		let aa = self.avx._mm256_moveldup_ps(ab);
389		let bb = self.avx._mm256_movehdup_ps(ab);
390
391		cast!(
392			self.fma
393				._mm256_fmsubadd_ps(aa, xy, self.fma._mm256_fmsubadd_ps(bb, yx, cast!(c)))
394		)
395	}
396
397	#[inline(always)]
398	fn conj_mul_add_c64s(self, a: Self::c64s, b: Self::c64s, c: Self::c64s) -> Self::c64s {
399		let ab = cast!(a);
400		let xy = cast!(b);
401
402		let yx = self.avx._mm256_permute_pd::<0b0101>(xy);
403		let aa = self.avx._mm256_unpacklo_pd(ab, ab);
404		let bb = self.avx._mm256_unpackhi_pd(ab, ab);
405
406		cast!(
407			self.fma
408				._mm256_fmsubadd_pd(aa, xy, self.fma._mm256_fmsubadd_pd(bb, yx, cast!(c)))
409		)
410	}
411
412	#[inline(always)]
413	fn conj_mul_c32s(self, a: Self::c32s, b: Self::c32s) -> Self::c32s {
414		let ab = cast!(a);
415		let xy = cast!(b);
416
417		let yx = self.avx._mm256_permute_ps::<0b10_11_00_01>(xy);
418		let aa = self.avx._mm256_moveldup_ps(ab);
419		let bb = self.avx._mm256_movehdup_ps(ab);
420
421		cast!(
422			self.fma
423				._mm256_fmsubadd_ps(aa, xy, self.avx._mm256_mul_ps(bb, yx))
424		)
425	}
426
427	#[inline(always)]
428	fn conj_mul_c64s(self, a: Self::c64s, b: Self::c64s) -> Self::c64s {
429		let ab = cast!(a);
430		let xy = cast!(b);
431
432		let yx = self.avx._mm256_permute_pd::<0b0101>(xy);
433		let aa = self.avx._mm256_unpacklo_pd(ab, ab);
434		let bb = self.avx._mm256_unpackhi_pd(ab, ab);
435
436		cast!(
437			self.fma
438				._mm256_fmsubadd_pd(aa, xy, self.avx._mm256_mul_pd(bb, yx))
439		)
440	}
441
442	#[inline(always)]
443	fn deinterleave_shfl_f32s<T: Interleave>(self, values: T) -> T {
444		let avx = self.avx;
445
446		if try_const! { core::mem::size_of::<T>() == 2 * core::mem::size_of::<Self::f32s>() } {
447			let values: [__m256d; 2] = unsafe { core::mem::transmute_copy(&values) };
448			// a0 b0 a1 b1 a2 b2 a3 b3
449			// a4 b4 a5 b5 a6 b6 a7 b7
450
451			// a0 a4 b0 b4 a2 a6 b2 b6
452			// a1 a5 b1 b5 a3 a7 b3 b7
453			let values = [
454				cast!(avx._mm256_unpacklo_ps(cast!(values[0]), cast!(values[1]))),
455				cast!(avx._mm256_unpackhi_ps(cast!(values[0]), cast!(values[1]))),
456			];
457
458			// a0 a4 a1 a5 a2 a6 a3 a7
459			// b0 b4 b1 b5 b2 b6 b3 b7
460			let values = [
461				avx._mm256_unpacklo_pd(values[0], values[1]),
462				avx._mm256_unpackhi_pd(values[0], values[1]),
463			];
464
465			unsafe { core::mem::transmute_copy(&values) }
466		} else if try_const! { core::mem::size_of::<T>() == 4 * core::mem::size_of::<Self::f32s>() }
467		{
468			// a0 b0 c0 d0 a1 b1 c1 d1
469			// a2 b2 c2 d2 a3 b3 c3 d3
470			// a4 b4 c4 d4 a5 b5 c5 d5
471			// a6 b6 c6 d6 a7 b7 c7 d7
472			let values: [__m256d; 4] = unsafe { core::mem::transmute_copy(&values) };
473
474			// a0 a2 c0 c2 a1 a3 c1 c3
475			// b0 b2 d0 d2 b1 b3 d1 d3
476			// a4 a6 c4 c6 a5 a7 c5 c7
477			// b4 b6 d4 d6 b5 b7 d5 d7
478			let values = [
479				cast!(avx._mm256_unpacklo_ps(cast!(values[0]), cast!(values[1]))),
480				cast!(avx._mm256_unpackhi_ps(cast!(values[0]), cast!(values[1]))),
481				cast!(avx._mm256_unpacklo_ps(cast!(values[2]), cast!(values[3]))),
482				cast!(avx._mm256_unpackhi_ps(cast!(values[2]), cast!(values[3]))),
483			];
484
485			let values = [
486				avx._mm256_unpacklo_pd(values[0], values[1]),
487				avx._mm256_unpackhi_pd(values[0], values[1]),
488				avx._mm256_unpacklo_pd(values[2], values[3]),
489				avx._mm256_unpackhi_pd(values[2], values[3]),
490			];
491
492			// a0 a2 a4 a6 a1 a3 a5 a7
493			// b0 b2 b4 b6 b1 b3 b5 b7
494			// c0 c2 c4 c6 c1 c3 c5 c7
495			// d0 d2 d4 d6 d1 d3 d5 d7
496			let values = [
497				avx._mm256_unpacklo_pd(values[0], values[2]),
498				avx._mm256_unpacklo_pd(values[1], values[3]),
499				avx._mm256_unpackhi_pd(values[0], values[2]),
500				avx._mm256_unpackhi_pd(values[1], values[3]),
501			];
502
503			unsafe { core::mem::transmute_copy(&values) }
504		} else {
505			unsafe { deinterleave_fallback::<f32, Self::f32s, T>(values) }
506		}
507	}
508
509	#[inline(always)]
510	fn deinterleave_shfl_f64s<T: Interleave>(self, values: T) -> T {
511		let avx = self.avx;
512
513		if try_const! { core::mem::size_of::<T>() == 2 * core::mem::size_of::<Self::f64s>() } {
514			let values: [__m256d; 2] = unsafe { core::mem::transmute_copy(&values) };
515			let values = [
516				avx._mm256_unpacklo_pd(values[0], values[1]),
517				avx._mm256_unpackhi_pd(values[0], values[1]),
518			];
519			unsafe { core::mem::transmute_copy(&values) }
520		} else if try_const! { core::mem::size_of::<T>() == 4 * core::mem::size_of::<Self::f64s>() }
521		{
522			let values: [__m256d; 4] = unsafe { core::mem::transmute_copy(&values) };
523
524			// a0 b0 c0 d0
525			// a1 b1 c1 d1
526			// a2 b2 c2 d2
527			// a3 b3 c3 d3
528
529			// a0 a1 c0 c1
530			// b0 b1 d0 d1
531			// a2 a3 c2 c3
532			// b2 b3 d2 d3
533			let values: [__m256d; 4] = [
534				avx._mm256_unpacklo_pd(values[0], values[1]),
535				avx._mm256_unpackhi_pd(values[0], values[1]),
536				avx._mm256_unpacklo_pd(values[2], values[3]),
537				avx._mm256_unpackhi_pd(values[2], values[3]),
538			];
539
540			// a0 a1 a2 a3
541			// b0 b1 b2 b3
542			// c0 c1 c2 c3
543			// d0 d1 d2 d3
544			let values = [
545				avx._mm256_permute2f128_pd::<0b0010_0000>(values[0], values[2]),
546				avx._mm256_permute2f128_pd::<0b0010_0000>(values[1], values[3]),
547				avx._mm256_permute2f128_pd::<0b0011_0001>(values[0], values[2]),
548				avx._mm256_permute2f128_pd::<0b0011_0001>(values[1], values[3]),
549			];
550
551			unsafe { core::mem::transmute_copy(&values) }
552		} else {
553			unsafe { deinterleave_fallback::<f64, Self::f64s, T>(values) }
554		}
555	}
556
557	#[inline(always)]
558	fn div_f32s(self, a: Self::f32s, b: Self::f32s) -> Self::f32s {
559		cast!(self.avx._mm256_div_ps(cast!(a), cast!(b)))
560	}
561
562	#[inline(always)]
563	fn div_f64s(self, a: Self::f64s, b: Self::f64s) -> Self::f64s {
564		cast!(self.avx._mm256_div_pd(cast!(a), cast!(b)))
565	}
566
567	#[inline(always)]
568	fn equal_f32s(self, a: Self::f32s, b: Self::f32s) -> Self::m32s {
569		cast!(self.avx._mm256_cmp_ps::<_CMP_EQ_OQ>(cast!(a), cast!(b)))
570	}
571
572	#[inline(always)]
573	fn equal_f64s(self, a: Self::f64s, b: Self::f64s) -> Self::m64s {
574		cast!(self.avx._mm256_cmp_pd::<_CMP_EQ_OQ>(cast!(a), cast!(b)))
575	}
576
577	#[inline(always)]
578	fn greater_than_or_equal_u32s(self, a: Self::u32s, b: Self::u32s) -> Self::m32s {
579		self.cmp_ge_u32x8(a, b)
580	}
581
582	#[inline(always)]
583	fn greater_than_or_equal_u64s(self, a: Self::u64s, b: Self::u64s) -> Self::m64s {
584		self.cmp_ge_u64x4(a, b)
585	}
586
587	#[inline(always)]
588	fn greater_than_u32s(self, a: Self::u32s, b: Self::u32s) -> Self::m32s {
589		self.cmp_gt_u32x8(a, b)
590	}
591
592	#[inline(always)]
593	fn greater_than_u64s(self, a: Self::u64s, b: Self::u64s) -> Self::m64s {
594		self.cmp_gt_u64x4(a, b)
595	}
596
597	#[inline(always)]
598	fn interleave_shfl_f32s<T: Interleave>(self, values: T) -> T {
599		if try_const! {
600			(core::mem::size_of::<T>() == 2 * core::mem::size_of::<Self::f32s>())
601				|| (core::mem::size_of::<T>() == 4 * core::mem::size_of::<Self::f32s>())
602		} {
603			// permutation is inverse of itself in this case
604			self.deinterleave_shfl_f32s(values)
605		} else {
606			unsafe { interleave_fallback::<f32, Self::f32s, T>(values) }
607		}
608	}
609
610	#[inline(always)]
611	fn interleave_shfl_f64s<T: Interleave>(self, values: T) -> T {
612		if try_const! {
613			(core::mem::size_of::<T>() == 2 * core::mem::size_of::<Self::f64s>())
614				|| (core::mem::size_of::<T>() == 4 * core::mem::size_of::<Self::f64s>())
615		} {
616			// permutation is inverse of itself in this case
617			self.deinterleave_shfl_f64s(values)
618		} else {
619			unsafe { interleave_fallback::<f64, Self::f64s, T>(values) }
620		}
621	}
622
623	#[inline(always)]
624	fn less_than_f32s(self, a: Self::f32s, b: Self::f32s) -> Self::m32s {
625		cast!(self.avx._mm256_cmp_ps::<_CMP_LT_OQ>(cast!(a), cast!(b)))
626	}
627
628	#[inline(always)]
629	fn less_than_f64s(self, a: Self::f64s, b: Self::f64s) -> Self::m64s {
630		cast!(self.avx._mm256_cmp_pd::<_CMP_LT_OQ>(cast!(a), cast!(b)))
631	}
632
633	#[inline(always)]
634	fn less_than_or_equal_f32s(self, a: Self::f32s, b: Self::f32s) -> Self::m32s {
635		cast!(self.avx._mm256_cmp_ps::<_CMP_LE_OQ>(cast!(a), cast!(b)))
636	}
637
638	#[inline(always)]
639	fn less_than_or_equal_f64s(self, a: Self::f64s, b: Self::f64s) -> Self::m64s {
640		cast!(self.avx._mm256_cmp_pd::<_CMP_LE_OQ>(cast!(a), cast!(b)))
641	}
642
643	#[inline(always)]
644	fn less_than_or_equal_u32s(self, a: Self::u32s, b: Self::u32s) -> Self::m32s {
645		self.cmp_le_u32x8(a, b)
646	}
647
648	#[inline(always)]
649	fn less_than_or_equal_u64s(self, a: Self::u64s, b: Self::u64s) -> Self::m64s {
650		self.cmp_le_u64x4(a, b)
651	}
652
653	#[inline(always)]
654	fn less_than_u32s(self, a: Self::u32s, b: Self::u32s) -> Self::m32s {
655		self.cmp_lt_u32x8(a, b)
656	}
657
658	#[inline(always)]
659	fn less_than_u64s(self, a: Self::u64s, b: Self::u64s) -> Self::m64s {
660		self.cmp_lt_u64x4(a, b)
661	}
662
663	#[inline(always)]
664	fn mask_between_m32s(self, start: u32, end: u32) -> MemMask<Self::m32s> {
665		let start = start.min(8) as usize;
666		let end = end.min(8) as usize;
667		MemMask {
668			mask: self.and_m32s(
669				cast!(V3_U32_LAST_MASKS[8 - start]),
670				cast!(V3_U32_MASKS[end]),
671			),
672			load: Some(LD_ST[2 * (16 * end + start) + 0]),
673			store: Some(LD_ST[2 * (16 * end + start) + 1]),
674		}
675	}
676
677	#[inline(always)]
678	fn mask_between_m64s(self, start: u64, end: u64) -> MemMask<Self::m64s> {
679		let start = (2 * start.min(4)) as usize;
680		let end = (2 * end.min(4)) as usize;
681		MemMask {
682			mask: self.and_m64s(
683				cast!(V3_U32_LAST_MASKS[8 - start]),
684				cast!(V3_U32_MASKS[end]),
685			),
686			load: Some(LD_ST[2 * (16 * end + start) + 0]),
687			store: Some(LD_ST[2 * (16 * end + start) + 1]),
688		}
689	}
690
691	/// # Safety
692	///
693	/// See the trait-level safety documentation.
694	#[inline(always)]
695	unsafe fn mask_load_ptr_c32s(self, mask: MemMask<Self::m32s>, ptr: *const c32) -> Self::c32s {
696		cast!(self.mask_load_ptr_u32s(mask, ptr as _))
697	}
698
699	/// # Safety
700	///
701	/// See the trait-level safety documentation.
702	#[inline(always)]
703	unsafe fn mask_load_ptr_c64s(self, mask: MemMask<Self::m64s>, ptr: *const c64) -> Self::c64s {
704		cast!(self.mask_load_ptr_u64s(mask, ptr as _))
705	}
706
707	/// # Safety
708	///
709	/// See the trait-level safety documentation.
710	#[inline(always)]
711	unsafe fn mask_load_ptr_u32s(self, mask: MemMask<Self::m32s>, ptr: *const u32) -> Self::u32s {
712		match mask.load {
713			Some(load) => avx_ld_u32s(ptr, load),
714			None => cast!(self.avx2._mm256_maskload_epi32(ptr as _, cast!(mask.mask))),
715		}
716	}
717
718	/// # Safety
719	///
720	/// See the trait-level safety documentation.
721	#[inline(always)]
722	unsafe fn mask_load_ptr_u64s(self, mask: MemMask<Self::m64s>, ptr: *const u64) -> Self::u64s {
723		cast!(self.mask_load_ptr_u32s(
724			MemMask {
725				mask: cast!(mask.mask),
726				load: mask.load,
727				store: mask.store
728			},
729			ptr as _
730		))
731	}
732
733	/// # Safety
734	///
735	/// See the trait-level safety documentation.
736	#[inline(always)]
737	unsafe fn mask_store_ptr_c32s(
738		self,
739		mask: MemMask<Self::m32s>,
740		ptr: *mut c32,
741		values: Self::c32s,
742	) {
743		self.mask_store_ptr_u32s(mask, ptr as _, cast!(values))
744	}
745
746	/// # Safety
747	///
748	/// See the trait-level safety documentation.
749	#[inline(always)]
750	unsafe fn mask_store_ptr_c64s(
751		self,
752		mask: MemMask<Self::m64s>,
753		ptr: *mut c64,
754		values: Self::c64s,
755	) {
756		self.mask_store_ptr_u64s(mask, ptr as _, cast!(values))
757	}
758
759	/// # Safety
760	///
761	/// See the trait-level safety documentation.
762	#[inline(always)]
763	unsafe fn mask_store_ptr_u32s(
764		self,
765		mask: MemMask<Self::m32s>,
766		ptr: *mut u32,
767		values: Self::u32s,
768	) {
769		match mask.store {
770			Some(store) => avx_st_u32s(ptr, values, store),
771			None => _mm256_maskstore_epi32(ptr as *mut i32, cast!(mask.mask), cast!(values)),
772		}
773	}
774
775	/// # Safety
776	///
777	/// See the trait-level safety documentation.
778	#[inline(always)]
779	unsafe fn mask_store_ptr_u64s(
780		self,
781		mask: MemMask<Self::m64s>,
782		ptr: *mut u64,
783		values: Self::u64s,
784	) {
785		self.mask_store_ptr_u32s(
786			MemMask {
787				mask: cast!(mask.mask),
788				load: mask.load,
789				store: mask.store,
790			},
791			ptr as _,
792			cast!(values),
793		)
794	}
795
796	#[inline(always)]
797	fn max_f32s(self, a: Self::f32s, b: Self::f32s) -> Self::f32s {
798		cast!(self.avx._mm256_max_ps(cast!(a), cast!(b)))
799	}
800
801	#[inline(always)]
802	fn max_f64s(self, a: Self::f64s, b: Self::f64s) -> Self::f64s {
803		cast!(self.avx._mm256_max_pd(cast!(a), cast!(b)))
804	}
805
806	#[inline(always)]
807	fn min_f32s(self, a: Self::f32s, b: Self::f32s) -> Self::f32s {
808		cast!(self.avx._mm256_min_ps(cast!(a), cast!(b)))
809	}
810
811	#[inline(always)]
812	fn min_f64s(self, a: Self::f64s, b: Self::f64s) -> Self::f64s {
813		cast!(self.avx._mm256_min_pd(cast!(a), cast!(b)))
814	}
815
816	#[inline(always)]
817	fn mul_add_c32s(self, a: Self::c32s, b: Self::c32s, c: Self::c32s) -> Self::c32s {
818		let ab = cast!(a);
819		let xy = cast!(b);
820
821		let yx = self.avx._mm256_permute_ps::<0b10_11_00_01>(xy);
822		let aa = self.avx._mm256_moveldup_ps(ab);
823		let bb = self.avx._mm256_movehdup_ps(ab);
824
825		cast!(
826			self.fma
827				._mm256_fmaddsub_ps(aa, xy, self.fma._mm256_fmaddsub_ps(bb, yx, cast!(c)))
828		)
829	}
830
831	#[inline(always)]
832	fn mul_add_c64s(self, a: Self::c64s, b: Self::c64s, c: Self::c64s) -> Self::c64s {
833		let ab = cast!(a);
834		let xy = cast!(b);
835
836		let yx = self.avx._mm256_permute_pd::<0b0101>(xy);
837		let aa = self.avx._mm256_unpacklo_pd(ab, ab);
838		let bb = self.avx._mm256_unpackhi_pd(ab, ab);
839
840		cast!(
841			self.fma
842				._mm256_fmaddsub_pd(aa, xy, self.fma._mm256_fmaddsub_pd(bb, yx, cast!(c)))
843		)
844	}
845
846	#[inline(always)]
847	fn mul_add_e_f32s(self, a: Self::f32s, b: Self::f32s, c: Self::f32s) -> Self::f32s {
848		self.mul_add_f32s(a, b, c)
849	}
850
851	#[inline(always)]
852	fn mul_add_e_f64s(self, a: Self::f64s, b: Self::f64s, c: Self::f64s) -> Self::f64s {
853		self.mul_add_f64s(a, b, c)
854	}
855
856	#[inline(always)]
857	fn mul_add_f32s(self, a: Self::f32s, b: Self::f32s, c: Self::f32s) -> Self::f32s {
858		cast!(self.fma._mm256_fmadd_ps(cast!(a), cast!(b), cast!(c)))
859	}
860
861	#[inline(always)]
862	fn mul_add_f64s(self, a: Self::f64s, b: Self::f64s, c: Self::f64s) -> Self::f64s {
863		cast!(self.fma._mm256_fmadd_pd(cast!(a), cast!(b), cast!(c)))
864	}
865
866	#[inline(always)]
867	fn mul_c32s(self, a: Self::c32s, b: Self::c32s) -> Self::c32s {
868		let ab = cast!(a);
869		let xy = cast!(b);
870
871		let yx = self.avx._mm256_permute_ps::<0b10_11_00_01>(xy);
872		let aa = self.avx._mm256_moveldup_ps(ab);
873		let bb = self.avx._mm256_movehdup_ps(ab);
874
875		cast!(
876			self.fma
877				._mm256_fmaddsub_ps(aa, xy, self.avx._mm256_mul_ps(bb, yx))
878		)
879	}
880
881	#[inline(always)]
882	fn mul_c64s(self, a: Self::c64s, b: Self::c64s) -> Self::c64s {
883		let ab = cast!(a);
884		let xy = cast!(b);
885
886		let yx = self.avx._mm256_permute_pd::<0b0101>(xy);
887		let aa = self.avx._mm256_unpacklo_pd(ab, ab);
888		let bb = self.avx._mm256_unpackhi_pd(ab, ab);
889
890		cast!(
891			self.fma
892				._mm256_fmaddsub_pd(aa, xy, self.avx._mm256_mul_pd(bb, yx))
893		)
894	}
895
896	#[inline(always)]
897	fn mul_f32s(self, a: Self::f32s, b: Self::f32s) -> Self::f32s {
898		cast!(self.avx._mm256_mul_ps(cast!(a), cast!(b)))
899	}
900
901	#[inline(always)]
902	fn mul_f64s(self, a: Self::f64s, b: Self::f64s) -> Self::f64s {
903		cast!(self.avx._mm256_mul_pd(cast!(a), cast!(b)))
904	}
905
906	#[inline(always)]
907	fn neg_c32s(self, a: Self::c32s) -> Self::c32s {
908		self.xor_f32s(a, self.splat_f32s(-0.0))
909	}
910
911	#[inline(always)]
912	fn neg_c64s(self, a: Self::c64s) -> Self::c64s {
913		self.xor_f64s(a, self.splat_f64s(-0.0))
914	}
915
916	#[inline(always)]
917	fn not_m32s(self, a: Self::m32s) -> Self::m32s {
918		cast!(
919			self.avx
920				._mm256_xor_pd(cast!(self.avx._mm256_set1_epi32(-1)), cast!(a),)
921		)
922	}
923
924	#[inline(always)]
925	fn not_m64s(self, a: Self::m64s) -> Self::m64s {
926		cast!(
927			self.avx
928				._mm256_xor_pd(cast!(self.avx._mm256_set1_epi32(-1)), cast!(a),)
929		)
930	}
931
932	#[inline(always)]
933	fn not_u32s(self, a: Self::u32s) -> Self::u32s {
934		cast!(
935			self.avx
936				._mm256_xor_pd(cast!(self.avx._mm256_set1_epi32(-1)), cast!(a),)
937		)
938	}
939
940	#[inline(always)]
941	fn not_u64s(self, a: Self::u64s) -> Self::u64s {
942		cast!(
943			self.avx
944				._mm256_xor_pd(cast!(self.avx._mm256_set1_epi32(-1)), cast!(a),)
945		)
946	}
947
948	#[inline(always)]
949	fn or_m32s(self, a: Self::m32s, b: Self::m32s) -> Self::m32s {
950		cast!(self.avx._mm256_or_pd(cast!(a), cast!(b)))
951	}
952
953	#[inline(always)]
954	fn or_m64s(self, a: Self::m64s, b: Self::m64s) -> Self::m64s {
955		cast!(self.avx._mm256_or_pd(cast!(a), cast!(b)))
956	}
957
958	#[inline(always)]
959	fn or_u32s(self, a: Self::u32s, b: Self::u32s) -> Self::u32s {
960		cast!(self.avx._mm256_or_pd(cast!(a), cast!(b)))
961	}
962
963	#[inline(always)]
964	fn or_u64s(self, a: Self::u64s, b: Self::u64s) -> Self::u64s {
965		cast!(self.avx._mm256_or_pd(cast!(a), cast!(b)))
966	}
967
968	#[inline(always)]
969	fn partial_load_u32s(self, slice: &[u32]) -> Self::u32s {
970		avx_load_u32s(self.avx2, slice)
971	}
972
973	#[inline(always)]
974	fn partial_load_u64s(self, slice: &[u64]) -> Self::u64s {
975		cast!(self.partial_load_u32s(bytemuck::cast_slice(slice)))
976	}
977
978	#[inline(always)]
979	fn partial_store_u32s(self, slice: &mut [u32], values: Self::u32s) {
980		avx_store_u32s(self.avx2, slice, values)
981	}
982
983	#[inline(always)]
984	fn partial_store_u64s(self, slice: &mut [u64], values: Self::u64s) {
985		self.partial_store_u32s(bytemuck::cast_slice_mut(slice), cast!(values))
986	}
987
988	#[inline(always)]
989	fn reduce_max_c32s(self, a: Self::c32s) -> c32 {
990		let a: __m256 = cast!(a);
991		let r = self.sse._mm_max_ps(
992			self.avx._mm256_castps256_ps128(a),
993			self.avx._mm256_extractf128_ps::<1>(a),
994		);
995		(*self).reduce_max_c32x2(cast!(r))
996	}
997
998	#[inline(always)]
999	fn reduce_max_c64s(self, a: Self::c64s) -> c64 {
1000		let a: __m256d = cast!(a);
1001		let r = self.sse2._mm_max_pd(
1002			self.avx._mm256_castpd256_pd128(a),
1003			self.avx._mm256_extractf128_pd::<1>(a),
1004		);
1005		(*self).reduce_max_c64x1(cast!(r))
1006	}
1007
1008	#[inline(always)]
1009	fn reduce_max_f32s(self, a: Self::f32s) -> f32 {
1010		let a: __m256 = cast!(a);
1011		let r = self.sse._mm_max_ps(
1012			self.avx._mm256_castps256_ps128(a),
1013			self.avx._mm256_extractf128_ps::<1>(a),
1014		);
1015		(*self).reduce_max_f32x4(cast!(r))
1016	}
1017
1018	#[inline(always)]
1019	fn reduce_max_f64s(self, a: Self::f64s) -> f64 {
1020		let a: __m256d = cast!(a);
1021		let r = self.sse2._mm_max_pd(
1022			self.avx._mm256_castpd256_pd128(a),
1023			self.avx._mm256_extractf128_pd::<1>(a),
1024		);
1025		(*self).reduce_max_f64x2(cast!(r))
1026	}
1027
1028	#[inline(always)]
1029	fn reduce_min_c32s(self, a: Self::c32s) -> c32 {
1030		let a: __m256 = cast!(a);
1031		let r = self.sse._mm_min_ps(
1032			self.avx._mm256_castps256_ps128(a),
1033			self.avx._mm256_extractf128_ps::<1>(a),
1034		);
1035		(*self).reduce_min_c32x2(cast!(r))
1036	}
1037
1038	#[inline(always)]
1039	fn reduce_min_c64s(self, a: Self::c64s) -> c64 {
1040		let a: __m256d = cast!(a);
1041		let r = self.sse2._mm_min_pd(
1042			self.avx._mm256_castpd256_pd128(a),
1043			self.avx._mm256_extractf128_pd::<1>(a),
1044		);
1045		(*self).reduce_min_c64x1(cast!(r))
1046	}
1047
1048	#[inline(always)]
1049	fn reduce_min_f32s(self, a: Self::f32s) -> f32 {
1050		let a: __m256 = cast!(a);
1051		let r = self.sse._mm_min_ps(
1052			self.avx._mm256_castps256_ps128(a),
1053			self.avx._mm256_extractf128_ps::<1>(a),
1054		);
1055		(*self).reduce_min_f32x4(cast!(r))
1056	}
1057
1058	#[inline(always)]
1059	fn reduce_min_f64s(self, a: Self::f64s) -> f64 {
1060		let a: __m256d = cast!(a);
1061		let r = self.sse2._mm_min_pd(
1062			self.avx._mm256_castpd256_pd128(a),
1063			self.avx._mm256_extractf128_pd::<1>(a),
1064		);
1065		(*self).reduce_min_f64x2(cast!(r))
1066	}
1067
1068	#[inline(always)]
1069	fn reduce_product_f32s(self, a: Self::f32s) -> f32 {
1070		let a: __m256 = cast!(a);
1071		let r = self.sse._mm_mul_ps(
1072			self.avx._mm256_castps256_ps128(a),
1073			self.avx._mm256_extractf128_ps::<1>(a),
1074		);
1075		(*self).reduce_product_f32x4(cast!(r))
1076	}
1077
1078	#[inline(always)]
1079	fn reduce_product_f64s(self, a: Self::f64s) -> f64 {
1080		let a: __m256d = cast!(a);
1081		let r = self.sse2._mm_mul_pd(
1082			self.avx._mm256_castpd256_pd128(a),
1083			self.avx._mm256_extractf128_pd::<1>(a),
1084		);
1085		(*self).reduce_product_f64x2(cast!(r))
1086	}
1087
1088	#[inline(always)]
1089	fn reduce_sum_c32s(self, a: Self::c32s) -> c32 {
1090		let a: __m256 = cast!(a);
1091		let r = self.sse._mm_add_ps(
1092			self.avx._mm256_castps256_ps128(a),
1093			self.avx._mm256_extractf128_ps::<1>(a),
1094		);
1095		(*self).reduce_sum_c32x2(cast!(r))
1096	}
1097
1098	#[inline(always)]
1099	fn reduce_sum_c64s(self, a: Self::c64s) -> c64 {
1100		let a: __m256d = cast!(a);
1101		let r = self.sse2._mm_add_pd(
1102			self.avx._mm256_castpd256_pd128(a),
1103			self.avx._mm256_extractf128_pd::<1>(a),
1104		);
1105		(*self).reduce_sum_c64x1(cast!(r))
1106	}
1107
1108	#[inline(always)]
1109	fn reduce_sum_f32s(self, a: Self::f32s) -> f32 {
1110		let a: __m256 = cast!(a);
1111		let r = self.sse._mm_add_ps(
1112			self.avx._mm256_castps256_ps128(a),
1113			self.avx._mm256_extractf128_ps::<1>(a),
1114		);
1115		(*self).reduce_sum_f32x4(cast!(r))
1116	}
1117
1118	#[inline(always)]
1119	fn reduce_sum_f64s(self, a: Self::f64s) -> f64 {
1120		let a: __m256d = cast!(a);
1121		let r = self.sse2._mm_add_pd(
1122			self.avx._mm256_castpd256_pd128(a),
1123			self.avx._mm256_extractf128_pd::<1>(a),
1124		);
1125		(*self).reduce_sum_f64x2(cast!(r))
1126	}
1127
1128	#[inline(always)]
1129	fn rotate_right_c32s(self, a: Self::c32s, amount: usize) -> Self::c32s {
1130		cast!(avx2_pshufb(
1131			self,
1132			cast!(a),
1133			cast!(AVX2_ROTATE_IDX[8 * (amount % 4)]),
1134		))
1135	}
1136
1137	#[inline(always)]
1138	fn rotate_right_c64s(self, a: Self::c64s, amount: usize) -> Self::c64s {
1139		cast!(avx2_pshufb(
1140			self,
1141			cast!(a),
1142			cast!(AVX2_ROTATE_IDX[16 * (amount % 2)]),
1143		))
1144	}
1145
1146	#[inline(always)]
1147	fn rotate_right_u32s(self, a: Self::u32s, amount: usize) -> Self::u32s {
1148		cast!(avx2_pshufb(
1149			self,
1150			cast!(a),
1151			cast!(AVX2_ROTATE_IDX[4 * (amount % 8)]),
1152		))
1153	}
1154
1155	#[inline(always)]
1156	fn rotate_right_u64s(self, a: Self::u64s, amount: usize) -> Self::u64s {
1157		cast!(avx2_pshufb(
1158			self,
1159			cast!(a),
1160			cast!(AVX2_ROTATE_IDX[8 * (amount % 4)]),
1161		))
1162	}
1163
1164	#[inline(always)]
1165	fn select_u32s_m32s(
1166		self,
1167		mask: Self::m32s,
1168		if_true: Self::u32s,
1169		if_false: Self::u32s,
1170	) -> Self::u32s {
1171		let mask: __m256 = cast!(mask);
1172		let if_true: __m256 = cast!(if_true);
1173		let if_false: __m256 = cast!(if_false);
1174
1175		cast!(self.avx._mm256_blendv_ps(if_false, if_true, mask))
1176	}
1177
1178	#[inline(always)]
1179	fn select_u64s_m64s(
1180		self,
1181		mask: Self::m64s,
1182		if_true: Self::u64s,
1183		if_false: Self::u64s,
1184	) -> Self::u64s {
1185		let mask: __m256d = cast!(mask);
1186		let if_true: __m256d = cast!(if_true);
1187		let if_false: __m256d = cast!(if_false);
1188
1189		cast!(self.avx._mm256_blendv_pd(if_false, if_true, mask))
1190	}
1191
1192	#[inline(always)]
1193	fn splat_c32s(self, value: c32) -> Self::c32s {
1194		cast!(self.splat_f64s(cast!(value)))
1195	}
1196
1197	#[inline(always)]
1198	fn splat_c64s(self, value: c64) -> Self::c64s {
1199		cast!(self.avx._mm256_broadcast_pd(&cast!(value)))
1200	}
1201
1202	#[inline(always)]
1203	fn splat_f32s(self, value: f32) -> Self::f32s {
1204		cast!(self.avx._mm256_set1_ps(value))
1205	}
1206
1207	#[inline(always)]
1208	fn splat_f64s(self, value: f64) -> Self::f64s {
1209		cast!(self.avx._mm256_set1_pd(value))
1210	}
1211
1212	#[inline(always)]
1213	fn splat_u32s(self, value: u32) -> Self::u32s {
1214		cast!(self.avx._mm256_set1_epi32(value as i32))
1215	}
1216
1217	#[inline(always)]
1218	fn splat_u64s(self, value: u64) -> Self::u64s {
1219		cast!(self.avx._mm256_set1_epi64x(value as i64))
1220	}
1221
1222	#[inline(always)]
1223	fn sub_c32s(self, a: Self::c32s, b: Self::c32s) -> Self::c32s {
1224		self.sub_f32s(a, b)
1225	}
1226
1227	#[inline(always)]
1228	fn sub_c64s(self, a: Self::c64s, b: Self::c64s) -> Self::c64s {
1229		self.sub_f64s(a, b)
1230	}
1231
1232	#[inline(always)]
1233	fn sub_f32s(self, a: Self::f32s, b: Self::f32s) -> Self::f32s {
1234		cast!(self.avx._mm256_sub_ps(cast!(a), cast!(b)))
1235	}
1236
1237	#[inline(always)]
1238	fn sub_f64s(self, a: Self::f64s, b: Self::f64s) -> Self::f64s {
1239		cast!(self.avx._mm256_sub_pd(cast!(a), cast!(b)))
1240	}
1241
1242	#[inline(always)]
1243	fn sub_u32s(self, a: Self::u32s, b: Self::u32s) -> Self::u32s {
1244		cast!(self.avx2._mm256_sub_epi32(cast!(a), cast!(b)))
1245	}
1246
1247	#[inline(always)]
1248	fn sub_u64s(self, a: Self::u64s, b: Self::u64s) -> Self::u64s {
1249		cast!(self.avx2._mm256_sub_epi64(cast!(a), cast!(b)))
1250	}
1251
1252	#[inline(always)]
1253	fn swap_re_im_c32s(self, a: Self::c32s) -> Self::c32s {
1254		cast!(self.avx._mm256_permute_ps::<0b10_11_00_01>(cast!(a)))
1255	}
1256
1257	#[inline(always)]
1258	fn swap_re_im_c64s(self, a: Self::c64s) -> Self::c64s {
1259		cast!(self.avx._mm256_permute_pd::<0b0101>(cast!(a)))
1260	}
1261
1262	#[inline(always)]
1263	fn vectorize<Op: WithSimd>(self, op: Op) -> Op::Output {
1264		struct Impl<Op> {
1265			this: V3,
1266			op: Op,
1267		}
1268		impl<Op: WithSimd> crate::NullaryFnOnce for Impl<Op> {
1269			type Output = Op::Output;
1270
1271			#[inline(always)]
1272			fn call(self) -> Self::Output {
1273				self.op.with_simd(self.this)
1274			}
1275		}
1276		self.vectorize(Impl { this: self, op })
1277	}
1278
1279	#[inline(always)]
1280	fn widening_mul_u32s(self, a: Self::u32s, b: Self::u32s) -> (Self::u32s, Self::u32s) {
1281		self.widening_mul_u32x8(a, b)
1282	}
1283
1284	#[inline(always)]
1285	fn wrapping_dyn_shl_u32s(self, a: Self::u32s, amount: Self::u32s) -> Self::u32s {
1286		self.shl_dyn_u32x8(a, self.and_u32x8(amount, self.splat_u32x8(32 - 1)))
1287	}
1288
1289	#[inline(always)]
1290	fn wrapping_dyn_shr_u32s(self, a: Self::u32s, amount: Self::u32s) -> Self::u32s {
1291		self.shr_dyn_u32x8(a, self.and_u32x8(amount, self.splat_u32x8(32 - 1)))
1292	}
1293
1294	#[inline(always)]
1295	fn xor_m32s(self, a: Self::m32s, b: Self::m32s) -> Self::m32s {
1296		cast!(self.avx._mm256_xor_pd(cast!(a), cast!(b)))
1297	}
1298
1299	#[inline(always)]
1300	fn xor_m64s(self, a: Self::m64s, b: Self::m64s) -> Self::m64s {
1301		cast!(self.avx._mm256_xor_pd(cast!(a), cast!(b)))
1302	}
1303
1304	#[inline(always)]
1305	fn xor_u32s(self, a: Self::u32s, b: Self::u32s) -> Self::u32s {
1306		cast!(self.avx._mm256_xor_pd(cast!(a), cast!(b)))
1307	}
1308
1309	#[inline(always)]
1310	fn xor_u64s(self, a: Self::u64s, b: Self::u64s) -> Self::u64s {
1311		cast!(self.avx._mm256_xor_pd(cast!(a), cast!(b)))
1312	}
1313
1314	#[inline(always)]
1315	fn greater_than_or_equal_i32s(self, a: Self::i32s, b: Self::i32s) -> Self::m32s {
1316		self.cmp_ge_i32x8(a, b)
1317	}
1318
1319	#[inline(always)]
1320	fn greater_than_or_equal_i64s(self, a: Self::i64s, b: Self::i64s) -> Self::m64s {
1321		self.cmp_ge_i64x4(a, b)
1322	}
1323
1324	#[inline(always)]
1325	fn greater_than_i32s(self, a: Self::i32s, b: Self::i32s) -> Self::m32s {
1326		self.cmp_gt_i32x8(a, b)
1327	}
1328
1329	#[inline(always)]
1330	fn greater_than_i64s(self, a: Self::i64s, b: Self::i64s) -> Self::m64s {
1331		self.cmp_gt_i64x4(a, b)
1332	}
1333
1334	#[inline(always)]
1335	fn less_than_or_equal_i32s(self, a: Self::i32s, b: Self::i32s) -> Self::m32s {
1336		self.cmp_le_i32x8(a, b)
1337	}
1338
1339	#[inline(always)]
1340	fn less_than_or_equal_i64s(self, a: Self::i64s, b: Self::i64s) -> Self::m64s {
1341		self.cmp_le_i64x4(a, b)
1342	}
1343
1344	#[inline(always)]
1345	fn less_than_i32s(self, a: Self::i32s, b: Self::i32s) -> Self::m32s {
1346		self.cmp_lt_i32x8(a, b)
1347	}
1348
1349	#[inline(always)]
1350	fn less_than_i64s(self, a: Self::i64s, b: Self::i64s) -> Self::m64s {
1351		self.cmp_lt_i64x4(a, b)
1352	}
1353}
1354
1355#[derive(Copy, Clone, Debug)]
1356#[repr(transparent)]
1357pub struct V3_128b(pub V3);
1358
1359#[derive(Copy, Clone, Debug)]
1360#[repr(transparent)]
1361pub struct V3_256b(pub V3);
1362
1363#[derive(Copy, Clone, Debug)]
1364#[repr(transparent)]
1365pub struct V3_512b(pub V3);
1366
1367impl core::ops::Deref for V3_128b {
1368	type Target = V3;
1369
1370	#[inline]
1371	fn deref(&self) -> &Self::Target {
1372		&self.0
1373	}
1374}
1375
1376impl core::ops::Deref for V3_256b {
1377	type Target = V3;
1378
1379	#[inline]
1380	fn deref(&self) -> &Self::Target {
1381		&self.0
1382	}
1383}
1384
1385impl core::ops::Deref for V3_512b {
1386	type Target = V3;
1387
1388	#[inline]
1389	fn deref(&self) -> &Self::Target {
1390		&self.0
1391	}
1392}
1393
1394impl Seal for V3_128b {}
1395impl Seal for V3_256b {}
1396impl Seal for V3_512b {}
1397
1398impl Simd for V3_128b {
1399	type c32s = f32x4;
1400	type c64s = f64x2;
1401	type f32s = f32x4;
1402	type f64s = f64x2;
1403	type i32s = i32x4;
1404	type i64s = i64x2;
1405	type m32s = m32x4;
1406	type m64s = m64x2;
1407	type u32s = u32x4;
1408	type u64s = u64x2;
1409
1410	const REGISTER_COUNT: usize = 16;
1411
1412	#[inline(always)]
1413	fn abs2_c32s(self, a: Self::c32s) -> Self::c32s {
1414		let sqr = self.mul_f32s(a, a);
1415		let sqr_rev = self
1416			.sse
1417			._mm_shuffle_ps::<0b10_11_00_01>(cast!(sqr), cast!(sqr));
1418		self.add_f32s(sqr, cast!(sqr_rev))
1419	}
1420
1421	#[inline(always)]
1422	fn abs2_c64s(self, a: Self::c64s) -> Self::c64s {
1423		let sqr = self.mul_f64s(a, a);
1424		let sqr_rev = self.sse2._mm_shuffle_pd::<0b01>(cast!(sqr), cast!(sqr));
1425		self.add_f64s(sqr, cast!(sqr_rev))
1426	}
1427
1428	#[inline(always)]
1429	fn abs_max_c32s(self, a: Self::c32s) -> Self::c32s {
1430		let sqr = self.abs_f32s(a);
1431		let sqr_rev = self
1432			.sse
1433			._mm_shuffle_ps::<0b10_11_00_01>(cast!(sqr), cast!(sqr));
1434		self.max_f32s(sqr, cast!(sqr_rev))
1435	}
1436
1437	#[inline(always)]
1438	fn abs_max_c64s(self, a: Self::c64s) -> Self::c64s {
1439		let sqr = self.abs_f64s(a);
1440		let sqr_rev = self.sse2._mm_shuffle_pd::<0b01>(cast!(sqr), cast!(sqr));
1441		self.max_f64s(sqr, cast!(sqr_rev))
1442	}
1443
1444	#[inline(always)]
1445	fn add_c32s(self, a: Self::c32s, b: Self::c32s) -> Self::c32s {
1446		self.add_f32s(a, b)
1447	}
1448
1449	#[inline(always)]
1450	fn add_c64s(self, a: Self::c64s, b: Self::c64s) -> Self::c64s {
1451		self.add_f64s(a, b)
1452	}
1453
1454	#[inline(always)]
1455	fn add_f32s(self, a: Self::f32s, b: Self::f32s) -> Self::f32s {
1456		self.add_f32x4(a, b)
1457	}
1458
1459	#[inline(always)]
1460	fn add_f64s(self, a: Self::f64s, b: Self::f64s) -> Self::f64s {
1461		self.add_f64x2(a, b)
1462	}
1463
1464	#[inline(always)]
1465	fn add_u32s(self, a: Self::u32s, b: Self::u32s) -> Self::u32s {
1466		self.wrapping_add_u32x4(a, b)
1467	}
1468
1469	#[inline(always)]
1470	fn add_u64s(self, a: Self::u64s, b: Self::u64s) -> Self::u64s {
1471		self.wrapping_add_u64x2(a, b)
1472	}
1473
1474	#[inline(always)]
1475	fn and_m32s(self, a: Self::m32s, b: Self::m32s) -> Self::m32s {
1476		cast!(self.sse2._mm_and_pd(cast!(a), cast!(b)))
1477	}
1478
1479	#[inline(always)]
1480	fn and_m64s(self, a: Self::m64s, b: Self::m64s) -> Self::m64s {
1481		cast!(self.sse2._mm_and_pd(cast!(a), cast!(b)))
1482	}
1483
1484	#[inline(always)]
1485	fn and_u32s(self, a: Self::u32s, b: Self::u32s) -> Self::u32s {
1486		cast!(self.sse2._mm_and_pd(cast!(a), cast!(b)))
1487	}
1488
1489	#[inline(always)]
1490	fn and_u64s(self, a: Self::u64s, b: Self::u64s) -> Self::u64s {
1491		cast!(self.sse2._mm_and_pd(cast!(a), cast!(b)))
1492	}
1493
1494	#[inline(always)]
1495	fn conj_c32s(self, a: Self::c32s) -> Self::c32s {
1496		self.xor_f32s(a, self.splat_c32s(c32 { re: 0.0, im: -0.0 }))
1497	}
1498
1499	#[inline(always)]
1500	fn conj_c64s(self, a: Self::c64s) -> Self::c64s {
1501		self.xor_f64s(a, self.splat_c64s(c64 { re: 0.0, im: -0.0 }))
1502	}
1503
1504	#[inline(always)]
1505	fn conj_mul_add_c32s(self, a: Self::c32s, b: Self::c32s, c: Self::c32s) -> Self::c32s {
1506		let ab = cast!(a);
1507		let xy = cast!(b);
1508
1509		let yx = self.avx._mm_permute_ps::<0b10_11_00_01>(xy);
1510		let aa = self.sse3._mm_moveldup_ps(ab);
1511		let bb = self.sse3._mm_movehdup_ps(ab);
1512
1513		cast!(
1514			self.fma
1515				._mm_fmsubadd_ps(aa, xy, self.fma._mm_fmsubadd_ps(bb, yx, cast!(c)))
1516		)
1517	}
1518
1519	#[inline(always)]
1520	fn conj_mul_add_c64s(self, a: Self::c64s, b: Self::c64s, c: Self::c64s) -> Self::c64s {
1521		let ab = cast!(a);
1522		let xy = cast!(b);
1523
1524		let yx = self.avx._mm_permute_pd::<0b01>(xy);
1525		let aa = self.sse2._mm_unpacklo_pd(ab, ab);
1526		let bb = self.sse2._mm_unpackhi_pd(ab, ab);
1527
1528		cast!(
1529			self.fma
1530				._mm_fmsubadd_pd(aa, xy, self.fma._mm_fmsubadd_pd(bb, yx, cast!(c)))
1531		)
1532	}
1533
1534	#[inline(always)]
1535	fn conj_mul_c32s(self, a: Self::c32s, b: Self::c32s) -> Self::c32s {
1536		let ab = cast!(a);
1537		let xy = cast!(b);
1538
1539		let yx = self.avx._mm_permute_ps::<0b10_11_00_01>(xy);
1540		let aa = self.sse3._mm_moveldup_ps(ab);
1541		let bb = self.sse3._mm_movehdup_ps(ab);
1542
1543		cast!(
1544			self.fma
1545				._mm_fmsubadd_ps(aa, xy, self.sse._mm_mul_ps(bb, yx))
1546		)
1547	}
1548
1549	#[inline(always)]
1550	fn conj_mul_c64s(self, a: Self::c64s, b: Self::c64s) -> Self::c64s {
1551		let ab = cast!(a);
1552		let xy = cast!(b);
1553
1554		let yx = self.avx._mm_permute_pd::<0b01>(xy);
1555		let aa = self.sse2._mm_unpacklo_pd(ab, ab);
1556		let bb = self.sse2._mm_unpackhi_pd(ab, ab);
1557
1558		cast!(
1559			self.fma
1560				._mm_fmsubadd_pd(aa, xy, self.sse2._mm_mul_pd(bb, yx))
1561		)
1562	}
1563
1564	#[inline(always)]
1565	fn div_f32s(self, a: Self::f32s, b: Self::f32s) -> Self::f32s {
1566		self.div_f32x4(a, b)
1567	}
1568
1569	#[inline(always)]
1570	fn div_f64s(self, a: Self::f64s, b: Self::f64s) -> Self::f64s {
1571		self.div_f64x2(a, b)
1572	}
1573
1574	#[inline(always)]
1575	fn equal_f32s(self, a: Self::f32s, b: Self::f32s) -> Self::m32s {
1576		self.cmp_eq_f32x4(a, b)
1577	}
1578
1579	#[inline(always)]
1580	fn equal_f64s(self, a: Self::f64s, b: Self::f64s) -> Self::m64s {
1581		self.cmp_eq_f64x2(a, b)
1582	}
1583
1584	#[inline(always)]
1585	fn greater_than_or_equal_u32s(self, a: Self::u32s, b: Self::u32s) -> Self::m32s {
1586		self.cmp_ge_u32x4(a, b)
1587	}
1588
1589	#[inline(always)]
1590	fn greater_than_or_equal_u64s(self, a: Self::u64s, b: Self::u64s) -> Self::m64s {
1591		self.cmp_ge_u64x2(a, b)
1592	}
1593
1594	#[inline(always)]
1595	fn greater_than_u32s(self, a: Self::u32s, b: Self::u32s) -> Self::m32s {
1596		self.cmp_gt_u32x4(a, b)
1597	}
1598
1599	#[inline(always)]
1600	fn greater_than_u64s(self, a: Self::u64s, b: Self::u64s) -> Self::m64s {
1601		self.cmp_gt_u64x2(a, b)
1602	}
1603
1604	#[inline(always)]
1605	fn less_than_f32s(self, a: Self::f32s, b: Self::f32s) -> Self::m32s {
1606		self.cmp_lt_f32x4(a, b)
1607	}
1608
1609	#[inline(always)]
1610	fn less_than_f64s(self, a: Self::f64s, b: Self::f64s) -> Self::m64s {
1611		self.cmp_lt_f64x2(a, b)
1612	}
1613
1614	#[inline(always)]
1615	fn less_than_or_equal_f32s(self, a: Self::f32s, b: Self::f32s) -> Self::m32s {
1616		self.cmp_le_f32x4(a, b)
1617	}
1618
1619	#[inline(always)]
1620	fn less_than_or_equal_f64s(self, a: Self::f64s, b: Self::f64s) -> Self::m64s {
1621		self.cmp_le_f64x2(a, b)
1622	}
1623
1624	#[inline(always)]
1625	fn less_than_or_equal_u32s(self, a: Self::u32s, b: Self::u32s) -> Self::m32s {
1626		self.cmp_le_u32x4(a, b)
1627	}
1628
1629	#[inline(always)]
1630	fn less_than_or_equal_u64s(self, a: Self::u64s, b: Self::u64s) -> Self::m64s {
1631		self.cmp_le_u64x2(a, b)
1632	}
1633
1634	#[inline(always)]
1635	fn less_than_u32s(self, a: Self::u32s, b: Self::u32s) -> Self::m32s {
1636		self.cmp_lt_u32x4(a, b)
1637	}
1638
1639	#[inline(always)]
1640	fn less_than_u64s(self, a: Self::u64s, b: Self::u64s) -> Self::m64s {
1641		self.cmp_lt_u64x2(a, b)
1642	}
1643
1644	#[inline(always)]
1645	unsafe fn mask_load_ptr_c32s(self, mask: MemMask<Self::m32s>, ptr: *const c32) -> Self::c32s {
1646		cast!(self.mask_load_ptr_u32s(mask, ptr as _))
1647	}
1648
1649	#[inline(always)]
1650	unsafe fn mask_load_ptr_c64s(self, mask: MemMask<Self::m64s>, ptr: *const c64) -> Self::c64s {
1651		cast!(self.mask_load_ptr_u64s(mask, ptr as _))
1652	}
1653
1654	#[inline(always)]
1655	unsafe fn mask_load_ptr_u32s(self, mask: MemMask<Self::m32s>, ptr: *const u32) -> Self::u32s {
1656		match mask.load {
1657			Some(load) => cast_lossy(avx_ld_u32s(ptr, load)),
1658			None => cast!(self.avx2._mm_maskload_epi32(ptr as _, cast!(mask.mask))),
1659		}
1660	}
1661
1662	#[inline(always)]
1663	unsafe fn mask_load_ptr_u64s(self, mask: MemMask<Self::m64s>, ptr: *const u64) -> Self::u64s {
1664		cast!(self.mask_load_ptr_u32s(
1665			MemMask {
1666				mask: cast!(mask.mask),
1667				load: mask.load,
1668				store: mask.store
1669			},
1670			ptr as _
1671		))
1672	}
1673
1674	#[inline(always)]
1675	unsafe fn mask_store_ptr_c32s(
1676		self,
1677		mask: MemMask<Self::m32s>,
1678		ptr: *mut c32,
1679		values: Self::c32s,
1680	) {
1681		self.mask_store_ptr_u32s(mask, ptr as _, cast!(values));
1682	}
1683
1684	#[inline(always)]
1685	unsafe fn mask_store_ptr_c64s(
1686		self,
1687		mask: MemMask<Self::m64s>,
1688		ptr: *mut c64,
1689		values: Self::c64s,
1690	) {
1691		self.mask_store_ptr_u64s(mask, ptr as _, cast!(values))
1692	}
1693
1694	#[inline(always)]
1695	unsafe fn mask_store_ptr_u32s(
1696		self,
1697		mask: MemMask<Self::m32s>,
1698		ptr: *mut u32,
1699		values: Self::u32s,
1700	) {
1701		match mask.store {
1702			Some(store) => avx_st_u32s(ptr, cast!([values, self.splat_u32s(0)]), store),
1703			None => self
1704				.avx2
1705				._mm_maskstore_epi32(ptr as _, cast!(mask.mask), cast!(values)),
1706		}
1707	}
1708
1709	#[inline(always)]
1710	unsafe fn mask_store_ptr_u64s(
1711		self,
1712		mask: MemMask<Self::m64s>,
1713		ptr: *mut u64,
1714		values: Self::u64s,
1715	) {
1716		self.mask_store_ptr_u32s(
1717			MemMask {
1718				mask: cast!(mask.mask),
1719				load: mask.load,
1720				store: mask.store,
1721			},
1722			ptr as _,
1723			cast!(values),
1724		)
1725	}
1726
1727	#[inline(always)]
1728	fn max_f32s(self, a: Self::f32s, b: Self::f32s) -> Self::f32s {
1729		self.max_f32x4(a, b)
1730	}
1731
1732	#[inline(always)]
1733	fn max_f64s(self, a: Self::f64s, b: Self::f64s) -> Self::f64s {
1734		self.max_f64x2(a, b)
1735	}
1736
1737	#[inline(always)]
1738	fn min_f32s(self, a: Self::f32s, b: Self::f32s) -> Self::f32s {
1739		self.min_f32x4(a, b)
1740	}
1741
1742	#[inline(always)]
1743	fn min_f64s(self, a: Self::f64s, b: Self::f64s) -> Self::f64s {
1744		self.min_f64x2(a, b)
1745	}
1746
1747	#[inline(always)]
1748	fn mul_add_c32s(self, a: Self::c32s, b: Self::c32s, c: Self::c32s) -> Self::c32s {
1749		let ab = cast!(a);
1750		let xy = cast!(b);
1751
1752		let yx = self.avx._mm_permute_ps::<0b10_11_00_01>(xy);
1753		let aa = self.sse3._mm_moveldup_ps(ab);
1754		let bb = self.sse3._mm_movehdup_ps(ab);
1755
1756		cast!(
1757			self.fma
1758				._mm_fmaddsub_ps(aa, xy, self.fma._mm_fmaddsub_ps(bb, yx, cast!(c)))
1759		)
1760	}
1761
1762	#[inline(always)]
1763	fn mul_add_c64s(self, a: Self::c64s, b: Self::c64s, c: Self::c64s) -> Self::c64s {
1764		let ab = cast!(a);
1765		let xy = cast!(b);
1766
1767		let yx = self.avx._mm_permute_pd::<0b01>(xy);
1768		let aa = self.sse2._mm_unpacklo_pd(ab, ab);
1769		let bb = self.sse2._mm_unpackhi_pd(ab, ab);
1770
1771		cast!(
1772			self.fma
1773				._mm_fmaddsub_pd(aa, xy, self.fma._mm_fmaddsub_pd(bb, yx, cast!(c)))
1774		)
1775	}
1776
1777	#[inline(always)]
1778	fn mul_add_e_f32s(self, a: Self::f32s, b: Self::f32s, c: Self::f32s) -> Self::f32s {
1779		self.mul_add_f32s(a, b, c)
1780	}
1781
1782	#[inline(always)]
1783	fn mul_add_e_f64s(self, a: Self::f64s, b: Self::f64s, c: Self::f64s) -> Self::f64s {
1784		self.mul_add_f64s(a, b, c)
1785	}
1786
1787	#[inline(always)]
1788	fn mul_add_f32s(self, a: Self::f32s, b: Self::f32s, c: Self::f32s) -> Self::f32s {
1789		self.mul_add_f32x4(a, b, c)
1790	}
1791
1792	#[inline(always)]
1793	fn mul_add_f64s(self, a: Self::f64s, b: Self::f64s, c: Self::f64s) -> Self::f64s {
1794		self.mul_add_f64x2(a, b, c)
1795	}
1796
1797	#[inline(always)]
1798	fn mul_c32s(self, a: Self::c32s, b: Self::c32s) -> Self::c32s {
1799		let ab = cast!(a);
1800		let xy = cast!(b);
1801
1802		let yx = self.avx._mm_permute_ps::<0b10_11_00_01>(xy);
1803		let aa = self.sse3._mm_moveldup_ps(ab);
1804		let bb = self.sse3._mm_movehdup_ps(ab);
1805
1806		cast!(
1807			self.fma
1808				._mm_fmaddsub_ps(aa, xy, self.sse._mm_mul_ps(bb, yx))
1809		)
1810	}
1811
1812	#[inline(always)]
1813	fn mul_c64s(self, a: Self::c64s, b: Self::c64s) -> Self::c64s {
1814		let ab = cast!(a);
1815		let xy = cast!(b);
1816
1817		let yx = self.avx._mm_permute_pd::<0b01>(xy);
1818		let aa = self.sse2._mm_unpacklo_pd(ab, ab);
1819		let bb = self.sse2._mm_unpackhi_pd(ab, ab);
1820
1821		cast!(
1822			self.fma
1823				._mm_fmaddsub_pd(aa, xy, self.sse2._mm_mul_pd(bb, yx))
1824		)
1825	}
1826
1827	#[inline(always)]
1828	fn mul_f32s(self, a: Self::f32s, b: Self::f32s) -> Self::f32s {
1829		self.mul_f32x4(a, b)
1830	}
1831
1832	#[inline(always)]
1833	fn mul_f64s(self, a: Self::f64s, b: Self::f64s) -> Self::f64s {
1834		self.mul_f64x2(a, b)
1835	}
1836
1837	#[inline(always)]
1838	fn neg_c32s(self, a: Self::c32s) -> Self::c32s {
1839		self.xor_f32s(a, self.splat_f32s(-0.0))
1840	}
1841
1842	#[inline(always)]
1843	fn neg_c64s(self, a: Self::c64s) -> Self::c64s {
1844		self.xor_f64s(a, self.splat_f64s(-0.0))
1845	}
1846
1847	#[inline(always)]
1848	fn not_m32s(self, a: Self::m32s) -> Self::m32s {
1849		cast!(
1850			self.sse2
1851				._mm_xor_pd(cast!(self.sse2._mm_set1_epi32(-1)), cast!(a),)
1852		)
1853	}
1854
1855	#[inline(always)]
1856	fn not_m64s(self, a: Self::m64s) -> Self::m64s {
1857		cast!(
1858			self.sse2
1859				._mm_xor_pd(cast!(self.sse2._mm_set1_epi32(-1)), cast!(a),)
1860		)
1861	}
1862
1863	#[inline(always)]
1864	fn not_u32s(self, a: Self::u32s) -> Self::u32s {
1865		cast!(
1866			self.sse2
1867				._mm_xor_pd(cast!(self.sse2._mm_set1_epi32(-1)), cast!(a),)
1868		)
1869	}
1870
1871	#[inline(always)]
1872	fn not_u64s(self, a: Self::u64s) -> Self::u64s {
1873		cast!(
1874			self.sse2
1875				._mm_xor_pd(cast!(self.sse2._mm_set1_epi32(-1)), cast!(a),)
1876		)
1877	}
1878
1879	#[inline(always)]
1880	fn or_m32s(self, a: Self::m32s, b: Self::m32s) -> Self::m32s {
1881		cast!(self.sse2._mm_or_pd(cast!(a), cast!(b)))
1882	}
1883
1884	#[inline(always)]
1885	fn or_m64s(self, a: Self::m64s, b: Self::m64s) -> Self::m64s {
1886		cast!(self.sse2._mm_or_pd(cast!(a), cast!(b)))
1887	}
1888
1889	#[inline(always)]
1890	fn or_u32s(self, a: Self::u32s, b: Self::u32s) -> Self::u32s {
1891		cast!(self.sse2._mm_or_pd(cast!(a), cast!(b)))
1892	}
1893
1894	#[inline(always)]
1895	fn or_u64s(self, a: Self::u64s, b: Self::u64s) -> Self::u64s {
1896		cast!(self.sse2._mm_or_pd(cast!(a), cast!(b)))
1897	}
1898
1899	#[inline(always)]
1900	fn partial_load_u32s(self, slice: &[u32]) -> Self::u32s {
1901		cast_lossy(avx_load_u32s(self.avx2, slice))
1902	}
1903
1904	#[inline(always)]
1905	fn partial_load_u64s(self, slice: &[u64]) -> Self::u64s {
1906		cast!(self.partial_load_u32s(bytemuck::cast_slice(slice)))
1907	}
1908
1909	#[inline(always)]
1910	fn partial_store_u32s(self, slice: &mut [u32], values: Self::u32s) {
1911		avx_store_u32s(self.avx2, slice, cast!([values, self.splat_u32s(0)]))
1912	}
1913
1914	#[inline(always)]
1915	fn partial_store_u64s(self, slice: &mut [u64], values: Self::u64s) {
1916		self.partial_store_u32s(bytemuck::cast_slice_mut(slice), cast!(values))
1917	}
1918
1919	#[inline(always)]
1920	fn reduce_max_c32s(self, a: Self::c32s) -> c32 {
1921		let a: __m128 = cast!(a);
1922		let hi = self.sse._mm_movehl_ps(a, a);
1923		let r0 = self.sse._mm_max_ps(a, hi);
1924		cast!(self.sse2._mm_cvtsd_f64(cast!(r0)))
1925	}
1926
1927	#[inline(always)]
1928	fn reduce_max_c64s(self, a: Self::c64s) -> c64 {
1929		cast!(a)
1930	}
1931
1932	#[inline(always)]
1933	fn reduce_max_f32s(self, a: Self::f32s) -> f32 {
1934		let a: __m128 = cast!(a);
1935		let hi = self.sse._mm_movehl_ps(a, a);
1936		let r0 = self.sse._mm_max_ps(a, hi);
1937		let r0_shuffled = self.sse._mm_shuffle_ps::<0b0001>(r0, r0);
1938		let r = self.sse._mm_max_ss(r0, r0_shuffled);
1939		self.sse._mm_cvtss_f32(r)
1940	}
1941
1942	#[inline(always)]
1943	fn reduce_max_f64s(self, a: Self::f64s) -> f64 {
1944		let a: __m128d = cast!(a);
1945		let hi = cast!(self.sse._mm_movehl_ps(cast!(a), cast!(a)));
1946		let r = self.sse2._mm_max_sd(a, hi);
1947		self.sse2._mm_cvtsd_f64(r)
1948	}
1949
1950	#[inline(always)]
1951	fn reduce_min_c32s(self, a: Self::c32s) -> c32 {
1952		let a: __m128 = cast!(a);
1953		let hi = self.sse._mm_movehl_ps(a, a);
1954		let r0 = self.sse._mm_min_ps(a, hi);
1955		cast!(self.sse2._mm_cvtsd_f64(cast!(r0)))
1956	}
1957
1958	#[inline(always)]
1959	fn reduce_min_c64s(self, a: Self::c64s) -> c64 {
1960		cast!(a)
1961	}
1962
1963	#[inline(always)]
1964	fn reduce_min_f32s(self, a: Self::f32s) -> f32 {
1965		let a: __m128 = cast!(a);
1966		let hi = self.sse._mm_movehl_ps(a, a);
1967		let r0 = self.sse._mm_min_ps(a, hi);
1968		let r0_shuffled = self.sse._mm_shuffle_ps::<0b0001>(r0, r0);
1969		let r = self.sse._mm_min_ss(r0, r0_shuffled);
1970		self.sse._mm_cvtss_f32(r)
1971	}
1972
1973	#[inline(always)]
1974	fn reduce_min_f64s(self, a: Self::f64s) -> f64 {
1975		let a: __m128d = cast!(a);
1976		let hi = cast!(self.sse._mm_movehl_ps(cast!(a), cast!(a)));
1977		let r = self.sse2._mm_min_sd(a, hi);
1978		self.sse2._mm_cvtsd_f64(r)
1979	}
1980
1981	#[inline(always)]
1982	fn reduce_product_f32s(self, a: Self::f32s) -> f32 {
1983		let a: __m128 = cast!(a);
1984		let hi = self.sse._mm_movehl_ps(a, a);
1985		let r0 = self.sse._mm_mul_ps(a, hi);
1986		let r0_shuffled = self.sse._mm_shuffle_ps::<0b0001>(r0, r0);
1987		let r = self.sse._mm_mul_ss(r0, r0_shuffled);
1988		self.sse._mm_cvtss_f32(r)
1989	}
1990
1991	#[inline(always)]
1992	fn reduce_product_f64s(self, a: Self::f64s) -> f64 {
1993		let a: __m128d = cast!(a);
1994		let hi = cast!(self.sse._mm_movehl_ps(cast!(a), cast!(a)));
1995		let r = self.sse2._mm_mul_sd(a, hi);
1996		self.sse2._mm_cvtsd_f64(r)
1997	}
1998
1999	#[inline(always)]
2000	fn reduce_sum_c32s(self, a: Self::c32s) -> c32 {
2001		// a0 a1 a2 a3
2002		let a: __m128 = cast!(a);
2003		// a2 a3 a2 a3
2004		let hi = self.sse._mm_movehl_ps(a, a);
2005
2006		// a0+a2 a1+a3 _ _
2007		let r0 = self.sse._mm_add_ps(a, hi);
2008
2009		cast!(self.sse2._mm_cvtsd_f64(cast!(r0)))
2010	}
2011
2012	#[inline(always)]
2013	fn reduce_sum_c64s(self, a: Self::c64s) -> c64 {
2014		cast!(a)
2015	}
2016
2017	#[inline(always)]
2018	fn reduce_sum_f32s(self, a: Self::f32s) -> f32 {
2019		let a: __m128 = cast!(a);
2020		let hi = self.sse._mm_movehl_ps(a, a);
2021		let r0 = self.sse._mm_add_ps(a, hi);
2022		let r0_shuffled = self.sse._mm_shuffle_ps::<0b0001>(r0, r0);
2023		let r = self.sse._mm_add_ss(r0, r0_shuffled);
2024		self.sse._mm_cvtss_f32(r)
2025	}
2026
2027	#[inline(always)]
2028	fn reduce_sum_f64s(self, a: Self::f64s) -> f64 {
2029		let a: __m128d = cast!(a);
2030		let hi = cast!(self.sse._mm_movehl_ps(cast!(a), cast!(a)));
2031		let r = self.sse2._mm_add_sd(a, hi);
2032		self.sse2._mm_cvtsd_f64(r)
2033	}
2034
2035	#[inline(always)]
2036	fn rotate_right_c32s(self, a: Self::c32s, amount: usize) -> Self::c32s {
2037		cast!(
2038			self.ssse3
2039				._mm_shuffle_epi8(cast!(a), cast!(AVX2_128_ROTATE_IDX[8 * (amount % 2)]))
2040		)
2041	}
2042
2043	#[inline(always)]
2044	fn rotate_right_c64s(self, a: Self::c64s, amount: usize) -> Self::c64s {
2045		_ = amount;
2046		a
2047	}
2048
2049	#[inline(always)]
2050	fn rotate_right_u32s(self, a: Self::u32s, amount: usize) -> Self::u32s {
2051		cast!(
2052			self.ssse3
2053				._mm_shuffle_epi8(cast!(a), cast!(AVX2_128_ROTATE_IDX[4 * (amount % 4)]))
2054		)
2055	}
2056
2057	#[inline(always)]
2058	fn rotate_right_u64s(self, a: Self::u64s, amount: usize) -> Self::u64s {
2059		cast!(
2060			self.ssse3
2061				._mm_shuffle_epi8(cast!(a), cast!(AVX2_128_ROTATE_IDX[8 * (amount % 2)]))
2062		)
2063	}
2064
2065	#[inline(always)]
2066	fn select_u32s_m32s(
2067		self,
2068		mask: Self::m32s,
2069		if_true: Self::u32s,
2070		if_false: Self::u32s,
2071	) -> Self::u32s {
2072		self.select_u32x4(mask, if_true, if_false)
2073	}
2074
2075	#[inline(always)]
2076	fn select_u64s_m64s(
2077		self,
2078		mask: Self::m64s,
2079		if_true: Self::u64s,
2080		if_false: Self::u64s,
2081	) -> Self::u64s {
2082		self.select_u64x2(mask, if_true, if_false)
2083	}
2084
2085	#[inline(always)]
2086	fn splat_c32s(self, value: c32) -> Self::c32s {
2087		cast!(self.splat_f64x2(cast!(value)))
2088	}
2089
2090	#[inline(always)]
2091	fn splat_c64s(self, value: c64) -> Self::c64s {
2092		cast!(value)
2093	}
2094
2095	#[inline(always)]
2096	fn splat_f32s(self, value: f32) -> Self::f32s {
2097		self.splat_f32x4(value)
2098	}
2099
2100	#[inline(always)]
2101	fn splat_f64s(self, value: f64) -> Self::f64s {
2102		self.splat_f64x2(value)
2103	}
2104
2105	#[inline(always)]
2106	fn splat_u32s(self, value: u32) -> Self::u32s {
2107		self.splat_u32x4(value)
2108	}
2109
2110	#[inline(always)]
2111	fn splat_u64s(self, value: u64) -> Self::u64s {
2112		self.splat_u64x2(value)
2113	}
2114
2115	#[inline(always)]
2116	fn sub_c32s(self, a: Self::c32s, b: Self::c32s) -> Self::c32s {
2117		self.sub_f32x4(a, b)
2118	}
2119
2120	#[inline(always)]
2121	fn sub_c64s(self, a: Self::c64s, b: Self::c64s) -> Self::c64s {
2122		self.sub_f64x2(a, b)
2123	}
2124
2125	#[inline(always)]
2126	fn sub_f32s(self, a: Self::f32s, b: Self::f32s) -> Self::f32s {
2127		self.sub_f32x4(a, b)
2128	}
2129
2130	#[inline(always)]
2131	fn sub_f64s(self, a: Self::f64s, b: Self::f64s) -> Self::f64s {
2132		self.sub_f64x2(a, b)
2133	}
2134
2135	#[inline(always)]
2136	fn sub_u32s(self, a: Self::u32s, b: Self::u32s) -> Self::u32s {
2137		self.wrapping_sub_u32x4(a, b)
2138	}
2139
2140	#[inline(always)]
2141	fn sub_u64s(self, a: Self::u64s, b: Self::u64s) -> Self::u64s {
2142		self.wrapping_sub_u64x2(a, b)
2143	}
2144
2145	#[inline(always)]
2146	fn swap_re_im_c32s(self, a: Self::c32s) -> Self::c32s {
2147		cast!(self.avx._mm_permute_ps::<0b10_11_00_01>(cast!(a)))
2148	}
2149
2150	#[inline(always)]
2151	fn swap_re_im_c64s(self, a: Self::c64s) -> Self::c64s {
2152		cast!(self.avx._mm_permute_pd::<0b01>(cast!(a)))
2153	}
2154
2155	#[inline(always)]
2156	fn vectorize<Op: WithSimd>(self, op: Op) -> Op::Output {
2157		Simd::vectorize(self.0, op)
2158	}
2159
2160	#[inline(always)]
2161	fn widening_mul_u32s(self, a: Self::u32s, b: Self::u32s) -> (Self::u32s, Self::u32s) {
2162		self.widening_mul_u32x4(a, b)
2163	}
2164
2165	#[inline(always)]
2166	fn wrapping_dyn_shl_u32s(self, a: Self::u32s, amount: Self::u32s) -> Self::u32s {
2167		self.shl_dyn_u32x4(a, self.and_u32x4(amount, self.splat_u32x4(32 - 1)))
2168	}
2169
2170	#[inline(always)]
2171	fn wrapping_dyn_shr_u32s(self, a: Self::u32s, amount: Self::u32s) -> Self::u32s {
2172		self.shr_dyn_u32x4(a, self.and_u32x4(amount, self.splat_u32x4(32 - 1)))
2173	}
2174
2175	#[inline(always)]
2176	fn xor_m32s(self, a: Self::m32s, b: Self::m32s) -> Self::m32s {
2177		cast!(self.sse2._mm_xor_pd(cast!(a), cast!(b)))
2178	}
2179
2180	#[inline(always)]
2181	fn xor_m64s(self, a: Self::m64s, b: Self::m64s) -> Self::m64s {
2182		cast!(self.sse2._mm_xor_pd(cast!(a), cast!(b)))
2183	}
2184
2185	#[inline(always)]
2186	fn xor_u32s(self, a: Self::u32s, b: Self::u32s) -> Self::u32s {
2187		cast!(self.sse2._mm_xor_pd(cast!(a), cast!(b)))
2188	}
2189
2190	#[inline(always)]
2191	fn xor_u64s(self, a: Self::u64s, b: Self::u64s) -> Self::u64s {
2192		cast!(self.sse2._mm_xor_pd(cast!(a), cast!(b)))
2193	}
2194
2195	#[inline(always)]
2196	fn greater_than_or_equal_i32s(self, a: Self::i32s, b: Self::i32s) -> Self::m32s {
2197		self.cmp_ge_i32x4(a, b)
2198	}
2199
2200	#[inline(always)]
2201	fn greater_than_or_equal_i64s(self, a: Self::i64s, b: Self::i64s) -> Self::m64s {
2202		self.cmp_ge_i64x2(a, b)
2203	}
2204
2205	#[inline(always)]
2206	fn greater_than_i32s(self, a: Self::i32s, b: Self::i32s) -> Self::m32s {
2207		self.cmp_gt_i32x4(a, b)
2208	}
2209
2210	#[inline(always)]
2211	fn greater_than_i64s(self, a: Self::i64s, b: Self::i64s) -> Self::m64s {
2212		self.cmp_gt_i64x2(a, b)
2213	}
2214
2215	#[inline(always)]
2216	fn less_than_or_equal_i32s(self, a: Self::i32s, b: Self::i32s) -> Self::m32s {
2217		self.cmp_le_i32x4(a, b)
2218	}
2219
2220	#[inline(always)]
2221	fn less_than_or_equal_i64s(self, a: Self::i64s, b: Self::i64s) -> Self::m64s {
2222		self.cmp_le_i64x2(a, b)
2223	}
2224
2225	#[inline(always)]
2226	fn less_than_i32s(self, a: Self::i32s, b: Self::i32s) -> Self::m32s {
2227		self.cmp_lt_i32x4(a, b)
2228	}
2229
2230	#[inline(always)]
2231	fn less_than_i64s(self, a: Self::i64s, b: Self::i64s) -> Self::m64s {
2232		self.cmp_lt_i64x2(a, b)
2233	}
2234}
2235
2236impl Simd for V3_256b {
2237	type c32s = f32x8;
2238	type c64s = f64x4;
2239	type f32s = f32x8;
2240	type f64s = f64x4;
2241	type i32s = i32x8;
2242	type i64s = i64x4;
2243	type m32s = m32x8;
2244	type m64s = m64x4;
2245	type u32s = u32x8;
2246	type u64s = u64x4;
2247
2248	const REGISTER_COUNT: usize = 16;
2249
2250	inherit!({
2251		fn abs2_c32s(self, a: Self::c32s) -> Self::c32s;
2252		fn abs2_c64s(self, a: Self::c64s) -> Self::c64s;
2253		fn abs_max_c32s(self, a: Self::c32s) -> Self::c32s;
2254		fn abs_max_c64s(self, a: Self::c64s) -> Self::c64s;
2255		fn add_c32s(self, a: Self::c32s, b: Self::c32s) -> Self::c32s;
2256		fn add_c64s(self, a: Self::c64s, b: Self::c64s) -> Self::c64s;
2257		fn add_f32s(self, a: Self::f32s, b: Self::f32s) -> Self::f32s;
2258		fn add_f64s(self, a: Self::f64s, b: Self::f64s) -> Self::f64s;
2259		fn add_u32s(self, a: Self::u32s, b: Self::u32s) -> Self::u32s;
2260		fn add_u64s(self, a: Self::u64s, b: Self::u64s) -> Self::u64s;
2261		fn and_m32s(self, a: Self::m32s, b: Self::m32s) -> Self::m32s;
2262		fn and_m64s(self, a: Self::m64s, b: Self::m64s) -> Self::m64s;
2263		fn and_u32s(self, a: Self::u32s, b: Self::u32s) -> Self::u32s;
2264		fn and_u64s(self, a: Self::u64s, b: Self::u64s) -> Self::u64s;
2265		fn conj_c32s(self, a: Self::c32s) -> Self::c32s;
2266		fn conj_c64s(self, a: Self::c64s) -> Self::c64s;
2267		fn conj_mul_add_c32s(self, a: Self::c32s, b: Self::c32s, c: Self::c32s) -> Self::c32s;
2268		fn conj_mul_add_c64s(self, a: Self::c64s, b: Self::c64s, c: Self::c64s) -> Self::c64s;
2269		fn conj_mul_c32s(self, a: Self::c32s, b: Self::c32s) -> Self::c32s;
2270		fn conj_mul_c64s(self, a: Self::c64s, b: Self::c64s) -> Self::c64s;
2271		fn div_f32s(self, a: Self::f32s, b: Self::f32s) -> Self::f32s;
2272		fn div_f64s(self, a: Self::f64s, b: Self::f64s) -> Self::f64s;
2273		fn equal_f32s(self, a: Self::f32s, b: Self::f32s) -> Self::m32s;
2274		fn equal_f64s(self, a: Self::f64s, b: Self::f64s) -> Self::m64s;
2275		fn greater_than_or_equal_u32s(self, a: Self::u32s, b: Self::u32s) -> Self::m32s;
2276		fn greater_than_or_equal_u64s(self, a: Self::u64s, b: Self::u64s) -> Self::m64s;
2277		fn greater_than_u32s(self, a: Self::u32s, b: Self::u32s) -> Self::m32s;
2278		fn greater_than_u64s(self, a: Self::u64s, b: Self::u64s) -> Self::m64s;
2279		fn less_than_f32s(self, a: Self::f32s, b: Self::f32s) -> Self::m32s;
2280		fn less_than_f64s(self, a: Self::f64s, b: Self::f64s) -> Self::m64s;
2281		fn less_than_or_equal_f32s(self, a: Self::f32s, b: Self::f32s) -> Self::m32s;
2282		fn less_than_or_equal_f64s(self, a: Self::f64s, b: Self::f64s) -> Self::m64s;
2283		fn less_than_or_equal_u32s(self, a: Self::u32s, b: Self::u32s) -> Self::m32s;
2284		fn less_than_or_equal_u64s(self, a: Self::u64s, b: Self::u64s) -> Self::m64s;
2285		fn less_than_u32s(self, a: Self::u32s, b: Self::u32s) -> Self::m32s;
2286		fn less_than_u64s(self, a: Self::u64s, b: Self::u64s) -> Self::m64s;
2287		fn mask_between_m32s(self, start: u32, end: u32) -> MemMask<Self::m32s>;
2288		fn mask_between_m64s(self, start: u64, end: u64) -> MemMask<Self::m64s>;
2289		/// # Safety
2290		///
2291		/// See the trait-level safety documentation.
2292		unsafe fn mask_load_ptr_c32s(
2293			self,
2294			mask: MemMask<Self::m32s>,
2295			ptr: *const c32,
2296		) -> Self::c32s;
2297		/// # Safety
2298		///
2299		/// See the trait-level safety documentation.
2300		unsafe fn mask_load_ptr_c64s(
2301			self,
2302			mask: MemMask<Self::m64s>,
2303			ptr: *const c64,
2304		) -> Self::c64s;
2305		/// # Safety
2306		///
2307		/// See the trait-level safety documentation.
2308		unsafe fn mask_load_ptr_u32s(
2309			self,
2310			mask: MemMask<Self::m32s>,
2311			ptr: *const u32,
2312		) -> Self::u32s;
2313		/// # Safety
2314		///
2315		/// See the trait-level safety documentation.
2316		unsafe fn mask_load_ptr_u64s(
2317			self,
2318			mask: MemMask<Self::m64s>,
2319			ptr: *const u64,
2320		) -> Self::u64s;
2321		/// # Safety
2322		///
2323		/// See the trait-level safety documentation.
2324		unsafe fn mask_store_ptr_c32s(
2325			self,
2326			mask: MemMask<Self::m32s>,
2327			ptr: *mut c32,
2328			values: Self::c32s,
2329		);
2330		/// # Safety
2331		///
2332		/// See the trait-level safety documentation.
2333		unsafe fn mask_store_ptr_c64s(
2334			self,
2335			mask: MemMask<Self::m64s>,
2336			ptr: *mut c64,
2337			values: Self::c64s,
2338		);
2339		/// # Safety
2340		///
2341		/// See the trait-level safety documentation.
2342		unsafe fn mask_store_ptr_u32s(
2343			self,
2344			mask: MemMask<Self::m32s>,
2345			ptr: *mut u32,
2346			values: Self::u32s,
2347		);
2348		/// # Safety
2349		///
2350		/// See the trait-level safety documentation.
2351		unsafe fn mask_store_ptr_u64s(
2352			self,
2353			mask: MemMask<Self::m64s>,
2354			ptr: *mut u64,
2355			values: Self::u64s,
2356		);
2357		fn max_f32s(self, a: Self::f32s, b: Self::f32s) -> Self::f32s;
2358		fn max_f64s(self, a: Self::f64s, b: Self::f64s) -> Self::f64s;
2359		fn min_f32s(self, a: Self::f32s, b: Self::f32s) -> Self::f32s;
2360		fn min_f64s(self, a: Self::f64s, b: Self::f64s) -> Self::f64s;
2361		fn mul_add_c32s(self, a: Self::c32s, b: Self::c32s, c: Self::c32s) -> Self::c32s;
2362		fn mul_add_c64s(self, a: Self::c64s, b: Self::c64s, c: Self::c64s) -> Self::c64s;
2363		fn mul_add_e_f32s(self, a: Self::f32s, b: Self::f32s, c: Self::f32s) -> Self::f32s;
2364		fn mul_add_e_f64s(self, a: Self::f64s, b: Self::f64s, c: Self::f64s) -> Self::f64s;
2365		fn mul_add_f32s(self, a: Self::f32s, b: Self::f32s, c: Self::f32s) -> Self::f32s;
2366		fn mul_add_f64s(self, a: Self::f64s, b: Self::f64s, c: Self::f64s) -> Self::f64s;
2367		fn mul_c32s(self, a: Self::c32s, b: Self::c32s) -> Self::c32s;
2368		fn mul_c64s(self, a: Self::c64s, b: Self::c64s) -> Self::c64s;
2369		fn mul_f32s(self, a: Self::f32s, b: Self::f32s) -> Self::f32s;
2370		fn mul_f64s(self, a: Self::f64s, b: Self::f64s) -> Self::f64s;
2371		fn neg_c32s(self, a: Self::c32s) -> Self::c32s;
2372		fn neg_c64s(self, a: Self::c64s) -> Self::c64s;
2373		fn not_m32s(self, a: Self::m32s) -> Self::m32s;
2374		fn not_m64s(self, a: Self::m64s) -> Self::m64s;
2375		fn not_u32s(self, a: Self::u32s) -> Self::u32s;
2376		fn not_u64s(self, a: Self::u64s) -> Self::u64s;
2377		fn or_m32s(self, a: Self::m32s, b: Self::m32s) -> Self::m32s;
2378		fn or_m64s(self, a: Self::m64s, b: Self::m64s) -> Self::m64s;
2379		fn or_u32s(self, a: Self::u32s, b: Self::u32s) -> Self::u32s;
2380		fn or_u64s(self, a: Self::u64s, b: Self::u64s) -> Self::u64s;
2381		fn partial_load_u32s(self, slice: &[u32]) -> Self::u32s;
2382		fn partial_load_u64s(self, slice: &[u64]) -> Self::u64s;
2383		fn partial_store_u32s(self, slice: &mut [u32], values: Self::u32s);
2384		fn partial_store_u64s(self, slice: &mut [u64], values: Self::u64s);
2385		fn reduce_max_c32s(self, a: Self::c32s) -> c32;
2386		fn reduce_max_c64s(self, a: Self::c64s) -> c64;
2387		fn reduce_max_f32s(self, a: Self::f32s) -> f32;
2388		fn reduce_max_f64s(self, a: Self::f64s) -> f64;
2389		fn reduce_min_c32s(self, a: Self::c32s) -> c32;
2390		fn reduce_min_c64s(self, a: Self::c64s) -> c64;
2391		fn reduce_min_f32s(self, a: Self::f32s) -> f32;
2392		fn reduce_min_f64s(self, a: Self::f64s) -> f64;
2393		fn reduce_product_f32s(self, a: Self::f32s) -> f32;
2394		fn reduce_product_f64s(self, a: Self::f64s) -> f64;
2395		fn reduce_sum_c32s(self, a: Self::c32s) -> c32;
2396		fn reduce_sum_c64s(self, a: Self::c64s) -> c64;
2397		fn reduce_sum_f32s(self, a: Self::f32s) -> f32;
2398		fn reduce_sum_f64s(self, a: Self::f64s) -> f64;
2399		fn rotate_right_c32s(self, a: Self::c32s, amount: usize) -> Self::c32s;
2400		fn rotate_right_c64s(self, a: Self::c64s, amount: usize) -> Self::c64s;
2401		fn rotate_right_u32s(self, a: Self::u32s, amount: usize) -> Self::u32s;
2402		fn rotate_right_u64s(self, a: Self::u64s, amount: usize) -> Self::u64s;
2403		fn select_u32s_m32s(
2404			self,
2405			mask: Self::m32s,
2406			if_true: Self::u32s,
2407			if_false: Self::u32s,
2408		) -> Self::u32s;
2409		fn select_u64s_m64s(
2410			self,
2411			mask: Self::m64s,
2412			if_true: Self::u64s,
2413			if_false: Self::u64s,
2414		) -> Self::u64s;
2415		fn splat_c32s(self, value: c32) -> Self::c32s;
2416		fn splat_c64s(self, value: c64) -> Self::c64s;
2417		fn splat_f32s(self, value: f32) -> Self::f32s;
2418		fn splat_f64s(self, value: f64) -> Self::f64s;
2419		fn splat_u32s(self, value: u32) -> Self::u32s;
2420		fn splat_u64s(self, value: u64) -> Self::u64s;
2421		fn sub_c32s(self, a: Self::c32s, b: Self::c32s) -> Self::c32s;
2422		fn sub_c64s(self, a: Self::c64s, b: Self::c64s) -> Self::c64s;
2423		fn sub_f32s(self, a: Self::f32s, b: Self::f32s) -> Self::f32s;
2424		fn sub_f64s(self, a: Self::f64s, b: Self::f64s) -> Self::f64s;
2425		fn sub_u32s(self, a: Self::u32s, b: Self::u32s) -> Self::u32s;
2426		fn sub_u64s(self, a: Self::u64s, b: Self::u64s) -> Self::u64s;
2427		fn swap_re_im_c32s(self, a: Self::c32s) -> Self::c32s;
2428		fn swap_re_im_c64s(self, a: Self::c64s) -> Self::c64s;
2429		fn widening_mul_u32s(self, a: Self::u32s, b: Self::u32s) -> (Self::u32s, Self::u32s);
2430		fn wrapping_dyn_shl_u32s(self, a: Self::u32s, amount: Self::u32s) -> Self::u32s;
2431		fn wrapping_dyn_shr_u32s(self, a: Self::u32s, amount: Self::u32s) -> Self::u32s;
2432		fn xor_m32s(self, a: Self::m32s, b: Self::m32s) -> Self::m32s;
2433		fn xor_m64s(self, a: Self::m64s, b: Self::m64s) -> Self::m64s;
2434		fn xor_u32s(self, a: Self::u32s, b: Self::u32s) -> Self::u32s;
2435		fn xor_u64s(self, a: Self::u64s, b: Self::u64s) -> Self::u64s;
2436		fn greater_than_or_equal_i32s(self, a: Self::i32s, b: Self::i32s) -> Self::m32s;
2437		fn greater_than_or_equal_i64s(self, a: Self::i64s, b: Self::i64s) -> Self::m64s;
2438		fn greater_than_i32s(self, a: Self::i32s, b: Self::i32s) -> Self::m32s;
2439		fn greater_than_i64s(self, a: Self::i64s, b: Self::i64s) -> Self::m64s;
2440		fn less_than_or_equal_i32s(self, a: Self::i32s, b: Self::i32s) -> Self::m32s;
2441		fn less_than_or_equal_i64s(self, a: Self::i64s, b: Self::i64s) -> Self::m64s;
2442		fn less_than_i32s(self, a: Self::i32s, b: Self::i32s) -> Self::m32s;
2443		fn less_than_i64s(self, a: Self::i64s, b: Self::i64s) -> Self::m64s;
2444	});
2445
2446	#[inline(always)]
2447	fn vectorize<Op: WithSimd>(self, op: Op) -> Op::Output {
2448		Simd::vectorize(self.0, op)
2449	}
2450}
2451
2452impl Simd for V3_512b {
2453	type c32s = f32x16;
2454	type c64s = f64x8;
2455	type f32s = f32x16;
2456	type f64s = f64x8;
2457	type i32s = i32x16;
2458	type i64s = i64x8;
2459	type m32s = m32x16;
2460	type m64s = m64x8;
2461	type u32s = u32x16;
2462	type u64s = u64x8;
2463
2464	const REGISTER_COUNT: usize = 8;
2465
2466	inherit_x2!(V3_256b(*self), {
2467		fn abs2_c32s(self, a: Self::c32s) -> Self::c32s;
2468		fn abs2_c64s(self, a: Self::c64s) -> Self::c64s;
2469		fn abs_max_c32s(self, a: Self::c32s) -> Self::c32s;
2470		fn abs_max_c64s(self, a: Self::c64s) -> Self::c64s;
2471		fn add_c32s(self, a: Self::c32s, b: Self::c32s) -> Self::c32s;
2472		fn add_c64s(self, a: Self::c64s, b: Self::c64s) -> Self::c64s;
2473		fn add_f32s(self, a: Self::f32s, b: Self::f32s) -> Self::f32s;
2474		fn add_f64s(self, a: Self::f64s, b: Self::f64s) -> Self::f64s;
2475		fn add_u32s(self, a: Self::u32s, b: Self::u32s) -> Self::u32s;
2476		fn add_u64s(self, a: Self::u64s, b: Self::u64s) -> Self::u64s;
2477		fn and_m32s(self, a: Self::m32s, b: Self::m32s) -> Self::m32s;
2478		fn and_m64s(self, a: Self::m64s, b: Self::m64s) -> Self::m64s;
2479		fn and_u32s(self, a: Self::u32s, b: Self::u32s) -> Self::u32s;
2480		fn and_u64s(self, a: Self::u64s, b: Self::u64s) -> Self::u64s;
2481		fn conj_c32s(self, a: Self::c32s) -> Self::c32s;
2482		fn conj_c64s(self, a: Self::c64s) -> Self::c64s;
2483		fn conj_mul_add_c32s(self, a: Self::c32s, b: Self::c32s, c: Self::c32s) -> Self::c32s;
2484		fn conj_mul_add_c64s(self, a: Self::c64s, b: Self::c64s, c: Self::c64s) -> Self::c64s;
2485		fn conj_mul_c32s(self, a: Self::c32s, b: Self::c32s) -> Self::c32s;
2486		fn conj_mul_c64s(self, a: Self::c64s, b: Self::c64s) -> Self::c64s;
2487		fn div_f32s(self, a: Self::f32s, b: Self::f32s) -> Self::f32s;
2488		fn div_f64s(self, a: Self::f64s, b: Self::f64s) -> Self::f64s;
2489		fn equal_f32s(self, a: Self::f32s, b: Self::f32s) -> Self::m32s;
2490		fn equal_f64s(self, a: Self::f64s, b: Self::f64s) -> Self::m64s;
2491		fn greater_than_or_equal_u32s(self, a: Self::u32s, b: Self::u32s) -> Self::m32s;
2492		fn greater_than_or_equal_u64s(self, a: Self::u64s, b: Self::u64s) -> Self::m64s;
2493		fn greater_than_u32s(self, a: Self::u32s, b: Self::u32s) -> Self::m32s;
2494		fn greater_than_u64s(self, a: Self::u64s, b: Self::u64s) -> Self::m64s;
2495		fn less_than_f32s(self, a: Self::f32s, b: Self::f32s) -> Self::m32s;
2496		fn less_than_f64s(self, a: Self::f64s, b: Self::f64s) -> Self::m64s;
2497		fn less_than_or_equal_f32s(self, a: Self::f32s, b: Self::f32s) -> Self::m32s;
2498		fn less_than_or_equal_f64s(self, a: Self::f64s, b: Self::f64s) -> Self::m64s;
2499		fn less_than_or_equal_u32s(self, a: Self::u32s, b: Self::u32s) -> Self::m32s;
2500		fn less_than_or_equal_u64s(self, a: Self::u64s, b: Self::u64s) -> Self::m64s;
2501		fn less_than_u32s(self, a: Self::u32s, b: Self::u32s) -> Self::m32s;
2502		fn less_than_u64s(self, a: Self::u64s, b: Self::u64s) -> Self::m64s;
2503		fn max_f32s(self, a: Self::f32s, b: Self::f32s) -> Self::f32s;
2504		fn max_f64s(self, a: Self::f64s, b: Self::f64s) -> Self::f64s;
2505		fn min_f32s(self, a: Self::f32s, b: Self::f32s) -> Self::f32s;
2506		fn min_f64s(self, a: Self::f64s, b: Self::f64s) -> Self::f64s;
2507		fn mul_add_c32s(self, a: Self::c32s, b: Self::c32s, c: Self::c32s) -> Self::c32s;
2508		fn mul_add_c64s(self, a: Self::c64s, b: Self::c64s, c: Self::c64s) -> Self::c64s;
2509		fn mul_add_e_f32s(self, a: Self::f32s, b: Self::f32s, c: Self::f32s) -> Self::f32s;
2510		fn mul_add_e_f64s(self, a: Self::f64s, b: Self::f64s, c: Self::f64s) -> Self::f64s;
2511		fn mul_add_f32s(self, a: Self::f32s, b: Self::f32s, c: Self::f32s) -> Self::f32s;
2512		fn mul_add_f64s(self, a: Self::f64s, b: Self::f64s, c: Self::f64s) -> Self::f64s;
2513		fn mul_c32s(self, a: Self::c32s, b: Self::c32s) -> Self::c32s;
2514		fn mul_c64s(self, a: Self::c64s, b: Self::c64s) -> Self::c64s;
2515		fn mul_f32s(self, a: Self::f32s, b: Self::f32s) -> Self::f32s;
2516		fn mul_f64s(self, a: Self::f64s, b: Self::f64s) -> Self::f64s;
2517		fn neg_c32s(self, a: Self::c32s) -> Self::c32s;
2518		fn neg_c64s(self, a: Self::c64s) -> Self::c64s;
2519		fn not_m32s(self, a: Self::m32s) -> Self::m32s;
2520		fn not_m64s(self, a: Self::m64s) -> Self::m64s;
2521		fn not_u32s(self, a: Self::u32s) -> Self::u32s;
2522		fn not_u64s(self, a: Self::u64s) -> Self::u64s;
2523		fn or_m32s(self, a: Self::m32s, b: Self::m32s) -> Self::m32s;
2524		fn or_m64s(self, a: Self::m64s, b: Self::m64s) -> Self::m64s;
2525		fn or_u32s(self, a: Self::u32s, b: Self::u32s) -> Self::u32s;
2526		fn or_u64s(self, a: Self::u64s, b: Self::u64s) -> Self::u64s;
2527		fn select_u32s_m32s(
2528			self,
2529			mask: Self::m32s,
2530			if_true: Self::u32s,
2531			if_false: Self::u32s,
2532		) -> Self::u32s;
2533		fn select_u64s_m64s(
2534			self,
2535			mask: Self::m64s,
2536			if_true: Self::u64s,
2537			if_false: Self::u64s,
2538		) -> Self::u64s;
2539		fn sub_c32s(self, a: Self::c32s, b: Self::c32s) -> Self::c32s;
2540		fn sub_c64s(self, a: Self::c64s, b: Self::c64s) -> Self::c64s;
2541		fn sub_f32s(self, a: Self::f32s, b: Self::f32s) -> Self::f32s;
2542		fn sub_f64s(self, a: Self::f64s, b: Self::f64s) -> Self::f64s;
2543		fn sub_u32s(self, a: Self::u32s, b: Self::u32s) -> Self::u32s;
2544		fn sub_u64s(self, a: Self::u64s, b: Self::u64s) -> Self::u64s;
2545		fn swap_re_im_c32s(self, a: Self::c32s) -> Self::c32s;
2546		fn swap_re_im_c64s(self, a: Self::c64s) -> Self::c64s;
2547		fn wrapping_dyn_shl_u32s(self, a: Self::u32s, amount: Self::u32s) -> Self::u32s;
2548		fn wrapping_dyn_shr_u32s(self, a: Self::u32s, amount: Self::u32s) -> Self::u32s;
2549		fn xor_m32s(self, a: Self::m32s, b: Self::m32s) -> Self::m32s;
2550		fn xor_m64s(self, a: Self::m64s, b: Self::m64s) -> Self::m64s;
2551		fn xor_u32s(self, a: Self::u32s, b: Self::u32s) -> Self::u32s;
2552		fn xor_u64s(self, a: Self::u64s, b: Self::u64s) -> Self::u64s;
2553		fn greater_than_or_equal_i32s(self, a: Self::i32s, b: Self::i32s) -> Self::m32s;
2554		fn greater_than_or_equal_i64s(self, a: Self::i64s, b: Self::i64s) -> Self::m64s;
2555		fn greater_than_i32s(self, a: Self::i32s, b: Self::i32s) -> Self::m32s;
2556		fn greater_than_i64s(self, a: Self::i64s, b: Self::i64s) -> Self::m64s;
2557		fn less_than_or_equal_i32s(self, a: Self::i32s, b: Self::i32s) -> Self::m32s;
2558		fn less_than_or_equal_i64s(self, a: Self::i64s, b: Self::i64s) -> Self::m64s;
2559		fn less_than_i32s(self, a: Self::i32s, b: Self::i32s) -> Self::m32s;
2560		fn less_than_i64s(self, a: Self::i64s, b: Self::i64s) -> Self::m64s;
2561	});
2562
2563	inherit_x2!(V3_256b(*self), splat, {
2564		fn splat_c32s(self, value: c32) -> Self::c32s;
2565		fn splat_c64s(self, value: c64) -> Self::c64s;
2566		fn splat_f32s(self, value: f32) -> Self::f32s;
2567		fn splat_f64s(self, value: f64) -> Self::f64s;
2568		fn splat_u32s(self, value: u32) -> Self::u32s;
2569		fn splat_u64s(self, value: u64) -> Self::u64s;
2570	});
2571
2572	inherit_x2!(V3_256b(*self), wide, {
2573		fn widening_mul_u32s(self, a: Self::u32s, b: Self::u32s) -> (Self::u32s, Self::u32s);
2574	});
2575
2576	#[inline(always)]
2577	fn rotate_right_c32s(self, a: Self::c32s, amount: usize) -> Self::c32s {
2578		let simd = V3_256b(*self);
2579		let amount = amount % Self::C32_LANES;
2580		let [mut a0, mut a1]: [_; 2] = cast!(a);
2581		if amount >= Self::C32_LANES / 2 {
2582			core::mem::swap(&mut a0, &mut a1);
2583		}
2584		let amount = amount % (Self::C32_LANES / 2);
2585		let mask = simd.mask_between_m32s(0, amount as _).mask();
2586		let a0 = simd.rotate_right_c32s(a0, amount);
2587		let a1 = simd.rotate_right_c32s(a1, amount);
2588
2589		cast!([
2590			simd.select_f32s_m32s(mask, a1, a0),
2591			simd.select_f32s_m32s(mask, a0, a1),
2592		])
2593	}
2594
2595	#[inline(always)]
2596	fn rotate_right_c64s(self, a: Self::c64s, amount: usize) -> Self::c64s {
2597		let simd = V3_256b(*self);
2598		let amount = amount % Self::C64_LANES;
2599		let [mut a0, mut a1]: [_; 2] = cast!(a);
2600		if amount >= Self::C64_LANES / 2 {
2601			core::mem::swap(&mut a0, &mut a1);
2602		}
2603		let amount = amount % (Self::C64_LANES / 2);
2604		let mask = simd.mask_between_m64s(0, amount as _).mask();
2605		let a0 = simd.rotate_right_c64s(a0, amount);
2606		let a1 = simd.rotate_right_c64s(a1, amount);
2607
2608		cast!([
2609			simd.select_f64s_m64s(mask, a1, a0),
2610			simd.select_f64s_m64s(mask, a0, a1),
2611		])
2612	}
2613
2614	#[inline(always)]
2615	fn rotate_right_u32s(self, a: Self::u32s, amount: usize) -> Self::u32s {
2616		let simd = V3_256b(*self);
2617		let amount = amount % Self::U32_LANES;
2618		let [mut a0, mut a1]: [_; 2] = cast!(a);
2619		if amount >= Self::U32_LANES / 2 {
2620			core::mem::swap(&mut a0, &mut a1);
2621		}
2622		let amount = amount % (Self::U32_LANES / 2);
2623		let mask = simd.mask_between_m32s(0, amount as _).mask();
2624		let a0 = simd.rotate_right_u32s(a0, amount);
2625		let a1 = simd.rotate_right_u32s(a1, amount);
2626
2627		cast!([
2628			simd.select_u32s_m32s(mask, a1, a0),
2629			simd.select_u32s_m32s(mask, a0, a1),
2630		])
2631	}
2632
2633	#[inline(always)]
2634	fn rotate_right_u64s(self, a: Self::u64s, amount: usize) -> Self::u64s {
2635		let simd = V3_256b(*self);
2636		let amount = amount % Self::U64_LANES;
2637		let [mut a0, mut a1]: [_; 2] = cast!(a);
2638		if amount >= Self::U64_LANES / 2 {
2639			core::mem::swap(&mut a0, &mut a1);
2640		}
2641		let amount = amount % (Self::U64_LANES / 2);
2642		let mask = simd.mask_between_m64s(0, amount as _).mask();
2643		let a0 = simd.rotate_right_u64s(a0, amount);
2644		let a1 = simd.rotate_right_u64s(a1, amount);
2645
2646		cast!([
2647			simd.select_u64s_m64s(mask, a1, a0),
2648			simd.select_u64s_m64s(mask, a0, a1),
2649		])
2650	}
2651
2652	/// # Safety
2653	///
2654	/// See the trait-level safety documentation.
2655	#[inline(always)]
2656	unsafe fn mask_load_ptr_c32s(self, mask: MemMask<Self::m32s>, ptr: *const c32) -> Self::c32s {
2657		let simd = V3_256b(*self);
2658		let mask: [_; 2] = cast!(mask.mask());
2659		cast!([
2660			simd.mask_load_ptr_c32s(MemMask::new(mask[0]), ptr.wrapping_add(0)),
2661			simd.mask_load_ptr_c32s(MemMask::new(mask[1]), ptr.wrapping_add(Self::C32_LANES)),
2662		])
2663	}
2664
2665	/// # Safety
2666	///
2667	/// See the trait-level safety documentation.
2668	#[inline(always)]
2669	unsafe fn mask_load_ptr_c64s(self, mask: MemMask<Self::m64s>, ptr: *const c64) -> Self::c64s {
2670		let simd = V3_256b(*self);
2671		let mask: [_; 2] = cast!(mask.mask());
2672		cast!([
2673			simd.mask_load_ptr_c64s(MemMask::new(mask[0]), ptr.wrapping_add(0)),
2674			simd.mask_load_ptr_c64s(MemMask::new(mask[1]), ptr.wrapping_add(Self::C64_LANES)),
2675		])
2676	}
2677
2678	/// # Safety
2679	///
2680	/// See the trait-level safety documentation.
2681	#[inline(always)]
2682	unsafe fn mask_load_ptr_u32s(self, mask: MemMask<Self::m32s>, ptr: *const u32) -> Self::u32s {
2683		let simd = V3_256b(*self);
2684		let mask: [_; 2] = cast!(mask.mask());
2685		cast!([
2686			simd.mask_load_ptr_u32s(MemMask::new(mask[0]), ptr.wrapping_add(0)),
2687			simd.mask_load_ptr_u32s(MemMask::new(mask[1]), ptr.wrapping_add(Self::U32_LANES)),
2688		])
2689	}
2690
2691	/// # Safety
2692	///
2693	/// See the trait-level safety documentation.
2694	#[inline(always)]
2695	unsafe fn mask_load_ptr_u64s(self, mask: MemMask<Self::m64s>, ptr: *const u64) -> Self::u64s {
2696		let simd = V3_256b(*self);
2697		let mask: [_; 2] = cast!(mask.mask());
2698		cast!([
2699			simd.mask_load_ptr_u64s(MemMask::new(mask[0]), ptr.wrapping_add(0)),
2700			simd.mask_load_ptr_u64s(MemMask::new(mask[1]), ptr.wrapping_add(Self::U64_LANES)),
2701		])
2702	}
2703
2704	/// # Safety
2705	///
2706	/// See the trait-level safety documentation.
2707	#[inline(always)]
2708	unsafe fn mask_store_ptr_c32s(
2709		self,
2710		mask: MemMask<Self::m32s>,
2711		ptr: *mut c32,
2712		values: Self::c32s,
2713	) {
2714		let simd = V3_256b(*self);
2715		let mask: [_; 2] = cast!(mask.mask());
2716		let values: [_; 2] = cast!(values);
2717		cast!([
2718			simd.mask_store_ptr_c32s(MemMask::new(mask[0]), ptr.wrapping_add(0), values[0]),
2719			simd.mask_store_ptr_c32s(
2720				MemMask::new(mask[1]),
2721				ptr.wrapping_add(Self::C32_LANES),
2722				values[1]
2723			),
2724		])
2725	}
2726
2727	/// # Safety
2728	///
2729	/// See the trait-level safety documentation.
2730	#[inline(always)]
2731	unsafe fn mask_store_ptr_c64s(
2732		self,
2733		mask: MemMask<Self::m64s>,
2734		ptr: *mut c64,
2735		values: Self::c64s,
2736	) {
2737		let simd = V3_256b(*self);
2738		let mask: [_; 2] = cast!(mask.mask());
2739		let values: [_; 2] = cast!(values);
2740		cast!([
2741			simd.mask_store_ptr_c64s(MemMask::new(mask[0]), ptr.wrapping_add(0), values[0]),
2742			simd.mask_store_ptr_c64s(
2743				MemMask::new(mask[1]),
2744				ptr.wrapping_add(Self::C64_LANES),
2745				values[1]
2746			),
2747		])
2748	}
2749
2750	/// # Safety
2751	///
2752	/// See the trait-level safety documentation.
2753	#[inline(always)]
2754	unsafe fn mask_store_ptr_u32s(
2755		self,
2756		mask: MemMask<Self::m32s>,
2757		ptr: *mut u32,
2758		values: Self::u32s,
2759	) {
2760		let simd = V3_256b(*self);
2761		let mask: [_; 2] = cast!(mask.mask());
2762		let values: [_; 2] = cast!(values);
2763		cast!([
2764			simd.mask_store_ptr_u32s(MemMask::new(mask[0]), ptr.wrapping_add(0), values[0]),
2765			simd.mask_store_ptr_u32s(
2766				MemMask::new(mask[1]),
2767				ptr.wrapping_add(Self::U32_LANES),
2768				values[1]
2769			),
2770		])
2771	}
2772
2773	/// # Safety
2774	///
2775	/// See the trait-level safety documentation.
2776	#[inline(always)]
2777	unsafe fn mask_store_ptr_u64s(
2778		self,
2779		mask: MemMask<Self::m64s>,
2780		ptr: *mut u64,
2781		values: Self::u64s,
2782	) {
2783		let simd = V3_256b(*self);
2784		let mask: [_; 2] = cast!(mask.mask());
2785		let values: [_; 2] = cast!(values);
2786		cast!([
2787			simd.mask_store_ptr_u64s(MemMask::new(mask[0]), ptr.wrapping_add(0), values[0]),
2788			simd.mask_store_ptr_u64s(
2789				MemMask::new(mask[1]),
2790				ptr.wrapping_add(Self::U64_LANES),
2791				values[1]
2792			),
2793		])
2794	}
2795
2796	#[inline(always)]
2797	fn reduce_max_c32s(self, a: Self::c32s) -> c32 {
2798		let simd = V3_256b(*self);
2799		let a: [_; 2] = cast!(a);
2800		simd.reduce_max_c32s(simd.max_f32s(a[0], a[1]))
2801	}
2802
2803	#[inline(always)]
2804	fn reduce_max_c64s(self, a: Self::c64s) -> c64 {
2805		let simd = V3_256b(*self);
2806		let a: [_; 2] = cast!(a);
2807		simd.reduce_max_c64s(simd.max_f64s(a[0], a[1]))
2808	}
2809
2810	#[inline(always)]
2811	fn reduce_max_f32s(self, a: Self::f32s) -> f32 {
2812		let simd = V3_256b(*self);
2813		let a: [_; 2] = cast!(a);
2814		simd.reduce_max_f32s(simd.max_f32s(a[0], a[1]))
2815	}
2816
2817	#[inline(always)]
2818	fn reduce_max_f64s(self, a: Self::f64s) -> f64 {
2819		let simd = V3_256b(*self);
2820		let a: [_; 2] = cast!(a);
2821		simd.reduce_max_f64s(simd.max_f64s(a[0], a[1]))
2822	}
2823
2824	#[inline(always)]
2825	fn reduce_min_c32s(self, a: Self::c32s) -> c32 {
2826		let simd = V3_256b(*self);
2827		let a: [_; 2] = cast!(a);
2828		simd.reduce_min_c32s(simd.min_f32s(a[0], a[1]))
2829	}
2830
2831	#[inline(always)]
2832	fn reduce_min_c64s(self, a: Self::c64s) -> c64 {
2833		let simd = V3_256b(*self);
2834		let a: [_; 2] = cast!(a);
2835		simd.reduce_min_c64s(simd.min_f64s(a[0], a[1]))
2836	}
2837
2838	#[inline(always)]
2839	fn reduce_min_f32s(self, a: Self::f32s) -> f32 {
2840		let simd = V3_256b(*self);
2841		let a: [_; 2] = cast!(a);
2842		simd.reduce_min_f32s(simd.min_f32s(a[0], a[1]))
2843	}
2844
2845	#[inline(always)]
2846	fn reduce_min_f64s(self, a: Self::f64s) -> f64 {
2847		let simd = V3_256b(*self);
2848		let a: [_; 2] = cast!(a);
2849		simd.reduce_min_f64s(simd.min_f64s(a[0], a[1]))
2850	}
2851
2852	#[inline(always)]
2853	fn reduce_product_f32s(self, a: Self::f32s) -> f32 {
2854		let simd = V3_256b(*self);
2855		let a: [_; 2] = cast!(a);
2856		simd.reduce_product_f32s(simd.mul_f32s(a[0], a[1]))
2857	}
2858
2859	#[inline(always)]
2860	fn reduce_product_f64s(self, a: Self::f64s) -> f64 {
2861		let simd = V3_256b(*self);
2862		let a: [_; 2] = cast!(a);
2863		simd.reduce_product_f64s(simd.mul_f64s(a[0], a[1]))
2864	}
2865
2866	#[inline(always)]
2867	fn reduce_sum_c32s(self, a: Self::c32s) -> c32 {
2868		let simd = V3_256b(*self);
2869		let a: [_; 2] = cast!(a);
2870		simd.reduce_sum_c32s(simd.add_c32s(a[0], a[1]))
2871	}
2872
2873	#[inline(always)]
2874	fn reduce_sum_c64s(self, a: Self::c64s) -> c64 {
2875		let simd = V3_256b(*self);
2876		let a: [_; 2] = cast!(a);
2877		simd.reduce_sum_c64s(simd.add_c64s(a[0], a[1]))
2878	}
2879
2880	#[inline(always)]
2881	fn reduce_sum_f32s(self, a: Self::f32s) -> f32 {
2882		let simd = V3_256b(*self);
2883		let a: [_; 2] = cast!(a);
2884		simd.reduce_sum_f32s(simd.add_f32s(a[0], a[1]))
2885	}
2886
2887	#[inline(always)]
2888	fn reduce_sum_f64s(self, a: Self::f64s) -> f64 {
2889		let simd = V3_256b(*self);
2890		let a: [_; 2] = cast!(a);
2891		simd.reduce_sum_f64s(simd.add_f64s(a[0], a[1]))
2892	}
2893
2894	#[inline(always)]
2895	fn vectorize<Op: WithSimd>(self, op: Op) -> Op::Output {
2896		Simd::vectorize(self.0, op)
2897	}
2898}
2899
2900impl V3 {
2901	/// Computes `abs(a)` for each lane of `a`.
2902	#[inline(always)]
2903	pub fn abs_f32x8(self, a: f32x8) -> f32x8 {
2904		self.and_f32x8(a, cast!(self.splat_u32x8((1 << 31) - 1)))
2905	}
2906
2907	/// Computes `abs(a)` for each lane of `a`.
2908	#[inline(always)]
2909	pub fn abs_f64x4(self, a: f64x4) -> f64x4 {
2910		self.and_f64x4(a, cast!(self.splat_u64x4((1 << 63) - 1)))
2911	}
2912
2913	/// Computes `a + b` for each lane of `a` and `b`.
2914	#[inline(always)]
2915	pub fn add_f32x8(self, a: f32x8, b: f32x8) -> f32x8 {
2916		cast!(self.avx._mm256_add_ps(cast!(a), cast!(b)))
2917	}
2918
2919	/// Computes `a + b` for each lane of `a` and `b`.
2920	#[inline(always)]
2921	pub fn add_f64x4(self, a: f64x4, b: f64x4) -> f64x4 {
2922		cast!(self.avx._mm256_add_pd(cast!(a), cast!(b)))
2923	}
2924
2925	/// Returns `a & b` for each bit in `a` and `b`.
2926	#[inline(always)]
2927	pub fn and_f32x8(self, a: f32x8, b: f32x8) -> f32x8 {
2928		cast!(self.avx._mm256_and_ps(cast!(a), cast!(b)))
2929	}
2930
2931	/// Returns `a & b` for each bit in `a` and `b`.
2932	#[inline(always)]
2933	pub fn and_f64x4(self, a: f64x4, b: f64x4) -> f64x4 {
2934		cast!(self.avx._mm256_and_pd(cast!(a), cast!(b)))
2935	}
2936
2937	/// Returns `a & b` for each bit in `a` and `b`.
2938	#[inline(always)]
2939	pub fn and_i16x16(self, a: i16x16, b: i16x16) -> i16x16 {
2940		cast!(self.avx2._mm256_and_si256(cast!(a), cast!(b)))
2941	}
2942
2943	/// Returns `a & b` for each bit in `a` and `b`.
2944	#[inline(always)]
2945	pub fn and_i32x8(self, a: i32x8, b: i32x8) -> i32x8 {
2946		cast!(self.avx2._mm256_and_si256(cast!(a), cast!(b)))
2947	}
2948
2949	/// Returns `a & b` for each bit in `a` and `b`.
2950	#[inline(always)]
2951	pub fn and_i64x4(self, a: i64x4, b: i64x4) -> i64x4 {
2952		cast!(self.avx2._mm256_and_si256(cast!(a), cast!(b)))
2953	}
2954
2955	/// Returns `a & b` for each bit in `a` and `b`.
2956	#[inline(always)]
2957	pub fn and_i8x32(self, a: i8x32, b: i8x32) -> i8x32 {
2958		cast!(self.avx2._mm256_and_si256(cast!(a), cast!(b)))
2959	}
2960
2961	/// Returns `a & b` for each bit in `a` and `b`.
2962	#[inline(always)]
2963	pub fn and_m16x16(self, a: m16x16, b: m16x16) -> m16x16 {
2964		cast!(self.avx2._mm256_and_si256(cast!(a), cast!(b)))
2965	}
2966
2967	/// Returns `a & b` for each bit in `a` and `b`.
2968	#[inline(always)]
2969	pub fn and_m32x8(self, a: m32x8, b: m32x8) -> m32x8 {
2970		cast!(self.avx2._mm256_and_si256(cast!(a), cast!(b)))
2971	}
2972
2973	/// Returns `a & b` for each bit in `a` and `b`.
2974	#[inline(always)]
2975	pub fn and_m64x4(self, a: m64x4, b: m64x4) -> m64x4 {
2976		cast!(self.avx2._mm256_and_si256(cast!(a), cast!(b)))
2977	}
2978
2979	/// Returns `a & b` for each bit in `a` and `b`.
2980	#[inline(always)]
2981	pub fn and_m8x32(self, a: m8x32, b: m8x32) -> m8x32 {
2982		cast!(self.avx2._mm256_and_si256(cast!(a), cast!(b)))
2983	}
2984
2985	/// Returns `a & b` for each bit in `a` and `b`.
2986	#[inline(always)]
2987	pub fn and_u16x16(self, a: u16x16, b: u16x16) -> u16x16 {
2988		cast!(self.avx2._mm256_and_si256(cast!(a), cast!(b)))
2989	}
2990
2991	/// Returns `a & b` for each bit in `a` and `b`.
2992	#[inline(always)]
2993	pub fn and_u32x8(self, a: u32x8, b: u32x8) -> u32x8 {
2994		cast!(self.avx2._mm256_and_si256(cast!(a), cast!(b)))
2995	}
2996
2997	/// Returns `a & b` for each bit in `a` and `b`.
2998	#[inline(always)]
2999	pub fn and_u64x4(self, a: u64x4, b: u64x4) -> u64x4 {
3000		cast!(self.avx2._mm256_and_si256(cast!(a), cast!(b)))
3001	}
3002
3003	/// Returns `a & b` for each bit in `a` and `b`.
3004	#[inline(always)]
3005	pub fn and_u8x32(self, a: u8x32, b: u8x32) -> u8x32 {
3006		cast!(self.avx2._mm256_and_si256(cast!(a), cast!(b)))
3007	}
3008
3009	/// Returns `!a & b` for each bit in `a` and `b`.
3010	#[inline(always)]
3011	pub fn andnot_f32x8(self, a: f32x8, b: f32x8) -> f32x8 {
3012		cast!(self.avx._mm256_andnot_ps(cast!(a), cast!(b)))
3013	}
3014
3015	/// Returns `!a & b` for each bit in `a` and `b`.
3016	#[inline(always)]
3017	pub fn andnot_f64x4(self, a: f64x4, b: f64x4) -> f64x4 {
3018		cast!(self.avx._mm256_andnot_pd(cast!(a), cast!(b)))
3019	}
3020
3021	/// Returns `!a & b` for each bit in `a` and `b`.
3022	#[inline(always)]
3023	pub fn andnot_i16x16(self, a: i16x16, b: i16x16) -> i16x16 {
3024		cast!(self.avx2._mm256_andnot_si256(cast!(a), cast!(b)))
3025	}
3026
3027	/// Returns `!a & b` for each bit in `a` and `b`.
3028	#[inline(always)]
3029	pub fn andnot_i32x8(self, a: i32x8, b: i32x8) -> i32x8 {
3030		cast!(self.avx2._mm256_andnot_si256(cast!(a), cast!(b)))
3031	}
3032
3033	/// Returns `!a & b` for each bit in `a` and `b`.
3034	#[inline(always)]
3035	pub fn andnot_i64x4(self, a: i64x4, b: i64x4) -> i64x4 {
3036		cast!(self.avx2._mm256_andnot_si256(cast!(a), cast!(b)))
3037	}
3038
3039	/// Returns `!a & b` for each bit in `a` and `b`.
3040	#[inline(always)]
3041	pub fn andnot_i8x32(self, a: i8x32, b: i8x32) -> i8x32 {
3042		cast!(self.avx2._mm256_andnot_si256(cast!(a), cast!(b)))
3043	}
3044
3045	/// Returns `!a & b` for each bit in `a` and `b`.
3046	#[inline(always)]
3047	pub fn andnot_m16x16(self, a: m16x16, b: m16x16) -> m16x16 {
3048		cast!(self.avx2._mm256_andnot_si256(cast!(a), cast!(b)))
3049	}
3050
3051	/// Returns `!a & b` for each bit in `a` and `b`.
3052	#[inline(always)]
3053	pub fn andnot_m32x8(self, a: m32x8, b: m32x8) -> m32x8 {
3054		cast!(self.avx2._mm256_andnot_si256(cast!(a), cast!(b)))
3055	}
3056
3057	/// Returns `!a & b` for each bit in `a` and `b`.
3058	#[inline(always)]
3059	pub fn andnot_m64x4(self, a: m64x4, b: m64x4) -> m64x4 {
3060		cast!(self.avx2._mm256_andnot_si256(cast!(a), cast!(b)))
3061	}
3062
3063	/// Returns `!a & b` for each bit in `a` and `b`.
3064	#[inline(always)]
3065	pub fn andnot_m8x32(self, a: m8x32, b: m8x32) -> m8x32 {
3066		cast!(self.avx2._mm256_andnot_si256(cast!(a), cast!(b)))
3067	}
3068
3069	/// Returns `!a & b` for each bit in `a` and `b`.
3070	#[inline(always)]
3071	pub fn andnot_u16x16(self, a: u16x16, b: u16x16) -> u16x16 {
3072		cast!(self.avx2._mm256_andnot_si256(cast!(a), cast!(b)))
3073	}
3074
3075	/// Returns `!a & b` for each bit in `a` and `b`.
3076	#[inline(always)]
3077	pub fn andnot_u32x8(self, a: u32x8, b: u32x8) -> u32x8 {
3078		cast!(self.avx2._mm256_andnot_si256(cast!(a), cast!(b)))
3079	}
3080
3081	/// Returns `!a & b` for each bit in `a` and `b`.
3082	#[inline(always)]
3083	pub fn andnot_u64x4(self, a: u64x4, b: u64x4) -> u64x4 {
3084		cast!(self.avx2._mm256_andnot_si256(cast!(a), cast!(b)))
3085	}
3086
3087	/// Returns `!a & b` for each bit in `a` and `b`.
3088	#[inline(always)]
3089	pub fn andnot_u8x32(self, a: u8x32, b: u8x32) -> u8x32 {
3090		cast!(self.avx2._mm256_andnot_si256(cast!(a), cast!(b)))
3091	}
3092
3093	/// Applies the sign of each element of `sign` to the corresponding lane in `a`.
3094	/// - If `sign` is zero, the corresponding element is zeroed.
3095	/// - If `sign` is positive, the corresponding element is returned as is.
3096	/// - If `sign` is negative, the corresponding element is negated.
3097	#[inline(always)]
3098	pub fn apply_sign_i16x16(self, sign: i16x16, a: i16x16) -> i16x16 {
3099		cast!(self.avx2._mm256_sign_epi16(cast!(a), cast!(sign)))
3100	}
3101
3102	/// Applies the sign of each element of `sign` to the corresponding lane in `a`.
3103	/// - If `sign` is zero, the corresponding element is zeroed.
3104	/// - If `sign` is positive, the corresponding element is returned as is.
3105	/// - If `sign` is negative, the corresponding element is negated.
3106	#[inline(always)]
3107	pub fn apply_sign_i32x8(self, sign: i32x8, a: i32x8) -> i32x8 {
3108		cast!(self.avx2._mm256_sign_epi32(cast!(a), cast!(sign)))
3109	}
3110
3111	/// Applies the sign of each element of `sign` to the corresponding lane in `a`.
3112	/// - If `sign` is zero, the corresponding element is zeroed.
3113	/// - If `sign` is positive, the corresponding element is returned as is.
3114	/// - If `sign` is negative, the corresponding element is negated.
3115	#[inline(always)]
3116	pub fn apply_sign_i8x32(self, sign: i8x32, a: i8x32) -> i8x32 {
3117		cast!(self.avx2._mm256_sign_epi8(cast!(a), cast!(sign)))
3118	}
3119
3120	/// Computes the approximate reciprocal of the elements of each lane of `a`.
3121	#[inline(always)]
3122	pub fn approx_reciprocal_f32x8(self, a: f32x8) -> f32x8 {
3123		cast!(self.avx._mm256_rcp_ps(cast!(a)))
3124	}
3125
3126	/// Computes the approximate reciprocal of the square roots of the elements of each lane of `a`.
3127	#[inline(always)]
3128	pub fn approx_reciprocal_sqrt_f32x8(self, a: f32x8) -> f32x8 {
3129		cast!(self.avx._mm256_rsqrt_ps(cast!(a)))
3130	}
3131
3132	/// Computes `average(a, b)` for each lane of `a` and `b`.
3133	#[inline(always)]
3134	pub fn average_u16x16(self, a: u16x16, b: u16x16) -> u16x16 {
3135		cast!(self.avx2._mm256_avg_epu16(cast!(a), cast!(b)))
3136	}
3137
3138	/// Computes `average(a, b)` for each lane of `a` and `b`.
3139	#[inline(always)]
3140	pub fn average_u8x32(self, a: u8x32, b: u8x32) -> u8x32 {
3141		cast!(self.avx2._mm256_avg_epu8(cast!(a), cast!(b)))
3142	}
3143
3144	/// Returns `ceil(a)` for each lane of `a`, rounding towards positive infinity.
3145	#[inline(always)]
3146	pub fn ceil_f32x8(self, a: f32x8) -> f32x8 {
3147		cast!(self.avx._mm256_ceil_ps(cast!(a)))
3148	}
3149
3150	/// Returns `ceil(a)` for each lane of `a`, rounding towards positive infinity.
3151	#[inline(always)]
3152	pub fn ceil_f64x4(self, a: f64x4) -> f64x4 {
3153		cast!(self.avx._mm256_ceil_pd(cast!(a)))
3154	}
3155
3156	/// Compares the elements in each lane of `a` and `b` for equality.
3157	#[inline(always)]
3158	pub fn cmp_eq_f32x8(self, a: f32x8, b: f32x8) -> m32x8 {
3159		cast!(self.avx._mm256_cmp_ps::<_CMP_EQ_OQ>(cast!(a), cast!(b)))
3160	}
3161
3162	/// Compares the elements in each lane of `a` and `b` for equality.
3163	#[inline(always)]
3164	pub fn cmp_eq_f64x4(self, a: f64x4, b: f64x4) -> m64x4 {
3165		cast!(self.avx._mm256_cmp_pd::<_CMP_EQ_OQ>(cast!(a), cast!(b)))
3166	}
3167
3168	/// Compares the elements in each lane of `a` and `b` for equality.
3169	#[inline(always)]
3170	pub fn cmp_eq_i16x16(self, a: i16x16, b: i16x16) -> m16x16 {
3171		cast!(self.avx2._mm256_cmpeq_epi16(cast!(a), cast!(b)))
3172	}
3173
3174	/// Compares the elements in each lane of `a` and `b` for equality.
3175	#[inline(always)]
3176	pub fn cmp_eq_i32x8(self, a: i32x8, b: i32x8) -> m32x8 {
3177		cast!(self.avx2._mm256_cmpeq_epi32(cast!(a), cast!(b)))
3178	}
3179
3180	/// Compares the elements in each lane of `a` and `b` for equality.
3181	#[inline(always)]
3182	pub fn cmp_eq_i64x4(self, a: i64x4, b: i64x4) -> m64x4 {
3183		cast!(self.avx2._mm256_cmpeq_epi64(cast!(a), cast!(b)))
3184	}
3185
3186	/// Compares the elements in each lane of `a` and `b` for equality.
3187	#[inline(always)]
3188	pub fn cmp_eq_i8x32(self, a: i8x32, b: i8x32) -> m8x32 {
3189		cast!(self.avx2._mm256_cmpeq_epi8(cast!(a), cast!(b)))
3190	}
3191
3192	/// Compares the elements in each lane of `a` and `b` for equality.
3193	#[inline(always)]
3194	pub fn cmp_eq_u16x16(self, a: u16x16, b: u16x16) -> m16x16 {
3195		cast!(self.avx2._mm256_cmpeq_epi16(cast!(a), cast!(b)))
3196	}
3197
3198	/// Compares the elements in each lane of `a` and `b` for equality.
3199	#[inline(always)]
3200	pub fn cmp_eq_u32x8(self, a: u32x8, b: u32x8) -> m32x8 {
3201		cast!(self.avx2._mm256_cmpeq_epi32(cast!(a), cast!(b)))
3202	}
3203
3204	/// Compares the elements in each lane of `a` and `b` for equality.
3205	#[inline(always)]
3206	pub fn cmp_eq_u64x4(self, a: u64x4, b: u64x4) -> m64x4 {
3207		cast!(self.avx2._mm256_cmpeq_epi64(cast!(a), cast!(b)))
3208	}
3209
3210	/// Compares the elements in each lane of `a` and `b` for equality.
3211	#[inline(always)]
3212	pub fn cmp_eq_u8x32(self, a: u8x32, b: u8x32) -> m8x32 {
3213		cast!(self.avx2._mm256_cmpeq_epi8(cast!(a), cast!(b)))
3214	}
3215
3216	/// Compares the elements in each lane of `a` and `b` for greater-than-or-equal-to.
3217	#[inline(always)]
3218	pub fn cmp_ge_f32x8(self, a: f32x8, b: f32x8) -> m32x8 {
3219		cast!(self.avx._mm256_cmp_ps::<_CMP_GE_OQ>(cast!(a), cast!(b)))
3220	}
3221
3222	/// Compares the elements in each lane of `a` and `b` for greater-than-or-equal-to.
3223	#[inline(always)]
3224	pub fn cmp_ge_f64x4(self, a: f64x4, b: f64x4) -> m64x4 {
3225		cast!(self.avx._mm256_cmp_pd::<_CMP_GE_OQ>(cast!(a), cast!(b)))
3226	}
3227
3228	/// Compares the elements in each lane of `a` and `b` for greater-than-or-equal-to.
3229	#[inline(always)]
3230	pub fn cmp_ge_i16x16(self, a: i16x16, b: i16x16) -> m16x16 {
3231		self.not_m16x16(self.cmp_lt_i16x16(a, b))
3232	}
3233
3234	/// Compares the elements in each lane of `a` and `b` for greater-than-or-equal-to.
3235	#[inline(always)]
3236	pub fn cmp_ge_i32x8(self, a: i32x8, b: i32x8) -> m32x8 {
3237		self.not_m32x8(self.cmp_lt_i32x8(a, b))
3238	}
3239
3240	/// Compares the elements in each lane of `a` and `b` for greater-than-or-equal-to.
3241	#[inline(always)]
3242	pub fn cmp_ge_i64x4(self, a: i64x4, b: i64x4) -> m64x4 {
3243		self.not_m64x4(self.cmp_lt_i64x4(a, b))
3244	}
3245
3246	/// Compares the elements in each lane of `a` and `b` for greater-than-or-equal-to.
3247	#[inline(always)]
3248	pub fn cmp_ge_i8x32(self, a: i8x32, b: i8x32) -> m8x32 {
3249		self.not_m8x32(self.cmp_lt_i8x32(a, b))
3250	}
3251
3252	/// Compares the elements in each lane of `a` and `b` for greater-than-or-equal-to.
3253	#[inline(always)]
3254	pub fn cmp_ge_u16x16(self, a: u16x16, b: u16x16) -> m16x16 {
3255		self.not_m16x16(self.cmp_lt_u16x16(a, b))
3256	}
3257
3258	/// Compares the elements in each lane of `a` and `b` for greater-than-or-equal-to.
3259	#[inline(always)]
3260	pub fn cmp_ge_u32x8(self, a: u32x8, b: u32x8) -> m32x8 {
3261		self.not_m32x8(self.cmp_lt_u32x8(a, b))
3262	}
3263
3264	/// Compares the elements in each lane of `a` and `b` for greater-than-or-equal-to.
3265	#[inline(always)]
3266	pub fn cmp_ge_u64x4(self, a: u64x4, b: u64x4) -> m64x4 {
3267		self.not_m64x4(self.cmp_lt_u64x4(a, b))
3268	}
3269
3270	/// Compares the elements in each lane of `a` and `b` for greater-than-or-equal-to.
3271	#[inline(always)]
3272	pub fn cmp_ge_u8x32(self, a: u8x32, b: u8x32) -> m8x32 {
3273		self.not_m8x32(self.cmp_lt_u8x32(a, b))
3274	}
3275
3276	/// Compares the elements in each lane of `a` and `b` for greater-than.
3277	#[inline(always)]
3278	pub fn cmp_gt_f32x8(self, a: f32x8, b: f32x8) -> m32x8 {
3279		cast!(self.avx._mm256_cmp_ps::<_CMP_GT_OQ>(cast!(a), cast!(b)))
3280	}
3281
3282	/// Compares the elements in each lane of `a` and `b` for greater-than.
3283	#[inline(always)]
3284	pub fn cmp_gt_f64x4(self, a: f64x4, b: f64x4) -> m64x4 {
3285		cast!(self.avx._mm256_cmp_pd::<_CMP_GT_OQ>(cast!(a), cast!(b)))
3286	}
3287
3288	/// Compares the elements in each lane of `a` and `b` for greater-than.
3289	#[inline(always)]
3290	pub fn cmp_gt_i16x16(self, a: i16x16, b: i16x16) -> m16x16 {
3291		cast!(self.avx2._mm256_cmpgt_epi16(cast!(a), cast!(b)))
3292	}
3293
3294	/// Compares the elements in each lane of `a` and `b` for greater-than.
3295	#[inline(always)]
3296	pub fn cmp_gt_i32x8(self, a: i32x8, b: i32x8) -> m32x8 {
3297		cast!(self.avx2._mm256_cmpgt_epi32(cast!(a), cast!(b)))
3298	}
3299
3300	/// Compares the elements in each lane of `a` and `b` for greater-than.
3301	#[inline(always)]
3302	pub fn cmp_gt_i64x4(self, a: i64x4, b: i64x4) -> m64x4 {
3303		cast!(self.avx2._mm256_cmpgt_epi64(cast!(a), cast!(b)))
3304	}
3305
3306	/// Compares the elements in each lane of `a` and `b` for greater-than.
3307	#[inline(always)]
3308	pub fn cmp_gt_i8x32(self, a: i8x32, b: i8x32) -> m8x32 {
3309		cast!(self.avx2._mm256_cmpgt_epi8(cast!(a), cast!(b)))
3310	}
3311
3312	/// Compares the elements in each lane of `a` and `b` for greater-than.
3313	#[inline(always)]
3314	pub fn cmp_gt_u16x16(self, a: u16x16, b: u16x16) -> m16x16 {
3315		let k = self.splat_u16x16(0x8000);
3316		self.cmp_gt_i16x16(cast!(self.xor_u16x16(a, k)), cast!(self.xor_u16x16(b, k)))
3317	}
3318
3319	/// Compares the elements in each lane of `a` and `b` for greater-than.
3320	#[inline(always)]
3321	pub fn cmp_gt_u32x8(self, a: u32x8, b: u32x8) -> m32x8 {
3322		let k = self.splat_u32x8(0x80000000);
3323		self.cmp_gt_i32x8(cast!(self.xor_u32x8(a, k)), cast!(self.xor_u32x8(b, k)))
3324	}
3325
3326	/// Compares the elements in each lane of `a` and `b` for greater-than.
3327	#[inline(always)]
3328	pub fn cmp_gt_u64x4(self, a: u64x4, b: u64x4) -> m64x4 {
3329		let k = self.splat_u64x4(0x8000000000000000);
3330		self.cmp_gt_i64x4(cast!(self.xor_u64x4(a, k)), cast!(self.xor_u64x4(b, k)))
3331	}
3332
3333	/// Compares the elements in each lane of `a` and `b` for greater-than.
3334	#[inline(always)]
3335	pub fn cmp_gt_u8x32(self, a: u8x32, b: u8x32) -> m8x32 {
3336		let k = self.splat_u8x32(0x80);
3337		self.cmp_gt_i8x32(cast!(self.xor_u8x32(a, k)), cast!(self.xor_u8x32(b, k)))
3338	}
3339
3340	/// Compares the elements in each lane of `a` and `b` for less-than-or-equal-to.
3341	#[inline(always)]
3342	pub fn cmp_le_f32x8(self, a: f32x8, b: f32x8) -> m32x8 {
3343		cast!(self.avx._mm256_cmp_ps::<_CMP_LE_OQ>(cast!(a), cast!(b)))
3344	}
3345
3346	/// Compares the elements in each lane of `a` and `b` for less-than-or-equal-to.
3347	#[inline(always)]
3348	pub fn cmp_le_f64x4(self, a: f64x4, b: f64x4) -> m64x4 {
3349		cast!(self.avx._mm256_cmp_pd::<_CMP_LE_OQ>(cast!(a), cast!(b)))
3350	}
3351
3352	/// Compares the elements in each lane of `a` and `b` for less-than-or-equal-to.
3353	#[inline(always)]
3354	pub fn cmp_le_i16x16(self, a: i16x16, b: i16x16) -> m16x16 {
3355		self.not_m16x16(self.cmp_gt_i16x16(a, b))
3356	}
3357
3358	/// Compares the elements in each lane of `a` and `b` for less-than-or-equal-to.
3359	#[inline(always)]
3360	pub fn cmp_le_i32x8(self, a: i32x8, b: i32x8) -> m32x8 {
3361		self.not_m32x8(self.cmp_gt_i32x8(a, b))
3362	}
3363
3364	/// Compares the elements in each lane of `a` and `b` for less-than-or-equal-to.
3365	#[inline(always)]
3366	pub fn cmp_le_i64x4(self, a: i64x4, b: i64x4) -> m64x4 {
3367		self.not_m64x4(self.cmp_gt_i64x4(a, b))
3368	}
3369
3370	/// Compares the elements in each lane of `a` and `b` for less-than-or-equal-to.
3371	#[inline(always)]
3372	pub fn cmp_le_i8x32(self, a: i8x32, b: i8x32) -> m8x32 {
3373		self.not_m8x32(self.cmp_gt_i8x32(a, b))
3374	}
3375
3376	/// Compares the elements in each lane of `a` and `b` for less-than-or-equal-to.
3377	#[inline(always)]
3378	pub fn cmp_le_u16x16(self, a: u16x16, b: u16x16) -> m16x16 {
3379		self.not_m16x16(self.cmp_gt_u16x16(a, b))
3380	}
3381
3382	/// Compares the elements in each lane of `a` and `b` for less-than-or-equal-to.
3383	#[inline(always)]
3384	pub fn cmp_le_u32x8(self, a: u32x8, b: u32x8) -> m32x8 {
3385		self.not_m32x8(self.cmp_gt_u32x8(a, b))
3386	}
3387
3388	/// Compares the elements in each lane of `a` and `b` for less-than-or-equal-to.
3389	#[inline(always)]
3390	pub fn cmp_le_u64x4(self, a: u64x4, b: u64x4) -> m64x4 {
3391		self.not_m64x4(self.cmp_gt_u64x4(a, b))
3392	}
3393
3394	/// Compares the elements in each lane of `a` and `b` for less-than-or-equal-to.
3395	#[inline(always)]
3396	pub fn cmp_le_u8x32(self, a: u8x32, b: u8x32) -> m8x32 {
3397		self.not_m8x32(self.cmp_gt_u8x32(a, b))
3398	}
3399
3400	/// Compares the elements in each lane of `a` and `b` for less-than.
3401	#[inline(always)]
3402	pub fn cmp_lt_f32x8(self, a: f32x8, b: f32x8) -> m32x8 {
3403		cast!(self.avx._mm256_cmp_ps::<_CMP_LT_OQ>(cast!(a), cast!(b)))
3404	}
3405
3406	/// Compares the elements in each lane of `a` and `b` for less-than.
3407	#[inline(always)]
3408	pub fn cmp_lt_f64x4(self, a: f64x4, b: f64x4) -> m64x4 {
3409		cast!(self.avx._mm256_cmp_pd::<_CMP_LT_OQ>(cast!(a), cast!(b)))
3410	}
3411
3412	/// Compares the elements in each lane of `a` and `b` for less-than.
3413	#[inline(always)]
3414	pub fn cmp_lt_i16x16(self, a: i16x16, b: i16x16) -> m16x16 {
3415		cast!(self.avx2._mm256_cmpgt_epi16(cast!(b), cast!(a)))
3416	}
3417
3418	/// Compares the elements in each lane of `a` and `b` for less-than.
3419	#[inline(always)]
3420	pub fn cmp_lt_i32x8(self, a: i32x8, b: i32x8) -> m32x8 {
3421		cast!(self.avx2._mm256_cmpgt_epi32(cast!(b), cast!(a)))
3422	}
3423
3424	/// Compares the elements in each lane of `a` and `b` for less-than.
3425	#[inline(always)]
3426	pub fn cmp_lt_i64x4(self, a: i64x4, b: i64x4) -> m64x4 {
3427		cast!(self.avx2._mm256_cmpgt_epi64(cast!(b), cast!(a)))
3428	}
3429
3430	/// Compares the elements in each lane of `a` and `b` for less-than.
3431	#[inline(always)]
3432	pub fn cmp_lt_i8x32(self, a: i8x32, b: i8x32) -> m8x32 {
3433		cast!(self.avx2._mm256_cmpgt_epi8(cast!(b), cast!(a)))
3434	}
3435
3436	/// Compares the elements in each lane of `a` and `b` for less-than.
3437	#[inline(always)]
3438	pub fn cmp_lt_u16x16(self, a: u16x16, b: u16x16) -> m16x16 {
3439		let k = self.splat_u16x16(0x8000);
3440		self.cmp_lt_i16x16(cast!(self.xor_u16x16(a, k)), cast!(self.xor_u16x16(b, k)))
3441	}
3442
3443	/// Compares the elements in each lane of `a` and `b` for less-than.
3444	#[inline(always)]
3445	pub fn cmp_lt_u32x8(self, a: u32x8, b: u32x8) -> m32x8 {
3446		let k = self.splat_u32x8(0x80000000);
3447		self.cmp_lt_i32x8(cast!(self.xor_u32x8(a, k)), cast!(self.xor_u32x8(b, k)))
3448	}
3449
3450	/// Compares the elements in each lane of `a` and `b` for less-than.
3451	#[inline(always)]
3452	pub fn cmp_lt_u64x4(self, a: u64x4, b: u64x4) -> m64x4 {
3453		let k = self.splat_u64x4(0x8000000000000000);
3454		self.cmp_lt_i64x4(cast!(self.xor_u64x4(a, k)), cast!(self.xor_u64x4(b, k)))
3455	}
3456
3457	/// Compares the elements in each lane of `a` and `b` for less-than.
3458	#[inline(always)]
3459	pub fn cmp_lt_u8x32(self, a: u8x32, b: u8x32) -> m8x32 {
3460		let k = self.splat_u8x32(0x80);
3461		self.cmp_lt_i8x32(cast!(self.xor_u8x32(a, k)), cast!(self.xor_u8x32(b, k)))
3462	}
3463
3464	/// Compares the elements in each lane of `a` and `b` for inequality.
3465	#[inline(always)]
3466	pub fn cmp_not_eq_f32x8(self, a: f32x8, b: f32x8) -> m32x8 {
3467		cast!(self.avx._mm256_cmp_ps::<_CMP_NEQ_UQ>(cast!(a), cast!(b)))
3468	}
3469
3470	/// Compares the elements in each lane of `a` and `b` for inequality.
3471	#[inline(always)]
3472	pub fn cmp_not_eq_f64x4(self, a: f64x4, b: f64x4) -> m64x4 {
3473		cast!(self.avx._mm256_cmp_pd::<_CMP_NEQ_UQ>(cast!(a), cast!(b)))
3474	}
3475
3476	/// Compares the elements in each lane of `a` and `b` for not-greater-than-or-equal.
3477	#[inline(always)]
3478	pub fn cmp_not_ge_f32x8(self, a: f32x8, b: f32x8) -> m32x8 {
3479		cast!(self.avx._mm256_cmp_ps::<_CMP_NGE_UQ>(cast!(a), cast!(b)))
3480	}
3481
3482	/// Compares the elements in each lane of `a` and `b` for not-greater-than-or-equal.
3483	#[inline(always)]
3484	pub fn cmp_not_ge_f64x4(self, a: f64x4, b: f64x4) -> m64x4 {
3485		cast!(self.avx._mm256_cmp_pd::<_CMP_NGE_UQ>(cast!(a), cast!(b)))
3486	}
3487
3488	/// Compares the elements in each lane of `a` and `b` for not-greater-than.
3489	#[inline(always)]
3490	pub fn cmp_not_gt_f32x8(self, a: f32x8, b: f32x8) -> m32x8 {
3491		cast!(self.avx._mm256_cmp_ps::<_CMP_NGT_UQ>(cast!(a), cast!(b)))
3492	}
3493
3494	/// Compares the elements in each lane of `a` and `b` for not-greater-than.
3495	#[inline(always)]
3496	pub fn cmp_not_gt_f64x4(self, a: f64x4, b: f64x4) -> m64x4 {
3497		cast!(self.avx._mm256_cmp_pd::<_CMP_NGT_UQ>(cast!(a), cast!(b)))
3498	}
3499
3500	/// Compares the elements in each lane of `a` and `b` for not-less-than-or-equal.
3501	#[inline(always)]
3502	pub fn cmp_not_le_f32x8(self, a: f32x8, b: f32x8) -> m32x8 {
3503		cast!(self.avx._mm256_cmp_ps::<_CMP_NLE_UQ>(cast!(a), cast!(b)))
3504	}
3505
3506	/// Compares the elements in each lane of `a` and `b` for not-less-than-or-equal.
3507	#[inline(always)]
3508	pub fn cmp_not_le_f64x4(self, a: f64x4, b: f64x4) -> m64x4 {
3509		cast!(self.avx._mm256_cmp_pd::<_CMP_NLE_UQ>(cast!(a), cast!(b)))
3510	}
3511
3512	/// Compares the elements in each lane of `a` and `b` for not-less-than.
3513	#[inline(always)]
3514	pub fn cmp_not_lt_f32x8(self, a: f32x8, b: f32x8) -> m32x8 {
3515		cast!(self.avx._mm256_cmp_ps::<_CMP_NLT_UQ>(cast!(a), cast!(b)))
3516	}
3517
3518	/// Compares the elements in each lane of `a` and `b` for not-less-than.
3519	#[inline(always)]
3520	pub fn cmp_not_lt_f64x4(self, a: f64x4, b: f64x4) -> m64x4 {
3521		cast!(self.avx._mm256_cmp_pd::<_CMP_NLT_UQ>(cast!(a), cast!(b)))
3522	}
3523
3524	/// Converts a `f32x4` to `f64x4`, elementwise.
3525	#[inline(always)]
3526	pub fn convert_f32x4_to_f64x4(self, a: f32x4) -> f64x4 {
3527		cast!(self.avx._mm256_cvtps_pd(cast!(a)))
3528	}
3529
3530	/// Converts a `f32x8` to `i32x8`, elementwise.
3531	#[inline(always)]
3532	pub fn convert_f32x8_to_i32x8(self, a: f32x8) -> i32x8 {
3533		cast!(self.avx._mm256_cvttps_epi32(cast!(a)))
3534	}
3535
3536	/// Converts a `f64x4` to `f32x4`, elementwise.
3537	#[inline(always)]
3538	pub fn convert_f64x4_to_f32x4(self, a: f64x4) -> f32x4 {
3539		cast!(self.avx._mm256_cvtpd_ps(cast!(a)))
3540	}
3541
3542	/// Converts a `f64x4` to `i32x4`, elementwise.
3543	#[inline(always)]
3544	pub fn convert_f64x4_to_i32x4(self, a: f64x4) -> i32x4 {
3545		cast!(self.avx._mm256_cvttpd_epi32(cast!(a)))
3546	}
3547
3548	/// Converts a `i16x16` to `u16x16`, elementwise.
3549	#[inline(always)]
3550	pub fn convert_i16x16_to_u16x16(self, a: i16x16) -> u16x16 {
3551		cast!(a)
3552	}
3553
3554	/// Converts a `i16x8` to `i32x8`, elementwise.
3555	#[inline(always)]
3556	pub fn convert_i16x8_to_i32x8(self, a: i16x8) -> i32x8 {
3557		cast!(self.avx2._mm256_cvtepi16_epi32(cast!(a)))
3558	}
3559
3560	/// Converts a `i16x8` to `i64x4`, elementwise, while truncating the extra elements.
3561	#[inline(always)]
3562	pub fn convert_i16x8_to_i64x4(self, a: i16x8) -> i64x4 {
3563		cast!(self.avx2._mm256_cvtepi16_epi64(cast!(a)))
3564	}
3565
3566	/// Converts a `i16x8` to `u32x8`, elementwise.
3567	#[inline(always)]
3568	pub fn convert_i16x8_to_u32x8(self, a: i16x8) -> u32x8 {
3569		cast!(self.avx2._mm256_cvtepi16_epi32(cast!(a)))
3570	}
3571
3572	/// Converts a `i16x8` to `u64x4`, elementwise, while truncating the extra elements.
3573	#[inline(always)]
3574	pub fn convert_i16x8_to_u64x4(self, a: i16x8) -> u64x4 {
3575		cast!(self.avx2._mm256_cvtepi16_epi64(cast!(a)))
3576	}
3577
3578	/// Converts a `i32x4` to `f64x4`, elementwise.
3579	#[inline(always)]
3580	pub fn convert_i32x4_to_f64x4(self, a: i32x4) -> f64x4 {
3581		cast!(self.avx._mm256_cvtepi32_pd(cast!(a)))
3582	}
3583
3584	/// Converts a `i32x4` to `i64x4`, elementwise.
3585	#[inline(always)]
3586	pub fn convert_i32x4_to_i64x4(self, a: i32x4) -> i64x4 {
3587		cast!(self.avx2._mm256_cvtepi32_epi64(cast!(a)))
3588	}
3589
3590	/// Converts a `i32x4` to `u64x4`, elementwise.
3591	#[inline(always)]
3592	pub fn convert_i32x4_to_u64x4(self, a: i32x4) -> u64x4 {
3593		cast!(self.avx2._mm256_cvtepi32_epi64(cast!(a)))
3594	}
3595
3596	/// Converts a `i32x8` to `f32x8`, elementwise.
3597	#[inline(always)]
3598	pub fn convert_i32x8_to_f32x8(self, a: i32x8) -> f32x8 {
3599		cast!(self.avx._mm256_cvtepi32_ps(cast!(a)))
3600	}
3601
3602	/// Converts a `i32x8` to `u32x8`, elementwise.
3603	#[inline(always)]
3604	pub fn convert_i32x8_to_u32x8(self, a: i32x8) -> u32x8 {
3605		cast!(a)
3606	}
3607
3608	/// Converts a `i8x16` to `i16x16`, elementwise.
3609	#[inline(always)]
3610	pub fn convert_i8x16_to_i16x16(self, a: i8x16) -> i16x16 {
3611		cast!(self.avx2._mm256_cvtepi8_epi16(cast!(a)))
3612	}
3613
3614	/// Converts a `i8x16` to `i32x8`, elementwise, while truncating the extra elements.
3615	#[inline(always)]
3616	pub fn convert_i8x16_to_i32x8(self, a: i8x16) -> i32x8 {
3617		cast!(self.avx2._mm256_cvtepi8_epi32(cast!(a)))
3618	}
3619
3620	/// Converts a `i8x16` to `i64x4`, elementwise, while truncating the extra elements.
3621	#[inline(always)]
3622	pub fn convert_i8x16_to_i64x4(self, a: i8x16) -> i64x4 {
3623		cast!(self.avx2._mm256_cvtepi8_epi64(cast!(a)))
3624	}
3625
3626	/// Converts a `i8x16` to `u16x16`, elementwise.
3627	#[inline(always)]
3628	pub fn convert_i8x16_to_u16x16(self, a: i8x16) -> u16x16 {
3629		cast!(self.avx2._mm256_cvtepi8_epi16(cast!(a)))
3630	}
3631
3632	/// Converts a `i8x16` to `u32x8`, elementwise, while truncating the extra elements.
3633	#[inline(always)]
3634	pub fn convert_i8x16_to_u32x8(self, a: i8x16) -> u32x8 {
3635		cast!(self.avx2._mm256_cvtepi8_epi32(cast!(a)))
3636	}
3637
3638	/// Converts a `i8x16` to `u64x4`, elementwise, while truncating the extra elements.
3639	#[inline(always)]
3640	pub fn convert_i8x16_to_u64x4(self, a: i8x16) -> u64x4 {
3641		cast!(self.avx2._mm256_cvtepi8_epi64(cast!(a)))
3642	}
3643
3644	/// Converts a `i8x32` to `u8x32`, elementwise.
3645	#[inline(always)]
3646	pub fn convert_i8x32_to_u8x32(self, a: i8x32) -> u8x32 {
3647		cast!(a)
3648	}
3649
3650	/// Converts a `u16x16` to `i16x16`, elementwise.
3651	#[inline(always)]
3652	pub fn convert_u16x16_to_i16x16(self, a: u16x16) -> i16x16 {
3653		cast!(a)
3654	}
3655
3656	/// Converts a `u16x8` to `i32x8`, elementwise.
3657	#[inline(always)]
3658	pub fn convert_u16x8_to_i32x8(self, a: u16x8) -> i32x8 {
3659		cast!(self.avx2._mm256_cvtepu16_epi32(cast!(a)))
3660	}
3661
3662	/// Converts a `u16x8` to `i64x4`, elementwise, while truncating the extra elements.
3663	#[inline(always)]
3664	pub fn convert_u16x8_to_i64x4(self, a: u16x8) -> i64x4 {
3665		cast!(self.avx2._mm256_cvtepu16_epi64(cast!(a)))
3666	}
3667
3668	/// Converts a `u16x8` to `u32x8`, elementwise.
3669	#[inline(always)]
3670	pub fn convert_u16x8_to_u32x8(self, a: u16x8) -> u32x8 {
3671		cast!(self.avx2._mm256_cvtepu16_epi32(cast!(a)))
3672	}
3673
3674	/// Converts a `u16x8` to `u64x4`, elementwise, while truncating the extra elements.
3675	#[inline(always)]
3676	pub fn convert_u16x8_to_u64x4(self, a: u16x8) -> u64x4 {
3677		cast!(self.avx2._mm256_cvtepu16_epi64(cast!(a)))
3678	}
3679
3680	/// Converts a `u32x4` to `i64x4`, elementwise.
3681	#[inline(always)]
3682	pub fn convert_u32x4_to_i64x4(self, a: u32x4) -> i64x4 {
3683		cast!(self.avx2._mm256_cvtepu32_epi64(cast!(a)))
3684	}
3685
3686	/// Converts a `u32x4` to `u64x4`, elementwise.
3687	#[inline(always)]
3688	pub fn convert_u32x4_to_u64x4(self, a: u32x4) -> u64x4 {
3689		cast!(self.avx2._mm256_cvtepu32_epi64(cast!(a)))
3690	}
3691
3692	/// Converts a `u32x8` to `i32x8`, elementwise.
3693	#[inline(always)]
3694	pub fn convert_u32x8_to_i32x8(self, a: u32x8) -> i32x8 {
3695		cast!(a)
3696	}
3697
3698	/// Converts a `u8x16` to `i16x16`, elementwise.
3699	#[inline(always)]
3700	pub fn convert_u8x16_to_i16x16(self, a: u8x16) -> i16x16 {
3701		cast!(self.avx2._mm256_cvtepu8_epi16(cast!(a)))
3702	}
3703
3704	/// Converts a `u8x16` to `i32x8`, elementwise, while truncating the extra elements.
3705	#[inline(always)]
3706	pub fn convert_u8x16_to_i32x8(self, a: u8x16) -> i32x8 {
3707		cast!(self.avx2._mm256_cvtepu8_epi32(cast!(a)))
3708	}
3709
3710	/// Converts a `u8x16` to `i64x4`, elementwise, while truncating the extra elements.
3711	#[inline(always)]
3712	pub fn convert_u8x16_to_i64x4(self, a: u8x16) -> i64x4 {
3713		cast!(self.avx2._mm256_cvtepu8_epi64(cast!(a)))
3714	}
3715
3716	/// Converts a `u8x16` to `u16x16`, elementwise.
3717	#[inline(always)]
3718	pub fn convert_u8x16_to_u16x16(self, a: u8x16) -> u16x16 {
3719		cast!(self.avx2._mm256_cvtepu8_epi16(cast!(a)))
3720	}
3721
3722	/// Converts a `u8x16` to `u32x8`, elementwise, while truncating the extra elements.
3723	#[inline(always)]
3724	pub fn convert_u8x16_to_u32x8(self, a: u8x16) -> u32x8 {
3725		cast!(self.avx2._mm256_cvtepu8_epi32(cast!(a)))
3726	}
3727
3728	/// Converts a `u8x16` to `u64x4`, elementwise, while truncating the extra elements.
3729	#[inline(always)]
3730	pub fn convert_u8x16_to_u64x4(self, a: u8x16) -> u64x4 {
3731		cast!(self.avx2._mm256_cvtepu8_epi64(cast!(a)))
3732	}
3733
3734	/// Converts a `u8x32` to `i8x32`, elementwise.
3735	#[inline(always)]
3736	pub fn convert_u8x32_to_i8x32(self, a: u8x32) -> i8x32 {
3737		cast!(a)
3738	}
3739
3740	/// Divides the elements of each lane of `a` and `b`.
3741	#[inline(always)]
3742	pub fn div_f32x8(self, a: f32x8, b: f32x8) -> f32x8 {
3743		cast!(self.avx._mm256_div_ps(cast!(a), cast!(b)))
3744	}
3745
3746	/// Divides the elements of each lane of `a` and `b`.
3747	#[inline(always)]
3748	pub fn div_f64x4(self, a: f64x4, b: f64x4) -> f64x4 {
3749		cast!(self.avx._mm256_div_pd(cast!(a), cast!(b)))
3750	}
3751
3752	/// Rounds the elements of each lane of `a` to the nearest integer towards negative infinity.
3753	#[inline(always)]
3754	pub fn floor_f32x8(self, a: f32x8) -> f32x8 {
3755		cast!(self.avx._mm256_floor_ps(cast!(a)))
3756	}
3757
3758	/// Rounds the elements of each lane of `a` to the nearest integer towards negative infinity.
3759	#[inline(always)]
3760	pub fn floor_f64x4(self, a: f64x4) -> f64x4 {
3761		cast!(self.avx._mm256_floor_pd(cast!(a)))
3762	}
3763
3764	/// See [_mm_hadd_ps].
3765	///
3766	/// [_mm_hadd_ps]: core::arch::x86_64::_mm_hadd_ps
3767	#[inline(always)]
3768	pub fn horizontal_add_pack_f32x8(self, a: f32x8, b: f32x8) -> f32x8 {
3769		cast!(self.avx._mm256_hadd_ps(cast!(a), cast!(b)))
3770	}
3771
3772	/// See [_mm_hadd_pd].
3773	///
3774	/// [_mm_hadd_pd]: core::arch::x86_64::_mm_hadd_pd
3775	#[inline(always)]
3776	pub fn horizontal_add_pack_f64x4(self, a: f64x4, b: f64x4) -> f64x4 {
3777		cast!(self.avx._mm256_hadd_pd(cast!(a), cast!(b)))
3778	}
3779
3780	/// See [_mm_hadd_epi16].
3781	///
3782	/// [_mm_hadd_epi16]: core::arch::x86_64::_mm_hadd_epi16
3783	#[inline(always)]
3784	pub fn horizontal_add_pack_i16x16(self, a: i16x16, b: i16x16) -> i16x16 {
3785		cast!(self.avx2._mm256_hadd_epi16(cast!(a), cast!(b)))
3786	}
3787
3788	/// See [_mm_hadd_epi32].
3789	///
3790	/// [_mm_hadd_epi32]: core::arch::x86_64::_mm_hadd_epi32
3791	#[inline(always)]
3792	pub fn horizontal_add_pack_i32x8(self, a: i32x8, b: i32x8) -> i32x8 {
3793		cast!(self.avx2._mm256_hadd_epi32(cast!(a), cast!(b)))
3794	}
3795
3796	/// See [_mm_hadds_epi16].
3797	///
3798	/// [_mm_hadds_epi16]: core::arch::x86_64::_mm_hadds_epi16
3799	#[inline(always)]
3800	pub fn horizontal_saturating_add_pack_i16x16(self, a: i16x16, b: i16x16) -> i16x16 {
3801		cast!(self.avx2._mm256_hadds_epi16(cast!(a), cast!(b)))
3802	}
3803
3804	/// See [_mm_hsubs_epi16].
3805	///
3806	/// [_mm_hsubs_epi16]: core::arch::x86_64::_mm_hsubs_epi16
3807	#[inline(always)]
3808	pub fn horizontal_saturating_sub_pack_i16x16(self, a: i16x16, b: i16x16) -> i16x16 {
3809		cast!(self.avx2._mm256_hsubs_epi16(cast!(a), cast!(b)))
3810	}
3811
3812	/// See [_mm256_hsub_ps].
3813	///
3814	/// [_mm256_hsub_ps]: core::arch::x86_64::_mm256_hsub_ps
3815	#[inline(always)]
3816	pub fn horizontal_sub_pack_f32x8(self, a: f32x8, b: f32x8) -> f32x8 {
3817		cast!(self.avx._mm256_hsub_ps(cast!(a), cast!(b)))
3818	}
3819
3820	/// See [_mm256_hsub_pd].
3821	///
3822	/// [_mm256_hsub_pd]: core::arch::x86_64::_mm256_hsub_pd
3823	#[inline(always)]
3824	pub fn horizontal_sub_pack_f64x4(self, a: f64x4, b: f64x4) -> f64x4 {
3825		cast!(self.avx._mm256_hsub_pd(cast!(a), cast!(b)))
3826	}
3827
3828	/// See [_mm256_hsub_epi16].
3829	///
3830	/// [_mm256_hsub_epi16]: core::arch::x86_64::_mm256_hsub_epi16
3831	#[inline(always)]
3832	pub fn horizontal_sub_pack_i16x16(self, a: i16x16, b: i16x16) -> i16x16 {
3833		cast!(self.avx2._mm256_hsub_epi16(cast!(a), cast!(b)))
3834	}
3835
3836	/// See [_mm256_hsub_epi32].
3837	///
3838	/// [_mm256_hsub_epi32]: core::arch::x86_64::_mm256_hsub_epi32
3839	#[inline(always)]
3840	pub fn horizontal_sub_pack_i32x8(self, a: i32x8, b: i32x8) -> i32x8 {
3841		cast!(self.avx2._mm256_hsub_epi32(cast!(a), cast!(b)))
3842	}
3843
3844	/// Checks if the elements in each lane of `a` are NaN.
3845	#[inline(always)]
3846	pub fn is_nan_f32x8(self, a: f32x8) -> m32x8 {
3847		cast!(self.avx._mm256_cmp_ps::<_CMP_UNORD_Q>(cast!(a), cast!(a)))
3848	}
3849
3850	/// Checks if the elements in each lane of `a` are NaN.
3851	#[inline(always)]
3852	pub fn is_nan_f64x4(self, a: f64x4) -> m64x4 {
3853		cast!(self.avx._mm256_cmp_pd::<_CMP_UNORD_Q>(cast!(a), cast!(a)))
3854	}
3855
3856	/// Checks if the elements in each lane of `a` are not NaN.
3857	#[inline(always)]
3858	pub fn is_not_nan_f32x8(self, a: f32x8) -> m32x8 {
3859		cast!(self.avx._mm256_cmp_ps::<_CMP_ORD_Q>(cast!(a), cast!(a)))
3860	}
3861
3862	/// Checks if the elements in each lane of `a` are not NaN.
3863	#[inline(always)]
3864	pub fn is_not_nan_f64x4(self, a: f64x4) -> m64x4 {
3865		cast!(self.avx._mm256_cmp_pd::<_CMP_ORD_Q>(cast!(a), cast!(a)))
3866	}
3867
3868	/// Computes `max(a, b)`. for each lane in `a` and `b`.
3869	#[inline(always)]
3870	pub fn max_f32x8(self, a: f32x8, b: f32x8) -> f32x8 {
3871		cast!(self.avx._mm256_max_ps(cast!(a), cast!(b)))
3872	}
3873
3874	/// Computes `max(a, b)`. for each lane in `a` and `b`.
3875	#[inline(always)]
3876	pub fn max_f64x4(self, a: f64x4, b: f64x4) -> f64x4 {
3877		cast!(self.avx._mm256_max_pd(cast!(a), cast!(b)))
3878	}
3879
3880	/// Computes `max(a, b)`. for each lane in `a` and `b`.
3881	#[inline(always)]
3882	pub fn max_i16x16(self, a: i16x16, b: i16x16) -> i16x16 {
3883		cast!(self.avx2._mm256_max_epi16(cast!(a), cast!(b)))
3884	}
3885
3886	/// Computes `max(a, b)`. for each lane in `a` and `b`.
3887	#[inline(always)]
3888	pub fn max_i32x8(self, a: i32x8, b: i32x8) -> i32x8 {
3889		cast!(self.avx2._mm256_max_epi32(cast!(a), cast!(b)))
3890	}
3891
3892	/// Computes `max(a, b)`. for each lane in `a` and `b`.
3893	#[inline(always)]
3894	pub fn max_i8x32(self, a: i8x32, b: i8x32) -> i8x32 {
3895		cast!(self.avx2._mm256_max_epi8(cast!(a), cast!(b)))
3896	}
3897
3898	/// Computes `max(a, b)`. for each lane in `a` and `b`.
3899	#[inline(always)]
3900	pub fn max_u16x16(self, a: u16x16, b: u16x16) -> u16x16 {
3901		cast!(self.avx2._mm256_max_epu16(cast!(a), cast!(b)))
3902	}
3903
3904	/// Computes `max(a, b)`. for each lane in `a` and `b`.
3905	#[inline(always)]
3906	pub fn max_u32x8(self, a: u32x8, b: u32x8) -> u32x8 {
3907		cast!(self.avx2._mm256_max_epu32(cast!(a), cast!(b)))
3908	}
3909
3910	/// Computes `max(a, b)`. for each lane in `a` and `b`.
3911	#[inline(always)]
3912	pub fn max_u8x32(self, a: u8x32, b: u8x32) -> u8x32 {
3913		cast!(self.avx2._mm256_max_epu8(cast!(a), cast!(b)))
3914	}
3915
3916	/// Computes `min(a, b)`. for each lane in `a` and `b`.
3917	#[inline(always)]
3918	pub fn min_f32x8(self, a: f32x8, b: f32x8) -> f32x8 {
3919		cast!(self.avx._mm256_min_ps(cast!(a), cast!(b)))
3920	}
3921
3922	/// Computes `min(a, b)`. for each lane in `a` and `b`.
3923	#[inline(always)]
3924	pub fn min_f64x4(self, a: f64x4, b: f64x4) -> f64x4 {
3925		cast!(self.avx._mm256_min_pd(cast!(a), cast!(b)))
3926	}
3927
3928	/// Computes `min(a, b)`. for each lane in `a` and `b`.
3929	#[inline(always)]
3930	pub fn min_i16x16(self, a: i16x16, b: i16x16) -> i16x16 {
3931		cast!(self.avx2._mm256_min_epi16(cast!(a), cast!(b)))
3932	}
3933
3934	/// Computes `min(a, b)`. for each lane in `a` and `b`.
3935	#[inline(always)]
3936	pub fn min_i32x8(self, a: i32x8, b: i32x8) -> i32x8 {
3937		cast!(self.avx2._mm256_min_epi32(cast!(a), cast!(b)))
3938	}
3939
3940	/// Computes `min(a, b)`. for each lane in `a` and `b`.
3941	#[inline(always)]
3942	pub fn min_i8x32(self, a: i8x32, b: i8x32) -> i8x32 {
3943		cast!(self.avx2._mm256_min_epi8(cast!(a), cast!(b)))
3944	}
3945
3946	/// Computes `min(a, b)`. for each lane in `a` and `b`.
3947	#[inline(always)]
3948	pub fn min_u16x16(self, a: u16x16, b: u16x16) -> u16x16 {
3949		cast!(self.avx2._mm256_min_epu16(cast!(a), cast!(b)))
3950	}
3951
3952	/// Computes `min(a, b)`. for each lane in `a` and `b`.
3953	#[inline(always)]
3954	pub fn min_u32x8(self, a: u32x8, b: u32x8) -> u32x8 {
3955		cast!(self.avx2._mm256_min_epu32(cast!(a), cast!(b)))
3956	}
3957
3958	/// Computes `min(a, b)`. for each lane in `a` and `b`.
3959	#[inline(always)]
3960	pub fn min_u8x32(self, a: u8x32, b: u8x32) -> u8x32 {
3961		cast!(self.avx2._mm256_min_epu8(cast!(a), cast!(b)))
3962	}
3963
3964	/// Multiplies the elements in each lane of `a` and `b`, and adds the results to each lane of
3965	/// `c`.
3966	#[inline(always)]
3967	pub fn mul_add_f32x4(self, a: f32x4, b: f32x4, c: f32x4) -> f32x4 {
3968		cast!(self.fma._mm_fmadd_ps(cast!(a), cast!(b), cast!(c)))
3969	}
3970
3971	/// Multiplies the elements in each lane of `a` and `b`, and adds the results to each lane of
3972	/// `c`.
3973	#[inline(always)]
3974	pub fn mul_add_f32x8(self, a: f32x8, b: f32x8, c: f32x8) -> f32x8 {
3975		cast!(self.fma._mm256_fmadd_ps(cast!(a), cast!(b), cast!(c)))
3976	}
3977
3978	/// Multiplies the elements in each lane of `a` and `b`, and adds the results to each lane of
3979	/// `c`.
3980	#[inline(always)]
3981	pub fn mul_add_f64x2(self, a: f64x2, b: f64x2, c: f64x2) -> f64x2 {
3982		cast!(self.fma._mm_fmadd_pd(cast!(a), cast!(b), cast!(c)))
3983	}
3984
3985	/// Multiplies the elements in each lane of `a` and `b`, and adds the results to each lane of
3986	/// `c`.
3987	#[inline(always)]
3988	pub fn mul_add_f64x4(self, a: f64x4, b: f64x4, c: f64x4) -> f64x4 {
3989		cast!(self.fma._mm256_fmadd_pd(cast!(a), cast!(b), cast!(c)))
3990	}
3991
3992	/// Multiplies the elements in each lane of `a` and `b`, and alternatively adds/subtracts 'c'
3993	/// to/from the results.
3994	#[inline(always)]
3995	pub fn mul_addsub_f32x4(self, a: f32x4, b: f32x4, c: f32x4) -> f32x4 {
3996		cast!(self.fma._mm_fmsubadd_ps(cast!(a), cast!(b), cast!(c)))
3997	}
3998
3999	/// Multiplies the elements in each lane of `a` and `b`, and alternatively adds/subtracts 'c'
4000	/// to/from the results.
4001	#[inline(always)]
4002	pub fn mul_addsub_f32x8(self, a: f32x8, b: f32x8, c: f32x8) -> f32x8 {
4003		cast!(self.fma._mm256_fmsubadd_ps(cast!(a), cast!(b), cast!(c)))
4004	}
4005
4006	/// Multiplies the elements in each lane of `a` and `b`, and alternatively adds/subtracts 'c'
4007	/// to/from the results.
4008	#[inline(always)]
4009	pub fn mul_addsub_f64x2(self, a: f64x2, b: f64x2, c: f64x2) -> f64x2 {
4010		cast!(self.fma._mm_fmsubadd_pd(cast!(a), cast!(b), cast!(c)))
4011	}
4012
4013	/// Multiplies the elements in each lane of `a` and `b`, and alternatively adds/subtracts 'c'
4014	/// to/from the results.
4015	#[inline(always)]
4016	pub fn mul_addsub_f64x4(self, a: f64x4, b: f64x4, c: f64x4) -> f64x4 {
4017		cast!(self.fma._mm256_fmsubadd_pd(cast!(a), cast!(b), cast!(c)))
4018	}
4019
4020	/// Computes `a * b` for each lane in `a` and `b`.
4021	#[inline(always)]
4022	pub fn mul_f32x8(self, a: f32x8, b: f32x8) -> f32x8 {
4023		cast!(self.avx._mm256_mul_ps(cast!(a), cast!(b)))
4024	}
4025
4026	/// Computes `a * b` for each lane in `a` and `b`.
4027	#[inline(always)]
4028	pub fn mul_f64x4(self, a: f64x4, b: f64x4) -> f64x4 {
4029		cast!(self.avx._mm256_mul_pd(cast!(a), cast!(b)))
4030	}
4031
4032	/// Multiplies the elements in each lane of `a` and `b`, and subtracts each lane of `c` from
4033	/// the results.
4034	#[inline(always)]
4035	pub fn mul_sub_f32x4(self, a: f32x4, b: f32x4, c: f32x4) -> f32x4 {
4036		cast!(self.fma._mm_fmsub_ps(cast!(a), cast!(b), cast!(c)))
4037	}
4038
4039	/// Multiplies the elements in each lane of `a` and `b`, and subtracts each lane of `c` from
4040	/// the results.
4041	#[inline(always)]
4042	pub fn mul_sub_f32x8(self, a: f32x8, b: f32x8, c: f32x8) -> f32x8 {
4043		cast!(self.fma._mm256_fmsub_ps(cast!(a), cast!(b), cast!(c)))
4044	}
4045
4046	/// Multiplies the elements in each lane of `a` and `b`, and subtracts each lane of `c` from
4047	/// the results.
4048	#[inline(always)]
4049	pub fn mul_sub_f64x2(self, a: f64x2, b: f64x2, c: f64x2) -> f64x2 {
4050		cast!(self.fma._mm_fmsub_pd(cast!(a), cast!(b), cast!(c)))
4051	}
4052
4053	/// Multiplies the elements in each lane of `a` and `b`, and subtracts each lane of `c` from
4054	/// the results.
4055	#[inline(always)]
4056	pub fn mul_sub_f64x4(self, a: f64x4, b: f64x4, c: f64x4) -> f64x4 {
4057		cast!(self.fma._mm256_fmsub_pd(cast!(a), cast!(b), cast!(c)))
4058	}
4059
4060	/// Multiplies the elements in each lane of `a` and `b`, and alternatively subtracts/adds 'c'
4061	/// to/from the results.
4062	#[inline(always)]
4063	pub fn mul_subadd_f32x4(self, a: f32x4, b: f32x4, c: f32x4) -> f32x4 {
4064		cast!(self.fma._mm_fmaddsub_ps(cast!(a), cast!(b), cast!(c)))
4065	}
4066
4067	/// Multiplies the elements in each lane of `a` and `b`, and alternatively subtracts/adds 'c'
4068	/// to/from the results.
4069	#[inline(always)]
4070	pub fn mul_subadd_f32x8(self, a: f32x8, b: f32x8, c: f32x8) -> f32x8 {
4071		cast!(self.fma._mm256_fmaddsub_ps(cast!(a), cast!(b), cast!(c)))
4072	}
4073
4074	/// Multiplies the elements in each lane of `a` and `b`, and alternatively subtracts/adds 'c'
4075	/// to/from the results.
4076	#[inline(always)]
4077	pub fn mul_subadd_f64x2(self, a: f64x2, b: f64x2, c: f64x2) -> f64x2 {
4078		cast!(self.fma._mm_fmaddsub_pd(cast!(a), cast!(b), cast!(c)))
4079	}
4080
4081	/// Multiplies the elements in each lane of `a` and `b`, and alternatively subtracts/adds 'c'
4082	/// to/from the results.
4083	#[inline(always)]
4084	pub fn mul_subadd_f64x4(self, a: f64x4, b: f64x4, c: f64x4) -> f64x4 {
4085		cast!(self.fma._mm256_fmaddsub_pd(cast!(a), cast!(b), cast!(c)))
4086	}
4087
4088	/// See [_mm256_maddubs_epi16].
4089	///
4090	/// [_mm256_maddubs_epi16]: core::arch::x86_64::_mm256_maddubs_epi16
4091	#[inline(always)]
4092	pub fn multiply_saturating_add_adjacent_i8x32(self, a: i8x32, b: i8x32) -> i16x16 {
4093		cast!(self.avx2._mm256_maddubs_epi16(cast!(a), cast!(b)))
4094	}
4095
4096	/// See [_mm256_madd_epi16].
4097	///
4098	/// [_mm256_madd_epi16]: core::arch::x86_64::_mm256_madd_epi16
4099	#[inline(always)]
4100	pub fn multiply_wrapping_add_adjacent_i16x16(self, a: i16x16, b: i16x16) -> i32x8 {
4101		cast!(self.avx2._mm256_madd_epi16(cast!(a), cast!(b)))
4102	}
4103
4104	/// See [_mm256_mpsadbw_epu8].
4105	///
4106	/// [_mm256_mpsadbw_epu8]: core::arch::x86_64::_mm256_mpsadbw_epu8
4107	#[inline(always)]
4108	pub fn multisum_of_absolute_differences_u8x32<const OFFSETS: i32>(
4109		self,
4110		a: u8x32,
4111		b: u8x32,
4112	) -> u16x16 {
4113		cast!(self.avx2._mm256_mpsadbw_epu8::<OFFSETS>(cast!(a), cast!(b)))
4114	}
4115
4116	/// Multiplies the elements in each lane of `a` and `b`, negates the results, and adds them to
4117	/// each lane of `c`.
4118	#[inline(always)]
4119	pub fn negate_mul_add_f32x4(self, a: f32x4, b: f32x4, c: f32x4) -> f32x4 {
4120		cast!(self.fma._mm_fnmadd_ps(cast!(a), cast!(b), cast!(c)))
4121	}
4122
4123	/// Multiplies the elements in each lane of `a` and `b`, negates the results, and adds them to
4124	/// each lane of `c`.
4125	#[inline(always)]
4126	pub fn negate_mul_add_f32x8(self, a: f32x8, b: f32x8, c: f32x8) -> f32x8 {
4127		cast!(self.fma._mm256_fnmadd_ps(cast!(a), cast!(b), cast!(c)))
4128	}
4129
4130	/// Multiplies the elements in each lane of `a` and `b`, negates the results, and adds them to
4131	/// each lane of `c`.
4132	#[inline(always)]
4133	pub fn negate_mul_add_f64x2(self, a: f64x2, b: f64x2, c: f64x2) -> f64x2 {
4134		cast!(self.fma._mm_fnmadd_pd(cast!(a), cast!(b), cast!(c)))
4135	}
4136
4137	/// Multiplies the elements in each lane of `a` and `b`, negates the results, and adds them to
4138	/// each lane of `c`.
4139	#[inline(always)]
4140	pub fn negate_mul_add_f64x4(self, a: f64x4, b: f64x4, c: f64x4) -> f64x4 {
4141		cast!(self.fma._mm256_fnmadd_pd(cast!(a), cast!(b), cast!(c)))
4142	}
4143
4144	/// Multiplies the elements in each lane of `a` and `b`, and subtracts each lane of `c` from
4145	/// the negation of the results.
4146	#[inline(always)]
4147	pub fn negate_mul_sub_f32x4(self, a: f32x4, b: f32x4, c: f32x4) -> f32x4 {
4148		cast!(self.fma._mm_fnmsub_ps(cast!(a), cast!(b), cast!(c)))
4149	}
4150
4151	/// Multiplies the elements in each lane of `a` and `b`, and subtracts each lane of `c` from
4152	/// the negation of the results.
4153	#[inline(always)]
4154	pub fn negate_mul_sub_f32x8(self, a: f32x8, b: f32x8, c: f32x8) -> f32x8 {
4155		cast!(self.fma._mm256_fnmsub_ps(cast!(a), cast!(b), cast!(c)))
4156	}
4157
4158	/// Multiplies the elements in each lane of `a` and `b`, and subtracts each lane of `c` from
4159	/// the negation of the results.
4160	#[inline(always)]
4161	pub fn negate_mul_sub_f64x2(self, a: f64x2, b: f64x2, c: f64x2) -> f64x2 {
4162		cast!(self.fma._mm_fnmsub_pd(cast!(a), cast!(b), cast!(c)))
4163	}
4164
4165	/// Multiplies the elements in each lane of `a` and `b`, and subtracts each lane of `c` from
4166	/// the negation of the results.
4167	#[inline(always)]
4168	pub fn negate_mul_sub_f64x4(self, a: f64x4, b: f64x4, c: f64x4) -> f64x4 {
4169		cast!(self.fma._mm256_fnmsub_pd(cast!(a), cast!(b), cast!(c)))
4170	}
4171
4172	/// Returns `!a` for each bit in a.
4173	#[inline(always)]
4174	pub fn not_i16x16(self, a: i16x16) -> i16x16 {
4175		self.xor_i16x16(a, self.splat_i16x16(!0))
4176	}
4177
4178	/// Returns `!a` for each bit in a.
4179	#[inline(always)]
4180	pub fn not_i32x8(self, a: i32x8) -> i32x8 {
4181		self.xor_i32x8(a, self.splat_i32x8(!0))
4182	}
4183
4184	/// Returns `!a` for each bit in a.
4185	#[inline(always)]
4186	pub fn not_i64x4(self, a: i64x4) -> i64x4 {
4187		self.xor_i64x4(a, self.splat_i64x4(!0))
4188	}
4189
4190	/// Returns `!a` for each bit in a.
4191	#[inline(always)]
4192	pub fn not_i8x32(self, a: i8x32) -> i8x32 {
4193		self.xor_i8x32(a, self.splat_i8x32(!0))
4194	}
4195
4196	/// Returns `!a` for each bit in a.
4197	#[inline(always)]
4198	pub fn not_m16x16(self, a: m16x16) -> m16x16 {
4199		self.xor_m16x16(a, self.splat_m16x16(m16::new(true)))
4200	}
4201
4202	/// Returns `!a` for each bit in a.
4203	#[inline(always)]
4204	pub fn not_m32x8(self, a: m32x8) -> m32x8 {
4205		self.xor_m32x8(a, self.splat_m32x8(m32::new(true)))
4206	}
4207
4208	/// Returns `!a` for each bit in a.
4209	#[inline(always)]
4210	pub fn not_m64x4(self, a: m64x4) -> m64x4 {
4211		self.xor_m64x4(a, self.splat_m64x4(m64::new(true)))
4212	}
4213
4214	/// Returns `!a` for each bit in a.
4215	#[inline(always)]
4216	pub fn not_m8x32(self, a: m8x32) -> m8x32 {
4217		self.xor_m8x32(a, self.splat_m8x32(m8::new(true)))
4218	}
4219
4220	/// Returns `!a` for each bit in a.
4221	#[inline(always)]
4222	pub fn not_u16x16(self, a: u16x16) -> u16x16 {
4223		self.xor_u16x16(a, self.splat_u16x16(!0))
4224	}
4225
4226	/// Returns `!a` for each bit in a.
4227	#[inline(always)]
4228	pub fn not_u32x8(self, a: u32x8) -> u32x8 {
4229		self.xor_u32x8(a, self.splat_u32x8(!0))
4230	}
4231
4232	/// Returns `!a` for each bit in a.
4233	#[inline(always)]
4234	pub fn not_u64x4(self, a: u64x4) -> u64x4 {
4235		self.xor_u64x4(a, self.splat_u64x4(!0))
4236	}
4237
4238	/// Returns `!a` for each bit in a.
4239	#[inline(always)]
4240	pub fn not_u8x32(self, a: u8x32) -> u8x32 {
4241		self.xor_u8x32(a, self.splat_u8x32(!0))
4242	}
4243
4244	/// Returns `a | b` for each bit in `a` and `b`.
4245	#[inline(always)]
4246	pub fn or_f32x8(self, a: f32x8, b: f32x8) -> f32x8 {
4247		cast!(self.avx._mm256_or_ps(cast!(a), cast!(b)))
4248	}
4249
4250	/// Returns `a | b` for each bit in `a` and `b`.
4251	#[inline(always)]
4252	pub fn or_f64x4(self, a: f64x4, b: f64x4) -> f64x4 {
4253		cast!(self.avx._mm256_or_pd(cast!(a), cast!(b)))
4254	}
4255
4256	/// Returns `a | b` for each bit in `a` and `b`.
4257	#[inline(always)]
4258	pub fn or_i16x16(self, a: i16x16, b: i16x16) -> i16x16 {
4259		cast!(self.avx2._mm256_or_si256(cast!(a), cast!(b)))
4260	}
4261
4262	/// Returns `a | b` for each bit in `a` and `b`.
4263	#[inline(always)]
4264	pub fn or_i32x8(self, a: i32x8, b: i32x8) -> i32x8 {
4265		cast!(self.avx2._mm256_or_si256(cast!(a), cast!(b)))
4266	}
4267
4268	/// Returns `a | b` for each bit in `a` and `b`.
4269	#[inline(always)]
4270	pub fn or_i64x4(self, a: i64x4, b: i64x4) -> i64x4 {
4271		cast!(self.avx2._mm256_or_si256(cast!(a), cast!(b)))
4272	}
4273
4274	/// Returns `a | b` for each bit in `a` and `b`.
4275	#[inline(always)]
4276	pub fn or_i8x32(self, a: i8x32, b: i8x32) -> i8x32 {
4277		cast!(self.avx2._mm256_or_si256(cast!(a), cast!(b)))
4278	}
4279
4280	/// Returns `a | b` for each bit in `a` and `b`.
4281	#[inline(always)]
4282	pub fn or_m16x16(self, a: m16x16, b: m16x16) -> m16x16 {
4283		cast!(self.avx2._mm256_or_si256(cast!(a), cast!(b)))
4284	}
4285
4286	/// Returns `a | b` for each bit in `a` and `b`.
4287	#[inline(always)]
4288	pub fn or_m32x8(self, a: m32x8, b: m32x8) -> m32x8 {
4289		cast!(self.avx2._mm256_or_si256(cast!(a), cast!(b)))
4290	}
4291
4292	/// Returns `a | b` for each bit in `a` and `b`.
4293	#[inline(always)]
4294	pub fn or_m64x4(self, a: m64x4, b: m64x4) -> m64x4 {
4295		cast!(self.avx2._mm256_or_si256(cast!(a), cast!(b)))
4296	}
4297
4298	/// Returns `a | b` for each bit in `a` and `b`.
4299	#[inline(always)]
4300	pub fn or_m8x32(self, a: m8x32, b: m8x32) -> m8x32 {
4301		cast!(self.avx2._mm256_or_si256(cast!(a), cast!(b)))
4302	}
4303
4304	/// Returns `a | b` for each bit in `a` and `b`.
4305	#[inline(always)]
4306	pub fn or_u16x16(self, a: u16x16, b: u16x16) -> u16x16 {
4307		cast!(self.avx2._mm256_or_si256(cast!(a), cast!(b)))
4308	}
4309
4310	/// Returns `a | b` for each bit in `a` and `b`.
4311	#[inline(always)]
4312	pub fn or_u32x8(self, a: u32x8, b: u32x8) -> u32x8 {
4313		cast!(self.avx2._mm256_or_si256(cast!(a), cast!(b)))
4314	}
4315
4316	/// Returns `a | b` for each bit in `a` and `b`.
4317	#[inline(always)]
4318	pub fn or_u64x4(self, a: u64x4, b: u64x4) -> u64x4 {
4319		cast!(self.avx2._mm256_or_si256(cast!(a), cast!(b)))
4320	}
4321
4322	/// Returns `a | b` for each bit in `a` and `b`.
4323	#[inline(always)]
4324	pub fn or_u8x32(self, a: u8x32, b: u8x32) -> u8x32 {
4325		cast!(self.avx2._mm256_or_si256(cast!(a), cast!(b)))
4326	}
4327
4328	/// See [_mm256_packs_epi16].
4329	///
4330	/// [_mm256_packs_epi16]: core::arch::x86_64::_mm256_packs_epi16
4331	#[inline(always)]
4332	pub fn pack_with_signed_saturation_i16x16(self, a: i16x16, b: i16x16) -> i8x32 {
4333		cast!(self.avx2._mm256_packs_epi16(cast!(a), cast!(b)))
4334	}
4335
4336	/// See [_mm256_packs_epi32].
4337	///
4338	/// [_mm256_packs_epi32]: core::arch::x86_64::_mm256_packs_epi32
4339	#[inline(always)]
4340	pub fn pack_with_signed_saturation_i32x8(self, a: i32x8, b: i32x8) -> i16x16 {
4341		cast!(self.avx2._mm256_packs_epi32(cast!(a), cast!(b)))
4342	}
4343
4344	/// See [_mm256_packus_epi16].
4345	///
4346	/// [_mm256_packus_epi16]: core::arch::x86_64::_mm256_packus_epi16
4347	#[inline(always)]
4348	pub fn pack_with_unsigned_saturation_i16x16(self, a: i16x16, b: i16x16) -> u8x32 {
4349		cast!(self.avx2._mm256_packus_epi16(cast!(a), cast!(b)))
4350	}
4351
4352	/// See [_mm256_packus_epi32].
4353	///
4354	/// [_mm256_packus_epi32]: core::arch::x86_64::_mm256_packus_epi32
4355	#[inline(always)]
4356	pub fn pack_with_unsigned_saturation_i32x8(self, a: i32x8, b: i32x8) -> u16x16 {
4357		cast!(self.avx2._mm256_packus_epi32(cast!(a), cast!(b)))
4358	}
4359
4360	/// Rounds the elements of each lane of `a` to the nearest integer. If two values are equally
4361	/// close, the even value is returned.
4362	#[inline(always)]
4363	pub fn round_f32x8(self, a: f32x8) -> f32x8 {
4364		const ROUNDING: i32 = _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC;
4365		cast!(self.avx._mm256_round_ps::<ROUNDING>(cast!(a)))
4366	}
4367
4368	/// Rounds the elements of each lane of `a` to the nearest integer. If two values are equally
4369	/// close, the even value is returned.
4370	#[inline(always)]
4371	pub fn round_f64x4(self, a: f64x4) -> f64x4 {
4372		const ROUNDING: i32 = _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC;
4373		cast!(self.avx._mm256_round_pd::<ROUNDING>(cast!(a)))
4374	}
4375
4376	/// Adds the elements of each lane of `a` and `b`, with saturation.
4377	#[inline(always)]
4378	pub fn saturating_add_i16x16(self, a: i16x16, b: i16x16) -> i16x16 {
4379		cast!(self.avx2._mm256_adds_epi16(cast!(a), cast!(b)))
4380	}
4381
4382	/// Adds the elements of each lane of `a` and `b`, with saturation.
4383	#[inline(always)]
4384	pub fn saturating_add_i8x32(self, a: i8x32, b: i8x32) -> i8x32 {
4385		cast!(self.avx2._mm256_adds_epi8(cast!(a), cast!(b)))
4386	}
4387
4388	/// Adds the elements of each lane of `a` and `b`, with saturation.
4389	#[inline(always)]
4390	pub fn saturating_add_u16x16(self, a: u16x16, b: u16x16) -> u16x16 {
4391		cast!(self.avx2._mm256_adds_epu16(cast!(a), cast!(b)))
4392	}
4393
4394	/// Adds the elements of each lane of `a` and `b`, with saturation.
4395	#[inline(always)]
4396	pub fn saturating_add_u8x32(self, a: u8x32, b: u8x32) -> u8x32 {
4397		cast!(self.avx2._mm256_adds_epu8(cast!(a), cast!(b)))
4398	}
4399
4400	/// Subtracts the elements of each lane of `a` and `b`, with saturation.
4401	#[inline(always)]
4402	pub fn saturating_sub_i16x16(self, a: i16x16, b: i16x16) -> i16x16 {
4403		cast!(self.avx2._mm256_subs_epi16(cast!(a), cast!(b)))
4404	}
4405
4406	/// Subtracts the elements of each lane of `a` and `b`, with saturation.
4407	#[inline(always)]
4408	pub fn saturating_sub_i8x32(self, a: i8x32, b: i8x32) -> i8x32 {
4409		cast!(self.avx2._mm256_subs_epi8(cast!(a), cast!(b)))
4410	}
4411
4412	/// Subtracts the elements of each lane of `a` and `b`, with saturation.
4413	#[inline(always)]
4414	pub fn saturating_sub_u16x16(self, a: u16x16, b: u16x16) -> u16x16 {
4415		cast!(self.avx2._mm256_subs_epu16(cast!(a), cast!(b)))
4416	}
4417
4418	/// Subtracts the elements of each lane of `a` and `b`, with saturation.
4419	#[inline(always)]
4420	pub fn saturating_sub_u8x32(self, a: u8x32, b: u8x32) -> u8x32 {
4421		cast!(self.avx2._mm256_subs_epu8(cast!(a), cast!(b)))
4422	}
4423
4424	/// Combines `if_true` and `if_false`, selecting elements from `if_true` if the corresponding
4425	/// bit in the mask is set, otherwise selecting elements from `if_false`.
4426	#[inline(always)]
4427	pub fn select_const_f32x8<const MASK8: i32>(self, if_true: f32x8, if_false: f32x8) -> f32x8 {
4428		cast!(
4429			self.avx
4430				._mm256_blend_ps::<MASK8>(cast!(if_false), cast!(if_true)),
4431		)
4432	}
4433
4434	/// Combines `if_true` and `if_false`, selecting elements from `if_true` if the corresponding
4435	/// bit in the mask is set, otherwise selecting elements from `if_false`.
4436	#[inline(always)]
4437	pub fn select_const_f64x4<const MASK4: i32>(self, if_true: f64x4, if_false: f64x4) -> f64x4 {
4438		cast!(self.select_const_u64x4::<MASK4>(cast!(if_true), cast!(if_false)))
4439	}
4440
4441	/// Combines `if_true` and `if_false`, selecting elements from `if_true` if the corresponding
4442	/// bit in the mask is set, otherwise selecting elements from `if_false`.
4443	#[inline(always)]
4444	pub fn select_const_i32x8<const MASK8: i32>(self, if_true: i32x8, if_false: i32x8) -> i32x8 {
4445		cast!(self.select_const_u32x8::<MASK8>(cast!(if_true), cast!(if_false)))
4446	}
4447
4448	/// Combines `if_true` and `if_false`, selecting elements from `if_true` if the corresponding
4449	/// bit in the mask is set, otherwise selecting elements from `if_false`.
4450	#[inline(always)]
4451	pub fn select_const_i64x4<const MASK4: i32>(self, if_true: i64x4, if_false: i64x4) -> i64x4 {
4452		cast!(self.select_const_u64x4::<MASK4>(cast!(if_true), cast!(if_false)))
4453	}
4454
4455	/// Combines `if_true` and `if_false`, selecting elements from `if_true` if the corresponding
4456	/// bit in the mask is set, otherwise selecting elements from `if_false`.
4457	#[inline(always)]
4458	pub fn select_const_u32x8<const MASK8: i32>(self, if_true: u32x8, if_false: u32x8) -> u32x8 {
4459		cast!(
4460			self.avx2
4461				._mm256_blend_epi32::<MASK8>(cast!(if_false), cast!(if_true)),
4462		)
4463	}
4464
4465	/// Combines `if_true` and `if_false`, selecting elements from `if_true` if the corresponding
4466	/// bit in the mask is set, otherwise selecting elements from `if_false`.
4467	#[inline(always)]
4468	pub fn select_const_u64x4<const MASK4: i32>(self, if_true: u64x4, if_false: u64x4) -> u64x4 {
4469		cast!(
4470			self.avx
4471				._mm256_blend_pd::<MASK4>(cast!(if_false), cast!(if_true)),
4472		)
4473	}
4474
4475	/// Combines `if_true` and `if_false`, selecting elements from `if_true` if the corresponding
4476	/// mask in `mask` is set, otherwise selecting elements from `if_false`.
4477	#[inline(always)]
4478	pub fn select_f32x8(self, mask: m32x8, if_true: f32x8, if_false: f32x8) -> f32x8 {
4479		cast!(
4480			self.avx
4481				._mm256_blendv_ps(cast!(if_false), cast!(if_true), cast!(mask)),
4482		)
4483	}
4484
4485	/// Combines `if_true` and `if_false`, selecting elements from `if_true` if the corresponding
4486	/// mask in `mask` is set, otherwise selecting elements from `if_false`.
4487	#[inline(always)]
4488	pub fn select_f64x4(self, mask: m64x4, if_true: f64x4, if_false: f64x4) -> f64x4 {
4489		cast!(
4490			self.avx
4491				._mm256_blendv_pd(cast!(if_false), cast!(if_true), cast!(mask)),
4492		)
4493	}
4494
4495	/// Combines `if_true` and `if_false`, selecting elements from `if_true` if the corresponding
4496	/// mask in `mask` is set, otherwise selecting elements from `if_false`.
4497	#[inline(always)]
4498	pub fn select_i16x16(self, mask: m16x16, if_true: i16x16, if_false: i16x16) -> i16x16 {
4499		cast!(self.select_u16x16(mask, cast!(if_true), cast!(if_false)))
4500	}
4501
4502	/// Combines `if_true` and `if_false`, selecting elements from `if_true` if the corresponding
4503	/// mask in `mask` is set, otherwise selecting elements from `if_false`.
4504	#[inline(always)]
4505	pub fn select_i32x8(self, mask: m32x8, if_true: i32x8, if_false: i32x8) -> i32x8 {
4506		cast!(self.select_u32x8(mask, cast!(if_true), cast!(if_false)))
4507	}
4508
4509	/// Combines `if_true` and `if_false`, selecting elements from `if_true` if the corresponding
4510	/// mask in `mask` is set, otherwise selecting elements from `if_false`.
4511	#[inline(always)]
4512	pub fn select_i64x4(self, mask: m64x4, if_true: i64x4, if_false: i64x4) -> i64x4 {
4513		cast!(self.select_u64x4(mask, cast!(if_true), cast!(if_false)))
4514	}
4515
4516	/// Combines `if_true` and `if_false`, selecting elements from `if_true` if the corresponding
4517	/// mask in `mask` is set, otherwise selecting elements from `if_false`.
4518	#[inline(always)]
4519	pub fn select_i8x32(self, mask: m8x32, if_true: i8x32, if_false: i8x32) -> i8x32 {
4520		cast!(self.select_u8x32(mask, cast!(if_true), cast!(if_false)))
4521	}
4522
4523	/// Combines `if_true` and `if_false`, selecting elements from `if_true` if the corresponding
4524	/// mask in `mask` is set, otherwise selecting elements from `if_false`.
4525	#[inline(always)]
4526	pub fn select_u16x16(self, mask: m16x16, if_true: u16x16, if_false: u16x16) -> u16x16 {
4527		cast!(
4528			self.avx2
4529				._mm256_blendv_epi8(cast!(if_false), cast!(if_true), cast!(mask)),
4530		)
4531	}
4532
4533	/// Combines `if_true` and `if_false`, selecting elements from `if_true` if the corresponding
4534	/// mask in `mask` is set, otherwise selecting elements from `if_false`.
4535	#[inline(always)]
4536	pub fn select_u32x8(self, mask: m32x8, if_true: u32x8, if_false: u32x8) -> u32x8 {
4537		cast!(
4538			self.avx2
4539				._mm256_blendv_epi8(cast!(if_false), cast!(if_true), cast!(mask)),
4540		)
4541	}
4542
4543	/// Combines `if_true` and `if_false`, selecting elements from `if_true` if the corresponding
4544	/// mask in `mask` is set, otherwise selecting elements from `if_false`.
4545	#[inline(always)]
4546	pub fn select_u64x4(self, mask: m64x4, if_true: u64x4, if_false: u64x4) -> u64x4 {
4547		cast!(
4548			self.avx2
4549				._mm256_blendv_epi8(cast!(if_false), cast!(if_true), cast!(mask)),
4550		)
4551	}
4552
4553	/// Combines `if_true` and `if_false`, selecting elements from `if_true` if the corresponding
4554	/// mask in `mask` is set, otherwise selecting elements from `if_false`.
4555	#[inline(always)]
4556	pub fn select_u8x32(self, mask: m8x32, if_true: u8x32, if_false: u8x32) -> u8x32 {
4557		cast!(
4558			self.avx2
4559				._mm256_blendv_epi8(cast!(if_false), cast!(if_true), cast!(mask)),
4560		)
4561	}
4562
4563	/// Shift the bits of each lane of `a` to the left by `AMOUNT`, while shifting in zeros.
4564	/// Shifting by a value greater than the bit width of the type sets the result to zero.
4565	#[inline(always)]
4566	pub fn shl_const_i16x16<const AMOUNT: i32>(self, a: i16x16) -> i16x16 {
4567		cast!(self.avx2._mm256_slli_epi16::<AMOUNT>(cast!(a)))
4568	}
4569
4570	/// Shift the bits of each lane of `a` to the left by `AMOUNT`, while shifting in zeros.
4571	/// Shifting by a value greater than the bit width of the type sets the result to zero.
4572	#[inline(always)]
4573	pub fn shl_const_i32x8<const AMOUNT: i32>(self, a: i32x8) -> i32x8 {
4574		cast!(self.avx2._mm256_slli_epi32::<AMOUNT>(cast!(a)))
4575	}
4576
4577	/// Shift the bits of each lane of `a` to the left by `AMOUNT`, while shifting in zeros.
4578	/// Shifting by a value greater than the bit width of the type sets the result to zero.
4579	#[inline(always)]
4580	pub fn shl_const_i64x4<const AMOUNT: i32>(self, a: i64x4) -> i64x4 {
4581		cast!(self.avx2._mm256_slli_epi64::<AMOUNT>(cast!(a)))
4582	}
4583
4584	/// Shift the bits of each lane of `a` to the left by `AMOUNT`, while shifting in zeros.
4585	/// Shifting by a value greater than the bit width of the type sets the result to zero.
4586	#[inline(always)]
4587	pub fn shl_const_u16x16<const AMOUNT: i32>(self, a: u16x16) -> u16x16 {
4588		cast!(self.avx2._mm256_slli_epi16::<AMOUNT>(cast!(a)))
4589	}
4590
4591	/// Shift the bits of each lane of `a` to the left by `AMOUNT`, while shifting in zeros.
4592	/// Shifting by a value greater than the bit width of the type sets the result to zero.
4593	#[inline(always)]
4594	pub fn shl_const_u32x8<const AMOUNT: i32>(self, a: u32x8) -> u32x8 {
4595		cast!(self.avx2._mm256_slli_epi32::<AMOUNT>(cast!(a)))
4596	}
4597
4598	/// Shift the bits of each lane of `a` to the left by `AMOUNT`, while shifting in zeros.
4599	/// Shifting by a value greater than the bit width of the type sets the result to zero.
4600	#[inline(always)]
4601	pub fn shl_const_u64x4<const AMOUNT: i32>(self, a: u64x4) -> u64x4 {
4602		cast!(self.avx2._mm256_slli_epi64::<AMOUNT>(cast!(a)))
4603	}
4604
4605	/// Shift the bits of each lane of `a` to the left by the element in the corresponding lane in
4606	/// `amount`, while shifting in zeros.
4607	/// Shifting by a value greater than the bit width of the type sets the result to zero.
4608	#[inline(always)]
4609	pub fn shl_dyn_i32x4(self, a: i32x4, amount: u32x4) -> i32x4 {
4610		cast!(self.avx2._mm_sllv_epi32(cast!(a), cast!(amount)))
4611	}
4612
4613	/// Shift the bits of each lane of `a` to the left by the element in the corresponding lane in
4614	/// `amount`, while shifting in zeros.
4615	/// Shifting by a value greater than the bit width of the type sets the result to zero.
4616	#[inline(always)]
4617	pub fn shl_dyn_i32x8(self, a: i32x8, amount: u32x8) -> i32x8 {
4618		cast!(self.avx2._mm256_sllv_epi32(cast!(a), cast!(amount)))
4619	}
4620
4621	/// Shift the bits of each lane of `a` to the left by the element in the corresponding lane in
4622	/// `amount`, while shifting in zeros.
4623	/// Shifting by a value greater than the bit width of the type sets the result to zero.
4624	#[inline(always)]
4625	pub fn shl_dyn_i64x2(self, a: i64x2, amount: u64x2) -> i64x2 {
4626		cast!(self.avx2._mm_sllv_epi64(cast!(a), cast!(amount)))
4627	}
4628
4629	/// Shift the bits of each lane of `a` to the left by the element in the corresponding lane in
4630	/// `amount`, while shifting in zeros.
4631	/// Shifting by a value greater than the bit width of the type sets the result to zero.
4632	#[inline(always)]
4633	pub fn shl_dyn_i64x4(self, a: i64x4, amount: u64x4) -> i64x4 {
4634		cast!(self.avx2._mm256_sllv_epi64(cast!(a), cast!(amount)))
4635	}
4636
4637	/// Shift the bits of each lane of `a` to the left by the element in the corresponding lane in
4638	/// `amount`, while shifting in zeros.
4639	/// Shifting by a value greater than the bit width of the type sets the result to zero.
4640	#[inline(always)]
4641	pub fn shl_dyn_u32x4(self, a: u32x4, amount: u32x4) -> u32x4 {
4642		cast!(self.avx2._mm_sllv_epi32(cast!(a), cast!(amount)))
4643	}
4644
4645	/// Shift the bits of each lane of `a` to the left by the element in the corresponding lane in
4646	/// `amount`, while shifting in zeros.
4647	/// Shifting by a value greater than the bit width of the type sets the result to zero.
4648	#[inline(always)]
4649	pub fn shl_dyn_u32x8(self, a: u32x8, amount: u32x8) -> u32x8 {
4650		cast!(self.avx2._mm256_sllv_epi32(cast!(a), cast!(amount)))
4651	}
4652
4653	/// Shift the bits of each lane of `a` to the left by the element in the corresponding lane in
4654	/// `amount`, while shifting in zeros.
4655	/// Shifting by a value greater than the bit width of the type sets the result to zero.
4656	#[inline(always)]
4657	pub fn shl_dyn_u64x2(self, a: u64x2, amount: u64x2) -> u64x2 {
4658		cast!(self.avx2._mm_sllv_epi64(cast!(a), cast!(amount)))
4659	}
4660
4661	/// Shift the bits of each lane of `a` to the left by the element in the corresponding lane in
4662	/// `amount`, while shifting in zeros.
4663	/// Shifting by a value greater than the bit width of the type sets the result to zero.
4664	#[inline(always)]
4665	pub fn shl_dyn_u64x4(self, a: u64x4, amount: u64x4) -> u64x4 {
4666		cast!(self.avx2._mm256_sllv_epi64(cast!(a), cast!(amount)))
4667	}
4668
4669	/// Shift the bits of each lane of `a` to the left by the first element in `amount`, while
4670	/// shifting in zeros.
4671	/// Shifting by a value greater than the bit width of the type sets the result to zero.
4672	#[inline(always)]
4673	pub fn shl_i16x16(self, a: i16x16, amount: u64x2) -> i16x16 {
4674		cast!(self.avx2._mm256_sll_epi16(cast!(a), cast!(amount)))
4675	}
4676
4677	/// Shift the bits of each lane of `a` to the left by the first element in `amount`, while
4678	/// shifting in zeros.
4679	/// Shifting by a value greater than the bit width of the type sets the result to zero.
4680	#[inline(always)]
4681	pub fn shl_i32x8(self, a: i32x8, amount: u64x2) -> i32x8 {
4682		cast!(self.avx2._mm256_sll_epi32(cast!(a), cast!(amount)))
4683	}
4684
4685	/// Shift the bits of each lane of `a` to the left by the first element in `amount`, while
4686	/// shifting in zeros.
4687	/// Shifting by a value greater than the bit width of the type sets the result to zero.
4688	#[inline(always)]
4689	pub fn shl_i64x4(self, a: i64x4, amount: u64x2) -> i64x4 {
4690		cast!(self.avx2._mm256_sll_epi64(cast!(a), cast!(amount)))
4691	}
4692
4693	/// Shift the bits of each lane of `a` to the left by the first element in `amount`, while
4694	/// shifting in zeros.
4695	/// Shifting by a value greater than the bit width of the type sets the result to zero.
4696	#[inline(always)]
4697	pub fn shl_u16x16(self, a: u16x16, amount: u64x2) -> u16x16 {
4698		cast!(self.avx2._mm256_sll_epi16(cast!(a), cast!(amount)))
4699	}
4700
4701	/// Shift the bits of each lane of `a` to the left by the first element in `amount`, while
4702	/// shifting in zeros.
4703	/// Shifting by a value greater than the bit width of the type sets the result to zero.
4704	#[inline(always)]
4705	pub fn shl_u32x8(self, a: u32x8, amount: u64x2) -> u32x8 {
4706		cast!(self.avx2._mm256_sll_epi32(cast!(a), cast!(amount)))
4707	}
4708
4709	/// Shift the bits of each lane of `a` to the left by the first element in `amount`, while
4710	/// shifting in zeros.
4711	/// Shifting by a value greater than the bit width of the type sets the result to zero.
4712	#[inline(always)]
4713	pub fn shl_u64x4(self, a: u64x4, amount: u64x2) -> u64x4 {
4714		cast!(self.avx2._mm256_sll_epi64(cast!(a), cast!(amount)))
4715	}
4716
4717	/// Shift the bits of each lane of `a` to the right by `AMOUNT`, while shifting in sign bits.
4718	/// Shifting by a value greater than the bit width of the type sets the result to zero if the
4719	/// sign bit is not set, and to `-1` if the sign bit is set.
4720	#[inline(always)]
4721	pub fn shr_const_i16x16<const AMOUNT: i32>(self, a: i16x16) -> i16x16 {
4722		cast!(self.avx2._mm256_srai_epi16::<AMOUNT>(cast!(a)))
4723	}
4724
4725	/// Shift the bits of each lane of `a` to the right by `AMOUNT`, while shifting in sign bits.
4726	/// Shifting by a value greater than the bit width of the type sets the result to zero if the
4727	/// sign bit is not set, and to `-1` if the sign bit is set.
4728	#[inline(always)]
4729	pub fn shr_const_i32x8<const AMOUNT: i32>(self, a: i32x8) -> i32x8 {
4730		cast!(self.avx2._mm256_srai_epi32::<AMOUNT>(cast!(a)))
4731	}
4732
4733	/// Shift the bits of each lane of `a` to the right by `AMOUNT`, while shifting in zeros.
4734	/// Shifting by a value greater than the bit width of the type sets the result to zero.
4735	#[inline(always)]
4736	pub fn shr_const_u16x16<const AMOUNT: i32>(self, a: u16x16) -> u16x16 {
4737		cast!(self.avx2._mm256_srli_epi16::<AMOUNT>(cast!(a)))
4738	}
4739
4740	/// Shift the bits of each lane of `a` to the right by `AMOUNT`, while shifting in zeros.
4741	/// Shifting by a value greater than the bit width of the type sets the result to zero.
4742	#[inline(always)]
4743	pub fn shr_const_u32x8<const AMOUNT: i32>(self, a: u32x8) -> u32x8 {
4744		cast!(self.avx2._mm256_srli_epi32::<AMOUNT>(cast!(a)))
4745	}
4746
4747	/// Shift the bits of each lane of `a` to the right by `AMOUNT`, while shifting in zeros.
4748	/// Shifting by a value greater than the bit width of the type sets the result to zero.
4749	#[inline(always)]
4750	pub fn shr_const_u64x4<const AMOUNT: i32>(self, a: u64x4) -> u64x4 {
4751		cast!(self.avx2._mm256_srli_epi64::<AMOUNT>(cast!(a)))
4752	}
4753
4754	/// Shift the bits of each lane of `a` to the right by the element in the corresponding lane in
4755	/// `amount`, while shifting in sign bits.
4756	/// Shifting by a value greater than the bit width of the type sets the result to zero if the
4757	/// sign bit is not set, and to `-1` if the sign bit is set.
4758	#[inline(always)]
4759	pub fn shr_dyn_i32x4(self, a: i32x4, amount: i32x4) -> i32x4 {
4760		cast!(self.avx2._mm_srav_epi32(cast!(a), cast!(amount)))
4761	}
4762
4763	/// Shift the bits of each lane of `a` to the right by the element in the corresponding lane in
4764	/// `amount`, while shifting in sign bits.
4765	/// Shifting by a value greater than the bit width of the type sets the result to zero if the
4766	/// sign bit is not set, and to `-1` if the sign bit is set.
4767	#[inline(always)]
4768	pub fn shr_dyn_i32x8(self, a: i32x8, amount: i32x8) -> i32x8 {
4769		cast!(self.avx2._mm256_srav_epi32(cast!(a), cast!(amount)))
4770	}
4771
4772	/// Shift the bits of each lane of `a` to the right by the element in the corresponding lane in
4773	/// `amount`, while shifting in zeros.
4774	/// Shifting by a value greater than the bit width of the type sets the result to zero.
4775	#[inline(always)]
4776	pub fn shr_dyn_u32x4(self, a: u32x4, amount: u32x4) -> u32x4 {
4777		cast!(self.avx2._mm_srlv_epi32(cast!(a), cast!(amount)))
4778	}
4779
4780	/// Shift the bits of each lane of `a` to the right by the element in the corresponding lane in
4781	/// `amount`, while shifting in zeros.
4782	/// Shifting by a value greater than the bit width of the type sets the result to zero.
4783	#[inline(always)]
4784	pub fn shr_dyn_u32x8(self, a: u32x8, amount: u32x8) -> u32x8 {
4785		cast!(self.avx2._mm256_srlv_epi32(cast!(a), cast!(amount)))
4786	}
4787
4788	/// Shift the bits of each lane of `a` to the right by the element in the corresponding lane in
4789	/// `amount`, while shifting in zeros.
4790	/// Shifting by a value greater than the bit width of the type sets the result to zero.
4791	#[inline(always)]
4792	pub fn shr_dyn_u64x2(self, a: u64x2, amount: u64x2) -> u64x2 {
4793		cast!(self.avx2._mm_srlv_epi64(cast!(a), cast!(amount)))
4794	}
4795
4796	/// Shift the bits of each lane of `a` to the right by the element in the corresponding lane in
4797	/// `amount`, while shifting in zeros.
4798	/// Shifting by a value greater than the bit width of the type sets the result to zero.
4799	#[inline(always)]
4800	pub fn shr_dyn_u64x4(self, a: u64x4, amount: u64x4) -> u64x4 {
4801		cast!(self.avx2._mm256_srlv_epi64(cast!(a), cast!(amount)))
4802	}
4803
4804	/// Shift the bits of each lane of `a` to the right by the first element in `amount`, while
4805	/// shifting in zeros.
4806	/// Shifting by a value greater than the bit width of the type sets the result to zero if the
4807	/// sign bit is not set, and to `-1` if the sign bit is set.
4808	#[inline(always)]
4809	pub fn shr_i16x16(self, a: i16x16, amount: u64x2) -> i16x16 {
4810		cast!(self.avx2._mm256_sra_epi16(cast!(a), cast!(amount)))
4811	}
4812
4813	/// Shift the bits of each lane of `a` to the right by the first element in `amount`, while
4814	/// shifting in zeros.
4815	/// Shifting by a value greater than the bit width of the type sets the result to zero if the
4816	/// sign bit is not set, and to `-1` if the sign bit is set.
4817	#[inline(always)]
4818	pub fn shr_i32x8(self, a: i32x8, amount: u64x2) -> i32x8 {
4819		cast!(self.avx2._mm256_sra_epi32(cast!(a), cast!(amount)))
4820	}
4821
4822	/// Shift the bits of each lane of `a` to the right by the first element in `amount`, while
4823	/// shifting in zeros.
4824	/// Shifting by a value greater than the bit width of the type sets the result to zero.
4825	#[inline(always)]
4826	pub fn shr_u16x16(self, a: u16x16, amount: u64x2) -> u16x16 {
4827		cast!(self.avx2._mm256_srl_epi16(cast!(a), cast!(amount)))
4828	}
4829
4830	/// Shift the bits of each lane of `a` to the right by the first element in `amount`, while
4831	/// shifting in zeros.
4832	/// Shifting by a value greater than the bit width of the type sets the result to zero.
4833	#[inline(always)]
4834	pub fn shr_u32x8(self, a: u32x8, amount: u64x2) -> u32x8 {
4835		cast!(self.avx2._mm256_srl_epi32(cast!(a), cast!(amount)))
4836	}
4837
4838	/// Shift the bits of each lane of `a` to the right by the first element in `amount`, while
4839	/// shifting in zeros.
4840	/// Shifting by a value greater than the bit width of the type sets the result to zero.
4841	#[inline(always)]
4842	pub fn shr_u64x4(self, a: u64x4, amount: u64x2) -> u64x4 {
4843		cast!(self.avx2._mm256_srl_epi64(cast!(a), cast!(amount)))
4844	}
4845
4846	/// Returns a SIMD vector with all lanes set to the given value.
4847	#[inline(always)]
4848	pub fn splat_f32x8(self, value: f32) -> f32x8 {
4849		cast!(self.avx._mm256_set1_ps(value))
4850	}
4851
4852	/// Returns a SIMD vector with all lanes set to the given value.
4853	#[inline(always)]
4854	pub fn splat_f64x4(self, value: f64) -> f64x4 {
4855		cast!(self.avx._mm256_set1_pd(value))
4856	}
4857
4858	/// Returns a SIMD vector with all lanes set to the given value.
4859	#[inline(always)]
4860	pub fn splat_i16x16(self, value: i16) -> i16x16 {
4861		cast!(self.avx._mm256_set1_epi16(value))
4862	}
4863
4864	/// Returns a SIMD vector with all lanes set to the given value.
4865	#[inline(always)]
4866	pub fn splat_i32x8(self, value: i32) -> i32x8 {
4867		cast!(self.avx._mm256_set1_epi32(value))
4868	}
4869
4870	/// Returns a SIMD vector with all lanes set to the given value.
4871	#[inline(always)]
4872	pub fn splat_i64x4(self, value: i64) -> i64x4 {
4873		cast!(self.avx._mm256_set1_epi64x(value))
4874	}
4875
4876	/// Returns a SIMD vector with all lanes set to the given value.
4877	#[inline(always)]
4878	pub fn splat_i8x32(self, value: i8) -> i8x32 {
4879		cast!(self.avx._mm256_set1_epi8(value))
4880	}
4881
4882	/// Returns a SIMD vector with all lanes set to the given value.
4883	#[inline(always)]
4884	pub fn splat_m16x16(self, value: m16) -> m16x16 {
4885		cast!(self.avx._mm256_set1_epi16(value.0 as i16))
4886	}
4887
4888	/// Returns a SIMD vector with all lanes set to the given value.
4889	#[inline(always)]
4890	pub fn splat_m32x8(self, value: m32) -> m32x8 {
4891		cast!(self.avx._mm256_set1_epi32(value.0 as i32))
4892	}
4893
4894	/// Returns a SIMD vector with all lanes set to the given value.
4895	#[inline(always)]
4896	pub fn splat_m64x4(self, value: m64) -> m64x4 {
4897		cast!(self.avx._mm256_set1_epi64x(value.0 as i64))
4898	}
4899
4900	/// Returns a SIMD vector with all lanes set to the given value.
4901	#[inline(always)]
4902	pub fn splat_m8x32(self, value: m8) -> m8x32 {
4903		cast!(self.avx._mm256_set1_epi8(value.0 as i8))
4904	}
4905
4906	/// Returns a SIMD vector with all lanes set to the given value.
4907	#[inline(always)]
4908	pub fn splat_u16x16(self, value: u16) -> u16x16 {
4909		cast!(self.avx._mm256_set1_epi16(value as i16))
4910	}
4911
4912	/// Returns a SIMD vector with all lanes set to the given value.
4913	#[inline(always)]
4914	pub fn splat_u32x8(self, value: u32) -> u32x8 {
4915		cast!(self.avx._mm256_set1_epi32(value as i32))
4916	}
4917
4918	/// Returns a SIMD vector with all lanes set to the given value.
4919	#[inline(always)]
4920	pub fn splat_u64x4(self, value: u64) -> u64x4 {
4921		cast!(self.avx._mm256_set1_epi64x(value as i64))
4922	}
4923
4924	/// Returns a SIMD vector with all lanes set to the given value.
4925	#[inline(always)]
4926	pub fn splat_u8x32(self, value: u8) -> u8x32 {
4927		cast!(self.avx._mm256_set1_epi8(value as i8))
4928	}
4929
4930	/// Computes the square roots of the elements of each lane of `a`.
4931	#[inline(always)]
4932	pub fn sqrt_f32x8(self, a: f32x8) -> f32x8 {
4933		cast!(self.avx._mm256_sqrt_ps(cast!(a)))
4934	}
4935
4936	/// Computes the square roots of the elements of each lane of `a`.
4937	#[inline(always)]
4938	pub fn sqrt_f64x4(self, a: f64x4) -> f64x4 {
4939		cast!(self.avx._mm256_sqrt_pd(cast!(a)))
4940	}
4941
4942	/// Calculates `a - b` for each lane in `a` and `b`.
4943	#[inline(always)]
4944	pub fn sub_f32x8(self, a: f32x8, b: f32x8) -> f32x8 {
4945		cast!(self.avx._mm256_sub_ps(cast!(a), cast!(b)))
4946	}
4947
4948	/// Calculates `a - b` for each lane in `a` and `b`.
4949	#[inline(always)]
4950	pub fn sub_f64x4(self, a: f64x4, b: f64x4) -> f64x4 {
4951		cast!(self.avx._mm256_sub_pd(cast!(a), cast!(b)))
4952	}
4953
4954	/// Alternatively subtracts and adds the elements of each lane of `a` and `b`.
4955	#[inline(always)]
4956	pub fn subadd_f32x8(self, a: f32x8, b: f32x8) -> f32x8 {
4957		cast!(self.avx._mm256_addsub_ps(cast!(a), cast!(b)))
4958	}
4959
4960	/// Alternatively subtracts and adds the elements of each lane of `a` and `b`.
4961	#[inline(always)]
4962	pub fn subadd_f64x4(self, a: f64x4, b: f64x4) -> f64x4 {
4963		cast!(self.avx._mm256_addsub_pd(cast!(a), cast!(b)))
4964	}
4965
4966	/// See [_mm256_sad_epu8].
4967	///
4968	/// [_mm256_sad_epu8]: core::arch::x86_64::_mm256_sad_epu8
4969	#[inline(always)]
4970	pub fn sum_of_absolute_differences_u8x32(self, a: u8x32, b: u8x32) -> u64x4 {
4971		cast!(self.avx2._mm256_sad_epu8(cast!(a), cast!(b)))
4972	}
4973
4974	/// Rounds the elements of each lane of `a` to the nearest integer towards zero.
4975	#[inline(always)]
4976	pub fn truncate_f32x8(self, a: f32x8) -> f32x8 {
4977		const ROUNDING: i32 = _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC;
4978		cast!(self.avx._mm256_round_ps::<ROUNDING>(cast!(a)))
4979	}
4980
4981	/// Rounds the elements of each lane of `a` to the nearest integer towards zero.
4982	#[inline(always)]
4983	pub fn truncate_f64x4(self, a: f64x4) -> f64x4 {
4984		const ROUNDING: i32 = _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC;
4985		cast!(self.avx._mm256_round_pd::<ROUNDING>(cast!(a)))
4986	}
4987
4988	/// Computes the unsigned absolute value of the elements of each lane of `a`.
4989	#[inline(always)]
4990	pub fn unsigned_abs_i16x16(self, a: i16x16) -> u16x16 {
4991		cast!(self.avx2._mm256_abs_epi16(cast!(a)))
4992	}
4993
4994	/// Computes the unsigned absolute value of the elements of each lane of `a`.
4995	#[inline(always)]
4996	pub fn unsigned_abs_i32x8(self, a: i32x8) -> u32x8 {
4997		cast!(self.avx2._mm256_abs_epi32(cast!(a)))
4998	}
4999
5000	/// Computes the unsigned absolute value of the elements of each lane of `a`.
5001	#[inline(always)]
5002	pub fn unsigned_abs_i8x32(self, a: i8x32) -> u8x32 {
5003		cast!(self.avx2._mm256_abs_epi8(cast!(a)))
5004	}
5005
5006	/// Multiplies the elements of each lane of `a` and `b`, and returns separately the low and
5007	/// high bits of the result.
5008	#[inline(always)]
5009	pub fn widening_mul_i16x16(self, a: i16x16, b: i16x16) -> (i16x16, i16x16) {
5010		(
5011			cast!(self.avx2._mm256_mullo_epi16(cast!(a), cast!(b))),
5012			cast!(self.avx2._mm256_mulhi_epi16(cast!(a), cast!(b))),
5013		)
5014	}
5015
5016	/// Multiplies the elements of each lane of `a` and `b`, and returns separately the low and
5017	/// high bits of the result.
5018	#[inline(always)]
5019	pub fn widening_mul_i32x8(self, a: i32x8, b: i32x8) -> (i32x8, i32x8) {
5020		let a = cast!(a);
5021		let b = cast!(b);
5022		let avx2 = self.avx2;
5023
5024		// a0b0_lo a0b0_hi a2b2_lo a2b2_hi
5025		let ab_evens = self.avx2._mm256_mul_epi32(a, b);
5026		// a1b1_lo a1b1_hi a3b3_lo a3b3_hi
5027		let ab_odds = self.avx2._mm256_mul_epi32(
5028			avx2._mm256_srli_epi64::<32>(a),
5029			avx2._mm256_srli_epi64::<32>(b),
5030		);
5031
5032		let ab_lo = self.avx2._mm256_blend_epi32::<0b10101010>(
5033			// a0b0_lo xxxxxxx a2b2_lo xxxxxxx
5034			ab_evens,
5035			// xxxxxxx a1b1_lo xxxxxxx a3b3_lo
5036			avx2._mm256_slli_epi64::<32>(ab_odds),
5037		);
5038		let ab_hi = self.avx2._mm256_blend_epi32::<0b10101010>(
5039			// a0b0_hi xxxxxxx a2b2_hi xxxxxxx
5040			avx2._mm256_srli_epi64::<32>(ab_evens),
5041			// xxxxxxx a1b1_hi xxxxxxx a3b3_hi
5042			ab_odds,
5043		);
5044
5045		(cast!(ab_lo), cast!(ab_hi))
5046	}
5047
5048	/// Multiplies the elements of each lane of `a` and `b`, and returns separately the low and
5049	/// high bits of the result.
5050	#[inline(always)]
5051	pub fn widening_mul_u16x16(self, a: u16x16, b: u16x16) -> (u16x16, u16x16) {
5052		(
5053			cast!(self.avx2._mm256_mullo_epi16(cast!(a), cast!(b))),
5054			cast!(self.avx2._mm256_mulhi_epu16(cast!(a), cast!(b))),
5055		)
5056	}
5057
5058	/// Multiplies the elements of each lane of `a` and `b`, and returns separately the low and
5059	/// high bits of the result.
5060	#[inline(always)]
5061	pub fn widening_mul_u32x8(self, a: u32x8, b: u32x8) -> (u32x8, u32x8) {
5062		let a = cast!(a);
5063		let b = cast!(b);
5064		let avx2 = self.avx2;
5065
5066		// a0b0_lo a0b0_hi a2b2_lo a2b2_hi
5067		let ab_evens = avx2._mm256_mul_epu32(a, b);
5068		// a1b1_lo a1b1_hi a3b3_lo a3b3_hi
5069		let ab_odds = avx2._mm256_mul_epu32(
5070			avx2._mm256_srli_epi64::<32>(a),
5071			avx2._mm256_srli_epi64::<32>(b),
5072		);
5073
5074		let ab_lo = self.avx2._mm256_blend_epi32::<0b10101010>(
5075			// a0b0_lo xxxxxxx a2b2_lo xxxxxxx
5076			ab_evens,
5077			// xxxxxxx a1b1_lo xxxxxxx a3b3_lo
5078			avx2._mm256_slli_epi64::<32>(ab_odds),
5079		);
5080		let ab_hi = self.avx2._mm256_blend_epi32::<0b10101010>(
5081			// a0b0_hi xxxxxxx a2b2_hi xxxxxxx
5082			avx2._mm256_srli_epi64::<32>(ab_evens),
5083			// xxxxxxx a1b1_hi xxxxxxx a3b3_hi
5084			ab_odds,
5085		);
5086
5087		(cast!(ab_lo), cast!(ab_hi))
5088	}
5089
5090	/// Adds the elements of each lane of `a` and `b`, with wrapping on overflow.
5091	#[inline(always)]
5092	pub fn wrapping_add_i16x16(self, a: i16x16, b: i16x16) -> i16x16 {
5093		cast!(self.avx2._mm256_add_epi16(cast!(a), cast!(b)))
5094	}
5095
5096	/// Adds the elements of each lane of `a` and `b`, with wrapping on overflow.
5097	#[inline(always)]
5098	pub fn wrapping_add_i32x8(self, a: i32x8, b: i32x8) -> i32x8 {
5099		cast!(self.avx2._mm256_add_epi32(cast!(a), cast!(b)))
5100	}
5101
5102	/// Adds the elements of each lane of `a` and `b`, with wrapping on overflow.
5103	#[inline(always)]
5104	pub fn wrapping_add_i64x4(self, a: i64x4, b: i64x4) -> i64x4 {
5105		cast!(self.avx2._mm256_add_epi64(cast!(a), cast!(b)))
5106	}
5107
5108	/// Adds the elements of each lane of `a` and `b`, with wrapping on overflow.
5109	#[inline(always)]
5110	pub fn wrapping_add_i8x32(self, a: i8x32, b: i8x32) -> i8x32 {
5111		cast!(self.avx2._mm256_add_epi8(cast!(a), cast!(b)))
5112	}
5113
5114	/// Adds the elements of each lane of `a` and `b`, with wrapping on overflow.
5115	#[inline(always)]
5116	pub fn wrapping_add_u16x16(self, a: u16x16, b: u16x16) -> u16x16 {
5117		cast!(self.avx2._mm256_add_epi16(cast!(a), cast!(b)))
5118	}
5119
5120	/// Adds the elements of each lane of `a` and `b`, with wrapping on overflow.
5121	#[inline(always)]
5122	pub fn wrapping_add_u32x8(self, a: u32x8, b: u32x8) -> u32x8 {
5123		cast!(self.avx2._mm256_add_epi32(cast!(a), cast!(b)))
5124	}
5125
5126	/// Adds the elements of each lane of `a` and `b`, with wrapping on overflow.
5127	#[inline(always)]
5128	pub fn wrapping_add_u64x4(self, a: u64x4, b: u64x4) -> u64x4 {
5129		cast!(self.avx2._mm256_add_epi64(cast!(a), cast!(b)))
5130	}
5131
5132	/// Adds the elements of each lane of `a` and `b`, with wrapping on overflow.
5133	#[inline(always)]
5134	pub fn wrapping_add_u8x32(self, a: u8x32, b: u8x32) -> u8x32 {
5135		cast!(self.avx2._mm256_add_epi8(cast!(a), cast!(b)))
5136	}
5137
5138	/// Multiplies the elements of each lane of `a` and `b`, with wrapping on overflow.
5139	#[inline(always)]
5140	pub fn wrapping_mul_i16x16(self, a: i16x16, b: i16x16) -> i16x16 {
5141		cast!(self.avx2._mm256_mullo_epi16(cast!(a), cast!(b)))
5142	}
5143
5144	/// Multiplies the elements of each lane of `a` and `b`, with wrapping on overflow.
5145	#[inline(always)]
5146	pub fn wrapping_mul_i32x8(self, a: i32x8, b: i32x8) -> i32x8 {
5147		cast!(self.avx2._mm256_mullo_epi32(cast!(a), cast!(b)))
5148	}
5149
5150	/// Multiplies the elements of each lane of `a` and `b`, with wrapping on overflow.
5151	#[inline(always)]
5152	pub fn wrapping_mul_u16x16(self, a: u16x16, b: u16x16) -> u16x16 {
5153		cast!(self.avx2._mm256_mullo_epi16(cast!(a), cast!(b)))
5154	}
5155
5156	/// Multiplies the elements of each lane of `a` and `b`, with wrapping on overflow.
5157	#[inline(always)]
5158	pub fn wrapping_mul_u32x8(self, a: u32x8, b: u32x8) -> u32x8 {
5159		cast!(self.avx2._mm256_mullo_epi32(cast!(a), cast!(b)))
5160	}
5161
5162	/// Subtracts the elements of each lane of `a` and `b`, with wrapping on overflow.
5163	#[inline(always)]
5164	pub fn wrapping_sub_i16x16(self, a: i16x16, b: i16x16) -> i16x16 {
5165		cast!(self.avx2._mm256_sub_epi16(cast!(a), cast!(b)))
5166	}
5167
5168	/// Subtracts the elements of each lane of `a` and `b`, with wrapping on overflow.
5169	#[inline(always)]
5170	pub fn wrapping_sub_i32x8(self, a: i32x8, b: i32x8) -> i32x8 {
5171		cast!(self.avx2._mm256_sub_epi32(cast!(a), cast!(b)))
5172	}
5173
5174	/// Subtracts the elements of each lane of `a` and `b`, with wrapping on overflow.
5175	#[inline(always)]
5176	pub fn wrapping_sub_i64x4(self, a: i64x4, b: i64x4) -> i64x4 {
5177		cast!(self.avx2._mm256_sub_epi64(cast!(a), cast!(b)))
5178	}
5179
5180	/// Subtracts the elements of each lane of `a` and `b`, with wrapping on overflow.
5181	#[inline(always)]
5182	pub fn wrapping_sub_i8x32(self, a: i8x32, b: i8x32) -> i8x32 {
5183		cast!(self.avx2._mm256_sub_epi8(cast!(a), cast!(b)))
5184	}
5185
5186	/// Subtracts the elements of each lane of `a` and `b`, with wrapping on overflow.
5187	#[inline(always)]
5188	pub fn wrapping_sub_u16x16(self, a: u16x16, b: u16x16) -> u16x16 {
5189		cast!(self.avx2._mm256_sub_epi16(cast!(a), cast!(b)))
5190	}
5191
5192	/// Subtracts the elements of each lane of `a` and `b`, with wrapping on overflow.
5193	#[inline(always)]
5194	pub fn wrapping_sub_u32x8(self, a: u32x8, b: u32x8) -> u32x8 {
5195		cast!(self.avx2._mm256_sub_epi32(cast!(a), cast!(b)))
5196	}
5197
5198	/// Subtracts the elements of each lane of `a` and `b`, with wrapping on overflow.
5199	#[inline(always)]
5200	pub fn wrapping_sub_u64x4(self, a: u64x4, b: u64x4) -> u64x4 {
5201		cast!(self.avx2._mm256_sub_epi64(cast!(a), cast!(b)))
5202	}
5203
5204	/// Subtracts the elements of each lane of `a` and `b`, with wrapping on overflow.
5205	#[inline(always)]
5206	pub fn wrapping_sub_u8x32(self, a: u8x32, b: u8x32) -> u8x32 {
5207		cast!(self.avx2._mm256_sub_epi8(cast!(a), cast!(b)))
5208	}
5209
5210	/// Returns `a ^ b` for each bit in `a` and `b`.
5211	#[inline(always)]
5212	pub fn xor_f32x8(self, a: f32x8, b: f32x8) -> f32x8 {
5213		cast!(self.avx._mm256_xor_ps(cast!(a), cast!(b)))
5214	}
5215
5216	/// Returns `a ^ b` for each bit in `a` and `b`.
5217	#[inline(always)]
5218	pub fn xor_f64x4(self, a: f64x4, b: f64x4) -> f64x4 {
5219		cast!(self.avx._mm256_xor_pd(cast!(a), cast!(b)))
5220	}
5221
5222	/// Returns `a ^ b` for each bit in `a` and `b`.
5223	#[inline(always)]
5224	pub fn xor_i16x16(self, a: i16x16, b: i16x16) -> i16x16 {
5225		cast!(self.avx2._mm256_xor_si256(cast!(a), cast!(b)))
5226	}
5227
5228	/// Returns `a ^ b` for each bit in `a` and `b`.
5229	#[inline(always)]
5230	pub fn xor_i32x8(self, a: i32x8, b: i32x8) -> i32x8 {
5231		cast!(self.avx2._mm256_xor_si256(cast!(a), cast!(b)))
5232	}
5233
5234	/// Returns `a ^ b` for each bit in `a` and `b`.
5235	#[inline(always)]
5236	pub fn xor_i64x4(self, a: i64x4, b: i64x4) -> i64x4 {
5237		cast!(self.avx2._mm256_xor_si256(cast!(a), cast!(b)))
5238	}
5239
5240	/// Returns `a ^ b` for each bit in `a` and `b`.
5241	#[inline(always)]
5242	pub fn xor_i8x32(self, a: i8x32, b: i8x32) -> i8x32 {
5243		cast!(self.avx2._mm256_xor_si256(cast!(a), cast!(b)))
5244	}
5245
5246	/// Returns `a ^ b` for each bit in `a` and `b`.
5247	#[inline(always)]
5248	pub fn xor_m16x16(self, a: m16x16, b: m16x16) -> m16x16 {
5249		cast!(self.avx2._mm256_xor_si256(cast!(a), cast!(b)))
5250	}
5251
5252	/// Returns `a ^ b` for each bit in `a` and `b`.
5253	#[inline(always)]
5254	pub fn xor_m32x8(self, a: m32x8, b: m32x8) -> m32x8 {
5255		cast!(self.avx2._mm256_xor_si256(cast!(a), cast!(b)))
5256	}
5257
5258	/// Returns `a ^ b` for each bit in `a` and `b`.
5259	#[inline(always)]
5260	pub fn xor_m64x4(self, a: m64x4, b: m64x4) -> m64x4 {
5261		cast!(self.avx2._mm256_xor_si256(cast!(a), cast!(b)))
5262	}
5263
5264	/// Returns `a ^ b` for each bit in `a` and `b`.
5265	#[inline(always)]
5266	pub fn xor_m8x32(self, a: m8x32, b: m8x32) -> m8x32 {
5267		cast!(self.avx2._mm256_xor_si256(cast!(a), cast!(b)))
5268	}
5269
5270	/// Returns `a ^ b` for each bit in `a` and `b`.
5271	#[inline(always)]
5272	pub fn xor_u16x16(self, a: u16x16, b: u16x16) -> u16x16 {
5273		cast!(self.avx2._mm256_xor_si256(cast!(a), cast!(b)))
5274	}
5275
5276	/// Returns `a ^ b` for each bit in `a` and `b`.
5277	#[inline(always)]
5278	pub fn xor_u32x8(self, a: u32x8, b: u32x8) -> u32x8 {
5279		cast!(self.avx2._mm256_xor_si256(cast!(a), cast!(b)))
5280	}
5281
5282	/// Returns `a ^ b` for each bit in `a` and `b`.
5283	#[inline(always)]
5284	pub fn xor_u64x4(self, a: u64x4, b: u64x4) -> u64x4 {
5285		cast!(self.avx2._mm256_xor_si256(cast!(a), cast!(b)))
5286	}
5287
5288	/// Returns `a ^ b` for each bit in `a` and `b`.
5289	#[inline(always)]
5290	pub fn xor_u8x32(self, a: u8x32, b: u8x32) -> u8x32 {
5291		cast!(self.avx2._mm256_xor_si256(cast!(a), cast!(b)))
5292	}
5293}