pulp/x86/
v2.rs

1use super::*;
2
3// https://en.wikipedia.org/wiki/X86-64#Microarchitecture_levels
4simd_type!({
5	/// SSE instruction set.
6	#[allow(missing_docs)]
7	pub struct V2 {
8		pub sse: f!("sse"),
9		pub sse2: f!("sse2"),
10		pub fxsr: f!("fxsr"),
11		pub sse3: f!("sse3"),
12		pub ssse3: f!("ssse3"),
13		pub sse4_1: f!("sse4.1"),
14		pub sse4_2: f!("sse4.2"),
15		pub popcnt: f!("popcnt"),
16	}
17});
18
19impl Seal for V2 {}
20
21impl V2 {
22	/// Computes `abs(a)` for each lane of `a`.
23	#[inline(always)]
24	pub fn abs_f32x4(self, a: f32x4) -> f32x4 {
25		self.and_f32x4(a, cast!(self.splat_u32x4((1 << 31) - 1)))
26	}
27
28	/// Computes `abs(a)` for each lane of `a`.
29	#[inline(always)]
30	pub fn abs_f64x2(self, a: f64x2) -> f64x2 {
31		self.and_f64x2(a, cast!(self.splat_u64x2((1 << 63) - 1)))
32	}
33
34	/// Computes `a + b` for each lane of `a` and `b`.
35	#[inline(always)]
36	pub fn add_f32x4(self, a: f32x4, b: f32x4) -> f32x4 {
37		cast!(self.sse._mm_add_ps(cast!(a), cast!(b)))
38	}
39
40	/// Computes `a + b` for each lane of `a` and `b`.
41	#[inline(always)]
42	pub fn add_f64x2(self, a: f64x2, b: f64x2) -> f64x2 {
43		cast!(self.sse2._mm_add_pd(cast!(a), cast!(b)))
44	}
45
46	/// Returns `a & b` for each bit in `a` and `b`.
47	#[inline(always)]
48	pub fn and_f32x4(self, a: f32x4, b: f32x4) -> f32x4 {
49		cast!(self.sse._mm_and_ps(cast!(a), cast!(b)))
50	}
51
52	/// Returns `a & b` for each bit in `a` and `b`.
53	#[inline(always)]
54	pub fn and_f64x2(self, a: f64x2, b: f64x2) -> f64x2 {
55		cast!(self.sse2._mm_and_pd(cast!(a), cast!(b)))
56	}
57
58	/// Returns `a & b` for each bit in `a` and `b`.
59	#[inline(always)]
60	pub fn and_i16x8(self, a: i16x8, b: i16x8) -> i16x8 {
61		cast!(self.sse2._mm_and_si128(cast!(a), cast!(b)))
62	}
63
64	/// Returns `a & b` for each bit in `a` and `b`.
65	#[inline(always)]
66	pub fn and_i32x4(self, a: i32x4, b: i32x4) -> i32x4 {
67		cast!(self.sse2._mm_and_si128(cast!(a), cast!(b)))
68	}
69
70	/// Returns `a & b` for each bit in `a` and `b`.
71	#[inline(always)]
72	pub fn and_i64x2(self, a: i64x2, b: i64x2) -> i64x2 {
73		cast!(self.sse2._mm_and_si128(cast!(a), cast!(b)))
74	}
75
76	/// Returns `a & b` for each bit in `a` and `b`.
77	#[inline(always)]
78	pub fn and_i8x16(self, a: i8x16, b: i8x16) -> i8x16 {
79		cast!(self.sse2._mm_and_si128(cast!(a), cast!(b)))
80	}
81
82	/// Returns `a & b` for each bit in `a` and `b`.
83	#[inline(always)]
84	pub fn and_m16x8(self, a: m16x8, b: m16x8) -> m16x8 {
85		cast!(self.sse2._mm_and_si128(cast!(a), cast!(b)))
86	}
87
88	/// Returns `a & b` for each bit in `a` and `b`.
89	#[inline(always)]
90	pub fn and_m32x4(self, a: m32x4, b: m32x4) -> m32x4 {
91		cast!(self.sse2._mm_and_si128(cast!(a), cast!(b)))
92	}
93
94	/// Returns `a & b` for each bit in `a` and `b`.
95	#[inline(always)]
96	pub fn and_m64x2(self, a: m64x2, b: m64x2) -> m64x2 {
97		cast!(self.sse2._mm_and_si128(cast!(a), cast!(b)))
98	}
99
100	/// Returns `a & b` for each bit in `a` and `b`.
101	#[inline(always)]
102	pub fn and_m8x16(self, a: m8x16, b: m8x16) -> m8x16 {
103		cast!(self.sse2._mm_and_si128(cast!(a), cast!(b)))
104	}
105
106	/// Returns `a & b` for each bit in `a` and `b`.
107	#[inline(always)]
108	pub fn and_u16x8(self, a: u16x8, b: u16x8) -> u16x8 {
109		cast!(self.sse2._mm_and_si128(cast!(a), cast!(b)))
110	}
111
112	/// Returns `a & b` for each bit in `a` and `b`.
113	#[inline(always)]
114	pub fn and_u32x4(self, a: u32x4, b: u32x4) -> u32x4 {
115		cast!(self.sse2._mm_and_si128(cast!(a), cast!(b)))
116	}
117
118	/// Returns `a & b` for each bit in `a` and `b`.
119	#[inline(always)]
120	pub fn and_u64x2(self, a: u64x2, b: u64x2) -> u64x2 {
121		cast!(self.sse2._mm_and_si128(cast!(a), cast!(b)))
122	}
123
124	/// Returns `a & b` for each bit in `a` and `b`.
125	#[inline(always)]
126	pub fn and_u8x16(self, a: u8x16, b: u8x16) -> u8x16 {
127		cast!(self.sse2._mm_and_si128(cast!(a), cast!(b)))
128	}
129
130	/// Returns `!a & b` for each bit in `a` and `b`.
131	#[inline(always)]
132	pub fn andnot_f32x4(self, a: f32x4, b: f32x4) -> f32x4 {
133		cast!(self.sse._mm_andnot_ps(cast!(a), cast!(b)))
134	}
135
136	/// Returns `!a & b` for each bit in `a` and `b`.
137	#[inline(always)]
138	pub fn andnot_f64x2(self, a: f64x2, b: f64x2) -> f64x2 {
139		cast!(self.sse2._mm_andnot_pd(cast!(a), cast!(b)))
140	}
141
142	/// Returns `!a & b` for each bit in `a` and `b`.
143	#[inline(always)]
144	pub fn andnot_i16x8(self, a: i16x8, b: i16x8) -> i16x8 {
145		cast!(self.sse2._mm_andnot_si128(cast!(a), cast!(b)))
146	}
147
148	/// Returns `!a & b` for each bit in `a` and `b`.
149	#[inline(always)]
150	pub fn andnot_i32x4(self, a: i32x4, b: i32x4) -> i32x4 {
151		cast!(self.sse2._mm_andnot_si128(cast!(a), cast!(b)))
152	}
153
154	/// Returns `!a & b` for each bit in `a` and `b`.
155	#[inline(always)]
156	pub fn andnot_i64x2(self, a: i64x2, b: i64x2) -> i64x2 {
157		cast!(self.sse2._mm_andnot_si128(cast!(a), cast!(b)))
158	}
159
160	/// Returns `!a & b` for each bit in `a` and `b`.
161	#[inline(always)]
162	pub fn andnot_i8x16(self, a: i8x16, b: i8x16) -> i8x16 {
163		cast!(self.sse2._mm_andnot_si128(cast!(a), cast!(b)))
164	}
165
166	/// Returns `!a & b` for each bit in `a` and `b`.
167	#[inline(always)]
168	pub fn andnot_m16x8(self, a: m16x8, b: m16x8) -> m16x8 {
169		cast!(self.sse2._mm_andnot_si128(cast!(a), cast!(b)))
170	}
171
172	/// Returns `!a & b` for each bit in `a` and `b`.
173	#[inline(always)]
174	pub fn andnot_m32x4(self, a: m32x4, b: m32x4) -> m32x4 {
175		cast!(self.sse2._mm_andnot_si128(cast!(a), cast!(b)))
176	}
177
178	/// Returns `!a & b` for each bit in `a` and `b`.
179	#[inline(always)]
180	pub fn andnot_m64x2(self, a: m64x2, b: m64x2) -> m64x2 {
181		cast!(self.sse2._mm_andnot_si128(cast!(a), cast!(b)))
182	}
183
184	/// Returns `!a & b` for each bit in `a` and `b`.
185	#[inline(always)]
186	pub fn andnot_m8x16(self, a: m8x16, b: m8x16) -> m8x16 {
187		cast!(self.sse2._mm_andnot_si128(cast!(a), cast!(b)))
188	}
189
190	/// Returns `!a & b` for each bit in `a` and `b`.
191	#[inline(always)]
192	pub fn andnot_u16x8(self, a: u16x8, b: u16x8) -> u16x8 {
193		cast!(self.sse2._mm_andnot_si128(cast!(a), cast!(b)))
194	}
195
196	/// Returns `!a & b` for each bit in `a` and `b`.
197	#[inline(always)]
198	pub fn andnot_u32x4(self, a: u32x4, b: u32x4) -> u32x4 {
199		cast!(self.sse2._mm_andnot_si128(cast!(a), cast!(b)))
200	}
201
202	/// Returns `!a & b` for each bit in `a` and `b`.
203	#[inline(always)]
204	pub fn andnot_u64x2(self, a: u64x2, b: u64x2) -> u64x2 {
205		cast!(self.sse2._mm_andnot_si128(cast!(a), cast!(b)))
206	}
207
208	/// Returns `!a & b` for each bit in `a` and `b`.
209	#[inline(always)]
210	pub fn andnot_u8x16(self, a: u8x16, b: u8x16) -> u8x16 {
211		cast!(self.sse2._mm_andnot_si128(cast!(a), cast!(b)))
212	}
213
214	/// Applies the sign of each element of `sign` to the corresponding lane in `a`.
215	/// - If `sign` is zero, the corresponding element is zeroed.
216	/// - If `sign` is positive, the corresponding element is returned as is.
217	/// - If `sign` is negative, the corresponding element is negated.
218	#[inline(always)]
219	pub fn apply_sign_i16x8(self, sign: i16x8, a: i16x8) -> i16x8 {
220		cast!(self.ssse3._mm_sign_epi16(cast!(a), cast!(sign)))
221	}
222
223	/// Applies the sign of each element of `sign` to the corresponding lane in `a`.
224	/// - If `sign` is zero, the corresponding element is zeroed.
225	/// - If `sign` is positive, the corresponding element is returned as is.
226	/// - If `sign` is negative, the corresponding element is negated.
227	#[inline(always)]
228	pub fn apply_sign_i32x4(self, sign: i32x4, a: i32x4) -> i32x4 {
229		cast!(self.ssse3._mm_sign_epi32(cast!(a), cast!(sign)))
230	}
231
232	/// Applies the sign of each element of `sign` to the corresponding lane in `a`.
233	/// - If `sign` is zero, the corresponding element is zeroed.
234	/// - If `sign` is positive, the corresponding element is returned as is.
235	/// - If `sign` is negative, the corresponding element is negated.
236	#[inline(always)]
237	pub fn apply_sign_i8x16(self, sign: i8x16, a: i8x16) -> i8x16 {
238		cast!(self.ssse3._mm_sign_epi8(cast!(a), cast!(sign)))
239	}
240
241	/// Computes the approximate reciprocal of the elements of each lane of `a`.
242	#[inline(always)]
243	pub fn approx_reciprocal_f32x4(self, a: f32x4) -> f32x4 {
244		cast!(self.sse._mm_rcp_ps(cast!(a)))
245	}
246
247	/// Computes the approximate reciprocal of the square roots of the elements of each lane of `a`.
248	#[inline(always)]
249	pub fn approx_reciprocal_sqrt_f32x4(self, a: f32x4) -> f32x4 {
250		cast!(self.sse._mm_rsqrt_ps(cast!(a)))
251	}
252
253	/// Computes `average(a, b)` for each lane of `a` and `b`.
254	#[inline(always)]
255	pub fn average_u16x8(self, a: u16x8, b: u16x8) -> u16x8 {
256		cast!(self.sse2._mm_avg_epu16(cast!(a), cast!(b)))
257	}
258
259	/// Computes `average(a, b)` for each lane of `a` and `b`.
260	#[inline(always)]
261	pub fn average_u8x16(self, a: u8x16, b: u8x16) -> u8x16 {
262		cast!(self.sse2._mm_avg_epu8(cast!(a), cast!(b)))
263	}
264
265	/// Returns `ceil(a)` for each lane of `a`, rounding towards positive infinity.
266	#[inline(always)]
267	pub fn ceil_f32x4(self, a: f32x4) -> f32x4 {
268		cast!(self.sse4_1._mm_ceil_ps(cast!(a)))
269	}
270
271	/// Returns `ceil(a)` for each lane of `a`, rounding towards positive infinity.
272	#[inline(always)]
273	pub fn ceil_f64x2(self, a: f64x2) -> f64x2 {
274		cast!(self.sse4_1._mm_ceil_pd(cast!(a)))
275	}
276
277	/// Compares the elements in each lane of `a` and `b` for equality.
278	#[inline(always)]
279	pub fn cmp_eq_f32x4(self, a: f32x4, b: f32x4) -> m32x4 {
280		cast!(self.sse._mm_cmpeq_ps(cast!(a), cast!(b)))
281	}
282
283	/// Compares the elements in each lane of `a` and `b` for equality.
284	#[inline(always)]
285	pub fn cmp_eq_f64x2(self, a: f64x2, b: f64x2) -> m64x2 {
286		cast!(self.sse2._mm_cmpeq_pd(cast!(a), cast!(b)))
287	}
288
289	/// Compares the elements in each lane of `a` and `b` for equality.
290	#[inline(always)]
291	pub fn cmp_eq_i16x8(self, a: i16x8, b: i16x8) -> m16x8 {
292		cast!(self.sse2._mm_cmpeq_epi16(cast!(a), cast!(b)))
293	}
294
295	/// Compares the elements in each lane of `a` and `b` for equality.
296	#[inline(always)]
297	pub fn cmp_eq_i32x4(self, a: i32x4, b: i32x4) -> m32x4 {
298		cast!(self.sse2._mm_cmpeq_epi32(cast!(a), cast!(b)))
299	}
300
301	/// Compares the elements in each lane of `a` and `b` for equality.
302	#[inline(always)]
303	pub fn cmp_eq_i64x2(self, a: i64x2, b: i64x2) -> m64x2 {
304		cast!(self.sse4_1._mm_cmpeq_epi64(cast!(a), cast!(b)))
305	}
306
307	/// Compares the elements in each lane of `a` and `b` for equality.
308	#[inline(always)]
309	pub fn cmp_eq_i8x16(self, a: i8x16, b: i8x16) -> m8x16 {
310		cast!(self.sse2._mm_cmpeq_epi8(cast!(a), cast!(b)))
311	}
312
313	/// Compares the elements in each lane of `a` and `b` for equality.
314	#[inline(always)]
315	pub fn cmp_eq_u16x8(self, a: u16x8, b: u16x8) -> m16x8 {
316		cast!(self.sse2._mm_cmpeq_epi16(cast!(a), cast!(b)))
317	}
318
319	/// Compares the elements in each lane of `a` and `b` for equality.
320	#[inline(always)]
321	pub fn cmp_eq_u32x4(self, a: u32x4, b: u32x4) -> m32x4 {
322		cast!(self.sse2._mm_cmpeq_epi32(cast!(a), cast!(b)))
323	}
324
325	/// Compares the elements in each lane of `a` and `b` for equality.
326	#[inline(always)]
327	pub fn cmp_eq_u64x2(self, a: u64x2, b: u64x2) -> m64x2 {
328		cast!(self.sse4_1._mm_cmpeq_epi64(cast!(a), cast!(b)))
329	}
330
331	/// Compares the elements in each lane of `a` and `b` for equality.
332	#[inline(always)]
333	pub fn cmp_eq_u8x16(self, a: u8x16, b: u8x16) -> m8x16 {
334		cast!(self.sse2._mm_cmpeq_epi8(cast!(a), cast!(b)))
335	}
336
337	/// Compares the elements in each lane of `a` and `b` for greater-than-or-equal-to.
338	#[inline(always)]
339	pub fn cmp_ge_f32x4(self, a: f32x4, b: f32x4) -> m32x4 {
340		cast!(self.sse._mm_cmpge_ps(cast!(a), cast!(b)))
341	}
342
343	/// Compares the elements in each lane of `a` and `b` for greater-than-or-equal-to.
344	#[inline(always)]
345	pub fn cmp_ge_f64x2(self, a: f64x2, b: f64x2) -> m64x2 {
346		cast!(self.sse2._mm_cmpge_pd(cast!(a), cast!(b)))
347	}
348
349	/// Compares the elements in each lane of `a` and `b` for greater-than-or-equal-to.
350	#[inline(always)]
351	pub fn cmp_ge_i16x8(self, a: i16x8, b: i16x8) -> m16x8 {
352		self.not_m16x8(self.cmp_lt_i16x8(a, b))
353	}
354
355	/// Compares the elements in each lane of `a` and `b` for greater-than-or-equal-to.
356	#[inline(always)]
357	pub fn cmp_ge_i32x4(self, a: i32x4, b: i32x4) -> m32x4 {
358		self.not_m32x4(self.cmp_lt_i32x4(a, b))
359	}
360
361	/// Compares the elements in each lane of `a` and `b` for greater-than-or-equal-to.
362	#[inline(always)]
363	pub fn cmp_ge_i64x2(self, a: i64x2, b: i64x2) -> m64x2 {
364		self.not_m64x2(self.cmp_lt_i64x2(a, b))
365	}
366
367	/// Compares the elements in each lane of `a` and `b` for greater-than-or-equal-to.
368	#[inline(always)]
369	pub fn cmp_ge_i8x16(self, a: i8x16, b: i8x16) -> m8x16 {
370		self.not_m8x16(self.cmp_lt_i8x16(a, b))
371	}
372
373	/// Compares the elements in each lane of `a` and `b` for greater-than-or-equal-to.
374	#[inline(always)]
375	pub fn cmp_ge_u16x8(self, a: u16x8, b: u16x8) -> m16x8 {
376		self.not_m16x8(self.cmp_lt_u16x8(a, b))
377	}
378
379	/// Compares the elements in each lane of `a` and `b` for greater-than-or-equal-to.
380	#[inline(always)]
381	pub fn cmp_ge_u32x4(self, a: u32x4, b: u32x4) -> m32x4 {
382		self.not_m32x4(self.cmp_lt_u32x4(a, b))
383	}
384
385	/// Compares the elements in each lane of `a` and `b` for greater-than-or-equal-to.
386	#[inline(always)]
387	pub fn cmp_ge_u64x2(self, a: u64x2, b: u64x2) -> m64x2 {
388		self.not_m64x2(self.cmp_lt_u64x2(a, b))
389	}
390
391	/// Compares the elements in each lane of `a` and `b` for greater-than-or-equal-to.
392	#[inline(always)]
393	pub fn cmp_ge_u8x16(self, a: u8x16, b: u8x16) -> m8x16 {
394		self.not_m8x16(self.cmp_lt_u8x16(a, b))
395	}
396
397	/// Compares the elements in each lane of `a` and `b` for greater-than.
398	#[inline(always)]
399	pub fn cmp_gt_f32x4(self, a: f32x4, b: f32x4) -> m32x4 {
400		cast!(self.sse._mm_cmpgt_ps(cast!(a), cast!(b)))
401	}
402
403	/// Compares the elements in each lane of `a` and `b` for greater-than.
404	#[inline(always)]
405	pub fn cmp_gt_f64x2(self, a: f64x2, b: f64x2) -> m64x2 {
406		cast!(self.sse2._mm_cmpgt_pd(cast!(a), cast!(b)))
407	}
408
409	/// Compares the elements in each lane of `a` and `b` for greater-than.
410	#[inline(always)]
411	pub fn cmp_gt_i16x8(self, a: i16x8, b: i16x8) -> m16x8 {
412		cast!(self.sse2._mm_cmpgt_epi16(cast!(a), cast!(b)))
413	}
414
415	/// Compares the elements in each lane of `a` and `b` for greater-than.
416	#[inline(always)]
417	pub fn cmp_gt_i32x4(self, a: i32x4, b: i32x4) -> m32x4 {
418		cast!(self.sse2._mm_cmpgt_epi32(cast!(a), cast!(b)))
419	}
420
421	/// Compares the elements in each lane of `a` and `b` for greater-than.
422	#[inline(always)]
423	pub fn cmp_gt_i64x2(self, a: i64x2, b: i64x2) -> m64x2 {
424		cast!(self.sse4_2._mm_cmpgt_epi64(cast!(a), cast!(b)))
425	}
426
427	/// Compares the elements in each lane of `a` and `b` for greater-than.
428	#[inline(always)]
429	pub fn cmp_gt_i8x16(self, a: i8x16, b: i8x16) -> m8x16 {
430		cast!(self.sse2._mm_cmpgt_epi8(cast!(a), cast!(b)))
431	}
432
433	/// Compares the elements in each lane of `a` and `b` for greater-than.
434	#[inline(always)]
435	pub fn cmp_gt_u16x8(self, a: u16x8, b: u16x8) -> m16x8 {
436		let k = self.splat_u16x8(0x8000);
437		self.cmp_gt_i16x8(cast!(self.xor_u16x8(a, k)), cast!(self.xor_u16x8(b, k)))
438	}
439
440	/// Compares the elements in each lane of `a` and `b` for greater-than.
441	#[inline(always)]
442	pub fn cmp_gt_u32x4(self, a: u32x4, b: u32x4) -> m32x4 {
443		let k = self.splat_u32x4(0x80000000);
444		self.cmp_gt_i32x4(cast!(self.xor_u32x4(a, k)), cast!(self.xor_u32x4(b, k)))
445	}
446
447	/// Compares the elements in each lane of `a` and `b` for greater-than.
448	#[inline(always)]
449	pub fn cmp_gt_u64x2(self, a: u64x2, b: u64x2) -> m64x2 {
450		let k = self.splat_u64x2(0x8000000000000000);
451		self.cmp_gt_i64x2(cast!(self.xor_u64x2(a, k)), cast!(self.xor_u64x2(b, k)))
452	}
453
454	/// Compares the elements in each lane of `a` and `b` for greater-than.
455	#[inline(always)]
456	pub fn cmp_gt_u8x16(self, a: u8x16, b: u8x16) -> m8x16 {
457		let k = self.splat_u8x16(0x80);
458		self.cmp_gt_i8x16(cast!(self.xor_u8x16(a, k)), cast!(self.xor_u8x16(b, k)))
459	}
460
461	/// Compares the elements in each lane of `a` and `b` for less-than-or-equal-to.
462	#[inline(always)]
463	pub fn cmp_le_f32x4(self, a: f32x4, b: f32x4) -> m32x4 {
464		cast!(self.sse._mm_cmple_ps(cast!(a), cast!(b)))
465	}
466
467	/// Compares the elements in each lane of `a` and `b` for less-than-or-equal-to.
468	#[inline(always)]
469	pub fn cmp_le_f64x2(self, a: f64x2, b: f64x2) -> m64x2 {
470		cast!(self.sse2._mm_cmple_pd(cast!(a), cast!(b)))
471	}
472
473	/// Compares the elements in each lane of `a` and `b` for less-than-or-equal-to.
474	#[inline(always)]
475	pub fn cmp_le_i16x8(self, a: i16x8, b: i16x8) -> m16x8 {
476		self.not_m16x8(self.cmp_gt_i16x8(a, b))
477	}
478
479	/// Compares the elements in each lane of `a` and `b` for less-than-or-equal-to.
480	#[inline(always)]
481	pub fn cmp_le_i32x4(self, a: i32x4, b: i32x4) -> m32x4 {
482		self.not_m32x4(self.cmp_gt_i32x4(a, b))
483	}
484
485	/// Compares the elements in each lane of `a` and `b` for less-than-or-equal-to.
486	#[inline(always)]
487	pub fn cmp_le_i64x2(self, a: i64x2, b: i64x2) -> m64x2 {
488		self.not_m64x2(self.cmp_gt_i64x2(a, b))
489	}
490
491	/// Compares the elements in each lane of `a` and `b` for less-than-or-equal-to.
492	#[inline(always)]
493	pub fn cmp_le_i8x16(self, a: i8x16, b: i8x16) -> m8x16 {
494		self.not_m8x16(self.cmp_gt_i8x16(a, b))
495	}
496
497	/// Compares the elements in each lane of `a` and `b` for less-than-or-equal-to.
498	#[inline(always)]
499	pub fn cmp_le_u16x8(self, a: u16x8, b: u16x8) -> m16x8 {
500		self.not_m16x8(self.cmp_gt_u16x8(a, b))
501	}
502
503	/// Compares the elements in each lane of `a` and `b` for less-than-or-equal-to.
504	#[inline(always)]
505	pub fn cmp_le_u32x4(self, a: u32x4, b: u32x4) -> m32x4 {
506		self.not_m32x4(self.cmp_gt_u32x4(a, b))
507	}
508
509	/// Compares the elements in each lane of `a` and `b` for less-than-or-equal-to.
510	#[inline(always)]
511	pub fn cmp_le_u64x2(self, a: u64x2, b: u64x2) -> m64x2 {
512		self.not_m64x2(self.cmp_gt_u64x2(a, b))
513	}
514
515	/// Compares the elements in each lane of `a` and `b` for less-than-or-equal-to.
516	#[inline(always)]
517	pub fn cmp_le_u8x16(self, a: u8x16, b: u8x16) -> m8x16 {
518		self.not_m8x16(self.cmp_gt_u8x16(a, b))
519	}
520
521	/// Compares the elements in each lane of `a` and `b` for less-than.
522	#[inline(always)]
523	pub fn cmp_lt_f32x4(self, a: f32x4, b: f32x4) -> m32x4 {
524		cast!(self.sse._mm_cmplt_ps(cast!(a), cast!(b)))
525	}
526
527	/// Compares the elements in each lane of `a` and `b` for less-than.
528	#[inline(always)]
529	pub fn cmp_lt_f64x2(self, a: f64x2, b: f64x2) -> m64x2 {
530		cast!(self.sse2._mm_cmplt_pd(cast!(a), cast!(b)))
531	}
532
533	/// Compares the elements in each lane of `a` and `b` for less-than.
534	#[inline(always)]
535	pub fn cmp_lt_i16x8(self, a: i16x8, b: i16x8) -> m16x8 {
536		cast!(self.sse2._mm_cmplt_epi16(cast!(a), cast!(b)))
537	}
538
539	/// Compares the elements in each lane of `a` and `b` for less-than.
540	#[inline(always)]
541	pub fn cmp_lt_i32x4(self, a: i32x4, b: i32x4) -> m32x4 {
542		cast!(self.sse2._mm_cmplt_epi32(cast!(a), cast!(b)))
543	}
544
545	/// Compares the elements in each lane of `a` and `b` for less-than.
546	#[inline(always)]
547	pub fn cmp_lt_i64x2(self, a: i64x2, b: i64x2) -> m64x2 {
548		cast!(self.sse4_2._mm_cmpgt_epi64(cast!(b), cast!(a)))
549	}
550
551	/// Compares the elements in each lane of `a` and `b` for less-than.
552	#[inline(always)]
553	pub fn cmp_lt_i8x16(self, a: i8x16, b: i8x16) -> m8x16 {
554		cast!(self.sse2._mm_cmplt_epi8(cast!(a), cast!(b)))
555	}
556
557	/// Compares the elements in each lane of `a` and `b` for less-than.
558	#[inline(always)]
559	pub fn cmp_lt_u16x8(self, a: u16x8, b: u16x8) -> m16x8 {
560		let k = self.splat_u16x8(0x8000);
561		self.cmp_lt_i16x8(cast!(self.xor_u16x8(a, k)), cast!(self.xor_u16x8(b, k)))
562	}
563
564	/// Compares the elements in each lane of `a` and `b` for less-than.
565	#[inline(always)]
566	pub fn cmp_lt_u32x4(self, a: u32x4, b: u32x4) -> m32x4 {
567		let k = self.splat_u32x4(0x80000000);
568		self.cmp_lt_i32x4(cast!(self.xor_u32x4(a, k)), cast!(self.xor_u32x4(b, k)))
569	}
570
571	/// Compares the elements in each lane of `a` and `b` for less-than.
572	#[inline(always)]
573	pub fn cmp_lt_u64x2(self, a: u64x2, b: u64x2) -> m64x2 {
574		let k = self.splat_u64x2(0x8000000000000000);
575		self.cmp_lt_i64x2(cast!(self.xor_u64x2(a, k)), cast!(self.xor_u64x2(b, k)))
576	}
577
578	/// Compares the elements in each lane of `a` and `b` for less-than.
579	#[inline(always)]
580	pub fn cmp_lt_u8x16(self, a: u8x16, b: u8x16) -> m8x16 {
581		let k = self.splat_u8x16(0x80);
582		self.cmp_lt_i8x16(cast!(self.xor_u8x16(a, k)), cast!(self.xor_u8x16(b, k)))
583	}
584
585	/// Compares the elements in each lane of `a` and `b` for inequality.
586	#[inline(always)]
587	pub fn cmp_not_eq_f32x4(self, a: f32x4, b: f32x4) -> m32x4 {
588		cast!(self.sse._mm_cmpneq_ps(cast!(a), cast!(b)))
589	}
590
591	/// Compares the elements in each lane of `a` and `b` for inequality.
592	#[inline(always)]
593	pub fn cmp_not_eq_f64x2(self, a: f64x2, b: f64x2) -> m64x2 {
594		cast!(self.sse2._mm_cmpneq_pd(cast!(a), cast!(b)))
595	}
596
597	/// Compares the elements in each lane of `a` and `b` for not-greater-than-or-equal.
598	#[inline(always)]
599	pub fn cmp_not_ge_f32x4(self, a: f32x4, b: f32x4) -> m32x4 {
600		cast!(self.sse._mm_cmpnge_ps(cast!(a), cast!(b)))
601	}
602
603	/// Compares the elements in each lane of `a` and `b` for not-greater-than-or-equal.
604	#[inline(always)]
605	pub fn cmp_not_ge_f64x2(self, a: f64x2, b: f64x2) -> m64x2 {
606		cast!(self.sse2._mm_cmpnge_pd(cast!(a), cast!(b)))
607	}
608
609	/// Compares the elements in each lane of `a` and `b` for not-greater-than.
610	#[inline(always)]
611	pub fn cmp_not_gt_f32x4(self, a: f32x4, b: f32x4) -> m32x4 {
612		cast!(self.sse._mm_cmpngt_ps(cast!(a), cast!(b)))
613	}
614
615	/// Compares the elements in each lane of `a` and `b` for not-greater-than.
616	#[inline(always)]
617	pub fn cmp_not_gt_f64x2(self, a: f64x2, b: f64x2) -> m64x2 {
618		cast!(self.sse2._mm_cmpngt_pd(cast!(a), cast!(b)))
619	}
620
621	/// Compares the elements in each lane of `a` and `b` for not-less-than-or-equal.
622	#[inline(always)]
623	pub fn cmp_not_le_f32x4(self, a: f32x4, b: f32x4) -> m32x4 {
624		cast!(self.sse._mm_cmpnle_ps(cast!(a), cast!(b)))
625	}
626
627	/// Compares the elements in each lane of `a` and `b` for not-less-than-or-equal.
628	#[inline(always)]
629	pub fn cmp_not_le_f64x2(self, a: f64x2, b: f64x2) -> m64x2 {
630		cast!(self.sse2._mm_cmpnle_pd(cast!(a), cast!(b)))
631	}
632
633	/// Compares the elements in each lane of `a` and `b` for not-less-than.
634	#[inline(always)]
635	pub fn cmp_not_lt_f32x4(self, a: f32x4, b: f32x4) -> m32x4 {
636		cast!(self.sse._mm_cmpnlt_ps(cast!(a), cast!(b)))
637	}
638
639	/// Compares the elements in each lane of `a` and `b` for not-less-than.
640	#[inline(always)]
641	pub fn cmp_not_lt_f64x2(self, a: f64x2, b: f64x2) -> m64x2 {
642		cast!(self.sse2._mm_cmpnlt_pd(cast!(a), cast!(b)))
643	}
644
645	/// Converts a `f32x4` to `f64x2`, elementwise, while truncating the extra elements.
646	#[inline(always)]
647	pub fn convert_f32x4_to_f64x2(self, a: f32x4) -> f64x2 {
648		cast!(self.sse2._mm_cvtps_pd(cast!(a)))
649	}
650
651	/// Converts a `f32x4` to `i32x4`, elementwise.
652	#[inline(always)]
653	pub fn convert_f32x4_to_i32x4(self, a: f32x4) -> i32x4 {
654		cast!(self.sse2._mm_cvttps_epi32(cast!(a)))
655	}
656
657	/// Converts a `f64x2` to `f32x4`, elementwise, filling the remaining elements with zeros.
658	#[inline(always)]
659	pub fn convert_f64x2_to_f32x4(self, a: f64x2) -> f32x4 {
660		cast!(self.sse2._mm_cvtpd_ps(cast!(a)))
661	}
662
663	/// Converts a `f64x2` to `i32x4`, elementwise.
664	#[inline(always)]
665	pub fn convert_f64x2_to_i32x4(self, a: f64x2) -> i32x4 {
666		cast!(self.sse2._mm_cvttpd_epi32(cast!(a)))
667	}
668
669	/// Converts a `i16x8` to `i32x4`, elementwise, while truncating the extra elements.
670	#[inline(always)]
671	pub fn convert_i16x8_to_i32x4(self, a: i16x8) -> i32x4 {
672		cast!(self.sse4_1._mm_cvtepi16_epi32(cast!(a)))
673	}
674
675	/// Converts a `i16x8` to `i64x2`, elementwise, while truncating the extra elements.
676	#[inline(always)]
677	pub fn convert_i16x8_to_i64x2(self, a: i16x8) -> i64x2 {
678		cast!(self.sse4_1._mm_cvtepi16_epi64(cast!(a)))
679	}
680
681	/// Converts a `i16x8` to `u16x8`, elementwise.
682	#[inline(always)]
683	pub fn convert_i16x8_to_u16x8(self, a: i16x8) -> u16x8 {
684		cast!(a)
685	}
686
687	/// Converts a `i16x8` to `u32x4`, elementwise, while truncating the extra elements.
688	#[inline(always)]
689	pub fn convert_i16x8_to_u32x4(self, a: i16x8) -> u32x4 {
690		cast!(self.sse4_1._mm_cvtepi16_epi32(cast!(a)))
691	}
692
693	/// Converts a `i16x8` to `u64x2`, elementwise, while truncating the extra elements.
694	#[inline(always)]
695	pub fn convert_i16x8_to_u64x2(self, a: i16x8) -> u64x2 {
696		cast!(self.sse4_1._mm_cvtepi16_epi64(cast!(a)))
697	}
698
699	/// Converts a `i32x4` to `f32x4`, elementwise.
700	#[inline(always)]
701	pub fn convert_i32x4_to_f32x4(self, a: i32x4) -> f32x4 {
702		cast!(self.sse2._mm_cvtepi32_ps(cast!(a)))
703	}
704
705	/// Converts a `i32x4` to `f64x2`, elementwise, while truncating the extra elements.
706	#[inline(always)]
707	pub fn convert_i32x4_to_f64x2(self, a: i32x4) -> f64x2 {
708		cast!(self.sse2._mm_cvtepi32_pd(cast!(a)))
709	}
710
711	/// Converts a `i32x4` to `i64x2`, elementwise, while truncating the extra elements.
712	#[inline(always)]
713	pub fn convert_i32x4_to_i64x2(self, a: i32x4) -> i64x2 {
714		cast!(self.sse4_1._mm_cvtepi32_epi64(cast!(a)))
715	}
716
717	/// Converts a `i32x4` to `u32x4`, elementwise.
718	#[inline(always)]
719	pub fn convert_i32x4_to_u32x4(self, a: i32x4) -> u32x4 {
720		cast!(a)
721	}
722
723	/// Converts a `i32x4` to `u64x2`, elementwise, while truncating the extra elements.
724	#[inline(always)]
725	pub fn convert_i32x4_to_u64x2(self, a: i32x4) -> u64x2 {
726		cast!(self.sse4_1._mm_cvtepi32_epi64(cast!(a)))
727	}
728
729	/// Converts a `i8x16` to `i16x8`, elementwise, while truncating the extra elements.
730	#[inline(always)]
731	pub fn convert_i8x16_to_i16x8(self, a: i8x16) -> i16x8 {
732		cast!(self.sse4_1._mm_cvtepi8_epi16(cast!(a)))
733	}
734
735	/// Converts a `i8x16` to `i32x4`, elementwise, while truncating the extra elements.
736	#[inline(always)]
737	pub fn convert_i8x16_to_i32x4(self, a: i8x16) -> i32x4 {
738		cast!(self.sse4_1._mm_cvtepi8_epi32(cast!(a)))
739	}
740
741	/// Converts a `i8x16` to `i64x2`, elementwise, while truncating the extra elements.
742	#[inline(always)]
743	pub fn convert_i8x16_to_i64x2(self, a: i8x16) -> i64x2 {
744		cast!(self.sse4_1._mm_cvtepi8_epi64(cast!(a)))
745	}
746
747	/// Converts a `i8x16` to `u16x8`, elementwise, while truncating the extra elements.
748	#[inline(always)]
749	pub fn convert_i8x16_to_u16x8(self, a: i8x16) -> u16x8 {
750		cast!(self.sse4_1._mm_cvtepi8_epi16(cast!(a)))
751	}
752
753	/// Converts a `i8x16` to `u32x4`, elementwise, while truncating the extra elements.
754	#[inline(always)]
755	pub fn convert_i8x16_to_u32x4(self, a: i8x16) -> u32x4 {
756		cast!(self.sse4_1._mm_cvtepi8_epi32(cast!(a)))
757	}
758
759	/// Converts a `i8x16` to `u64x2`, elementwise, while truncating the extra elements.
760	#[inline(always)]
761	pub fn convert_i8x16_to_u64x2(self, a: i8x16) -> u64x2 {
762		cast!(self.sse4_1._mm_cvtepi8_epi64(cast!(a)))
763	}
764
765	/// Converts a `i8x16` to `u8x16`, elementwise.
766	#[inline(always)]
767	pub fn convert_i8x16_to_u8x16(self, a: i8x16) -> u8x16 {
768		cast!(a)
769	}
770
771	/// Converts a `u16x8` to `i16x8`, elementwise.
772	#[inline(always)]
773	pub fn convert_u16x8_to_i16x8(self, a: u16x8) -> i16x8 {
774		cast!(a)
775	}
776
777	/// Converts a `u16x8` to `i32x4`, elementwise, while truncating the extra elements.
778	#[inline(always)]
779	pub fn convert_u16x8_to_i32x4(self, a: u16x8) -> i32x4 {
780		cast!(self.sse4_1._mm_cvtepu16_epi32(cast!(a)))
781	}
782
783	/// Converts a `u16x8` to `i64x2`, elementwise, while truncating the extra elements.
784	#[inline(always)]
785	pub fn convert_u16x8_to_i64x2(self, a: u16x8) -> i64x2 {
786		cast!(self.sse4_1._mm_cvtepu16_epi64(cast!(a)))
787	}
788
789	/// Converts a `u16x8` to `u32x4`, elementwise, while truncating the extra elements.
790	#[inline(always)]
791	pub fn convert_u16x8_to_u32x4(self, a: u16x8) -> u32x4 {
792		cast!(self.sse4_1._mm_cvtepu16_epi32(cast!(a)))
793	}
794
795	/// Converts a `u16x8` to `u64x2`, elementwise, while truncating the extra elements.
796	#[inline(always)]
797	pub fn convert_u16x8_to_u64x2(self, a: u16x8) -> u64x2 {
798		cast!(self.sse4_1._mm_cvtepu16_epi64(cast!(a)))
799	}
800
801	/// Converts a `u32x4` to `i32x4`, elementwise.
802	#[inline(always)]
803	pub fn convert_u32x4_to_i32x4(self, a: u32x4) -> i32x4 {
804		cast!(a)
805	}
806
807	/// Converts a `u32x4` to `i64x2`, elementwise, while truncating the extra elements.
808	#[inline(always)]
809	pub fn convert_u32x4_to_i64x2(self, a: u32x4) -> i64x2 {
810		cast!(self.sse4_1._mm_cvtepu32_epi64(cast!(a)))
811	}
812
813	/// Converts a `u32x4` to `u64x2`, elementwise, while truncating the extra elements.
814	#[inline(always)]
815	pub fn convert_u32x4_to_u64x2(self, a: u32x4) -> u64x2 {
816		cast!(self.sse4_1._mm_cvtepu32_epi64(cast!(a)))
817	}
818
819	/// Converts a `u8x16` to `i16x8`, elementwise, while truncating the extra elements.
820	#[inline(always)]
821	pub fn convert_u8x16_to_i16x8(self, a: u8x16) -> i16x8 {
822		cast!(self.sse4_1._mm_cvtepu8_epi16(cast!(a)))
823	}
824
825	/// Converts a `u8x16` to `i32x4`, elementwise, while truncating the extra elements.
826	#[inline(always)]
827	pub fn convert_u8x16_to_i32x4(self, a: u8x16) -> i32x4 {
828		cast!(self.sse4_1._mm_cvtepu8_epi32(cast!(a)))
829	}
830
831	/// Converts a `u8x16` to `i64x2`, elementwise, while truncating the extra elements.
832	#[inline(always)]
833	pub fn convert_u8x16_to_i64x2(self, a: u8x16) -> i64x2 {
834		cast!(self.sse4_1._mm_cvtepu8_epi64(cast!(a)))
835	}
836
837	/// Converts a `u8x16` to `i8x16`, elementwise.
838	#[inline(always)]
839	pub fn convert_u8x16_to_i8x16(self, a: u8x16) -> i8x16 {
840		cast!(a)
841	}
842
843	/// Converts a `u8x16` to `u16x8`, elementwise, while truncating the extra elements.
844	#[inline(always)]
845	pub fn convert_u8x16_to_u16x8(self, a: u8x16) -> u16x8 {
846		cast!(self.sse4_1._mm_cvtepu8_epi16(cast!(a)))
847	}
848
849	/// Converts a `u8x16` to `u32x4`, elementwise, while truncating the extra elements.
850	#[inline(always)]
851	pub fn convert_u8x16_to_u32x4(self, a: u8x16) -> u32x4 {
852		cast!(self.sse4_1._mm_cvtepu8_epi32(cast!(a)))
853	}
854
855	/// Converts a `u8x16` to `u64x2`, elementwise, while truncating the extra elements.
856	#[inline(always)]
857	pub fn convert_u8x16_to_u64x2(self, a: u8x16) -> u64x2 {
858		cast!(self.sse4_1._mm_cvtepu8_epi64(cast!(a)))
859	}
860
861	/// Divides the elements of each lane of `a` and `b`.
862	#[inline(always)]
863	pub fn div_f32x4(self, a: f32x4, b: f32x4) -> f32x4 {
864		cast!(self.sse._mm_div_ps(cast!(a), cast!(b)))
865	}
866
867	/// Divides the elements of each lane of `a` and `b`.
868	#[inline(always)]
869	pub fn div_f64x2(self, a: f64x2, b: f64x2) -> f64x2 {
870		cast!(self.sse2._mm_div_pd(cast!(a), cast!(b)))
871	}
872
873	/// Rounds the elements of each lane of `a` to the nearest integer towards negative infinity.
874	#[inline(always)]
875	pub fn floor_f32x4(self, a: f32x4) -> f32x4 {
876		cast!(self.sse4_1._mm_floor_ps(cast!(a)))
877	}
878
879	/// Rounds the elements of each lane of `a` to the nearest integer towards negative infinity.
880	#[inline(always)]
881	pub fn floor_f64x2(self, a: f64x2) -> f64x2 {
882		cast!(self.sse4_1._mm_floor_pd(cast!(a)))
883	}
884
885	/// See [_mm_hadd_ps].
886	///
887	/// [_mm_hadd_ps]: core::arch::x86_64::_mm_hadd_ps
888	#[inline(always)]
889	pub fn horizontal_add_pack_f32x4(self, a: f32x4, b: f32x4) -> f32x4 {
890		cast!(self.sse3._mm_hadd_ps(cast!(a), cast!(b)))
891	}
892
893	/// See [_mm_hadd_pd].
894	///
895	/// [_mm_hadd_pd]: core::arch::x86_64::_mm_hadd_pd
896	#[inline(always)]
897	pub fn horizontal_add_pack_f64x2(self, a: f64x2, b: f64x2) -> f64x2 {
898		cast!(self.sse3._mm_hadd_pd(cast!(a), cast!(b)))
899	}
900
901	/// See [_mm_hadd_epi16].
902	///
903	/// [_mm_hadd_epi16]: core::arch::x86_64::_mm_hadd_epi16
904	#[inline(always)]
905	pub fn horizontal_add_pack_i16x8(self, a: i16x8, b: i16x8) -> i16x8 {
906		cast!(self.ssse3._mm_hadd_epi16(cast!(a), cast!(b)))
907	}
908
909	/// See [_mm_hadd_epi32].
910	///
911	/// [_mm_hadd_epi32]: core::arch::x86_64::_mm_hadd_epi32
912	#[inline(always)]
913	pub fn horizontal_add_pack_i32x4(self, a: i32x4, b: i32x4) -> i32x4 {
914		cast!(self.ssse3._mm_hadd_epi32(cast!(a), cast!(b)))
915	}
916
917	/// See [_mm_hadds_epi16].
918	///
919	/// [_mm_hadds_epi16]: core::arch::x86_64::_mm_hadds_epi16
920	#[inline(always)]
921	pub fn horizontal_saturating_add_pack_i16x8(self, a: i16x8, b: i16x8) -> i16x8 {
922		cast!(self.ssse3._mm_hadds_epi16(cast!(a), cast!(b)))
923	}
924
925	/// See [_mm_hsubs_epi16].
926	///
927	/// [_mm_hsubs_epi16]: core::arch::x86_64::_mm_hsubs_epi16
928	#[inline(always)]
929	pub fn horizontal_saturating_sub_pack_i16x8(self, a: i16x8, b: i16x8) -> i16x8 {
930		cast!(self.ssse3._mm_hsubs_epi16(cast!(a), cast!(b)))
931	}
932
933	/// See [_mm_hsub_ps].
934	///
935	/// [_mm_hsub_ps]: core::arch::x86_64::_mm_hsub_ps
936	#[inline(always)]
937	pub fn horizontal_sub_pack_f32x4(self, a: f32x4, b: f32x4) -> f32x4 {
938		cast!(self.sse3._mm_hsub_ps(cast!(a), cast!(b)))
939	}
940
941	/// See [_mm_hsub_pd].
942	///
943	/// [_mm_hsub_pd]: core::arch::x86_64::_mm_hsub_pd
944	#[inline(always)]
945	pub fn horizontal_sub_pack_f64x2(self, a: f64x2, b: f64x2) -> f64x2 {
946		cast!(self.sse3._mm_hsub_pd(cast!(a), cast!(b)))
947	}
948
949	/// See [_mm_hsub_epi16].
950	///
951	/// [_mm_hsub_epi16]: core::arch::x86_64::_mm_hsub_epi16
952	#[inline(always)]
953	pub fn horizontal_sub_pack_i16x8(self, a: i16x8, b: i16x8) -> i16x8 {
954		cast!(self.ssse3._mm_hsub_epi16(cast!(a), cast!(b)))
955	}
956
957	/// See [_mm_hsub_epi32].
958	///
959	/// [_mm_hsub_epi32]: core::arch::x86_64::_mm_hsub_epi32
960	#[inline(always)]
961	pub fn horizontal_sub_pack_i32x4(self, a: i32x4, b: i32x4) -> i32x4 {
962		cast!(self.ssse3._mm_hsub_epi32(cast!(a), cast!(b)))
963	}
964
965	/// Checks if the elements in each lane of `a` are NaN.
966	#[inline(always)]
967	pub fn is_nan_f32x4(self, a: f32x4) -> m32x4 {
968		cast!(self.sse._mm_cmpunord_ps(cast!(a), cast!(a)))
969	}
970
971	/// Checks if the elements in each lane of `a` are NaN.
972	#[inline(always)]
973	pub fn is_nan_f64x2(self, a: f64x2) -> m64x2 {
974		cast!(self.sse2._mm_cmpunord_pd(cast!(a), cast!(a)))
975	}
976
977	/// Checks if the elements in each lane of `a` are not NaN.
978	#[inline(always)]
979	pub fn is_not_nan_f32x4(self, a: f32x4) -> m32x4 {
980		cast!(self.sse._mm_cmpord_ps(cast!(a), cast!(a)))
981	}
982
983	/// Checks if the elements in each lane of `a` are not NaN.
984	#[inline(always)]
985	pub fn is_not_nan_f64x2(self, a: f64x2) -> m64x2 {
986		cast!(self.sse2._mm_cmpord_pd(cast!(a), cast!(a)))
987	}
988
989	/// Computes `max(a, b)`. for each lane in `a` and `b`.
990	#[inline(always)]
991	pub fn max_f32x4(self, a: f32x4, b: f32x4) -> f32x4 {
992		cast!(self.sse._mm_max_ps(cast!(a), cast!(b)))
993	}
994
995	/// Computes `max(a, b)`. for each lane in `a` and `b`.
996	#[inline(always)]
997	pub fn max_f64x2(self, a: f64x2, b: f64x2) -> f64x2 {
998		cast!(self.sse2._mm_max_pd(cast!(a), cast!(b)))
999	}
1000
1001	/// Computes `max(a, b)`. for each lane in `a` and `b`.
1002	#[inline(always)]
1003	pub fn max_i16x8(self, a: i16x8, b: i16x8) -> i16x8 {
1004		cast!(self.sse2._mm_max_epi16(cast!(a), cast!(b)))
1005	}
1006
1007	/// Computes `max(a, b)`. for each lane in `a` and `b`.
1008	#[inline(always)]
1009	pub fn max_i32x4(self, a: i32x4, b: i32x4) -> i32x4 {
1010		cast!(self.sse4_1._mm_max_epi32(cast!(a), cast!(b)))
1011	}
1012
1013	/// Computes `max(a, b)`. for each lane in `a` and `b`.
1014	#[inline(always)]
1015	pub fn max_i8x16(self, a: i8x16, b: i8x16) -> i8x16 {
1016		cast!(self.sse4_1._mm_max_epi8(cast!(a), cast!(b)))
1017	}
1018
1019	/// Computes `max(a, b)`. for each lane in `a` and `b`.
1020	#[inline(always)]
1021	pub fn max_u16x8(self, a: u16x8, b: u16x8) -> u16x8 {
1022		cast!(self.sse4_1._mm_max_epu16(cast!(a), cast!(b)))
1023	}
1024
1025	/// Computes `max(a, b)`. for each lane in `a` and `b`.
1026	#[inline(always)]
1027	pub fn max_u32x4(self, a: u32x4, b: u32x4) -> u32x4 {
1028		cast!(self.sse4_1._mm_max_epu32(cast!(a), cast!(b)))
1029	}
1030
1031	/// Computes `max(a, b)`. for each lane in `a` and `b`.
1032	#[inline(always)]
1033	pub fn max_u8x16(self, a: u8x16, b: u8x16) -> u8x16 {
1034		cast!(self.sse2._mm_max_epu8(cast!(a), cast!(b)))
1035	}
1036
1037	/// Computes `min(a, b)`. for each lane in `a` and `b`.
1038	#[inline(always)]
1039	pub fn min_f32x4(self, a: f32x4, b: f32x4) -> f32x4 {
1040		cast!(self.sse._mm_min_ps(cast!(a), cast!(b)))
1041	}
1042
1043	/// Computes `min(a, b)`. for each lane in `a` and `b`.
1044	#[inline(always)]
1045	pub fn min_f64x2(self, a: f64x2, b: f64x2) -> f64x2 {
1046		cast!(self.sse2._mm_min_pd(cast!(a), cast!(b)))
1047	}
1048
1049	/// Computes `min(a, b)`. for each lane in `a` and `b`.
1050	#[inline(always)]
1051	pub fn min_i16x8(self, a: i16x8, b: i16x8) -> i16x8 {
1052		cast!(self.sse2._mm_min_epi16(cast!(a), cast!(b)))
1053	}
1054
1055	/// Computes `min(a, b)`. for each lane in `a` and `b`.
1056	#[inline(always)]
1057	pub fn min_i32x4(self, a: i32x4, b: i32x4) -> i32x4 {
1058		cast!(self.sse4_1._mm_min_epi32(cast!(a), cast!(b)))
1059	}
1060
1061	/// Computes `min(a, b)`. for each lane in `a` and `b`.
1062	#[inline(always)]
1063	pub fn min_i8x16(self, a: i8x16, b: i8x16) -> i8x16 {
1064		cast!(self.sse4_1._mm_min_epi8(cast!(a), cast!(b)))
1065	}
1066
1067	/// Computes `min(a, b)`. for each lane in `a` and `b`.
1068	#[inline(always)]
1069	pub fn min_u16x8(self, a: u16x8, b: u16x8) -> u16x8 {
1070		cast!(self.sse4_1._mm_min_epu16(cast!(a), cast!(b)))
1071	}
1072
1073	/// Computes `min(a, b)`. for each lane in `a` and `b`.
1074	#[inline(always)]
1075	pub fn min_u32x4(self, a: u32x4, b: u32x4) -> u32x4 {
1076		cast!(self.sse4_1._mm_min_epu32(cast!(a), cast!(b)))
1077	}
1078
1079	/// Computes `min(a, b)`. for each lane in `a` and `b`.
1080	#[inline(always)]
1081	pub fn min_u8x16(self, a: u8x16, b: u8x16) -> u8x16 {
1082		cast!(self.sse2._mm_min_epu8(cast!(a), cast!(b)))
1083	}
1084
1085	/// Computes `a * b` for each lane in `a` and `b`.
1086	#[inline(always)]
1087	pub fn mul_f32x4(self, a: f32x4, b: f32x4) -> f32x4 {
1088		cast!(self.sse._mm_mul_ps(cast!(a), cast!(b)))
1089	}
1090
1091	/// Computes `a * b` for each lane in `a` and `b`.
1092	#[inline(always)]
1093	pub fn mul_f64x2(self, a: f64x2, b: f64x2) -> f64x2 {
1094		cast!(self.sse2._mm_mul_pd(cast!(a), cast!(b)))
1095	}
1096
1097	/// See [_mm_maddubs_epi16].
1098	///
1099	/// [_mm_maddubs_epi16]: core::arch::x86_64::_mm_maddubs_epi16
1100	#[inline(always)]
1101	pub fn multiply_saturating_add_adjacent_i8x16(self, a: i8x16, b: i8x16) -> i16x8 {
1102		cast!(self.ssse3._mm_maddubs_epi16(cast!(a), cast!(b)))
1103	}
1104
1105	/// See [_mm_madd_epi16].
1106	///
1107	/// [_mm_madd_epi16]: core::arch::x86_64::_mm_madd_epi16
1108	#[inline(always)]
1109	pub fn multiply_wrapping_add_adjacent_i16x8(self, a: i16x8, b: i16x8) -> i32x4 {
1110		cast!(self.sse2._mm_madd_epi16(cast!(a), cast!(b)))
1111	}
1112
1113	/// See [_mm_mpsadbw_epu8].
1114	///
1115	/// [_mm_mpsadbw_epu8]: core::arch::x86_64::_mm_mpsadbw_epu8
1116	#[inline(always)]
1117	pub fn multisum_of_absolute_differences_u8x16<const OFFSETS: i32>(
1118		self,
1119		a: u8x16,
1120		b: u8x16,
1121	) -> u16x8 {
1122		cast!(self.sse4_1._mm_mpsadbw_epu8::<OFFSETS>(cast!(a), cast!(b)))
1123	}
1124
1125	/// Returns `!a` for each bit in a.
1126	#[inline(always)]
1127	pub fn not_i16x8(self, a: i16x8) -> i16x8 {
1128		self.xor_i16x8(a, self.splat_i16x8(!0))
1129	}
1130
1131	/// Returns `!a` for each bit in a.
1132	#[inline(always)]
1133	pub fn not_i32x4(self, a: i32x4) -> i32x4 {
1134		self.xor_i32x4(a, self.splat_i32x4(!0))
1135	}
1136
1137	/// Returns `!a` for each bit in a.
1138	#[inline(always)]
1139	pub fn not_i64x2(self, a: i64x2) -> i64x2 {
1140		self.xor_i64x2(a, self.splat_i64x2(!0))
1141	}
1142
1143	/// Returns `!a` for each bit in a.
1144	#[inline(always)]
1145	pub fn not_i8x16(self, a: i8x16) -> i8x16 {
1146		self.xor_i8x16(a, self.splat_i8x16(!0))
1147	}
1148
1149	/// Returns `!a` for each bit in a.
1150	#[inline(always)]
1151	pub fn not_m16x8(self, a: m16x8) -> m16x8 {
1152		self.xor_m16x8(a, self.splat_m16x8(m16::new(true)))
1153	}
1154
1155	/// Returns `!a` for each bit in a.
1156	#[inline(always)]
1157	pub fn not_m32x4(self, a: m32x4) -> m32x4 {
1158		self.xor_m32x4(a, self.splat_m32x4(m32::new(true)))
1159	}
1160
1161	/// Returns `!a` for each bit in a.
1162	#[inline(always)]
1163	pub fn not_m64x2(self, a: m64x2) -> m64x2 {
1164		self.xor_m64x2(a, self.splat_m64x2(m64::new(true)))
1165	}
1166
1167	/// Returns `!a` for each bit in a.
1168	#[inline(always)]
1169	pub fn not_m8x16(self, a: m8x16) -> m8x16 {
1170		self.xor_m8x16(a, self.splat_m8x16(m8::new(true)))
1171	}
1172
1173	/// Returns `!a` for each bit in a.
1174	#[inline(always)]
1175	pub fn not_u16x8(self, a: u16x8) -> u16x8 {
1176		self.xor_u16x8(a, self.splat_u16x8(!0))
1177	}
1178
1179	/// Returns `!a` for each bit in a.
1180	#[inline(always)]
1181	pub fn not_u32x4(self, a: u32x4) -> u32x4 {
1182		self.xor_u32x4(a, self.splat_u32x4(!0))
1183	}
1184
1185	/// Returns `!a` for each bit in a.
1186	#[inline(always)]
1187	pub fn not_u64x2(self, a: u64x2) -> u64x2 {
1188		self.xor_u64x2(a, self.splat_u64x2(!0))
1189	}
1190
1191	/// Returns `!a` for each bit in a.
1192	#[inline(always)]
1193	pub fn not_u8x16(self, a: u8x16) -> u8x16 {
1194		self.xor_u8x16(a, self.splat_u8x16(!0))
1195	}
1196
1197	/// Returns `a | b` for each bit in `a` and `b`.
1198	#[inline(always)]
1199	pub fn or_f32x4(self, a: f32x4, b: f32x4) -> f32x4 {
1200		cast!(self.sse._mm_or_ps(cast!(a), cast!(b)))
1201	}
1202
1203	/// Returns `a | b` for each bit in `a` and `b`.
1204	#[inline(always)]
1205	pub fn or_f64x2(self, a: f64x2, b: f64x2) -> f64x2 {
1206		cast!(self.sse2._mm_or_pd(cast!(a), cast!(b)))
1207	}
1208
1209	/// Returns `a | b` for each bit in `a` and `b`.
1210	#[inline(always)]
1211	pub fn or_i16x8(self, a: i16x8, b: i16x8) -> i16x8 {
1212		cast!(self.sse2._mm_or_si128(cast!(a), cast!(b)))
1213	}
1214
1215	/// Returns `a | b` for each bit in `a` and `b`.
1216	#[inline(always)]
1217	pub fn or_i32x4(self, a: i32x4, b: i32x4) -> i32x4 {
1218		cast!(self.sse2._mm_or_si128(cast!(a), cast!(b)))
1219	}
1220
1221	/// Returns `a | b` for each bit in `a` and `b`.
1222	#[inline(always)]
1223	pub fn or_i64x2(self, a: i64x2, b: i64x2) -> i64x2 {
1224		cast!(self.sse2._mm_or_si128(cast!(a), cast!(b)))
1225	}
1226
1227	/// Returns `a | b` for each bit in `a` and `b`.
1228	#[inline(always)]
1229	pub fn or_i8x16(self, a: i8x16, b: i8x16) -> i8x16 {
1230		cast!(self.sse2._mm_or_si128(cast!(a), cast!(b)))
1231	}
1232
1233	/// Returns `a | b` for each bit in `a` and `b`.
1234	#[inline(always)]
1235	pub fn or_m16x8(self, a: m16x8, b: m16x8) -> m16x8 {
1236		cast!(self.sse2._mm_or_si128(cast!(a), cast!(b)))
1237	}
1238
1239	/// Returns `a | b` for each bit in `a` and `b`.
1240	#[inline(always)]
1241	pub fn or_m32x4(self, a: m32x4, b: m32x4) -> m32x4 {
1242		cast!(self.sse2._mm_or_si128(cast!(a), cast!(b)))
1243	}
1244
1245	/// Returns `a | b` for each bit in `a` and `b`.
1246	#[inline(always)]
1247	pub fn or_m64x2(self, a: m64x2, b: m64x2) -> m64x2 {
1248		cast!(self.sse2._mm_or_si128(cast!(a), cast!(b)))
1249	}
1250
1251	/// Returns `a | b` for each bit in `a` and `b`.
1252	#[inline(always)]
1253	pub fn or_m8x16(self, a: m8x16, b: m8x16) -> m8x16 {
1254		cast!(self.sse2._mm_or_si128(cast!(a), cast!(b)))
1255	}
1256
1257	/// Returns `a | b` for each bit in `a` and `b`.
1258	#[inline(always)]
1259	pub fn or_u16x8(self, a: u16x8, b: u16x8) -> u16x8 {
1260		cast!(self.sse2._mm_or_si128(cast!(a), cast!(b)))
1261	}
1262
1263	/// Returns `a | b` for each bit in `a` and `b`.
1264	#[inline(always)]
1265	pub fn or_u32x4(self, a: u32x4, b: u32x4) -> u32x4 {
1266		cast!(self.sse2._mm_or_si128(cast!(a), cast!(b)))
1267	}
1268
1269	/// Returns `a | b` for each bit in `a` and `b`.
1270	#[inline(always)]
1271	pub fn or_u64x2(self, a: u64x2, b: u64x2) -> u64x2 {
1272		cast!(self.sse2._mm_or_si128(cast!(a), cast!(b)))
1273	}
1274
1275	/// Returns `a | b` for each bit in `a` and `b`.
1276	#[inline(always)]
1277	pub fn or_u8x16(self, a: u8x16, b: u8x16) -> u8x16 {
1278		cast!(self.sse2._mm_or_si128(cast!(a), cast!(b)))
1279	}
1280
1281	/// See [_mm_packs_epi16].
1282	///
1283	/// [_mm_packs_epi16]: core::arch::x86_64::_mm_packs_epi16
1284	#[inline(always)]
1285	pub fn pack_with_signed_saturation_i16x8(self, a: i16x8, b: i16x8) -> i8x16 {
1286		cast!(self.sse2._mm_packs_epi16(cast!(a), cast!(b)))
1287	}
1288
1289	/// See [_mm_packs_epi32].
1290	///
1291	/// [_mm_packs_epi32]: core::arch::x86_64::_mm_packs_epi32
1292	#[inline(always)]
1293	pub fn pack_with_signed_saturation_i32x4(self, a: i32x4, b: i32x4) -> i16x8 {
1294		cast!(self.sse2._mm_packs_epi32(cast!(a), cast!(b)))
1295	}
1296
1297	/// See [_mm_packus_epi16].
1298	///
1299	/// [_mm_packus_epi16]: core::arch::x86_64::_mm_packus_epi16
1300	#[inline(always)]
1301	pub fn pack_with_unsigned_saturation_i16x8(self, a: i16x8, b: i16x8) -> u8x16 {
1302		cast!(self.sse2._mm_packus_epi16(cast!(a), cast!(b)))
1303	}
1304
1305	/// See [_mm_packus_epi32].
1306	///
1307	/// [_mm_packus_epi32]: core::arch::x86_64::_mm_packus_epi32
1308	#[inline(always)]
1309	pub fn pack_with_unsigned_saturation_i32x4(self, a: i32x4, b: i32x4) -> u16x8 {
1310		cast!(self.sse4_1._mm_packus_epi32(cast!(a), cast!(b)))
1311	}
1312
1313	#[inline(always)]
1314	pub fn reduce_max_c32x2(self, a: f32x4) -> c32 {
1315		// a0 a1 a2 a3
1316		let a: __m128 = cast!(a);
1317		// a2 a3 a2 a3
1318		let hi = self.sse._mm_movehl_ps(a, a);
1319
1320		// a0+a2 a1+a3 _ _
1321		let r0 = self.sse._mm_max_ps(a, hi);
1322
1323		cast!(self.sse2._mm_cvtsd_f64(cast!(r0)))
1324	}
1325
1326	#[inline(always)]
1327	pub fn reduce_max_c64x1(self, a: f64x2) -> c64 {
1328		cast!(a)
1329	}
1330
1331	#[inline(always)]
1332	pub fn reduce_max_f32x4(self, a: f32x4) -> f32 {
1333		let a: __m128 = cast!(a);
1334		let hi = self.sse._mm_movehl_ps(a, a);
1335		let r0 = self.sse._mm_max_ps(a, hi);
1336		let r0_shuffled = self.sse._mm_shuffle_ps::<0b0001>(r0, r0);
1337		let r = self.sse._mm_max_ss(r0, r0_shuffled);
1338		self.sse._mm_cvtss_f32(r)
1339	}
1340
1341	#[inline(always)]
1342	pub fn reduce_max_f64x2(self, a: f64x2) -> f64 {
1343		let a: __m128d = cast!(a);
1344		let hi = cast!(self.sse._mm_movehl_ps(cast!(a), cast!(a)));
1345		let r = self.sse2._mm_max_sd(a, hi);
1346		self.sse2._mm_cvtsd_f64(r)
1347	}
1348
1349	#[inline(always)]
1350	pub fn reduce_min_c32x2(self, a: f32x4) -> c32 {
1351		// a0 a1 a2 a3
1352		let a: __m128 = cast!(a);
1353		// a2 a3 a2 a3
1354		let hi = self.sse._mm_movehl_ps(a, a);
1355
1356		// a0+a2 a1+a3 _ _
1357		let r0 = self.sse._mm_min_ps(a, hi);
1358
1359		cast!(self.sse2._mm_cvtsd_f64(cast!(r0)))
1360	}
1361
1362	#[inline(always)]
1363	pub fn reduce_min_c64x1(self, a: f64x2) -> c64 {
1364		cast!(a)
1365	}
1366
1367	#[inline(always)]
1368	pub fn reduce_min_f32x4(self, a: f32x4) -> f32 {
1369		let a: __m128 = cast!(a);
1370		let hi = self.sse._mm_movehl_ps(a, a);
1371		let r0 = self.sse._mm_min_ps(a, hi);
1372		let r0_shuffled = self.sse._mm_shuffle_ps::<0b0001>(r0, r0);
1373		let r = self.sse._mm_min_ss(r0, r0_shuffled);
1374		self.sse._mm_cvtss_f32(r)
1375	}
1376
1377	#[inline(always)]
1378	pub fn reduce_min_f64x2(self, a: f64x2) -> f64 {
1379		let a: __m128d = cast!(a);
1380		let hi = cast!(self.sse._mm_movehl_ps(cast!(a), cast!(a)));
1381		let r = self.sse2._mm_min_sd(a, hi);
1382		self.sse2._mm_cvtsd_f64(r)
1383	}
1384
1385	#[inline(always)]
1386	pub fn reduce_product_f32x4(self, a: f32x4) -> f32 {
1387		let a: __m128 = cast!(a);
1388		let hi = self.sse._mm_movehl_ps(a, a);
1389		let r0 = self.sse._mm_mul_ps(a, hi);
1390		let r0_shuffled = self.sse._mm_shuffle_ps::<0b0001>(r0, r0);
1391		let r = self.sse._mm_mul_ss(r0, r0_shuffled);
1392		self.sse._mm_cvtss_f32(r)
1393	}
1394
1395	#[inline(always)]
1396	pub fn reduce_product_f64x2(self, a: f64x2) -> f64 {
1397		let a: __m128d = cast!(a);
1398		let hi = cast!(self.sse._mm_movehl_ps(cast!(a), cast!(a)));
1399		let r = self.sse2._mm_mul_sd(a, hi);
1400		self.sse2._mm_cvtsd_f64(r)
1401	}
1402
1403	#[inline(always)]
1404	pub fn reduce_sum_c32x2(self, a: f32x4) -> c32 {
1405		// a0 a1 a2 a3
1406		let a: __m128 = cast!(a);
1407		// a2 a3 a2 a3
1408		let hi = self.sse._mm_movehl_ps(a, a);
1409
1410		// a0+a2 a1+a3 _ _
1411		let r0 = self.sse._mm_add_ps(a, hi);
1412
1413		cast!(self.sse2._mm_cvtsd_f64(cast!(r0)))
1414	}
1415
1416	#[inline(always)]
1417	pub fn reduce_sum_c64x1(self, a: f64x2) -> c64 {
1418		cast!(a)
1419	}
1420
1421	#[inline(always)]
1422	pub fn reduce_sum_f32x4(self, a: f32x4) -> f32 {
1423		// a0 a1 a2 a3
1424		let a: __m128 = cast!(a);
1425		// a2 a3 a2 a3
1426		let hi = self.sse._mm_movehl_ps(a, a);
1427
1428		// a0+a2 a1+a3 _ _
1429		let r0 = self.sse._mm_add_ps(a, hi);
1430		// a1+a3 a2+a1 _ _
1431		let r0_shuffled = self.sse._mm_shuffle_ps::<0b0001>(r0, r0);
1432
1433		let r = self.sse._mm_add_ss(r0, r0_shuffled);
1434
1435		self.sse._mm_cvtss_f32(r)
1436	}
1437
1438	#[inline(always)]
1439	pub fn reduce_sum_f64x2(self, a: f64x2) -> f64 {
1440		let a: __m128d = cast!(a);
1441		let hi = cast!(self.sse._mm_movehl_ps(cast!(a), cast!(a)));
1442		let r = self.sse2._mm_add_sd(a, hi);
1443		self.sse2._mm_cvtsd_f64(r)
1444	}
1445
1446	/// Rounds the elements of each lane of `a` to the nearest integer. If two values are equally
1447	/// close, the even value is returned.
1448	#[inline(always)]
1449	pub fn round_f32x4(self, a: f32x4) -> f32x4 {
1450		const ROUNDING: i32 = _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC;
1451		cast!(self.sse4_1._mm_round_ps::<ROUNDING>(cast!(a)))
1452	}
1453
1454	/// Rounds the elements of each lane of `a` to the nearest integer. If two values are equally
1455	/// close, the even value is returned.
1456	#[inline(always)]
1457	pub fn round_f64x2(self, a: f64x2) -> f64x2 {
1458		const ROUNDING: i32 = _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC;
1459		cast!(self.sse4_1._mm_round_pd::<ROUNDING>(cast!(a)))
1460	}
1461
1462	/// Adds the elements of each lane of `a` and `b`, with saturation.
1463	#[inline(always)]
1464	pub fn saturating_add_i16x8(self, a: i16x8, b: i16x8) -> i16x8 {
1465		cast!(self.sse2._mm_adds_epi16(cast!(a), cast!(b)))
1466	}
1467
1468	/// Adds the elements of each lane of `a` and `b`, with saturation.
1469	#[inline(always)]
1470	pub fn saturating_add_i8x16(self, a: i8x16, b: i8x16) -> i8x16 {
1471		cast!(self.sse2._mm_adds_epi8(cast!(a), cast!(b)))
1472	}
1473
1474	/// Adds the elements of each lane of `a` and `b`, with saturation.
1475	#[inline(always)]
1476	pub fn saturating_add_u16x8(self, a: u16x8, b: u16x8) -> u16x8 {
1477		cast!(self.sse2._mm_adds_epu16(cast!(a), cast!(b)))
1478	}
1479
1480	/// Adds the elements of each lane of `a` and `b`, with saturation.
1481	#[inline(always)]
1482	pub fn saturating_add_u8x16(self, a: u8x16, b: u8x16) -> u8x16 {
1483		cast!(self.sse2._mm_adds_epu8(cast!(a), cast!(b)))
1484	}
1485
1486	/// Subtracts the elements of each lane of `a` and `b`, with saturation.
1487	#[inline(always)]
1488	pub fn saturating_sub_i16x8(self, a: i16x8, b: i16x8) -> i16x8 {
1489		cast!(self.sse2._mm_subs_epi16(cast!(a), cast!(b)))
1490	}
1491
1492	/// Subtracts the elements of each lane of `a` and `b`, with saturation.
1493	#[inline(always)]
1494	pub fn saturating_sub_i8x16(self, a: i8x16, b: i8x16) -> i8x16 {
1495		cast!(self.sse2._mm_subs_epi8(cast!(a), cast!(b)))
1496	}
1497
1498	/// Subtracts the elements of each lane of `a` and `b`, with saturation.
1499	#[inline(always)]
1500	pub fn saturating_sub_u16x8(self, a: u16x8, b: u16x8) -> u16x8 {
1501		cast!(self.sse2._mm_subs_epu16(cast!(a), cast!(b)))
1502	}
1503
1504	/// Subtracts the elements of each lane of `a` and `b`, with saturation.
1505	#[inline(always)]
1506	pub fn saturating_sub_u8x16(self, a: u8x16, b: u8x16) -> u8x16 {
1507		cast!(self.sse2._mm_subs_epu8(cast!(a), cast!(b)))
1508	}
1509
1510	/// Combines `if_true` and `if_false`, selecting elements from `if_true` if the corresponding
1511	/// bit in the mask is set, otherwise selecting elements from `if_false`.
1512	#[inline(always)]
1513	pub fn select_const_f32x4<const MASK4: i32>(self, if_true: f32x4, if_false: f32x4) -> f32x4 {
1514		cast!(self.select_const_u32x4::<MASK4>(cast!(if_true), cast!(if_false)))
1515	}
1516
1517	/// Combines `if_true` and `if_false`, selecting elements from `if_true` if the corresponding
1518	/// bit in the mask is set, otherwise selecting elements from `if_false`.
1519	#[inline(always)]
1520	pub fn select_const_f64x2<const MASK2: i32>(self, if_true: f64x2, if_false: f64x2) -> f64x2 {
1521		cast!(self.select_const_u64x2::<MASK2>(cast!(if_true), cast!(if_false)))
1522	}
1523
1524	/// Combines `if_true` and `if_false`, selecting elements from `if_true` if the corresponding
1525	/// bit in the mask is set, otherwise selecting elements from `if_false`.
1526	#[inline(always)]
1527	pub fn select_const_i32x4<const MASK4: i32>(self, if_true: i32x4, if_false: i32x4) -> i32x4 {
1528		cast!(self.select_const_u32x4::<MASK4>(cast!(if_true), cast!(if_false)))
1529	}
1530
1531	/// Combines `if_true` and `if_false`, selecting elements from `if_true` if the corresponding
1532	/// bit in the mask is set, otherwise selecting elements from `if_false`.
1533	#[inline(always)]
1534	pub fn select_const_i64x2<const MASK2: i32>(self, if_true: i64x2, if_false: i64x2) -> i64x2 {
1535		cast!(self.select_const_u64x2::<MASK2>(cast!(if_true), cast!(if_false)))
1536	}
1537
1538	/// Combines `if_true` and `if_false`, selecting elements from `if_true` if the corresponding
1539	/// bit in the mask is set, otherwise selecting elements from `if_false`.
1540	#[inline(always)]
1541	pub fn select_const_u32x4<const MASK4: i32>(self, if_true: u32x4, if_false: u32x4) -> u32x4 {
1542		cast!(
1543			self.sse4_1
1544				._mm_blend_ps::<MASK4>(cast!(if_false), cast!(if_true)),
1545		)
1546	}
1547
1548	/// Combines `if_true` and `if_false`, selecting elements from `if_true` if the corresponding
1549	/// bit in the mask is set, otherwise selecting elements from `if_false`.
1550	#[inline(always)]
1551	pub fn select_const_u64x2<const MASK2: i32>(self, if_true: u64x2, if_false: u64x2) -> u64x2 {
1552		cast!(
1553			self.sse4_1
1554				._mm_blend_pd::<MASK2>(cast!(if_false), cast!(if_true)),
1555		)
1556	}
1557
1558	/// Combines `if_true` and `if_false`, selecting elements from `if_true` if the corresponding
1559	/// mask in `mask` is set, otherwise selecting elements from `if_false`.
1560	#[inline(always)]
1561	pub fn select_f32x4(self, mask: m32x4, if_true: f32x4, if_false: f32x4) -> f32x4 {
1562		cast!(
1563			self.sse4_1
1564				._mm_blendv_ps(cast!(if_false), cast!(if_true), cast!(mask)),
1565		)
1566	}
1567
1568	/// Combines `if_true` and `if_false`, selecting elements from `if_true` if the corresponding
1569	/// mask in `mask` is set, otherwise selecting elements from `if_false`.
1570	#[inline(always)]
1571	pub fn select_f64x2(self, mask: m64x2, if_true: f64x2, if_false: f64x2) -> f64x2 {
1572		cast!(
1573			self.sse4_1
1574				._mm_blendv_pd(cast!(if_false), cast!(if_true), cast!(mask)),
1575		)
1576	}
1577
1578	/// Combines `if_true` and `if_false`, selecting elements from `if_true` if the corresponding
1579	/// mask in `mask` is set, otherwise selecting elements from `if_false`.
1580	#[inline(always)]
1581	pub fn select_i16x8(self, mask: m16x8, if_true: i16x8, if_false: i16x8) -> i16x8 {
1582		cast!(self.select_u16x8(mask, cast!(if_true), cast!(if_false)))
1583	}
1584
1585	/// Combines `if_true` and `if_false`, selecting elements from `if_true` if the corresponding
1586	/// mask in `mask` is set, otherwise selecting elements from `if_false`.
1587	#[inline(always)]
1588	pub fn select_i32x4(self, mask: m32x4, if_true: i32x4, if_false: i32x4) -> i32x4 {
1589		cast!(self.select_u32x4(mask, cast!(if_true), cast!(if_false)))
1590	}
1591
1592	/// Combines `if_true` and `if_false`, selecting elements from `if_true` if the corresponding
1593	/// mask in `mask` is set, otherwise selecting elements from `if_false`.
1594	#[inline(always)]
1595	pub fn select_i64x2(self, mask: m64x2, if_true: i64x2, if_false: i64x2) -> i64x2 {
1596		cast!(self.select_u64x2(mask, cast!(if_true), cast!(if_false)))
1597	}
1598
1599	/// Combines `if_true` and `if_false`, selecting elements from `if_true` if the corresponding
1600	/// mask in `mask` is set, otherwise selecting elements from `if_false`.
1601	#[inline(always)]
1602	pub fn select_i8x16(self, mask: m8x16, if_true: i8x16, if_false: i8x16) -> i8x16 {
1603		cast!(self.select_u8x16(mask, cast!(if_true), cast!(if_false)))
1604	}
1605
1606	/// Combines `if_true` and `if_false`, selecting elements from `if_true` if the corresponding
1607	/// mask in `mask` is set, otherwise selecting elements from `if_false`.
1608	#[inline(always)]
1609	pub fn select_u16x8(self, mask: m16x8, if_true: u16x8, if_false: u16x8) -> u16x8 {
1610		cast!(
1611			self.sse4_1
1612				._mm_blendv_epi8(cast!(if_false), cast!(if_true), cast!(mask)),
1613		)
1614	}
1615
1616	/// Combines `if_true` and `if_false`, selecting elements from `if_true` if the corresponding
1617	/// mask in `mask` is set, otherwise selecting elements from `if_false`.
1618	#[inline(always)]
1619	pub fn select_u32x4(self, mask: m32x4, if_true: u32x4, if_false: u32x4) -> u32x4 {
1620		cast!(
1621			self.sse4_1
1622				._mm_blendv_epi8(cast!(if_false), cast!(if_true), cast!(mask)),
1623		)
1624	}
1625
1626	/// Combines `if_true` and `if_false`, selecting elements from `if_true` if the corresponding
1627	/// mask in `mask` is set, otherwise selecting elements from `if_false`.
1628	#[inline(always)]
1629	pub fn select_u64x2(self, mask: m64x2, if_true: u64x2, if_false: u64x2) -> u64x2 {
1630		cast!(
1631			self.sse4_1
1632				._mm_blendv_epi8(cast!(if_false), cast!(if_true), cast!(mask)),
1633		)
1634	}
1635
1636	/// Combines `if_true` and `if_false`, selecting elements from `if_true` if the corresponding
1637	/// mask in `mask` is set, otherwise selecting elements from `if_false`.
1638	#[inline(always)]
1639	pub fn select_u8x16(self, mask: m8x16, if_true: u8x16, if_false: u8x16) -> u8x16 {
1640		cast!(
1641			self.sse4_1
1642				._mm_blendv_epi8(cast!(if_false), cast!(if_true), cast!(mask)),
1643		)
1644	}
1645
1646	/// Shift the bits of each lane of `a` to the left by `AMOUNT`, while shifting in zeros.
1647	/// Shifting by a value greater than the bit width of the type sets the result to zero.
1648	#[inline(always)]
1649	pub fn shl_const_i16x8<const AMOUNT: i32>(self, a: i16x8) -> i16x8 {
1650		cast!(self.sse2._mm_slli_epi16::<AMOUNT>(cast!(a)))
1651	}
1652
1653	/// Shift the bits of each lane of `a` to the left by `AMOUNT`, while shifting in zeros.
1654	/// Shifting by a value greater than the bit width of the type sets the result to zero.
1655	#[inline(always)]
1656	pub fn shl_const_i32x4<const AMOUNT: i32>(self, a: i32x4) -> i32x4 {
1657		cast!(self.sse2._mm_slli_epi32::<AMOUNT>(cast!(a)))
1658	}
1659
1660	/// Shift the bits of each lane of `a` to the left by `AMOUNT`, while shifting in zeros.
1661	/// Shifting by a value greater than the bit width of the type sets the result to zero.
1662	#[inline(always)]
1663	pub fn shl_const_i64x2<const AMOUNT: i32>(self, a: i64x2) -> i64x2 {
1664		cast!(self.sse2._mm_slli_epi64::<AMOUNT>(cast!(a)))
1665	}
1666
1667	/// Shift the bits of each lane of `a` to the left by `AMOUNT`, while shifting in zeros.
1668	/// Shifting by a value greater than the bit width of the type sets the result to zero.
1669	#[inline(always)]
1670	pub fn shl_const_u16x8<const AMOUNT: i32>(self, a: u16x8) -> u16x8 {
1671		cast!(self.sse2._mm_slli_epi16::<AMOUNT>(cast!(a)))
1672	}
1673
1674	/// Shift the bits of each lane of `a` to the left by `AMOUNT`, while shifting in zeros.
1675	/// Shifting by a value greater than the bit width of the type sets the result to zero.
1676	#[inline(always)]
1677	pub fn shl_const_u32x4<const AMOUNT: i32>(self, a: u32x4) -> u32x4 {
1678		cast!(self.sse2._mm_slli_epi32::<AMOUNT>(cast!(a)))
1679	}
1680
1681	/// Shift the bits of each lane of `a` to the left by `AMOUNT`, while shifting in zeros.
1682	/// Shifting by a value greater than the bit width of the type sets the result to zero.
1683	#[inline(always)]
1684	pub fn shl_const_u64x2<const AMOUNT: i32>(self, a: u64x2) -> u64x2 {
1685		cast!(self.sse2._mm_slli_epi64::<AMOUNT>(cast!(a)))
1686	}
1687
1688	/// Shift the bits of each lane of `a` to the left by the first element in `amount`, while
1689	/// shifting in zeros.
1690	/// Shifting by a value greater than the bit width of the type sets the result to zero.
1691	#[inline(always)]
1692	pub fn shl_i16x8(self, a: i16x8, amount: u64x2) -> i16x8 {
1693		cast!(self.sse2._mm_sll_epi16(cast!(a), cast!(amount)))
1694	}
1695
1696	/// Shift the bits of each lane of `a` to the left by the first element in `amount`, while
1697	/// shifting in zeros.
1698	/// Shifting by a value greater than the bit width of the type sets the result to zero.
1699	#[inline(always)]
1700	pub fn shl_i32x4(self, a: i32x4, amount: u64x2) -> i32x4 {
1701		cast!(self.sse2._mm_sll_epi32(cast!(a), cast!(amount)))
1702	}
1703
1704	/// Shift the bits of each lane of `a` to the left by the first element in `amount`, while
1705	/// shifting in zeros.
1706	/// Shifting by a value greater than the bit width of the type sets the result to zero.
1707	#[inline(always)]
1708	pub fn shl_i64x2(self, a: i64x2, amount: u64x2) -> u64x2 {
1709		cast!(self.sse2._mm_sll_epi64(cast!(a), cast!(amount)))
1710	}
1711
1712	/// Shift the bits of each lane of `a` to the left by the first element in `amount`, while
1713	/// shifting in zeros.
1714	/// Shifting by a value greater than the bit width of the type sets the result to zero.
1715	#[inline(always)]
1716	pub fn shl_u16x8(self, a: u16x8, amount: u64x2) -> u16x8 {
1717		cast!(self.sse2._mm_sll_epi16(cast!(a), cast!(amount)))
1718	}
1719
1720	/// Shift the bits of each lane of `a` to the left by the first element in `amount`, while
1721	/// shifting in zeros.
1722	/// Shifting by a value greater than the bit width of the type sets the result to zero.
1723	#[inline(always)]
1724	pub fn shl_u32x4(self, a: u32x4, amount: u64x2) -> u32x4 {
1725		cast!(self.sse2._mm_sll_epi32(cast!(a), cast!(amount)))
1726	}
1727
1728	/// Shift the bits of each lane of `a` to the left by the first element in `amount`, while
1729	/// shifting in zeros.
1730	/// Shifting by a value greater than the bit width of the type sets the result to zero.
1731	#[inline(always)]
1732	pub fn shl_u64x2(self, a: u64x2, amount: u64x2) -> u64x2 {
1733		cast!(self.sse2._mm_sll_epi64(cast!(a), cast!(amount)))
1734	}
1735
1736	/// Shift the bits of each lane of `a` to the right by `AMOUNT`, while shifting in sign bits.
1737	/// Shifting by a value greater than the bit width of the type sets the result to zero if the
1738	/// sign bit is not set, and to `-1` if the sign bit is set.
1739	#[inline(always)]
1740	pub fn shr_const_i16x8<const AMOUNT: i32>(self, a: i16x8) -> i16x8 {
1741		cast!(self.sse2._mm_srai_epi16::<AMOUNT>(cast!(a)))
1742	}
1743
1744	/// Shift the bits of each lane of `a` to the right by `AMOUNT`, while shifting in sign bits.
1745	/// Shifting by a value greater than the bit width of the type sets the result to zero if the
1746	/// sign bit is not set, and to `-1` if the sign bit is set.
1747	#[inline(always)]
1748	pub fn shr_const_i32x4<const AMOUNT: i32>(self, a: i32x4) -> i32x4 {
1749		cast!(self.sse2._mm_srai_epi32::<AMOUNT>(cast!(a)))
1750	}
1751
1752	/// Shift the bits of each lane of `a` to the right by `AMOUNT`, while shifting in zeros.
1753	/// Shifting by a value greater than the bit width of the type sets the result to zero.
1754	#[inline(always)]
1755	pub fn shr_const_u16x8<const AMOUNT: i32>(self, a: u16x8) -> u16x8 {
1756		cast!(self.sse2._mm_srli_epi16::<AMOUNT>(cast!(a)))
1757	}
1758
1759	/// Shift the bits of each lane of `a` to the right by `AMOUNT`, while shifting in zeros.
1760	/// Shifting by a value greater than the bit width of the type sets the result to zero.
1761	#[inline(always)]
1762	pub fn shr_const_u32x4<const AMOUNT: i32>(self, a: u32x4) -> u32x4 {
1763		cast!(self.sse2._mm_srli_epi32::<AMOUNT>(cast!(a)))
1764	}
1765
1766	/// Shift the bits of each lane of `a` to the right by `AMOUNT`, while shifting in zeros.
1767	/// Shifting by a value greater than the bit width of the type sets the result to zero.
1768	#[inline(always)]
1769	pub fn shr_const_u64x2<const AMOUNT: i32>(self, a: u64x2) -> u64x2 {
1770		cast!(self.sse2._mm_srli_epi64::<AMOUNT>(cast!(a)))
1771	}
1772
1773	/// Shift the bits of each lane of `a` to the right by the first element in `amount`, while
1774	/// shifting in zeros.
1775	/// Shifting by a value greater than the bit width of the type sets the result to zero if the
1776	/// sign bit is not set, and to `-1` if the sign bit is set.
1777	#[inline(always)]
1778	pub fn shr_i16x8(self, a: i16x8, amount: u64x2) -> i16x8 {
1779		cast!(self.sse2._mm_sra_epi16(cast!(a), cast!(amount)))
1780	}
1781
1782	/// Shift the bits of each lane of `a` to the right by the first element in `amount`, while
1783	/// shifting in zeros.
1784	/// Shifting by a value greater than the bit width of the type sets the result to zero if the
1785	/// sign bit is not set, and to `-1` if the sign bit is set.
1786	#[inline(always)]
1787	pub fn shr_i32x4(self, a: i32x4, amount: u64x2) -> i32x4 {
1788		cast!(self.sse2._mm_sra_epi32(cast!(a), cast!(amount)))
1789	}
1790
1791	/// Shift the bits of each lane of `a` to the right by the first element in `amount`, while
1792	/// shifting in zeros.
1793	/// Shifting by a value greater than the bit width of the type sets the result to zero.
1794	#[inline(always)]
1795	pub fn shr_u16x8(self, a: u16x8, amount: u64x2) -> u16x8 {
1796		cast!(self.sse2._mm_srl_epi16(cast!(a), cast!(amount)))
1797	}
1798
1799	/// Shift the bits of each lane of `a` to the right by the first element in `amount`, while
1800	/// shifting in zeros.
1801	/// Shifting by a value greater than the bit width of the type sets the result to zero.
1802	#[inline(always)]
1803	pub fn shr_u32x4(self, a: u32x4, amount: u64x2) -> u32x4 {
1804		cast!(self.sse2._mm_srl_epi32(cast!(a), cast!(amount)))
1805	}
1806
1807	/// Shift the bits of each lane of `a` to the right by the first element in `amount`, while
1808	/// shifting in zeros.
1809	/// Shifting by a value greater than the bit width of the type sets the result to zero.
1810	#[inline(always)]
1811	pub fn shr_u64x2(self, a: u64x2, amount: u64x2) -> u64x2 {
1812		cast!(self.sse2._mm_srl_epi64(cast!(a), cast!(amount)))
1813	}
1814
1815	/// Returns a SIMD vector with all lanes set to the given value.
1816	#[inline(always)]
1817	pub fn splat_f32x4(self, value: f32) -> f32x4 {
1818		cast!(self.sse._mm_set1_ps(value))
1819	}
1820
1821	/// Returns a SIMD vector with all lanes set to the given value.
1822	#[inline(always)]
1823	pub fn splat_f64x2(self, value: f64) -> f64x2 {
1824		cast!(self.sse2._mm_set1_pd(value))
1825	}
1826
1827	/// Returns a SIMD vector with all lanes set to the given value.
1828	#[inline(always)]
1829	pub fn splat_i16x8(self, value: i16) -> i16x8 {
1830		cast!(self.sse2._mm_set1_epi16(value))
1831	}
1832
1833	/// Returns a SIMD vector with all lanes set to the given value.
1834	#[inline(always)]
1835	pub fn splat_i32x4(self, value: i32) -> i32x4 {
1836		cast!(self.sse2._mm_set1_epi32(value))
1837	}
1838
1839	/// Returns a SIMD vector with all lanes set to the given value.
1840	#[inline(always)]
1841	pub fn splat_i64x2(self, value: i64) -> i64x2 {
1842		cast!(self.sse2._mm_set1_epi64x(value))
1843	}
1844
1845	/// Returns a SIMD vector with all lanes set to the given value.
1846	#[inline(always)]
1847	pub fn splat_i8x16(self, value: i8) -> i8x16 {
1848		cast!(self.sse2._mm_set1_epi8(value))
1849	}
1850
1851	/// Returns a SIMD vector with all lanes set to the given value.
1852	#[inline(always)]
1853	pub fn splat_m16x8(self, value: m16) -> m16x8 {
1854		cast!(self.sse2._mm_set1_epi16(value.0 as i16))
1855	}
1856
1857	/// Returns a SIMD vector with all lanes set to the given value.
1858	#[inline(always)]
1859	pub fn splat_m32x4(self, value: m32) -> m32x4 {
1860		cast!(self.sse2._mm_set1_epi32(value.0 as i32))
1861	}
1862
1863	/// Returns a SIMD vector with all lanes set to the given value.
1864	#[inline(always)]
1865	pub fn splat_m64x2(self, value: m64) -> m64x2 {
1866		cast!(self.sse2._mm_set1_epi64x(value.0 as i64))
1867	}
1868
1869	/// Returns a SIMD vector with all lanes set to the given value.
1870	#[inline(always)]
1871	pub fn splat_m8x16(self, value: m8) -> m8x16 {
1872		cast!(self.sse2._mm_set1_epi8(value.0 as i8))
1873	}
1874
1875	/// Returns a SIMD vector with all lanes set to the given value.
1876	#[inline(always)]
1877	pub fn splat_u16x8(self, value: u16) -> u16x8 {
1878		cast!(self.sse2._mm_set1_epi16(value as i16))
1879	}
1880
1881	/// Returns a SIMD vector with all lanes set to the given value.
1882	#[inline(always)]
1883	pub fn splat_u32x4(self, value: u32) -> u32x4 {
1884		cast!(self.sse2._mm_set1_epi32(value as i32))
1885	}
1886
1887	/// Returns a SIMD vector with all lanes set to the given value.
1888	#[inline(always)]
1889	pub fn splat_u64x2(self, value: u64) -> u64x2 {
1890		cast!(self.sse2._mm_set1_epi64x(value as i64))
1891	}
1892
1893	/// Returns a SIMD vector with all lanes set to the given value.
1894	#[inline(always)]
1895	pub fn splat_u8x16(self, value: u8) -> u8x16 {
1896		cast!(self.sse2._mm_set1_epi8(value as i8))
1897	}
1898
1899	/// Computes the square roots of the elements of each lane of `a`.
1900	#[inline(always)]
1901	pub fn sqrt_f32x4(self, a: f32x4) -> f32x4 {
1902		cast!(self.sse._mm_sqrt_ps(cast!(a)))
1903	}
1904
1905	/// Computes the square roots of the elements of each lane of `a`.
1906	#[inline(always)]
1907	pub fn sqrt_f64x2(self, a: f64x2) -> f64x2 {
1908		cast!(self.sse2._mm_sqrt_pd(cast!(a)))
1909	}
1910
1911	/// Calculates `a - b` for each lane in `a` and `b`.
1912	#[inline(always)]
1913	pub fn sub_f32x4(self, a: f32x4, b: f32x4) -> f32x4 {
1914		cast!(self.sse._mm_sub_ps(cast!(a), cast!(b)))
1915	}
1916
1917	/// Calculates `a - b` for each lane in `a` and `b`.
1918	#[inline(always)]
1919	pub fn sub_f64x2(self, a: f64x2, b: f64x2) -> f64x2 {
1920		cast!(self.sse2._mm_sub_pd(cast!(a), cast!(b)))
1921	}
1922
1923	/// Alternatively subtracts and adds the elements of each lane of `a` and `b`.
1924	#[inline(always)]
1925	pub fn subadd_f32x4(self, a: f32x4, b: f32x4) -> f32x4 {
1926		cast!(self.sse3._mm_addsub_ps(cast!(a), cast!(b)))
1927	}
1928
1929	/// Alternatively subtracts and adds the elements of each lane of `a` and `b`.
1930	#[inline(always)]
1931	pub fn subadd_f64x2(self, a: f64x2, b: f64x2) -> f64x2 {
1932		cast!(self.sse3._mm_addsub_pd(cast!(a), cast!(b)))
1933	}
1934
1935	/// See [_mm_sad_epu8].
1936	///
1937	/// [_mm_sad_epu8]: core::arch::x86_64::_mm_sad_epu8
1938	#[inline(always)]
1939	pub fn sum_of_absolute_differences_u8x16(self, a: u8x16, b: u8x16) -> u64x2 {
1940		cast!(self.sse2._mm_sad_epu8(cast!(a), cast!(b)))
1941	}
1942
1943	/// Rounds the elements of each lane of `a` to the nearest integer towards zero.
1944	#[inline(always)]
1945	pub fn truncate_f32x4(self, a: f32x4) -> f32x4 {
1946		const ROUNDING: i32 = _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC;
1947		cast!(self.sse4_1._mm_round_ps::<ROUNDING>(cast!(a)))
1948	}
1949
1950	/// Rounds the elements of each lane of `a` to the nearest integer towards zero.
1951	#[inline(always)]
1952	pub fn truncate_f64x2(self, a: f64x2) -> f64x2 {
1953		const ROUNDING: i32 = _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC;
1954		cast!(self.sse4_1._mm_round_pd::<ROUNDING>(cast!(a)))
1955	}
1956
1957	/// Computes the unsigned absolute value of the elements of each lane of `a`.
1958	#[inline(always)]
1959	pub fn unsigned_abs_i16x8(self, a: i16x8) -> u16x8 {
1960		cast!(self.ssse3._mm_abs_epi16(cast!(a)))
1961	}
1962
1963	/// Computes the unsigned absolute value of the elements of each lane of `a`.
1964	#[inline(always)]
1965	pub fn unsigned_abs_i32x4(self, a: i32x4) -> u32x4 {
1966		cast!(self.ssse3._mm_abs_epi32(cast!(a)))
1967	}
1968
1969	/// Computes the unsigned absolute value of the elements of each lane of `a`.
1970	#[inline(always)]
1971	pub fn unsigned_abs_i8x16(self, a: i8x16) -> u8x16 {
1972		cast!(self.ssse3._mm_abs_epi8(cast!(a)))
1973	}
1974
1975	/// Multiplies the elements of each lane of `a` and `b`, and returns separately the low and
1976	/// high bits of the result.
1977	#[inline(always)]
1978	pub fn widening_mul_i16x8(self, a: i16x8, b: i16x8) -> (i16x8, i16x8) {
1979		(
1980			cast!(self.sse2._mm_mullo_epi16(cast!(a), cast!(b))),
1981			cast!(self.sse2._mm_mulhi_epi16(cast!(a), cast!(b))),
1982		)
1983	}
1984
1985	/// Multiplies the elements of each lane of `a` and `b`, and returns separately the low and
1986	/// high bits of the result.
1987	#[inline(always)]
1988	pub fn widening_mul_i32x4(self, a: i32x4, b: i32x4) -> (i32x4, i32x4) {
1989		let a = cast!(a);
1990		let b = cast!(b);
1991		let sse = self.sse2;
1992
1993		// a0b0_lo a0b0_hi a2b2_lo a2b2_hi
1994		let ab_evens = self.sse4_1._mm_mul_epi32(a, b);
1995		// a1b1_lo a1b1_hi a3b3_lo a3b3_hi
1996		let ab_odds = self
1997			.sse4_1
1998			._mm_mul_epi32(sse._mm_srli_epi64::<32>(a), sse._mm_srli_epi64::<32>(b));
1999
2000		let ab_lo = self.sse4_1._mm_blend_ps::<0b1010>(
2001			// a0b0_lo xxxxxxx a2b2_lo xxxxxxx
2002			cast!(ab_evens),
2003			// xxxxxxx a1b1_lo xxxxxxx a3b3_lo
2004			cast!(sse._mm_slli_epi64::<32>(ab_odds)),
2005		);
2006		let ab_hi = self.sse4_1._mm_blend_ps::<0b1010>(
2007			// a0b0_hi xxxxxxx a2b2_hi xxxxxxx
2008			cast!(sse._mm_srli_epi64::<32>(ab_evens)),
2009			// xxxxxxx a1b1_hi xxxxxxx a3b3_hi
2010			cast!(ab_odds),
2011		);
2012
2013		(cast!(ab_lo), cast!(ab_hi))
2014	}
2015
2016	/// Multiplies the elements of each lane of `a` and `b`, and returns separately the low and
2017	/// high bits of the result.
2018	#[inline(always)]
2019	pub fn widening_mul_u16x8(self, a: u16x8, b: u16x8) -> (u16x8, u16x8) {
2020		(
2021			cast!(self.sse2._mm_mullo_epi16(cast!(a), cast!(b))),
2022			cast!(self.sse2._mm_mulhi_epu16(cast!(a), cast!(b))),
2023		)
2024	}
2025
2026	/// Multiplies the elements of each lane of `a` and `b`, and returns separately the low and
2027	/// high bits of the result.
2028	#[inline(always)]
2029	pub fn widening_mul_u32x4(self, a: u32x4, b: u32x4) -> (u32x4, u32x4) {
2030		let a = cast!(a);
2031		let b = cast!(b);
2032		let sse = self.sse2;
2033
2034		// a0b0_lo a0b0_hi a2b2_lo a2b2_hi
2035		let ab_evens = sse._mm_mul_epu32(a, b);
2036		// a1b1_lo a1b1_hi a3b3_lo a3b3_hi
2037		let ab_odds = sse._mm_mul_epu32(sse._mm_srli_epi64::<32>(a), sse._mm_srli_epi64::<32>(b));
2038
2039		let ab_lo = self.sse4_1._mm_blend_ps::<0b1010>(
2040			// a0b0_lo xxxxxxx a2b2_lo xxxxxxx
2041			cast!(ab_evens),
2042			// xxxxxxx a1b1_lo xxxxxxx a3b3_lo
2043			cast!(sse._mm_slli_epi64::<32>(ab_odds)),
2044		);
2045		let ab_hi = self.sse4_1._mm_blend_ps::<0b1010>(
2046			// a0b0_hi xxxxxxx a2b2_hi xxxxxxx
2047			cast!(sse._mm_srli_epi64::<32>(ab_evens)),
2048			// xxxxxxx a1b1_hi xxxxxxx a3b3_hi
2049			cast!(ab_odds),
2050		);
2051
2052		(cast!(ab_lo), cast!(ab_hi))
2053	}
2054
2055	/// Adds the elements of each lane of `a` and `b`, with wrapping on overflow.
2056	#[inline(always)]
2057	pub fn wrapping_add_i16x8(self, a: i16x8, b: i16x8) -> i16x8 {
2058		cast!(self.sse2._mm_add_epi16(cast!(a), cast!(b)))
2059	}
2060
2061	/// Adds the elements of each lane of `a` and `b`, with wrapping on overflow.
2062	#[inline(always)]
2063	pub fn wrapping_add_i32x4(self, a: i32x4, b: i32x4) -> i32x4 {
2064		cast!(self.sse2._mm_add_epi32(cast!(a), cast!(b)))
2065	}
2066
2067	/// Adds the elements of each lane of `a` and `b`, with wrapping on overflow.
2068	#[inline(always)]
2069	pub fn wrapping_add_i64x2(self, a: i64x2, b: i64x2) -> i64x2 {
2070		cast!(self.sse2._mm_add_epi64(cast!(a), cast!(b)))
2071	}
2072
2073	/// Adds the elements of each lane of `a` and `b`, with wrapping on overflow.
2074	#[inline(always)]
2075	pub fn wrapping_add_i8x16(self, a: i8x16, b: i8x16) -> i8x16 {
2076		cast!(self.sse2._mm_add_epi8(cast!(a), cast!(b)))
2077	}
2078
2079	/// Adds the elements of each lane of `a` and `b`, with wrapping on overflow.
2080	#[inline(always)]
2081	pub fn wrapping_add_u16x8(self, a: u16x8, b: u16x8) -> u16x8 {
2082		cast!(self.sse2._mm_add_epi16(cast!(a), cast!(b)))
2083	}
2084
2085	/// Adds the elements of each lane of `a` and `b`, with wrapping on overflow.
2086	#[inline(always)]
2087	pub fn wrapping_add_u32x4(self, a: u32x4, b: u32x4) -> u32x4 {
2088		cast!(self.sse2._mm_add_epi32(cast!(a), cast!(b)))
2089	}
2090
2091	/// Adds the elements of each lane of `a` and `b`, with wrapping on overflow.
2092	#[inline(always)]
2093	pub fn wrapping_add_u64x2(self, a: u64x2, b: u64x2) -> u64x2 {
2094		cast!(self.sse2._mm_add_epi64(cast!(a), cast!(b)))
2095	}
2096
2097	/// Adds the elements of each lane of `a` and `b`, with wrapping on overflow.
2098	#[inline(always)]
2099	pub fn wrapping_add_u8x16(self, a: u8x16, b: u8x16) -> u8x16 {
2100		cast!(self.sse2._mm_add_epi8(cast!(a), cast!(b)))
2101	}
2102
2103	/// Multiplies the elements of each lane of `a` and `b`, with wrapping on overflow.
2104	#[inline(always)]
2105	pub fn wrapping_mul_i16x8(self, a: i16x8, b: i16x8) -> i16x8 {
2106		cast!(self.sse2._mm_mullo_epi16(cast!(a), cast!(b)))
2107	}
2108
2109	/// Multiplies the elements of each lane of `a` and `b`, with wrapping on overflow.
2110	#[inline(always)]
2111	pub fn wrapping_mul_i32x4(self, a: i32x4, b: i32x4) -> i32x4 {
2112		cast!(self.sse4_1._mm_mullo_epi32(cast!(a), cast!(b)))
2113	}
2114
2115	/// Multiplies the elements of each lane of `a` and `b`, with wrapping on overflow.
2116	#[inline(always)]
2117	pub fn wrapping_mul_u16x8(self, a: u16x8, b: u16x8) -> u16x8 {
2118		cast!(self.sse2._mm_mullo_epi16(cast!(a), cast!(b)))
2119	}
2120
2121	/// Multiplies the elements of each lane of `a` and `b`, with wrapping on overflow.
2122	#[inline(always)]
2123	pub fn wrapping_mul_u32x4(self, a: u32x4, b: u32x4) -> u32x4 {
2124		cast!(self.sse4_1._mm_mullo_epi32(cast!(a), cast!(b)))
2125	}
2126
2127	/// Subtracts the elements of each lane of `a` and `b`, with wrapping on overflow.
2128	#[inline(always)]
2129	pub fn wrapping_sub_i16x8(self, a: i16x8, b: i16x8) -> i16x8 {
2130		cast!(self.sse2._mm_sub_epi16(cast!(a), cast!(b)))
2131	}
2132
2133	/// Subtracts the elements of each lane of `a` and `b`, with wrapping on overflow.
2134	#[inline(always)]
2135	pub fn wrapping_sub_i32x4(self, a: i32x4, b: i32x4) -> i32x4 {
2136		cast!(self.sse2._mm_sub_epi32(cast!(a), cast!(b)))
2137	}
2138
2139	/// Subtracts the elements of each lane of `a` and `b`, with wrapping on overflow.
2140	#[inline(always)]
2141	pub fn wrapping_sub_i64x2(self, a: i64x2, b: i64x2) -> i64x2 {
2142		cast!(self.sse2._mm_sub_epi64(cast!(a), cast!(b)))
2143	}
2144
2145	/// Subtracts the elements of each lane of `a` and `b`, with wrapping on overflow.
2146	#[inline(always)]
2147	pub fn wrapping_sub_i8x16(self, a: i8x16, b: i8x16) -> i8x16 {
2148		cast!(self.sse2._mm_sub_epi8(cast!(a), cast!(b)))
2149	}
2150
2151	/// Subtracts the elements of each lane of `a` and `b`, with wrapping on overflow.
2152	#[inline(always)]
2153	pub fn wrapping_sub_u16x8(self, a: u16x8, b: u16x8) -> u16x8 {
2154		cast!(self.sse2._mm_sub_epi16(cast!(a), cast!(b)))
2155	}
2156
2157	/// Subtracts the elements of each lane of `a` and `b`, with wrapping on overflow.
2158	#[inline(always)]
2159	pub fn wrapping_sub_u32x4(self, a: u32x4, b: u32x4) -> u32x4 {
2160		cast!(self.sse2._mm_sub_epi32(cast!(a), cast!(b)))
2161	}
2162
2163	/// Subtracts the elements of each lane of `a` and `b`, with wrapping on overflow.
2164	#[inline(always)]
2165	pub fn wrapping_sub_u64x2(self, a: u64x2, b: u64x2) -> u64x2 {
2166		cast!(self.sse2._mm_sub_epi64(cast!(a), cast!(b)))
2167	}
2168
2169	/// Subtracts the elements of each lane of `a` and `b`, with wrapping on overflow.
2170	#[inline(always)]
2171	pub fn wrapping_sub_u8x16(self, a: u8x16, b: u8x16) -> u8x16 {
2172		cast!(self.sse2._mm_sub_epi8(cast!(a), cast!(b)))
2173	}
2174
2175	/// Returns `a ^ b` for each bit in `a` and `b`.
2176	#[inline(always)]
2177	pub fn xor_f32x4(self, a: f32x4, b: f32x4) -> f32x4 {
2178		cast!(self.sse._mm_xor_ps(cast!(a), cast!(b)))
2179	}
2180
2181	/// Returns `a ^ b` for each bit in `a` and `b`.
2182	#[inline(always)]
2183	pub fn xor_f64x2(self, a: f64x2, b: f64x2) -> f64x2 {
2184		cast!(self.sse2._mm_xor_pd(cast!(a), cast!(b)))
2185	}
2186
2187	/// Returns `a ^ b` for each bit in `a` and `b`.
2188	#[inline(always)]
2189	pub fn xor_i16x8(self, a: i16x8, b: i16x8) -> i16x8 {
2190		cast!(self.sse2._mm_xor_si128(cast!(a), cast!(b)))
2191	}
2192
2193	/// Returns `a ^ b` for each bit in `a` and `b`.
2194	#[inline(always)]
2195	pub fn xor_i32x4(self, a: i32x4, b: i32x4) -> i32x4 {
2196		cast!(self.sse2._mm_xor_si128(cast!(a), cast!(b)))
2197	}
2198
2199	/// Returns `a ^ b` for each bit in `a` and `b`.
2200	#[inline(always)]
2201	pub fn xor_i64x2(self, a: i64x2, b: i64x2) -> i64x2 {
2202		cast!(self.sse2._mm_xor_si128(cast!(a), cast!(b)))
2203	}
2204
2205	/// Returns `a ^ b` for each bit in `a` and `b`.
2206	#[inline(always)]
2207	pub fn xor_i8x16(self, a: i8x16, b: i8x16) -> i8x16 {
2208		cast!(self.sse2._mm_xor_si128(cast!(a), cast!(b)))
2209	}
2210
2211	/// Returns `a ^ b` for each bit in `a` and `b`.
2212	#[inline(always)]
2213	pub fn xor_m16x8(self, a: m16x8, b: m16x8) -> m16x8 {
2214		cast!(self.sse2._mm_xor_si128(cast!(a), cast!(b)))
2215	}
2216
2217	/// Returns `a ^ b` for each bit in `a` and `b`.
2218	#[inline(always)]
2219	pub fn xor_m32x4(self, a: m32x4, b: m32x4) -> m32x4 {
2220		cast!(self.sse2._mm_xor_si128(cast!(a), cast!(b)))
2221	}
2222
2223	/// Returns `a ^ b` for each bit in `a` and `b`.
2224	#[inline(always)]
2225	pub fn xor_m64x2(self, a: m64x2, b: m64x2) -> m64x2 {
2226		cast!(self.sse2._mm_xor_si128(cast!(a), cast!(b)))
2227	}
2228
2229	/// Returns `a ^ b` for each bit in `a` and `b`.
2230	#[inline(always)]
2231	pub fn xor_m8x16(self, a: m8x16, b: m8x16) -> m8x16 {
2232		cast!(self.sse2._mm_xor_si128(cast!(a), cast!(b)))
2233	}
2234
2235	/// Returns `a ^ b` for each bit in `a` and `b`.
2236	#[inline(always)]
2237	pub fn xor_u16x8(self, a: u16x8, b: u16x8) -> u16x8 {
2238		cast!(self.sse2._mm_xor_si128(cast!(a), cast!(b)))
2239	}
2240
2241	/// Returns `a ^ b` for each bit in `a` and `b`.
2242	#[inline(always)]
2243	pub fn xor_u32x4(self, a: u32x4, b: u32x4) -> u32x4 {
2244		cast!(self.sse2._mm_xor_si128(cast!(a), cast!(b)))
2245	}
2246
2247	/// Returns `a ^ b` for each bit in `a` and `b`.
2248	#[inline(always)]
2249	pub fn xor_u64x2(self, a: u64x2, b: u64x2) -> u64x2 {
2250		cast!(self.sse2._mm_xor_si128(cast!(a), cast!(b)))
2251	}
2252
2253	/// Returns `a ^ b` for each bit in `a` and `b`.
2254	#[inline(always)]
2255	pub fn xor_u8x16(self, a: u8x16, b: u8x16) -> u8x16 {
2256		cast!(self.sse2._mm_xor_si128(cast!(a), cast!(b)))
2257	}
2258}