pulp/
lib.rs

1//! `pulp` is a safe abstraction over SIMD instructions, that allows you to write a function once
2//! and dispatch to equivalent vectorized versions based on the features detected at runtime.
3//!
4//! # Autovectorization example
5//!
6//! ```
7//! use pulp::Arch;
8//!
9//! let mut v = (0..1000).map(|i| i as f64).collect::<Vec<_>>();
10//! let arch = Arch::new();
11//!
12//! arch.dispatch(|| {
13//! 	for x in &mut v {
14//! 		*x *= 2.0;
15//! 	}
16//! });
17//!
18//! for (i, x) in v.into_iter().enumerate() {
19//! 	assert_eq!(x, 2.0 * i as f64);
20//! }
21//! ```
22//!
23//! # Manual vectorization example
24//!
25//! ```
26//! use pulp::{Arch, Simd, WithSimd};
27//!
28//! struct TimesThree<'a>(&'a mut [f64]);
29//! impl<'a> WithSimd for TimesThree<'a> {
30//! 	type Output = ();
31//!
32//! 	#[inline(always)]
33//! 	fn with_simd<S: Simd>(self, simd: S) -> Self::Output {
34//! 		let v = self.0;
35//! 		let (head, tail) = S::as_mut_simd_f64s(v);
36//!
37//! 		let three = simd.splat_f64s(3.0);
38//! 		for x in head {
39//! 			*x = simd.mul_f64s(three, *x);
40//! 		}
41//!
42//! 		for x in tail {
43//! 			*x = *x * 3.0;
44//! 		}
45//! 	}
46//! }
47//!
48//! let mut v = (0..1000).map(|i| i as f64).collect::<Vec<_>>();
49//! let arch = Arch::new();
50//!
51//! arch.dispatch(TimesThree(&mut v));
52//!
53//! for (i, x) in v.into_iter().enumerate() {
54//! 	assert_eq!(x, 3.0 * i as f64);
55//! }
56//! ```
57
58// FIXME: replace x86 non-ieee min/max functions to propagate nans instead
59
60#![allow(
61	non_camel_case_types,
62	unknown_lints,
63	clippy::zero_prefixed_literal,
64	clippy::identity_op,
65	clippy::too_many_arguments,
66	clippy::type_complexity,
67	clippy::missing_transmute_annotations,
68	clippy::tabs_in_doc_comments,
69	clippy::modulo_one
70)]
71#![cfg_attr(
72	all(feature = "nightly", any(target_arch = "x86", target_arch = "x86_64")),
73	feature(stdarch_x86_avx512),
74	feature(avx512_target_feature)
75)]
76#![cfg_attr(
77	all(feature = "nightly", any(target_arch = "aarch64")),
78	feature(stdarch_neon_i8mm),
79	feature(stdarch_neon_sm4),
80	feature(stdarch_neon_ftts),
81	feature(stdarch_neon_fcma),
82	feature(stdarch_neon_dotprod)
83)]
84#![cfg_attr(not(feature = "std"), no_std)]
85#![cfg_attr(docsrs, feature(doc_cfg))]
86
87#[cfg(libpulp_const)]
88#[macro_export]
89macro_rules! try_const {
90	($e: expr) => {
91		const { $e }
92	};
93}
94
95#[cfg(not(libpulp_const))]
96#[macro_export]
97macro_rules! try_const {
98	($e: expr) => {{ $e }};
99}
100
101macro_rules! match_cfg {
102    (item, match cfg!() {
103        $(
104            const { $i_meta:meta } => { $( $i_tokens:tt )* },
105        )*
106        $(_ => { $( $e_tokens:tt )* },)?
107    }) => {
108        $crate::match_cfg! {
109            @__items () ;
110            $(
111                (( $i_meta ) ( $( $i_tokens )* )) ,
112            )*
113            $((() ( $( $e_tokens )* )),)?
114        }
115    };
116
117    (match cfg!() {
118        $(
119            const { $i_meta:meta } => $i_expr: expr,
120        )*
121        $(_ => $e_expr: expr,)?
122    }) => {
123        $crate::match_cfg! {
124            @ __result @ __exprs ();
125            $(
126                (( $i_meta ) ( $i_expr  )) ,
127            )*
128            $((() ( $e_expr  )),)?
129        }
130    };
131
132    // Internal and recursive macro to emit all the items
133    //
134    // Collects all the previous cfgs in a list at the beginning, so they can be
135    // negated. After the semicolon are all the remaining items.
136    (@__items ( $( $_:meta , )* ) ; ) => {};
137    (
138        @__items ( $( $no:meta , )* ) ;
139        (( $( $yes:meta )? ) ( $( $tokens:tt )* )) ,
140        $( $rest:tt , )*
141    ) => {
142        // Emit all items within one block, applying an appropriate [cfg]. The
143        // [cfg] will require all `$yes` matchers specified and must also negate
144        // all previous matchers.
145        #[cfg(all(
146            $( $yes , )?
147            not(any( $( $no ),* ))
148        ))]
149        $crate::match_cfg! { @__identity $( $tokens )* }
150
151        // Recurse to emit all other items in `$rest`, and when we do so add all
152        // our `$yes` matchers to the list of `$no` matchers as future emissions
153        // will have to negate everything we just matched as well.
154        $crate::match_cfg! {
155            @__items ( $( $no , )* $( $yes , )? ) ;
156            $( $rest , )*
157        }
158    };
159
160    // Internal and recursive macro to emit all the exprs
161    //
162    // Collects all the previous cfgs in a list at the beginning, so they can be
163    // negated. After the semicolon are all the remaining exprs.
164    (@ $ret: ident @ __exprs ( $( $_:meta , )* ) ; ) => {
165    	$ret
166    };
167
168    (
169        @ $ret: ident @__exprs ( $( $no:meta , )* ) ;
170        (( $( $yes:meta )? ) ( $( $tokens:tt )* )) ,
171        $( $rest:tt , )*
172    ) => {{
173        // Emit all exprs within one block, applying an appropriate [cfg]. The
174        // [cfg] will require all `$yes` matchers specified and must also negate
175        // all previous matchers.
176        #[cfg(all(
177            $( $yes , )?
178            not(any( $( $no ),* ))
179        ))]
180        let $ret = $crate::match_cfg! { @__identity $( $tokens )* };
181
182        // // Recurse to emit all other exprs in `$rest`, and when we do so add all
183        // // our `$yes` matchers to the list of `$no` matchers as future emissions
184        // // will have to negate everything we just matched as well.
185        $crate::match_cfg! {
186            @ $ret @ __exprs ( $( $no , )* $( $yes , )? ) ;
187            $( $rest , )*
188        }
189    }};
190
191    // Internal macro to make __apply work out right for different match types,
192    // because of how macros match/expand stuff.
193    (@__identity $( $tokens:tt )* ) => {
194        $( $tokens )*
195    };
196}
197
198use match_cfg;
199
200/// Safe transmute macro.
201///
202/// This function asserts at compile time that the two types have the same size.
203#[macro_export]
204macro_rules! cast {
205	($val: expr $(,)?) => {{
206		let __val = $val;
207		if $crate::try_const! { false } {
208			// checks type constraints
209			$crate::cast(__val)
210		} else {
211			#[allow(unused_unsafe, clippy::missing_transmute_annotations)]
212			unsafe {
213				::core::mem::transmute(__val)
214			}
215		}
216	}};
217}
218
219use bytemuck::{AnyBitPattern, NoUninit, Pod, Zeroable};
220use core::fmt::Debug;
221use core::marker::PhantomData;
222use core::mem::MaybeUninit;
223use core::slice::{from_raw_parts, from_raw_parts_mut};
224use num_complex::Complex;
225use seal::Seal;
226
227/// Requires the first non-lifetime generic parameter, as well as the function's
228/// first input parameter to be the SIMD type.
229/// Also currently requires that all the lifetimes be explicitly specified.
230#[cfg(feature = "macro")]
231#[cfg_attr(docsrs, doc(cfg(feature = "macro")))]
232pub use pulp_macro::with_simd;
233
234pub use {bytemuck, num_complex};
235
236pub type c32 = Complex<f32>;
237pub type c64 = Complex<f64>;
238
239match_cfg!(item, match cfg!() {
240	const { any(target_arch = "x86_64", target_arch = "x86") } => {
241		#[derive(Debug, Copy, Clone)]
242		pub struct MemMask<T> {
243			mask: T,
244			load: Option<unsafe extern "C" fn()>,
245			store: Option<unsafe extern "C" fn()>,
246		}
247
248		impl<T> MemMask<T> {
249			#[inline]
250			pub fn new(mask: T) -> Self {
251				Self {
252					mask,
253					load: None,
254					store: None,
255				}
256			}
257		}
258
259		impl<T> From<T> for MemMask<T> {
260			#[inline]
261			fn from(value: T) -> Self {
262				Self {
263					mask: value,
264					load: None,
265					store: None,
266				}
267			}
268		}
269	},
270
271	_ => {
272		#[derive(Debug, Copy, Clone)]
273		pub struct MemMask<T> {
274			mask: T,
275		}
276
277		impl<T> MemMask<T> {
278			#[inline]
279			pub fn new(mask: T) -> Self {
280				Self { mask }
281			}
282		}
283
284		impl<T> From<T> for MemMask<T> {
285			#[inline]
286			fn from(value: T) -> Self {
287				Self { mask: value }
288			}
289		}
290	},
291});
292
293impl<T: Copy> MemMask<T> {
294	#[inline]
295	pub fn mask(self) -> T {
296		self.mask
297	}
298}
299
300mod seal {
301	pub trait Seal {}
302}
303
304pub trait NullaryFnOnce {
305	type Output;
306
307	fn call(self) -> Self::Output;
308}
309
310impl<R, F: FnOnce() -> R> NullaryFnOnce for F {
311	type Output = R;
312
313	#[inline(always)]
314	fn call(self) -> Self::Output {
315		self()
316	}
317}
318
319pub trait WithSimd {
320	type Output;
321
322	fn with_simd<S: Simd>(self, simd: S) -> Self::Output;
323}
324
325impl<F: NullaryFnOnce> WithSimd for F {
326	type Output = F::Output;
327
328	#[inline(always)]
329	fn with_simd<S: Simd>(self, simd: S) -> Self::Output {
330		let _simd = &simd;
331		self.call()
332	}
333}
334
335#[inline(always)]
336fn fma_f32(a: f32, b: f32, c: f32) -> f32 {
337	match_cfg!(match cfg!() {
338		const { feature = "std" } => f32::mul_add(a, b, c),
339		_ => libm::fmaf(a, b, c),
340	})
341}
342
343#[inline(always)]
344fn fma_f64(a: f64, b: f64, c: f64) -> f64 {
345	match_cfg!(match cfg!() {
346		const { feature = "std" } => f64::mul_add(a, b, c),
347		_ => libm::fma(a, b, c),
348	})
349}
350
351// a0,0 ... a0,m-1
352// ...
353// an-1,0 ... an-1,m-1
354#[inline(always)]
355unsafe fn interleave_fallback<Unit: Pod, Reg: Pod, AosReg>(x: AosReg) -> AosReg {
356	assert!(core::mem::size_of::<AosReg>() % core::mem::size_of::<Reg>() == 0);
357	assert!(core::mem::size_of::<Reg>() % core::mem::size_of::<Unit>() == 0);
358	assert!(!core::mem::needs_drop::<AosReg>());
359
360	if try_const! { core::mem::size_of::<AosReg>() == core::mem::size_of::<Reg>() } {
361		x
362	} else {
363		let mut y = core::ptr::read(&x);
364
365		let n = try_const! { core::mem::size_of::<AosReg>() / core::mem::size_of::<Reg>() };
366		let m = try_const! { core::mem::size_of::<Reg>() / core::mem::size_of::<Unit>() };
367
368		unsafe {
369			let y = (&mut y) as *mut _ as *mut Unit;
370			let x = (&x) as *const _ as *const Unit;
371			for j in 0..m {
372				for i in 0..n {
373					*y.add(i + n * j) = *x.add(j + i * m);
374				}
375			}
376		}
377
378		y
379	}
380}
381
382#[inline(always)]
383unsafe fn deinterleave_fallback<Unit: Pod, Reg: Pod, SoaReg>(y: SoaReg) -> SoaReg {
384	assert!(core::mem::size_of::<SoaReg>() % core::mem::size_of::<Reg>() == 0);
385	assert!(core::mem::size_of::<Reg>() % core::mem::size_of::<Unit>() == 0);
386	assert!(!core::mem::needs_drop::<SoaReg>());
387
388	if try_const! { core::mem::size_of::<SoaReg>() == core::mem::size_of::<Reg>() } {
389		y
390	} else {
391		let mut x = core::ptr::read(&y);
392
393		let n = try_const! { core::mem::size_of::<SoaReg>() / core::mem::size_of::<Reg>() };
394		let m = try_const! { core::mem::size_of::<Reg>() / core::mem::size_of::<Unit>() };
395
396		unsafe {
397			let y = (&y) as *const _ as *const Unit;
398			let x = (&mut x) as *mut _ as *mut Unit;
399			for j in 0..m {
400				for i in 0..n {
401					*x.add(j + i * m) = *y.add(i + n * j);
402				}
403			}
404		}
405
406		x
407	}
408}
409
410/// Types that allow \[de\]interleaving.
411///
412/// # Safety
413/// Instances of this type passed to simd \[de\]interleave functions must be `Pod`.
414pub unsafe trait Interleave {}
415unsafe impl<T: Pod> Interleave for T {}
416
417pub trait Simd: Seal + Debug + Copy + Send + Sync + 'static {
418	const IS_SCALAR: bool = false;
419
420	const U64_LANES: usize = core::mem::size_of::<Self::u64s>() / core::mem::size_of::<u64>();
421	const I64_LANES: usize = core::mem::size_of::<Self::i64s>() / core::mem::size_of::<i64>();
422	const F64_LANES: usize = core::mem::size_of::<Self::f64s>() / core::mem::size_of::<f64>();
423	const C64_LANES: usize = core::mem::size_of::<Self::c64s>() / core::mem::size_of::<c64>();
424
425	const U32_LANES: usize = core::mem::size_of::<Self::u32s>() / core::mem::size_of::<u32>();
426	const I32_LANES: usize = core::mem::size_of::<Self::i32s>() / core::mem::size_of::<i32>();
427	const F32_LANES: usize = core::mem::size_of::<Self::f32s>() / core::mem::size_of::<f32>();
428	const C32_LANES: usize = core::mem::size_of::<Self::c32s>() / core::mem::size_of::<c32>();
429
430	const REGISTER_COUNT: usize;
431
432	type m32s: Debug + Copy + Send + Sync + Zeroable + NoUninit + 'static;
433	type f32s: Debug + Copy + Send + Sync + Pod + 'static;
434	type c32s: Debug + Copy + Send + Sync + Pod + 'static;
435	type i32s: Debug + Copy + Send + Sync + Pod + 'static;
436	type u32s: Debug + Copy + Send + Sync + Pod + 'static;
437
438	type m64s: Debug + Copy + Send + Sync + Zeroable + NoUninit + 'static;
439	type f64s: Debug + Copy + Send + Sync + Pod + 'static;
440	type c64s: Debug + Copy + Send + Sync + Pod + 'static;
441	type i64s: Debug + Copy + Send + Sync + Pod + 'static;
442	type u64s: Debug + Copy + Send + Sync + Pod + 'static;
443
444	/// Contains the square of the norm in both the real and imaginary components.
445	fn abs2_c32s(self, a: Self::c32s) -> Self::c32s;
446
447	/// Contains the square of the norm in both the real and imaginary components.
448	fn abs2_c64s(self, a: Self::c64s) -> Self::c64s;
449	#[inline]
450	fn abs_f32s(self, a: Self::f32s) -> Self::f32s {
451		self.and_f32s(self.not_f32s(self.splat_f32s(-0.0)), a)
452	}
453	#[inline]
454	fn abs_f64s(self, a: Self::f64s) -> Self::f64s {
455		self.and_f64s(self.not_f64s(self.splat_f64s(-0.0)), a)
456	}
457	/// Contains the max norm in both the real and imaginary components.
458	fn abs_max_c32s(self, a: Self::c32s) -> Self::c32s;
459	/// Contains the max norm in both the real and imaginary components.
460	fn abs_max_c64s(self, a: Self::c64s) -> Self::c64s;
461	fn add_c32s(self, a: Self::c32s, b: Self::c32s) -> Self::c32s;
462	fn add_c64s(self, a: Self::c64s, b: Self::c64s) -> Self::c64s;
463	fn add_f32s(self, a: Self::f32s, b: Self::f32s) -> Self::f32s;
464	fn add_f64s(self, a: Self::f64s, b: Self::f64s) -> Self::f64s;
465	#[inline]
466	fn add_i32s(self, a: Self::i32s, b: Self::i32s) -> Self::i32s {
467		self.transmute_i32s_u32s(
468			self.add_u32s(self.transmute_u32s_i32s(a), self.transmute_u32s_i32s(b)),
469		)
470	}
471	#[inline]
472	fn add_i64s(self, a: Self::i64s, b: Self::i64s) -> Self::i64s {
473		self.transmute_i64s_u64s(
474			self.add_u64s(self.transmute_u64s_i64s(a), self.transmute_u64s_i64s(b)),
475		)
476	}
477	fn add_u32s(self, a: Self::u32s, b: Self::u32s) -> Self::u32s;
478	fn add_u64s(self, a: Self::u64s, b: Self::u64s) -> Self::u64s;
479	#[inline]
480	fn and_f32s(self, a: Self::f32s, b: Self::f32s) -> Self::f32s {
481		self.transmute_f32s_u32s(
482			self.and_u32s(self.transmute_u32s_f32s(a), self.transmute_u32s_f32s(b)),
483		)
484	}
485	#[inline]
486	fn and_f64s(self, a: Self::f64s, b: Self::f64s) -> Self::f64s {
487		self.transmute_f64s_u64s(
488			self.and_u64s(self.transmute_u64s_f64s(a), self.transmute_u64s_f64s(b)),
489		)
490	}
491	#[inline]
492	fn and_i32s(self, a: Self::i32s, b: Self::i32s) -> Self::i32s {
493		self.transmute_i32s_u32s(
494			self.and_u32s(self.transmute_u32s_i32s(a), self.transmute_u32s_i32s(b)),
495		)
496	}
497	#[inline]
498	fn and_i64s(self, a: Self::i64s, b: Self::i64s) -> Self::i64s {
499		self.transmute_i64s_u64s(
500			self.and_u64s(self.transmute_u64s_i64s(a), self.transmute_u64s_i64s(b)),
501		)
502	}
503	fn and_m32s(self, a: Self::m32s, b: Self::m32s) -> Self::m32s;
504	fn and_m64s(self, a: Self::m64s, b: Self::m64s) -> Self::m64s;
505	fn and_u32s(self, a: Self::u32s, b: Self::u32s) -> Self::u32s;
506	fn and_u64s(self, a: Self::u64s, b: Self::u64s) -> Self::u64s;
507	#[inline(always)]
508	fn as_mut_rsimd_c32s(slice: &mut [c32]) -> (&mut [c32], &mut [Self::c32s]) {
509		unsafe { rsplit_mut_slice(slice) }
510	}
511	#[inline(always)]
512	fn as_mut_rsimd_c64s(slice: &mut [c64]) -> (&mut [c64], &mut [Self::c64s]) {
513		unsafe { rsplit_mut_slice(slice) }
514	}
515	#[inline(always)]
516	fn as_mut_rsimd_f32s(slice: &mut [f32]) -> (&mut [f32], &mut [Self::f32s]) {
517		unsafe { rsplit_mut_slice(slice) }
518	}
519
520	#[inline(always)]
521	fn as_mut_rsimd_f64s(slice: &mut [f64]) -> (&mut [f64], &mut [Self::f64s]) {
522		unsafe { rsplit_mut_slice(slice) }
523	}
524	#[inline(always)]
525	fn as_mut_rsimd_i32s(slice: &mut [i32]) -> (&mut [i32], &mut [Self::i32s]) {
526		unsafe { rsplit_mut_slice(slice) }
527	}
528	#[inline(always)]
529	fn as_mut_rsimd_i64s(slice: &mut [i64]) -> (&mut [i64], &mut [Self::i64s]) {
530		unsafe { rsplit_mut_slice(slice) }
531	}
532	#[inline(always)]
533	fn as_mut_rsimd_u32s(slice: &mut [u32]) -> (&mut [u32], &mut [Self::u32s]) {
534		unsafe { rsplit_mut_slice(slice) }
535	}
536	#[inline(always)]
537	fn as_mut_rsimd_u64s(slice: &mut [u64]) -> (&mut [u64], &mut [Self::u64s]) {
538		unsafe { rsplit_mut_slice(slice) }
539	}
540	#[inline(always)]
541	fn as_mut_simd_c32s(slice: &mut [c32]) -> (&mut [Self::c32s], &mut [c32]) {
542		unsafe { split_mut_slice(slice) }
543	}
544	#[inline(always)]
545	fn as_mut_simd_c64s(slice: &mut [c64]) -> (&mut [Self::c64s], &mut [c64]) {
546		unsafe { split_mut_slice(slice) }
547	}
548	#[inline(always)]
549	fn as_mut_simd_f32s(slice: &mut [f32]) -> (&mut [Self::f32s], &mut [f32]) {
550		unsafe { split_mut_slice(slice) }
551	}
552	#[inline(always)]
553	fn as_mut_simd_f64s(slice: &mut [f64]) -> (&mut [Self::f64s], &mut [f64]) {
554		unsafe { split_mut_slice(slice) }
555	}
556	#[inline(always)]
557	fn as_mut_simd_i32s(slice: &mut [i32]) -> (&mut [Self::i32s], &mut [i32]) {
558		unsafe { split_mut_slice(slice) }
559	}
560	#[inline(always)]
561	fn as_mut_simd_i64s(slice: &mut [i64]) -> (&mut [Self::i64s], &mut [i64]) {
562		unsafe { split_mut_slice(slice) }
563	}
564	#[inline(always)]
565	fn as_mut_simd_u32s(slice: &mut [u32]) -> (&mut [Self::u32s], &mut [u32]) {
566		unsafe { split_mut_slice(slice) }
567	}
568	#[inline(always)]
569	fn as_mut_simd_u64s(slice: &mut [u64]) -> (&mut [Self::u64s], &mut [u64]) {
570		unsafe { split_mut_slice(slice) }
571	}
572	#[inline(always)]
573	fn as_rsimd_c32s(slice: &[c32]) -> (&[c32], &[Self::c32s]) {
574		unsafe { rsplit_slice(slice) }
575	}
576	#[inline(always)]
577	fn as_rsimd_c64s(slice: &[c64]) -> (&[c64], &[Self::c64s]) {
578		unsafe { rsplit_slice(slice) }
579	}
580	#[inline(always)]
581	fn as_rsimd_f32s(slice: &[f32]) -> (&[f32], &[Self::f32s]) {
582		unsafe { rsplit_slice(slice) }
583	}
584	#[inline(always)]
585	fn as_rsimd_f64s(slice: &[f64]) -> (&[f64], &[Self::f64s]) {
586		unsafe { rsplit_slice(slice) }
587	}
588	#[inline(always)]
589	fn as_rsimd_i32s(slice: &[i32]) -> (&[i32], &[Self::i32s]) {
590		unsafe { rsplit_slice(slice) }
591	}
592	#[inline(always)]
593	fn as_rsimd_i64s(slice: &[i64]) -> (&[i64], &[Self::i64s]) {
594		unsafe { rsplit_slice(slice) }
595	}
596	#[inline(always)]
597	fn as_rsimd_u32s(slice: &[u32]) -> (&[u32], &[Self::u32s]) {
598		unsafe { rsplit_slice(slice) }
599	}
600	#[inline(always)]
601	fn as_rsimd_u64s(slice: &[u64]) -> (&[u64], &[Self::u64s]) {
602		unsafe { rsplit_slice(slice) }
603	}
604	#[inline(always)]
605	fn as_simd_c32s(slice: &[c32]) -> (&[Self::c32s], &[c32]) {
606		unsafe { split_slice(slice) }
607	}
608	#[inline(always)]
609	fn as_simd_c64s(slice: &[c64]) -> (&[Self::c64s], &[c64]) {
610		unsafe { split_slice(slice) }
611	}
612	#[inline(always)]
613	fn as_simd_f32s(slice: &[f32]) -> (&[Self::f32s], &[f32]) {
614		unsafe { split_slice(slice) }
615	}
616
617	#[inline(always)]
618	fn as_simd_f64s(slice: &[f64]) -> (&[Self::f64s], &[f64]) {
619		unsafe { split_slice(slice) }
620	}
621	#[inline(always)]
622	fn as_simd_i32s(slice: &[i32]) -> (&[Self::i32s], &[i32]) {
623		unsafe { split_slice(slice) }
624	}
625	#[inline(always)]
626	fn as_simd_i64s(slice: &[i64]) -> (&[Self::i64s], &[i64]) {
627		unsafe { split_slice(slice) }
628	}
629	#[inline(always)]
630	fn as_simd_u32s(slice: &[u32]) -> (&[Self::u32s], &[u32]) {
631		unsafe { split_slice(slice) }
632	}
633	#[inline(always)]
634	fn as_simd_u64s(slice: &[u64]) -> (&[Self::u64s], &[u64]) {
635		unsafe { split_slice(slice) }
636	}
637	#[inline(always)]
638	fn as_uninit_mut_rsimd_c32s(
639		slice: &mut [MaybeUninit<c32>],
640	) -> (&mut [MaybeUninit<c32>], &mut [MaybeUninit<Self::c32s>]) {
641		unsafe { rsplit_mut_slice(slice) }
642	}
643	#[inline(always)]
644	fn as_uninit_mut_rsimd_c64s(
645		slice: &mut [MaybeUninit<c64>],
646	) -> (&mut [MaybeUninit<c64>], &mut [MaybeUninit<Self::c64s>]) {
647		unsafe { rsplit_mut_slice(slice) }
648	}
649	#[inline(always)]
650	fn as_uninit_mut_rsimd_f32s(
651		slice: &mut [MaybeUninit<f32>],
652	) -> (&mut [MaybeUninit<f32>], &mut [MaybeUninit<Self::f32s>]) {
653		unsafe { rsplit_mut_slice(slice) }
654	}
655
656	#[inline(always)]
657	fn as_uninit_mut_rsimd_f64s(
658		slice: &mut [MaybeUninit<f64>],
659	) -> (&mut [MaybeUninit<f64>], &mut [MaybeUninit<Self::f64s>]) {
660		unsafe { rsplit_mut_slice(slice) }
661	}
662	#[inline(always)]
663	fn as_uninit_mut_rsimd_i32s(
664		slice: &mut [MaybeUninit<i32>],
665	) -> (&mut [MaybeUninit<i32>], &mut [MaybeUninit<Self::i32s>]) {
666		unsafe { rsplit_mut_slice(slice) }
667	}
668	#[inline(always)]
669	fn as_uninit_mut_rsimd_i64s(
670		slice: &mut [MaybeUninit<i64>],
671	) -> (&mut [MaybeUninit<i64>], &mut [MaybeUninit<Self::i64s>]) {
672		unsafe { rsplit_mut_slice(slice) }
673	}
674	#[inline(always)]
675	fn as_uninit_mut_rsimd_u32s(
676		slice: &mut [MaybeUninit<u32>],
677	) -> (&mut [MaybeUninit<u32>], &mut [MaybeUninit<Self::u32s>]) {
678		unsafe { rsplit_mut_slice(slice) }
679	}
680	#[inline(always)]
681	fn as_uninit_mut_rsimd_u64s(
682		slice: &mut [MaybeUninit<u64>],
683	) -> (&mut [MaybeUninit<u64>], &mut [MaybeUninit<Self::u64s>]) {
684		unsafe { rsplit_mut_slice(slice) }
685	}
686	#[inline(always)]
687	fn as_uninit_mut_simd_c32s(
688		slice: &mut [MaybeUninit<c32>],
689	) -> (&mut [MaybeUninit<Self::c32s>], &mut [MaybeUninit<c32>]) {
690		unsafe { split_mut_slice(slice) }
691	}
692	#[inline(always)]
693	fn as_uninit_mut_simd_c64s(
694		slice: &mut [MaybeUninit<c64>],
695	) -> (&mut [MaybeUninit<Self::c64s>], &mut [MaybeUninit<c64>]) {
696		unsafe { split_mut_slice(slice) }
697	}
698	#[inline(always)]
699	fn as_uninit_mut_simd_f32s(
700		slice: &mut [MaybeUninit<f32>],
701	) -> (&mut [MaybeUninit<Self::f32s>], &mut [MaybeUninit<f32>]) {
702		unsafe { split_mut_slice(slice) }
703	}
704
705	#[inline(always)]
706	fn as_uninit_mut_simd_f64s(
707		slice: &mut [MaybeUninit<f64>],
708	) -> (&mut [MaybeUninit<Self::f64s>], &mut [MaybeUninit<f64>]) {
709		unsafe { split_mut_slice(slice) }
710	}
711	#[inline(always)]
712	fn as_uninit_mut_simd_i32s(
713		slice: &mut [MaybeUninit<i32>],
714	) -> (&mut [MaybeUninit<Self::i32s>], &mut [MaybeUninit<i32>]) {
715		unsafe { split_mut_slice(slice) }
716	}
717	#[inline(always)]
718	fn as_uninit_mut_simd_i64s(
719		slice: &mut [MaybeUninit<i64>],
720	) -> (&mut [MaybeUninit<Self::i64s>], &mut [MaybeUninit<i64>]) {
721		unsafe { split_mut_slice(slice) }
722	}
723	#[inline(always)]
724	fn as_uninit_mut_simd_u32s(
725		slice: &mut [MaybeUninit<u32>],
726	) -> (&mut [MaybeUninit<Self::u32s>], &mut [MaybeUninit<u32>]) {
727		unsafe { split_mut_slice(slice) }
728	}
729
730	#[inline(always)]
731	fn as_uninit_mut_simd_u64s(
732		slice: &mut [MaybeUninit<u64>],
733	) -> (&mut [MaybeUninit<Self::u64s>], &mut [MaybeUninit<u64>]) {
734		unsafe { split_mut_slice(slice) }
735	}
736	fn conj_c32s(self, a: Self::c32s) -> Self::c32s;
737	fn conj_c64s(self, a: Self::c64s) -> Self::c64s;
738	fn conj_mul_add_c32s(self, a: Self::c32s, b: Self::c32s, c: Self::c32s) -> Self::c32s;
739
740	fn conj_mul_add_c64s(self, a: Self::c64s, b: Self::c64s, c: Self::c64s) -> Self::c64s;
741	/// Computes `conj(a) * b + c`
742	#[inline]
743	fn conj_mul_add_e_c32s(self, a: Self::c32s, b: Self::c32s, c: Self::c32s) -> Self::c32s {
744		self.conj_mul_add_c32s(a, b, c)
745	}
746	/// Computes `conj(a) * b + c`
747	#[inline]
748	fn conj_mul_add_e_c64s(self, a: Self::c64s, b: Self::c64s, c: Self::c64s) -> Self::c64s {
749		self.conj_mul_add_c64s(a, b, c)
750	}
751	fn conj_mul_c32s(self, a: Self::c32s, b: Self::c32s) -> Self::c32s;
752
753	fn conj_mul_c64s(self, a: Self::c64s, b: Self::c64s) -> Self::c64s;
754	/// Computes `conj(a) * b`
755	#[inline]
756	fn conj_mul_e_c32s(self, a: Self::c32s, b: Self::c32s) -> Self::c32s {
757		self.conj_mul_c32s(a, b)
758	}
759	/// Computes `conj(a) * b`
760	#[inline]
761	fn conj_mul_e_c64s(self, a: Self::c64s, b: Self::c64s) -> Self::c64s {
762		self.conj_mul_c64s(a, b)
763	}
764	#[inline(always)]
765	fn deinterleave_shfl_f32s<T: Interleave>(self, values: T) -> T {
766		unsafe { deinterleave_fallback::<f32, Self::f32s, T>(values) }
767	}
768
769	#[inline(always)]
770	fn deinterleave_shfl_f64s<T: Interleave>(self, values: T) -> T {
771		unsafe { deinterleave_fallback::<f64, Self::f64s, T>(values) }
772	}
773	fn div_f32s(self, a: Self::f32s, b: Self::f32s) -> Self::f32s;
774	fn div_f64s(self, a: Self::f64s, b: Self::f64s) -> Self::f64s;
775	fn equal_f32s(self, a: Self::f32s, b: Self::f32s) -> Self::m32s;
776
777	fn equal_f64s(self, a: Self::f64s, b: Self::f64s) -> Self::m64s;
778	#[inline(always)]
779	fn first_true_m32s(self, mask: Self::m32s) -> usize {
780		if try_const! { core::mem::size_of::<Self::m32s>() == core::mem::size_of::<Self::u32s>() } {
781			let mask: Self::u32s = bytemuck::cast(mask);
782			let slice = bytemuck::cast_slice::<Self::u32s, u32>(core::slice::from_ref(&mask));
783			let mut i = 0;
784			for &x in slice.iter() {
785				if x != 0 {
786					break;
787				}
788				i += 1;
789			}
790			i
791		} else if try_const! { core::mem::size_of::<Self::m32s>() == core::mem::size_of::<u8>() } {
792			let mask: u8 = bytemuck::cast(mask);
793			mask.leading_zeros() as usize
794		} else if try_const! { core::mem::size_of::<Self::m32s>() == core::mem::size_of::<u16>() } {
795			let mask: u16 = bytemuck::cast(mask);
796			mask.leading_zeros() as usize
797		} else {
798			panic!()
799		}
800	}
801
802	#[inline(always)]
803	fn first_true_m64s(self, mask: Self::m64s) -> usize {
804		if try_const! { core::mem::size_of::<Self::m64s>() == core::mem::size_of::<Self::u64s>() } {
805			let mask: Self::u64s = bytemuck::cast(mask);
806			let slice = bytemuck::cast_slice::<Self::u64s, u64>(core::slice::from_ref(&mask));
807			let mut i = 0;
808			for &x in slice.iter() {
809				if x != 0 {
810					break;
811				}
812				i += 1;
813			}
814			i
815		} else if try_const! { core::mem::size_of::<Self::m64s>() == core::mem::size_of::<u8>() } {
816			let mask: u8 = bytemuck::cast(mask);
817			mask.leading_zeros() as usize
818		} else if try_const! { core::mem::size_of::<Self::m64s>() == core::mem::size_of::<u16>() } {
819			let mask: u16 = bytemuck::cast(mask);
820			mask.leading_zeros() as usize
821		} else {
822			panic!()
823		}
824	}
825
826	#[inline]
827	fn greater_than_f32s(self, a: Self::f32s, b: Self::f32s) -> Self::m32s {
828		self.less_than_f32s(b, a)
829	}
830
831	#[inline]
832	fn greater_than_f64s(self, a: Self::f64s, b: Self::f64s) -> Self::m64s {
833		self.less_than_f64s(b, a)
834	}
835	#[inline]
836	fn greater_than_or_equal_f32s(self, a: Self::f32s, b: Self::f32s) -> Self::m32s {
837		self.less_than_or_equal_f32s(b, a)
838	}
839	#[inline]
840	fn greater_than_or_equal_f64s(self, a: Self::f64s, b: Self::f64s) -> Self::m64s {
841		self.less_than_or_equal_f64s(b, a)
842	}
843
844	fn greater_than_or_equal_i32s(self, a: Self::i32s, b: Self::i32s) -> Self::m32s;
845	fn greater_than_or_equal_i64s(self, a: Self::i64s, b: Self::i64s) -> Self::m64s;
846	fn greater_than_i32s(self, a: Self::i32s, b: Self::i32s) -> Self::m32s;
847	fn greater_than_i64s(self, a: Self::i64s, b: Self::i64s) -> Self::m64s;
848	fn greater_than_or_equal_u32s(self, a: Self::u32s, b: Self::u32s) -> Self::m32s;
849	fn greater_than_or_equal_u64s(self, a: Self::u64s, b: Self::u64s) -> Self::m64s;
850	fn greater_than_u32s(self, a: Self::u32s, b: Self::u32s) -> Self::m32s;
851	fn greater_than_u64s(self, a: Self::u64s, b: Self::u64s) -> Self::m64s;
852
853	#[inline(always)]
854	fn interleave_shfl_f32s<T: Interleave>(self, values: T) -> T {
855		unsafe { interleave_fallback::<f32, Self::f32s, T>(values) }
856	}
857
858	#[inline(always)]
859	fn interleave_shfl_f64s<T: Interleave>(self, values: T) -> T {
860		unsafe { interleave_fallback::<f64, Self::f64s, T>(values) }
861	}
862
863	fn less_than_f32s(self, a: Self::f32s, b: Self::f32s) -> Self::m32s;
864	fn less_than_f64s(self, a: Self::f64s, b: Self::f64s) -> Self::m64s;
865	fn less_than_or_equal_f32s(self, a: Self::f32s, b: Self::f32s) -> Self::m32s;
866	fn less_than_or_equal_f64s(self, a: Self::f64s, b: Self::f64s) -> Self::m64s;
867
868	fn less_than_or_equal_i32s(self, a: Self::i32s, b: Self::i32s) -> Self::m32s;
869	fn less_than_or_equal_i64s(self, a: Self::i64s, b: Self::i64s) -> Self::m64s;
870	fn less_than_i32s(self, a: Self::i32s, b: Self::i32s) -> Self::m32s;
871	fn less_than_i64s(self, a: Self::i64s, b: Self::i64s) -> Self::m64s;
872	fn less_than_or_equal_u32s(self, a: Self::u32s, b: Self::u32s) -> Self::m32s;
873	fn less_than_or_equal_u64s(self, a: Self::u64s, b: Self::u64s) -> Self::m64s;
874	fn less_than_u32s(self, a: Self::u32s, b: Self::u32s) -> Self::m32s;
875	fn less_than_u64s(self, a: Self::u64s, b: Self::u64s) -> Self::m64s;
876
877	#[inline(always)]
878	fn mask_between_m32s(self, start: u32, end: u32) -> MemMask<Self::m32s> {
879		let iota: Self::u32s =
880			try_const! { unsafe { core::mem::transmute_copy(&iota_32::<u32>()) } };
881		self.and_m32s(
882			self.greater_than_or_equal_u32s(iota, self.splat_u32s(start)),
883			self.less_than_u32s(iota, self.splat_u32s(end)),
884		)
885		.into()
886	}
887
888	#[inline(always)]
889	fn mask_between_m64s(self, start: u64, end: u64) -> MemMask<Self::m64s> {
890		let iota: Self::u64s =
891			try_const! { unsafe { core::mem::transmute_copy(&iota_64::<u64>()) } };
892		self.and_m64s(
893			self.greater_than_or_equal_u64s(iota, self.splat_u64s(start)),
894			self.less_than_u64s(iota, self.splat_u64s(end)),
895		)
896		.into()
897	}
898	/// # Safety
899	///
900	/// Addresses corresponding to enabled lanes in the mask have the same restrictions as
901	/// [`core::ptr::read`].
902	unsafe fn mask_load_ptr_c32s(self, mask: MemMask<Self::m32s>, ptr: *const c32) -> Self::c32s;
903	/// # Safety
904	///
905	/// Addresses corresponding to enabled lanes in the mask have the same restrictions as
906	/// [`core::ptr::read`].
907	unsafe fn mask_load_ptr_c64s(self, mask: MemMask<Self::m64s>, ptr: *const c64) -> Self::c64s;
908	/// # Safety
909	///
910	/// Addresses corresponding to enabled lanes in the mask have the same restrictions as
911	/// [`core::ptr::read`].
912	#[inline(always)]
913	unsafe fn mask_load_ptr_f32s(self, mask: MemMask<Self::m32s>, ptr: *const f32) -> Self::f32s {
914		self.transmute_f32s_u32s(self.mask_load_ptr_u32s(mask, ptr as *const u32))
915	}
916
917	/// # Safety
918	///
919	/// Addresses corresponding to enabled lanes in the mask have the same restrictions as
920	/// [`core::ptr::read`].
921	#[inline(always)]
922	unsafe fn mask_load_ptr_f64s(self, mask: MemMask<Self::m64s>, ptr: *const f64) -> Self::f64s {
923		self.transmute_f64s_u64s(self.mask_load_ptr_u64s(mask, ptr as *const u64))
924	}
925	/// # Safety
926	///
927	/// Addresses corresponding to enabled lanes in the mask have the same restrictions as
928	/// [`core::ptr::read`].
929	#[inline(always)]
930	unsafe fn mask_load_ptr_i32s(self, mask: MemMask<Self::m32s>, ptr: *const i32) -> Self::i32s {
931		self.transmute_i32s_u32s(self.mask_load_ptr_u32s(mask, ptr as *const u32))
932	}
933	/// # Safety
934	///
935	/// Addresses corresponding to enabled lanes in the mask have the same restrictions as
936	/// [`core::ptr::read`].
937	#[inline(always)]
938	unsafe fn mask_load_ptr_i64s(self, mask: MemMask<Self::m64s>, ptr: *const i64) -> Self::i64s {
939		self.transmute_i64s_u64s(self.mask_load_ptr_u64s(mask, ptr as *const u64))
940	}
941	/// # Safety
942	///
943	/// Addresses corresponding to enabled lanes in the mask have the same restrictions as
944	/// [`core::ptr::read`].
945	unsafe fn mask_load_ptr_u32s(self, mask: MemMask<Self::m32s>, ptr: *const u32) -> Self::u32s;
946
947	/// # Safety
948	///
949	/// Addresses corresponding to enabled lanes in the mask have the same restrictions as
950	/// [`core::ptr::read`].
951	unsafe fn mask_load_ptr_u64s(self, mask: MemMask<Self::m64s>, ptr: *const u64) -> Self::u64s;
952	/// # Safety
953	///
954	/// Addresses corresponding to enabled lanes in the mask have the same restrictions as
955	/// [`core::ptr::write`].
956	unsafe fn mask_store_ptr_c32s(
957		self,
958		mask: MemMask<Self::m32s>,
959		ptr: *mut c32,
960		values: Self::c32s,
961	);
962	/// # Safety
963	///
964	/// Addresses corresponding to enabled lanes in the mask have the same restrictions as
965	/// [`core::ptr::write`].
966	unsafe fn mask_store_ptr_c64s(
967		self,
968		mask: MemMask<Self::m64s>,
969		ptr: *mut c64,
970		values: Self::c64s,
971	);
972	/// # Safety
973	///
974	/// Addresses corresponding to enabled lanes in the mask have the same restrictions as
975	/// [`core::ptr::write`].
976	#[inline(always)]
977	unsafe fn mask_store_ptr_f32s(
978		self,
979		mask: MemMask<Self::m32s>,
980		ptr: *mut f32,
981		values: Self::f32s,
982	) {
983		self.mask_store_ptr_u32s(mask, ptr as *mut u32, self.transmute_u32s_f32s(values));
984	}
985
986	/// # Safety
987	///
988	/// Addresses corresponding to enabled lanes in the mask have the same restrictions as
989	/// [`core::ptr::write`].
990	#[inline(always)]
991	unsafe fn mask_store_ptr_f64s(
992		self,
993		mask: MemMask<Self::m64s>,
994		ptr: *mut f64,
995		values: Self::f64s,
996	) {
997		self.mask_store_ptr_u64s(mask, ptr as *mut u64, self.transmute_u64s_f64s(values));
998	}
999	/// # Safety
1000	///
1001	/// Addresses corresponding to enabled lanes in the mask have the same restrictions as
1002	/// [`core::ptr::write`].
1003	#[inline(always)]
1004	unsafe fn mask_store_ptr_i32s(
1005		self,
1006		mask: MemMask<Self::m32s>,
1007		ptr: *mut i32,
1008		values: Self::i32s,
1009	) {
1010		self.mask_store_ptr_u32s(mask, ptr as *mut u32, self.transmute_u32s_i32s(values));
1011	}
1012	/// # Safety
1013	///
1014	/// Addresses corresponding to enabled lanes in the mask have the same restrictions as
1015	/// [`core::ptr::write`].
1016	#[inline(always)]
1017	unsafe fn mask_store_ptr_i64s(
1018		self,
1019		mask: MemMask<Self::m64s>,
1020		ptr: *mut i64,
1021		values: Self::i64s,
1022	) {
1023		self.mask_store_ptr_u64s(mask, ptr as *mut u64, self.transmute_u64s_i64s(values));
1024	}
1025	/// # Safety
1026	///
1027	/// Addresses corresponding to enabled lanes in the mask have the same restrictions as
1028	/// [`core::ptr::write`].
1029	unsafe fn mask_store_ptr_u32s(
1030		self,
1031		mask: MemMask<Self::m32s>,
1032		ptr: *mut u32,
1033		values: Self::u32s,
1034	);
1035
1036	/// # Safety
1037	///
1038	/// Addresses corresponding to enabled lanes in the mask have the same restrictions as
1039	/// [`core::ptr::write`].
1040	unsafe fn mask_store_ptr_u64s(
1041		self,
1042		mask: MemMask<Self::m64s>,
1043		ptr: *mut u64,
1044		values: Self::u64s,
1045	);
1046	fn max_f32s(self, a: Self::f32s, b: Self::f32s) -> Self::f32s;
1047	fn max_f64s(self, a: Self::f64s, b: Self::f64s) -> Self::f64s;
1048	fn min_f32s(self, a: Self::f32s, b: Self::f32s) -> Self::f32s;
1049	fn min_f64s(self, a: Self::f64s, b: Self::f64s) -> Self::f64s;
1050
1051	fn mul_add_c32s(self, a: Self::c32s, b: Self::c32s, c: Self::c32s) -> Self::c32s;
1052	fn mul_add_c64s(self, a: Self::c64s, b: Self::c64s, c: Self::c64s) -> Self::c64s;
1053	/// Computes `a * b + c`
1054	#[inline]
1055	fn mul_add_e_c32s(self, a: Self::c32s, b: Self::c32s, c: Self::c32s) -> Self::c32s {
1056		self.mul_add_c32s(a, b, c)
1057	}
1058	/// Computes `a * b + c`
1059	#[inline]
1060	fn mul_add_e_c64s(self, a: Self::c64s, b: Self::c64s, c: Self::c64s) -> Self::c64s {
1061		self.mul_add_c64s(a, b, c)
1062	}
1063	fn mul_add_e_f32s(self, a: Self::f32s, b: Self::f32s, c: Self::f32s) -> Self::f32s;
1064	fn mul_add_e_f64s(self, a: Self::f64s, b: Self::f64s, c: Self::f64s) -> Self::f64s;
1065	fn mul_add_f32s(self, a: Self::f32s, b: Self::f32s, c: Self::f32s) -> Self::f32s;
1066	fn mul_add_f64s(self, a: Self::f64s, b: Self::f64s, c: Self::f64s) -> Self::f64s;
1067	fn mul_c32s(self, a: Self::c32s, b: Self::c32s) -> Self::c32s;
1068
1069	fn mul_c64s(self, a: Self::c64s, b: Self::c64s) -> Self::c64s;
1070	/// Computes `a * b`
1071	#[inline]
1072	fn mul_e_c32s(self, a: Self::c32s, b: Self::c32s) -> Self::c32s {
1073		self.mul_c32s(a, b)
1074	}
1075	/// Computes `a * b`
1076	fn mul_e_c64s(self, a: Self::c64s, b: Self::c64s) -> Self::c64s {
1077		self.mul_c64s(a, b)
1078	}
1079	fn mul_f32s(self, a: Self::f32s, b: Self::f32s) -> Self::f32s;
1080	fn mul_f64s(self, a: Self::f64s, b: Self::f64s) -> Self::f64s;
1081	fn neg_c32s(self, a: Self::c32s) -> Self::c32s;
1082	fn neg_c64s(self, a: Self::c64s) -> Self::c64s;
1083
1084	#[inline]
1085	fn neg_f32s(self, a: Self::f32s) -> Self::f32s {
1086		self.xor_f32s(self.splat_f32s(-0.0), a)
1087	}
1088	#[inline]
1089	fn neg_f64s(self, a: Self::f64s) -> Self::f64s {
1090		self.xor_f64s(a, self.splat_f64s(-0.0))
1091	}
1092	#[inline]
1093	fn not_f32s(self, a: Self::f32s) -> Self::f32s {
1094		self.transmute_f32s_u32s(self.not_u32s(self.transmute_u32s_f32s(a)))
1095	}
1096
1097	#[inline]
1098	fn not_f64s(self, a: Self::f64s) -> Self::f64s {
1099		self.transmute_f64s_u64s(self.not_u64s(self.transmute_u64s_f64s(a)))
1100	}
1101	#[inline]
1102	fn not_i32s(self, a: Self::i32s) -> Self::i32s {
1103		self.transmute_i32s_u32s(self.not_u32s(self.transmute_u32s_i32s(a)))
1104	}
1105	#[inline]
1106	fn not_i64s(self, a: Self::i64s) -> Self::i64s {
1107		self.transmute_i64s_u64s(self.not_u64s(self.transmute_u64s_i64s(a)))
1108	}
1109
1110	fn not_m32s(self, a: Self::m32s) -> Self::m32s;
1111	fn not_m64s(self, a: Self::m64s) -> Self::m64s;
1112	fn not_u32s(self, a: Self::u32s) -> Self::u32s;
1113	fn not_u64s(self, a: Self::u64s) -> Self::u64s;
1114	#[inline]
1115	fn or_f32s(self, a: Self::f32s, b: Self::f32s) -> Self::f32s {
1116		self.transmute_f32s_u32s(
1117			self.or_u32s(self.transmute_u32s_f32s(a), self.transmute_u32s_f32s(b)),
1118		)
1119	}
1120	#[inline]
1121	fn or_f64s(self, a: Self::f64s, b: Self::f64s) -> Self::f64s {
1122		self.transmute_f64s_u64s(
1123			self.or_u64s(self.transmute_u64s_f64s(a), self.transmute_u64s_f64s(b)),
1124		)
1125	}
1126	#[inline]
1127	fn or_i32s(self, a: Self::i32s, b: Self::i32s) -> Self::i32s {
1128		self.transmute_i32s_u32s(
1129			self.or_u32s(self.transmute_u32s_i32s(a), self.transmute_u32s_i32s(b)),
1130		)
1131	}
1132	#[inline]
1133	fn or_i64s(self, a: Self::i64s, b: Self::i64s) -> Self::i64s {
1134		self.transmute_i64s_u64s(
1135			self.or_u64s(self.transmute_u64s_i64s(a), self.transmute_u64s_i64s(b)),
1136		)
1137	}
1138	fn or_m32s(self, a: Self::m32s, b: Self::m32s) -> Self::m32s;
1139
1140	fn or_m64s(self, a: Self::m64s, b: Self::m64s) -> Self::m64s;
1141	fn or_u32s(self, a: Self::u32s, b: Self::u32s) -> Self::u32s;
1142	fn or_u64s(self, a: Self::u64s, b: Self::u64s) -> Self::u64s;
1143	#[inline(always)]
1144	fn partial_load_c32s(self, slice: &[c32]) -> Self::c32s {
1145		cast(self.partial_load_f64s(bytemuck::cast_slice(slice)))
1146	}
1147	#[inline(always)]
1148	fn partial_load_c64s(self, slice: &[c64]) -> Self::c64s {
1149		cast(self.partial_load_f64s(bytemuck::cast_slice(slice)))
1150	}
1151	#[inline(always)]
1152	fn partial_load_f32s(self, slice: &[f32]) -> Self::f32s {
1153		cast(self.partial_load_u32s(bytemuck::cast_slice(slice)))
1154	}
1155	#[inline(always)]
1156	fn partial_load_f64s(self, slice: &[f64]) -> Self::f64s {
1157		cast(self.partial_load_u64s(bytemuck::cast_slice(slice)))
1158	}
1159	#[inline(always)]
1160	fn partial_load_i32s(self, slice: &[i32]) -> Self::i32s {
1161		cast(self.partial_load_u32s(bytemuck::cast_slice(slice)))
1162	}
1163	#[inline(always)]
1164	fn partial_load_i64s(self, slice: &[i64]) -> Self::i64s {
1165		cast(self.partial_load_u64s(bytemuck::cast_slice(slice)))
1166	}
1167	#[inline(always)]
1168	fn partial_load_u32s(self, slice: &[u32]) -> Self::u32s {
1169		unsafe {
1170			self.mask_load_ptr_u32s(
1171				self.mask_between_m32s(0, slice.len() as u32),
1172				slice.as_ptr(),
1173			)
1174		}
1175	}
1176	#[inline(always)]
1177	fn partial_load_u64s(self, slice: &[u64]) -> Self::u64s {
1178		unsafe {
1179			self.mask_load_ptr_u64s(
1180				self.mask_between_m64s(0, slice.len() as u64),
1181				slice.as_ptr(),
1182			)
1183		}
1184	}
1185
1186	#[inline(always)]
1187	fn partial_store_c32s(self, slice: &mut [c32], values: Self::c32s) {
1188		self.partial_store_f64s(bytemuck::cast_slice_mut(slice), cast(values))
1189	}
1190	#[inline(always)]
1191	fn partial_store_c64s(self, slice: &mut [c64], values: Self::c64s) {
1192		self.partial_store_f64s(bytemuck::cast_slice_mut(slice), cast(values))
1193	}
1194
1195	#[inline(always)]
1196	fn partial_store_f32s(self, slice: &mut [f32], values: Self::f32s) {
1197		self.partial_store_u32s(bytemuck::cast_slice_mut(slice), cast(values))
1198	}
1199	#[inline(always)]
1200	fn partial_store_f64s(self, slice: &mut [f64], values: Self::f64s) {
1201		self.partial_store_u64s(bytemuck::cast_slice_mut(slice), cast(values))
1202	}
1203	#[inline(always)]
1204	fn partial_store_i32s(self, slice: &mut [i32], values: Self::i32s) {
1205		self.partial_store_u32s(bytemuck::cast_slice_mut(slice), cast(values))
1206	}
1207	#[inline(always)]
1208	fn partial_store_i64s(self, slice: &mut [i64], values: Self::i64s) {
1209		self.partial_store_u64s(bytemuck::cast_slice_mut(slice), cast(values))
1210	}
1211	#[inline(always)]
1212	fn partial_store_u32s(self, slice: &mut [u32], values: Self::u32s) {
1213		unsafe {
1214			self.mask_store_ptr_u32s(
1215				self.mask_between_m32s(0, slice.len() as u32),
1216				slice.as_mut_ptr(),
1217				values,
1218			)
1219		}
1220	}
1221	#[inline(always)]
1222	fn partial_store_u64s(self, slice: &mut [u64], values: Self::u64s) {
1223		unsafe {
1224			self.mask_store_ptr_u64s(
1225				self.mask_between_m64s(0, slice.len() as u64),
1226				slice.as_mut_ptr(),
1227				values,
1228			)
1229		}
1230	}
1231	fn reduce_max_c32s(self, a: Self::c32s) -> c32;
1232	fn reduce_max_c64s(self, a: Self::c64s) -> c64;
1233	fn reduce_max_f32s(self, a: Self::f32s) -> f32;
1234	fn reduce_max_f64s(self, a: Self::f64s) -> f64;
1235	fn reduce_min_c32s(self, a: Self::c32s) -> c32;
1236	fn reduce_min_c64s(self, a: Self::c64s) -> c64;
1237	fn reduce_min_f32s(self, a: Self::f32s) -> f32;
1238	fn reduce_min_f64s(self, a: Self::f64s) -> f64;
1239
1240	fn reduce_product_f32s(self, a: Self::f32s) -> f32;
1241	fn reduce_product_f64s(self, a: Self::f64s) -> f64;
1242	fn reduce_sum_c32s(self, a: Self::c32s) -> c32;
1243	fn reduce_sum_c64s(self, a: Self::c64s) -> c64;
1244
1245	fn reduce_sum_f32s(self, a: Self::f32s) -> f32;
1246	fn reduce_sum_f64s(self, a: Self::f64s) -> f64;
1247	#[inline(always)]
1248	fn rotate_left_c32s(self, a: Self::c32s, amount: usize) -> Self::c32s {
1249		self.rotate_right_c32s(a, amount.wrapping_neg())
1250	}
1251	#[inline(always)]
1252	fn rotate_left_c64s(self, a: Self::c64s, amount: usize) -> Self::c64s {
1253		self.rotate_right_c64s(a, amount.wrapping_neg())
1254	}
1255
1256	#[inline(always)]
1257	fn rotate_left_f32s(self, a: Self::f32s, amount: usize) -> Self::f32s {
1258		cast(self.rotate_left_u32s(cast(a), amount))
1259	}
1260	#[inline(always)]
1261	fn rotate_left_f64s(self, a: Self::f64s, amount: usize) -> Self::f64s {
1262		cast(self.rotate_left_u64s(cast(a), amount))
1263	}
1264	#[inline(always)]
1265	fn rotate_left_i32s(self, a: Self::i32s, amount: usize) -> Self::i32s {
1266		cast(self.rotate_left_u32s(cast(a), amount))
1267	}
1268
1269	#[inline(always)]
1270	fn rotate_left_i64s(self, a: Self::i64s, amount: usize) -> Self::i64s {
1271		cast(self.rotate_left_u64s(cast(a), amount))
1272	}
1273
1274	#[inline(always)]
1275	fn rotate_left_u32s(self, a: Self::u32s, amount: usize) -> Self::u32s {
1276		self.rotate_right_u32s(a, amount.wrapping_neg())
1277	}
1278	#[inline(always)]
1279	fn rotate_left_u64s(self, a: Self::u64s, amount: usize) -> Self::u64s {
1280		self.rotate_right_u64s(a, amount.wrapping_neg())
1281	}
1282	fn rotate_right_c32s(self, a: Self::c32s, amount: usize) -> Self::c32s;
1283	fn rotate_right_c64s(self, a: Self::c64s, amount: usize) -> Self::c64s;
1284	#[inline(always)]
1285	fn rotate_right_f32s(self, a: Self::f32s, amount: usize) -> Self::f32s {
1286		cast(self.rotate_right_u32s(cast(a), amount))
1287	}
1288	#[inline(always)]
1289	fn rotate_right_f64s(self, a: Self::f64s, amount: usize) -> Self::f64s {
1290		cast(self.rotate_right_u64s(cast(a), amount))
1291	}
1292	#[inline(always)]
1293	fn rotate_right_i32s(self, a: Self::i32s, amount: usize) -> Self::i32s {
1294		cast(self.rotate_right_u32s(cast(a), amount))
1295	}
1296	#[inline(always)]
1297	fn rotate_right_i64s(self, a: Self::i64s, amount: usize) -> Self::i64s {
1298		cast(self.rotate_right_u64s(cast(a), amount))
1299	}
1300	fn rotate_right_u32s(self, a: Self::u32s, amount: usize) -> Self::u32s;
1301	fn rotate_right_u64s(self, a: Self::u64s, amount: usize) -> Self::u64s;
1302
1303	#[inline]
1304	fn select_f32s_m32s(
1305		self,
1306		mask: Self::m32s,
1307		if_true: Self::f32s,
1308		if_false: Self::f32s,
1309	) -> Self::f32s {
1310		self.transmute_f32s_u32s(self.select_u32s_m32s(
1311			mask,
1312			self.transmute_u32s_f32s(if_true),
1313			self.transmute_u32s_f32s(if_false),
1314		))
1315	}
1316	#[inline]
1317	fn select_f64s_m64s(
1318		self,
1319		mask: Self::m64s,
1320		if_true: Self::f64s,
1321		if_false: Self::f64s,
1322	) -> Self::f64s {
1323		self.transmute_f64s_u64s(self.select_u64s_m64s(
1324			mask,
1325			self.transmute_u64s_f64s(if_true),
1326			self.transmute_u64s_f64s(if_false),
1327		))
1328	}
1329	#[inline]
1330	fn select_i32s_m32s(
1331		self,
1332		mask: Self::m32s,
1333		if_true: Self::i32s,
1334		if_false: Self::i32s,
1335	) -> Self::i32s {
1336		self.transmute_i32s_u32s(self.select_u32s_m32s(
1337			mask,
1338			self.transmute_u32s_i32s(if_true),
1339			self.transmute_u32s_i32s(if_false),
1340		))
1341	}
1342	#[inline]
1343	fn select_i64s_m64s(
1344		self,
1345		mask: Self::m64s,
1346		if_true: Self::i64s,
1347		if_false: Self::i64s,
1348	) -> Self::i64s {
1349		self.transmute_i64s_u64s(self.select_u64s_m64s(
1350			mask,
1351			self.transmute_u64s_i64s(if_true),
1352			self.transmute_u64s_i64s(if_false),
1353		))
1354	}
1355	fn select_u32s_m32s(
1356		self,
1357		mask: Self::m32s,
1358		if_true: Self::u32s,
1359		if_false: Self::u32s,
1360	) -> Self::u32s;
1361	fn select_u64s_m64s(
1362		self,
1363		mask: Self::m64s,
1364		if_true: Self::u64s,
1365		if_false: Self::u64s,
1366	) -> Self::u64s;
1367	fn splat_c32s(self, value: c32) -> Self::c32s;
1368	fn splat_c64s(self, value: c64) -> Self::c64s;
1369	fn splat_f32s(self, value: f32) -> Self::f32s;
1370	fn splat_f64s(self, value: f64) -> Self::f64s;
1371
1372	#[inline]
1373	fn splat_i32s(self, value: i32) -> Self::i32s {
1374		self.transmute_i32s_u32s(self.splat_u32s(value as u32))
1375	}
1376	#[inline]
1377	fn splat_i64s(self, value: i64) -> Self::i64s {
1378		self.transmute_i64s_u64s(self.splat_u64s(value as u64))
1379	}
1380	fn splat_u32s(self, value: u32) -> Self::u32s;
1381	fn splat_u64s(self, value: u64) -> Self::u64s;
1382
1383	fn sub_c32s(self, a: Self::c32s, b: Self::c32s) -> Self::c32s;
1384	fn sub_c64s(self, a: Self::c64s, b: Self::c64s) -> Self::c64s;
1385	fn sub_f32s(self, a: Self::f32s, b: Self::f32s) -> Self::f32s;
1386	fn sub_f64s(self, a: Self::f64s, b: Self::f64s) -> Self::f64s;
1387
1388	#[inline]
1389	fn sub_i32s(self, a: Self::i32s, b: Self::i32s) -> Self::i32s {
1390		self.transmute_i32s_u32s(
1391			self.sub_u32s(self.transmute_u32s_i32s(a), self.transmute_u32s_i32s(b)),
1392		)
1393	}
1394	#[inline]
1395	fn sub_i64s(self, a: Self::i64s, b: Self::i64s) -> Self::i64s {
1396		self.transmute_i64s_u64s(
1397			self.sub_u64s(self.transmute_u64s_i64s(a), self.transmute_u64s_i64s(b)),
1398		)
1399	}
1400
1401	fn sub_u32s(self, a: Self::u32s, b: Self::u32s) -> Self::u32s;
1402	fn sub_u64s(self, a: Self::u64s, b: Self::u64s) -> Self::u64s;
1403	fn swap_re_im_c32s(self, a: Self::c32s) -> Self::c32s;
1404	fn swap_re_im_c64s(self, a: Self::c64s) -> Self::c64s;
1405	#[inline]
1406	fn transmute_f32s_i32s(self, a: Self::i32s) -> Self::f32s {
1407		cast(a)
1408	}
1409	#[inline]
1410	fn transmute_f32s_u32s(self, a: Self::u32s) -> Self::f32s {
1411		cast(a)
1412	}
1413
1414	#[inline]
1415	fn transmute_f64s_i64s(self, a: Self::i64s) -> Self::f64s {
1416		cast(a)
1417	}
1418	#[inline]
1419	fn transmute_f64s_u64s(self, a: Self::u64s) -> Self::f64s {
1420		cast(a)
1421	}
1422	#[inline]
1423	fn transmute_i32s_f32s(self, a: Self::f32s) -> Self::i32s {
1424		cast(a)
1425	}
1426	#[inline]
1427	fn transmute_i32s_u32s(self, a: Self::u32s) -> Self::i32s {
1428		cast(a)
1429	}
1430	#[inline]
1431	fn transmute_i64s_f64s(self, a: Self::f64s) -> Self::i64s {
1432		cast(a)
1433	}
1434	#[inline]
1435	fn transmute_i64s_u64s(self, a: Self::u64s) -> Self::i64s {
1436		cast(a)
1437	}
1438
1439	#[inline]
1440	fn transmute_u32s_f32s(self, a: Self::f32s) -> Self::u32s {
1441		cast(a)
1442	}
1443	#[inline]
1444	fn transmute_u32s_i32s(self, a: Self::i32s) -> Self::u32s {
1445		cast(a)
1446	}
1447	#[inline]
1448	fn transmute_u64s_f64s(self, a: Self::f64s) -> Self::u64s {
1449		cast(a)
1450	}
1451	#[inline]
1452	fn transmute_u64s_i64s(self, a: Self::i64s) -> Self::u64s {
1453		cast(a)
1454	}
1455
1456	fn vectorize<Op: WithSimd>(self, op: Op) -> Op::Output;
1457	fn widening_mul_u32s(self, a: Self::u32s, b: Self::u32s) -> (Self::u32s, Self::u32s);
1458	fn wrapping_dyn_shl_u32s(self, a: Self::u32s, amount: Self::u32s) -> Self::u32s;
1459	fn wrapping_dyn_shr_u32s(self, a: Self::u32s, amount: Self::u32s) -> Self::u32s;
1460
1461	#[inline]
1462	fn xor_f32s(self, a: Self::f32s, b: Self::f32s) -> Self::f32s {
1463		self.transmute_f32s_u32s(
1464			self.xor_u32s(self.transmute_u32s_f32s(a), self.transmute_u32s_f32s(b)),
1465		)
1466	}
1467	#[inline]
1468	fn xor_f64s(self, a: Self::f64s, b: Self::f64s) -> Self::f64s {
1469		self.transmute_f64s_u64s(
1470			self.xor_u64s(self.transmute_u64s_f64s(a), self.transmute_u64s_f64s(b)),
1471		)
1472	}
1473	#[inline]
1474	fn xor_i32s(self, a: Self::i32s, b: Self::i32s) -> Self::i32s {
1475		self.transmute_i32s_u32s(
1476			self.xor_u32s(self.transmute_u32s_i32s(a), self.transmute_u32s_i32s(b)),
1477		)
1478	}
1479	#[inline]
1480	fn xor_i64s(self, a: Self::i64s, b: Self::i64s) -> Self::i64s {
1481		self.transmute_i64s_u64s(
1482			self.xor_u64s(self.transmute_u64s_i64s(a), self.transmute_u64s_i64s(b)),
1483		)
1484	}
1485
1486	fn xor_m32s(self, a: Self::m32s, b: Self::m32s) -> Self::m32s;
1487	fn xor_m64s(self, a: Self::m64s, b: Self::m64s) -> Self::m64s;
1488	fn xor_u32s(self, a: Self::u32s, b: Self::u32s) -> Self::u32s;
1489	fn xor_u64s(self, a: Self::u64s, b: Self::u64s) -> Self::u64s;
1490}
1491
1492pub trait PortableSimd: Simd {}
1493
1494impl PortableSimd for Scalar {}
1495impl PortableSimd for Scalar128b {}
1496impl PortableSimd for Scalar256b {}
1497impl PortableSimd for Scalar512b {}
1498
1499#[derive(Debug, Copy, Clone)]
1500pub struct Scalar;
1501
1502#[derive(Debug, Copy, Clone)]
1503pub struct Scalar128b;
1504#[derive(Debug, Copy, Clone)]
1505pub struct Scalar256b;
1506#[derive(Debug, Copy, Clone)]
1507pub struct Scalar512b;
1508
1509macro_rules! scalar_simd {
1510	($ty: ty, $register_count: expr, $m32s: ty, $f32s: ty, $i32s: ty, $u32s: ty, $m64s: ty, $f64s: ty, $i64s: ty, $u64s: ty $(,)?) => {
1511		impl Seal for $ty {}
1512		impl Simd for $ty {
1513			type c32s = $f32s;
1514			type c64s = $f64s;
1515			type f32s = $f32s;
1516			type f64s = $f64s;
1517			type i32s = $i32s;
1518			type i64s = $i64s;
1519			type m32s = $m32s;
1520			type m64s = $m64s;
1521			type u32s = $u32s;
1522			type u64s = $u64s;
1523
1524			const REGISTER_COUNT: usize = $register_count;
1525
1526			#[inline]
1527			fn vectorize<Op: WithSimd>(self, op: Op) -> Op::Output {
1528				op.with_simd(self)
1529			}
1530
1531			#[inline]
1532			unsafe fn mask_load_ptr_u32s(
1533				self,
1534				mask: MemMask<Self::m32s>,
1535				ptr: *const u32,
1536			) -> Self::u32s {
1537				let mut values = [0u32; Self::F32_LANES];
1538				let mask: [m32; Self::F32_LANES] = cast(mask.mask());
1539				for i in 0..Self::F32_LANES {
1540					if mask[i].is_set() {
1541						values[i] = *ptr.add(i);
1542					}
1543				}
1544				cast(values)
1545			}
1546
1547			#[inline]
1548			unsafe fn mask_load_ptr_c32s(
1549				self,
1550				mask: MemMask<Self::m32s>,
1551				ptr: *const c32,
1552			) -> Self::c32s {
1553				cast(self.mask_load_ptr_u32s(mask, ptr as *const u32))
1554			}
1555
1556			#[inline]
1557			unsafe fn mask_store_ptr_u32s(
1558				self,
1559				mask: MemMask<Self::m32s>,
1560				ptr: *mut u32,
1561				values: Self::u32s,
1562			) {
1563				let mask: [m32; Self::F32_LANES] = cast(mask.mask());
1564				let values: [u32; Self::F32_LANES] = cast(values);
1565				for i in 0..Self::F32_LANES {
1566					if mask[i].is_set() {
1567						*ptr.add(i) = values[i];
1568					}
1569				}
1570			}
1571
1572			#[inline]
1573			unsafe fn mask_store_ptr_c32s(
1574				self,
1575				mask: MemMask<Self::m32s>,
1576				ptr: *mut c32,
1577				values: Self::c32s,
1578			) {
1579				self.mask_store_ptr_u32s(mask, ptr as *mut u32, cast(values))
1580			}
1581
1582			#[inline]
1583			unsafe fn mask_load_ptr_u64s(
1584				self,
1585				mask: MemMask<Self::m64s>,
1586				ptr: *const u64,
1587			) -> Self::u64s {
1588				let mut values = [0u64; Self::F64_LANES];
1589				let mask: [m64; Self::F64_LANES] = cast(mask.mask());
1590				for i in 0..Self::F64_LANES {
1591					if mask[i].is_set() {
1592						values[i] = *ptr.add(i);
1593					}
1594				}
1595				cast(values)
1596			}
1597
1598			#[inline]
1599			unsafe fn mask_load_ptr_c64s(
1600				self,
1601				mask: MemMask<Self::m64s>,
1602				ptr: *const c64,
1603			) -> Self::c64s {
1604				cast(self.mask_load_ptr_u64s(mask, ptr as *const u64))
1605			}
1606
1607			#[inline]
1608			unsafe fn mask_store_ptr_u64s(
1609				self,
1610				mask: MemMask<Self::m64s>,
1611				ptr: *mut u64,
1612				values: Self::u64s,
1613			) {
1614				let mask: [m64; Self::F64_LANES] = cast(mask.mask());
1615				let values: [u64; Self::F64_LANES] = cast(values);
1616				for i in 0..Self::F64_LANES {
1617					if mask[i].is_set() {
1618						*ptr.add(i) = values[i];
1619					}
1620				}
1621			}
1622
1623			#[inline]
1624			unsafe fn mask_store_ptr_c64s(
1625				self,
1626				mask: MemMask<Self::m64s>,
1627				ptr: *mut c64,
1628				values: Self::c64s,
1629			) {
1630				self.mask_store_ptr_u64s(mask, ptr as *mut u64, cast(values))
1631			}
1632
1633			#[inline]
1634			fn partial_load_u32s(self, slice: &[u32]) -> Self::u32s {
1635				let mut values = [0u32; Self::F32_LANES];
1636				for i in 0..Ord::min(values.len(), slice.len()) {
1637					values[i] = slice[i];
1638				}
1639				cast(values)
1640			}
1641
1642			#[inline]
1643			fn partial_store_u32s(self, slice: &mut [u32], values: Self::u32s) {
1644				let values: [u32; Self::F32_LANES] = cast(values);
1645				for i in 0..Ord::min(values.len(), slice.len()) {
1646					slice[i] = values[i];
1647				}
1648			}
1649
1650			#[inline]
1651			fn partial_load_u64s(self, slice: &[u64]) -> Self::u64s {
1652				let mut values = [0u64; Self::F64_LANES];
1653				for i in 0..Ord::min(values.len(), slice.len()) {
1654					values[i] = slice[i];
1655				}
1656				cast(values)
1657			}
1658
1659			#[inline]
1660			fn partial_store_u64s(self, slice: &mut [u64], values: Self::u64s) {
1661				let values: [u64; Self::F64_LANES] = cast(values);
1662				for i in 0..Ord::min(values.len(), slice.len()) {
1663					slice[i] = values[i];
1664				}
1665			}
1666
1667			#[inline]
1668			fn not_m32s(self, a: Self::m32s) -> Self::m32s {
1669				let mut out = [m32::new(false); Self::F32_LANES];
1670				let a: [m32; Self::F32_LANES] = cast(a);
1671				for i in 0..Self::F32_LANES {
1672					out[i] = !a[i];
1673				}
1674				cast(out)
1675			}
1676
1677			#[inline]
1678			fn and_m32s(self, a: Self::m32s, b: Self::m32s) -> Self::m32s {
1679				let mut out = [m32::new(false); Self::F32_LANES];
1680				let a: [m32; Self::F32_LANES] = cast(a);
1681				let b: [m32; Self::F32_LANES] = cast(b);
1682				for i in 0..Self::F32_LANES {
1683					out[i] = a[i] & b[i];
1684				}
1685				cast(out)
1686			}
1687
1688			#[inline]
1689			fn or_m32s(self, a: Self::m32s, b: Self::m32s) -> Self::m32s {
1690				let mut out = [m32::new(false); Self::F32_LANES];
1691				let a: [m32; Self::F32_LANES] = cast(a);
1692				let b: [m32; Self::F32_LANES] = cast(b);
1693				for i in 0..Self::F32_LANES {
1694					out[i] = a[i] | b[i];
1695				}
1696				cast(out)
1697			}
1698
1699			#[inline]
1700			fn xor_m32s(self, a: Self::m32s, b: Self::m32s) -> Self::m32s {
1701				let mut out = [m32::new(false); Self::F32_LANES];
1702				let a: [m32; Self::F32_LANES] = cast(a);
1703				let b: [m32; Self::F32_LANES] = cast(b);
1704				for i in 0..Self::F32_LANES {
1705					out[i] = a[i] ^ b[i];
1706				}
1707				cast(out)
1708			}
1709
1710			#[inline]
1711			fn not_m64s(self, a: Self::m64s) -> Self::m64s {
1712				let mut out = [m64::new(false); Self::F64_LANES];
1713				let a: [m64; Self::F64_LANES] = cast(a);
1714				for i in 0..Self::F64_LANES {
1715					out[i] = !a[i];
1716				}
1717				cast(out)
1718			}
1719
1720			#[inline]
1721			fn and_m64s(self, a: Self::m64s, b: Self::m64s) -> Self::m64s {
1722				let mut out = [m64::new(false); Self::F64_LANES];
1723				let a: [m64; Self::F64_LANES] = cast(a);
1724				let b: [m64; Self::F64_LANES] = cast(b);
1725				for i in 0..Self::F64_LANES {
1726					out[i] = a[i] & b[i];
1727				}
1728				cast(out)
1729			}
1730
1731			#[inline]
1732			fn or_m64s(self, a: Self::m64s, b: Self::m64s) -> Self::m64s {
1733				let mut out = [m64::new(false); Self::F64_LANES];
1734				let a: [m64; Self::F64_LANES] = cast(a);
1735				let b: [m64; Self::F64_LANES] = cast(b);
1736				for i in 0..Self::F64_LANES {
1737					out[i] = a[i] | b[i];
1738				}
1739				cast(out)
1740			}
1741
1742			#[inline]
1743			fn xor_m64s(self, a: Self::m64s, b: Self::m64s) -> Self::m64s {
1744				let mut out = [m64::new(false); Self::F64_LANES];
1745				let a: [m64; Self::F64_LANES] = cast(a);
1746				let b: [m64; Self::F64_LANES] = cast(b);
1747				for i in 0..Self::F64_LANES {
1748					out[i] = a[i] ^ b[i];
1749				}
1750				cast(out)
1751			}
1752
1753			#[inline]
1754			fn not_u32s(self, a: Self::u32s) -> Self::u32s {
1755				let mut out = [0u32; Self::F32_LANES];
1756				let a: [u32; Self::F32_LANES] = cast(a);
1757				for i in 0..Self::F32_LANES {
1758					out[i] = !a[i];
1759				}
1760				cast(out)
1761			}
1762
1763			#[inline]
1764			fn and_u32s(self, a: Self::u32s, b: Self::u32s) -> Self::u32s {
1765				let mut out = [0u32; Self::F32_LANES];
1766				let a: [u32; Self::F32_LANES] = cast(a);
1767				let b: [u32; Self::F32_LANES] = cast(b);
1768				for i in 0..Self::F32_LANES {
1769					out[i] = a[i] & b[i];
1770				}
1771				cast(out)
1772			}
1773
1774			#[inline]
1775			fn or_u32s(self, a: Self::u32s, b: Self::u32s) -> Self::u32s {
1776				let mut out = [0u32; Self::F32_LANES];
1777				let a: [u32; Self::F32_LANES] = cast(a);
1778				let b: [u32; Self::F32_LANES] = cast(b);
1779				for i in 0..Self::F32_LANES {
1780					out[i] = a[i] | b[i];
1781				}
1782				cast(out)
1783			}
1784
1785			#[inline]
1786			fn xor_u32s(self, a: Self::u32s, b: Self::u32s) -> Self::u32s {
1787				let mut out = [0u32; Self::F32_LANES];
1788				let a: [u32; Self::F32_LANES] = cast(a);
1789				let b: [u32; Self::F32_LANES] = cast(b);
1790				for i in 0..Self::F32_LANES {
1791					out[i] = a[i] ^ b[i];
1792				}
1793				cast(out)
1794			}
1795
1796			#[inline]
1797			fn not_u64s(self, a: Self::u64s) -> Self::u64s {
1798				let mut out = [0u64; Self::F64_LANES];
1799				let a: [u64; Self::F64_LANES] = cast(a);
1800				for i in 0..Self::F64_LANES {
1801					out[i] = !a[i];
1802				}
1803				cast(out)
1804			}
1805
1806			#[inline]
1807			fn and_u64s(self, a: Self::u64s, b: Self::u64s) -> Self::u64s {
1808				let mut out = [0u64; Self::F64_LANES];
1809				let a: [u64; Self::F64_LANES] = cast(a);
1810				let b: [u64; Self::F64_LANES] = cast(b);
1811				for i in 0..Self::F64_LANES {
1812					out[i] = a[i] & b[i];
1813				}
1814				cast(out)
1815			}
1816
1817			#[inline]
1818			fn or_u64s(self, a: Self::u64s, b: Self::u64s) -> Self::u64s {
1819				let mut out = [0u64; Self::F64_LANES];
1820				let a: [u64; Self::F64_LANES] = cast(a);
1821				let b: [u64; Self::F64_LANES] = cast(b);
1822				for i in 0..Self::F64_LANES {
1823					out[i] = a[i] | b[i];
1824				}
1825				cast(out)
1826			}
1827
1828			#[inline]
1829			fn xor_u64s(self, a: Self::u64s, b: Self::u64s) -> Self::u64s {
1830				let mut out = [0u64; Self::F64_LANES];
1831				let a: [u64; Self::F64_LANES] = cast(a);
1832				let b: [u64; Self::F64_LANES] = cast(b);
1833				for i in 0..Self::F64_LANES {
1834					out[i] = a[i] ^ b[i];
1835				}
1836				cast(out)
1837			}
1838
1839			#[inline]
1840			fn select_u32s_m32s(
1841				self,
1842				mask: Self::m32s,
1843				if_true: Self::u32s,
1844				if_false: Self::u32s,
1845			) -> Self::u32s {
1846				let mut out = [0u32; Self::F32_LANES];
1847				let mask: [m32; Self::F32_LANES] = cast(mask);
1848				let if_true: [u32; Self::F32_LANES] = cast(if_true);
1849				let if_false: [u32; Self::F32_LANES] = cast(if_false);
1850
1851				for i in 0..Self::F32_LANES {
1852					out[i] = if mask[i].is_set() {
1853						if_true[i]
1854					} else {
1855						if_false[i]
1856					};
1857				}
1858
1859				cast(out)
1860			}
1861
1862			#[inline]
1863			fn select_u64s_m64s(
1864				self,
1865				mask: Self::m64s,
1866				if_true: Self::u64s,
1867				if_false: Self::u64s,
1868			) -> Self::u64s {
1869				let mut out = [0u64; Self::F64_LANES];
1870				let mask: [m64; Self::F64_LANES] = cast(mask);
1871				let if_true: [u64; Self::F64_LANES] = cast(if_true);
1872				let if_false: [u64; Self::F64_LANES] = cast(if_false);
1873
1874				for i in 0..Self::F64_LANES {
1875					out[i] = if mask[i].is_set() {
1876						if_true[i]
1877					} else {
1878						if_false[i]
1879					};
1880				}
1881
1882				cast(out)
1883			}
1884
1885			#[inline]
1886			fn splat_u32s(self, value: u32) -> Self::u32s {
1887				cast([value; Self::F32_LANES])
1888			}
1889
1890			#[inline]
1891			fn add_u32s(self, a: Self::u32s, b: Self::u32s) -> Self::u32s {
1892				let mut out = [0u32; Self::F32_LANES];
1893				let a: [u32; Self::F32_LANES] = cast(a);
1894				let b: [u32; Self::F32_LANES] = cast(b);
1895				for i in 0..Self::F32_LANES {
1896					out[i] = a[i].wrapping_add(b[i]);
1897				}
1898				cast(out)
1899			}
1900
1901			#[inline]
1902			fn sub_u32s(self, a: Self::u32s, b: Self::u32s) -> Self::u32s {
1903				let mut out = [0u32; Self::F32_LANES];
1904				let a: [u32; Self::F32_LANES] = cast(a);
1905				let b: [u32; Self::F32_LANES] = cast(b);
1906				for i in 0..Self::F32_LANES {
1907					out[i] = a[i].wrapping_sub(b[i]);
1908				}
1909				cast(out)
1910			}
1911
1912			#[inline]
1913			fn less_than_u32s(self, a: Self::u32s, b: Self::u32s) -> Self::m32s {
1914				let mut out = [m32::new(false); Self::F32_LANES];
1915				let a: [u32; Self::F32_LANES] = cast(a);
1916				let b: [u32; Self::F32_LANES] = cast(b);
1917				for i in 0..Self::F32_LANES {
1918					out[i] = m32::new(a[i] < b[i]);
1919				}
1920				cast(out)
1921			}
1922
1923			#[inline]
1924			fn greater_than_u32s(self, a: Self::u32s, b: Self::u32s) -> Self::m32s {
1925				let mut out = [m32::new(false); Self::F32_LANES];
1926				let a: [u32; Self::F32_LANES] = cast(a);
1927				let b: [u32; Self::F32_LANES] = cast(b);
1928				for i in 0..Self::F32_LANES {
1929					out[i] = m32::new(a[i] > b[i]);
1930				}
1931				cast(out)
1932			}
1933
1934			#[inline]
1935			fn less_than_or_equal_u32s(self, a: Self::u32s, b: Self::u32s) -> Self::m32s {
1936				let mut out = [m32::new(false); Self::F32_LANES];
1937				let a: [u32; Self::F32_LANES] = cast(a);
1938				let b: [u32; Self::F32_LANES] = cast(b);
1939				for i in 0..Self::F32_LANES {
1940					out[i] = m32::new(a[i] <= b[i]);
1941				}
1942				cast(out)
1943			}
1944
1945			#[inline]
1946			fn greater_than_or_equal_u32s(self, a: Self::u32s, b: Self::u32s) -> Self::m32s {
1947				let mut out = [m32::new(false); Self::F32_LANES];
1948				let a: [u32; Self::F32_LANES] = cast(a);
1949				let b: [u32; Self::F32_LANES] = cast(b);
1950				for i in 0..Self::F32_LANES {
1951					out[i] = m32::new(a[i] >= b[i]);
1952				}
1953				cast(out)
1954			}
1955
1956			#[inline]
1957			fn less_than_i32s(self, a: Self::i32s, b: Self::i32s) -> Self::m32s {
1958				let mut out = [m32::new(false); Self::F32_LANES];
1959				let a: [i32; Self::F32_LANES] = cast(a);
1960				let b: [i32; Self::F32_LANES] = cast(b);
1961				for i in 0..Self::F32_LANES {
1962					out[i] = m32::new(a[i] < b[i]);
1963				}
1964				cast(out)
1965			}
1966
1967			#[inline]
1968			fn greater_than_i32s(self, a: Self::i32s, b: Self::i32s) -> Self::m32s {
1969				let mut out = [m32::new(false); Self::F32_LANES];
1970				let a: [i32; Self::F32_LANES] = cast(a);
1971				let b: [i32; Self::F32_LANES] = cast(b);
1972				for i in 0..Self::F32_LANES {
1973					out[i] = m32::new(a[i] > b[i]);
1974				}
1975				cast(out)
1976			}
1977
1978			#[inline]
1979			fn less_than_or_equal_i32s(self, a: Self::i32s, b: Self::i32s) -> Self::m32s {
1980				let mut out = [m32::new(false); Self::F32_LANES];
1981				let a: [i32; Self::F32_LANES] = cast(a);
1982				let b: [i32; Self::F32_LANES] = cast(b);
1983				for i in 0..Self::F32_LANES {
1984					out[i] = m32::new(a[i] <= b[i]);
1985				}
1986				cast(out)
1987			}
1988
1989			#[inline]
1990			fn greater_than_or_equal_i32s(self, a: Self::i32s, b: Self::i32s) -> Self::m32s {
1991				let mut out = [m32::new(false); Self::F32_LANES];
1992				let a: [i32; Self::F32_LANES] = cast(a);
1993				let b: [i32; Self::F32_LANES] = cast(b);
1994				for i in 0..Self::F32_LANES {
1995					out[i] = m32::new(a[i] >= b[i]);
1996				}
1997				cast(out)
1998			}
1999
2000			#[inline]
2001			fn wrapping_dyn_shl_u32s(self, a: Self::u32s, amount: Self::u32s) -> Self::u32s {
2002				let mut out = [0u32; Self::F32_LANES];
2003				let a: [u32; Self::F32_LANES] = cast(a);
2004				let b: [u32; Self::F32_LANES] = cast(amount);
2005				for i in 0..Self::F32_LANES {
2006					out[i] = a[i].wrapping_shl(b[i]);
2007				}
2008				cast(out)
2009			}
2010
2011			#[inline]
2012			fn wrapping_dyn_shr_u32s(self, a: Self::u32s, amount: Self::u32s) -> Self::u32s {
2013				let mut out = [0u32; Self::F32_LANES];
2014				let a: [u32; Self::F32_LANES] = cast(a);
2015				let b: [u32; Self::F32_LANES] = cast(amount);
2016				for i in 0..Self::F32_LANES {
2017					out[i] = a[i].wrapping_shr(b[i]);
2018				}
2019				cast(out)
2020			}
2021
2022			#[inline]
2023			fn widening_mul_u32s(self, a: Self::u32s, b: Self::u32s) -> (Self::u32s, Self::u32s) {
2024				let mut lo = [0u32; Self::F32_LANES];
2025				let mut hi = [0u32; Self::F32_LANES];
2026				let a: [u32; Self::F32_LANES] = cast(a);
2027				let b: [u32; Self::F32_LANES] = cast(b);
2028				for i in 0..Self::F32_LANES {
2029					let m = a[i] as u64 * b[i] as u64;
2030
2031					(lo[i], hi[i]) = (m as u32, (m >> 32) as u32);
2032				}
2033				(cast(lo), cast(hi))
2034			}
2035
2036			#[inline]
2037			fn splat_u64s(self, value: u64) -> Self::u64s {
2038				cast([value; Self::F64_LANES])
2039			}
2040
2041			#[inline]
2042			fn add_u64s(self, a: Self::u64s, b: Self::u64s) -> Self::u64s {
2043				let mut out = [0u64; Self::F64_LANES];
2044				let a: [u64; Self::F64_LANES] = cast(a);
2045				let b: [u64; Self::F64_LANES] = cast(b);
2046				for i in 0..Self::F64_LANES {
2047					out[i] = a[i].wrapping_add(b[i]);
2048				}
2049				cast(out)
2050			}
2051
2052			#[inline]
2053			fn sub_u64s(self, a: Self::u64s, b: Self::u64s) -> Self::u64s {
2054				let mut out = [0u64; Self::F64_LANES];
2055				let a: [u64; Self::F64_LANES] = cast(a);
2056				let b: [u64; Self::F64_LANES] = cast(b);
2057				for i in 0..Self::F64_LANES {
2058					out[i] = a[i].wrapping_sub(b[i]);
2059				}
2060				cast(out)
2061			}
2062
2063			#[inline]
2064			fn less_than_u64s(self, a: Self::u64s, b: Self::u64s) -> Self::m64s {
2065				let mut out = [m64::new(false); Self::F64_LANES];
2066				let a: [u64; Self::F64_LANES] = cast(a);
2067				let b: [u64; Self::F64_LANES] = cast(b);
2068				for i in 0..Self::F64_LANES {
2069					out[i] = m64::new(a[i] < b[i]);
2070				}
2071				cast(out)
2072			}
2073
2074			#[inline]
2075			fn greater_than_u64s(self, a: Self::u64s, b: Self::u64s) -> Self::m64s {
2076				let mut out = [m64::new(false); Self::F64_LANES];
2077				let a: [u64; Self::F64_LANES] = cast(a);
2078				let b: [u64; Self::F64_LANES] = cast(b);
2079				for i in 0..Self::F64_LANES {
2080					out[i] = m64::new(a[i] > b[i]);
2081				}
2082				cast(out)
2083			}
2084
2085			#[inline]
2086			fn less_than_or_equal_u64s(self, a: Self::u64s, b: Self::u64s) -> Self::m64s {
2087				let mut out = [m64::new(false); Self::F64_LANES];
2088				let a: [u64; Self::F64_LANES] = cast(a);
2089				let b: [u64; Self::F64_LANES] = cast(b);
2090				for i in 0..Self::F64_LANES {
2091					out[i] = m64::new(a[i] <= b[i]);
2092				}
2093				cast(out)
2094			}
2095
2096			#[inline]
2097			fn greater_than_or_equal_u64s(self, a: Self::u64s, b: Self::u64s) -> Self::m64s {
2098				let mut out = [m64::new(false); Self::F64_LANES];
2099				let a: [u64; Self::F64_LANES] = cast(a);
2100				let b: [u64; Self::F64_LANES] = cast(b);
2101				for i in 0..Self::F64_LANES {
2102					out[i] = m64::new(a[i] >= b[i]);
2103				}
2104				cast(out)
2105			}
2106
2107			#[inline]
2108			fn less_than_i64s(self, a: Self::i64s, b: Self::i64s) -> Self::m64s {
2109				let mut out = [m64::new(false); Self::F64_LANES];
2110				let a: [i64; Self::F64_LANES] = cast(a);
2111				let b: [i64; Self::F64_LANES] = cast(b);
2112				for i in 0..Self::F64_LANES {
2113					out[i] = m64::new(a[i] < b[i]);
2114				}
2115				cast(out)
2116			}
2117
2118			#[inline]
2119			fn greater_than_i64s(self, a: Self::i64s, b: Self::i64s) -> Self::m64s {
2120				let mut out = [m64::new(false); Self::F64_LANES];
2121				let a: [i64; Self::F64_LANES] = cast(a);
2122				let b: [i64; Self::F64_LANES] = cast(b);
2123				for i in 0..Self::F64_LANES {
2124					out[i] = m64::new(a[i] > b[i]);
2125				}
2126				cast(out)
2127			}
2128
2129			#[inline]
2130			fn less_than_or_equal_i64s(self, a: Self::i64s, b: Self::i64s) -> Self::m64s {
2131				let mut out = [m64::new(false); Self::F64_LANES];
2132				let a: [i64; Self::F64_LANES] = cast(a);
2133				let b: [i64; Self::F64_LANES] = cast(b);
2134				for i in 0..Self::F64_LANES {
2135					out[i] = m64::new(a[i] <= b[i]);
2136				}
2137				cast(out)
2138			}
2139
2140			#[inline]
2141			fn greater_than_or_equal_i64s(self, a: Self::i64s, b: Self::i64s) -> Self::m64s {
2142				let mut out = [m64::new(false); Self::F64_LANES];
2143				let a: [i64; Self::F64_LANES] = cast(a);
2144				let b: [i64; Self::F64_LANES] = cast(b);
2145				for i in 0..Self::F64_LANES {
2146					out[i] = m64::new(a[i] >= b[i]);
2147				}
2148				cast(out)
2149			}
2150
2151			#[inline]
2152			fn splat_f32s(self, value: f32) -> Self::f32s {
2153				cast([value; Self::F32_LANES])
2154			}
2155
2156			#[inline]
2157			fn add_f32s(self, a: Self::f32s, b: Self::f32s) -> Self::f32s {
2158				let mut out = [0.0f32; Self::F32_LANES];
2159				let a: [f32; Self::F32_LANES] = cast(a);
2160				let b: [f32; Self::F32_LANES] = cast(b);
2161
2162				for i in 0..Self::F32_LANES {
2163					out[i] = a[i] + b[i];
2164				}
2165
2166				cast(out)
2167			}
2168
2169			#[inline]
2170			fn sub_f32s(self, a: Self::f32s, b: Self::f32s) -> Self::f32s {
2171				let mut out = [0.0f32; Self::F32_LANES];
2172				let a: [f32; Self::F32_LANES] = cast(a);
2173				let b: [f32; Self::F32_LANES] = cast(b);
2174
2175				for i in 0..Self::F32_LANES {
2176					out[i] = a[i] - b[i];
2177				}
2178
2179				cast(out)
2180			}
2181
2182			#[inline]
2183			fn mul_f32s(self, a: Self::f32s, b: Self::f32s) -> Self::f32s {
2184				let mut out = [0.0f32; Self::F32_LANES];
2185				let a: [f32; Self::F32_LANES] = cast(a);
2186				let b: [f32; Self::F32_LANES] = cast(b);
2187
2188				for i in 0..Self::F32_LANES {
2189					out[i] = a[i] * b[i];
2190				}
2191
2192				cast(out)
2193			}
2194
2195			#[inline]
2196			fn div_f32s(self, a: Self::f32s, b: Self::f32s) -> Self::f32s {
2197				let mut out = [0.0f32; Self::F32_LANES];
2198				let a: [f32; Self::F32_LANES] = cast(a);
2199				let b: [f32; Self::F32_LANES] = cast(b);
2200
2201				for i in 0..Self::F32_LANES {
2202					out[i] = a[i] / b[i];
2203				}
2204
2205				cast(out)
2206			}
2207
2208			#[inline]
2209			fn mul_add_f32s(self, a: Self::f32s, b: Self::f32s, c: Self::f32s) -> Self::f32s {
2210				let mut out = [0.0f32; Self::F32_LANES];
2211				let a: [f32; Self::F32_LANES] = cast(a);
2212				let b: [f32; Self::F32_LANES] = cast(b);
2213				let c: [f32; Self::F32_LANES] = cast(c);
2214
2215				for i in 0..Self::F32_LANES {
2216					out[i] = fma_f32(a[i], b[i], c[i]);
2217				}
2218
2219				cast(out)
2220			}
2221
2222			#[inline]
2223			fn equal_f32s(self, a: Self::f32s, b: Self::f32s) -> Self::m32s {
2224				let mut out = [m32::new(false); Self::F32_LANES];
2225				let a: [f32; Self::F32_LANES] = cast(a);
2226				let b: [f32; Self::F32_LANES] = cast(b);
2227
2228				for i in 0..Self::F32_LANES {
2229					out[i] = m32::new(a[i] == b[i]);
2230				}
2231
2232				cast(out)
2233			}
2234
2235			#[inline]
2236			fn less_than_f32s(self, a: Self::f32s, b: Self::f32s) -> Self::m32s {
2237				let mut out = [m32::new(false); Self::F32_LANES];
2238				let a: [f32; Self::F32_LANES] = cast(a);
2239				let b: [f32; Self::F32_LANES] = cast(b);
2240
2241				for i in 0..Self::F32_LANES {
2242					out[i] = m32::new(a[i] < b[i]);
2243				}
2244
2245				cast(out)
2246			}
2247
2248			#[inline]
2249			fn less_than_or_equal_f32s(self, a: Self::f32s, b: Self::f32s) -> Self::m32s {
2250				let mut out = [m32::new(false); Self::F32_LANES];
2251				let a: [f32; Self::F32_LANES] = cast(a);
2252				let b: [f32; Self::F32_LANES] = cast(b);
2253
2254				for i in 0..Self::F32_LANES {
2255					out[i] = m32::new(a[i] <= b[i]);
2256				}
2257
2258				cast(out)
2259			}
2260
2261			#[inline]
2262			fn min_f32s(self, a: Self::f32s, b: Self::f32s) -> Self::f32s {
2263				let mut out = [0.0f32; Self::F32_LANES];
2264				let a: [f32; Self::F32_LANES] = cast(a);
2265				let b: [f32; Self::F32_LANES] = cast(b);
2266
2267				for i in 0..Self::F32_LANES {
2268					out[i] = f32::min(a[i], b[i]);
2269				}
2270
2271				cast(out)
2272			}
2273
2274			#[inline]
2275			fn max_f32s(self, a: Self::f32s, b: Self::f32s) -> Self::f32s {
2276				let mut out = [0.0f32; Self::F32_LANES];
2277				let a: [f32; Self::F32_LANES] = cast(a);
2278				let b: [f32; Self::F32_LANES] = cast(b);
2279
2280				for i in 0..Self::F32_LANES {
2281					out[i] = f32::max(a[i], b[i]);
2282				}
2283
2284				cast(out)
2285			}
2286
2287			#[inline]
2288			fn reduce_sum_f32s(self, a: Self::f32s) -> f32 {
2289				let mut a: [f32; Self::F32_LANES] = cast(a);
2290
2291				let mut n = Self::F32_LANES;
2292				while n > 1 {
2293					n /= 2;
2294					for i in 0..n {
2295						a[i] += a[i + n];
2296					}
2297				}
2298
2299				a[0]
2300			}
2301
2302			#[inline]
2303			fn reduce_product_f32s(self, a: Self::f32s) -> f32 {
2304				let mut a: [f32; Self::F32_LANES] = cast(a);
2305
2306				let mut n = Self::F32_LANES;
2307				while n > 1 {
2308					n /= 2;
2309					for i in 0..n {
2310						a[i] *= a[i + n];
2311					}
2312				}
2313
2314				a[0]
2315			}
2316
2317			#[inline]
2318			fn reduce_min_f32s(self, a: Self::f32s) -> f32 {
2319				let mut a: [f32; Self::F32_LANES] = cast(a);
2320
2321				let mut n = Self::F32_LANES;
2322				while n > 1 {
2323					n /= 2;
2324					for i in 0..n {
2325						a[i] = f32::min(a[i], a[i + n]);
2326					}
2327				}
2328
2329				a[0]
2330			}
2331
2332			#[inline]
2333			fn reduce_max_f32s(self, a: Self::f32s) -> f32 {
2334				let mut a: [f32; Self::F32_LANES] = cast(a);
2335
2336				let mut n = Self::F32_LANES;
2337				while n > 1 {
2338					n /= 2;
2339					for i in 0..n {
2340						a[i] = f32::max(a[i], a[i + n]);
2341					}
2342				}
2343
2344				a[0]
2345			}
2346
2347			#[inline]
2348			fn splat_c32s(self, value: c32) -> Self::c32s {
2349				cast([value; Self::C32_LANES])
2350			}
2351
2352			#[inline]
2353			fn conj_c32s(self, a: Self::c32s) -> Self::c32s {
2354				let mut out = [c32::ZERO; Self::C32_LANES];
2355				let a: [c32; Self::C32_LANES] = cast(a);
2356
2357				for i in 0..Self::C32_LANES {
2358					out[i] = c32::new(a[i].re, -a[i].im);
2359				}
2360
2361				cast(out)
2362			}
2363
2364			#[inline]
2365			fn neg_c32s(self, a: Self::c32s) -> Self::c32s {
2366				let mut out = [c32::ZERO; Self::C32_LANES];
2367				let a: [c32; Self::C32_LANES] = cast(a);
2368
2369				for i in 0..Self::C32_LANES {
2370					out[i] = c32::new(-a[i].re, -a[i].im);
2371				}
2372
2373				cast(out)
2374			}
2375
2376			#[inline]
2377			fn swap_re_im_c32s(self, a: Self::c32s) -> Self::c32s {
2378				let mut out = [c32::ZERO; Self::C32_LANES];
2379				let a: [c32; Self::C32_LANES] = cast(a);
2380
2381				for i in 0..Self::C32_LANES {
2382					out[i] = c32::new(a[i].im, a[i].re);
2383				}
2384
2385				cast(out)
2386			}
2387
2388			#[inline]
2389			fn add_c32s(self, a: Self::c32s, b: Self::c32s) -> Self::c32s {
2390				let mut out = [c32::ZERO; Self::C32_LANES];
2391				let a: [c32; Self::C32_LANES] = cast(a);
2392				let b: [c32; Self::C32_LANES] = cast(b);
2393
2394				for i in 0..Self::C32_LANES {
2395					out[i] = c32::new(a[i].re + b[i].re, a[i].im + b[i].im);
2396				}
2397
2398				cast(out)
2399			}
2400
2401			#[inline]
2402			fn sub_c32s(self, a: Self::c32s, b: Self::c32s) -> Self::c32s {
2403				let mut out = [c32::ZERO; Self::C32_LANES];
2404				let a: [c32; Self::C32_LANES] = cast(a);
2405				let b: [c32; Self::C32_LANES] = cast(b);
2406
2407				for i in 0..Self::C32_LANES {
2408					out[i] = c32::new(a[i].re - b[i].re, a[i].im - b[i].im);
2409				}
2410
2411				cast(out)
2412			}
2413
2414			#[inline]
2415			fn mul_c32s(self, a: Self::c32s, b: Self::c32s) -> Self::c32s {
2416				let mut out = [c32::ZERO; Self::C32_LANES];
2417				let a: [c32; Self::C32_LANES] = cast(a);
2418				let b: [c32; Self::C32_LANES] = cast(b);
2419
2420				for i in 0..Self::C32_LANES {
2421					out[i].re = fma_f32(a[i].re, b[i].re, -(a[i].im * b[i].im));
2422					out[i].im = fma_f32(a[i].re, b[i].im, a[i].im * b[i].re);
2423				}
2424
2425				cast(out)
2426			}
2427
2428			#[inline]
2429			fn conj_mul_c32s(self, a: Self::c32s, b: Self::c32s) -> Self::c32s {
2430				let mut out = [c32::ZERO; Self::C32_LANES];
2431				let a: [c32; Self::C32_LANES] = cast(a);
2432				let b: [c32; Self::C32_LANES] = cast(b);
2433
2434				for i in 0..Self::C32_LANES {
2435					out[i].re = fma_f32(a[i].re, b[i].re, a[i].im * b[i].im);
2436					out[i].im = fma_f32(a[i].re, b[i].im, -(a[i].im * b[i].re));
2437				}
2438
2439				cast(out)
2440			}
2441
2442			#[inline]
2443			fn mul_add_c32s(self, a: Self::c32s, b: Self::c32s, c: Self::c32s) -> Self::c32s {
2444				let mut out = [c32::ZERO; Self::C32_LANES];
2445				let a: [c32; Self::C32_LANES] = cast(a);
2446				let b: [c32; Self::C32_LANES] = cast(b);
2447				let c: [c32; Self::C32_LANES] = cast(c);
2448
2449				for i in 0..Self::C32_LANES {
2450					out[i].re = fma_f32(a[i].re, b[i].re, -fma_f32(a[i].im, b[i].im, -c[i].re));
2451					out[i].im = fma_f32(a[i].re, b[i].im, fma_f32(a[i].im, b[i].re, c[i].im));
2452				}
2453
2454				cast(out)
2455			}
2456
2457			#[inline]
2458			fn conj_mul_add_c32s(self, a: Self::c32s, b: Self::c32s, c: Self::c32s) -> Self::c32s {
2459				let mut out = [c32::ZERO; Self::C32_LANES];
2460				let a: [c32; Self::C32_LANES] = cast(a);
2461				let b: [c32; Self::C32_LANES] = cast(b);
2462				let c: [c32; Self::C32_LANES] = cast(c);
2463
2464				for i in 0..Self::C32_LANES {
2465					out[i].re = fma_f32(a[i].re, b[i].re, fma_f32(a[i].im, b[i].im, c[i].re));
2466					out[i].im = fma_f32(a[i].re, b[i].im, -fma_f32(a[i].im, b[i].re, -c[i].im));
2467				}
2468
2469				cast(out)
2470			}
2471
2472			#[inline]
2473			fn abs2_c32s(self, a: Self::c32s) -> Self::c32s {
2474				let mut out = [c32::ZERO; Self::C32_LANES];
2475				let a: [c32; Self::C32_LANES] = cast(a);
2476
2477				for i in 0..Self::C32_LANES {
2478					let x = a[i].re * a[i].re + a[i].im * a[i].im;
2479					out[i].re = x;
2480					out[i].im = x;
2481				}
2482
2483				cast(out)
2484			}
2485
2486			#[inline]
2487			fn abs_max_c32s(self, a: Self::c32s) -> Self::c32s {
2488				let mut out = [c32::ZERO; Self::C32_LANES];
2489				let a: [c32; Self::C32_LANES] = cast(self.abs_f32s(a));
2490
2491				for i in 0..Self::C32_LANES {
2492					let x = f32::max(a[i].re, a[i].im);
2493					out[i].re = x;
2494					out[i].im = x;
2495				}
2496
2497				cast(out)
2498			}
2499
2500			#[inline]
2501			fn reduce_sum_c32s(self, a: Self::c32s) -> c32 {
2502				let mut a: [c32; Self::C32_LANES] = cast(a);
2503
2504				let mut n = Self::C32_LANES;
2505				while n > 1 {
2506					n /= 2;
2507					for i in 0..n {
2508						a[i].re += a[i + n].re;
2509						a[i].im += a[i + n].im;
2510					}
2511				}
2512
2513				a[0]
2514			}
2515
2516			#[inline]
2517			fn reduce_min_c32s(self, a: Self::c32s) -> c32 {
2518				let mut a: [c32; Self::C32_LANES] = cast(a);
2519
2520				let mut n = Self::C32_LANES;
2521				while n > 1 {
2522					n /= 2;
2523					for i in 0..n {
2524						a[i].re = f32::min(a[i].re, a[i + n].re);
2525						a[i].im = f32::min(a[i].im, a[i + n].im);
2526					}
2527				}
2528
2529				a[0]
2530			}
2531
2532			#[inline]
2533			fn reduce_max_c32s(self, a: Self::c32s) -> c32 {
2534				let mut a: [c32; Self::C32_LANES] = cast(a);
2535
2536				let mut n = Self::C32_LANES;
2537				while n > 1 {
2538					n /= 2;
2539					for i in 0..n {
2540						a[i].re = f32::max(a[i].re, a[i + n].re);
2541						a[i].im = f32::max(a[i].im, a[i + n].im);
2542					}
2543				}
2544
2545				a[0]
2546			}
2547
2548			#[inline]
2549			fn rotate_right_u32s(self, a: Self::u32s, amount: usize) -> Self::u32s {
2550				let mut a: [u32; Self::F32_LANES] = cast(a);
2551				let amount = amount % Self::F32_LANES;
2552				a.rotate_right(amount);
2553				cast(a)
2554			}
2555
2556			#[inline]
2557			fn rotate_right_c32s(self, a: Self::c32s, amount: usize) -> Self::c32s {
2558				let mut a: [c32; Self::C32_LANES] = cast(a);
2559				let amount = amount % Self::C32_LANES;
2560				a.rotate_right(amount);
2561				cast(a)
2562			}
2563
2564			#[inline]
2565			fn splat_f64s(self, value: f64) -> Self::f64s {
2566				cast([value; Self::F64_LANES])
2567			}
2568
2569			#[inline]
2570			fn add_f64s(self, a: Self::f64s, b: Self::f64s) -> Self::f64s {
2571				let mut out = [0.0f64; Self::F64_LANES];
2572				let a: [f64; Self::F64_LANES] = cast(a);
2573				let b: [f64; Self::F64_LANES] = cast(b);
2574
2575				for i in 0..Self::F64_LANES {
2576					out[i] = a[i] + b[i];
2577				}
2578
2579				cast(out)
2580			}
2581
2582			#[inline]
2583			fn sub_f64s(self, a: Self::f64s, b: Self::f64s) -> Self::f64s {
2584				let mut out = [0.0f64; Self::F64_LANES];
2585				let a: [f64; Self::F64_LANES] = cast(a);
2586				let b: [f64; Self::F64_LANES] = cast(b);
2587
2588				for i in 0..Self::F64_LANES {
2589					out[i] = a[i] - b[i];
2590				}
2591
2592				cast(out)
2593			}
2594
2595			#[inline]
2596			fn mul_f64s(self, a: Self::f64s, b: Self::f64s) -> Self::f64s {
2597				let mut out = [0.0f64; Self::F64_LANES];
2598				let a: [f64; Self::F64_LANES] = cast(a);
2599				let b: [f64; Self::F64_LANES] = cast(b);
2600
2601				for i in 0..Self::F64_LANES {
2602					out[i] = a[i] * b[i];
2603				}
2604
2605				cast(out)
2606			}
2607
2608			#[inline]
2609			fn div_f64s(self, a: Self::f64s, b: Self::f64s) -> Self::f64s {
2610				let mut out = [0.0f64; Self::F64_LANES];
2611				let a: [f64; Self::F64_LANES] = cast(a);
2612				let b: [f64; Self::F64_LANES] = cast(b);
2613
2614				for i in 0..Self::F64_LANES {
2615					out[i] = a[i] / b[i];
2616				}
2617
2618				cast(out)
2619			}
2620
2621			#[inline]
2622			fn mul_add_f64s(self, a: Self::f64s, b: Self::f64s, c: Self::f64s) -> Self::f64s {
2623				let mut out = [0.0f64; Self::F64_LANES];
2624				let a: [f64; Self::F64_LANES] = cast(a);
2625				let b: [f64; Self::F64_LANES] = cast(b);
2626				let c: [f64; Self::F64_LANES] = cast(c);
2627
2628				for i in 0..Self::F64_LANES {
2629					out[i] = fma_f64(a[i], b[i], c[i]);
2630				}
2631
2632				cast(out)
2633			}
2634
2635			#[inline]
2636			fn equal_f64s(self, a: Self::f64s, b: Self::f64s) -> Self::m64s {
2637				let mut out = [m64::new(false); Self::F64_LANES];
2638				let a: [f64; Self::F64_LANES] = cast(a);
2639				let b: [f64; Self::F64_LANES] = cast(b);
2640
2641				for i in 0..Self::F64_LANES {
2642					out[i] = m64::new(a[i] == b[i]);
2643				}
2644
2645				cast(out)
2646			}
2647
2648			#[inline]
2649			fn less_than_f64s(self, a: Self::f64s, b: Self::f64s) -> Self::m64s {
2650				let mut out = [m64::new(false); Self::F64_LANES];
2651				let a: [f64; Self::F64_LANES] = cast(a);
2652				let b: [f64; Self::F64_LANES] = cast(b);
2653
2654				for i in 0..Self::F64_LANES {
2655					out[i] = m64::new(a[i] < b[i]);
2656				}
2657
2658				cast(out)
2659			}
2660
2661			#[inline]
2662			fn less_than_or_equal_f64s(self, a: Self::f64s, b: Self::f64s) -> Self::m64s {
2663				let mut out = [m64::new(false); Self::F64_LANES];
2664				let a: [f64; Self::F64_LANES] = cast(a);
2665				let b: [f64; Self::F64_LANES] = cast(b);
2666
2667				for i in 0..Self::F64_LANES {
2668					out[i] = m64::new(a[i] <= b[i]);
2669				}
2670
2671				cast(out)
2672			}
2673
2674			#[inline]
2675			fn min_f64s(self, a: Self::f64s, b: Self::f64s) -> Self::f64s {
2676				let mut out = [0.0f64; Self::F64_LANES];
2677				let a: [f64; Self::F64_LANES] = cast(a);
2678				let b: [f64; Self::F64_LANES] = cast(b);
2679
2680				for i in 0..Self::F64_LANES {
2681					out[i] = f64::min(a[i], b[i]);
2682				}
2683
2684				cast(out)
2685			}
2686
2687			#[inline]
2688			fn max_f64s(self, a: Self::f64s, b: Self::f64s) -> Self::f64s {
2689				let mut out = [0.0f64; Self::F64_LANES];
2690				let a: [f64; Self::F64_LANES] = cast(a);
2691				let b: [f64; Self::F64_LANES] = cast(b);
2692
2693				for i in 0..Self::F64_LANES {
2694					out[i] = f64::max(a[i], b[i]);
2695				}
2696
2697				cast(out)
2698			}
2699
2700			#[inline]
2701			fn reduce_sum_f64s(self, a: Self::f64s) -> f64 {
2702				let mut a: [f64; Self::F64_LANES] = cast(a);
2703
2704				let mut n = Self::F64_LANES;
2705				while n > 1 {
2706					n /= 2;
2707					for i in 0..n {
2708						a[i] += a[i + n];
2709					}
2710				}
2711
2712				a[0]
2713			}
2714
2715			#[inline]
2716			fn reduce_product_f64s(self, a: Self::f64s) -> f64 {
2717				let mut a: [f64; Self::F64_LANES] = cast(a);
2718
2719				let mut n = Self::F64_LANES;
2720				while n > 1 {
2721					n /= 2;
2722					for i in 0..n {
2723						a[i] *= a[i + n];
2724					}
2725				}
2726
2727				a[0]
2728			}
2729
2730			#[inline]
2731			fn reduce_min_f64s(self, a: Self::f64s) -> f64 {
2732				let mut a: [f64; Self::F64_LANES] = cast(a);
2733
2734				let mut n = Self::F64_LANES;
2735				while n > 1 {
2736					n /= 2;
2737					for i in 0..n {
2738						a[i] = f64::min(a[i], a[i + n]);
2739					}
2740				}
2741
2742				a[0]
2743			}
2744
2745			#[inline]
2746			fn reduce_max_f64s(self, a: Self::f64s) -> f64 {
2747				let mut a: [f64; Self::F64_LANES] = cast(a);
2748
2749				let mut n = Self::F64_LANES;
2750				while n > 1 {
2751					n /= 2;
2752					for i in 0..n {
2753						a[i] = f64::max(a[i], a[i + n]);
2754					}
2755				}
2756
2757				a[0]
2758			}
2759
2760			#[inline]
2761			fn splat_c64s(self, value: c64) -> Self::c64s {
2762				cast([value; Self::C64_LANES])
2763			}
2764
2765			#[inline]
2766			fn conj_c64s(self, a: Self::c64s) -> Self::c64s {
2767				let mut out = [c64::ZERO; Self::C64_LANES];
2768				let a: [c64; Self::C64_LANES] = cast(a);
2769
2770				for i in 0..Self::C64_LANES {
2771					out[i] = c64::new(a[i].re, -a[i].im);
2772				}
2773
2774				cast(out)
2775			}
2776
2777			#[inline]
2778			fn neg_c64s(self, a: Self::c64s) -> Self::c64s {
2779				let mut out = [c64::ZERO; Self::C64_LANES];
2780				let a: [c64; Self::C64_LANES] = cast(a);
2781
2782				for i in 0..Self::C64_LANES {
2783					out[i] = c64::new(-a[i].re, -a[i].im);
2784				}
2785
2786				cast(out)
2787			}
2788
2789			#[inline]
2790			fn swap_re_im_c64s(self, a: Self::c64s) -> Self::c64s {
2791				let mut out = [c64::ZERO; Self::C64_LANES];
2792				let a: [c64; Self::C64_LANES] = cast(a);
2793
2794				for i in 0..Self::C64_LANES {
2795					out[i] = c64::new(a[i].im, a[i].re);
2796				}
2797
2798				cast(out)
2799			}
2800
2801			#[inline]
2802			fn add_c64s(self, a: Self::c64s, b: Self::c64s) -> Self::c64s {
2803				let mut out = [c64::ZERO; Self::C64_LANES];
2804				let a: [c64; Self::C64_LANES] = cast(a);
2805				let b: [c64; Self::C64_LANES] = cast(b);
2806
2807				for i in 0..Self::C64_LANES {
2808					out[i] = c64::new(a[i].re + b[i].re, a[i].im + b[i].im);
2809				}
2810
2811				cast(out)
2812			}
2813
2814			#[inline]
2815			fn sub_c64s(self, a: Self::c64s, b: Self::c64s) -> Self::c64s {
2816				let mut out = [c64::ZERO; Self::C64_LANES];
2817				let a: [c64; Self::C64_LANES] = cast(a);
2818				let b: [c64; Self::C64_LANES] = cast(b);
2819
2820				for i in 0..Self::C64_LANES {
2821					out[i] = c64::new(a[i].re - b[i].re, a[i].im - b[i].im);
2822				}
2823
2824				cast(out)
2825			}
2826
2827			#[inline]
2828			fn mul_c64s(self, a: Self::c64s, b: Self::c64s) -> Self::c64s {
2829				let mut out = [c64::ZERO; Self::C64_LANES];
2830				let a: [c64; Self::C64_LANES] = cast(a);
2831				let b: [c64; Self::C64_LANES] = cast(b);
2832
2833				for i in 0..Self::C64_LANES {
2834					out[i].re = fma_f64(a[i].re, b[i].re, -(a[i].im * b[i].im));
2835					out[i].im = fma_f64(a[i].re, b[i].im, a[i].im * b[i].re);
2836				}
2837
2838				cast(out)
2839			}
2840
2841			#[inline]
2842			fn conj_mul_c64s(self, a: Self::c64s, b: Self::c64s) -> Self::c64s {
2843				let mut out = [c64::ZERO; Self::C64_LANES];
2844				let a: [c64; Self::C64_LANES] = cast(a);
2845				let b: [c64; Self::C64_LANES] = cast(b);
2846
2847				for i in 0..Self::C64_LANES {
2848					out[i].re = fma_f64(a[i].re, b[i].re, a[i].im * b[i].im);
2849					out[i].im = fma_f64(a[i].re, b[i].im, -(a[i].im * b[i].re));
2850				}
2851
2852				cast(out)
2853			}
2854
2855			#[inline]
2856			fn mul_add_c64s(self, a: Self::c64s, b: Self::c64s, c: Self::c64s) -> Self::c64s {
2857				let mut out = [c64::ZERO; Self::C64_LANES];
2858				let a: [c64; Self::C64_LANES] = cast(a);
2859				let b: [c64; Self::C64_LANES] = cast(b);
2860				let c: [c64; Self::C64_LANES] = cast(c);
2861
2862				for i in 0..Self::C64_LANES {
2863					out[i].re = fma_f64(a[i].re, b[i].re, -fma_f64(a[i].im, b[i].im, -c[i].re));
2864					out[i].im = fma_f64(a[i].re, b[i].im, fma_f64(a[i].im, b[i].re, c[i].im));
2865				}
2866
2867				cast(out)
2868			}
2869
2870			#[inline]
2871			fn conj_mul_add_c64s(self, a: Self::c64s, b: Self::c64s, c: Self::c64s) -> Self::c64s {
2872				let mut out = [c64::ZERO; Self::C64_LANES];
2873				let a: [c64; Self::C64_LANES] = cast(a);
2874				let b: [c64; Self::C64_LANES] = cast(b);
2875				let c: [c64; Self::C64_LANES] = cast(c);
2876
2877				for i in 0..Self::C64_LANES {
2878					out[i].re = fma_f64(a[i].re, b[i].re, fma_f64(a[i].im, b[i].im, c[i].re));
2879					out[i].im = fma_f64(a[i].re, b[i].im, -fma_f64(a[i].im, b[i].re, -c[i].im));
2880				}
2881
2882				cast(out)
2883			}
2884
2885			#[inline]
2886			fn abs2_c64s(self, a: Self::c64s) -> Self::c64s {
2887				let mut out = [c64::ZERO; Self::C64_LANES];
2888				let a: [c64; Self::C64_LANES] = cast(a);
2889
2890				for i in 0..Self::C64_LANES {
2891					let x = a[i].re * a[i].re + a[i].im * a[i].im;
2892					out[i].re = x;
2893					out[i].im = x;
2894				}
2895
2896				cast(out)
2897			}
2898
2899			#[inline]
2900			fn abs_max_c64s(self, a: Self::c64s) -> Self::c64s {
2901				let mut out = [c64::ZERO; Self::C64_LANES];
2902				let a: [c64; Self::C64_LANES] = cast(self.abs_f64s(a));
2903
2904				for i in 0..Self::C64_LANES {
2905					let x = f64::max(a[i].re, a[i].im);
2906					out[i].re = x;
2907					out[i].im = x;
2908				}
2909
2910				cast(out)
2911			}
2912
2913			#[inline]
2914			fn reduce_sum_c64s(self, a: Self::c64s) -> c64 {
2915				let mut a: [c64; Self::C64_LANES] = cast(a);
2916
2917				let mut n = Self::C64_LANES;
2918				while n > 1 {
2919					n /= 2;
2920					for i in 0..n {
2921						a[i].re += a[i + n].re;
2922						a[i].im += a[i + n].im;
2923					}
2924				}
2925
2926				a[0]
2927			}
2928
2929			#[inline]
2930			fn reduce_min_c64s(self, a: Self::c64s) -> c64 {
2931				let mut a: [c64; Self::C64_LANES] = cast(a);
2932
2933				let mut n = Self::C64_LANES;
2934				while n > 1 {
2935					n /= 2;
2936					for i in 0..n {
2937						a[i].re = f64::min(a[i].re, a[i + n].re);
2938						a[i].im = f64::min(a[i].im, a[i + n].im);
2939					}
2940				}
2941
2942				a[0]
2943			}
2944
2945			#[inline]
2946			fn reduce_max_c64s(self, a: Self::c64s) -> c64 {
2947				let mut a: [c64; Self::C64_LANES] = cast(a);
2948
2949				let mut n = Self::C64_LANES;
2950				while n > 1 {
2951					n /= 2;
2952					for i in 0..n {
2953						a[i].re = f64::max(a[i].re, a[i + n].re);
2954						a[i].im = f64::max(a[i].im, a[i + n].im);
2955					}
2956				}
2957
2958				a[0]
2959			}
2960
2961			#[inline]
2962			fn rotate_right_u64s(self, a: Self::u64s, amount: usize) -> Self::u64s {
2963				let mut a: [u64; Self::F64_LANES] = cast(a);
2964				let amount = amount % Self::F64_LANES;
2965				a.rotate_right(amount);
2966				cast(a)
2967			}
2968
2969			#[inline]
2970			fn rotate_right_c64s(self, a: Self::c64s, amount: usize) -> Self::c64s {
2971				let mut a: [c64; Self::C64_LANES] = cast(a);
2972				let amount = amount % Self::C64_LANES;
2973				a.rotate_right(amount);
2974				cast(a)
2975			}
2976
2977			#[inline]
2978			fn mul_add_e_f32s(self, a: Self::f32s, b: Self::f32s, c: Self::f32s) -> Self::f32s {
2979				self.mul_add_f32s(a, b, c)
2980			}
2981
2982			#[inline]
2983			fn mul_add_e_f64s(self, a: Self::f64s, b: Self::f64s, c: Self::f64s) -> Self::f64s {
2984				self.mul_add_f64s(a, b, c)
2985			}
2986		}
2987	};
2988}
2989
2990scalar_simd!(
2991	Scalar128b, 16, m32x4, f32x4, i32x4, u32x4, m64x2, f64x2, i64x2, u64x2
2992);
2993scalar_simd!(
2994	Scalar256b, 16, m32x8, f32x8, i32x8, u32x8, m64x4, f64x4, i64x4, u64x4
2995);
2996scalar_simd!(
2997	Scalar512b, 8, m32x16, f32x16, i32x16, u32x16, m64x8, f64x8, i64x8, u64x8
2998);
2999
3000impl Default for Scalar {
3001	#[inline]
3002	fn default() -> Self {
3003		Self::new()
3004	}
3005}
3006
3007impl Scalar {
3008	#[inline]
3009	pub fn new() -> Self {
3010		Self
3011	}
3012}
3013
3014impl Seal for Scalar {}
3015impl Simd for Scalar {
3016	type c32s = c32;
3017	type c64s = c64;
3018	type f32s = f32;
3019	type f64s = f64;
3020	type i32s = i32;
3021	type i64s = i64;
3022	type m32s = bool;
3023	type m64s = bool;
3024	type u32s = u32;
3025	type u64s = u64;
3026
3027	const IS_SCALAR: bool = true;
3028	const REGISTER_COUNT: usize = 16;
3029
3030	#[inline]
3031	fn abs2_c32s(self, a: Self::c32s) -> Self::c32s {
3032		let norm2 = a.re * a.re + a.im * a.im;
3033		c32::new(norm2, norm2)
3034	}
3035
3036	#[inline]
3037	fn abs2_c64s(self, a: Self::c64s) -> Self::c64s {
3038		let norm2 = a.re * a.re + a.im * a.im;
3039		c64::new(norm2, norm2)
3040	}
3041
3042	#[inline(always)]
3043	fn abs_max_c32s(self, a: Self::c32s) -> Self::c32s {
3044		let re = if a.re > a.im { a.re } else { a.im };
3045		let im = re;
3046		Complex { re, im }
3047	}
3048
3049	#[inline(always)]
3050	fn abs_max_c64s(self, a: Self::c64s) -> Self::c64s {
3051		let re = if a.re > a.im { a.re } else { a.im };
3052		let im = re;
3053		Complex { re, im }
3054	}
3055
3056	#[inline]
3057	fn add_c32s(self, a: Self::c32s, b: Self::c32s) -> Self::c32s {
3058		a + b
3059	}
3060
3061	#[inline]
3062	fn add_c64s(self, a: Self::c64s, b: Self::c64s) -> Self::c64s {
3063		a + b
3064	}
3065
3066	#[inline]
3067	fn add_f32s(self, a: Self::f32s, b: Self::f32s) -> Self::f32s {
3068		a + b
3069	}
3070
3071	#[inline]
3072	fn add_f64s(self, a: Self::f64s, b: Self::f64s) -> Self::f64s {
3073		a + b
3074	}
3075
3076	#[inline]
3077	fn add_u32s(self, a: Self::u32s, b: Self::u32s) -> Self::u32s {
3078		a.wrapping_add(b)
3079	}
3080
3081	#[inline]
3082	fn add_u64s(self, a: Self::u64s, b: Self::u64s) -> Self::u64s {
3083		a.wrapping_add(b)
3084	}
3085
3086	#[inline]
3087	fn and_m32s(self, a: Self::m32s, b: Self::m32s) -> Self::m32s {
3088		a & b
3089	}
3090
3091	#[inline]
3092	fn and_m64s(self, a: Self::m64s, b: Self::m64s) -> Self::m64s {
3093		a & b
3094	}
3095
3096	#[inline]
3097	fn and_u32s(self, a: Self::u32s, b: Self::u32s) -> Self::u32s {
3098		a & b
3099	}
3100
3101	#[inline]
3102	fn and_u64s(self, a: Self::u64s, b: Self::u64s) -> Self::u64s {
3103		a & b
3104	}
3105
3106	#[inline]
3107	fn conj_c32s(self, a: Self::c32s) -> Self::c32s {
3108		a.conj()
3109	}
3110
3111	#[inline]
3112	fn conj_c64s(self, a: Self::c64s) -> Self::c64s {
3113		a.conj()
3114	}
3115
3116	#[inline]
3117	fn conj_mul_add_c32s(self, a: Self::c32s, b: Self::c32s, c: Self::c32s) -> Self::c32s {
3118		let re = fma_f32(a.re, b.re, fma_f32(a.im, b.im, c.re));
3119		let im = fma_f32(a.re, b.im, -fma_f32(a.im, b.re, -c.im));
3120		Complex { re, im }
3121	}
3122
3123	#[inline]
3124	fn conj_mul_add_c64s(self, a: Self::c64s, b: Self::c64s, c: Self::c64s) -> Self::c64s {
3125		let re = fma_f64(a.re, b.re, fma_f64(a.im, b.im, c.re));
3126		let im = fma_f64(a.re, b.im, -fma_f64(a.im, b.re, -c.im));
3127		Complex { re, im }
3128	}
3129
3130	#[inline]
3131	fn conj_mul_add_e_c32s(self, a: Self::c32s, b: Self::c32s, c: Self::c32s) -> Self::c32s {
3132		a.conj() * b + c
3133	}
3134
3135	#[inline]
3136	fn conj_mul_add_e_c64s(self, a: Self::c64s, b: Self::c64s, c: Self::c64s) -> Self::c64s {
3137		a.conj() * b + c
3138	}
3139
3140	#[inline]
3141	fn conj_mul_c32s(self, a: Self::c32s, b: Self::c32s) -> Self::c32s {
3142		let re = fma_f32(a.re, b.re, a.im * b.im);
3143		let im = fma_f32(a.re, b.im, -(a.im * b.re));
3144		Complex { re, im }
3145	}
3146
3147	#[inline]
3148	fn conj_mul_c64s(self, a: Self::c64s, b: Self::c64s) -> Self::c64s {
3149		let re = fma_f64(a.re, b.re, a.im * b.im);
3150		let im = fma_f64(a.re, b.im, -(a.im * b.re));
3151		Complex { re, im }
3152	}
3153
3154	#[inline]
3155	fn conj_mul_e_c32s(self, a: Self::c32s, b: Self::c32s) -> Self::c32s {
3156		a.conj() * b
3157	}
3158
3159	#[inline]
3160	fn conj_mul_e_c64s(self, a: Self::c64s, b: Self::c64s) -> Self::c64s {
3161		a.conj() * b
3162	}
3163
3164	#[inline]
3165	fn div_f32s(self, a: Self::f32s, b: Self::f32s) -> Self::f32s {
3166		a / b
3167	}
3168
3169	#[inline]
3170	fn div_f64s(self, a: Self::f64s, b: Self::f64s) -> Self::f64s {
3171		a / b
3172	}
3173
3174	#[inline]
3175	fn equal_f32s(self, a: Self::f32s, b: Self::f32s) -> Self::m32s {
3176		a == b
3177	}
3178
3179	#[inline]
3180	fn equal_f64s(self, a: Self::f64s, b: Self::f64s) -> Self::m64s {
3181		a == b
3182	}
3183
3184	#[inline(always)]
3185	fn first_true_m32s(self, mask: Self::m32s) -> usize {
3186		if mask { 0 } else { 1 }
3187	}
3188
3189	#[inline(always)]
3190	fn first_true_m64s(self, mask: Self::m64s) -> usize {
3191		if mask { 0 } else { 1 }
3192	}
3193
3194	#[inline]
3195	fn greater_than_or_equal_u32s(self, a: Self::u32s, b: Self::u32s) -> Self::m32s {
3196		a >= b
3197	}
3198
3199	#[inline(always)]
3200	fn greater_than_or_equal_u64s(self, a: Self::u64s, b: Self::u64s) -> Self::m64s {
3201		a >= b
3202	}
3203
3204	#[inline]
3205	fn greater_than_u32s(self, a: Self::u32s, b: Self::u32s) -> Self::m32s {
3206		a > b
3207	}
3208
3209	#[inline(always)]
3210	fn greater_than_u64s(self, a: Self::u64s, b: Self::u64s) -> Self::m64s {
3211		a > b
3212	}
3213
3214	#[inline]
3215	fn greater_than_or_equal_i32s(self, a: Self::i32s, b: Self::i32s) -> Self::m32s {
3216		a >= b
3217	}
3218
3219	#[inline(always)]
3220	fn greater_than_or_equal_i64s(self, a: Self::i64s, b: Self::i64s) -> Self::m64s {
3221		a >= b
3222	}
3223
3224	#[inline]
3225	fn greater_than_i32s(self, a: Self::i32s, b: Self::i32s) -> Self::m32s {
3226		a > b
3227	}
3228
3229	#[inline(always)]
3230	fn greater_than_i64s(self, a: Self::i64s, b: Self::i64s) -> Self::m64s {
3231		a > b
3232	}
3233
3234	#[inline]
3235	fn less_than_f32s(self, a: Self::f32s, b: Self::f32s) -> Self::m32s {
3236		a < b
3237	}
3238
3239	#[inline]
3240	fn less_than_f64s(self, a: Self::f64s, b: Self::f64s) -> Self::m64s {
3241		a < b
3242	}
3243
3244	#[inline]
3245	fn less_than_or_equal_f32s(self, a: Self::f32s, b: Self::f32s) -> Self::m32s {
3246		a <= b
3247	}
3248
3249	#[inline]
3250	fn less_than_or_equal_f64s(self, a: Self::f64s, b: Self::f64s) -> Self::m64s {
3251		a <= b
3252	}
3253
3254	#[inline]
3255	fn less_than_or_equal_u32s(self, a: Self::u32s, b: Self::u32s) -> Self::m32s {
3256		a <= b
3257	}
3258
3259	#[inline(always)]
3260	fn less_than_or_equal_u64s(self, a: Self::u64s, b: Self::u64s) -> Self::m64s {
3261		a <= b
3262	}
3263
3264	#[inline]
3265	fn less_than_u32s(self, a: Self::u32s, b: Self::u32s) -> Self::m32s {
3266		a < b
3267	}
3268
3269	#[inline(always)]
3270	fn less_than_u64s(self, a: Self::u64s, b: Self::u64s) -> Self::m64s {
3271		a < b
3272	}
3273
3274	#[inline]
3275	fn less_than_or_equal_i32s(self, a: Self::i32s, b: Self::i32s) -> Self::m32s {
3276		a <= b
3277	}
3278
3279	#[inline(always)]
3280	fn less_than_or_equal_i64s(self, a: Self::i64s, b: Self::i64s) -> Self::m64s {
3281		a <= b
3282	}
3283
3284	#[inline]
3285	fn less_than_i32s(self, a: Self::i32s, b: Self::i32s) -> Self::m32s {
3286		a < b
3287	}
3288
3289	#[inline(always)]
3290	fn less_than_i64s(self, a: Self::i64s, b: Self::i64s) -> Self::m64s {
3291		a < b
3292	}
3293
3294	#[inline(always)]
3295	unsafe fn mask_load_ptr_c32s(self, mask: MemMask<Self::m32s>, ptr: *const c32) -> Self::c32s {
3296		if mask.mask { *ptr } else { core::mem::zeroed() }
3297	}
3298
3299	#[inline(always)]
3300	unsafe fn mask_load_ptr_c64s(self, mask: MemMask<Self::m64s>, ptr: *const c64) -> Self::c64s {
3301		if mask.mask { *ptr } else { core::mem::zeroed() }
3302	}
3303
3304	#[inline(always)]
3305	unsafe fn mask_load_ptr_u32s(self, mask: MemMask<Self::m32s>, ptr: *const u32) -> Self::u32s {
3306		if mask.mask { *ptr } else { 0 }
3307	}
3308
3309	#[inline(always)]
3310	unsafe fn mask_load_ptr_u64s(self, mask: MemMask<Self::m64s>, ptr: *const u64) -> Self::u64s {
3311		if mask.mask { *ptr } else { 0 }
3312	}
3313
3314	#[inline(always)]
3315	unsafe fn mask_store_ptr_c32s(
3316		self,
3317		mask: MemMask<Self::m32s>,
3318		ptr: *mut c32,
3319		values: Self::c32s,
3320	) {
3321		if mask.mask {
3322			*ptr = values
3323		}
3324	}
3325
3326	#[inline(always)]
3327	unsafe fn mask_store_ptr_c64s(
3328		self,
3329		mask: MemMask<Self::m64s>,
3330		ptr: *mut c64,
3331		values: Self::c64s,
3332	) {
3333		if mask.mask {
3334			*ptr = values
3335		}
3336	}
3337
3338	#[inline(always)]
3339	unsafe fn mask_store_ptr_u32s(
3340		self,
3341		mask: MemMask<Self::m32s>,
3342		ptr: *mut u32,
3343		values: Self::u32s,
3344	) {
3345		if mask.mask {
3346			*ptr = values
3347		}
3348	}
3349
3350	#[inline(always)]
3351	unsafe fn mask_store_ptr_u64s(
3352		self,
3353		mask: MemMask<Self::m64s>,
3354		ptr: *mut u64,
3355		values: Self::u64s,
3356	) {
3357		if mask.mask {
3358			*ptr = values
3359		}
3360	}
3361
3362	#[inline]
3363	fn max_f32s(self, a: Self::f32s, b: Self::f32s) -> Self::f32s {
3364		a.max(b)
3365	}
3366
3367	#[inline]
3368	fn max_f64s(self, a: Self::f64s, b: Self::f64s) -> Self::f64s {
3369		a.max(b)
3370	}
3371
3372	#[inline]
3373	fn min_f32s(self, a: Self::f32s, b: Self::f32s) -> Self::f32s {
3374		a.min(b)
3375	}
3376
3377	#[inline]
3378	fn min_f64s(self, a: Self::f64s, b: Self::f64s) -> Self::f64s {
3379		a.min(b)
3380	}
3381
3382	#[inline]
3383	fn mul_add_c32s(self, a: Self::c32s, b: Self::c32s, c: Self::c32s) -> Self::c32s {
3384		let re = fma_f32(a.re, b.re, -fma_f32(a.im, b.im, -c.re));
3385		let im = fma_f32(a.re, b.im, fma_f32(a.im, b.re, c.im));
3386		Complex { re, im }
3387	}
3388
3389	#[inline]
3390	fn mul_add_c64s(self, a: Self::c64s, b: Self::c64s, c: Self::c64s) -> Self::c64s {
3391		let re = fma_f64(a.re, b.re, -fma_f64(a.im, b.im, -c.re));
3392		let im = fma_f64(a.re, b.im, fma_f64(a.im, b.re, c.im));
3393		Complex { re, im }
3394	}
3395
3396	#[inline]
3397	fn mul_add_e_c32s(self, a: Self::c32s, b: Self::c32s, c: Self::c32s) -> Self::c32s {
3398		a * b + c
3399	}
3400
3401	#[inline]
3402	fn mul_add_e_c64s(self, a: Self::c64s, b: Self::c64s, c: Self::c64s) -> Self::c64s {
3403		a * b + c
3404	}
3405
3406	#[inline(always)]
3407	fn mul_add_e_f32s(self, a: Self::f32s, b: Self::f32s, c: Self::f32s) -> Self::f32s {
3408		a * b + c
3409	}
3410
3411	#[inline(always)]
3412	fn mul_add_e_f64s(self, a: Self::f64s, b: Self::f64s, c: Self::f64s) -> Self::f64s {
3413		a * b + c
3414	}
3415
3416	#[inline]
3417	fn mul_add_f32s(self, a: Self::f32s, b: Self::f32s, c: Self::f32s) -> Self::f32s {
3418		fma_f32(a, b, c)
3419	}
3420
3421	#[inline]
3422	fn mul_add_f64s(self, a: Self::f64s, b: Self::f64s, c: Self::f64s) -> Self::f64s {
3423		fma_f64(a, b, c)
3424	}
3425
3426	#[inline]
3427	fn mul_c32s(self, a: Self::c32s, b: Self::c32s) -> Self::c32s {
3428		let re = fma_f32(a.re, b.re, -(a.im * b.im));
3429		let im = fma_f32(a.re, b.im, a.im * b.re);
3430		Complex { re, im }
3431	}
3432
3433	#[inline]
3434	fn mul_c64s(self, a: Self::c64s, b: Self::c64s) -> Self::c64s {
3435		let re = fma_f64(a.re, b.re, -(a.im * b.im));
3436		let im = fma_f64(a.re, b.im, a.im * b.re);
3437		Complex { re, im }
3438	}
3439
3440	#[inline]
3441	fn mul_e_c32s(self, a: Self::c32s, b: Self::c32s) -> Self::c32s {
3442		a * b
3443	}
3444
3445	#[inline]
3446	fn mul_e_c64s(self, a: Self::c64s, b: Self::c64s) -> Self::c64s {
3447		a * b
3448	}
3449
3450	#[inline]
3451	fn mul_f32s(self, a: Self::f32s, b: Self::f32s) -> Self::f32s {
3452		a * b
3453	}
3454
3455	#[inline]
3456	fn mul_f64s(self, a: Self::f64s, b: Self::f64s) -> Self::f64s {
3457		a * b
3458	}
3459
3460	#[inline]
3461	fn neg_c32s(self, a: Self::c32s) -> Self::c32s {
3462		-a
3463	}
3464
3465	#[inline]
3466	fn neg_c64s(self, a: Self::c64s) -> Self::c64s {
3467		-a
3468	}
3469
3470	#[inline]
3471	fn not_m32s(self, a: Self::m32s) -> Self::m32s {
3472		!a
3473	}
3474
3475	#[inline]
3476	fn not_m64s(self, a: Self::m64s) -> Self::m64s {
3477		!a
3478	}
3479
3480	#[inline]
3481	fn not_u32s(self, a: Self::u32s) -> Self::u32s {
3482		!a
3483	}
3484
3485	#[inline]
3486	fn not_u64s(self, a: Self::u64s) -> Self::u64s {
3487		!a
3488	}
3489
3490	#[inline]
3491	fn or_m32s(self, a: Self::m32s, b: Self::m32s) -> Self::m32s {
3492		a | b
3493	}
3494
3495	#[inline]
3496	fn or_m64s(self, a: Self::m64s, b: Self::m64s) -> Self::m64s {
3497		a | b
3498	}
3499
3500	#[inline]
3501	fn or_u32s(self, a: Self::u32s, b: Self::u32s) -> Self::u32s {
3502		a | b
3503	}
3504
3505	#[inline]
3506	fn or_u64s(self, a: Self::u64s, b: Self::u64s) -> Self::u64s {
3507		a | b
3508	}
3509
3510	#[inline]
3511	fn partial_load_c64s(self, slice: &[c64]) -> Self::c64s {
3512		if let Some((head, _)) = slice.split_first() {
3513			*head
3514		} else {
3515			c64 { re: 0.0, im: 0.0 }
3516		}
3517	}
3518
3519	#[inline]
3520	fn partial_load_u32s(self, slice: &[u32]) -> Self::u32s {
3521		if let Some((head, _)) = slice.split_first() {
3522			*head
3523		} else {
3524			0
3525		}
3526	}
3527
3528	#[inline]
3529	fn partial_load_u64s(self, slice: &[u64]) -> Self::u64s {
3530		if let Some((head, _)) = slice.split_first() {
3531			*head
3532		} else {
3533			0
3534		}
3535	}
3536
3537	#[inline]
3538	fn partial_store_c64s(self, slice: &mut [c64], values: Self::c64s) {
3539		if let Some((head, _)) = slice.split_first_mut() {
3540			*head = values;
3541		}
3542	}
3543
3544	#[inline]
3545	fn partial_store_u32s(self, slice: &mut [u32], values: Self::u32s) {
3546		if let Some((head, _)) = slice.split_first_mut() {
3547			*head = values;
3548		}
3549	}
3550
3551	#[inline]
3552	fn partial_store_u64s(self, slice: &mut [u64], values: Self::u64s) {
3553		if let Some((head, _)) = slice.split_first_mut() {
3554			*head = values;
3555		}
3556	}
3557
3558	#[inline(always)]
3559	fn reduce_max_c32s(self, a: Self::c32s) -> c32 {
3560		a
3561	}
3562
3563	#[inline(always)]
3564	fn reduce_max_c64s(self, a: Self::c64s) -> c64 {
3565		a
3566	}
3567
3568	#[inline]
3569	fn reduce_max_f32s(self, a: Self::f32s) -> f32 {
3570		a
3571	}
3572
3573	#[inline]
3574	fn reduce_max_f64s(self, a: Self::f64s) -> f64 {
3575		a
3576	}
3577
3578	#[inline(always)]
3579	fn reduce_min_c32s(self, a: Self::c32s) -> c32 {
3580		a
3581	}
3582
3583	#[inline(always)]
3584	fn reduce_min_c64s(self, a: Self::c64s) -> c64 {
3585		a
3586	}
3587
3588	#[inline]
3589	fn reduce_min_f32s(self, a: Self::f32s) -> f32 {
3590		a
3591	}
3592
3593	#[inline]
3594	fn reduce_min_f64s(self, a: Self::f64s) -> f64 {
3595		a
3596	}
3597
3598	#[inline]
3599	fn reduce_product_f32s(self, a: Self::f32s) -> f32 {
3600		a
3601	}
3602
3603	#[inline]
3604	fn reduce_product_f64s(self, a: Self::f64s) -> f64 {
3605		a
3606	}
3607
3608	#[inline]
3609	fn reduce_sum_c32s(self, a: Self::c32s) -> c32 {
3610		a
3611	}
3612
3613	#[inline]
3614	fn reduce_sum_c64s(self, a: Self::c64s) -> c64 {
3615		a
3616	}
3617
3618	#[inline]
3619	fn reduce_sum_f32s(self, a: Self::f32s) -> f32 {
3620		a
3621	}
3622
3623	#[inline]
3624	fn reduce_sum_f64s(self, a: Self::f64s) -> f64 {
3625		a
3626	}
3627
3628	#[inline(always)]
3629	fn rotate_right_c32s(self, a: Self::c32s, _amount: usize) -> Self::c32s {
3630		a
3631	}
3632
3633	#[inline(always)]
3634	fn rotate_right_c64s(self, a: Self::c64s, _amount: usize) -> Self::c64s {
3635		a
3636	}
3637
3638	#[inline(always)]
3639	fn rotate_right_u32s(self, a: Self::u32s, _amount: usize) -> Self::u32s {
3640		a
3641	}
3642
3643	#[inline(always)]
3644	fn rotate_right_u64s(self, a: Self::u64s, _amount: usize) -> Self::u64s {
3645		a
3646	}
3647
3648	#[inline]
3649	fn select_u32s_m32s(
3650		self,
3651		mask: Self::m32s,
3652		if_true: Self::u32s,
3653		if_false: Self::u32s,
3654	) -> Self::u32s {
3655		if mask { if_true } else { if_false }
3656	}
3657
3658	#[inline]
3659	fn select_u64s_m64s(
3660		self,
3661		mask: Self::m64s,
3662		if_true: Self::u64s,
3663		if_false: Self::u64s,
3664	) -> Self::u64s {
3665		if mask { if_true } else { if_false }
3666	}
3667
3668	#[inline]
3669	fn splat_c32s(self, value: c32) -> Self::c32s {
3670		value
3671	}
3672
3673	#[inline]
3674	fn splat_c64s(self, value: c64) -> Self::c64s {
3675		value
3676	}
3677
3678	#[inline]
3679	fn splat_f32s(self, value: f32) -> Self::f32s {
3680		value
3681	}
3682
3683	#[inline]
3684	fn splat_f64s(self, value: f64) -> Self::f64s {
3685		value
3686	}
3687
3688	#[inline]
3689	fn splat_u32s(self, value: u32) -> Self::u32s {
3690		value
3691	}
3692
3693	#[inline]
3694	fn splat_u64s(self, value: u64) -> Self::u64s {
3695		value
3696	}
3697
3698	#[inline]
3699	fn sub_c32s(self, a: Self::c32s, b: Self::c32s) -> Self::c32s {
3700		a - b
3701	}
3702
3703	#[inline]
3704	fn sub_c64s(self, a: Self::c64s, b: Self::c64s) -> Self::c64s {
3705		a - b
3706	}
3707
3708	#[inline]
3709	fn sub_f32s(self, a: Self::f32s, b: Self::f32s) -> Self::f32s {
3710		a - b
3711	}
3712
3713	#[inline]
3714	fn sub_f64s(self, a: Self::f64s, b: Self::f64s) -> Self::f64s {
3715		a - b
3716	}
3717
3718	#[inline]
3719	fn sub_u32s(self, a: Self::u32s, b: Self::u32s) -> Self::u32s {
3720		a.wrapping_sub(b)
3721	}
3722
3723	#[inline]
3724	fn sub_u64s(self, a: Self::u64s, b: Self::u64s) -> Self::u64s {
3725		a.wrapping_sub(b)
3726	}
3727
3728	#[inline]
3729	fn swap_re_im_c32s(self, a: Self::c32s) -> Self::c32s {
3730		c32 { re: a.im, im: a.re }
3731	}
3732
3733	fn swap_re_im_c64s(self, a: Self::c64s) -> Self::c64s {
3734		c64 { re: a.im, im: a.re }
3735	}
3736
3737	#[inline]
3738	fn vectorize<Op: WithSimd>(self, op: Op) -> Op::Output {
3739		op.with_simd(self)
3740	}
3741
3742	#[inline]
3743	fn widening_mul_u32s(self, a: Self::u32s, b: Self::u32s) -> (Self::u32s, Self::u32s) {
3744		let c = a as u64 * b as u64;
3745		let lo = c as u32;
3746		let hi = (c >> 32) as u32;
3747		(lo, hi)
3748	}
3749
3750	#[inline]
3751	fn wrapping_dyn_shl_u32s(self, a: Self::u32s, amount: Self::u32s) -> Self::u32s {
3752		a.wrapping_shl(amount)
3753	}
3754
3755	#[inline]
3756	fn wrapping_dyn_shr_u32s(self, a: Self::u32s, amount: Self::u32s) -> Self::u32s {
3757		a.wrapping_shr(amount)
3758	}
3759
3760	#[inline]
3761	fn xor_m32s(self, a: Self::m32s, b: Self::m32s) -> Self::m32s {
3762		a ^ b
3763	}
3764
3765	#[inline]
3766	fn xor_m64s(self, a: Self::m64s, b: Self::m64s) -> Self::m64s {
3767		a ^ b
3768	}
3769
3770	#[inline]
3771	fn xor_u32s(self, a: Self::u32s, b: Self::u32s) -> Self::u32s {
3772		a ^ b
3773	}
3774
3775	#[inline]
3776	fn xor_u64s(self, a: Self::u64s, b: Self::u64s) -> Self::u64s {
3777		a ^ b
3778	}
3779}
3780
3781#[inline(always)]
3782unsafe fn split_slice<T, U>(slice: &[T]) -> (&[U], &[T]) {
3783	assert_eq!(core::mem::size_of::<U>() % core::mem::size_of::<T>(), 0);
3784	assert_eq!(core::mem::align_of::<U>(), core::mem::align_of::<T>());
3785
3786	let chunk_size = core::mem::size_of::<U>() / core::mem::size_of::<T>();
3787
3788	let len = slice.len();
3789	let data = slice.as_ptr();
3790
3791	let div = len / chunk_size;
3792	let rem = len % chunk_size;
3793	(
3794		from_raw_parts(data as *const U, div),
3795		from_raw_parts(data.add(len - rem), rem),
3796	)
3797}
3798
3799#[inline(always)]
3800unsafe fn split_mut_slice<T, U>(slice: &mut [T]) -> (&mut [U], &mut [T]) {
3801	assert_eq!(core::mem::size_of::<U>() % core::mem::size_of::<T>(), 0);
3802	assert_eq!(core::mem::align_of::<U>(), core::mem::align_of::<T>());
3803
3804	let chunk_size = core::mem::size_of::<U>() / core::mem::size_of::<T>();
3805
3806	let len = slice.len();
3807	let data = slice.as_mut_ptr();
3808
3809	let div = len / chunk_size;
3810	let rem = len % chunk_size;
3811	(
3812		from_raw_parts_mut(data as *mut U, div),
3813		from_raw_parts_mut(data.add(len - rem), rem),
3814	)
3815}
3816
3817#[inline(always)]
3818unsafe fn rsplit_slice<T, U>(slice: &[T]) -> (&[T], &[U]) {
3819	assert_eq!(core::mem::size_of::<U>() % core::mem::size_of::<T>(), 0);
3820	assert_eq!(core::mem::align_of::<U>(), core::mem::align_of::<T>());
3821
3822	let chunk_size = core::mem::size_of::<U>() / core::mem::size_of::<T>();
3823
3824	let len = slice.len();
3825	let data = slice.as_ptr();
3826
3827	let div = len / chunk_size;
3828	let rem = len % chunk_size;
3829	(
3830		from_raw_parts(data, rem),
3831		from_raw_parts(data.add(rem) as *const U, div),
3832	)
3833}
3834
3835#[inline(always)]
3836unsafe fn rsplit_mut_slice<T, U>(slice: &mut [T]) -> (&mut [T], &mut [U]) {
3837	assert_eq!(core::mem::size_of::<U>() % core::mem::size_of::<T>(), 0);
3838	assert_eq!(core::mem::align_of::<U>(), core::mem::align_of::<T>());
3839
3840	let chunk_size = core::mem::size_of::<U>() / core::mem::size_of::<T>();
3841
3842	let len = slice.len();
3843	let data = slice.as_mut_ptr();
3844
3845	let div = len / chunk_size;
3846	let rem = len % chunk_size;
3847	(
3848		from_raw_parts_mut(data, rem),
3849		from_raw_parts_mut(data.add(rem) as *mut U, div),
3850	)
3851}
3852
3853match_cfg!(item, match cfg!() {
3854	const { any(target_arch = "x86", target_arch = "x86_64") } => {
3855		pub use x86::Arch;
3856	},
3857	const { target_arch = "aarch64" } => {
3858		pub use aarch64::Arch;
3859	},
3860	_ => {
3861		#[derive(Debug, Clone, Copy)]
3862		#[non_exhaustive]
3863		pub enum Arch {
3864			Scalar,
3865		}
3866
3867		impl Arch {
3868			#[inline(always)]
3869			pub fn new() -> Self {
3870				Self::Scalar
3871			}
3872
3873			#[inline(always)]
3874			pub fn dispatch<Op: WithSimd>(self, op: Op) -> Op::Output {
3875				op.with_simd(Scalar)
3876			}
3877		}
3878		impl Default for Arch {
3879			#[inline]
3880			fn default() -> Self {
3881				Self::new()
3882			}
3883		}
3884	},
3885});
3886
3887#[doc(hidden)]
3888pub struct CheckSameSize<T, U>(PhantomData<(T, U)>);
3889impl<T, U> CheckSameSize<T, U> {
3890	pub const VALID: () = {
3891		assert!(core::mem::size_of::<T>() == core::mem::size_of::<U>());
3892	};
3893}
3894
3895#[doc(hidden)]
3896pub struct CheckSizeLessThanOrEqual<T, U>(PhantomData<(T, U)>);
3897impl<T, U> CheckSizeLessThanOrEqual<T, U> {
3898	pub const VALID: () = {
3899		assert!(core::mem::size_of::<T>() <= core::mem::size_of::<U>());
3900	};
3901}
3902
3903#[macro_export]
3904macro_rules! static_assert_same_size {
3905	($t: ty, $u: ty) => {
3906		let _ = $crate::CheckSameSize::<$t, $u>::VALID;
3907	};
3908}
3909#[macro_export]
3910macro_rules! static_assert_size_less_than_or_equal {
3911	($t: ty, $u: ty) => {
3912		let _ = $crate::CheckSizeLessThanOrEqual::<$t, $u>::VALID;
3913	};
3914}
3915
3916/// Safe transmute function.
3917///
3918/// This function asserts at compile time that the two types have the same size.
3919#[inline(always)]
3920pub fn cast<T: NoUninit, U: AnyBitPattern>(value: T) -> U {
3921	static_assert_same_size!(T, U);
3922	let value = core::mem::ManuallyDrop::new(value);
3923	let ptr = &value as *const core::mem::ManuallyDrop<T> as *const U;
3924	unsafe { ptr.read_unaligned() }
3925}
3926
3927/// Safe lossy transmute function, where the destination type may be smaller than the source type.
3928///
3929/// This property is checked at compile time.
3930#[inline(always)]
3931pub fn cast_lossy<T: NoUninit, U: AnyBitPattern>(value: T) -> U {
3932	static_assert_size_less_than_or_equal!(U, T);
3933	let value = core::mem::ManuallyDrop::new(value);
3934	let ptr = &value as *const core::mem::ManuallyDrop<T> as *const U;
3935	unsafe { ptr.read_unaligned() }
3936}
3937
3938/// Splits a slice into chunks of equal size (known at compile time).
3939///
3940/// Returns the chunks and the remaining section of the input slice.
3941#[inline(always)]
3942pub fn as_arrays<const N: usize, T>(slice: &[T]) -> (&[[T; N]], &[T]) {
3943	let n = slice.len();
3944	let mid_div_n = n / N;
3945	let mid = mid_div_n * N;
3946	let ptr = slice.as_ptr();
3947	unsafe {
3948		(
3949			from_raw_parts(ptr as *const [T; N], mid_div_n),
3950			from_raw_parts(ptr.add(mid), n - mid),
3951		)
3952	}
3953}
3954
3955/// Splits a slice into chunks of equal size (known at compile time).
3956///
3957/// Returns the chunks and the remaining section of the input slice.
3958#[inline(always)]
3959pub fn as_arrays_mut<const N: usize, T>(slice: &mut [T]) -> (&mut [[T; N]], &mut [T]) {
3960	let n = slice.len();
3961	let mid_div_n = n / N;
3962	let mid = mid_div_n * N;
3963	let ptr = slice.as_mut_ptr();
3964	unsafe {
3965		(
3966			from_raw_parts_mut(ptr as *mut [T; N], mid_div_n),
3967			from_raw_parts_mut(ptr.add(mid), n - mid),
3968		)
3969	}
3970}
3971
3972/// Platform dependent intrinsics.
3973pub mod core_arch;
3974
3975#[allow(unused_macros)]
3976macro_rules! inherit {
3977    ({$(
3978        $(#[$attr: meta])*
3979        $(unsafe $($placeholder: lifetime)?)?
3980        fn $func: ident(self
3981            $(,$arg: ident: $ty: ty)* $(,)?
3982        ) $(-> $ret: ty)?;
3983    )*}) => {
3984        $(
3985            $(#[$attr])*
3986            #[inline(always)]
3987            $(unsafe $($placeholder)?)? fn $func (self, $($arg: $ty,)*) $(-> $ret)? {
3988                (*self).$func ($($arg,)*)
3989            }
3990        )*
3991    };
3992}
3993
3994#[allow(unused_macros)]
3995macro_rules! inherit_x2 {
3996    ($base: expr, {$(
3997        $(#[$attr: meta])*
3998        $(unsafe $($placeholder: lifetime)?)?
3999        fn $func: ident ($self: ident
4000            $(,$arg: ident: $ty: ty)* $(,)?
4001        ) $(-> $ret: ty)?;
4002    )*}) => {
4003        $(
4004            $(#[$attr])*
4005            #[inline(always)]
4006            $(unsafe $($placeholder)?)? fn $func ($self, $($arg: $ty,)*) $(-> $ret)? {
4007            	$(let $arg: [_; 2] = cast!($arg);)*
4008                cast!([($base).$func ($($arg[0],)*), ($base).$func ($($arg[1],)*)])
4009            }
4010        )*
4011    };
4012
4013    ($base: expr, splat, {$(
4014        $(#[$attr: meta])*
4015        $(unsafe $($placeholder: lifetime)?)?
4016        fn $func: ident ($self: ident
4017            $(,$arg: ident: $ty: ty)* $(,)?
4018        ) $(-> $ret: ty)?;
4019    )*}) => {
4020        $(
4021            $(#[$attr])*
4022            #[inline(always)]
4023            $(unsafe $($placeholder)?)? fn $func ($self, $($arg: $ty,)*) $(-> $ret)? {
4024                cast!([($base).$func ($($arg,)*), ($base).$func ($($arg,)*)])
4025            }
4026        )*
4027    };
4028
4029    ($base: expr, wide, {$(
4030        $(#[$attr: meta])*
4031        $(unsafe $($placeholder: lifetime)?)?
4032        fn $func: ident ($self: ident
4033            $(,$arg: ident: $ty: ty)* $(,)?
4034        ) $(-> $ret: ty)?;
4035    )*}) => {
4036        $(
4037            $(#[$attr])*
4038            #[inline(always)]
4039            $(unsafe $($placeholder)?)? fn $func ($self, $($arg: $ty,)*) $(-> $ret)? {
4040            	$(let $arg: [_; 2] = cast!($arg);)*
4041                let (r0, r1) = ($base).$func ($($arg[0],)*); let (s0, s1) = ($base).$func ($($arg[1],)*);
4042                (cast!([r0, s0]), cast!([r1, s1]))
4043            }
4044        )*
4045    };
4046}
4047
4048#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
4049#[cfg_attr(docsrs, doc(cfg(any(target_arch = "x86", target_arch = "x86_64"))))]
4050/// Low level x86 API.
4051pub mod x86;
4052
4053#[cfg(target_arch = "aarch64")]
4054#[cfg_attr(docsrs, doc(cfg(target_arch = "aarch64")))]
4055/// Low level aarch64 API.
4056pub mod aarch64;
4057
4058/// Mask type with 8 bits. Its bit pattern is either all ones or all zeros. Unsafe code must not
4059/// depend on this, however.
4060#[derive(Copy, Clone, PartialEq, Eq)]
4061#[repr(transparent)]
4062pub struct m8(u8);
4063/// Mask type with 16 bits. Its bit pattern is either all ones or all zeros. Unsafe code must not
4064/// depend on this, however.
4065#[derive(Copy, Clone, PartialEq, Eq)]
4066#[repr(transparent)]
4067pub struct m16(u16);
4068/// Mask type with 32 bits. Its bit pattern is either all ones or all zeros. Unsafe code must not
4069/// depend on this, however.
4070#[derive(Copy, Clone, PartialEq, Eq)]
4071#[repr(transparent)]
4072pub struct m32(u32);
4073/// Mask type with 64 bits. Its bit pattern is either all ones or all zeros. Unsafe code must not
4074/// depend on this, however.
4075#[derive(Copy, Clone, PartialEq, Eq)]
4076#[repr(transparent)]
4077pub struct m64(u64);
4078
4079/// Bitmask type for 8 elements, used for mask operations on AVX512.
4080#[derive(Copy, Clone, PartialEq, Eq)]
4081#[repr(transparent)]
4082pub struct b8(pub u8);
4083/// Bitmask type for 16 elements, used for mask operations on AVX512.
4084#[derive(Copy, Clone, PartialEq, Eq)]
4085#[repr(transparent)]
4086pub struct b16(pub u16);
4087/// Bitmask type for 32 elements, used for mask operations on AVX512.
4088#[derive(Copy, Clone, PartialEq, Eq)]
4089#[repr(transparent)]
4090pub struct b32(pub u32);
4091/// Bitmask type for 64 elements, used for mask operations on AVX512.
4092#[derive(Copy, Clone, PartialEq, Eq)]
4093#[repr(transparent)]
4094pub struct b64(pub u64);
4095
4096impl core::ops::Not for b8 {
4097	type Output = b8;
4098
4099	#[inline(always)]
4100	fn not(self) -> Self::Output {
4101		b8(!self.0)
4102	}
4103}
4104impl core::ops::BitAnd for b8 {
4105	type Output = b8;
4106
4107	#[inline(always)]
4108	fn bitand(self, rhs: Self) -> Self::Output {
4109		b8(self.0 & rhs.0)
4110	}
4111}
4112impl core::ops::BitOr for b8 {
4113	type Output = b8;
4114
4115	#[inline(always)]
4116	fn bitor(self, rhs: Self) -> Self::Output {
4117		b8(self.0 | rhs.0)
4118	}
4119}
4120impl core::ops::BitXor for b8 {
4121	type Output = b8;
4122
4123	#[inline(always)]
4124	fn bitxor(self, rhs: Self) -> Self::Output {
4125		b8(self.0 ^ rhs.0)
4126	}
4127}
4128
4129impl core::ops::Not for m8 {
4130	type Output = m8;
4131
4132	#[inline(always)]
4133	fn not(self) -> Self::Output {
4134		m8(!self.0)
4135	}
4136}
4137impl core::ops::BitAnd for m8 {
4138	type Output = m8;
4139
4140	#[inline(always)]
4141	fn bitand(self, rhs: Self) -> Self::Output {
4142		m8(self.0 & rhs.0)
4143	}
4144}
4145impl core::ops::BitOr for m8 {
4146	type Output = m8;
4147
4148	#[inline(always)]
4149	fn bitor(self, rhs: Self) -> Self::Output {
4150		m8(self.0 | rhs.0)
4151	}
4152}
4153impl core::ops::BitXor for m8 {
4154	type Output = m8;
4155
4156	#[inline(always)]
4157	fn bitxor(self, rhs: Self) -> Self::Output {
4158		m8(self.0 ^ rhs.0)
4159	}
4160}
4161
4162impl core::ops::Not for m16 {
4163	type Output = m16;
4164
4165	#[inline(always)]
4166	fn not(self) -> Self::Output {
4167		m16(!self.0)
4168	}
4169}
4170impl core::ops::BitAnd for m16 {
4171	type Output = m16;
4172
4173	#[inline(always)]
4174	fn bitand(self, rhs: Self) -> Self::Output {
4175		m16(self.0 & rhs.0)
4176	}
4177}
4178impl core::ops::BitOr for m16 {
4179	type Output = m16;
4180
4181	#[inline(always)]
4182	fn bitor(self, rhs: Self) -> Self::Output {
4183		m16(self.0 | rhs.0)
4184	}
4185}
4186impl core::ops::BitXor for m16 {
4187	type Output = m16;
4188
4189	#[inline(always)]
4190	fn bitxor(self, rhs: Self) -> Self::Output {
4191		m16(self.0 ^ rhs.0)
4192	}
4193}
4194
4195impl core::ops::Not for m32 {
4196	type Output = m32;
4197
4198	#[inline(always)]
4199	fn not(self) -> Self::Output {
4200		m32(!self.0)
4201	}
4202}
4203impl core::ops::BitAnd for m32 {
4204	type Output = m32;
4205
4206	#[inline(always)]
4207	fn bitand(self, rhs: Self) -> Self::Output {
4208		m32(self.0 & rhs.0)
4209	}
4210}
4211impl core::ops::BitOr for m32 {
4212	type Output = m32;
4213
4214	#[inline(always)]
4215	fn bitor(self, rhs: Self) -> Self::Output {
4216		m32(self.0 | rhs.0)
4217	}
4218}
4219impl core::ops::BitXor for m32 {
4220	type Output = m32;
4221
4222	#[inline(always)]
4223	fn bitxor(self, rhs: Self) -> Self::Output {
4224		m32(self.0 ^ rhs.0)
4225	}
4226}
4227
4228impl core::ops::Not for m64 {
4229	type Output = m64;
4230
4231	#[inline(always)]
4232	fn not(self) -> Self::Output {
4233		m64(!self.0)
4234	}
4235}
4236impl core::ops::BitAnd for m64 {
4237	type Output = m64;
4238
4239	#[inline(always)]
4240	fn bitand(self, rhs: Self) -> Self::Output {
4241		m64(self.0 & rhs.0)
4242	}
4243}
4244impl core::ops::BitOr for m64 {
4245	type Output = m64;
4246
4247	#[inline(always)]
4248	fn bitor(self, rhs: Self) -> Self::Output {
4249		m64(self.0 | rhs.0)
4250	}
4251}
4252impl core::ops::BitXor for m64 {
4253	type Output = m64;
4254
4255	#[inline(always)]
4256	fn bitxor(self, rhs: Self) -> Self::Output {
4257		m64(self.0 ^ rhs.0)
4258	}
4259}
4260
4261impl core::ops::Not for b16 {
4262	type Output = b16;
4263
4264	#[inline(always)]
4265	fn not(self) -> Self::Output {
4266		b16(!self.0)
4267	}
4268}
4269impl core::ops::BitAnd for b16 {
4270	type Output = b16;
4271
4272	#[inline(always)]
4273	fn bitand(self, rhs: Self) -> Self::Output {
4274		b16(self.0 & rhs.0)
4275	}
4276}
4277impl core::ops::BitOr for b16 {
4278	type Output = b16;
4279
4280	#[inline(always)]
4281	fn bitor(self, rhs: Self) -> Self::Output {
4282		b16(self.0 | rhs.0)
4283	}
4284}
4285impl core::ops::BitXor for b16 {
4286	type Output = b16;
4287
4288	#[inline(always)]
4289	fn bitxor(self, rhs: Self) -> Self::Output {
4290		b16(self.0 ^ rhs.0)
4291	}
4292}
4293
4294impl core::ops::Not for b32 {
4295	type Output = b32;
4296
4297	#[inline(always)]
4298	fn not(self) -> Self::Output {
4299		b32(!self.0)
4300	}
4301}
4302impl core::ops::BitAnd for b32 {
4303	type Output = b32;
4304
4305	#[inline(always)]
4306	fn bitand(self, rhs: Self) -> Self::Output {
4307		b32(self.0 & rhs.0)
4308	}
4309}
4310impl core::ops::BitOr for b32 {
4311	type Output = b32;
4312
4313	#[inline(always)]
4314	fn bitor(self, rhs: Self) -> Self::Output {
4315		b32(self.0 | rhs.0)
4316	}
4317}
4318impl core::ops::BitXor for b32 {
4319	type Output = b32;
4320
4321	#[inline(always)]
4322	fn bitxor(self, rhs: Self) -> Self::Output {
4323		b32(self.0 ^ rhs.0)
4324	}
4325}
4326
4327impl core::ops::Not for b64 {
4328	type Output = b64;
4329
4330	#[inline(always)]
4331	fn not(self) -> Self::Output {
4332		b64(!self.0)
4333	}
4334}
4335impl core::ops::BitAnd for b64 {
4336	type Output = b64;
4337
4338	#[inline(always)]
4339	fn bitand(self, rhs: Self) -> Self::Output {
4340		b64(self.0 & rhs.0)
4341	}
4342}
4343impl core::ops::BitOr for b64 {
4344	type Output = b64;
4345
4346	#[inline(always)]
4347	fn bitor(self, rhs: Self) -> Self::Output {
4348		b64(self.0 | rhs.0)
4349	}
4350}
4351impl core::ops::BitXor for b64 {
4352	type Output = b64;
4353
4354	#[inline(always)]
4355	fn bitxor(self, rhs: Self) -> Self::Output {
4356		b64(self.0 ^ rhs.0)
4357	}
4358}
4359
4360impl Debug for b8 {
4361	fn fmt(&self, f: &mut ::core::fmt::Formatter<'_>) -> ::core::fmt::Result {
4362		#[allow(dead_code)]
4363		#[derive(Copy, Clone, Debug)]
4364		struct b8(bool, bool, bool, bool, bool, bool, bool, bool);
4365		b8(
4366			((self.0 >> 0) & 1) == 1,
4367			((self.0 >> 1) & 1) == 1,
4368			((self.0 >> 2) & 1) == 1,
4369			((self.0 >> 3) & 1) == 1,
4370			((self.0 >> 4) & 1) == 1,
4371			((self.0 >> 5) & 1) == 1,
4372			((self.0 >> 6) & 1) == 1,
4373			((self.0 >> 7) & 1) == 1,
4374		)
4375		.fmt(f)
4376	}
4377}
4378impl Debug for b16 {
4379	fn fmt(&self, f: &mut ::core::fmt::Formatter<'_>) -> ::core::fmt::Result {
4380		#[allow(dead_code)]
4381		#[derive(Copy, Clone, Debug)]
4382		struct b16(
4383			bool,
4384			bool,
4385			bool,
4386			bool,
4387			bool,
4388			bool,
4389			bool,
4390			bool,
4391			bool,
4392			bool,
4393			bool,
4394			bool,
4395			bool,
4396			bool,
4397			bool,
4398			bool,
4399		);
4400		b16(
4401			((self.0 >> 00) & 1) == 1,
4402			((self.0 >> 01) & 1) == 1,
4403			((self.0 >> 02) & 1) == 1,
4404			((self.0 >> 03) & 1) == 1,
4405			((self.0 >> 04) & 1) == 1,
4406			((self.0 >> 05) & 1) == 1,
4407			((self.0 >> 06) & 1) == 1,
4408			((self.0 >> 07) & 1) == 1,
4409			((self.0 >> 08) & 1) == 1,
4410			((self.0 >> 09) & 1) == 1,
4411			((self.0 >> 10) & 1) == 1,
4412			((self.0 >> 11) & 1) == 1,
4413			((self.0 >> 12) & 1) == 1,
4414			((self.0 >> 13) & 1) == 1,
4415			((self.0 >> 14) & 1) == 1,
4416			((self.0 >> 15) & 1) == 1,
4417		)
4418		.fmt(f)
4419	}
4420}
4421impl Debug for b32 {
4422	fn fmt(&self, f: &mut ::core::fmt::Formatter<'_>) -> ::core::fmt::Result {
4423		#[allow(dead_code)]
4424		#[derive(Copy, Clone, Debug)]
4425		struct b32(
4426			bool,
4427			bool,
4428			bool,
4429			bool,
4430			bool,
4431			bool,
4432			bool,
4433			bool,
4434			bool,
4435			bool,
4436			bool,
4437			bool,
4438			bool,
4439			bool,
4440			bool,
4441			bool,
4442			bool,
4443			bool,
4444			bool,
4445			bool,
4446			bool,
4447			bool,
4448			bool,
4449			bool,
4450			bool,
4451			bool,
4452			bool,
4453			bool,
4454			bool,
4455			bool,
4456			bool,
4457			bool,
4458		);
4459		b32(
4460			((self.0 >> 00) & 1) == 1,
4461			((self.0 >> 01) & 1) == 1,
4462			((self.0 >> 02) & 1) == 1,
4463			((self.0 >> 03) & 1) == 1,
4464			((self.0 >> 04) & 1) == 1,
4465			((self.0 >> 05) & 1) == 1,
4466			((self.0 >> 06) & 1) == 1,
4467			((self.0 >> 07) & 1) == 1,
4468			((self.0 >> 08) & 1) == 1,
4469			((self.0 >> 09) & 1) == 1,
4470			((self.0 >> 10) & 1) == 1,
4471			((self.0 >> 11) & 1) == 1,
4472			((self.0 >> 12) & 1) == 1,
4473			((self.0 >> 13) & 1) == 1,
4474			((self.0 >> 14) & 1) == 1,
4475			((self.0 >> 15) & 1) == 1,
4476			((self.0 >> 16) & 1) == 1,
4477			((self.0 >> 17) & 1) == 1,
4478			((self.0 >> 18) & 1) == 1,
4479			((self.0 >> 19) & 1) == 1,
4480			((self.0 >> 20) & 1) == 1,
4481			((self.0 >> 21) & 1) == 1,
4482			((self.0 >> 22) & 1) == 1,
4483			((self.0 >> 23) & 1) == 1,
4484			((self.0 >> 24) & 1) == 1,
4485			((self.0 >> 25) & 1) == 1,
4486			((self.0 >> 26) & 1) == 1,
4487			((self.0 >> 27) & 1) == 1,
4488			((self.0 >> 28) & 1) == 1,
4489			((self.0 >> 29) & 1) == 1,
4490			((self.0 >> 30) & 1) == 1,
4491			((self.0 >> 31) & 1) == 1,
4492		)
4493		.fmt(f)
4494	}
4495}
4496impl Debug for b64 {
4497	fn fmt(&self, f: &mut ::core::fmt::Formatter<'_>) -> ::core::fmt::Result {
4498		#[allow(dead_code)]
4499		#[derive(Copy, Clone, Debug)]
4500		struct b64(
4501			bool,
4502			bool,
4503			bool,
4504			bool,
4505			bool,
4506			bool,
4507			bool,
4508			bool,
4509			bool,
4510			bool,
4511			bool,
4512			bool,
4513			bool,
4514			bool,
4515			bool,
4516			bool,
4517			bool,
4518			bool,
4519			bool,
4520			bool,
4521			bool,
4522			bool,
4523			bool,
4524			bool,
4525			bool,
4526			bool,
4527			bool,
4528			bool,
4529			bool,
4530			bool,
4531			bool,
4532			bool,
4533			bool,
4534			bool,
4535			bool,
4536			bool,
4537			bool,
4538			bool,
4539			bool,
4540			bool,
4541			bool,
4542			bool,
4543			bool,
4544			bool,
4545			bool,
4546			bool,
4547			bool,
4548			bool,
4549			bool,
4550			bool,
4551			bool,
4552			bool,
4553			bool,
4554			bool,
4555			bool,
4556			bool,
4557			bool,
4558			bool,
4559			bool,
4560			bool,
4561			bool,
4562			bool,
4563			bool,
4564			bool,
4565		);
4566		b64(
4567			((self.0 >> 00) & 1) == 1,
4568			((self.0 >> 01) & 1) == 1,
4569			((self.0 >> 02) & 1) == 1,
4570			((self.0 >> 03) & 1) == 1,
4571			((self.0 >> 04) & 1) == 1,
4572			((self.0 >> 05) & 1) == 1,
4573			((self.0 >> 06) & 1) == 1,
4574			((self.0 >> 07) & 1) == 1,
4575			((self.0 >> 08) & 1) == 1,
4576			((self.0 >> 09) & 1) == 1,
4577			((self.0 >> 10) & 1) == 1,
4578			((self.0 >> 11) & 1) == 1,
4579			((self.0 >> 12) & 1) == 1,
4580			((self.0 >> 13) & 1) == 1,
4581			((self.0 >> 14) & 1) == 1,
4582			((self.0 >> 15) & 1) == 1,
4583			((self.0 >> 16) & 1) == 1,
4584			((self.0 >> 17) & 1) == 1,
4585			((self.0 >> 18) & 1) == 1,
4586			((self.0 >> 19) & 1) == 1,
4587			((self.0 >> 20) & 1) == 1,
4588			((self.0 >> 21) & 1) == 1,
4589			((self.0 >> 22) & 1) == 1,
4590			((self.0 >> 23) & 1) == 1,
4591			((self.0 >> 24) & 1) == 1,
4592			((self.0 >> 25) & 1) == 1,
4593			((self.0 >> 26) & 1) == 1,
4594			((self.0 >> 27) & 1) == 1,
4595			((self.0 >> 28) & 1) == 1,
4596			((self.0 >> 29) & 1) == 1,
4597			((self.0 >> 30) & 1) == 1,
4598			((self.0 >> 31) & 1) == 1,
4599			((self.0 >> 32) & 1) == 1,
4600			((self.0 >> 33) & 1) == 1,
4601			((self.0 >> 34) & 1) == 1,
4602			((self.0 >> 35) & 1) == 1,
4603			((self.0 >> 36) & 1) == 1,
4604			((self.0 >> 37) & 1) == 1,
4605			((self.0 >> 38) & 1) == 1,
4606			((self.0 >> 39) & 1) == 1,
4607			((self.0 >> 40) & 1) == 1,
4608			((self.0 >> 41) & 1) == 1,
4609			((self.0 >> 42) & 1) == 1,
4610			((self.0 >> 43) & 1) == 1,
4611			((self.0 >> 44) & 1) == 1,
4612			((self.0 >> 45) & 1) == 1,
4613			((self.0 >> 46) & 1) == 1,
4614			((self.0 >> 47) & 1) == 1,
4615			((self.0 >> 48) & 1) == 1,
4616			((self.0 >> 49) & 1) == 1,
4617			((self.0 >> 50) & 1) == 1,
4618			((self.0 >> 51) & 1) == 1,
4619			((self.0 >> 52) & 1) == 1,
4620			((self.0 >> 53) & 1) == 1,
4621			((self.0 >> 54) & 1) == 1,
4622			((self.0 >> 55) & 1) == 1,
4623			((self.0 >> 56) & 1) == 1,
4624			((self.0 >> 57) & 1) == 1,
4625			((self.0 >> 58) & 1) == 1,
4626			((self.0 >> 59) & 1) == 1,
4627			((self.0 >> 60) & 1) == 1,
4628			((self.0 >> 61) & 1) == 1,
4629			((self.0 >> 62) & 1) == 1,
4630			((self.0 >> 63) & 1) == 1,
4631		)
4632		.fmt(f)
4633	}
4634}
4635
4636impl Debug for m8 {
4637	#[inline]
4638	fn fmt(&self, f: &mut ::core::fmt::Formatter<'_>) -> ::core::fmt::Result {
4639		self.is_set().fmt(f)
4640	}
4641}
4642impl Debug for m16 {
4643	#[inline]
4644	fn fmt(&self, f: &mut ::core::fmt::Formatter<'_>) -> ::core::fmt::Result {
4645		self.is_set().fmt(f)
4646	}
4647}
4648impl Debug for m32 {
4649	#[inline]
4650	fn fmt(&self, f: &mut ::core::fmt::Formatter<'_>) -> ::core::fmt::Result {
4651		self.is_set().fmt(f)
4652	}
4653}
4654impl Debug for m64 {
4655	#[inline]
4656	fn fmt(&self, f: &mut ::core::fmt::Formatter<'_>) -> ::core::fmt::Result {
4657		self.is_set().fmt(f)
4658	}
4659}
4660
4661impl m8 {
4662	/// Returns a mask with all bits set one, if `flag` is true, otherwise returns a mask with all
4663	/// bits set to zero.
4664	#[inline(always)]
4665	pub const fn new(flag: bool) -> Self {
4666		Self(if flag { u8::MAX } else { 0 })
4667	}
4668
4669	/// Returns `false` if the mask bits are all zero, otherwise returns `true`.
4670	#[inline(always)]
4671	pub const fn is_set(self) -> bool {
4672		self.0 != 0
4673	}
4674}
4675impl m16 {
4676	/// Returns a mask with all bits set one, if `flag` is true, otherwise returns a mask with all
4677	/// bits set to zero.
4678	#[inline(always)]
4679	pub const fn new(flag: bool) -> Self {
4680		Self(if flag { u16::MAX } else { 0 })
4681	}
4682
4683	/// Returns `false` if the mask bits are all zero, otherwise returns `true`.
4684	#[inline(always)]
4685	pub const fn is_set(self) -> bool {
4686		self.0 != 0
4687	}
4688}
4689impl m32 {
4690	/// Returns a mask with all bits set one, if `flag` is true, otherwise returns a mask with all
4691	/// bits set to zero.
4692	#[inline(always)]
4693	pub const fn new(flag: bool) -> Self {
4694		Self(if flag { u32::MAX } else { 0 })
4695	}
4696
4697	/// Returns `false` if the mask bits are all zero, otherwise returns `true`.
4698	#[inline(always)]
4699	pub const fn is_set(self) -> bool {
4700		self.0 != 0
4701	}
4702}
4703impl m64 {
4704	/// Returns a mask with all bits set one, if `flag` is true, otherwise returns a mask with all
4705	/// bits set to zero.
4706	#[inline(always)]
4707	pub const fn new(flag: bool) -> Self {
4708		Self(if flag { u64::MAX } else { 0 })
4709	}
4710
4711	/// Returns `false` if the mask bits are all zero, otherwise returns `true`.
4712	#[inline(always)]
4713	pub const fn is_set(self) -> bool {
4714		self.0 != 0
4715	}
4716}
4717
4718/// A 128-bit SIMD vector with 16 elements of type [`i8`].
4719#[derive(Debug, Copy, Clone, PartialEq, Eq)]
4720#[repr(C)]
4721pub struct i8x16(
4722	pub i8,
4723	pub i8,
4724	pub i8,
4725	pub i8,
4726	pub i8,
4727	pub i8,
4728	pub i8,
4729	pub i8,
4730	pub i8,
4731	pub i8,
4732	pub i8,
4733	pub i8,
4734	pub i8,
4735	pub i8,
4736	pub i8,
4737	pub i8,
4738);
4739/// A 256-bit SIMD vector with 32 elements of type [`i8`].
4740#[derive(Debug, Copy, Clone, PartialEq, Eq)]
4741#[repr(C)]
4742pub struct i8x32(
4743	pub i8,
4744	pub i8,
4745	pub i8,
4746	pub i8,
4747	pub i8,
4748	pub i8,
4749	pub i8,
4750	pub i8,
4751	pub i8,
4752	pub i8,
4753	pub i8,
4754	pub i8,
4755	pub i8,
4756	pub i8,
4757	pub i8,
4758	pub i8,
4759	pub i8,
4760	pub i8,
4761	pub i8,
4762	pub i8,
4763	pub i8,
4764	pub i8,
4765	pub i8,
4766	pub i8,
4767	pub i8,
4768	pub i8,
4769	pub i8,
4770	pub i8,
4771	pub i8,
4772	pub i8,
4773	pub i8,
4774	pub i8,
4775);
4776/// A 512-bit SIMD vector with 64 elements of type [`i8`].
4777#[derive(Debug, Copy, Clone, PartialEq, Eq)]
4778#[repr(C)]
4779pub struct i8x64(
4780	pub i8,
4781	pub i8,
4782	pub i8,
4783	pub i8,
4784	pub i8,
4785	pub i8,
4786	pub i8,
4787	pub i8,
4788	pub i8,
4789	pub i8,
4790	pub i8,
4791	pub i8,
4792	pub i8,
4793	pub i8,
4794	pub i8,
4795	pub i8,
4796	pub i8,
4797	pub i8,
4798	pub i8,
4799	pub i8,
4800	pub i8,
4801	pub i8,
4802	pub i8,
4803	pub i8,
4804	pub i8,
4805	pub i8,
4806	pub i8,
4807	pub i8,
4808	pub i8,
4809	pub i8,
4810	pub i8,
4811	pub i8,
4812	pub i8,
4813	pub i8,
4814	pub i8,
4815	pub i8,
4816	pub i8,
4817	pub i8,
4818	pub i8,
4819	pub i8,
4820	pub i8,
4821	pub i8,
4822	pub i8,
4823	pub i8,
4824	pub i8,
4825	pub i8,
4826	pub i8,
4827	pub i8,
4828	pub i8,
4829	pub i8,
4830	pub i8,
4831	pub i8,
4832	pub i8,
4833	pub i8,
4834	pub i8,
4835	pub i8,
4836	pub i8,
4837	pub i8,
4838	pub i8,
4839	pub i8,
4840	pub i8,
4841	pub i8,
4842	pub i8,
4843	pub i8,
4844);
4845
4846/// A 128-bit SIMD vector with 16 elements of type [`u8`].
4847#[derive(Debug, Copy, Clone, PartialEq, Eq)]
4848#[repr(C)]
4849pub struct u8x16(
4850	pub u8,
4851	pub u8,
4852	pub u8,
4853	pub u8,
4854	pub u8,
4855	pub u8,
4856	pub u8,
4857	pub u8,
4858	pub u8,
4859	pub u8,
4860	pub u8,
4861	pub u8,
4862	pub u8,
4863	pub u8,
4864	pub u8,
4865	pub u8,
4866);
4867/// A 256-bit SIMD vector with 32 elements of type [`u8`].
4868#[derive(Debug, Copy, Clone, PartialEq, Eq)]
4869#[repr(C)]
4870pub struct u8x32(
4871	pub u8,
4872	pub u8,
4873	pub u8,
4874	pub u8,
4875	pub u8,
4876	pub u8,
4877	pub u8,
4878	pub u8,
4879	pub u8,
4880	pub u8,
4881	pub u8,
4882	pub u8,
4883	pub u8,
4884	pub u8,
4885	pub u8,
4886	pub u8,
4887	pub u8,
4888	pub u8,
4889	pub u8,
4890	pub u8,
4891	pub u8,
4892	pub u8,
4893	pub u8,
4894	pub u8,
4895	pub u8,
4896	pub u8,
4897	pub u8,
4898	pub u8,
4899	pub u8,
4900	pub u8,
4901	pub u8,
4902	pub u8,
4903);
4904/// A 512-bit SIMD vector with 64 elements of type [`u8`].
4905#[derive(Debug, Copy, Clone, PartialEq, Eq)]
4906#[repr(C)]
4907pub struct u8x64(
4908	pub u8,
4909	pub u8,
4910	pub u8,
4911	pub u8,
4912	pub u8,
4913	pub u8,
4914	pub u8,
4915	pub u8,
4916	pub u8,
4917	pub u8,
4918	pub u8,
4919	pub u8,
4920	pub u8,
4921	pub u8,
4922	pub u8,
4923	pub u8,
4924	pub u8,
4925	pub u8,
4926	pub u8,
4927	pub u8,
4928	pub u8,
4929	pub u8,
4930	pub u8,
4931	pub u8,
4932	pub u8,
4933	pub u8,
4934	pub u8,
4935	pub u8,
4936	pub u8,
4937	pub u8,
4938	pub u8,
4939	pub u8,
4940	pub u8,
4941	pub u8,
4942	pub u8,
4943	pub u8,
4944	pub u8,
4945	pub u8,
4946	pub u8,
4947	pub u8,
4948	pub u8,
4949	pub u8,
4950	pub u8,
4951	pub u8,
4952	pub u8,
4953	pub u8,
4954	pub u8,
4955	pub u8,
4956	pub u8,
4957	pub u8,
4958	pub u8,
4959	pub u8,
4960	pub u8,
4961	pub u8,
4962	pub u8,
4963	pub u8,
4964	pub u8,
4965	pub u8,
4966	pub u8,
4967	pub u8,
4968	pub u8,
4969	pub u8,
4970	pub u8,
4971	pub u8,
4972);
4973
4974/// A 128-bit SIMD vector with 16 elements of type [`m8`].
4975#[derive(Debug, Copy, Clone, PartialEq, Eq)]
4976#[repr(C)]
4977pub struct m8x16(
4978	pub m8,
4979	pub m8,
4980	pub m8,
4981	pub m8,
4982	pub m8,
4983	pub m8,
4984	pub m8,
4985	pub m8,
4986	pub m8,
4987	pub m8,
4988	pub m8,
4989	pub m8,
4990	pub m8,
4991	pub m8,
4992	pub m8,
4993	pub m8,
4994);
4995/// A 256-bit SIMD vector with 32 elements of type [`m8`].
4996#[derive(Debug, Copy, Clone, PartialEq, Eq)]
4997#[repr(C)]
4998pub struct m8x32(
4999	pub m8,
5000	pub m8,
5001	pub m8,
5002	pub m8,
5003	pub m8,
5004	pub m8,
5005	pub m8,
5006	pub m8,
5007	pub m8,
5008	pub m8,
5009	pub m8,
5010	pub m8,
5011	pub m8,
5012	pub m8,
5013	pub m8,
5014	pub m8,
5015	pub m8,
5016	pub m8,
5017	pub m8,
5018	pub m8,
5019	pub m8,
5020	pub m8,
5021	pub m8,
5022	pub m8,
5023	pub m8,
5024	pub m8,
5025	pub m8,
5026	pub m8,
5027	pub m8,
5028	pub m8,
5029	pub m8,
5030	pub m8,
5031);
5032
5033/// A 128-bit SIMD vector with 8 elements of type [`i16`].
5034#[derive(Debug, Copy, Clone, PartialEq, Eq)]
5035#[repr(C)]
5036pub struct i16x8(
5037	pub i16,
5038	pub i16,
5039	pub i16,
5040	pub i16,
5041	pub i16,
5042	pub i16,
5043	pub i16,
5044	pub i16,
5045);
5046/// A 256-bit SIMD vector with 16 elements of type [`i16`].
5047#[derive(Debug, Copy, Clone, PartialEq, Eq)]
5048#[repr(C)]
5049pub struct i16x16(
5050	pub i16,
5051	pub i16,
5052	pub i16,
5053	pub i16,
5054	pub i16,
5055	pub i16,
5056	pub i16,
5057	pub i16,
5058	pub i16,
5059	pub i16,
5060	pub i16,
5061	pub i16,
5062	pub i16,
5063	pub i16,
5064	pub i16,
5065	pub i16,
5066);
5067/// A 512-bit SIMD vector with 32 elements of type [`i16`].
5068#[derive(Debug, Copy, Clone, PartialEq, Eq)]
5069#[repr(C)]
5070pub struct i16x32(
5071	pub i16,
5072	pub i16,
5073	pub i16,
5074	pub i16,
5075	pub i16,
5076	pub i16,
5077	pub i16,
5078	pub i16,
5079	pub i16,
5080	pub i16,
5081	pub i16,
5082	pub i16,
5083	pub i16,
5084	pub i16,
5085	pub i16,
5086	pub i16,
5087	pub i16,
5088	pub i16,
5089	pub i16,
5090	pub i16,
5091	pub i16,
5092	pub i16,
5093	pub i16,
5094	pub i16,
5095	pub i16,
5096	pub i16,
5097	pub i16,
5098	pub i16,
5099	pub i16,
5100	pub i16,
5101	pub i16,
5102	pub i16,
5103);
5104
5105/// A 128-bit SIMD vector with 8 elements of type [`u16`].
5106#[derive(Debug, Copy, Clone, PartialEq, Eq)]
5107#[repr(C)]
5108pub struct u16x8(
5109	pub u16,
5110	pub u16,
5111	pub u16,
5112	pub u16,
5113	pub u16,
5114	pub u16,
5115	pub u16,
5116	pub u16,
5117);
5118/// A 256-bit SIMD vector with 16 elements of type [`u16`].
5119#[derive(Debug, Copy, Clone, PartialEq, Eq)]
5120#[repr(C)]
5121pub struct u16x16(
5122	pub u16,
5123	pub u16,
5124	pub u16,
5125	pub u16,
5126	pub u16,
5127	pub u16,
5128	pub u16,
5129	pub u16,
5130	pub u16,
5131	pub u16,
5132	pub u16,
5133	pub u16,
5134	pub u16,
5135	pub u16,
5136	pub u16,
5137	pub u16,
5138);
5139/// A 512-bit SIMD vector with 32 elements of type [`u16`].
5140#[derive(Debug, Copy, Clone, PartialEq, Eq)]
5141#[repr(C)]
5142pub struct u16x32(
5143	pub u16,
5144	pub u16,
5145	pub u16,
5146	pub u16,
5147	pub u16,
5148	pub u16,
5149	pub u16,
5150	pub u16,
5151	pub u16,
5152	pub u16,
5153	pub u16,
5154	pub u16,
5155	pub u16,
5156	pub u16,
5157	pub u16,
5158	pub u16,
5159	pub u16,
5160	pub u16,
5161	pub u16,
5162	pub u16,
5163	pub u16,
5164	pub u16,
5165	pub u16,
5166	pub u16,
5167	pub u16,
5168	pub u16,
5169	pub u16,
5170	pub u16,
5171	pub u16,
5172	pub u16,
5173	pub u16,
5174	pub u16,
5175);
5176
5177/// A 128-bit SIMD vector with 8 elements of type [`m16`].
5178#[derive(Debug, Copy, Clone, PartialEq, Eq)]
5179#[repr(C)]
5180pub struct m16x8(
5181	pub m16,
5182	pub m16,
5183	pub m16,
5184	pub m16,
5185	pub m16,
5186	pub m16,
5187	pub m16,
5188	pub m16,
5189);
5190/// A 256-bit SIMD vector with 16 elements of type [`m16`].
5191#[derive(Debug, Copy, Clone, PartialEq, Eq)]
5192#[repr(C)]
5193pub struct m16x16(
5194	pub m16,
5195	pub m16,
5196	pub m16,
5197	pub m16,
5198	pub m16,
5199	pub m16,
5200	pub m16,
5201	pub m16,
5202	pub m16,
5203	pub m16,
5204	pub m16,
5205	pub m16,
5206	pub m16,
5207	pub m16,
5208	pub m16,
5209	pub m16,
5210);
5211
5212/// A 128-bit SIMD vector with 4 elements of type [`f32`].
5213#[derive(Debug, Copy, Clone, PartialEq)]
5214#[repr(C)]
5215pub struct f32x4(pub f32, pub f32, pub f32, pub f32);
5216/// A 256-bit SIMD vector with 8 elements of type [`f32`].
5217#[derive(Debug, Copy, Clone, PartialEq)]
5218#[repr(C)]
5219pub struct f32x8(
5220	pub f32,
5221	pub f32,
5222	pub f32,
5223	pub f32,
5224	pub f32,
5225	pub f32,
5226	pub f32,
5227	pub f32,
5228);
5229/// A 512-bit SIMD vector with 16 elements of type [`f32`].
5230#[derive(Debug, Copy, Clone, PartialEq)]
5231#[repr(C)]
5232pub struct f32x16(
5233	pub f32,
5234	pub f32,
5235	pub f32,
5236	pub f32,
5237	pub f32,
5238	pub f32,
5239	pub f32,
5240	pub f32,
5241	pub f32,
5242	pub f32,
5243	pub f32,
5244	pub f32,
5245	pub f32,
5246	pub f32,
5247	pub f32,
5248	pub f32,
5249);
5250
5251/// A 128-bit SIMD vector with 4 elements of type [`i32`].
5252#[derive(Debug, Copy, Clone, PartialEq, Eq)]
5253#[repr(C)]
5254pub struct i32x4(pub i32, pub i32, pub i32, pub i32);
5255/// A 256-bit SIMD vector with 8 elements of type [`i32`].
5256#[derive(Debug, Copy, Clone, PartialEq, Eq)]
5257#[repr(C)]
5258pub struct i32x8(
5259	pub i32,
5260	pub i32,
5261	pub i32,
5262	pub i32,
5263	pub i32,
5264	pub i32,
5265	pub i32,
5266	pub i32,
5267);
5268/// A 512-bit SIMD vector with 16 elements of type [`i32`].
5269#[derive(Debug, Copy, Clone, PartialEq, Eq)]
5270#[repr(C)]
5271pub struct i32x16(
5272	pub i32,
5273	pub i32,
5274	pub i32,
5275	pub i32,
5276	pub i32,
5277	pub i32,
5278	pub i32,
5279	pub i32,
5280	pub i32,
5281	pub i32,
5282	pub i32,
5283	pub i32,
5284	pub i32,
5285	pub i32,
5286	pub i32,
5287	pub i32,
5288);
5289
5290/// A 128-bit SIMD vector with 4 elements of type [`u32`].
5291#[derive(Debug, Copy, Clone, PartialEq, Eq)]
5292#[repr(C)]
5293pub struct u32x4(pub u32, pub u32, pub u32, pub u32);
5294/// A 256-bit SIMD vector with 8 elements of type [`u32`].
5295#[derive(Debug, Copy, Clone, PartialEq, Eq)]
5296#[repr(C)]
5297pub struct u32x8(
5298	pub u32,
5299	pub u32,
5300	pub u32,
5301	pub u32,
5302	pub u32,
5303	pub u32,
5304	pub u32,
5305	pub u32,
5306);
5307/// A 512-bit SIMD vector with 16 elements of type [`u32`].
5308#[derive(Debug, Copy, Clone, PartialEq, Eq)]
5309#[repr(C)]
5310pub struct u32x16(
5311	pub u32,
5312	pub u32,
5313	pub u32,
5314	pub u32,
5315	pub u32,
5316	pub u32,
5317	pub u32,
5318	pub u32,
5319	pub u32,
5320	pub u32,
5321	pub u32,
5322	pub u32,
5323	pub u32,
5324	pub u32,
5325	pub u32,
5326	pub u32,
5327);
5328
5329/// A 128-bit SIMD vector with 4 elements of type [`m32`].
5330#[derive(Debug, Copy, Clone, PartialEq, Eq)]
5331#[repr(C)]
5332pub struct m32x4(pub m32, pub m32, pub m32, pub m32);
5333/// A 256-bit SIMD vector with 8 elements of type [`m32`].
5334#[derive(Debug, Copy, Clone, PartialEq, Eq)]
5335#[repr(C)]
5336pub struct m32x8(
5337	pub m32,
5338	pub m32,
5339	pub m32,
5340	pub m32,
5341	pub m32,
5342	pub m32,
5343	pub m32,
5344	pub m32,
5345);
5346/// A 512-bit SIMD vector with 16 elements of type [`m32`].
5347#[derive(Debug, Copy, Clone, PartialEq, Eq)]
5348#[repr(C)]
5349pub struct m32x16(
5350	pub m32,
5351	pub m32,
5352	pub m32,
5353	pub m32,
5354	pub m32,
5355	pub m32,
5356	pub m32,
5357	pub m32,
5358	pub m32,
5359	pub m32,
5360	pub m32,
5361	pub m32,
5362	pub m32,
5363	pub m32,
5364	pub m32,
5365	pub m32,
5366);
5367
5368/// A 128-bit SIMD vector with 2 elements of type [`f64`].
5369#[derive(Debug, Copy, Clone, PartialEq)]
5370#[repr(C)]
5371pub struct f64x2(pub f64, pub f64);
5372/// A 256-bit SIMD vector with 4 elements of type [`f64`].
5373#[derive(Debug, Copy, Clone, PartialEq)]
5374#[repr(C)]
5375pub struct f64x4(pub f64, pub f64, pub f64, pub f64);
5376/// A 512-bit SIMD vector with 8 elements of type [`f64`].
5377#[derive(Debug, Copy, Clone, PartialEq)]
5378#[repr(C)]
5379pub struct f64x8(
5380	pub f64,
5381	pub f64,
5382	pub f64,
5383	pub f64,
5384	pub f64,
5385	pub f64,
5386	pub f64,
5387	pub f64,
5388);
5389
5390/// A 128-bit SIMD vector with 2 elements of type [`i64`].
5391#[derive(Debug, Copy, Clone, PartialEq, Eq)]
5392#[repr(C)]
5393pub struct i64x2(pub i64, pub i64);
5394/// A 256-bit SIMD vector with 4 elements of type [`i64`].
5395#[derive(Debug, Copy, Clone, PartialEq, Eq)]
5396#[repr(C)]
5397pub struct i64x4(pub i64, pub i64, pub i64, pub i64);
5398/// A 512-bit SIMD vector with 8 elements of type [`i64`].
5399#[derive(Debug, Copy, Clone, PartialEq, Eq)]
5400#[repr(C)]
5401pub struct i64x8(
5402	pub i64,
5403	pub i64,
5404	pub i64,
5405	pub i64,
5406	pub i64,
5407	pub i64,
5408	pub i64,
5409	pub i64,
5410);
5411
5412/// A 128-bit SIMD vector with 2 elements of type [`u64`].
5413#[derive(Debug, Copy, Clone, PartialEq, Eq)]
5414#[repr(C)]
5415pub struct u64x2(pub u64, pub u64);
5416/// A 256-bit SIMD vector with 4 elements of type [`u64`].
5417#[derive(Debug, Copy, Clone, PartialEq, Eq)]
5418#[repr(C)]
5419pub struct u64x4(pub u64, pub u64, pub u64, pub u64);
5420/// A 512-bit SIMD vector with 8 elements of type [`u64`].
5421#[derive(Debug, Copy, Clone, PartialEq, Eq)]
5422#[repr(C)]
5423pub struct u64x8(
5424	pub u64,
5425	pub u64,
5426	pub u64,
5427	pub u64,
5428	pub u64,
5429	pub u64,
5430	pub u64,
5431	pub u64,
5432);
5433
5434/// A 128-bit SIMD vector with 2 elements of type [`m64`].
5435#[derive(Debug, Copy, Clone, PartialEq, Eq)]
5436#[repr(C)]
5437pub struct m64x2(pub m64, pub m64);
5438/// A 256-bit SIMD vector with 4 elements of type [`m64`].
5439#[derive(Debug, Copy, Clone, PartialEq, Eq)]
5440#[repr(C)]
5441pub struct m64x4(pub m64, pub m64, pub m64, pub m64);
5442/// A 512-bit SIMD vector with 8 elements of type [`m64`].
5443#[derive(Debug, Copy, Clone, PartialEq, Eq)]
5444#[repr(C)]
5445pub struct m64x8(
5446	pub m64,
5447	pub m64,
5448	pub m64,
5449	pub m64,
5450	pub m64,
5451	pub m64,
5452	pub m64,
5453	pub m64,
5454);
5455
5456unsafe impl Zeroable for m8 {}
5457unsafe impl Zeroable for m16 {}
5458unsafe impl Zeroable for m32 {}
5459unsafe impl Zeroable for m64 {}
5460unsafe impl Pod for m8 {}
5461unsafe impl Pod for m16 {}
5462unsafe impl Pod for m32 {}
5463unsafe impl Pod for m64 {}
5464
5465unsafe impl Zeroable for b8 {}
5466unsafe impl Pod for b8 {}
5467unsafe impl Zeroable for b16 {}
5468unsafe impl Pod for b16 {}
5469unsafe impl Zeroable for b32 {}
5470unsafe impl Pod for b32 {}
5471unsafe impl Zeroable for b64 {}
5472unsafe impl Pod for b64 {}
5473
5474unsafe impl Zeroable for i8x16 {}
5475unsafe impl Zeroable for i8x32 {}
5476unsafe impl Zeroable for i8x64 {}
5477unsafe impl Pod for i8x16 {}
5478unsafe impl Pod for i8x32 {}
5479unsafe impl Pod for i8x64 {}
5480unsafe impl Zeroable for u8x16 {}
5481unsafe impl Zeroable for u8x32 {}
5482unsafe impl Zeroable for u8x64 {}
5483unsafe impl Pod for u8x16 {}
5484unsafe impl Pod for u8x32 {}
5485unsafe impl Pod for u8x64 {}
5486unsafe impl Zeroable for m8x16 {}
5487unsafe impl Zeroable for m8x32 {}
5488unsafe impl Pod for m8x16 {}
5489unsafe impl Pod for m8x32 {}
5490
5491unsafe impl Zeroable for i16x8 {}
5492unsafe impl Zeroable for i16x16 {}
5493unsafe impl Zeroable for i16x32 {}
5494unsafe impl Pod for i16x8 {}
5495unsafe impl Pod for i16x16 {}
5496unsafe impl Pod for i16x32 {}
5497unsafe impl Zeroable for u16x8 {}
5498unsafe impl Zeroable for u16x16 {}
5499unsafe impl Zeroable for u16x32 {}
5500unsafe impl Pod for u16x8 {}
5501unsafe impl Pod for u16x16 {}
5502unsafe impl Pod for u16x32 {}
5503unsafe impl Zeroable for m16x8 {}
5504unsafe impl Zeroable for m16x16 {}
5505unsafe impl Pod for m16x8 {}
5506unsafe impl Pod for m16x16 {}
5507
5508unsafe impl Zeroable for f32x4 {}
5509unsafe impl Zeroable for f32x8 {}
5510unsafe impl Zeroable for f32x16 {}
5511unsafe impl Pod for f32x4 {}
5512unsafe impl Pod for f32x8 {}
5513unsafe impl Pod for f32x16 {}
5514unsafe impl Zeroable for i32x4 {}
5515unsafe impl Zeroable for i32x8 {}
5516unsafe impl Zeroable for i32x16 {}
5517unsafe impl Pod for i32x4 {}
5518unsafe impl Pod for i32x8 {}
5519unsafe impl Pod for i32x16 {}
5520unsafe impl Zeroable for u32x4 {}
5521unsafe impl Zeroable for u32x8 {}
5522unsafe impl Zeroable for u32x16 {}
5523unsafe impl Pod for u32x4 {}
5524unsafe impl Pod for u32x8 {}
5525unsafe impl Pod for u32x16 {}
5526unsafe impl Zeroable for m32x4 {}
5527unsafe impl Zeroable for m32x8 {}
5528unsafe impl Zeroable for m32x16 {}
5529unsafe impl Pod for m32x4 {}
5530unsafe impl Pod for m32x8 {}
5531unsafe impl Pod for m32x16 {}
5532
5533unsafe impl Zeroable for f64x2 {}
5534unsafe impl Zeroable for f64x4 {}
5535unsafe impl Zeroable for f64x8 {}
5536unsafe impl Pod for f64x2 {}
5537unsafe impl Pod for f64x4 {}
5538unsafe impl Pod for f64x8 {}
5539unsafe impl Zeroable for i64x2 {}
5540unsafe impl Zeroable for i64x4 {}
5541unsafe impl Zeroable for i64x8 {}
5542unsafe impl Pod for i64x2 {}
5543unsafe impl Pod for i64x4 {}
5544unsafe impl Pod for i64x8 {}
5545unsafe impl Zeroable for u64x2 {}
5546unsafe impl Zeroable for u64x4 {}
5547unsafe impl Zeroable for u64x8 {}
5548unsafe impl Pod for u64x2 {}
5549unsafe impl Pod for u64x4 {}
5550unsafe impl Pod for u64x8 {}
5551unsafe impl Zeroable for m64x2 {}
5552unsafe impl Zeroable for m64x4 {}
5553unsafe impl Zeroable for m64x8 {}
5554unsafe impl Pod for m64x2 {}
5555unsafe impl Pod for m64x4 {}
5556unsafe impl Pod for m64x8 {}
5557
5558macro_rules! iota_32 {
5559	($T: ty) => {{
5560		let mut iota = core::mem::MaybeUninit::uninit();
5561		unsafe {
5562			{
5563				let iota =
5564					&mut *((&mut iota) as *mut MaybeUninit<[$T; 32]> as *mut [MaybeUninit<$T>; 32]);
5565				let mut i = 0;
5566				while i < 32 {
5567					let v = (&mut iota[i]) as *mut _ as *mut u32;
5568
5569					let mut j = 0;
5570					while j < core::mem::size_of::<$T>() / core::mem::size_of::<u32>() {
5571						v.add(j).write_unaligned(i as u32);
5572						j += 1;
5573					}
5574
5575					i += 1;
5576				}
5577			}
5578			iota.assume_init()
5579		}
5580	}};
5581}
5582
5583macro_rules! iota_64 {
5584	($T: ty) => {{
5585		let mut iota = core::mem::MaybeUninit::uninit();
5586		unsafe {
5587			{
5588				let iota =
5589					&mut *((&mut iota) as *mut MaybeUninit<[$T; 32]> as *mut [MaybeUninit<$T>; 32]);
5590				let mut i = 0;
5591				while i < 32 {
5592					let v = (&mut iota[i]) as *mut _ as *mut u64;
5593
5594					let mut j = 0;
5595					while j < core::mem::size_of::<$T>() / core::mem::size_of::<u64>() {
5596						v.add(j).write_unaligned(i as u64);
5597						j += 1;
5598					}
5599
5600					i += 1;
5601				}
5602			}
5603			iota.assume_init()
5604		}
5605	}};
5606}
5607
5608#[cfg(libpulp_const)]
5609pub const fn iota_32<T: Interleave>() -> [T; 32] {
5610	iota_32!(T)
5611}
5612#[cfg(libpulp_const)]
5613pub const fn iota_64<T: Interleave>() -> [T; 32] {
5614	iota_64!(T)
5615}
5616
5617#[cfg(not(libpulp_const))]
5618pub fn iota_32<T: Interleave>() -> [T; 32] {
5619	iota_32!(T)
5620}
5621#[cfg(not(libpulp_const))]
5622pub fn iota_64<T: Interleave>() -> [T; 32] {
5623	iota_64!(T)
5624}
5625
5626#[cfg(target_arch = "x86_64")]
5627#[cfg(test)]
5628mod tests {
5629	use super::*;
5630
5631	#[test]
5632	fn test_interleave() {
5633		if let Some(simd) = x86::V3::try_new() {
5634			{
5635				let src = [f64x4(0.0, 0.1, 1.0, 1.1), f64x4(2.0, 2.1, 3.0, 3.1)];
5636				let dst = unsafe { deinterleave_fallback::<f64, f64x4, [f64x4; 2]>(src) };
5637				assert_eq!(dst[1], simd.add_f64x4(dst[0], simd.splat_f64x4(0.1)));
5638				assert_eq!(src, unsafe {
5639					interleave_fallback::<f64, f64x4, [f64x4; 2]>(dst)
5640				});
5641			}
5642			{
5643				let src = [
5644					f64x4(0.0, 0.1, 0.2, 0.3),
5645					f64x4(1.0, 1.1, 1.2, 1.3),
5646					f64x4(2.0, 2.1, 2.2, 2.3),
5647					f64x4(3.0, 3.1, 3.2, 3.3),
5648				];
5649				let dst = unsafe { deinterleave_fallback::<f64, f64x4, [f64x4; 4]>(src) };
5650				assert_eq!(dst[1], simd.add_f64x4(dst[0], simd.splat_f64x4(0.1)));
5651				assert_eq!(dst[2], simd.add_f64x4(dst[0], simd.splat_f64x4(0.2)));
5652				assert_eq!(dst[3], simd.add_f64x4(dst[0], simd.splat_f64x4(0.3)));
5653				assert_eq!(src, unsafe {
5654					interleave_fallback::<f64, f64x4, [f64x4; 4]>(dst)
5655				});
5656			}
5657		}
5658	}
5659}