pulp/
x86.rs

1use super::*;
2use crate::core_arch::x86::Avx2;
3
4#[cfg(target_arch = "x86")]
5use core::arch::x86::*;
6#[cfg(target_arch = "x86_64")]
7use core::arch::x86_64::*;
8
9mod v2;
10mod v3;
11
12#[cfg(feature = "nightly")]
13#[cfg_attr(docsrs, doc(cfg(feature = "nightly")))]
14mod v4;
15
16pub use v2::*;
17pub use v3::*;
18
19#[cfg(feature = "nightly")]
20#[cfg_attr(docsrs, doc(cfg(feature = "nightly")))]
21pub use v4::*;
22
23#[target_feature(enable = "avx,avx2")]
24#[inline]
25unsafe fn avx_ld_u32s(ptr: *const u32, f: unsafe extern "C" fn()) -> u32x8 {
26	let ret: __m256;
27	#[cfg(target_arch = "x86_64")]
28	core::arch::asm! {
29		"lea rcx, [rip + 2f]",
30		"jmp {f}",
31		"2:",
32		f = in(reg) f,
33		in("rax") ptr,
34		out("rcx") _,
35		out("ymm0") ret,
36		out("ymm1") _,
37	};
38
39	#[cfg(target_arch = "x86")]
40	core::arch::asm! {
41		"lea ecx, [eip + 2f]",
42		"jmp {f}",
43		"2:",
44		f = in(reg) f,
45		in("eax") ptr,
46		out("ecx") _,
47		out("ymm0") ret,
48		out("ymm1") _,
49	};
50
51	cast!(ret)
52}
53
54#[target_feature(enable = "avx,avx2")]
55#[inline]
56unsafe fn avx_st_u32s(ptr: *mut u32, value: u32x8, f: unsafe extern "C" fn()) {
57	#[cfg(target_arch = "x86_64")]
58	core::arch::asm! {
59		"lea rcx, [rip + 2f]",
60		"jmp {f}",
61		"2:",
62		f = in(reg) f,
63
64		in("rax") ptr,
65		out("rcx") _,
66		inout("ymm0") cast::<_, __m256>(value) => _,
67		out("ymm1") _,
68	};
69
70	#[cfg(target_arch = "x86")]
71	core::arch::asm! {
72		"lea ecx, [eip + 2f]",
73		"jmp {f}",
74		"2:",
75		f = in(reg) f,
76
77		in("eax") ptr,
78		out("ecx") _,
79		inout("ymm0") cast::<_, __m256>(value) => _,
80		out("ymm1") _,
81	};
82}
83
84/// x86 arch
85#[derive(Debug, Clone, Copy)]
86#[non_exhaustive]
87#[repr(u8)]
88pub enum Arch {
89	Scalar = 0,
90
91	#[cfg(feature = "x86-v3")]
92	#[cfg_attr(docsrs, doc(cfg(feature = "x86-v3")))]
93	V3(V3) = 1,
94
95	#[cfg(feature = "nightly-x86-v4")]
96	#[cfg_attr(docsrs, doc(cfg(feature = "nightly-x86-v4")))]
97	V4(V4) = 2,
98}
99
100impl Arch {
101	/// Detects the best available instruction set.
102	#[inline]
103	pub fn new() -> Self {
104		#[cfg(feature = "nightly-x86-v4")]
105		if let Some(simd) = V4::try_new() {
106			return Self::V4(simd);
107		}
108		#[cfg(feature = "x86-v3")]
109		if let Some(simd) = V3::try_new() {
110			return Self::V3(simd);
111		}
112		Self::Scalar
113	}
114
115	/// Detects the best available instruction set.
116	#[inline(always)]
117	pub fn dispatch<Op: WithSimd>(self, op: Op) -> Op::Output {
118		match self {
119			#[cfg(feature = "nightly-x86-v4")]
120			Arch::V4(simd) => Simd::vectorize(simd, op),
121			#[cfg(feature = "x86-v3")]
122			Arch::V3(simd) => Simd::vectorize(simd, op),
123
124			Arch::Scalar => Simd::vectorize(Scalar, op),
125		}
126	}
127}
128
129impl Default for Arch {
130	#[inline]
131	fn default() -> Self {
132		Self::new()
133	}
134}
135
136include!(concat!(env!("OUT_DIR"), "/x86_64_asm.rs"));
137
138#[cfg(test)]
139mod tests {
140	extern crate alloc;
141
142	use super::*;
143	use alloc::vec;
144	use alloc::vec::Vec;
145	use assert_approx_eq::assert_approx_eq;
146	use core::iter::zip;
147	use rand::random;
148
149	#[allow(unused_macros)]
150	macro_rules! dbgx {
151        () => {
152            ::std::eprintln!("[{}:{}]", ::std::file!(), ::std::line!())
153        };
154        ($val:expr $(,)?) => {
155            match $val {
156                tmp => {
157                    ::std::eprintln!("[{}:{}] {} = {:#X?}",
158                        ::std::file!(), ::std::line!(), ::std::stringify!($val), &tmp);
159                    tmp
160                }
161            }
162        };
163        ($($val:expr),+ $(,)?) => {
164            ($(dbgx!($val)),+,)
165        };
166    }
167
168	#[test]
169	fn times_two() {
170		let n = 1312;
171		let mut v = (0..n).map(|i| i as f64).collect::<Vec<_>>();
172		let arch = Arch::new();
173
174		struct TimesThree<'a>(&'a mut [f64]);
175		impl WithSimd for TimesThree<'_> {
176			type Output = ();
177
178			#[inline(always)]
179			fn with_simd<S: Simd>(self, simd: S) -> Self::Output {
180				let v = self.0;
181				let (head, tail) = S::as_mut_simd_f64s(v);
182
183				let three = simd.splat_f64s(3.0);
184				for x in head {
185					*x = simd.mul_f64s(three, *x);
186				}
187
188				for x in tail {
189					*x *= 3.0;
190				}
191			}
192		}
193
194		arch.dispatch(|| {
195			for x in &mut v {
196				*x *= 2.0;
197			}
198		});
199
200		arch.dispatch(TimesThree(&mut v));
201
202		for (i, x) in v.into_iter().enumerate() {
203			assert_eq!(x, 6.0 * i as f64);
204		}
205	}
206
207	#[test]
208	fn cplx_ops() {
209		let n = 16;
210		let a = (0..n)
211			.map(|_| c32 {
212				re: random(),
213				im: random(),
214			})
215			.collect::<Vec<_>>();
216		let b = (0..n)
217			.map(|_| c32 {
218				re: random(),
219				im: random(),
220			})
221			.collect::<Vec<_>>();
222		let c = (0..n)
223			.map(|_| c32 {
224				re: random(),
225				im: random(),
226			})
227			.collect::<Vec<_>>();
228
229		let axb_target = zip(&a, &b).map(|(a, b)| a * b).collect::<Vec<_>>();
230		let conjaxb_target = zip(&a, &b).map(|(a, b)| a.conj() * b).collect::<Vec<_>>();
231		let axbpc_target = zip(zip(&a, &b), &c)
232			.map(|((a, b), c)| a * b + c)
233			.collect::<Vec<_>>();
234		let conjaxbpc_target = zip(zip(&a, &b), &c)
235			.map(|((a, b), c)| a.conj() * b + c)
236			.collect::<Vec<_>>();
237
238		if let Some(simd) = V3::try_new() {
239			let mut axb = vec![c32::new(0.0, 0.0); n];
240			let mut conjaxb = vec![c32::new(0.0, 0.0); n];
241			let mut axbpc = vec![c32::new(0.0, 0.0); n];
242			let mut conjaxbpc = vec![c32::new(0.0, 0.0); n];
243
244			{
245				let a = V3::as_simd_c32s(&a).0;
246				let b = V3::as_simd_c32s(&b).0;
247				let c = V3::as_simd_c32s(&c).0;
248				let axb = V3::as_mut_simd_c32s(&mut axb).0;
249				let conjaxb = V3::as_mut_simd_c32s(&mut conjaxb).0;
250				let axbpc = V3::as_mut_simd_c32s(&mut axbpc).0;
251				let conjaxbpc = V3::as_mut_simd_c32s(&mut conjaxbpc).0;
252
253				for (axb, (a, b)) in zip(axb, zip(a, b)) {
254					*axb = simd.mul_e_c32s(*a, *b);
255				}
256				for (conjaxb, (a, b)) in zip(conjaxb, zip(a, b)) {
257					*conjaxb = simd.conj_mul_e_c32s(*a, *b);
258				}
259				for (axbpc, ((a, b), c)) in zip(axbpc, zip(zip(a, b), c)) {
260					*axbpc = simd.mul_add_e_c32s(*a, *b, *c);
261				}
262				for (conjaxbpc, ((a, b), c)) in zip(conjaxbpc, zip(zip(a, b), c)) {
263					*conjaxbpc = simd.conj_mul_add_e_c32s(*a, *b, *c);
264				}
265			}
266
267			for (target, actual) in zip(&axb_target, &axb) {
268				assert_approx_eq!(target.re, actual.re);
269				assert_approx_eq!(target.im, actual.im);
270			}
271			for (target, actual) in zip(&conjaxb_target, &conjaxb) {
272				assert_approx_eq!(target.re, actual.re);
273				assert_approx_eq!(target.im, actual.im);
274			}
275			for (target, actual) in zip(&axbpc_target, &axbpc) {
276				assert_approx_eq!(target.re, actual.re);
277				assert_approx_eq!(target.im, actual.im);
278			}
279			for (target, actual) in zip(&conjaxbpc_target, &conjaxbpc) {
280				assert_approx_eq!(target.re, actual.re);
281				assert_approx_eq!(target.im, actual.im);
282			}
283		}
284
285		#[cfg(feature = "nightly")]
286		if let Some(simd) = V4::try_new() {
287			let mut axb = vec![c32::new(0.0, 0.0); n];
288			let mut conjaxb = vec![c32::new(0.0, 0.0); n];
289			let mut axbpc = vec![c32::new(0.0, 0.0); n];
290			let mut conjaxbpc = vec![c32::new(0.0, 0.0); n];
291
292			{
293				let a = V4::as_simd_c32s(&a).0;
294				let b = V4::as_simd_c32s(&b).0;
295				let c = V4::as_simd_c32s(&c).0;
296				let axb = V4::as_mut_simd_c32s(&mut axb).0;
297				let conjaxb = V4::as_mut_simd_c32s(&mut conjaxb).0;
298				let axbpc = V4::as_mut_simd_c32s(&mut axbpc).0;
299				let conjaxbpc = V4::as_mut_simd_c32s(&mut conjaxbpc).0;
300
301				for (axb, (a, b)) in zip(axb, zip(a, b)) {
302					*axb = simd.mul_e_c32s(*a, *b);
303				}
304				for (conjaxb, (a, b)) in zip(conjaxb, zip(a, b)) {
305					*conjaxb = simd.conj_mul_e_c32s(*a, *b);
306				}
307				for (axbpc, ((a, b), c)) in zip(axbpc, zip(zip(a, b), c)) {
308					*axbpc = simd.mul_add_e_c32s(*a, *b, *c);
309				}
310				for (conjaxbpc, ((a, b), c)) in zip(conjaxbpc, zip(zip(a, b), c)) {
311					*conjaxbpc = simd.conj_mul_add_e_c32s(*a, *b, *c);
312				}
313			}
314
315			for (target, actual) in zip(&axb_target, &axb) {
316				assert_approx_eq!(target.re, actual.re);
317				assert_approx_eq!(target.im, actual.im);
318			}
319			for (target, actual) in zip(&conjaxb_target, &conjaxb) {
320				assert_approx_eq!(target.re, actual.re);
321				assert_approx_eq!(target.im, actual.im);
322			}
323			for (target, actual) in zip(&axbpc_target, &axbpc) {
324				assert_approx_eq!(target.re, actual.re);
325				assert_approx_eq!(target.im, actual.im);
326			}
327			for (target, actual) in zip(&conjaxbpc_target, &conjaxbpc) {
328				assert_approx_eq!(target.re, actual.re);
329				assert_approx_eq!(target.im, actual.im);
330			}
331		}
332	}
333
334	#[test]
335	fn test_to_ref() {
336		let simd_ref = unsafe { V2::new_unchecked() }.to_ref();
337		let _ = *simd_ref;
338	}
339
340	#[test]
341	fn test_widening_mul_u32x4() {
342		if let Some(simd) = V2::try_new() {
343			const N: usize = 4;
344			let a = u32x4(2298413717, 568259975, 2905436181, 175547995);
345			let b = u32x4(2022374205, 1446824162, 3165580604, 3011091403);
346			let a_array: [u32; N] = cast!(a);
347			let b_array: [u32; N] = cast!(b);
348			let mut lo_array = [0u32; N];
349			let mut hi_array = [0u32; N];
350
351			for i in 0..N {
352				let prod = a_array[i] as u64 * b_array[i] as u64;
353				let lo = prod as u32;
354				let hi = (prod >> 32) as u32;
355				lo_array[i] = lo;
356				hi_array[i] = hi;
357			}
358
359			let (lo, hi) = simd.widening_mul_u32x4(a, b);
360			assert_eq!(lo, cast!(lo_array));
361			assert_eq!(hi, cast!(hi_array));
362		}
363		if let Some(simd) = V3::try_new() {
364			const N: usize = 8;
365			let a = u32x8(
366				2298413717, 568259975, 2905436181, 175547995, 2298413717, 568259975, 2905436181,
367				175547995,
368			);
369			let b = u32x8(
370				2022374205, 1446824162, 3165580604, 3011091403, 2022374205, 1446824162, 3165580604,
371				3011091403,
372			);
373			let a_array: [u32; N] = cast!(a);
374			let b_array: [u32; N] = cast!(b);
375			let mut lo_array = [0u32; N];
376			let mut hi_array = [0u32; N];
377
378			for i in 0..N {
379				let prod = a_array[i] as u64 * b_array[i] as u64;
380				let lo = prod as u32;
381				let hi = (prod >> 32) as u32;
382				lo_array[i] = lo;
383				hi_array[i] = hi;
384			}
385
386			let (lo, hi) = simd.widening_mul_u32x8(a, b);
387			assert_eq!(lo, cast!(lo_array));
388			assert_eq!(hi, cast!(hi_array));
389		}
390	}
391
392	#[test]
393	fn test_widening_mul_i32() {
394		if let Some(simd) = V2::try_new() {
395			const N: usize = 4;
396			let a = cast!(u32x4(2298413717, 568259975, 2905436181, 175547995));
397			let b = cast!(u32x4(2022374205, 1446824162, 3165580604, 3011091403));
398
399			let a_array: [i32; N] = cast!(a);
400			let b_array: [i32; N] = cast!(b);
401			let mut lo_array = [0i32; N];
402			let mut hi_array = [0i32; N];
403
404			for i in 0..N {
405				let prod = a_array[i] as i64 * b_array[i] as i64;
406				let lo = prod as i32;
407				let hi = (prod >> 32) as i32;
408				lo_array[i] = lo;
409				hi_array[i] = hi;
410			}
411
412			let (lo, hi) = simd.widening_mul_i32x4(a, b);
413			assert_eq!(lo, cast!(lo_array));
414			assert_eq!(hi, cast!(hi_array));
415		}
416		if let Some(simd) = V3::try_new() {
417			const N: usize = 8;
418			let a = cast!(u32x8(
419				2298413717, 568259975, 2905436181, 175547995, 2298413717, 568259975, 2905436181,
420				175547995,
421			));
422			let b = cast!(u32x8(
423				2022374205, 1446824162, 3165580604, 3011091403, 2022374205, 1446824162, 3165580604,
424				3011091403,
425			));
426
427			let a_array: [i32; N] = cast!(a);
428			let b_array: [i32; N] = cast!(b);
429			let mut lo_array = [0i32; N];
430			let mut hi_array = [0i32; N];
431
432			for i in 0..N {
433				let prod = a_array[i] as i64 * b_array[i] as i64;
434				let lo = prod as i32;
435				let hi = (prod >> 32) as i32;
436				lo_array[i] = lo;
437				hi_array[i] = hi;
438			}
439
440			let (lo, hi) = simd.widening_mul_i32x8(a, b);
441			assert_eq!(lo, cast!(lo_array));
442			assert_eq!(hi, cast!(hi_array));
443		}
444	}
445
446	#[test]
447	fn test_shift() {
448		if let Some(simd) = V2::try_new() {
449			let a = u16x8(54911, 46958, 49991, 22366, 46365, 39572, 22704, 60060);
450			assert_eq!(simd.shl_const_u16x8::<16>(a), simd.splat_u16x8(0));
451			assert_eq!(simd.shl_u16x8(a, simd.splat_u64x2(!0)), simd.splat_u16x8(0),);
452		}
453	}
454
455	#[test]
456	fn test_abs() {
457		if let Some(simd) = V2::try_new() {
458			let a = f32x4(1.0, -2.0, -1.0, 2.0);
459			assert_eq!(simd.abs_f32x4(a), f32x4(1.0, 2.0, 1.0, 2.0));
460			let a = f64x2(1.0, -2.0);
461			assert_eq!(simd.abs_f64x2(a), f64x2(1.0, 2.0));
462		}
463	}
464
465	#[test]
466	fn test_subadd() {
467		if let Some(simd) = V2::try_new() {
468			let a = f32x4(1.0, -2.0, -1.0, 2.0);
469			assert_eq!(simd.subadd_f32x4(a, a), f32x4(0.0, -4.0, 0.0, 4.0));
470		}
471	}
472
473	#[test]
474	fn test_signed_to_unsigned() {
475		if let Some(simd) = V2::try_new() {
476			let a = i8x16(1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
477			assert_eq!(simd.convert_i8x16_to_u64x2(a), u64x2(1, !0));
478		}
479	}
480
481	#[test]
482	fn test_int_cmp() {
483		if let Some(simd) = V2::try_new() {
484			{
485				const N: usize = 16;
486
487				let a = u8x16(
488					174, 191, 248, 232, 11, 186, 42, 236, 3, 59, 223, 72, 161, 146, 98, 69,
489				);
490				let b = u8x16(
491					97, 239, 164, 173, 208, 0, 121, 247, 218, 58, 119, 131, 213, 133, 22, 128,
492				);
493				let lt = simd.cmp_lt_u8x16(a, b);
494
495				let a_array: [u8; N] = cast!(a);
496				let b_array: [u8; N] = cast!(b);
497				let mut lt_array = [m8::new(false); N];
498
499				for i in 0..N {
500					lt_array[i] = m8::new(a_array[i] < b_array[i]);
501				}
502
503				assert_eq!(lt, cast!(lt_array));
504			}
505			{
506				const N: usize = 8;
507
508				let a = u16x8(174, 191, 248, 232, 11, 186, 42, 236);
509				let b = u16x8(97, 239, 164, 173, 208, 0, 121, 247);
510				let lt = simd.cmp_lt_u16x8(a, b);
511
512				let a_array: [u16; N] = cast!(a);
513				let b_array: [u16; N] = cast!(b);
514				let mut lt_array = [m16::new(false); N];
515
516				for i in 0..N {
517					lt_array[i] = m16::new(a_array[i] < b_array[i]);
518				}
519
520				assert_eq!(lt, cast!(lt_array));
521			}
522			{
523				const N: usize = 4;
524
525				let a = u32x4(174, 191, 248, 232);
526				let b = u32x4(97, 239, 164, 173);
527				let lt = simd.cmp_lt_u32x4(a, b);
528
529				let a_array: [u32; N] = cast!(a);
530				let b_array: [u32; N] = cast!(b);
531				let mut lt_array = [m32::new(false); N];
532
533				for i in 0..N {
534					lt_array[i] = m32::new(a_array[i] < b_array[i]);
535				}
536
537				assert_eq!(lt, cast!(lt_array));
538			}
539			{
540				const N: usize = 2;
541
542				let a = u64x2(174, 191);
543				let b = u64x2(97, 239);
544				let lt = simd.cmp_lt_u64x2(a, b);
545
546				let a_array: [u64; N] = cast!(a);
547				let b_array: [u64; N] = cast!(b);
548				let mut lt_array = [m64::new(false); N];
549
550				for i in 0..N {
551					lt_array[i] = m64::new(a_array[i] < b_array[i]);
552				}
553
554				assert_eq!(lt, cast!(lt_array));
555			}
556		}
557	}
558
559	#[test]
560	fn test_is_nan() {
561		if let Some(simd) = V2::try_new() {
562			assert_eq!(
563				simd.is_nan_f32x4(f32x4(0.0, f32::NAN, f32::INFINITY, -f32::NAN)),
564				m32x4(
565					m32::new(false),
566					m32::new(true),
567					m32::new(false),
568					m32::new(true),
569				),
570			);
571			assert_eq!(
572				simd.is_nan_f64x2(f64x2(0.0, f64::NAN)),
573				m64x2(m64::new(false), m64::new(true)),
574			);
575		}
576	}
577
578	#[test]
579	fn test_rotate() {
580		if let Some(simd) = V3::try_new() {
581			for amount in 0..128 {
582				let mut array = [0u32; 8];
583				for (i, dst) in array.iter_mut().enumerate() {
584					*dst = 1000 + i as u32;
585				}
586
587				let rot: [u32; 8] = cast!(simd.rotate_right_u32s(cast!(array), amount));
588				for i in 0..8 {
589					assert_eq!(rot[(i + amount) % 8], array[i]);
590				}
591			}
592			for amount in 0..128 {
593				let mut array = [0u64; 4];
594				for (i, dst) in array.iter_mut().enumerate() {
595					*dst = 1000 + i as u64;
596				}
597
598				let rot: [u64; 4] = cast!(simd.rotate_right_u64s(cast!(array), amount));
599				for i in 0..4 {
600					assert_eq!(rot[(i + amount) % 4], array[i]);
601				}
602			}
603		}
604
605		#[cfg(feature = "nightly")]
606		if let Some(simd) = V4::try_new() {
607			for amount in 0..128 {
608				let mut array = [0u32; 16];
609				for (i, dst) in array.iter_mut().enumerate() {
610					*dst = 1000 + i as u32;
611				}
612
613				let rot: [u32; 16] = cast!(simd.rotate_right_u32s(cast!(array), amount));
614				for i in 0..16 {
615					assert_eq!(rot[(i + amount) % 16], array[i]);
616				}
617			}
618			for amount in 0..128 {
619				let mut array = [0u64; 8];
620				for (i, dst) in array.iter_mut().enumerate() {
621					*dst = 1000 + i as u64;
622				}
623
624				let rot: [u64; 8] = cast!(simd.rotate_right_u64s(cast!(array), amount));
625				for i in 0..8 {
626					assert_eq!(rot[(i + amount) % 8], array[i]);
627				}
628			}
629		}
630	}
631
632	#[test]
633	fn test_partial() {
634		if let Some(simd) = V3::try_new() {
635			for n in 0..=8 {
636				let src = core::array::from_fn::<f32, 8, _>(|i| i as _);
637				let mut dst = [0.0f32; 8];
638				let zero = dst;
639
640				assert_eq!(simd.partial_load_f32s(&src[..n]), unsafe {
641					simd.mask_load_ptr_f32s(simd.mask_between_m32s(0, n as u32), src.as_ptr())
642				});
643				{
644					let src = &src[..n];
645					let dst = &mut dst[..n];
646
647					simd.partial_store_f32s(dst, simd.partial_load_f32s(src));
648
649					assert_eq!(src, dst);
650				}
651				assert_eq!(dst[n..], zero[n..]);
652			}
653		}
654
655		#[cfg(feature = "nightly")]
656		if let Some(simd) = V4::try_new() {
657			for n in 0..=16 {
658				let src = core::array::from_fn::<f32, 16, _>(|i| i as _);
659				let mut dst = [0.0f32; 16];
660				let zero = dst;
661
662				assert_eq!(simd.partial_load_f32s(&src[..n]), unsafe {
663					simd.mask_load_ptr_f32s(simd.mask_between_m32s(0, n as u32), src.as_ptr())
664				});
665
666				{
667					let src = &src[..n];
668					let dst = &mut dst[..n];
669
670					simd.partial_store_f32s(dst, simd.partial_load_f32s(src));
671
672					assert_eq!(src, dst);
673				}
674				assert_eq!(dst[n..], zero[n..]);
675			}
676		}
677	}
678
679	#[test]
680	fn test_interleave() {
681		if let Some(simd) = V3::try_new() {
682			{
683				let src = [f64x4(0.0, 0.1, 1.0, 1.1), f64x4(2.0, 2.1, 3.0, 3.1)];
684				let dst = simd.deinterleave_shfl_f64s(src);
685				assert_eq!(dst[1], simd.add_f64x4(dst[0], simd.splat_f64x4(0.1)));
686				assert_eq!(src, simd.interleave_shfl_f64s(dst));
687			}
688			{
689				let src = [
690					f64x4(0.0, 0.1, 0.2, 0.3),
691					f64x4(1.0, 1.1, 1.2, 1.3),
692					f64x4(2.0, 2.1, 2.2, 2.3),
693					f64x4(3.0, 3.1, 3.2, 3.3),
694				];
695				let dst = simd.deinterleave_shfl_f64s(src);
696				assert_eq!(dst[1], simd.add_f64x4(dst[0], simd.splat_f64x4(0.1)));
697				assert_eq!(dst[2], simd.add_f64x4(dst[0], simd.splat_f64x4(0.2)));
698				assert_eq!(dst[3], simd.add_f64x4(dst[0], simd.splat_f64x4(0.3)));
699				assert_eq!(src, simd.interleave_shfl_f64s(dst));
700			}
701			{
702				let src = [
703					f32x8(0.0, 0.1, 1.0, 1.1, 2.0, 2.1, 3.0, 3.1),
704					f32x8(4.0, 4.1, 5.0, 5.1, 6.0, 6.1, 7.0, 7.1),
705				];
706				let dst = simd.deinterleave_shfl_f32s(src);
707				assert_eq!(dst[1], simd.add_f32x8(dst[0], simd.splat_f32x8(0.1)));
708				assert_eq!(src, simd.interleave_shfl_f32s(dst));
709			}
710			{
711				let src = [
712					f32x8(0.0, 0.1, 0.2, 0.3, 1.0, 1.1, 1.2, 1.3),
713					f32x8(2.0, 2.1, 2.2, 2.3, 3.0, 3.1, 3.2, 3.3),
714					f32x8(4.0, 4.1, 4.2, 4.3, 5.0, 5.1, 5.2, 5.3),
715					f32x8(6.0, 6.1, 6.2, 6.3, 7.0, 7.1, 7.2, 7.3),
716				];
717				let dst = simd.deinterleave_shfl_f32s(src);
718				assert_eq!(dst[1], simd.add_f32x8(dst[0], simd.splat_f32x8(0.1)));
719				assert_eq!(dst[2], simd.add_f32x8(dst[0], simd.splat_f32x8(0.2)));
720				assert_eq!(dst[3], simd.add_f32x8(dst[0], simd.splat_f32x8(0.3)));
721				assert_eq!(src, simd.interleave_shfl_f32s(dst));
722			}
723		}
724		#[cfg(feature = "nightly")]
725		if let Some(simd) = V4::try_new() {
726			{
727				let src = [
728					f64x8(0.0, 0.1, 1.0, 1.1, 2.0, 2.1, 3.0, 3.1),
729					f64x8(4.0, 4.1, 5.0, 5.1, 6.0, 6.1, 7.0, 7.1),
730				];
731				let dst = simd.deinterleave_shfl_f64s(src);
732				assert_eq!(dst[1], simd.add_f64x8(dst[0], simd.splat_f64x8(0.1)));
733				assert_eq!(src, simd.interleave_shfl_f64s(dst));
734			}
735
736			{
737				let src = [
738					f64x8(0.0, 0.1, 0.2, 1.0, 1.1, 1.2, 2.0, 2.1),
739					f64x8(2.2, 3.0, 3.1, 3.2, 4.0, 4.1, 4.2, 5.0),
740					f64x8(5.1, 5.2, 6.0, 6.1, 6.2, 7.0, 7.1, 7.2),
741				];
742				let dst = simd.deinterleave_shfl_f64s(src);
743				assert_eq!(dst[1], simd.add_f64x8(dst[0], simd.splat_f64x8(0.1)));
744				assert_eq!(dst[2], simd.add_f64x8(dst[0], simd.splat_f64x8(0.2)));
745				assert_eq!(src, simd.interleave_shfl_f64s(dst));
746			}
747			{
748				let src = [
749					f64x8(0.0, 0.1, 0.2, 0.3, 1.0, 1.1, 1.2, 1.3),
750					f64x8(2.0, 2.1, 2.2, 2.3, 3.0, 3.1, 3.2, 3.3),
751					f64x8(4.0, 4.1, 4.2, 4.3, 5.0, 5.1, 5.2, 5.3),
752					f64x8(6.0, 6.1, 6.2, 6.3, 7.0, 7.1, 7.2, 7.3),
753				];
754				let dst = simd.deinterleave_shfl_f64s(src);
755				assert_eq!(dst[1], simd.add_f64x8(dst[0], simd.splat_f64x8(0.1)));
756				assert_eq!(dst[2], simd.add_f64x8(dst[0], simd.splat_f64x8(0.2)));
757				assert_eq!(dst[3], simd.add_f64x8(dst[0], simd.splat_f64x8(0.3)));
758				assert_eq!(src, simd.interleave_shfl_f64s(dst));
759			}
760
761			{
762				let src = [
763					f32x16(
764						0.0, 0.1, 1.0, 1.1, 2.0, 2.1, 3.0, 3.1, 4.0, 4.1, 5.0, 5.1, 6.0, 6.1, 7.0,
765						7.1,
766					),
767					f32x16(
768						8.0, 8.1, 9.0, 9.1, 10.0, 10.1, 11.0, 11.1, 12.0, 12.1, 13.0, 13.1, 14.0,
769						14.1, 15.0, 15.1,
770					),
771				];
772				let dst = simd.deinterleave_shfl_f32s(src);
773				assert_eq!(dst[1], simd.add_f32x16(dst[0], simd.splat_f32x16(0.1)));
774				assert_eq!(src, simd.interleave_shfl_f32s(dst));
775			}
776			{
777				let src = [
778					f32x16(
779						0.0, 0.1, 0.2, 0.3, 1.0, 1.1, 1.2, 1.3, 2.0, 2.1, 2.2, 2.3, 3.0, 3.1, 3.2,
780						3.3,
781					),
782					f32x16(
783						4.0, 4.1, 4.2, 4.3, 5.0, 5.1, 5.2, 5.3, 6.0, 6.1, 6.2, 6.3, 7.0, 7.1, 7.2,
784						7.3,
785					),
786					f32x16(
787						8.0, 8.1, 8.2, 8.3, 9.0, 9.1, 9.2, 9.3, 10.0, 10.1, 10.2, 10.3, 11.0, 11.1,
788						11.2, 11.3,
789					),
790					f32x16(
791						12.0, 12.1, 12.2, 12.3, 13.0, 13.1, 13.2, 13.3, 14.0, 14.1, 14.2, 14.3,
792						15.0, 15.1, 15.2, 15.3,
793					),
794				];
795				let dst = simd.deinterleave_shfl_f32s(src);
796				assert_eq!(dst[1], simd.add_f32x16(dst[0], simd.splat_f32x16(0.1)));
797				assert_eq!(dst[2], simd.add_f32x16(dst[0], simd.splat_f32x16(0.2)));
798				assert_eq!(dst[3], simd.add_f32x16(dst[0], simd.splat_f32x16(0.3)));
799				assert_eq!(src, simd.interleave_shfl_f32s(dst));
800			}
801		}
802	}
803}