8#if defined(__SSE__) || defined(__SSE2__)
11#if defined(__ARM_NEON)
22 const __m128 VECTOR4_MASK_X = _mm_set_ps(0.0f, 0.0f, 0.0f, 1.0f);
23 const __m128 VECTOR4_MASK_Y = _mm_set_ps(0.0f, 0.0f, 1.0f, 0.0f);
24 const __m128 VECTOR4_MASK_Z = _mm_set_ps(0.0f, 1.0f, 0.0f, 0.0f);
25 const __m128 VECTOR4_MASK_W = _mm_set_ps(1.0f, 0.0f, 0.0f, 0.0f);
27 inline __m128 _mm_cross3_ps(__m128 a, __m128 b) {
28 __m128 a_yzx = _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 0, 2, 1));
29 __m128 b_yzx = _mm_shuffle_ps(b, b, _MM_SHUFFLE(3, 0, 2, 1));
30 __m128 c = _mm_sub_ps(
34 return _mm_shuffle_ps(c, c, _MM_SHUFFLE(3, 0, 2, 1));
37 inline __m128 _mm_normalize3_ps(__m128 v) {
38 __m128 dp = _mm_dp_ps(v, v, 0x7F);
39 __m128 rsqrt = _mm_rsqrt_ps(dp);
40 return _mm_mul_ps(v, rsqrt);
42#elif defined(__ARM_NEON)
43 const float32x4_t VECTOR4_MASK_X = (float32x4_t){ 1.0f, 0.0f, 0.0f, 0.0f };
44 const float32x4_t VECTOR4_MASK_Y = (float32x4_t){ 0.0f, 1.0f, 0.0f, 0.0f };
45 const float32x4_t VECTOR4_MASK_Z = (float32x4_t){ 0.0f, 0.0f, 1.0f, 0.0f };
47 inline float32x4_t vcrossq_f32(float32x4_t a, float32x4_t b)
52 static const uint8x16_t yzxw = {4,5,6,7, 8,9,10,11, 0,1,2,3, 12,13,14,15};
54 const float32x4_t a_yzx = vreinterpretq_f32_u8(vqtbl1q_u8(vreinterpretq_u8_f32(a), yzxw));
55 const float32x4_t b_yzx = vreinterpretq_f32_u8(vqtbl1q_u8(vreinterpretq_u8_f32(b), yzxw));
58 const float32x4_t c = vsubq_f32(vmulq_f32(a, b_yzx), vmulq_f32(a_yzx, b));
61 return vreinterpretq_f32_u8(vqtbl1q_u8(vreinterpretq_u8_f32(c), yzxw));
64 inline float32x4_t neon_normalize3(float32x4_t v)
66 float32x4_t dp = vmulq_f32(v, v);
67 const float sum = vgetq_lane_f32(dp, 0) + vgetq_lane_f32(dp, 1) + vgetq_lane_f32(dp, 2);
68 const float invLen = 1.0f / std::sqrt(sum);
69 return vmulq_n_f32(v, invLen);
72 inline float vdotq3_f32(float32x4_t a, float32x4_t b)
74 const float32x4_t mul = vmulq_f32(a, b);
75 return vgetq_lane_f32(mul, 0) + vgetq_lane_f32(mul, 1) + vgetq_lane_f32(mul, 2);
78 inline void neon_transpose4x4(
79 const float32x4_t row0,
const float32x4_t row1,
const float32x4_t row2,
const float32x4_t row3,
80 float32x4_t& col0, float32x4_t& col1, float32x4_t& col2, float32x4_t& col3)
84 const float32x4x2_t t01 = vtrnq_f32(row0, row1);
85 const float32x4x2_t t23 = vtrnq_f32(row2, row3);
89 col0 = vcombine_f32(vget_low_f32(t01.val[0]), vget_low_f32(t23.val[0]));
90 col1 = vcombine_f32(vget_low_f32(t01.val[1]), vget_low_f32(t23.val[1]));
91 col2 = vcombine_f32(vget_high_f32(t01.val[0]), vget_high_f32(t23.val[0]));
92 col3 = vcombine_f32(vget_high_f32(t01.val[1]), vget_high_f32(t23.val[1]));
94#elif defined(__APPLE__)