139#if defined(USE_SIMD_SSE)
141 __m128 mul = _mm_mul_ps(m128, other.m128);
143 __m128 shuf = _mm_shuffle_ps(mul, mul, _MM_SHUFFLE(2, 3, 0, 1));
144 __m128 sums = _mm_add_ps(mul, shuf);
145 shuf = _mm_shuffle_ps(sums, sums, _MM_SHUFFLE(1, 0, 3, 2));
146 sums = _mm_add_ps(sums, shuf);
147 return _mm_cvtss_f32(sums);
148#elif defined(USE_SIMD_APPLE)
149 return simd_dot(m128, other.m128);
150#elif defined(USE_SIMD_NEON)
151 float32x4_t mul = vmulq_f32(m128, other.m128);
152 const float32x2_t sum2 = vadd_f32(vget_low_f32(mul), vget_high_f32(mul));
153 const float sum = vget_lane_f32(sum2, 0) + vget_lane_f32(sum2, 1);
156 return x * other.
x +
y * other.
y +
z * other.
z +
w * other.
w;
221#if defined(USE_SIMD_SSE)
224 __m128
dot = _mm_mul_ps(xyz, xyz);
227 __m128 shuf = _mm_shuffle_ps(
dot,
dot, _MM_SHUFFLE(2, 3, 0, 1));
228 __m128 sums = _mm_add_ps(
dot, shuf);
229 __m128 lengthSq = _mm_add_ss(sums, _mm_shuffle_ps(sums, sums, _MM_SHUFFLE(1, 1, 1, 1)));
231 float len = _mm_cvtss_f32(lengthSq);
234 float invLen = 1.0f / std::sqrt(len);
235 __m128 scale = _mm_set1_ps(invLen);
236 __m128 scaled = _mm_mul_ps(m128, scale);
237 alignas(16)
float out[4];
238 _mm_store_ps(out, scaled);
239 return {out[0], out[1], out[2], out[3]};
242#elif defined(USE_SIMD_APPLE)
243 simd_float4 xyz = m128;
245 if (
float len = simd_length(xyz); len > 0.0f)
247 float invLen = 1.0f / len;
251#elif defined(USE_SIMD_NEON)
252 float32x4_t xyz = m128;
253 xyz = vsetq_lane_f32(0.0f, xyz, 3);
255 float32x4_t
dot = vmulq_f32(xyz, xyz);
256 const float32x2_t sum2 = vadd_f32(vget_low_f32(
dot), vget_high_f32(
dot));
258 if (
float len = vget_lane_f32(sum2, 0) + vget_lane_f32(sum2, 1); len > 0)
260 float invLen = 1.0f / std::sqrt(len);
261 const float32x4_t result = vmulq_n_f32(m128, invLen);
270 const float lenSq =
x *
x +
y *
y +
z *
z;
272 const float invLength = 1.0f / std::sqrt(lenSq);
288#if defined(USE_SIMD_SSE)
289 __m128 vec = _mm_insert_ps(point.m128, _mm_set_ss(1.0f), 0x30);
290 __m128 result = _mm_dp_ps(this->m128, vec, 0xF1);
291 return _mm_cvtss_f32(result);
292#elif defined(USE_SIMD_APPLE)
293 simd_float4 vec = point.m128;
295 return simd_dot(m128, vec);
296#elif defined(USE_SIMD_NEON)
297 float32x4_t vec = point.m128;
298 vec = vsetq_lane_f32(1.0f, vec, 3);
299 const float32x4_t mul = vmulq_f32(this->m128, vec);
300 const float32x2_t sum1 = vadd_f32(vget_low_f32(mul), vget_high_f32(mul));
301 const float32x2_t sum2 = vpadd_f32(sum1, sum1);
302 return vget_lane_f32(sum2, 0);
304 return x * point.
x +
y * point.
y +
z * point.
z +
w;