VisuTwin Canvas
C++ 3D Engine — Metal Backend
Loading...
Searching...
No Matches
simd.h
Go to the documentation of this file.
1// SPDX-License-Identifier: Apache-2.0
2// Copyright 2025-2026 Arnis Lektauers
3//
4// Created by Arnis Lektauers on 25.07.2025.
5//
6#pragma once
7
8#if defined(__SSE__) || defined(__SSE2__)
9#include <immintrin.h>
10#endif
11#if defined(__ARM_NEON)
12#include <arm_neon.h>
13#endif
14#if defined(__APPLE__)
15#include <simd/simd.h>
16#endif
17#include <cmath>
18
19namespace visutwin::canvas
20{
21#if defined(__SSE__)
22 const __m128 VECTOR4_MASK_X = _mm_set_ps(0.0f, 0.0f, 0.0f, 1.0f); // x = lowest index
23 const __m128 VECTOR4_MASK_Y = _mm_set_ps(0.0f, 0.0f, 1.0f, 0.0f);
24 const __m128 VECTOR4_MASK_Z = _mm_set_ps(0.0f, 1.0f, 0.0f, 0.0f);
25 const __m128 VECTOR4_MASK_W = _mm_set_ps(1.0f, 0.0f, 0.0f, 0.0f);
26
27 inline __m128 _mm_cross3_ps(__m128 a, __m128 b) {
28 __m128 a_yzx = _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 0, 2, 1));
29 __m128 b_yzx = _mm_shuffle_ps(b, b, _MM_SHUFFLE(3, 0, 2, 1));
30 __m128 c = _mm_sub_ps(
31 _mm_mul_ps(a, b_yzx),
32 _mm_mul_ps(a_yzx, b)
33 );
34 return _mm_shuffle_ps(c, c, _MM_SHUFFLE(3, 0, 2, 1));
35 }
36
37 inline __m128 _mm_normalize3_ps(__m128 v) {
38 __m128 dp = _mm_dp_ps(v, v, 0x7F); // dot3 in .x
39 __m128 rsqrt = _mm_rsqrt_ps(dp);
40 return _mm_mul_ps(v, rsqrt);
41 }
42#elif defined(__ARM_NEON)
43 const float32x4_t VECTOR4_MASK_X = (float32x4_t){ 1.0f, 0.0f, 0.0f, 0.0f };
44 const float32x4_t VECTOR4_MASK_Y = (float32x4_t){ 0.0f, 1.0f, 0.0f, 0.0f };
45 const float32x4_t VECTOR4_MASK_Z = (float32x4_t){ 0.0f, 0.0f, 1.0f, 0.0f };
46
47 inline float32x4_t vcrossq_f32(float32x4_t a, float32x4_t b)
48 {
49 // yzxw permutation via byte-level table lookup (AArch64)
50 // Maps: lane 0←lane1, lane 1←lane2, lane 2←lane0, lane 3←lane3
51 // vextq_f32(v,v,1) does circular rotation [1,2,3,0] which is WRONG for cross product
52 static const uint8x16_t yzxw = {4,5,6,7, 8,9,10,11, 0,1,2,3, 12,13,14,15};
53
54 const float32x4_t a_yzx = vreinterpretq_f32_u8(vqtbl1q_u8(vreinterpretq_u8_f32(a), yzxw));
55 const float32x4_t b_yzx = vreinterpretq_f32_u8(vqtbl1q_u8(vreinterpretq_u8_f32(b), yzxw));
56
57 // c = a * b_yzx - a_yzx * b → {a.x*b.y - a.y*b.x, a.y*b.z - a.z*b.y, a.z*b.x - a.x*b.z, 0}
58 const float32x4_t c = vsubq_f32(vmulq_f32(a, b_yzx), vmulq_f32(a_yzx, b));
59
60 // Permute result back from (z', x', y', 0) to (x', y', z', 0)
61 return vreinterpretq_f32_u8(vqtbl1q_u8(vreinterpretq_u8_f32(c), yzxw));
62 }
63
64 inline float32x4_t neon_normalize3(float32x4_t v)
65 {
66 float32x4_t dp = vmulq_f32(v, v);
67 const float sum = vgetq_lane_f32(dp, 0) + vgetq_lane_f32(dp, 1) + vgetq_lane_f32(dp, 2);
68 const float invLen = 1.0f / std::sqrt(sum);
69 return vmulq_n_f32(v, invLen);
70 }
71
72 inline float vdotq3_f32(float32x4_t a, float32x4_t b)
73 {
74 const float32x4_t mul = vmulq_f32(a, b);
75 return vgetq_lane_f32(mul, 0) + vgetq_lane_f32(mul, 1) + vgetq_lane_f32(mul, 2);
76 }
77
78 inline void neon_transpose4x4(
79 const float32x4_t row0, const float32x4_t row1, const float32x4_t row2, const float32x4_t row3,
80 float32x4_t& col0, float32x4_t& col1, float32x4_t& col2, float32x4_t& col3)
81 {
82 // Step 1: Pairwise interleave at 32-bit granularity
83 // vtrnq_f32({a,b,c,d}, {e,f,g,h}) → val[0]={a,e,c,g}, val[1]={b,f,d,h}
84 const float32x4x2_t t01 = vtrnq_f32(row0, row1);
85 const float32x4x2_t t23 = vtrnq_f32(row2, row3);
86
87 // Step 2: Combine low/high halves to complete the transpose
88 // No second vtrn_f32 needed — just vcombine the correct halves
89 col0 = vcombine_f32(vget_low_f32(t01.val[0]), vget_low_f32(t23.val[0]));
90 col1 = vcombine_f32(vget_low_f32(t01.val[1]), vget_low_f32(t23.val[1]));
91 col2 = vcombine_f32(vget_high_f32(t01.val[0]), vget_high_f32(t23.val[0]));
92 col3 = vcombine_f32(vget_high_f32(t01.val[1]), vget_high_f32(t23.val[1]));
93 }
94#elif defined(__APPLE__)
95#endif
96}