VisuTwin Canvas
C++ 3D Engine — Metal Backend
Loading...
Searching...
No Matches
metalInstanceCullPass.cpp
Go to the documentation of this file.
1// SPDX-License-Identifier: Apache-2.0
2// Copyright 2025-2026 Arnis Lektauers
3//
4// Metal compute pass for GPU frustum culling -- implementation.
5//
6// Contains the embedded MSL compute kernels (instanceCull + writeIndirectArgs)
7// and CPU-side dispatch logic for the two-kernel pipeline.
8//
9// Custom shader -- no upstream GLSL equivalent exists.
10//
12
13#include "metalGraphicsDevice.h"
14#include "spdlog/spdlog.h"
15
16#include <cmath>
17#include <cstring>
18
19namespace visutwin::canvas
20{
21 namespace
22 {
23 // ── Embedded Metal Shading Language ─────────────────────────────
24 //
25 // Two compute kernels for GPU frustum culling of instances.
26 //
27 // Kernel 1 (instanceCull): Tests each instance bounding sphere vs 6 planes,
28 // compacts visible instances into an output buffer via atomic_fetch_add.
29 // Kernel 2 (writeIndirectArgs): Reads the atomic counter and writes
30 // MTLDrawIndexedPrimitivesIndirectArguments.
31 //
32 // Instance layout matches InstanceData in common.metal (80 bytes):
33 // float4x4 modelMatrix (64 bytes) + float4 diffuseColor (16 bytes).
34 //
35 constexpr const char* INSTANCE_CULL_SOURCE = R"(
36#include <metal_stdlib>
37using namespace metal;
38
39// ── Uniform parameters ──────────────────────────────────────────────
40// Must match InstanceCullParams in metalInstanceCullPass.h (128 bytes).
41struct CullParams {
42 float4 frustumPlanes[6]; // (nx, ny, nz, d) per plane. dot(n,p)+d >= 0 = inside.
43 float boundingSphereRadius;
44 uint instanceCount;
45 uint indexCount; // mesh Primitive.count -> indirect args
46 uint indexStart; // mesh Primitive.base -> indirect args
47 int baseVertex; // mesh Primitive.baseVertex -> indirect args
48 uint baseInstance; // always 0
49 float _pad[2];
50};
51
52// ── Instance data layout (80 bytes) ────────────────────────────────
53// Matches common.metal InstanceData.
54struct InstanceData {
55 float4x4 modelMatrix; // 64 bytes
56 float4 diffuseColor; // 16 bytes
57};
58
59// ── Metal indirect draw arguments (20 bytes) ───────────────────────
60// Matches MTLDrawIndexedPrimitivesIndirectArguments.
61struct IndirectArgs {
62 uint indexCount;
63 uint instanceCount;
64 uint indexStart;
65 int baseVertex;
66 uint baseInstance;
67};
68
69// ── Kernel 1: Frustum cull instances and compact visible ones ──────
70// 64 threads per threadgroup, 1D dispatch.
71kernel void instanceCull(
72 constant CullParams& params [[buffer(0)]],
73 constant InstanceData* input [[buffer(1)]],
74 device InstanceData* output [[buffer(2)]],
75 device atomic_uint* counter [[buffer(3)]],
76 uint tid [[thread_position_in_grid]])
77{
78 if (tid >= params.instanceCount) return;
79
80 // Extract instance world position from translation column of model matrix.
81 const float3 center = float3(input[tid].modelMatrix[3][0],
82 input[tid].modelMatrix[3][1],
83 input[tid].modelMatrix[3][2]);
84 const float radius = params.boundingSphereRadius;
85
86 // Test bounding sphere against 6 frustum planes.
87 // dot(plane.xyz, center) + plane.w + radius >= 0 means (partially) inside.
88 for (int p = 0; p < 6; ++p) {
89 const float4 plane = params.frustumPlanes[p];
90 const float dist = dot(plane.xyz, center) + plane.w;
91 if (dist < -radius) {
92 return; // Outside this plane — culled.
93 }
94 }
95
96 // Visible: atomically allocate a slot and copy instance data.
97 const uint slot = atomic_fetch_add_explicit(counter, 1u, memory_order_relaxed);
98 output[slot] = input[tid];
99}
100
101// ── Kernel 2: Write indirect draw arguments ────────────────────────
102// Single thread reads the atomic counter and writes the indirect args.
103kernel void writeIndirectArgs(
104 constant CullParams& params [[buffer(0)]],
105 device atomic_uint* counter [[buffer(3)]],
106 device IndirectArgs* args [[buffer(4)]],
107 uint tid [[thread_position_in_grid]])
108{
109 if (tid != 0) return;
110
111 const uint visibleCount = atomic_load_explicit(counter, memory_order_relaxed);
112 args[0].indexCount = params.indexCount;
113 args[0].instanceCount = visibleCount;
114 args[0].indexStart = params.indexStart;
115 args[0].baseVertex = params.baseVertex;
116 args[0].baseInstance = params.baseInstance;
117}
118)";
119
120 constexpr uint32_t THREADGROUP_SIZE = 64;
121 constexpr size_t INSTANCE_DATA_SIZE = 80; // bytes per instance
122
123 } // anonymous namespace
124
125 // ─── Construction / Destruction ───────────────────────────────────
126
128 : device_(device)
129 {
130 }
131
133 {
134 if (cullPipeline_) { cullPipeline_->release(); cullPipeline_ = nullptr; }
135 if (writeArgsPipeline_) { writeArgsPipeline_->release(); writeArgsPipeline_ = nullptr; }
136 if (compactedBuffer_) { compactedBuffer_->release(); compactedBuffer_ = nullptr; }
137 if (indirectArgsBuffer_){ indirectArgsBuffer_->release(); indirectArgsBuffer_ = nullptr; }
138 if (counterBuffer_) { counterBuffer_->release(); counterBuffer_ = nullptr; }
139 if (uniformBuffer_) { uniformBuffer_->release(); uniformBuffer_ = nullptr; }
140 }
141
142 // ─── Lazy Resource Creation ──────────────────────────────────────
143
144 void MetalInstanceCullPass::ensureResources()
145 {
146 if (resourcesReady_) return;
147
148 auto* mtlDevice = device_->raw();
149 if (!mtlDevice) return;
150
151 // ── Compile compute shaders ────────────────────────────────
152 if (!cullPipeline_ || !writeArgsPipeline_) {
153 NS::Error* error = nullptr;
154 auto* source = NS::String::string(INSTANCE_CULL_SOURCE, NS::UTF8StringEncoding);
155 auto* library = mtlDevice->newLibrary(source, nullptr, &error);
156 if (!library) {
157 spdlog::error("[MetalInstanceCullPass] Failed to compile cull shaders: {}",
158 error ? error->localizedDescription()->utf8String() : "unknown");
159 return;
160 }
161
162 // Cull pipeline
163 if (!cullPipeline_) {
164 auto* funcName = NS::String::string("instanceCull", NS::UTF8StringEncoding);
165 auto* function = library->newFunction(funcName);
166 if (!function) {
167 spdlog::error("[MetalInstanceCullPass] Entry point 'instanceCull' not found");
168 library->release();
169 return;
170 }
171 cullPipeline_ = mtlDevice->newComputePipelineState(function, &error);
172 function->release();
173 if (!cullPipeline_) {
174 spdlog::error("[MetalInstanceCullPass] Failed to create cull pipeline: {}",
175 error ? error->localizedDescription()->utf8String() : "unknown");
176 library->release();
177 return;
178 }
179 }
180
181 // WriteArgs pipeline
182 if (!writeArgsPipeline_) {
183 auto* funcName = NS::String::string("writeIndirectArgs", NS::UTF8StringEncoding);
184 auto* function = library->newFunction(funcName);
185 if (!function) {
186 spdlog::error("[MetalInstanceCullPass] Entry point 'writeIndirectArgs' not found");
187 library->release();
188 return;
189 }
190 writeArgsPipeline_ = mtlDevice->newComputePipelineState(function, &error);
191 function->release();
192 if (!writeArgsPipeline_) {
193 spdlog::error("[MetalInstanceCullPass] Failed to create writeArgs pipeline: {}",
194 error ? error->localizedDescription()->utf8String() : "unknown");
195 library->release();
196 return;
197 }
198 }
199
200 library->release();
201 }
202
203 // ── Atomic counter buffer (single uint32) ──────────────────
204 if (!counterBuffer_) {
205 counterBuffer_ = mtlDevice->newBuffer(sizeof(uint32_t), MTL::ResourceStorageModeShared);
206 if (!counterBuffer_) {
207 spdlog::error("[MetalInstanceCullPass] Failed to create counter buffer");
208 return;
209 }
210 }
211
212 // ── Uniform buffer (InstanceCullParams) ────────────────────
213 if (!uniformBuffer_) {
214 uniformBuffer_ = mtlDevice->newBuffer(sizeof(InstanceCullParams), MTL::ResourceStorageModeShared);
215 if (!uniformBuffer_) {
216 spdlog::error("[MetalInstanceCullPass] Failed to create uniform buffer");
217 return;
218 }
219 }
220
221 // ── Indirect args buffer (20 bytes) ────────────────────────
222 if (!indirectArgsBuffer_) {
223 indirectArgsBuffer_ = mtlDevice->newBuffer(
224 5 * sizeof(uint32_t), MTL::ResourceStorageModeShared);
225 if (!indirectArgsBuffer_) {
226 spdlog::error("[MetalInstanceCullPass] Failed to create indirect args buffer");
227 return;
228 }
229 }
230
231 resourcesReady_ = (cullPipeline_ && writeArgsPipeline_ &&
232 counterBuffer_ && uniformBuffer_ && indirectArgsBuffer_);
233
234 if (resourcesReady_) {
235 spdlog::info("[MetalInstanceCullPass] Resources initialized successfully");
236 }
237 }
238
239 // ─── Buffer Reservation ──────────────────────────────────────────
240
241 void MetalInstanceCullPass::reserve(uint32_t maxInstances)
242 {
243 if (maxInstances <= maxInstances_ && compactedBuffer_) return;
244
245 auto* mtlDevice = device_->raw();
246 if (!mtlDevice) return;
247
248 if (compactedBuffer_) {
249 compactedBuffer_->release();
250 compactedBuffer_ = nullptr;
251 }
252
253 const size_t bufferSize = static_cast<size_t>(maxInstances) * INSTANCE_DATA_SIZE;
254 compactedBuffer_ = mtlDevice->newBuffer(bufferSize, MTL::ResourceStorageModeShared);
255 if (!compactedBuffer_) {
256 spdlog::error("[MetalInstanceCullPass] Failed to allocate compacted buffer ({} instances, {:.1f} KB)",
257 maxInstances, static_cast<double>(bufferSize) / 1024.0);
258 maxInstances_ = 0;
259 return;
260 }
261
262 maxInstances_ = maxInstances;
263 spdlog::debug("[MetalInstanceCullPass] Reserved compacted buffer for {} instances ({:.1f} KB)",
264 maxInstances, static_cast<double>(bufferSize) / 1024.0);
265 }
266
267 // ─── GPU Culling ─────────────────────────────────────────────────
268
269 void MetalInstanceCullPass::cull(MTL::Buffer* inputBuffer, const InstanceCullParams& params)
270 {
271 if (!inputBuffer || params.instanceCount == 0) return;
272
273 ensureResources();
274 if (!resourcesReady_) return;
275
276 // Ensure compacted buffer is large enough
277 reserve(params.instanceCount);
278 if (!compactedBuffer_) return;
279
280 // Upload uniforms
281 std::memcpy(uniformBuffer_->contents(), &params, sizeof(InstanceCullParams));
282
283 // Reset atomic counter to 0
284 uint32_t zero = 0;
285 std::memcpy(counterBuffer_->contents(), &zero, sizeof(uint32_t));
286
287 // Dispatch kernel 1 + kernel 2 in a single command buffer.
288 // Metal guarantees sequential execution of compute encoders within
289 // the same command buffer — no explicit barrier needed.
290 auto* commandBuffer = device_->_commandQueue->commandBuffer();
291 if (!commandBuffer) {
292 spdlog::warn("[MetalInstanceCullPass] Failed to create command buffer");
293 return;
294 }
295
296 // ── Kernel 1: Frustum cull instances ────────────────────────
297 {
298 auto* encoder = commandBuffer->computeCommandEncoder();
299 if (!encoder) {
300 spdlog::warn("[MetalInstanceCullPass] Failed to create compute encoder for cull");
301 return;
302 }
303
304 encoder->pushDebugGroup(
305 NS::String::string("InstanceCull", NS::UTF8StringEncoding));
306
307 encoder->setComputePipelineState(cullPipeline_);
308 encoder->setBuffer(uniformBuffer_, 0, 0); // [[buffer(0)]] params
309 encoder->setBuffer(inputBuffer, 0, 1); // [[buffer(1)]] input
310 encoder->setBuffer(compactedBuffer_, 0, 2); // [[buffer(2)]] output
311 encoder->setBuffer(counterBuffer_, 0, 3); // [[buffer(3)]] counter
312
313 const uint32_t threadgroups = (params.instanceCount + THREADGROUP_SIZE - 1) / THREADGROUP_SIZE;
314 encoder->dispatchThreadgroups(
315 MTL::Size(threadgroups, 1, 1),
316 MTL::Size(THREADGROUP_SIZE, 1, 1));
317
318 encoder->popDebugGroup();
319 encoder->endEncoding();
320 }
321
322 // ── Kernel 2: Write indirect draw arguments ─────────────────
323 {
324 auto* encoder = commandBuffer->computeCommandEncoder();
325 if (!encoder) {
326 spdlog::warn("[MetalInstanceCullPass] Failed to create compute encoder for writeArgs");
327 return;
328 }
329
330 encoder->pushDebugGroup(
331 NS::String::string("WriteIndirectArgs", NS::UTF8StringEncoding));
332
333 encoder->setComputePipelineState(writeArgsPipeline_);
334 encoder->setBuffer(uniformBuffer_, 0, 0); // [[buffer(0)]] params
335 encoder->setBuffer(counterBuffer_, 0, 3); // [[buffer(3)]] counter
336 encoder->setBuffer(indirectArgsBuffer_, 0, 4); // [[buffer(4)]] args
337
338 encoder->dispatchThreadgroups(
339 MTL::Size(1, 1, 1),
340 MTL::Size(1, 1, 1));
341
342 encoder->popDebugGroup();
343 encoder->endEncoding();
344 }
345
346 // MVP: synchronous wait. For production, this could be replaced with
347 // a shared event or fence to overlap compute with the previous frame's render.
348 commandBuffer->commit();
349 commandBuffer->waitUntilCompleted();
350 }
351
352 // ─── Frustum Plane Extraction (Gribb/Hartmann Method) ────────────
353
355 const float* m, float outPlanes[6][4])
356 {
357 // Input: 4x4 view-projection matrix in column-major order.
358 // m[col*4 + row] — standard Metal/OpenGL layout.
359 //
360 // Row access helper: row i of column j = m[j*4 + i]
361 // Row 0: m[0], m[4], m[8], m[12]
362 // Row 1: m[1], m[5], m[9], m[13]
363 // Row 2: m[2], m[6], m[10], m[14]
364 // Row 3: m[3], m[7], m[11], m[15]
365
366 // Left: row3 + row0
367 outPlanes[0][0] = m[3] + m[0];
368 outPlanes[0][1] = m[7] + m[4];
369 outPlanes[0][2] = m[11] + m[8];
370 outPlanes[0][3] = m[15] + m[12];
371
372 // Right: row3 - row0
373 outPlanes[1][0] = m[3] - m[0];
374 outPlanes[1][1] = m[7] - m[4];
375 outPlanes[1][2] = m[11] - m[8];
376 outPlanes[1][3] = m[15] - m[12];
377
378 // Bottom: row3 + row1
379 outPlanes[2][0] = m[3] + m[1];
380 outPlanes[2][1] = m[7] + m[5];
381 outPlanes[2][2] = m[11] + m[9];
382 outPlanes[2][3] = m[15] + m[13];
383
384 // Top: row3 - row1
385 outPlanes[3][0] = m[3] - m[1];
386 outPlanes[3][1] = m[7] - m[5];
387 outPlanes[3][2] = m[11] - m[9];
388 outPlanes[3][3] = m[15] - m[13];
389
390 // Near: row3 + row2
391 outPlanes[4][0] = m[3] + m[2];
392 outPlanes[4][1] = m[7] + m[6];
393 outPlanes[4][2] = m[11] + m[10];
394 outPlanes[4][3] = m[15] + m[14];
395
396 // Far: row3 - row2
397 outPlanes[5][0] = m[3] - m[2];
398 outPlanes[5][1] = m[7] - m[6];
399 outPlanes[5][2] = m[11] - m[10];
400 outPlanes[5][3] = m[15] - m[14];
401
402 // Normalize each plane
403 for (int i = 0; i < 6; ++i) {
404 const float len = std::sqrt(
405 outPlanes[i][0] * outPlanes[i][0] +
406 outPlanes[i][1] * outPlanes[i][1] +
407 outPlanes[i][2] * outPlanes[i][2]);
408 if (len > 1e-8f) {
409 const float invLen = 1.0f / len;
410 outPlanes[i][0] *= invLen;
411 outPlanes[i][1] *= invLen;
412 outPlanes[i][2] *= invLen;
413 outPlanes[i][3] *= invLen;
414 }
415 }
416 }
417
418} // namespace visutwin::canvas
static void extractFrustumPlanes(const float *vpMatrix4x4ColMajor, float outPlanes[6][4])
MetalInstanceCullPass(MetalGraphicsDevice *device)
void cull(MTL::Buffer *inputBuffer, const InstanceCullParams &params)
uint32_t instanceCount
Total input instances.