fix(audit): pass 2 — regression cleanup + remaining HIGH/MEDIUM

Pass 2 of /audit-code on PR #16. Both variants build clean. Critical (Pass 1 regressions): - buffer.zig:grow released the old VkBuffer BEFORE allocating the new one; if create() then failed the caller's deinit re-released the freed handles via the pool, crashing the driver. Now creates first and routes the old buffer through release after success. - Vulkan.zig:beginFrame partial failure (resetDescriptorPool fails after resetFences succeeds) left frame_fence unsignaled with no pending submit; the next Vulkan.deinit's waitForFences hung indefinitely. Fence reset is now LAST so an earlier failure leaves it signaled, plus an errdefer empty-submit re-signals it if Frame.begin itself fails post-reset. High: - RenderPass.step on pipeline reuse used to fall back to the pipeline's static descriptor set when step_pool.allocate failed — re-introducing the exact corruption the step_pool was added to prevent. Now drops the offending draw with a loud log instead. Same for missing step_pool. Medium: - vulkanizeGlsl `texture()` rewrite walks paren depth by raw bytes; added copySkippable inside the args so comments/strings containing `(` or `)` don't desync the depth tracker. - buffer_pool.cycle's OOM fallback held the pool mutex across vkDeviceWaitIdle, blocking every other renderer thread's release/acquire. Pending list is now moved into a local OUTSIDE the lock before the device wait. - SubsurfacePresenter binds zwp_linux_dmabuf_v1 at hardcoded v3; now skips the bind on a v1/v2 compositor (and clamps via std::min) to avoid a fatal protocol error that would tear down the wl_display. - presentVulkanDmabuf had no upper bound on `stride`; pathological stride near UINT32_MAX × height=65536 reached mmap with a ~280 TB request. Added stride <= MAX_DIM*16 cap. - drainVulkan early-return on m_hidden left m_pendingDmabuf.fd>=0; next post-Show present spuriously bumped m_droppedFrames every Hide/Show cycle. Now clears the slot under the lock. - shadertoy prefix-injection no-newline fallback used to write `#define` BEFORE `#version`, producing GLSL that violates the "#version must be first" rule. Now scans past the `#version` directive (number + optional profile) and synthesizes a newline before injecting defines. - shadertoy SPV target now validates non-empty + SPIR-V magic word (0x07230203) before alignedAlloc/memcpy; a zero-length file used to silently survive and crash vkCreateShaderModule. - Module.initFromSpirv mirrors the same checks for any future caller that bypasses glslang/shadertoy. Low: - Frame.complete's three-branch buffer_pool.cycle invocation collapsed to two branches (with/without deviceWaitIdle). - Buffer.deinit OOM fallback now logs via `log.warn` (the comment claiming "no logger in scope" was wrong; `log` IS imported). - GhosttySurface.h comment block referencing the removed 2 ms safety-net poll updated to point at m_droppedFrames instead. - m_droppedFrames now also counts legacy QImage path overwrites, matching the documented scope of the counter. - vulkanizeGlsl unnormalized_sampler_names list documented as a reserved-name constraint for user shaders (atlas_grayscale / atlas_color must not be used as user sampler names). - isHorizSpace + processIncludes whitespace check now accept \v and \f (legal GLSL preprocessor whitespace). - DescriptorPool: added error.InvalidPoolConfig for caller-side argument errors, distinct from driver-side error.VulkanFailed. - SubsurfacePresenter discoverGlobals now checks wl_display_roundtrip_queue's return value and logs disconnect- during-startup failures. Nit: - First-dmabuf log uses `unsigned long long` + `%llx` for drm_modifier (was `unsigned long` + `%lx` which truncates upper 32 bits on ILP32). Co-Authored-By: claude-flow <ruv@ruv.net>
2026-05-25 16:23:18 -05:00 · 2026-05-25 16:23:18 -05:00 · 7b13d526cb
parent 0f80583825
commit 7b13d526cb
10 changed files with 287 additions and 82 deletions
--- a/qt/src/GhosttySurface.cpp
+++ b/qt/src/GhosttySurface.cpp
@ -1638,9 +1638,9 @@ void GhosttySurface::presentVulkanDmabuf(
          expected, true, std::memory_order_relaxed)) {
    std::fprintf(stderr,
                 "[ghastty] first dmabuf for surface=%p: fd=%d %ux%u "
-                 "stride=%u fourcc=0x%08x mod=0x%lx image_backed=%d path=%s\n",
+                 "stride=%u fourcc=0x%08x mod=0x%llx image_backed=%d path=%s\n",
                 static_cast<void *>(this), dmabuf_fd, width, height, stride,
-                 drm_format, static_cast<unsigned long>(drm_modifier),
+                 drm_format, static_cast<unsigned long long>(drm_modifier),
                 image_backed ? 1 : 0, useSubsurface ? "subsurface" : "qimage");
  }

@ -1657,9 +1657,17 @@ void GhosttySurface::presentVulkanDmabuf(
  // terminal — and check that stride*height doesn't exceed
  // SIZE_MAX before promoting.
  constexpr quint32 MAX_DIM = 65536;
+  // Cap stride at MAX_DIM × 4 (BGRA8) × a small slack factor for
+  // tiled formats: ~4× the width-derived minimum is enough for any
+  // legitimate vendor tiling, and it keeps `stride * height`
+  // below ~64 GiB even at MAX_DIM. The previous lower-only bound
+  // let a pathological renderer with stride near UINT32_MAX and
+  // height=MAX_DIM reach mmap with a ~280 TB request.
+  constexpr quint32 MAX_STRIDE = MAX_DIM * 16;
  if (dmabuf_fd < 0 || width == 0 || height == 0) return;
  if (width > MAX_DIM || height > MAX_DIM) return;
  if (stride < static_cast<quint64>(width) * 4) return;
+  if (stride > MAX_STRIDE) return;
  // stride*height as 64-bit and check the size_t fit explicitly.
  const quint64 bytes64 = static_cast<quint64>(stride) * height;
  if (bytes64 > std::numeric_limits<std::size_t>::max()) return;
@ -1737,10 +1745,23 @@ void GhosttySurface::presentVulkanDmabuf(

  const double dpr_now = m_fbDpr.load(std::memory_order_acquire);
  if (dpr_now > 0) owned.setDevicePixelRatio(dpr_now);
+  bool overwrote_legacy = false;
  {
    QMutexLocker lock(&m_pendingMutex);
+    overwrote_legacy = !m_pending.isNull();
    m_pending = std::move(owned);
  }
+  if (overwrote_legacy) {
+    const auto count = m_droppedFrames.fetch_add(
+        1, std::memory_order_relaxed) + 1;
+    if (count <= 3 || count % 60 == 0) {
+      std::fprintf(stderr,
+                   "[ghastty] surface=%p dropped frame "
+                   "(legacy QImage path, total=%llu)\n",
+                   static_cast<void *>(this),
+                   static_cast<unsigned long long>(count));
+    }
+  }
  QMetaObject::invokeMethod(this, "drainVulkan", Qt::QueuedConnection);
 }

@ -1749,7 +1770,17 @@ void GhosttySurface::drainVulkan() {
  // under the mutex, then dispatch it to the presenter outside the
  // lock so a renderer-thread `presentVulkanDmabuf` parking the
  // next frame doesn't block on wl_display_flush.
-  if (m_hidden.load(std::memory_order_acquire)) return;
+  if (m_hidden.load(std::memory_order_acquire)) {
+    // Clear the parked descriptor on hide so the next post-Show
+    // present doesn't see a "stale frame still pending" state and
+    // spuriously bump m_droppedFrames every Hide/Show cycle. The
+    // fd itself is libghostty-owned (per ABI it's only valid for
+    // the duration of the original presentVulkanDmabuf call), so
+    // there's nothing to release here beyond marking the slot empty.
+    QMutexLocker lock(&m_pendingMutex);
+    m_pendingDmabuf.fd = -1;
+    return;
+  }
  if (m_useSubsurface.load(std::memory_order_acquire) &&
      m_subsurfacePresenter) {
    PendingDmabuf frame;
--- a/qt/src/GhosttySurface.h
+++ b/qt/src/GhosttySurface.h
@ -170,8 +170,9 @@ public:
  // mmap+memcpy'd QImage) and wakes the GUI thread via
  // `QMetaObject::invokeMethod(this, drainVulkan, Qt::QueuedConnection)`.
  // The GUI thread either commits the dmabuf to the wl_subsurface
-  // (zero-copy) or paints the QImage (fallback). A 2 ms safety-net
-  // poll catches anything `invokeMethod` ever fails to deliver.
+  // (zero-copy) or paints the QImage (fallback). The dropped-frame
+  // counter `m_droppedFrames` makes any genuine queue-loss visible
+  // (zero in the steady state).
  Q_INVOKABLE void presentVulkanDmabuf(
      int dmabuf_fd,
      quint32 drm_format,
--- a/qt/src/wayland/SubsurfacePresenter.cpp
+++ b/qt/src/wayland/SubsurfacePresenter.cpp
@ -86,16 +86,29 @@ void registryGlobal(void *data, wl_registry *registry, uint32_t name,
    g->subcompositor = static_cast<wl_subcompositor *>(
        wl_registry_bind(registry, name, &wl_subcompositor_interface, 1));
  } else if (std::strcmp(interface, zwp_linux_dmabuf_v1_interface.name) == 0) {
-    // v3 has `create_immed`, which we want (synchronous wl_buffer
-    // creation — the v2 async `create` + `created`/`failed` event
-    // dance would add a layer of callback machinery for no real win
-    // in our renderer's strict-fd-validity scenario). v4 adds the
-    // dynamic format/modifier feedback dance; we don't need it yet.
-    g->dmabuf = static_cast<zwp_linux_dmabuf_v1 *>(wl_registry_bind(
-        registry, name, &zwp_linux_dmabuf_v1_interface, 3));
-    // Add the listener immediately so the modifier events queued by
-    // the bind get delivered when the dispatch loop continues.
-    zwp_linux_dmabuf_v1_add_listener(g->dmabuf, &kDmabufListener, g);
+    // We want at least v3 for `create_immed` (synchronous wl_buffer
+    // creation — v1/v2 have only the async `create` + `created`/
+    // `failed` dance). A compositor that only advertises v1/v2
+    // can't satisfy our protocol assumptions; binding at v3 against
+    // such a compositor would protocol-error and tear down the
+    // entire wl_display. Skip the bind in that case so the
+    // legacy QImage fallback engages cleanly.
+    if (version < 3) {
+      std::fprintf(stderr,
+                   "[ghastty] wayland: linux-dmabuf-v1 advertised at "
+                   "version %u; need >= 3 for create_immed, falling back "
+                   "to QImage path\n",
+                   version);
+    } else {
+      // Cap at v3 — v4 adds the dynamic format/modifier feedback
+      // dance which we don't consume.
+      const uint32_t v = std::min<uint32_t>(version, 3u);
+      g->dmabuf = static_cast<zwp_linux_dmabuf_v1 *>(wl_registry_bind(
+          registry, name, &zwp_linux_dmabuf_v1_interface, v));
+      // Add the listener immediately so the modifier events queued
+      // by the bind get delivered when the dispatch loop continues.
+      zwp_linux_dmabuf_v1_add_listener(g->dmabuf, &kDmabufListener, g);
+    }
  } else if (std::strcmp(interface, wp_viewporter_interface.name) == 0) {
    g->viewporter = static_cast<wp_viewporter *>(
        wl_registry_bind(registry, name, &wp_viewporter_interface, 1));
@ -125,14 +138,24 @@ PresenterGlobals *discoverGlobals(wl_display *display) {
  // Roundtrip 1: bind compositor/subcompositor/dmabuf. Inside the
  // registry callback we attach the dmabuf listener immediately, so
  // any format/modifier events that arrive in the same dispatch
-  // pass fire on it.
-  wl_display_roundtrip_queue(display, queue);
+  // pass fire on it. A negative return means the wl_display
+  // disconnected mid-startup; subsequent tryCreate calls fall
+  // through to the QImage path (g->compositor etc. stay null).
+  if (wl_display_roundtrip_queue(display, queue) < 0) {
+    std::fprintf(stderr,
+                 "[ghastty] wayland: discoverGlobals roundtrip 1 failed; "
+                 "subsurface present path disabled\n");
+  }
  wl_registry_destroy(registry);
  // Roundtrip 2: belt-and-suspenders for any compositor that defers
  // the modifier events past the bind reply (most don't, but some
  // batch them). After this returns the modifier table is fully
  // populated and frozen for the process lifetime.
-  if (globals.dmabuf) wl_display_roundtrip_queue(display, queue);
+  if (globals.dmabuf && wl_display_roundtrip_queue(display, queue) < 0) {
+    std::fprintf(stderr,
+                 "[ghastty] wayland: discoverGlobals roundtrip 2 failed; "
+                 "modifier table may be incomplete\n");
+  }

  std::size_t total_mods = 0;
  for (const auto &kv : globals.modifiers) total_mods += kv.second.size();
--- a/src/renderer/Vulkan.zig
+++ b/src/renderer/Vulkan.zig
@ -206,25 +206,39 @@ pub const buffer_pool = struct {
    /// `Frame.complete` after `vkWaitForFences`.
    ///
    /// `dev` is needed only on the OOM fallback path: if `ready`
-    /// can't grow to absorb `pending`, we wait the device idle and
-    /// then destroy the pending entries directly so the next frame
-    /// doesn't double up on a pending list that can never drain.
+    /// can't grow to absorb `pending`, we wait the device idle
+    /// (OUTSIDE the mutex — see below) and then destroy the pending
+    /// entries directly so the next frame doesn't double up on a
+    /// pending list that can never drain.
    pub fn cycle(dev: *const Device) void {
-        mutex.lock();
-        defer mutex.unlock();
-        ready.appendSlice(std.heap.smp_allocator, pending.items) catch {
-            // Couldn't grow `ready` — destroy the pending GPU
-            // resources directly. Other renderer threads may still
-            // be submitting against the shared queue, so wait the
-            // device idle to make sure no command buffer in flight
-            // anywhere references these handles before we destroy.
-            _ = dev.dispatch.deviceWaitIdle(dev.device);
-            for (pending.items) |e| {
-                dev.dispatch.destroyBuffer(dev.device, e.buffer, null);
-                dev.dispatch.freeMemory(dev.device, e.memory, null);
+        // Try the fast path first — append `pending` to `ready`
+        // under the lock, then return. On OOM we have to destroy
+        // the pending entries, but `vkDeviceWaitIdle` is slow and
+        // holding the pool mutex across it would block every other
+        // renderer thread's release/acquire/cycle. Move the
+        // pending list into a local outside the lock, then drain.
+        var oom_pending: std.ArrayList(Entry) = .{};
+        defer oom_pending.deinit(std.heap.smp_allocator);
+        {
+            mutex.lock();
+            defer mutex.unlock();
+            if (ready.appendSlice(std.heap.smp_allocator, pending.items)) {
+                pending.clearRetainingCapacity();
+                return;
+            } else |_| {
+                // OOM. Move `pending` into our local so we can
+                // drain it without holding the mutex.
+                oom_pending = pending;
+                pending = .{};
            }
-        };
-        pending.clearRetainingCapacity();
+        }
+        // Mutex released. Other threads can release/acquire/cycle
+        // while we wait the device idle and destroy our slice.
+        _ = dev.dispatch.deviceWaitIdle(dev.device);
+        for (oom_pending.items) |e| {
+            dev.dispatch.destroyBuffer(dev.device, e.buffer, null);
+            dev.dispatch.freeMemory(dev.device, e.memory, null);
+        }
    }

    /// Tear down both lists. Call only when the device is idle
@ -583,19 +597,48 @@ pub fn beginFrame(
    }

    _ = self;
-    // Reset the command buffer + fence + step descriptor pool so
-    // this frame starts clean. `vkResetDescriptorPool` returns every
-    // set the previous frame allocated to the pool — much cheaper
-    // than freeing them individually, and removes any chance of
-    // last-frame's set being bound by accident.
+    // Reset this frame's per-frame state. ORDER MATTERS: the fence
+    // reset goes LAST. If an earlier reset fails and we return an
+    // error, `Frame.begin` never runs, no submit ever happens, and
+    // the fence stays in whatever state it was in (initially
+    // signaled, or signaled by the previous frame's submit). Any
+    // subsequent `Vulkan.deinit` then waits on a SIGNALED fence
+    // and returns immediately — pre-fix, an unsignaled fence with
+    // no pending submit hung the deinit forever on
+    // waitForFences(UINT64_MAX).
    if (dev.dispatch.resetCommandBuffer(frame_cb, 0) != vk.VK_SUCCESS)
        return error.VulkanFailed;
-    if (dev.dispatch.resetFences(dev.device, 1, &frame_fence) != vk.VK_SUCCESS)
-        return error.VulkanFailed;
    if (step_pool) |*p| {
        if (dev.dispatch.resetDescriptorPool(dev.device, p.pool, 0) != vk.VK_SUCCESS)
            return error.VulkanFailed;
    }
+    // `vkResetDescriptorPool` returns every set the previous frame
+    // allocated to the pool — much cheaper than freeing them
+    // individually, and removes any chance of last-frame's set
+    // being bound by accident.
+    if (dev.dispatch.resetFences(dev.device, 1, &frame_fence) != vk.VK_SUCCESS)
+        return error.VulkanFailed;
+    // From here on the fence is UNSIGNALED. If `Frame.begin`
+    // (vkBeginCommandBuffer) fails before any submit, we have to
+    // re-signal the fence ourselves — otherwise the next
+    // `Vulkan.deinit`'s waitForFences hangs indefinitely.
+    errdefer {
+        // Empty submit with this fence as the signal target is the
+        // simplest portable way to push it back to signaled
+        // without recording any commands.
+        const empty: vk.VkSubmitInfo = .{
+            .sType = vk.VK_STRUCTURE_TYPE_SUBMIT_INFO,
+            .pNext = null,
+            .waitSemaphoreCount = 0,
+            .pWaitSemaphores = null,
+            .pWaitDstStageMask = null,
+            .commandBufferCount = 0,
+            .pCommandBuffers = null,
+            .signalSemaphoreCount = 0,
+            .pSignalSemaphores = null,
+        };
+        _ = dev.queueSubmit(1, &empty, frame_fence);
+    }

    return try Frame.begin(
        .{
--- a/src/renderer/shadertoy.zig
+++ b/src/renderer/shadertoy.zig
@ -184,6 +184,30 @@ pub fn loadFromFile(
        .glsl => try glslFromSpv(alloc_gpa, spirv),
        .msl => try mslFromSpv(alloc_gpa, spirv),
        .spv => spv: {
+            // Validate before handing back: glslang has succeeded at
+            // this point but a zero-length SPIR-V output would
+            // crash `vkCreateShaderModule` (codeSize == 0). The
+            // SPIR-V magic word check is defensive against future
+            // backends that bypass glslang.
+            if (spirv.len < 4) {
+                std.log.warn(
+                    "shadertoy: empty SPIR-V output (size={})",
+                    .{spirv.len},
+                );
+                return error.InvalidShader;
+            }
+            // First 4 bytes are the SPIR-V magic word 0x07230203
+            // (little-endian). Reject anything else loudly instead
+            // of letting the driver crash.
+            const magic = std.mem.readInt(u32, spirv[0..4], .little);
+            if (magic != 0x07230203) {
+                std.log.warn(
+                    "shadertoy: SPIR-V output missing magic word " ++
+                        "(got 0x{x:0>8}, expected 0x07230203)",
+                    .{magic},
+                );
+                return error.InvalidShader;
+            }
            // Copy the SPIR-V binary out of the arena into a
            // 4-byte-aligned allocation under `alloc_gpa`. Vulkan
            // expects `pCode: []const u32`, so over-aligning is safe;
@ -219,10 +243,12 @@ pub fn glslFromShader(
        // Find the first newline after `#version ...` and inject the
        // defines on the following line. The prefix is expected to
        // start with `#version` followed by a newline; if a future
-        // edit ever drops that newline (e.g. a single-line prefix)
-        // we inject the defines BEFORE the prefix so glslang sees
-        // the directives on their own lines and reports a clear
-        // error instead of us crashing on a `null.?` unwrap.
+        // edit ever drops that newline (e.g. a single-line prefix
+        // entirely on one line), we synthesize one between
+        // `#version` and the rest, then inject the defines after.
+        // GLSL requires `#version` to be the first non-blank line,
+        // so injecting BEFORE it would silently produce invalid
+        // GLSL.
        if (std.mem.indexOfScalar(u8, prefix, '\n')) |first_nl| {
            try writer.writeAll(prefix[0 .. first_nl + 1]);
            for (defines) |def| {
@ -231,12 +257,31 @@ pub fn glslFromShader(
                try writer.writeAll("\n");
            }
            try writer.writeAll(prefix[first_nl + 1 ..]);
-        } else {
+        } else if (std.mem.startsWith(u8, prefix, "#version")) {
+            // No newline anywhere, but it does start with `#version`.
+            // Find the end of the version directive: scan past the
+            // version number to the first non-version-token char,
+            // synthesize a newline there, then write defines and
+            // the rest of the prefix.
+            var p: usize = "#version".len;
+            while (p < prefix.len and (prefix[p] == ' ' or prefix[p] == '\t')) p += 1;
+            while (p < prefix.len and prefix[p] >= '0' and prefix[p] <= '9') p += 1;
+            // Optional profile (`core` / `compatibility` / `es`).
+            while (p < prefix.len and (prefix[p] == ' ' or prefix[p] == '\t')) p += 1;
+            while (p < prefix.len and ((prefix[p] >= 'a' and prefix[p] <= 'z') or
+                (prefix[p] >= 'A' and prefix[p] <= 'Z'))) p += 1;
+            try writer.writeAll(prefix[0..p]);
+            try writer.writeByte('\n');
            for (defines) |def| {
                try writer.writeAll("#define ");
                try writer.writeAll(def);
                try writer.writeAll("\n");
            }
+            try writer.writeAll(prefix[p..]);
+        } else {
+            // Prefix doesn't start with `#version` either — the
+            // shader is malformed. Pass it through as-is so glslang
+            // reports a clear parse error.
            try writer.writeAll(prefix);
        }
    }
--- a/src/renderer/vulkan/DescriptorPool.zig
+++ b/src/renderer/vulkan/DescriptorPool.zig
@ -30,6 +30,11 @@ pub const Error = error{
    /// `vkCreateDescriptorPool` / `vkAllocateDescriptorSets` returned
    /// a non-success status.
    VulkanFailed,
+    /// Caller passed an invalid pool configuration (e.g. `max_sets ==
+    /// 0`, or every per-type cap is zero). Distinct from
+    /// `VulkanFailed` so callers can tell driver-side errors from
+    /// caller-side ones.
+    InvalidPoolConfig,
 };

 /// Construction caps. `max_sets` is the total number of descriptor
@ -55,7 +60,7 @@ pub fn init(opts: Options) Error!Self {
    // a downstream allocation failure.
    if (opts.max_sets == 0) {
        log.err("DescriptorPool.init: max_sets must be > 0", .{});
-        return error.VulkanFailed;
+        return error.InvalidPoolConfig;
    }
    if (opts.uniform_buffers == 0 and
        opts.combined_image_samplers == 0 and
@ -66,7 +71,7 @@ pub fn init(opts: Options) Error!Self {
                "(uniform_buffers, combined_image_samplers, storage_buffers)",
            .{},
        );
-        return error.VulkanFailed;
+        return error.InvalidPoolConfig;
    }

    // Build a small VkDescriptorPoolSize array from whichever caps
--- a/src/renderer/vulkan/Frame.zig
+++ b/src/renderer/vulkan/Frame.zig
@ -186,22 +186,14 @@ pub fn complete(self: *const Self, sync: bool) void {
    // Recycle the per-frame Buffer pool. Even on the error path we
    // still want to cycle: buffers that the failed submit referenced
    // are now stuck (we can't prove the GPU is done with them), so
-    // we conservatively wait the device idle on the unhealthy path
-    // before draining. Without this, every failed submit leaks
-    // every buffer the renderer queued for that frame.
-    if (health == .unhealthy and !submitted) {
-        // Submit never happened — nothing in flight references
-        // recorded buffers, safe to cycle directly.
-        Vulkan.buffer_pool.cycle(dev);
-    } else if (health == .unhealthy) {
-        // Submit happened but fence wait failed (DEVICE_LOST etc.).
-        // Drain the device before recycling to avoid use-after-free
-        // on whatever queue is still ticking.
+    // we conservatively wait the device idle when submit DID happen
+    // but the fence wait failed (DEVICE_LOST etc.) before draining.
+    // Without that wait, every failed submit could leak the buffers
+    // the renderer queued for the frame.
+    if (health == .unhealthy and submitted) {
        _ = dev.dispatch.deviceWaitIdle(dev.device);
-        Vulkan.buffer_pool.cycle(dev);
-    } else {
-        Vulkan.buffer_pool.cycle(dev);
    }
+    Vulkan.buffer_pool.cycle(dev);

    // Hand the rendered target off to the host. On the unhealthy
    // path we skip present — the dmabuf may be partially written
--- a/src/renderer/vulkan/RenderPass.zig
+++ b/src/renderer/vulkan/RenderPass.zig
@ -396,23 +396,42 @@ pub fn step(self: *Self, s: Step) void {
    var effective_sets: [Pipeline.MAX_DESCRIPTOR_SETS]vk.VkDescriptorSet =
        s.pipeline.descriptor_sets;
    const reused = self.markPipelineUsed(s.pipeline.pipeline);
-    if (reused) if (self.step_pool) |pool| {
+    if (reused) {
+        // No step_pool means the renderer thread has no per-frame
+        // descriptor pool wired up (test harness, smoke test). We
+        // can't safely re-use this pipeline — updating the static
+        // set in place would corrupt the prior draw's bindings.
+        // Drop the draw rather than corrupt the frame.
+        const pool = self.step_pool orelse {
+            log.err(
+                "RenderPass.step: pipeline re-used but no step_pool " ++
+                    "available; dropping draw to avoid corrupting prior draws",
+                .{},
+            );
+            return;
+        };
        for (s.pipeline.descriptor_set_layouts, 0..) |maybe_dsl, i| {
            if (i >= s.pipeline.set_count) break;
            const dsl = maybe_dsl orelse continue;
            if (pool.allocate(dsl)) |fresh| {
                effective_sets[i] = fresh;
            } else |err| {
+                // Pool exhausted. The previous behavior was to
+                // fall back to the pipeline's static set, but that
+                // re-introduces the exact corruption the step_pool
+                // mechanism exists to prevent. Drop the draw; the
+                // user sees one missed image rather than every
+                // image rendered with the last image's bindings.
                log.err(
                    "RenderPass.step: per-call descriptor set " ++
-                        "allocation for set {} failed ({}); falling " ++
-                        "back to the pipeline's static set, which " ++
-                        "may corrupt prior draws on this pipeline",
+                        "allocation for set {} failed ({}); dropping draw " ++
+                        "(step_pool exhausted — increase STEP_POOL_MAX_SETS)",
                    .{ i, err },
                );
+                return;
            }
        }
-    };
+    }

    // ---- update descriptor sets ---------------------------------
    //
--- a/src/renderer/vulkan/buffer.zig
+++ b/src/renderer/vulkan/buffer.zig
@ -111,13 +111,16 @@ pub fn Buffer(comptime T: type) type {
                self.memory,
                self.opts.usage,
                capacity_bytes,
-            ) catch {
+            ) catch |err| {
                // OOM growing the pool. The buffer may still be
                // referenced by an in-flight command buffer, so we
                // wait the entire device idle before destroying —
-                // expensive but correct. Logging here is awkward (no
-                // logger in scope) so we accept the loud failure and
-                // let Vulkan stderr diagnose anything that follows.
+                // expensive but correct.
+                log.warn(
+                    "Buffer.deinit: pool release failed ({}); falling " ++
+                        "back to vkDeviceWaitIdle + destroy",
+                    .{err},
+                );
                _ = dev.dispatch.deviceWaitIdle(dev.device);
                dev.dispatch.destroyBuffer(dev.device, self.buffer, null);
                dev.dispatch.freeMemory(dev.device, self.memory, null);
@ -261,14 +264,26 @@ pub fn Buffer(comptime T: type) type {
        }

        /// Grow the buffer to hold at least `new_len` Ts. Vulkan
-        /// buffers are immutable in size, so we route the old
-        /// buffer through the recycle pool (it may still be
-        /// referenced by the in-flight command buffer — destroying
-        /// it directly would race the GPU same as `deinit` would)
-        /// and create a fresh one. Contents are discarded; callers
+        /// buffers are immutable in size, so we allocate a fresh
+        /// one and then route the old one through the recycle pool
+        /// (it may still be referenced by the in-flight command
+        /// buffer — destroying it directly would race the GPU same
+        /// as `deinit` would). Contents are discarded; callers
        /// always `sync` immediately after `grow` returns.
+        ///
+        /// Order is critical: `create` first, `release` second.
+        /// If we released the old buffer first and `create`
+        /// failed, `self.{buffer,memory}` would be left dangling
+        /// at freed handles, and the caller's eventual
+        /// `self.deinit()` would double-destroy via the pool.
        fn grow(self: *Self, new_len: usize) Error!void {
            const dev = self.opts.device;
+            const replacement = try create(self.opts, new_len);
+            // From here on `self.{buffer,memory}` are the OLD pair;
+            // release them. If `release` itself OOMs, we have to
+            // destroy directly (same fallback as `deinit`), but the
+            // new pair is already constructed and `self.* =
+            // replacement` will reach a healthy state regardless.
            const bp = @import("../Vulkan.zig").buffer_pool;
            const capacity_bytes: u64 = @as(u64, self.len) * @sizeOf(T);
            bp.release(
@ -278,13 +293,10 @@ pub fn Buffer(comptime T: type) type {
                self.opts.usage,
                capacity_bytes,
            ) catch {
-                // OOM appending to the pool — wait the device idle
-                // and destroy directly. Same fallback as `deinit`.
                _ = dev.dispatch.deviceWaitIdle(dev.device);
                dev.dispatch.destroyBuffer(dev.device, self.buffer, null);
                dev.dispatch.freeMemory(dev.device, self.memory, null);
            };
-            const replacement = try create(self.opts, new_len);
            self.* = replacement;
        }

--- a/src/renderer/vulkan/shaders.zig
+++ b/src/renderer/vulkan/shaders.zig
@ -74,7 +74,9 @@ fn processIncludes(comptime contents: [:0]const u8) [:0]const u8 {
            // any horizontal whitespace and require exactly one
            // double-quoted path.
            var p = i + "#include".len;
-            while (p < contents.len and (contents[p] == ' ' or contents[p] == '\t')) : (p += 1) {}
+            while (p < contents.len and (contents[p] == ' ' or contents[p] == '\t' or
+                contents[p] == 0x0B or contents[p] == 0x0C)) : (p += 1)
+            {}
            if (p >= contents.len or contents[p] != '"') {
                @compileError("processIncludes: malformed #include directive in shader");
            }
@ -231,6 +233,10 @@ pub fn vulkanizeGlsl(
                    i += 1; // consume the '('
                    var depth: i32 = 1;
                    while (i < src.len and depth > 0) {
+                        // Skip comments and string literals verbatim
+                        // — a `(` or `)` inside `/* */` or `"..."`
+                        // shouldn't move our paren depth tracker.
+                        if (try copySkippable(alloc, &out, src, &i)) continue;
                        const cc = src[i];
                        if (cc == '(') depth += 1;
                        if (cc == ')') {
@ -390,6 +396,13 @@ fn isIdentChar(c: u8) bool {
 /// tiny — broader matching would force `textureLod` on the custom
 /// shader's `iChannel0`, which is normalized, and bypassing the
 /// implicit-LOD opcode path makes the driver work harder per call.
+///
+/// User-supplied custom shaders MUST NOT name a sampler `atlas_grayscale`
+/// or `atlas_color` — doing so will trigger this rewrite and replace
+/// `texture()` calls on that sampler with `textureLod(..., 0.0)`,
+/// which is incorrect for any normalized sampler. The shadertoy
+/// shader convention uses `iChannel0..3`, so the conflict is unlikely
+/// in practice.
 const unnormalized_sampler_names = [_][]const u8{
    "atlas_grayscale",
    "atlas_color",
@ -416,7 +429,10 @@ fn nextSamplerIsUnnormalized(src: []const u8, i: usize) bool {
 }

 fn isHorizSpace(c: u8) bool {
-    return c == ' ' or c == '\t';
+    // GLSL preprocessor whitespace per the spec: space, tab,
+    // vertical tab, form feed. Newline + carriage return are
+    // separately handled as line terminators.
+    return c == ' ' or c == '\t' or c == 0x0B or c == 0x0C;
 }

 fn isAnySpace(c: u8) bool {
@ -563,6 +579,24 @@ pub const Module = struct {
        spirv: []const u32,
        stage: Stage,
    ) Error!Module {
+        // Sanity-check the SPIR-V before handing it to the driver.
+        // The glslang and shadertoy paths both validate already, but
+        // a future caller wiring up the build-time-blob path could
+        // accidentally pass an empty or non-SPIR-V buffer; failing
+        // here with a clear error beats a `vkCreateShaderModule`
+        // segfault inside the loader.
+        if (spirv.len == 0) {
+            log.err("Module.initFromSpirv: zero-length SPIR-V buffer", .{});
+            return error.VulkanFailed;
+        }
+        if (spirv[0] != 0x07230203) {
+            log.err(
+                "Module.initFromSpirv: missing magic word " ++
+                    "(got 0x{x:0>8}, expected 0x07230203)",
+                .{spirv[0]},
+            );
+            return error.VulkanFailed;
+        }
        const info: vk.VkShaderModuleCreateInfo = .{
            .sType = vk.VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO,
            .pNext = null,