diff --git a/qt/src/GhosttySurface.cpp b/qt/src/GhosttySurface.cpp
index 2296fc42b..7a35457c1 100644
--- a/qt/src/GhosttySurface.cpp
+++ b/qt/src/GhosttySurface.cpp
@@ -1638,9 +1638,9 @@ void GhosttySurface::presentVulkanDmabuf(
           expected, true, std::memory_order_relaxed)) {
     std::fprintf(stderr,
                  "[ghastty] first dmabuf for surface=%p: fd=%d %ux%u "
-                 "stride=%u fourcc=0x%08x mod=0x%lx image_backed=%d path=%s\n",
+                 "stride=%u fourcc=0x%08x mod=0x%llx image_backed=%d path=%s\n",
                  static_cast<void *>(this), dmabuf_fd, width, height, stride,
-                 drm_format, static_cast<unsigned long>(drm_modifier),
+                 drm_format, static_cast<unsigned long long>(drm_modifier),
                  image_backed ? 1 : 0, useSubsurface ? "subsurface" : "qimage");
   }
 
@@ -1657,9 +1657,17 @@ void GhosttySurface::presentVulkanDmabuf(
   // terminal — and check that stride*height doesn't exceed
   // SIZE_MAX before promoting.
   constexpr quint32 MAX_DIM = 65536;
+  // Cap stride at MAX_DIM × 4 (BGRA8) × a small slack factor for
+  // tiled formats: ~4× the width-derived minimum is enough for any
+  // legitimate vendor tiling, and it keeps `stride * height`
+  // below ~64 GiB even at MAX_DIM. The previous lower-only bound
+  // let a pathological renderer with stride near UINT32_MAX and
+  // height=MAX_DIM reach mmap with a ~280 TB request.
+  constexpr quint32 MAX_STRIDE = MAX_DIM * 16;
   if (dmabuf_fd < 0 || width == 0 || height == 0) return;
   if (width > MAX_DIM || height > MAX_DIM) return;
   if (stride < static_cast<quint64>(width) * 4) return;
+  if (stride > MAX_STRIDE) return;
   // stride*height as 64-bit and check the size_t fit explicitly.
   const quint64 bytes64 = static_cast<quint64>(stride) * height;
   if (bytes64 > std::numeric_limits<std::size_t>::max()) return;
@@ -1737,10 +1745,23 @@ void GhosttySurface::presentVulkanDmabuf(
 
   const double dpr_now = m_fbDpr.load(std::memory_order_acquire);
   if (dpr_now > 0) owned.setDevicePixelRatio(dpr_now);
+  bool overwrote_legacy = false;
   {
     QMutexLocker lock(&m_pendingMutex);
+    overwrote_legacy = !m_pending.isNull();
     m_pending = std::move(owned);
   }
+  if (overwrote_legacy) {
+    const auto count = m_droppedFrames.fetch_add(
+        1, std::memory_order_relaxed) + 1;
+    if (count <= 3 || count % 60 == 0) {
+      std::fprintf(stderr,
+                   "[ghastty] surface=%p dropped frame "
+                   "(legacy QImage path, total=%llu)\n",
+                   static_cast<void *>(this),
+                   static_cast<unsigned long long>(count));
+    }
+  }
   QMetaObject::invokeMethod(this, "drainVulkan", Qt::QueuedConnection);
 }
 
@@ -1749,7 +1770,17 @@ void GhosttySurface::drainVulkan() {
   // under the mutex, then dispatch it to the presenter outside the
   // lock so a renderer-thread `presentVulkanDmabuf` parking the
   // next frame doesn't block on wl_display_flush.
-  if (m_hidden.load(std::memory_order_acquire)) return;
+  if (m_hidden.load(std::memory_order_acquire)) {
+    // Clear the parked descriptor on hide so the next post-Show
+    // present doesn't see a "stale frame still pending" state and
+    // spuriously bump m_droppedFrames every Hide/Show cycle. The
+    // fd itself is libghostty-owned (per ABI it's only valid for
+    // the duration of the original presentVulkanDmabuf call), so
+    // there's nothing to release here beyond marking the slot empty.
+    QMutexLocker lock(&m_pendingMutex);
+    m_pendingDmabuf.fd = -1;
+    return;
+  }
   if (m_useSubsurface.load(std::memory_order_acquire) &&
       m_subsurfacePresenter) {
     PendingDmabuf frame;
diff --git a/qt/src/GhosttySurface.h b/qt/src/GhosttySurface.h
index 41738d328..22cd82eb5 100644
--- a/qt/src/GhosttySurface.h
+++ b/qt/src/GhosttySurface.h
@@ -170,8 +170,9 @@ public:
   // mmap+memcpy'd QImage) and wakes the GUI thread via
   // `QMetaObject::invokeMethod(this, drainVulkan, Qt::QueuedConnection)`.
   // The GUI thread either commits the dmabuf to the wl_subsurface
-  // (zero-copy) or paints the QImage (fallback). A 2 ms safety-net
-  // poll catches anything `invokeMethod` ever fails to deliver.
+  // (zero-copy) or paints the QImage (fallback). The dropped-frame
+  // counter `m_droppedFrames` makes any genuine queue-loss visible
+  // (zero in the steady state).
   Q_INVOKABLE void presentVulkanDmabuf(
       int dmabuf_fd,
       quint32 drm_format,
diff --git a/qt/src/wayland/SubsurfacePresenter.cpp b/qt/src/wayland/SubsurfacePresenter.cpp
index f0b42655a..681b2f2f7 100644
--- a/qt/src/wayland/SubsurfacePresenter.cpp
+++ b/qt/src/wayland/SubsurfacePresenter.cpp
@@ -86,16 +86,29 @@ void registryGlobal(void *data, wl_registry *registry, uint32_t name,
     g->subcompositor = static_cast<wl_subcompositor *>(
         wl_registry_bind(registry, name, &wl_subcompositor_interface, 1));
   } else if (std::strcmp(interface, zwp_linux_dmabuf_v1_interface.name) == 0) {
-    // v3 has `create_immed`, which we want (synchronous wl_buffer
-    // creation — the v2 async `create` + `created`/`failed` event
-    // dance would add a layer of callback machinery for no real win
-    // in our renderer's strict-fd-validity scenario). v4 adds the
-    // dynamic format/modifier feedback dance; we don't need it yet.
-    g->dmabuf = static_cast<zwp_linux_dmabuf_v1 *>(wl_registry_bind(
-        registry, name, &zwp_linux_dmabuf_v1_interface, 3));
-    // Add the listener immediately so the modifier events queued by
-    // the bind get delivered when the dispatch loop continues.
-    zwp_linux_dmabuf_v1_add_listener(g->dmabuf, &kDmabufListener, g);
+    // We want at least v3 for `create_immed` (synchronous wl_buffer
+    // creation — v1/v2 have only the async `create` + `created`/
+    // `failed` dance). A compositor that only advertises v1/v2
+    // can't satisfy our protocol assumptions; binding at v3 against
+    // such a compositor would protocol-error and tear down the
+    // entire wl_display. Skip the bind in that case so the
+    // legacy QImage fallback engages cleanly.
+    if (version < 3) {
+      std::fprintf(stderr,
+                   "[ghastty] wayland: linux-dmabuf-v1 advertised at "
+                   "version %u; need >= 3 for create_immed, falling back "
+                   "to QImage path\n",
+                   version);
+    } else {
+      // Cap at v3 — v4 adds the dynamic format/modifier feedback
+      // dance which we don't consume.
+      const uint32_t v = std::min<uint32_t>(version, 3u);
+      g->dmabuf = static_cast<zwp_linux_dmabuf_v1 *>(wl_registry_bind(
+          registry, name, &zwp_linux_dmabuf_v1_interface, v));
+      // Add the listener immediately so the modifier events queued
+      // by the bind get delivered when the dispatch loop continues.
+      zwp_linux_dmabuf_v1_add_listener(g->dmabuf, &kDmabufListener, g);
+    }
   } else if (std::strcmp(interface, wp_viewporter_interface.name) == 0) {
     g->viewporter = static_cast<wp_viewporter *>(
         wl_registry_bind(registry, name, &wp_viewporter_interface, 1));
@@ -125,14 +138,24 @@ PresenterGlobals *discoverGlobals(wl_display *display) {
   // Roundtrip 1: bind compositor/subcompositor/dmabuf. Inside the
   // registry callback we attach the dmabuf listener immediately, so
   // any format/modifier events that arrive in the same dispatch
-  // pass fire on it.
-  wl_display_roundtrip_queue(display, queue);
+  // pass fire on it. A negative return means the wl_display
+  // disconnected mid-startup; subsequent tryCreate calls fall
+  // through to the QImage path (g->compositor etc. stay null).
+  if (wl_display_roundtrip_queue(display, queue) < 0) {
+    std::fprintf(stderr,
+                 "[ghastty] wayland: discoverGlobals roundtrip 1 failed; "
+                 "subsurface present path disabled\n");
+  }
   wl_registry_destroy(registry);
   // Roundtrip 2: belt-and-suspenders for any compositor that defers
   // the modifier events past the bind reply (most don't, but some
   // batch them). After this returns the modifier table is fully
   // populated and frozen for the process lifetime.
-  if (globals.dmabuf) wl_display_roundtrip_queue(display, queue);
+  if (globals.dmabuf && wl_display_roundtrip_queue(display, queue) < 0) {
+    std::fprintf(stderr,
+                 "[ghastty] wayland: discoverGlobals roundtrip 2 failed; "
+                 "modifier table may be incomplete\n");
+  }
 
   std::size_t total_mods = 0;
   for (const auto &kv : globals.modifiers) total_mods += kv.second.size();
diff --git a/src/renderer/Vulkan.zig b/src/renderer/Vulkan.zig
index 563dbbcc7..983822b0f 100644
--- a/src/renderer/Vulkan.zig
+++ b/src/renderer/Vulkan.zig
@@ -206,25 +206,39 @@ pub const buffer_pool = struct {
     /// `Frame.complete` after `vkWaitForFences`.
     ///
     /// `dev` is needed only on the OOM fallback path: if `ready`
-    /// can't grow to absorb `pending`, we wait the device idle and
-    /// then destroy the pending entries directly so the next frame
-    /// doesn't double up on a pending list that can never drain.
+    /// can't grow to absorb `pending`, we wait the device idle
+    /// (OUTSIDE the mutex — see below) and then destroy the pending
+    /// entries directly so the next frame doesn't double up on a
+    /// pending list that can never drain.
     pub fn cycle(dev: *const Device) void {
-        mutex.lock();
-        defer mutex.unlock();
-        ready.appendSlice(std.heap.smp_allocator, pending.items) catch {
-            // Couldn't grow `ready` — destroy the pending GPU
-            // resources directly. Other renderer threads may still
-            // be submitting against the shared queue, so wait the
-            // device idle to make sure no command buffer in flight
-            // anywhere references these handles before we destroy.
-            _ = dev.dispatch.deviceWaitIdle(dev.device);
-            for (pending.items) |e| {
-                dev.dispatch.destroyBuffer(dev.device, e.buffer, null);
-                dev.dispatch.freeMemory(dev.device, e.memory, null);
+        // Try the fast path first — append `pending` to `ready`
+        // under the lock, then return. On OOM we have to destroy
+        // the pending entries, but `vkDeviceWaitIdle` is slow and
+        // holding the pool mutex across it would block every other
+        // renderer thread's release/acquire/cycle. Move the
+        // pending list into a local outside the lock, then drain.
+        var oom_pending: std.ArrayList(Entry) = .{};
+        defer oom_pending.deinit(std.heap.smp_allocator);
+        {
+            mutex.lock();
+            defer mutex.unlock();
+            if (ready.appendSlice(std.heap.smp_allocator, pending.items)) {
+                pending.clearRetainingCapacity();
+                return;
+            } else |_| {
+                // OOM. Move `pending` into our local so we can
+                // drain it without holding the mutex.
+                oom_pending = pending;
+                pending = .{};
             }
-        };
-        pending.clearRetainingCapacity();
+        }
+        // Mutex released. Other threads can release/acquire/cycle
+        // while we wait the device idle and destroy our slice.
+        _ = dev.dispatch.deviceWaitIdle(dev.device);
+        for (oom_pending.items) |e| {
+            dev.dispatch.destroyBuffer(dev.device, e.buffer, null);
+            dev.dispatch.freeMemory(dev.device, e.memory, null);
+        }
     }
 
     /// Tear down both lists. Call only when the device is idle
@@ -583,19 +597,48 @@ pub fn beginFrame(
     }
 
     _ = self;
-    // Reset the command buffer + fence + step descriptor pool so
-    // this frame starts clean. `vkResetDescriptorPool` returns every
-    // set the previous frame allocated to the pool — much cheaper
-    // than freeing them individually, and removes any chance of
-    // last-frame's set being bound by accident.
+    // Reset this frame's per-frame state. ORDER MATTERS: the fence
+    // reset goes LAST. If an earlier reset fails and we return an
+    // error, `Frame.begin` never runs, no submit ever happens, and
+    // the fence stays in whatever state it was in (initially
+    // signaled, or signaled by the previous frame's submit). Any
+    // subsequent `Vulkan.deinit` then waits on a SIGNALED fence
+    // and returns immediately — pre-fix, an unsignaled fence with
+    // no pending submit hung the deinit forever on
+    // waitForFences(UINT64_MAX).
     if (dev.dispatch.resetCommandBuffer(frame_cb, 0) != vk.VK_SUCCESS)
         return error.VulkanFailed;
-    if (dev.dispatch.resetFences(dev.device, 1, &frame_fence) != vk.VK_SUCCESS)
-        return error.VulkanFailed;
     if (step_pool) |*p| {
         if (dev.dispatch.resetDescriptorPool(dev.device, p.pool, 0) != vk.VK_SUCCESS)
             return error.VulkanFailed;
     }
+    // `vkResetDescriptorPool` returns every set the previous frame
+    // allocated to the pool — much cheaper than freeing them
+    // individually, and removes any chance of last-frame's set
+    // being bound by accident.
+    if (dev.dispatch.resetFences(dev.device, 1, &frame_fence) != vk.VK_SUCCESS)
+        return error.VulkanFailed;
+    // From here on the fence is UNSIGNALED. If `Frame.begin`
+    // (vkBeginCommandBuffer) fails before any submit, we have to
+    // re-signal the fence ourselves — otherwise the next
+    // `Vulkan.deinit`'s waitForFences hangs indefinitely.
+    errdefer {
+        // Empty submit with this fence as the signal target is the
+        // simplest portable way to push it back to signaled
+        // without recording any commands.
+        const empty: vk.VkSubmitInfo = .{
+            .sType = vk.VK_STRUCTURE_TYPE_SUBMIT_INFO,
+            .pNext = null,
+            .waitSemaphoreCount = 0,
+            .pWaitSemaphores = null,
+            .pWaitDstStageMask = null,
+            .commandBufferCount = 0,
+            .pCommandBuffers = null,
+            .signalSemaphoreCount = 0,
+            .pSignalSemaphores = null,
+        };
+        _ = dev.queueSubmit(1, &empty, frame_fence);
+    }
 
     return try Frame.begin(
         .{
diff --git a/src/renderer/shadertoy.zig b/src/renderer/shadertoy.zig
index f85b98271..24db7e592 100644
--- a/src/renderer/shadertoy.zig
+++ b/src/renderer/shadertoy.zig
@@ -184,6 +184,30 @@ pub fn loadFromFile(
         .glsl => try glslFromSpv(alloc_gpa, spirv),
         .msl => try mslFromSpv(alloc_gpa, spirv),
         .spv => spv: {
+            // Validate before handing back: glslang has succeeded at
+            // this point but a zero-length SPIR-V output would
+            // crash `vkCreateShaderModule` (codeSize == 0). The
+            // SPIR-V magic word check is defensive against future
+            // backends that bypass glslang.
+            if (spirv.len < 4) {
+                std.log.warn(
+                    "shadertoy: empty SPIR-V output (size={})",
+                    .{spirv.len},
+                );
+                return error.InvalidShader;
+            }
+            // First 4 bytes are the SPIR-V magic word 0x07230203
+            // (little-endian). Reject anything else loudly instead
+            // of letting the driver crash.
+            const magic = std.mem.readInt(u32, spirv[0..4], .little);
+            if (magic != 0x07230203) {
+                std.log.warn(
+                    "shadertoy: SPIR-V output missing magic word " ++
+                        "(got 0x{x:0>8}, expected 0x07230203)",
+                    .{magic},
+                );
+                return error.InvalidShader;
+            }
             // Copy the SPIR-V binary out of the arena into a
             // 4-byte-aligned allocation under `alloc_gpa`. Vulkan
             // expects `pCode: []const u32`, so over-aligning is safe;
@@ -219,10 +243,12 @@ pub fn glslFromShader(
         // Find the first newline after `#version ...` and inject the
         // defines on the following line. The prefix is expected to
         // start with `#version` followed by a newline; if a future
-        // edit ever drops that newline (e.g. a single-line prefix)
-        // we inject the defines BEFORE the prefix so glslang sees
-        // the directives on their own lines and reports a clear
-        // error instead of us crashing on a `null.?` unwrap.
+        // edit ever drops that newline (e.g. a single-line prefix
+        // entirely on one line), we synthesize one between
+        // `#version` and the rest, then inject the defines after.
+        // GLSL requires `#version` to be the first non-blank line,
+        // so injecting BEFORE it would silently produce invalid
+        // GLSL.
         if (std.mem.indexOfScalar(u8, prefix, '\n')) |first_nl| {
             try writer.writeAll(prefix[0 .. first_nl + 1]);
             for (defines) |def| {
@@ -231,12 +257,31 @@ pub fn glslFromShader(
                 try writer.writeAll("\n");
             }
             try writer.writeAll(prefix[first_nl + 1 ..]);
-        } else {
+        } else if (std.mem.startsWith(u8, prefix, "#version")) {
+            // No newline anywhere, but it does start with `#version`.
+            // Find the end of the version directive: scan past the
+            // version number to the first non-version-token char,
+            // synthesize a newline there, then write defines and
+            // the rest of the prefix.
+            var p: usize = "#version".len;
+            while (p < prefix.len and (prefix[p] == ' ' or prefix[p] == '\t')) p += 1;
+            while (p < prefix.len and prefix[p] >= '0' and prefix[p] <= '9') p += 1;
+            // Optional profile (`core` / `compatibility` / `es`).
+            while (p < prefix.len and (prefix[p] == ' ' or prefix[p] == '\t')) p += 1;
+            while (p < prefix.len and ((prefix[p] >= 'a' and prefix[p] <= 'z') or
+                (prefix[p] >= 'A' and prefix[p] <= 'Z'))) p += 1;
+            try writer.writeAll(prefix[0..p]);
+            try writer.writeByte('\n');
             for (defines) |def| {
                 try writer.writeAll("#define ");
                 try writer.writeAll(def);
                 try writer.writeAll("\n");
             }
+            try writer.writeAll(prefix[p..]);
+        } else {
+            // Prefix doesn't start with `#version` either — the
+            // shader is malformed. Pass it through as-is so glslang
+            // reports a clear parse error.
             try writer.writeAll(prefix);
         }
     }
diff --git a/src/renderer/vulkan/DescriptorPool.zig b/src/renderer/vulkan/DescriptorPool.zig
index 373074ae6..3fb8510a1 100644
--- a/src/renderer/vulkan/DescriptorPool.zig
+++ b/src/renderer/vulkan/DescriptorPool.zig
@@ -30,6 +30,11 @@ pub const Error = error{
     /// `vkCreateDescriptorPool` / `vkAllocateDescriptorSets` returned
     /// a non-success status.
     VulkanFailed,
+    /// Caller passed an invalid pool configuration (e.g. `max_sets ==
+    /// 0`, or every per-type cap is zero). Distinct from
+    /// `VulkanFailed` so callers can tell driver-side errors from
+    /// caller-side ones.
+    InvalidPoolConfig,
 };
 
 /// Construction caps. `max_sets` is the total number of descriptor
@@ -55,7 +60,7 @@ pub fn init(opts: Options) Error!Self {
     // a downstream allocation failure.
     if (opts.max_sets == 0) {
         log.err("DescriptorPool.init: max_sets must be > 0", .{});
-        return error.VulkanFailed;
+        return error.InvalidPoolConfig;
     }
     if (opts.uniform_buffers == 0 and
         opts.combined_image_samplers == 0 and
@@ -66,7 +71,7 @@ pub fn init(opts: Options) Error!Self {
                 "(uniform_buffers, combined_image_samplers, storage_buffers)",
             .{},
         );
-        return error.VulkanFailed;
+        return error.InvalidPoolConfig;
     }
 
     // Build a small VkDescriptorPoolSize array from whichever caps
diff --git a/src/renderer/vulkan/Frame.zig b/src/renderer/vulkan/Frame.zig
index 0b9d6faa2..d63dabd6c 100644
--- a/src/renderer/vulkan/Frame.zig
+++ b/src/renderer/vulkan/Frame.zig
@@ -186,22 +186,14 @@ pub fn complete(self: *const Self, sync: bool) void {
     // Recycle the per-frame Buffer pool. Even on the error path we
     // still want to cycle: buffers that the failed submit referenced
     // are now stuck (we can't prove the GPU is done with them), so
-    // we conservatively wait the device idle on the unhealthy path
-    // before draining. Without this, every failed submit leaks
-    // every buffer the renderer queued for that frame.
-    if (health == .unhealthy and !submitted) {
-        // Submit never happened — nothing in flight references
-        // recorded buffers, safe to cycle directly.
-        Vulkan.buffer_pool.cycle(dev);
-    } else if (health == .unhealthy) {
-        // Submit happened but fence wait failed (DEVICE_LOST etc.).
-        // Drain the device before recycling to avoid use-after-free
-        // on whatever queue is still ticking.
+    // we conservatively wait the device idle when submit DID happen
+    // but the fence wait failed (DEVICE_LOST etc.) before draining.
+    // Without that wait, every failed submit could leak the buffers
+    // the renderer queued for the frame.
+    if (health == .unhealthy and submitted) {
         _ = dev.dispatch.deviceWaitIdle(dev.device);
-        Vulkan.buffer_pool.cycle(dev);
-    } else {
-        Vulkan.buffer_pool.cycle(dev);
     }
+    Vulkan.buffer_pool.cycle(dev);
 
     // Hand the rendered target off to the host. On the unhealthy
     // path we skip present — the dmabuf may be partially written
diff --git a/src/renderer/vulkan/RenderPass.zig b/src/renderer/vulkan/RenderPass.zig
index b3db3ce7e..5c68b1600 100644
--- a/src/renderer/vulkan/RenderPass.zig
+++ b/src/renderer/vulkan/RenderPass.zig
@@ -396,23 +396,42 @@ pub fn step(self: *Self, s: Step) void {
     var effective_sets: [Pipeline.MAX_DESCRIPTOR_SETS]vk.VkDescriptorSet =
         s.pipeline.descriptor_sets;
     const reused = self.markPipelineUsed(s.pipeline.pipeline);
-    if (reused) if (self.step_pool) |pool| {
+    if (reused) {
+        // No step_pool means the renderer thread has no per-frame
+        // descriptor pool wired up (test harness, smoke test). We
+        // can't safely re-use this pipeline — updating the static
+        // set in place would corrupt the prior draw's bindings.
+        // Drop the draw rather than corrupt the frame.
+        const pool = self.step_pool orelse {
+            log.err(
+                "RenderPass.step: pipeline re-used but no step_pool " ++
+                    "available; dropping draw to avoid corrupting prior draws",
+                .{},
+            );
+            return;
+        };
         for (s.pipeline.descriptor_set_layouts, 0..) |maybe_dsl, i| {
             if (i >= s.pipeline.set_count) break;
             const dsl = maybe_dsl orelse continue;
             if (pool.allocate(dsl)) |fresh| {
                 effective_sets[i] = fresh;
             } else |err| {
+                // Pool exhausted. The previous behavior was to
+                // fall back to the pipeline's static set, but that
+                // re-introduces the exact corruption the step_pool
+                // mechanism exists to prevent. Drop the draw; the
+                // user sees one missed image rather than every
+                // image rendered with the last image's bindings.
                 log.err(
                     "RenderPass.step: per-call descriptor set " ++
-                        "allocation for set {} failed ({}); falling " ++
-                        "back to the pipeline's static set, which " ++
-                        "may corrupt prior draws on this pipeline",
+                        "allocation for set {} failed ({}); dropping draw " ++
+                        "(step_pool exhausted — increase STEP_POOL_MAX_SETS)",
                     .{ i, err },
                 );
+                return;
             }
         }
-    };
+    }
 
     // ---- update descriptor sets ---------------------------------
     //
diff --git a/src/renderer/vulkan/buffer.zig b/src/renderer/vulkan/buffer.zig
index cd73eccce..0e29da584 100644
--- a/src/renderer/vulkan/buffer.zig
+++ b/src/renderer/vulkan/buffer.zig
@@ -111,13 +111,16 @@ pub fn Buffer(comptime T: type) type {
                 self.memory,
                 self.opts.usage,
                 capacity_bytes,
-            ) catch {
+            ) catch |err| {
                 // OOM growing the pool. The buffer may still be
                 // referenced by an in-flight command buffer, so we
                 // wait the entire device idle before destroying —
-                // expensive but correct. Logging here is awkward (no
-                // logger in scope) so we accept the loud failure and
-                // let Vulkan stderr diagnose anything that follows.
+                // expensive but correct.
+                log.warn(
+                    "Buffer.deinit: pool release failed ({}); falling " ++
+                        "back to vkDeviceWaitIdle + destroy",
+                    .{err},
+                );
                 _ = dev.dispatch.deviceWaitIdle(dev.device);
                 dev.dispatch.destroyBuffer(dev.device, self.buffer, null);
                 dev.dispatch.freeMemory(dev.device, self.memory, null);
@@ -261,14 +264,26 @@ pub fn Buffer(comptime T: type) type {
         }
 
         /// Grow the buffer to hold at least `new_len` Ts. Vulkan
-        /// buffers are immutable in size, so we route the old
-        /// buffer through the recycle pool (it may still be
-        /// referenced by the in-flight command buffer — destroying
-        /// it directly would race the GPU same as `deinit` would)
-        /// and create a fresh one. Contents are discarded; callers
+        /// buffers are immutable in size, so we allocate a fresh
+        /// one and then route the old one through the recycle pool
+        /// (it may still be referenced by the in-flight command
+        /// buffer — destroying it directly would race the GPU same
+        /// as `deinit` would). Contents are discarded; callers
         /// always `sync` immediately after `grow` returns.
+        ///
+        /// Order is critical: `create` first, `release` second.
+        /// If we released the old buffer first and `create`
+        /// failed, `self.{buffer,memory}` would be left dangling
+        /// at freed handles, and the caller's eventual
+        /// `self.deinit()` would double-destroy via the pool.
         fn grow(self: *Self, new_len: usize) Error!void {
             const dev = self.opts.device;
+            const replacement = try create(self.opts, new_len);
+            // From here on `self.{buffer,memory}` are the OLD pair;
+            // release them. If `release` itself OOMs, we have to
+            // destroy directly (same fallback as `deinit`), but the
+            // new pair is already constructed and `self.* =
+            // replacement` will reach a healthy state regardless.
             const bp = @import("../Vulkan.zig").buffer_pool;
             const capacity_bytes: u64 = @as(u64, self.len) * @sizeOf(T);
             bp.release(
@@ -278,13 +293,10 @@ pub fn Buffer(comptime T: type) type {
                 self.opts.usage,
                 capacity_bytes,
             ) catch {
-                // OOM appending to the pool — wait the device idle
-                // and destroy directly. Same fallback as `deinit`.
                 _ = dev.dispatch.deviceWaitIdle(dev.device);
                 dev.dispatch.destroyBuffer(dev.device, self.buffer, null);
                 dev.dispatch.freeMemory(dev.device, self.memory, null);
             };
-            const replacement = try create(self.opts, new_len);
             self.* = replacement;
         }
 
diff --git a/src/renderer/vulkan/shaders.zig b/src/renderer/vulkan/shaders.zig
index 767e11d5d..bab1b9cce 100644
--- a/src/renderer/vulkan/shaders.zig
+++ b/src/renderer/vulkan/shaders.zig
@@ -74,7 +74,9 @@ fn processIncludes(comptime contents: [:0]const u8) [:0]const u8 {
             // any horizontal whitespace and require exactly one
             // double-quoted path.
             var p = i + "#include".len;
-            while (p < contents.len and (contents[p] == ' ' or contents[p] == '\t')) : (p += 1) {}
+            while (p < contents.len and (contents[p] == ' ' or contents[p] == '\t' or
+                contents[p] == 0x0B or contents[p] == 0x0C)) : (p += 1)
+            {}
             if (p >= contents.len or contents[p] != '"') {
                 @compileError("processIncludes: malformed #include directive in shader");
             }
@@ -231,6 +233,10 @@ pub fn vulkanizeGlsl(
                     i += 1; // consume the '('
                     var depth: i32 = 1;
                     while (i < src.len and depth > 0) {
+                        // Skip comments and string literals verbatim
+                        // — a `(` or `)` inside `/* */` or `"..."`
+                        // shouldn't move our paren depth tracker.
+                        if (try copySkippable(alloc, &out, src, &i)) continue;
                         const cc = src[i];
                         if (cc == '(') depth += 1;
                         if (cc == ')') {
@@ -390,6 +396,13 @@ fn isIdentChar(c: u8) bool {
 /// tiny — broader matching would force `textureLod` on the custom
 /// shader's `iChannel0`, which is normalized, and bypassing the
 /// implicit-LOD opcode path makes the driver work harder per call.
+///
+/// User-supplied custom shaders MUST NOT name a sampler `atlas_grayscale`
+/// or `atlas_color` — doing so will trigger this rewrite and replace
+/// `texture()` calls on that sampler with `textureLod(..., 0.0)`,
+/// which is incorrect for any normalized sampler. The shadertoy
+/// shader convention uses `iChannel0..3`, so the conflict is unlikely
+/// in practice.
 const unnormalized_sampler_names = [_][]const u8{
     "atlas_grayscale",
     "atlas_color",
@@ -416,7 +429,10 @@ fn nextSamplerIsUnnormalized(src: []const u8, i: usize) bool {
 }
 
 fn isHorizSpace(c: u8) bool {
-    return c == ' ' or c == '\t';
+    // GLSL preprocessor whitespace per the spec: space, tab,
+    // vertical tab, form feed. Newline + carriage return are
+    // separately handled as line terminators.
+    return c == ' ' or c == '\t' or c == 0x0B or c == 0x0C;
 }
 
 fn isAnySpace(c: u8) bool {
@@ -563,6 +579,24 @@ pub const Module = struct {
         spirv: []const u32,
         stage: Stage,
     ) Error!Module {
+        // Sanity-check the SPIR-V before handing it to the driver.
+        // The glslang and shadertoy paths both validate already, but
+        // a future caller wiring up the build-time-blob path could
+        // accidentally pass an empty or non-SPIR-V buffer; failing
+        // here with a clear error beats a `vkCreateShaderModule`
+        // segfault inside the loader.
+        if (spirv.len == 0) {
+            log.err("Module.initFromSpirv: zero-length SPIR-V buffer", .{});
+            return error.VulkanFailed;
+        }
+        if (spirv[0] != 0x07230203) {
+            log.err(
+                "Module.initFromSpirv: missing magic word " ++
+                    "(got 0x{x:0>8}, expected 0x07230203)",
+                .{spirv[0]},
+            );
+            return error.VulkanFailed;
+        }
         const info: vk.VkShaderModuleCreateInfo = .{
             .sType = vk.VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO,
             .pNext = null,