diff --git a/qt/CMakeLists.txt b/qt/CMakeLists.txt
index 9f5a81e9d..e5729a939 100644
--- a/qt/CMakeLists.txt
+++ b/qt/CMakeLists.txt
@@ -143,6 +143,15 @@ set(GHOSTTY_SO "${GHOSTTY_LIB_DIR}/ghostty-internal.so")
 set(GHASTTY_VARIANT "opengl" CACHE STRING
     "Renderer variant: opengl (default) or vulkan")
 set_property(CACHE GHASTTY_VARIANT PROPERTY STRINGS opengl vulkan)
+# Validate the cache value: STRINGS only constrains the cmake-gui
+# dropdown, not the command-line. `-DGHASTTY_VARIANT=foo` would
+# otherwise silently fall into the OpenGL branch below.
+if(NOT GHASTTY_VARIANT STREQUAL "opengl" AND
+   NOT GHASTTY_VARIANT STREQUAL "vulkan")
+  message(FATAL_ERROR
+    "GHASTTY_VARIANT='${GHASTTY_VARIANT}' is invalid; "
+    "must be 'opengl' or 'vulkan'.")
+endif()
 if(GHASTTY_VARIANT STREQUAL "vulkan")
   set(GHASTTY_EXE_NAME "ghastty-vulkan")
   set(GHASTTY_LIB_SUBDIR "ghastty-vulkan")
@@ -189,7 +198,6 @@ add_custom_target(ghostty_link DEPENDS "${GHOSTTY_LINK_SO}")
 
 add_executable(ghastty
   src/main.cpp
-  src/vulkan/Host.cpp
   src/actions/ActionDispatcher.cpp
   src/actions/ChromeActions.cpp
   src/actions/InputActions.cpp
@@ -226,6 +234,15 @@ add_executable(ghastty
   "${FRACSCALE_HEADER}"
 )
 
+# Vulkan host glue is variant-only. Adding it to the OpenGL build
+# would force an unconditional libvulkan link on a binary that
+# never calls into Vulkan, contradicting the side-by-side
+# `~/.local/lib/libghostty.so` story that the variant block above
+# documents.
+if(GHASTTY_VARIANT STREQUAL "vulkan")
+  target_sources(ghastty PRIVATE src/vulkan/Host.cpp)
+endif()
+
 # Embed the app icon so it is available even running from the build tree.
 qt_add_resources(ghastty "appicon"
   PREFIX "/"
@@ -250,10 +267,18 @@ target_link_libraries(ghastty PRIVATE
   PkgConfig::XKBCOMMON
   PkgConfig::EGL
   LayerShellQt::Interface
-  vulkan
   "${GHOSTTY_LINK_SO}"
 )
 
+# libvulkan is Vulkan-variant only. The OpenGL variant compiles
+# nothing that references Vulkan symbols (vulkan/Host.cpp is gated
+# above), so not linking libvulkan keeps OpenGL-only systems from
+# needing the loader installed at runtime — matching the
+# documented side-by-side variant story above.
+if(GHASTTY_VARIANT STREQUAL "vulkan")
+  target_link_libraries(ghastty PRIVATE vulkan)
+endif()
+
 # Hook up the private QPA headers (see find_package above).
 #
 # Qt6::WaylandClientPrivate gives us QtWaylandClient::QWaylandWindow,
diff --git a/qt/src/GhosttySurface.cpp b/qt/src/GhosttySurface.cpp
index d0b67c22b..2296fc42b 100644
--- a/qt/src/GhosttySurface.cpp
+++ b/qt/src/GhosttySurface.cpp
@@ -8,7 +8,9 @@
 #include "SearchBar.h"
 #include "TabWidget.h"
 #include "Util.h"
+#ifdef GHASTTY_USE_VULKAN
 #include "vulkan/Host.h"
+#endif
 #include "wayland/EglDmabufTarget.h"
 #include "wayland/SubsurfacePresenter.h"
 
@@ -112,12 +114,43 @@ GhosttySurface::GhosttySurface(ghostty_app_t app, MainWindow *owner,
   // produce a mismatch crash. Mixing GL+VK on the same process
   // (e.g. NVIDIA's coexistence on one Wayland surface) is also
   // reportedly fragile.
-  vulkan::Host *vk_host = nullptr;
-#ifdef GHASTTY_USE_VULKAN
-  vk_host = vulkan::Host::instance();
-#endif
+  // The "use Vulkan" decision is purely compile-time on this fork:
+  // each binary is linked against exactly one libghostty.so variant
+  // (opengl or vulkan). A runtime fallback would just mis-initialize
+  // the surface against the wrong renderer.
+  ghostty_surface_config_s sc =
+      m_parentSurface
+          ? ghostty_surface_inherited_config(m_parentSurface,
+                                             GHOSTTY_SURFACE_CONTEXT_TAB)
+          : ghostty_surface_config_new();
 
-  if (vk_host == nullptr) {
+#ifdef GHASTTY_USE_VULKAN
+  {
+    vulkan::Host *vk_host = vulkan::Host::instance();
+    if (vk_host == nullptr) {
+      // libghostty was compiled with -Drenderer=vulkan and there's
+      // no GL fallback available: libghostty's GL surface init
+      // would crash on the first call. Fail loudly here.
+      std::fprintf(stderr,
+                   "[ghastty] Vulkan host bring-up failed (no Vulkan 1.3 "
+                   "GPU with VK_KHR_external_memory_fd + "
+                   "VK_EXT_external_memory_dma_buf). The Vulkan variant "
+                   "of libghostty has no OpenGL fallback — exiting.\n");
+      std::abort();
+    }
+    m_useVulkan = true;
+    sc.platform_tag = GHOSTTY_PLATFORM_VULKAN;
+    sc.platform.vulkan = vk_host->asPlatform(this);
+
+    // GUI-thread frame delivery is driven by
+    // `QMetaObject::invokeMethod` (Qt::QueuedConnection) from
+    // `presentVulkanDmabuf`. The earlier 2 ms safety-net polling
+    // timer was removed once delivery was shown to be reliable;
+    // any genuine loss is visible via the dropped-frame counter
+    // logged from `presentVulkanDmabuf`.
+  }
+#else
+  {
     // OpenGL path: stand up the private context + offscreen FBO
     // libghostty's GL renderer draws into.
     m_context = new QOpenGLContext(this);
@@ -140,33 +173,7 @@ GhosttySurface::GhosttySurface(ghostty_app_t app, MainWindow *owner,
     fmt.setInternalTextureFormat(GL_RGBA8);
     m_fbw = m_fbh = 16;
     m_fbo = new QOpenGLFramebufferObject(QSize(m_fbw, m_fbh), fmt);
-  }
 
-  ghostty_surface_config_s sc =
-      m_parentSurface
-          ? ghostty_surface_inherited_config(m_parentSurface,
-                                             GHOSTTY_SURFACE_CONTEXT_TAB)
-          : ghostty_surface_config_new();
-
-  if (vk_host != nullptr) {
-    m_useVulkan = true;
-    sc.platform_tag = GHOSTTY_PLATFORM_VULKAN;
-    sc.platform.vulkan = vk_host->asPlatform(this);
-
-    // GUI-thread frame drain. The renderer thread wakes us per frame
-    // via QMetaObject::invokeMethod (Qt::QueuedConnection) on each
-    // present — see `presentVulkanDmabuf`. The 2 ms timer is a
-    // safety net: if `invokeMethod` ever fails to deliver (the
-    // earlier QImage-handoff diagnostics suggested this could
-    // happen), the next tick drains the parked frame within at most
-    // 2 ms. Idle case has negligible CPU cost because `drainVulkan`
-    // returns immediately when nothing is pending.
-    m_vulkanPollTimer = new QTimer(this);
-    m_vulkanPollTimer->setInterval(2);
-    connect(m_vulkanPollTimer, &QTimer::timeout, this,
-            [this]() { drainVulkan(); });
-    m_vulkanPollTimer->start();
-  } else {
     sc.platform_tag = GHOSTTY_PLATFORM_OPENGL;
     sc.platform.opengl.userdata = this;
     sc.platform.opengl.get_proc_address = glGetProcAddress;
@@ -174,6 +181,7 @@ GhosttySurface::GhosttySurface(ghostty_app_t app, MainWindow *owner,
     sc.platform.opengl.release_current = glReleaseCurrent;
     sc.platform.opengl.present = glPresent;
   }
+#endif
   sc.userdata = this;
   sc.scale_factor = devicePixelRatioF();
 
@@ -234,10 +242,12 @@ void GhosttySurface::syncSurfaceSize() {
   // shave a pixel off the framebuffer relative to the QImage blit.
   const int w = std::max(1, static_cast<int>(std::lround(width() * dpr)));
   const int h = std::max(1, static_cast<int>(std::lround(height() * dpr)));
-  if (w == m_fbw && h == m_fbh && dpr == m_fbDpr) return;
+  if (w == m_fbw && h == m_fbh &&
+      dpr == m_fbDpr.load(std::memory_order_relaxed))
+    return;
   m_fbw = w;
   m_fbh = h;
-  m_fbDpr = dpr;
+  m_fbDpr.store(dpr, std::memory_order_release);
 
   // Vulkan path: libghostty manages the target image itself (it
   // allocates the dmabuf-exportable VkImage). Tell it the new
@@ -405,7 +415,19 @@ bool GhosttySurface::event(QEvent *e) {
         static_cast<QPlatformSurfaceEvent *>(e)->surfaceEventType();
     if (type == QPlatformSurfaceEvent::SurfaceAboutToBeDestroyed) {
       m_useSubsurface.store(false, std::memory_order_release);
-      m_eglTarget.reset();
+      // EglDmabufTarget's destructor deletes a GL framebuffer +
+      // texture allocated against `m_context`; without that
+      // context current its `QOpenGLContext::currentContext()`
+      // check sees the wrong (or no) context and silently skips
+      // the gl* calls, leaking the resources every time Qt
+      // re-creates the QPA window (QSplitter reparent, fullscreen
+      // toggle, screen change). Make the owning context current
+      // before tearing down. Vulkan-variant builds have no
+      // `m_context` and skip the makeCurrent.
+      if (m_eglTarget) {
+        if (m_context) makeCurrent();
+        m_eglTarget.reset();
+      }
       m_subsurfacePresenter.reset();
     }
     // SurfaceCreated is handled implicitly: the next QEvent::Show
@@ -623,7 +645,7 @@ void GhosttySurface::renderTerminal() {
   // (Scaling it to the widget instead made the whole frame — images
   // included — rubber-band while a resize was in flight.)
   m_image = m_fbo->toImage();
-  m_image.setDevicePixelRatio(m_fbDpr);
+  m_image.setDevicePixelRatio(m_fbDpr.load(std::memory_order_acquire));
   m_fbo->release();
 
   update();
@@ -1537,10 +1559,11 @@ QVariant GhosttySurface::inputMethodQuery(Qt::InputMethodQuery query) const {
           ghostty_surface_cursor_position(m_surface);
       // m_fbDpr defaults to 1.0 and only ever takes positive values
       // from syncSurfaceSize, so dividing is always safe.
-      return QRect(static_cast<int>(c.x / m_fbDpr),
-                   static_cast<int>(c.y / m_fbDpr),
-                   std::max(1, static_cast<int>(c.width / m_fbDpr)),
-                   std::max(1, static_cast<int>(c.height / m_fbDpr)));
+      const double dpr = m_fbDpr.load(std::memory_order_acquire);
+      return QRect(static_cast<int>(c.x / dpr),
+                   static_cast<int>(c.y / dpr),
+                   std::max(1, static_cast<int>(c.width / dpr)),
+                   std::max(1, static_cast<int>(c.height / dpr)));
     }
     default:
       return QWidget::inputMethodQuery(query);
@@ -1621,8 +1644,25 @@ void GhosttySurface::presentVulkanDmabuf(
                  image_backed ? 1 : 0, useSubsurface ? "subsurface" : "qimage");
   }
 
-  if (dmabuf_fd < 0 || width == 0 || height == 0 || stride < width * 4)
-    return;
+  // Validate the renderer-supplied dimensions. width / height /
+  // stride are all u32 and the multiplications below would wrap if
+  // they're hostile/buggy:
+  //   - `width * 4` (the minimum acceptable stride) wraps for
+  //     width >= 0x40000000, accepting any stride.
+  //   - `stride * height` (the legacy mmap path's byte count) wraps
+  //     to a small size_t when promoted on platforms where size_t
+  //     is 32-bit, causing an under-mapped buffer that we then
+  //     read past.
+  // Cap on a sane upper bound — 65536×65536 dwarfs any plausible
+  // terminal — and check that stride*height doesn't exceed
+  // SIZE_MAX before promoting.
+  constexpr quint32 MAX_DIM = 65536;
+  if (dmabuf_fd < 0 || width == 0 || height == 0) return;
+  if (width > MAX_DIM || height > MAX_DIM) return;
+  if (stride < static_cast<quint64>(width) * 4) return;
+  // stride*height as 64-bit and check the size_t fit explicitly.
+  const quint64 bytes64 = static_cast<quint64>(stride) * height;
+  if (bytes64 > std::numeric_limits<std::size_t>::max()) return;
 
   // Don't park / dispatch frames while we're hidden — racing the
   // renderer's final post-Hide frame past presenter.hide() is what
@@ -1670,8 +1710,9 @@ void GhosttySurface::presentVulkanDmabuf(
     return;
   }
 
-  // Fallback: mmap + memcpy into a QImage.
-  const size_t bytes = static_cast<size_t>(stride) * height;
+  // Fallback: mmap + memcpy into a QImage. `bytes64` was computed
+  // and bounds-checked above.
+  const size_t bytes = static_cast<size_t>(bytes64);
   void *mapped = ::mmap(nullptr, bytes, PROT_READ, MAP_SHARED, dmabuf_fd, 0);
   if (mapped == MAP_FAILED) {
     std::fprintf(stderr, "[ghastty] mmap of dmabuf fd=%d failed: %s\n",
@@ -1694,7 +1735,8 @@ void GhosttySurface::presentVulkanDmabuf(
   QImage owned = stamped.copy();
   ::munmap(mapped, bytes);
 
-  if (m_fbDpr > 0) owned.setDevicePixelRatio(m_fbDpr);
+  const double dpr_now = m_fbDpr.load(std::memory_order_acquire);
+  if (dpr_now > 0) owned.setDevicePixelRatio(dpr_now);
   {
     QMutexLocker lock(&m_pendingMutex);
     m_pending = std::move(owned);
diff --git a/qt/src/GhosttySurface.h b/qt/src/GhosttySurface.h
index 074d9e319..41738d328 100644
--- a/qt/src/GhosttySurface.h
+++ b/qt/src/GhosttySurface.h
@@ -288,13 +288,14 @@ private:
   bool m_useVulkan = false;
 
   // Cross-thread frame handoff for the Vulkan path. The renderer
-  // thread calls `presentVulkanDmabuf` with a borrowed dmabuf fd; a
-  // 16 ms `QTimer` on the GUI thread drains the pending frame and
-  // routes it through the wl_subsurface (zero-copy) when the
-  // SubsurfacePresenter is available, or falls back to the
-  // mmap+memcpy+QImage path otherwise. The polling timer was kept
-  // (rather than QMetaObject::invokeMethod) because queued lambdas
-  // from the renderer thread were unreliable in earlier diagnostics.
+  // thread calls `presentVulkanDmabuf` with a borrowed dmabuf fd
+  // and posts a queued `drainVulkan` invocation; the GUI thread
+  // runs `drainVulkan` and routes the parked descriptor through
+  // either the wl_subsurface presenter (zero-copy) or the
+  // mmap+memcpy+QImage fallback. The dropped-frame counter
+  // (`m_droppedFrames`) surfaces any queue-loss that ever happens
+  // in practice — the earlier safety-net polling timer was
+  // removed once delivery was shown to be reliable.
   //
   // `m_useSubsurface` is set once on the GUI thread when the
   // presenter comes up; the renderer thread reads it acquire-style
@@ -318,7 +319,6 @@ private:
   // null and paintEvent skips its blit.
   QImage m_pending;
   QMutex m_pendingMutex;
-  QTimer *m_vulkanPollTimer = nullptr;
 
   // GL objects for the alpha-premultiply pass.
   QOpenGLShaderProgram *m_premultProg = nullptr;
@@ -326,7 +326,13 @@ private:
 
   int m_fbw = 0;                       // framebuffer size, device pixels
   int m_fbh = 0;
-  double m_fbDpr = 1.0;                // DPR the framebuffer was sized at
+  // DPR the framebuffer was sized at. Atomic because the renderer
+  // thread reads it from `presentVulkanDmabuf` to tag the legacy
+  // QImage path while the GUI thread writes it from
+  // `syncSurfaceSize`. `double` writes aren't guaranteed atomic
+  // across threads on every architecture; std::atomic<double> uses
+  // CAS-loop fallbacks where needed.
+  std::atomic<double> m_fbDpr{1.0};    // DPR the framebuffer was sized at
 
   QLabel *m_exitOverlay = nullptr;     // "process exited" banner; lazily made
   QLabel *m_keySeqOverlay = nullptr;   // pending keybind chord; lazily made
diff --git a/qt/src/main.cpp b/qt/src/main.cpp
index bb943a60e..3a4fa30a3 100644
--- a/qt/src/main.cpp
+++ b/qt/src/main.cpp
@@ -11,7 +11,6 @@
 #include "GlobalShortcuts.h"
 #include "MainWindow.h"
 #include "ghostty.h"
-#include "vulkan/Host.h"
 
 // True when any argv entry starts with `+` — i.e. the user invoked a
 // libghostty CLI action (`+show-config`, `+list-fonts`, `+version`, …).
diff --git a/qt/src/wayland/SubsurfacePresenter.cpp b/qt/src/wayland/SubsurfacePresenter.cpp
index 99bd5b010..f0b42655a 100644
--- a/qt/src/wayland/SubsurfacePresenter.cpp
+++ b/qt/src/wayland/SubsurfacePresenter.cpp
@@ -49,9 +49,13 @@ void dmabufFormat(void *, zwp_linux_dmabuf_v1 *, uint32_t /*format*/) {}
 
 // `modifier` event: compositor advertises one (format, modifier) it
 // can scan out. Fires once per pair during the bind roundtrip; we
-// stash them all in the per-format vector. Duplicate-keyed inserts
-// are theoretically possible across compositor restarts but won't
-// happen within a single bind round, so we don't dedupe.
+// stash them all in the per-format vector. Only fires from inside
+// `discoverGlobals` because we keep the dmabuf proxy on a private
+// queue that's never dispatched after discovery — see the queue-
+// retention comment in `discoverGlobals`. That guarantee is what
+// lets the renderer thread read `globals.modifiers` without a
+// lock, and is also why we don't bother deduping (one bind round
+// only fires each pair once).
 void dmabufModifier(void *data, zwp_linux_dmabuf_v1 *, uint32_t format,
                     uint32_t modifier_hi, uint32_t modifier_lo) {
   auto *g = static_cast<PresenterGlobals *>(data);
@@ -140,21 +144,32 @@ PresenterGlobals *discoverGlobals(wl_display *display) {
   // Move the bound proxies back to the default queue so Qt's main
   // dispatch drives subsequent events on them, then drop the private
   // queue. (Same lifecycle dance as `blurManager`.)
+  //
+  // EXCEPT the dmabuf proxy: its listener mutates `globals.modifiers`
+  // on every `modifier` event, and the renderer thread reads that
+  // map from `supportedDmabufModifiers` without locking. If we
+  // moved the proxy back to the default queue, a compositor
+  // restart / hot-plug fires more `modifier` events that would
+  // race the reader. Keep the proxy on `queue` and intentionally
+  // never dispatch that queue again — the events queue up
+  // harmlessly and are reaped at proxy destruction. The map is
+  // genuinely frozen post-discovery now.
   if (globals.compositor)
     wl_proxy_set_queue(reinterpret_cast<wl_proxy *>(globals.compositor),
                        nullptr);
   if (globals.subcompositor)
     wl_proxy_set_queue(reinterpret_cast<wl_proxy *>(globals.subcompositor),
                        nullptr);
-  if (globals.dmabuf)
-    wl_proxy_set_queue(reinterpret_cast<wl_proxy *>(globals.dmabuf), nullptr);
   if (globals.viewporter)
     wl_proxy_set_queue(reinterpret_cast<wl_proxy *>(globals.viewporter),
                        nullptr);
   if (globals.fractionalScale)
     wl_proxy_set_queue(reinterpret_cast<wl_proxy *>(globals.fractionalScale),
                        nullptr);
-  wl_event_queue_destroy(queue);
+  // We deliberately leak `queue` (and leave globals.dmabuf attached
+  // to it) for the process lifetime — it has no resources beyond a
+  // small kernel-side buffer and going away would put dmabuf events
+  // back on the default queue.
 
   return &globals;
 }
@@ -403,6 +418,33 @@ void SubsurfacePresenter::presentDmabuf(int fd, uint32_t drm_format,
   if (dest_width <= 0) dest_width = 1;
   if (dest_height <= 0) dest_height = 1;
 
+  // Validate the (format, modifier) pair against the compositor's
+  // advertised list before handing it to `create_immed`. If the
+  // pair isn't on the list, the compositor will reject the
+  // subsequent `create_immed` with `invalid_format` — a FATAL
+  // protocol error that kills the entire wl_display, taking down
+  // every window in the process. Better to drop this single frame
+  // than to take down the app.
+  {
+    const PresenterGlobals &g = globalState();
+    const auto it = g.modifiers.find(drm_format);
+    bool ok = false;
+    if (it != g.modifiers.end()) {
+      for (const uint64_t m : it->second) {
+        if (m == drm_modifier) { ok = true; break; }
+      }
+    }
+    if (!ok) {
+      std::fprintf(stderr,
+                   "[ghastty] SubsurfacePresenter: refusing dmabuf "
+                   "(fourcc=0x%08x mod=0x%llx) — compositor doesn't "
+                   "advertise this (format, modifier) pair\n",
+                   drm_format,
+                   static_cast<unsigned long long>(drm_modifier));
+      return;
+    }
+  }
+
   // Wrap libghostty's borrowed fd in a wl_buffer.
   zwp_linux_buffer_params_v1 *params =
       zwp_linux_dmabuf_v1_create_params(m_dmabuf);
diff --git a/qt/src/wayland/SubsurfacePresenter.h b/qt/src/wayland/SubsurfacePresenter.h
index 493c50d2b..3c1d3a081 100644
--- a/qt/src/wayland/SubsurfacePresenter.h
+++ b/qt/src/wayland/SubsurfacePresenter.h
@@ -109,8 +109,13 @@ public:
   // units of 1/120 (e.g. 144 = 1.2, 180 = 1.5, 240 = 2.0). Returns
   // 120 (= 1.0) until the compositor sends its first
   // wp_fractional_scale_v1.preferred_scale event for our surface.
-  // Renderer / GhosttySurface size their buffers at
-  // `logical * preferredScale120() / 120` device pixels.
+  //
+  // Currently INFORMATIONAL only: GhosttySurface uses Qt's
+  // devicePixelRatioF() for buffer sizing (which Qt derives from
+  // the same protocol on Wayland), so the two values agree at
+  // steady state. Exposed for diagnostics + a future direct-
+  // protocol path that bypasses Qt's DPR cache lag during a
+  // screen-change race.
   uint32_t preferredScale120() const { return m_preferredScale120; }
 
   // Stretch the existing subsurface buffer to a new destination
diff --git a/src/apprt/embedded.zig b/src/apprt/embedded.zig
index 7a850b682..c8702d2b4 100644
--- a/src/apprt/embedded.zig
+++ b/src/apprt/embedded.zig
@@ -547,24 +547,42 @@ pub const Platform = union(PlatformTag) {
 
             .vulkan => vulkan: {
                 const config = c_platform.vulkan;
+                // Collapse the eight per-callback "MustBeSet"
+                // variants into a single `error.MissingVulkanCallback`.
+                // Pre-this, every caller of `Platform.init` had to
+                // handle 8 separate error tags (or `try` swallow
+                // them) — eight names that all mean "the host
+                // didn't fill out one of these fields." Log which
+                // one was null for diagnostics; the error tag
+                // itself stays narrow.
+                const which: ?[]const u8 = blk: {
+                    if (config.get_instance_proc_addr == null) break :blk "get_instance_proc_addr";
+                    if (config.instance == null) break :blk "instance";
+                    if (config.physical_device == null) break :blk "physical_device";
+                    if (config.device == null) break :blk "device";
+                    if (config.queue == null) break :blk "queue";
+                    if (config.queue_family_index == null) break :blk "queue_family_index";
+                    if (config.get_supported_modifiers == null) break :blk "get_supported_modifiers";
+                    if (config.present == null) break :blk "present";
+                    break :blk null;
+                };
+                if (which) |name| {
+                    std.log.scoped(.embedded).err(
+                        "ghostty_platform_vulkan_s.{s} is null",
+                        .{name},
+                    );
+                    break :vulkan error.MissingVulkanCallback;
+                }
                 break :vulkan .{ .vulkan = .{
                     .userdata = config.userdata,
-                    .get_instance_proc_addr = config.get_instance_proc_addr orelse
-                        break :vulkan error.GetInstanceProcAddrMustBeSet,
-                    .instance = config.instance orelse
-                        break :vulkan error.InstanceMustBeSet,
-                    .physical_device = config.physical_device orelse
-                        break :vulkan error.PhysicalDeviceMustBeSet,
-                    .device = config.device orelse
-                        break :vulkan error.DeviceMustBeSet,
-                    .queue = config.queue orelse
-                        break :vulkan error.QueueMustBeSet,
-                    .queue_family_index = config.queue_family_index orelse
-                        break :vulkan error.QueueFamilyIndexMustBeSet,
-                    .get_supported_modifiers = config.get_supported_modifiers orelse
-                        break :vulkan error.GetSupportedModifiersMustBeSet,
-                    .present = config.present orelse
-                        break :vulkan error.PresentMustBeSet,
+                    .get_instance_proc_addr = config.get_instance_proc_addr.?,
+                    .instance = config.instance.?,
+                    .physical_device = config.physical_device.?,
+                    .device = config.device.?,
+                    .queue = config.queue.?,
+                    .queue_family_index = config.queue_family_index.?,
+                    .get_supported_modifiers = config.get_supported_modifiers.?,
+                    .present = config.present.?,
                 } };
             },
         };
diff --git a/src/renderer/Vulkan.zig b/src/renderer/Vulkan.zig
index ec65d51b5..563dbbcc7 100644
--- a/src/renderer/Vulkan.zig
+++ b/src/renderer/Vulkan.zig
@@ -115,8 +115,9 @@ var device: ?Device = null;
 var device_refcount: usize = 0;
 var device_mutex: std.Thread.Mutex = .{};
 
-/// Per-thread pool of `(VkBuffer, VkDeviceMemory)` pairs that get
-/// recycled across frames. Solves two problems together:
+/// Process-wide pool of `(VkBuffer, VkDeviceMemory)` pairs recycled
+/// across frames on the renderer thread. Solves two problems
+/// together:
 ///
 ///   1. Lifetime: `vulkan/buffer.zig`'s `Buffer.deinit` is called
 ///      mid-frame (by `renderer/image.zig:draw`'s `defer buf.deinit()`)
@@ -130,8 +131,22 @@ var device_mutex: std.Thread.Mutex = .{};
 /// Lifecycle: `Buffer.deinit` pushes to `pending`. `Frame.complete`
 /// after `vkWaitForFences` moves `pending` → `ready`. `Buffer.create`
 /// scans `ready` for an entry of matching usage + size and pops it
-/// before allocating new. The pool only grows; entries get destroyed
-/// when the device tears down (`Vulkan.deinit`).
+/// before allocating new.
+///
+/// Process-wide (not threadlocal) and mutex-protected: splits/tabs
+/// run independent renderer threads against the SAME shared
+/// VkDevice, and a per-thread pool would mean each thread leaks
+/// every staging buffer the other threads release. The mutex is
+/// uncontended in the steady state — entries are short-lived and
+/// the pool only grows.
+///
+/// Caller responsibilities:
+///   - Only call `release` from a code path whose VkBuffer reference
+///     is bounded by a fence the renderer thread will eventually
+///     wait on (i.e. the per-frame command buffer).
+///   - For one-shot uploads (e.g. atlas staging) the caller already
+///     does `vkQueueWaitIdle` post-submit; that path uses
+///     `Buffer.destroyImmediate` which bypasses this pool.
 pub const buffer_pool = struct {
     const Entry = struct {
         buffer: vk.VkBuffer,
@@ -140,8 +155,9 @@ pub const buffer_pool = struct {
         capacity: u64,
     };
 
-    threadlocal var pending: std.ArrayList(Entry) = .{};
-    threadlocal var ready: std.ArrayList(Entry) = .{};
+    var mutex: std.Thread.Mutex = .{};
+    var pending: std.ArrayList(Entry) = .{};
+    var ready: std.ArrayList(Entry) = .{};
 
     /// Queue a buffer for recycling. The buffer cannot be reused
     /// until the next fence-wait (handled by `cycle`); it sits in
@@ -154,6 +170,8 @@ pub const buffer_pool = struct {
         capacity: u64,
     ) !void {
         _ = dev;
+        mutex.lock();
+        defer mutex.unlock();
         try pending.append(std.heap.smp_allocator, .{
             .buffer = buffer,
             .memory = memory,
@@ -170,6 +188,8 @@ pub const buffer_pool = struct {
         usage: vk.VkBufferUsageFlags,
         min_capacity: u64,
     ) ?Entry {
+        mutex.lock();
+        defer mutex.unlock();
         var i: usize = 0;
         while (i < ready.items.len) : (i += 1) {
             const e = ready.items[i];
@@ -186,18 +206,19 @@ pub const buffer_pool = struct {
     /// `Frame.complete` after `vkWaitForFences`.
     ///
     /// `dev` is needed only on the OOM fallback path: if `ready`
-    /// can't grow to absorb `pending`, we destroy the pending
-    /// VkBuffers / VkDeviceMemory directly instead of leaking them
-    /// (the alternative would be to leave them in `pending` forever,
-    /// where each successive frame's `cycle` would try the same
-    /// failing append on an ever-growing list — guaranteed VkDevice
-    /// memory exhaustion).
+    /// can't grow to absorb `pending`, we wait the device idle and
+    /// then destroy the pending entries directly so the next frame
+    /// doesn't double up on a pending list that can never drain.
     pub fn cycle(dev: *const Device) void {
+        mutex.lock();
+        defer mutex.unlock();
         ready.appendSlice(std.heap.smp_allocator, pending.items) catch {
-            // Couldn't grow `ready` — destroy the GPU resources now
-            // (the GPU is provably done with them, the fence wait
-            // already returned) so the next frame doesn't double up
-            // on a pending list that can never drain.
+            // Couldn't grow `ready` — destroy the pending GPU
+            // resources directly. Other renderer threads may still
+            // be submitting against the shared queue, so wait the
+            // device idle to make sure no command buffer in flight
+            // anywhere references these handles before we destroy.
+            _ = dev.dispatch.deviceWaitIdle(dev.device);
             for (pending.items) |e| {
                 dev.dispatch.destroyBuffer(dev.device, e.buffer, null);
                 dev.dispatch.freeMemory(dev.device, e.memory, null);
@@ -207,8 +228,10 @@ pub const buffer_pool = struct {
     }
 
     /// Tear down both lists. Call only when the device is idle
-    /// (`vkDeviceWaitIdle` or surface destroy).
+    /// (`vkDeviceWaitIdle` or final surface destroy).
     pub fn drainAll(dev: *const Device) void {
+        mutex.lock();
+        defer mutex.unlock();
         for (pending.items) |e| {
             dev.dispatch.destroyBuffer(dev.device, e.buffer, null);
             dev.dispatch.freeMemory(dev.device, e.memory, null);
@@ -248,6 +271,28 @@ threadlocal var frame_cb: vk.VkCommandBuffer = null;
 /// in `Frame.complete` before handing the target dmabuf to the host.
 threadlocal var frame_fence: vk.VkFence = null;
 
+/// Per-thread descriptor pool used by `RenderPass.step` to allocate
+/// fresh descriptor sets when the same pipeline is bound more than
+/// once in a single pass (vkCmdDraw reads descriptors at submit
+/// time, so re-using the pipeline's static set would silently
+/// corrupt prior draws). Reset at the start of every `beginFrame`
+/// so this frame's allocations don't pile on the previous frame's;
+/// the per-pass usage is bounded by a small constant — see the
+/// `step_pool_*` caps below.
+threadlocal var step_pool: ?DescriptorPool = null;
+
+/// Caps for the per-frame `step_pool`. Sized for the worst pass
+/// shape (kitty image with N placements + the post pipelines): one
+/// set per (image_step × MAX_DESCRIPTOR_SETS) plus a handful of
+/// the renderer's other pipelines stepped once each. 256 is generous
+/// — actual frames stabilize well under that. If a frame ever
+/// exhausts the pool, `RenderPass.step` falls back to the pipeline's
+/// static set with a warning logged.
+const STEP_POOL_MAX_SETS: u32 = 256;
+const STEP_POOL_UNIFORM_BUFFERS: u32 = 256;
+const STEP_POOL_COMBINED_IMAGE_SAMPLERS: u32 = 256;
+const STEP_POOL_STORAGE_BUFFERS: u32 = 256;
+
 // ---- lifecycle ----------------------------------------------------------
 
 pub fn init(alloc: Allocator, opts: rendererpkg.Options) !Vulkan {
@@ -301,8 +346,27 @@ pub fn deinit(self: *Vulkan) void {
     // per surface), so it's always safe to clean them up regardless
     // of other surfaces' state.
     if (device) |*d| {
-        d.waitIdle();
+        // Per-surface teardown only needs THIS surface's submissions
+        // to be done — block on this thread's frame fence (if it
+        // exists) instead of `vkDeviceWaitIdle` on the shared device,
+        // which would stall every other tab/split's in-flight GPU
+        // work just to close one. The final-refcount path below does
+        // the device-wide waitIdle.
         if (frame_fence != null) {
+            const wait_r = d.dispatch.waitForFences(
+                d.device,
+                1,
+                &frame_fence,
+                vk.VK_TRUE,
+                std.math.maxInt(u64),
+            );
+            if (wait_r != vk.VK_SUCCESS) {
+                log.warn(
+                    "Vulkan.deinit: vkWaitForFences returned {}, falling back to device-wide wait",
+                    .{wait_r},
+                );
+                d.waitIdle();
+            }
             d.dispatch.destroyFence(d.device, frame_fence, null);
             frame_fence = null;
         }
@@ -314,13 +378,14 @@ pub fn deinit(self: *Vulkan) void {
             p.deinit();
             frame_pool = null;
         }
+        if (step_pool) |*p| {
+            p.deinit();
+            step_pool = null;
+        }
         // `last_target` is a borrow into this thread's FrameState
         // target slot. The SwapChain teardown destroys the target;
         // we just drop our reference.
         last_target = null;
-        // Recycle this thread's pooled buffers — the waitIdle above
-        // proves no GPU work references them anymore.
-        buffer_pool.drainAll(d);
     }
 
     // Decrement the shared-device refcount; only the last surface
@@ -330,9 +395,17 @@ pub fn deinit(self: *Vulkan) void {
     // renderer thread.
     device_mutex.lock();
     defer device_mutex.unlock();
+    std.debug.assert(device_refcount > 0);
     device_refcount -= 1;
     if (device_refcount == 0) {
-        if (device) |*d| d.deinit();
+        // Last surface: NOW we can safely drain the global buffer
+        // pool and tear the device down. The waitIdle is needed
+        // because non-final deinits skipped it.
+        if (device) |*d| {
+            d.waitIdle();
+            buffer_pool.drainAll(d);
+            d.deinit();
+        }
         device = null;
     }
     self.* = undefined;
@@ -499,16 +572,37 @@ pub fn beginFrame(
         if (dev.dispatch.createFence(dev.device, &fence_info, null, &frame_fence) != vk.VK_SUCCESS)
             return error.VulkanFailed;
     }
+    if (step_pool == null) {
+        step_pool = try DescriptorPool.init(.{
+            .device = dev,
+            .max_sets = STEP_POOL_MAX_SETS,
+            .uniform_buffers = STEP_POOL_UNIFORM_BUFFERS,
+            .combined_image_samplers = STEP_POOL_COMBINED_IMAGE_SAMPLERS,
+            .storage_buffers = STEP_POOL_STORAGE_BUFFERS,
+        });
+    }
 
     _ = self;
-    // Reset the command buffer + fence so this frame starts clean.
+    // Reset the command buffer + fence + step descriptor pool so
+    // this frame starts clean. `vkResetDescriptorPool` returns every
+    // set the previous frame allocated to the pool — much cheaper
+    // than freeing them individually, and removes any chance of
+    // last-frame's set being bound by accident.
     if (dev.dispatch.resetCommandBuffer(frame_cb, 0) != vk.VK_SUCCESS)
         return error.VulkanFailed;
     if (dev.dispatch.resetFences(dev.device, 1, &frame_fence) != vk.VK_SUCCESS)
         return error.VulkanFailed;
+    if (step_pool) |*p| {
+        if (dev.dispatch.resetDescriptorPool(dev.device, p.pool, 0) != vk.VK_SUCCESS)
+            return error.VulkanFailed;
+    }
 
     return try Frame.begin(
-        .{ .cb = frame_cb, .fence = frame_fence },
+        .{
+            .cb = frame_cb,
+            .fence = frame_fence,
+            .step_pool = if (step_pool) |*p| p else null,
+        },
         dev,
         renderer,
         target,
diff --git a/src/renderer/shadertoy.zig b/src/renderer/shadertoy.zig
index 7fe3142f7..f85b98271 100644
--- a/src/renderer/shadertoy.zig
+++ b/src/renderer/shadertoy.zig
@@ -217,17 +217,28 @@ pub fn glslFromShader(
         try writer.writeAll(prefix);
     } else {
         // Find the first newline after `#version ...` and inject the
-        // defines on the following line. We assume the prefix begins
-        // with a `#version` directive on its own line (true today;
-        // the comptime split below would crash loudly otherwise).
-        const first_nl = std.mem.indexOfScalar(u8, prefix, '\n').?;
-        try writer.writeAll(prefix[0 .. first_nl + 1]);
-        for (defines) |def| {
-            try writer.writeAll("#define ");
-            try writer.writeAll(def);
-            try writer.writeAll("\n");
+        // defines on the following line. The prefix is expected to
+        // start with `#version` followed by a newline; if a future
+        // edit ever drops that newline (e.g. a single-line prefix)
+        // we inject the defines BEFORE the prefix so glslang sees
+        // the directives on their own lines and reports a clear
+        // error instead of us crashing on a `null.?` unwrap.
+        if (std.mem.indexOfScalar(u8, prefix, '\n')) |first_nl| {
+            try writer.writeAll(prefix[0 .. first_nl + 1]);
+            for (defines) |def| {
+                try writer.writeAll("#define ");
+                try writer.writeAll(def);
+                try writer.writeAll("\n");
+            }
+            try writer.writeAll(prefix[first_nl + 1 ..]);
+        } else {
+            for (defines) |def| {
+                try writer.writeAll("#define ");
+                try writer.writeAll(def);
+                try writer.writeAll("\n");
+            }
+            try writer.writeAll(prefix);
         }
-        try writer.writeAll(prefix[first_nl + 1 ..]);
     }
     try writer.writeAll("\n\n");
     try writer.writeAll(src);
@@ -506,4 +517,3 @@ test "shadertoy to glsl" {
 
 const test_crt = @embedFile("shaders/test_shadertoy_crt.glsl");
 const test_invalid = @embedFile("shaders/test_shadertoy_invalid.glsl");
-const test_focus = @embedFile("shaders/test_shadertoy_focus.glsl");
diff --git a/src/renderer/vulkan/DescriptorPool.zig b/src/renderer/vulkan/DescriptorPool.zig
index 9248eb2b5..373074ae6 100644
--- a/src/renderer/vulkan/DescriptorPool.zig
+++ b/src/renderer/vulkan/DescriptorPool.zig
@@ -47,6 +47,28 @@ device: *const Device,
 pool: vk.VkDescriptorPool,
 
 pub fn init(opts: Options) Error!Self {
+    // Vulkan spec requires `maxSets > 0` and `poolSizeCount > 0` —
+    // a pool that vends N sets but doesn't admit any descriptor
+    // type would be useless and is rejected by some drivers
+    // (loose drivers accept it and fail at allocation time). Catch
+    // both shapes here so the caller gets a clear error instead of
+    // a downstream allocation failure.
+    if (opts.max_sets == 0) {
+        log.err("DescriptorPool.init: max_sets must be > 0", .{});
+        return error.VulkanFailed;
+    }
+    if (opts.uniform_buffers == 0 and
+        opts.combined_image_samplers == 0 and
+        opts.storage_buffers == 0)
+    {
+        log.err(
+            "DescriptorPool.init: at least one per-type cap must be > 0 " ++
+                "(uniform_buffers, combined_image_samplers, storage_buffers)",
+            .{},
+        );
+        return error.VulkanFailed;
+    }
+
     // Build a small VkDescriptorPoolSize array from whichever caps
     // are non-zero. Vulkan accepts an array; we cap at 3 entries
     // matching the three types `Options` exposes.
@@ -78,11 +100,12 @@ pub fn init(opts: Options) Error!Self {
         .sType = vk.VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO,
         .pNext = null,
         // No FREE_DESCRIPTOR_SET_BIT — we tear down by destroying
-        // the pool, which matches the per-frame reset pattern.
+        // the pool (or `vkResetDescriptorPool` for the per-frame
+        // step pool).
         .flags = 0,
         .maxSets = opts.max_sets,
         .poolSizeCount = n,
-        .pPoolSizes = if (n > 0) &sizes else null,
+        .pPoolSizes = &sizes,
     };
     var pool: vk.VkDescriptorPool = undefined;
     const r = opts.device.dispatch.createDescriptorPool(
diff --git a/src/renderer/vulkan/Device.zig b/src/renderer/vulkan/Device.zig
index ec6bd524e..1801dbb06 100644
--- a/src/renderer/vulkan/Device.zig
+++ b/src/renderer/vulkan/Device.zig
@@ -195,6 +195,7 @@ pub const Dispatch = struct {
     // the actual renderer integration lands.
     createDescriptorPool: std.meta.Child(vk.PFN_vkCreateDescriptorPool),
     destroyDescriptorPool: std.meta.Child(vk.PFN_vkDestroyDescriptorPool),
+    resetDescriptorPool: std.meta.Child(vk.PFN_vkResetDescriptorPool),
     allocateDescriptorSets: std.meta.Child(vk.PFN_vkAllocateDescriptorSets),
     updateDescriptorSets: std.meta.Child(vk.PFN_vkUpdateDescriptorSets),
     cmdBindDescriptorSets: std.meta.Child(vk.PFN_vkCmdBindDescriptorSets),
@@ -218,6 +219,12 @@ queue_family_index: u32,
 /// `error.UnsupportedVulkanVersion`).
 api_version: u32,
 
+/// Cached `VkPhysicalDeviceMemoryProperties`. The properties are
+/// immutable for the physical device's lifetime, so we query once
+/// at `init` time instead of on every `findMemoryType` call (which
+/// happens for every Buffer/Texture/Target allocation).
+memory_properties: vk.VkPhysicalDeviceMemoryProperties,
+
 dispatch: Dispatch,
 
 /// Process-wide mutex protecting access to `queue`. Vulkan requires
@@ -351,10 +358,41 @@ pub fn init(
 
     // ---- 4. extension check --------------------------------------
     var ext_count: u32 = 0;
-    _ = enumerate_device_extension_properties(physical_device, null, &ext_count, null);
+    {
+        const r = enumerate_device_extension_properties(physical_device, null, &ext_count, null);
+        // SUCCESS or INCOMPLETE both populate `ext_count`. INCOMPLETE
+        // shouldn't happen on the count-only call (no buffer to
+        // truncate) but we accept it defensively.
+        if (r != vk.VK_SUCCESS and r != vk.VK_INCOMPLETE) {
+            log.err("vkEnumerateDeviceExtensionProperties (count) failed: result={}", .{r});
+            return error.HostHandleMissing;
+        }
+    }
     const exts = try alloc.alloc(vk.VkExtensionProperties, ext_count);
     defer alloc.free(exts);
-    _ = enumerate_device_extension_properties(physical_device, null, &ext_count, exts.ptr);
+    {
+        const r = enumerate_device_extension_properties(physical_device, null, &ext_count, exts.ptr);
+        if (r != vk.VK_SUCCESS and r != vk.VK_INCOMPLETE) {
+            log.err("vkEnumerateDeviceExtensionProperties (fill) failed: result={}", .{r});
+            return error.HostHandleMissing;
+        }
+        // VK_INCOMPLETE here means the extension list grew between
+        // the count and fill calls (race with a driver hot-reload —
+        // very unlikely in practice but spec-permitted). The
+        // partially-filled buffer is still authoritative for the
+        // entries it does contain, but a required extension not yet
+        // populated would be missed. Treat as a hard fail since the
+        // extension presence check below would silently pass on a
+        // truncated list.
+        if (r == vk.VK_INCOMPLETE) {
+            log.err(
+                "vkEnumerateDeviceExtensionProperties returned INCOMPLETE; " ++
+                    "device extension list changed between count and fill",
+                .{},
+            );
+            return error.HostHandleMissing;
+        }
+    }
 
     inline for (REQUIRED_DEVICE_EXTENSIONS) |required| {
         var found = false;
@@ -501,6 +539,8 @@ pub fn init(
         try dl.load(vk.PFN_vkCreateDescriptorPool, "vkCreateDescriptorPool");
     const destroy_descriptor_pool =
         try dl.load(vk.PFN_vkDestroyDescriptorPool, "vkDestroyDescriptorPool");
+    const reset_descriptor_pool =
+        try dl.load(vk.PFN_vkResetDescriptorPool, "vkResetDescriptorPool");
     const allocate_descriptor_sets =
         try dl.load(vk.PFN_vkAllocateDescriptorSets, "vkAllocateDescriptorSets");
     const update_descriptor_sets =
@@ -508,6 +548,12 @@ pub fn init(
     const cmd_bind_descriptor_sets =
         try dl.load(vk.PFN_vkCmdBindDescriptorSets, "vkCmdBindDescriptorSets");
 
+    // Snapshot the memory properties once. They never change for
+    // the device's lifetime, so per-allocation re-queries (which
+    // findMemoryType used to do) were pure waste.
+    var memory_properties: vk.VkPhysicalDeviceMemoryProperties = undefined;
+    get_physical_device_memory_properties(physical_device, &memory_properties);
+
     return .{
         .platform = platform,
         .instance = instance,
@@ -516,6 +562,7 @@ pub fn init(
         .queue = queue,
         .queue_family_index = queue_family_index,
         .api_version = props.apiVersion,
+        .memory_properties = memory_properties,
         .dispatch = .{
             .getPhysicalDeviceProperties = get_physical_device_properties,
             .getPhysicalDeviceMemoryProperties = get_physical_device_memory_properties,
@@ -579,6 +626,7 @@ pub fn init(
             .cmdCopyImageToBuffer = cmd_copy_image_to_buffer,
             .createDescriptorPool = create_descriptor_pool,
             .destroyDescriptorPool = destroy_descriptor_pool,
+            .resetDescriptorPool = reset_descriptor_pool,
             .allocateDescriptorSets = allocate_descriptor_sets,
             .updateDescriptorSets = update_descriptor_sets,
             .cmdBindDescriptorSets = cmd_bind_descriptor_sets,
@@ -593,9 +641,16 @@ pub fn deinit(self: *Device) void {
 }
 
 /// Block until the device is idle. Useful before tearing down
-/// renderer resources to make sure no command buffers are in flight.
+/// renderer resources to make sure no command buffers are in
+/// flight. On `VK_ERROR_DEVICE_LOST` (or any other failure) we
+/// log the result so callers proceeding to destroy resources on
+/// a dead device leave a diagnostic crumb instead of silently
+/// crashing on the subsequent vkDestroy*.
 pub fn waitIdle(self: *const Device) void {
-    _ = self.dispatch.deviceWaitIdle(self.device);
+    const r = self.dispatch.deviceWaitIdle(self.device);
+    if (r != vk.VK_SUCCESS) {
+        log.warn("vkDeviceWaitIdle returned {}; teardown proceeding anyway", .{r});
+    }
 }
 
 /// Find a `VkMemoryType` index satisfying the requirements from a
@@ -609,8 +664,7 @@ pub fn findMemoryType(
     type_bits: u32,
     required_props: vk.VkMemoryPropertyFlags,
 ) ?u32 {
-    var props: vk.VkPhysicalDeviceMemoryProperties = undefined;
-    self.dispatch.getPhysicalDeviceMemoryProperties(self.physical_device, &props);
+    const props = &self.memory_properties;
     var i: u32 = 0;
     while (i < props.memoryTypeCount) : (i += 1) {
         const bit: u32 = @as(u32, 1) << @intCast(i);
diff --git a/src/renderer/vulkan/Frame.zig b/src/renderer/vulkan/Frame.zig
index 5c3d04a82..0b9d6faa2 100644
--- a/src/renderer/vulkan/Frame.zig
+++ b/src/renderer/vulkan/Frame.zig
@@ -37,6 +37,7 @@ const vk = @import("vulkan").c;
 
 const Device = @import("Device.zig");
 const Target = @import("Target.zig");
+const DescriptorPool = @import("DescriptorPool.zig");
 const RenderPass = @import("RenderPass.zig");
 
 const Vulkan = @import("../Vulkan.zig");
@@ -53,6 +54,15 @@ pub const Options = struct {
     /// Fence that gets signaled when the submit completes. Caller
     /// resets it to unsignaled before `begin` is called.
     fence: vk.VkFence,
+
+    /// Per-frame descriptor pool. `RenderPass.step` borrows it for
+    /// the per-call descriptor sets it allocates whenever a
+    /// pipeline is re-used within a single pass. The pool is
+    /// caller-owned (top-level `Vulkan.zig` keeps it threadlocal)
+    /// and must be reset (`vkResetDescriptorPool`) by the caller
+    /// before each Frame.begin so this frame's allocations don't
+    /// pile on the previous frame's.
+    step_pool: ?*DescriptorPool = null,
 };
 
 pub const Error = error{
@@ -67,6 +77,7 @@ renderer: *Renderer,
 target: *Target,
 cb: vk.VkCommandBuffer,
 fence: vk.VkFence,
+step_pool: ?*DescriptorPool = null,
 
 /// Begin recording a frame. The command buffer is reset and started
 /// with `ONE_TIME_SUBMIT` since we always submit before the next
@@ -95,6 +106,7 @@ pub fn begin(
         .target = target,
         .cb = opts.cb,
         .fence = opts.fence,
+        .step_pool = opts.step_pool,
     };
 }
 
@@ -112,75 +124,102 @@ pub fn complete(self: *const Self, sync: bool) void {
     _ = sync;
     const dev = self.device;
 
+    // `health` becomes `.unhealthy` on any GPU-side error below. We
+    // ALWAYS run `buffer_pool.cycle` and `frameCompleted` on the
+    // way out — skipping them on error left every retired buffer
+    // stuck in `pending` (unbounded growth) and held the renderer's
+    // swap-chain semaphore forever, so the NEXT `drawFrame` would
+    // hang with no diagnostic.
+    var health: Health = .healthy;
+    var submitted = false;
+
     // Make the rendered pixels visible to the host's mmap read. In
     // `.direct` mode this is just a memory barrier; in `.legacy_copy`
     // mode it also runs `vkCmdCopyImageToBuffer`. See `Target.zig`.
     self.target.recordPresentBarrier(self.cb);
 
-    {
+    end_cb: {
         const r = dev.dispatch.endCommandBuffer(self.cb);
         if (r != vk.VK_SUCCESS) {
             log.err("vkEndCommandBuffer (frame) failed: result={}", .{r});
-            return;
+            health = .unhealthy;
+            break :end_cb;
         }
-    }
 
-    const submit_info: vk.VkSubmitInfo = .{
-        .sType = vk.VK_STRUCTURE_TYPE_SUBMIT_INFO,
-        .pNext = null,
-        .waitSemaphoreCount = 0,
-        .pWaitSemaphores = null,
-        .pWaitDstStageMask = null,
-        .commandBufferCount = 1,
-        .pCommandBuffers = &self.cb,
-        .signalSemaphoreCount = 0,
-        .pSignalSemaphores = null,
-    };
-    {
+        const submit_info: vk.VkSubmitInfo = .{
+            .sType = vk.VK_STRUCTURE_TYPE_SUBMIT_INFO,
+            .pNext = null,
+            .waitSemaphoreCount = 0,
+            .pWaitSemaphores = null,
+            .pWaitDstStageMask = null,
+            .commandBufferCount = 1,
+            .pCommandBuffers = &self.cb,
+            .signalSemaphoreCount = 0,
+            .pSignalSemaphores = null,
+        };
         // Externally-synchronized via `Device.queueSubmit` — splits
         // and tabs share the host's VkQueue and Vulkan rejects
         // concurrent unsynchronized access.
-        const r = dev.queueSubmit(1, &submit_info, self.fence);
-        if (r != vk.VK_SUCCESS) {
-            log.err("vkQueueSubmit (frame) failed: result={}", .{r});
-            return;
+        const sr = dev.queueSubmit(1, &submit_info, self.fence);
+        if (sr != vk.VK_SUCCESS) {
+            log.err("vkQueueSubmit (frame) failed: result={}", .{sr});
+            health = .unhealthy;
+            break :end_cb;
         }
-    }
+        submitted = true;
 
-    // Wait for the GPU to finish writing the target before letting
-    // the host import the dmabuf. UINT64_MAX = "wait indefinitely".
-    {
-        const r = dev.dispatch.waitForFences(
+        // Wait for the GPU to finish writing the target before letting
+        // the host import the dmabuf. UINT64_MAX = "wait indefinitely".
+        const wr = dev.dispatch.waitForFences(
             dev.device,
             1,
             &self.fence,
             vk.VK_TRUE,
             std.math.maxInt(u64),
         );
-        if (r != vk.VK_SUCCESS) {
-            log.err("vkWaitForFences (frame) failed: result={}", .{r});
+        if (wr != vk.VK_SUCCESS) {
+            log.err("vkWaitForFences (frame) failed: result={}", .{wr});
+            health = .unhealthy;
         }
     }
 
-    // Recycle the per-frame Buffer pool now that the fence has
-    // signaled — every VkBuffer queued during this frame's
-    // recording is provably no longer in use by the GPU and is
-    // safe to hand to the next `Buffer.create` call. See
-    // `Vulkan.buffer_pool` for the lifecycle.
-    Vulkan.buffer_pool.cycle(dev);
+    // Recycle the per-frame Buffer pool. Even on the error path we
+    // still want to cycle: buffers that the failed submit referenced
+    // are now stuck (we can't prove the GPU is done with them), so
+    // we conservatively wait the device idle on the unhealthy path
+    // before draining. Without this, every failed submit leaks
+    // every buffer the renderer queued for that frame.
+    if (health == .unhealthy and !submitted) {
+        // Submit never happened — nothing in flight references
+        // recorded buffers, safe to cycle directly.
+        Vulkan.buffer_pool.cycle(dev);
+    } else if (health == .unhealthy) {
+        // Submit happened but fence wait failed (DEVICE_LOST etc.).
+        // Drain the device before recycling to avoid use-after-free
+        // on whatever queue is still ticking.
+        _ = dev.dispatch.deviceWaitIdle(dev.device);
+        Vulkan.buffer_pool.cycle(dev);
+    } else {
+        Vulkan.buffer_pool.cycle(dev);
+    }
 
-    // Hand the rendered target off to the host via `Vulkan.present`,
-    // which both calls the platform's present callback AND records
-    // the target pointer for `presentLastTarget` no-op republishes.
-    self.renderer.api.present(self.target) catch |err| {
-        log.err("present failed: {}", .{err});
-    };
+    // Hand the rendered target off to the host. On the unhealthy
+    // path we skip present — the dmabuf may be partially written
+    // and the host should see the previous frame instead (the
+    // generic renderer's no-op-frame logic re-presents
+    // `last_target`).
+    if (health == .healthy) {
+        self.renderer.api.present(self.target) catch |err| {
+            log.err("present failed: {}", .{err});
+            health = .unhealthy;
+        };
+    }
 
     // Tell the generic renderer the frame is done so it releases the
     // swap-chain semaphore. Without this, `SwapChain.nextFrame()`
     // blocks the second call to `drawFrame` forever (one buffer in
-    // the chain, never freed).
-    self.renderer.frameCompleted(.healthy);
+    // the chain, never freed). MUST run regardless of `health`.
+    self.renderer.frameCompleted(health);
 }
 
 /// Begin a render pass recording into this frame's command buffer.
@@ -193,6 +232,7 @@ pub inline fn renderPass(
     return RenderPass.begin(.{
         .device = self.device,
         .cb = self.cb,
+        .step_pool = self.step_pool,
         .attachments = attachments,
     });
 }
diff --git a/src/renderer/vulkan/Pipeline.zig b/src/renderer/vulkan/Pipeline.zig
index ec556ff95..324b3fdfd 100644
--- a/src/renderer/vulkan/Pipeline.zig
+++ b/src/renderer/vulkan/Pipeline.zig
@@ -136,9 +136,27 @@ layout: vk.VkPipelineLayout,
 /// `RenderPass.step` skips updating/binding it). `set_count` is one
 /// past the last non-null index, matching what
 /// `vkCmdBindDescriptorSets` needs as `setCount`.
+///
+/// HOT-PATH NOTE: these sets are SHARED across all `step()` calls
+/// that bind this pipeline within a single command buffer, but
+/// `vkCmdDraw` reads descriptors at submit time, so re-using the
+/// same pipeline twice with different per-call resources would
+/// cause both draws to see the LAST update's bindings.
+/// `RenderPass.step` defends against this by allocating a fresh
+/// per-call set from the pass's `step_pool` whenever the per-step
+/// resources differ; these `descriptor_sets[i]` slots act as
+/// pre-warmed defaults (used only when the call site is
+/// single-step-per-pipeline like bg_color / cell_bg).
 descriptor_sets: [MAX_DESCRIPTOR_SETS]vk.VkDescriptorSet = .{ null, null, null },
 set_count: u32 = 0,
 
+/// Descriptor set layouts associated with this pipeline, indexed by
+/// set number. `null` matches a `null` slot in `descriptor_sets`.
+/// Stored so `RenderPass.step` can allocate per-call sets from the
+/// pass's per-frame descriptor pool without round-tripping through
+/// the original `Shaders.init` layout-creation code path.
+descriptor_set_layouts: [MAX_DESCRIPTOR_SETS]vk.VkDescriptorSetLayout = .{ null, null, null },
+
 /// Binding number that `Step.uniforms` writes to within set 0.
 /// Defaults to 1 to match `common.glsl`'s
 /// `layout(binding = 1, std140) uniform Globals`. Override per
@@ -395,14 +413,20 @@ pub fn init(opts: Options) Error!Self {
             return error.VulkanFailed;
         }
     }
+    errdefer dev.dispatch.destroyPipeline(dev.device, pipeline, null);
 
     // Allocate one descriptor set per non-null entry in
-    // `opts.descriptor_set_layouts`. Null entries are placeholder
+    // `opts.descriptor_set_layouts`. Null entries are placeholders
     // (the shader's set=i isn't actually used) — nothing to allocate.
+    // Also remember the layouts on `Self` so `RenderPass.step` can
+    // allocate fresh per-call sets from a per-frame pool without
+    // re-creating layouts.
     var dsets: [MAX_DESCRIPTOR_SETS]vk.VkDescriptorSet = .{ null, null, null };
+    var dsls: [MAX_DESCRIPTOR_SETS]vk.VkDescriptorSetLayout = .{ null, null, null };
     if (opts.descriptor_pool) |pool_ptr| {
         for (opts.descriptor_set_layouts, 0..) |maybe_dsl, i| {
             if (maybe_dsl) |dsl| {
+                dsls[i] = dsl;
                 dsets[i] = pool_ptr.allocate(dsl) catch |err| {
                     log.err(
                         "Pipeline.init: descriptor set {} allocation failed: {}",
@@ -412,6 +436,10 @@ pub fn init(opts: Options) Error!Self {
                 };
             }
         }
+    } else {
+        for (opts.descriptor_set_layouts, 0..) |maybe_dsl, i| {
+            if (maybe_dsl) |dsl| dsls[i] = dsl;
+        }
     }
 
     return .{
@@ -419,6 +447,7 @@ pub fn init(opts: Options) Error!Self {
         .pipeline = pipeline,
         .layout = layout,
         .descriptor_sets = dsets,
+        .descriptor_set_layouts = dsls,
         .set_count = @intCast(opts.descriptor_set_layouts.len),
         .sampler = opts.sampler,
         .vertex_stride = if (opts.vertex_input) |vi| vi.stride else 0,
diff --git a/src/renderer/vulkan/RenderPass.zig b/src/renderer/vulkan/RenderPass.zig
index f679d2f14..b3db3ce7e 100644
--- a/src/renderer/vulkan/RenderPass.zig
+++ b/src/renderer/vulkan/RenderPass.zig
@@ -18,6 +18,7 @@ const Self = @This();
 const std = @import("std");
 const vk = @import("vulkan").c;
 
+const DescriptorPool = @import("DescriptorPool.zig");
 const Device = @import("Device.zig");
 const Pipeline = @import("Pipeline.zig");
 const Sampler = @import("Sampler.zig");
@@ -57,6 +58,16 @@ pub const Options = struct {
     /// by the enclosing `Frame`.
     cb: vk.VkCommandBuffer,
 
+    /// Per-frame descriptor pool. Used by `step` to allocate fresh
+    /// descriptor sets on the SECOND and later step() calls that
+    /// bind the same pipeline within this pass — without it,
+    /// mutating the pipeline's static `descriptor_sets[i]` for the
+    /// second call would overwrite the first call's bindings before
+    /// the GPU has read them (vkCmdDraw reads at submit time).
+    /// Optional: passes that never re-use a pipeline (bg_color,
+    /// cell_bg, cell_text) work without it.
+    step_pool: ?*DescriptorPool = null,
+
     /// Color attachments for the pass. With dynamic rendering each
     /// attachment is a render target + optional clear color.
     attachments: []const Attachment,
@@ -114,8 +125,21 @@ pub const Error = error{
 attachments: []const Options.Attachment,
 cb: vk.VkCommandBuffer,
 device: *const Device,
+step_pool: ?*DescriptorPool = null,
 step_number: usize = 0,
 
+/// VkPipeline handles already used by an earlier `step` in this
+/// pass. On second-and-later use of the same pipeline we allocate
+/// a fresh per-call descriptor set from `step_pool` instead of
+/// mutating `pipeline.descriptor_sets[i]` (vkCmdDraw reads at
+/// submit time, so re-updating the same set in place would
+/// overwrite the prior call's bindings before the GPU has read
+/// them). Capacity covers our worst case: per-pass image draws
+/// can fire dozens of pipeline reuses. The slice is empty when no
+/// step_pool was provided.
+seen_pipelines: [MAX_SEEN_PIPELINES]vk.VkPipeline = .{null} ** MAX_SEEN_PIPELINES,
+seen_pipelines_len: usize = 0,
+
 /// Last `Step.uniforms` value seen in this pass. The OpenGL backend
 /// keeps the bound UBO across draw calls implicitly (GL state
 /// persists), and the renderer's image/overlay draw calls in
@@ -126,6 +150,13 @@ step_number: usize = 0,
 /// to null at `begin`.
 last_uniforms: ?vk.VkBuffer = null,
 
+/// Cap on the number of distinct pipelines we'll track per pass
+/// for "first-use vs re-use" detection. The renderer's pass shape
+/// is: bg_color (1), cell_bg (1), cell_text (1), bg_image (1),
+/// image (varies). 8 is generous; we degrade gracefully to "always
+/// allocate fresh" past this cap.
+const MAX_SEEN_PIPELINES: usize = 8;
+
 /// Begin a render pass. Transitions the first attachment to
 /// `COLOR_ATTACHMENT_OPTIMAL` and opens a `vkCmdBeginRendering`
 /// scope with the caller's clear color (defaults to opaque black).
@@ -138,6 +169,7 @@ pub fn begin(opts: Options) Self {
         .attachments = opts.attachments,
         .cb = opts.cb,
         .device = opts.device,
+        .step_pool = opts.step_pool,
     };
 
     if (opts.attachments.len == 0) return self;
@@ -340,6 +372,48 @@ pub fn step(self: *Self, s: Step) void {
         }
     }
 
+    // Pick effective descriptor sets for this step.
+    //
+    // First time we see a given pipeline within this pass, we use
+    // its pre-allocated `descriptor_sets[]` slots and update them
+    // in place — cheap and avoids a per-pass-pool allocation in
+    // the common single-step case (bg_color/cell_bg/cell_text).
+    //
+    // SECOND-and-later use of the same pipeline within the same
+    // pass requires fresh sets: vkCmdDraw reads the descriptor
+    // contents at SUBMIT time, so re-updating the static sets in
+    // place would silently make every prior draw bound to this
+    // pipeline read the LAST update's UBO/sampler/storage. The
+    // image / kitty path issues N draws on the same `image`
+    // pipeline with per-call vertex buffers and textures — without
+    // this fix every kitty image rendered with the FINAL image's
+    // texture and the final draw's vertex buffer.
+    //
+    // The fresh sets come from `step_pool`, owned by the enclosing
+    // Frame and reset at frame start. When `step_pool` is null
+    // (test harnesses, smoke tests) we fall back to the static
+    // sets and accept the limitation.
+    var effective_sets: [Pipeline.MAX_DESCRIPTOR_SETS]vk.VkDescriptorSet =
+        s.pipeline.descriptor_sets;
+    const reused = self.markPipelineUsed(s.pipeline.pipeline);
+    if (reused) if (self.step_pool) |pool| {
+        for (s.pipeline.descriptor_set_layouts, 0..) |maybe_dsl, i| {
+            if (i >= s.pipeline.set_count) break;
+            const dsl = maybe_dsl orelse continue;
+            if (pool.allocate(dsl)) |fresh| {
+                effective_sets[i] = fresh;
+            } else |err| {
+                log.err(
+                    "RenderPass.step: per-call descriptor set " ++
+                        "allocation for set {} failed ({}); falling " ++
+                        "back to the pipeline's static set, which " ++
+                        "may corrupt prior draws on this pipeline",
+                    .{ i, err },
+                );
+            }
+        }
+    };
+
     // ---- update descriptor sets ---------------------------------
     //
     // We do one vkUpdateDescriptorSets call per descriptor write to
@@ -353,7 +427,7 @@ pub fn step(self: *Self, s: Step) void {
     // supply one. Track the new one for later steps.
     const ubo: ?vk.VkBuffer = s.uniforms orelse self.last_uniforms;
     if (s.uniforms) |b| self.last_uniforms = b;
-    if (s.pipeline.descriptor_sets[0] != null) if (ubo) |ubo_buffer| {
+    if (effective_sets[0] != null) if (ubo) |ubo_buffer| {
         const buffer_info: vk.VkDescriptorBufferInfo = .{
             .buffer = ubo_buffer,
             .offset = 0,
@@ -362,7 +436,7 @@ pub fn step(self: *Self, s: Step) void {
         const write: vk.VkWriteDescriptorSet = .{
             .sType = vk.VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET,
             .pNext = null,
-            .dstSet = s.pipeline.descriptor_sets[0],
+            .dstSet = effective_sets[0],
             .dstBinding = s.pipeline.uniforms_binding,
             .dstArrayElement = 0,
             .descriptorCount = 1,
@@ -375,7 +449,7 @@ pub fn step(self: *Self, s: Step) void {
     };
 
     // Samplers (set 1)
-    if (s.pipeline.descriptor_sets[1] != null) {
+    if (effective_sets[1] != null) {
         const slot_count = @max(s.textures.len, s.samplers.len);
         for (0..slot_count) |slot| {
             const tex_opt: ?Texture = if (slot < s.textures.len) s.textures[slot] else null;
@@ -396,7 +470,7 @@ pub fn step(self: *Self, s: Step) void {
             const write: vk.VkWriteDescriptorSet = .{
                 .sType = vk.VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET,
                 .pNext = null,
-                .dstSet = s.pipeline.descriptor_sets[1],
+                .dstSet = effective_sets[1],
                 .dstBinding = @intCast(slot),
                 .dstArrayElement = 0,
                 .descriptorCount = 1,
@@ -411,7 +485,7 @@ pub fn step(self: *Self, s: Step) void {
 
     // Storage buffers (set 2). `buffers[0]` is reserved for the
     // vertex buffer (handled above), so storage starts at slot 1.
-    if (s.pipeline.descriptor_sets[2] != null and s.buffers.len > 1) {
+    if (effective_sets[2] != null and s.buffers.len > 1) {
         for (s.buffers[1..], 1..) |maybe_buf, slot| {
             const buf = maybe_buf orelse continue;
             const buffer_info: vk.VkDescriptorBufferInfo = .{
@@ -422,7 +496,7 @@ pub fn step(self: *Self, s: Step) void {
             const write: vk.VkWriteDescriptorSet = .{
                 .sType = vk.VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET,
                 .pNext = null,
-                .dstSet = s.pipeline.descriptor_sets[2],
+                .dstSet = effective_sets[2],
                 .dstBinding = @intCast(slot),
                 .dstArrayElement = 0,
                 .descriptorCount = 1,
@@ -443,19 +517,19 @@ pub fn step(self: *Self, s: Step) void {
     // contiguous run of non-null sets.
     var start: usize = 0;
     while (start < s.pipeline.set_count) {
-        if (s.pipeline.descriptor_sets[start] == null) {
+        if (effective_sets[start] == null) {
             start += 1;
             continue;
         }
         var end = start + 1;
-        while (end < s.pipeline.set_count and s.pipeline.descriptor_sets[end] != null) : (end += 1) {}
+        while (end < s.pipeline.set_count and effective_sets[end] != null) : (end += 1) {}
         dev.dispatch.cmdBindDescriptorSets(
             self.cb,
             vk.VK_PIPELINE_BIND_POINT_GRAPHICS,
             s.pipeline.layout,
             @intCast(start),
             @intCast(end - start),
-            &s.pipeline.descriptor_sets[start],
+            &effective_sets[start],
             0,
             null,
         );
@@ -477,6 +551,26 @@ pub fn step(self: *Self, s: Step) void {
     self.step_number += 1;
 }
 
+/// Mark `pipeline` as used in this pass and report whether it was
+/// already seen. Returns `false` on the FIRST call (so `step` can
+/// safely update the pipeline's static descriptor sets in place);
+/// `true` on every subsequent call (so `step` allocates fresh sets
+/// from `step_pool` to avoid clobbering the prior call's bindings).
+///
+/// Beyond `MAX_SEEN_PIPELINES` we conservatively report `true` so
+/// callers always allocate fresh — the alternative (silently
+/// reverting to in-place updates) is the bug this whole mechanism
+/// exists to prevent.
+fn markPipelineUsed(self: *Self, pipeline: vk.VkPipeline) bool {
+    for (self.seen_pipelines[0..self.seen_pipelines_len]) |seen| {
+        if (seen == pipeline) return true;
+    }
+    if (self.seen_pipelines_len >= MAX_SEEN_PIPELINES) return true;
+    self.seen_pipelines[self.seen_pipelines_len] = pipeline;
+    self.seen_pipelines_len += 1;
+    return false;
+}
+
 /// Close the rendering scope and leave the attachment in a layout
 /// the host can read back via the dmabuf export. `GENERAL` is the
 /// safest choice for unknown consumer access patterns; the host
diff --git a/src/renderer/vulkan/Target.zig b/src/renderer/vulkan/Target.zig
index 513674a54..0a554a6b3 100644
--- a/src/renderer/vulkan/Target.zig
+++ b/src/renderer/vulkan/Target.zig
@@ -198,12 +198,17 @@ fn pickModifier(
     // work for AMD/Intel LINEAR but the compositor attach would
     // fail, so treat it as "no intersection."
     var host_mods: [MAX_MODIFIERS]u64 = undefined;
-    const host_count = dev.platform.get_supported_modifiers(
+    const host_returned = dev.platform.get_supported_modifiers(
         dev.platform.userdata,
         drm_format,
         &host_mods,
         MAX_MODIFIERS,
     );
+    // Clamp defensively. The C ABI contract is "host returns ≤ capacity",
+    // but we don't get to assume the host's implementation is correct
+    // — and in safe builds an OOB read on `host_mods[..host_returned]`
+    // panics, hiding the real diagnostic.
+    const host_count: usize = @min(host_returned, MAX_MODIFIERS);
     if (host_count == 0) {
         log.warn(
             "host advertises no dmabuf modifiers for format 0x{x}; " ++
@@ -465,7 +470,22 @@ fn initDirect(opts: Options, drm_format: u32, chosen_mod: u64) Error!Self {
         .fd = fd,
         .drm_format = drm_format,
         .drm_modifier = actual_mod,
-        .stride = @intCast(layout.rowPitch),
+        .stride = stride: {
+            // VkSubresourceLayout.rowPitch is u64 but the platform
+            // present callback accepts u32 stride. For a sanely-
+            // sized terminal target stride fits comfortably in u32,
+            // but vendor-tiled drivers at exotic resolutions could
+            // legitimately exceed it. Fail the init explicitly
+            // instead of letting `@intCast` panic in safe builds.
+            if (layout.rowPitch > std.math.maxInt(u32)) {
+                log.err(
+                    "Target.initDirect: rowPitch {} > u32 max; refusing direct mode",
+                    .{layout.rowPitch},
+                );
+                return error.UnsupportedFormat;
+            }
+            break :stride @intCast(layout.rowPitch);
+        },
     };
 }
 
diff --git a/src/renderer/vulkan/Texture.zig b/src/renderer/vulkan/Texture.zig
index 9d34506ce..366e1a963 100644
--- a/src/renderer/vulkan/Texture.zig
+++ b/src/renderer/vulkan/Texture.zig
@@ -74,6 +74,12 @@ image: vk.VkImage,
 memory: vk.VkDeviceMemory,
 view: vk.VkImageView,
 format: vk.VkFormat,
+/// Aspect mask the image was created with (e.g. COLOR_BIT for
+/// renderable textures, DEPTH_BIT for depth attachments). Stored
+/// so per-frame `replaceRegion` barrier/copy use the same aspect
+/// the image view was made with — hardcoding COLOR_BIT here was a
+/// silent validation error for any non-color caller.
+aspect: vk.VkImageAspectFlags,
 width: usize,
 height: usize,
 device: *const Device,
@@ -207,6 +213,7 @@ pub fn init(
         .memory = memory,
         .view = view,
         .format = opts.format,
+        .aspect = opts.aspect,
         .width = width,
         .height = height,
         .device = dev,
@@ -249,7 +256,15 @@ pub fn replaceRegion(
         .device = dev,
         .usage = vk.VK_BUFFER_USAGE_TRANSFER_SRC_BIT,
     }, data);
-    defer staging.deinit();
+    // `destroyImmediate` instead of `deinit`: replaceRegion runs
+    // synchronously on the calling thread (typically the main /
+    // app-init thread, NOT the renderer thread), and
+    // `OneShot.endAndSubmit` below calls `vkQueueWaitIdle` so the
+    // staging buffer is provably done with the GPU before this
+    // defer fires. Routing it into `Vulkan.buffer_pool` from a
+    // non-renderer thread would leak it forever — the pool's
+    // `cycle()` runs only on the renderer thread.
+    defer staging.destroyImmediate();
 
     // ---- command pool (one-shot) --------------------------------
     var pool = try CommandPool.init(dev);
@@ -279,7 +294,7 @@ pub fn replaceRegion(
             .dstQueueFamilyIndex = vk.VK_QUEUE_FAMILY_IGNORED,
             .image = self.image,
             .subresourceRange = .{
-                .aspectMask = vk.VK_IMAGE_ASPECT_COLOR_BIT,
+                .aspectMask = self.aspect,
                 .baseMipLevel = 0,
                 .levelCount = 1,
                 .baseArrayLayer = 0,
@@ -304,7 +319,7 @@ pub fn replaceRegion(
             .bufferRowLength = 0, // tightly packed
             .bufferImageHeight = 0,
             .imageSubresource = .{
-                .aspectMask = vk.VK_IMAGE_ASPECT_COLOR_BIT,
+                .aspectMask = self.aspect,
                 .mipLevel = 0,
                 .baseArrayLayer = 0,
                 .layerCount = 1,
@@ -343,7 +358,7 @@ pub fn replaceRegion(
             .dstQueueFamilyIndex = vk.VK_QUEUE_FAMILY_IGNORED,
             .image = self.image,
             .subresourceRange = .{
-                .aspectMask = vk.VK_IMAGE_ASPECT_COLOR_BIT,
+                .aspectMask = self.aspect,
                 .baseMipLevel = 0,
                 .levelCount = 1,
                 .baseArrayLayer = 0,
diff --git a/src/renderer/vulkan/buffer.zig b/src/renderer/vulkan/buffer.zig
index 388717441..cd73eccce 100644
--- a/src/renderer/vulkan/buffer.zig
+++ b/src/renderer/vulkan/buffer.zig
@@ -83,20 +83,26 @@ pub fn Buffer(comptime T: type) type {
             return self;
         }
 
+        /// Hand the (VkBuffer, VkDeviceMemory) pair back to the
+        /// process-wide pool. The pool (see `Vulkan.buffer_pool`)
+        /// holds the entry until the current frame's fence has
+        /// signaled (the GPU is done with our recorded references)
+        /// and then makes it available to a future `Buffer.create`
+        /// call. Returning to the pool solves both:
+        ///   - `renderer/image.zig:draw`'s `defer buf.deinit()` no
+        ///     longer use-after-frees the in-flight buffer.
+        ///   - It avoids the per-frame allocation thrash that
+        ///     drove the driver to SIGSEGV on image-heavy frames.
+        ///
+        /// MUST be called only from the renderer thread (the path
+        /// whose fence will eventually retire references to this
+        /// buffer in `Frame.complete`). One-shot uploads (atlas
+        /// staging buffers, etc.) that already block on
+        /// `vkQueueWaitIdle` post-submit must use
+        /// `destroyImmediate` instead — they don't share the
+        /// renderer thread's fence cycle.
         pub fn deinit(self: Self) void {
             const dev = self.opts.device;
-            // Hand the (VkBuffer, VkDeviceMemory) pair back to the
-            // process-wide pool instead of destroying it. The pool
-            // (see `Vulkan.buffer_pool`) holds the entry until the
-            // current frame's fence has signaled (the GPU is done
-            // with our recorded references) and then makes it
-            // available to a future `Buffer.create` call. Returning
-            // to the pool solves BOTH:
-            //   - `renderer/image.zig:draw`'s `defer buf.deinit()`
-            //     no longer use-after-frees the in-flight buffer.
-            //   - It avoids the per-frame allocation thrash that
-            //     drove the driver to SIGSEGV on image-heavy
-            //     frames.
             const bp = @import("../Vulkan.zig").buffer_pool;
             const capacity_bytes: u64 = @as(u64, self.len) * @sizeOf(T);
             bp.release(
@@ -106,16 +112,35 @@ pub fn Buffer(comptime T: type) type {
                 self.opts.usage,
                 capacity_bytes,
             ) catch {
-                // OOM growing the pool — fall back to immediate
-                // destroy. Logging here is awkward (no logger in
-                // scope) so we accept the loud failure and let
-                // Vulkan stderr diagnose any use-after-free that
-                // follows.
+                // OOM growing the pool. The buffer may still be
+                // referenced by an in-flight command buffer, so we
+                // wait the entire device idle before destroying —
+                // expensive but correct. Logging here is awkward (no
+                // logger in scope) so we accept the loud failure and
+                // let Vulkan stderr diagnose anything that follows.
+                _ = dev.dispatch.deviceWaitIdle(dev.device);
                 dev.dispatch.destroyBuffer(dev.device, self.buffer, null);
                 dev.dispatch.freeMemory(dev.device, self.memory, null);
             };
         }
 
+        /// Destroy the buffer immediately, bypassing the recycle
+        /// pool. The caller MUST ensure no in-flight command buffer
+        /// references this buffer (e.g. by having waited on a fence
+        /// or `vkQueueWaitIdle` covering its submission).
+        ///
+        /// Used by short-lived staging buffers like
+        /// `Texture.replaceRegion` whose lifetime is bounded by a
+        /// `OneShot.endAndSubmit` that already drains the queue;
+        /// stuffing those into the pool from a non-renderer thread
+        /// would leak them (the renderer thread's `cycle` runs the
+        /// pool, so an upload thread's pushes never get reused).
+        pub fn destroyImmediate(self: Self) void {
+            const dev = self.opts.device;
+            dev.dispatch.destroyBuffer(dev.device, self.buffer, null);
+            dev.dispatch.freeMemory(dev.device, self.memory, null);
+        }
+
         /// Replace the buffer's contents. Grows (doubles) if needed —
         /// matches the OpenGL backend's behavior. Data shorter than
         /// the current capacity leaves the trailing slots untouched.
@@ -235,14 +260,30 @@ pub fn Buffer(comptime T: type) type {
             };
         }
 
-        /// Grow the buffer to hold at least `new_len` Ts. Destroys
-        /// and recreates the underlying VkBuffer (Vulkan buffers are
-        /// immutable in size). Contents are discarded — callers
+        /// Grow the buffer to hold at least `new_len` Ts. Vulkan
+        /// buffers are immutable in size, so we route the old
+        /// buffer through the recycle pool (it may still be
+        /// referenced by the in-flight command buffer — destroying
+        /// it directly would race the GPU same as `deinit` would)
+        /// and create a fresh one. Contents are discarded; callers
         /// always `sync` immediately after `grow` returns.
         fn grow(self: *Self, new_len: usize) Error!void {
             const dev = self.opts.device;
-            dev.dispatch.destroyBuffer(dev.device, self.buffer, null);
-            dev.dispatch.freeMemory(dev.device, self.memory, null);
+            const bp = @import("../Vulkan.zig").buffer_pool;
+            const capacity_bytes: u64 = @as(u64, self.len) * @sizeOf(T);
+            bp.release(
+                dev,
+                self.buffer,
+                self.memory,
+                self.opts.usage,
+                capacity_bytes,
+            ) catch {
+                // OOM appending to the pool — wait the device idle
+                // and destroy directly. Same fallback as `deinit`.
+                _ = dev.dispatch.deviceWaitIdle(dev.device);
+                dev.dispatch.destroyBuffer(dev.device, self.buffer, null);
+                dev.dispatch.freeMemory(dev.device, self.memory, null);
+            };
             const replacement = try create(self.opts, new_len);
             self.* = replacement;
         }
diff --git a/src/renderer/vulkan/shaders.zig b/src/renderer/vulkan/shaders.zig
index 1927fe3fe..767e11d5d 100644
--- a/src/renderer/vulkan/shaders.zig
+++ b/src/renderer/vulkan/shaders.zig
@@ -67,9 +67,20 @@ fn processIncludes(comptime contents: [:0]const u8) [:0]const u8 {
     var i: usize = 0;
     while (i < contents.len) {
         if (std.mem.startsWith(u8, contents[i..], "#include")) {
-            std.debug.assert(std.mem.startsWith(u8, contents[i..], "#include \""));
-            const start = i + "#include \"".len;
-            const end = std.mem.indexOfScalarPos(u8, contents, start, '"').?;
+            // Skip whitespace (space or tab) between `#include` and
+            // the opening quote. The previous literal-prefix
+            // `startsWith("#include \"")` assert tripped on legal
+            // `#include\t"…"` and `#include  "…"` variants. Accept
+            // any horizontal whitespace and require exactly one
+            // double-quoted path.
+            var p = i + "#include".len;
+            while (p < contents.len and (contents[p] == ' ' or contents[p] == '\t')) : (p += 1) {}
+            if (p >= contents.len or contents[p] != '"') {
+                @compileError("processIncludes: malformed #include directive in shader");
+            }
+            const start = p + 1;
+            const end = std.mem.indexOfScalarPos(u8, contents, start, '"') orelse
+                @compileError("processIncludes: unterminated #include path");
             return std.fmt.comptimePrint("{s}{s}{s}", .{
                 contents[0..i],
                 @embedFile("../shaders/glsl/" ++ contents[start..end]),
@@ -178,6 +189,12 @@ pub fn vulkanizeGlsl(
 
         var i: usize = 0;
         while (i < src.len) {
+            // Skip comments + string literals verbatim — anything
+            // that looks like an identifier inside one of those is
+            // not a real token, and rewriting it (e.g. a comment
+            // that mentions `gl_VertexID` or `texture(atlas_*, ...)`)
+            // would silently corrupt the shader source.
+            if (try copySkippable(alloc, &out, src, &i)) continue;
             const c = src[i];
             const is_ident_start = isIdentChar(c);
             if (is_ident_start) {
@@ -245,6 +262,11 @@ pub fn vulkanizeGlsl(
 
     var i: usize = 0;
     while (i < pass1.len) {
+        // Skip comments + string literals verbatim, same reason as
+        // pass 1 — rewriting `layout(binding=…)` text inside a
+        // comment would inject a `set =` qualifier into a comment
+        // that's never compiled, harmless today but a footgun.
+        if (try copySkippable(alloc, &out, pass1, &i)) continue;
         if (matchKeyword(pass1, i, "layout")) |layout_end| {
             // Skip whitespace between `layout` and `(`.
             var p = layout_end;
@@ -307,6 +329,52 @@ pub fn vulkanizeGlsl(
     return try out.toOwnedSliceSentinel(alloc, 0);
 }
 
+/// If position `i` in `src` is the start of a GLSL line comment
+/// (`//...\n`), block comment (`/* ... */`), or `"..."` string
+/// literal, copy the whole token verbatim into `out`, advance
+/// `*i` past it, and return true. Otherwise `*i` is unchanged
+/// and we return false.
+///
+/// Strings are unusual in GLSL but `#extension` directives can
+/// quote in some preprocessor flavors, and the safe thing is to
+/// leave any quoted run untouched.
+fn copySkippable(
+    alloc: std.mem.Allocator,
+    out: *std.ArrayList(u8),
+    src: []const u8,
+    i: *usize,
+) std.mem.Allocator.Error!bool {
+    const start = i.*;
+    if (start >= src.len) return false;
+    if (start + 1 < src.len and src[start] == '/' and src[start + 1] == '/') {
+        var p = start;
+        while (p < src.len and src[p] != '\n') : (p += 1) {}
+        try out.appendSlice(alloc, src[start..p]);
+        i.* = p;
+        return true;
+    }
+    if (start + 1 < src.len and src[start] == '/' and src[start + 1] == '*') {
+        var p = start + 2;
+        while (p + 1 < src.len and !(src[p] == '*' and src[p + 1] == '/')) : (p += 1) {}
+        // Include the closing `*/` if found; otherwise consume to EOF.
+        const end = if (p + 1 < src.len) p + 2 else src.len;
+        try out.appendSlice(alloc, src[start..end]);
+        i.* = end;
+        return true;
+    }
+    if (src[start] == '"') {
+        var p = start + 1;
+        while (p < src.len and src[p] != '"') : (p += 1) {
+            if (src[p] == '\\' and p + 1 < src.len) p += 1;
+        }
+        const end = if (p < src.len) p + 1 else src.len;
+        try out.appendSlice(alloc, src[start..end]);
+        i.* = end;
+        return true;
+    }
+    return false;
+}
+
 fn isIdentChar(c: u8) bool {
     return (c >= 'a' and c <= 'z') or
         (c >= 'A' and c <= 'Z') or
@@ -314,15 +382,6 @@ fn isIdentChar(c: u8) bool {
         c == '_';
 }
 
-/// True if the first non-space, non-comment character at or after
-/// position `i` in `src` is `(`. Used to recognize a function call
-/// when the caller is positioned right after the identifier name.
-fn nextNonSpaceIsOpenParen(src: []const u8, i: usize) bool {
-    var p = i;
-    while (p < src.len and isAnySpace(src[p])) : (p += 1) {}
-    return p < src.len and src[p] == '(';
-}
-
 /// Names of samplers we create with `unnormalized_coordinates =
 /// VK_TRUE`. The shaders here all use only the two atlas samplers
 /// for cell_text; if more get added (or renamed) update this list.
@@ -457,9 +516,11 @@ pub const Module = struct {
             return error.GlslangFailed;
         };
 
-        const translated = vulkanizeGlsl(alloc, src) catch {
-            return error.GlslangFailed;
-        };
+        // vulkanizeGlsl returns `Allocator.Error` only — surface it
+        // via `try` (Module.Error includes Allocator.Error) so OOM
+        // doesn't get reported as `error.GlslangFailed`. Conflating
+        // them masked the actual failure mode in earlier diagnostics.
+        const translated = try vulkanizeGlsl(alloc, src);
         defer alloc.free(translated);
 
         const c = glslang.c;
@@ -671,14 +732,19 @@ const empty_pipeline: Pipeline = .{
 /// `Shaders.deinit` walks the same set in reverse to destroy
 /// pipelines, layouts, samplers, the descriptor pool, and modules.
 pub const Shaders = struct {
+    /// Borrowed pointer to the host-owned VkDevice wrapper. Stored
+    /// so `deinit` can reach the device dispatch table without
+    /// reaching into an arbitrary module's `.device` field (which
+    /// would silently break if `Modules` is restructured). The
+    /// pointer outlives `Shaders` because the device is process-
+    /// global in `Vulkan.zig`.
+    device: *const Device,
     pipelines: PipelineCollection,
     /// One per user-supplied custom shader. Built by `Shaders.init`
     /// from the `post_shaders` arg — empty when no custom shaders.
-    /// Owned by `Shaders` (deinit destroys each).
+    /// Owned by `Shaders` (deinit destroys each + frees the slice
+    /// using the allocator passed to `deinit`).
     post_pipelines: []Pipeline,
-    /// Allocator used to allocate `post_pipelines`; held so deinit
-    /// can free the slice.
-    post_alloc: ?Allocator = null,
     /// Compiled `VkShaderModule`s for each user shader, parallel to
     /// `post_pipelines`. Owned by `Shaders` (deinit destroys each).
     post_modules: []Module = &.{},
@@ -1188,12 +1254,17 @@ pub const Shaders = struct {
             post_modules = try alloc.alloc(Module, post_shaders.len);
             errdefer alloc.free(post_modules);
 
-            // Init counter so partial failures can deinit only what
-            // was built.
-            var built: usize = 0;
+            // Init counters so partial failures deinit exactly what
+            // was built. We track modules and pipelines separately
+            // because the inner loop creates a module first, then
+            // tries to build a pipeline against it — if Pipeline.init
+            // fails after Module.initFromSpirv succeeded, the module
+            // is populated but the pipeline isn't.
+            var modules_built: usize = 0;
+            var pipelines_built: usize = 0;
             errdefer {
-                for (post_pipelines[0..built]) |p| p.deinit();
-                for (post_modules[0..built]) |m| m.deinit();
+                for (post_pipelines[0..pipelines_built]) |p| p.deinit();
+                for (post_modules[0..modules_built]) |m| m.deinit();
             }
 
             // Shared descriptor set layouts across post pipelines.
@@ -1224,6 +1295,7 @@ pub const Shaders = struct {
                 }
                 const spv_words: []const u32 = std.mem.bytesAsSlice(u32, @as([]align(@alignOf(u32)) const u8, @alignCast(spv_bytes)));
                 post_modules[i] = try Module.initFromSpirv(device, spv_words, .fragment);
+                modules_built = i + 1;
                 post_pipelines[i] = try Pipeline.init(.{
                     .device = device,
                     .descriptor_pool = &pool,
@@ -1236,14 +1308,14 @@ pub const Shaders = struct {
                     .blending_enabled = false,
                     .topology = vk.VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST,
                 });
-                built = i + 1;
+                pipelines_built = i + 1;
             }
         }
 
         return .{
+            .device = device,
             .pipelines = pipelines,
             .post_pipelines = post_pipelines,
-            .post_alloc = if (post_shaders.len > 0) alloc else null,
             .post_modules = post_modules,
             .modules = modules,
             .descriptor_pool = pool,
@@ -1292,7 +1364,6 @@ pub const Shaders = struct {
     }
 
     pub fn deinit(self: *Shaders, alloc: Allocator) void {
-        _ = alloc;
         if (self.defunct) return;
         self.defunct = true;
 
@@ -1314,10 +1385,12 @@ pub const Shaders = struct {
         // pipeline first (holds VkPipelineLayout), then shader module.
         for (self.post_pipelines) |p| p.deinit();
         for (self.post_modules) |m| m.deinit();
-        if (self.post_alloc) |a| {
-            a.free(self.post_pipelines);
-            a.free(self.post_modules);
-        }
+        // The slices were allocated from the same allocator the
+        // caller hands to deinit (the renderer's `self.alloc`).
+        // Use it directly — the previous `post_alloc` field was
+        // an extra source of truth for the same value.
+        if (self.post_pipelines.len > 0) alloc.free(self.post_pipelines);
+        if (self.post_modules.len > 0) alloc.free(self.post_modules);
 
         // Atlas sampler held by `Shaders` for the cell_text pipeline's
         // texture bindings.
@@ -1331,7 +1404,7 @@ pub const Shaders = struct {
 
         // Destroy every descriptor set layout we created. The empty
         // placeholder is one of the entries.
-        const dev = self.modules.full_screen_vert.device;
+        const dev = self.device;
         for (self.set_layouts[0..self.set_layouts_len]) |dsl| {
             if (dsl != null) dev.dispatch.destroyDescriptorSetLayout(
                 dev.device,