Merge pull request #17 from fuddlesworth/qt-vulkan-renderer

Vulkan renderer + Qt apprt with Wayland subsurface dmabuf presenter
2026-05-26 18:26:04 -05:00 · 2026-05-26 18:26:04 -05:00 · 4d01762849
parent 1d5651062e 5a24a90f4e
commit 4d01762849
64 changed files with 13378 additions and 145 deletions
--- a/build.zig.zon
+++ b/build.zig.zon
@ -74,6 +74,7 @@
        .macos = .{ .path = "./pkg/macos", .lazy = true },
        .oniguruma = .{ .path = "./pkg/oniguruma", .lazy = true },
        .opengl = .{ .path = "./pkg/opengl", .lazy = true },
+        .vulkan = .{ .path = "./pkg/vulkan", .lazy = true },
        .sentry = .{ .path = "./pkg/sentry", .lazy = true },
        .simdutf = .{ .path = "./pkg/simdutf", .lazy = true },
        .wuffs = .{ .path = "./pkg/wuffs", .lazy = true },
--- a/include/ghostty.h
+++ b/include/ghostty.h
@ -67,6 +67,11 @@ typedef enum {
  GHOSTTY_PLATFORM_MACOS,
  GHOSTTY_PLATFORM_IOS,
  GHOSTTY_PLATFORM_OPENGL,
+  // Vulkan: fork-only platform tag. The host owns the
+  // VkInstance/Device/Queue and hands them to libghostty via
+  // `ghostty_platform_vulkan_s`. Frames come back to the host as
+  // dmabuf fds for zero-copy compositing.
+  GHOSTTY_PLATFORM_VULKAN,
 } ghostty_platform_e;

 typedef enum {
@ -481,10 +486,87 @@ typedef struct {
  void (*present)(void* userdata);
 } ghostty_platform_opengl_s;

+// Vulkan host integration (fork-only). The host owns the
+// VkInstance / VkPhysicalDevice / VkDevice / VkQueue (same ownership
+// model as the OpenGL host); libghostty creates pipelines, command
+// pools, and images against that device. Frames are handed back to the
+// host as dmabuf file descriptors so a compositor-side toolkit (e.g.
+// Qt RHI via QRhiTexture) can sample them without a CPU readback.
+//
+// Handles are typed as void* here so consumers don't need the Vulkan
+// headers to compile the public C API; callers should treat them as
+// VkInstance, VkPhysicalDevice, VkDevice, VkQueue respectively.
+typedef struct {
+  // Userdata passed as the first argument to every callback below.
+  void* userdata;
+
+  // Return the address of vkGetInstanceProcAddr (as void*). libghostty
+  // uses this as the loader entry point for every other Vulkan
+  // function it needs.
+  void* (*get_instance_proc_addr)(void* userdata, const char* name);
+
+  // Host-owned Vulkan handles. libghostty does not destroy these; they
+  // remain owned by the host for the surface's lifetime.
+  void* (*instance)(void* userdata);          // VkInstance
+  void* (*physical_device)(void* userdata);   // VkPhysicalDevice
+  void* (*device)(void* userdata);            // VkDevice
+  void* (*queue)(void* userdata);             // VkQueue
+  uint32_t (*queue_family_index)(void* userdata);
+
+  // Compositor-supported DRM modifiers for a given DRM_FORMAT_*
+  // fourcc, as advertised by linux-dmabuf-v1's `modifier` events.
+  // libghostty intersects this with what its physical device
+  // supports for COLOR_ATTACHMENT to pick a tiling that the
+  // compositor will actually accept on attach. Without this
+  // intersection, drivers that don't expose COLOR_ATTACHMENT for
+  // the LINEAR modifier (NVIDIA) can't use the direct-export path
+  // and fall back to a CPU-readback path.
+  //
+  // Two-pass usage: call with `out=NULL, capacity=0` to query the
+  // total count; allocate; call again to fill. Returns the number
+  // of modifiers actually written (capped at `capacity`). May
+  // return 0 if the format isn't compositor-supported or the host
+  // doesn't speak linux-dmabuf-v1.
+  size_t (*get_supported_modifiers)(
+      void* userdata,
+      uint32_t drm_format,
+      uint64_t* out,
+      size_t capacity);
+
+  // Hand off a rendered frame to the host as a dmabuf fd. The host
+  // imports it (e.g. into Qt's RHI as a QRhiTexture, or attaches to
+  // a wl_subsurface via linux-dmabuf-v1) and composites.
+  //
+  // `image_backed` is true when the dmabuf was exported from a
+  // VkImage allocated with VK_EXT_image_drm_format_modifier — i.e.
+  // it's directly importable as a 2D image by the compositor or any
+  // GPU-side consumer. false when it was exported from a VkBuffer
+  // (the legacy NVIDIA fallback path where the driver doesn't
+  // advertise COLOR_ATTACHMENT for the LINEAR modifier on
+  // exportable images, so libghostty renders into an OPTIMAL image
+  // and copies the bytes into a linear VkBuffer for export). In the
+  // !image_backed case the fd is only usable via mmap + CPU
+  // readback — attempting a linux-dmabuf-v1 import will trigger an
+  // `invalid_wl_buffer` protocol error.
+  //
+  // libghostty retains ownership of the underlying VkDeviceMemory;
+  // the host must dup() the fd if it needs to hold it past the call.
+  void (*present)(
+      void* userdata,
+      int dmabuf_fd,
+      uint32_t drm_format,
+      uint64_t drm_modifier,
+      uint32_t width,
+      uint32_t height,
+      uint32_t stride,
+      bool image_backed);
+} ghostty_platform_vulkan_s;
+
 typedef union {
  ghostty_platform_macos_s macos;
  ghostty_platform_ios_s ios;
  ghostty_platform_opengl_s opengl;
+  ghostty_platform_vulkan_s vulkan;
 } ghostty_platform_u;

 typedef enum {
--- a/pkg/glslang/build.zig
+++ b/pkg/glslang/build.zig
@ -165,5 +165,20 @@ fn buildGlslang(
        );
    }

+    // Ghastty Vulkan-friendly compile shim. Wraps glslang's C++ API
+    // to expose features (auto-map bindings/locations, source/target
+    // environment translation) that the upstream C API doesn't, so
+    // the renderer can compile OpenGL-flavored GLSL — including
+    // user-supplied custom shaders — to Vulkan-targeted SPIR-V.
+    lib.addCSourceFiles(.{
+        .root = b.path("override"),
+        .flags = flags.items,
+        .files = &.{"ghastty_vk_shim.cpp"},
+    });
+    lib.installHeader(
+        b.path("override/ghastty_vk_shim.h"),
+        "ghastty_vk_shim.h",
+    );
+
    return lib;
 }
--- a/pkg/glslang/c.zig
+++ b/pkg/glslang/c.zig
@ -1,4 +1,10 @@
 pub const c = @cImport({
    @cInclude("glslang/Include/glslang_c_interface.h");
    @cInclude("glslang/Public/resource_limits_c.h");
+    // Ghastty-specific extension to glslang's C ABI: a Vulkan-
+    // friendly compile entry point that wraps the C++ TShader API
+    // (setAutoMapBindings / setAutoMapLocations / setEnvInput) the
+    // upstream C interface doesn't expose. See
+    // `pkg/glslang/override/ghastty_vk_shim.h`.
+    @cInclude("ghastty_vk_shim.h");
 });
--- a/pkg/glslang/main.zig
+++ b/pkg/glslang/main.zig
@ -4,6 +4,7 @@ const shader = @import("shader.zig");

 pub const c = @import("c.zig").c;
 pub const testing = @import("test.zig");
+pub const vk = @import("vk.zig");

 pub const init = initpkg.init;
 pub const finalize = initpkg.finalize;
--- a/pkg/glslang/override/ghastty_vk_shim.cpp
+++ b/pkg/glslang/override/ghastty_vk_shim.cpp
@ -0,0 +1,282 @@
+// See `ghastty_vk_shim.h` for the contract.
+
+#include "ghastty_vk_shim.h"
+
+#include <cstdlib>
+#include <cstring>
+#include <mutex>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include <glslang/Include/PoolAlloc.h>
+#include <glslang/Public/ShaderLang.h>
+#include <glslang/Public/ResourceLimits.h>
+#include <SPIRV/GlslangToSpv.h>
+
+// glslang's `InitializeProcess` / `FinalizeProcess` must bracket
+// any use of `glslang::TShader` / `glslang::TProgram`. The existing
+// C-API path in `pkg/glslang/init.zig` calls `glslang_initialize_process`
+// at startup, and per the glslang headers the C and C++ inits share
+// state, so we don't initialize again here — calling `InitializeProcess`
+// twice without a matching `FinalizeProcess` leaks reference counts.
+
+namespace {
+
+std::string drain_logs(glslang::TShader* shader, glslang::TProgram* program) {
+    std::string s;
+    if (shader != nullptr) {
+        const char* info = shader->getInfoLog();
+        const char* debug = shader->getInfoDebugLog();
+        if (info != nullptr && info[0] != '\0') { s += info; s += "\n"; }
+        if (debug != nullptr && debug[0] != '\0') { s += debug; s += "\n"; }
+    }
+    if (program != nullptr) {
+        const char* info = program->getInfoLog();
+        const char* debug = program->getInfoDebugLog();
+        if (info != nullptr && info[0] != '\0') { s += info; s += "\n"; }
+        if (debug != nullptr && debug[0] != '\0') { s += debug; s += "\n"; }
+    }
+    return s;
+}
+
+char* dup_to_c(const std::string& s) {
+    char* p = static_cast<char*>(std::malloc(s.size() + 1));
+    if (p == nullptr) return nullptr;
+    std::memcpy(p, s.data(), s.size());
+    p[s.size()] = '\0';
+    return p;
+}
+
+// Process-wide SPIR-V cache keyed by (source, stage). The renderer
+// builds one Vulkan.Shaders per surface (per tab/split), which calls
+// `Module.init` → `compileToSpv` for all 9 built-in shaders + every
+// user custom shader. Each compile pulls memory from glslang's
+// thread-local TPoolAllocator, which is a raw pointer in glslang's
+// TLS that is NEVER released when a renderer thread exits (Zig
+// pthread spawn doesn't run C++ thread_local destructors and there
+// is no FinalizeThread hook). With N tabs, the leaked pool pages
+// add up to tens of MB — observed via heaptrack as the dominant
+// leak source (~17 MB across 15k+ allocations from
+// glslang::TPoolAllocator::allocate).
+//
+// Cache the resulting SPIR-V instead. The built-in shaders produce
+// byte-identical SPV regardless of which surface compiles them; the
+// custom shaders only change when the user edits their config. So
+// after the first surface, every other surface's compile is a
+// cache hit with zero glslang work and zero new pool pages.
+//
+// Key format: source bytes followed by a single byte stage tag
+// (0=vertex, 1=fragment). Disambiguates the rare case where two
+// stages share identical source text.
+std::mutex& spv_cache_mutex() {
+    static std::mutex m;
+    return m;
+}
+std::unordered_map<std::string, std::vector<uint32_t>>& spv_cache() {
+    static std::unordered_map<std::string, std::vector<uint32_t>> c;
+    return c;
+}
+
+std::string make_cache_key(const char* source, ghastty_glslang_stage_t stage) {
+    std::string key(source);
+    key.push_back(static_cast<char>(stage));
+    return key;
+}
+
+} // namespace
+
+extern "C" int ghastty_glslang_compile_vulkan(
+    const char* source,
+    ghastty_glslang_stage_t stage,
+    uint32_t** spv_out,
+    size_t* spv_len_out,
+    char** err_out) {
+
+    // Reject any null out-pointer up-front. The previous code
+    // dereferenced all three unconditionally on line 1 of the
+    // function body — the in-tree Zig caller (`pkg/glslang/vk.zig`)
+    // always passes valid pointers, but this is a C ABI export and
+    // a future consumer that omits any out-arg would crash here
+    // before any error message could be reported. Returning early
+    // surfaces the precondition cleanly.
+    if (spv_out == nullptr || spv_len_out == nullptr || err_out == nullptr) {
+        return 1;
+    }
+
+    *spv_out = nullptr;
+    *spv_len_out = 0;
+    *err_out = nullptr;
+
+    if (source == nullptr) {
+        *err_out = dup_to_c("source pointer is null");
+        return 1;
+    }
+
+    // Cache hit: copy SPV from the cache and return without ever
+    // touching glslang. See the cache rationale comment above the
+    // map for why this is critical for the multi-tab leak.
+    const std::string key = make_cache_key(source, stage);
+    {
+        std::lock_guard<std::mutex> lg(spv_cache_mutex());
+        auto it = spv_cache().find(key);
+        if (it != spv_cache().end()) {
+            const std::vector<uint32_t>& cached = it->second;
+            const size_t bytes = cached.size() * sizeof(uint32_t);
+            uint32_t* out = static_cast<uint32_t*>(std::malloc(bytes));
+            if (out == nullptr) {
+                *err_out = dup_to_c(
+                    "malloc failed for cached SPIR-V copy");
+                return 1;
+            }
+            std::memcpy(out, cached.data(), bytes);
+            *spv_out = out;
+            *spv_len_out = cached.size();
+            return 0;
+        }
+    }
+
+    EShLanguage lang;
+    switch (stage) {
+        case GHASTTY_GLSLANG_STAGE_VERTEX:   lang = EShLangVertex;   break;
+        case GHASTTY_GLSLANG_STAGE_FRAGMENT: lang = EShLangFragment; break;
+        default:
+            *err_out = dup_to_c("unknown stage");
+            return 1;
+    }
+
+    glslang::TShader shader(lang);
+    const char* sources[1] = { source };
+    shader.setStrings(sources, 1);
+
+    // Source environment is OpenGL GLSL, target environment is Vulkan.
+    // The cross-environment setup is what lets glslang translate
+    // OpenGL-only builtins (`gl_VertexID`, `gl_InstanceID`, etc.) to
+    // their Vulkan equivalents (`gl_VertexIndex`, `gl_InstanceIndex`)
+    // during SPIR-V generation. Matches `glslangValidator -V` and
+    // Qt's `QShaderBaker`.
+    shader.setEnvInput(
+        glslang::EShSourceGlsl,
+        lang,
+        glslang::EShClientVulkan,
+        /*version*/ 100);
+    shader.setEnvClient(
+        glslang::EShClientVulkan,
+        glslang::EShTargetVulkan_1_3);
+    shader.setEnvTarget(
+        glslang::EShTargetSpv,
+        glslang::EShTargetSpv_1_6);
+
+    // Auto-map: assign descriptor bindings and shader I/O locations
+    // for any `layout`-less declarations. Required for OpenGL GLSL
+    // that doesn't bother with explicit locations (which Vulkan SPIR-V
+    // requires).
+    shader.setAutoMapBindings(true);
+    shader.setAutoMapLocations(true);
+
+    const TBuiltInResource* resources = GetDefaultResources();
+    const EShMessages messages = static_cast<EShMessages>(
+        EShMsgDefault | EShMsgSpvRules | EShMsgVulkanRules);
+
+    if (!shader.parse(resources, /*default_version*/ 450,
+                      ECoreProfile, /*force_default*/ false,
+                      /*forward_compatible*/ true, messages)) {
+        *err_out = dup_to_c(drain_logs(&shader, nullptr));
+        return 1;
+    }
+
+    glslang::TProgram program;
+    program.addShader(&shader);
+    if (!program.link(messages)) {
+        *err_out = dup_to_c(drain_logs(&shader, &program));
+        return 1;
+    }
+    // mapIO() is what actually applies the auto-bind / auto-map
+    // resolution to the SPIR-V output. Without it the
+    // `setAutoMap*(true)` calls above are no-ops.
+    if (!program.mapIO()) {
+        std::string s = "glslang TProgram::mapIO() failed:\n";
+        s += drain_logs(&shader, &program);
+        *err_out = dup_to_c(s);
+        return 1;
+    }
+
+    std::vector<unsigned int> spv;
+    glslang::GlslangToSpv(*program.getIntermediate(lang), spv);
+    if (spv.empty()) {
+        *err_out = dup_to_c(
+            "GlslangToSpv produced no SPIR-V output");
+        return 1;
+    }
+
+    const size_t bytes = spv.size() * sizeof(uint32_t);
+    uint32_t* out = static_cast<uint32_t*>(std::malloc(bytes));
+    if (out == nullptr) {
+        *err_out = dup_to_c("malloc failed for SPIR-V output buffer");
+        return 1;
+    }
+    std::memcpy(out, spv.data(), bytes);
+    *spv_out = out;
+    *spv_len_out = spv.size();
+
+    // Populate the cache with the freshly-compiled SPV. Stored by
+    // value (std::move into the map); the SPV vector is the same
+    // data we just memcpy'd to `out` so the caller's malloc'd copy
+    // and the cache entry are independent. Future calls with this
+    // (source, stage) skip glslang entirely.
+    {
+        std::lock_guard<std::mutex> lg(spv_cache_mutex());
+        spv_cache().emplace(key, std::move(spv));
+    }
+    return 0;
+}
+
+extern "C" void ghastty_glslang_free_spirv(uint32_t* spv) {
+    std::free(spv);
+}
+
+extern "C" void ghastty_glslang_free_error(char* err) {
+    std::free(err);
+}
+
+extern "C" void ghastty_glslang_finalize_process(void) {
+    // Drop the cached SPV blobs first. The map owns the std::vector
+    // pages it holds; clearing returns them to the heap. Done before
+    // FinalizeProcess so a malicious post-finalize compile attempt
+    // (which would re-enter glslang on a dead process state) trips
+    // glslang's own checks rather than handing out stale cache hits.
+    {
+        std::lock_guard<std::mutex> lg(spv_cache_mutex());
+        spv_cache().clear();
+    }
+    // Release glslang's process-wide shared state FIRST. This deletes
+    // SharedSymbolTables[v][s][p][src][stage] entries that hold
+    // pointers INTO the thread pool; we want their dtors to run
+    // while the pool memory is still live.
+    glslang::FinalizeProcess();
+
+    // Now destroy this thread's TPoolAllocator entirely. popAll()
+    // alone is insufficient — it returns pages to glslang's
+    // internal free list but never gives them back to the system
+    // allocator (verified empirically: heaptrack total leaked
+    // unchanged after popAll). The pool is `new`-allocated in
+    // glslang::InitializeThreadPoolAllocator, so `delete` calls
+    // ~TPoolAllocator which `free()`s every page.
+    //
+    // heaptrack pointed the ~12 MB glslang leak at
+    // TPoolAllocator::allocate calls rooted in
+    // shadertoy.spirvFromGlsl on the GUI thread (since
+    // ghostty_surface_new runs glslang synchronously from
+    // MainWindow::newTab) — that pool's pages persist until the
+    // GUI thread exits, but a Qt app's GUI thread only exits at
+    // process termination, after atexit. Manual delete here gives
+    // the pages back before exit.
+    //
+    // Safe at atexit because every renderer thread has joined
+    // via Vulkan.threadExit (their pools are independent
+    // threadlocals already cleaned up), the SPV cache was just
+    // cleared, and FinalizeProcess just ran.
+    glslang::TPoolAllocator* pool = &glslang::GetThreadPoolAllocator();
+    glslang::SetThreadPoolAllocator(nullptr);
+    delete pool;
+}
--- a/pkg/glslang/override/ghastty_vk_shim.h
+++ b/pkg/glslang/override/ghastty_vk_shim.h
@ -0,0 +1,81 @@
+// Vulkan-targeted GLSL compilation that exposes glslang's
+// C++-only features (auto-map bindings/locations, source/target
+// environment translation for `gl_VertexID` → `gl_VertexIndex`)
+// through a C-compatible entry point.
+//
+// glslang's public C API (`glslang_c_interface.h`) doesn't expose
+// `setAutoMapBindings` / `setAutoMapLocations` / `setEnvInput` —
+// they only live on the C++ `glslang::TShader` class. The CLI
+// (`glslangValidator -V --auto-map-locations --auto-map-bindings`)
+// and Qt's `QShaderBaker` both call them internally; this shim is
+// the equivalent for libghostty.
+//
+// Used by `src/renderer/vulkan/shaders.zig` for both the renderer's
+// built-in shaders and user-supplied custom shaders. The same
+// function covers both because user-shader compilation happens at
+// runtime against `libghostty.so`, not as a build step.
+
+#ifndef GHASTTY_VK_SHIM_H
+#define GHASTTY_VK_SHIM_H
+
+#include <stddef.h>
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef enum {
+    GHASTTY_GLSLANG_STAGE_VERTEX = 0,
+    GHASTTY_GLSLANG_STAGE_FRAGMENT = 1,
+} ghastty_glslang_stage_t;
+
+// Compile a null-terminated GLSL source to Vulkan-flavored SPIR-V.
+//
+// Preconditions: `spv_out`, `spv_len_out`, and `err_out` MUST all be
+//   non-null. The function rejects any null out-pointer with rc=1
+//   and no error string (since `err_out` is itself part of the
+//   contract). `source` may be null; that produces a normal failure
+//   with `*err_out` set.
+//
+// On success: returns 0. `*spv_out` points to a freshly allocated
+//   array of `*spv_len_out` 32-bit SPIR-V words. Caller frees it
+//   with `ghastty_glslang_free_spirv`. `*err_out` is NULL.
+//
+// On failure: returns non-zero. `*err_out` points to a freshly
+//   allocated null-terminated error message (or NULL on out-arg
+//   precondition violation OR on internal OOM). Caller frees it
+//   with `ghastty_glslang_free_error`. `*spv_out` is NULL,
+//   `*spv_len_out` is 0.
+int ghastty_glslang_compile_vulkan(
+    const char* source,
+    ghastty_glslang_stage_t stage,
+    uint32_t** spv_out,
+    size_t* spv_len_out,
+    char** err_out);
+
+void ghastty_glslang_free_spirv(uint32_t* spv);
+void ghastty_glslang_free_error(char* err);
+
+// Release the process-wide glslang state: the per-thread
+// TPoolAllocator pages (the high-water-mark pool memory that
+// otherwise leaks for the process lifetime because Zig pthreads
+// don't run C++ thread_local destructors) AND the shim's
+// SPV cache.
+//
+// Idempotent. Call ONCE from the host's shutdown path AFTER all
+// renderer threads have joined — calling it while a renderer
+// thread might still touch glslang::TShader / TProgram is
+// undefined behavior per glslang's contract.
+//
+// libghostty's own renderer-thread teardown (Vulkan.threadExit)
+// is what serializes this safely: by the time the host's main()
+// returns from QApplication::exec(), every renderer thread has
+// already run threadExit and is joined.
+void ghastty_glslang_finalize_process(void);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* GHASTTY_VK_SHIM_H */
--- a/pkg/glslang/vk.zig
+++ b/pkg/glslang/vk.zig
@ -0,0 +1,90 @@
+//! Typed Zig wrapper around the Ghastty Vulkan-friendly glslang
+//! compile shim (`pkg/glslang/override/ghastty_vk_shim.h`). The shim
+//! itself is a small C entry point that wraps glslang's C++-only
+//! `setAutoMapBindings` / `setAutoMapLocations` / `setEnvInput` knobs
+//! the upstream C ABI doesn't expose.
+//!
+//! Callers use this instead of poking `glslang.c.ghastty_*` directly:
+//! the malloc/free dance for the shim's out-pointers is finicky
+//! (separate free entry points for SPIR-V and error strings, both
+//! optional, both have to be dropped on the right path) and was
+//! previously open-coded across two near-identical 25-line blocks
+//! in `src/renderer/vulkan/shaders.zig`. This module is the binding
+//! layer; the renderer just calls `compileToSpv` and gets a Zig
+//! `[]const u32` slice.
+
+const std = @import("std");
+const Allocator = std.mem.Allocator;
+
+const c = @import("c.zig").c;
+
+const log = std.log.scoped(.glslang);
+
+pub const Stage = enum {
+    vertex,
+    fragment,
+
+    fn cValue(self: Stage) c.ghastty_glslang_stage_t {
+        return switch (self) {
+            .vertex => c.GHASTTY_GLSLANG_STAGE_VERTEX,
+            .fragment => c.GHASTTY_GLSLANG_STAGE_FRAGMENT,
+        };
+    }
+};
+
+pub const Error = error{
+    /// The compile-shim's underlying glslang C++ pipeline (TShader
+    /// preprocess / parse + TProgram link + GlslangToSpv) failed.
+    /// The shim's error message is logged via `std.log.err` before
+    /// this error is returned — no allocation is propagated to the
+    /// caller.
+    GlslangFailed,
+} || Allocator.Error;
+
+/// Compile a null-terminated GLSL source string to a Vulkan-flavored
+/// SPIR-V binary.
+///
+/// On success, returns a slice owned by `alloc`; the caller frees with
+/// `alloc.free(spv)`. The shim hands back its own malloc'd buffer
+/// which we copy into `alloc` so the caller's `defer alloc.free` works
+/// without remembering a separate `ghastty_glslang_free_spirv` call.
+///
+/// On failure, the shim's error string is logged with `std.log.err`
+/// and `error.GlslangFailed` is returned — the C-side malloc'd error
+/// buffer is freed before returning so callers don't have to.
+pub fn compileToSpv(
+    alloc: Allocator,
+    source: [:0]const u8,
+    stage: Stage,
+) Error![]const u32 {
+    var spv_ptr: [*c]u32 = undefined;
+    var spv_len: usize = 0;
+    var err_ptr: [*c]u8 = undefined;
+
+    const rc = c.ghastty_glslang_compile_vulkan(
+        source.ptr,
+        stage.cValue(),
+        &spv_ptr,
+        &spv_len,
+        &err_ptr,
+    );
+    if (rc != 0) {
+        if (err_ptr != null) {
+            log.err("ghastty_glslang_compile_vulkan: rc={} {s}", .{
+                rc,
+                std.mem.span(@as([*:0]const u8, @ptrCast(err_ptr))),
+            });
+            c.ghastty_glslang_free_error(err_ptr);
+        } else {
+            log.err("ghastty_glslang_compile_vulkan: rc={} (no error string)", .{rc});
+        }
+        return error.GlslangFailed;
+    }
+    defer c.ghastty_glslang_free_spirv(spv_ptr);
+
+    // Copy out of the shim's malloc into `alloc` so the caller's
+    // free path is symmetric with every other allocator-owned slice.
+    const owned = try alloc.alloc(u32, spv_len);
+    @memcpy(owned, spv_ptr[0..spv_len]);
+    return owned;
+}
--- a/pkg/vulkan/CommandPool.zig
+++ b/pkg/vulkan/CommandPool.zig
@ -0,0 +1,195 @@
+//! Wrapper for `VkCommandPool` with a one-shot command-buffer helper.
+//!
+//! Initially used by `vulkan/Texture.zig` for staging-buffer uploads:
+//! allocate a transient command buffer, record an upload + layout
+//! barriers, submit, wait for completion, free.
+//!
+//! Eventually the renderer will grow a separate per-frame command
+//! pool for the main draw stream; this pool stays around for
+//! infrequent operations like atlas uploads where blocking the
+//! caller is fine. The choice keeps the API small and avoids the
+//! complication of multi-frame fence tracking for resources that
+//! will outlive the upload.
+
+const Self = @This();
+
+const std = @import("std");
+const vk = @import("c.zig").c;
+
+const Device = @import("Device.zig");
+
+const log = std.log.scoped(.vulkan);
+
+pub const Error = error{
+    /// A `vkCreateCommandPool` / `vkAllocateCommandBuffers` /
+    /// `vkBeginCommandBuffer` / `vkEndCommandBuffer` / `vkQueueSubmit`
+    /// returned a non-success status. Logged with the raw `VkResult`.
+    VulkanFailed,
+};
+
+device: *const Device,
+pool: vk.VkCommandPool,
+
+/// Create a command pool on the device's graphics queue family. The
+/// pool is created with `TRANSIENT_BIT | RESET_COMMAND_BUFFER_BIT`
+/// because every command buffer we allocate here is short-lived and
+/// freed (or reset) immediately after submit.
+pub fn init(device: *const Device) Error!Self {
+    const info: vk.VkCommandPoolCreateInfo = .{
+        .sType = vk.VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO,
+        .pNext = null,
+        .flags = vk.VK_COMMAND_POOL_CREATE_TRANSIENT_BIT |
+            vk.VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT,
+        .queueFamilyIndex = device.queue_family_index,
+    };
+    var pool: vk.VkCommandPool = undefined;
+    const r = device.dispatch.createCommandPool(device.device, &info, null, &pool);
+    if (r != vk.VK_SUCCESS) {
+        log.err("vkCreateCommandPool failed: result={}", .{r});
+        return error.VulkanFailed;
+    }
+    return .{ .device = device, .pool = pool };
+}
+
+pub fn deinit(self: *Self) void {
+    self.device.dispatch.destroyCommandPool(self.device.device, self.pool, null);
+    self.* = undefined;
+}
+
+/// A one-shot recording session. Yielded from `beginOneShot`, drives
+/// `endAndSubmit` when the caller is done recording.
+pub const OneShot = struct {
+    pool: *Self,
+    cb: vk.VkCommandBuffer,
+
+    /// Record any commands directly via `cb` and the device dispatch
+    /// table (e.g. `pool.device.dispatch.cmdPipelineBarrier(cb, …)`).
+    /// Then call `endAndSubmit`. The command buffer is freed by the
+    /// time this returns.
+    pub fn endAndSubmit(self: OneShot) Error!void {
+        const dev = self.pool.device;
+
+        // ALWAYS free the command buffer, success or failure.
+        // Without this errdefer the early returns from end / submit /
+        // waitIdle would leak the buffer slot — until the pool is
+        // destroyed — and a caller that treats `error.VulkanFailed`
+        // as recoverable (retries the upload) would eventually
+        // exhaust the pool.
+        //
+        // Vulkan-correctness wrinkle: a buffer in PENDING state
+        // (post-submit, pre-wait) cannot legally be freed — that's
+        // UB per the spec. `submitted_pending` tracks whether we've
+        // submitted; on the error path we then `deviceWaitIdle`
+        // before freeing to drag the buffer back to a safely-freeable
+        // state. The errdefer fires on error only; the success path
+        // hits the explicit free below.
+        var cb_local = self.cb;
+        var submitted_pending: bool = false;
+        errdefer {
+            if (submitted_pending) {
+                // Buffer may be in PENDING state. Drain to be safe
+                // before freeing. deviceWaitIdle here is acceptable
+                // — we're already on an error path for an atlas
+                // upload, so blocking the device once on the way out
+                // is preferable to leaving the buffer leaked OR to
+                // freeing a PENDING buffer (UB).
+                _ = dev.dispatch.deviceWaitIdle(dev.device);
+            }
+            dev.dispatch.freeCommandBuffers(dev.device, self.pool.pool, 1, &cb_local);
+        }
+
+        {
+            const r = dev.dispatch.endCommandBuffer(self.cb);
+            if (r != vk.VK_SUCCESS) {
+                log.err("vkEndCommandBuffer failed: result={}", .{r});
+                return error.VulkanFailed;
+            }
+        }
+
+        const submit_info: vk.VkSubmitInfo = .{
+            .sType = vk.VK_STRUCTURE_TYPE_SUBMIT_INFO,
+            .pNext = null,
+            .waitSemaphoreCount = 0,
+            .pWaitSemaphores = null,
+            .pWaitDstStageMask = null,
+            .commandBufferCount = 1,
+            .pCommandBuffers = &self.cb,
+            .signalSemaphoreCount = 0,
+            .pSignalSemaphores = null,
+        };
+        {
+            // Externally-synchronized via `Device.queueSubmit` —
+            // see the note there. Splits/tabs both submit here for
+            // atlas uploads, and the per-frame Frame.complete path
+            // also uses the same queue.
+            const r = dev.queueSubmit(1, &submit_info, null);
+            if (r != vk.VK_SUCCESS) {
+                log.err("vkQueueSubmit failed: result={}", .{r});
+                return error.VulkanFailed;
+            }
+            submitted_pending = true;
+        }
+
+        // Block until the submit completes. Acceptable for one-shot
+        // uploads (atlas resizes are rare and the caller is willing
+        // to stall). Per-frame command submission will use fences
+        // and never queueWaitIdle.
+        {
+            const r = dev.queueWaitIdle();
+            if (r != vk.VK_SUCCESS) {
+                log.err("vkQueueWaitIdle failed: result={}", .{r});
+                return error.VulkanFailed;
+            }
+            submitted_pending = false;
+        }
+
+        // Success path: free the buffer (the errdefer above only
+        // fires on the error path, so we still need this on success).
+        // The pool itself stays around so back-to-back uploads can
+        // reuse it without re-allocating VkCommandPool.
+        dev.dispatch.freeCommandBuffers(dev.device, self.pool.pool, 1, &cb_local);
+    }
+};
+
+/// Allocate + begin a transient command buffer for a one-shot
+/// upload. Pair with `OneShot.endAndSubmit`.
+pub fn beginOneShot(self: *Self) Error!OneShot {
+    const dev = self.device;
+
+    const alloc_info: vk.VkCommandBufferAllocateInfo = .{
+        .sType = vk.VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO,
+        .pNext = null,
+        .commandPool = self.pool,
+        .level = vk.VK_COMMAND_BUFFER_LEVEL_PRIMARY,
+        .commandBufferCount = 1,
+    };
+    var cb: vk.VkCommandBuffer = undefined;
+    {
+        const r = dev.dispatch.allocateCommandBuffers(dev.device, &alloc_info, &cb);
+        if (r != vk.VK_SUCCESS) {
+            log.err("vkAllocateCommandBuffers failed: result={}", .{r});
+            return error.VulkanFailed;
+        }
+    }
+    errdefer dev.dispatch.freeCommandBuffers(dev.device, self.pool, 1, &cb);
+
+    const begin_info: vk.VkCommandBufferBeginInfo = .{
+        .sType = vk.VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO,
+        .pNext = null,
+        .flags = vk.VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT,
+        .pInheritanceInfo = null,
+    };
+    {
+        const r = dev.dispatch.beginCommandBuffer(cb, &begin_info);
+        if (r != vk.VK_SUCCESS) {
+            log.err("vkBeginCommandBuffer failed: result={}", .{r});
+            return error.VulkanFailed;
+        }
+    }
+
+    return .{ .pool = self, .cb = cb };
+}
+
+test {
+    std.testing.refAllDecls(@This());
+}
--- a/pkg/vulkan/DescriptorPool.zig
+++ b/pkg/vulkan/DescriptorPool.zig
@ -0,0 +1,168 @@
+//! Wrapper for `VkDescriptorPool` with allocation + per-set helpers.
+//!
+//! Vulkan descriptor sets are the per-pipeline resource-binding
+//! handles: a descriptor set holds references to uniform buffers,
+//! sampled images, samplers, etc., that a particular shader stage
+//! draws from. They're allocated from a pool, populated via
+//! `vkUpdateDescriptorSets`, and bound at draw time with
+//! `vkCmdBindDescriptorSets`.
+//!
+//! Lifetime model: this wrapper assumes the pool outlives all sets
+//! allocated from it (caller arranges teardown order). Sets aren't
+//! individually freed — destroying the pool reclaims everything.
+//! That matches the per-frame pool pattern the renderer will use
+//! (reset the pool at frame start; reallocate the sets for that
+//! frame).
+//!
+//! Caps are caller-provided. Pass realistic numbers — over-pooling
+//! is fine; under-pooling fails at allocation time.
+
+const Self = @This();
+
+const std = @import("std");
+const vk = @import("c.zig").c;
+
+const Device = @import("Device.zig");
+
+const log = std.log.scoped(.vulkan);
+
+pub const Error = error{
+    /// `vkCreateDescriptorPool` / `vkAllocateDescriptorSets` returned
+    /// a non-success status.
+    VulkanFailed,
+    /// Caller passed an invalid pool configuration (e.g. `max_sets ==
+    /// 0`, or every per-type cap is zero). Distinct from
+    /// `VulkanFailed` so callers can tell driver-side errors from
+    /// caller-side ones.
+    InvalidPoolConfig,
+};
+
+/// Construction caps. `max_sets` is the total number of descriptor
+/// sets the pool can ever vend; the per-type counts are individual
+/// resource counts pooled across all those sets.
+pub const Options = struct {
+    device: *const Device,
+    max_sets: u32,
+    uniform_buffers: u32 = 0,
+    combined_image_samplers: u32 = 0,
+    storage_buffers: u32 = 0,
+};
+
+device: *const Device,
+pool: vk.VkDescriptorPool,
+
+pub fn init(opts: Options) Error!Self {
+    // Vulkan spec requires `maxSets > 0` and `poolSizeCount > 0` —
+    // a pool that vends N sets but doesn't admit any descriptor
+    // type would be useless and is rejected by some drivers
+    // (loose drivers accept it and fail at allocation time). Catch
+    // both shapes here so the caller gets a clear error instead of
+    // a downstream allocation failure.
+    if (opts.max_sets == 0) {
+        log.err("DescriptorPool.init: max_sets must be > 0", .{});
+        return error.InvalidPoolConfig;
+    }
+    if (opts.uniform_buffers == 0 and
+        opts.combined_image_samplers == 0 and
+        opts.storage_buffers == 0)
+    {
+        log.err(
+            "DescriptorPool.init: at least one per-type cap must be > 0 " ++
+                "(uniform_buffers, combined_image_samplers, storage_buffers)",
+            .{},
+        );
+        return error.InvalidPoolConfig;
+    }
+
+    // Build a small VkDescriptorPoolSize array from whichever caps
+    // are non-zero. Vulkan accepts an array; we cap at 3 entries
+    // matching the three types `Options` exposes.
+    var sizes: [3]vk.VkDescriptorPoolSize = undefined;
+    var n: u32 = 0;
+    if (opts.uniform_buffers > 0) {
+        sizes[n] = .{
+            .type = vk.VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER,
+            .descriptorCount = opts.uniform_buffers,
+        };
+        n += 1;
+    }
+    if (opts.combined_image_samplers > 0) {
+        sizes[n] = .{
+            .type = vk.VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
+            .descriptorCount = opts.combined_image_samplers,
+        };
+        n += 1;
+    }
+    if (opts.storage_buffers > 0) {
+        sizes[n] = .{
+            .type = vk.VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
+            .descriptorCount = opts.storage_buffers,
+        };
+        n += 1;
+    }
+
+    const info: vk.VkDescriptorPoolCreateInfo = .{
+        .sType = vk.VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO,
+        .pNext = null,
+        // No FREE_DESCRIPTOR_SET_BIT — we tear down by destroying
+        // the pool (or `vkResetDescriptorPool` for the per-frame
+        // step pool).
+        .flags = 0,
+        .maxSets = opts.max_sets,
+        .poolSizeCount = n,
+        .pPoolSizes = &sizes,
+    };
+    var pool: vk.VkDescriptorPool = undefined;
+    const r = opts.device.dispatch.createDescriptorPool(
+        opts.device.device,
+        &info,
+        null,
+        &pool,
+    );
+    if (r != vk.VK_SUCCESS) {
+        log.err("vkCreateDescriptorPool failed: result={}", .{r});
+        return error.VulkanFailed;
+    }
+    return .{ .device = opts.device, .pool = pool };
+}
+
+pub fn deinit(self: *Self) void {
+    self.device.dispatch.destroyDescriptorPool(
+        self.device.device,
+        self.pool,
+        null,
+    );
+    self.* = undefined;
+}
+
+/// Allocate a single descriptor set against the provided layout.
+/// On success the set is uninitialized — populate it with
+/// `vkUpdateDescriptorSets` before binding.
+pub fn allocate(
+    self: *Self,
+    layout: vk.VkDescriptorSetLayout,
+) Error!vk.VkDescriptorSet {
+    var layouts = [_]vk.VkDescriptorSetLayout{layout};
+    const info: vk.VkDescriptorSetAllocateInfo = .{
+        .sType = vk.VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO,
+        .pNext = null,
+        .descriptorPool = self.pool,
+        .descriptorSetCount = 1,
+        .pSetLayouts = &layouts,
+    };
+    var set: vk.VkDescriptorSet = undefined;
+    const r = self.device.dispatch.allocateDescriptorSets(
+        self.device.device,
+        &info,
+        &set,
+    );
+    if (r != vk.VK_SUCCESS) {
+        log.err("vkAllocateDescriptorSets failed: result={}", .{r});
+        return error.VulkanFailed;
+    }
+    return set;
+}
+
+test {
+    std.testing.refAllDecls(@This());
+}
--- a/pkg/vulkan/Device.zig
+++ b/pkg/vulkan/Device.zig
@ -0,0 +1,679 @@
+//! Host-provided Vulkan device wrapper.
+//!
+//! libghostty does NOT call `vkCreateInstance` / `vkCreateDevice` for
+//! the Vulkan renderer: per `ghostty_platform_vulkan_s` in
+//! `include/ghostty.h`, the host (the apprt embedding libghostty —
+//! e.g. the Qt frontend) owns the entire Vulkan setup. We consume
+//! its handles via the platform callbacks, validate the version /
+//! extensions we need, and build a function-pointer dispatch table
+//! the rest of the renderer can use.
+//!
+//! Why host-owned? The host already has a Vulkan instance/device for
+//! its own compositing (Qt's RHI). Asking the host to share its
+//! device means rendered frames can be handed back as raw `VkImage`
+//! handles or dmabuf fds without a CPU readback or a second Vulkan
+//! instance fighting for the same GPU resources.
+//!
+//! Vulkan version: 1.3 (Jan 2022). Promotes dynamic rendering,
+//! sync2, extended dynamic state — all of which simplify a
+//! dirty-rect-style terminal renderer. Driver coverage is fine on
+//! every distro currently in support.
+//!
+//! Required device extensions (must be enabled on the host's
+//! VkDevice; we verify each on init):
+//!   - VK_KHR_external_memory_fd
+//!   - VK_EXT_external_memory_dma_buf
+//!   - VK_EXT_image_drm_format_modifier
+//!
+//! These are what let libghostty export the rendered VkImage memory
+//! as a dmabuf fd so the host can import it for zero-copy
+//! presentation (path 3 in the qt-vulkan-renderer scoping log:
+//! preserves Qt's QWidget composition model AND avoids the CPU
+//! readback the OpenGL path currently does).
+
+const std = @import("std");
+const Allocator = std.mem.Allocator;
+
+const vk = @import("c.zig").c;
+
+const log = std.log.scoped(.vulkan);
+
+const Device = @This();
+
+/// Minimum Vulkan API version the renderer requires.
+pub const MIN_API_VERSION = vk.VK_API_VERSION_1_3;
+
+/// Device extensions libghostty enables on top of the host's
+/// VkDevice setup. The host must have created its VkDevice with
+/// these enabled; we only verify availability here.
+///
+/// `VK_EXT_image_drm_format_modifier` is what lets
+/// `vulkan/Target.zig` probe the per-modifier feature set (in
+/// particular: does `DRM_FORMAT_MOD_LINEAR` advertise
+/// `COLOR_ATTACHMENT_BIT`?) and, when supported, allocate the render
+/// image with `VkImageDrmFormatModifierExplicitCreateInfoEXT` so its
+/// memory can be exported as a dmabuf directly — no separate LINEAR
+/// `VkBuffer` and no end-of-frame `vkCmdCopyImageToBuffer`. Drivers
+/// where the modifier path can't satisfy the requested features fall
+/// back to the legacy OPTIMAL-plus-copy path inside `Target`.
+pub const REQUIRED_DEVICE_EXTENSIONS = [_][:0]const u8{
+    "VK_KHR_external_memory_fd",
+    "VK_EXT_external_memory_dma_buf",
+    "VK_EXT_image_drm_format_modifier",
+};
+
+/// Errors that can come out of `init`.
+pub const Error = error{
+    /// The host returned a null handle for `instance` / `device` /
+    /// `queue` / `physical_device`, or `get_instance_proc_addr`
+    /// failed to resolve a core Vulkan function we need to bootstrap.
+    HostHandleMissing,
+
+    /// The host's VkPhysicalDevice doesn't report a Vulkan API version
+    /// >= MIN_API_VERSION. Detected via `vkGetPhysicalDeviceProperties`.
+    UnsupportedVulkanVersion,
+
+    /// At least one entry in `REQUIRED_DEVICE_EXTENSIONS` was not
+    /// listed in `vkEnumerateDeviceExtensionProperties` for the
+    /// host's VkPhysicalDevice.
+    MissingRequiredExtension,
+};
+
+/// The function-pointer dispatch table libghostty resolves against the
+/// host's instance / device. We only enumerate the entry points the
+/// renderer actually uses; extending the table is the supported way
+/// for follow-up renderer code to call additional Vulkan functions.
+pub const Dispatch = struct {
+    // ---- instance-level -----------------------------------------
+    getPhysicalDeviceProperties: std.meta.Child(vk.PFN_vkGetPhysicalDeviceProperties),
+    getPhysicalDeviceMemoryProperties: std.meta.Child(vk.PFN_vkGetPhysicalDeviceMemoryProperties),
+    getPhysicalDeviceFormatProperties: std.meta.Child(vk.PFN_vkGetPhysicalDeviceFormatProperties),
+    /// Used by `Target` to chain `VkDrmFormatModifierPropertiesListEXT`
+    /// and enumerate which DRM modifiers the device exposes for a
+    /// given format. Vulkan 1.1 promoted `vkGetPhysicalDeviceFormatProperties2`
+    /// from `VK_KHR_get_physical_device_properties2` into core, so we
+    /// resolve it under the non-suffixed name — `MIN_API_VERSION` is
+    /// 1.3 (see line 45), well past the promotion.
+    getPhysicalDeviceFormatProperties2: std.meta.Child(vk.PFN_vkGetPhysicalDeviceFormatProperties2),
+    enumerateDeviceExtensionProperties: std.meta.Child(vk.PFN_vkEnumerateDeviceExtensionProperties),
+    getDeviceProcAddr: std.meta.Child(vk.PFN_vkGetDeviceProcAddr),
+
+    // ---- device-level (resolved via getDeviceProcAddr) ----------
+    // Intentionally narrow for now — every additional renderer-side
+    // call adds a field here and a `loadDevice` lookup in `init`.
+    getDeviceQueue: std.meta.Child(vk.PFN_vkGetDeviceQueue),
+    deviceWaitIdle: std.meta.Child(vk.PFN_vkDeviceWaitIdle),
+
+    // Sampler — used by `vulkan/Sampler.zig`.
+    createSampler: std.meta.Child(vk.PFN_vkCreateSampler),
+    destroySampler: std.meta.Child(vk.PFN_vkDestroySampler),
+
+    // Texture (image + memory + view) — used by `vulkan/Texture.zig`.
+    createImage: std.meta.Child(vk.PFN_vkCreateImage),
+    destroyImage: std.meta.Child(vk.PFN_vkDestroyImage),
+    getImageMemoryRequirements: std.meta.Child(vk.PFN_vkGetImageMemoryRequirements),
+    allocateMemory: std.meta.Child(vk.PFN_vkAllocateMemory),
+    freeMemory: std.meta.Child(vk.PFN_vkFreeMemory),
+    bindImageMemory: std.meta.Child(vk.PFN_vkBindImageMemory),
+    createImageView: std.meta.Child(vk.PFN_vkCreateImageView),
+    destroyImageView: std.meta.Child(vk.PFN_vkDestroyImageView),
+
+    // Buffer (host-visible vertex / uniform / cell-data storage) —
+    // used by `vulkan/buffer.zig`.
+    createBuffer: std.meta.Child(vk.PFN_vkCreateBuffer),
+    destroyBuffer: std.meta.Child(vk.PFN_vkDestroyBuffer),
+    getBufferMemoryRequirements: std.meta.Child(vk.PFN_vkGetBufferMemoryRequirements),
+    bindBufferMemory: std.meta.Child(vk.PFN_vkBindBufferMemory),
+    mapMemory: std.meta.Child(vk.PFN_vkMapMemory),
+    unmapMemory: std.meta.Child(vk.PFN_vkUnmapMemory),
+
+    // Command pool / buffer + queue submit + recording —
+    // used by `vulkan/CommandPool.zig` and (later) per-frame command
+    // recording in `vulkan/Frame.zig`.
+    createCommandPool: std.meta.Child(vk.PFN_vkCreateCommandPool),
+    destroyCommandPool: std.meta.Child(vk.PFN_vkDestroyCommandPool),
+    allocateCommandBuffers: std.meta.Child(vk.PFN_vkAllocateCommandBuffers),
+    freeCommandBuffers: std.meta.Child(vk.PFN_vkFreeCommandBuffers),
+    beginCommandBuffer: std.meta.Child(vk.PFN_vkBeginCommandBuffer),
+    endCommandBuffer: std.meta.Child(vk.PFN_vkEndCommandBuffer),
+    queueSubmit: std.meta.Child(vk.PFN_vkQueueSubmit),
+    queueWaitIdle: std.meta.Child(vk.PFN_vkQueueWaitIdle),
+    cmdPipelineBarrier: std.meta.Child(vk.PFN_vkCmdPipelineBarrier),
+    cmdCopyBufferToImage: std.meta.Child(vk.PFN_vkCmdCopyBufferToImage),
+    cmdFillBuffer: std.meta.Child(vk.PFN_vkCmdFillBuffer),
+    cmdClearColorImage: std.meta.Child(vk.PFN_vkCmdClearColorImage),
+    cmdBindVertexBuffers: std.meta.Child(vk.PFN_vkCmdBindVertexBuffers),
+
+    // Shader modules — used by `vulkan/shaders.zig`.
+    createShaderModule: std.meta.Child(vk.PFN_vkCreateShaderModule),
+    destroyShaderModule: std.meta.Child(vk.PFN_vkDestroyShaderModule),
+
+    // Graphics pipeline + descriptor set layout —
+    // used by `vulkan/Pipeline.zig`.
+    createDescriptorSetLayout: std.meta.Child(vk.PFN_vkCreateDescriptorSetLayout),
+    destroyDescriptorSetLayout: std.meta.Child(vk.PFN_vkDestroyDescriptorSetLayout),
+    createPipelineLayout: std.meta.Child(vk.PFN_vkCreatePipelineLayout),
+    destroyPipelineLayout: std.meta.Child(vk.PFN_vkDestroyPipelineLayout),
+    createGraphicsPipelines: std.meta.Child(vk.PFN_vkCreateGraphicsPipelines),
+    destroyPipeline: std.meta.Child(vk.PFN_vkDestroyPipeline),
+
+    // External memory fd export — used by `vulkan/Target.zig`.
+    // `vkGetMemoryFdKHR` is from `VK_KHR_external_memory_fd`; needs
+    // device-level resolution like any other device function.
+    getMemoryFdKHR: std.meta.Child(vk.PFN_vkGetMemoryFdKHR),
+    getImageSubresourceLayout: std.meta.Child(vk.PFN_vkGetImageSubresourceLayout),
+    /// From `VK_EXT_image_drm_format_modifier`. Used by
+    /// `vulkan/Target.zig` after creating an image with the LIST
+    /// variant of the modifier create-info to discover which
+    /// modifier the driver actually chose.
+    getImageDrmFormatModifierPropertiesEXT: std.meta.Child(vk.PFN_vkGetImageDrmFormatModifierPropertiesEXT),
+
+    // Per-frame sync (fence + command-buffer reset) — used by
+    // `vulkan/Frame.zig`.
+    createFence: std.meta.Child(vk.PFN_vkCreateFence),
+    destroyFence: std.meta.Child(vk.PFN_vkDestroyFence),
+    waitForFences: std.meta.Child(vk.PFN_vkWaitForFences),
+    resetFences: std.meta.Child(vk.PFN_vkResetFences),
+    resetCommandBuffer: std.meta.Child(vk.PFN_vkResetCommandBuffer),
+
+    // Drawing — used by `vulkan/RenderPass.zig` (and the smoke
+    // test's renderTriangle helper). Vulkan 1.3 promoted
+    // `vkCmdBeginRendering` / `vkCmdEndRendering` from the
+    // `VK_KHR_dynamic_rendering` extension into core, so they're
+    // available without an extension opt-in.
+    cmdBeginRendering: std.meta.Child(vk.PFN_vkCmdBeginRendering),
+    cmdEndRendering: std.meta.Child(vk.PFN_vkCmdEndRendering),
+    cmdBindPipeline: std.meta.Child(vk.PFN_vkCmdBindPipeline),
+    cmdSetViewport: std.meta.Child(vk.PFN_vkCmdSetViewport),
+    cmdSetScissor: std.meta.Child(vk.PFN_vkCmdSetScissor),
+    cmdDraw: std.meta.Child(vk.PFN_vkCmdDraw),
+    cmdCopyImageToBuffer: std.meta.Child(vk.PFN_vkCmdCopyImageToBuffer),
+
+    // Descriptor sets — used by `vulkan/DescriptorPool.zig`. Per-
+    // surface lifetime today; per-frame pooling will follow when
+    // the actual renderer integration lands.
+    createDescriptorPool: std.meta.Child(vk.PFN_vkCreateDescriptorPool),
+    destroyDescriptorPool: std.meta.Child(vk.PFN_vkDestroyDescriptorPool),
+    resetDescriptorPool: std.meta.Child(vk.PFN_vkResetDescriptorPool),
+    allocateDescriptorSets: std.meta.Child(vk.PFN_vkAllocateDescriptorSets),
+    updateDescriptorSets: std.meta.Child(vk.PFN_vkUpdateDescriptorSets),
+    cmdBindDescriptorSets: std.meta.Child(vk.PFN_vkCmdBindDescriptorSets),
+};
+
+// ---- fields ---------------------------------------------------------
+
+instance: vk.VkInstance,
+physical_device: vk.VkPhysicalDevice,
+device: vk.VkDevice,
+queue: vk.VkQueue,
+queue_family_index: u32,
+
+/// The Vulkan API version the host's physical device reports. Always
+/// >= `MIN_API_VERSION` (if it were lower, `init` returns
+/// `error.UnsupportedVulkanVersion`).
+api_version: u32,
+
+/// Cached `VkPhysicalDeviceMemoryProperties`. The properties are
+/// immutable for the physical device's lifetime, so we query once
+/// at `init` time instead of on every `findMemoryType` call (which
+/// happens for every Buffer/Texture/Target allocation).
+memory_properties: vk.VkPhysicalDeviceMemoryProperties,
+
+dispatch: Dispatch,
+
+/// Process-wide mutex protecting access to `queue`. Vulkan requires
+/// external synchronization of `VkQueue` — `vkQueueSubmit` and
+/// `vkQueueWaitIdle` from multiple threads must not overlap. Splits
+/// and tabs share the host's single queue (one VkQueue per process),
+/// so the mutex serializes submissions across all renderer threads.
+/// Use via `Device.queueSubmit` / `Device.queueWaitIdle`.
+var queue_mutex: std.Thread.Mutex = .{};
+
+/// Externally-synchronized `vkQueueSubmit`. ALL submissions to the
+/// host queue (Frame, atlas upload, etc.) MUST go through this so
+/// concurrent renderer threads from splits/tabs don't race the
+/// driver into a hang.
+pub fn queueSubmit(
+    self: *const Device,
+    submit_count: u32,
+    submits: [*c]const vk.VkSubmitInfo,
+    fence: vk.VkFence,
+) vk.VkResult {
+    queue_mutex.lock();
+    defer queue_mutex.unlock();
+    return self.dispatch.queueSubmit(self.queue, submit_count, submits, fence);
+}
+
+/// Externally-synchronized `vkQueueWaitIdle`. Same reasoning as
+/// `queueSubmit`.
+pub fn queueWaitIdle(self: *const Device) vk.VkResult {
+    queue_mutex.lock();
+    defer queue_mutex.unlock();
+    return self.dispatch.queueWaitIdle(self.queue);
+}
+
+// ---- API ------------------------------------------------------------
+
+/// Pre-resolved host-Vulkan handles passed into `Device.init`. Keeps
+/// `pkg/vulkan` independent of any apprt type — callers (e.g.
+/// libghostty's `src/renderer/Vulkan.zig`) translate their own
+/// platform-callback struct into this neutral shape.
+pub const HostBootstrap = struct {
+    instance: vk.VkInstance,
+    physical_device: vk.VkPhysicalDevice,
+    device: vk.VkDevice,
+    queue: vk.VkQueue,
+    queue_family_index: u32,
+    /// Root proc-addr resolver. `Device.init` uses this to pull
+    /// `vkGetInstanceProcAddr` itself plus every instance-level
+    /// function it needs to bootstrap the dispatch table.
+    get_instance_proc_addr_raw: *const anyopaque,
+};
+
+/// Build a `Device` from pre-resolved host handles. Performs:
+///   1. Load the instance-level dispatch via `vkGetInstanceProcAddr`.
+///   2. Verify `physicalDeviceProperties.apiVersion >= 1.3`.
+///   3. Verify every entry in `REQUIRED_DEVICE_EXTENSIONS` is present
+///      on the physical device.
+///   4. Load the device-level dispatch via `vkGetDeviceProcAddr`.
+///
+/// On success the returned `Device` is ready for the renderer to
+/// build pipelines / images / command buffers against. The host
+/// retains ownership of `instance` / `device` / `queue` — `deinit`
+/// is a no-op stub for symmetry.
+pub fn init(
+    alloc: Allocator,
+    boot: HostBootstrap,
+) (Error || Allocator.Error)!Device {
+    const instance = boot.instance;
+    const physical_device = boot.physical_device;
+    const device = boot.device;
+    const queue = boot.queue;
+    const queue_family_index = boot.queue_family_index;
+
+    // ---- instance-level dispatch ---------------------------------
+    // The caller-provided get_instance_proc_addr is our root entry
+    // point. We resolve other functions via vkGetInstanceProcAddr
+    // (instance, name); per the Vulkan spec, passing a non-null
+    // instance is valid for any function that takes an instance,
+    // physical device, device, or child object of any of these — i.e.
+    // everything we care about.
+    const get_instance_proc_addr: std.meta.Child(vk.PFN_vkGetInstanceProcAddr) =
+        @ptrCast(@alignCast(boot.get_instance_proc_addr_raw));
+
+    const InstanceLoader = struct {
+        instance: vk.VkInstance,
+        get_instance_proc_addr: std.meta.Child(vk.PFN_vkGetInstanceProcAddr),
+
+        fn load(self: @This(), comptime T: type, name: [*:0]const u8) Error!std.meta.Child(T) {
+            const fp = self.get_instance_proc_addr(self.instance, name) orelse {
+                log.err("vkGetInstanceProcAddr returned null for {s}", .{name});
+                return error.HostHandleMissing;
+            };
+            return @ptrCast(fp);
+        }
+    };
+    const il: InstanceLoader = .{
+        .instance = instance,
+        .get_instance_proc_addr = get_instance_proc_addr,
+    };
+
+    const get_physical_device_properties =
+        try il.load(vk.PFN_vkGetPhysicalDeviceProperties, "vkGetPhysicalDeviceProperties");
+    const get_physical_device_memory_properties =
+        try il.load(vk.PFN_vkGetPhysicalDeviceMemoryProperties, "vkGetPhysicalDeviceMemoryProperties");
+    const get_physical_device_format_properties =
+        try il.load(vk.PFN_vkGetPhysicalDeviceFormatProperties, "vkGetPhysicalDeviceFormatProperties");
+    const get_physical_device_format_properties_2 =
+        try il.load(vk.PFN_vkGetPhysicalDeviceFormatProperties2, "vkGetPhysicalDeviceFormatProperties2");
+    const enumerate_device_extension_properties =
+        try il.load(vk.PFN_vkEnumerateDeviceExtensionProperties, "vkEnumerateDeviceExtensionProperties");
+    const get_device_proc_addr =
+        try il.load(vk.PFN_vkGetDeviceProcAddr, "vkGetDeviceProcAddr");
+
+    // ---- version check ------------------------------------------
+    var props: vk.VkPhysicalDeviceProperties = std.mem.zeroes(vk.VkPhysicalDeviceProperties);
+    get_physical_device_properties(physical_device, &props);
+    if (props.apiVersion < MIN_API_VERSION) {
+        log.err(
+            "host VkPhysicalDevice reports Vulkan {}.{}.{}, need >= {}.{}.{}",
+            .{
+                vk.VK_API_VERSION_MAJOR(props.apiVersion),
+                vk.VK_API_VERSION_MINOR(props.apiVersion),
+                vk.VK_API_VERSION_PATCH(props.apiVersion),
+                vk.VK_API_VERSION_MAJOR(MIN_API_VERSION),
+                vk.VK_API_VERSION_MINOR(MIN_API_VERSION),
+                vk.VK_API_VERSION_PATCH(MIN_API_VERSION),
+            },
+        );
+        return error.UnsupportedVulkanVersion;
+    }
+
+    // ---- extension check ----------------------------------------
+    var ext_count: u32 = 0;
+    {
+        const r = enumerate_device_extension_properties(physical_device, null, &ext_count, null);
+        // SUCCESS or INCOMPLETE both populate `ext_count`. INCOMPLETE
+        // shouldn't happen on the count-only call (no buffer to
+        // truncate) but we accept it defensively.
+        if (r != vk.VK_SUCCESS and r != vk.VK_INCOMPLETE) {
+            log.err("vkEnumerateDeviceExtensionProperties (count) failed: result={}", .{r});
+            return error.HostHandleMissing;
+        }
+    }
+    const exts = try alloc.alloc(vk.VkExtensionProperties, ext_count);
+    defer alloc.free(exts);
+    {
+        const r = enumerate_device_extension_properties(physical_device, null, &ext_count, exts.ptr);
+        if (r != vk.VK_SUCCESS and r != vk.VK_INCOMPLETE) {
+            log.err("vkEnumerateDeviceExtensionProperties (fill) failed: result={}", .{r});
+            return error.HostHandleMissing;
+        }
+        // VK_INCOMPLETE here means the extension list grew between
+        // the count and fill calls (race with a driver hot-reload —
+        // very unlikely in practice but spec-permitted). The
+        // partially-filled buffer is still authoritative for the
+        // entries it does contain, but a required extension not yet
+        // populated would be missed. Treat as a hard fail since the
+        // extension presence check below would silently pass on a
+        // truncated list.
+        if (r == vk.VK_INCOMPLETE) {
+            log.err(
+                "vkEnumerateDeviceExtensionProperties returned INCOMPLETE; " ++
+                    "device extension list changed between count and fill",
+                .{},
+            );
+            return error.HostHandleMissing;
+        }
+    }
+
+    inline for (REQUIRED_DEVICE_EXTENSIONS) |required| {
+        var found = false;
+        for (exts) |ext| {
+            const name_cstr: [*:0]const u8 = @ptrCast(&ext.extensionName);
+            if (std.mem.eql(u8, std.mem.span(name_cstr), required)) {
+                found = true;
+                break;
+            }
+        }
+        if (!found) {
+            log.err("required Vulkan device extension missing: {s}", .{required});
+            return error.MissingRequiredExtension;
+        }
+    }
+
+    // ---- device-level dispatch ----------------------------------
+    const DeviceLoader = struct {
+        device: vk.VkDevice,
+        get_device_proc_addr: std.meta.Child(vk.PFN_vkGetDeviceProcAddr),
+
+        fn load(self: @This(), comptime T: type, name: [*:0]const u8) Error!std.meta.Child(T) {
+            const fp = self.get_device_proc_addr(self.device, name) orelse {
+                log.err("vkGetDeviceProcAddr returned null for {s}", .{name});
+                return error.HostHandleMissing;
+            };
+            return @ptrCast(fp);
+        }
+    };
+    const dl: DeviceLoader = .{
+        .device = device,
+        .get_device_proc_addr = get_device_proc_addr,
+    };
+
+    const get_device_queue =
+        try dl.load(vk.PFN_vkGetDeviceQueue, "vkGetDeviceQueue");
+    const device_wait_idle =
+        try dl.load(vk.PFN_vkDeviceWaitIdle, "vkDeviceWaitIdle");
+    const create_sampler =
+        try dl.load(vk.PFN_vkCreateSampler, "vkCreateSampler");
+    const destroy_sampler =
+        try dl.load(vk.PFN_vkDestroySampler, "vkDestroySampler");
+    const create_image =
+        try dl.load(vk.PFN_vkCreateImage, "vkCreateImage");
+    const destroy_image =
+        try dl.load(vk.PFN_vkDestroyImage, "vkDestroyImage");
+    const get_image_memory_requirements =
+        try dl.load(vk.PFN_vkGetImageMemoryRequirements, "vkGetImageMemoryRequirements");
+    const allocate_memory =
+        try dl.load(vk.PFN_vkAllocateMemory, "vkAllocateMemory");
+    const free_memory =
+        try dl.load(vk.PFN_vkFreeMemory, "vkFreeMemory");
+    const bind_image_memory =
+        try dl.load(vk.PFN_vkBindImageMemory, "vkBindImageMemory");
+    const create_image_view =
+        try dl.load(vk.PFN_vkCreateImageView, "vkCreateImageView");
+    const destroy_image_view =
+        try dl.load(vk.PFN_vkDestroyImageView, "vkDestroyImageView");
+    const create_buffer =
+        try dl.load(vk.PFN_vkCreateBuffer, "vkCreateBuffer");
+    const destroy_buffer =
+        try dl.load(vk.PFN_vkDestroyBuffer, "vkDestroyBuffer");
+    const get_buffer_memory_requirements =
+        try dl.load(vk.PFN_vkGetBufferMemoryRequirements, "vkGetBufferMemoryRequirements");
+    const bind_buffer_memory =
+        try dl.load(vk.PFN_vkBindBufferMemory, "vkBindBufferMemory");
+    const map_memory =
+        try dl.load(vk.PFN_vkMapMemory, "vkMapMemory");
+    const unmap_memory =
+        try dl.load(vk.PFN_vkUnmapMemory, "vkUnmapMemory");
+    const create_command_pool =
+        try dl.load(vk.PFN_vkCreateCommandPool, "vkCreateCommandPool");
+    const destroy_command_pool =
+        try dl.load(vk.PFN_vkDestroyCommandPool, "vkDestroyCommandPool");
+    const allocate_command_buffers =
+        try dl.load(vk.PFN_vkAllocateCommandBuffers, "vkAllocateCommandBuffers");
+    const free_command_buffers =
+        try dl.load(vk.PFN_vkFreeCommandBuffers, "vkFreeCommandBuffers");
+    const begin_command_buffer =
+        try dl.load(vk.PFN_vkBeginCommandBuffer, "vkBeginCommandBuffer");
+    const end_command_buffer =
+        try dl.load(vk.PFN_vkEndCommandBuffer, "vkEndCommandBuffer");
+    const queue_submit =
+        try dl.load(vk.PFN_vkQueueSubmit, "vkQueueSubmit");
+    const queue_wait_idle =
+        try dl.load(vk.PFN_vkQueueWaitIdle, "vkQueueWaitIdle");
+    const cmd_pipeline_barrier =
+        try dl.load(vk.PFN_vkCmdPipelineBarrier, "vkCmdPipelineBarrier");
+    const cmd_copy_buffer_to_image =
+        try dl.load(vk.PFN_vkCmdCopyBufferToImage, "vkCmdCopyBufferToImage");
+    const cmd_fill_buffer =
+        try dl.load(vk.PFN_vkCmdFillBuffer, "vkCmdFillBuffer");
+    const cmd_clear_color_image =
+        try dl.load(vk.PFN_vkCmdClearColorImage, "vkCmdClearColorImage");
+    const cmd_bind_vertex_buffers =
+        try dl.load(vk.PFN_vkCmdBindVertexBuffers, "vkCmdBindVertexBuffers");
+    const create_shader_module =
+        try dl.load(vk.PFN_vkCreateShaderModule, "vkCreateShaderModule");
+    const destroy_shader_module =
+        try dl.load(vk.PFN_vkDestroyShaderModule, "vkDestroyShaderModule");
+    const create_descriptor_set_layout =
+        try dl.load(vk.PFN_vkCreateDescriptorSetLayout, "vkCreateDescriptorSetLayout");
+    const destroy_descriptor_set_layout =
+        try dl.load(vk.PFN_vkDestroyDescriptorSetLayout, "vkDestroyDescriptorSetLayout");
+    const create_pipeline_layout =
+        try dl.load(vk.PFN_vkCreatePipelineLayout, "vkCreatePipelineLayout");
+    const destroy_pipeline_layout =
+        try dl.load(vk.PFN_vkDestroyPipelineLayout, "vkDestroyPipelineLayout");
+    const create_graphics_pipelines =
+        try dl.load(vk.PFN_vkCreateGraphicsPipelines, "vkCreateGraphicsPipelines");
+    const destroy_pipeline =
+        try dl.load(vk.PFN_vkDestroyPipeline, "vkDestroyPipeline");
+    const get_memory_fd_khr =
+        try dl.load(vk.PFN_vkGetMemoryFdKHR, "vkGetMemoryFdKHR");
+    const get_image_subresource_layout =
+        try dl.load(vk.PFN_vkGetImageSubresourceLayout, "vkGetImageSubresourceLayout");
+    const get_image_drm_format_modifier_properties_ext =
+        try dl.load(vk.PFN_vkGetImageDrmFormatModifierPropertiesEXT, "vkGetImageDrmFormatModifierPropertiesEXT");
+    const create_fence =
+        try dl.load(vk.PFN_vkCreateFence, "vkCreateFence");
+    const destroy_fence =
+        try dl.load(vk.PFN_vkDestroyFence, "vkDestroyFence");
+    const wait_for_fences =
+        try dl.load(vk.PFN_vkWaitForFences, "vkWaitForFences");
+    const reset_fences =
+        try dl.load(vk.PFN_vkResetFences, "vkResetFences");
+    const reset_command_buffer =
+        try dl.load(vk.PFN_vkResetCommandBuffer, "vkResetCommandBuffer");
+    const cmd_begin_rendering =
+        try dl.load(vk.PFN_vkCmdBeginRendering, "vkCmdBeginRendering");
+    const cmd_end_rendering =
+        try dl.load(vk.PFN_vkCmdEndRendering, "vkCmdEndRendering");
+    const cmd_bind_pipeline =
+        try dl.load(vk.PFN_vkCmdBindPipeline, "vkCmdBindPipeline");
+    const cmd_set_viewport =
+        try dl.load(vk.PFN_vkCmdSetViewport, "vkCmdSetViewport");
+    const cmd_set_scissor =
+        try dl.load(vk.PFN_vkCmdSetScissor, "vkCmdSetScissor");
+    const cmd_draw =
+        try dl.load(vk.PFN_vkCmdDraw, "vkCmdDraw");
+    const cmd_copy_image_to_buffer =
+        try dl.load(vk.PFN_vkCmdCopyImageToBuffer, "vkCmdCopyImageToBuffer");
+    const create_descriptor_pool =
+        try dl.load(vk.PFN_vkCreateDescriptorPool, "vkCreateDescriptorPool");
+    const destroy_descriptor_pool =
+        try dl.load(vk.PFN_vkDestroyDescriptorPool, "vkDestroyDescriptorPool");
+    const reset_descriptor_pool =
+        try dl.load(vk.PFN_vkResetDescriptorPool, "vkResetDescriptorPool");
+    const allocate_descriptor_sets =
+        try dl.load(vk.PFN_vkAllocateDescriptorSets, "vkAllocateDescriptorSets");
+    const update_descriptor_sets =
+        try dl.load(vk.PFN_vkUpdateDescriptorSets, "vkUpdateDescriptorSets");
+    const cmd_bind_descriptor_sets =
+        try dl.load(vk.PFN_vkCmdBindDescriptorSets, "vkCmdBindDescriptorSets");
+
+    // Snapshot the memory properties once. They never change for
+    // the device's lifetime, so per-allocation re-queries (which
+    // findMemoryType used to do) were pure waste.
+    var memory_properties: vk.VkPhysicalDeviceMemoryProperties = undefined;
+    get_physical_device_memory_properties(physical_device, &memory_properties);
+
+    return .{
+        .instance = instance,
+        .physical_device = physical_device,
+        .device = device,
+        .queue = queue,
+        .queue_family_index = queue_family_index,
+        .api_version = props.apiVersion,
+        .memory_properties = memory_properties,
+        .dispatch = .{
+            .getPhysicalDeviceProperties = get_physical_device_properties,
+            .getPhysicalDeviceMemoryProperties = get_physical_device_memory_properties,
+            .getPhysicalDeviceFormatProperties = get_physical_device_format_properties,
+            .getPhysicalDeviceFormatProperties2 = get_physical_device_format_properties_2,
+            .enumerateDeviceExtensionProperties = enumerate_device_extension_properties,
+            .getDeviceProcAddr = get_device_proc_addr,
+            .getDeviceQueue = get_device_queue,
+            .deviceWaitIdle = device_wait_idle,
+            .createSampler = create_sampler,
+            .destroySampler = destroy_sampler,
+            .createImage = create_image,
+            .destroyImage = destroy_image,
+            .getImageMemoryRequirements = get_image_memory_requirements,
+            .allocateMemory = allocate_memory,
+            .freeMemory = free_memory,
+            .bindImageMemory = bind_image_memory,
+            .createImageView = create_image_view,
+            .destroyImageView = destroy_image_view,
+            .createBuffer = create_buffer,
+            .destroyBuffer = destroy_buffer,
+            .getBufferMemoryRequirements = get_buffer_memory_requirements,
+            .bindBufferMemory = bind_buffer_memory,
+            .mapMemory = map_memory,
+            .unmapMemory = unmap_memory,
+            .createCommandPool = create_command_pool,
+            .destroyCommandPool = destroy_command_pool,
+            .allocateCommandBuffers = allocate_command_buffers,
+            .freeCommandBuffers = free_command_buffers,
+            .beginCommandBuffer = begin_command_buffer,
+            .endCommandBuffer = end_command_buffer,
+            .queueSubmit = queue_submit,
+            .queueWaitIdle = queue_wait_idle,
+            .cmdPipelineBarrier = cmd_pipeline_barrier,
+            .cmdCopyBufferToImage = cmd_copy_buffer_to_image,
+            .cmdFillBuffer = cmd_fill_buffer,
+            .cmdClearColorImage = cmd_clear_color_image,
+            .cmdBindVertexBuffers = cmd_bind_vertex_buffers,
+            .createShaderModule = create_shader_module,
+            .destroyShaderModule = destroy_shader_module,
+            .createDescriptorSetLayout = create_descriptor_set_layout,
+            .destroyDescriptorSetLayout = destroy_descriptor_set_layout,
+            .createPipelineLayout = create_pipeline_layout,
+            .destroyPipelineLayout = destroy_pipeline_layout,
+            .createGraphicsPipelines = create_graphics_pipelines,
+            .destroyPipeline = destroy_pipeline,
+            .getMemoryFdKHR = get_memory_fd_khr,
+            .getImageSubresourceLayout = get_image_subresource_layout,
+            .getImageDrmFormatModifierPropertiesEXT = get_image_drm_format_modifier_properties_ext,
+            .createFence = create_fence,
+            .destroyFence = destroy_fence,
+            .waitForFences = wait_for_fences,
+            .resetFences = reset_fences,
+            .resetCommandBuffer = reset_command_buffer,
+            .cmdBeginRendering = cmd_begin_rendering,
+            .cmdEndRendering = cmd_end_rendering,
+            .cmdBindPipeline = cmd_bind_pipeline,
+            .cmdSetViewport = cmd_set_viewport,
+            .cmdSetScissor = cmd_set_scissor,
+            .cmdDraw = cmd_draw,
+            .cmdCopyImageToBuffer = cmd_copy_image_to_buffer,
+            .createDescriptorPool = create_descriptor_pool,
+            .destroyDescriptorPool = destroy_descriptor_pool,
+            .resetDescriptorPool = reset_descriptor_pool,
+            .allocateDescriptorSets = allocate_descriptor_sets,
+            .updateDescriptorSets = update_descriptor_sets,
+            .cmdBindDescriptorSets = cmd_bind_descriptor_sets,
+        },
+    };
+}
+
+/// Symmetry-only: every handle is host-owned. Provided so callers
+/// can `defer device.deinit()` without special-casing.
+pub fn deinit(self: *Device) void {
+    self.* = undefined;
+}
+
+/// Block until the device is idle. Useful before tearing down
+/// renderer resources to make sure no command buffers are in
+/// flight. On `VK_ERROR_DEVICE_LOST` (or any other failure) we
+/// log the result so callers proceeding to destroy resources on
+/// a dead device leave a diagnostic crumb instead of silently
+/// crashing on the subsequent vkDestroy*.
+pub fn waitIdle(self: *const Device) void {
+    const r = self.dispatch.deviceWaitIdle(self.device);
+    if (r != vk.VK_SUCCESS) {
+        log.warn("vkDeviceWaitIdle returned {}; teardown proceeding anyway", .{r});
+    }
+}
+
+/// Find a `VkMemoryType` index satisfying the requirements from a
+/// `VkMemoryRequirements.memoryTypeBits` bitmask AND with all of
+/// `required_props` set. Returns null if nothing matches.
+///
+/// Used by `vulkan/Texture.zig` (and later `vulkan/Buffer.zig`) to
+/// pick an appropriate heap for a freshly created image/buffer.
+pub fn findMemoryType(
+    self: *const Device,
+    type_bits: u32,
+    required_props: vk.VkMemoryPropertyFlags,
+) ?u32 {
+    const props = &self.memory_properties;
+    var i: u32 = 0;
+    while (i < props.memoryTypeCount) : (i += 1) {
+        const bit: u32 = @as(u32, 1) << @intCast(i);
+        if (type_bits & bit == 0) continue;
+        if (props.memoryTypes[i].propertyFlags & required_props == required_props) {
+            return i;
+        }
+    }
+    return null;
+}
+
+test {
+    // Force type-checking of every decl in this file so the renderer
+    // bring-up catches signature mismatches against the Vulkan
+    // binding before the apprt-side wiring lands. The actual init
+    // path requires a real host-provided Vulkan device and is
+    // exercised end-to-end once the Qt frontend wires up
+    // `ghostty_platform_vulkan_s`.
+    std.testing.refAllDecls(@This());
+}
--- a/pkg/vulkan/Sampler.zig
+++ b/pkg/vulkan/Sampler.zig
@ -0,0 +1,163 @@
+//! Wrapper for `VkSampler` — the immutable filter / wrap configuration
+//! the GPU uses when sampling a texture.
+//!
+//! libghostty doesn't share samplers across textures (the OpenGL
+//! backend already creates one per texture-shaped need); we keep the
+//! same per-callsite ownership model so the renderer interface
+//! matches.
+//!
+//! Counterpart: `src/renderer/opengl/Sampler.zig`.
+
+const Self = @This();
+
+const std = @import("std");
+const vk = @import("c.zig").c;
+
+const Device = @import("Device.zig");
+
+const log = std.log.scoped(.vulkan);
+
+/// Texel filter mode. Maps 1:1 to `VkFilter` (which is a `c_uint`).
+pub const Filter = enum(c_uint) {
+    nearest = vk.VK_FILTER_NEAREST,
+    linear = vk.VK_FILTER_LINEAR,
+};
+
+/// Texture coordinate wrap mode. Maps 1:1 to `VkSamplerAddressMode`
+/// (a `c_uint`).
+pub const AddressMode = enum(c_uint) {
+    repeat = vk.VK_SAMPLER_ADDRESS_MODE_REPEAT,
+    mirrored_repeat = vk.VK_SAMPLER_ADDRESS_MODE_MIRRORED_REPEAT,
+    clamp_to_edge = vk.VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_EDGE,
+    clamp_to_border = vk.VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_BORDER,
+};
+
+/// Sampler construction parameters. The same shape as the OpenGL
+/// backend's `Sampler.Options` (so generic.zig can call
+/// `Sampler.init(api.samplerOptions())` against either backend), with
+/// a `device` reference so we can call `vkCreateSampler` against the
+/// host's VkDevice without threading a global through.
+pub const Options = struct {
+    device: *const Device,
+    min_filter: Filter,
+    mag_filter: Filter,
+    wrap_s: AddressMode,
+    wrap_t: AddressMode,
+
+    /// Vulkan-only: enable sampling with non-normalized texel
+    /// coordinates (so `texture(s, p)` reads texel `p` directly
+    /// rather than mapping `[0,1] x [0,1]` over the image).
+    ///
+    /// This is what makes `sampler2D` behave like the OpenGL
+    /// `sampler2DRect` the renderer's text shaders were originally
+    /// authored against (after `vulkanizeGlsl` rewrites the type
+    /// name). Vulkan imposes a long list of co-requirements when
+    /// this is enabled — `init` forces them rather than rejecting
+    /// inputs that violate them:
+    ///
+    ///   - `magFilter == minFilter`  (we use `mag_filter`)
+    ///   - `mipmapMode = NEAREST`
+    ///   - `addressModeU/V` must be CLAMP_TO_EDGE / CLAMP_TO_BORDER
+    ///     (we force CLAMP_TO_EDGE, ignoring `wrap_s/wrap_t`)
+    ///   - `anisotropyEnable = FALSE`
+    ///   - `compareEnable = FALSE`
+    ///   - `minLod == maxLod == 0`
+    ///
+    /// The bound image view must also be 1D or 2D with one mip
+    /// level and one array layer — true for the glyph atlas.
+    unnormalized_coordinates: bool = false,
+};
+
+pub const Error = error{
+    /// `vkCreateSampler` returned a non-success status. Logged with
+    /// the raw `VkResult` value.
+    VulkanFailed,
+};
+
+sampler: vk.VkSampler,
+device: *const Device,
+
+/// Create a sampler against the host's VkDevice. The sampler is
+/// destroyed in `deinit`; libghostty owns this handle's lifetime.
+pub fn init(opts: Options) Error!Self {
+    const unnorm = opts.unnormalized_coordinates;
+    const info: vk.VkSamplerCreateInfo = .{
+        .sType = vk.VK_STRUCTURE_TYPE_SAMPLER_CREATE_INFO,
+        .pNext = null,
+        .flags = 0,
+        // When unnormalized, magFilter must equal minFilter (the
+        // sampling stage doesn't get to pick between them). Force
+        // both to `mag_filter` rather than rejecting at the caller.
+        .magFilter = @intFromEnum(opts.mag_filter),
+        .minFilter = if (unnorm)
+            @intFromEnum(opts.mag_filter)
+        else
+            @intFromEnum(opts.min_filter),
+        // The glyph atlases are 2D textures without mips; the
+        // renderer doesn't request mipmaps and the value here is
+        // ignored when `lodMin == lodMax == 0`. Unnormalized
+        // sampling requires NEAREST; we use LINEAR otherwise for
+        // forward-compatibility if we ever generate atlas mips.
+        .mipmapMode = if (unnorm)
+            vk.VK_SAMPLER_MIPMAP_MODE_NEAREST
+        else
+            vk.VK_SAMPLER_MIPMAP_MODE_LINEAR,
+        // Unnormalized requires CLAMP_TO_EDGE or CLAMP_TO_BORDER;
+        // we don't have a use for the latter, so force CLAMP_TO_EDGE.
+        .addressModeU = if (unnorm)
+            vk.VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_EDGE
+        else
+            @intFromEnum(opts.wrap_s),
+        .addressModeV = if (unnorm)
+            vk.VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_EDGE
+        else
+            @intFromEnum(opts.wrap_t),
+        // 2D textures never sample in W; the renderer ignores it. The
+        // value still has to be valid — pick CLAMP_TO_EDGE.
+        .addressModeW = vk.VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_EDGE,
+        .mipLodBias = 0,
+        // Anisotropy is a per-physical-device feature toggle; the
+        // terminal grid doesn't benefit from it and gating on the
+        // feature flag adds host coordination noise. Skip. (Also a
+        // hard requirement for unnormalized sampling.)
+        .anisotropyEnable = vk.VK_FALSE,
+        .maxAnisotropy = 1,
+        .compareEnable = vk.VK_FALSE,
+        .compareOp = vk.VK_COMPARE_OP_ALWAYS,
+        .minLod = 0,
+        .maxLod = 0,
+        .borderColor = vk.VK_BORDER_COLOR_FLOAT_TRANSPARENT_BLACK,
+        .unnormalizedCoordinates = if (unnorm) vk.VK_TRUE else vk.VK_FALSE,
+    };
+
+    var sampler: vk.VkSampler = undefined;
+    const result = opts.device.dispatch.createSampler(
+        opts.device.device,
+        &info,
+        null,
+        &sampler,
+    );
+    if (result != vk.VK_SUCCESS) {
+        log.err("vkCreateSampler failed: result={}", .{result});
+        return error.VulkanFailed;
+    }
+
+    return .{
+        .sampler = sampler,
+        .device = opts.device,
+    };
+}
+
+/// `Sampler` is held by value at every call site (`const samp =
+/// try Sampler.init(...)`), so `deinit` takes `Self` not `*Self`
+/// — `const`-bound values can't be addressed-of for a `*Self`
+/// signature. CommandPool/DescriptorPool take `*Self` because
+/// they're held in mutable slots; this asymmetry follows
+/// container ownership, not a stylistic choice.
+pub fn deinit(self: Self) void {
+    self.device.dispatch.destroySampler(self.device.device, self.sampler, null);
+}
+
+test {
+    std.testing.refAllDecls(@This());
+}
--- a/pkg/vulkan/build.zig
+++ b/pkg/vulkan/build.zig
@ -0,0 +1,14 @@
+const std = @import("std");
+
+pub fn build(b: *std.Build) !void {
+    // `addModule` registers "vulkan" on `b`'s module table; consumers
+    // (`src/build/SharedDeps.zig`) reach it via
+    // `b.lazyDependency("vulkan", ...).module("vulkan")`. No return
+    // value or further wiring is needed here — Vulkan headers
+    // (`vulkan-headers` package) sit on the default system include
+    // path and libvulkan is link-system'd by the top-level build.
+    // Same pattern as `pkg/opengl/build.zig`.
+    _ = b.addModule("vulkan", .{
+        .root_source_file = b.path("main.zig"),
+    });
+}
--- a/pkg/vulkan/c.zig
+++ b/pkg/vulkan/c.zig
@ -0,0 +1,16 @@
+// Vulkan core API + the dmabuf-related extensions the renderer relies
+// on for zero-copy presentation:
+//
+//   - VK_KHR_external_memory / VK_KHR_external_memory_fd
+//   - VK_EXT_external_memory_dma_buf
+//   - VK_EXT_image_drm_format_modifier
+//
+// VK_USE_PLATFORM_* macros are intentionally NOT set here — the
+// renderer talks to its host purely via dmabuf fds (handed back to
+// the apprt's `ghostty_platform_vulkan_s.present` callback), so
+// libghostty never sees a wl_display or xcb_connection. That keeps
+// the binding portable and lets the host (Qt RHI) do all the
+// platform-specific compositing.
+pub const c = @cImport({
+    @cInclude("vulkan/vulkan.h");
+});
--- a/pkg/vulkan/main.zig
+++ b/pkg/vulkan/main.zig
@ -0,0 +1,30 @@
+//! Vulkan bindings.
+//!
+//! Shaped after `pkg/opengl/`: `c` is the raw C API (a thin `@cImport`
+//! wrapper around the system Vulkan headers); the per-resource files
+//! alongside provide opinionated typed wrappers the renderer
+//! consumes as primitives.
+//!
+//! The Vulkan renderer in `src/renderer/vulkan/` builds renderer
+//! policy on top of these (Pipeline / RenderPass / Frame / Target
+//! etc.); anything that's pure Vulkan-API plumbing belongs here.
+//!
+//! Vulkan core API + the dmabuf-related extensions the renderer relies
+//! on for zero-copy presentation:
+//!
+//!   - VK_KHR_external_memory / VK_KHR_external_memory_fd
+//!   - VK_EXT_external_memory_dma_buf
+//!   - VK_EXT_image_drm_format_modifier
+//!
+//! VK_USE_PLATFORM_* macros are intentionally NOT set in `c.zig` —
+//! libghostty talks to its host purely via dmabuf fds (handed back to
+//! the apprt's `ghostty_platform_vulkan_s.present` callback), so it
+//! never sees a `wl_display` or `xcb_connection`. That keeps the
+//! binding portable and lets the host (Qt RHI) do all the
+//! platform-specific compositing.
+
+pub const c = @import("c.zig").c;
+pub const Device = @import("Device.zig");
+pub const Sampler = @import("Sampler.zig");
+pub const CommandPool = @import("CommandPool.zig");
+pub const DescriptorPool = @import("DescriptorPool.zig");
--- a/qt/CMakeLists.txt
+++ b/qt/CMakeLists.txt
@ -30,7 +30,7 @@ set(CMAKE_AUTOMOC ON)
 include(GNUInstallDirs)

 find_package(Qt6 REQUIRED COMPONENTS Gui Widgets OpenGL DBus
-  Multimedia Svg)
+  Multimedia Svg WaylandClient)
 # WindowBlur + XkbTracker use qpa/qplatformnativeinterface.h to reach
 # the wl_display / wl_surface / wl_seat for native compositor calls
 # (blur, layer-shell screen pinning, raw wl_keyboard listeners). The
@ -43,7 +43,7 @@ find_package(Qt6 REQUIRED COMPONENTS Gui Widgets OpenGL DBus
 # CMake config (older Debian) and we fall back to hand-wiring the
 # include dir below.
 set(QT_NO_PRIVATE_MODULE_WARNING ON)
-find_package(Qt6 QUIET OPTIONAL_COMPONENTS GuiPrivate)
+find_package(Qt6 QUIET OPTIONAL_COMPONENTS GuiPrivate WaylandClientPrivate)

 # LayerShellQt: the quick terminal is a wlr-layer-shell dropdown window.
 find_package(LayerShellQt REQUIRED)
@ -53,6 +53,11 @@ find_package(LayerShellQt REQUIRED)
 # QPA native-handle accessors.
 find_package(PkgConfig REQUIRED)
 pkg_check_modules(WAYLAND_CLIENT REQUIRED IMPORTED_TARGET wayland-client)
+# libEGL is only needed by the OpenGL variant — `EglDmabufTarget`
+# uses EGL_MESA_image_dma_buf_export to export an FBO-backed
+# texture as a dmabuf. The Vulkan variant gets dmabufs straight
+# from `VK_KHR_external_memory_fd` and never calls into EGL, so
+# the EGL pkg-config + IMPORTED_TARGET is gated below.
 # libxkbcommon: derive the unshifted Unicode codepoint for a key event
 # from its XKB keycode, so libghostty's kitty encoder finds an entry for
 # punctuation keys (Qt's ev->key() reports the SHIFTED symbol, e.g.
@ -60,22 +65,105 @@ pkg_check_modules(WAYLAND_CLIENT REQUIRED IMPORTED_TARGET wayland-client)
 pkg_check_modules(XKBCOMMON REQUIRED IMPORTED_TARGET xkbcommon)
 find_program(WAYLAND_SCANNER wayland-scanner REQUIRED)

-# Generate client glue for the org_kde_kwin_blur protocol.
-set(BLUR_XML "${CMAKE_CURRENT_SOURCE_DIR}/protocols/blur.xml")
-set(BLUR_HEADER "${CMAKE_CURRENT_BINARY_DIR}/blur-client-protocol.h")
-set(BLUR_CODE "${CMAKE_CURRENT_BINARY_DIR}/blur-protocol.c")
-add_custom_command(OUTPUT "${BLUR_HEADER}"
-  COMMAND "${WAYLAND_SCANNER}" client-header "${BLUR_XML}" "${BLUR_HEADER}"
-  DEPENDS "${BLUR_XML}" VERBATIM)
-add_custom_command(OUTPUT "${BLUR_CODE}"
-  COMMAND "${WAYLAND_SCANNER}" private-code "${BLUR_XML}" "${BLUR_CODE}"
-  DEPENDS "${BLUR_XML}" VERBATIM)
+# `ghastty_wayland_protocol(<basename> <header_var> <code_var>)` —
+# Generates `<basename>-client-protocol.h` + `<basename>-protocol.c`
+# in `CMAKE_CURRENT_BINARY_DIR` from `protocols/<basename>.xml` via
+# `wayland-scanner`. Sets `<header_var>` and `<code_var>` in the
+# caller's scope to the generated paths so the caller can hand them
+# to `add_executable`'s source list.
+#
+# Each `add_custom_command` is independent — the `private-code`
+# output `#include`s the `client-header` output, but CMake creates
+# the dependency at target-source-list time when both files appear
+# in `add_executable`. Mirrors the pre-collapse pattern (two custom
+# commands per protocol) — only the boilerplate is gone.
+function(ghastty_wayland_protocol basename header_var code_var)
+  set(xml "${CMAKE_CURRENT_SOURCE_DIR}/protocols/${basename}.xml")
+  set(hdr "${CMAKE_CURRENT_BINARY_DIR}/${basename}-client-protocol.h")
+  set(src "${CMAKE_CURRENT_BINARY_DIR}/${basename}-protocol.c")
+  add_custom_command(OUTPUT "${hdr}"
+    COMMAND "${WAYLAND_SCANNER}" client-header "${xml}" "${hdr}"
+    DEPENDS "${xml}" VERBATIM)
+  add_custom_command(OUTPUT "${src}"
+    COMMAND "${WAYLAND_SCANNER}" private-code "${xml}" "${src}"
+    DEPENDS "${xml}" VERBATIM)
+  set("${header_var}" "${hdr}" PARENT_SCOPE)
+  set("${code_var}" "${src}" PARENT_SCOPE)
+endfunction()
+
+# Per-protocol notes:
+#   - `blur` (`org_kde_kwin_blur`)             — KWin background-blur.
+#   - `linux-dmabuf-v1`                        — Vulkan present path:
+#       wrap libghostty's dmabuf fd in a `wl_buffer` for the
+#       wayland::SubsurfacePresenter's `wl_surface`.
+#   - `viewporter` (`wp_viewporter`)           — destination size in
+#       surface-local coords; decouples the buffer's pixel dimensions
+#       from how big the subsurface appears on screen (fractional
+#       scaling).
+#   - `fractional-scale-v1` (`wp_fractional_scale_v1`)
+#       — compositor reports per-surface fractional scale (120ths).
+#       Used as the authoritative scale for buffer sizing, avoiding
+#       any sync lag with Qt's `devicePixelRatioF()` cache.
+ghastty_wayland_protocol(blur                 BLUR_HEADER       BLUR_CODE)
+ghastty_wayland_protocol(linux-dmabuf-v1      DMABUF_HEADER     DMABUF_CODE)
+ghastty_wayland_protocol(viewporter           VIEWPORTER_HEADER VIEWPORTER_CODE)
+ghastty_wayland_protocol(fractional-scale-v1  FRACSCALE_HEADER  FRACSCALE_CODE)
+#   - `alpha-modifier-v1` (`wp_alpha_modifier_v1`)
+#       — compositor-side per-surface alpha multiplier. QtWayland has no
+#       built-in setWindowOpacity equivalent (the QPA plugin warns
+#       "This plugin does not support setting window opacity" on every
+#       call), so QuickTerminal's fade-in/out drives this protocol
+#       directly. Supported on KWin, wlroots ≥0.17, Hyprland; NOT yet
+#       on mutter/GNOME.
+ghastty_wayland_protocol(alpha-modifier-v1    ALPHAMOD_HEADER   ALPHAMOD_CODE)

 # libghostty is built out-of-tree by Zig.
 get_filename_component(GHOSTTY_ROOT "${CMAKE_CURRENT_SOURCE_DIR}/.." ABSOLUTE)
 set(GHOSTTY_LIB_DIR "${GHOSTTY_ROOT}/zig-out/lib")
 set(GHOSTTY_SO "${GHOSTTY_LIB_DIR}/ghostty-internal.so")

+# Variant: which libghostty.so this build links against. The
+# rendering backend is baked into libghostty (Zig builds with
+# `-Drenderer=opengl` vs `-Drenderer=vulkan` produce ABI-compatible
+# but functionally distinct .so's), so the variant here is purely a
+# *compile-time selector*. The binary name and install layout do
+# NOT change — `${CMAKE_INSTALL_BINDIR}/ghastty` and
+# `${CMAKE_INSTALL_LIBDIR}/libghostty.so` for both. Developers who
+# want both flavors installed at once should use distinct prefixes
+# (`cmake --install --prefix /tmp/ghastty-vulkan`).
+#
+# Set via `cmake -DGHASTTY_VARIANT=vulkan -S qt -B qt/build-vulkan`.
+set(GHASTTY_VARIANT "opengl" CACHE STRING
+    "Renderer variant: opengl (default) or vulkan")
+set_property(CACHE GHASTTY_VARIANT PROPERTY STRINGS opengl vulkan)
+# Validate the cache value: STRINGS only constrains the cmake-gui
+# dropdown, not the command-line. `-DGHASTTY_VARIANT=foo` would
+# otherwise silently fall into the OpenGL branch below.
+if(NOT GHASTTY_VARIANT STREQUAL "opengl" AND
+   NOT GHASTTY_VARIANT STREQUAL "vulkan")
+  message(FATAL_ERROR
+    "GHASTTY_VARIANT='${GHASTTY_VARIANT}' is invalid; "
+    "must be 'opengl' or 'vulkan'.")
+endif()
+message(STATUS "Building variant=${GHASTTY_VARIANT}")
+
+# Compile-time renderer pick. Each binary is linked against exactly
+# one libghostty.so variant (opengl or vulkan), so the renderer
+# choice is inherent to the binary — no need for a runtime env var.
+# GhosttySurface.cpp branches on GHASTTY_USE_VULKAN to spin up the
+# Vulkan host vs the OpenGL context.
+if(GHASTTY_VARIANT STREQUAL "vulkan")
+  add_compile_definitions(GHASTTY_USE_VULKAN)
+endif()
+
+# libEGL: needed by `opengl/EglDmabufTarget.cpp` for the OpenGL
+# variant's zero-copy present path. Vulkan-variant binaries never
+# pull in this source file (gated below) so the loader doesn't have
+# to be installed for Vulkan-only systems.
+if(GHASTTY_VARIANT STREQUAL "opengl")
+  pkg_check_modules(EGL REQUIRED IMPORTED_TARGET egl)
+endif()
+
 if(NOT EXISTS "${GHOSTTY_SO}")
  message(FATAL_ERROR
    "libghostty not found at ${GHOSTTY_SO}\n"
@ -119,12 +207,40 @@ add_executable(ghastty
  src/TabWidget.cpp
  src/undo/UndoStack.cpp
  src/Util.cpp
+  src/wayland/AlphaModifier.cpp
+  src/wayland/SubsurfacePresenter.cpp
  src/WindowBlur.cpp
  src/XkbTracker.cpp
  "${BLUR_CODE}"
  "${BLUR_HEADER}"
+  "${DMABUF_CODE}"
+  "${DMABUF_HEADER}"
+  "${VIEWPORTER_CODE}"
+  "${VIEWPORTER_HEADER}"
+  "${FRACSCALE_CODE}"
+  "${FRACSCALE_HEADER}"
+  "${ALPHAMOD_CODE}"
+  "${ALPHAMOD_HEADER}"
 )

+# Vulkan host glue is variant-only. Adding it to the OpenGL build
+# would force an unconditional libvulkan link on a binary that
+# never calls into Vulkan, contradicting the side-by-side
+# `~/.local/lib/libghostty.so` story that the variant block above
+# documents.
+if(GHASTTY_VARIANT STREQUAL "vulkan")
+  target_sources(ghastty PRIVATE src/vulkan/Host.cpp)
+endif()
+
+# `opengl/EglDmabufTarget.cpp` is OpenGL-variant only. The Vulkan
+# variant exports dmabufs straight from VkDeviceMemory via
+# VK_KHR_external_memory_fd and never calls into EGL, so excluding
+# this source file from the Vulkan binary lets it stay free of
+# libEGL too.
+if(GHASTTY_VARIANT STREQUAL "opengl")
+  target_sources(ghastty PRIVATE src/opengl/EglDmabufTarget.cpp)
+endif()
+
 # Embed the app icon so it is available even running from the build tree.
 qt_add_resources(ghastty "appicon"
  PREFIX "/"
@ -151,7 +267,32 @@ target_link_libraries(ghastty PRIVATE
  "${GHOSTTY_LINK_SO}"
 )

+# libEGL is OpenGL-variant only — gated alongside the source file
+# in the variant block above. Vulkan-variant binaries don't pull
+# in libEGL at all.
+if(GHASTTY_VARIANT STREQUAL "opengl")
+  target_link_libraries(ghastty PRIVATE PkgConfig::EGL)
+endif()
+
+# libvulkan is Vulkan-variant only. The OpenGL variant compiles
+# nothing that references Vulkan symbols (vulkan/Host.cpp is gated
+# above), so not linking libvulkan keeps OpenGL-only systems from
+# needing the loader installed at runtime — matching the
+# documented side-by-side variant story above.
+if(GHASTTY_VARIANT STREQUAL "vulkan")
+  target_link_libraries(ghastty PRIVATE vulkan)
+endif()
+
 # Hook up the private QPA headers (see find_package above).
+#
+# Qt6::WaylandClientPrivate gives us QtWaylandClient::QWaylandWindow,
+# which we cast the QPA platform window to in GhosttySurface to call
+# `commit()` directly — that forces a parent wl_surface commit at the
+# moment our subsurface state is ready, instead of waiting on Qt's
+# backing-store flush which never fires for our translucent widget.
+if(TARGET Qt6::WaylandClientPrivate)
+  target_link_libraries(ghastty PRIVATE Qt6::WaylandClientPrivate)
+endif()
 if(TARGET Qt6::GuiPrivate)
  target_link_libraries(ghastty PRIVATE Qt6::GuiPrivate)
 else()
@ -187,6 +328,9 @@ endif()
 #   actual zig-out artifact), and the .so's NEEDED entries also point
 #   into zig-out/lib for transitive deps.
 # - Installed: libghostty.so lives next to the binary ($ORIGIN/../lib).
+# Same layout regardless of variant — the binary name doesn't change,
+# the .so path doesn't change. Side-by-side installs of two variants
+# need separate `--prefix`es.
 set_target_properties(ghastty PROPERTIES
  BUILD_RPATH "${GHOSTTY_LINK_DIR};${GHOSTTY_LIB_DIR}"
  INSTALL_RPATH "$ORIGIN/../${CMAKE_INSTALL_LIBDIR}"
--- a/qt/protocols/alpha-modifier-v1.xml
+++ b/qt/protocols/alpha-modifier-v1.xml
@ -0,0 +1,118 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<protocol name="alpha_modifier_v1">
+  <copyright>
+    Copyright 2023 Xaver Hugl
+
+    Permission is hereby granted, free of charge, to any person obtaining a
+    copy of this software and associated documentation files (the "Software"),
+    to deal in the Software without restriction, including without limitation
+    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+    and/or sell copies of the Software, and to permit persons to whom the
+    Software is furnished to do so, subject to the following conditions:
+
+    The above copyright notice and this permission notice (including the next
+    paragraph) shall be included in all copies or substantial portions of the
+    Software.
+
+    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+    THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+    DEALINGS IN THE SOFTWARE.
+  </copyright>
+
+  <description summary="surface alpha modifier">
+    This interface allows a client to set a factor for the alpha values on a
+    surface, which can be used to offload such operations to the compositor,
+    which can in turn for example offload them to KMS.
+
+    Warning! The protocol described in this file is currently in the testing
+    phase. Backward compatible changes may be added together with the
+    corresponding interface version bump. Backward incompatible changes can
+    only be done by creating a new major version of the extension.
+  </description>
+
+  <interface name="wp_alpha_modifier_v1" version="1">
+    <description summary="surface alpha modifier manager">
+      This interface allows a client to set a factor for the alpha values on
+      a surface, which can be used to offload such operations to the
+      compositor. The default factor is UINT32_MAX.
+
+      This interface can be used to set an arbitrary alpha value for the
+      surface, allowing it to be made fully transparent by setting the factor
+      to 0, fully opaque by setting it to UINT32_MAX, or any value in
+      between.
+
+      Warning! The protocol described in this file is currently in the
+      testing phase. Backward compatible changes may be added together with
+      the corresponding interface version bump. Backward incompatible changes
+      can only be done by creating a new major version of the extension.
+    </description>
+
+    <request name="destroy" type="destructor">
+      <description summary="destroy the alpha modifier manager object">
+        Destroy the alpha modifier manager. This doesn't destroy objects
+        created with the manager.
+      </description>
+    </request>
+
+    <request name="get_surface">
+      <description summary="create a new alpha modifier surface object">
+        Create a new alpha modifier surface object associated with the given
+        wl_surface. If there is already such an object associated with the
+        wl_surface, the already_constructed error will be raised.
+      </description>
+      <arg name="id" type="new_id" interface="wp_alpha_modifier_surface_v1"/>
+      <arg name="surface" type="object" interface="wl_surface"/>
+    </request>
+
+    <enum name="error">
+      <entry name="already_constructed" value="0"
+             summary="wl_surface already has a alpha modifier object associated"/>
+    </enum>
+  </interface>
+
+  <interface name="wp_alpha_modifier_surface_v1" version="1">
+    <description summary="modifier object for a surface">
+      This interface allows the client to set a factor for the alpha values on
+      a surface, which can be used to offload such operations to the
+      compositor. Multiple alpha modifiers can be attached to the same
+      surface, in which case the resulting alpha will be the product of all
+      the multiplicative factors.
+
+      The default factor is UINT32_MAX.
+    </description>
+
+    <request name="destroy" type="destructor">
+      <description summary="remove the alpha modifier from the surface">
+        This destroys the object, and is equivalent to set_multiplier with
+        a value of UINT32_MAX, with the same double-buffered semantics as
+        set_multiplier.
+      </description>
+    </request>
+
+    <request name="set_multiplier">
+      <description summary="set the alpha multiplier">
+        Sets the alpha multiplier for the surface. The alpha multiplier is
+        double-buffered state, see wl_surface.commit for details.
+
+        The default factor is UINT32_MAX.
+
+        This factor is applied in the compositor's blending space, as an
+        additional step after the processing of per-pixel alpha values for
+        the surface. It allows to set an arbitrary alpha value for the
+        surface, including making the surface partially transparent even when
+        all the pixels are fully opaque, or fully transparent even when the
+        pixels are not.
+      </description>
+      <arg name="factor" type="uint" summary="the new alpha multiplier for the surface"/>
+    </request>
+
+    <enum name="error">
+      <entry name="no_surface" value="0"
+             summary="wl_surface was destroyed"/>
+    </enum>
+  </interface>
+</protocol>
--- a/qt/protocols/fractional-scale-v1.xml
+++ b/qt/protocols/fractional-scale-v1.xml
@ -0,0 +1,102 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<protocol name="fractional_scale_v1">
+  <copyright>
+    Copyright © 2022 Kenny Levinsen
+
+    Permission is hereby granted, free of charge, to any person obtaining a
+    copy of this software and associated documentation files (the "Software"),
+    to deal in the Software without restriction, including without limitation
+    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+    and/or sell copies of the Software, and to permit persons to whom the
+    Software is furnished to do so, subject to the following conditions:
+
+    The above copyright notice and this permission notice (including the next
+    paragraph) shall be included in all copies or substantial portions of the
+    Software.
+
+    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+    THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+    DEALINGS IN THE SOFTWARE.
+  </copyright>
+
+  <description summary="Protocol for requesting fractional surface scales">
+    This protocol allows a compositor to suggest for surfaces to render at
+    fractional scales.
+
+    A client can submit scaled content by utilizing wp_viewport. This is done by
+    creating a wp_viewport object for the surface and setting the destination
+    rectangle to the surface size before the scale factor is applied.
+
+    The buffer size is calculated by multiplying the surface size by the
+    intended scale.
+
+    The wl_surface buffer scale should remain set to 1.
+
+    If a surface has a surface-local size of 100 px by 50 px and wishes to
+    submit buffers with a scale of 1.5, then a buffer of 150px by 75 px should
+    be used and the wp_viewport destination rectangle should be 100 px by 50 px.
+
+    For toplevel surfaces, the size is rounded halfway away from zero. The
+    rounding algorithm for subsurface position and size is not defined.
+  </description>
+
+  <interface name="wp_fractional_scale_manager_v1" version="1">
+    <description summary="fractional surface scale information">
+      A global interface for requesting surfaces to use fractional scales.
+    </description>
+
+    <request name="destroy" type="destructor">
+      <description summary="unbind the fractional surface scale interface">
+        Informs the server that the client will not be using this protocol
+        object anymore. This does not affect any other objects,
+        wp_fractional_scale_v1 objects included.
+      </description>
+    </request>
+
+    <enum name="error">
+      <entry name="fractional_scale_exists" value="0"
+        summary="the surface already has a fractional_scale object associated"/>
+    </enum>
+
+    <request name="get_fractional_scale">
+      <description summary="extend surface interface for scale information">
+        Create an add-on object for the the wl_surface to let the compositor
+        request fractional scales. If the given wl_surface already has a
+        wp_fractional_scale_v1 object associated, the fractional_scale_exists
+        protocol error is raised.
+      </description>
+      <arg name="id" type="new_id" interface="wp_fractional_scale_v1"
+           summary="the new surface scale info interface id"/>
+      <arg name="surface" type="object" interface="wl_surface"
+           summary="the surface"/>
+    </request>
+  </interface>
+
+  <interface name="wp_fractional_scale_v1" version="1">
+    <description summary="fractional scale interface to a wl_surface">
+      An additional interface to a wl_surface object which allows the compositor
+      to inform the client of the preferred scale.
+    </description>
+
+    <request name="destroy" type="destructor">
+      <description summary="remove surface scale information for surface">
+        Destroy the fractional scale object. When this object is destroyed,
+        preferred_scale events will no longer be sent.
+      </description>
+    </request>
+
+    <event name="preferred_scale">
+      <description summary="notify of new preferred scale">
+        Notification of a new preferred scale for this surface that the
+        compositor suggests that the client should use.
+
+        The sent scale is the numerator of a fraction with a denominator of 120.
+      </description>
+      <arg name="scale" type="uint" summary="the new preferred scale"/>
+    </event>
+  </interface>
+</protocol>
--- a/qt/protocols/linux-dmabuf-v1.xml
+++ b/qt/protocols/linux-dmabuf-v1.xml
@ -0,0 +1,585 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<protocol name="linux_dmabuf_v1">
+
+  <copyright>
+    Copyright © 2014, 2015 Collabora, Ltd.
+
+    Permission is hereby granted, free of charge, to any person obtaining a
+    copy of this software and associated documentation files (the "Software"),
+    to deal in the Software without restriction, including without limitation
+    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+    and/or sell copies of the Software, and to permit persons to whom the
+    Software is furnished to do so, subject to the following conditions:
+
+    The above copyright notice and this permission notice (including the next
+    paragraph) shall be included in all copies or substantial portions of the
+    Software.
+
+    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+    THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+    DEALINGS IN THE SOFTWARE.
+  </copyright>
+
+  <interface name="zwp_linux_dmabuf_v1" version="5">
+    <description summary="factory for creating dmabuf-based wl_buffers">
+      This interface offers ways to create generic dmabuf-based wl_buffers.
+
+      For more information about dmabuf, see:
+      https://www.kernel.org/doc/html/next/userspace-api/dma-buf-alloc-exchange.html
+
+      Clients can use the get_surface_feedback request to get dmabuf feedback
+      for a particular surface. If the client wants to retrieve feedback not
+      tied to a surface, they can use the get_default_feedback request.
+
+      The following are required from clients:
+
+      - Clients must ensure that either all data in the dma-buf is
+        coherent for all subsequent read access or that coherency is
+        correctly handled by the underlying kernel-side dma-buf
+        implementation.
+
+      - Don't make any more attachments after sending the buffer to the
+        compositor. Making more attachments later increases the risk of
+        the compositor not being able to use (re-import) an existing
+        dmabuf-based wl_buffer.
+
+      The underlying graphics stack must ensure the following:
+
+      - The dmabuf file descriptors relayed to the server will stay valid
+        for the whole lifetime of the wl_buffer. This means the server may
+        at any time use those fds to import the dmabuf into any kernel
+        sub-system that might accept it.
+
+      However, when the underlying graphics stack fails to deliver the
+      promise, because of e.g. a device hot-unplug which raises internal
+      errors, after the wl_buffer has been successfully created the
+      compositor must not raise protocol errors to the client when dmabuf
+      import later fails.
+
+      To create a wl_buffer from one or more dmabufs, a client creates a
+      zwp_linux_dmabuf_params_v1 object with a zwp_linux_dmabuf_v1.create_params
+      request. All planes required by the intended format are added with
+      the 'add' request. Finally, a 'create' or 'create_immed' request is
+      issued, which has the following outcome depending on the import success.
+
+      The 'create' request,
+      - on success, triggers a 'created' event which provides the final
+        wl_buffer to the client.
+      - on failure, triggers a 'failed' event to convey that the server
+        cannot use the dmabufs received from the client.
+
+      For the 'create_immed' request,
+      - on success, the server immediately imports the added dmabufs to
+        create a wl_buffer. No event is sent from the server in this case.
+      - on failure, the server can choose to either:
+        - terminate the client by raising a fatal error.
+        - mark the wl_buffer as failed, and send a 'failed' event to the
+          client. If the client uses a failed wl_buffer as an argument to any
+          request, the behaviour is compositor implementation-defined.
+
+      For all DRM formats and unless specified in another protocol extension,
+      pre-multiplied alpha is used for pixel values.
+
+      Unless specified otherwise in another protocol extension, implicit
+      synchronization is used. In other words, compositors and clients must
+      wait and signal fences implicitly passed via the DMA-BUF's reservation
+      mechanism.
+    </description>
+
+    <request name="destroy" type="destructor">
+      <description summary="unbind the factory">
+        Objects created through this interface, especially wl_buffers, will
+        remain valid.
+      </description>
+    </request>
+
+    <request name="create_params">
+      <description summary="create a temporary object for buffer parameters">
+        This temporary object is used to collect multiple dmabuf handles into
+        a single batch to create a wl_buffer. It can only be used once and
+        should be destroyed after a 'created' or 'failed' event has been
+        received.
+      </description>
+      <arg name="params_id" type="new_id" interface="zwp_linux_buffer_params_v1"
+           summary="the new temporary"/>
+    </request>
+
+    <event name="format" deprecated-since="4">
+      <description summary="supported buffer format">
+        This event advertises one buffer format that the server supports.
+        All the supported formats are advertised once when the client
+        binds to this interface. A roundtrip after binding guarantees
+        that the client has received all supported formats.
+
+        For the definition of the format codes, see the
+        zwp_linux_buffer_params_v1::create request.
+
+        Starting version 4, the format event is deprecated and must not be
+        sent by compositors. Instead, use get_default_feedback or
+        get_surface_feedback.
+      </description>
+      <arg name="format" type="uint" summary="DRM_FORMAT code"/>
+    </event>
+
+    <event name="modifier" since="3" deprecated-since="4">
+      <description summary="supported buffer format modifier">
+        This event advertises the formats that the server supports, along with
+        the modifiers supported for each format. All the supported modifiers
+        for all the supported formats are advertised once when the client
+        binds to this interface. A roundtrip after binding guarantees that
+        the client has received all supported format-modifier pairs.
+
+        For legacy support, DRM_FORMAT_MOD_INVALID (that is, modifier_hi ==
+        0x00ffffff and modifier_lo == 0xffffffff) is allowed in this event.
+        It indicates that the server can support the format with an implicit
+        modifier. When a plane has DRM_FORMAT_MOD_INVALID as its modifier, it
+        is as if no explicit modifier is specified. The effective modifier
+        will be derived from the dmabuf.
+
+        A compositor that sends valid modifiers and DRM_FORMAT_MOD_INVALID for
+        a given format supports both explicit modifiers and implicit modifiers.
+
+        For the definition of the format and modifier codes, see the
+        zwp_linux_buffer_params_v1::create and zwp_linux_buffer_params_v1::add
+        requests.
+
+        Starting version 4, the modifier event is deprecated and must not be
+        sent by compositors. Instead, use get_default_feedback or
+        get_surface_feedback.
+      </description>
+      <arg name="format" type="uint" summary="DRM_FORMAT code"/>
+      <arg name="modifier_hi" type="uint"
+           summary="high 32 bits of layout modifier"/>
+      <arg name="modifier_lo" type="uint"
+           summary="low 32 bits of layout modifier"/>
+    </event>
+
+    <!-- Version 4 additions -->
+
+    <request name="get_default_feedback" since="4">
+      <description summary="get default feedback">
+        This request creates a new wp_linux_dmabuf_feedback object not bound
+        to a particular surface. This object will deliver feedback about dmabuf
+        parameters to use if the client doesn't support per-surface feedback
+        (see get_surface_feedback).
+      </description>
+      <arg name="id" type="new_id" interface="zwp_linux_dmabuf_feedback_v1"/>
+    </request>
+
+    <request name="get_surface_feedback" since="4">
+      <description summary="get feedback for a surface">
+        This request creates a new wp_linux_dmabuf_feedback object for the
+        specified wl_surface. This object will deliver feedback about dmabuf
+        parameters to use for buffers attached to this surface.
+
+        If the surface is destroyed before the wp_linux_dmabuf_feedback object,
+        the feedback object becomes inert.
+      </description>
+      <arg name="id" type="new_id" interface="zwp_linux_dmabuf_feedback_v1"/>
+      <arg name="surface" type="object" interface="wl_surface"/>
+    </request>
+  </interface>
+
+  <interface name="zwp_linux_buffer_params_v1" version="5">
+    <description summary="parameters for creating a dmabuf-based wl_buffer">
+      This temporary object is a collection of dmabufs and other
+      parameters that together form a single logical buffer. The temporary
+      object may eventually create one wl_buffer unless cancelled by
+      destroying it before requesting 'create'.
+
+      Single-planar formats only require one dmabuf, however
+      multi-planar formats may require more than one dmabuf. For all
+      formats, an 'add' request must be called once per plane (even if the
+      underlying dmabuf fd is identical).
+
+      You must use consecutive plane indices ('plane_idx' argument for 'add')
+      from zero to the number of planes used by the drm_fourcc format code.
+      All planes required by the format must be given exactly once, but can
+      be given in any order. Each plane index can only be set once; subsequent
+      calls with a plane index which has already been set will result in a
+      plane_set error being generated.
+    </description>
+
+    <enum name="error">
+      <entry name="already_used" value="0"
+             summary="the dmabuf_batch object has already been used to create a wl_buffer"/>
+      <entry name="plane_idx" value="1"
+             summary="plane index out of bounds"/>
+      <entry name="plane_set" value="2"
+             summary="the plane index was already set"/>
+      <entry name="incomplete" value="3"
+             summary="missing or too many planes to create a buffer"/>
+      <entry name="invalid_format" value="4"
+             summary="format not supported"/>
+      <entry name="invalid_dimensions" value="5"
+             summary="invalid width or height"/>
+      <entry name="out_of_bounds" value="6"
+             summary="offset + stride * height goes out of dmabuf bounds"/>
+      <entry name="invalid_wl_buffer" value="7"
+             summary="invalid wl_buffer resulted from importing dmabufs via
+               the create_immed request on given buffer_params"/>
+    </enum>
+
+    <request name="destroy" type="destructor">
+      <description summary="delete this object, used or not">
+        Cleans up the temporary data sent to the server for dmabuf-based
+        wl_buffer creation.
+      </description>
+    </request>
+
+    <request name="add">
+      <description summary="add a dmabuf to the temporary set">
+        This request adds one dmabuf to the set in this
+        zwp_linux_buffer_params_v1.
+
+        The 64-bit unsigned value combined from modifier_hi and modifier_lo
+        is the dmabuf layout modifier. DRM AddFB2 ioctl calls this the
+        fb modifier, which is defined in drm_mode.h of Linux UAPI.
+        This is an opaque token. Drivers use this token to express tiling,
+        compression, etc. driver-specific modifications to the base format
+        defined by the DRM fourcc code.
+
+        Starting from version 4, the invalid_format protocol error is sent if
+        the format + modifier pair was not advertised as supported.
+
+        Starting from version 5, the invalid_format protocol error is sent if
+        all planes don't use the same modifier.
+
+        This request raises the PLANE_IDX error if plane_idx is too large.
+        The error PLANE_SET is raised if attempting to set a plane that
+        was already set.
+      </description>
+      <arg name="fd" type="fd" summary="dmabuf fd"/>
+      <arg name="plane_idx" type="uint" summary="plane index"/>
+      <arg name="offset" type="uint" summary="offset in bytes"/>
+      <arg name="stride" type="uint" summary="stride in bytes"/>
+      <arg name="modifier_hi" type="uint"
+           summary="high 32 bits of layout modifier"/>
+      <arg name="modifier_lo" type="uint"
+           summary="low 32 bits of layout modifier"/>
+    </request>
+
+    <enum name="flags" bitfield="true">
+      <entry name="y_invert" value="1" summary="contents are y-inverted"/>
+      <entry name="interlaced" value="2" summary="content is interlaced"/>
+      <entry name="bottom_first" value="4" summary="bottom field first"/>
+    </enum>
+
+    <request name="create">
+      <description summary="create a wl_buffer from the given dmabufs">
+        This asks for creation of a wl_buffer from the added dmabuf
+        buffers. The wl_buffer is not created immediately but returned via
+        the 'created' event if the dmabuf sharing succeeds. The sharing
+        may fail at runtime for reasons a client cannot predict, in
+        which case the 'failed' event is triggered.
+
+        The 'format' argument is a DRM_FORMAT code, as defined by the
+        libdrm's drm_fourcc.h. The Linux kernel's DRM sub-system is the
+        authoritative source on how the format codes should work.
+
+        The 'flags' is a bitfield of the flags defined in enum "flags".
+        'y_invert' means the that the image needs to be y-flipped.
+
+        Flag 'interlaced' means that the frame in the buffer is not
+        progressive as usual, but interlaced. An interlaced buffer as
+        supported here must always contain both top and bottom fields.
+        The top field always begins on the first pixel row. The temporal
+        ordering between the two fields is top field first, unless
+        'bottom_first' is specified. It is undefined whether 'bottom_first'
+        is ignored if 'interlaced' is not set.
+
+        This protocol does not convey any information about field rate,
+        duration, or timing, other than the relative ordering between the
+        two fields in one buffer. A compositor may have to estimate the
+        intended field rate from the incoming buffer rate. It is undefined
+        whether the time of receiving wl_surface.commit with a new buffer
+        attached, applying the wl_surface state, wl_surface.frame callback
+        trigger, presentation, or any other point in the compositor cycle
+        is used to measure the frame or field times. There is no support
+        for detecting missed or late frames/fields/buffers either, and
+        there is no support whatsoever for cooperating with interlaced
+        compositor output.
+
+        The composited image quality resulting from the use of interlaced
+        buffers is explicitly undefined. A compositor may use elaborate
+        hardware features or software to deinterlace and create progressive
+        output frames from a sequence of interlaced input buffers, or it
+        may produce substandard image quality. However, compositors that
+        cannot guarantee reasonable image quality in all cases are recommended
+        to just reject all interlaced buffers.
+
+        Any argument errors, including non-positive width or height,
+        mismatch between the number of planes and the format, bad
+        format, bad offset or stride, may be indicated by fatal protocol
+        errors: INCOMPLETE, INVALID_FORMAT, INVALID_DIMENSIONS,
+        OUT_OF_BOUNDS.
+
+        Dmabuf import errors in the server that are not obvious client
+        bugs are returned via the 'failed' event as non-fatal. This
+        allows attempting dmabuf sharing and falling back in the client
+        if it fails.
+
+        This request can be sent only once in the object's lifetime, after
+        which the only legal request is destroy. This object should be
+        destroyed after issuing a 'create' request. Attempting to use this
+        object after issuing 'create' raises ALREADY_USED protocol error.
+
+        It is not mandatory to issue 'create'. If a client wants to
+        cancel the buffer creation, it can just destroy this object.
+      </description>
+      <arg name="width" type="int" summary="base plane width in pixels"/>
+      <arg name="height" type="int" summary="base plane height in pixels"/>
+      <arg name="format" type="uint" summary="DRM_FORMAT code"/>
+      <arg name="flags" type="uint" enum="flags" summary="see enum flags"/>
+    </request>
+
+    <event name="created">
+      <description summary="buffer creation succeeded">
+        This event indicates that the attempted buffer creation was
+        successful. It provides the new wl_buffer referencing the dmabuf(s).
+
+        Upon receiving this event, the client should destroy the
+        zwp_linux_buffer_params_v1 object.
+      </description>
+      <arg name="buffer" type="new_id" interface="wl_buffer"
+           summary="the newly created wl_buffer"/>
+    </event>
+
+    <event name="failed">
+      <description summary="buffer creation failed">
+        This event indicates that the attempted buffer creation has
+        failed. It usually means that one of the dmabuf constraints
+        has not been fulfilled.
+
+        Upon receiving this event, the client should destroy the
+        zwp_linux_buffer_params_v1 object.
+      </description>
+    </event>
+
+    <request name="create_immed" since="2">
+      <description summary="immediately create a wl_buffer from the given
+                     dmabufs">
+        This asks for immediate creation of a wl_buffer by importing the
+        added dmabufs.
+
+        In case of import success, no event is sent from the server, and the
+        wl_buffer is ready to be used by the client.
+
+        Upon import failure, either of the following may happen, as seen fit
+        by the implementation:
+        - the client is terminated with one of the following fatal protocol
+          errors:
+          - INCOMPLETE, INVALID_FORMAT, INVALID_DIMENSIONS, OUT_OF_BOUNDS,
+            in case of argument errors such as mismatch between the number
+            of planes and the format, bad format, non-positive width or
+            height, or bad offset or stride.
+          - INVALID_WL_BUFFER, in case the cause for failure is unknown or
+            platform specific.
+        - the server creates an invalid wl_buffer, marks it as failed and
+          sends a 'failed' event to the client. The result of using this
+          invalid wl_buffer as an argument in any request by the client is
+          defined by the compositor implementation.
+
+        This takes the same arguments as a 'create' request, and obeys the
+        same restrictions.
+      </description>
+      <arg name="buffer_id" type="new_id" interface="wl_buffer"
+           summary="id for the newly created wl_buffer"/>
+      <arg name="width" type="int" summary="base plane width in pixels"/>
+      <arg name="height" type="int" summary="base plane height in pixels"/>
+      <arg name="format" type="uint" summary="DRM_FORMAT code"/>
+      <arg name="flags" type="uint" enum="flags" summary="see enum flags"/>
+    </request>
+  </interface>
+
+  <interface name="zwp_linux_dmabuf_feedback_v1" version="5">
+    <description summary="dmabuf feedback">
+      This object advertises dmabuf parameters feedback. This includes the
+      preferred devices and the supported formats/modifiers.
+
+      The parameters are sent once when this object is created and whenever they
+      change. The done event is always sent once after all parameters have been
+      sent. When a single parameter changes, all parameters are re-sent by the
+      compositor.
+
+      Compositors can re-send the parameters when the current client buffer
+      allocations are sub-optimal. Compositors should not re-send the
+      parameters if re-allocating the buffers would not result in a more optimal
+      configuration. In particular, compositors should avoid sending the exact
+      same parameters multiple times in a row.
+
+      The tranche_target_device and tranche_formats events are grouped by
+      tranches of preference. For each tranche, a tranche_target_device, one
+      tranche_flags and one or more tranche_formats events are sent, followed
+      by a tranche_done event finishing the list. The tranches are sent in
+      descending order of preference. All formats and modifiers in the same
+      tranche have the same preference.
+
+      To send parameters, the compositor sends one main_device event, tranches
+      (each consisting of one tranche_target_device event, one tranche_flags
+      event, tranche_formats events and then a tranche_done event), then one
+      done event.
+    </description>
+
+    <request name="destroy" type="destructor">
+      <description summary="destroy the feedback object">
+        Using this request a client can tell the server that it is not going to
+        use the wp_linux_dmabuf_feedback object anymore.
+      </description>
+    </request>
+
+    <event name="done">
+      <description summary="all feedback has been sent">
+        This event is sent after all parameters of a wp_linux_dmabuf_feedback
+        object have been sent.
+
+        This allows changes to the wp_linux_dmabuf_feedback parameters to be
+        seen as atomic, even if they happen via multiple events.
+      </description>
+    </event>
+
+    <event name="format_table">
+      <description summary="format and modifier table">
+        This event provides a file descriptor which can be memory-mapped to
+        access the format and modifier table.
+
+        The table contains a tightly packed array of consecutive format +
+        modifier pairs. Each pair is 16 bytes wide. It contains a format as a
+        32-bit unsigned integer, followed by 4 bytes of unused padding, and a
+        modifier as a 64-bit unsigned integer. The native endianness is used.
+
+        The client must map the file descriptor in read-only private mode.
+
+        Compositors are not allowed to mutate the table file contents once this
+        event has been sent. Instead, compositors must create a new, separate
+        table file and re-send feedback parameters. Compositors are allowed to
+        store duplicate format + modifier pairs in the table.
+      </description>
+      <arg name="fd" type="fd" summary="table file descriptor"/>
+      <arg name="size" type="uint" summary="table size, in bytes"/>
+    </event>
+
+    <event name="main_device">
+      <description summary="preferred main device">
+        This event advertises the main device that the server prefers to use
+        when direct scan-out to the target device isn't possible. The
+        advertised main device may be different for each
+        wp_linux_dmabuf_feedback object, and may change over time.
+
+        There is exactly one main device. The compositor must send at least
+        one preference tranche with tranche_target_device equal to main_device.
+
+        Clients need to create buffers that the main device can import and
+        read from, otherwise creating the dmabuf wl_buffer will fail (see the
+        wp_linux_buffer_params.create and create_immed requests for details).
+        The main device will also likely be kept active by the compositor,
+        so clients can use it instead of waking up another device for power
+        savings.
+
+        In general the device is a DRM node. The DRM node type (primary vs.
+        render) is unspecified. Clients must not rely on the compositor sending
+        a particular node type. Clients cannot check two devices for equality
+        by comparing the dev_t value.
+
+        If explicit modifiers are not supported and the client performs buffer
+        allocations on a different device than the main device, then the client
+        must force the buffer to have a linear layout.
+      </description>
+      <arg name="device" type="array" summary="device dev_t value"/>
+    </event>
+
+    <event name="tranche_done">
+      <description summary="a preference tranche has been sent">
+        This event splits tranche_target_device and tranche_formats events in
+        preference tranches. It is sent after a set of tranche_target_device
+        and tranche_formats events; it represents the end of a tranche. The
+        next tranche will have a lower preference.
+      </description>
+    </event>
+
+    <event name="tranche_target_device">
+      <description summary="target device">
+        This event advertises the target device that the server prefers to use
+        for a buffer created given this tranche. The advertised target device
+        may be different for each preference tranche, and may change over time.
+
+        There is exactly one target device per tranche.
+
+        The target device may be a scan-out device, for example if the
+        compositor prefers to directly scan-out a buffer created given this
+        tranche. The target device may be a rendering device, for example if
+        the compositor prefers to texture from said buffer.
+
+        The client can use this hint to allocate the buffer in a way that makes
+        it accessible from the target device, ideally directly. The buffer must
+        still be accessible from the main device, either through direct import
+        or through a potentially more expensive fallback path. If the buffer
+        can't be directly imported from the main device then clients must be
+        prepared for the compositor changing the tranche priority or making
+        wl_buffer creation fail (see the wp_linux_buffer_params.create and
+        create_immed requests for details).
+
+        If the device is a DRM node, the DRM node type (primary vs. render) is
+        unspecified. Clients must not rely on the compositor sending a
+        particular node type. Clients cannot check two devices for equality by
+        comparing the dev_t value.
+
+        This event is tied to a preference tranche, see the tranche_done event.
+      </description>
+      <arg name="device" type="array" summary="device dev_t value"/>
+    </event>
+
+    <event name="tranche_formats">
+      <description summary="supported buffer format modifier">
+        This event advertises the format + modifier combinations that the
+        compositor supports.
+
+        It carries an array of indices, each referring to a format + modifier
+        pair in the last received format table (see the format_table event).
+        Each index is a 16-bit unsigned integer in native endianness.
+
+        For legacy support, DRM_FORMAT_MOD_INVALID is an allowed modifier.
+        It indicates that the server can support the format with an implicit
+        modifier. When a buffer has DRM_FORMAT_MOD_INVALID as its modifier, it
+        is as if no explicit modifier is specified. The effective modifier
+        will be derived from the dmabuf.
+
+        A compositor that sends valid modifiers and DRM_FORMAT_MOD_INVALID for
+        a given format supports both explicit modifiers and implicit modifiers.
+
+        Compositors must not send duplicate format + modifier pairs within the
+        same tranche or across two different tranches with the same target
+        device and flags.
+
+        This event is tied to a preference tranche, see the tranche_done event.
+
+        For the definition of the format and modifier codes, see the
+        wp_linux_buffer_params.create request.
+      </description>
+      <arg name="indices" type="array" summary="array of 16-bit indexes"/>
+    </event>
+
+    <enum name="tranche_flags" bitfield="true">
+      <entry name="scanout" value="1" summary="direct scan-out tranche"/>
+    </enum>
+
+    <event name="tranche_flags">
+      <description summary="tranche flags">
+        This event sets tranche-specific flags.
+
+        The scanout flag is a hint that direct scan-out may be attempted by the
+        compositor on the target device if the client appropriately allocates a
+        buffer. How to allocate a buffer that can be scanned out on the target
+        device is implementation-defined.
+
+        This event is tied to a preference tranche, see the tranche_done event.
+      </description>
+      <arg name="flags" type="uint" enum="tranche_flags" summary="tranche flags"/>
+    </event>
+  </interface>
+
+</protocol>
--- a/qt/protocols/viewporter.xml
+++ b/qt/protocols/viewporter.xml
@ -0,0 +1,177 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<protocol name="viewporter">
+
+  <copyright>
+    Copyright © 2013-2016 Collabora, Ltd.
+
+    Permission is hereby granted, free of charge, to any person obtaining a
+    copy of this software and associated documentation files (the "Software"),
+    to deal in the Software without restriction, including without limitation
+    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+    and/or sell copies of the Software, and to permit persons to whom the
+    Software is furnished to do so, subject to the following conditions:
+
+    The above copyright notice and this permission notice (including the next
+    paragraph) shall be included in all copies or substantial portions of the
+    Software.
+
+    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+    THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+    DEALINGS IN THE SOFTWARE.
+  </copyright>
+
+  <interface name="wp_viewporter" version="1">
+    <description summary="surface cropping and scaling">
+      The global interface exposing surface cropping and scaling
+      capabilities is used to instantiate an interface extension for a
+      wl_surface object. This extended interface will then allow
+      cropping and scaling the surface contents, effectively
+      disconnecting the direct relationship between the buffer and the
+      surface size.
+    </description>
+
+    <request name="destroy" type="destructor">
+      <description summary="unbind from the cropping and scaling interface">
+	Informs the server that the client will not be using this
+	protocol object anymore. This does not affect any other objects,
+	wp_viewport objects included.
+      </description>
+    </request>
+
+    <enum name="error">
+      <entry name="viewport_exists" value="0"
+             summary="the surface already has a viewport object associated"/>
+    </enum>
+
+    <request name="get_viewport">
+      <description summary="extend surface interface for crop and scale">
+	Instantiate an interface extension for the given wl_surface to
+	crop and scale its content. If the given wl_surface already has
+	a wp_viewport object associated, the viewport_exists
+	protocol error is raised.
+      </description>
+      <arg name="id" type="new_id" interface="wp_viewport"
+           summary="the new viewport interface id"/>
+      <arg name="surface" type="object" interface="wl_surface"
+           summary="the surface"/>
+    </request>
+  </interface>
+
+  <interface name="wp_viewport" version="1">
+    <description summary="crop and scale interface to a wl_surface">
+      An additional interface to a wl_surface object, which allows the
+      client to specify the cropping and scaling of the surface
+      contents.
+
+      This interface works with two concepts: the source rectangle (src_x,
+      src_y, src_width, src_height), and the destination size (dst_width,
+      dst_height). The contents of the source rectangle are scaled to the
+      destination size, and content outside the source rectangle is ignored.
+      This state is double-buffered, see wl_surface.commit.
+
+      The two parts of crop and scale state are independent: the source
+      rectangle, and the destination size. Initially both are unset, that
+      is, no scaling is applied. The whole of the current wl_buffer is
+      used as the source, and the surface size is as defined in
+      wl_surface.attach.
+
+      If the destination size is set, it causes the surface size to become
+      dst_width, dst_height. The source (rectangle) is scaled to exactly
+      this size. This overrides whatever the attached wl_buffer size is,
+      unless the wl_buffer is NULL. If the wl_buffer is NULL, the surface
+      has no content and therefore no size. Otherwise, the size is always
+      at least 1x1 in surface local coordinates.
+
+      If the source rectangle is set, it defines what area of the wl_buffer is
+      taken as the source. If the source rectangle is set and the destination
+      size is not set, then src_width and src_height must be integers, and the
+      surface size becomes the source rectangle size. This results in cropping
+      without scaling. If src_width or src_height are not integers and
+      destination size is not set, the bad_size protocol error is raised when
+      the surface state is applied.
+
+      The coordinate transformations from buffer pixel coordinates up to
+      the surface-local coordinates happen in the following order:
+        1. buffer_transform (wl_surface.set_buffer_transform)
+        2. buffer_scale (wl_surface.set_buffer_scale)
+        3. crop and scale (wp_viewport.set*)
+      This means, that the source rectangle coordinates of crop and scale
+      are given in the coordinates after the buffer transform and scale,
+      i.e. in the coordinates that would be the surface-local coordinates
+      if the crop and scale was not applied.
+
+      If src_x or src_y are negative, the bad_value protocol error is raised.
+      Otherwise, if the source rectangle is partially or completely outside of
+      the non-NULL wl_buffer, then the out_of_buffer protocol error is raised
+      when the surface state is applied. A NULL wl_buffer does not raise the
+      out_of_buffer error.
+
+      If the wl_surface associated with the wp_viewport is destroyed,
+      all wp_viewport requests except 'destroy' raise the protocol error
+      no_surface.
+
+      If the wp_viewport object is destroyed, the crop and scale
+      state is removed from the wl_surface. The change will be applied
+      on the next wl_surface.commit.
+    </description>
+
+    <request name="destroy" type="destructor">
+      <description summary="remove scaling and cropping from the surface">
+	The associated wl_surface's crop and scale state is removed.
+	The change is applied on the next wl_surface.commit.
+      </description>
+    </request>
+
+    <enum name="error">
+      <entry name="bad_value" value="0"
+	     summary="negative or zero values in width or height"/>
+      <entry name="bad_size" value="1"
+	     summary="destination size is not integer"/>
+      <entry name="out_of_buffer" value="2"
+	     summary="source rectangle extends outside of the content area"/>
+      <entry name="no_surface" value="3"
+	     summary="the wl_surface was destroyed"/>
+    </enum>
+
+    <request name="set_source">
+      <description summary="set the source rectangle for cropping">
+	Set the source rectangle of the associated wl_surface. See
+	wp_viewport for the description, and relation to the wl_buffer
+	size.
+
+	If all of x, y, width and height are -1.0, the source rectangle is
+	unset instead. Any other set of values where width or height are zero
+	or negative, or x or y are negative, raise the bad_value protocol
+	error.
+
+	The crop and scale state is double-buffered, see wl_surface.commit.
+      </description>
+      <arg name="x" type="fixed" summary="source rectangle x"/>
+      <arg name="y" type="fixed" summary="source rectangle y"/>
+      <arg name="width" type="fixed" summary="source rectangle width"/>
+      <arg name="height" type="fixed" summary="source rectangle height"/>
+    </request>
+
+    <request name="set_destination">
+      <description summary="set the surface size for scaling">
+	Set the destination size of the associated wl_surface. See
+	wp_viewport for the description, and relation to the wl_buffer
+	size.
+
+	If width is -1 and height is -1, the destination size is unset
+	instead. Any other pair of values for width and height that
+	contains zero or negative values raises the bad_value protocol
+	error.
+
+	The crop and scale state is double-buffered, see wl_surface.commit.
+      </description>
+      <arg name="width" type="int" summary="surface width"/>
+      <arg name="height" type="int" summary="surface height"/>
+    </request>
+  </interface>
+
+</protocol>
--- a/qt/src/GhosttySurface.cpp
+++ b/qt/src/GhosttySurface.cpp
--- a/qt/src/GhosttySurface.h
+++ b/qt/src/GhosttySurface.h
@ -1,14 +1,30 @@
 #pragma once

 #include <atomic>
+#include <condition_variable>
+#include <cstdint>
+#include <memory>
+#include <mutex>

 #include <QImage>
+#include <QMutex>
 #include <QPointer>
 #include <QString>
 #include <QStringList>
+#include <QTimer>
 #include <QWidget>

 #include "ghostty.h"
+#include "vulkan/Host.h"
+
+namespace wayland {
+class SubsurfacePresenter;
+}
+#ifndef GHASTTY_USE_VULKAN
+namespace opengl {
+class EglDmabufTarget;
+}
+#endif

 class MainWindow;
 class QContextMenuEvent;
@ -31,15 +47,27 @@ class OverlayScrollbar;

 // One Ghostty terminal pane.
 //
-// libghostty's OpenGL renderer draws the terminal into an offscreen
-// framebuffer owned by a private QOpenGLContext (there is no on-screen
-// GL surface). Each frame is read back into a QImage and painted with
-// QPainter. That keeps this an ordinary translucent QWidget, so it
-// embeds in the QTabWidget / QSplitter tree and its transparent
-// background composites to the desktop exactly like the rest of the
-// widget chrome — avoiding QOpenGLWidget (composites opaque on Wayland)
-// and an embedded QOpenGLWindow (does not present when embedded).
-class GhosttySurface : public QWidget {
+// Terminal pixels reach the screen via a wl_subsurface attached to
+// the top-level QWindow's wl_surface (see wayland::SubsurfacePresenter).
+// libghostty's renderer (Vulkan or OpenGL, picked at compile time
+// via GHASTTY_USE_VULKAN) hands us a dmabuf fd per frame; we wrap
+// it in a wl_buffer via zwp_linux_dmabuf_v1 and the compositor
+// scans it out directly — no readback, no QPainter blit for the
+// terminal area. Each pane in a split is a sibling subsurface
+// under the same top-level wl_surface, positioned at its offset
+// within the top-level via setPosition.
+//
+// This QWidget itself keeps WA_TranslucentBackground so the
+// terminal area of the parent surface is transparent (the
+// subsurface below shows through) and chrome (SearchBar,
+// overlays, scrollbar) painted in paintEvent stays visible on top.
+//
+// Legacy fallback: if the compositor lacks the required Wayland
+// globals (linux-dmabuf-v1, viewporter, subcompositor) or the
+// renderer reports image_backed=false (NVIDIA Vulkan's
+// legacy_copy path on this branch), the frame goes through a
+// mmap+memcpy+QImage+QPainter::drawImage path instead.
+class GhosttySurface : public QWidget, public vulkan::PresentSink {
  Q_OBJECT

 public:
@ -143,10 +171,67 @@ public:
  void setPwd(const QString &pwd);
  const QString &pwd() const { return m_pwd; }

+  // Apprt-side entry point for the Vulkan `present` callback. Fires
+  // on the renderer thread. Parks the dmabuf descriptor under
+  // `m_pendingMutex` (plus, for the legacy fallback path, an
+  // mmap+memcpy'd QImage) and wakes the GUI thread via
+  // `QMetaObject::invokeMethod(this, drainVulkan, Qt::QueuedConnection)`.
+  // The GUI thread either commits the dmabuf to the wl_subsurface
+  // (zero-copy) or paints the QImage (fallback). The dropped-frame
+  // counter `m_droppedFrames` makes any genuine queue-loss visible
+  // (zero in the steady state).
+  void presentVulkanDmabuf(
+      int dmabuf_fd,
+      quint32 drm_format,
+      quint64 drm_modifier,
+      quint32 width,
+      quint32 height,
+      quint32 stride,
+      bool image_backed);
+
+  // `vulkan::PresentSink` override. Thin forward to
+  // `presentVulkanDmabuf` so the existing implementation (and its
+  // doc comment above) stays where it is. Called by `vulkan::Host`'s
+  // present-callback trampoline on the libghostty renderer thread.
+  void presentDmabuf(int dmabuf_fd, std::uint32_t drm_format,
+                      std::uint64_t drm_modifier, std::uint32_t width,
+                      std::uint32_t height, std::uint32_t stride,
+                      bool image_backed) override {
+    presentVulkanDmabuf(dmabuf_fd, drm_format, drm_modifier, width,
+                         height, stride, image_backed);
+  }
+
+  // GUI-thread drain step: hands the most recent pending frame
+  // either to the SubsurfacePresenter (zero-copy path) or the
+  // QImage paint pipeline (fallback). Idempotent: returns
+  // immediately if nothing's pending. Invoked from the polling
+  // safety net AND from queued invocations triggered by the
+  // renderer thread.
+  Q_INVOKABLE void drainVulkan();
+
+  // Compositor frame-callback handler. Fires (on the GUI thread,
+  // via Wayland event-queue dispatch) when the compositor signals
+  // it's ready to display our next commit. Clears the in-flight
+  // flag and re-pumps drainVulkan to consume any frame the renderer
+  // parked while we were waiting. Q_INVOKABLE so it can also be
+  // posted via QMetaObject::invokeMethod from a queued context.
+  Q_INVOKABLE void onWaylandFrameReady();
+
+  // Force a wl_surface.commit on our parent native window via the
+  // QtWaylandClient::QWaylandWindow private API. The wl_subsurface
+  // is in sync mode, so child state changes only apply when the
+  // parent commits — but Qt's backing-store flush doesn't fire for
+  // a translucent QWidget with no paint damage. Calling this after
+  // every child commit ensures the cached child state actually
+  // reaches the compositor. Returns false on non-Wayland QPA or if
+  // the cast fails (no Qt private headers available).
+  bool forceParentCommit();
+
 protected:
  bool event(QEvent *) override;
  void paintEvent(QPaintEvent *) override;
  void resizeEvent(QResizeEvent *) override;
+  void moveEvent(QMoveEvent *) override;

  // Disable Qt's Tab/Backtab focus traversal so those keys reach
  // keyPressEvent and can be forwarded to the terminal.
@ -207,19 +292,125 @@ private:
  ghostty_surface_t m_parentSurface;   // inherited-config source; may be null
  ghostty_surface_t m_surface = nullptr;

-  // Private offscreen GL context libghostty renders into.
+  // Private offscreen GL context libghostty renders into. Null for
+  // the Vulkan-backed renderer (libghostty hands frames back via a
+  // dmabuf fd to the apprt's `present` callback — no GL involved).
  QOpenGLContext *m_context = nullptr;
  QOffscreenSurface *m_offscreen = nullptr;
  QOpenGLFramebufferObject *m_fbo = nullptr;
+#ifndef GHASTTY_USE_VULKAN
+  // Dmabuf-exporting GL target (zero-copy path). Set when the EGL
+  // display advertises EGL_MESA_image_dma_buf_export and the
+  // wl_subsurface presenter is up; the renderer draws into this
+  // texture-backed framebuffer and we attach its fd straight to the
+  // subsurface — no glReadPixels, no QImage, no QPainter blit.
+  // Stays null when EGL support is missing or the subsurface failed
+  // to bring up, and the legacy m_fbo path runs as fallback.
+  //
+  // Vulkan-variant builds export dmabufs directly from
+  // VkDeviceMemory via VK_KHR_external_memory_fd and never touch
+  // EGL, so the field (and the entire EglDmabufTarget translation
+  // unit) is excluded from those binaries — matching the libEGL
+  // gating in qt/CMakeLists.txt.
+  std::unique_ptr<opengl::EglDmabufTarget> m_eglTarget;
+#endif
  QImage m_image;                      // last frame, read back from m_fbo

+  // True when this surface is using the Vulkan platform. The
+  // paintEvent uses this to draw a visible placeholder when no
+  // dmabuf has been imported yet; once
+  // `presentVulkanDmabuf` has filled `m_image` the placeholder
+  // gives way to the actual rendered content.
+  bool m_useVulkan = false;
+
+  // Cross-thread frame handoff for the Vulkan path. The renderer
+  // thread calls `presentVulkanDmabuf` with a borrowed dmabuf fd
+  // and posts a queued `drainVulkan` invocation; the GUI thread
+  // runs `drainVulkan` and routes the parked descriptor through
+  // either the wl_subsurface presenter (zero-copy) or the
+  // mmap+memcpy+QImage fallback. The dropped-frame counter
+  // (`m_droppedFrames`) surfaces any queue-loss that ever happens
+  // in practice — the earlier safety-net polling timer was
+  // removed once delivery was shown to be reliable.
+  //
+  // `m_useSubsurface` is set once on the GUI thread when the
+  // presenter comes up; the renderer thread reads it acquire-style
+  // to decide which path to populate per frame.
+  std::atomic<bool> m_useSubsurface{false};
+  // Subsurface (zero-copy) path: renderer thread parks the
+  // borrowed-fd descriptor here; GUI-thread timer hands it to the
+  // presenter.
+  struct PendingDmabuf {
+    int fd = -1;
+    quint32 drm_format = 0;
+    quint64 drm_modifier = 0;
+    quint32 width = 0;
+    quint32 height = 0;
+    quint32 stride = 0;
+  };
+  PendingDmabuf m_pendingDmabuf;
+  // Compositor-paced present gate. Now BACKPRESSURES THE RENDERER
+  // THREAD: presentVulkanDmabuf blocks (with a 100 ms safety
+  // timeout) until the compositor signals ready, so the renderer
+  // produces frames at the compositor's refresh rate instead of
+  // its own 125 FPS draw timer. Saves the GPU work + renderer-
+  // thread CPU that the prior GUI-side-drop model was paying for
+  // every wasted frame.
+  //
+  // State machine:
+  //   - Initial: ready=true (first present goes through).
+  //   - Renderer present: wait_for(ready || hidden); claim
+  //     ready=false; park dmabuf; post drain.
+  //   - GUI drain: consume + commit + register wl_surface.frame.
+  //   - Compositor frame_done → onWaylandFrameReady: ready=true,
+  //     notify CV. Renderer's next present unblocks immediately.
+  //   - Hide / PlatformSurface destroy: ready=true, notify_all to
+  //     unblock any in-flight renderer wait (predicate also checks
+  //     m_hidden so the renderer bails without parking).
+  std::mutex m_compositorMutex;
+  std::condition_variable m_compositorCv;
+  bool m_compositorReady = true;
+  // True once drainVulkan has successfully attached a dmabuf
+  // whose dimensions match the widget's current device-pixel
+  // size. paintEvent reads this to decide whether to fill the
+  // terminal area with the configured background color (hides
+  // the otherwise-transparent flash on new-tab open) or with
+  // Qt::transparent (lets the subsurface buffer show through).
+  // Reset to false on Hide and on PlatformSurface destroy so
+  // the next Show re-paints the placeholder until a real frame
+  // is attached.
+  std::atomic<bool> m_subsurfaceHasFrame{false};
+  // Dedupes queued drainVulkan invocations posted from the renderer
+  // thread. Each renderer-thread `presentVulkanDmabuf` used to post
+  // a QueuedConnection invokeMethod unconditionally — at 125 FPS
+  // that's 125 Qt-event-queue allocations + dispatches per second,
+  // most of which no-op now that the compositor gate may not yet
+  // be ready. CAS to true to claim the slot; drainVulkan resets to
+  // false before consuming so a follow-up renderer frame can
+  // schedule its own drain. The pending-dmabuf "latest wins"
+  // semantic guarantees the renderer's newest frame is what
+  // drainVulkan sees regardless of how many parks happened between.
+  std::atomic<bool> m_drainScheduled{false};
+  // Legacy (mmap+memcpy) path: kept as a fallback when the
+  // presenter isn't available (e.g. compositor missing
+  // linux-dmabuf-v1). When the subsurface path is active this stays
+  // null and paintEvent skips its blit.
+  QImage m_pending;
+  QMutex m_pendingMutex;
+
  // GL objects for the alpha-premultiply pass.
  QOpenGLShaderProgram *m_premultProg = nullptr;
  QOpenGLVertexArrayObject *m_premultVao = nullptr;

  int m_fbw = 0;                       // framebuffer size, device pixels
  int m_fbh = 0;
-  double m_fbDpr = 1.0;                // DPR the framebuffer was sized at
+  // DPR the framebuffer was sized at. Atomic because the renderer
+  // thread reads it from `presentVulkanDmabuf` to tag the legacy
+  // QImage path while the GUI thread writes it from
+  // `syncSurfaceSize`. `double` writes aren't guaranteed atomic
+  // across threads on every architecture; std::atomic<double> uses
+  // CAS-loop fallbacks where needed.
+  std::atomic<double> m_fbDpr{1.0};    // DPR the framebuffer was sized at

  QLabel *m_exitOverlay = nullptr;     // "process exited" banner; lazily made
  QLabel *m_keySeqOverlay = nullptr;   // pending keybind chord; lazily made
@ -268,4 +459,46 @@ private:
  // first PWD notification (libghostty fires one at spawn from the
  // inherited config, then on every cwd change).
  QString m_pwd;
+
+  // Wayland subsurface for the GPU-direct present path. Lazily
+  // created on first `QEvent::Show` once the top-level QWindow
+  // exists; null if the compositor lacks the required globals
+  // (linux-dmabuf-v1, viewporter, subcompositor), in which case
+  // the legacy mmap+memcpy+QImage+QPainter path renders pixels.
+  std::unique_ptr<wayland::SubsurfacePresenter> m_subsurfacePresenter;
+  // Per-surface latch for the first-dmabuf log breadcrumb so each
+  // pane / split prints its own line on first frame. Atomic because
+  // the renderer thread is what hits `presentVulkanDmabuf` and the
+  // first-frame check would otherwise race a sibling renderer
+  // thread on the same widget — relaxed CAS means at most one log
+  // line per surface, even under concurrent first frames.
+  std::atomic<bool> m_loggedFirstFrame{false};
+
+  // Count of frames overwritten in `m_pendingDmabuf` before the GUI
+  // thread drained them. Each overwrite is a missed compositor
+  // present — fd lifetime is unaffected (libghostty owns the
+  // dmabuf), but a sustained nonzero rate means the GUI thread is
+  // falling behind the renderer. Logged sparsely from
+  // `presentVulkanDmabuf`.
+  std::atomic<std::uint64_t> m_droppedFrames{0};
+  // Set true on QEvent::Hide, false on QEvent::Show. Guards the
+  // present path against a race where libghostty's renderer thread
+  // fires one more frame after we've detached the subsurface
+  // buffer on Hide — without this gate, that stray frame re-
+  // attaches a buffer and the now-inactive tab ghosts on top of
+  // whatever tab the user just switched to. `std::atomic` because
+  // the renderer thread reads it in `presentVulkanDmabuf` /
+  // `drainVulkan` while the GUI thread writes from event().
+  std::atomic<bool> m_hidden{false};
+
+  // Cache of the result of `dynamic_cast<QtWaylandClient::QWaylandWindow*>`
+  // for the top-level QWindow's QPA handle, used by
+  // `forceParentCommit`. The cast is non-trivial and the function
+  // is on the present hot path (called per Vulkan frame, per GL
+  // frame, per moveEvent, on Hide, etc.). Resolved on first
+  // successful call; invalidated whenever the platform-surface
+  // QWindow handle is recreated (PlatformSurfaceAboutToBeDestroyed
+  // event). Stored as void* so the header doesn't have to include
+  // any Qt private QPA headers; the .cpp casts back at use sites.
+  void *m_cachedWaylandWindow = nullptr;
 };
--- a/qt/src/XkbTracker.cpp
+++ b/qt/src/XkbTracker.cpp
@ -65,9 +65,17 @@ XkbTracker::XkbTracker() {
  if (m_keyboard == nullptr)
    wl_display_roundtrip_queue(display, queue);

-  // The keyboard proxy is hot — move it onto the default queue so
-  // Qt's event loop dispatches our listeners alongside Qt's own
-  // input events.
+  // The keyboard + seat proxies are long-lived — move them onto the
+  // default queue so Qt's event loop dispatches our listeners
+  // alongside Qt's own input events, AND so they don't dangle on
+  // the about-to-be-destroyed private queue. Failing to migrate the
+  // seat caused a SIGSEGV at process exit: libwayland warned
+  // ("queue X destroyed while proxies still attached: wl_seat#NN")
+  // and then later seat events / display teardown dereferenced the
+  // dead queue.
+  if (m_seat) {
+    wl_proxy_set_queue(reinterpret_cast<wl_proxy *>(m_seat), nullptr);
+  }
  if (m_keyboard) {
    wl_proxy_set_queue(reinterpret_cast<wl_proxy *>(m_keyboard), nullptr);
  }
@ -78,6 +86,7 @@ XkbTracker::~XkbTracker() {
  // Process-wide singleton; OS reclaims at exit. Explicit teardown
  // keeps leak checkers quiet and documents ownership.
  if (m_keyboard) wl_keyboard_destroy(m_keyboard);
+  if (m_seat) wl_seat_destroy(m_seat);
  if (m_state) xkb_state_unref(m_state);
  if (m_keymap) xkb_keymap_unref(m_keymap);
  if (m_ctx) xkb_context_unref(m_ctx);
@ -108,6 +117,12 @@ void XkbTracker::onRegistryGlobal(void *data, wl_registry *registry,
  auto *seat = static_cast<wl_seat *>(
      wl_registry_bind(registry, name, &wl_seat_interface, 5));
  if (!seat) return;
+  // Stash the seat on the tracker so it outlives this callback and
+  // its private-queue registry. wl_seat is a long-lived proxy: we
+  // keep the listener alive for the full process lifetime so future
+  // capability changes (keyboard hot-plug, layout change) flow into
+  // onSeatCapabilities and we can re-bind the wl_keyboard.
+  self->m_seat = seat;
  // Subscribe to capability changes; we'll grab the keyboard from
  // the capability callback once the seat tells us it has one.
  wl_seat_add_listener(seat, &kSeatListener, self);
--- a/qt/src/XkbTracker.h
+++ b/qt/src/XkbTracker.h
@ -94,6 +94,12 @@ class XkbTracker {
  // a keymap is loaded.
  uint32_t m_idxCapsLock = ~0u;
  uint32_t m_idxNumLock = ~0u;
+  // wl_seat handle, owned by us via wl_registry_bind. Kept alive for
+  // the singleton's lifetime so capability changes (keyboard
+  // hot-plug, layout switch) keep flowing to onSeatCapabilities, and
+  // so the proxy isn't dangling on the private registry queue we
+  // destroy at the end of the ctor.
+  struct wl_seat *m_seat = nullptr;
  // wl_keyboard handle, owned by us via wl_seat_get_keyboard.
  struct wl_keyboard *m_keyboard = nullptr;
 };
--- a/qt/src/actions/SystemActions.cpp
+++ b/qt/src/actions/SystemActions.cpp
@ -251,7 +251,7 @@ bool handleSystem(const Context &ctx, const ghostty_action_s &action) {
      // abnormal threshold (default 250ms). Banner = "the process
      // died unexpectedly," not "the process exited."
      uint32_t threshold = 250;
-      config::get(&threshold, "abnormal-command-exit-runtime");
+      (void)config::get(&threshold, "abnormal-command-exit-runtime");
      if (ce.runtime_ms < threshold) return true;
      const int code = static_cast<int>(ce.exit_code);
      post(src, [srcp, code]() {
--- a/qt/src/main.cpp
+++ b/qt/src/main.cpp
@ -1,4 +1,14 @@
 #include <cstdio>
+#include <cstdlib>
+#include <cstring>
+
+// (The atexit hook to ghastty_glslang_finalize_process that used
+// to live here was removed: now that build-time SPV precompile
+// is in place, the runtime libghostty no longer calls the glslang
+// shim at all for built-ins, so the shim's symbols get DCE'd out
+// of libghostty.so. The cosmetic FinalizeProcess+popAll cleanup
+// also didn't reduce heaptrack's reported leak in practice, so
+// the call wasn't pulling its weight anyway.)

 #include <QApplication>
 #include <QCoreApplication>
@ -22,7 +32,51 @@ static bool isCliActionInvocation(int argc, char **argv) {
  return false;
 }

+// Default-disable MangoHud for this process. The Vulkan implicit
+// layer hooks every vkQueueSubmit / vkAcquireNextImage / etc. to
+// render its own overlay, which on this branch's animated-shader
+// + multi-pane workload added ~25% extra main-thread CPU at idle
+// (measured against a baseline of ~10% for the Wayland-buffer
+// cache path). For a terminal, that's a steep tax on a feature
+// users typically associate with games. A system-wide MANGOHUD=1
+// (common in `~/.profile` for users who want the HUD on games) is
+// explicitly OVERRIDDEN here — the user is invoking ghastty, not
+// a game, and we don't want them to silently pay 25% extra CPU.
+//
+// Two layers of MangoHud's loading model:
+//   - VK_LOADER_LAYERS_DISABLE: Vulkan loader skips the layer
+//     entirely (no interception overhead).
+//   - DISABLE_MANGOHUD: belt-and-suspenders if the loader didn't
+//     honor the env var (older loaders) or another runtime force-
+//     loaded the layer through a different path.
+//
+// Escape hatch: GHASTTY_ALLOW_OVERLAY=1 skips the guard entirely
+// so a user who genuinely wants MangoHud on the terminal (e.g.
+// debugging the renderer with the HUD's frame-time graph) can
+// opt back in without removing the layer JSON system-wide.
+//
+// setenv overwrite=1 throughout: the whole point is to override a
+// pre-existing MANGOHUD=1 / DISABLE_MANGOHUD=0 / etc.
+static void defaultDisableMangoHud() {
+  if (const char *opt = ::getenv("GHASTTY_ALLOW_OVERLAY");
+      opt && opt[0] == '1') return;
+  ::setenv("MANGOHUD", "0", 1);
+  ::setenv("DISABLE_MANGOHUD", "1", 1);
+  ::setenv("VK_LOADER_LAYERS_DISABLE", "*MANGOHUD*", 1);
+}
+
 int main(int argc, char **argv) {
+  // Set the env BEFORE Qt's QApplication ctor (which can probe
+  // GL/Vulkan via QPA) and before the CLI action path (since
+  // libghostty action handlers may also touch the renderer).
+  defaultDisableMangoHud();
+
+  // (Build-time SPV precompile means the runtime libghostty no
+  // longer invokes glslang for built-in shaders, so the per-
+  // thread TPoolAllocator pages we used to leak from first-
+  // surface init don't exist on the Vulkan variant anymore. No
+  // atexit cleanup needed.)
+
  // CLI action fast path: skip Qt entirely. ghostty_init parses argv
  // for the `+action`; ghostty_cli_try_action runs it and exits the
  // process. If something fails (unknown action, multiple actions),
@ -104,6 +158,15 @@ int main(int argc, char **argv) {
    return 1;
  }

+  // The Vulkan host is intentionally NOT bootstrapped here: doing it
+  // before any window is mapped on Wayland can interact badly with
+  // Qt's Wayland integration (the VkInstance starts grabbing display
+  // resources before Qt has finished its own connection setup, and
+  // on some compositor + driver combos the result is a process that
+  // runs but never actually displays a window). It's brought up
+  // lazily on the first surface that needs it — see
+  // `GhosttySurface.cpp`.
+
  // initial-window: when false, start headless (no window mapped at
  // launch). Combined with quit-after-last-window-closed=false this
  // is how a user runs ghastty as a daemon for the global quick-
--- a/qt/src/opengl/EglDmabufTarget.cpp
+++ b/qt/src/opengl/EglDmabufTarget.cpp
@ -0,0 +1,275 @@
+#include "EglDmabufTarget.h"
+
+#include <cstdio>
+#include <cstring>
+#include <unistd.h>
+
+#include <QOpenGLContext>
+#include <QOpenGLFunctions>
+
+#include <EGL/egl.h>
+#include <EGL/eglext.h>
+
+namespace opengl {
+
+namespace {
+
+// EGL_MESA_image_dma_buf_export entry points (loaded once per
+// process). Resolved via `eglGetProcAddress`, which returns null if
+// the extension isn't present.
+using PFNeglExportDMABUFImageQueryMESA =
+    EGLBoolean (*)(EGLDisplay dpy, EGLImageKHR image, int *fourcc,
+                   int *num_planes, EGLuint64KHR *modifiers);
+using PFNeglExportDMABUFImageMESA =
+    EGLBoolean (*)(EGLDisplay dpy, EGLImageKHR image, int *fds,
+                   EGLint *strides, EGLint *offsets);
+
+struct EglFns {
+  PFNEGLCREATEIMAGEKHRPROC createImage = nullptr;
+  PFNEGLDESTROYIMAGEKHRPROC destroyImage = nullptr;
+  PFNeglExportDMABUFImageQueryMESA queryExport = nullptr;
+  PFNeglExportDMABUFImageMESA exportImage = nullptr;
+  bool resolved = false;
+  bool available = false;
+};
+
+EglFns &eglFns() {
+  static EglFns f;
+  return f;
+}
+
+bool ensureEglFns(EGLDisplay display) {
+  EglFns &f = eglFns();
+  if (f.resolved) return f.available;
+  f.resolved = true;
+
+  const char *exts = eglQueryString(display, EGL_EXTENSIONS);
+  if (!exts) return false;
+  auto hasExt = [exts](const char *name) {
+    const std::size_t n = std::strlen(name);
+    const char *p = exts;
+    while ((p = std::strstr(p, name)) != nullptr) {
+      if ((p == exts || p[-1] == ' ') && (p[n] == '\0' || p[n] == ' '))
+        return true;
+      p += n;
+    }
+    return false;
+  };
+  if (!hasExt("EGL_KHR_image_base") ||
+      !hasExt("EGL_MESA_image_dma_buf_export")) {
+    std::fprintf(stderr,
+                 "[ghastty] EglDmabufTarget: EGL display lacks "
+                 "EGL_KHR_image_base or EGL_MESA_image_dma_buf_export\n");
+    return false;
+  }
+
+  f.createImage = reinterpret_cast<PFNEGLCREATEIMAGEKHRPROC>(
+      eglGetProcAddress("eglCreateImageKHR"));
+  f.destroyImage = reinterpret_cast<PFNEGLDESTROYIMAGEKHRPROC>(
+      eglGetProcAddress("eglDestroyImageKHR"));
+  f.queryExport = reinterpret_cast<PFNeglExportDMABUFImageQueryMESA>(
+      eglGetProcAddress("eglExportDMABUFImageQueryMESA"));
+  f.exportImage = reinterpret_cast<PFNeglExportDMABUFImageMESA>(
+      eglGetProcAddress("eglExportDMABUFImageMESA"));
+  if (!f.createImage || !f.destroyImage || !f.queryExport ||
+      !f.exportImage) {
+    std::fprintf(stderr,
+                 "[ghastty] EglDmabufTarget: eglGetProcAddress returned "
+                 "null for required entry points\n");
+    return false;
+  }
+  f.available = true;
+  return true;
+}
+
+EGLDisplay currentEglDisplay() {
+  return eglGetCurrentDisplay();
+}
+
+// GL constants come from <QOpenGLFunctions> indirectly via the Qt
+// GL headers — GL_TEXTURE_2D / GL_RGBA8 / GL_FRAMEBUFFER etc. are
+// in scope without further includes.
+
+} // namespace
+
+bool EglDmabufTarget::available(QOpenGLContext *ctx) {
+  if (!ctx) return false;
+  if (!ctx->isValid()) return false;
+  EGLDisplay dpy = currentEglDisplay();
+  if (dpy == EGL_NO_DISPLAY) {
+    std::fprintf(
+        stderr,
+        "[ghastty] EglDmabufTarget: no current EGL display (call after "
+        "QOpenGLContext::makeCurrent on a Wayland QPA)\n");
+    return false;
+  }
+  return ensureEglFns(dpy);
+}
+
+std::unique_ptr<EglDmabufTarget> EglDmabufTarget::create(QOpenGLContext *ctx,
+                                                          int width_px,
+                                                          int height_px) {
+  if (!ctx || !ctx->isValid()) return nullptr;
+  if (width_px <= 0 || height_px <= 0) return nullptr;
+  EGLDisplay dpy = currentEglDisplay();
+  if (dpy == EGL_NO_DISPLAY) return nullptr;
+  if (!ensureEglFns(dpy)) return nullptr;
+  const EglFns &fns = eglFns();
+  auto *gl = ctx->functions();
+  if (!gl) return nullptr;
+
+  // We populate `target->m_*` AS we acquire each resource; on any
+  // failure we just `return nullptr` and let the unique_ptr's
+  // destructor unwind everything that's been stored so far. This is
+  // the only cleanup path — no manual gl->glDeleteTextures /
+  // ::close(fd) on early returns, which previously double-freed the
+  // texture and made the cleanup logic asymmetric per branch.
+  auto target = std::unique_ptr<EglDmabufTarget>(new EglDmabufTarget());
+  target->m_eglDisplay = dpy;
+  target->m_width = width_px;
+  target->m_height = height_px;
+
+  // 1. Allocate a GL texture sized to the desired framebuffer.
+  unsigned int tex = 0;
+  gl->glGenTextures(1, &tex);
+  if (tex == 0) return nullptr;
+  target->m_texture = tex;
+  gl->glBindTexture(GL_TEXTURE_2D, tex);
+  gl->glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
+  gl->glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
+  gl->glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA8, width_px, height_px, 0, GL_RGBA,
+                   GL_UNSIGNED_BYTE, nullptr);
+  gl->glBindTexture(GL_TEXTURE_2D, 0);
+
+  // 2. Wrap as an EGLImage targeting the GL texture.
+  EGLImageKHR img = fns.createImage(
+      dpy, ctx->nativeInterface<QNativeInterface::QEGLContext>()
+               ? reinterpret_cast<EGLContext>(
+                     ctx->nativeInterface<QNativeInterface::QEGLContext>()
+                         ->nativeContext())
+               : eglGetCurrentContext(),
+      EGL_GL_TEXTURE_2D_KHR,
+      reinterpret_cast<EGLClientBuffer>(static_cast<uintptr_t>(tex)), nullptr);
+  if (img == EGL_NO_IMAGE_KHR) {
+    std::fprintf(stderr,
+                 "[ghastty] EglDmabufTarget: eglCreateImageKHR failed (0x%x)\n",
+                 eglGetError());
+    return nullptr;
+  }
+  target->m_eglImage = img;
+
+  // 3. Query the export metadata (fourcc, plane count, modifier).
+  int fourcc = 0;
+  int num_planes = 0;
+  EGLuint64KHR modifier = 0;
+  if (!fns.queryExport(dpy, img, &fourcc, &num_planes, &modifier)) {
+    std::fprintf(stderr,
+                 "[ghastty] EglDmabufTarget: eglExportDMABUFImageQueryMESA "
+                 "failed (0x%x)\n",
+                 eglGetError());
+    return nullptr;
+  }
+  if (num_planes != 1) {
+    // Multi-plane modifiers need a wider present-callback ABI on the
+    // subsurface side. NVIDIA / Mesa default tilings for RGBA are
+    // single-plane in practice; refuse multi-plane cleanly and fall
+    // back to the QImage path.
+    std::fprintf(stderr,
+                 "[ghastty] EglDmabufTarget: refusing multi-plane export "
+                 "(num_planes=%d fourcc=0x%x mod=0x%llx)\n",
+                 num_planes, fourcc,
+                 static_cast<unsigned long long>(modifier));
+    return nullptr;
+  }
+  target->m_drmFormat = static_cast<std::uint32_t>(fourcc);
+  target->m_drmModifier = static_cast<std::uint64_t>(modifier);
+
+  // 4. Export the dmabuf fd + per-plane stride/offset.
+  int fd = -1;
+  EGLint stride = 0;
+  EGLint offset = 0;
+  if (!fns.exportImage(dpy, img, &fd, &stride, &offset) || fd < 0) {
+    std::fprintf(stderr,
+                 "[ghastty] EglDmabufTarget: eglExportDMABUFImageMESA failed "
+                 "(0x%x fd=%d)\n",
+                 eglGetError(), fd);
+    return nullptr;
+  }
+  target->m_fd = fd;
+  target->m_stride = static_cast<std::uint32_t>(stride);
+  // The `wayland::SubsurfacePresenter` present path hardcodes
+  // `offset = 0` when wrapping this fd in a wl_buffer (see
+  // SubsurfacePresenter.cpp's zwp_linux_buffer_params_v1_add call).
+  // For LINEAR-tiled exports (the only thing this OpenGL path
+  // produces, by EGL_MESA_image_dma_buf_export's contract for a
+  // single-plane texture) `offset` is always 0 in practice. Reject
+  // anything else loudly so a future EGL implementation that
+  // returns a non-zero offset doesn't silently render at the wrong
+  // location.
+  if (offset != 0) {
+    std::fprintf(stderr,
+                 "[ghastty] EglDmabufTarget: unexpected non-zero offset=%d "
+                 "from eglExportDMABUFImageMESA; SubsurfacePresenter assumes "
+                 "offset=0 for single-plane LINEAR exports\n",
+                 offset);
+    ::close(fd);
+    target->m_fd = -1;
+    return nullptr;
+  }
+
+  // 5. Attach to a framebuffer so libghostty can render into it.
+  unsigned int fbo = 0;
+  gl->glGenFramebuffers(1, &fbo);
+  if (fbo == 0) return nullptr;
+  target->m_framebuffer = fbo;
+  gl->glBindFramebuffer(GL_FRAMEBUFFER, fbo);
+  gl->glFramebufferTexture2D(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0,
+                             GL_TEXTURE_2D, tex, 0);
+  const unsigned int status = gl->glCheckFramebufferStatus(GL_FRAMEBUFFER);
+  gl->glBindFramebuffer(GL_FRAMEBUFFER, 0);
+  if (status != GL_FRAMEBUFFER_COMPLETE) {
+    std::fprintf(stderr,
+                 "[ghastty] EglDmabufTarget: framebuffer incomplete (0x%x)\n",
+                 status);
+    return nullptr;
+  }
+
+  std::fprintf(stderr,
+               "[ghastty] EglDmabufTarget: %dx%d fd=%d fourcc=0x%x mod=0x%llx "
+               "stride=%u\n",
+               width_px, height_px, fd, target->m_drmFormat,
+               static_cast<unsigned long long>(target->m_drmModifier),
+               target->m_stride);
+  return target;
+}
+
+EglDmabufTarget::EglDmabufTarget() = default;
+
+EglDmabufTarget::~EglDmabufTarget() {
+  // Caller must ensure the owning QOpenGLContext is current; on
+  // GhosttySurface destruction we go through `makeCurrent` first.
+  auto ctx = QOpenGLContext::currentContext();
+  if (ctx) {
+    auto *gl = ctx->functions();
+    if (m_framebuffer) gl->glDeleteFramebuffers(1, &m_framebuffer);
+    if (m_texture) gl->glDeleteTextures(1, &m_texture);
+  }
+  if (m_eglImage && m_eglDisplay) {
+    eglFns().destroyImage(m_eglDisplay, m_eglImage);
+  }
+  if (m_fd >= 0) ::close(m_fd);
+}
+
+void EglDmabufTarget::bind() const {
+  auto ctx = QOpenGLContext::currentContext();
+  if (!ctx || !m_framebuffer) return;
+  ctx->functions()->glBindFramebuffer(GL_FRAMEBUFFER, m_framebuffer);
+}
+
+void EglDmabufTarget::release() const {
+  auto ctx = QOpenGLContext::currentContext();
+  if (!ctx) return;
+  ctx->functions()->glBindFramebuffer(GL_FRAMEBUFFER, 0);
+}
+
+} // namespace opengl
--- a/qt/src/opengl/EglDmabufTarget.h
+++ b/qt/src/opengl/EglDmabufTarget.h
@ -0,0 +1,87 @@
+// Dmabuf-exporting GL render target for the OpenGL present path.
+//
+// libghostty's GL renderer draws into a host-owned framebuffer (see
+// GhosttySurface's `m_fbo`). Today that framebuffer's pixels get
+// pulled back through `glReadPixels` (via `QOpenGLFramebufferObject::toImage`)
+// into a QImage, then re-uploaded to the QWidget backing store by
+// QPainter. After this class is wired in, the host instead allocates
+// a GL texture, wraps it as an `EGLImage` via `eglCreateImage`,
+// exports its memory as a dmabuf via `eglExportDMABUFImageMESA`,
+// and attaches that texture to a GL framebuffer for libghostty to
+// draw into. The cached dmabuf fd / fourcc / modifier / stride are
+// then handed straight to the `wayland::SubsurfacePresenter` — same
+// zero-copy path the Vulkan renderer's Target uses, just sourced
+// from EGL instead of Vulkan.
+//
+// Requires `EGL_MESA_image_dma_buf_export` (checked by the static
+// `available()` predicate). Wayland-only by project decision.
+
+#pragma once
+
+#include <cstdint>
+#include <memory>
+
+class QOpenGLContext;
+
+namespace opengl {
+
+class EglDmabufTarget {
+public:
+  // Detect at runtime whether the current EGL display advertises
+  // `EGL_MESA_image_dma_buf_export`. Caller MUST have a Wayland QPA
+  // and `ctx` must be a usable, makeCurrent-able QOpenGLContext.
+  // Cached after first call.
+  static bool available(QOpenGLContext *ctx);
+
+  // Build a target of the given device-pixel size. Returns nullptr
+  // on any EGL / GL failure (caller falls back to the legacy
+  // QOpenGLFramebufferObject + toImage path). `ctx` must be current
+  // on the calling thread when called.
+  static std::unique_ptr<EglDmabufTarget> create(QOpenGLContext *ctx,
+                                                  int width_px,
+                                                  int height_px);
+
+  ~EglDmabufTarget();
+
+  // Bind the framebuffer for draw operations. Caller is responsible
+  // for `glViewport` / `glClear` etc. Mirrors `QOpenGLFramebufferObject::bind`.
+  void bind() const;
+  void release() const;
+
+  // Pixel + dmabuf metadata. Stable for the lifetime of this target;
+  // resize allocates a new target. `stride` is the value returned by
+  // `eglExportDMABUFImageMESA` for plane 0.
+  int width() const { return m_width; }
+  int height() const { return m_height; }
+  int fd() const { return m_fd; }
+  std::uint32_t drmFormat() const { return m_drmFormat; }
+  std::uint64_t drmModifier() const { return m_drmModifier; }
+  std::uint32_t stride() const { return m_stride; }
+  // Raw GL framebuffer object id for glBlitFramebuffer callers that
+  // need to write into the dmabuf-backed FBO from a different
+  // attached target (e.g. blitting from m_fbo with an inverted dst
+  // rect to flip Y, since the linux-dmabuf-v1 Y_INVERT flag is not
+  // universally supported).
+  unsigned int framebuffer() const { return m_framebuffer; }
+
+  EglDmabufTarget(const EglDmabufTarget &) = delete;
+  EglDmabufTarget &operator=(const EglDmabufTarget &) = delete;
+
+private:
+  EglDmabufTarget();
+
+  // Opaque to callers (and avoids leaking EGL/GL handle types into
+  // the header). The .cpp owns the EGLDisplay/EGLImage casts.
+  void *m_eglDisplay = nullptr;
+  void *m_eglImage = nullptr;
+  unsigned int m_texture = 0;
+  unsigned int m_framebuffer = 0;
+  int m_width = 0;
+  int m_height = 0;
+  int m_fd = -1;
+  std::uint32_t m_drmFormat = 0;
+  std::uint64_t m_drmModifier = 0;
+  std::uint32_t m_stride = 0;
+};
+
+} // namespace opengl
--- a/qt/src/quickterm/QuickTerminal.cpp
+++ b/qt/src/quickterm/QuickTerminal.cpp
@ -6,17 +6,18 @@
 #include <QCursor>
 #include <QEasingCurve>
 #include <QGuiApplication>
-#include <QPropertyAnimation>
 #include <QScreen>
 #include <QSize>
 #include <QString>
 #include <QStringLiteral>
+#include <QVariantAnimation>
 #include <QWidget>
 #include <QWindow>

 #include <LayerShellQt/window.h>

 #include "../config/Config.h"
+#include "../wayland/AlphaModifier.h"
 #include "ghostty.h"

 namespace quickterm {
@ -43,14 +44,36 @@ int animationMs() {
  return std::clamp(static_cast<int>(secs * 1000.0), 1, 1000);
 }

+// Apply opacity to the window. Uses wp_alpha_modifier_v1 when the
+// compositor supports it (real per-surface alpha multiplier on the
+// compositor side); otherwise falls through to a no-op (the
+// animation still runs but the window just appears at the end —
+// previously this called QWindow::setOpacity which spammed
+// "This plugin does not support setting window opacity" warnings
+// on every animation tick because QtWayland's QPA plugin has no
+// implementation).
+void applyOpacity(QWidget *window, double opacity) {
+  QWindow *handle = window->windowHandle();
+  if (!handle) return;
+  wayland::AlphaModifier::setOpacity(handle, opacity);
+}
+
 // Lazily fetch (or build) the per-window opacity animation, parented
-// to `window` so its lifetime tracks the widget's.
-QPropertyAnimation *animFor(QWidget *window) {
-  auto *existing = window->property(kAnimProperty).value<QPropertyAnimation *>();
+// to `window` so its lifetime tracks the widget's. We use
+// QVariantAnimation (not QPropertyAnimation on windowOpacity) so
+// the per-tick value is delivered to our applyOpacity handler
+// instead of QWindow::setOpacity (which QtWayland's QPA plugin
+// doesn't implement — see applyOpacity comment).
+QVariantAnimation *animFor(QWidget *window) {
+  auto *existing = window->property(kAnimProperty).value<QVariantAnimation *>();
  if (existing) return existing;
-  auto *anim = new QPropertyAnimation(window, "windowOpacity", window);
+  auto *anim = new QVariantAnimation(window);
+  QObject::connect(anim, &QVariantAnimation::valueChanged, window,
+                   [window](const QVariant &v) {
+                     applyOpacity(window, v.toDouble());
+                   });
  window->setProperty(kAnimProperty,
-                      QVariant::fromValue<QPropertyAnimation *>(anim));
+                      QVariant::fromValue<QVariantAnimation *>(anim));
  return anim;
 }

@ -167,25 +190,33 @@ void setupLayerShell(QWidget *window) {
 }

 void animateIn(QWidget *window) {
-  window->setWindowOpacity(0.0);
+  // Show with opacity 0 first so the compositor never paints a
+  // fully-opaque frame before the animation kicks in. The
+  // QVariantAnimation valueChanged → applyOpacity path needs the
+  // wl_surface to exist, which means after show(). We call
+  // applyOpacity twice on either side of show() — once at 0.0 as
+  // a best-effort pre-show (no-op if wl_surface isn't up yet),
+  // once at 0.0 immediately after to lock in the start state.
+  applyOpacity(window, 0.0);
  window->show();
  window->raise();
  window->activateWindow();
+  applyOpacity(window, 0.0);
  const int ms = animationMs();
  if (ms <= 0) {
-    window->setWindowOpacity(1.0);
+    applyOpacity(window, 1.0);
    return;
  }
  // Stop any running fade so toggling rapidly doesn't stack
  // animations.
-  QPropertyAnimation *anim = animFor(window);
+  QVariantAnimation *anim = animFor(window);
  anim->stop();
  // animateOut leaves a `finished -> hide()` handler attached to the
  // shared animation object. If a fade-out was interrupted by this
  // fade-in (rapid out/in cycle), the leftover handler would fire at
  // the end of the in-fade and silently hide the just-revealed
  // window — clear it before starting.
-  QObject::disconnect(anim, &QPropertyAnimation::finished, window, nullptr);
+  QObject::disconnect(anim, &QVariantAnimation::finished, window, nullptr);
  anim->setDuration(ms);
  anim->setStartValue(0.0);
  anim->setEndValue(1.0);
@ -199,17 +230,21 @@ void animateOut(QWidget *window) {
    window->hide();
    return;
  }
-  QPropertyAnimation *anim = animFor(window);
+  QVariantAnimation *anim = animFor(window);
  anim->stop();
  anim->setDuration(ms);
-  anim->setStartValue(window->windowOpacity());
+  // Start from the animation's last delivered value if we have one
+  // (a rapid in-then-out cycle interrupts at some intermediate
+  // alpha); otherwise assume the window was fully visible.
+  const QVariant cur = anim->currentValue();
+  anim->setStartValue(cur.isValid() ? cur.toDouble() : 1.0);
  anim->setEndValue(0.0);
  anim->setEasingCurve(QEasingCurve::InCubic);
  // Disconnect any previous handler before reconnecting; otherwise a
  // toggle-out-then-in cycle accumulates handlers that all fire on
  // the next out.
-  QObject::disconnect(anim, &QPropertyAnimation::finished, window, nullptr);
-  QObject::connect(anim, &QPropertyAnimation::finished, window,
+  QObject::disconnect(anim, &QVariantAnimation::finished, window, nullptr);
+  QObject::connect(anim, &QVariantAnimation::finished, window,
                   [window]() { window->hide(); });
  anim->start();
 }
--- a/qt/src/vulkan/Host.cpp
+++ b/qt/src/vulkan/Host.cpp
@ -0,0 +1,267 @@
+// See `Host.h` for the contract.
+
+#include "Host.h"
+
+#include <array>
+#include <cstdio>
+#include <cstring>
+#include <mutex>
+#include <optional>
+#include <vector>
+
+#include "../wayland/DmabufRegistry.h"
+
+namespace vulkan {
+
+namespace {
+
+constexpr const char *kRequiredDeviceExtensions[] = {
+    "VK_KHR_external_memory_fd",
+    "VK_EXT_external_memory_dma_buf",
+    // Needed so libghostty can allocate render images with a chosen
+    // DRM modifier (vendor-tiled where supported) and query the
+    // driver-chosen layout back via
+    // `vkGetImageDrmFormatModifierPropertiesEXT`. Without it on the
+    // host's VkDevice, the device-level proc-addr lookup for that
+    // function returns null and Target.init fails.
+    "VK_EXT_image_drm_format_modifier",
+};
+
+bool hasRequiredExtensions(VkPhysicalDevice pd) {
+  uint32_t n = 0;
+  vkEnumerateDeviceExtensionProperties(pd, nullptr, &n, nullptr);
+  if (n == 0) return false;
+  std::vector<VkExtensionProperties> exts(n);
+  vkEnumerateDeviceExtensionProperties(pd, nullptr, &n, exts.data());
+  for (const char *req : kRequiredDeviceExtensions) {
+    bool found = false;
+    for (const auto &e : exts) {
+      if (std::strcmp(e.extensionName, req) == 0) {
+        found = true;
+        break;
+      }
+    }
+    if (!found) return false;
+  }
+  return true;
+}
+
+std::optional<uint32_t> findGraphicsQueueFamily(VkPhysicalDevice pd) {
+  uint32_t n = 0;
+  vkGetPhysicalDeviceQueueFamilyProperties(pd, &n, nullptr);
+  if (n == 0) return std::nullopt;
+  std::vector<VkQueueFamilyProperties> props(n);
+  vkGetPhysicalDeviceQueueFamilyProperties(pd, &n, props.data());
+  for (uint32_t i = 0; i < n; ++i) {
+    if (props[i].queueFlags & VK_QUEUE_GRAPHICS_BIT) return i;
+  }
+  return std::nullopt;
+}
+
+// ---- Platform callback trampolines ----------------------------------
+//
+// `ghostty_platform_vulkan_s` is a plain C ABI; the callback signatures
+// take a `void *userdata` that libghostty hands back to each callback.
+// The handle-lookup callbacks (instance / physical_device / device /
+// queue / queue_family_index / get_instance_proc_addr) ignore the
+// userdata and resolve through the process singleton — there's only
+// one Vulkan setup per process. The `present` callback DOES use the
+// userdata: it's the `GhosttySurface *` that owns the rendered
+// target, so we can hand the dmabuf back to the right widget.
+
+void *cbGetInstanceProcAddr(void *ud, const char *name) {
+  (void)ud;
+  auto *host = Host::instance();
+  if (host == nullptr) return nullptr;
+  auto fp = vkGetInstanceProcAddr(host->vkInstance(), name);
+  return reinterpret_cast<void *>(fp);
+}
+
+void *cbInstance(void *ud) {
+  (void)ud;
+  auto *host = Host::instance();
+  return host != nullptr ? host->vkInstance() : nullptr;
+}
+void *cbPhysicalDevice(void *ud) {
+  (void)ud;
+  auto *host = Host::instance();
+  return host != nullptr ? host->vkPhysicalDevice() : nullptr;
+}
+void *cbDevice(void *ud) {
+  (void)ud;
+  auto *host = Host::instance();
+  return host != nullptr ? host->vkDevice() : nullptr;
+}
+void *cbQueue(void *ud) {
+  (void)ud;
+  auto *host = Host::instance();
+  return host != nullptr ? host->vkQueue() : nullptr;
+}
+uint32_t cbQueueFamilyIndex(void *ud) {
+  (void)ud;
+  auto *host = Host::instance();
+  return host != nullptr ? host->vkQueueFamilyIndex() : 0;
+}
+
+size_t cbGetSupportedModifiers(void *ud, uint32_t drm_format,
+                                uint64_t *out, size_t capacity) {
+  (void)ud;
+  // Lock-free read of an immutable table. The table is primed on the
+  // GUI thread by `wayland::primeDmabufModifierRegistry`, called from
+  // `GhosttySurface`'s ctor (Vulkan branch) BEFORE the libghostty
+  // renderer thread is spawned for that surface. As long as that
+  // ordering invariant holds, this read sees a fully-populated table.
+  // `wayland::supportedDmabufModifiers` itself returns 0 if priming
+  // hasn't happened yet, so the failure mode is fail-safe (renderer
+  // gets an empty modifier list, falls back to legacy_copy mode).
+  return ::wayland::supportedDmabufModifiers(drm_format, out, capacity);
+}
+
+void cbPresent(
+    void *ud,
+    int dmabuf_fd,
+    uint32_t drm_format,
+    uint64_t drm_modifier,
+    uint32_t width,
+    uint32_t height,
+    uint32_t stride,
+    bool image_backed) {
+  if (ud == nullptr) return;
+  static_cast<PresentSink *>(ud)->presentDmabuf(
+      dmabuf_fd, drm_format, drm_modifier, width, height, stride,
+      image_backed);
+}
+
+} // namespace
+
+bool Host::init() {
+  // ---- instance ---------------------------------------------------
+  VkApplicationInfo appInfo{};
+  appInfo.sType = VK_STRUCTURE_TYPE_APPLICATION_INFO;
+  appInfo.pApplicationName = "ghastty";
+  appInfo.applicationVersion = 1;
+  appInfo.pEngineName = "ghastty";
+  appInfo.engineVersion = 1;
+  appInfo.apiVersion = VK_API_VERSION_1_3;
+
+  VkInstanceCreateInfo instInfo{};
+  instInfo.sType = VK_STRUCTURE_TYPE_INSTANCE_CREATE_INFO;
+  instInfo.pApplicationInfo = &appInfo;
+  if (vkCreateInstance(&instInfo, nullptr, &m_instance) != VK_SUCCESS) {
+    std::fprintf(stderr, "[vulkan] vkCreateInstance failed\n");
+    return false;
+  }
+
+  // ---- physical device -------------------------------------------
+  uint32_t pdCount = 0;
+  vkEnumeratePhysicalDevices(m_instance, &pdCount, nullptr);
+  if (pdCount == 0) {
+    std::fprintf(stderr, "[vulkan] no physical devices\n");
+    return false;
+  }
+  std::vector<VkPhysicalDevice> pds(pdCount);
+  vkEnumeratePhysicalDevices(m_instance, &pdCount, pds.data());
+
+  for (auto pd : pds) {
+    VkPhysicalDeviceProperties props;
+    vkGetPhysicalDeviceProperties(pd, &props);
+    if (props.apiVersion < VK_API_VERSION_1_3) continue;
+    if (!hasRequiredExtensions(pd)) continue;
+    auto qfi = findGraphicsQueueFamily(pd);
+    if (!qfi) continue;
+    m_physicalDevice = pd;
+    m_queueFamilyIndex = *qfi;
+    break;
+  }
+  if (m_physicalDevice == VK_NULL_HANDLE) {
+    std::fprintf(stderr,
+                 "[vulkan] no suitable physical device "
+                 "(need Vulkan 1.3 + external_memory_fd + dma_buf)\n");
+    return false;
+  }
+
+  // ---- logical device + queue ------------------------------------
+  float queuePriority = 1.0f;
+  VkDeviceQueueCreateInfo qci{};
+  qci.sType = VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO;
+  qci.queueFamilyIndex = m_queueFamilyIndex;
+  qci.queueCount = 1;
+  qci.pQueuePriorities = &queuePriority;
+
+  // libghostty's Vulkan renderer uses Vulkan 1.3 dynamic rendering
+  // (vkCmdBeginRendering / vkCmdEndRendering, no VkRenderPass).
+  // That feature has to be explicitly enabled at device creation
+  // time via VkPhysicalDeviceVulkan13Features.
+  VkPhysicalDeviceVulkan13Features vk13features{};
+  vk13features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_3_FEATURES;
+  vk13features.dynamicRendering = VK_TRUE;
+  vk13features.synchronization2 = VK_TRUE;
+
+  VkDeviceCreateInfo dci{};
+  dci.sType = VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO;
+  dci.pNext = &vk13features;
+  dci.queueCreateInfoCount = 1;
+  dci.pQueueCreateInfos = &qci;
+  dci.enabledExtensionCount =
+      static_cast<uint32_t>(std::size(kRequiredDeviceExtensions));
+  dci.ppEnabledExtensionNames = kRequiredDeviceExtensions;
+
+  if (vkCreateDevice(m_physicalDevice, &dci, nullptr, &m_device) != VK_SUCCESS) {
+    std::fprintf(stderr, "[vulkan] vkCreateDevice failed\n");
+    return false;
+  }
+
+  vkGetDeviceQueue(m_device, m_queueFamilyIndex, 0, &m_queue);
+
+  VkPhysicalDeviceProperties props;
+  vkGetPhysicalDeviceProperties(m_physicalDevice, &props);
+  std::fprintf(stderr,
+               "[vulkan] device ready: %s (Vulkan %u.%u.%u, qfi=%u)\n",
+               props.deviceName,
+               VK_API_VERSION_MAJOR(props.apiVersion),
+               VK_API_VERSION_MINOR(props.apiVersion),
+               VK_API_VERSION_PATCH(props.apiVersion),
+               m_queueFamilyIndex);
+  return true;
+}
+
+Host::~Host() {
+  if (m_device != VK_NULL_HANDLE) vkDestroyDevice(m_device, nullptr);
+  if (m_instance != VK_NULL_HANDLE) vkDestroyInstance(m_instance, nullptr);
+}
+
+ghostty_platform_vulkan_s Host::asPlatform(PresentSink *sink) const {
+  ghostty_platform_vulkan_s p{};
+  p.userdata = sink;
+  p.get_instance_proc_addr = cbGetInstanceProcAddr;
+  p.instance = cbInstance;
+  p.physical_device = cbPhysicalDevice;
+  p.device = cbDevice;
+  p.queue = cbQueue;
+  p.queue_family_index = cbQueueFamilyIndex;
+  p.get_supported_modifiers = cbGetSupportedModifiers;
+  p.present = cbPresent;
+  return p;
+}
+
+Host *Host::instance() {
+  static std::once_flag once;
+  static std::unique_ptr<Host> host;
+  std::call_once(once, []() {
+    auto candidate = std::unique_ptr<Host>(new Host());
+    if (candidate->init()) {
+      host = std::move(candidate);
+    }
+    // candidate's destructor runs on init failure and cleans up
+    // any partial state.
+  });
+  // The dmabuf modifier registry priming used to happen here too,
+  // inside this `call_once`. It moved out to `GhosttySurface`'s
+  // ctor: registry priming is a Wayland-protocol concern, not a
+  // Vulkan one, and `Host::instance()` is logically about Vulkan
+  // setup. Co-locating both in one trampoline coupled `Host` to a
+  // wayland-side concern that doesn't need it.
+  return host.get();
+}
+
+} // namespace vulkan
--- a/qt/src/vulkan/Host.h
+++ b/qt/src/vulkan/Host.h
@ -0,0 +1,97 @@
+// Vulkan host setup for the Ghastty Qt frontend.
+//
+// libghostty (when built with `-Drenderer=vulkan`) doesn't create
+// its own VkInstance / VkDevice — the host does, then hands the
+// handles down via the `ghostty_platform_vulkan_s` callback struct
+// declared in `include/ghostty.h`. This class is the Qt-side owner
+// of those handles.
+//
+// The host is process-singleton (one Vulkan instance + device shared
+// across every `GhosttySurface`), constructed lazily on first use
+// via `instance()`. Requires a physical device that supports
+// VK_KHR_external_memory_fd, VK_EXT_external_memory_dma_buf, and
+// VK_EXT_image_drm_format_modifier — all three are needed for the
+// dmabuf-as-importable-image export path libghostty's Vulkan
+// renderer uses to hand frames back to the host.
+//
+// The compositor dmabuf modifier registry that this host's
+// `get_supported_modifiers` callback reads is primed elsewhere
+// (in `GhosttySurface`'s ctor on the GUI thread, via
+// `wayland::primeDmabufModifierRegistry` from
+// `qt/src/wayland/DmabufRegistry.h`). That priming is a Wayland
+// concern and used to leak into `Host::instance`'s `call_once` —
+// which made `Host` (a Vulkan object) responsible for a
+// Wayland-protocol concern it doesn't otherwise touch.
+
+#pragma once
+
+#include <cstdint>
+#include <memory>
+
+#include <vulkan/vulkan.h>
+
+#include "ghostty.h"
+
+namespace vulkan {
+
+/// Receiver for a presented dmabuf-backed frame. Implemented by
+/// `GhosttySurface`; abstract so `vulkan::Host` doesn't need to
+/// know about the widget type. Replaces an earlier cross-TU
+/// forward declaration of a free function `presentToGhosttySurface`
+/// that coupled `Host.cpp` directly to `GhosttySurface.cpp`.
+class PresentSink {
+public:
+  virtual ~PresentSink() = default;
+  /// Hand off a rendered frame. Called on the libghostty renderer
+  /// thread; the implementation is responsible for marshalling to
+  /// whatever thread it composites on. The fd is borrowed for the
+  /// duration of the call — implementations that need to retain
+  /// it must `dup()`.
+  virtual void presentDmabuf(int dmabuf_fd, std::uint32_t drm_format,
+                              std::uint64_t drm_modifier,
+                              std::uint32_t width, std::uint32_t height,
+                              std::uint32_t stride, bool image_backed) = 0;
+};
+
+/// Process-wide Vulkan setup. One per Ghastty process; threadsafe
+/// to call `instance()` from anywhere (constructs once via
+/// std::call_once on first access).
+class Host {
+public:
+  /// Return the process-wide host, or nullptr if Vulkan can't be
+  /// brought up on this system. Cached after the first call so
+  /// repeated lookups are cheap.
+  static Host *instance();
+
+  /// Build a `ghostty_platform_vulkan_s` callback struct whose
+  /// `present` callback delivers frames to `sink`. `sink` must
+  /// outlive the lifetime of any libghostty surface that was
+  /// configured with the returned platform struct. Other callbacks
+  /// (handle lookups, modifier registry) ignore `sink` and route
+  /// through the process singleton.
+  ghostty_platform_vulkan_s asPlatform(PresentSink *sink) const;
+
+  VkInstance vkInstance() const { return m_instance; }
+  VkPhysicalDevice vkPhysicalDevice() const { return m_physicalDevice; }
+  VkDevice vkDevice() const { return m_device; }
+  VkQueue vkQueue() const { return m_queue; }
+  uint32_t vkQueueFamilyIndex() const { return m_queueFamilyIndex; }
+
+  ~Host();
+
+  // No copy/move — singleton.
+  Host(const Host &) = delete;
+  Host &operator=(const Host &) = delete;
+
+private:
+  Host() = default;
+  bool init();
+
+  VkInstance m_instance = VK_NULL_HANDLE;
+  VkPhysicalDevice m_physicalDevice = VK_NULL_HANDLE;
+  VkDevice m_device = VK_NULL_HANDLE;
+  VkQueue m_queue = VK_NULL_HANDLE;
+  uint32_t m_queueFamilyIndex = 0;
+};
+
+} // namespace vulkan
--- a/qt/src/wayland/AlphaModifier.cpp
+++ b/qt/src/wayland/AlphaModifier.cpp
@ -0,0 +1,193 @@
+#include "AlphaModifier.h"
+
+#include <algorithm>
+#include <cstdint>
+#include <cstring>
+#include <mutex>
+#include <unordered_map>
+
+#include <QGuiApplication>
+#include <QWindow>
+#include <qpa/qplatformnativeinterface.h>
+
+#include <wayland-client.h>
+
+#include "alpha-modifier-v1-client-protocol.h"
+
+namespace wayland {
+
+namespace {
+
+// Process-wide binding. Lazily initialised on first supported()/
+// setOpacity() call, then read lock-free via the atomic-by-fence
+// guarantee of `std::call_once`. Once bound it lives for the
+// process lifetime — there's no clean teardown path on Wayland
+// global teardown that would matter for a manager-style global.
+struct GlobalState {
+  wl_display *display = nullptr;
+  wp_alpha_modifier_v1 *manager = nullptr;  // null if compositor lacks it
+  bool ready = false;                       // call_once fired (success or failure)
+};
+
+GlobalState &globalState() {
+  static GlobalState g;
+  return g;
+}
+
+// Listener: discover wp_alpha_modifier_v1 in the registry. The
+// scoped wl_event_queue we use here is destroyed before the
+// listener data goes out of scope, so the registry's child
+// proxies (none survive past this binding pass) are safe.
+void onRegistryGlobal(void *data, wl_registry *registry, uint32_t name,
+                      const char *interface, uint32_t /*version*/) {
+  auto *g = static_cast<GlobalState *>(data);
+  if (std::strcmp(interface, wp_alpha_modifier_v1_interface.name) != 0)
+    return;
+  // Version 1 is the only version of this staging protocol so far.
+  g->manager = static_cast<wp_alpha_modifier_v1 *>(
+      wl_registry_bind(registry, name, &wp_alpha_modifier_v1_interface, 1));
+}
+
+void onRegistryGlobalRemove(void *, wl_registry *, uint32_t) {}
+
+const wl_registry_listener kRegistryListener = {
+    &onRegistryGlobal,
+    &onRegistryGlobalRemove,
+};
+
+// Bind the manager global lazily on first use. Idempotent under
+// std::call_once. Mirrors the private-queue pattern in
+// XkbTracker — and like that, we migrate the bound proxy onto
+// the default queue before destroying the private queue, so
+// future calls (set_multiplier, get_surface) dispatch on Qt's
+// event loop instead of a dangling queue.
+void initOnce() {
+  static std::once_flag once;
+  std::call_once(once, []() {
+    auto &g = globalState();
+    QPlatformNativeInterface *native =
+        QGuiApplication::platformNativeInterface();
+    if (!native) {
+      g.ready = true;
+      return;
+    }
+    g.display = static_cast<wl_display *>(
+        native->nativeResourceForIntegration("wl_display"));
+    if (!g.display) {
+      g.ready = true;
+      return;
+    }
+
+    wl_event_queue *queue = wl_display_create_queue(g.display);
+    wl_registry *registry = wl_display_get_registry(g.display);
+    wl_proxy_set_queue(reinterpret_cast<wl_proxy *>(registry), queue);
+    wl_registry_add_listener(registry, &kRegistryListener, &g);
+    wl_display_roundtrip_queue(g.display, queue);
+    wl_registry_destroy(registry);
+
+    // Migrate the manager onto the default queue BEFORE destroying
+    // the private one — otherwise compositor-side messages for the
+    // manager (none expected for this protocol, but cleanliness
+    // matters and Qt's event queue is the dispatch target we want
+    // anyway) would target a destroyed queue, the same footgun that
+    // produced the exit-time SIGSEGV in XkbTracker.
+    if (g.manager) {
+      wl_proxy_set_queue(reinterpret_cast<wl_proxy *>(g.manager), nullptr);
+    }
+    wl_event_queue_destroy(queue);
+    g.ready = true;
+  });
+}
+
+// Per-wl_surface alpha modifier object cache. Cached so animation
+// ticks don't re-roundtrip get_surface every frame.
+//
+// Keyed by wl_surface* — that's stable for the wl_surface's
+// lifetime, and we explicitly drop on detach(). If a QWindow is
+// destroyed without detach() being called the wl_surface gets
+// destroyed by Qt; the cached wp_alpha_modifier_surface_v1 would
+// then be invalid on next get_surface, so callers MUST detach()
+// from the QWindow's destruction path. Map access is from the
+// GUI thread only.
+struct Cache {
+  std::unordered_map<wl_surface *, wp_alpha_modifier_surface_v1 *> entries;
+};
+
+Cache &cache() {
+  static Cache c;
+  return c;
+}
+
+wl_surface *surfaceFor(QWindow *window) {
+  if (!window) return nullptr;
+  QPlatformNativeInterface *native =
+      QGuiApplication::platformNativeInterface();
+  if (!native) return nullptr;
+  return static_cast<wl_surface *>(
+      native->nativeResourceForWindow("surface", window));
+}
+
+wp_alpha_modifier_surface_v1 *getOrCreate(wl_surface *surface) {
+  auto &c = cache();
+  auto it = c.entries.find(surface);
+  if (it != c.entries.end()) return it->second;
+  auto *manager = globalState().manager;
+  if (!manager) return nullptr;
+  auto *obj = wp_alpha_modifier_v1_get_surface(manager, surface);
+  if (!obj) return nullptr;
+  c.entries.emplace(surface, obj);
+  return obj;
+}
+
+}  // namespace
+
+bool AlphaModifier::supported() {
+  initOnce();
+  return globalState().manager != nullptr;
+}
+
+bool AlphaModifier::setOpacity(QWindow *window, double opacity) {
+  initOnce();
+  auto &g = globalState();
+  if (!g.manager) return false;
+  wl_surface *surface = surfaceFor(window);
+  if (!surface) return false;
+  auto *mod = getOrCreate(surface);
+  if (!mod) return false;
+
+  // Convert [0.0, 1.0] → [0, UINT32_MAX]. Clamp first; lround
+  // gives the closest integer, matching what users expect at the
+  // endpoints (1.0 → fully opaque, 0.0 → fully transparent) without
+  // off-by-one rounding drift at intermediate values.
+  const double clamped = std::clamp(opacity, 0.0, 1.0);
+  const uint32_t factor = static_cast<uint32_t>(
+      std::lround(clamped * static_cast<double>(UINT32_MAX)));
+  wp_alpha_modifier_surface_v1_set_multiplier(mod, factor);
+  // Alpha multiplier is double-buffered on the wl_surface; the
+  // change applies on the next wl_surface.commit. Commit here so
+  // the caller doesn't need to know about Wayland's double-buffer
+  // semantics. For Qt-managed top-level windows we don't have a
+  // clean Qt API to force a parent commit, so we wl_surface.commit
+  // the surface directly — same trick used elsewhere in this code
+  // for subsurface state changes.
+  wl_surface_commit(surface);
+  // And flush so the commit reaches the compositor immediately
+  // rather than sitting in libwayland-client's send buffer until
+  // Qt's next event-loop iteration. Otherwise rapid animation
+  // ticks would coalesce into one frame at the end of the tick
+  // cycle, defeating the smooth fade.
+  wl_display_flush(g.display);
+  return true;
+}
+
+void AlphaModifier::detach(QWindow *window) {
+  wl_surface *surface = surfaceFor(window);
+  if (!surface) return;
+  auto &c = cache();
+  auto it = c.entries.find(surface);
+  if (it == c.entries.end()) return;
+  wp_alpha_modifier_surface_v1_destroy(it->second);
+  c.entries.erase(it);
+}
+
+}  // namespace wayland
--- a/qt/src/wayland/AlphaModifier.h
+++ b/qt/src/wayland/AlphaModifier.h
@ -0,0 +1,51 @@
+// Per-window alpha multiplier via wp_alpha_modifier_v1.
+//
+// QtWayland's QPA plugin doesn't implement QWindow::setOpacity (it
+// logs "This plugin does not support setting window opacity" on
+// every call). For the QuickTerminal fade-in/out we need real
+// per-surface alpha, so we drive the wp_alpha_modifier_v1 staging
+// Wayland protocol ourselves.
+//
+// Compositor support (as of 2026-05): KWin (KDE 6+), wlroots
+// (≥0.17), Hyprland — yes. mutter/GNOME — no. If the protocol
+// isn't advertised, `setOpacity` returns false and the caller can
+// either skip the animation or fall back to instant show/hide.
+//
+// Wayland-only by project decision (see feedback-qt-no-x11 memory).
+
+#pragma once
+
+struct wp_alpha_modifier_v1;
+struct wp_alpha_modifier_surface_v1;
+class QWindow;
+
+namespace wayland {
+
+class AlphaModifier {
+public:
+  // Returns true if the compositor advertises wp_alpha_modifier_v1
+  // and we've successfully bound it. Cheap after the first call
+  // (the binding is cached process-wide). Use this to decide
+  // whether to drive an opacity animation or fall through to
+  // instant show/hide.
+  static bool supported();
+
+  // Set the window's alpha multiplier in [0.0, 1.0]. Must be
+  // called on the GUI thread (the thread that owns wl_display
+  // dispatch). Returns false if `window`'s native wl_surface
+  // isn't available yet (e.g. before first show), or if the
+  // compositor doesn't support the protocol.
+  //
+  // The wp_alpha_modifier_surface_v1 object is created lazily per
+  // wl_surface and cached for the surface's lifetime — repeated
+  // calls during an animation just emit set_multiplier + commit.
+  static bool setOpacity(QWindow *window, double opacity);
+
+  // Release the per-surface alpha modifier object for this window.
+  // Call when the window is being destroyed (or before re-creating
+  // its native surface). Equivalent to set_multiplier(UINT32_MAX)
+  // followed by destroy on the surface object.
+  static void detach(QWindow *window);
+};
+
+}  // namespace wayland
--- a/qt/src/wayland/DmabufRegistry.h
+++ b/qt/src/wayland/DmabufRegistry.h
@ -0,0 +1,55 @@
+// Compositor dmabuf modifier registry.
+//
+// Process-wide read-only table of `(drm_format, [modifier])` pairs the
+// compositor advertises via `zwp_linux_dmabuf_v1`. libghostty's Vulkan
+// renderer queries this through the
+// `ghostty_platform_vulkan_s.get_supported_modifiers` callback when
+// picking a modifier the compositor will accept on attach — without
+// that intersection, drivers that don't expose `COLOR_ATTACHMENT_BIT`
+// for `LINEAR` (NVIDIA) can't get into Target's direct-export mode at
+// all and have to fall back to the legacy CPU-readback path.
+//
+// Why a header of its own instead of living on
+// `wayland::SubsurfacePresenter`? The presenter is per-widget; the
+// registry is process-wide and read-only after a one-shot prime. They
+// share `globalState()` machinery internally
+// (`SubsurfacePresenter.cpp`) but their public surfaces are unrelated
+// concerns.
+//
+// Wayland-only by project decision (the Qt frontend is Wayland-only;
+// see `feedback-qt-no-x11` memory). On non-Wayland QPA both functions
+// are no-ops — `primeDmabufModifierRegistry` returns immediately and
+// `supportedDmabufModifiers` returns 0 — so callers can stay
+// runtime-agnostic.
+
+#pragma once
+
+#include <cstddef>
+#include <cstdint>
+
+namespace wayland {
+
+// Eagerly discover the compositor's dmabuf modifier list on the
+// CALLING THREAD. MUST be called from the GUI thread before any
+// `supportedDmabufModifiers` reader runs (typically the libghostty
+// renderer thread). Safe to call multiple times — discovery happens
+// exactly once via the underlying `globalState`'s latched `searched`
+// flag.
+//
+// Idempotent no-op if the QPA isn't Wayland or the
+// QPlatformNativeInterface lookup fails.
+void primeDmabufModifierRegistry();
+
+// Read the cached compositor-supported DRM modifiers for the given
+// DRM_FORMAT_* fourcc. Returns the number of modifiers actually
+// written to `out` (capped at `capacity`). Pass `out=nullptr,
+// capacity=0` to query the total count.
+//
+// Thread-safe for readers once `primeDmabufModifierRegistry` has
+// returned. Returns 0 if the registry hasn't been primed yet or the
+// format isn't advertised.
+std::size_t supportedDmabufModifiers(std::uint32_t drm_format,
+                                     std::uint64_t *out,
+                                     std::size_t capacity);
+
+} // namespace wayland
--- a/qt/src/wayland/SubsurfacePresenter.cpp
+++ b/qt/src/wayland/SubsurfacePresenter.cpp
@ -0,0 +1,785 @@
+#include "SubsurfacePresenter.h"
+#include "DmabufRegistry.h"
+
+#include <algorithm>
+#include <climits>
+#include <cstdint>
+#include <cstdio>
+#include <cstring>
+#include <sys/stat.h>  // ::fstat — wl_buffer cache identity via st_ino
+#include <unordered_map>
+#include <vector>
+
+#include <QGuiApplication>
+#include <QLatin1String>
+#include <QWindow>
+#include <qpa/qplatformnativeinterface.h>
+
+#include <wayland-client.h>
+
+#include "fractional-scale-v1-client-protocol.h"
+#include "linux-dmabuf-v1-client-protocol.h"
+#include "viewporter-client-protocol.h"
+
+namespace wayland {
+
+namespace {
+
+// Process-wide bindings for the Wayland globals the presenter needs,
+// plus the (format → modifiers) table the compositor advertises via
+// zwp_linux_dmabuf_v1's format/modifier events. Populated once by
+// `discoverGlobals` on the GUI thread; subsequent reads from the
+// renderer thread are safe because the table is never mutated after
+// the initial discovery completes.
+struct PresenterGlobals {
+  wl_compositor *compositor = nullptr;
+  wl_subcompositor *subcompositor = nullptr;
+  zwp_linux_dmabuf_v1 *dmabuf = nullptr;
+  wp_viewporter *viewporter = nullptr;
+  wp_fractional_scale_manager_v1 *fractionalScale = nullptr;
+  std::unordered_map<uint32_t, std::vector<uint64_t>> modifiers;
+  bool searched = false;
+};
+
+PresenterGlobals &globalState() {
+  static PresenterGlobals g;
+  return g;
+}
+
+// Pre-v4 dmabuf format event. We ignore it: v3 also fires `modifier`
+// events for every (format, modifier) tuple including LINEAR — the
+// `format` event is legacy from v1/v2 when modifiers didn't exist.
+void dmabufFormat(void *, zwp_linux_dmabuf_v1 *, uint32_t /*format*/) {}
+
+// `modifier` event: compositor advertises one (format, modifier) it
+// can scan out. Fires once per pair during the bind roundtrip; we
+// stash them all in the per-format vector. Only fires from inside
+// `discoverGlobals` because we keep the dmabuf proxy on a private
+// queue that's never dispatched after discovery — see the queue-
+// retention comment in `discoverGlobals`. That guarantee is what
+// lets the renderer thread read `globals.modifiers` without a
+// lock, and is also why we don't bother deduping (one bind round
+// only fires each pair once).
+void dmabufModifier(void *data, zwp_linux_dmabuf_v1 *, uint32_t format,
+                    uint32_t modifier_hi, uint32_t modifier_lo) {
+  auto *g = static_cast<PresenterGlobals *>(data);
+  const uint64_t modifier =
+      (static_cast<uint64_t>(modifier_hi) << 32) | modifier_lo;
+  g->modifiers[format].push_back(modifier);
+}
+
+const zwp_linux_dmabuf_v1_listener kDmabufListener = {
+    dmabufFormat,
+    dmabufModifier,
+};
+
+void registryGlobal(void *data, wl_registry *registry, uint32_t name,
+                    const char *interface, uint32_t version) {
+  auto *g = static_cast<PresenterGlobals *>(data);
+  if (std::strcmp(interface, wl_compositor_interface.name) == 0) {
+    // Bind wl_compositor at version 3+ so child wl_surfaces we
+    // create support `set_buffer_scale` (added in v3, used by the
+    // presenter on HiDPI displays). Cap at v6 (the highest we've
+    // tested against); if the compositor advertises less, take
+    // what we get and `presentDmabuf` will skip the buffer_scale
+    // call on those compositors.
+    const uint32_t v = std::min<uint32_t>(version, 6u);
+    g->compositor = static_cast<wl_compositor *>(
+        wl_registry_bind(registry, name, &wl_compositor_interface, v));
+  } else if (std::strcmp(interface, wl_subcompositor_interface.name) == 0) {
+    g->subcompositor = static_cast<wl_subcompositor *>(
+        wl_registry_bind(registry, name, &wl_subcompositor_interface, 1));
+  } else if (std::strcmp(interface, zwp_linux_dmabuf_v1_interface.name) == 0) {
+    // We want at least v3 for `create_immed` (synchronous wl_buffer
+    // creation — v1/v2 have only the async `create` + `created`/
+    // `failed` dance). A compositor that only advertises v1/v2
+    // can't satisfy our protocol assumptions; binding at v3 against
+    // such a compositor would protocol-error and tear down the
+    // entire wl_display. Skip the bind in that case so the
+    // legacy QImage fallback engages cleanly.
+    if (version < 3) {
+      std::fprintf(stderr,
+                   "[ghastty] wayland: linux-dmabuf-v1 advertised at "
+                   "version %u; need >= 3 for create_immed, falling back "
+                   "to QImage path\n",
+                   version);
+    } else {
+      // Cap at v3 — v4 adds the dynamic format/modifier feedback
+      // dance which we don't consume.
+      const uint32_t v = std::min<uint32_t>(version, 3u);
+      g->dmabuf = static_cast<zwp_linux_dmabuf_v1 *>(wl_registry_bind(
+          registry, name, &zwp_linux_dmabuf_v1_interface, v));
+      // Add the listener immediately so the modifier events queued
+      // by the bind get delivered when the dispatch loop continues.
+      zwp_linux_dmabuf_v1_add_listener(g->dmabuf, &kDmabufListener, g);
+    }
+  } else if (std::strcmp(interface, wp_viewporter_interface.name) == 0) {
+    g->viewporter = static_cast<wp_viewporter *>(
+        wl_registry_bind(registry, name, &wp_viewporter_interface, 1));
+  } else if (std::strcmp(
+                 interface, wp_fractional_scale_manager_v1_interface.name) == 0) {
+    g->fractionalScale = static_cast<wp_fractional_scale_manager_v1 *>(
+        wl_registry_bind(registry, name,
+                         &wp_fractional_scale_manager_v1_interface, 1));
+  }
+}
+void registryGlobalRemove(void *, wl_registry *, uint32_t) {}
+
+const wl_registry_listener kRegistryListener = {
+    registryGlobal,
+    registryGlobalRemove,
+};
+
+PresenterGlobals *discoverGlobals(wl_display *display) {
+  PresenterGlobals &globals = globalState();
+  if (globals.searched) return &globals;
+  globals.searched = true;
+
+  wl_event_queue *queue = wl_display_create_queue(display);
+  wl_registry *registry = wl_display_get_registry(display);
+  wl_proxy_set_queue(reinterpret_cast<wl_proxy *>(registry), queue);
+  wl_registry_add_listener(registry, &kRegistryListener, &globals);
+  // Roundtrip 1: bind compositor/subcompositor/dmabuf. Inside the
+  // registry callback we attach the dmabuf listener immediately, so
+  // any format/modifier events that arrive in the same dispatch
+  // pass fire on it. A negative return means the wl_display
+  // disconnected mid-startup; subsequent tryCreate calls fall
+  // through to the QImage path (g->compositor etc. stay null).
+  if (wl_display_roundtrip_queue(display, queue) < 0) {
+    std::fprintf(stderr,
+                 "[ghastty] wayland: discoverGlobals roundtrip 1 failed; "
+                 "subsurface present path disabled\n");
+  }
+  wl_registry_destroy(registry);
+  // Roundtrip 2: belt-and-suspenders for any compositor that defers
+  // the modifier events past the bind reply (most don't, but some
+  // batch them). After this returns the modifier table is fully
+  // populated and frozen for the process lifetime.
+  if (globals.dmabuf && wl_display_roundtrip_queue(display, queue) < 0) {
+    std::fprintf(stderr,
+                 "[ghastty] wayland: discoverGlobals roundtrip 2 failed; "
+                 "modifier table is incomplete — disabling dmabuf path\n");
+    // Drop whatever modifier entries we did get. A partially-
+    // populated table is dangerous: presentDmabuf would treat it
+    // as authoritative, hand a "supported" modifier to the
+    // compositor that the compositor may actually not accept, and
+    // the resulting `invalid_format` is a FATAL protocol error
+    // that kills the entire wl_display. Falling back to QImage
+    // path (modifiers map empty → tryCreate's checks fail / the
+    // Vulkan renderer drops to legacy_copy mode) is much safer.
+    globals.modifiers.clear();
+    globals.dmabuf = nullptr;
+  }
+
+  std::size_t total_mods = 0;
+  for (const auto &kv : globals.modifiers) total_mods += kv.second.size();
+  std::fprintf(stderr,
+               "[ghastty] wayland: discovered %zu dmabuf (format,modifier) "
+               "pairs across %zu formats\n",
+               total_mods, globals.modifiers.size());
+
+  // Move the bound proxies back to the default queue so Qt's main
+  // dispatch drives subsequent events on them, then drop the private
+  // queue. (Same lifecycle dance as `blurManager`.)
+  //
+  // EXCEPT the dmabuf proxy: its listener mutates `globals.modifiers`
+  // on every `modifier` event, and the renderer thread reads that
+  // map from `supportedDmabufModifiers` without locking. If we
+  // moved the proxy back to the default queue, a compositor
+  // restart / hot-plug fires more `modifier` events that would
+  // race the reader. Keep the proxy on `queue` and intentionally
+  // never dispatch that queue again — the events queue up
+  // harmlessly and are reaped at proxy destruction. The map is
+  // genuinely frozen post-discovery now.
+  if (globals.compositor)
+    wl_proxy_set_queue(reinterpret_cast<wl_proxy *>(globals.compositor),
+                       nullptr);
+  if (globals.subcompositor)
+    wl_proxy_set_queue(reinterpret_cast<wl_proxy *>(globals.subcompositor),
+                       nullptr);
+  if (globals.viewporter)
+    wl_proxy_set_queue(reinterpret_cast<wl_proxy *>(globals.viewporter),
+                       nullptr);
+  if (globals.fractionalScale)
+    wl_proxy_set_queue(reinterpret_cast<wl_proxy *>(globals.fractionalScale),
+                       nullptr);
+  // We deliberately leak `queue` (and leave globals.dmabuf attached
+  // to it) for the process lifetime — it has no resources beyond a
+  // small kernel-side buffer and going away would put dmabuf events
+  // back on the default queue.
+
+  return &globals;
+}
+
+wl_display *acquireWaylandDisplay() {
+  if (!QGuiApplication::platformName().startsWith(QLatin1String("wayland")))
+    return nullptr;
+  QPlatformNativeInterface *native = QGuiApplication::platformNativeInterface();
+  if (!native) return nullptr;
+  return static_cast<wl_display *>(
+      native->nativeResourceForIntegration("wl_display"));
+}
+
+// wl_buffer::release listener: the compositor is done sampling the
+// buffer for any committed surface state. We KEEP the wl_buffer
+// alive across releases — libghostty re-uses the same dmabuf fd
+// across frames until resize, so we re-attach the cached wl_buffer
+// on every present (see `m_cachedBuffer` in the header). The buffer
+// is destroyed only when (a) the dmabuf shape changes (next
+// `presentDmabuf` invalidates the cache) or (b) the presenter is
+// destroyed.
+//
+// The underlying dmabuf memory is owned by libghostty; we never
+// close that fd here (the SCM_RIGHTS transfer in
+// zwp_linux_buffer_params.add gave the compositor its own
+// reference, which lives independently of our wl_buffer).
+void bufferRelease(void *, wl_buffer *) {
+  // No-op. See cache rationale above.
+}
+const wl_buffer_listener kBufferListener = {
+    bufferRelease,
+};
+
+// wl_callback::done listener for compositor-paced presents. Single-
+// shot per callback — the proxy is destroyed here and the
+// presenter's m_frameCallback field is cleared so the next present
+// knows to register a fresh one. After cleanup, invoke the
+// presenter's onFrameReady hook (set by GhosttySurface to pump the
+// next pending frame).
+void frameCallbackDone(void *data, wl_callback *cb, uint32_t /*time*/) {
+  auto *p = static_cast<wayland::SubsurfacePresenter *>(data);
+  // Defensive: if the listener fires after the proxy was destroyed
+  // by ~SubsurfacePresenter (Wayland guarantees no events on a
+  // destroyed proxy, so this shouldn't happen, but if a future
+  // refactor destroys the presenter before flushing the queue we'd
+  // rather no-op than UAF).
+  if (!p) {
+    wl_callback_destroy(cb);
+    return;
+  }
+  p->onFrameCallbackDone(cb);
+}
+const wl_callback_listener kFrameCallbackListener = {
+    frameCallbackDone,
+};
+
+} // namespace
+
+void primeDmabufModifierRegistry() {
+  if (wl_display *display = acquireWaylandDisplay()) {
+    (void)discoverGlobals(display);
+  }
+}
+
+std::size_t supportedDmabufModifiers(std::uint32_t drm_format,
+                                     std::uint64_t *out,
+                                     std::size_t capacity) {
+  const PresenterGlobals &g = globalState();
+  if (!g.searched) return 0;
+  auto it = g.modifiers.find(drm_format);
+  if (it == g.modifiers.end()) return 0;
+  const std::size_t available = it->second.size();
+  if (out == nullptr || capacity == 0) return available;
+  const std::size_t copied = std::min(available, capacity);
+  std::memcpy(out, it->second.data(), copied * sizeof(std::uint64_t));
+  return copied;
+}
+
+std::unique_ptr<SubsurfacePresenter>
+SubsurfacePresenter::tryCreate(QWindow *topLevel) {
+  if (!topLevel) return nullptr;
+
+  if (!QGuiApplication::platformName().startsWith(QLatin1String("wayland"))) {
+    std::fprintf(stderr,
+                 "[ghastty] SubsurfacePresenter: not on Wayland QPA\n");
+    return nullptr;
+  }
+
+  QPlatformNativeInterface *native = QGuiApplication::platformNativeInterface();
+  if (!native) return nullptr;
+
+  auto *display = static_cast<wl_display *>(
+      native->nativeResourceForIntegration("wl_display"));
+  auto *parentSurface = static_cast<wl_surface *>(
+      native->nativeResourceForWindow("surface", topLevel));
+  if (!display || !parentSurface) {
+    std::fprintf(stderr,
+                 "[ghastty] SubsurfacePresenter: missing wl_display or "
+                 "parent wl_surface (display=%p surface=%p)\n",
+                 static_cast<void *>(display),
+                 static_cast<void *>(parentSurface));
+    return nullptr;
+  }
+
+  PresenterGlobals *g = discoverGlobals(display);
+  if (!g->compositor || !g->subcompositor || !g->dmabuf || !g->viewporter) {
+    std::fprintf(
+        stderr,
+        "[ghastty] SubsurfacePresenter: compositor missing required globals "
+        "(compositor=%p subcompositor=%p dmabuf=%p viewporter=%p)\n",
+        static_cast<void *>(g->compositor),
+        static_cast<void *>(g->subcompositor), static_cast<void *>(g->dmabuf),
+        static_cast<void *>(g->viewporter));
+    return nullptr;
+  }
+  // wp_fractional_scale_manager_v1 is optional — if missing we
+  // assume integer scale 1.0 and let wp_viewport.set_destination
+  // still do its job. Most modern compositors support it.
+
+  wl_surface *child = wl_compositor_create_surface(g->compositor);
+  if (!child) return nullptr;
+
+  wl_subsurface *sub =
+      wl_subcompositor_get_subsurface(g->subcompositor, child, parentSurface);
+  if (!sub) {
+    wl_surface_destroy(child);
+    return nullptr;
+  }
+
+  // Sync mode (the wl_subsurface default): wl_surface.commit on
+  // the child caches state until the parent commits, at which point
+  // both apply atomically. This is what guarantees lockstep resize
+  // behavior — parent grows to the new size and our matching
+  // new-size buffer apply in the same compositor frame, no gap.
+  //
+  // Sync mode requires the parent to commit for our state to
+  // apply. Qt's backing-store flush doesn't fire for our
+  // translucent QWidget (paintEvent produces no damage), so
+  // GhosttySurface forces the parent commit explicitly via
+  // QtWaylandClient::QWaylandWindow::commit() (Qt6::WaylandClient-
+  // Private) after every child commit + viewport update. See
+  // `forceParentCommit` in GhosttySurface.cpp.
+  //
+  // The earlier desync-mode attempt avoided the Qt-private
+  // dependency but couldn't deliver lockstep resize because the
+  // two surfaces commit independently in that mode.
+
+  // Initial subsurface position: (0,0) in parent-surface coords.
+  // GhosttySurface immediately calls setPosition after tryCreate
+  // returns with the pane's real offset within the top-level (and
+  // updates it on every moveEvent / resizeEvent).
+  wl_subsurface_set_position(sub, 0, 0);
+
+  // Stack the subsurface BELOW the parent so Qt's child widgets
+  // (SearchBar, overlays, scrollbar, exit/health/link/resize hints)
+  // remain visible — they're painted into the parent's backing
+  // store, and Wayland's default subsurface stacking is *above*
+  // parent which would hide all of them. With place_below the
+  // parent QWidget renders on top; WA_TranslucentBackground means
+  // the terminal area of the parent is transparent so the
+  // subsurface shows through, while the chrome painted by
+  // paintEvent stays visible on top.
+  wl_subsurface_place_below(sub, parentSurface);
+
+  // Set an empty input region so pointer/touch events fall through
+  // to the parent surface (Qt's QWindow). The default input region
+  // is the whole attached buffer, which would mean our subsurface
+  // captures every click in the terminal area — Qt's QWidget would
+  // never see contextMenuEvent (right-click menu), mouse press/
+  // release, or any other pointer event in the terminal. wl_region
+  // with no add_rectangle calls = empty = "no input." The region
+  // can be destroyed immediately after set_input_region; the
+  // compositor copies its state into the surface's pending state.
+  wl_region *empty = wl_compositor_create_region(g->compositor);
+  if (empty) {
+    wl_surface_set_input_region(child, empty);
+    wl_region_destroy(empty);
+  }
+
+  // wp_viewport: per-surface object that lets us tell the compositor
+  // the destination size in surface-local coords, independent of
+  // the buffer's pixel dimensions. With fractional scaling we
+  // render at, say, 960x720 device pixels into an 800x600 surface
+  // area, and the viewport handles the mapping.
+  wp_viewport *viewport =
+      wp_viewporter_get_viewport(g->viewporter, child);
+  if (!viewport) {
+    wl_subsurface_destroy(sub);
+    wl_surface_destroy(child);
+    return nullptr;
+  }
+
+  // wp_fractional_scale_v1: subscribe to the compositor's
+  // per-surface preferred scale. Optional — if the global is
+  // missing we stick with default 120 (= 1.0×).
+  wp_fractional_scale_v1 *frac_scale = nullptr;
+  if (g->fractionalScale) {
+    frac_scale = wp_fractional_scale_manager_v1_get_fractional_scale(
+        g->fractionalScale, child);
+  }
+
+  wl_display_flush(display);
+  if (int err = wl_display_get_error(display); err != 0) {
+    std::fprintf(stderr,
+                 "[ghastty] SubsurfacePresenter: wl_display error %d after "
+                 "subsurface creation\n",
+                 err);
+    if (frac_scale) wp_fractional_scale_v1_destroy(frac_scale);
+    wp_viewport_destroy(viewport);
+    wl_subsurface_destroy(sub);
+    wl_surface_destroy(child);
+    return nullptr;
+  }
+
+  std::fprintf(stderr,
+               "[ghastty] SubsurfacePresenter: ready (parent=%p child=%p "
+               "sub=%p dmabuf=%p viewport=%p frac_scale=%p)\n",
+               static_cast<void *>(parentSurface), static_cast<void *>(child),
+               static_cast<void *>(sub), static_cast<void *>(g->dmabuf),
+               static_cast<void *>(viewport),
+               static_cast<void *>(frac_scale));
+
+  return std::unique_ptr<SubsurfacePresenter>(new SubsurfacePresenter(
+      display, child, sub, g->dmabuf, viewport, frac_scale));
+}
+
+const wp_fractional_scale_v1_listener kFractionalScaleListener = {
+    SubsurfacePresenter::onPreferredScale,
+};
+
+void SubsurfacePresenter::onPreferredScale(void *data,
+                                            wp_fractional_scale_v1 *,
+                                            uint32_t scale) {
+  auto *self = static_cast<SubsurfacePresenter *>(data);
+  if (scale == 0) return; // guard against compositor bugs
+  if (scale != self->m_preferredScale120) {
+    std::fprintf(stderr,
+                 "[ghastty] SubsurfacePresenter: preferred scale %u/120 = "
+                 "%.3f\n",
+                 scale, static_cast<double>(scale) / 120.0);
+    self->m_preferredScale120 = scale;
+  }
+}
+
+SubsurfacePresenter::SubsurfacePresenter(wl_display *display, wl_surface *child,
+                                         wl_subsurface *sub,
+                                         zwp_linux_dmabuf_v1 *dmabuf,
+                                         wp_viewport *viewport,
+                                         wp_fractional_scale_v1 *frac_scale)
+    : m_display(display),
+      m_childSurface(child),
+      m_subsurface(sub),
+      m_dmabuf(dmabuf),
+      m_viewport(viewport),
+      m_fractionalScale(frac_scale) {
+  if (m_fractionalScale) {
+    wp_fractional_scale_v1_add_listener(m_fractionalScale,
+                                         &kFractionalScaleListener, this);
+  }
+}
+
+SubsurfacePresenter::~SubsurfacePresenter() {
+  // Destroy the pending frame callback first: subsequent dispatches
+  // of the wl_event_queue won't deliver its done event (Wayland
+  // guarantees no events on a destroyed proxy), so the dangling
+  // `this` pointer in the listener data can't fire.
+  if (m_frameCallback) {
+    wl_callback_destroy(m_frameCallback);
+    m_frameCallback = nullptr;
+  }
+  // Destroy the cached wl_buffer BEFORE the child surface — the
+  // buffer may still be attached. wl_buffer_destroy is safe whether
+  // or not the compositor has released it (Wayland guarantees no
+  // further events on a destroyed proxy).
+  if (m_cachedBuffer) {
+    wl_buffer_destroy(m_cachedBuffer);
+    m_cachedBuffer = nullptr;
+  }
+  if (m_fractionalScale) wp_fractional_scale_v1_destroy(m_fractionalScale);
+  if (m_viewport) wp_viewport_destroy(m_viewport);
+  if (m_subsurface) wl_subsurface_destroy(m_subsurface);
+  if (m_childSurface) wl_surface_destroy(m_childSurface);
+  if (m_display) wl_display_flush(m_display);
+}
+
+void SubsurfacePresenter::onFrameCallbackDone(wl_callback *cb) {
+  // The single-shot wl_callback is now spent. Destroy the proxy and
+  // clear our slot so the next present registers a fresh callback.
+  // Guard against the rare cb-mismatch case (shouldn't happen — the
+  // listener data routes to exactly this presenter and we only ever
+  // have one outstanding callback — but be defensive against future
+  // refactors).
+  if (cb == m_frameCallback) m_frameCallback = nullptr;
+  wl_callback_destroy(cb);
+  // Notify the consumer (e.g. GhosttySurface) that the compositor
+  // is ready for the next frame. The callback runs on the same
+  // thread that pumps Wayland events (the Qt GUI thread), so it can
+  // touch GUI-thread state directly.
+  if (m_onFrameReady) m_onFrameReady();
+}
+
+void SubsurfacePresenter::presentDmabuf(int fd, uint32_t drm_format,
+                                        uint64_t drm_modifier, uint32_t width,
+                                        uint32_t height, uint32_t stride,
+                                        int dest_width, int dest_height,
+                                        bool y_invert) {
+  if (fd < 0 || !m_dmabuf || !m_childSurface || !m_viewport) return;
+  if (dest_width <= 0) dest_width = 1;
+  if (dest_height <= 0) dest_height = 1;
+
+  // System-boundary input validation. width/height/stride flow in
+  // from libghostty's renderer thread and are about to be passed
+  // verbatim to the compositor. linux-dmabuf-v1 protocol errors
+  // (`invalid_dimensions`, `invalid_format`, etc.) are FATAL — they
+  // tear down the entire wl_display, killing every window in the
+  // process. We MUST reject malformed inputs locally rather than
+  // letting the compositor do it.
+  //
+  // Specifically reject: zero dimensions or stride, or any value
+  // that would silently flip negative when cast to int32_t at the
+  // create_immed call below (the wayland C API takes signed ints
+  // for dimensions; uint32_t >= 2^31 wraps to negative).
+  constexpr uint32_t kMaxDim = static_cast<uint32_t>(INT32_MAX);
+  if (width == 0 || height == 0 || stride == 0 ||
+      width > kMaxDim || height > kMaxDim || stride > kMaxDim) {
+    std::fprintf(stderr,
+                 "[ghastty] SubsurfacePresenter: rejecting dmabuf with "
+                 "out-of-range dimensions (w=%u h=%u stride=%u)\n",
+                 width, height, stride);
+    return;
+  }
+  // Stride sanity: must be at least 4 bytes per pixel for
+  // 32-bit ARGB/XRGB/etc. — the only formats this presenter
+  // currently advertises support for. Tighter than the protocol's
+  // minimum but matches what the compositor will accept on attach.
+  if (stride < static_cast<uint64_t>(width) * 4) {
+    std::fprintf(stderr,
+                 "[ghastty] SubsurfacePresenter: rejecting dmabuf with "
+                 "stride=%u too small for width=%u (need >= %llu)\n",
+                 stride, width,
+                 static_cast<unsigned long long>(static_cast<uint64_t>(width) * 4));
+    return;
+  }
+
+  // Validate the (format, modifier) pair against the compositor's
+  // advertised list before handing it to `create_immed`. If the
+  // pair isn't on the list, the compositor will reject the
+  // subsequent `create_immed` with `invalid_format` — a FATAL
+  // protocol error that kills the entire wl_display, taking down
+  // every window in the process. Better to drop this single frame
+  // than to take down the app.
+  {
+    const PresenterGlobals &g = globalState();
+    const auto it = g.modifiers.find(drm_format);
+    bool ok = false;
+    if (it != g.modifiers.end()) {
+      for (const uint64_t m : it->second) {
+        if (m == drm_modifier) { ok = true; break; }
+      }
+    }
+    if (!ok) {
+      std::fprintf(stderr,
+                   "[ghastty] SubsurfacePresenter: refusing dmabuf "
+                   "(fourcc=0x%08x mod=0x%llx) — compositor doesn't "
+                   "advertise this (format, modifier) pair\n",
+                   drm_format,
+                   static_cast<unsigned long long>(drm_modifier));
+      return;
+    }
+  }
+
+  // Wrap libghostty's borrowed fd in a wl_buffer. Cached across
+  // frames by (kernel inode, shape) — see m_cachedInode in the
+  // header for the full rationale. fstat the dmabuf fd to get the
+  // anon_inode that uniquely identifies the dma-buf object; it's
+  // stable across the dup that GhosttySurface did before parking,
+  // and changes only when libghostty allocates a new Target.
+  // fstat failure (rare; would indicate a closed fd, which we
+  // already check above via `fd < 0`) falls through to cache miss
+  // → create_immed will likely fail too, but the error path there
+  // already logs cleanly.
+  struct stat st;
+  unsigned long inode = 0;
+  if (::fstat(fd, &st) == 0) inode = static_cast<unsigned long>(st.st_ino);
+  const bool cache_hit = m_cachedBuffer != nullptr &&
+                         inode != 0 &&
+                         m_cachedInode == inode &&
+                         m_cachedWidth == width &&
+                         m_cachedHeight == height &&
+                         m_cachedStride == stride &&
+                         m_cachedFormat == drm_format &&
+                         m_cachedModifier == drm_modifier &&
+                         m_cachedYInvert == y_invert;
+  wl_buffer *buffer = nullptr;
+  if (cache_hit) {
+    buffer = m_cachedBuffer;
+  } else {
+    // Cache miss — destroy any stale buffer first so a failed
+    // create_immed below leaves the cache empty (rather than half-
+    // populated with the previous buffer that no longer matches the
+    // new inputs).
+    if (m_cachedBuffer) {
+      wl_buffer_destroy(m_cachedBuffer);
+      m_cachedBuffer = nullptr;
+      m_cachedInode = 0;
+    }
+    zwp_linux_buffer_params_v1 *params =
+        zwp_linux_dmabuf_v1_create_params(m_dmabuf);
+    if (!params) return;
+    zwp_linux_buffer_params_v1_add(params, fd, /*plane_idx*/ 0,
+                                   /*offset*/ 0, stride,
+                                   static_cast<uint32_t>(drm_modifier >> 32),
+                                   static_cast<uint32_t>(drm_modifier & 0xFFFFFFFFu));
+    const uint32_t buffer_flags =
+        y_invert ? ZWP_LINUX_BUFFER_PARAMS_V1_FLAGS_Y_INVERT : 0;
+    buffer = zwp_linux_buffer_params_v1_create_immed(
+        params, static_cast<int32_t>(width), static_cast<int32_t>(height),
+        drm_format, buffer_flags);
+    zwp_linux_buffer_params_v1_destroy(params);
+    if (!buffer) {
+      // Surface the wl_display error code if the failure was a
+      // protocol-fatal error (compositor rejected the buffer with
+      // `invalid_format` / `invalid_dimensions` / etc., which kills
+      // the wl_display). Without this, every subsequent presentDmabuf
+      // call silently no-ops on the dead display and the cause stays
+      // hidden until something else logs the disconnection.
+      const int wl_err = wl_display_get_error(m_display);
+      std::fprintf(stderr,
+                   "[ghastty] SubsurfacePresenter: create_immed returned null "
+                   "(fd=%d %ux%u fmt=0x%x mod=0x%llx wl_display_error=%d)\n",
+                   fd, width, height, drm_format,
+                   static_cast<unsigned long long>(drm_modifier), wl_err);
+      return;
+    }
+    // Listener data is unused — see `bufferRelease` for why this is
+    // nullptr (and the no-op release semantics that make the cache
+    // safe).
+    wl_buffer_add_listener(buffer, &kBufferListener, nullptr);
+    m_cachedBuffer = buffer;
+    m_cachedInode = inode;
+    m_cachedWidth = width;
+    m_cachedHeight = height;
+    m_cachedStride = stride;
+    m_cachedFormat = drm_format;
+    m_cachedModifier = drm_modifier;
+    m_cachedYInvert = y_invert;
+  }
+
+  // Tell the compositor the destination size in surface-local
+  // coordinates. With fractional scaling this is the logical pixel
+  // size (e.g. 800x600) while the buffer is at device pixels (e.g.
+  // 960x720 for 1.2× DPR). wp_viewport handles the mapping;
+  // wl_surface.set_buffer_scale is intentionally NOT used here
+  // because (a) it only supports integer scales, and (b) when
+  // wp_fractional_scale_v1 is active the protocol forbids using
+  // set_buffer_scale to anything other than 1.
+  if (dest_width != m_lastDestWidth || dest_height != m_lastDestHeight) {
+    wp_viewport_set_destination(m_viewport, dest_width, dest_height);
+    m_lastDestWidth = dest_width;
+    m_lastDestHeight = dest_height;
+  }
+
+  wl_surface_attach(m_childSurface, buffer, 0, 0);
+  // Damage the full buffer extent — terminals tend to update large
+  // dirty rects anyway (cursor blink, scroll, repaint) so a precise
+  // damage region wouldn't save much, and `damage_buffer` (vs
+  // `damage`) uses buffer coordinates so it's resolution-correct.
+  wl_surface_damage_buffer(m_childSurface, 0, 0, static_cast<int32_t>(width),
+                           static_cast<int32_t>(height));
+  // Register a wl_surface.frame callback BEFORE the commit so the
+  // compositor knows we want to be paced. Only request a new one if
+  // none is outstanding — re-requesting before the prior fires would
+  // leak callbacks. The done handler clears m_frameCallback, so the
+  // next call here will register fresh.
+  if (!m_frameCallback) {
+    m_frameCallback = wl_surface_frame(m_childSurface);
+    if (m_frameCallback) {
+      wl_callback_add_listener(m_frameCallback, &kFrameCallbackListener,
+                               this);
+    }
+  }
+  wl_surface_commit(m_childSurface);
+
+  wl_display_flush(m_display);
+  if (int err = wl_display_get_error(m_display); err != 0) {
+    std::fprintf(
+        stderr,
+        "[ghastty] SubsurfacePresenter: wl_display error %d after present\n",
+        err);
+  }
+}
+
+void SubsurfacePresenter::resizeDestination(int dest_width, int dest_height) {
+  if (!m_viewport || !m_childSurface) return;
+  if (dest_width <= 0 || dest_height <= 0) return;
+  if (dest_width == m_lastDestWidth && dest_height == m_lastDestHeight) return;
+
+  // Update destination + commit child WITHOUT attaching a new buffer.
+  // In desync mode the commit applies immediately and the compositor
+  // stretches the currently-attached buffer to the new dest extent.
+  // The next presentDmabuf will overwrite this with a properly-sized
+  // buffer, but until then the subsurface fills the new area instead
+  // of leaving a transparent gap during the parent's resize commit.
+  wp_viewport_set_destination(m_viewport, dest_width, dest_height);
+  m_lastDestWidth = dest_width;
+  m_lastDestHeight = dest_height;
+  wl_surface_commit(m_childSurface);
+  wl_display_flush(m_display);
+}
+
+void SubsurfacePresenter::setPosition(int x, int y) {
+  if (!m_subsurface) return;
+  if (x == m_lastX && y == m_lastY) return;
+  wl_subsurface_set_position(m_subsurface, x, y);
+  m_lastX = x;
+  m_lastY = y;
+  // Position is double-buffered on the parent surface — the caller
+  // must trigger a parent commit (forceParentCommit on the GhosttySurface
+  // side) for the change to land. We flush so the request is on the
+  // wire when that happens.
+  wl_display_flush(m_display);
+}
+
+void SubsurfacePresenter::hide() {
+  if (!m_childSurface) return;
+  // Attach NULL = no buffer. After commit + parent commit, the
+  // subsurface contributes nothing to the compositor's frame.
+  // Caller is responsible for forceParentCommit on its side.
+  wl_surface_attach(m_childSurface, nullptr, 0, 0);
+  wl_surface_commit(m_childSurface);
+  wl_display_flush(m_display);
+}
+
+void SubsurfacePresenter::flushDisplay() {
+  if (m_display) wl_display_flush(m_display);
+}
+
+bool SubsurfacePresenter::reattachCached() {
+  if (!m_childSurface || !m_cachedBuffer) return false;
+  // Re-show whatever we had attached before `hide()`. The cached
+  // wl_buffer survives across hide/show because the release
+  // listener no-ops (see `bufferRelease`). The dmabuf backing the
+  // buffer is still alive — libghostty owns the underlying
+  // VkDeviceMemory until the next Target.deinit (resize), and
+  // dma-buf kernel ref-counting keeps the pages pinned regardless
+  // of our client-side state.
+  //
+  // The content may be one frame stale (whatever was rendered just
+  // before Hide), but that's better than a transparent gap while
+  // the renderer thread spins up its first new frame after Show —
+  // the parent surface has WA_TranslucentBackground, so without a
+  // re-attach the user sees through to whatever is behind the
+  // window. The renderer's next frame overwrites this within
+  // DRAW_INTERVAL.
+  wl_surface_attach(m_childSurface, m_cachedBuffer, 0, 0);
+  wl_surface_damage_buffer(m_childSurface, 0, 0,
+                           static_cast<int32_t>(m_cachedWidth),
+                           static_cast<int32_t>(m_cachedHeight));
+  // Register a frame callback so the consumer's pacing state machine
+  // gets a "compositor is ready" event after this re-attach too —
+  // otherwise a tab switch could leave m_compositorReady stuck false
+  // (a stale frame callback from the pre-Hide commit may have been
+  // discarded by the compositor on the NULL attach).
+  if (!m_frameCallback) {
+    m_frameCallback = wl_surface_frame(m_childSurface);
+    if (m_frameCallback) {
+      wl_callback_add_listener(m_frameCallback, &kFrameCallbackListener,
+                               this);
+    }
+  }
+  wl_surface_commit(m_childSurface);
+  wl_display_flush(m_display);
+  return true;
+}
+
+} // namespace wayland
--- a/qt/src/wayland/SubsurfacePresenter.h
+++ b/qt/src/wayland/SubsurfacePresenter.h
@ -0,0 +1,240 @@
+// Wayland subsurface presenter for `GhosttySurface`.
+//
+// Owns one `wl_subsurface` parented to the `GhosttySurface`'s native
+// `wl_surface`, plus the `zwp_linux_dmabuf_v1` machinery for wrapping
+// libghostty's dmabuf fds in `wl_buffer`s and attaching them to that
+// subsurface. The compositor scans the buffers out directly — no
+// mmap, no memcpy, no QImage, no QPainter blit on the present path.
+//
+// The process-wide compositor modifier registry that used to share
+// this header now lives in `DmabufRegistry.h`. The implementations
+// share `globalState()` machinery in `SubsurfacePresenter.cpp` but
+// the API surfaces are disjoint: presenter is per-widget, registry
+// is process-wide and read-only.
+//
+// Wayland-only by project decision (the Qt frontend is Wayland-only;
+// see `feedback-qt-no-x11` memory). If the host isn't on a Wayland
+// QPA platform or the compositor lacks the required globals,
+// `tryCreate` returns nullptr — the caller decides whether that's a
+// fatal error.
+
+#pragma once
+
+#include <cstddef>
+#include <cstdint>
+#include <functional>
+#include <memory>
+
+struct wl_buffer;
+struct wl_callback;
+struct wl_display;
+struct wl_subsurface;
+struct wl_surface;
+struct zwp_linux_dmabuf_v1;
+struct wp_viewport;
+struct wp_fractional_scale_v1;
+class QWindow;
+
+namespace wayland {
+
+class SubsurfacePresenter {
+public:
+  // Build a subsurface parented to `topLevel`'s native `wl_surface`,
+  // and bind the linux-dmabuf-v1 global on the same display. Pass
+  // the TOP-LEVEL QWindow (e.g. `widget->window()->windowHandle()`)
+  // — NOT a per-widget native QWindow. We attach all panes/splits
+  // as siblings under the top-level surface and position each with
+  // `setPosition`, instead of giving each pane its own QWindow
+  // (which Qt's QSplitter-embedded child widgets don't handle
+  // cleanly: "QWidgetWindow must be a top level window" warning,
+  // and the result renders black).
+  //
+  // Returns nullptr if any prerequisite is missing (non-Wayland QPA,
+  // null `wl_display`, `wl_subcompositor` unbindable,
+  // `zwp_linux_dmabuf_v1` unbindable, etc.).
+  static std::unique_ptr<SubsurfacePresenter> tryCreate(QWindow *topLevel);
+
+  ~SubsurfacePresenter();
+
+  // Hand a dmabuf-backed frame to the compositor: wrap the fd in a
+  // `wl_buffer` via `zwp_linux_buffer_params_v1.create_immed`, attach
+  // to the subsurface, damage, commit. MUST be called on the Qt GUI
+  // thread (the thread that owns the wl_display dispatch); the
+  // renderer thread should marshal frames through a Qt-side queue.
+  //
+  // libghostty owns the fd; this method does not close it. The
+  // wayland client library duplicates the fd kernel-side via
+  // SCM_RIGHTS, so the compositor's reference survives even after
+  // libghostty reuses or closes its handle.
+  //
+  // `dest_width` / `dest_height` are the size of the subsurface in
+  // PARENT surface-local coordinates (i.e. logical pixels). For
+  // integer scales they match the buffer dimensions divided by the
+  // scale; for fractional scales they're independent (set via
+  // wp_viewport.set_destination, which decouples buffer dimensions
+  // from surface area).
+  // `y_invert` requests the compositor flip the buffer vertically
+  // when sampling. The OpenGL renderer's coordinate convention is
+  // bottom-left origin (Y up), but Wayland/DRM samples top-down —
+  // without the flag, GL frames render upside-down. Vulkan
+  // rasterizes Y-down by default and passes false.
+  void presentDmabuf(int fd, uint32_t drm_format, uint64_t drm_modifier,
+                     uint32_t width, uint32_t height, uint32_t stride,
+                     int dest_width, int dest_height,
+                     bool y_invert = false);
+
+  // Compositor-preferred fractional scale for this surface, in
+  // units of 1/120 (e.g. 144 = 1.2, 180 = 1.5, 240 = 2.0). Returns
+  // 120 (= 1.0) until the compositor sends its first
+  // wp_fractional_scale_v1.preferred_scale event for our surface.
+  //
+  // Currently INFORMATIONAL only: GhosttySurface uses Qt's
+  // devicePixelRatioF() for buffer sizing (which Qt derives from
+  // the same protocol on Wayland), so the two values agree at
+  // steady state. Exposed for diagnostics + a future direct-
+  // protocol path that bypasses Qt's DPR cache lag during a
+  // screen-change race.
+  uint32_t preferredScale120() const { return m_preferredScale120; }
+
+  // Stretch the existing subsurface buffer to a new destination
+  // size WITHOUT attaching a new buffer. Used at the *start* of a
+  // resize, before the renderer has produced a new-size frame:
+  // wp_viewport.set_destination is double-buffered on the child
+  // surface, so committing the child here in desync mode applies
+  // the new destination immediately and the compositor stretches
+  // the old buffer to fill it. Result: the parent surface can grow
+  // to its new size with the subsurface already covering the new
+  // area (briefly stretched), instead of leaving a one-frame
+  // transparent gap where the translucent parent shows through.
+  //
+  // The next presentDmabuf call (with the real new-size buffer)
+  // replaces the stretched content, ending the brief blur.
+  //
+  // Same pattern mpv's vo_dmabuf_wayland uses for its video
+  // subsurface during resize.
+  void resizeDestination(int dest_width, int dest_height);
+
+  // Update the subsurface position in parent-surface-local coords.
+  // For panes inside splits / tabs, position is the GhosttySurface
+  // widget's offset within the top-level (`mapTo(window(),
+  // QPoint(0,0))`). wl_subsurface.set_position is double-buffered
+  // on the *parent* surface — caller must trigger a parent commit
+  // (Qt's QtWaylandClient::QWaylandWindow::commit()) for the new
+  // position to apply. No-op if the position hasn't changed.
+  void setPosition(int x, int y);
+
+  // Detach the currently-attached buffer so the subsurface becomes
+  // invisible. Called when the owning GhosttySurface hides (tab
+  // switch) so the inactive pane's pixels don't ghost on top of
+  // whatever the active tab is showing in the same on-screen
+  // region. The next presentDmabuf call re-attaches a buffer and
+  // the subsurface becomes visible again.
+  void hide();
+
+  // Register a callback fired (on the GUI thread, via Wayland event
+  // queue dispatch) when the compositor signals it's ready for the
+  // next frame on this subsurface. Lets the caller pace presents at
+  // the compositor's refresh rate instead of unconditionally
+  // committing every renderer frame.
+  //
+  // The callback fires AT MOST ONCE per `presentDmabuf` /
+  // `reattachCached` call — the underlying `wl_surface.frame`
+  // request is single-shot per commit. After the callback fires,
+  // the next present's commit will register a new frame_callback.
+  using OnFrameReady = std::function<void()>;
+  void setOnFrameReady(OnFrameReady cb) { m_onFrameReady = std::move(cb); }
+
+  // Flush the underlying wl_display to push any queued requests
+  // to the compositor. Useful after a forceParentCommit on the
+  // Qt side (which queues a parent wl_surface.commit but doesn't
+  // wl_display_flush), so the combined "child commit + parent
+  // commit" reach the compositor in one shot rather than racing
+  // Qt's next event-loop flush.
+  void flushDisplay();
+
+  // Re-attach + commit the most recently cached wl_buffer, if any.
+  // Called from `QEvent::Show` so a tab-switch / re-show sees the
+  // last frame immediately rather than a transparent area while
+  // the renderer thread spins up its first new frame. Without this,
+  // the parent surface paints through (WA_TranslucentBackground)
+  // and the user sees a flash of whatever is behind the window.
+  // Returns true if a cached buffer was actually re-attached;
+  // false if the cache was empty (first show — caller is
+  // responsible for the new-tab flash mitigation if needed).
+  bool reattachCached();
+
+  // Called from the wp_fractional_scale_v1.preferred_scale event.
+  // Public so the C-style listener struct at file scope in the .cpp
+  // can name it; not part of the API for other call sites.
+  static void onPreferredScale(void *data, wp_fractional_scale_v1 *,
+                                uint32_t scale);
+
+  // wl_callback::done dispatch from the file-scope listener. Public
+  // for the same reason as onPreferredScale: C-style Wayland
+  // listeners need a static-callable entry point and we route the
+  // result back into the owning presenter via the listener's `data`
+  // pointer. Destroys the callback proxy, clears m_frameCallback,
+  // and invokes m_onFrameReady if set. Not part of the API for
+  // other call sites.
+  void onFrameCallbackDone(wl_callback *cb);
+
+  SubsurfacePresenter(const SubsurfacePresenter &) = delete;
+  SubsurfacePresenter &operator=(const SubsurfacePresenter &) = delete;
+
+private:
+  SubsurfacePresenter(wl_display *display, wl_surface *child,
+                      wl_subsurface *sub, zwp_linux_dmabuf_v1 *dmabuf,
+                      wp_viewport *viewport,
+                      wp_fractional_scale_v1 *frac_scale);
+
+  wl_display *m_display;
+  wl_surface *m_childSurface;
+  wl_subsurface *m_subsurface;
+  zwp_linux_dmabuf_v1 *m_dmabuf;
+  wp_viewport *m_viewport;
+  wp_fractional_scale_v1 *m_fractionalScale;
+  uint32_t m_preferredScale120 = 120; // default: 1.0×
+  int m_lastDestWidth = 0;
+  int m_lastDestHeight = 0;
+  int m_lastX = 0;
+  int m_lastY = 0;
+
+  // Pending wl_surface.frame callback for compositor-paced presents.
+  // Null between frame_done and the next presentDmabuf commit. Non-
+  // null between presentDmabuf and frame_done. Single-shot — the
+  // done handler destroys it and clears the field, then invokes
+  // `m_onFrameReady` if set.
+  wl_callback *m_frameCallback = nullptr;
+  OnFrameReady m_onFrameReady;
+
+  // wl_buffer cache keyed by dma-buf identity (kernel inode of the
+  // anon_inode backing the dma-buf, which is unique per Target
+  // regardless of fd-number reuse) plus the layout-relevant shape.
+  // libghostty re-uses the same dmabuf across frames until the
+  // next Target.deinit (resize); cache hits skip the create_immed
+  // round-trip + compositor-side dmabuf import that dominated
+  // GUI-thread CPU at 125 FPS.
+  //
+  // We can't key on the caller's fd value because GhosttySurface
+  // now dups the fd on the renderer thread (to outlive libghostty's
+  // close — see 22713b0d3) so the value is fresh per frame. Inode
+  // identity is stable across our dup AND across libghostty's
+  // close → reopen cycles, so cache invalidation matches Target
+  // identity exactly: same Target → same inode → cache hit; new
+  // Target → new inode → cache miss → recreate.
+  //
+  // Cache only stores the wl_buffer; the compositor SCM_RIGHTS-
+  // dup'd the fd into its own address space at create_immed time,
+  // so the cached wl_buffer doesn't need our fd to outlive the
+  // call. The caller owns + closes its own dup.
+  wl_buffer *m_cachedBuffer = nullptr;
+  unsigned long m_cachedInode = 0;  // 0 = empty cache (anon_inode ino > 0)
+  uint32_t m_cachedWidth = 0;
+  uint32_t m_cachedHeight = 0;
+  uint32_t m_cachedStride = 0;
+  uint32_t m_cachedFormat = 0;
+  uint64_t m_cachedModifier = 0;
+  bool m_cachedYInvert = false;
+};
+
+} // namespace wayland
--- a/src/apprt/embedded.zig
+++ b/src/apprt/embedded.zig
@ -353,6 +353,7 @@ pub const Platform = union(PlatformTag) {
    macos: MacOS,
    ios: IOS,
    opengl: OpenGL,
+    vulkan: Vulkan,

    // If our build target for libghostty is not darwin then we do
    // not include macos support at all.
@ -395,6 +396,70 @@ pub const Platform = union(PlatformTag) {
        present: *const fn (?*anyopaque) callconv(.c) void,
    };

+    /// Configuration for a host that owns a Vulkan device libghostty
+    /// should render against (fork-only). The host owns the
+    /// VkInstance / VkPhysicalDevice / VkDevice / VkQueue — same
+    /// ownership model as `OpenGL` above. Frames are handed back to
+    /// the host as dmabuf file descriptors so the host can sample
+    /// them without a CPU readback.
+    ///
+    /// Handles are `?*anyopaque` here so callers don't need Vulkan
+    /// headers to compile against the C API; treat them as VkInstance,
+    /// VkPhysicalDevice, VkDevice, VkQueue respectively.
+    pub const Vulkan = struct {
+        userdata: ?*anyopaque,
+
+        /// Resolve `vkGetInstanceProcAddr` (returned as `?*anyopaque`).
+        /// libghostty bootstraps the rest of the Vulkan loader from it.
+        get_instance_proc_addr: *const fn (
+            ?*anyopaque,
+            [*:0]const u8,
+        ) callconv(.c) ?*anyopaque,
+
+        /// Host-owned Vulkan handles. libghostty does not destroy
+        /// these.
+        instance: *const fn (?*anyopaque) callconv(.c) ?*anyopaque,
+        physical_device: *const fn (?*anyopaque) callconv(.c) ?*anyopaque,
+        device: *const fn (?*anyopaque) callconv(.c) ?*anyopaque,
+        queue: *const fn (?*anyopaque) callconv(.c) ?*anyopaque,
+        queue_family_index: *const fn (?*anyopaque) callconv(.c) u32,
+
+        /// Query the compositor-supported DRM modifiers for a given
+        /// DRM_FORMAT_* fourcc. Two-pass usage: call with
+        /// `out=null, capacity=0` for the count, then again with a
+        /// buffer of that size. Returns the number of modifiers
+        /// actually written. The renderer intersects this with the
+        /// GPU's per-modifier feature set to pick a tiling the
+        /// compositor will accept on attach.
+        get_supported_modifiers: *const fn (
+            ?*anyopaque,
+            u32, // DRM_FORMAT_*
+            ?[*]u64, // out
+            usize, // capacity
+        ) callconv(.c) usize,
+
+        /// Hand off a rendered frame to the host as a dmabuf fd. The
+        /// host imports it for composition; libghostty retains
+        /// ownership of the underlying VkDeviceMemory and the fd is
+        /// valid only for the duration of the call (host must `dup()`
+        /// if it needs to hold the fd longer). `image_backed` tells
+        /// the host whether the fd was exported from a VkImage
+        /// (directly importable as a 2D image via linux-dmabuf-v1)
+        /// or from a VkBuffer (only usable via mmap + CPU readback);
+        /// see `vulkan/Target.zig` and `include/ghostty.h` for the
+        /// full rationale.
+        present: *const fn (
+            ?*anyopaque,
+            i32, // dmabuf fd
+            u32, // DRM_FORMAT_*
+            u64, // DRM modifier
+            u32, // width (pixels)
+            u32, // height (pixels)
+            u32, // stride (bytes)
+            bool, // image_backed
+        ) callconv(.c) void,
+    };
+
    // The C ABI compatible version of this union. The tag is expected
    // to be stored elsewhere.
    pub const C = extern union {
@ -416,6 +481,35 @@ pub const Platform = union(PlatformTag) {
            release_current: ?*const fn (?*anyopaque) callconv(.c) void,
            present: ?*const fn (?*anyopaque) callconv(.c) void,
        },
+
+        vulkan: extern struct {
+            userdata: ?*anyopaque,
+            get_instance_proc_addr: ?*const fn (
+                ?*anyopaque,
+                [*:0]const u8,
+            ) callconv(.c) ?*anyopaque,
+            instance: ?*const fn (?*anyopaque) callconv(.c) ?*anyopaque,
+            physical_device: ?*const fn (?*anyopaque) callconv(.c) ?*anyopaque,
+            device: ?*const fn (?*anyopaque) callconv(.c) ?*anyopaque,
+            queue: ?*const fn (?*anyopaque) callconv(.c) ?*anyopaque,
+            queue_family_index: ?*const fn (?*anyopaque) callconv(.c) u32,
+            get_supported_modifiers: ?*const fn (
+                ?*anyopaque,
+                u32,
+                ?[*]u64,
+                usize,
+            ) callconv(.c) usize,
+            present: ?*const fn (
+                ?*anyopaque,
+                i32,
+                u32,
+                u64,
+                u32,
+                u32,
+                u32,
+                bool,
+            ) callconv(.c) void,
+        },
    };

    /// Initialize a Platform a tag and configuration from the C ABI.
@ -450,6 +544,47 @@ pub const Platform = union(PlatformTag) {
                        break :opengl error.PresentMustBeSet,
                } };
            },
+
+            .vulkan => vulkan: {
+                const config = c_platform.vulkan;
+                // Collapse the eight per-callback "MustBeSet"
+                // variants into a single `error.MissingVulkanCallback`.
+                // Pre-this, every caller of `Platform.init` had to
+                // handle 8 separate error tags (or `try` swallow
+                // them) — eight names that all mean "the host
+                // didn't fill out one of these fields." Log which
+                // one was null for diagnostics; the error tag
+                // itself stays narrow.
+                const which: ?[]const u8 = blk: {
+                    if (config.get_instance_proc_addr == null) break :blk "get_instance_proc_addr";
+                    if (config.instance == null) break :blk "instance";
+                    if (config.physical_device == null) break :blk "physical_device";
+                    if (config.device == null) break :blk "device";
+                    if (config.queue == null) break :blk "queue";
+                    if (config.queue_family_index == null) break :blk "queue_family_index";
+                    if (config.get_supported_modifiers == null) break :blk "get_supported_modifiers";
+                    if (config.present == null) break :blk "present";
+                    break :blk null;
+                };
+                if (which) |name| {
+                    std.log.scoped(.embedded).err(
+                        "ghostty_platform_vulkan_s.{s} is null",
+                        .{name},
+                    );
+                    break :vulkan error.MissingVulkanCallback;
+                }
+                break :vulkan .{ .vulkan = .{
+                    .userdata = config.userdata,
+                    .get_instance_proc_addr = config.get_instance_proc_addr.?,
+                    .instance = config.instance.?,
+                    .physical_device = config.physical_device.?,
+                    .device = config.device.?,
+                    .queue = config.queue.?,
+                    .queue_family_index = config.queue_family_index.?,
+                    .get_supported_modifiers = config.get_supported_modifiers.?,
+                    .present = config.present.?,
+                } };
+            },
        };
    }
 };
@ -461,6 +596,8 @@ pub const PlatformTag = enum(c_int) {
    macos = 1,
    ios = 2,
    opengl = 3,
+    // Fork-only platform tag for hosts that drive `src/renderer/Vulkan.zig`.
+    vulkan = 4,
 };

 pub const EnvVar = extern struct {
@ -538,6 +675,25 @@ pub const Surface = struct {
                .x = @floatCast(opts.scale_factor),
                .y = @floatCast(opts.scale_factor),
            },
+            // Initial surface size. Must be large enough for the
+            // terminal to have at least a few cols/rows by default,
+            // because the shell process is forked as part of
+            // Surface.init and the PTY's winsize is whatever this
+            // size translates to. Tools like fastfetch query winsize
+            // (TIOCGWINSZ) on startup and lay out their kitty-image
+            // escape codes based on what they see; if winsize reports
+            // 0 cols × 0 rows, fastfetch sends the image with c=0
+            // r=0, and `Placement.pixelSize` (graphics_storage.zig)
+            // returns the image's NATIVE pixel dimensions — visible
+            // to the user as a giant Kusanagi (or whatever logo)
+            // filling the whole pane. 800×600 was the historic
+            // default; restoring it. Race against a real wrong-size
+            // first frame coinciding with the widget's device-pixel
+            // size at a fractional DPR is handled separately by the
+            // host apprt sending its real size as early as possible
+            // (Qt: immediate ghostty_surface_set_size right after
+            // ghostty_surface_new, inheriting the parent surface's
+            // size for new tabs).
            .size = .{ .width = 800, .height = 600 },
            .cursor_pos = .{ .x = -1, .y = -1 },
        };
--- a/src/build/Config.zig
+++ b/src/build/Config.zig
@ -688,6 +688,14 @@ pub const ExeEntrypoint = enum {
    webgen_config,
    webgen_actions,
    webgen_commands,
+    /// Build-time tool: compiles one of the renderer's built-in
+    /// GLSL shaders to SPIR-V and writes the bytes to stdout.
+    /// Invoked by `src/build/VulkanSpv.zig` once per (shader, stage)
+    /// pair so libghostty can `@embedFile` the resulting .spv
+    /// instead of running glslang at runtime — eliminates the
+    /// per-process TPoolAllocator high-water-mark leak (~10 MB)
+    /// that the Vulkan path otherwise pays on first surface init.
+    vulkan_spvgen,
 };

 /// The release channel for the build.
--- a/src/build/SharedDeps.zig
+++ b/src/build/SharedDeps.zig
@ -8,6 +8,7 @@ const HelpStrings = @import("HelpStrings.zig");
 const MetallibStep = @import("MetallibStep.zig");
 const UnicodeTables = @import("UnicodeTables.zig");
 const GhosttyFrameData = @import("GhosttyFrameData.zig");
+const VulkanSpv = @import("VulkanSpv.zig");
 const DistResource = @import("GhosttyDist.zig").Resource;

 config: *const Config,
@ -18,6 +19,9 @@ metallib: ?*MetallibStep,
 unicode_tables: UnicodeTables,
 framedata: GhosttyFrameData,
 uucode_tables: std.Build.LazyPath,
+/// Vulkan-only: build-time SPIR-V blobs for the renderer's
+/// built-in shaders. Null on non-Vulkan builds.
+vulkan_spv: ?VulkanSpv,

 /// Used to keep track of a list of file sources.
 pub const LazyPathList = std.ArrayList(std.Build.LazyPath);
@ -37,6 +41,15 @@ pub fn init(b: *std.Build, cfg: *const Config) !SharedDeps {
        .unicode_tables = try .init(b, uucode_tables),
        .framedata = try .init(b),
        .uucode_tables = uucode_tables,
+        // Vulkan-only build artifact: precompiled SPV blobs for
+        // the renderer's built-in shaders. Skipping the build
+        // step entirely on non-Vulkan builds avoids paying for
+        // a host-target glslang link the OpenGL/Metal renderers
+        // would never use.
+        .vulkan_spv = if (cfg.renderer == .vulkan)
+            try VulkanSpv.init(b, cfg)
+        else
+            null,

        // Setup by retarget
        .options = undefined,
@ -452,6 +465,14 @@ pub fn add(
    if (b.lazyDependency("opengl", .{})) |dep| {
        step.root_module.addImport("opengl", dep.module("opengl"));
    }
+    // The Vulkan binding is only loaded when the renderer is .vulkan
+    // (still in development — see `src/renderer/Vulkan.zig`). Linking
+    // libvulkan happens further down in `linkSystemDeps`.
+    if (self.config.renderer == .vulkan) {
+        if (b.lazyDependency("vulkan", .{})) |dep| {
+            step.root_module.addImport("vulkan", dep.module("vulkan"));
+        }
+    }
    if (b.lazyDependency("vaxis", .{})) |dep| {
        step.root_module.addImport("vaxis", dep.module("vaxis"));
    }
@ -600,6 +621,15 @@ pub fn add(
        });
    }

+    // Link the system Vulkan loader for the Vulkan renderer. The
+    // bindings themselves are in `pkg/vulkan` (added above as a Zig
+    // module). On Linux this resolves to libvulkan.so via the standard
+    // dynamic linker; Vulkan headers (`vulkan/vulkan.h`) come from the
+    // standard system include path (`vulkan-headers` package).
+    if (self.config.renderer == .vulkan) {
+        step.linkSystemLibrary2("vulkan", dynamic_link_opts);
+    }
+
    // If we're building an exe then we have additional dependencies.
    if (step.kind != .lib) {
        // When we're targeting flatpak we ALWAYS link GTK so we
@ -615,6 +645,7 @@ pub fn add(
    self.help_strings.addImport(step);
    self.unicode_tables.addImport(step);
    self.framedata.addImport(step);
+    if (self.vulkan_spv) |*v| v.addImport(step);

    return static_libs;
 }
--- a/src/build/VulkanSpv.zig
+++ b/src/build/VulkanSpv.zig
@ -0,0 +1,167 @@
+//! Build-time SPV precompile for the renderer's 9 built-in
+//! shaders. Builds a host-target executable from
+//! `src/vulkan_spvgen.zig` that takes (shader_name, stage) on
+//! argv and emits SPIR-V bytes on stdout, then runs it 9 times
+//! at build time and generates a `vulkan_spv.zig` module that
+//! exposes the resulting blobs as `pub const X: []const u8 =
+//! @embedFile("X.spv");` decls.
+//!
+//! Why: see `src/vulkan_spvgen.zig` for the leak/perf rationale.
+//! Pre-compiling built-ins at build time lets the runtime call
+//! `Module.initFromSpirv` instead of `Module.init`, skipping
+//! glslang entirely on the per-process first-surface init that
+//! otherwise hits glslang's TLS TPoolAllocator and leaves
+//! ~10 MB of un-releasable pool pages.
+//!
+//! Mirrors `HelpStrings.zig`'s structure. Conditional: only
+//! constructed when the build is targeting the Vulkan renderer
+//! (caller gates this).
+
+const VulkanSpv = @This();
+
+const std = @import("std");
+const Config = @import("Config.zig");
+
+/// The (name, stage) tuples of the renderer's 9 built-in shaders.
+/// Keep in sync with the decls of `renderer.vulkan.shaders.source`
+/// and the corresponding `Module.init` call sites in
+/// `renderer/vulkan/shaders.zig::Shaders.init`.
+const Shader = struct { name: []const u8, stage: []const u8 };
+const shaders = [_]Shader{
+    .{ .name = "bg_color_frag", .stage = "fragment" },
+    .{ .name = "bg_image_frag", .stage = "fragment" },
+    .{ .name = "bg_image_vert", .stage = "vertex" },
+    .{ .name = "cell_bg_frag", .stage = "fragment" },
+    .{ .name = "cell_text_frag", .stage = "fragment" },
+    .{ .name = "cell_text_vert", .stage = "vertex" },
+    .{ .name = "full_screen_vert", .stage = "vertex" },
+    .{ .name = "image_frag", .stage = "fragment" },
+    .{ .name = "image_vert", .stage = "vertex" },
+};
+
+/// Host-target executable; built once, run 9 times.
+exe: *std.Build.Step.Compile,
+
+/// LazyPath to the generated `vulkan_spv.zig` module.
+output: std.Build.LazyPath,
+
+pub fn init(b: *std.Build, cfg: *const Config) !VulkanSpv {
+    const exe = b.addExecutable(.{
+        .name = "vulkan_spvgen",
+        .root_module = b.createModule(.{
+            // Through main.zig so the exe_entrypoint switch
+            // resolves to vulkan_spvgen.zig. Matches the helpgen
+            // pattern (also root_source_file=main.zig + the
+            // entrypoint enum picks the actual main).
+            .root_source_file = b.path("src/main.zig"),
+            .target = b.graph.host,
+            // ReleaseFast is required: Debug mode produces
+            // R_X86_64_PC64 relocations when linking glslang's
+            // large static library that Zig's bundled linker
+            // can't handle. Release mode uses the small code
+            // model + system linker.
+            .optimize = .ReleaseFast,
+            .strip = false,
+            .omit_frame_pointer = false,
+            .unwind_tables = .sync,
+        }),
+    });
+
+    // Pin the entrypoint via build_options.
+    const spv_config = config: {
+        var copy = cfg.*;
+        copy.exe_entrypoint = .vulkan_spvgen;
+        break :config copy;
+    };
+    const options = b.addOptions();
+    try spv_config.addOptions(options);
+    exe.root_module.addOptions("build_options", options);
+
+    // Transitive imports the gen tool needs (mirrors what
+    // SharedDeps adds for the renderer build, but pinned to
+    // b.graph.host since this exe runs on the build machine).
+    if (b.lazyDependency("glslang", .{
+        .target = b.graph.host,
+        .optimize = .ReleaseFast,
+    })) |glslang_dep| {
+        exe.root_module.addImport("glslang", glslang_dep.module("glslang"));
+        exe.linkLibrary(glslang_dep.artifact("glslang"));
+    }
+    // `vulkan` is a header-only Zig module — its build.zig only
+    // calls `b.addModule(...)`, so it doesn't accept target /
+    // optimize args.
+    if (b.lazyDependency("vulkan", .{})) |vulkan_dep| {
+        exe.root_module.addImport("vulkan", vulkan_dep.module("vulkan"));
+    }
+
+    // Run the exe once per shader, capture stdout, drop the
+    // resulting bytes into a single WriteFiles directory under
+    // distinct .spv filenames. Also generate a .zig stub that
+    // @embedFile()s each blob with a typed `[]const u8` decl
+    // matching the shader name — that's what the renderer
+    // imports as "vulkan_spv".
+    var wf = b.addWriteFiles();
+    var module_src: std.ArrayList(u8) = .empty;
+    defer module_src.deinit(b.allocator);
+    try module_src.appendSlice(b.allocator,
+        \\// AUTO-GENERATED by src/build/VulkanSpv.zig — do not edit.
+        \\// Re-run `zig build -Drenderer=vulkan` after editing any
+        \\// of the renderer's built-in GLSL shaders.
+        \\//
+        \\// Each shader is exposed as `[]const u32` directly. The
+        \\// underlying storage is a comptime-aligned u8 array
+        \\// (`align(@alignOf(u32))`) so the bytesAsSlice cast is
+        \\// safe — the previous `@alignCast` of an unaligned
+        \\// @embedFile slice was UB and caused subtle SPIR-V
+        \\// misinterpretation (images rendered at wrong size on
+        \\// NVIDIA, which accepted the misaligned data and treated
+        \\// it as a slightly different program). Module.initFromSpirv
+        \\// takes []const u32 directly so callers can use these
+        \\// decls without further casts.
+        \\
+        \\const std = @import("std");
+        \\
+        \\
+    );
+    for (shaders) |s| {
+        const run = b.addRunArtifact(exe);
+        run.addArgs(&.{ s.name, s.stage });
+        const captured = run.captureStdOut();
+        const file_name = b.fmt("{s}.spv", .{s.name});
+        _ = wf.addCopyFile(captured, file_name);
+        // Two declarations per shader:
+        //   - `<name>_raw` is the storage: a const array of u8
+        //     aligned to @alignOf(u32) (forces .rodata layout to
+        //     start on a 4-byte boundary, dereferences the
+        //     @embedFile pointer to put bytes inline).
+        //   - `<name>` is the public []const u32 view via
+        //     bytesAsSlice (which asserts the runtime pointer's
+        //     alignment matches the type's required alignment;
+        //     guaranteed by the align() on _raw).
+        try module_src.writer(b.allocator).print(
+            \\const {0s}_raw align(@alignOf(u32)) = @embedFile("{1s}").*;
+            \\pub const {0s}: []const u32 = std.mem.bytesAsSlice(u32, {0s}_raw[0..]);
+            \\
+        ,
+            .{ s.name, file_name },
+        );
+    }
+    const output = wf.add(
+        "vulkan_spv.zig",
+        try module_src.toOwnedSlice(b.allocator),
+    );
+
+    return .{
+        .exe = exe,
+        .output = output,
+    };
+}
+
+/// Attach the generated `vulkan_spv` module to a step that
+/// builds libghostty (or anything else that needs the blobs).
+pub fn addImport(self: *const VulkanSpv, step: *std.Build.Step.Compile) void {
+    self.output.addStepDependencies(&step.step);
+    step.root_module.addAnonymousImport("vulkan_spv", .{
+        .root_source_file = self.output,
+    });
+}
--- a/src/main.zig
+++ b/src/main.zig
@ -10,6 +10,7 @@ const entrypoint = switch (build_config.exe_entrypoint) {
    .webgen_config => @import("build/webgen/main_config.zig"),
    .webgen_actions => @import("build/webgen/main_actions.zig"),
    .webgen_commands => @import("build/webgen/main_commands.zig"),
+    .vulkan_spvgen => @import("vulkan_spvgen.zig"),
 };

 /// The main entrypoint for the program.
--- a/src/renderer.zig
+++ b/src/renderer.zig
@ -17,6 +17,7 @@ pub const Backend = @import("renderer/backend.zig").Backend;
 pub const GenericRenderer = @import("renderer/generic.zig").Renderer;
 pub const Metal = @import("renderer/Metal.zig");
 pub const OpenGL = @import("renderer/OpenGL.zig");
+pub const Vulkan = @import("renderer/Vulkan.zig");
 pub const WebGL = @import("renderer/WebGL.zig");
 pub const Options = @import("renderer/Options.zig");
 pub const Overlay = @import("renderer/Overlay.zig");
@ -39,6 +40,7 @@ pub const Renderer = switch (build_config.renderer) {
    .metal => GenericRenderer(Metal),
    .opengl => GenericRenderer(OpenGL),
    .webgl => WebGL,
+    .vulkan => GenericRenderer(Vulkan),
 };

 /// The health status of a renderer. These must be shared across all
--- a/src/renderer/Metal.zig
+++ b/src/renderer/Metal.zig
@ -100,9 +100,10 @@ pub fn init(alloc: Allocator, opts: rendererpkg.Options) !Metal {
                .macos => |v| v.nsview,
                .ios => |v| v.uiview,

-                // The OpenGL platform is only valid with the OpenGL
-                // renderer; it cannot provide a view for Metal.
-                .opengl => return error.UnsupportedPlatform,
+                // The OpenGL / Vulkan platforms are only valid with
+                // their respective renderers; neither provides a view
+                // for Metal.
+                .opengl, .vulkan => return error.UnsupportedPlatform,
            },
        },

@ -199,12 +200,29 @@ pub fn drawFrameEnd(self: *Metal) void {
 pub fn initShaders(
    self: *const Metal,
    alloc: Allocator,
-    custom_shaders: []const [:0]const u8,
+    custom_shaders: []const []const u8,
 ) !shaders.Shaders {
+    // `shadertoy.loadFromFiles` returns `[]const []const u8` (a unified
+    // type so the SPV-target Vulkan path can share the loader); for
+    // `.msl` the underlying allocation IS null-terminated
+    // (`shadertoy.mslFromSpv` returns `[:0]const u8` and writes a
+    // sentinel one past `.len`). Reattach the sentinel for our
+    // downstream `Shaders.init` which expects `[:0]const u8`.
+    // Same pattern as `OpenGL.initShaders`.
+    const z_shaders = try alloc.alloc([:0]const u8, custom_shaders.len);
+    defer alloc.free(z_shaders);
+    for (custom_shaders, z_shaders) |bytes, *out| {
+        // Sentinel guard: `@ptrCast` does NOT verify the sentinel,
+        // so without this assert a future `loadFromFiles` change
+        // that forgets the trailing null would surface as an
+        // OOB read inside the Metal library compile.
+        std.debug.assert(bytes.len == 0 or bytes.ptr[bytes.len] == 0);
+        out.* = @ptrCast(bytes);
+    }
    return try shaders.Shaders.init(
        alloc,
        self.device,
-        custom_shaders,
+        z_shaders,
        // Using an `*_srgb` pixel format makes Metal gamma encode
        // the pixels written to it *after* blending, which means
        // we get linear alpha blending rather than gamma-incorrect
--- a/src/renderer/OpenGL.zig
+++ b/src/renderer/OpenGL.zig
@ -27,6 +27,11 @@ pub const custom_shader_target: shadertoy.Target = .glsl;
 // The fragCoord for OpenGL shaders is +Y = up.
 pub const custom_shader_y_is_down = false;

+/// Custom shaders are supported (the renderer ships a working "post"
+/// pass that composites `CustomShaderState.back_texture` through the
+/// user's shader into `frame.target`).
+pub const supports_custom_shaders: bool = true;
+
 /// Because OpenGL's frame completion is always
 /// sync, we have no need for multi-buffering.
 pub const swap_chain_count = 1;
@ -211,8 +216,9 @@ pub fn surfaceInit(surface: *apprt.Surface) !void {
                try prepareContext(&gladHostLoader);
            },

-            // macOS and iOS use the Metal renderer.
-            .macos, .ios => return error.UnsupportedPlatform,
+            // macOS and iOS use the Metal renderer; the Vulkan platform
+            // is only valid with the Vulkan renderer (currently a stub).
+            .macos, .ios, .vulkan => return error.UnsupportedPlatform,
        },
    }

@ -295,12 +301,33 @@ pub fn drawFrameEnd(self: *OpenGL) void {
 pub fn initShaders(
    self: *const OpenGL,
    alloc: Allocator,
-    custom_shaders: []const [:0]const u8,
+    custom_shaders: []const []const u8,
 ) !shaders.Shaders {
-    _ = alloc;
+    _ = self;
+    // `shadertoy.loadFromFiles` returns `[]const []const u8` so the
+    // SPV-target Vulkan path can share the loader, but for `.glsl`
+    // the underlying allocation IS null-terminated
+    // (`shadertoy.glslFromSpv` returns `[:0]const u8` and writes a
+    // sentinel one past `.len`). Reattach the sentinel for our
+    // downstream `Pipeline.init` calls that expect `[:0]const u8`.
+    //
+    // Use the caller-provided `alloc` (matches `Metal.initShaders`)
+    // — this is a transient scratch slice torn down at function
+    // exit.
+    const z_shaders = try alloc.alloc([:0]const u8, custom_shaders.len);
+    defer alloc.free(z_shaders);
+    for (custom_shaders, z_shaders) |bytes, *out| {
+        // Defense against a future `loadFromFiles` change that
+        // forgets to null-terminate: assert the sentinel before we
+        // pretend the slice is `[:0]const u8`. `@ptrCast` does NOT
+        // verify the sentinel — without this assert, a missing
+        // terminator surfaces as a downstream OOB read.
+        std.debug.assert(bytes.len == 0 or bytes.ptr[bytes.len] == 0);
+        out.* = @ptrCast(bytes);
+    }
    return try shaders.Shaders.init(
-        self.alloc,
-        custom_shaders,
+        alloc,
+        z_shaders,
    );
 }

--- a/src/renderer/Thread.zig
+++ b/src/renderer/Thread.zig
@ -293,6 +293,18 @@ fn setQosClass(self: *const Thread) void {
 }

 fn syncDrawTimer(self: *Thread) void {
+    // Hidden surfaces have no business running the animation
+    // draw timer — `drawFrame` would just early-return on the
+    // `!flags.visible` check and we'd burn 125 wakeups/sec on
+    // a no-op. With N background tabs each holding an animation
+    // timer, this dominated CPU on multi-tab sessions. The
+    // `.visible → true` mailbox handler re-runs `syncDrawTimer`
+    // to re-arm when the tab becomes visible again.
+    if (!self.flags.visible) {
+        self.draw_active = false;
+        return;
+    }
+
    skip: {
        // If our renderer supports animations and has them, then we
        // can apply draw timer based on custom shader animation configuration.
@ -360,6 +372,12 @@ fn drainMailbox(self: *Thread) !void {
                // Visibility affects our QoS class
                self.setQosClass();

+                // Visibility also gates the animation draw timer
+                // (see syncDrawTimer): hidden surfaces don't arm
+                // the 125 FPS timer, visible ones do. Re-run on
+                // every transition.
+                self.syncDrawTimer();
+
                // If we became visible then we immediately rebuild cells
                // (renderCallback skips updateFrame while invisible) and draw.
                if (v) {
@ -623,8 +641,15 @@ fn renderCallback(
    ) catch |err|
        log.warn("error rendering err={}", .{err});

-    // Draw
-    t.drawFrame(false);
+    // Draw. When the animation draw timer is already running
+    // (custom-shader-animation engaged), it will pick up the
+    // newly-updated cells at its next DRAW_INTERVAL tick — drawing
+    // here too would double-up frames during animated-shader periods
+    // and burn host-thread CPU (per-frame Wayland buffer attach +
+    // commit on the Qt apprt) for no visible benefit. Without the
+    // timer, wakeup-driven draws are the only way frames reach the
+    // host, so we always draw in that case.
+    if (!t.draw_active) t.drawFrame(false);

    return .disarm;
 }
--- a/src/renderer/Vulkan.zig
+++ b/src/renderer/Vulkan.zig
@ -0,0 +1,661 @@
+//! Vulkan graphics API for libghostty's `GenericRenderer`. Active
+//! on `-Drenderer=vulkan` builds; the host (e.g. the Qt frontend)
+//! supplies a VkInstance / VkDevice / VkQueue via the
+//! `ghostty_platform_vulkan_s` C ABI, libghostty drives all
+//! pipeline / image / command-buffer work against those handles,
+//! and rendered frames go back to the host as dmabuf fds for
+//! zero-copy compositing.
+//!
+//! Per-frame model: fence-paced submit-then-wait (one frame in
+//! flight), `Target` is the dmabuf-exportable render image,
+//! `Frame.complete` waits on the fence before handing the fd to
+//! the platform `present` callback.
+//!
+//! Submodules — pure Vulkan-API wrappers live in `pkg/vulkan/`
+//! (mirror of `pkg/opengl/`); renderer-policy modules live alongside
+//! this file under `vulkan/`.
+//!
+//! In `pkg/vulkan/` (re-exported from this file as
+//! `Vulkan.{Device,Sampler,CommandPool,DescriptorPool}`):
+//!   - `Device.zig`        — host-handle wrapper + dispatch table.
+//!   - `Sampler.zig`       — VkSampler.
+//!   - `CommandPool.zig`   — VkCommandPool + one-shot helper.
+//!   - `DescriptorPool.zig`— per-frame descriptor pool.
+//!
+//! In `src/renderer/vulkan/`:
+//!   - `Texture.zig`      — VkImage + memory + view + staging upload.
+//!   - `Target.zig`       — dmabuf-exportable render target
+//!                           (direct or legacy_copy mode).
+//!   - `buffer.zig`       — Buffer(T) host-coherent.
+//!   - `buffer_pool.zig`  — cross-frame VkBuffer recycle pool
+//!                           (per-thread pending, shared ready).
+//!   - `ThreadState.zig`  — per-renderer-thread frame fence /
+//!                           command buffer / step pool / last-target.
+//!   - `Pipeline.zig`     — VkPipeline + layout (dynamic rendering).
+//!   - `RenderPass.zig`   — dynamic-rendering pass + step recorder.
+//!   - `Frame.zig`        — per-draw context (fence-paced).
+//!   - `shaders.zig`      — GLSL→SPIR-V→VkShaderModule + the
+//!                           OpenGL-GLSL → Vulkan-GLSL rewriter.
+
+pub const Vulkan = @This();
+
+const std = @import("std");
+const builtin = @import("builtin");
+const Allocator = std.mem.Allocator;
+const vulkan = @import("vulkan");
+const vk = vulkan.c;
+
+const apprt = @import("../apprt.zig");
+const configpkg = @import("../config.zig");
+const font = @import("../font/main.zig");
+const rendererpkg = @import("../renderer.zig");
+const shadertoy = @import("shadertoy.zig");
+
+pub const GraphicsAPI = Vulkan;
+// Device-dispatch primitives live in `pkg/vulkan/` so they can be
+// reused by anything that needs a typed Vulkan binding (mirrors how
+// `pkg/opengl/` houses Buffer/Program/Texture/etc.). The renderer
+// re-exports them from this top-level so call sites continue to write
+// `Vulkan.Device`, `Vulkan.Sampler`, etc.
+pub const Device = vulkan.Device;
+pub const Sampler = vulkan.Sampler;
+pub const CommandPool = vulkan.CommandPool;
+pub const DescriptorPool = vulkan.DescriptorPool;
+
+// Renderer-policy primitives stay in `src/renderer/vulkan/` (dmabuf
+// export, our pipeline + render-pass wiring, frame fence pacing, the
+// GLSL→SPIR-V loader).
+pub const Texture = @import("vulkan/Texture.zig");
+pub const Target = @import("vulkan/Target.zig");
+pub const Pipeline = @import("vulkan/Pipeline.zig");
+pub const RenderPass = @import("vulkan/RenderPass.zig");
+pub const Frame = @import("vulkan/Frame.zig");
+pub const shaders = @import("vulkan/shaders.zig");
+
+const bufferpkg = @import("vulkan/buffer.zig");
+pub const Buffer = bufferpkg.Buffer;
+
+// ---- comptime contract --------------------------------------------------
+
+/// Custom user shaders compile to SPIR-V directly — skip the
+/// GLSL → SPIR-V → GLSL roundtrip that `.glsl` would do. The
+/// roundtrip exists for backends that consume GLSL (OpenGL, Metal
+/// via MSL), but Vulkan ingests SPIR-V natively and we already have
+/// a glslang shim for the renderer's built-in shaders. Bypassing
+/// the roundtrip halves the per-shader compile cost AND avoids the
+/// spirv-cross-emitted main() losing the upstream `gl_FragCoord.xy`
+/// pattern we hook for the Y-flip fix.
+pub const custom_shader_target: shadertoy.Target = .spv;
+
+/// Custom shaders ARE now supported on the Vulkan backend.
+/// `shaders.Shaders.init` builds one post pipeline per user shader
+/// (UBO at set 0 binding 1, iChannel0 sampler at set 1 binding 0,
+/// matching `shadertoy_prefix.glsl` after `vulkanizeGlsl` rewrites
+/// the layouts). The renderer's post pass at the end of `drawFrame`
+/// chains them — first pipeline samples `back_texture` and writes
+/// `front_texture`, swap, repeat; the last one writes
+/// `frame.target` instead.
+pub const supports_custom_shaders: bool = true;
+
+/// Vulkan's clip-space Y axis points down (unlike OpenGL).
+pub const custom_shader_y_is_down = true;
+
+/// Extra `#define` lines `shadertoy.loadFromFile` injects into the
+/// prefix between `#version` and the rest. `GHASTTY_VULKAN`
+/// activates the Vulkan-side `gl_FragCoord` flip + `texture()`
+/// upper-left wrap so `mainImage` sees shadertoy-convention coords
+/// even though Vulkan rasterizes Y-down. OpenGL/MSL backends omit
+/// this decl entirely and pass `&.{}` from `generic.zig`.
+pub const custom_shader_extra_defines: []const []const u8 = &.{"GHASTTY_VULKAN 1"};
+
+/// GLSL → GLSL rewriter `shadertoy.loadFromFile` runs after the
+/// prefix splice and before the SPIR-V compile. Plugs the
+/// `vulkanizeGlsl` pass that rewrites `layout(binding = N)` into
+/// `layout(set = S, binding = N)` so the resulting SPIR-V matches
+/// the renderer's multi-set descriptor layout. Without this, the
+/// shader's `iChannel0` lands at set 0 binding 0 while the post
+/// pipeline binds it at set 1 binding 0 → sampler returns garbage.
+pub const rewriteCustomShaderSource = shaders.vulkanizeGlsl;
+
+/// Single-buffered for v1; fence-paced submit-then-wait means there's
+/// only ever one frame in flight.
+pub const swap_chain_count = 1;
+
+const log = std.log.scoped(.vulkan);
+
+// ---- per-surface state --------------------------------------------------
+
+alloc: Allocator,
+blending: configpkg.Config.AlphaBlending,
+rt_surface: *apprt.Surface,
+
+/// Process-wide Vulkan device. The host owns one VkDevice shared
+/// across every surface, so we mirror that as a single global slot
+/// (not threadlocal — the renderer thread is distinct from the main
+/// thread that constructs the surface, and threadlocal doesn't
+/// survive that boundary).
+///
+/// Initialized in `Vulkan.init` on the surface-construction thread;
+/// read by every other thread via `devicePtr` after that. The renderer
+/// holds `*const Vulkan` from `generic.zig` so we can't mutate fields
+/// on the value — same reason OpenGL uses a `threadlocal var gl_host`
+/// (though OpenGL gets away with threadlocal because the OpenGL
+/// platform callbacks are read on the same thread that set them).
+var device: ?Device = null;
+
+/// Refcount of live `Vulkan` renderer instances that share `device`.
+/// Each `init` increments; each `deinit` decrements. The device is
+/// only torn down when the count returns to 0, so closing one tab
+/// (or one split) doesn't yank the VkDevice out from under the
+/// surfaces still running in other tabs. Process-wide (matches
+/// `device`'s scope). Mutated under `device_mutex` because
+/// surfaces' renderer threads run independently and may init/deinit
+/// concurrently.
+var device_refcount: usize = 0;
+var device_mutex: std.Thread.Mutex = .{};
+
+/// Cross-frame buffer recycle pool. See `vulkan/buffer_pool.zig`
+/// for the full lifecycle / multi-thread contract. Re-exported so
+/// existing callers (`Vulkan.buffer_pool.cycle` etc.) keep working
+/// unchanged.
+pub const buffer_pool = @import("vulkan/buffer_pool.zig");
+
+/// Per-renderer-thread state (frame command buffer, fence, descriptor
+/// pool, last-target pointer). See `vulkan/ThreadState.zig` for the
+/// lifecycle.
+const ThreadState = @import("vulkan/ThreadState.zig");
+
+// ---- lifecycle ----------------------------------------------------------
+
+pub fn init(alloc: Allocator, opts: rendererpkg.Options) !Vulkan {
+    // Vulkan needs the device populated before the renderer's
+    // `FrameState.init` starts asking for buffer/texture options.
+    // Process-wide (not threadlocal): the renderer thread is
+    // distinct from the main thread that constructs the surface.
+    device_mutex.lock();
+    defer device_mutex.unlock();
+    if (device == null) {
+        switch (apprt.runtime) {
+            // The Vulkan renderer is embedded-only by design: the
+            // host owns the VkInstance/Device/Queue and hands them
+            // to libghostty via `ghostty_platform_vulkan_s`. There
+            // is no Vulkan path through the GTK apprt and never
+            // will be from this side. Compile-error any other
+            // runtime so a misconfigured `-Drenderer=vulkan
+            // -Dapp-runtime=gtk` build fails loudly at compile time
+            // instead of crashing at first surface init. Mirrors
+            // OpenGL.zig's `@compileError("unsupported app
+            // runtime for OpenGL")` pattern.
+            else => @compileError("unsupported app runtime for Vulkan (embedded-only)"),
+            apprt.embedded => switch (opts.rt_surface.platform) {
+                .vulkan => |platform| {
+                    device = try Device.init(alloc, try bootstrapFromPlatform(platform));
+                    log.info(
+                        "Vulkan device ready (api=0x{x})",
+                        .{device.?.api_version},
+                    );
+                },
+                // The Platform union is decided at host-call time
+                // (the C ABI lets the host pick), so this arm
+                // really is a runtime check — the host plugged us
+                // into a non-Vulkan surface.
+                .opengl, .macos, .ios => return error.UnsupportedPlatform,
+            },
+        }
+    }
+    device_refcount += 1;
+    return .{
+        .alloc = alloc,
+        .blending = opts.config.blending,
+        .rt_surface = opts.rt_surface,
+    };
+}
+
+pub fn deinit(self: *Vulkan) void {
+    // ThreadState.cleanup is NOT called here — it runs in
+    // `threadExit` on the renderer thread, which is where the
+    // `threadlocal var` state was populated. Calling it here would
+    // read the GUI thread's empty TLS and silently leak everything.
+    // See the comment in `threadExit` for the full rationale.
+
+    // Decrement the shared-device refcount; only the last surface
+    // to deinit gets to destroy the VkDevice. Closing one of N tabs
+    // must NOT pull the device out from under the others — that
+    // crashes (or invisibly silences) every other surface's
+    // renderer thread.
+    {
+        device_mutex.lock();
+        defer device_mutex.unlock();
+        // Refcount-underflow guard. Was `std.debug.assert(refcount > 0)`,
+        // but assertions compile out in ReleaseFast / ReleaseSmall — a
+        // double-deinit would silently underflow the unsigned counter
+        // to a huge value, blocking the device tear-down forever (the
+        // refcount==0 branch below would never trigger). Hard-log
+        // even in release: a stale deinit is a contract violation
+        // we'd rather surface than mask. We still poison `self` at
+        // function exit so the caller sees consistent UB on either
+        // path.
+        if (device_refcount == 0) {
+            log.err("Vulkan.deinit: refcount underflow — double-deinit?", .{});
+        } else {
+            device_refcount -= 1;
+            if (device_refcount == 0) {
+                // Last surface: NOW we can safely drain the shared
+                // `ready` list of the buffer pool and tear the device
+                // down. The waitIdle is needed because non-final
+                // deinits skipped it. Each surface's deinit already
+                // drained its own per-thread `pending` (via
+                // buffer_pool.drainSelf above), so this path only
+                // needs to handle the cross-thread `ready`.
+                if (device) |*d| {
+                    d.waitIdle();
+                    buffer_pool.drainShared(d);
+                    d.deinit();
+                }
+                device = null;
+            }
+        }
+    }
+    self.* = undefined;
+}
+
+/// Early per-surface setup hook. No-op for Vulkan: the host
+/// hasn't finished installing the platform callbacks at this
+/// point, so all device wiring waits until `Vulkan.init` (which
+/// runs after the platform is plumbed through `opts`).
+pub fn surfaceInit(surface: *apprt.Surface) !void {
+    _ = surface;
+}
+
+/// Main-thread setup just before the renderer thread spins up.
+/// No-op: device construction happens in `Vulkan.init` (the
+/// renderer's FrameState init path calls option getters before
+/// `threadEnter`, and those getters need the device — so it has
+/// to be ready earlier than OpenGL needs it to be).
+pub fn finalizeSurfaceInit(self: *const Vulkan, surface: *apprt.Surface) !void {
+    _ = self;
+    _ = surface;
+}
+
+pub fn threadEnter(self: *const Vulkan, surface: *apprt.Surface) !void {
+    _ = self;
+    _ = surface;
+    // No-op: device is brought up in `init` (the renderer's
+    // FrameState init path calls option getters before threadEnter
+    // and those need the device). Decl kept so
+    // `@hasDecl(GraphicsAPI, "threadEnter")` still resolves true in
+    // `generic.zig`.
+}
+
+pub fn threadExit(self: *const Vulkan) void {
+    _ = self;
+    if (device) |*d| {
+        // ThreadState.cleanup MUST run here, on the renderer thread,
+        // not in Vulkan.deinit (which runs on the GUI thread AFTER
+        // the renderer thread has joined — see Surface.deinit). Our
+        // per-thread Vulkan state lives in `threadlocal var` slots
+        // populated on this thread; calling cleanup from the GUI
+        // thread reads the GUI thread's empty TLS, the destroys
+        // no-op, and the per-tab DescriptorPool / VkCommandBuffer /
+        // VkFence + buffer_pool pending list leak forever. heaptrack
+        // on a 20-tab open+close session attributed ~6 MB / 42 calls
+        // of NVIDIA driver-internal state to exactly this:
+        // DescriptorPool.init → ThreadState.ensureInit pages that
+        // nothing ever released.
+        //
+        // Cleanup needs the device alive: refcount stays > 0 until
+        // Vulkan.deinit decrements it on the GUI thread, so the
+        // shared VkDevice is still valid here.
+        ThreadState.cleanup(d);
+        // waitIdle was the pre-fix behavior — keep it as belt-and-
+        // suspenders for any non-ThreadState in-flight work this
+        // thread may have submitted via the shared queue.
+        d.waitIdle();
+    }
+}
+
+pub fn displayRealized(self: *Vulkan) void {
+    _ = self;
+}
+
+pub fn displayUnrealized(self: *Vulkan) void {
+    _ = self;
+}
+
+pub fn drawFrameStart(self: *Vulkan) void {
+    _ = self;
+}
+
+pub fn drawFrameEnd(self: *Vulkan) void {
+    _ = self;
+}
+
+pub fn initShaders(
+    self: *const Vulkan,
+    alloc: Allocator,
+    /// For Vulkan these are SPIR-V binaries (loaded with
+    /// `shadertoy.Target = .spv`), not GLSL strings — see
+    /// `custom_shader_target` above.
+    custom_shaders: []const []const u8,
+) !shaders.Shaders {
+    _ = self;
+    return try shaders.Shaders.init(alloc, devicePtr(), custom_shaders);
+}
+
+pub fn initTarget(self: *const Vulkan, width: usize, height: usize) !Target {
+    // SRGB format so the hardware gamma-encodes the linear premultiplied
+    // shader output at framebuffer-write time. The renderer's shaders
+    // produce linear premultiplied alpha; without an sRGB format the
+    // bytes in memory would be linear and Qt (which expects sRGB
+    // premultiplied) would render them as if they were already gamma
+    // encoded — colors would look way too dark. The DRM fourcc the
+    // host sees is still ARGB8888; SRGB encoding is a Vulkan-side
+    // concern only.
+    //
+    // Per-surface platform: pulled from rt_surface so the `present`
+    // callback's `userdata` points at THIS surface's window. Splits
+    // and tabs share the process-wide Device but each owns its own
+    // platform copy — without per-surface routing here, all dmabuf
+    // frames would funnel through whichever surface initialized the
+    // device first.
+    const platform = surfacePlatform(self.rt_surface) orelse
+        return error.UnsupportedPlatform;
+    return try Target.init(.{
+        .device = devicePtr(),
+        .format = vk.VK_FORMAT_B8G8R8A8_SRGB,
+        .width = @intCast(width),
+        .height = @intCast(height),
+        .platform = platform,
+    });
+}
+
+/// Translate the apprt's `Platform.Vulkan` callback struct into the
+/// neutral `Device.HostBootstrap` the binding expects. Resolves the
+/// host's handles + the root proc-addr resolver up-front so the
+/// binding stays free of any apprt type. Any null host handle ->
+/// `error.HostHandleMissing`.
+fn bootstrapFromPlatform(
+    platform: apprt.embedded.Platform.Vulkan,
+) Device.Error!Device.HostBootstrap {
+    const instance_handle = platform.instance(platform.userdata) orelse
+        return error.HostHandleMissing;
+    const physical_device_handle = platform.physical_device(platform.userdata) orelse
+        return error.HostHandleMissing;
+    const device_handle = platform.device(platform.userdata) orelse
+        return error.HostHandleMissing;
+    const queue_handle = platform.queue(platform.userdata) orelse
+        return error.HostHandleMissing;
+    const get_instance_proc_addr_raw = platform.get_instance_proc_addr(
+        platform.userdata,
+        "vkGetInstanceProcAddr",
+    ) orelse return error.HostHandleMissing;
+
+    return .{
+        .instance = @ptrCast(instance_handle),
+        .physical_device = @ptrCast(physical_device_handle),
+        .device = @ptrCast(device_handle),
+        .queue = @ptrCast(queue_handle),
+        .queue_family_index = platform.queue_family_index(platform.userdata),
+        .get_instance_proc_addr_raw = get_instance_proc_addr_raw,
+    };
+}
+
+/// Extract the Vulkan platform callbacks from a surface, when the
+/// surface was created with the Vulkan platform tag. Returns null
+/// when the surface was tagged with a non-Vulkan platform — the
+/// caller is expected to reject the surface with
+/// `error.UnsupportedPlatform`. (`Vulkan.init` already does the same
+/// reject up-front, so reaching this function with a non-Vulkan
+/// platform implies a surface plumbed through after that gate.)
+fn surfacePlatform(rt_surface: *apprt.Surface) ?apprt.embedded.Platform.Vulkan {
+    // `init()` already gates non-embedded runtimes with a
+    // `@compileError`, so reaching this function on anything other
+    // than `apprt.embedded` is impossible. Direct embedded match
+    // here keeps the function single-arm.
+    if (apprt.runtime != apprt.embedded)
+        @compileError("unsupported app runtime for Vulkan (embedded-only)");
+    return switch (rt_surface.platform) {
+        .vulkan => |p| p,
+        else => null,
+    };
+}
+
+pub fn surfaceSize(self: *const Vulkan) !struct { width: u32, height: u32 } {
+    const size = self.rt_surface.size;
+    return .{ .width = size.width, .height = size.height };
+}
+
+pub fn present(self: *Vulkan, target: *Target) !void {
+    _ = self;
+    // The target is already populated by the time we get here:
+    // `Frame.complete` ended the command buffer, submitted with the
+    // fence, and waited for the GPU to finish before returning. So
+    // the dmabuf fd is safe to hand off.
+    target.present();
+    // Remember the target's address so `presentLastTarget` can
+    // re-present it on no-op frames. We store the pointer — not a
+    // value copy — so a subsequent `frame.resize` (which destroys
+    // the old Target and overwrites the FrameState's slot with a
+    // new one) is transparently followed. A value copy would leave
+    // us holding a closed fd and freed VkImage handles.
+    ThreadState.last_target = target;
+}
+
+pub fn presentLastTarget(self: *Vulkan) !void {
+    if (ThreadState.last_target) |t| try self.present(t);
+}
+
+pub fn beginFrame(
+    self: *const Vulkan,
+    renderer: *rendererpkg.Renderer,
+    target: *Target,
+) !Frame {
+    _ = self;
+    const dev = devicePtr();
+
+    // Lazy per-thread resource init (no-op after the first frame on
+    // this thread). Sets up the command pool + buffer + fence +
+    // descriptor pool that get reused for every subsequent frame.
+    try ThreadState.ensureInit(dev);
+
+    // Reset this frame's per-frame state. The fence is the load-
+    // bearing piece for tear-down correctness: any error path that
+    // could leave the fence in an UNSIGNALED-with-no-pending-submit
+    // state will hang the next `Vulkan.deinit` on
+    // `waitForFences(UINT64_MAX)`.
+    //
+    // Defense: register the re-signal `errdefer` BEFORE the
+    // `beginFrameReset` call (which is the one that calls
+    // `vkResetFences`). If any reset fails, the errdefer fires
+    // an empty submit with this fence as the signal target,
+    // restoring the signaled state.
+    errdefer {
+        // Empty submit with this fence as the signal target is the
+        // simplest portable way to push it back to signaled without
+        // recording any commands. The fence in this errdefer can
+        // be in any of three states:
+        //   1. Reset by `beginFrameReset` (the failing path). The
+        //      empty submit signals it cleanly.
+        //   2. Still in its prior-frame state (the resetFences call
+        //      failed — spec says the fence is in an undefined
+        //      state). The empty submit re-signals once any prior
+        //      pending submit on the queue retires; queueSubmit
+        //      spec semantics guarantee the fence is signaled
+        //      after all earlier submits complete.
+        //   3. Driver-lost on DEVICE_LOST. queueSubmit returns
+        //      DEVICE_LOST too; we fall back to deviceWaitIdle.
+        // The fallback `vkDeviceWaitIdle` is the actual safety net
+        // — without one of those signaling paths succeeding, the
+        // next `Vulkan.deinit` hangs on `waitForFences(UINT64_MAX)`.
+        const empty: vk.VkSubmitInfo = .{
+            .sType = vk.VK_STRUCTURE_TYPE_SUBMIT_INFO,
+            .pNext = null,
+            .waitSemaphoreCount = 0,
+            .pWaitSemaphores = null,
+            .pWaitDstStageMask = null,
+            .commandBufferCount = 0,
+            .pCommandBuffers = null,
+            .signalSemaphoreCount = 0,
+            .pSignalSemaphores = null,
+        };
+        const sr = dev.queueSubmit(1, &empty, ThreadState.frame_fence);
+        if (sr != vk.VK_SUCCESS) {
+            log.warn(
+                "beginFrame errdefer: empty queueSubmit failed " ++
+                    "(result={}); waiting device idle to ensure the fence " ++
+                    "doesn't hang the next deinit",
+                .{sr},
+            );
+            _ = dev.dispatch.deviceWaitIdle(dev.device);
+        }
+    }
+    try ThreadState.beginFrameReset(dev);
+
+    return try Frame.begin(
+        .{
+            .cb = ThreadState.frame_cb,
+            .fence = ThreadState.frame_fence,
+            .step_pool = if (ThreadState.step_pool) |*p| p else null,
+        },
+        dev,
+        renderer,
+        target,
+    );
+}
+
+// ---- buffer / texture / sampler option getters --------------------------
+//
+// `GenericRenderer` calls these without knowing or caring about Vulkan
+// specifics; the returned `Options` structs are what each backend's
+// resource wrapper expects to be passed back to its `init`. The
+// Vulkan-flavored ones embed a `*const Device` reference plus
+// backend-specific usage flags.
+
+inline fn devicePtr() *const Device {
+    // Indirected through a getter so future refactors (e.g. allocating
+    // `Device` on the heap) don't ripple. Today the device is a
+    // process-wide `?Device` populated in `Vulkan.init` BEFORE the
+    // renderer's `FrameState.init` calls any of the option getters.
+    // A null here means the device construction failed AND someone
+    // called an option getter anyway — a programming error, not a
+    // runtime condition we can recover from.
+    return &(device orelse {
+        @panic("Vulkan.devicePtr: device not initialized — option getter called before Vulkan.init succeeded");
+    });
+}
+
+/// Default buffer options. Vulkan needs an explicit usage bitmask;
+/// callers that want a specific kind override via the per-kind getters
+/// below. (Self is unused — the device comes from the threadlocal.)
+pub fn bufferOptions(_: *const Vulkan) bufferpkg.Options {
+    return .{
+        .device = devicePtr(),
+        .usage = vk.VK_BUFFER_USAGE_VERTEX_BUFFER_BIT,
+    };
+}
+
+pub fn instanceBufferOptions(_: *const Vulkan) bufferpkg.Options {
+    return .{
+        .device = devicePtr(),
+        .usage = vk.VK_BUFFER_USAGE_VERTEX_BUFFER_BIT,
+    };
+}
+
+pub fn uniformBufferOptions(_: *const Vulkan) bufferpkg.Options {
+    return .{
+        .device = devicePtr(),
+        .usage = vk.VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT,
+    };
+}
+
+pub fn fgBufferOptions(self: *const Vulkan) bufferpkg.Options {
+    return self.instanceBufferOptions();
+}
+
+pub fn bgBufferOptions(_: *const Vulkan) bufferpkg.Options {
+    // The bg cells buffer is consumed as a STORAGE BUFFER by the
+    // cell_bg fragment shader (binding `bg_cells`) and the cell_text
+    // vertex shader (same binding). The OpenGL backend doesn't
+    // distinguish — every buffer is reusable across roles — but
+    // Vulkan validates usage flags at descriptor-write time.
+    return .{
+        .device = devicePtr(),
+        .usage = vk.VK_BUFFER_USAGE_STORAGE_BUFFER_BIT,
+    };
+}
+
+pub fn imageBufferOptions(self: *const Vulkan) bufferpkg.Options {
+    return self.instanceBufferOptions();
+}
+
+pub fn bgImageBufferOptions(self: *const Vulkan) bufferpkg.Options {
+    return self.instanceBufferOptions();
+}
+
+pub fn textureOptions(_: *const Vulkan) Texture.Options {
+    // The renderer uses `textureOptions()`-shaped textures both for
+    // glyph atlases (sampled-only) AND for the custom-shader
+    // back_texture (which is BOTH sampled AND a render target).
+    // We hand back the wider usage set so both work. The format
+    // matches the renderer's `initTarget` choice
+    // (`B8G8R8A8_SRGB`) so a render → sample → render chain
+    // through the custom-shader pass keeps the same color format.
+    return .{
+        .device = devicePtr(),
+        .format = vk.VK_FORMAT_B8G8R8A8_SRGB,
+        .usage = vk.VK_IMAGE_USAGE_SAMPLED_BIT |
+            vk.VK_IMAGE_USAGE_TRANSFER_DST_BIT |
+            vk.VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT,
+    };
+}
+
+pub fn samplerOptions(_: *const Vulkan) Sampler.Options {
+    return .{
+        .device = devicePtr(),
+        .min_filter = .linear,
+        .mag_filter = .linear,
+        .wrap_s = .clamp_to_edge,
+        .wrap_t = .clamp_to_edge,
+    };
+}
+
+/// Re-export so callers can write `Vulkan.ImageTextureFormat` —
+/// matches the `OpenGL.ImageTextureFormat` shape on the OpenGL side.
+/// Definition lives in `vulkan/Texture.zig` next to `Texture`
+/// itself.
+pub const ImageTextureFormat = Texture.ImageTextureFormat;
+
+pub fn imageTextureOptions(
+    _: *const Vulkan,
+    format: ImageTextureFormat,
+    srgb: bool,
+) Texture.Options {
+    return .{
+        .device = devicePtr(),
+        .format = format.toVk(srgb),
+        .usage = vk.VK_IMAGE_USAGE_SAMPLED_BIT |
+            vk.VK_IMAGE_USAGE_TRANSFER_DST_BIT,
+    };
+}
+
+pub fn initAtlasTexture(
+    _: *const Vulkan,
+    atlas: *const font.Atlas,
+) !Texture {
+    const fmt: vk.VkFormat = switch (atlas.format) {
+        .grayscale => vk.VK_FORMAT_R8_UNORM,
+        .bgra => vk.VK_FORMAT_B8G8R8A8_UNORM,
+        else => return error.UnsupportedAtlasFormat,
+    };
+    return try Texture.init(
+        .{
+            .device = devicePtr(),
+            .format = fmt,
+            .usage = vk.VK_IMAGE_USAGE_SAMPLED_BIT |
+                vk.VK_IMAGE_USAGE_TRANSFER_DST_BIT,
+        },
+        atlas.size,
+        atlas.size,
+        null,
+    );
+}
--- a/src/renderer/backend.zig
+++ b/src/renderer/backend.zig
@ -6,6 +6,11 @@ pub const Backend = enum {
    opengl,
    metal,
    webgl,
+    /// Vulkan is on this fork only. Embedded-only — the host owns
+    /// the VkInstance/Device/Queue and hands them in via
+    /// `ghostty_platform_vulkan_s`; libghostty renders against
+    /// those handles and exports the result as a dmabuf fd.
+    vulkan,

    pub fn default(
        target: std.Target,
--- a/src/renderer/generic.zig
+++ b/src/renderer/generic.zig
@ -838,14 +838,52 @@ pub fn Renderer(comptime GraphicsAPI: type) type {
            defer arena.deinit();
            const arena_alloc = arena.allocator();

-            // Load our custom shaders
-            const custom_shaders: []const [:0]const u8 = shadertoy.loadFromFiles(
-                arena_alloc,
-                self.config.custom_shaders,
-                GraphicsAPI.custom_shader_target,
-            ) catch |err| err: {
-                log.warn("error loading custom shaders err={}", .{err});
-                break :err &.{};
+            // Load our custom shaders.
+            //
+            // GraphicsAPI advertises whether it can actually run them
+            // (`supports_custom_shaders`). The Vulkan backend currently
+            // can't — its post-pass / compositor pipeline that wires
+            // CustomShaderState.back_texture → frame.target through the
+            // user's shader hasn't been built yet. Loading + flagging
+            // `has_custom_shaders` anyway would route bg_color into the
+            // back_texture and leave frame.target blank. Skip the load
+            // when the backend can't consume the result, and emit a
+            // one-line warning so the user knows their config item was
+            // ignored.
+            const can_use_custom = !@hasDecl(GraphicsAPI, "supports_custom_shaders") or
+                GraphicsAPI.supports_custom_shaders;
+            const custom_shaders: []const []const u8 = if (can_use_custom)
+                (shadertoy.loadFromFiles(
+                    arena_alloc,
+                    self.config.custom_shaders,
+                    .{
+                        .target = GraphicsAPI.custom_shader_target,
+                        // Optional per-backend hooks. Resolved at
+                        // comptime via `@hasDecl`, so backends that
+                        // don't need them stay free of extra-define /
+                        // GLSL-rewrite logic.
+                        .extra_defines = if (@hasDecl(GraphicsAPI, "custom_shader_extra_defines"))
+                            GraphicsAPI.custom_shader_extra_defines
+                        else
+                            &.{},
+                        .rewrite = if (@hasDecl(GraphicsAPI, "rewriteCustomShaderSource"))
+                            GraphicsAPI.rewriteCustomShaderSource
+                        else
+                            null,
+                    },
+                ) catch |err| err: {
+                    log.warn("error loading custom shaders err={}", .{err});
+                    break :err &.{};
+                })
+            else custom: {
+                if (self.config.custom_shaders.value.items.len > 0) {
+                    log.warn(
+                        "custom-shader config ignored: backend lacks " ++
+                            "post-pipeline support (Vulkan TODO)",
+                        .{},
+                    );
+                }
+                break :custom &.{};
            };

            const has_custom_shaders = custom_shaders.len > 0;
@ -1431,15 +1469,6 @@ pub fn Renderer(comptime GraphicsAPI: type) type {
            self: *Self,
            sync: bool,
        ) !void {
-            // const start = std.time.Instant.now() catch unreachable;
-            // const start_micro = std.time.microTimestamp();
-            // defer {
-            //     const end = std.time.Instant.now() catch unreachable;
-            //     log.warn(
-            //         "[drawFrame time] start_micro={} duration={}ns",
-            //         .{ start_micro, end.since(start) / std.time.ns_per_us },
-            //     );
-            // }

            // We hold a the draw mutex to prevent changes to any
            // data we access while we're in the middle of drawing.
@ -1632,6 +1661,7 @@ pub fn Renderer(comptime GraphicsAPI: type) type {
                self.images.draw(
                    &self.api,
                    self.shaders.pipelines.image,
+                    frame.uniforms,
                    &pass,
                    .kitty_below_bg,
                );
@ -1648,6 +1678,7 @@ pub fn Renderer(comptime GraphicsAPI: type) type {
                self.images.draw(
                    &self.api,
                    self.shaders.pipelines.image,
+                    frame.uniforms,
                    &pass,
                    .kitty_below_text,
                );
@ -1675,6 +1706,7 @@ pub fn Renderer(comptime GraphicsAPI: type) type {
                self.images.draw(
                    &self.api,
                    self.shaders.pipelines.image,
+                    frame.uniforms,
                    &pass,
                    .kitty_above_text,
                );
@ -1684,6 +1716,7 @@ pub fn Renderer(comptime GraphicsAPI: type) type {
                if (self.overlay != null) self.images.draw(
                    &self.api,
                    self.shaders.pipelines.image,
+                    frame.uniforms,
                    &pass,
                    .overlay,
                );
--- a/src/renderer/image.zig
+++ b/src/renderer/image.zig
@ -105,6 +105,7 @@ pub const State = struct {
        self: *State,
        api: *GraphicsAPI,
        pipeline: GraphicsAPI.Pipeline,
+        uniforms: GraphicsAPI.Buffer(GraphicsAPI.shaders.Uniforms),
        pass: *GraphicsAPI.RenderPass,
        placement_type: DrawPlacements,
    ) void {
@ -168,6 +169,21 @@ pub const State = struct {

            pass.step(.{
                .pipeline = pipeline,
+                // Bind uniforms explicitly per image step. Without
+                // this, the image pipeline relied on whatever
+                // uniforms a previous (cell_bg / cell_text) step
+                // happened to bind in the same render pass — works
+                // if the renderer always draws cells before images,
+                // but a race on first-frame init (precompiled-SPV
+                // path returned from Shaders.init fast enough that
+                // image.draw could fire before the cell steps had
+                // populated the descriptor set) showed the image
+                // shader reading garbage cell_size from a stale
+                // UBO binding, producing image quads that covered
+                // the entire viewport. Defensive explicit bind
+                // makes the image pipeline's UBO source independent
+                // of prior-step ordering.
+                .uniforms = uniforms.buffer,
                .buffers = &.{buf.buffer},
                .textures = &.{texture},
                .draw = .{
--- a/src/renderer/shaders/glsl/image.v.glsl
+++ b/src/renderer/shaders/glsl/image.v.glsl
@ -43,5 +43,10 @@ void main() {
    vec2 image_pos = (cell_size * grid_pos) + cell_offset;
    image_pos += dest_size * corner;

-    gl_Position = projection_matrix * vec4(image_pos.xy, 1.0, 1.0);
+    // Z=0 (not 1) so we land in the middle of Vulkan's [0,1] NDC
+    // depth range after `ortho2d`'s `-1` z scale. OpenGL accepts
+    // either since there's no depth attachment, but Vulkan clips
+    // NDC z<0 (which `vec4(_, _, 1.0, 1.0)` would produce) and
+    // erases the entire image. Matches `cell_text.v.glsl`.
+    gl_Position = projection_matrix * vec4(image_pos.xy, 0.0, 1.0);
 }
--- a/src/renderer/shaders/shadertoy_prefix.glsl
+++ b/src/renderer/shaders/shadertoy_prefix.glsl
@ -49,4 +49,24 @@ layout(location = 0) out vec4 _fragColor;
 #define texture2D texture

 void mainImage( out vec4 fragColor, in vec2 fragCoord );
-void main() { mainImage (_fragColor, gl_FragCoord.xy); }
+
+// Vulkan-only: wrap `texture(sampler2D, vec2)` so iChannel0
+// (back_texture, in Vulkan top-left orientation) appears to
+// the author in OpenGL/shadertoy convention (lower-left).
+// Defined BEFORE the `#define`, so the inner `texture(s, ...)`
+// call here resolves to the GLSL built-in, not back to ourselves
+// (no preprocessor recursion).
+#ifdef GHASTTY_VULKAN
+vec4 _ghastty_tex2d(sampler2D s, vec2 uv) {
+    return texture(s, vec2(uv.x, 1.0 - uv.y));
+}
+#define texture _ghastty_tex2d
+#endif
+
+void main() {
+#ifdef GHASTTY_VULKAN
+    mainImage(_fragColor, vec2(gl_FragCoord.x, iResolution.y - gl_FragCoord.y));
+#else
+    mainImage(_fragColor, gl_FragCoord.xy);
+#endif
+}
--- a/src/renderer/shadertoy.zig
+++ b/src/renderer/shadertoy.zig
@ -40,16 +40,69 @@ pub const Uniforms = extern struct {
 };

 /// The target to load shaders for.
-pub const Target = enum { glsl, msl };
+///
+///   - `.glsl`: roundtripped through SPIR-V back to GLSL via
+///     spirv-cross. Normalizes/validates the source. The OpenGL
+///     backend consumes this.
+///   - `.msl`: spirv-cross translation to Metal Shading Language.
+///   - `.spv`: raw SPIR-V binary (no spirv-cross roundtrip). The
+///     Vulkan backend consumes this — Vulkan compiles GLSL → SPIR-V
+///     itself via glslang for its built-in shaders, and feeding
+///     the user shader through GLSL→SPIR-V→GLSL→SPIR-V again costs
+///     2× the compile work AND loses the original source structure
+///     (which broke our `gl_FragCoord` Y-flip rewrite when the
+///     spirv-cross-emitted main() didn't match the upstream prefix).
+pub const Target = enum { glsl, msl, spv };
+
+/// Optional GLSL → GLSL rewriter applied between the prefix splice
+/// and the SPIR-V compile. Vulkan plugs in `vulkanizeGlsl` here so
+/// SPIR-V output uses the renderer's multi-set descriptor layout;
+/// other backends pass `null`. Owns its allocation under the
+/// caller's allocator (`shadertoy.loadFromFile` runs it inside an
+/// arena that's torn down at function exit, so the rewriter's
+/// returned slice may be arena-owned).
+pub const Rewriter = *const fn (
+    alloc: Allocator,
+    src: []const u8,
+) Allocator.Error![:0]const u8;
+
+/// What `loadFromFile`/`loadFromFiles` need beyond the path itself.
+/// Keeps the function decoupled from any specific backend — every
+/// backend-flavored knob becomes an explicit field, and `shadertoy`
+/// itself reaches into no other backend's submodules.
+pub const LoadOptions = struct {
+    /// Output language / format. See `Target` for the per-variant
+    /// rationale.
+    target: Target,
+
+    /// `#define <body>` lines injected after the prefix's
+    /// `#version` directive. Vulkan passes
+    /// `&.{"GHASTTY_VULKAN 1"}` so the prefix's `main()` flips
+    /// `gl_FragCoord.y` and wraps `texture()` for upper-left
+    /// sampling; OpenGL/MSL pass `&.{}`.
+    extra_defines: []const []const u8 = &.{},
+
+    /// Optional second-pass GLSL transform run between the prefix
+    /// splice and the SPIR-V compile. Vulkan installs
+    /// `vulkan/shaders.zig:vulkanizeGlsl` here for the multi-set
+    /// descriptor layout rewrite; other backends leave it null.
+    rewrite: ?Rewriter = null,
+};

 /// Load a set of shaders from files and convert them to the target
 /// format. The shader order is preserved.
+///
+/// Result element type depends on `opts.target`: `.glsl`/`.msl`
+/// produce null-terminated UTF-8 source strings; `.spv` produces
+/// SPIR-V binary bytes (4-byte-aligned, no trailing null). We unify
+/// the return type as `[]const []const u8` and have the caller cast/
+/// reinterpret as needed.
 pub fn loadFromFiles(
    alloc_gpa: Allocator,
    paths: configpkg.RepeatablePath,
-    target: Target,
-) ![]const [:0]const u8 {
-    var list: std.ArrayList([:0]const u8) = .empty;
+    opts: LoadOptions,
+) ![]const []const u8 {
+    var list: std.ArrayList([]const u8) = .empty;
    defer list.deinit(alloc_gpa);
    errdefer for (list.items) |shader| alloc_gpa.free(shader);

@ -59,13 +112,19 @@ pub fn loadFromFiles(
            .required => |path| .{ path, false },
        };

-        const shader = loadFromFile(alloc_gpa, path, target) catch |err| {
+        const shader = loadFromFile(alloc_gpa, path, opts) catch |err| {
            if (err == error.FileNotFound and optional) {
                continue;
            }

            return err;
        };
+        // Take ownership of `shader` immediately. If the subsequent
+        // `list.append` itself OOMs, the freshly-loaded slice would
+        // leak — `errdefer` at the function level only iterates
+        // `list.items`, and `shader` isn't in `list` yet. Free it
+        // explicitly on the error path before propagating.
+        errdefer alloc_gpa.free(shader);
        log.info("loaded custom shader path={s}", .{path});
        try list.append(alloc_gpa, shader);
    }
@ -75,11 +134,16 @@ pub fn loadFromFiles(

 /// Load a single shader from a file and convert it to the target language
 /// ready to be used with renderers.
+///
+/// For `.glsl` / `.msl` the returned slice is a null-terminated UTF-8
+/// source string; the underlying allocation is `[:0]const u8` and
+/// callers that need the sentinel may safely cast. For `.spv` the
+/// returned slice is raw SPIR-V bytes — no terminator, 4-byte aligned.
 pub fn loadFromFile(
    alloc_gpa: Allocator,
    path: []const u8,
-    target: Target,
-) ![:0]const u8 {
+    opts: LoadOptions,
+) ![]const u8 {
    var arena = ArenaAllocator.init(alloc_gpa);
    defer arena.deinit();
    const alloc = arena.allocator();
@ -97,14 +161,33 @@ pub fn loadFromFile(
        );
    };

-    // Convert to full GLSL
-    const glsl: [:0]const u8 = glsl: {
+    // Convert to full GLSL. `opts.extra_defines` lets a backend
+    // inject `#define <body>` lines after the prefix's `#version`
+    // directive — Vulkan uses this to flip `gl_FragCoord.y` and
+    // wrap `texture()` for upper-left sampling so `mainImage` sees
+    // shadertoy-convention coords; OpenGL/MSL pass `&.{}` and use
+    // the GL-native paths unchanged.
+    const glsl_raw: [:0]const u8 = glsl: {
        var stream: std.Io.Writer.Allocating = .init(alloc);
-        try glslFromShader(&stream.writer, src);
+        try glslFromShader(&stream.writer, src, opts.extra_defines);
        try stream.writer.writeByte(0);
        break :glsl stream.written()[0 .. stream.written().len - 1 :0];
    };

+    // Optional second-pass GLSL transform. Vulkan installs
+    // `vulkanizeGlsl` here so the resulting SPIR-V uses the
+    // renderer's multi-set descriptor layout (UBO=set 0,
+    // samplers=set 1, storage=set 2). Without that rewrite,
+    // glslang assigns everything to `set 0` and the post pipeline's
+    // descriptor set layout points at the wrong slots — the
+    // shader's `iChannel0` ends up at set 0 binding 0 while the
+    // pipeline binds it at set 1 binding 0, sampling returns
+    // garbage / zero, output is transparent.
+    const glsl: [:0]const u8 = if (opts.rewrite) |f|
+        try f(alloc, glsl_raw)
+    else
+        glsl_raw;
+
    // Convert to SPIR-V
    const spirv: []const u8 = spirv: {
        var stream: std.Io.Writer.Allocating = .init(alloc);
@ -129,12 +212,47 @@ pub fn loadFromFile(
        break :spirv list.items;
    };

-    // Convert to MSL
-    return switch (target) {
-        // Important: using the alloc_gpa here on purpose because this
-        // is the final result that will be returned to the caller.
+    // Validate the SPIR-V regardless of target. glslang has succeeded
+    // at this point but a zero-length output would crash
+    // `vkCreateShaderModule` on the Vulkan path AND would make
+    // `glslFromSpv` / `mslFromSpv` produce empty/garbage GLSL/MSL
+    // with poor diagnostics. Hoist the checks above the switch so
+    // every backend gets the same defensive validation.
+    if (spirv.len < 4) {
+        std.log.warn(
+            "shadertoy: empty SPIR-V output (size={})",
+            .{spirv.len},
+        );
+        return error.InvalidShader;
+    }
+    // First 4 bytes are the SPIR-V magic word 0x07230203
+    // (little-endian). Reject anything else loudly.
+    const magic = std.mem.readInt(u32, spirv[0..4], .little);
+    if (magic != 0x07230203) {
+        std.log.warn(
+            "shadertoy: SPIR-V output missing magic word " ++
+                "(got 0x{x:0>8}, expected 0x07230203)",
+            .{magic},
+        );
+        return error.InvalidShader;
+    }
+
+    // Important: using the alloc_gpa here on purpose because this is
+    // the final result that will be returned to the caller (the arena
+    // gets torn down on function exit).
+    return switch (opts.target) {
        .glsl => try glslFromSpv(alloc_gpa, spirv),
        .msl => try mslFromSpv(alloc_gpa, spirv),
+        .spv => spv: {
+            // Copy the SPIR-V binary out of the arena into a
+            // 4-byte-aligned allocation under `alloc_gpa`. Vulkan
+            // expects `pCode: []const u32`, so over-aligning is safe;
+            // we return as `[]const u8` to share the unified return
+            // type with the GLSL/MSL paths.
+            const dst = try alloc_gpa.alignedAlloc(u8, .of(u32), spirv.len);
+            @memcpy(dst, spirv);
+            break :spv dst;
+        },
    };
 }

@ -144,19 +262,97 @@ pub fn loadFromFile(
 /// mainImage function and don't define any of the uniforms. This function
 /// will convert the ShaderToy shader into a valid GLSL shader that can be
 /// compiled and linked.
-pub fn glslFromShader(writer: *std.Io.Writer, src: []const u8) !void {
+pub fn glslFromShader(
+    writer: *std.Io.Writer,
+    src: []const u8,
+    /// Macros to inject as `#define <body>` lines after the prefix's
+    /// `#version` directive (GLSL requires `#version` first, so we
+    /// can't simply prepend). Empty for the default OpenGL/MSL paths;
+    /// the Vulkan SPV path uses this to flag the prefix's `main()`
+    /// to Y-flip `gl_FragCoord`.
+    defines: []const []const u8,
+) !void {
    const prefix = @embedFile("shaders/shadertoy_prefix.glsl");
-    try writer.writeAll(prefix);
+    if (defines.len == 0) {
+        try writer.writeAll(prefix);
+    } else {
+        // GLSL requires `#version` to be the first non-blank line,
+        // so we can't simply prepend defines. Find the first
+        // newline after `#version …` and inject defines on the
+        // following line.
+        //
+        // The prefix is `@embedFile`'d at comptime, so its bytes
+        // are known to the compiler — assert it has a newline once
+        // here rather than threading branchy fallback paths
+        // through the runtime. A future prefix edit that loses its
+        // trailing newline will fail at comptime, not silently at
+        // runtime.
+        comptime {
+            if (std.mem.indexOfScalar(u8, prefix, '\n') == null) {
+                @compileError(
+                    "shadertoy_prefix.glsl must contain at least one newline " ++
+                        "for `#define` injection — see glslFromShader",
+                );
+            }
+            if (!std.mem.startsWith(u8, prefix, "#version")) {
+                @compileError(
+                    "shadertoy_prefix.glsl must start with `#version` " ++
+                        "(GLSL spec requirement) — see glslFromShader",
+                );
+            }
+        }
+        const first_nl = comptime std.mem.indexOfScalar(u8, prefix, '\n').?;
+        try writer.writeAll(prefix[0 .. first_nl + 1]);
+        for (defines) |def| {
+            try writer.writeAll("#define ");
+            try writer.writeAll(def);
+            try writer.writeAll("\n");
+        }
+        try writer.writeAll(prefix[first_nl + 1 ..]);
+    }
    try writer.writeAll("\n\n");
    try writer.writeAll(src);
 }

+/// Process-wide cache of compiled SPIR-V keyed by GLSL source bytes.
+/// The C-API glslang path (`Shader.create` / `program.spirvGenerate`)
+/// used below pulls allocations from glslang's thread-local
+/// TPoolAllocator on every call — pages that are never released
+/// because Zig pthreads don't run C++ thread_local destructors. With
+/// N tabs each calling `loadFromFiles` → `loadFromFile` →
+/// `spirvFromGlsl` for the same custom shader file, that's N
+/// renderer threads each leaking a per-thread pool. Caching the SPV
+/// bytes lets every call after the first short-circuit without
+/// touching glslang.
+///
+/// Same problem and same fix as the C++ shim's spv_cache in
+/// pkg/glslang/override/ghastty_vk_shim.cpp; this one covers the
+/// C-API path that the shim doesn't see.
+var spv_cache_mutex: std.Thread.Mutex = .{};
+var spv_cache: std.StringHashMapUnmanaged([]const u8) = .empty;
+
 /// Convert a GLSL shader into SPIR-V assembly.
 pub fn spirvFromGlsl(
    writer: *std.Io.Writer,
    errlog: ?*SpirvLog,
    src: [:0]const u8,
 ) !void {
+    // Cache check. On hit, write the cached SPV to the writer and
+    // return without entering glslang. Strict-equality keying on
+    // the source bytes (incl. the NUL terminator) — the input is
+    // deterministically generated upstream from a stable shader
+    // file + a small set of `#define` lines, so identical sources
+    // produce identical SPV.
+    {
+        spv_cache_mutex.lock();
+        defer spv_cache_mutex.unlock();
+        const key: []const u8 = src[0..src.len];
+        if (spv_cache.get(key)) |cached| {
+            try writer.writeAll(cached);
+            return;
+        }
+    }
+
    // So we can run unit tests without fear.
    if (builtin.is_test) try glslang.testing.ensureInit();

@ -205,6 +401,26 @@ pub fn spirvFromGlsl(
    const ptr_u8: [*]u8 = @ptrCast(ptr);
    const slice_u8: []u8 = ptr_u8[0 .. size * 4];
    try writer.writeAll(slice_u8);
+
+    // Populate the cache so the next surface's compile of the same
+    // source short-circuits. Allocations are process-lifetime
+    // (smp_allocator, never freed) — the keys + values are bounded
+    // by the number of distinct shaders the user has configured,
+    // which is small (typically 1-3); even at 100 KB per shader
+    // the total cache cost is negligible against the per-tab pool
+    // pages we'd otherwise leak.
+    spv_cache_mutex.lock();
+    defer spv_cache_mutex.unlock();
+    const key: []const u8 = src[0..src.len];
+    if (!spv_cache.contains(key)) {
+        const key_copy = std.heap.smp_allocator.dupe(u8, key) catch return;
+        errdefer std.heap.smp_allocator.free(key_copy);
+        const spv_copy = std.heap.smp_allocator.dupe(u8, slice_u8) catch return;
+        spv_cache.put(std.heap.smp_allocator, key_copy, spv_copy) catch {
+            std.heap.smp_allocator.free(spv_copy);
+            return;
+        };
+    }
 }

 /// Retrieve errors from spirv compilation.
@ -348,7 +564,7 @@ fn spvCross(
 fn testGlslZ(alloc: Allocator, src: []const u8) ![:0]const u8 {
    var buf: std.Io.Writer.Allocating = .init(alloc);
    defer buf.deinit();
-    try glslFromShader(&buf.writer, src);
+    try glslFromShader(&buf.writer, src, &.{});
    return try buf.toOwnedSliceSentinel(0);
 }

@ -424,4 +640,3 @@ test "shadertoy to glsl" {

 const test_crt = @embedFile("shaders/test_shadertoy_crt.glsl");
 const test_invalid = @embedFile("shaders/test_shadertoy_invalid.glsl");
-const test_focus = @embedFile("shaders/test_shadertoy_focus.glsl");
--- a/src/renderer/vulkan/Frame.zig
+++ b/src/renderer/vulkan/Frame.zig
@ -0,0 +1,242 @@
+//! Per-draw recording context. Lifecycle: `begin` → caller records
+//! commands (via the eventual `renderPass()` accessor) → `complete`.
+//!
+//! Unlike `opengl/Frame.zig` (which is a zero-state wrapper around
+//! the implicit GL context), Vulkan's Frame drives the explicit
+//! sync model: a fence is signaled when the GPU finishes the
+//! frame's submit, and `complete` waits on it before handing the
+//! dmabuf fd to the host. That's required for correctness — the
+//! host shouldn't sample memory the GPU is still writing — and
+//! acceptable for perf because terminal frames cap at ~60Hz.
+//!
+//! Ownership: the command buffer and fence are owned by the
+//! top-level renderer (`Vulkan.zig`, not yet wired) and passed into
+//! `begin` via `Options`. Frame just borrows them. The top-level
+//! is responsible for creating/destroying them and for resetting
+//! the fence to unsignaled state before `begin` (this layer would
+//! conflate ownership otherwise).
+//!
+//! Why not semaphores? With dmabuf export to the host (rather than
+//! a `VkSwapchain` we own), we have no acquire/present semaphore
+//! pair to sync against. Fence-only is the right model when
+//! libghostty hands the host a "GPU is done writing to this fd"
+//! guarantee at present time. The host's own compositor handles
+//! display sync from there.
+//!
+//! `renderPass()` will land alongside `vulkan/RenderPass.zig` in a
+//! follow-up commit. For now it's not declared — calling code that
+//! tries to record into a frame will fail to compile, which is
+//! intentional: the recording path isn't ready.
+//!
+//! Counterpart: `src/renderer/opengl/Frame.zig`.
+
+const Self = @This();
+
+const std = @import("std");
+const vulkan = @import("vulkan");
+const vk = vulkan.c;
+
+const Device = vulkan.Device;
+const DescriptorPool = vulkan.DescriptorPool;
+const Target = @import("Target.zig");
+const RenderPass = @import("RenderPass.zig");
+
+const Vulkan = @import("../Vulkan.zig");
+const Renderer = @import("../generic.zig").Renderer(Vulkan);
+const Health = @import("../../renderer.zig").Health;
+
+const log = std.log.scoped(.vulkan);
+
+pub const Options = struct {
+    /// Command buffer this frame's commands record into. Caller
+    /// resets it to a fresh state before `begin` is called.
+    cb: vk.VkCommandBuffer,
+
+    /// Fence that gets signaled when the submit completes. Caller
+    /// resets it to unsignaled before `begin` is called.
+    fence: vk.VkFence,
+
+    /// Per-frame descriptor pool. `RenderPass.step` borrows it for
+    /// the per-call descriptor sets it allocates whenever a
+    /// pipeline is re-used within a single pass. The pool is
+    /// caller-owned (top-level `Vulkan.zig` keeps it threadlocal)
+    /// and must be reset (`vkResetDescriptorPool`) by the caller
+    /// before each Frame.begin so this frame's allocations don't
+    /// pile on the previous frame's.
+    step_pool: ?*DescriptorPool = null,
+};
+
+pub const Error = error{
+    /// `vkBeginCommandBuffer` / `vkEndCommandBuffer` /
+    /// `vkQueueSubmit` / `vkWaitForFences` returned a non-success
+    /// status.
+    VulkanFailed,
+};
+
+device: *const Device,
+renderer: *Renderer,
+target: *Target,
+cb: vk.VkCommandBuffer,
+fence: vk.VkFence,
+step_pool: ?*DescriptorPool = null,
+
+/// Begin recording a frame. The command buffer is reset and started
+/// with `ONE_TIME_SUBMIT` since we always submit before the next
+/// `begin` overwrites it.
+pub fn begin(
+    opts: Options,
+    device: *const Device,
+    renderer: *Renderer,
+    target: *Target,
+) Error!Self {
+    const begin_info: vk.VkCommandBufferBeginInfo = .{
+        .sType = vk.VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO,
+        .pNext = null,
+        .flags = vk.VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT,
+        .pInheritanceInfo = null,
+    };
+    const r = device.dispatch.beginCommandBuffer(opts.cb, &begin_info);
+    if (r != vk.VK_SUCCESS) {
+        log.err("vkBeginCommandBuffer (frame) failed: result={}", .{r});
+        return error.VulkanFailed;
+    }
+
+    return .{
+        .device = device,
+        .renderer = renderer,
+        .target = target,
+        .cb = opts.cb,
+        .fence = opts.fence,
+        .step_pool = opts.step_pool,
+    };
+}
+
+/// End recording, submit to the queue with `self.fence`, and (if
+/// `sync` is true, which it always is for our dmabuf-export model)
+/// wait on the fence so the GPU is guaranteed to be done before
+/// the host imports the target's dmabuf.
+///
+/// `sync == false` is accepted by the interface for parity with
+/// `opengl/Frame.zig`, but currently still does the wait — without
+/// it, handing the dmabuf fd to the host would race the GPU. The
+/// argument may eventually drive multi-frame pipelining once a
+/// proper queue of frames is in flight.
+pub fn complete(self: *const Self, sync: bool) void {
+    // `sync` is part of the cross-backend `Frame.complete` interface
+    // (OpenGL / Metal / Vulkan all share it). The Vulkan path is
+    // always synchronous today: we waitForFences before handing the
+    // dmabuf fd to the host, and the host cannot sample a buffer
+    // mid-GPU-write. So `sync=false` is silently treated as
+    // `sync=true`. If multi-frame pipelining ever lands, this is
+    // where the param would gate the wait.
+    _ = sync;
+    const dev = self.device;
+
+    // `health` becomes `.unhealthy` on any GPU-side error below. We
+    // ALWAYS run `buffer_pool.cycle` and `frameCompleted` on the
+    // way out — skipping them on error left every retired buffer
+    // stuck in `pending` (unbounded growth) and held the renderer's
+    // swap-chain semaphore forever, so the NEXT `drawFrame` would
+    // hang with no diagnostic.
+    var health: Health = .healthy;
+    var submitted = false;
+
+    // Make the rendered pixels visible to the host's mmap read. In
+    // `.direct` mode this is just a memory barrier; in `.legacy_copy`
+    // mode it also runs `vkCmdCopyImageToBuffer`. See `Target.zig`.
+    self.target.recordPresentBarrier(self.cb);
+
+    end_cb: {
+        const r = dev.dispatch.endCommandBuffer(self.cb);
+        if (r != vk.VK_SUCCESS) {
+            log.err("vkEndCommandBuffer (frame) failed: result={}", .{r});
+            health = .unhealthy;
+            break :end_cb;
+        }
+
+        const submit_info: vk.VkSubmitInfo = .{
+            .sType = vk.VK_STRUCTURE_TYPE_SUBMIT_INFO,
+            .pNext = null,
+            .waitSemaphoreCount = 0,
+            .pWaitSemaphores = null,
+            .pWaitDstStageMask = null,
+            .commandBufferCount = 1,
+            .pCommandBuffers = &self.cb,
+            .signalSemaphoreCount = 0,
+            .pSignalSemaphores = null,
+        };
+        // Externally-synchronized via `Device.queueSubmit` — splits
+        // and tabs share the host's VkQueue and Vulkan rejects
+        // concurrent unsynchronized access.
+        const sr = dev.queueSubmit(1, &submit_info, self.fence);
+        if (sr != vk.VK_SUCCESS) {
+            log.err("vkQueueSubmit (frame) failed: result={}", .{sr});
+            health = .unhealthy;
+            break :end_cb;
+        }
+        submitted = true;
+
+        // Wait for the GPU to finish writing the target before letting
+        // the host import the dmabuf. UINT64_MAX = "wait indefinitely".
+        const wr = dev.dispatch.waitForFences(
+            dev.device,
+            1,
+            &self.fence,
+            vk.VK_TRUE,
+            std.math.maxInt(u64),
+        );
+        if (wr != vk.VK_SUCCESS) {
+            log.err("vkWaitForFences (frame) failed: result={}", .{wr});
+            health = .unhealthy;
+        }
+    }
+
+    // Recycle the per-frame Buffer pool. Even on the error path we
+    // still want to cycle: buffers that the failed submit referenced
+    // are now stuck (we can't prove the GPU is done with them), so
+    // we conservatively wait the device idle when submit DID happen
+    // but the fence wait failed (DEVICE_LOST etc.) before draining.
+    // Without that wait, every failed submit could leak the buffers
+    // the renderer queued for the frame.
+    if (health == .unhealthy and submitted) {
+        _ = dev.dispatch.deviceWaitIdle(dev.device);
+    }
+    Vulkan.buffer_pool.cycle(dev);
+
+    // Hand the rendered target off to the host. On the unhealthy
+    // path we skip present — the dmabuf may be partially written
+    // and the host should see the previous frame instead (the
+    // generic renderer's no-op-frame logic re-presents
+    // `last_target`).
+    if (health == .healthy) {
+        self.renderer.api.present(self.target) catch |err| {
+            log.err("present failed: {}", .{err});
+            health = .unhealthy;
+        };
+    }
+
+    // Tell the generic renderer the frame is done so it releases the
+    // swap-chain semaphore. Without this, `SwapChain.nextFrame()`
+    // blocks the second call to `drawFrame` forever (one buffer in
+    // the chain, never freed). MUST run regardless of `health`.
+    self.renderer.frameCompleted(health);
+}
+
+/// Begin a render pass recording into this frame's command buffer.
+/// The returned `RenderPass` accepts `step()` calls for the
+/// per-pipeline draw work, and is finalized with `complete()`.
+pub inline fn renderPass(
+    self: *const Self,
+    attachments: []const RenderPass.Options.Attachment,
+) RenderPass {
+    return RenderPass.begin(.{
+        .device = self.device,
+        .cb = self.cb,
+        .step_pool = self.step_pool,
+        .attachments = attachments,
+    });
+}
+
+test {
+    std.testing.refAllDecls(@This());
+}
--- a/src/renderer/vulkan/Pipeline.zig
+++ b/src/renderer/vulkan/Pipeline.zig
@ -0,0 +1,466 @@
+//! `VkPipeline` (graphics) + the `VkPipelineLayout` that backs it.
+//!
+//! Vulkan 1.3 with **dynamic rendering**: we use
+//! `VkPipelineRenderingCreateInfo` (chained into the pipeline create
+//! info via `pNext`) instead of constructing a `VkRenderPass` + a
+//! framebuffer per target. This removes the entire RenderPass /
+//! Framebuffer object lifecycle the OpenGL backend never had to
+//! think about — saves significant boilerplate.
+//!
+//! Wrapper scope: the renderer-level "what shaders + what attachment
+//! format" lives in `vulkan/shaders.zig`'s eventual `Shaders` struct
+//! (mirroring `opengl/shaders.zig`). This file is the bare
+//! `VkPipeline` wrapper that takes everything explicitly:
+//! pre-compiled shader modules, descriptor set layouts, push
+//! constant ranges, vertex input description, color attachment
+//! format. The renderer's pipeline-collection assembly layer is
+//! responsible for plumbing those together — Pipeline.zig has no
+//! per-shader knowledge.
+//!
+//! Counterpart: `src/renderer/opengl/Pipeline.zig`.
+
+const Self = @This();
+
+const std = @import("std");
+const vulkan = @import("vulkan");
+const vk = vulkan.c;
+
+const Device = vulkan.Device;
+const DescriptorPool = vulkan.DescriptorPool;
+
+const log = std.log.scoped(.vulkan);
+
+pub const StepFunction = enum {
+    /// Constant value across all vertices (no vertex input).
+    constant,
+    /// One per vertex.
+    per_vertex,
+    /// One per instance (`VK_VERTEX_INPUT_RATE_INSTANCE`).
+    per_instance,
+};
+
+/// Vertex input description. Pass `null` for shaders that don't read
+/// vertex attributes (e.g. screen-quad shaders that derive position
+/// from `gl_VertexIndex`).
+pub const VertexInput = struct {
+    /// Byte stride of the vertex buffer.
+    stride: u32,
+
+    /// Whether the buffer is stepped per-vertex or per-instance.
+    step_fn: StepFunction = .per_vertex,
+
+    /// `binding = 0` attribute descriptions describing each field of
+    /// the vertex struct. The caller is responsible for building
+    /// these (offsets, formats) — Pipeline doesn't introspect.
+    attributes: []const vk.VkVertexInputAttributeDescription,
+};
+
+/// Maximum descriptor sets a single pipeline can address. The
+/// preprocessor in `shaders.zig` bins resources into 3 sets (UBO=0,
+/// sampler=1, storage=2), so 3 is sufficient. Bump if/when a fourth
+/// resource class is introduced.
+pub const MAX_DESCRIPTOR_SETS: usize = 3;
+
+pub const Options = struct {
+    device: *const Device,
+
+    /// Optional descriptor pool. If provided, `Pipeline.init`
+    /// allocates one descriptor set per non-null entry in
+    /// `descriptor_set_layouts` and stores them on
+    /// `Pipeline.descriptor_sets[i]`, indexed by set number.
+    /// `RenderPass.step` updates + binds them per frame.
+    descriptor_pool: ?*DescriptorPool = null,
+
+    /// Shader modules. The caller owns these — Pipeline does not
+    /// destroy them on deinit (they're typically reused across
+    /// multiple pipelines and outlive any one of them).
+    vertex_module: vk.VkShaderModule,
+    fragment_module: vk.VkShaderModule,
+
+    /// Optional vertex input. `null` ⇒ no vertex bindings.
+    vertex_input: ?VertexInput = null,
+
+    /// Per-set descriptor layouts. Element i corresponds to `set = i`
+    /// in the shader. `null` slots are placeholders for sets the
+    /// pipeline doesn't actually use — Vulkan requires the pipeline
+    /// layout's `pSetLayouts` to be contiguous up to the max used
+    /// set number, so we substitute `empty_set_layout` for nulls.
+    descriptor_set_layouts: []const ?vk.VkDescriptorSetLayout = &.{},
+
+    /// 0-binding placeholder layout used to fill `null` entries in
+    /// `descriptor_set_layouts`. Required when any entry is null;
+    /// can stay null when every entry is non-null. Owned by the
+    /// caller (`Shaders.init` caches one and reuses it).
+    empty_set_layout: vk.VkDescriptorSetLayout = null,
+
+    /// Push constant ranges referenced by the shaders.
+    push_constant_ranges: []const vk.VkPushConstantRange = &.{},
+
+    /// Default sampler the pipeline owns and uses for every
+    /// combined-image-sampler binding the caller doesn't supply a
+    /// sampler for. Lets the renderer pass plain `textures` (parallel
+    /// to OpenGL's per-texture `glBindTextureUnit` model) without
+    /// having to also track per-binding samplers; the pipeline knows
+    /// the right sampler for its own atlases (e.g. cell_text uses
+    /// unnormalized coords for `sampler2D` standing in for the old
+    /// `sampler2DRect`). The handle is borrowed, not owned by
+    /// `Pipeline` — `Shaders.init` owns the lifetime.
+    sampler: vk.VkSampler = null,
+
+    /// Color attachment format. With dynamic rendering this must
+    /// match the format of the image the renderer eventually targets
+    /// in `vkCmdBeginRendering`.
+    color_format: vk.VkFormat,
+
+    /// Pre-multiplied-alpha source-over blending. Disable for
+    /// the bg_color pass (full opaque background).
+    blending_enabled: bool = true,
+
+    /// Primitive topology. The renderer's shaders use TRIANGLE_STRIP
+    /// for the full-screen quad and TRIANGLE_LIST for instanced cells.
+    topology: vk.VkPrimitiveTopology = vk.VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST,
+};
+
+pub const Error = error{
+    /// `vkCreatePipelineLayout` or `vkCreateGraphicsPipelines`
+    /// returned a non-success status.
+    VulkanFailed,
+};
+
+device: *const Device,
+pipeline: vk.VkPipeline,
+layout: vk.VkPipelineLayout,
+
+/// Descriptor sets allocated from `opts.descriptor_pool`, indexed by
+/// set number. `descriptor_sets[i]` is the set bound at `set = i` in
+/// the shader; `null` means the pipeline doesn't use that set (so
+/// `RenderPass.step` skips updating/binding it). `set_count` is one
+/// past the last non-null index, matching what
+/// `vkCmdBindDescriptorSets` needs as `setCount`.
+///
+/// HOT-PATH NOTE: these sets are SHARED across all `step()` calls
+/// that bind this pipeline within a single command buffer, but
+/// `vkCmdDraw` reads descriptors at submit time, so re-using the
+/// same pipeline twice with different per-call resources would
+/// cause both draws to see the LAST update's bindings.
+/// `RenderPass.step` defends against this by allocating a fresh
+/// per-call set from the pass's `step_pool` whenever the per-step
+/// resources differ; these `descriptor_sets[i]` slots act as
+/// pre-warmed defaults (used only when the call site is
+/// single-step-per-pipeline like bg_color / cell_bg).
+descriptor_sets: [MAX_DESCRIPTOR_SETS]vk.VkDescriptorSet = .{ null, null, null },
+set_count: u32 = 0,
+
+/// Descriptor set layouts associated with this pipeline, indexed by
+/// set number. `null` matches a `null` slot in `descriptor_sets`.
+/// Stored so `RenderPass.step` can allocate per-call sets from the
+/// pass's per-frame descriptor pool without round-tripping through
+/// the original `Shaders.init` layout-creation code path.
+descriptor_set_layouts: [MAX_DESCRIPTOR_SETS]vk.VkDescriptorSetLayout = .{ null, null, null },
+
+/// Binding number that `Step.uniforms` writes to within set 0.
+/// Defaults to 1 to match `common.glsl`'s
+/// `layout(binding = 1, std140) uniform Globals`. Override per
+/// pipeline if a different shader uses a different slot.
+uniforms_binding: u32 = 1,
+
+/// Pipeline-owned fallback sampler. See `Options.sampler`.
+sampler: vk.VkSampler = null,
+
+/// Vertex buffer stride (bytes). Needed so `RenderPass.step` can
+/// bind a vertex buffer with the right per-instance/per-vertex
+/// stride. Defaults to 0 (no vertex buffer); set automatically when
+/// `Options.vertex_input` is non-null.
+vertex_stride: u32 = 0,
+
+pub fn init(opts: Options) Error!Self {
+    const dev = opts.device;
+
+    if (opts.descriptor_set_layouts.len > MAX_DESCRIPTOR_SETS) {
+        log.err(
+            "Pipeline.init: {} descriptor sets exceeds MAX_DESCRIPTOR_SETS={}",
+            .{ opts.descriptor_set_layouts.len, MAX_DESCRIPTOR_SETS },
+        );
+        return error.VulkanFailed;
+    }
+
+    // ---- pipeline layout ---------------------------------------
+    //
+    // Build a flat array of VkDescriptorSetLayout where index i is
+    // the layout for set=i. Null entries in `opts.descriptor_set_layouts`
+    // get substituted with `opts.empty_set_layout` — Vulkan rejects
+    // VK_NULL_HANDLE in `pSetLayouts`. `Shaders.init` always supplies
+    // an empty layout when any null appears.
+    var flat_dsls: [MAX_DESCRIPTOR_SETS]vk.VkDescriptorSetLayout = .{ null, null, null };
+    for (opts.descriptor_set_layouts, 0..) |maybe_dsl, i| {
+        if (maybe_dsl) |dsl| {
+            flat_dsls[i] = dsl;
+        } else if (opts.empty_set_layout != null) {
+            flat_dsls[i] = opts.empty_set_layout;
+        } else {
+            log.err(
+                "Pipeline.init: set {} is null but no empty_set_layout was provided",
+                .{i},
+            );
+            return error.VulkanFailed;
+        }
+    }
+    const layout_info: vk.VkPipelineLayoutCreateInfo = .{
+        .sType = vk.VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO,
+        .pNext = null,
+        .flags = 0,
+        .setLayoutCount = @intCast(opts.descriptor_set_layouts.len),
+        .pSetLayouts = if (opts.descriptor_set_layouts.len > 0) &flat_dsls else null,
+        .pushConstantRangeCount = @intCast(opts.push_constant_ranges.len),
+        .pPushConstantRanges = if (opts.push_constant_ranges.len > 0)
+            opts.push_constant_ranges.ptr
+        else
+            null,
+    };
+    var layout: vk.VkPipelineLayout = undefined;
+    {
+        const r = dev.dispatch.createPipelineLayout(dev.device, &layout_info, null, &layout);
+        if (r != vk.VK_SUCCESS) {
+            log.err("vkCreatePipelineLayout failed: result={}", .{r});
+            return error.VulkanFailed;
+        }
+    }
+    errdefer dev.dispatch.destroyPipelineLayout(dev.device, layout, null);
+
+    // ---- shader stages -----------------------------------------
+    const stages: [2]vk.VkPipelineShaderStageCreateInfo = .{
+        .{
+            .sType = vk.VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
+            .pNext = null,
+            .flags = 0,
+            .stage = vk.VK_SHADER_STAGE_VERTEX_BIT,
+            .module = opts.vertex_module,
+            .pName = "main",
+            .pSpecializationInfo = null,
+        },
+        .{
+            .sType = vk.VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
+            .pNext = null,
+            .flags = 0,
+            .stage = vk.VK_SHADER_STAGE_FRAGMENT_BIT,
+            .module = opts.fragment_module,
+            .pName = "main",
+            .pSpecializationInfo = null,
+        },
+    };
+
+    // ---- vertex input -------------------------------------------
+    var vi_binding: vk.VkVertexInputBindingDescription = undefined;
+    const vertex_input: vk.VkPipelineVertexInputStateCreateInfo = if (opts.vertex_input) |vi| blk: {
+        vi_binding = .{
+            .binding = 0,
+            .stride = vi.stride,
+            .inputRate = switch (vi.step_fn) {
+                .constant, .per_vertex => vk.VK_VERTEX_INPUT_RATE_VERTEX,
+                .per_instance => vk.VK_VERTEX_INPUT_RATE_INSTANCE,
+            },
+        };
+        break :blk .{
+            .sType = vk.VK_STRUCTURE_TYPE_PIPELINE_VERTEX_INPUT_STATE_CREATE_INFO,
+            .pNext = null,
+            .flags = 0,
+            .vertexBindingDescriptionCount = 1,
+            .pVertexBindingDescriptions = &vi_binding,
+            .vertexAttributeDescriptionCount = @intCast(vi.attributes.len),
+            .pVertexAttributeDescriptions = vi.attributes.ptr,
+        };
+    } else .{
+        .sType = vk.VK_STRUCTURE_TYPE_PIPELINE_VERTEX_INPUT_STATE_CREATE_INFO,
+        .pNext = null,
+        .flags = 0,
+        .vertexBindingDescriptionCount = 0,
+        .pVertexBindingDescriptions = null,
+        .vertexAttributeDescriptionCount = 0,
+        .pVertexAttributeDescriptions = null,
+    };
+
+    // ---- input assembly + viewport (dynamic) + raster + ms ------
+    const input_assembly: vk.VkPipelineInputAssemblyStateCreateInfo = .{
+        .sType = vk.VK_STRUCTURE_TYPE_PIPELINE_INPUT_ASSEMBLY_STATE_CREATE_INFO,
+        .pNext = null,
+        .flags = 0,
+        .topology = opts.topology,
+        .primitiveRestartEnable = vk.VK_FALSE,
+    };
+    const viewport_state: vk.VkPipelineViewportStateCreateInfo = .{
+        .sType = vk.VK_STRUCTURE_TYPE_PIPELINE_VIEWPORT_STATE_CREATE_INFO,
+        .pNext = null,
+        .flags = 0,
+        .viewportCount = 1,
+        .pViewports = null,
+        .scissorCount = 1,
+        .pScissors = null,
+    };
+    const rasterization: vk.VkPipelineRasterizationStateCreateInfo = .{
+        .sType = vk.VK_STRUCTURE_TYPE_PIPELINE_RASTERIZATION_STATE_CREATE_INFO,
+        .pNext = null,
+        .flags = 0,
+        .depthClampEnable = vk.VK_FALSE,
+        .rasterizerDiscardEnable = vk.VK_FALSE,
+        .polygonMode = vk.VK_POLYGON_MODE_FILL,
+        .cullMode = vk.VK_CULL_MODE_NONE,
+        .frontFace = vk.VK_FRONT_FACE_COUNTER_CLOCKWISE,
+        .depthBiasEnable = vk.VK_FALSE,
+        .depthBiasConstantFactor = 0,
+        .depthBiasClamp = 0,
+        .depthBiasSlopeFactor = 0,
+        .lineWidth = 1.0,
+    };
+    const multisample: vk.VkPipelineMultisampleStateCreateInfo = .{
+        .sType = vk.VK_STRUCTURE_TYPE_PIPELINE_MULTISAMPLE_STATE_CREATE_INFO,
+        .pNext = null,
+        .flags = 0,
+        .rasterizationSamples = vk.VK_SAMPLE_COUNT_1_BIT,
+        .sampleShadingEnable = vk.VK_FALSE,
+        .minSampleShading = 0,
+        .pSampleMask = null,
+        .alphaToCoverageEnable = vk.VK_FALSE,
+        .alphaToOneEnable = vk.VK_FALSE,
+    };
+
+    // ---- color blend --------------------------------------------
+    // Pre-multiplied alpha source-over: out = src + dst*(1-src.a).
+    // Same as the OpenGL backend's default blend (and what the
+    // shaders are written to produce).
+    const blend_attachment: vk.VkPipelineColorBlendAttachmentState = .{
+        .blendEnable = if (opts.blending_enabled) vk.VK_TRUE else vk.VK_FALSE,
+        .srcColorBlendFactor = vk.VK_BLEND_FACTOR_ONE,
+        .dstColorBlendFactor = vk.VK_BLEND_FACTOR_ONE_MINUS_SRC_ALPHA,
+        .colorBlendOp = vk.VK_BLEND_OP_ADD,
+        .srcAlphaBlendFactor = vk.VK_BLEND_FACTOR_ONE,
+        .dstAlphaBlendFactor = vk.VK_BLEND_FACTOR_ONE_MINUS_SRC_ALPHA,
+        .alphaBlendOp = vk.VK_BLEND_OP_ADD,
+        .colorWriteMask = vk.VK_COLOR_COMPONENT_R_BIT |
+            vk.VK_COLOR_COMPONENT_G_BIT |
+            vk.VK_COLOR_COMPONENT_B_BIT |
+            vk.VK_COLOR_COMPONENT_A_BIT,
+    };
+    const blend_state: vk.VkPipelineColorBlendStateCreateInfo = .{
+        .sType = vk.VK_STRUCTURE_TYPE_PIPELINE_COLOR_BLEND_STATE_CREATE_INFO,
+        .pNext = null,
+        .flags = 0,
+        .logicOpEnable = vk.VK_FALSE,
+        .logicOp = vk.VK_LOGIC_OP_COPY,
+        .attachmentCount = 1,
+        .pAttachments = &blend_attachment,
+        .blendConstants = .{ 0, 0, 0, 0 },
+    };
+
+    // ---- dynamic state -----------------------------------------
+    const dynamic_states = [_]vk.VkDynamicState{
+        vk.VK_DYNAMIC_STATE_VIEWPORT,
+        vk.VK_DYNAMIC_STATE_SCISSOR,
+    };
+    const dynamic_state: vk.VkPipelineDynamicStateCreateInfo = .{
+        .sType = vk.VK_STRUCTURE_TYPE_PIPELINE_DYNAMIC_STATE_CREATE_INFO,
+        .pNext = null,
+        .flags = 0,
+        .dynamicStateCount = @intCast(dynamic_states.len),
+        .pDynamicStates = &dynamic_states,
+    };
+
+    // ---- dynamic rendering info (chained via pNext) ------------
+    var color_format = opts.color_format;
+    const rendering_info: vk.VkPipelineRenderingCreateInfo = .{
+        .sType = vk.VK_STRUCTURE_TYPE_PIPELINE_RENDERING_CREATE_INFO,
+        .pNext = null,
+        .viewMask = 0,
+        .colorAttachmentCount = 1,
+        .pColorAttachmentFormats = &color_format,
+        .depthAttachmentFormat = vk.VK_FORMAT_UNDEFINED,
+        .stencilAttachmentFormat = vk.VK_FORMAT_UNDEFINED,
+    };
+
+    // ---- assemble + create -------------------------------------
+    const pipeline_info: vk.VkGraphicsPipelineCreateInfo = .{
+        .sType = vk.VK_STRUCTURE_TYPE_GRAPHICS_PIPELINE_CREATE_INFO,
+        .pNext = &rendering_info,
+        .flags = 0,
+        .stageCount = stages.len,
+        .pStages = &stages,
+        .pVertexInputState = &vertex_input,
+        .pInputAssemblyState = &input_assembly,
+        .pTessellationState = null,
+        .pViewportState = &viewport_state,
+        .pRasterizationState = &rasterization,
+        .pMultisampleState = &multisample,
+        .pDepthStencilState = null,
+        .pColorBlendState = &blend_state,
+        .pDynamicState = &dynamic_state,
+        .layout = layout,
+        // renderPass / subpass intentionally null — dynamic rendering.
+        .renderPass = null,
+        .subpass = 0,
+        .basePipelineHandle = null,
+        .basePipelineIndex = -1,
+    };
+    var pipeline: vk.VkPipeline = undefined;
+    {
+        const r = dev.dispatch.createGraphicsPipelines(
+            dev.device,
+            null, // pipeline cache
+            1,
+            &pipeline_info,
+            null,
+            &pipeline,
+        );
+        if (r != vk.VK_SUCCESS) {
+            log.err("vkCreateGraphicsPipelines failed: result={}", .{r});
+            return error.VulkanFailed;
+        }
+    }
+    errdefer dev.dispatch.destroyPipeline(dev.device, pipeline, null);
+
+    // Allocate one descriptor set per non-null entry in
+    // `opts.descriptor_set_layouts`. Null entries are placeholders
+    // (the shader's set=i isn't actually used) — nothing to allocate.
+    // Also remember the layouts on `Self` so `RenderPass.step` can
+    // allocate fresh per-call sets from a per-frame pool without
+    // re-creating layouts.
+    var dsets: [MAX_DESCRIPTOR_SETS]vk.VkDescriptorSet = .{ null, null, null };
+    var dsls: [MAX_DESCRIPTOR_SETS]vk.VkDescriptorSetLayout = .{ null, null, null };
+    if (opts.descriptor_pool) |pool_ptr| {
+        for (opts.descriptor_set_layouts, 0..) |maybe_dsl, i| {
+            if (maybe_dsl) |dsl| {
+                dsls[i] = dsl;
+                dsets[i] = pool_ptr.allocate(dsl) catch |err| {
+                    log.err(
+                        "Pipeline.init: descriptor set {} allocation failed: {}",
+                        .{ i, err },
+                    );
+                    return error.VulkanFailed;
+                };
+            }
+        }
+    } else {
+        for (opts.descriptor_set_layouts, 0..) |maybe_dsl, i| {
+            if (maybe_dsl) |dsl| dsls[i] = dsl;
+        }
+    }
+
+    return .{
+        .device = dev,
+        .pipeline = pipeline,
+        .layout = layout,
+        .descriptor_sets = dsets,
+        .descriptor_set_layouts = dsls,
+        .set_count = @intCast(opts.descriptor_set_layouts.len),
+        .sampler = opts.sampler,
+        .vertex_stride = if (opts.vertex_input) |vi| vi.stride else 0,
+    };
+}
+
+pub fn deinit(self: *const Self) void {
+    const dev = self.device;
+    dev.dispatch.destroyPipeline(dev.device, self.pipeline, null);
+    dev.dispatch.destroyPipelineLayout(dev.device, self.layout, null);
+}
+
+test {
+    std.testing.refAllDecls(@This());
+}
--- a/src/renderer/vulkan/README.md
+++ b/src/renderer/vulkan/README.md
@ -0,0 +1,49 @@
+# Vulkan renderer backend
+
+This directory holds the **renderer-policy** Vulkan files for libghostty.
+Pure Vulkan-API wrappers (Device dispatch table, Sampler, CommandPool,
+DescriptorPool) live in `pkg/vulkan/`, mirroring how `pkg/opengl/`
+relates to `src/renderer/opengl/`.
+
+## File layout
+
+Renderer policy (this directory):
+
+| File                | OpenGL counterpart        | Notes                                                              |
+| ------------------- | ------------------------- | ------------------------------------------------------------------ |
+| `Target.zig`        | `opengl/Target.zig`       | Render image + dmabuf export (direct or legacy_copy mode).         |
+| `Texture.zig`       | `opengl/Texture.zig`      | `VkImage` + `VkImageView` + upload helpers for the glyph atlas.    |
+| `buffer.zig`        | `opengl/buffer.zig`       | `Buffer(T)` host-coherent.                                         |
+| `buffer_pool.zig`   | (none — GL implicit)      | Cross-frame `VkBuffer` recycle pool, per-thread pending list.      |
+| `ThreadState.zig`   | (none — GL implicit)      | Per-renderer-thread frame fence / CB / descriptor pool / last-tgt. |
+| `Pipeline.zig`      | `opengl/Pipeline.zig`     | Graphics pipeline + descriptor set layout creation.                |
+| `RenderPass.zig`    | `opengl/RenderPass.zig`   | Dynamic-rendering pass + step recorder.                            |
+| `Frame.zig`         | `opengl/Frame.zig`        | Per-draw command buffer + fence-paced submit-then-wait.            |
+| `shaders.zig`       | `opengl/shaders.zig`      | GLSL → SPIR-V via glslang + the OpenGL-GLSL → Vulkan-GLSL rewrite. |
+
+Pure Vulkan-API wrappers (in `pkg/vulkan/`):
+
+| File                  | OpenGL counterpart       | Notes                                                              |
+| --------------------- | ------------------------ | ------------------------------------------------------------------ |
+| `Device.zig`          | (no analogue — GL ctx)   | Host-provided VkInstance/Device/Queue + function dispatch table.   |
+| `Sampler.zig`         | `pkg/opengl/Sampler.zig` | `VkSampler` (linear for atlases, nearest for cells).               |
+| `CommandPool.zig`     | (none)                   | `VkCommandPool` + one-shot record/submit helper.                   |
+| `DescriptorPool.zig`  | (none)                   | Per-frame `VkDescriptorPool`.                                      |
+
+The renderer's top-level lives one directory up at `../Vulkan.zig`
+and is the single module imported by `src/renderer.zig` when
+`build_config.renderer == .vulkan`. It re-exports the `pkg/vulkan/`
+types as `Vulkan.Device`, `Vulkan.Sampler`, etc., so call sites use a
+single `Vulkan.*` namespace regardless of where each type physically
+lives.
+
+## Why dmabuf, not Vulkan swapchains?
+
+The Qt frontend wants to keep `GhosttySurface` as a `QWidget` so that
+splits (`QSplitter`), tabs (`QTabWidget`), and translucent composition
+keep working. That rules out `QVulkanWindow`. Instead libghostty
+exports the rendered `VkImage` memory as a dmabuf fd
+(`VK_KHR_external_memory_fd` + `VK_EXT_image_drm_format_modifier`); the
+Qt side imports it via `zwp_linux_dmabuf_v1` and attaches it to a
+`wl_subsurface` parented to the top-level `wl_surface`. The compositor
+scans the buffer out directly — no readback, no QImage round trip.
--- a/src/renderer/vulkan/RenderPass.zig
+++ b/src/renderer/vulkan/RenderPass.zig
@ -0,0 +1,673 @@
+//! Per-pass recording helper for `vkCmdBeginRendering` /
+//! `vkCmdEndRendering` (Vulkan 1.3 dynamic rendering — no
+//! `VkRenderPass` object needed) plus the per-`step` resource
+//! binding + draw-call emission.
+//!
+//! `begin` transitions the attachment from its current layout to
+//! `COLOR_ATTACHMENT_OPTIMAL` and opens a rendering scope with the
+//! caller's clear color. `step` updates the pipeline's descriptor
+//! sets from the Step's resources and records a draw call;
+//! `complete` closes the rendering scope and transitions the
+//! attachment to its consumer-facing layout (SHADER_READ_ONLY for
+//! intermediate textures, GENERAL for the dmabuf-backed target).
+//!
+//! Counterpart: `src/renderer/opengl/RenderPass.zig`.
+
+const Self = @This();
+
+const std = @import("std");
+const vulkan = @import("vulkan");
+const vk = vulkan.c;
+
+const Device = vulkan.Device;
+const DescriptorPool = vulkan.DescriptorPool;
+const Sampler = vulkan.Sampler;
+const Pipeline = @import("Pipeline.zig");
+const Target = @import("Target.zig");
+const Texture = @import("Texture.zig");
+const bufferpkg = @import("buffer.zig");
+
+const log = std.log.scoped(.vulkan);
+
+/// Primitive topology. Variant names match `pkg/opengl/primitives.zig`'s
+/// `gl.Primitive` so the renderer's call sites in `generic.zig` (e.g.
+/// `.draw = .{ .type = .triangle, ... }`) work against either backend
+/// without per-backend branching. Mapped to `VkPrimitiveTopology` at
+/// command-recording time.
+pub const Primitive = enum {
+    point,
+    line,
+    line_strip,
+    triangle,
+    triangle_strip,
+
+    pub fn toVk(self: Primitive) vk.VkPrimitiveTopology {
+        return switch (self) {
+            .point => vk.VK_PRIMITIVE_TOPOLOGY_POINT_LIST,
+            .line => vk.VK_PRIMITIVE_TOPOLOGY_LINE_LIST,
+            .line_strip => vk.VK_PRIMITIVE_TOPOLOGY_LINE_STRIP,
+            .triangle => vk.VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST,
+            .triangle_strip => vk.VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP,
+        };
+    }
+};
+
+pub const Options = struct {
+    /// Device + dispatch table for recording commands.
+    device: *const Device,
+    /// Caller-recorded command buffer to emit commands into. Provided
+    /// by the enclosing `Frame`.
+    cb: vk.VkCommandBuffer,
+
+    /// Per-frame descriptor pool. Used by `step` to allocate fresh
+    /// descriptor sets on the SECOND and later step() calls that
+    /// bind the same pipeline within this pass — without it,
+    /// mutating the pipeline's static `descriptor_sets[i]` for the
+    /// second call would overwrite the first call's bindings before
+    /// the GPU has read them (vkCmdDraw reads at submit time).
+    /// Optional: passes that never re-use a pipeline (bg_color,
+    /// cell_bg, cell_text) work without it.
+    step_pool: ?*DescriptorPool = null,
+
+    /// Color attachments for the pass. With dynamic rendering each
+    /// attachment is a render target + optional clear color.
+    attachments: []const Attachment,
+
+    pub const Attachment = struct {
+        // Held by value to match the OpenGL backend's Attachment
+        // shape (so `generic.zig`'s call sites remain identical).
+        // Vulkan's `Texture` and `Target` carry a `layout` field
+        // that mutates across passes — `RenderPass.begin` reads it
+        // to emit the right source-layout barrier, and
+        // `RenderPass.complete` updates the value-copy here. Because
+        // the value is a copy, that update doesn't propagate back
+        // to the caller; the call sites in `generic.zig` are
+        // intentionally fine with that — they always pass the
+        // CURRENT `frame.target` / `state.{front,back}_texture`
+        // (whose `layout` was last updated by the previous pass's
+        // `recordPresentBarrier` / pipeline-end barrier in
+        // `Target.recordPresentBarrier` / `Texture.replaceRegion`)
+        // when constructing a new pass.
+        target: union(enum) {
+            texture: Texture,
+            target: Target,
+        },
+        clear_color: ?[4]f32 = null,
+    };
+};
+
+/// Describes one rendering step within the pass: which pipeline to
+/// bind, which resources (uniforms / vertex buffers / textures /
+/// samplers) to bind, and the draw call to issue.
+pub const Step = struct {
+    pipeline: Pipeline,
+    uniforms: ?vk.VkBuffer = null,
+    buffers: []const ?vk.VkBuffer = &.{},
+    textures: []const ?Texture = &.{},
+    samplers: []const ?Sampler = &.{},
+    draw: Draw,
+
+    pub const Draw = struct {
+        type: Primitive,
+        vertex_count: usize,
+        instance_count: usize = 1,
+    };
+};
+
+pub const Error = error{
+    /// Reserved for command-recording failures. Currently unused —
+    /// the recorder relies on Vulkan's silent-failure model
+    /// (record bad input → validation flags it / next submit
+    /// returns DEVICE_LOST), but the slot stays open in case a
+    /// future step wants to fail-fast at record time.
+    VulkanFailed,
+};
+
+attachments: []const Options.Attachment,
+cb: vk.VkCommandBuffer,
+device: *const Device,
+step_pool: ?*DescriptorPool = null,
+step_number: usize = 0,
+
+/// VkPipeline handles already used by an earlier `step` in this
+/// pass. On second-and-later use of the same pipeline we allocate
+/// a fresh per-call descriptor set from `step_pool` instead of
+/// mutating `pipeline.descriptor_sets[i]` (vkCmdDraw reads at
+/// submit time, so re-updating the same set in place would
+/// overwrite the prior call's bindings before the GPU has read
+/// them). Capacity covers our worst case: per-pass image draws
+/// can fire dozens of pipeline reuses. The slice is empty when no
+/// step_pool was provided.
+seen_pipelines: [MAX_SEEN_PIPELINES]vk.VkPipeline = .{null} ** MAX_SEEN_PIPELINES,
+seen_pipelines_len: usize = 0,
+
+/// Last `Step.uniforms` value seen in this pass. The OpenGL backend
+/// keeps the bound UBO across draw calls implicitly (GL state
+/// persists), and the renderer's image/overlay draw calls in
+/// `image.zig` don't pass `uniforms` at all — they expect the
+/// previously-bound UBO to still be live. Vulkan needs explicit
+/// descriptor-set updates per pipeline, so we cache the last UBO
+/// buffer here and reuse it when a step doesn't supply one. Reset
+/// to null at `begin`.
+last_uniforms: ?vk.VkBuffer = null,
+
+/// Cap on the number of distinct pipelines we'll track per pass
+/// for "first-use vs re-use" detection. The renderer's pass shape
+/// is: bg_color (1), cell_bg (1), cell_text (1), bg_image (1),
+/// image (varies). 8 is generous; we degrade gracefully to "always
+/// allocate fresh" past this cap.
+const MAX_SEEN_PIPELINES: usize = 8;
+
+/// Begin a render pass. Transitions the first attachment to
+/// `COLOR_ATTACHMENT_OPTIMAL` and opens a `vkCmdBeginRendering`
+/// scope with the caller's clear color (defaults to opaque black).
+///
+/// We only act on attachments[0] for now — the renderer's calls
+/// always pass exactly one attachment per pass, matching the
+/// OpenGL backend's `RenderPass.Options.attachments` use.
+pub fn begin(opts: Options) Self {
+    const self: Self = .{
+        .attachments = opts.attachments,
+        .cb = opts.cb,
+        .device = opts.device,
+        .step_pool = opts.step_pool,
+    };
+
+    if (opts.attachments.len == 0) return self;
+
+    const attach = opts.attachments[0];
+    const view: vk.VkImageView, const image: vk.VkImage, const width: u32, const height: u32, const old_layout: vk.VkImageLayout = switch (attach.target) {
+        .texture => |t| .{ t.view, t.image, @intCast(t.width), @intCast(t.height), t.layout },
+        .target => |t| .{ t.view, t.image, t.width, t.height, t.layout },
+    };
+    // Always Y-flip the viewport regardless of attachment kind.
+    //
+    // `cell_text` is projection-driven (vertex shader applies
+    // `projection_matrix` to pixel coords) while `cell_bg` is
+    // fragment-position-driven (derives grid_pos from
+    // `gl_FragCoord.xy / cell_size`). For those two to agree on
+    // where "row 0" lands in the framebuffer, the viewport
+    // orientation must be the same for both — anything else
+    // produces the cell-bg-at-top-while-cell-text-at-bottom
+    // disagreement seen on the custom-shader (back_texture) path.
+    // For the dmabuf `Target` we needed the Y-flip anyway (Qt mmaps
+    // origin-upper-left). For shadertoy sampling: with both the
+    // back_texture and frame.target Y-flipped, an upper-left
+    // `gl_FragCoord` in the post fragment maps to texel y=0 (top
+    // of back_texture = top of original render), which is what
+    // `uv = fragCoord/iResolution` + `texture(iChannel0, uv)`
+    // expects in Vulkan-native convention.
+
+    // Transition to COLOR_ATTACHMENT_OPTIMAL. The attachment's
+    // current layout drives the source-side of the barrier so a
+    // re-used target (e.g. `Target` in `.direct` mode after the
+    // previous frame's `recordDirectBarrier` left it in GENERAL,
+    // or `.legacy_copy` after `recordCopyToDmabuf` left it in
+    // TRANSFER_SRC_OPTIMAL, or a `Texture` after the previous
+    // pass's `complete` left it in SHADER_READ_ONLY_OPTIMAL) is
+    // transitioned correctly. UNDEFINED is the implicit-discard
+    // initial layout for a fresh image; we'd also accept it for
+    // an image whose contents we don't care about, but `loadOp =
+    // CLEAR` covers that case explicitly so we always pass a
+    // truthful old layout to validation.
+    {
+        // Source access depends on what the previous owner of the
+        // layout could have left in flight. For COLOR_ATTACHMENT_*
+        // it's the color-write access; for TRANSFER_SRC the read
+        // already retired but we conservatively name it; for
+        // SHADER_READ_ONLY the prior fragment-stage read; UNDEFINED
+        // and GENERAL want a no-op source mask (GENERAL was last
+        // written by the present-barrier and `recordDirectBarrier`
+        // has already chained that visibility into HOST — the next
+        // frame doesn't need to re-flush it).
+        const src_access: vk.VkAccessFlags = switch (old_layout) {
+            vk.VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL => vk.VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT,
+            vk.VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL => vk.VK_ACCESS_TRANSFER_READ_BIT,
+            vk.VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL => vk.VK_ACCESS_SHADER_READ_BIT,
+            else => 0,
+        };
+        const src_stage: vk.VkPipelineStageFlags = switch (old_layout) {
+            vk.VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL => vk.VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT,
+            vk.VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL => vk.VK_PIPELINE_STAGE_TRANSFER_BIT,
+            vk.VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL => vk.VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT,
+            else => vk.VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT,
+        };
+        const barrier: vk.VkImageMemoryBarrier = .{
+            .sType = vk.VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER,
+            .pNext = null,
+            .srcAccessMask = src_access,
+            .dstAccessMask = vk.VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT,
+            .oldLayout = old_layout,
+            .newLayout = vk.VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL,
+            .srcQueueFamilyIndex = vk.VK_QUEUE_FAMILY_IGNORED,
+            .dstQueueFamilyIndex = vk.VK_QUEUE_FAMILY_IGNORED,
+            .image = image,
+            .subresourceRange = .{
+                .aspectMask = vk.VK_IMAGE_ASPECT_COLOR_BIT,
+                .baseMipLevel = 0,
+                .levelCount = 1,
+                .baseArrayLayer = 0,
+                .layerCount = 1,
+            },
+        };
+        opts.device.dispatch.cmdPipelineBarrier(
+            opts.cb,
+            src_stage,
+            vk.VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT,
+            0,
+            0,
+            null,
+            0,
+            null,
+            1,
+            &barrier,
+        );
+    }
+
+    const clear_value: vk.VkClearValue = if (attach.clear_color) |c| .{
+        .color = .{ .float32 = c },
+    } else .{ .color = .{ .float32 = .{ 0, 0, 0, 1 } } };
+
+    const color_attachment: vk.VkRenderingAttachmentInfo = .{
+        .sType = vk.VK_STRUCTURE_TYPE_RENDERING_ATTACHMENT_INFO,
+        .pNext = null,
+        .imageView = view,
+        .imageLayout = vk.VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL,
+        .resolveMode = vk.VK_RESOLVE_MODE_NONE,
+        .resolveImageView = null,
+        .resolveImageLayout = vk.VK_IMAGE_LAYOUT_UNDEFINED,
+        // Always clear: the renderer redraws every cell each frame,
+        // so prior contents are never useful. CLEAR is also free on
+        // tiled GPUs (avoids a full attachment load).
+        .loadOp = vk.VK_ATTACHMENT_LOAD_OP_CLEAR,
+        .storeOp = vk.VK_ATTACHMENT_STORE_OP_STORE,
+        .clearValue = clear_value,
+    };
+    const info: vk.VkRenderingInfo = .{
+        .sType = vk.VK_STRUCTURE_TYPE_RENDERING_INFO,
+        .pNext = null,
+        .flags = 0,
+        .renderArea = .{
+            .offset = .{ .x = 0, .y = 0 },
+            .extent = .{ .width = width, .height = height },
+        },
+        .layerCount = 1,
+        .viewMask = 0,
+        .colorAttachmentCount = 1,
+        .pColorAttachments = &color_attachment,
+        .pDepthAttachment = null,
+        .pStencilAttachment = null,
+    };
+    opts.device.dispatch.cmdBeginRendering(opts.cb, &info);
+
+    // Dynamic state: viewport + scissor follow the attachment size.
+    //
+    // Negative `height` (Vulkan 1.1 maintenance1 / core) flips the Y
+    // axis at viewport time so the renderer's OpenGL-style projection
+    // matrices (Y-up clip space, `ortho2d` with bottom > top) keep
+    // producing pixels at the expected location on screen. Without
+    // this, everything renders upside-down — text intended for the
+    // top of the window appears at the bottom. `gl_FragCoord` still
+    // reports origin-upper-left, matching `cell_bg.f.glsl`'s
+    // `layout(origin_upper_left)` request.
+    const viewport: vk.VkViewport = .{
+        .x = 0,
+        .y = @floatFromInt(height),
+        .width = @floatFromInt(width),
+        .height = -@as(f32, @floatFromInt(height)),
+        .minDepth = 0,
+        .maxDepth = 1,
+    };
+    opts.device.dispatch.cmdSetViewport(opts.cb, 0, 1, &viewport);
+    const scissor: vk.VkRect2D = .{
+        .offset = .{ .x = 0, .y = 0 },
+        .extent = .{ .width = width, .height = height },
+    };
+    opts.device.dispatch.cmdSetScissor(opts.cb, 0, 1, &scissor);
+
+    return self;
+}
+
+/// Record one step of the pass.
+///
+/// Updates the pipeline's descriptor sets from the Step's resources
+/// and emits the draw call. Resource conventions match the OpenGL
+/// backend (so `generic.zig` call sites work unchanged):
+///
+///   - `uniforms`     → set 0, binding `pipeline.uniforms_binding`
+///                      (UBO; the Globals block from `common.glsl`)
+///   - `buffers[0]`   → vertex buffer at binding 0 (when the pipeline
+///                      has a non-zero `vertex_stride`; ignored
+///                      otherwise). Matches OpenGL's "0th buffer is
+///                      the VBO" convention.
+///   - `buffers[i]`, i≥1
+///                    → set 2, binding `i` (storage buffer)
+///   - `textures[i]`  → set 1, binding `i` (combined image sampler).
+///                      The sampler is `samplers[i]` if provided,
+///                      otherwise the pipeline's owned fallback
+///                      `pipeline.sampler` (so the renderer can pass
+///                      plain textures and let the pipeline pick the
+///                      sampler config it needs).
+///
+/// Skips when the pipeline hasn't been constructed yet
+/// (`VkPipeline == null`) — pipelines for shaders we haven't wired
+/// up are default-null and we filter them out instead of crashing
+/// on a null handle. A null pipeline reaching here once
+/// shader bring-up has completed indicates a config / build issue
+/// (e.g. a custom-shader compile failure that left the post pipeline
+/// half-init); log so the missing draw is visible instead of a
+/// silently-blank surface.
+pub fn step(self: *Self, s: Step) void {
+    if (s.pipeline.pipeline == null) {
+        log.warn("RenderPass.step: skipping draw — pipeline not constructed", .{});
+        return;
+    }
+    if (s.draw.vertex_count == 0) return;
+
+    const dev = self.device;
+
+    // ---- vertex buffer (buffers[0]) ----------------------------
+    if (s.pipeline.vertex_stride > 0 and s.buffers.len > 0) {
+        if (s.buffers[0]) |vbo| {
+            const offsets = [_]vk.VkDeviceSize{0};
+            const bufs = [_]vk.VkBuffer{vbo};
+            dev.dispatch.cmdBindVertexBuffers(
+                self.cb,
+                0, // first binding
+                1, // binding count
+                &bufs,
+                &offsets,
+            );
+        }
+    }
+
+    // Pick effective descriptor sets for this step.
+    //
+    // First time we see a given pipeline within this pass, we use
+    // its pre-allocated `descriptor_sets[]` slots and update them
+    // in place — cheap and avoids a per-pass-pool allocation in
+    // the common single-step case (bg_color/cell_bg/cell_text).
+    //
+    // SECOND-and-later use of the same pipeline within the same
+    // pass requires fresh sets: vkCmdDraw reads the descriptor
+    // contents at SUBMIT time, so re-updating the static sets in
+    // place would silently make every prior draw bound to this
+    // pipeline read the LAST update's UBO/sampler/storage. The
+    // image / kitty path issues N draws on the same `image`
+    // pipeline with per-call vertex buffers and textures — without
+    // this fix every kitty image rendered with the FINAL image's
+    // texture and the final draw's vertex buffer.
+    //
+    // The fresh sets come from `step_pool`, owned by the enclosing
+    // Frame and reset at frame start. When `step_pool` is null
+    // (test harnesses, smoke tests) we fall back to the static
+    // sets and accept the limitation.
+    var effective_sets: [Pipeline.MAX_DESCRIPTOR_SETS]vk.VkDescriptorSet =
+        s.pipeline.descriptor_sets;
+    const reused = self.markPipelineUsed(s.pipeline.pipeline);
+    if (reused) {
+        // No step_pool means the renderer thread has no per-frame
+        // descriptor pool wired up (test harness, smoke test). We
+        // can't safely re-use this pipeline — updating the static
+        // set in place would corrupt the prior draw's bindings.
+        // Drop the draw rather than corrupt the frame.
+        const pool = self.step_pool orelse {
+            log.err(
+                "RenderPass.step: pipeline re-used but no step_pool " ++
+                    "available; dropping draw to avoid corrupting prior draws",
+                .{},
+            );
+            return;
+        };
+        for (s.pipeline.descriptor_set_layouts, 0..) |maybe_dsl, i| {
+            if (i >= s.pipeline.set_count) break;
+            const dsl = maybe_dsl orelse continue;
+            if (pool.allocate(dsl)) |fresh| {
+                effective_sets[i] = fresh;
+            } else |err| {
+                // Pool exhausted. The previous behavior was to
+                // fall back to the pipeline's static set, but that
+                // re-introduces the exact corruption the step_pool
+                // mechanism exists to prevent. Drop the draw; the
+                // user sees one missed image rather than every
+                // image rendered with the last image's bindings.
+                log.err(
+                    "RenderPass.step: per-call descriptor set " ++
+                        "allocation for set {} failed ({}); dropping draw " ++
+                        "(step_pool exhausted — increase STEP_POOL_MAX_SETS)",
+                    .{ i, err },
+                );
+                return;
+            }
+        }
+    }
+
+    // ---- update descriptor sets ---------------------------------
+    //
+    // We do one vkUpdateDescriptorSets call per descriptor write to
+    // keep the code straightforward; the total writes per frame are
+    // tiny (1 UBO + a handful of storage buffers + a handful of
+    // samplers) so batching wouldn't move the needle.
+
+    // UBO (set 0). The OpenGL backend's image/overlay draws don't
+    // pass `uniforms` — they expect the previously-bound UBO to
+    // persist. Fall back to `last_uniforms` when the Step doesn't
+    // supply one. Track the new one for later steps.
+    const ubo: ?vk.VkBuffer = s.uniforms orelse self.last_uniforms;
+    if (s.uniforms) |b| self.last_uniforms = b;
+    if (effective_sets[0] != null) if (ubo) |ubo_buffer| {
+        const buffer_info: vk.VkDescriptorBufferInfo = .{
+            .buffer = ubo_buffer,
+            .offset = 0,
+            .range = vk.VK_WHOLE_SIZE,
+        };
+        const write: vk.VkWriteDescriptorSet = .{
+            .sType = vk.VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET,
+            .pNext = null,
+            .dstSet = effective_sets[0],
+            .dstBinding = s.pipeline.uniforms_binding,
+            .dstArrayElement = 0,
+            .descriptorCount = 1,
+            .descriptorType = vk.VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER,
+            .pImageInfo = null,
+            .pBufferInfo = &buffer_info,
+            .pTexelBufferView = null,
+        };
+        dev.dispatch.updateDescriptorSets(dev.device, 1, &write, 0, null);
+    };
+
+    // Samplers (set 1)
+    if (effective_sets[1] != null) {
+        const slot_count = @max(s.textures.len, s.samplers.len);
+        for (0..slot_count) |slot| {
+            const tex_opt: ?Texture = if (slot < s.textures.len) s.textures[slot] else null;
+            const tex = tex_opt orelse continue;
+            const samp_opt: ?Sampler = if (slot < s.samplers.len) s.samplers[slot] else null;
+            const sampler_handle: vk.VkSampler = if (samp_opt) |samp|
+                samp.sampler
+            else if (s.pipeline.sampler != null)
+                s.pipeline.sampler
+            else
+                continue;
+
+            const image_info: vk.VkDescriptorImageInfo = .{
+                .sampler = sampler_handle,
+                .imageView = tex.view,
+                .imageLayout = vk.VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL,
+            };
+            const write: vk.VkWriteDescriptorSet = .{
+                .sType = vk.VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET,
+                .pNext = null,
+                .dstSet = effective_sets[1],
+                .dstBinding = @intCast(slot),
+                .dstArrayElement = 0,
+                .descriptorCount = 1,
+                .descriptorType = vk.VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
+                .pImageInfo = &image_info,
+                .pBufferInfo = null,
+                .pTexelBufferView = null,
+            };
+            dev.dispatch.updateDescriptorSets(dev.device, 1, &write, 0, null);
+        }
+    }
+
+    // Storage buffers (set 2). `buffers[0]` is reserved for the
+    // vertex buffer (handled above), so storage starts at slot 1.
+    if (effective_sets[2] != null and s.buffers.len > 1) {
+        for (s.buffers[1..], 1..) |maybe_buf, slot| {
+            const buf = maybe_buf orelse continue;
+            const buffer_info: vk.VkDescriptorBufferInfo = .{
+                .buffer = buf,
+                .offset = 0,
+                .range = vk.VK_WHOLE_SIZE,
+            };
+            const write: vk.VkWriteDescriptorSet = .{
+                .sType = vk.VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET,
+                .pNext = null,
+                .dstSet = effective_sets[2],
+                .dstBinding = @intCast(slot),
+                .dstArrayElement = 0,
+                .descriptorCount = 1,
+                .descriptorType = vk.VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
+                .pImageInfo = null,
+                .pBufferInfo = &buffer_info,
+                .pTexelBufferView = null,
+            };
+            dev.dispatch.updateDescriptorSets(dev.device, 1, &write, 0, null);
+        }
+    }
+
+    // ---- bind descriptor sets -----------------------------------
+    //
+    // `cmdBindDescriptorSets` only accepts contiguous, non-null
+    // handles starting at `firstSet`. To handle the cell_bg case
+    // (sets 0 and 2, no set 1), we make one call per maximal
+    // contiguous run of non-null sets.
+    var start: usize = 0;
+    while (start < s.pipeline.set_count) {
+        if (effective_sets[start] == null) {
+            start += 1;
+            continue;
+        }
+        var end = start + 1;
+        while (end < s.pipeline.set_count and effective_sets[end] != null) : (end += 1) {}
+        dev.dispatch.cmdBindDescriptorSets(
+            self.cb,
+            vk.VK_PIPELINE_BIND_POINT_GRAPHICS,
+            s.pipeline.layout,
+            @intCast(start),
+            @intCast(end - start),
+            &effective_sets[start],
+            0,
+            null,
+        );
+        start = end;
+    }
+
+    dev.dispatch.cmdBindPipeline(
+        self.cb,
+        vk.VK_PIPELINE_BIND_POINT_GRAPHICS,
+        s.pipeline.pipeline,
+    );
+    dev.dispatch.cmdDraw(
+        self.cb,
+        @intCast(s.draw.vertex_count),
+        @intCast(s.draw.instance_count),
+        0,
+        0,
+    );
+    self.step_number += 1;
+}
+
+/// Mark `pipeline` as used in this pass and report whether it was
+/// already seen. Returns `false` on the FIRST call (so `step` can
+/// safely update the pipeline's static descriptor sets in place);
+/// `true` on every subsequent call (so `step` allocates fresh sets
+/// from `step_pool` to avoid clobbering the prior call's bindings).
+///
+/// Beyond `MAX_SEEN_PIPELINES` we conservatively report `true` so
+/// callers always allocate fresh — the alternative (silently
+/// reverting to in-place updates) is the bug this whole mechanism
+/// exists to prevent.
+fn markPipelineUsed(self: *Self, pipeline: vk.VkPipeline) bool {
+    for (self.seen_pipelines[0..self.seen_pipelines_len]) |seen| {
+        if (seen == pipeline) return true;
+    }
+    if (self.seen_pipelines_len >= MAX_SEEN_PIPELINES) return true;
+    self.seen_pipelines[self.seen_pipelines_len] = pipeline;
+    self.seen_pipelines_len += 1;
+    return false;
+}
+
+/// Close the rendering scope and leave the attachment in a layout
+/// the host can read back via the dmabuf export. `GENERAL` is the
+/// safest choice for unknown consumer access patterns; the host
+/// (Qt RHI) can transition again if it wants something more
+/// specific.
+pub fn complete(self: *const Self) void {
+    if (self.attachments.len == 0) return;
+
+    self.device.dispatch.cmdEndRendering(self.cb);
+
+    // Final layout depends on what consumes the attachment next.
+    // A `.texture` attachment is the custom-shader back_texture, read
+    // by the post pass's sampler — transition to SHADER_READ_ONLY so
+    // the descriptor write's declared layout matches reality
+    // (otherwise validation flags VUID-vkCmdDraw-imageLayout-00344
+    // and some drivers can mishandle sampling from an out-of-spec
+    // layout). A `.target` attachment is the dmabuf-backed
+    // `frame.target`; the next op is
+    // `Target.recordPresentBarrier` which expects GENERAL on entry
+    // (it either stays in GENERAL in `.direct` mode or transitions to
+    // TRANSFER_SRC_OPTIMAL in `.legacy_copy`), so leave it in GENERAL here.
+    const image: vk.VkImage, const new_layout: vk.VkImageLayout, const dst_stage: vk.VkPipelineStageFlags, const dst_access: vk.VkAccessFlags =
+        switch (self.attachments[0].target) {
+            .texture => |t| .{
+                t.image,
+                vk.VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL,
+                vk.VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT,
+                vk.VK_ACCESS_SHADER_READ_BIT,
+            },
+            .target => |t| .{
+                t.image,
+                vk.VK_IMAGE_LAYOUT_GENERAL,
+                vk.VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT,
+                0,
+            },
+        };
+
+    const barrier: vk.VkImageMemoryBarrier = .{
+        .sType = vk.VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER,
+        .pNext = null,
+        .srcAccessMask = vk.VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT,
+        .dstAccessMask = dst_access,
+        .oldLayout = vk.VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL,
+        .newLayout = new_layout,
+        .srcQueueFamilyIndex = vk.VK_QUEUE_FAMILY_IGNORED,
+        .dstQueueFamilyIndex = vk.VK_QUEUE_FAMILY_IGNORED,
+        .image = image,
+        .subresourceRange = .{
+            .aspectMask = vk.VK_IMAGE_ASPECT_COLOR_BIT,
+            .baseMipLevel = 0,
+            .levelCount = 1,
+            .baseArrayLayer = 0,
+            .layerCount = 1,
+        },
+    };
+    self.device.dispatch.cmdPipelineBarrier(
+        self.cb,
+        vk.VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT,
+        dst_stage,
+        0,
+        0,
+        null,
+        0,
+        null,
+        1,
+        &barrier,
+    );
+}
+
+test {
+    std.testing.refAllDecls(@This());
+}
--- a/src/renderer/vulkan/Target.zig
+++ b/src/renderer/vulkan/Target.zig
@ -0,0 +1,914 @@
+//! Render target: a `VkImage` whose memory is exported as a dmabuf
+//! fd so the host (Qt) can present it via
+//! `ghostty_platform_vulkan_s.present` without a CPU readback round
+//! trip through libghostty.
+//!
+//! Two construction modes, picked at `init` time after probing
+//! `VK_EXT_image_drm_format_modifier`:
+//!
+//!   - `.direct` — the render image itself is allocated with
+//!     `VkImageDrmFormatModifierExplicitCreateInfoEXT`
+//!     (`DRM_FORMAT_MOD_LINEAR`, single plane). Its `VkDeviceMemory`
+//!     is what we `vkGetMemoryFdKHR` and hand to the host. No second
+//!     allocation, no end-of-frame copy. Used when the driver
+//!     advertises `COLOR_ATTACHMENT_BIT | TRANSFER_SRC_BIT |
+//!     SAMPLED_BIT` for the LINEAR modifier in
+//!     `VkDrmFormatModifierPropertiesEXT.drmFormatModifierTilingFeatures`.
+//!
+//!   - `.legacy_copy` — fallback for drivers (notably NVIDIA at time
+//!     of writing) that don't expose `COLOR_ATTACHMENT_BIT` for
+//!     LINEAR via either the legacy `vkGetPhysicalDeviceFormatProperties`
+//!     query or the modifier-extension query. Allocates an OPTIMAL-
+//!     tiled render image plus a separate dmabuf-exported LINEAR
+//!     `VkBuffer`, and inserts a `vkCmdCopyImageToBuffer` at the end
+//!     of each frame. Behavior identical to the pre-modifier-path
+//!     code.
+//!
+//! Why two modes? NVIDIA's `linearTilingFeatures` for BGRA8 doesn't
+//! include `COLOR_ATTACHMENT_BIT`, so a LINEAR `VkImage` silently
+//! rasterizes nothing (confirmed via
+//! `vkGetPhysicalDeviceFormatProperties`: linearTilingFeatures=0x1dc03
+//! for `B8G8R8A8_UNORM`). The modifier-extension query is a separate
+//! channel and *may* expose different feature bits per modifier — so
+//! we always probe. Where the probe says yes, we drop the redundant
+//! buffer + copy; where it says no, we keep working.
+//!
+//! Ownership: libghostty owns the image, any buffer, all memory, and
+//! the dmabuf fd for the lifetime of the `Target`. The fd is passed
+//! to the host via `present` as a borrow; the host must `dup()` if
+//! it needs to hold it past the call. `deinit` closes the fd and
+//! frees all the memory.
+//!
+//! Counterpart: `src/renderer/opengl/Target.zig`.
+
+const Self = @This();
+
+const std = @import("std");
+const vk = @import("vulkan").c;
+
+const apprt = @import("../../apprt.zig");
+const Device = @import("vulkan").Device;
+
+const log = std.log.scoped(.vulkan);
+
+/// DRM modifier sentinel for "linear, no tiling". Matches
+/// `DRM_FORMAT_MOD_LINEAR` from `<drm/drm_fourcc.h>`. Hardcoded so we
+/// don't pull in libdrm headers just for a single constant.
+pub const DRM_FORMAT_MOD_LINEAR: u64 = 0;
+
+/// Upper bound for the number of DRM format modifiers we ever expect
+/// a driver to expose for a single format. Real-world drivers expose
+/// well under 20 (mostly LINEAR + a handful of vendor tiled variants);
+/// 64 gives us comfortable headroom with a ~1.5 KiB stack buffer and
+/// avoids allocator threading through the per-surface init path.
+const MAX_MODIFIERS: usize = 64;
+
+/// Which dmabuf-export strategy this `Target` settled on. See the
+/// module-level doc comment for the full rationale.
+pub const Tiling = enum {
+    /// Render image's own memory is exported as the dmabuf. Single
+    /// plane, `DRM_FORMAT_MOD_LINEAR`. No separate buffer, no copy.
+    direct,
+
+    /// OPTIMAL render image + separate LINEAR `VkBuffer` dmabuf
+    /// target. End-of-frame `vkCmdCopyImageToBuffer`. Used when
+    /// neither tiling channel exposes `COLOR_ATTACHMENT_BIT` for
+    /// LINEAR.
+    legacy_copy,
+};
+
+pub const Options = struct {
+    device: *const Device,
+    format: vk.VkFormat,
+    width: u32,
+    height: u32,
+    /// Extra `VkImageUsageFlagBits` for the render image, beyond the
+    /// defaults (`COLOR_ATTACHMENT_BIT | SAMPLED_BIT |
+    /// TRANSFER_SRC_BIT`). Rarely needed.
+    extra_usage: vk.VkImageUsageFlags = 0,
+
+    /// Per-surface platform callbacks. The host's process-wide
+    /// VkDevice is shared across splits/tabs, but each surface gets
+    /// its own platform copy with the right `userdata`, so
+    /// `present()` reaches the right window — and `pickModifier`
+    /// asks the right host (compositor and host can in principle
+    /// differ across surfaces, e.g. mixed-DPI multi-screen).
+    platform: apprt.embedded.Platform.Vulkan,
+};
+
+pub const Error = error{
+    VulkanFailed,
+    NoSuitableMemoryType,
+    UnsupportedFormat,
+};
+
+device: *const Device,
+
+/// Per-surface platform — see `Options.platform`.
+platform: apprt.embedded.Platform.Vulkan,
+
+/// Which present strategy this target uses. Decides whether
+/// `recordPresentBarrier` emits a copy.
+tiling: Tiling,
+
+// ---- render image ---------------------------------------------------
+// In `.direct` mode this image's memory is the dmabuf; in
+// `.legacy_copy` mode it's internal OPTIMAL memory we copy out of.
+image: vk.VkImage,
+image_memory: vk.VkDeviceMemory,
+view: vk.VkImageView,
+
+// ---- dmabuf buffer (legacy mode only) -------------------------------
+// `null` in `.direct` mode — the image's memory is the dmabuf.
+dmabuf_buffer: ?vk.VkBuffer,
+dmabuf_memory: ?vk.VkDeviceMemory,
+
+format: vk.VkFormat,
+width: u32,
+height: u32,
+
+fd: i32,
+drm_format: u32,
+drm_modifier: u64,
+stride: u32,
+
+/// Current layout of the render image. Tracked so
+/// `recordPresentBarrier` knows what oldLayout to use in its barrier.
+/// The renderer transitions it elsewhere too (RenderPass).
+layout: vk.VkImageLayout = vk.VK_IMAGE_LAYOUT_UNDEFINED,
+
+pub fn init(opts: Options) Error!Self {
+    const dev = opts.device;
+    const drm_format = try vkFormatToDrmFourcc(opts.format);
+
+    const required_features: vk.VkFormatFeatureFlags =
+        @as(vk.VkFormatFeatureFlags, vk.VK_FORMAT_FEATURE_COLOR_ATTACHMENT_BIT) |
+        vk.VK_FORMAT_FEATURE_TRANSFER_SRC_BIT |
+        vk.VK_FORMAT_FEATURE_SAMPLED_IMAGE_BIT;
+
+    const picked = try pickModifier(dev, opts.platform, opts.format, drm_format, required_features);
+    if (picked) |m| {
+        const tag: []const u8 = if (m == DRM_FORMAT_MOD_LINEAR)
+            "LINEAR"
+        else
+            "vendor-tiled";
+        log.info(
+            "Target: direct dmabuf export ({s} modifier 0x{x}) {}x{}",
+            .{ tag, m, opts.width, opts.height },
+        );
+        return try initDirect(opts, drm_format, m);
+    }
+    log.warn(
+        "Target: no usable single-plane modifier with COLOR_ATTACHMENT " ++
+            "in compositor ∩ GPU intersection; falling back to " ++
+            "OPTIMAL render + LINEAR-buffer copy",
+        .{},
+    );
+    return try initLegacyCopy(opts, drm_format);
+}
+
+/// Intersect the compositor's accepted modifier list (from the host
+/// callback) with the GPU's supported modifiers for `format` (queried
+/// via `VK_EXT_image_drm_format_modifier`), filtered by single-plane
+/// + the required format-feature flags. Prefer the first non-LINEAR
+/// hit (vendor-tiled — NVIDIA block-linear, AMD DCC variants, Intel
+/// Y-tiled; these are where the perf win lives on most hardware).
+/// Fall back to LINEAR if it's in the intersection. Return null when
+/// no modifier qualifies — the caller drops to `.legacy_copy`.
+///
+/// Why both intersections matter:
+///   - GPU-only: passes on AMD/Intel for LINEAR but NVIDIA never
+///     exposes COLOR_ATTACHMENT for LINEAR — direct mode would
+///     create the image OK but rasterize nothing.
+///   - Compositor-only: GPU may not be able to render into the
+///     compositor's preferred tilings (drivers don't always expose
+///     COLOR_ATTACHMENT for every modifier).
+fn pickModifier(
+    dev: *const Device,
+    platform: apprt.embedded.Platform.Vulkan,
+    format: vk.VkFormat,
+    drm_format: u32,
+    required_features: vk.VkFormatFeatureFlags,
+) Error!?u64 {
+    // Compositor side: ask the host what it will accept on attach.
+    // Two-pass query (NULL out + capacity 0 returns count). Empty
+    // result means the compositor doesn't speak linux-dmabuf-v1 or
+    // doesn't advertise this format — direct mode would still likely
+    // work for AMD/Intel LINEAR but the compositor attach would
+    // fail, so treat it as "no intersection."
+    var host_mods: [MAX_MODIFIERS]u64 = undefined;
+    const host_returned = platform.get_supported_modifiers(
+        platform.userdata,
+        drm_format,
+        &host_mods,
+        MAX_MODIFIERS,
+    );
+    // Clamp defensively. The C ABI contract is "host returns ≤ capacity",
+    // but we don't get to assume the host's implementation is correct
+    // — and in safe builds an OOB read on `host_mods[..host_returned]`
+    // panics, hiding the real diagnostic.
+    const host_count: usize = @min(host_returned, MAX_MODIFIERS);
+    if (host_count == 0) {
+        log.warn(
+            "host advertises no dmabuf modifiers for format 0x{x}; " ++
+                "cannot use direct mode",
+            .{drm_format},
+        );
+        return null;
+    }
+
+    // GPU side: enumerate modifiers + their per-modifier feature bits.
+    var gpu_mods: [MAX_MODIFIERS]vk.VkDrmFormatModifierPropertiesEXT = undefined;
+    var mod_list: vk.VkDrmFormatModifierPropertiesListEXT = .{
+        .sType = vk.VK_STRUCTURE_TYPE_DRM_FORMAT_MODIFIER_PROPERTIES_LIST_EXT,
+        .pNext = null,
+        .drmFormatModifierCount = 0,
+        .pDrmFormatModifierProperties = null,
+    };
+    var props2: vk.VkFormatProperties2 = .{
+        .sType = vk.VK_STRUCTURE_TYPE_FORMAT_PROPERTIES_2,
+        .pNext = &mod_list,
+        .formatProperties = std.mem.zeroes(vk.VkFormatProperties),
+    };
+    dev.dispatch.getPhysicalDeviceFormatProperties2(
+        dev.physical_device,
+        format,
+        &props2,
+    );
+    if (mod_list.drmFormatModifierCount == 0) return null;
+    if (mod_list.drmFormatModifierCount > MAX_MODIFIERS) {
+        log.warn(
+            "GPU modifier list truncated: driver reports {}, MAX_MODIFIERS={}",
+            .{ mod_list.drmFormatModifierCount, MAX_MODIFIERS },
+        );
+        mod_list.drmFormatModifierCount = MAX_MODIFIERS;
+    }
+    mod_list.pDrmFormatModifierProperties = &gpu_mods[0];
+    dev.dispatch.getPhysicalDeviceFormatProperties2(
+        dev.physical_device,
+        format,
+        &props2,
+    );
+
+    var has_linear: bool = false;
+    var best_tiled: ?u64 = null;
+    for (gpu_mods[0..mod_list.drmFormatModifierCount]) |gm| {
+        // Single-plane only: present callback ABI passes one fd /
+        // offset / stride. Multi-plane (AMD AFBC, some video
+        // formats) needs a wider ABI.
+        if (gm.drmFormatModifierPlaneCount != 1) continue;
+        if ((gm.drmFormatModifierTilingFeatures & required_features) != required_features) continue;
+        // Intersect with what the compositor accepts.
+        var compositor_ok = false;
+        for (host_mods[0..host_count]) |hm| {
+            if (hm == gm.drmFormatModifier) {
+                compositor_ok = true;
+                break;
+            }
+        }
+        if (!compositor_ok) continue;
+        if (gm.drmFormatModifier == DRM_FORMAT_MOD_LINEAR) {
+            has_linear = true;
+        } else if (best_tiled == null) {
+            best_tiled = gm.drmFormatModifier;
+        }
+    }
+
+    if (best_tiled) |m| return m;
+    if (has_linear) return DRM_FORMAT_MOD_LINEAR;
+    return null;
+}
+
+/// `.direct` mode: allocate the render image with
+/// `VK_EXT_image_drm_format_modifier` so its own memory can be
+/// exported as the dmabuf. Two create-info variants depending on
+/// the chosen modifier:
+///   - LINEAR: EXPLICIT layout (we know rowPitch = width*bpp).
+///     Lets us populate `stride` deterministically without a
+///     post-create driver query.
+///   - non-LINEAR (vendor-tiled): LIST with a single-modifier list.
+///     The driver picks the only option and computes its own
+///     internal layout; we recover the chosen modifier via
+///     `vkGetImageDrmFormatModifierPropertiesEXT` (sanity check —
+///     it should equal `chosen_mod`) and the per-plane layout via
+///     `vkGetImageSubresourceLayout` for the right `stride` value.
+fn initDirect(opts: Options, drm_format: u32, chosen_mod: u64) Error!Self {
+    const dev = opts.device;
+
+    const image_usage = @as(vk.VkImageUsageFlags, vk.VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT) |
+        vk.VK_IMAGE_USAGE_SAMPLED_BIT |
+        vk.VK_IMAGE_USAGE_TRANSFER_SRC_BIT |
+        opts.extra_usage;
+
+    const bytes_per_pixel: u32 = 4;
+    const row_pitch: vk.VkDeviceSize = @as(vk.VkDeviceSize, opts.width) * bytes_per_pixel;
+
+    // ---- 1. Image: modifier-aware, externally-shareable -----------
+    const plane_layout: vk.VkSubresourceLayout = .{
+        .offset = 0,
+        .size = 0, // ignored for EXPLICIT create-info
+        .rowPitch = row_pitch,
+        .arrayPitch = 0,
+        .depthPitch = 0,
+    };
+    const explicit_create: vk.VkImageDrmFormatModifierExplicitCreateInfoEXT = .{
+        .sType = vk.VK_STRUCTURE_TYPE_IMAGE_DRM_FORMAT_MODIFIER_EXPLICIT_CREATE_INFO_EXT,
+        .pNext = null,
+        .drmFormatModifier = DRM_FORMAT_MOD_LINEAR,
+        .drmFormatModifierPlaneCount = 1,
+        .pPlaneLayouts = &plane_layout,
+    };
+    // Single-modifier list — the driver "picks" the only option, but
+    // crucially computes its own opaque internal layout for the
+    // tiling, which we don't have to know.
+    const list_mod = chosen_mod;
+    const list_create: vk.VkImageDrmFormatModifierListCreateInfoEXT = .{
+        .sType = vk.VK_STRUCTURE_TYPE_IMAGE_DRM_FORMAT_MODIFIER_LIST_CREATE_INFO_EXT,
+        .pNext = null,
+        .drmFormatModifierCount = 1,
+        .pDrmFormatModifiers = &list_mod,
+    };
+    const mod_pnext: ?*const anyopaque = if (chosen_mod == DRM_FORMAT_MOD_LINEAR)
+        @ptrCast(&explicit_create)
+    else
+        @ptrCast(&list_create);
+    const ext_image_info: vk.VkExternalMemoryImageCreateInfo = .{
+        .sType = vk.VK_STRUCTURE_TYPE_EXTERNAL_MEMORY_IMAGE_CREATE_INFO,
+        .pNext = mod_pnext,
+        .handleTypes = vk.VK_EXTERNAL_MEMORY_HANDLE_TYPE_DMA_BUF_BIT_EXT,
+    };
+    const image_info: vk.VkImageCreateInfo = .{
+        .sType = vk.VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO,
+        .pNext = &ext_image_info,
+        .flags = 0,
+        .imageType = vk.VK_IMAGE_TYPE_2D,
+        .format = opts.format,
+        .extent = .{ .width = opts.width, .height = opts.height, .depth = 1 },
+        .mipLevels = 1,
+        .arrayLayers = 1,
+        .samples = vk.VK_SAMPLE_COUNT_1_BIT,
+        .tiling = vk.VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT,
+        .usage = image_usage,
+        .sharingMode = vk.VK_SHARING_MODE_EXCLUSIVE,
+        .queueFamilyIndexCount = 0,
+        .pQueueFamilyIndices = null,
+        .initialLayout = vk.VK_IMAGE_LAYOUT_UNDEFINED,
+    };
+    var image: vk.VkImage = undefined;
+    if (dev.dispatch.createImage(dev.device, &image_info, null, &image) != vk.VK_SUCCESS) {
+        log.err("vkCreateImage (Target direct, mod=0x{x}) failed", .{chosen_mod});
+        return error.VulkanFailed;
+    }
+    errdefer dev.dispatch.destroyImage(dev.device, image, null);
+
+    // ---- 2. Image memory: exportable ---------------------------------
+    var image_reqs: vk.VkMemoryRequirements = undefined;
+    dev.dispatch.getImageMemoryRequirements(dev.device, image, &image_reqs);
+
+    // In direct mode the host doesn't mmap the dmabuf — it imports it
+    // as a 2D image into the compositor (`image_backed=true` per
+    // `Target.present`). So DEVICE_LOCAL is the right choice: GPU-
+    // local memory is faster for the COLOR_ATTACHMENT_OUTPUT writes,
+    // and vendor-tiled modifiers often require it on drivers like
+    // NVIDIA (which won't expose HOST_VISIBLE memory types for the
+    // bits a tiled exportable image requires anyway).
+    const image_mem_idx = dev.findMemoryType(
+        image_reqs.memoryTypeBits,
+        vk.VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT,
+    ) orelse {
+        log.err(
+            "no DEVICE_LOCAL memory type for direct dmabuf image " ++
+                "(mod=0x{x} typeBits=0x{x})",
+            .{ chosen_mod, image_reqs.memoryTypeBits },
+        );
+        return error.NoSuitableMemoryType;
+    };
+    const export_info: vk.VkExportMemoryAllocateInfo = .{
+        .sType = vk.VK_STRUCTURE_TYPE_EXPORT_MEMORY_ALLOCATE_INFO,
+        .pNext = null,
+        .handleTypes = vk.VK_EXTERNAL_MEMORY_HANDLE_TYPE_DMA_BUF_BIT_EXT,
+    };
+    const image_alloc: vk.VkMemoryAllocateInfo = .{
+        .sType = vk.VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO,
+        .pNext = &export_info,
+        .allocationSize = image_reqs.size,
+        .memoryTypeIndex = image_mem_idx,
+    };
+    var image_memory: vk.VkDeviceMemory = undefined;
+    if (dev.dispatch.allocateMemory(dev.device, &image_alloc, null, &image_memory) != vk.VK_SUCCESS) {
+        log.err("vkAllocateMemory (Target direct image) failed", .{});
+        return error.VulkanFailed;
+    }
+    errdefer dev.dispatch.freeMemory(dev.device, image_memory, null);
+    if (dev.dispatch.bindImageMemory(dev.device, image, image_memory, 0) != vk.VK_SUCCESS) {
+        log.err("vkBindImageMemory (Target direct image) failed", .{});
+        return error.VulkanFailed;
+    }
+
+    // ---- 3. View ---------------------------------------------------
+    const view = try createView(dev, image, opts.format);
+    errdefer dev.dispatch.destroyImageView(dev.device, view, null);
+
+    // ---- 4. Export memory as dmabuf fd -----------------------------
+    const fd = try exportDmabufFd(dev, image_memory);
+    errdefer std.posix.close(fd);
+
+    // ---- 5. Confirm the actual modifier + plane layout -------------
+    // For non-LINEAR we used LIST create-info (one entry), so the
+    // driver "picked" the only option. We query back via
+    // `vkGetImageDrmFormatModifierPropertiesEXT` as a sanity check
+    // and log a warning if the driver returned a different modifier
+    // — that would indicate a driver bug or our list being ignored.
+    var actual_mod = chosen_mod;
+    if (chosen_mod != DRM_FORMAT_MOD_LINEAR) {
+        var mod_props: vk.VkImageDrmFormatModifierPropertiesEXT = .{
+            .sType = vk.VK_STRUCTURE_TYPE_IMAGE_DRM_FORMAT_MODIFIER_PROPERTIES_EXT,
+            .pNext = null,
+            .drmFormatModifier = 0,
+        };
+        if (dev.dispatch.getImageDrmFormatModifierPropertiesEXT(
+            dev.device,
+            image,
+            &mod_props,
+        ) == vk.VK_SUCCESS) {
+            actual_mod = mod_props.drmFormatModifier;
+            if (actual_mod != chosen_mod) {
+                log.warn(
+                    "driver chose modifier 0x{x}, we asked for 0x{x}",
+                    .{ actual_mod, chosen_mod },
+                );
+            }
+        }
+    }
+
+    // Plane 0 layout: rowPitch is what we report as `stride` to the
+    // compositor. For LINEAR this is width*bpp (possibly padded).
+    // For vendor-tiled formats the value is implementation-specific —
+    // the compositor's GPU knows how to interpret it given the
+    // modifier we report alongside.
+    var subres: vk.VkImageSubresource = .{
+        .aspectMask = vk.VK_IMAGE_ASPECT_MEMORY_PLANE_0_BIT_EXT,
+        .mipLevel = 0,
+        .arrayLayer = 0,
+    };
+    var layout: vk.VkSubresourceLayout = undefined;
+    dev.dispatch.getImageSubresourceLayout(dev.device, image, &subres, &layout);
+
+    return .{
+        .device = dev,
+        .platform = opts.platform,
+        .tiling = .direct,
+        .image = image,
+        .image_memory = image_memory,
+        .view = view,
+        .dmabuf_buffer = null,
+        .dmabuf_memory = null,
+        .format = opts.format,
+        .width = opts.width,
+        .height = opts.height,
+        .fd = fd,
+        .drm_format = drm_format,
+        .drm_modifier = actual_mod,
+        .stride = stride: {
+            // VkSubresourceLayout.rowPitch is u64 but the platform
+            // present callback accepts u32 stride. For a sanely-
+            // sized terminal target stride fits comfortably in u32,
+            // but vendor-tiled drivers at exotic resolutions could
+            // legitimately exceed it. Fail the init explicitly
+            // instead of letting `@intCast` panic in safe builds.
+            if (layout.rowPitch > std.math.maxInt(u32)) {
+                log.err(
+                    "Target.initDirect: rowPitch {} > u32 max; refusing direct mode",
+                    .{layout.rowPitch},
+                );
+                return error.UnsupportedFormat;
+            }
+            break :stride @intCast(layout.rowPitch);
+        },
+    };
+}
+
+/// `.legacy_copy` mode: OPTIMAL render image + separate LINEAR
+/// dmabuf-exported `VkBuffer`. Behavior identical to the
+/// pre-modifier-path code.
+fn initLegacyCopy(opts: Options, drm_format: u32) Error!Self {
+    const dev = opts.device;
+
+    // BGRA8 — 4 bytes/pixel, packed (no per-row padding).
+    const bytes_per_pixel: u32 = 4;
+    const stride: u32 = opts.width * bytes_per_pixel;
+    const buffer_size: vk.VkDeviceSize = @as(vk.VkDeviceSize, stride) * opts.height;
+
+    // ---- 1. Render image: OPTIMAL tiling, internal memory ----------
+    const image_usage = @as(vk.VkImageUsageFlags, vk.VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT) |
+        vk.VK_IMAGE_USAGE_SAMPLED_BIT |
+        vk.VK_IMAGE_USAGE_TRANSFER_SRC_BIT |
+        opts.extra_usage;
+    const image_info: vk.VkImageCreateInfo = .{
+        .sType = vk.VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO,
+        .pNext = null,
+        .flags = 0,
+        .imageType = vk.VK_IMAGE_TYPE_2D,
+        .format = opts.format,
+        .extent = .{ .width = opts.width, .height = opts.height, .depth = 1 },
+        .mipLevels = 1,
+        .arrayLayers = 1,
+        .samples = vk.VK_SAMPLE_COUNT_1_BIT,
+        .tiling = vk.VK_IMAGE_TILING_OPTIMAL,
+        .usage = image_usage,
+        .sharingMode = vk.VK_SHARING_MODE_EXCLUSIVE,
+        .queueFamilyIndexCount = 0,
+        .pQueueFamilyIndices = null,
+        .initialLayout = vk.VK_IMAGE_LAYOUT_UNDEFINED,
+    };
+    var image: vk.VkImage = undefined;
+    if (dev.dispatch.createImage(dev.device, &image_info, null, &image) != vk.VK_SUCCESS) {
+        log.err("vkCreateImage (Target legacy render) failed", .{});
+        return error.VulkanFailed;
+    }
+    errdefer dev.dispatch.destroyImage(dev.device, image, null);
+
+    var image_reqs: vk.VkMemoryRequirements = undefined;
+    dev.dispatch.getImageMemoryRequirements(dev.device, image, &image_reqs);
+    const image_mem_idx = dev.findMemoryType(
+        image_reqs.memoryTypeBits,
+        vk.VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT,
+    ) orelse return error.NoSuitableMemoryType;
+    const image_alloc: vk.VkMemoryAllocateInfo = .{
+        .sType = vk.VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO,
+        .pNext = null,
+        .allocationSize = image_reqs.size,
+        .memoryTypeIndex = image_mem_idx,
+    };
+    var image_memory: vk.VkDeviceMemory = undefined;
+    if (dev.dispatch.allocateMemory(dev.device, &image_alloc, null, &image_memory) != vk.VK_SUCCESS) {
+        log.err("vkAllocateMemory (Target legacy render image) failed", .{});
+        return error.VulkanFailed;
+    }
+    errdefer dev.dispatch.freeMemory(dev.device, image_memory, null);
+    if (dev.dispatch.bindImageMemory(dev.device, image, image_memory, 0) != vk.VK_SUCCESS) {
+        log.err("vkBindImageMemory (Target legacy render image) failed", .{});
+        return error.VulkanFailed;
+    }
+
+    // ---- 2. View ---------------------------------------------------
+    const view = try createView(dev, image, opts.format);
+    errdefer dev.dispatch.destroyImageView(dev.device, view, null);
+
+    // ---- 3. Dmabuf buffer: LINEAR pixel data, external memory -----
+    const ext_buffer_info: vk.VkExternalMemoryBufferCreateInfo = .{
+        .sType = vk.VK_STRUCTURE_TYPE_EXTERNAL_MEMORY_BUFFER_CREATE_INFO,
+        .pNext = null,
+        .handleTypes = vk.VK_EXTERNAL_MEMORY_HANDLE_TYPE_DMA_BUF_BIT_EXT,
+    };
+    const buffer_info: vk.VkBufferCreateInfo = .{
+        .sType = vk.VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO,
+        .pNext = &ext_buffer_info,
+        .flags = 0,
+        .size = buffer_size,
+        .usage = vk.VK_BUFFER_USAGE_TRANSFER_DST_BIT,
+        .sharingMode = vk.VK_SHARING_MODE_EXCLUSIVE,
+        .queueFamilyIndexCount = 0,
+        .pQueueFamilyIndices = null,
+    };
+    var dmabuf_buffer: vk.VkBuffer = undefined;
+    if (dev.dispatch.createBuffer(dev.device, &buffer_info, null, &dmabuf_buffer) != vk.VK_SUCCESS) {
+        log.err("vkCreateBuffer (Target dmabuf) failed", .{});
+        return error.VulkanFailed;
+    }
+    errdefer dev.dispatch.destroyBuffer(dev.device, dmabuf_buffer, null);
+
+    var buf_reqs: vk.VkMemoryRequirements = undefined;
+    dev.dispatch.getBufferMemoryRequirements(dev.device, dmabuf_buffer, &buf_reqs);
+    // Prefer HOST_CACHED so reads from the mmap'd dmabuf are fast.
+    // Without it (HOST_VISIBLE | HOST_COHERENT only), NVIDIA gives
+    // back write-combining memory: GPU writes are fast but HOST reads
+    // crawl (~10 MB/s) because the mapping is uncached. The Qt
+    // `presentVulkanDmabuf` `QImage::copy()` reads every pixel, so a
+    // small ~3 MB frame took ~260 ms there. HOST_COHERENT is still
+    // requested so we don't need explicit flushes between GPU writes
+    // and host reads; HOST_CACHED on top makes the host reads
+    // cacheable.
+    const host_flags_cached =
+        @as(vk.VkMemoryPropertyFlags, vk.VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT) |
+        vk.VK_MEMORY_PROPERTY_HOST_COHERENT_BIT |
+        vk.VK_MEMORY_PROPERTY_HOST_CACHED_BIT;
+    const host_flags_uncached =
+        @as(vk.VkMemoryPropertyFlags, vk.VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT) |
+        vk.VK_MEMORY_PROPERTY_HOST_COHERENT_BIT;
+    const dmabuf_mem_idx = dev.findMemoryType(buf_reqs.memoryTypeBits, host_flags_cached) orelse
+        dev.findMemoryType(buf_reqs.memoryTypeBits, host_flags_uncached) orelse
+        {
+            log.err(
+                "no HOST_VISIBLE memory type for dmabuf (typeBits=0x{x})",
+                .{buf_reqs.memoryTypeBits},
+            );
+            return error.NoSuitableMemoryType;
+        };
+    const export_info: vk.VkExportMemoryAllocateInfo = .{
+        .sType = vk.VK_STRUCTURE_TYPE_EXPORT_MEMORY_ALLOCATE_INFO,
+        .pNext = null,
+        .handleTypes = vk.VK_EXTERNAL_MEMORY_HANDLE_TYPE_DMA_BUF_BIT_EXT,
+    };
+    const buf_alloc: vk.VkMemoryAllocateInfo = .{
+        .sType = vk.VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO,
+        .pNext = &export_info,
+        .allocationSize = buf_reqs.size,
+        .memoryTypeIndex = dmabuf_mem_idx,
+    };
+    var dmabuf_memory: vk.VkDeviceMemory = undefined;
+    if (dev.dispatch.allocateMemory(dev.device, &buf_alloc, null, &dmabuf_memory) != vk.VK_SUCCESS) {
+        log.err("vkAllocateMemory (Target dmabuf) failed", .{});
+        return error.VulkanFailed;
+    }
+    errdefer dev.dispatch.freeMemory(dev.device, dmabuf_memory, null);
+    if (dev.dispatch.bindBufferMemory(dev.device, dmabuf_buffer, dmabuf_memory, 0) != vk.VK_SUCCESS) {
+        log.err("vkBindBufferMemory (Target dmabuf) failed", .{});
+        return error.VulkanFailed;
+    }
+
+    const fd = try exportDmabufFd(dev, dmabuf_memory);
+    errdefer std.posix.close(fd);
+
+    return .{
+        .device = dev,
+        .platform = opts.platform,
+        .tiling = .legacy_copy,
+        .image = image,
+        .image_memory = image_memory,
+        .view = view,
+        .dmabuf_buffer = dmabuf_buffer,
+        .dmabuf_memory = dmabuf_memory,
+        .format = opts.format,
+        .width = opts.width,
+        .height = opts.height,
+        .fd = fd,
+        .drm_format = drm_format,
+        .drm_modifier = DRM_FORMAT_MOD_LINEAR,
+        .stride = stride,
+    };
+}
+
+fn createView(
+    dev: *const Device,
+    image: vk.VkImage,
+    format: vk.VkFormat,
+) Error!vk.VkImageView {
+    const view_info: vk.VkImageViewCreateInfo = .{
+        .sType = vk.VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO,
+        .pNext = null,
+        .flags = 0,
+        .image = image,
+        .viewType = vk.VK_IMAGE_VIEW_TYPE_2D,
+        .format = format,
+        .components = .{
+            .r = vk.VK_COMPONENT_SWIZZLE_IDENTITY,
+            .g = vk.VK_COMPONENT_SWIZZLE_IDENTITY,
+            .b = vk.VK_COMPONENT_SWIZZLE_IDENTITY,
+            .a = vk.VK_COMPONENT_SWIZZLE_IDENTITY,
+        },
+        .subresourceRange = .{
+            .aspectMask = vk.VK_IMAGE_ASPECT_COLOR_BIT,
+            .baseMipLevel = 0,
+            .levelCount = 1,
+            .baseArrayLayer = 0,
+            .layerCount = 1,
+        },
+    };
+    var view: vk.VkImageView = undefined;
+    if (dev.dispatch.createImageView(dev.device, &view_info, null, &view) != vk.VK_SUCCESS) {
+        log.err("vkCreateImageView (Target) failed", .{});
+        return error.VulkanFailed;
+    }
+    return view;
+}
+
+fn exportDmabufFd(dev: *const Device, memory: vk.VkDeviceMemory) Error!i32 {
+    const fd_info: vk.VkMemoryGetFdInfoKHR = .{
+        .sType = vk.VK_STRUCTURE_TYPE_MEMORY_GET_FD_INFO_KHR,
+        .pNext = null,
+        .memory = memory,
+        .handleType = vk.VK_EXTERNAL_MEMORY_HANDLE_TYPE_DMA_BUF_BIT_EXT,
+    };
+    var fd: c_int = -1;
+    if (dev.dispatch.getMemoryFdKHR(dev.device, &fd_info, &fd) != vk.VK_SUCCESS or fd < 0) {
+        log.err("vkGetMemoryFdKHR (Target) failed: fd={}", .{fd});
+        return error.VulkanFailed;
+    }
+    return fd;
+}
+
+pub fn deinit(self: *Self) void {
+    const dev = self.device;
+    if (self.fd >= 0) std.posix.close(self.fd);
+    if (self.dmabuf_buffer) |b| dev.dispatch.destroyBuffer(dev.device, b, null);
+    if (self.dmabuf_memory) |m| dev.dispatch.freeMemory(dev.device, m, null);
+    dev.dispatch.destroyImageView(dev.device, self.view, null);
+    dev.dispatch.destroyImage(dev.device, self.image, null);
+    dev.dispatch.freeMemory(dev.device, self.image_memory, null);
+    self.* = undefined;
+}
+
+/// Record the end-of-frame barrier(s) that make the rendered pixels
+/// visible to the host's later mmap read. Dispatches on `self.tiling`:
+///
+///   - `.direct`: just an image layout/memory barrier — the render
+///     image's own memory is the dmabuf, so we transition
+///     `GENERAL → GENERAL` with `COLOR_ATTACHMENT_WRITE → HOST_READ`
+///     visibility (`COLOR_ATTACHMENT_OUTPUT → HOST` stages). The
+///     LINEAR-modifier image stays in GENERAL throughout — it's both
+///     the render target and the host-mapped surface.
+///
+///   - `.legacy_copy`: the original behavior — transition the
+///     render image to `TRANSFER_SRC_OPTIMAL`, `vkCmdCopyImageToBuffer`
+///     into the dmabuf buffer, buffer-memory barrier for HOST_READ
+///     visibility.
+///
+/// Call this AFTER all RenderPass work has been recorded but BEFORE
+/// `vkEndCommandBuffer`.
+pub fn recordPresentBarrier(self: *Self, cb: vk.VkCommandBuffer) void {
+    switch (self.tiling) {
+        .direct => self.recordDirectBarrier(cb),
+        .legacy_copy => self.recordCopyToDmabuf(cb),
+    }
+}
+
+fn recordDirectBarrier(self: *Self, cb: vk.VkCommandBuffer) void {
+    const dev = self.device;
+
+    // Image stays in GENERAL — it's the render target AND the
+    // host-mapped surface. We only need a memory barrier so the host's
+    // mmap read sees the writes from the COLOR_ATTACHMENT_OUTPUT stage.
+    const img_barrier: vk.VkImageMemoryBarrier = .{
+        .sType = vk.VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER,
+        .pNext = null,
+        .srcAccessMask = vk.VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT,
+        .dstAccessMask = vk.VK_ACCESS_HOST_READ_BIT,
+        .oldLayout = vk.VK_IMAGE_LAYOUT_GENERAL,
+        .newLayout = vk.VK_IMAGE_LAYOUT_GENERAL,
+        .srcQueueFamilyIndex = vk.VK_QUEUE_FAMILY_IGNORED,
+        .dstQueueFamilyIndex = vk.VK_QUEUE_FAMILY_IGNORED,
+        .image = self.image,
+        .subresourceRange = .{
+            .aspectMask = vk.VK_IMAGE_ASPECT_COLOR_BIT,
+            .baseMipLevel = 0,
+            .levelCount = 1,
+            .baseArrayLayer = 0,
+            .layerCount = 1,
+        },
+    };
+    dev.dispatch.cmdPipelineBarrier(
+        cb,
+        vk.VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT,
+        vk.VK_PIPELINE_STAGE_HOST_BIT,
+        0,
+        0,
+        null,
+        0,
+        null,
+        1,
+        &img_barrier,
+    );
+
+    self.layout = vk.VK_IMAGE_LAYOUT_GENERAL;
+}
+
+fn recordCopyToDmabuf(self: *Self, cb: vk.VkCommandBuffer) void {
+    const dev = self.device;
+
+    // Image: GENERAL → TRANSFER_SRC_OPTIMAL (the RenderPass leaves us
+    // in GENERAL on complete, but if it was UNDEFINED for some reason
+    // we still need a valid transition; UNDEFINED is also legal).
+    const img_barrier: vk.VkImageMemoryBarrier = .{
+        .sType = vk.VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER,
+        .pNext = null,
+        .srcAccessMask = vk.VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT,
+        .dstAccessMask = vk.VK_ACCESS_TRANSFER_READ_BIT,
+        .oldLayout = vk.VK_IMAGE_LAYOUT_GENERAL,
+        .newLayout = vk.VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL,
+        .srcQueueFamilyIndex = vk.VK_QUEUE_FAMILY_IGNORED,
+        .dstQueueFamilyIndex = vk.VK_QUEUE_FAMILY_IGNORED,
+        .image = self.image,
+        .subresourceRange = .{
+            .aspectMask = vk.VK_IMAGE_ASPECT_COLOR_BIT,
+            .baseMipLevel = 0,
+            .levelCount = 1,
+            .baseArrayLayer = 0,
+            .layerCount = 1,
+        },
+    };
+    dev.dispatch.cmdPipelineBarrier(
+        cb,
+        vk.VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT,
+        vk.VK_PIPELINE_STAGE_TRANSFER_BIT,
+        0,
+        0,
+        null,
+        0,
+        null,
+        1,
+        &img_barrier,
+    );
+
+    // Copy image → buffer. BGRA8, packed (stride = width*4).
+    const region: vk.VkBufferImageCopy = .{
+        .bufferOffset = 0,
+        .bufferRowLength = 0, // 0 = tightly packed (uses imageExtent.width)
+        .bufferImageHeight = 0,
+        .imageSubresource = .{
+            .aspectMask = vk.VK_IMAGE_ASPECT_COLOR_BIT,
+            .mipLevel = 0,
+            .baseArrayLayer = 0,
+            .layerCount = 1,
+        },
+        .imageOffset = .{ .x = 0, .y = 0, .z = 0 },
+        .imageExtent = .{ .width = self.width, .height = self.height, .depth = 1 },
+    };
+    dev.dispatch.cmdCopyImageToBuffer(
+        cb,
+        self.image,
+        vk.VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL,
+        self.dmabuf_buffer.?,
+        1,
+        &region,
+    );
+
+    // Memory barrier so the host's later mmap read sees the bytes.
+    // HOST_READ_BIT is the destination access; HOST_BIT is the
+    // destination stage. (External fd consumers may need an explicit
+    // sync2 release barrier, but for an mmap-based read after a
+    // fence-wait this is sufficient on the GPU side.)
+    const buf_barrier: vk.VkBufferMemoryBarrier = .{
+        .sType = vk.VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER,
+        .pNext = null,
+        .srcAccessMask = vk.VK_ACCESS_TRANSFER_WRITE_BIT,
+        .dstAccessMask = vk.VK_ACCESS_HOST_READ_BIT,
+        .srcQueueFamilyIndex = vk.VK_QUEUE_FAMILY_IGNORED,
+        .dstQueueFamilyIndex = vk.VK_QUEUE_FAMILY_IGNORED,
+        .buffer = self.dmabuf_buffer.?,
+        .offset = 0,
+        .size = vk.VK_WHOLE_SIZE,
+    };
+    dev.dispatch.cmdPipelineBarrier(
+        cb,
+        vk.VK_PIPELINE_STAGE_TRANSFER_BIT,
+        vk.VK_PIPELINE_STAGE_HOST_BIT,
+        0,
+        0,
+        null,
+        1,
+        &buf_barrier,
+        0,
+        null,
+    );
+
+    // Track the new image layout so the next frame's RenderPass.begin
+    // doesn't see stale state (it currently transitions from UNDEFINED
+    // unconditionally, but be defensive).
+    self.layout = vk.VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL;
+}
+
+pub fn present(self: *const Self) void {
+    // Per-surface platform — its `userdata` points at THIS surface's
+    // GhosttySurface, so present reaches the right window.
+    const platform = self.platform;
+    // `image_backed` is the host's signal that this fd is importable
+    // by a 2D-image consumer (Wayland linux-dmabuf-v1, Vulkan
+    // external image, etc.). True in `.direct` mode where the fd was
+    // exported from a VkImage; false in `.legacy_copy` where it was
+    // exported from a VkBuffer and can only be read via mmap.
+    platform.present(
+        platform.userdata,
+        self.fd,
+        self.drm_format,
+        self.drm_modifier,
+        self.width,
+        self.height,
+        self.stride,
+        self.tiling == .direct,
+    );
+}
+
+fn vkFormatToDrmFourcc(format: vk.VkFormat) Error!u32 {
+    const fourcc = struct {
+        fn make(a: u8, b: u8, c: u8, d: u8) u32 {
+            return (@as(u32, a)) |
+                (@as(u32, b) << 8) |
+                (@as(u32, c) << 16) |
+                (@as(u32, d) << 24);
+        }
+    };
+    return switch (format) {
+        vk.VK_FORMAT_B8G8R8A8_UNORM,
+        vk.VK_FORMAT_B8G8R8A8_SRGB,
+        => fourcc.make('A', 'R', '2', '4'),
+        vk.VK_FORMAT_R8G8B8A8_UNORM,
+        vk.VK_FORMAT_R8G8B8A8_SRGB,
+        => fourcc.make('A', 'B', '2', '4'),
+        else => error.UnsupportedFormat,
+    };
+}
+
+test {
+    std.testing.refAllDecls(@This());
+}
--- a/src/renderer/vulkan/Texture.zig
+++ b/src/renderer/vulkan/Texture.zig
@ -0,0 +1,430 @@
+//! Wrapper for `VkImage` + `VkDeviceMemory` + `VkImageView` with a
+//! staging-buffer upload path.
+//!
+//! Holds a 2D image, the backing device-local memory, and a view
+//! configured for color sampling. All three handles are libghostty-
+//! owned and destroyed in `deinit`.
+//!
+//! Uploads go through a temporary `Buffer(u8)` staging buffer
+//! (`HOST_VISIBLE | HOST_COHERENT | TRANSFER_SRC`) and a per-call
+//! `CommandPool` that drives the layout-transition →
+//! `vkCmdCopyBufferToImage` → layout-transition sequence. Both
+//! resources are destroyed by the time `replaceRegion` returns — the
+//! upload is synchronous from the caller's perspective. That's the
+//! right tradeoff for atlas resizes (rare; the renderer can afford
+//! the stall) but won't fit the eventual per-frame upload path,
+//! which will reuse a long-lived `CommandPool` and fence-paced
+//! submission.
+//!
+//! Layout tracking: a single `layout: VkImageLayout` field records
+//! whether the image currently sits in `UNDEFINED` (fresh) or
+//! `SHADER_READ_ONLY_OPTIMAL` (after at least one upload). The
+//! barrier sequence in `replaceRegion` reads this field to pick the
+//! right `srcAccessMask` / `srcStageMask`.
+//!
+//! Counterpart: `src/renderer/opengl/Texture.zig`.
+
+const Self = @This();
+
+const std = @import("std");
+const vulkan = @import("vulkan");
+const vk = vulkan.c;
+
+const Device = vulkan.Device;
+const CommandPool = vulkan.CommandPool;
+const bufferpkg = @import("buffer.zig");
+
+const log = std.log.scoped(.vulkan);
+
+/// Pixel format hint matching `opengl/OpenGL.zig`'s `ImageTextureFormat`.
+/// Used by `Vulkan.imageTextureOptions` to pick a `VkFormat` for kitty
+/// graphics / background-image uploads. Lives here (next to `Texture`)
+/// instead of in the renderer top-level so the rendering policy that
+/// owns it (the SRGB-vs-UNORM choice for color channels) can be
+/// inspected in one place.
+pub const ImageTextureFormat = enum {
+    gray,
+    rgba,
+    bgra,
+
+    pub fn toVk(self: ImageTextureFormat, srgb: bool) vk.VkFormat {
+        return switch (self) {
+            // `gray` is a single-channel R8 (no color, no gamma).
+            .gray => vk.VK_FORMAT_R8_UNORM,
+            // Color channels honor `srgb`: when an image was
+            // authored in sRGB (the common case for kitty graphics),
+            // selecting the SRGB format lets the sampler auto-
+            // linearize on read so `texture()` returns linear values
+            // that the renderer's `unlinearize()` then re-encodes
+            // for the sRGB framebuffer. UNORM here would skip the
+            // sampler decode, leaving sRGB bytes for `unlinearize`
+            // to encode-again, which is then encoded a third time
+            // by the SRGB framebuffer — visible as washed-out kitty
+            // graphics.
+            .rgba => if (srgb) vk.VK_FORMAT_R8G8B8A8_SRGB else vk.VK_FORMAT_R8G8B8A8_UNORM,
+            .bgra => if (srgb) vk.VK_FORMAT_B8G8R8A8_SRGB else vk.VK_FORMAT_B8G8R8A8_UNORM,
+        };
+    }
+};
+
+/// Texture construction parameters. Vulkan-native rather than mirroring
+/// the OpenGL backend's separate `format` / `internal_format` — Vulkan
+/// encodes both into one `VkFormat`.
+pub const Options = struct {
+    device: *const Device,
+
+    /// Pixel format. Common choices:
+    ///   - `VK_FORMAT_R8G8B8A8_UNORM`     — color atlases, render target.
+    ///   - `VK_FORMAT_R8G8B8A8_SRGB`      — sRGB color atlases.
+    ///   - `VK_FORMAT_R8_UNORM`           — grayscale glyph atlas.
+    format: vk.VkFormat,
+
+    /// `VkImageUsageFlagBits` for the image. Typical:
+    ///   - Atlas:           `SAMPLED | TRANSFER_DST`
+    ///   - Render target:   `COLOR_ATTACHMENT | SAMPLED` (+ external
+    ///                       memory flags wired in by the export path)
+    /// `TRANSFER_DST_BIT` is forced on at create time so the upload
+    /// path always works — callers don't have to remember.
+    usage: vk.VkImageUsageFlags,
+
+    /// Aspect mask for the image view. Defaults to color; depth images
+    /// would override.
+    aspect: vk.VkImageAspectFlags = vk.VK_IMAGE_ASPECT_COLOR_BIT,
+};
+
+pub const Error = error{
+    /// A `vkCreate*` or `vkAllocate*` returned a non-success status.
+    /// Logged with the raw `VkResult`.
+    VulkanFailed,
+    /// `findMemoryType` couldn't find a `DEVICE_LOCAL` memory type
+    /// matching the image's requirements. Effectively unrecoverable
+    /// — typical Vulkan devices always expose at least one.
+    NoSuitableMemoryType,
+};
+
+image: vk.VkImage,
+memory: vk.VkDeviceMemory,
+view: vk.VkImageView,
+format: vk.VkFormat,
+/// Aspect mask the image was created with (e.g. COLOR_BIT for
+/// renderable textures, DEPTH_BIT for depth attachments). Stored
+/// so per-frame `replaceRegion` barrier/copy use the same aspect
+/// the image view was made with — hardcoding COLOR_BIT here was a
+/// silent validation error for any non-color caller.
+aspect: vk.VkImageAspectFlags,
+width: usize,
+height: usize,
+device: *const Device,
+
+/// Current image layout. Starts at `UNDEFINED`; `replaceRegion`
+/// drives it to `SHADER_READ_ONLY_OPTIMAL` on the first call and
+/// keeps it there afterwards. Read by the barrier sequence in
+/// `replaceRegion` to pick the right transition source.
+layout: vk.VkImageLayout = vk.VK_IMAGE_LAYOUT_UNDEFINED,
+
+/// Create a 2D texture. With non-null `data`, the image is uploaded
+/// and ends in `SHADER_READ_ONLY_OPTIMAL`. With null `data`, the
+/// image is left in `UNDEFINED` — the caller transitions it later
+/// (typically via `replaceRegion` or as a render target).
+pub fn init(
+    opts: Options,
+    width: usize,
+    height: usize,
+    data: ?[]const u8,
+) Error!Self {
+    const dev = opts.device;
+
+    // ---- 1. VkImage ---------------------------------------------
+    // Force TRANSFER_DST_BIT so `replaceRegion` always works without
+    // callers having to remember to set it.
+    const usage = opts.usage | @as(vk.VkImageUsageFlags, vk.VK_IMAGE_USAGE_TRANSFER_DST_BIT);
+    const image_info: vk.VkImageCreateInfo = .{
+        .sType = vk.VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO,
+        .pNext = null,
+        .flags = 0,
+        .imageType = vk.VK_IMAGE_TYPE_2D,
+        .format = opts.format,
+        .extent = .{
+            .width = @intCast(width),
+            .height = @intCast(height),
+            .depth = 1,
+        },
+        .mipLevels = 1,
+        .arrayLayers = 1,
+        .samples = vk.VK_SAMPLE_COUNT_1_BIT,
+        .tiling = vk.VK_IMAGE_TILING_OPTIMAL,
+        .usage = usage,
+        .sharingMode = vk.VK_SHARING_MODE_EXCLUSIVE,
+        .queueFamilyIndexCount = 0,
+        .pQueueFamilyIndices = null,
+        .initialLayout = vk.VK_IMAGE_LAYOUT_UNDEFINED,
+    };
+    var image: vk.VkImage = undefined;
+    {
+        const r = dev.dispatch.createImage(dev.device, &image_info, null, &image);
+        if (r != vk.VK_SUCCESS) {
+            log.err("vkCreateImage failed: result={}", .{r});
+            return error.VulkanFailed;
+        }
+    }
+    errdefer dev.dispatch.destroyImage(dev.device, image, null);
+
+    // ---- 2. VkDeviceMemory --------------------------------------
+    var reqs: vk.VkMemoryRequirements = undefined;
+    dev.dispatch.getImageMemoryRequirements(dev.device, image, &reqs);
+
+    const memory_type_index = dev.findMemoryType(
+        reqs.memoryTypeBits,
+        vk.VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT,
+    ) orelse {
+        log.err(
+            "no DEVICE_LOCAL memory type found for image (typeBits=0x{x})",
+            .{reqs.memoryTypeBits},
+        );
+        return error.NoSuitableMemoryType;
+    };
+
+    const alloc_info: vk.VkMemoryAllocateInfo = .{
+        .sType = vk.VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO,
+        .pNext = null,
+        .allocationSize = reqs.size,
+        .memoryTypeIndex = memory_type_index,
+    };
+    var memory: vk.VkDeviceMemory = undefined;
+    {
+        const r = dev.dispatch.allocateMemory(dev.device, &alloc_info, null, &memory);
+        if (r != vk.VK_SUCCESS) {
+            log.err("vkAllocateMemory failed: result={}", .{r});
+            return error.VulkanFailed;
+        }
+    }
+    errdefer dev.dispatch.freeMemory(dev.device, memory, null);
+
+    {
+        const r = dev.dispatch.bindImageMemory(dev.device, image, memory, 0);
+        if (r != vk.VK_SUCCESS) {
+            log.err("vkBindImageMemory failed: result={}", .{r});
+            return error.VulkanFailed;
+        }
+    }
+
+    // ---- 3. VkImageView -----------------------------------------
+    const view_info: vk.VkImageViewCreateInfo = .{
+        .sType = vk.VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO,
+        .pNext = null,
+        .flags = 0,
+        .image = image,
+        .viewType = vk.VK_IMAGE_VIEW_TYPE_2D,
+        .format = opts.format,
+        .components = .{
+            .r = vk.VK_COMPONENT_SWIZZLE_IDENTITY,
+            .g = vk.VK_COMPONENT_SWIZZLE_IDENTITY,
+            .b = vk.VK_COMPONENT_SWIZZLE_IDENTITY,
+            .a = vk.VK_COMPONENT_SWIZZLE_IDENTITY,
+        },
+        .subresourceRange = .{
+            .aspectMask = opts.aspect,
+            .baseMipLevel = 0,
+            .levelCount = 1,
+            .baseArrayLayer = 0,
+            .layerCount = 1,
+        },
+    };
+    var view: vk.VkImageView = undefined;
+    {
+        const r = dev.dispatch.createImageView(dev.device, &view_info, null, &view);
+        if (r != vk.VK_SUCCESS) {
+            log.err("vkCreateImageView failed: result={}", .{r});
+            return error.VulkanFailed;
+        }
+    }
+    errdefer dev.dispatch.destroyImageView(dev.device, view, null);
+
+    var self: Self = .{
+        .image = image,
+        .memory = memory,
+        .view = view,
+        .format = opts.format,
+        .aspect = opts.aspect,
+        .width = width,
+        .height = height,
+        .device = dev,
+    };
+
+    if (data) |d| try self.replaceRegion(0, 0, width, height, d);
+    return self;
+}
+
+pub fn deinit(self: Self) void {
+    const dev = self.device;
+    dev.dispatch.destroyImageView(dev.device, self.view, null);
+    dev.dispatch.destroyImage(dev.device, self.image, null);
+    dev.dispatch.freeMemory(dev.device, self.memory, null);
+}
+
+/// Replace a region of the texture with the provided data. Performs:
+///   1. Allocate a host-coherent staging buffer holding `data`.
+///   2. One-shot command buffer:
+///      a. Barrier: current layout → TRANSFER_DST_OPTIMAL.
+///      b. `vkCmdCopyBufferToImage`.
+///      c. Barrier: TRANSFER_DST_OPTIMAL → SHADER_READ_ONLY_OPTIMAL.
+///   3. Submit + `vkQueueWaitIdle`.
+///   4. Free staging buffer + command pool.
+///
+/// On success, `self.layout` is `SHADER_READ_ONLY_OPTIMAL`.
+pub fn replaceRegion(
+    self: *Self,
+    x: usize,
+    y: usize,
+    width: usize,
+    height: usize,
+    data: []const u8,
+) Error!void {
+    // Empty-data / zero-region call: full no-op (does NOT transition
+    // the image layout). Callers passing nothing-to-upload are
+    // saying just that; transitioning anyway would issue a one-shot
+    // command-buffer + queueWaitIdle for no reason and would surprise
+    // a caller relying on the texture's current layout being
+    // preserved. If a caller ever needs a layout-only transition,
+    // add a separate `transitionToShaderRead` API rather than
+    // overloading replaceRegion's empty-data path.
+    if (data.len == 0 or width == 0 or height == 0) return;
+    const dev = self.device;
+
+    // ---- staging buffer -----------------------------------------
+    var staging = try bufferpkg.Buffer(u8).initFill(.{
+        .device = dev,
+        .usage = vk.VK_BUFFER_USAGE_TRANSFER_SRC_BIT,
+    }, data);
+    // `destroyImmediate` instead of `deinit`: replaceRegion runs
+    // synchronously on the calling thread (typically the main /
+    // app-init thread, NOT the renderer thread), and
+    // `OneShot.endAndSubmit` below calls `vkQueueWaitIdle` so the
+    // staging buffer is provably done with the GPU before this
+    // defer fires. Routing it into `Vulkan.buffer_pool` from a
+    // non-renderer thread would leak it forever — the pool's
+    // `cycle()` runs only on the renderer thread.
+    defer staging.destroyImmediate();
+
+    // ---- command pool (one-shot) --------------------------------
+    var pool = try CommandPool.init(dev);
+    defer pool.deinit();
+    const session = try pool.beginOneShot();
+
+    // ---- barrier: current → TRANSFER_DST_OPTIMAL ----------------
+    const old_layout = self.layout;
+    const src_access: vk.VkAccessFlags = switch (old_layout) {
+        vk.VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL => vk.VK_ACCESS_SHADER_READ_BIT,
+        else => 0,
+    };
+    const src_stage: vk.VkPipelineStageFlags = switch (old_layout) {
+        vk.VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL => vk.VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT,
+        else => vk.VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT,
+    };
+    {
+        const barrier: vk.VkImageMemoryBarrier = .{
+            .sType = vk.VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER,
+            .pNext = null,
+            .srcAccessMask = src_access,
+            .dstAccessMask = vk.VK_ACCESS_TRANSFER_WRITE_BIT,
+            .oldLayout = old_layout,
+            .newLayout = vk.VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL,
+            .srcQueueFamilyIndex = vk.VK_QUEUE_FAMILY_IGNORED,
+            .dstQueueFamilyIndex = vk.VK_QUEUE_FAMILY_IGNORED,
+            .image = self.image,
+            .subresourceRange = .{
+                .aspectMask = self.aspect,
+                .baseMipLevel = 0,
+                .levelCount = 1,
+                .baseArrayLayer = 0,
+                .layerCount = 1,
+            },
+        };
+        dev.dispatch.cmdPipelineBarrier(
+            session.cb,
+            src_stage,
+            vk.VK_PIPELINE_STAGE_TRANSFER_BIT,
+            0, // dependencyFlags
+            0,
+            null, // memory barriers
+            0,
+            null, // buffer memory barriers
+            1,
+            &barrier,
+        );
+    }
+
+    // ---- vkCmdCopyBufferToImage ---------------------------------
+    {
+        const region: vk.VkBufferImageCopy = .{
+            .bufferOffset = 0,
+            .bufferRowLength = 0, // tightly packed
+            .bufferImageHeight = 0,
+            .imageSubresource = .{
+                .aspectMask = self.aspect,
+                .mipLevel = 0,
+                .baseArrayLayer = 0,
+                .layerCount = 1,
+            },
+            .imageOffset = .{
+                .x = @intCast(x),
+                .y = @intCast(y),
+                .z = 0,
+            },
+            .imageExtent = .{
+                .width = @intCast(width),
+                .height = @intCast(height),
+                .depth = 1,
+            },
+        };
+        dev.dispatch.cmdCopyBufferToImage(
+            session.cb,
+            staging.buffer,
+            self.image,
+            vk.VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL,
+            1,
+            &region,
+        );
+    }
+
+    // ---- barrier: TRANSFER_DST → SHADER_READ_ONLY ---------------
+    {
+        const barrier: vk.VkImageMemoryBarrier = .{
+            .sType = vk.VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER,
+            .pNext = null,
+            .srcAccessMask = vk.VK_ACCESS_TRANSFER_WRITE_BIT,
+            .dstAccessMask = vk.VK_ACCESS_SHADER_READ_BIT,
+            .oldLayout = vk.VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL,
+            .newLayout = vk.VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL,
+            .srcQueueFamilyIndex = vk.VK_QUEUE_FAMILY_IGNORED,
+            .dstQueueFamilyIndex = vk.VK_QUEUE_FAMILY_IGNORED,
+            .image = self.image,
+            .subresourceRange = .{
+                .aspectMask = self.aspect,
+                .baseMipLevel = 0,
+                .levelCount = 1,
+                .baseArrayLayer = 0,
+                .layerCount = 1,
+            },
+        };
+        dev.dispatch.cmdPipelineBarrier(
+            session.cb,
+            vk.VK_PIPELINE_STAGE_TRANSFER_BIT,
+            vk.VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT,
+            0,
+            0,
+            null,
+            0,
+            null,
+            1,
+            &barrier,
+        );
+    }
+
+    try session.endAndSubmit();
+    self.layout = vk.VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL;
+}
+
+test {
+    std.testing.refAllDecls(@This());
+}
--- a/src/renderer/vulkan/ThreadState.zig
+++ b/src/renderer/vulkan/ThreadState.zig
@ -0,0 +1,232 @@
+//! Per-renderer-thread Vulkan state. Lifecycle:
+//!
+//!   - first `Vulkan.beginFrame` on a thread → `ensureInit(dev)`
+//!     lazily creates a `CommandPool`, a single command buffer
+//!     allocated from it, a fence (created signaled), and a
+//!     `DescriptorPool` sized for one frame's worst-case usage.
+//!     All four are reused across frames; only the descriptor
+//!     pool is reset every frame.
+//!   - `Vulkan.deinit` on a surface (one per renderer thread) →
+//!     `cleanup(dev)` waits the per-thread fence, frees CB,
+//!     destroys pool + fence, drops the cached `last_target`
+//!     pointer, and drains the per-thread `buffer_pool` pending
+//!     list (which is bounded by the same fence we just waited).
+//!
+//! Why threadlocal? Splits/tabs share the host's process-wide
+//! `VkDevice`, but each renderer thread submits independently and
+//! its fence-paced single-frame-in-flight model needs its own
+//! fence + command buffer to avoid stomping the previous frame's
+//! still-in-flight work. Threadlocal also matches the lifetime of
+//! the buffer-pool's per-thread `pending` list (both are bounded
+//! by the same `Frame.complete` fence wait).
+//!
+//! `last_target` lives here too because it's logically per-thread:
+//! `presentLastTarget` re-presents whatever the renderer thread
+//! handed to `present` last, and pointing at another thread's
+//! target would route a different surface's frames to this
+//! thread's window.
+
+const std = @import("std");
+const vulkan = @import("vulkan");
+const vk = vulkan.c;
+
+const Device = vulkan.Device;
+const CommandPool = vulkan.CommandPool;
+const DescriptorPool = vulkan.DescriptorPool;
+const Target = @import("Target.zig");
+const buffer_pool = @import("buffer_pool.zig");
+
+const log = std.log.scoped(.vulkan);
+
+/// Caps for the per-frame `step_pool`. Sized for the worst pass
+/// shape (kitty image with N placements + the post pipelines): one
+/// set per (image_step × MAX_DESCRIPTOR_SETS) plus a handful of
+/// the renderer's other pipelines stepped once each. 256 is generous
+/// — actual frames stabilize well under that. If a frame ever
+/// exhausts the pool, `RenderPass.step` falls back to the pipeline's
+/// static set with a warning logged.
+pub const STEP_POOL_MAX_SETS: u32 = 256;
+pub const STEP_POOL_UNIFORM_BUFFERS: u32 = 256;
+pub const STEP_POOL_COMBINED_IMAGE_SAMPLERS: u32 = 256;
+pub const STEP_POOL_STORAGE_BUFFERS: u32 = 256;
+
+pub const Error = error{
+    /// `vkAllocateCommandBuffers` / `vkCreateFence` returned a
+    /// non-success status. Wrapped here so the lazy-init path in
+    /// `ensureInit` can surface a single error type to callers.
+    VulkanFailed,
+    /// `DescriptorPool.init` rejected the caps we passed it (e.g.
+    /// max_sets == 0). Surfaces here so callers' error set matches.
+    InvalidPoolConfig,
+} || std.mem.Allocator.Error;
+
+/// Most recently presented target, used by `presentLastTarget` when
+/// the renderer decides nothing new needs drawing. Stored as a
+/// POINTER (not a value copy) into the FrameState's `target` slot
+/// so it follows the target through a resize: `frame.resize` calls
+/// `target.deinit()` on the old Target and overwrites the slot with
+/// a new one — a value copy would now reference a closed fd and
+/// freed VkImage/VkBuffer/VkDeviceMemory handles, and Qt's mmap on
+/// the closed fd could read whatever a later open() recycled the fd
+/// for. Following the pointer instead always re-presents the
+/// currently-live target.
+pub threadlocal var last_target: ?*Target = null;
+
+/// Per-surface (per-thread) command pool used for the frame's
+/// command buffer. Lazily created in `ensureInit` on the first call;
+/// destroyed in `cleanup`.
+pub threadlocal var frame_pool: ?CommandPool = null;
+
+/// The single command buffer allocated from `frame_pool` and reused
+/// across frames. `vkResetCommandBuffer` is called at the start of
+/// each `beginFrameReset` to clear prior recording.
+pub threadlocal var frame_cb: vk.VkCommandBuffer = null;
+
+/// Fence signaled when each frame's submit completes. Caller waits
+/// on it in `Frame.complete` before handing the target dmabuf to
+/// the host.
+pub threadlocal var frame_fence: vk.VkFence = null;
+
+/// Per-thread descriptor pool used by `RenderPass.step` to allocate
+/// fresh descriptor sets when the same pipeline is bound more than
+/// once in a single pass (vkCmdDraw reads descriptors at submit
+/// time, so re-using the pipeline's static set would silently
+/// corrupt prior draws). Reset at the start of every
+/// `beginFrameReset` so this frame's allocations don't pile on the
+/// previous frame's; the per-pass usage is bounded by a small
+/// constant — see the `STEP_POOL_*` caps above.
+pub threadlocal var step_pool: ?DescriptorPool = null;
+
+/// Lazy per-thread resource init. The first call on a renderer
+/// thread sets up the command pool + buffer + fence + descriptor
+/// pool that get reused for every subsequent frame. Subsequent
+/// calls are no-ops.
+///
+/// Failure-mode contract: on error the threadlocal state is rolled
+/// back to its pre-call values so the next `ensureInit` retries
+/// cleanly. Without rollback, a partial failure would leave e.g.
+/// `frame_pool != null and frame_cb == null`, and the next call's
+/// `if (frame_pool == null)` guard would skip re-init — locking the
+/// thread out of the renderer permanently.
+pub fn ensureInit(dev: *const Device) Error!void {
+    if (frame_pool == null) {
+        // Stage everything into locals; only commit to threadlocals
+        // after every step succeeds. errdefers chain rollback.
+        var pool = try CommandPool.init(dev);
+        errdefer pool.deinit();
+
+        const alloc_info: vk.VkCommandBufferAllocateInfo = .{
+            .sType = vk.VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO,
+            .pNext = null,
+            .commandPool = pool.pool,
+            .level = vk.VK_COMMAND_BUFFER_LEVEL_PRIMARY,
+            .commandBufferCount = 1,
+        };
+        var cb: vk.VkCommandBuffer = null;
+        if (dev.dispatch.allocateCommandBuffers(dev.device, &alloc_info, &cb) != vk.VK_SUCCESS)
+            return error.VulkanFailed;
+        errdefer dev.dispatch.freeCommandBuffers(dev.device, pool.pool, 1, &cb);
+
+        const fence_info: vk.VkFenceCreateInfo = .{
+            .sType = vk.VK_STRUCTURE_TYPE_FENCE_CREATE_INFO,
+            .pNext = null,
+            // Created signaled so the very first `Frame.complete`
+            // doesn't try to reset an unsignaled fence.
+            .flags = vk.VK_FENCE_CREATE_SIGNALED_BIT,
+        };
+        var fence: vk.VkFence = null;
+        if (dev.dispatch.createFence(dev.device, &fence_info, null, &fence) != vk.VK_SUCCESS)
+            return error.VulkanFailed;
+        // No errdefer for fence — past this point all three threadlocals
+        // are about to be set together, atomically from the caller's
+        // perspective, so any later error in this function is impossible.
+        // (`if (step_pool == null)` is a separate block.)
+
+        frame_pool = pool;
+        frame_cb = cb;
+        frame_fence = fence;
+    }
+    if (step_pool == null) {
+        // Independent of the frame_pool/cb/fence triple — its own
+        // failure leaves those committed and only step_pool null,
+        // which the next ensureInit() call retries correctly.
+        step_pool = try DescriptorPool.init(.{
+            .device = dev,
+            .max_sets = STEP_POOL_MAX_SETS,
+            .uniform_buffers = STEP_POOL_UNIFORM_BUFFERS,
+            .combined_image_samplers = STEP_POOL_COMBINED_IMAGE_SAMPLERS,
+            .storage_buffers = STEP_POOL_STORAGE_BUFFERS,
+        });
+    }
+}
+
+/// Reset per-frame state at the start of `beginFrame`. Caller is
+/// responsible for installing an `errdefer` re-signal of the fence
+/// so a failure here doesn't hang the next `Vulkan.deinit` on
+/// `waitForFences(UINT64_MAX)` — see the comment in
+/// `Vulkan.beginFrame` for the full rationale.
+pub fn beginFrameReset(dev: *const Device) error{VulkanFailed}!void {
+    if (dev.dispatch.resetCommandBuffer(frame_cb, 0) != vk.VK_SUCCESS)
+        return error.VulkanFailed;
+    if (step_pool) |*p| {
+        if (dev.dispatch.resetDescriptorPool(dev.device, p.pool, 0) != vk.VK_SUCCESS)
+            return error.VulkanFailed;
+    }
+    if (dev.dispatch.resetFences(dev.device, 1, &frame_fence) != vk.VK_SUCCESS)
+        return error.VulkanFailed;
+}
+
+/// Tear down THIS thread's state. Called from `Vulkan.deinit` on
+/// each surface. Waits the per-thread fence (covers any in-flight
+/// submit), then destroys the fence, frees the command buffer,
+/// destroys the pools, drains the per-thread `buffer_pool` pending
+/// list (bounded by the same fence wait), and clears `last_target`.
+///
+/// Per-surface teardown only needs THIS surface's submissions to be
+/// done — block on this thread's frame fence (if it exists) instead
+/// of `vkDeviceWaitIdle` on the shared device, which would stall
+/// every other tab/split's in-flight GPU work just to close one.
+/// The final-refcount path in `Vulkan.deinit` does the device-wide
+/// waitIdle.
+pub fn cleanup(dev: *const Device) void {
+    if (frame_fence != null) {
+        const wait_r = dev.dispatch.waitForFences(
+            dev.device,
+            1,
+            &frame_fence,
+            vk.VK_TRUE,
+            std.math.maxInt(u64),
+        );
+        if (wait_r != vk.VK_SUCCESS) {
+            log.warn(
+                "ThreadState.cleanup: vkWaitForFences returned {}, falling back to device-wide wait",
+                .{wait_r},
+            );
+            dev.waitIdle();
+        }
+        dev.dispatch.destroyFence(dev.device, frame_fence, null);
+        frame_fence = null;
+    }
+    if (frame_pool != null and frame_cb != null) {
+        dev.dispatch.freeCommandBuffers(dev.device, frame_pool.?.pool, 1, &frame_cb);
+        frame_cb = null;
+    }
+    if (frame_pool) |*p| {
+        p.deinit();
+        frame_pool = null;
+    }
+    if (step_pool) |*p| {
+        p.deinit();
+        step_pool = null;
+    }
+    // Drain THIS thread's pending buffer-pool entries. The
+    // frame-fence wait above proved the GPU is done with them,
+    // and we have to do this from THIS thread because the
+    // pending list is in this thread's threadlocal storage —
+    // the final-refcount drainShared can't reach it.
+    buffer_pool.drainSelf(dev);
+    // `last_target` is a borrow into this thread's FrameState
+    // target slot. The SwapChain teardown destroys the target;
+    // we just drop our reference.
+    last_target = null;
+}
--- a/src/renderer/vulkan/buffer.zig
+++ b/src/renderer/vulkan/buffer.zig
@ -0,0 +1,352 @@
+//! Host-coherent `VkBuffer` wrapper, generic over element type.
+//!
+//! Mirrors `src/renderer/opengl/buffer.zig`: `Buffer(T)` returns a
+//! struct that holds one buffer's worth of `T`s, with init / initFill
+//! / sync / syncFromArrayLists semantics that match the OpenGL
+//! contract.
+//!
+//! Storage strategy: `HOST_VISIBLE | HOST_COHERENT` memory.
+//! - HOST_VISIBLE lets us `vkMapMemory` the buffer and write directly.
+//! - HOST_COHERENT means the writes are visible to the GPU without a
+//!   `vkFlushMappedMemoryRanges` round-trip.
+//! - This is the simplest "dynamic" buffer pattern in Vulkan. It does
+//!   pay a small cost over device-local + staging on discrete GPUs,
+//!   but the renderer's per-frame buffer payloads are KBs (cell
+//!   instances + uniforms), not bandwidth-bound. The OpenGL backend
+//!   uses `dynamic_draw` for the same buffers, which behaves
+//!   similarly on most drivers.
+//!
+//! Growth policy: matches the OpenGL backend — `sync` doubles the
+//! buffer when content outgrows it, with no shrink. The buffer is
+//! recreated (destroy/create) on growth because Vulkan buffers are
+//! immutable in size.
+
+const std = @import("std");
+const Allocator = std.mem.Allocator;
+const vulkan = @import("vulkan");
+const vk = vulkan.c;
+
+const Device = vulkan.Device;
+
+const log = std.log.scoped(.vulkan);
+
+/// Buffer construction parameters. The OpenGL backend's `target` /
+/// `usage` enums don't map to Vulkan — `target` (vertex vs element
+/// binding point) is replaced by descriptor binding at draw time, and
+/// `usage` (static_draw / dynamic_draw / etc.) is implicit in our
+/// host-coherent allocation strategy. What's left is the Vulkan
+/// `VkBufferUsageFlags` bitmask, which the renderer's `api.*BufferOptions`
+/// methods will return differently per buffer kind (VERTEX_BUFFER_BIT
+/// for instance buffers, UNIFORM_BUFFER_BIT for uniforms, etc.).
+pub const Options = struct {
+    device: *const Device,
+    /// `VkBufferUsageFlagBits` for the buffer.
+    usage: vk.VkBufferUsageFlags,
+};
+
+pub const Error = error{
+    /// A `vkCreate*` / `vkAllocateMemory` / `vkBindBufferMemory` /
+    /// `vkMapMemory` returned a non-success status.
+    VulkanFailed,
+    /// `Device.findMemoryType` couldn't find a `HOST_VISIBLE | HOST_COHERENT`
+    /// memory type matching the buffer's requirements. Unlikely on any
+    /// real driver but worth flagging distinctly.
+    NoSuitableMemoryType,
+};
+
+/// `Buffer(T)`: a `VkBuffer` + backing `VkDeviceMemory` typed to hold
+/// some number of `T`s. Mirrors `opengl/buffer.zig`'s `Buffer(T)` so
+/// the renderer's call sites don't need a per-backend branch.
+pub fn Buffer(comptime T: type) type {
+    return struct {
+        const Self = @This();
+
+        /// Underlying `VkBuffer` handle.
+        buffer: vk.VkBuffer,
+        /// Backing memory. Host-coherent; mappable directly.
+        memory: vk.VkDeviceMemory,
+        /// Options this buffer was allocated with.
+        opts: Options,
+        /// Current capacity, in number of `T`s.
+        len: usize,
+
+        /// Initialize a buffer with capacity for `len` `T`s. Contents
+        /// are uninitialized; call `sync` to populate.
+        pub fn init(opts: Options, len: usize) Error!Self {
+            return try create(opts, len);
+        }
+
+        /// Initialize a buffer pre-filled with the provided data.
+        pub fn initFill(opts: Options, data: []const T) Error!Self {
+            var self = try create(opts, data.len);
+            errdefer self.deinit();
+            try self.write(0, data);
+            return self;
+        }
+
+        /// Hand the (VkBuffer, VkDeviceMemory) pair back to the
+        /// process-wide pool. The pool (see `Vulkan.buffer_pool`)
+        /// holds the entry until the current frame's fence has
+        /// signaled (the GPU is done with our recorded references)
+        /// and then makes it available to a future `Buffer.create`
+        /// call. Returning to the pool solves both:
+        ///   - `renderer/image.zig:draw`'s `defer buf.deinit()` no
+        ///     longer use-after-frees the in-flight buffer.
+        ///   - It avoids the per-frame allocation thrash that
+        ///     drove the driver to SIGSEGV on image-heavy frames.
+        ///
+        /// MUST be called only from the renderer thread (the path
+        /// whose fence will eventually retire references to this
+        /// buffer in `Frame.complete`). One-shot uploads (atlas
+        /// staging buffers, etc.) that already block on
+        /// `vkQueueWaitIdle` post-submit must use
+        /// `destroyImmediate` instead — they don't share the
+        /// renderer thread's fence cycle.
+        pub fn deinit(self: Self) void {
+            const dev = self.opts.device;
+            const bp = @import("../Vulkan.zig").buffer_pool;
+            const capacity_bytes: u64 = @as(u64, self.len) * @sizeOf(T);
+            bp.release(
+                dev,
+                self.buffer,
+                self.memory,
+                self.opts.usage,
+                capacity_bytes,
+            ) catch |err| {
+                // OOM growing the pool. The buffer may still be
+                // referenced by an in-flight command buffer, so we
+                // wait the entire device idle before destroying —
+                // expensive but correct.
+                log.warn(
+                    "Buffer.deinit: pool release failed ({}); falling " ++
+                        "back to vkDeviceWaitIdle + destroy",
+                    .{err},
+                );
+                _ = dev.dispatch.deviceWaitIdle(dev.device);
+                dev.dispatch.destroyBuffer(dev.device, self.buffer, null);
+                dev.dispatch.freeMemory(dev.device, self.memory, null);
+            };
+        }
+
+        /// Destroy the buffer immediately, bypassing the recycle
+        /// pool. The caller MUST ensure no in-flight command buffer
+        /// references this buffer (e.g. by having waited on a fence
+        /// or `vkQueueWaitIdle` covering its submission).
+        ///
+        /// Used by short-lived staging buffers like
+        /// `Texture.replaceRegion` whose lifetime is bounded by a
+        /// `OneShot.endAndSubmit` that already drains the queue;
+        /// stuffing those into the pool from a non-renderer thread
+        /// would leak them (the renderer thread's `cycle` runs the
+        /// pool, so an upload thread's pushes never get reused).
+        pub fn destroyImmediate(self: Self) void {
+            const dev = self.opts.device;
+            dev.dispatch.destroyBuffer(dev.device, self.buffer, null);
+            dev.dispatch.freeMemory(dev.device, self.memory, null);
+        }
+
+        /// Replace the buffer's contents. Grows (doubles) if needed —
+        /// matches the OpenGL backend's behavior. Data shorter than
+        /// the current capacity leaves the trailing slots untouched.
+        pub fn sync(self: *Self, data: []const T) Error!void {
+            if (data.len > self.len) try self.grow(data.len * 2);
+            try self.write(0, data);
+        }
+
+        /// Like `sync` but pulls from multiple `ArrayList`s in
+        /// sequence; returns the total number of elements written.
+        pub fn syncFromArrayLists(
+            self: *Self,
+            lists: []const std.ArrayListUnmanaged(T),
+        ) Error!usize {
+            var total: usize = 0;
+            for (lists) |list| total += list.items.len;
+
+            if (total > self.len) try self.grow(total * 2);
+
+            var off: usize = 0;
+            for (lists) |list| {
+                if (list.items.len == 0) continue;
+                try self.write(off, list.items);
+                off += list.items.len;
+            }
+            return total;
+        }
+
+        // ---- internals -------------------------------------------
+
+        fn create(opts: Options, len: usize) Error!Self {
+            const dev = opts.device;
+            // Vulkan requires `size > 0` for buffer creation. Round up
+            // a zero request to 1 so the buffer exists and can be
+            // grown later via `sync`. (OpenGL silently accepts size=0.)
+            //
+            // Compute byte size in u64 to avoid the usize multiply
+            // overflowing on 32-bit hosts (or, theoretically, on a
+            // 64-bit host with `len` near `maxInt(usize)/@sizeOf(T)`,
+            // though that's astronomical for any real renderer
+            // payload). `std.math.mul` returns `error.Overflow` on
+            // overflow; map that onto `error.VulkanFailed` since the
+            // request is unservicable — Vulkan can't allocate a
+            // buffer that big regardless of why we computed it.
+            const len_u64: u64 = @intCast(len);
+            const byte_size_raw = std.math.mul(u64, len_u64, @sizeOf(T)) catch
+                return error.VulkanFailed;
+            const byte_size: u64 = @max(1, byte_size_raw);
+
+            // Reach into the buffer pool first — a previous frame's
+            // released VkBuffer of matching usage+capacity is safe to
+            // reuse, no allocator round trip needed. Image-draw
+            // frames stabilize at ~hundreds of pool entries per
+            // (usage, size) bucket.
+            const bp = @import("../Vulkan.zig").buffer_pool;
+            if (bp.acquire(opts.usage, byte_size)) |entry| {
+                return .{
+                    .buffer = entry.buffer,
+                    .memory = entry.memory,
+                    .opts = opts,
+                    .len = @intCast(entry.capacity / @sizeOf(T)),
+                };
+            }
+
+            const info: vk.VkBufferCreateInfo = .{
+                .sType = vk.VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO,
+                .pNext = null,
+                .flags = 0,
+                .size = byte_size,
+                .usage = opts.usage,
+                .sharingMode = vk.VK_SHARING_MODE_EXCLUSIVE,
+                .queueFamilyIndexCount = 0,
+                .pQueueFamilyIndices = null,
+            };
+            var buffer: vk.VkBuffer = undefined;
+            {
+                const r = dev.dispatch.createBuffer(dev.device, &info, null, &buffer);
+                if (r != vk.VK_SUCCESS) {
+                    log.err("vkCreateBuffer failed: result={}", .{r});
+                    return error.VulkanFailed;
+                }
+            }
+            errdefer dev.dispatch.destroyBuffer(dev.device, buffer, null);
+
+            var reqs: vk.VkMemoryRequirements = undefined;
+            dev.dispatch.getBufferMemoryRequirements(dev.device, buffer, &reqs);
+
+            const type_index = dev.findMemoryType(
+                reqs.memoryTypeBits,
+                vk.VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT |
+                    vk.VK_MEMORY_PROPERTY_HOST_COHERENT_BIT,
+            ) orelse {
+                log.err(
+                    "no HOST_VISIBLE|HOST_COHERENT memory type for buffer (typeBits=0x{x})",
+                    .{reqs.memoryTypeBits},
+                );
+                return error.NoSuitableMemoryType;
+            };
+
+            const alloc_info: vk.VkMemoryAllocateInfo = .{
+                .sType = vk.VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO,
+                .pNext = null,
+                .allocationSize = reqs.size,
+                .memoryTypeIndex = type_index,
+            };
+            var memory: vk.VkDeviceMemory = undefined;
+            {
+                const r = dev.dispatch.allocateMemory(dev.device, &alloc_info, null, &memory);
+                if (r != vk.VK_SUCCESS) {
+                    log.err("vkAllocateMemory (buffer) failed: result={}", .{r});
+                    return error.VulkanFailed;
+                }
+            }
+            errdefer dev.dispatch.freeMemory(dev.device, memory, null);
+
+            {
+                const r = dev.dispatch.bindBufferMemory(dev.device, buffer, memory, 0);
+                if (r != vk.VK_SUCCESS) {
+                    log.err("vkBindBufferMemory failed: result={}", .{r});
+                    return error.VulkanFailed;
+                }
+            }
+
+            return .{
+                .buffer = buffer,
+                .memory = memory,
+                .opts = opts,
+                .len = len,
+            };
+        }
+
+        /// Grow the buffer to hold at least `new_len` Ts. Vulkan
+        /// buffers are immutable in size, so we allocate a fresh
+        /// one and then route the old one through the recycle pool
+        /// (it may still be referenced by the in-flight command
+        /// buffer — destroying it directly would race the GPU same
+        /// as `deinit` would). Contents are discarded; callers
+        /// always `sync` immediately after `grow` returns.
+        ///
+        /// Order is critical: `create` first, `release` second.
+        /// If we released the old buffer first and `create`
+        /// failed, `self.{buffer,memory}` would be left dangling
+        /// at freed handles, and the caller's eventual
+        /// `self.deinit()` would double-destroy via the pool.
+        fn grow(self: *Self, new_len: usize) Error!void {
+            const dev = self.opts.device;
+            const replacement = try create(self.opts, new_len);
+            // From here on `self.{buffer,memory}` are the OLD pair;
+            // release them. If `release` itself OOMs, we have to
+            // destroy directly (same fallback as `deinit`), but the
+            // new pair is already constructed and `self.* =
+            // replacement` will reach a healthy state regardless.
+            const bp = @import("../Vulkan.zig").buffer_pool;
+            const capacity_bytes: u64 = @as(u64, self.len) * @sizeOf(T);
+            bp.release(
+                dev,
+                self.buffer,
+                self.memory,
+                self.opts.usage,
+                capacity_bytes,
+            ) catch {
+                _ = dev.dispatch.deviceWaitIdle(dev.device);
+                dev.dispatch.destroyBuffer(dev.device, self.buffer, null);
+                dev.dispatch.freeMemory(dev.device, self.memory, null);
+            };
+            self.* = replacement;
+        }
+
+        /// Copy `data` into the buffer starting at element offset
+        /// `elem_off`. Host-coherent memory means the GPU sees the
+        /// writes without an explicit flush.
+        fn write(self: *const Self, elem_off: usize, data: []const T) Error!void {
+            if (data.len == 0) return;
+            const dev = self.opts.device;
+            const byte_off: u64 = elem_off * @sizeOf(T);
+            const byte_size: u64 = data.len * @sizeOf(T);
+            var mapped: ?*anyopaque = null;
+            {
+                const r = dev.dispatch.mapMemory(
+                    dev.device,
+                    self.memory,
+                    byte_off,
+                    byte_size,
+                    0,
+                    &mapped,
+                );
+                if (r != vk.VK_SUCCESS) {
+                    log.err("vkMapMemory failed: result={}", .{r});
+                    return error.VulkanFailed;
+                }
+            }
+            defer dev.dispatch.unmapMemory(dev.device, self.memory);
+
+            const dst: [*]u8 = @ptrCast(mapped.?);
+            const src: [*]const u8 = @ptrCast(data.ptr);
+            @memcpy(dst[0..byte_size], src[0..byte_size]);
+        }
+    };
+}
+
+test {
+    // Exercise top-level decls of a representative instantiation so
+    // type errors in the generic body surface during compile-check.
+    std.testing.refAllDecls(Buffer(u32));
+}
--- a/src/renderer/vulkan/buffer_pool.zig
+++ b/src/renderer/vulkan/buffer_pool.zig
@ -0,0 +1,189 @@
+//! Process-wide pool of `(VkBuffer, VkDeviceMemory)` pairs recycled
+//! across frames on the renderer thread. Solves two problems
+//! together:
+//!
+//!   1. Lifetime: `vulkan/buffer.zig`'s `Buffer.deinit` is called
+//!      mid-frame (by `renderer/image.zig:draw`'s `defer buf.deinit()`)
+//!      while the command buffer that references the buffer hasn't
+//!      been submitted yet. Naive immediate destroy → use-after-free.
+//!   2. Allocation thrash: a frame with N kitty-image placements
+//!      would otherwise allocate N tiny VkBuffers + VkDeviceMemories
+//!      per frame, every frame. NVIDIA driver SIGSEGVs after a few
+//!      seconds of that.
+//!
+//! Multi-thread design: `pending` is THREADLOCAL (each renderer
+//! thread accumulates the buffers IT released during the current
+//! frame), while `ready` is process-wide and mutex-protected (any
+//! thread can recycle from it). Splits/tabs run independent
+//! renderer threads against the SAME shared VkDevice — a single
+//! shared `pending` list would let thread A's `Frame.complete`
+//! retire buffers thread B released but whose fence hasn't
+//! signaled yet, handing B's still-GPU-in-flight buffer back to a
+//! new `acquire`. Per-thread pending bounds the visibility of
+//! each entry to the thread that knows when its fence signals.
+//!
+//! Lifecycle:
+//!   - `release(dev, …)` (renderer thread) pushes to THAT thread's
+//!     `pending`.
+//!   - `cycle(dev)` (renderer thread, after `vkWaitForFences` on
+//!     the SAME thread's per-frame fence) moves THAT thread's
+//!     `pending` → shared `ready` under the mutex.
+//!   - `acquire(…)` (any thread) pops a matching entry from `ready`
+//!     under the mutex.
+//!
+//! Caller responsibilities:
+//!   - Only call `release` from the renderer thread whose fence
+//!     the frame's GPU work signals; calling from a thread that
+//!     never reaches its own `Frame.complete` would leak entries
+//!     (they sit in that thread's `pending` forever). For one-shot
+//!     uploads from a non-renderer thread (atlas staging), use
+//!     `Buffer.destroyImmediate` instead, which bypasses this
+//!     pool entirely.
+
+const std = @import("std");
+const vulkan = @import("vulkan");
+const vk = vulkan.c;
+
+const Device = vulkan.Device;
+
+const log = std.log.scoped(.vulkan);
+
+pub const Entry = struct {
+    buffer: vk.VkBuffer,
+    memory: vk.VkDeviceMemory,
+    usage: vk.VkBufferUsageFlags,
+    capacity: u64,
+};
+
+/// Guards the process-wide `ready` list. Per-thread `pending` is
+/// threadlocal and never under this mutex.
+var ready_mutex: std.Thread.Mutex = .{};
+
+/// Per-thread pending list. Entries here were released by THIS
+/// thread during the current frame and are bounded by the
+/// fence THIS thread will wait on in `Frame.complete`. Moved
+/// to the shared `ready` list by `cycle()` after that wait
+/// returns.
+threadlocal var pending: std.ArrayList(Entry) = .{};
+
+/// Process-wide ready list. Entries here are provably retired
+/// (the bounding fence has signaled) and any thread may
+/// `acquire` them.
+var ready: std.ArrayList(Entry) = .{};
+
+/// Queue a buffer for recycling. The buffer cannot be reused
+/// until the next fence-wait (handled by `cycle`); it sits in
+/// THIS thread's `pending` until then. Bounded by THIS thread's
+/// per-frame fence — see the per-thread pending rationale at
+/// the top of this module.
+pub fn release(
+    dev: *const Device,
+    buffer: vk.VkBuffer,
+    memory: vk.VkDeviceMemory,
+    usage: vk.VkBufferUsageFlags,
+    capacity: u64,
+) !void {
+    _ = dev;
+    // No mutex: `pending` is threadlocal, only THIS thread
+    // touches it.
+    try pending.append(std.heap.smp_allocator, .{
+        .buffer = buffer,
+        .memory = memory,
+        .usage = usage,
+        .capacity = capacity,
+    });
+}
+
+/// Pop a `ready` entry whose usage matches and whose capacity is
+/// >= the requested size. Linear scan — pools tend to have a
+/// small number of distinct (usage, size) shapes (image: 48B
+/// VERTEX, bg_image: 8B VERTEX) so this stays cheap.
+pub fn acquire(
+    usage: vk.VkBufferUsageFlags,
+    min_capacity: u64,
+) ?Entry {
+    ready_mutex.lock();
+    defer ready_mutex.unlock();
+    var i: usize = 0;
+    while (i < ready.items.len) : (i += 1) {
+        const e = ready.items[i];
+        if (e.usage == usage and e.capacity >= min_capacity) {
+            _ = ready.swapRemove(i);
+            return e;
+        }
+    }
+    return null;
+}
+
+/// Move THIS thread's `pending` entries to the shared `ready` —
+/// THIS thread's fence has signaled, so the GPU is done with
+/// every buffer in `pending`. Call from `Frame.complete` after
+/// `vkWaitForFences`.
+///
+/// `dev` is needed only on the OOM fallback path: if `ready`
+/// can't grow to absorb `pending`, we wait the device idle
+/// (OUTSIDE the mutex — see below) and then destroy the pending
+/// entries directly so the next frame doesn't double up on a
+/// pending list that can never drain.
+pub fn cycle(dev: *const Device) void {
+    // Try the fast path first — append THIS thread's `pending`
+    // to the shared `ready` under the lock, then clear pending.
+    // On OOM we have to destroy the pending entries, but
+    // `vkDeviceWaitIdle` is slow and holding the pool mutex
+    // across it would block every other renderer thread's
+    // release/acquire/cycle. Move the pending list into a
+    // local outside the lock, then drain.
+    var oom_pending: std.ArrayList(Entry) = .{};
+    defer oom_pending.deinit(std.heap.smp_allocator);
+    {
+        ready_mutex.lock();
+        defer ready_mutex.unlock();
+        if (ready.appendSlice(std.heap.smp_allocator, pending.items)) {
+            pending.clearRetainingCapacity();
+            return;
+        } else |_| {
+            // OOM. Move THIS thread's `pending` into our local
+            // so we can drain without holding the mutex.
+            oom_pending = pending;
+            pending = .{};
+        }
+    }
+    // Mutex released. Other threads can release/acquire/cycle
+    // while we wait the device idle and destroy our slice.
+    _ = dev.dispatch.deviceWaitIdle(dev.device);
+    for (oom_pending.items) |e| {
+        dev.dispatch.destroyBuffer(dev.device, e.buffer, null);
+        dev.dispatch.freeMemory(dev.device, e.memory, null);
+    }
+}
+
+/// Destroy THIS thread's `pending` entries directly. Call from
+/// the same thread's `Vulkan.deinit` AFTER `vkWaitForFences`
+/// on this thread's frame fence — the bounding fence has
+/// signaled so the GPU is provably done with these buffers.
+///
+/// Each renderer thread is responsible for cleaning up its own
+/// pending list because Zig threadlocal storage is the calling
+/// thread's; the final-refcount tear-down (`drainShared`) only
+/// handles the process-wide `ready` list.
+pub fn drainSelf(dev: *const Device) void {
+    for (pending.items) |e| {
+        dev.dispatch.destroyBuffer(dev.device, e.buffer, null);
+        dev.dispatch.freeMemory(dev.device, e.memory, null);
+    }
+    pending.clearRetainingCapacity();
+}
+
+/// Destroy every entry in the shared `ready` list. Call only
+/// from the FINAL surface tear-down (the path that hits
+/// `device_refcount == 0`) and only after every other renderer
+/// thread has already run `drainSelf` on its own pending list.
+pub fn drainShared(dev: *const Device) void {
+    ready_mutex.lock();
+    defer ready_mutex.unlock();
+    for (ready.items) |e| {
+        dev.dispatch.destroyBuffer(dev.device, e.buffer, null);
+        dev.dispatch.freeMemory(dev.device, e.memory, null);
+    }
+    ready.clearRetainingCapacity();
+}
--- a/src/renderer/vulkan/shaders.zig
+++ b/src/renderer/vulkan/shaders.zig
--- a/src/terminal/PageList.zig
+++ b/src/terminal/PageList.zig
@ -1051,10 +1051,26 @@ fn resizeCols(
            break :wrapped wrapped;
        };

+        // `c.y` is the cursor row from BEFORE this resize. When the
+        // call sequence is `resizeWithoutReflow(new_rows, old_cols)`
+        // → `resizeCols(new_cols)` (the `.lt` arm above), `self.rows`
+        // has already been reduced to the new row count by the time
+        // we run, so a cursor strictly past the new bottom (`c.y >=
+        // self.rows`) would underflow `self.rows - c.y - 1`. Clamp
+        // to zero remaining rows in that case — the cursor
+        // effectively sits on the last visible row after the
+        // shrink. Note: `c.y == self.rows - 1` (cursor AT the new
+        // bottom) does NOT underflow, but the `c.y + 1 >= self.rows`
+        // form still returns 0 there, matching the old
+        // `self.rows - c.y - 1 == 0` result.
+        const remaining_rows: usize = if (c.y + 1 >= self.rows)
+            0
+        else
+            self.rows - c.y - 1;
        break :cursor .{
            .tracked_pin = c.pin orelse try self.trackPin(p),
            .untrack = c.pin == null,
-            .remaining_rows = self.rows - c.y - 1,
+            .remaining_rows = remaining_rows,
            .wrapped_rows = wrapped,
        };
    } else null;
--- a/src/vulkan_spvgen.zig
+++ b/src/vulkan_spvgen.zig
@ -0,0 +1,88 @@
+//! Build-time tool: compiles one of `src/renderer/vulkan/shaders.zig`'s
+//! `source.*` constants to SPIR-V and writes the bytes to stdout.
+//!
+//! Invoked by `src/build/VulkanSpv.zig` once per (shader_name, stage)
+//! pair so the renderer can `@embedFile` the resulting .spv blobs
+//! and call `Module.initFromSpirv` for built-ins instead of going
+//! through `glslang.vk.compileToSpv` at runtime. The runtime path
+//! is what populates glslang's per-thread `TPoolAllocator`, which
+//! never releases its high-water-mark pages (Zig pthreads don't
+//! run C++ thread_local destructors) — heaptrack attributed ~10 MB
+//! to that residual leak on the Vulkan variant, exactly the delta
+//! over OpenGL (which never invokes glslang for its built-ins
+//! because the GPU driver compiles GLSL natively).
+//!
+//! Usage:
+//!   vulkan_spvgen <shader_name> <stage>
+//!
+//! Where `shader_name` is one of the public decls of
+//! `vulkan.shaders.source` (e.g. `bg_color_frag`, `cell_text_vert`)
+//! and `stage` is `vertex` or `fragment`.
+//!
+//! On success: writes binary SPIR-V to stdout, exits 0.
+//! On failure: writes a diagnostic to stderr, exits 1.
+
+const std = @import("std");
+const shaders = @import("renderer/vulkan/shaders.zig");
+const glslang = @import("glslang");
+
+pub fn main() !void {
+    var gpa: std.heap.GeneralPurposeAllocator(.{}) = .{};
+    defer _ = gpa.deinit();
+    const alloc = gpa.allocator();
+
+    const args = try std.process.argsAlloc(alloc);
+    defer std.process.argsFree(alloc, args);
+
+    if (args.len != 3) {
+        std.debug.print(
+            "usage: {s} <shader_name> <vertex|fragment>\n",
+            .{args[0]},
+        );
+        std.process.exit(1);
+    }
+    const name = args[1];
+    const stage = std.meta.stringToEnum(shaders.Stage, args[2]) orelse {
+        std.debug.print("invalid stage: {s}\n", .{args[2]});
+        std.process.exit(1);
+    };
+
+    try glslang.init();
+    defer glslang.finalize();
+
+    // Resolve the source by name. The runtime renderer accesses
+    // `shaders.source.bg_color_frag` etc. directly; we look up the
+    // matching decl by name at comptime so the build step can pass
+    // any of the 9 built-ins by string argv.
+    const src: [:0]const u8 = src: {
+        inline for (@typeInfo(shaders.source).@"struct".decls) |decl| {
+            if (std.mem.eql(u8, decl.name, name)) {
+                break :src @field(shaders.source, decl.name);
+            }
+        }
+        std.debug.print("unknown shader: {s}\n", .{name});
+        std.process.exit(1);
+    };
+
+    // Vulkan-flavor rewrite (gl_VertexID → gl_VertexIndex, multi-set
+    // descriptor layout, etc.). Same path the runtime took before
+    // this precompile change.
+    const translated = try shaders.vulkanizeGlsl(alloc, src);
+    defer alloc.free(translated);
+
+    const spv = try glslang.vk.compileToSpv(
+        alloc,
+        translated,
+        stage.vkBindingStage(),
+    );
+    defer alloc.free(spv);
+
+    // Write the raw SPIR-V words (u32 little-endian on every host
+    // we build for; Vulkan loaders accept the in-memory byte order
+    // of the platform). The build step captures stdout into a .spv
+    // file the renderer @embedFiles at compile time.
+    var buf: [4096]u8 = undefined;
+    var stdout = std.fs.File.stdout().writerStreaming(&buf);
+    try stdout.interface.writeAll(std.mem.sliceAsBytes(spv));
+    try stdout.end();
+}