From 9a7a31ac3766f45a1dbc03951b6230ad1f2b3136 Mon Sep 17 00:00:00 2001
From: Nathan <nate0001@gmail.com>
Date: Sun, 24 May 2026 22:56:07 -0500
Subject: [PATCH] qt/wayland: zero-copy dmabuf present via wl_subsurface
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The SubsurfacePresenter now binds zwp_linux_dmabuf_v1 (vendored XML;
hermetic build), wraps libghostty's dmabuf fd in a wl_buffer via
create_immed, and attach/damage/commits it to the subsurface. The
compositor scans the buffer out directly — no mmap, no memcpy, no
QImage, no QPainter blit on the terminal pixels. paintEvent skips
its blit when the subsurface path is active so the translucent
QWidget background lets the subsurface show through; chrome (split
dim, bell flash, resize overlay) still paints on top.

Frame delivery is QMetaObject::invokeMethod(Qt::QueuedConnection)
per present, with a 2 ms QTimer as a safety net for any missed
queued lambda (the prior 16 ms poll was a leftover from the QImage
path, capped present at 60 Hz, and added up to a frame of latency).

C ABI: ghostty_platform_vulkan_s.present grows a bool `image_backed`
parameter. NVIDIA in legacy_copy mode exports the dmabuf from a
VkBuffer that linux-dmabuf-v1 cannot import as a 2D image —
attempting it would trigger an `invalid_wl_buffer` protocol error,
which is fatal for the wl_display connection. Target.present sets
the flag based on Target.tiling; the host only takes the subsurface
path when set, falls back to the QImage/QPainter path otherwise.

Verified on NVIDIA RTX 2080 (legacy_copy → image_backed=0 →
path=qimage → no protocol error). Subsurface presenter still
constructs and would activate on AMD/Intel hardware where Phase 1's
direct mode succeeds. Subsequent phases will add vendor-tiled
modifier support so NVIDIA can use the zero-copy path too.

Co-Authored-By: claude-flow <ruv@ruv.net>
---
 include/ghostty.h                      |  19 +-
 qt/CMakeLists.txt                      |  17 +
 qt/protocols/linux-dmabuf-v1.xml       | 585 +++++++++++++++++++++++++
 qt/src/GhosttySurface.cpp              | 191 +++++---
 qt/src/GhosttySurface.h                |  64 ++-
 qt/src/vulkan/Host.cpp                 |  10 +-
 qt/src/wayland/SubsurfacePresenter.cpp | 126 +++++-
 qt/src/wayland/SubsurfacePresenter.h   |  64 +--
 src/apprt/embedded.zig                 |   9 +-
 src/renderer/vulkan/Target.zig         |   6 +
 10 files changed, 956 insertions(+), 135 deletions(-)
 create mode 100644 qt/protocols/linux-dmabuf-v1.xml

diff --git a/include/ghostty.h b/include/ghostty.h
index 1fec03a93..aced06412 100644
--- a/include/ghostty.h
+++ b/include/ghostty.h
@@ -514,7 +514,21 @@ typedef struct {
   uint32_t (*queue_family_index)(void* userdata);
 
   // Hand off a rendered frame to the host as a dmabuf fd. The host
-  // imports it (e.g. into Qt's RHI as a QRhiTexture) and composites.
+  // imports it (e.g. into Qt's RHI as a QRhiTexture, or attaches to
+  // a wl_subsurface via linux-dmabuf-v1) and composites.
+  //
+  // `image_backed` is true when the dmabuf was exported from a
+  // VkImage allocated with VK_EXT_image_drm_format_modifier — i.e.
+  // it's directly importable as a 2D image by the compositor or any
+  // GPU-side consumer. false when it was exported from a VkBuffer
+  // (the legacy NVIDIA fallback path where the driver doesn't
+  // advertise COLOR_ATTACHMENT for the LINEAR modifier on
+  // exportable images, so libghostty renders into an OPTIMAL image
+  // and copies the bytes into a linear VkBuffer for export). In the
+  // !image_backed case the fd is only usable via mmap + CPU
+  // readback — attempting a linux-dmabuf-v1 import will trigger an
+  // `invalid_wl_buffer` protocol error.
+  //
   // libghostty retains ownership of the underlying VkDeviceMemory;
   // the host must dup() the fd if it needs to hold it past the call.
   void (*present)(
@@ -524,7 +538,8 @@ typedef struct {
       uint64_t drm_modifier,
       uint32_t width,
       uint32_t height,
-      uint32_t stride);
+      uint32_t stride,
+      bool image_backed);
 } ghostty_platform_vulkan_s;
 
 typedef union {
diff --git a/qt/CMakeLists.txt b/qt/CMakeLists.txt
index 1a78bad66..41186a7dc 100644
--- a/qt/CMakeLists.txt
+++ b/qt/CMakeLists.txt
@@ -71,6 +71,21 @@ add_custom_command(OUTPUT "${BLUR_CODE}"
   COMMAND "${WAYLAND_SCANNER}" private-code "${BLUR_XML}" "${BLUR_CODE}"
   DEPENDS "${BLUR_XML}" VERBATIM)
 
+# Generate client glue for the linux-dmabuf-v1 protocol (used by the
+# Vulkan present path: wrap libghostty's dmabuf fd in a wl_buffer and
+# attach it to the wayland::SubsurfacePresenter's wl_surface). Vendored
+# in qt/protocols/ so the build doesn't depend on
+# /usr/share/wayland-protocols being installed.
+set(DMABUF_XML "${CMAKE_CURRENT_SOURCE_DIR}/protocols/linux-dmabuf-v1.xml")
+set(DMABUF_HEADER "${CMAKE_CURRENT_BINARY_DIR}/linux-dmabuf-v1-client-protocol.h")
+set(DMABUF_CODE "${CMAKE_CURRENT_BINARY_DIR}/linux-dmabuf-v1-protocol.c")
+add_custom_command(OUTPUT "${DMABUF_HEADER}"
+  COMMAND "${WAYLAND_SCANNER}" client-header "${DMABUF_XML}" "${DMABUF_HEADER}"
+  DEPENDS "${DMABUF_XML}" VERBATIM)
+add_custom_command(OUTPUT "${DMABUF_CODE}"
+  COMMAND "${WAYLAND_SCANNER}" private-code "${DMABUF_XML}" "${DMABUF_CODE}"
+  DEPENDS "${DMABUF_XML}" VERBATIM)
+
 # libghostty is built out-of-tree by Zig.
 get_filename_component(GHOSTTY_ROOT "${CMAKE_CURRENT_SOURCE_DIR}/.." ABSOLUTE)
 set(GHOSTTY_LIB_DIR "${GHOSTTY_ROOT}/zig-out/lib")
@@ -152,6 +167,8 @@ add_executable(ghastty
   src/XkbTracker.cpp
   "${BLUR_CODE}"
   "${BLUR_HEADER}"
+  "${DMABUF_CODE}"
+  "${DMABUF_HEADER}"
 )
 
 # Embed the app icon so it is available even running from the build tree.
diff --git a/qt/protocols/linux-dmabuf-v1.xml b/qt/protocols/linux-dmabuf-v1.xml
new file mode 100644
index 000000000..12d09fb28
--- /dev/null
+++ b/qt/protocols/linux-dmabuf-v1.xml
@@ -0,0 +1,585 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<protocol name="linux_dmabuf_v1">
+
+  <copyright>
+    Copyright © 2014, 2015 Collabora, Ltd.
+
+    Permission is hereby granted, free of charge, to any person obtaining a
+    copy of this software and associated documentation files (the "Software"),
+    to deal in the Software without restriction, including without limitation
+    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+    and/or sell copies of the Software, and to permit persons to whom the
+    Software is furnished to do so, subject to the following conditions:
+
+    The above copyright notice and this permission notice (including the next
+    paragraph) shall be included in all copies or substantial portions of the
+    Software.
+
+    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+    THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+    DEALINGS IN THE SOFTWARE.
+  </copyright>
+
+  <interface name="zwp_linux_dmabuf_v1" version="5">
+    <description summary="factory for creating dmabuf-based wl_buffers">
+      This interface offers ways to create generic dmabuf-based wl_buffers.
+
+      For more information about dmabuf, see:
+      https://www.kernel.org/doc/html/next/userspace-api/dma-buf-alloc-exchange.html
+
+      Clients can use the get_surface_feedback request to get dmabuf feedback
+      for a particular surface. If the client wants to retrieve feedback not
+      tied to a surface, they can use the get_default_feedback request.
+
+      The following are required from clients:
+
+      - Clients must ensure that either all data in the dma-buf is
+        coherent for all subsequent read access or that coherency is
+        correctly handled by the underlying kernel-side dma-buf
+        implementation.
+
+      - Don't make any more attachments after sending the buffer to the
+        compositor. Making more attachments later increases the risk of
+        the compositor not being able to use (re-import) an existing
+        dmabuf-based wl_buffer.
+
+      The underlying graphics stack must ensure the following:
+
+      - The dmabuf file descriptors relayed to the server will stay valid
+        for the whole lifetime of the wl_buffer. This means the server may
+        at any time use those fds to import the dmabuf into any kernel
+        sub-system that might accept it.
+
+      However, when the underlying graphics stack fails to deliver the
+      promise, because of e.g. a device hot-unplug which raises internal
+      errors, after the wl_buffer has been successfully created the
+      compositor must not raise protocol errors to the client when dmabuf
+      import later fails.
+
+      To create a wl_buffer from one or more dmabufs, a client creates a
+      zwp_linux_dmabuf_params_v1 object with a zwp_linux_dmabuf_v1.create_params
+      request. All planes required by the intended format are added with
+      the 'add' request. Finally, a 'create' or 'create_immed' request is
+      issued, which has the following outcome depending on the import success.
+
+      The 'create' request,
+      - on success, triggers a 'created' event which provides the final
+        wl_buffer to the client.
+      - on failure, triggers a 'failed' event to convey that the server
+        cannot use the dmabufs received from the client.
+
+      For the 'create_immed' request,
+      - on success, the server immediately imports the added dmabufs to
+        create a wl_buffer. No event is sent from the server in this case.
+      - on failure, the server can choose to either:
+        - terminate the client by raising a fatal error.
+        - mark the wl_buffer as failed, and send a 'failed' event to the
+          client. If the client uses a failed wl_buffer as an argument to any
+          request, the behaviour is compositor implementation-defined.
+
+      For all DRM formats and unless specified in another protocol extension,
+      pre-multiplied alpha is used for pixel values.
+
+      Unless specified otherwise in another protocol extension, implicit
+      synchronization is used. In other words, compositors and clients must
+      wait and signal fences implicitly passed via the DMA-BUF's reservation
+      mechanism.
+    </description>
+
+    <request name="destroy" type="destructor">
+      <description summary="unbind the factory">
+        Objects created through this interface, especially wl_buffers, will
+        remain valid.
+      </description>
+    </request>
+
+    <request name="create_params">
+      <description summary="create a temporary object for buffer parameters">
+        This temporary object is used to collect multiple dmabuf handles into
+        a single batch to create a wl_buffer. It can only be used once and
+        should be destroyed after a 'created' or 'failed' event has been
+        received.
+      </description>
+      <arg name="params_id" type="new_id" interface="zwp_linux_buffer_params_v1"
+           summary="the new temporary"/>
+    </request>
+
+    <event name="format" deprecated-since="4">
+      <description summary="supported buffer format">
+        This event advertises one buffer format that the server supports.
+        All the supported formats are advertised once when the client
+        binds to this interface. A roundtrip after binding guarantees
+        that the client has received all supported formats.
+
+        For the definition of the format codes, see the
+        zwp_linux_buffer_params_v1::create request.
+
+        Starting version 4, the format event is deprecated and must not be
+        sent by compositors. Instead, use get_default_feedback or
+        get_surface_feedback.
+      </description>
+      <arg name="format" type="uint" summary="DRM_FORMAT code"/>
+    </event>
+
+    <event name="modifier" since="3" deprecated-since="4">
+      <description summary="supported buffer format modifier">
+        This event advertises the formats that the server supports, along with
+        the modifiers supported for each format. All the supported modifiers
+        for all the supported formats are advertised once when the client
+        binds to this interface. A roundtrip after binding guarantees that
+        the client has received all supported format-modifier pairs.
+
+        For legacy support, DRM_FORMAT_MOD_INVALID (that is, modifier_hi ==
+        0x00ffffff and modifier_lo == 0xffffffff) is allowed in this event.
+        It indicates that the server can support the format with an implicit
+        modifier. When a plane has DRM_FORMAT_MOD_INVALID as its modifier, it
+        is as if no explicit modifier is specified. The effective modifier
+        will be derived from the dmabuf.
+
+        A compositor that sends valid modifiers and DRM_FORMAT_MOD_INVALID for
+        a given format supports both explicit modifiers and implicit modifiers.
+
+        For the definition of the format and modifier codes, see the
+        zwp_linux_buffer_params_v1::create and zwp_linux_buffer_params_v1::add
+        requests.
+
+        Starting version 4, the modifier event is deprecated and must not be
+        sent by compositors. Instead, use get_default_feedback or
+        get_surface_feedback.
+      </description>
+      <arg name="format" type="uint" summary="DRM_FORMAT code"/>
+      <arg name="modifier_hi" type="uint"
+           summary="high 32 bits of layout modifier"/>
+      <arg name="modifier_lo" type="uint"
+           summary="low 32 bits of layout modifier"/>
+    </event>
+
+    <!-- Version 4 additions -->
+
+    <request name="get_default_feedback" since="4">
+      <description summary="get default feedback">
+        This request creates a new wp_linux_dmabuf_feedback object not bound
+        to a particular surface. This object will deliver feedback about dmabuf
+        parameters to use if the client doesn't support per-surface feedback
+        (see get_surface_feedback).
+      </description>
+      <arg name="id" type="new_id" interface="zwp_linux_dmabuf_feedback_v1"/>
+    </request>
+
+    <request name="get_surface_feedback" since="4">
+      <description summary="get feedback for a surface">
+        This request creates a new wp_linux_dmabuf_feedback object for the
+        specified wl_surface. This object will deliver feedback about dmabuf
+        parameters to use for buffers attached to this surface.
+
+        If the surface is destroyed before the wp_linux_dmabuf_feedback object,
+        the feedback object becomes inert.
+      </description>
+      <arg name="id" type="new_id" interface="zwp_linux_dmabuf_feedback_v1"/>
+      <arg name="surface" type="object" interface="wl_surface"/>
+    </request>
+  </interface>
+
+  <interface name="zwp_linux_buffer_params_v1" version="5">
+    <description summary="parameters for creating a dmabuf-based wl_buffer">
+      This temporary object is a collection of dmabufs and other
+      parameters that together form a single logical buffer. The temporary
+      object may eventually create one wl_buffer unless cancelled by
+      destroying it before requesting 'create'.
+
+      Single-planar formats only require one dmabuf, however
+      multi-planar formats may require more than one dmabuf. For all
+      formats, an 'add' request must be called once per plane (even if the
+      underlying dmabuf fd is identical).
+
+      You must use consecutive plane indices ('plane_idx' argument for 'add')
+      from zero to the number of planes used by the drm_fourcc format code.
+      All planes required by the format must be given exactly once, but can
+      be given in any order. Each plane index can only be set once; subsequent
+      calls with a plane index which has already been set will result in a
+      plane_set error being generated.
+    </description>
+
+    <enum name="error">
+      <entry name="already_used" value="0"
+             summary="the dmabuf_batch object has already been used to create a wl_buffer"/>
+      <entry name="plane_idx" value="1"
+             summary="plane index out of bounds"/>
+      <entry name="plane_set" value="2"
+             summary="the plane index was already set"/>
+      <entry name="incomplete" value="3"
+             summary="missing or too many planes to create a buffer"/>
+      <entry name="invalid_format" value="4"
+             summary="format not supported"/>
+      <entry name="invalid_dimensions" value="5"
+             summary="invalid width or height"/>
+      <entry name="out_of_bounds" value="6"
+             summary="offset + stride * height goes out of dmabuf bounds"/>
+      <entry name="invalid_wl_buffer" value="7"
+             summary="invalid wl_buffer resulted from importing dmabufs via
+               the create_immed request on given buffer_params"/>
+    </enum>
+
+    <request name="destroy" type="destructor">
+      <description summary="delete this object, used or not">
+        Cleans up the temporary data sent to the server for dmabuf-based
+        wl_buffer creation.
+      </description>
+    </request>
+
+    <request name="add">
+      <description summary="add a dmabuf to the temporary set">
+        This request adds one dmabuf to the set in this
+        zwp_linux_buffer_params_v1.
+
+        The 64-bit unsigned value combined from modifier_hi and modifier_lo
+        is the dmabuf layout modifier. DRM AddFB2 ioctl calls this the
+        fb modifier, which is defined in drm_mode.h of Linux UAPI.
+        This is an opaque token. Drivers use this token to express tiling,
+        compression, etc. driver-specific modifications to the base format
+        defined by the DRM fourcc code.
+
+        Starting from version 4, the invalid_format protocol error is sent if
+        the format + modifier pair was not advertised as supported.
+
+        Starting from version 5, the invalid_format protocol error is sent if
+        all planes don't use the same modifier.
+
+        This request raises the PLANE_IDX error if plane_idx is too large.
+        The error PLANE_SET is raised if attempting to set a plane that
+        was already set.
+      </description>
+      <arg name="fd" type="fd" summary="dmabuf fd"/>
+      <arg name="plane_idx" type="uint" summary="plane index"/>
+      <arg name="offset" type="uint" summary="offset in bytes"/>
+      <arg name="stride" type="uint" summary="stride in bytes"/>
+      <arg name="modifier_hi" type="uint"
+           summary="high 32 bits of layout modifier"/>
+      <arg name="modifier_lo" type="uint"
+           summary="low 32 bits of layout modifier"/>
+    </request>
+
+    <enum name="flags" bitfield="true">
+      <entry name="y_invert" value="1" summary="contents are y-inverted"/>
+      <entry name="interlaced" value="2" summary="content is interlaced"/>
+      <entry name="bottom_first" value="4" summary="bottom field first"/>
+    </enum>
+
+    <request name="create">
+      <description summary="create a wl_buffer from the given dmabufs">
+        This asks for creation of a wl_buffer from the added dmabuf
+        buffers. The wl_buffer is not created immediately but returned via
+        the 'created' event if the dmabuf sharing succeeds. The sharing
+        may fail at runtime for reasons a client cannot predict, in
+        which case the 'failed' event is triggered.
+
+        The 'format' argument is a DRM_FORMAT code, as defined by the
+        libdrm's drm_fourcc.h. The Linux kernel's DRM sub-system is the
+        authoritative source on how the format codes should work.
+
+        The 'flags' is a bitfield of the flags defined in enum "flags".
+        'y_invert' means the that the image needs to be y-flipped.
+
+        Flag 'interlaced' means that the frame in the buffer is not
+        progressive as usual, but interlaced. An interlaced buffer as
+        supported here must always contain both top and bottom fields.
+        The top field always begins on the first pixel row. The temporal
+        ordering between the two fields is top field first, unless
+        'bottom_first' is specified. It is undefined whether 'bottom_first'
+        is ignored if 'interlaced' is not set.
+
+        This protocol does not convey any information about field rate,
+        duration, or timing, other than the relative ordering between the
+        two fields in one buffer. A compositor may have to estimate the
+        intended field rate from the incoming buffer rate. It is undefined
+        whether the time of receiving wl_surface.commit with a new buffer
+        attached, applying the wl_surface state, wl_surface.frame callback
+        trigger, presentation, or any other point in the compositor cycle
+        is used to measure the frame or field times. There is no support
+        for detecting missed or late frames/fields/buffers either, and
+        there is no support whatsoever for cooperating with interlaced
+        compositor output.
+
+        The composited image quality resulting from the use of interlaced
+        buffers is explicitly undefined. A compositor may use elaborate
+        hardware features or software to deinterlace and create progressive
+        output frames from a sequence of interlaced input buffers, or it
+        may produce substandard image quality. However, compositors that
+        cannot guarantee reasonable image quality in all cases are recommended
+        to just reject all interlaced buffers.
+
+        Any argument errors, including non-positive width or height,
+        mismatch between the number of planes and the format, bad
+        format, bad offset or stride, may be indicated by fatal protocol
+        errors: INCOMPLETE, INVALID_FORMAT, INVALID_DIMENSIONS,
+        OUT_OF_BOUNDS.
+
+        Dmabuf import errors in the server that are not obvious client
+        bugs are returned via the 'failed' event as non-fatal. This
+        allows attempting dmabuf sharing and falling back in the client
+        if it fails.
+
+        This request can be sent only once in the object's lifetime, after
+        which the only legal request is destroy. This object should be
+        destroyed after issuing a 'create' request. Attempting to use this
+        object after issuing 'create' raises ALREADY_USED protocol error.
+
+        It is not mandatory to issue 'create'. If a client wants to
+        cancel the buffer creation, it can just destroy this object.
+      </description>
+      <arg name="width" type="int" summary="base plane width in pixels"/>
+      <arg name="height" type="int" summary="base plane height in pixels"/>
+      <arg name="format" type="uint" summary="DRM_FORMAT code"/>
+      <arg name="flags" type="uint" enum="flags" summary="see enum flags"/>
+    </request>
+
+    <event name="created">
+      <description summary="buffer creation succeeded">
+        This event indicates that the attempted buffer creation was
+        successful. It provides the new wl_buffer referencing the dmabuf(s).
+
+        Upon receiving this event, the client should destroy the
+        zwp_linux_buffer_params_v1 object.
+      </description>
+      <arg name="buffer" type="new_id" interface="wl_buffer"
+           summary="the newly created wl_buffer"/>
+    </event>
+
+    <event name="failed">
+      <description summary="buffer creation failed">
+        This event indicates that the attempted buffer creation has
+        failed. It usually means that one of the dmabuf constraints
+        has not been fulfilled.
+
+        Upon receiving this event, the client should destroy the
+        zwp_linux_buffer_params_v1 object.
+      </description>
+    </event>
+
+    <request name="create_immed" since="2">
+      <description summary="immediately create a wl_buffer from the given
+                     dmabufs">
+        This asks for immediate creation of a wl_buffer by importing the
+        added dmabufs.
+
+        In case of import success, no event is sent from the server, and the
+        wl_buffer is ready to be used by the client.
+
+        Upon import failure, either of the following may happen, as seen fit
+        by the implementation:
+        - the client is terminated with one of the following fatal protocol
+          errors:
+          - INCOMPLETE, INVALID_FORMAT, INVALID_DIMENSIONS, OUT_OF_BOUNDS,
+            in case of argument errors such as mismatch between the number
+            of planes and the format, bad format, non-positive width or
+            height, or bad offset or stride.
+          - INVALID_WL_BUFFER, in case the cause for failure is unknown or
+            platform specific.
+        - the server creates an invalid wl_buffer, marks it as failed and
+          sends a 'failed' event to the client. The result of using this
+          invalid wl_buffer as an argument in any request by the client is
+          defined by the compositor implementation.
+
+        This takes the same arguments as a 'create' request, and obeys the
+        same restrictions.
+      </description>
+      <arg name="buffer_id" type="new_id" interface="wl_buffer"
+           summary="id for the newly created wl_buffer"/>
+      <arg name="width" type="int" summary="base plane width in pixels"/>
+      <arg name="height" type="int" summary="base plane height in pixels"/>
+      <arg name="format" type="uint" summary="DRM_FORMAT code"/>
+      <arg name="flags" type="uint" enum="flags" summary="see enum flags"/>
+    </request>
+  </interface>
+
+  <interface name="zwp_linux_dmabuf_feedback_v1" version="5">
+    <description summary="dmabuf feedback">
+      This object advertises dmabuf parameters feedback. This includes the
+      preferred devices and the supported formats/modifiers.
+
+      The parameters are sent once when this object is created and whenever they
+      change. The done event is always sent once after all parameters have been
+      sent. When a single parameter changes, all parameters are re-sent by the
+      compositor.
+
+      Compositors can re-send the parameters when the current client buffer
+      allocations are sub-optimal. Compositors should not re-send the
+      parameters if re-allocating the buffers would not result in a more optimal
+      configuration. In particular, compositors should avoid sending the exact
+      same parameters multiple times in a row.
+
+      The tranche_target_device and tranche_formats events are grouped by
+      tranches of preference. For each tranche, a tranche_target_device, one
+      tranche_flags and one or more tranche_formats events are sent, followed
+      by a tranche_done event finishing the list. The tranches are sent in
+      descending order of preference. All formats and modifiers in the same
+      tranche have the same preference.
+
+      To send parameters, the compositor sends one main_device event, tranches
+      (each consisting of one tranche_target_device event, one tranche_flags
+      event, tranche_formats events and then a tranche_done event), then one
+      done event.
+    </description>
+
+    <request name="destroy" type="destructor">
+      <description summary="destroy the feedback object">
+        Using this request a client can tell the server that it is not going to
+        use the wp_linux_dmabuf_feedback object anymore.
+      </description>
+    </request>
+
+    <event name="done">
+      <description summary="all feedback has been sent">
+        This event is sent after all parameters of a wp_linux_dmabuf_feedback
+        object have been sent.
+
+        This allows changes to the wp_linux_dmabuf_feedback parameters to be
+        seen as atomic, even if they happen via multiple events.
+      </description>
+    </event>
+
+    <event name="format_table">
+      <description summary="format and modifier table">
+        This event provides a file descriptor which can be memory-mapped to
+        access the format and modifier table.
+
+        The table contains a tightly packed array of consecutive format +
+        modifier pairs. Each pair is 16 bytes wide. It contains a format as a
+        32-bit unsigned integer, followed by 4 bytes of unused padding, and a
+        modifier as a 64-bit unsigned integer. The native endianness is used.
+
+        The client must map the file descriptor in read-only private mode.
+
+        Compositors are not allowed to mutate the table file contents once this
+        event has been sent. Instead, compositors must create a new, separate
+        table file and re-send feedback parameters. Compositors are allowed to
+        store duplicate format + modifier pairs in the table.
+      </description>
+      <arg name="fd" type="fd" summary="table file descriptor"/>
+      <arg name="size" type="uint" summary="table size, in bytes"/>
+    </event>
+
+    <event name="main_device">
+      <description summary="preferred main device">
+        This event advertises the main device that the server prefers to use
+        when direct scan-out to the target device isn't possible. The
+        advertised main device may be different for each
+        wp_linux_dmabuf_feedback object, and may change over time.
+
+        There is exactly one main device. The compositor must send at least
+        one preference tranche with tranche_target_device equal to main_device.
+
+        Clients need to create buffers that the main device can import and
+        read from, otherwise creating the dmabuf wl_buffer will fail (see the
+        wp_linux_buffer_params.create and create_immed requests for details).
+        The main device will also likely be kept active by the compositor,
+        so clients can use it instead of waking up another device for power
+        savings.
+
+        In general the device is a DRM node. The DRM node type (primary vs.
+        render) is unspecified. Clients must not rely on the compositor sending
+        a particular node type. Clients cannot check two devices for equality
+        by comparing the dev_t value.
+
+        If explicit modifiers are not supported and the client performs buffer
+        allocations on a different device than the main device, then the client
+        must force the buffer to have a linear layout.
+      </description>
+      <arg name="device" type="array" summary="device dev_t value"/>
+    </event>
+
+    <event name="tranche_done">
+      <description summary="a preference tranche has been sent">
+        This event splits tranche_target_device and tranche_formats events in
+        preference tranches. It is sent after a set of tranche_target_device
+        and tranche_formats events; it represents the end of a tranche. The
+        next tranche will have a lower preference.
+      </description>
+    </event>
+
+    <event name="tranche_target_device">
+      <description summary="target device">
+        This event advertises the target device that the server prefers to use
+        for a buffer created given this tranche. The advertised target device
+        may be different for each preference tranche, and may change over time.
+
+        There is exactly one target device per tranche.
+
+        The target device may be a scan-out device, for example if the
+        compositor prefers to directly scan-out a buffer created given this
+        tranche. The target device may be a rendering device, for example if
+        the compositor prefers to texture from said buffer.
+
+        The client can use this hint to allocate the buffer in a way that makes
+        it accessible from the target device, ideally directly. The buffer must
+        still be accessible from the main device, either through direct import
+        or through a potentially more expensive fallback path. If the buffer
+        can't be directly imported from the main device then clients must be
+        prepared for the compositor changing the tranche priority or making
+        wl_buffer creation fail (see the wp_linux_buffer_params.create and
+        create_immed requests for details).
+
+        If the device is a DRM node, the DRM node type (primary vs. render) is
+        unspecified. Clients must not rely on the compositor sending a
+        particular node type. Clients cannot check two devices for equality by
+        comparing the dev_t value.
+
+        This event is tied to a preference tranche, see the tranche_done event.
+      </description>
+      <arg name="device" type="array" summary="device dev_t value"/>
+    </event>
+
+    <event name="tranche_formats">
+      <description summary="supported buffer format modifier">
+        This event advertises the format + modifier combinations that the
+        compositor supports.
+
+        It carries an array of indices, each referring to a format + modifier
+        pair in the last received format table (see the format_table event).
+        Each index is a 16-bit unsigned integer in native endianness.
+
+        For legacy support, DRM_FORMAT_MOD_INVALID is an allowed modifier.
+        It indicates that the server can support the format with an implicit
+        modifier. When a buffer has DRM_FORMAT_MOD_INVALID as its modifier, it
+        is as if no explicit modifier is specified. The effective modifier
+        will be derived from the dmabuf.
+
+        A compositor that sends valid modifiers and DRM_FORMAT_MOD_INVALID for
+        a given format supports both explicit modifiers and implicit modifiers.
+
+        Compositors must not send duplicate format + modifier pairs within the
+        same tranche or across two different tranches with the same target
+        device and flags.
+
+        This event is tied to a preference tranche, see the tranche_done event.
+
+        For the definition of the format and modifier codes, see the
+        wp_linux_buffer_params.create request.
+      </description>
+      <arg name="indices" type="array" summary="array of 16-bit indexes"/>
+    </event>
+
+    <enum name="tranche_flags" bitfield="true">
+      <entry name="scanout" value="1" summary="direct scan-out tranche"/>
+    </enum>
+
+    <event name="tranche_flags">
+      <description summary="tranche flags">
+        This event sets tranche-specific flags.
+
+        The scanout flag is a hint that direct scan-out may be attempted by the
+        compositor on the target device if the client appropriately allocates a
+        buffer. How to allocate a buffer that can be scanned out on the target
+        device is implementation-defined.
+
+        This event is tied to a preference tranche, see the tranche_done event.
+      </description>
+      <arg name="flags" type="uint" enum="tranche_flags" summary="tranche flags"/>
+    </event>
+  </interface>
+
+</protocol>
diff --git a/qt/src/GhosttySurface.cpp b/qt/src/GhosttySurface.cpp
index 739c5b576..e4f84c128 100644
--- a/qt/src/GhosttySurface.cpp
+++ b/qt/src/GhosttySurface.cpp
@@ -127,21 +127,18 @@ GhosttySurface::GhosttySurface(ghostty_app_t app, MainWindow *owner,
     sc.platform_tag = GHOSTTY_PLATFORM_VULKAN;
     sc.platform.vulkan = vk_host->asPlatform(this);
 
-    // Polling timer on the GUI thread: every 16ms, check if the
-    // renderer thread parked a new frame in `m_pending` and swap
-    // it into `m_image` for paintEvent to pick up.
+    // GUI-thread frame drain. The renderer thread wakes us per frame
+    // via QMetaObject::invokeMethod (Qt::QueuedConnection) on each
+    // present — see `presentVulkanDmabuf`. The 2 ms timer is a
+    // safety net: if `invokeMethod` ever fails to deliver (the
+    // earlier QImage-handoff diagnostics suggested this could
+    // happen), the next tick drains the parked frame within at most
+    // 2 ms. Idle case has negligible CPU cost because `drainVulkan`
+    // returns immediately when nothing is pending.
     m_vulkanPollTimer = new QTimer(this);
-    m_vulkanPollTimer->setInterval(16);  // ≈60 Hz
-    connect(m_vulkanPollTimer, &QTimer::timeout, this, [this]() {
-      QImage frame;
-      {
-        QMutexLocker lock(&m_pendingMutex);
-        if (m_pending.isNull()) return;
-        frame = std::move(m_pending);
-      }
-      m_image = std::move(frame);
-      update();
-    });
+    m_vulkanPollTimer->setInterval(2);
+    connect(m_vulkanPollTimer, &QTimer::timeout, this,
+            [this]() { drainVulkan(); });
     m_vulkanPollTimer->start();
   } else {
     sc.platform_tag = GHOSTTY_PLATFORM_OPENGL;
@@ -324,9 +321,18 @@ bool GhosttySurface::event(QEvent *e) {
         // WA_NativeWindow ensures windowHandle() is non-null even if
         // GhosttySurface is embedded in a non-native parent.
         setAttribute(Qt::WA_NativeWindow);
-        if (auto *h = windowHandle())
+        if (auto *h = windowHandle()) {
           m_subsurfacePresenter =
               wayland::SubsurfacePresenter::tryCreate(h);
+          if (m_subsurfacePresenter && m_useVulkan) {
+            // Flip the Vulkan present path over to the zero-copy
+            // wl_subsurface route. Release-style store pairs with
+            // the renderer thread's acquire-load — once it observes
+            // true, it stops parking QImages and just hands us the
+            // dmabuf descriptor for compositor handoff.
+            m_useSubsurface.store(true, std::memory_order_release);
+          }
+        }
       }
     } else if (e->type() == QEvent::Hide) {
       ghostty_surface_set_occlusion(m_surface, false);
@@ -424,6 +430,14 @@ void GhosttySurface::renderTerminal() {
 }
 
 void GhosttySurface::paintEvent(QPaintEvent *) {
+  // Subsurface zero-copy path: the wl_subsurface IS the terminal
+  // pixels — they reach the compositor without ever touching our
+  // QPainter. With `WA_TranslucentBackground` set, the QWidget
+  // paints transparent over the subsurface so chrome (dim overlay,
+  // bell flash, resize hint) still composites on top.
+  const bool subsurfaceActive =
+      m_useSubsurface.load(std::memory_order_acquire) && m_subsurfacePresenter;
+
   // No frame yet — leave the widget background untouched. With
   // `WA_TranslucentBackground` set the area is transparent until
   // the first frame imports, matching the OpenGL path. New surfaces
@@ -431,18 +445,20 @@ void GhosttySurface::paintEvent(QPaintEvent *) {
   // thread has emitted its first frame; the gap is short enough
   // that flashing a debug placeholder is more jarring than the
   // brief see-through.
-  if (m_image.isNull()) return;
+  if (!subsurfaceActive && m_image.isNull()) return;
   QPainter painter(this);
-  // Blit the framebuffer 1:1. m_image carries the device pixel ratio, so
-  // the QPointF overload draws it at its true logical size. When in
-  // sync that exactly fills the widget; mid-resize, the previous frame
-  // stays at its real size in the top-left corner (rather than being
-  // stretched to the new widget rect, which the user dislikes more
-  // than the transient gap).
-  // CompositionMode_Source replaces the transparent widget pixels with
-  // the terminal image, alpha included, so its translucency is kept.
-  painter.setCompositionMode(QPainter::CompositionMode_Source);
-  painter.drawImage(QPointF(0, 0), m_image);
+  if (!subsurfaceActive) {
+    // Blit the framebuffer 1:1. m_image carries the device pixel ratio, so
+    // the QPointF overload draws it at its true logical size. When in
+    // sync that exactly fills the widget; mid-resize, the previous frame
+    // stays at its real size in the top-left corner (rather than being
+    // stretched to the new widget rect, which the user dislikes more
+    // than the transient gap).
+    // CompositionMode_Source replaces the transparent widget pixels with
+    // the terminal image, alpha included, so its translucency is kept.
+    painter.setCompositionMode(QPainter::CompositionMode_Source);
+    painter.drawImage(QPointF(0, 0), m_image);
+  }
 
   // Unfocused-split dimming: a translucent fill over an inactive pane.
   // Only split panes (a QSplitter parent) are dimmed, matching GTK.
@@ -1343,13 +1359,34 @@ void GhosttySurface::presentVulkanDmabuf(
     quint64 drm_modifier,
     quint32 width,
     quint32 height,
-    quint32 stride) {
-  // Called from the renderer thread. We mmap the dmabuf, copy the
-  // bytes into a QImage, and hand the QImage to the GUI thread for
-  // paint via `QMetaObject::invokeMethod`. The fd is a borrow (per
-  // the `ghostty_platform_vulkan_s` contract); libghostty closes it
-  // when the underlying memory is freed.
-  (void)drm_modifier;  // LINEAR for v1; not used here.
+    quint32 stride,
+    bool image_backed) {
+  // Called from the renderer thread. Two paths, picked per frame
+  // based on whether the wl_subsurface presenter is up:
+  //
+  //   Subsurface (zero-copy): park the dmabuf metadata; GUI thread
+  //   wraps the fd in a wl_buffer and attach/commits to our
+  //   wl_subsurface. The compositor scans it out directly.
+  //
+  //   Fallback (legacy mmap+memcpy): map the fd, copy into a
+  //   QImage, GUI thread paints via QPainter. Used when the
+  //   subsurface presenter failed to come up (e.g. compositor
+  //   missing linux-dmabuf-v1).
+  //
+  // The fd is a borrow per the `ghostty_platform_vulkan_s` contract;
+  // libghostty closes it when the underlying memory is freed. In
+  // the subsurface path the wayland client lib SCM_RIGHTS-dups the
+  // fd so the compositor's reference outlives our park-and-drain.
+
+  // The subsurface path requires `image_backed` (i.e. the renderer
+  // is in `.direct` mode and the fd points at a VkImage). When the
+  // renderer falls back to `.legacy_copy` — NVIDIA today, the fd is
+  // a VkBuffer — linux-dmabuf-v1 import would fail with
+  // `invalid_wl_buffer` and that's a fatal protocol error on the
+  // wl_display. So we gate per-frame and stay on the QImage path
+  // when the fd isn't compositor-importable.
+  const bool useSubsurface =
+      image_backed && m_useSubsurface.load(std::memory_order_acquire);
 
   // One-shot breadcrumb so logs confirm the dmabuf hand-off is
   // wired. Subsequent frames are silent so we don't spam stderr.
@@ -1357,15 +1394,31 @@ void GhosttySurface::presentVulkanDmabuf(
   if (!logged_first) {
     logged_first = true;
     std::fprintf(stderr,
-                 "[ghastty] first Vulkan dmabuf frame: fd=%d %ux%u stride=%u fourcc=0x%08x mod=0x%lx\n",
+                 "[ghastty] first Vulkan dmabuf frame: fd=%d %ux%u stride=%u "
+                 "fourcc=0x%08x mod=0x%lx image_backed=%d path=%s\n",
                  dmabuf_fd, width, height, stride, drm_format,
-                 static_cast<unsigned long>(drm_modifier));
+                 static_cast<unsigned long>(drm_modifier), image_backed ? 1 : 0,
+                 useSubsurface ? "subsurface" : "qimage");
   }
 
-  // sanity check the size before we allocate / mmap.
   if (dmabuf_fd < 0 || width == 0 || height == 0 || stride < width * 4)
     return;
 
+  if (useSubsurface) {
+    // Subsurface path. Park the descriptor under the mutex (so
+    // a concurrent drainVulkan sees a consistent snapshot) and
+    // wake the GUI thread.
+    {
+      QMutexLocker lock(&m_pendingMutex);
+      m_pendingDmabuf = PendingDmabuf{
+          dmabuf_fd, drm_format, drm_modifier, width, height, stride,
+      };
+    }
+    QMetaObject::invokeMethod(this, "drainVulkan", Qt::QueuedConnection);
+    return;
+  }
+
+  // Fallback: mmap + memcpy into a QImage.
   const size_t bytes = static_cast<size_t>(stride) * height;
   void *mapped = ::mmap(nullptr, bytes, PROT_READ, MAP_SHARED, dmabuf_fd, 0);
   if (mapped == MAP_FAILED) {
@@ -1373,19 +1426,12 @@ void GhosttySurface::presentVulkanDmabuf(
                  dmabuf_fd, std::strerror(errno));
     return;
   }
-  // QImage holds the pixel data by copying when constructed with
-  // `Format_ARGB32_Premultiplied` from a buffer with explicit stride.
-  // We then detach (copy()) so the QImage survives the unmap.
-  //
   // drm_format ARGB8888 (0x34325241 = "AR24") matches QImage's
-  // ARGB32 byte order on little-endian (B,G,R,A in memory).
-  //
-  // We use the *premultiplied* variant because the renderer's
-  // fragment shaders output premultiplied alpha and the render
-  // target is `VK_FORMAT_B8G8R8A8_SRGB` (hardware gamma-encodes the
-  // linear shader output at framebuffer-write time). The bytes
-  // landing in this buffer are therefore sRGB-encoded premultiplied
-  // ARGB — exactly what Format_ARGB32_Premultiplied expects.
+  // ARGB32 byte order on little-endian (B,G,R,A in memory). The
+  // renderer's fragment shaders output premultiplied alpha into
+  // `VK_FORMAT_B8G8R8A8_SRGB`, so the buffer is sRGB-encoded
+  // premultiplied ARGB — exactly what Format_ARGB32_Premultiplied
+  // expects.
   (void)drm_format;
   const QImage stamped(
       static_cast<const uchar *>(mapped),
@@ -1396,20 +1442,45 @@ void GhosttySurface::presentVulkanDmabuf(
   QImage owned = stamped.copy();
   ::munmap(mapped, bytes);
 
-  // Tell QPainter the image's pixels are device pixels at the same
-  // DPR the framebuffer was sized at. Without this, `drawImage` would
-  // treat the image as logical pixels and re-scale to framebuffer
-  // pixels on a HiDPI display (DPR>1) — glyphs come out 2× too big.
-  // `m_fbDpr` is the DPR `syncSurfaceSize` used when telling
-  // libghostty the framebuffer size, so it matches what the renderer
-  // actually drew.
   if (m_fbDpr > 0) owned.setDevicePixelRatio(m_fbDpr);
-
-  // Stash for the GUI-thread polling timer to pick up.
   {
     QMutexLocker lock(&m_pendingMutex);
     m_pending = std::move(owned);
   }
+  QMetaObject::invokeMethod(this, "drainVulkan", Qt::QueuedConnection);
+}
+
+void GhosttySurface::drainVulkan() {
+  // Subsurface (zero-copy) path: take the parked dmabuf descriptor
+  // under the mutex, then dispatch it to the presenter outside the
+  // lock so a renderer-thread `presentVulkanDmabuf` parking the
+  // next frame doesn't block on wl_display_flush.
+  if (m_useSubsurface.load(std::memory_order_acquire) &&
+      m_subsurfacePresenter) {
+    PendingDmabuf frame;
+    {
+      QMutexLocker lock(&m_pendingMutex);
+      if (m_pendingDmabuf.fd < 0) return;
+      frame = m_pendingDmabuf;
+      m_pendingDmabuf.fd = -1;  // mark consumed
+    }
+    const int scale =
+        std::max(1, static_cast<int>(std::lround(devicePixelRatioF())));
+    m_subsurfacePresenter->presentDmabuf(frame.fd, frame.drm_format,
+                                          frame.drm_modifier, frame.width,
+                                          frame.height, frame.stride, scale);
+    return;
+  }
+
+  // Fallback: hand the QImage to paintEvent.
+  QImage frame;
+  {
+    QMutexLocker lock(&m_pendingMutex);
+    if (m_pending.isNull()) return;
+    frame = std::move(m_pending);
+  }
+  m_image = std::move(frame);
+  update();
 }
 
 // Trampoline so `Host.cpp` doesn't need to include the full
@@ -1425,10 +1496,12 @@ void presentToGhosttySurface(
     uint64_t drm_modifier,
     uint32_t width,
     uint32_t height,
-    uint32_t stride) {
+    uint32_t stride,
+    bool image_backed) {
   if (surface == nullptr) return;
   static_cast<GhosttySurface *>(surface)->presentVulkanDmabuf(
-      dmabuf_fd, drm_format, drm_modifier, width, height, stride);
+      dmabuf_fd, drm_format, drm_modifier, width, height, stride,
+      image_backed);
 }
 
 } // namespace vulkan
diff --git a/qt/src/GhosttySurface.h b/qt/src/GhosttySurface.h
index 6d3ff6ed2..9bb2d8d66 100644
--- a/qt/src/GhosttySurface.h
+++ b/qt/src/GhosttySurface.h
@@ -150,20 +150,30 @@ public:
   void setPwd(const QString &pwd);
   const QString &pwd() const { return m_pwd; }
 
-  // Apprt-side entry point for the Vulkan `present` callback.
-  // libghostty hands us a dmabuf fd pointing at the rendered
-  // VkImage's memory; we mmap it (LINEAR tiling means the bytes
-  // are directly readable as BGRA), copy the pixels into a QImage,
-  // and schedule a repaint. Thread-safe: the callback fires from
-  // the renderer thread; the QImage handoff goes through
-  // `QMetaObject::invokeMethod` to the GUI thread.
+  // Apprt-side entry point for the Vulkan `present` callback. Fires
+  // on the renderer thread. Parks the dmabuf descriptor under
+  // `m_pendingMutex` (plus, for the legacy fallback path, an
+  // mmap+memcpy'd QImage) and wakes the GUI thread via
+  // `QMetaObject::invokeMethod(this, drainVulkan, Qt::QueuedConnection)`.
+  // The GUI thread either commits the dmabuf to the wl_subsurface
+  // (zero-copy) or paints the QImage (fallback). A 2 ms safety-net
+  // poll catches anything `invokeMethod` ever fails to deliver.
   Q_INVOKABLE void presentVulkanDmabuf(
       int dmabuf_fd,
       quint32 drm_format,
       quint64 drm_modifier,
       quint32 width,
       quint32 height,
-      quint32 stride);
+      quint32 stride,
+      bool image_backed);
+
+  // GUI-thread drain step: hands the most recent pending frame
+  // either to the SubsurfacePresenter (zero-copy path) or the
+  // QImage paint pipeline (fallback). Idempotent: returns
+  // immediately if nothing's pending. Invoked from the polling
+  // safety net AND from queued invocations triggered by the
+  // renderer thread.
+  Q_INVOKABLE void drainVulkan();
 
 protected:
   bool event(QEvent *) override;
@@ -244,15 +254,35 @@ private:
   // gives way to the actual rendered content.
   bool m_useVulkan = false;
 
-  // Cross-thread frame handoff for the Vulkan path. `presentVulkanDmabuf`
-  // (renderer thread) writes a freshly-imported QImage to `m_pending`
-  // under `m_pendingMutex`; a 16 ms `QTimer` on the GUI thread checks
-  // `m_pending`, atomically swaps it into `m_image`, and triggers a
-  // repaint. The polling timer is the simplest reliable cross-thread
-  // path we could land — the obvious Qt mechanisms
-  // (QMetaObject::invokeMethod / postEvent) were both not firing
-  // their queued lambdas under the renderer-thread → GUI-thread
-  // handoff, see the commit message for diagnostics.
+  // Cross-thread frame handoff for the Vulkan path. The renderer
+  // thread calls `presentVulkanDmabuf` with a borrowed dmabuf fd; a
+  // 16 ms `QTimer` on the GUI thread drains the pending frame and
+  // routes it through the wl_subsurface (zero-copy) when the
+  // SubsurfacePresenter is available, or falls back to the
+  // mmap+memcpy+QImage path otherwise. The polling timer was kept
+  // (rather than QMetaObject::invokeMethod) because queued lambdas
+  // from the renderer thread were unreliable in earlier diagnostics.
+  //
+  // `m_useSubsurface` is set once on the GUI thread when the
+  // presenter comes up; the renderer thread reads it acquire-style
+  // to decide which path to populate per frame.
+  std::atomic<bool> m_useSubsurface{false};
+  // Subsurface (zero-copy) path: renderer thread parks the
+  // borrowed-fd descriptor here; GUI-thread timer hands it to the
+  // presenter.
+  struct PendingDmabuf {
+    int fd = -1;
+    quint32 drm_format = 0;
+    quint64 drm_modifier = 0;
+    quint32 width = 0;
+    quint32 height = 0;
+    quint32 stride = 0;
+  };
+  PendingDmabuf m_pendingDmabuf;
+  // Legacy (mmap+memcpy) path: kept as a fallback when the
+  // presenter isn't available (e.g. compositor missing
+  // linux-dmabuf-v1). When the subsurface path is active this stays
+  // null and paintEvent skips its blit.
   QImage m_pending;
   QMutex m_pendingMutex;
   QTimer *m_vulkanPollTimer = nullptr;
diff --git a/qt/src/vulkan/Host.cpp b/qt/src/vulkan/Host.cpp
index ce3fdbaa2..e9551567e 100644
--- a/qt/src/vulkan/Host.cpp
+++ b/qt/src/vulkan/Host.cpp
@@ -22,7 +22,8 @@ void presentToGhosttySurface(
     uint64_t drm_modifier,
     uint32_t width,
     uint32_t height,
-    uint32_t stride);
+    uint32_t stride,
+    bool image_backed);
 
 namespace {
 
@@ -114,10 +115,11 @@ void cbPresent(
     uint64_t drm_modifier,
     uint32_t width,
     uint32_t height,
-    uint32_t stride) {
+    uint32_t stride,
+    bool image_backed) {
   if (ud == nullptr) return;
-  ::vulkan::presentToGhosttySurface(ud, dmabuf_fd, drm_format,
-                                    drm_modifier, width, height, stride);
+  ::vulkan::presentToGhosttySurface(ud, dmabuf_fd, drm_format, drm_modifier,
+                                    width, height, stride, image_backed);
 }
 
 } // namespace
diff --git a/qt/src/wayland/SubsurfacePresenter.cpp b/qt/src/wayland/SubsurfacePresenter.cpp
index 77207a109..d02454ea5 100644
--- a/qt/src/wayland/SubsurfacePresenter.cpp
+++ b/qt/src/wayland/SubsurfacePresenter.cpp
@@ -10,6 +10,8 @@
 
 #include <wayland-client.h>
 
+#include "linux-dmabuf-v1-client-protocol.h"
+
 namespace wayland {
 
 namespace {
@@ -21,6 +23,7 @@ namespace {
 struct PresenterGlobals {
   wl_compositor *compositor = nullptr;
   wl_subcompositor *subcompositor = nullptr;
+  zwp_linux_dmabuf_v1 *dmabuf = nullptr;
   bool searched = false;
 };
 
@@ -33,6 +36,14 @@ void registryGlobal(void *data, wl_registry *registry, uint32_t name,
   } else if (std::strcmp(interface, wl_subcompositor_interface.name) == 0) {
     g->subcompositor = static_cast<wl_subcompositor *>(
         wl_registry_bind(registry, name, &wl_subcompositor_interface, 1));
+  } else if (std::strcmp(interface, zwp_linux_dmabuf_v1_interface.name) == 0) {
+    // v3 has `create_immed`, which we want (synchronous wl_buffer
+    // creation — the v2 async `create` + `created`/`failed` event
+    // dance would add a layer of callback machinery for no real win
+    // in our renderer's strict-fd-validity scenario). v4 adds the
+    // dynamic format/modifier feedback dance; we don't need it yet.
+    g->dmabuf = static_cast<zwp_linux_dmabuf_v1 *>(wl_registry_bind(
+        registry, name, &zwp_linux_dmabuf_v1_interface, 3));
   }
 }
 void registryGlobalRemove(void *, wl_registry *, uint32_t) {}
@@ -63,20 +74,32 @@ PresenterGlobals *discoverGlobals(wl_display *display) {
   if (globals.subcompositor)
     wl_proxy_set_queue(reinterpret_cast<wl_proxy *>(globals.subcompositor),
                        nullptr);
+  if (globals.dmabuf)
+    wl_proxy_set_queue(reinterpret_cast<wl_proxy *>(globals.dmabuf), nullptr);
   wl_event_queue_destroy(queue);
 
   return &globals;
 }
 
+// wl_buffer::release listener: the compositor is done sampling the
+// buffer for any committed surface state, so we can destroy our
+// client-side handle. The underlying dmabuf memory is owned by
+// libghostty; we never close that fd here (the SCM_RIGHTS transfer
+// in zwp_linux_buffer_params.add gave the compositor its own
+// reference, which lives independently of our wl_buffer).
+void bufferRelease(void *, wl_buffer *buffer) {
+  wl_buffer_destroy(buffer);
+}
+const wl_buffer_listener kBufferListener = {
+    bufferRelease,
+};
+
 } // namespace
 
 std::unique_ptr<SubsurfacePresenter>
 SubsurfacePresenter::tryCreate(QWindow *parent) {
   if (!parent) return nullptr;
 
-  // The Qt frontend is Wayland-only; if we're not on Wayland, the
-  // native-interface lookups below would return null anyway, but
-  // bail explicitly so the log message is useful.
   if (!QGuiApplication::platformName().startsWith(QLatin1String("wayland"))) {
     std::fprintf(stderr,
                  "[ghastty] SubsurfacePresenter: not on Wayland QPA\n");
@@ -100,13 +123,13 @@ SubsurfacePresenter::tryCreate(QWindow *parent) {
   }
 
   PresenterGlobals *g = discoverGlobals(display);
-  if (!g->compositor || !g->subcompositor) {
+  if (!g->compositor || !g->subcompositor || !g->dmabuf) {
     std::fprintf(stderr,
-                 "[ghastty] SubsurfacePresenter: compositor lacks "
-                 "wl_compositor or wl_subcompositor (compositor=%p "
-                 "subcompositor=%p)\n",
+                 "[ghastty] SubsurfacePresenter: compositor missing required "
+                 "globals (compositor=%p subcompositor=%p dmabuf=%p)\n",
                  static_cast<void *>(g->compositor),
-                 static_cast<void *>(g->subcompositor));
+                 static_cast<void *>(g->subcompositor),
+                 static_cast<void *>(g->dmabuf));
     return nullptr;
   }
 
@@ -126,18 +149,13 @@ SubsurfacePresenter::tryCreate(QWindow *parent) {
   // for the parent's next commit. `set_desync` is what allows that.
   wl_subsurface_set_desync(sub);
 
-  // Subsurface covers the parent at the origin. Phase 3 will keep
-  // this in sync on resize; for Phase 2 it doesn't matter because
-  // we never attach a buffer.
+  // Subsurface covers the parent at the origin. Phase 4 will keep
+  // this in sync on splits/tabs/etc.; for now the GhosttySurface
+  // forces WA_NativeWindow so its QWindow IS the terminal's native
+  // wayland surface and (0,0) is correct.
   wl_subsurface_set_position(sub, 0, 0);
 
-  // Flush so the compositor sees the subsurface creation. We do NOT
-  // commit the child surface — per protocol an uncommitted subsurface
-  // with no attached buffer contributes nothing to the parent's
-  // display, which is exactly the no-behavior-change state we want
-  // for Phase 2.
   wl_display_flush(display);
-
   if (int err = wl_display_get_error(display); err != 0) {
     std::fprintf(stderr,
                  "[ghastty] SubsurfacePresenter: wl_display error %d after "
@@ -149,18 +167,22 @@ SubsurfacePresenter::tryCreate(QWindow *parent) {
   }
 
   std::fprintf(stderr,
-               "[ghastty] SubsurfacePresenter: subsurface ready (parent=%p "
-               "child=%p sub=%p)\n",
-               static_cast<void *>(parentSurface),
-               static_cast<void *>(child), static_cast<void *>(sub));
+               "[ghastty] SubsurfacePresenter: ready (parent=%p child=%p "
+               "sub=%p dmabuf=%p)\n",
+               static_cast<void *>(parentSurface), static_cast<void *>(child),
+               static_cast<void *>(sub), static_cast<void *>(g->dmabuf));
 
   return std::unique_ptr<SubsurfacePresenter>(
-      new SubsurfacePresenter(display, child, sub));
+      new SubsurfacePresenter(display, child, sub, g->dmabuf));
 }
 
 SubsurfacePresenter::SubsurfacePresenter(wl_display *display, wl_surface *child,
-                                         wl_subsurface *sub)
-    : m_display(display), m_childSurface(child), m_subsurface(sub) {}
+                                         wl_subsurface *sub,
+                                         zwp_linux_dmabuf_v1 *dmabuf)
+    : m_display(display),
+      m_childSurface(child),
+      m_subsurface(sub),
+      m_dmabuf(dmabuf) {}
 
 SubsurfacePresenter::~SubsurfacePresenter() {
   if (m_subsurface) wl_subsurface_destroy(m_subsurface);
@@ -168,4 +190,60 @@ SubsurfacePresenter::~SubsurfacePresenter() {
   if (m_display) wl_display_flush(m_display);
 }
 
+void SubsurfacePresenter::presentDmabuf(int fd, uint32_t drm_format,
+                                        uint64_t drm_modifier, uint32_t width,
+                                        uint32_t height, uint32_t stride,
+                                        int buffer_scale) {
+  if (fd < 0 || !m_dmabuf || !m_childSurface) return;
+  if (buffer_scale < 1) buffer_scale = 1;
+
+  // Wrap libghostty's borrowed fd in a wl_buffer.
+  zwp_linux_buffer_params_v1 *params =
+      zwp_linux_dmabuf_v1_create_params(m_dmabuf);
+  if (!params) return;
+  zwp_linux_buffer_params_v1_add(params, fd, /*plane_idx*/ 0,
+                                 /*offset*/ 0, stride,
+                                 static_cast<uint32_t>(drm_modifier >> 32),
+                                 static_cast<uint32_t>(drm_modifier & 0xFFFFFFFFu));
+  wl_buffer *buffer = zwp_linux_buffer_params_v1_create_immed(
+      params, static_cast<int32_t>(width), static_cast<int32_t>(height),
+      drm_format, /*flags*/ 0);
+  zwp_linux_buffer_params_v1_destroy(params);
+  if (!buffer) {
+    std::fprintf(stderr,
+                 "[ghastty] SubsurfacePresenter: create_immed returned null "
+                 "(fd=%d %ux%u fmt=0x%x mod=0x%llx)\n",
+                 fd, width, height, drm_format,
+                 static_cast<unsigned long long>(drm_modifier));
+    return;
+  }
+  wl_buffer_add_listener(buffer, &kBufferListener, this);
+
+  // Set buffer scale only when it changes — calling on every present
+  // is harmless but the compositor's bookkeeping is cheaper if we
+  // skip the redundant request.
+  if (buffer_scale != m_lastBufferScale) {
+    wl_surface_set_buffer_scale(m_childSurface, buffer_scale);
+    m_lastBufferScale = buffer_scale;
+  }
+
+  wl_surface_attach(m_childSurface, buffer, 0, 0);
+  // Damage the full buffer extent — terminals tend to update large
+  // dirty rects anyway (cursor blink, scroll, repaint) so a precise
+  // damage region wouldn't save much, and `damage_buffer` (vs
+  // `damage`) uses buffer coordinates so it's resolution-correct
+  // regardless of buffer_scale.
+  wl_surface_damage_buffer(m_childSurface, 0, 0, static_cast<int32_t>(width),
+                           static_cast<int32_t>(height));
+  wl_surface_commit(m_childSurface);
+
+  wl_display_flush(m_display);
+  if (int err = wl_display_get_error(m_display); err != 0) {
+    std::fprintf(
+        stderr,
+        "[ghastty] SubsurfacePresenter: wl_display error %d after present\n",
+        err);
+  }
+}
+
 } // namespace wayland
diff --git a/qt/src/wayland/SubsurfacePresenter.h b/qt/src/wayland/SubsurfacePresenter.h
index 4c762c61d..daa17968f 100644
--- a/qt/src/wayland/SubsurfacePresenter.h
+++ b/qt/src/wayland/SubsurfacePresenter.h
@@ -1,67 +1,75 @@
 // Wayland subsurface presenter for `GhosttySurface`.
 //
-// Scaffolding for the GPU-direct present path (issue: Phase 2 of the
-// dmabuf-as-importable-surface rework). This class owns one
-// `wl_subsurface` parented to the `GhosttySurface`'s native
-// `wl_surface`. Its eventual job is to receive dmabuf fds from
-// libghostty's renderer, wrap each one in a `wl_buffer` via
-// `zwp_linux_dmabuf_v1`, and attach it to the subsurface so the
-// compositor scans it out directly — bypassing the current mmap +
-// memcpy + QImage + QPainter pipeline.
-//
-// In Phase 2 (this commit) the presenter only creates and tears down
-// the subsurface. No buffer is ever attached; the existing
-// `presentVulkanDmabuf` path keeps running unchanged. The proof this
-// scaffolding works is that `ghastty-vulkan` still launches and
-// renders identically with no Wayland protocol errors.
+// Owns one `wl_subsurface` parented to the `GhosttySurface`'s native
+// `wl_surface`, plus the `zwp_linux_dmabuf_v1` machinery for wrapping
+// libghostty's dmabuf fds in `wl_buffer`s and attaching them to that
+// subsurface. The compositor scans the buffers out directly — no
+// mmap, no memcpy, no QImage, no QPainter blit on the present path.
 //
 // Wayland-only by project decision (the Qt frontend is Wayland-only;
 // see `feedback-qt-no-x11` memory). If the host isn't on a Wayland
-// QPA platform or the compositor lacks `wl_subcompositor`,
-// `tryCreate` returns nullptr — Phase 2 silently ignores that
-// because nothing consumes the presenter yet; Phase 3 will treat it
-// as fatal.
+// QPA platform or the compositor lacks the required globals,
+// `tryCreate` returns nullptr — the caller decides whether that's a
+// fatal error.
 
 #pragma once
 
+#include <cstdint>
 #include <memory>
 
 struct wl_display;
 struct wl_subsurface;
 struct wl_surface;
+struct zwp_linux_dmabuf_v1;
 class QWindow;
 
 namespace wayland {
 
 class SubsurfacePresenter {
 public:
-  // Build a subsurface parented to `parent`'s native `wl_surface`.
+  // Build a subsurface parented to `parent`'s native `wl_surface`,
+  // and bind the linux-dmabuf-v1 global on the same display.
   // Returns nullptr if any prerequisite is missing (non-Wayland QPA,
-  // null `wl_display`, `wl_subcompositor` unbindable, etc.).
+  // null `wl_display`, `wl_subcompositor` unbindable,
+  // `zwp_linux_dmabuf_v1` unbindable, etc.).
   //
-  // Forces `Qt::WA_NativeWindow` on the caller is the *caller's*
+  // Forcing `Qt::WA_NativeWindow` on the caller is the *caller's*
   // responsibility — `tryCreate` only reads `parent->surfaceHandle`.
   static std::unique_ptr<SubsurfacePresenter> tryCreate(QWindow *parent);
 
   ~SubsurfacePresenter();
 
-  // Phase-3 accessors: when the present path moves to dmabuf-attach,
-  // the caller will need the child `wl_surface` to attach buffers to
-  // and the `wl_display` to flush. Exposed now so the API surface
-  // doesn't churn between phases.
-  wl_surface *childSurface() const { return m_childSurface; }
-  wl_display *display() const { return m_display; }
+  // Hand a dmabuf-backed frame to the compositor: wrap the fd in a
+  // `wl_buffer` via `zwp_linux_buffer_params_v1.create_immed`, attach
+  // to the subsurface, damage, commit. MUST be called on the Qt GUI
+  // thread (the thread that owns the wl_display dispatch); the
+  // renderer thread should marshal frames through a Qt-side queue.
+  //
+  // libghostty owns the fd; this method does not close it. The
+  // wayland client library duplicates the fd kernel-side via
+  // SCM_RIGHTS, so the compositor's reference survives even after
+  // libghostty reuses or closes its handle.
+  //
+  // `buffer_scale` is the Wayland buffer scale factor (1 for stock
+  // DPI, 2 for HiDPI, etc.) — set on the child surface so the
+  // compositor scales the buffer correctly relative to the parent's
+  // surface-local coordinates.
+  void presentDmabuf(int fd, uint32_t drm_format, uint64_t drm_modifier,
+                     uint32_t width, uint32_t height, uint32_t stride,
+                     int buffer_scale);
 
   SubsurfacePresenter(const SubsurfacePresenter &) = delete;
   SubsurfacePresenter &operator=(const SubsurfacePresenter &) = delete;
 
 private:
   SubsurfacePresenter(wl_display *display, wl_surface *child,
-                      wl_subsurface *sub);
+                      wl_subsurface *sub, zwp_linux_dmabuf_v1 *dmabuf);
 
   wl_display *m_display;
   wl_surface *m_childSurface;
   wl_subsurface *m_subsurface;
+  zwp_linux_dmabuf_v1 *m_dmabuf;
+  int m_lastBufferScale = 0;
 };
 
 } // namespace wayland
diff --git a/src/apprt/embedded.zig b/src/apprt/embedded.zig
index b5af8a319..4e9775246 100644
--- a/src/apprt/embedded.zig
+++ b/src/apprt/embedded.zig
@@ -428,7 +428,12 @@ pub const Platform = union(PlatformTag) {
         /// host imports it for composition; libghostty retains
         /// ownership of the underlying VkDeviceMemory and the fd is
         /// valid only for the duration of the call (host must `dup()`
-        /// if it needs to hold the fd longer).
+        /// if it needs to hold the fd longer). `image_backed` tells
+        /// the host whether the fd was exported from a VkImage
+        /// (directly importable as a 2D image via linux-dmabuf-v1)
+        /// or from a VkBuffer (only usable via mmap + CPU readback);
+        /// see `vulkan/Target.zig` and `include/ghostty.h` for the
+        /// full rationale.
         present: *const fn (
             ?*anyopaque,
             i32, // dmabuf fd
@@ -437,6 +442,7 @@ pub const Platform = union(PlatformTag) {
             u32, // width (pixels)
             u32, // height (pixels)
             u32, // stride (bytes)
+            bool, // image_backed
         ) callconv(.c) void,
     };
 
@@ -481,6 +487,7 @@ pub const Platform = union(PlatformTag) {
                 u32,
                 u32,
                 u32,
+                bool,
             ) callconv(.c) void,
         },
     };
diff --git a/src/renderer/vulkan/Target.zig b/src/renderer/vulkan/Target.zig
index 19df63eb4..c857bdaa6 100644
--- a/src/renderer/vulkan/Target.zig
+++ b/src/renderer/vulkan/Target.zig
@@ -747,6 +747,11 @@ pub fn present(self: *const Self) void {
     // Fall back to the device's singleton copy when no platform was
     // attached (only the smoke test does this).
     const platform = if (self.platform) |p| p else self.device.platform;
+    // `image_backed` is the host's signal that this fd is importable
+    // by a 2D-image consumer (Wayland linux-dmabuf-v1, Vulkan
+    // external image, etc.). True in `.direct` mode where the fd was
+    // exported from a VkImage; false in `.legacy_copy` where it was
+    // exported from a VkBuffer and can only be read via mmap.
     platform.present(
         platform.userdata,
         self.fd,
@@ -755,6 +760,7 @@ pub fn present(self: *const Self) void {
         self.width,
         self.height,
         self.stride,
+        self.tiling == .direct,
     );
 }