From 9a7a31ac3766f45a1dbc03951b6230ad1f2b3136 Mon Sep 17 00:00:00 2001 From: Nathan Date: Sun, 24 May 2026 22:56:07 -0500 Subject: [PATCH] qt/wayland: zero-copy dmabuf present via wl_subsurface MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The SubsurfacePresenter now binds zwp_linux_dmabuf_v1 (vendored XML; hermetic build), wraps libghostty's dmabuf fd in a wl_buffer via create_immed, and attach/damage/commits it to the subsurface. The compositor scans the buffer out directly — no mmap, no memcpy, no QImage, no QPainter blit on the terminal pixels. paintEvent skips its blit when the subsurface path is active so the translucent QWidget background lets the subsurface show through; chrome (split dim, bell flash, resize overlay) still paints on top. Frame delivery is QMetaObject::invokeMethod(Qt::QueuedConnection) per present, with a 2 ms QTimer as a safety net for any missed queued lambda (the prior 16 ms poll was a leftover from the QImage path, capped present at 60 Hz, and added up to a frame of latency). C ABI: ghostty_platform_vulkan_s.present grows a bool `image_backed` parameter. NVIDIA in legacy_copy mode exports the dmabuf from a VkBuffer that linux-dmabuf-v1 cannot import as a 2D image — attempting it would trigger an `invalid_wl_buffer` protocol error, which is fatal for the wl_display connection. Target.present sets the flag based on Target.tiling; the host only takes the subsurface path when set, falls back to the QImage/QPainter path otherwise. Verified on NVIDIA RTX 2080 (legacy_copy → image_backed=0 → path=qimage → no protocol error). Subsurface presenter still constructs and would activate on AMD/Intel hardware where Phase 1's direct mode succeeds. Subsequent phases will add vendor-tiled modifier support so NVIDIA can use the zero-copy path too. Co-Authored-By: claude-flow --- include/ghostty.h | 19 +- qt/CMakeLists.txt | 17 + qt/protocols/linux-dmabuf-v1.xml | 585 +++++++++++++++++++++++++ qt/src/GhosttySurface.cpp | 191 +++++--- qt/src/GhosttySurface.h | 64 ++- qt/src/vulkan/Host.cpp | 10 +- qt/src/wayland/SubsurfacePresenter.cpp | 126 +++++- qt/src/wayland/SubsurfacePresenter.h | 64 +-- src/apprt/embedded.zig | 9 +- src/renderer/vulkan/Target.zig | 6 + 10 files changed, 956 insertions(+), 135 deletions(-) create mode 100644 qt/protocols/linux-dmabuf-v1.xml diff --git a/include/ghostty.h b/include/ghostty.h index 1fec03a93..aced06412 100644 --- a/include/ghostty.h +++ b/include/ghostty.h @@ -514,7 +514,21 @@ typedef struct { uint32_t (*queue_family_index)(void* userdata); // Hand off a rendered frame to the host as a dmabuf fd. The host - // imports it (e.g. into Qt's RHI as a QRhiTexture) and composites. + // imports it (e.g. into Qt's RHI as a QRhiTexture, or attaches to + // a wl_subsurface via linux-dmabuf-v1) and composites. + // + // `image_backed` is true when the dmabuf was exported from a + // VkImage allocated with VK_EXT_image_drm_format_modifier — i.e. + // it's directly importable as a 2D image by the compositor or any + // GPU-side consumer. false when it was exported from a VkBuffer + // (the legacy NVIDIA fallback path where the driver doesn't + // advertise COLOR_ATTACHMENT for the LINEAR modifier on + // exportable images, so libghostty renders into an OPTIMAL image + // and copies the bytes into a linear VkBuffer for export). In the + // !image_backed case the fd is only usable via mmap + CPU + // readback — attempting a linux-dmabuf-v1 import will trigger an + // `invalid_wl_buffer` protocol error. + // // libghostty retains ownership of the underlying VkDeviceMemory; // the host must dup() the fd if it needs to hold it past the call. void (*present)( @@ -524,7 +538,8 @@ typedef struct { uint64_t drm_modifier, uint32_t width, uint32_t height, - uint32_t stride); + uint32_t stride, + bool image_backed); } ghostty_platform_vulkan_s; typedef union { diff --git a/qt/CMakeLists.txt b/qt/CMakeLists.txt index 1a78bad66..41186a7dc 100644 --- a/qt/CMakeLists.txt +++ b/qt/CMakeLists.txt @@ -71,6 +71,21 @@ add_custom_command(OUTPUT "${BLUR_CODE}" COMMAND "${WAYLAND_SCANNER}" private-code "${BLUR_XML}" "${BLUR_CODE}" DEPENDS "${BLUR_XML}" VERBATIM) +# Generate client glue for the linux-dmabuf-v1 protocol (used by the +# Vulkan present path: wrap libghostty's dmabuf fd in a wl_buffer and +# attach it to the wayland::SubsurfacePresenter's wl_surface). Vendored +# in qt/protocols/ so the build doesn't depend on +# /usr/share/wayland-protocols being installed. +set(DMABUF_XML "${CMAKE_CURRENT_SOURCE_DIR}/protocols/linux-dmabuf-v1.xml") +set(DMABUF_HEADER "${CMAKE_CURRENT_BINARY_DIR}/linux-dmabuf-v1-client-protocol.h") +set(DMABUF_CODE "${CMAKE_CURRENT_BINARY_DIR}/linux-dmabuf-v1-protocol.c") +add_custom_command(OUTPUT "${DMABUF_HEADER}" + COMMAND "${WAYLAND_SCANNER}" client-header "${DMABUF_XML}" "${DMABUF_HEADER}" + DEPENDS "${DMABUF_XML}" VERBATIM) +add_custom_command(OUTPUT "${DMABUF_CODE}" + COMMAND "${WAYLAND_SCANNER}" private-code "${DMABUF_XML}" "${DMABUF_CODE}" + DEPENDS "${DMABUF_XML}" VERBATIM) + # libghostty is built out-of-tree by Zig. get_filename_component(GHOSTTY_ROOT "${CMAKE_CURRENT_SOURCE_DIR}/.." ABSOLUTE) set(GHOSTTY_LIB_DIR "${GHOSTTY_ROOT}/zig-out/lib") @@ -152,6 +167,8 @@ add_executable(ghastty src/XkbTracker.cpp "${BLUR_CODE}" "${BLUR_HEADER}" + "${DMABUF_CODE}" + "${DMABUF_HEADER}" ) # Embed the app icon so it is available even running from the build tree. diff --git a/qt/protocols/linux-dmabuf-v1.xml b/qt/protocols/linux-dmabuf-v1.xml new file mode 100644 index 000000000..12d09fb28 --- /dev/null +++ b/qt/protocols/linux-dmabuf-v1.xml @@ -0,0 +1,585 @@ + + + + + Copyright © 2014, 2015 Collabora, Ltd. + + Permission is hereby granted, free of charge, to any person obtaining a + copy of this software and associated documentation files (the "Software"), + to deal in the Software without restriction, including without limitation + the rights to use, copy, modify, merge, publish, distribute, sublicense, + and/or sell copies of the Software, and to permit persons to whom the + Software is furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice (including the next + paragraph) shall be included in all copies or substantial portions of the + Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + DEALINGS IN THE SOFTWARE. + + + + + This interface offers ways to create generic dmabuf-based wl_buffers. + + For more information about dmabuf, see: + https://www.kernel.org/doc/html/next/userspace-api/dma-buf-alloc-exchange.html + + Clients can use the get_surface_feedback request to get dmabuf feedback + for a particular surface. If the client wants to retrieve feedback not + tied to a surface, they can use the get_default_feedback request. + + The following are required from clients: + + - Clients must ensure that either all data in the dma-buf is + coherent for all subsequent read access or that coherency is + correctly handled by the underlying kernel-side dma-buf + implementation. + + - Don't make any more attachments after sending the buffer to the + compositor. Making more attachments later increases the risk of + the compositor not being able to use (re-import) an existing + dmabuf-based wl_buffer. + + The underlying graphics stack must ensure the following: + + - The dmabuf file descriptors relayed to the server will stay valid + for the whole lifetime of the wl_buffer. This means the server may + at any time use those fds to import the dmabuf into any kernel + sub-system that might accept it. + + However, when the underlying graphics stack fails to deliver the + promise, because of e.g. a device hot-unplug which raises internal + errors, after the wl_buffer has been successfully created the + compositor must not raise protocol errors to the client when dmabuf + import later fails. + + To create a wl_buffer from one or more dmabufs, a client creates a + zwp_linux_dmabuf_params_v1 object with a zwp_linux_dmabuf_v1.create_params + request. All planes required by the intended format are added with + the 'add' request. Finally, a 'create' or 'create_immed' request is + issued, which has the following outcome depending on the import success. + + The 'create' request, + - on success, triggers a 'created' event which provides the final + wl_buffer to the client. + - on failure, triggers a 'failed' event to convey that the server + cannot use the dmabufs received from the client. + + For the 'create_immed' request, + - on success, the server immediately imports the added dmabufs to + create a wl_buffer. No event is sent from the server in this case. + - on failure, the server can choose to either: + - terminate the client by raising a fatal error. + - mark the wl_buffer as failed, and send a 'failed' event to the + client. If the client uses a failed wl_buffer as an argument to any + request, the behaviour is compositor implementation-defined. + + For all DRM formats and unless specified in another protocol extension, + pre-multiplied alpha is used for pixel values. + + Unless specified otherwise in another protocol extension, implicit + synchronization is used. In other words, compositors and clients must + wait and signal fences implicitly passed via the DMA-BUF's reservation + mechanism. + + + + + Objects created through this interface, especially wl_buffers, will + remain valid. + + + + + + This temporary object is used to collect multiple dmabuf handles into + a single batch to create a wl_buffer. It can only be used once and + should be destroyed after a 'created' or 'failed' event has been + received. + + + + + + + This event advertises one buffer format that the server supports. + All the supported formats are advertised once when the client + binds to this interface. A roundtrip after binding guarantees + that the client has received all supported formats. + + For the definition of the format codes, see the + zwp_linux_buffer_params_v1::create request. + + Starting version 4, the format event is deprecated and must not be + sent by compositors. Instead, use get_default_feedback or + get_surface_feedback. + + + + + + + This event advertises the formats that the server supports, along with + the modifiers supported for each format. All the supported modifiers + for all the supported formats are advertised once when the client + binds to this interface. A roundtrip after binding guarantees that + the client has received all supported format-modifier pairs. + + For legacy support, DRM_FORMAT_MOD_INVALID (that is, modifier_hi == + 0x00ffffff and modifier_lo == 0xffffffff) is allowed in this event. + It indicates that the server can support the format with an implicit + modifier. When a plane has DRM_FORMAT_MOD_INVALID as its modifier, it + is as if no explicit modifier is specified. The effective modifier + will be derived from the dmabuf. + + A compositor that sends valid modifiers and DRM_FORMAT_MOD_INVALID for + a given format supports both explicit modifiers and implicit modifiers. + + For the definition of the format and modifier codes, see the + zwp_linux_buffer_params_v1::create and zwp_linux_buffer_params_v1::add + requests. + + Starting version 4, the modifier event is deprecated and must not be + sent by compositors. Instead, use get_default_feedback or + get_surface_feedback. + + + + + + + + + + + This request creates a new wp_linux_dmabuf_feedback object not bound + to a particular surface. This object will deliver feedback about dmabuf + parameters to use if the client doesn't support per-surface feedback + (see get_surface_feedback). + + + + + + + This request creates a new wp_linux_dmabuf_feedback object for the + specified wl_surface. This object will deliver feedback about dmabuf + parameters to use for buffers attached to this surface. + + If the surface is destroyed before the wp_linux_dmabuf_feedback object, + the feedback object becomes inert. + + + + + + + + + This temporary object is a collection of dmabufs and other + parameters that together form a single logical buffer. The temporary + object may eventually create one wl_buffer unless cancelled by + destroying it before requesting 'create'. + + Single-planar formats only require one dmabuf, however + multi-planar formats may require more than one dmabuf. For all + formats, an 'add' request must be called once per plane (even if the + underlying dmabuf fd is identical). + + You must use consecutive plane indices ('plane_idx' argument for 'add') + from zero to the number of planes used by the drm_fourcc format code. + All planes required by the format must be given exactly once, but can + be given in any order. Each plane index can only be set once; subsequent + calls with a plane index which has already been set will result in a + plane_set error being generated. + + + + + + + + + + + + + + + + Cleans up the temporary data sent to the server for dmabuf-based + wl_buffer creation. + + + + + + This request adds one dmabuf to the set in this + zwp_linux_buffer_params_v1. + + The 64-bit unsigned value combined from modifier_hi and modifier_lo + is the dmabuf layout modifier. DRM AddFB2 ioctl calls this the + fb modifier, which is defined in drm_mode.h of Linux UAPI. + This is an opaque token. Drivers use this token to express tiling, + compression, etc. driver-specific modifications to the base format + defined by the DRM fourcc code. + + Starting from version 4, the invalid_format protocol error is sent if + the format + modifier pair was not advertised as supported. + + Starting from version 5, the invalid_format protocol error is sent if + all planes don't use the same modifier. + + This request raises the PLANE_IDX error if plane_idx is too large. + The error PLANE_SET is raised if attempting to set a plane that + was already set. + + + + + + + + + + + + + + + + + + This asks for creation of a wl_buffer from the added dmabuf + buffers. The wl_buffer is not created immediately but returned via + the 'created' event if the dmabuf sharing succeeds. The sharing + may fail at runtime for reasons a client cannot predict, in + which case the 'failed' event is triggered. + + The 'format' argument is a DRM_FORMAT code, as defined by the + libdrm's drm_fourcc.h. The Linux kernel's DRM sub-system is the + authoritative source on how the format codes should work. + + The 'flags' is a bitfield of the flags defined in enum "flags". + 'y_invert' means the that the image needs to be y-flipped. + + Flag 'interlaced' means that the frame in the buffer is not + progressive as usual, but interlaced. An interlaced buffer as + supported here must always contain both top and bottom fields. + The top field always begins on the first pixel row. The temporal + ordering between the two fields is top field first, unless + 'bottom_first' is specified. It is undefined whether 'bottom_first' + is ignored if 'interlaced' is not set. + + This protocol does not convey any information about field rate, + duration, or timing, other than the relative ordering between the + two fields in one buffer. A compositor may have to estimate the + intended field rate from the incoming buffer rate. It is undefined + whether the time of receiving wl_surface.commit with a new buffer + attached, applying the wl_surface state, wl_surface.frame callback + trigger, presentation, or any other point in the compositor cycle + is used to measure the frame or field times. There is no support + for detecting missed or late frames/fields/buffers either, and + there is no support whatsoever for cooperating with interlaced + compositor output. + + The composited image quality resulting from the use of interlaced + buffers is explicitly undefined. A compositor may use elaborate + hardware features or software to deinterlace and create progressive + output frames from a sequence of interlaced input buffers, or it + may produce substandard image quality. However, compositors that + cannot guarantee reasonable image quality in all cases are recommended + to just reject all interlaced buffers. + + Any argument errors, including non-positive width or height, + mismatch between the number of planes and the format, bad + format, bad offset or stride, may be indicated by fatal protocol + errors: INCOMPLETE, INVALID_FORMAT, INVALID_DIMENSIONS, + OUT_OF_BOUNDS. + + Dmabuf import errors in the server that are not obvious client + bugs are returned via the 'failed' event as non-fatal. This + allows attempting dmabuf sharing and falling back in the client + if it fails. + + This request can be sent only once in the object's lifetime, after + which the only legal request is destroy. This object should be + destroyed after issuing a 'create' request. Attempting to use this + object after issuing 'create' raises ALREADY_USED protocol error. + + It is not mandatory to issue 'create'. If a client wants to + cancel the buffer creation, it can just destroy this object. + + + + + + + + + + This event indicates that the attempted buffer creation was + successful. It provides the new wl_buffer referencing the dmabuf(s). + + Upon receiving this event, the client should destroy the + zwp_linux_buffer_params_v1 object. + + + + + + + This event indicates that the attempted buffer creation has + failed. It usually means that one of the dmabuf constraints + has not been fulfilled. + + Upon receiving this event, the client should destroy the + zwp_linux_buffer_params_v1 object. + + + + + + This asks for immediate creation of a wl_buffer by importing the + added dmabufs. + + In case of import success, no event is sent from the server, and the + wl_buffer is ready to be used by the client. + + Upon import failure, either of the following may happen, as seen fit + by the implementation: + - the client is terminated with one of the following fatal protocol + errors: + - INCOMPLETE, INVALID_FORMAT, INVALID_DIMENSIONS, OUT_OF_BOUNDS, + in case of argument errors such as mismatch between the number + of planes and the format, bad format, non-positive width or + height, or bad offset or stride. + - INVALID_WL_BUFFER, in case the cause for failure is unknown or + platform specific. + - the server creates an invalid wl_buffer, marks it as failed and + sends a 'failed' event to the client. The result of using this + invalid wl_buffer as an argument in any request by the client is + defined by the compositor implementation. + + This takes the same arguments as a 'create' request, and obeys the + same restrictions. + + + + + + + + + + + + This object advertises dmabuf parameters feedback. This includes the + preferred devices and the supported formats/modifiers. + + The parameters are sent once when this object is created and whenever they + change. The done event is always sent once after all parameters have been + sent. When a single parameter changes, all parameters are re-sent by the + compositor. + + Compositors can re-send the parameters when the current client buffer + allocations are sub-optimal. Compositors should not re-send the + parameters if re-allocating the buffers would not result in a more optimal + configuration. In particular, compositors should avoid sending the exact + same parameters multiple times in a row. + + The tranche_target_device and tranche_formats events are grouped by + tranches of preference. For each tranche, a tranche_target_device, one + tranche_flags and one or more tranche_formats events are sent, followed + by a tranche_done event finishing the list. The tranches are sent in + descending order of preference. All formats and modifiers in the same + tranche have the same preference. + + To send parameters, the compositor sends one main_device event, tranches + (each consisting of one tranche_target_device event, one tranche_flags + event, tranche_formats events and then a tranche_done event), then one + done event. + + + + + Using this request a client can tell the server that it is not going to + use the wp_linux_dmabuf_feedback object anymore. + + + + + + This event is sent after all parameters of a wp_linux_dmabuf_feedback + object have been sent. + + This allows changes to the wp_linux_dmabuf_feedback parameters to be + seen as atomic, even if they happen via multiple events. + + + + + + This event provides a file descriptor which can be memory-mapped to + access the format and modifier table. + + The table contains a tightly packed array of consecutive format + + modifier pairs. Each pair is 16 bytes wide. It contains a format as a + 32-bit unsigned integer, followed by 4 bytes of unused padding, and a + modifier as a 64-bit unsigned integer. The native endianness is used. + + The client must map the file descriptor in read-only private mode. + + Compositors are not allowed to mutate the table file contents once this + event has been sent. Instead, compositors must create a new, separate + table file and re-send feedback parameters. Compositors are allowed to + store duplicate format + modifier pairs in the table. + + + + + + + + This event advertises the main device that the server prefers to use + when direct scan-out to the target device isn't possible. The + advertised main device may be different for each + wp_linux_dmabuf_feedback object, and may change over time. + + There is exactly one main device. The compositor must send at least + one preference tranche with tranche_target_device equal to main_device. + + Clients need to create buffers that the main device can import and + read from, otherwise creating the dmabuf wl_buffer will fail (see the + wp_linux_buffer_params.create and create_immed requests for details). + The main device will also likely be kept active by the compositor, + so clients can use it instead of waking up another device for power + savings. + + In general the device is a DRM node. The DRM node type (primary vs. + render) is unspecified. Clients must not rely on the compositor sending + a particular node type. Clients cannot check two devices for equality + by comparing the dev_t value. + + If explicit modifiers are not supported and the client performs buffer + allocations on a different device than the main device, then the client + must force the buffer to have a linear layout. + + + + + + + This event splits tranche_target_device and tranche_formats events in + preference tranches. It is sent after a set of tranche_target_device + and tranche_formats events; it represents the end of a tranche. The + next tranche will have a lower preference. + + + + + + This event advertises the target device that the server prefers to use + for a buffer created given this tranche. The advertised target device + may be different for each preference tranche, and may change over time. + + There is exactly one target device per tranche. + + The target device may be a scan-out device, for example if the + compositor prefers to directly scan-out a buffer created given this + tranche. The target device may be a rendering device, for example if + the compositor prefers to texture from said buffer. + + The client can use this hint to allocate the buffer in a way that makes + it accessible from the target device, ideally directly. The buffer must + still be accessible from the main device, either through direct import + or through a potentially more expensive fallback path. If the buffer + can't be directly imported from the main device then clients must be + prepared for the compositor changing the tranche priority or making + wl_buffer creation fail (see the wp_linux_buffer_params.create and + create_immed requests for details). + + If the device is a DRM node, the DRM node type (primary vs. render) is + unspecified. Clients must not rely on the compositor sending a + particular node type. Clients cannot check two devices for equality by + comparing the dev_t value. + + This event is tied to a preference tranche, see the tranche_done event. + + + + + + + This event advertises the format + modifier combinations that the + compositor supports. + + It carries an array of indices, each referring to a format + modifier + pair in the last received format table (see the format_table event). + Each index is a 16-bit unsigned integer in native endianness. + + For legacy support, DRM_FORMAT_MOD_INVALID is an allowed modifier. + It indicates that the server can support the format with an implicit + modifier. When a buffer has DRM_FORMAT_MOD_INVALID as its modifier, it + is as if no explicit modifier is specified. The effective modifier + will be derived from the dmabuf. + + A compositor that sends valid modifiers and DRM_FORMAT_MOD_INVALID for + a given format supports both explicit modifiers and implicit modifiers. + + Compositors must not send duplicate format + modifier pairs within the + same tranche or across two different tranches with the same target + device and flags. + + This event is tied to a preference tranche, see the tranche_done event. + + For the definition of the format and modifier codes, see the + wp_linux_buffer_params.create request. + + + + + + + + + + + This event sets tranche-specific flags. + + The scanout flag is a hint that direct scan-out may be attempted by the + compositor on the target device if the client appropriately allocates a + buffer. How to allocate a buffer that can be scanned out on the target + device is implementation-defined. + + This event is tied to a preference tranche, see the tranche_done event. + + + + + + diff --git a/qt/src/GhosttySurface.cpp b/qt/src/GhosttySurface.cpp index 739c5b576..e4f84c128 100644 --- a/qt/src/GhosttySurface.cpp +++ b/qt/src/GhosttySurface.cpp @@ -127,21 +127,18 @@ GhosttySurface::GhosttySurface(ghostty_app_t app, MainWindow *owner, sc.platform_tag = GHOSTTY_PLATFORM_VULKAN; sc.platform.vulkan = vk_host->asPlatform(this); - // Polling timer on the GUI thread: every 16ms, check if the - // renderer thread parked a new frame in `m_pending` and swap - // it into `m_image` for paintEvent to pick up. + // GUI-thread frame drain. The renderer thread wakes us per frame + // via QMetaObject::invokeMethod (Qt::QueuedConnection) on each + // present — see `presentVulkanDmabuf`. The 2 ms timer is a + // safety net: if `invokeMethod` ever fails to deliver (the + // earlier QImage-handoff diagnostics suggested this could + // happen), the next tick drains the parked frame within at most + // 2 ms. Idle case has negligible CPU cost because `drainVulkan` + // returns immediately when nothing is pending. m_vulkanPollTimer = new QTimer(this); - m_vulkanPollTimer->setInterval(16); // ≈60 Hz - connect(m_vulkanPollTimer, &QTimer::timeout, this, [this]() { - QImage frame; - { - QMutexLocker lock(&m_pendingMutex); - if (m_pending.isNull()) return; - frame = std::move(m_pending); - } - m_image = std::move(frame); - update(); - }); + m_vulkanPollTimer->setInterval(2); + connect(m_vulkanPollTimer, &QTimer::timeout, this, + [this]() { drainVulkan(); }); m_vulkanPollTimer->start(); } else { sc.platform_tag = GHOSTTY_PLATFORM_OPENGL; @@ -324,9 +321,18 @@ bool GhosttySurface::event(QEvent *e) { // WA_NativeWindow ensures windowHandle() is non-null even if // GhosttySurface is embedded in a non-native parent. setAttribute(Qt::WA_NativeWindow); - if (auto *h = windowHandle()) + if (auto *h = windowHandle()) { m_subsurfacePresenter = wayland::SubsurfacePresenter::tryCreate(h); + if (m_subsurfacePresenter && m_useVulkan) { + // Flip the Vulkan present path over to the zero-copy + // wl_subsurface route. Release-style store pairs with + // the renderer thread's acquire-load — once it observes + // true, it stops parking QImages and just hands us the + // dmabuf descriptor for compositor handoff. + m_useSubsurface.store(true, std::memory_order_release); + } + } } } else if (e->type() == QEvent::Hide) { ghostty_surface_set_occlusion(m_surface, false); @@ -424,6 +430,14 @@ void GhosttySurface::renderTerminal() { } void GhosttySurface::paintEvent(QPaintEvent *) { + // Subsurface zero-copy path: the wl_subsurface IS the terminal + // pixels — they reach the compositor without ever touching our + // QPainter. With `WA_TranslucentBackground` set, the QWidget + // paints transparent over the subsurface so chrome (dim overlay, + // bell flash, resize hint) still composites on top. + const bool subsurfaceActive = + m_useSubsurface.load(std::memory_order_acquire) && m_subsurfacePresenter; + // No frame yet — leave the widget background untouched. With // `WA_TranslucentBackground` set the area is transparent until // the first frame imports, matching the OpenGL path. New surfaces @@ -431,18 +445,20 @@ void GhosttySurface::paintEvent(QPaintEvent *) { // thread has emitted its first frame; the gap is short enough // that flashing a debug placeholder is more jarring than the // brief see-through. - if (m_image.isNull()) return; + if (!subsurfaceActive && m_image.isNull()) return; QPainter painter(this); - // Blit the framebuffer 1:1. m_image carries the device pixel ratio, so - // the QPointF overload draws it at its true logical size. When in - // sync that exactly fills the widget; mid-resize, the previous frame - // stays at its real size in the top-left corner (rather than being - // stretched to the new widget rect, which the user dislikes more - // than the transient gap). - // CompositionMode_Source replaces the transparent widget pixels with - // the terminal image, alpha included, so its translucency is kept. - painter.setCompositionMode(QPainter::CompositionMode_Source); - painter.drawImage(QPointF(0, 0), m_image); + if (!subsurfaceActive) { + // Blit the framebuffer 1:1. m_image carries the device pixel ratio, so + // the QPointF overload draws it at its true logical size. When in + // sync that exactly fills the widget; mid-resize, the previous frame + // stays at its real size in the top-left corner (rather than being + // stretched to the new widget rect, which the user dislikes more + // than the transient gap). + // CompositionMode_Source replaces the transparent widget pixels with + // the terminal image, alpha included, so its translucency is kept. + painter.setCompositionMode(QPainter::CompositionMode_Source); + painter.drawImage(QPointF(0, 0), m_image); + } // Unfocused-split dimming: a translucent fill over an inactive pane. // Only split panes (a QSplitter parent) are dimmed, matching GTK. @@ -1343,13 +1359,34 @@ void GhosttySurface::presentVulkanDmabuf( quint64 drm_modifier, quint32 width, quint32 height, - quint32 stride) { - // Called from the renderer thread. We mmap the dmabuf, copy the - // bytes into a QImage, and hand the QImage to the GUI thread for - // paint via `QMetaObject::invokeMethod`. The fd is a borrow (per - // the `ghostty_platform_vulkan_s` contract); libghostty closes it - // when the underlying memory is freed. - (void)drm_modifier; // LINEAR for v1; not used here. + quint32 stride, + bool image_backed) { + // Called from the renderer thread. Two paths, picked per frame + // based on whether the wl_subsurface presenter is up: + // + // Subsurface (zero-copy): park the dmabuf metadata; GUI thread + // wraps the fd in a wl_buffer and attach/commits to our + // wl_subsurface. The compositor scans it out directly. + // + // Fallback (legacy mmap+memcpy): map the fd, copy into a + // QImage, GUI thread paints via QPainter. Used when the + // subsurface presenter failed to come up (e.g. compositor + // missing linux-dmabuf-v1). + // + // The fd is a borrow per the `ghostty_platform_vulkan_s` contract; + // libghostty closes it when the underlying memory is freed. In + // the subsurface path the wayland client lib SCM_RIGHTS-dups the + // fd so the compositor's reference outlives our park-and-drain. + + // The subsurface path requires `image_backed` (i.e. the renderer + // is in `.direct` mode and the fd points at a VkImage). When the + // renderer falls back to `.legacy_copy` — NVIDIA today, the fd is + // a VkBuffer — linux-dmabuf-v1 import would fail with + // `invalid_wl_buffer` and that's a fatal protocol error on the + // wl_display. So we gate per-frame and stay on the QImage path + // when the fd isn't compositor-importable. + const bool useSubsurface = + image_backed && m_useSubsurface.load(std::memory_order_acquire); // One-shot breadcrumb so logs confirm the dmabuf hand-off is // wired. Subsequent frames are silent so we don't spam stderr. @@ -1357,15 +1394,31 @@ void GhosttySurface::presentVulkanDmabuf( if (!logged_first) { logged_first = true; std::fprintf(stderr, - "[ghastty] first Vulkan dmabuf frame: fd=%d %ux%u stride=%u fourcc=0x%08x mod=0x%lx\n", + "[ghastty] first Vulkan dmabuf frame: fd=%d %ux%u stride=%u " + "fourcc=0x%08x mod=0x%lx image_backed=%d path=%s\n", dmabuf_fd, width, height, stride, drm_format, - static_cast(drm_modifier)); + static_cast(drm_modifier), image_backed ? 1 : 0, + useSubsurface ? "subsurface" : "qimage"); } - // sanity check the size before we allocate / mmap. if (dmabuf_fd < 0 || width == 0 || height == 0 || stride < width * 4) return; + if (useSubsurface) { + // Subsurface path. Park the descriptor under the mutex (so + // a concurrent drainVulkan sees a consistent snapshot) and + // wake the GUI thread. + { + QMutexLocker lock(&m_pendingMutex); + m_pendingDmabuf = PendingDmabuf{ + dmabuf_fd, drm_format, drm_modifier, width, height, stride, + }; + } + QMetaObject::invokeMethod(this, "drainVulkan", Qt::QueuedConnection); + return; + } + + // Fallback: mmap + memcpy into a QImage. const size_t bytes = static_cast(stride) * height; void *mapped = ::mmap(nullptr, bytes, PROT_READ, MAP_SHARED, dmabuf_fd, 0); if (mapped == MAP_FAILED) { @@ -1373,19 +1426,12 @@ void GhosttySurface::presentVulkanDmabuf( dmabuf_fd, std::strerror(errno)); return; } - // QImage holds the pixel data by copying when constructed with - // `Format_ARGB32_Premultiplied` from a buffer with explicit stride. - // We then detach (copy()) so the QImage survives the unmap. - // // drm_format ARGB8888 (0x34325241 = "AR24") matches QImage's - // ARGB32 byte order on little-endian (B,G,R,A in memory). - // - // We use the *premultiplied* variant because the renderer's - // fragment shaders output premultiplied alpha and the render - // target is `VK_FORMAT_B8G8R8A8_SRGB` (hardware gamma-encodes the - // linear shader output at framebuffer-write time). The bytes - // landing in this buffer are therefore sRGB-encoded premultiplied - // ARGB — exactly what Format_ARGB32_Premultiplied expects. + // ARGB32 byte order on little-endian (B,G,R,A in memory). The + // renderer's fragment shaders output premultiplied alpha into + // `VK_FORMAT_B8G8R8A8_SRGB`, so the buffer is sRGB-encoded + // premultiplied ARGB — exactly what Format_ARGB32_Premultiplied + // expects. (void)drm_format; const QImage stamped( static_cast(mapped), @@ -1396,20 +1442,45 @@ void GhosttySurface::presentVulkanDmabuf( QImage owned = stamped.copy(); ::munmap(mapped, bytes); - // Tell QPainter the image's pixels are device pixels at the same - // DPR the framebuffer was sized at. Without this, `drawImage` would - // treat the image as logical pixels and re-scale to framebuffer - // pixels on a HiDPI display (DPR>1) — glyphs come out 2× too big. - // `m_fbDpr` is the DPR `syncSurfaceSize` used when telling - // libghostty the framebuffer size, so it matches what the renderer - // actually drew. if (m_fbDpr > 0) owned.setDevicePixelRatio(m_fbDpr); - - // Stash for the GUI-thread polling timer to pick up. { QMutexLocker lock(&m_pendingMutex); m_pending = std::move(owned); } + QMetaObject::invokeMethod(this, "drainVulkan", Qt::QueuedConnection); +} + +void GhosttySurface::drainVulkan() { + // Subsurface (zero-copy) path: take the parked dmabuf descriptor + // under the mutex, then dispatch it to the presenter outside the + // lock so a renderer-thread `presentVulkanDmabuf` parking the + // next frame doesn't block on wl_display_flush. + if (m_useSubsurface.load(std::memory_order_acquire) && + m_subsurfacePresenter) { + PendingDmabuf frame; + { + QMutexLocker lock(&m_pendingMutex); + if (m_pendingDmabuf.fd < 0) return; + frame = m_pendingDmabuf; + m_pendingDmabuf.fd = -1; // mark consumed + } + const int scale = + std::max(1, static_cast(std::lround(devicePixelRatioF()))); + m_subsurfacePresenter->presentDmabuf(frame.fd, frame.drm_format, + frame.drm_modifier, frame.width, + frame.height, frame.stride, scale); + return; + } + + // Fallback: hand the QImage to paintEvent. + QImage frame; + { + QMutexLocker lock(&m_pendingMutex); + if (m_pending.isNull()) return; + frame = std::move(m_pending); + } + m_image = std::move(frame); + update(); } // Trampoline so `Host.cpp` doesn't need to include the full @@ -1425,10 +1496,12 @@ void presentToGhosttySurface( uint64_t drm_modifier, uint32_t width, uint32_t height, - uint32_t stride) { + uint32_t stride, + bool image_backed) { if (surface == nullptr) return; static_cast(surface)->presentVulkanDmabuf( - dmabuf_fd, drm_format, drm_modifier, width, height, stride); + dmabuf_fd, drm_format, drm_modifier, width, height, stride, + image_backed); } } // namespace vulkan diff --git a/qt/src/GhosttySurface.h b/qt/src/GhosttySurface.h index 6d3ff6ed2..9bb2d8d66 100644 --- a/qt/src/GhosttySurface.h +++ b/qt/src/GhosttySurface.h @@ -150,20 +150,30 @@ public: void setPwd(const QString &pwd); const QString &pwd() const { return m_pwd; } - // Apprt-side entry point for the Vulkan `present` callback. - // libghostty hands us a dmabuf fd pointing at the rendered - // VkImage's memory; we mmap it (LINEAR tiling means the bytes - // are directly readable as BGRA), copy the pixels into a QImage, - // and schedule a repaint. Thread-safe: the callback fires from - // the renderer thread; the QImage handoff goes through - // `QMetaObject::invokeMethod` to the GUI thread. + // Apprt-side entry point for the Vulkan `present` callback. Fires + // on the renderer thread. Parks the dmabuf descriptor under + // `m_pendingMutex` (plus, for the legacy fallback path, an + // mmap+memcpy'd QImage) and wakes the GUI thread via + // `QMetaObject::invokeMethod(this, drainVulkan, Qt::QueuedConnection)`. + // The GUI thread either commits the dmabuf to the wl_subsurface + // (zero-copy) or paints the QImage (fallback). A 2 ms safety-net + // poll catches anything `invokeMethod` ever fails to deliver. Q_INVOKABLE void presentVulkanDmabuf( int dmabuf_fd, quint32 drm_format, quint64 drm_modifier, quint32 width, quint32 height, - quint32 stride); + quint32 stride, + bool image_backed); + + // GUI-thread drain step: hands the most recent pending frame + // either to the SubsurfacePresenter (zero-copy path) or the + // QImage paint pipeline (fallback). Idempotent: returns + // immediately if nothing's pending. Invoked from the polling + // safety net AND from queued invocations triggered by the + // renderer thread. + Q_INVOKABLE void drainVulkan(); protected: bool event(QEvent *) override; @@ -244,15 +254,35 @@ private: // gives way to the actual rendered content. bool m_useVulkan = false; - // Cross-thread frame handoff for the Vulkan path. `presentVulkanDmabuf` - // (renderer thread) writes a freshly-imported QImage to `m_pending` - // under `m_pendingMutex`; a 16 ms `QTimer` on the GUI thread checks - // `m_pending`, atomically swaps it into `m_image`, and triggers a - // repaint. The polling timer is the simplest reliable cross-thread - // path we could land — the obvious Qt mechanisms - // (QMetaObject::invokeMethod / postEvent) were both not firing - // their queued lambdas under the renderer-thread → GUI-thread - // handoff, see the commit message for diagnostics. + // Cross-thread frame handoff for the Vulkan path. The renderer + // thread calls `presentVulkanDmabuf` with a borrowed dmabuf fd; a + // 16 ms `QTimer` on the GUI thread drains the pending frame and + // routes it through the wl_subsurface (zero-copy) when the + // SubsurfacePresenter is available, or falls back to the + // mmap+memcpy+QImage path otherwise. The polling timer was kept + // (rather than QMetaObject::invokeMethod) because queued lambdas + // from the renderer thread were unreliable in earlier diagnostics. + // + // `m_useSubsurface` is set once on the GUI thread when the + // presenter comes up; the renderer thread reads it acquire-style + // to decide which path to populate per frame. + std::atomic m_useSubsurface{false}; + // Subsurface (zero-copy) path: renderer thread parks the + // borrowed-fd descriptor here; GUI-thread timer hands it to the + // presenter. + struct PendingDmabuf { + int fd = -1; + quint32 drm_format = 0; + quint64 drm_modifier = 0; + quint32 width = 0; + quint32 height = 0; + quint32 stride = 0; + }; + PendingDmabuf m_pendingDmabuf; + // Legacy (mmap+memcpy) path: kept as a fallback when the + // presenter isn't available (e.g. compositor missing + // linux-dmabuf-v1). When the subsurface path is active this stays + // null and paintEvent skips its blit. QImage m_pending; QMutex m_pendingMutex; QTimer *m_vulkanPollTimer = nullptr; diff --git a/qt/src/vulkan/Host.cpp b/qt/src/vulkan/Host.cpp index ce3fdbaa2..e9551567e 100644 --- a/qt/src/vulkan/Host.cpp +++ b/qt/src/vulkan/Host.cpp @@ -22,7 +22,8 @@ void presentToGhosttySurface( uint64_t drm_modifier, uint32_t width, uint32_t height, - uint32_t stride); + uint32_t stride, + bool image_backed); namespace { @@ -114,10 +115,11 @@ void cbPresent( uint64_t drm_modifier, uint32_t width, uint32_t height, - uint32_t stride) { + uint32_t stride, + bool image_backed) { if (ud == nullptr) return; - ::vulkan::presentToGhosttySurface(ud, dmabuf_fd, drm_format, - drm_modifier, width, height, stride); + ::vulkan::presentToGhosttySurface(ud, dmabuf_fd, drm_format, drm_modifier, + width, height, stride, image_backed); } } // namespace diff --git a/qt/src/wayland/SubsurfacePresenter.cpp b/qt/src/wayland/SubsurfacePresenter.cpp index 77207a109..d02454ea5 100644 --- a/qt/src/wayland/SubsurfacePresenter.cpp +++ b/qt/src/wayland/SubsurfacePresenter.cpp @@ -10,6 +10,8 @@ #include +#include "linux-dmabuf-v1-client-protocol.h" + namespace wayland { namespace { @@ -21,6 +23,7 @@ namespace { struct PresenterGlobals { wl_compositor *compositor = nullptr; wl_subcompositor *subcompositor = nullptr; + zwp_linux_dmabuf_v1 *dmabuf = nullptr; bool searched = false; }; @@ -33,6 +36,14 @@ void registryGlobal(void *data, wl_registry *registry, uint32_t name, } else if (std::strcmp(interface, wl_subcompositor_interface.name) == 0) { g->subcompositor = static_cast( wl_registry_bind(registry, name, &wl_subcompositor_interface, 1)); + } else if (std::strcmp(interface, zwp_linux_dmabuf_v1_interface.name) == 0) { + // v3 has `create_immed`, which we want (synchronous wl_buffer + // creation — the v2 async `create` + `created`/`failed` event + // dance would add a layer of callback machinery for no real win + // in our renderer's strict-fd-validity scenario). v4 adds the + // dynamic format/modifier feedback dance; we don't need it yet. + g->dmabuf = static_cast(wl_registry_bind( + registry, name, &zwp_linux_dmabuf_v1_interface, 3)); } } void registryGlobalRemove(void *, wl_registry *, uint32_t) {} @@ -63,20 +74,32 @@ PresenterGlobals *discoverGlobals(wl_display *display) { if (globals.subcompositor) wl_proxy_set_queue(reinterpret_cast(globals.subcompositor), nullptr); + if (globals.dmabuf) + wl_proxy_set_queue(reinterpret_cast(globals.dmabuf), nullptr); wl_event_queue_destroy(queue); return &globals; } +// wl_buffer::release listener: the compositor is done sampling the +// buffer for any committed surface state, so we can destroy our +// client-side handle. The underlying dmabuf memory is owned by +// libghostty; we never close that fd here (the SCM_RIGHTS transfer +// in zwp_linux_buffer_params.add gave the compositor its own +// reference, which lives independently of our wl_buffer). +void bufferRelease(void *, wl_buffer *buffer) { + wl_buffer_destroy(buffer); +} +const wl_buffer_listener kBufferListener = { + bufferRelease, +}; + } // namespace std::unique_ptr SubsurfacePresenter::tryCreate(QWindow *parent) { if (!parent) return nullptr; - // The Qt frontend is Wayland-only; if we're not on Wayland, the - // native-interface lookups below would return null anyway, but - // bail explicitly so the log message is useful. if (!QGuiApplication::platformName().startsWith(QLatin1String("wayland"))) { std::fprintf(stderr, "[ghastty] SubsurfacePresenter: not on Wayland QPA\n"); @@ -100,13 +123,13 @@ SubsurfacePresenter::tryCreate(QWindow *parent) { } PresenterGlobals *g = discoverGlobals(display); - if (!g->compositor || !g->subcompositor) { + if (!g->compositor || !g->subcompositor || !g->dmabuf) { std::fprintf(stderr, - "[ghastty] SubsurfacePresenter: compositor lacks " - "wl_compositor or wl_subcompositor (compositor=%p " - "subcompositor=%p)\n", + "[ghastty] SubsurfacePresenter: compositor missing required " + "globals (compositor=%p subcompositor=%p dmabuf=%p)\n", static_cast(g->compositor), - static_cast(g->subcompositor)); + static_cast(g->subcompositor), + static_cast(g->dmabuf)); return nullptr; } @@ -126,18 +149,13 @@ SubsurfacePresenter::tryCreate(QWindow *parent) { // for the parent's next commit. `set_desync` is what allows that. wl_subsurface_set_desync(sub); - // Subsurface covers the parent at the origin. Phase 3 will keep - // this in sync on resize; for Phase 2 it doesn't matter because - // we never attach a buffer. + // Subsurface covers the parent at the origin. Phase 4 will keep + // this in sync on splits/tabs/etc.; for now the GhosttySurface + // forces WA_NativeWindow so its QWindow IS the terminal's native + // wayland surface and (0,0) is correct. wl_subsurface_set_position(sub, 0, 0); - // Flush so the compositor sees the subsurface creation. We do NOT - // commit the child surface — per protocol an uncommitted subsurface - // with no attached buffer contributes nothing to the parent's - // display, which is exactly the no-behavior-change state we want - // for Phase 2. wl_display_flush(display); - if (int err = wl_display_get_error(display); err != 0) { std::fprintf(stderr, "[ghastty] SubsurfacePresenter: wl_display error %d after " @@ -149,18 +167,22 @@ SubsurfacePresenter::tryCreate(QWindow *parent) { } std::fprintf(stderr, - "[ghastty] SubsurfacePresenter: subsurface ready (parent=%p " - "child=%p sub=%p)\n", - static_cast(parentSurface), - static_cast(child), static_cast(sub)); + "[ghastty] SubsurfacePresenter: ready (parent=%p child=%p " + "sub=%p dmabuf=%p)\n", + static_cast(parentSurface), static_cast(child), + static_cast(sub), static_cast(g->dmabuf)); return std::unique_ptr( - new SubsurfacePresenter(display, child, sub)); + new SubsurfacePresenter(display, child, sub, g->dmabuf)); } SubsurfacePresenter::SubsurfacePresenter(wl_display *display, wl_surface *child, - wl_subsurface *sub) - : m_display(display), m_childSurface(child), m_subsurface(sub) {} + wl_subsurface *sub, + zwp_linux_dmabuf_v1 *dmabuf) + : m_display(display), + m_childSurface(child), + m_subsurface(sub), + m_dmabuf(dmabuf) {} SubsurfacePresenter::~SubsurfacePresenter() { if (m_subsurface) wl_subsurface_destroy(m_subsurface); @@ -168,4 +190,60 @@ SubsurfacePresenter::~SubsurfacePresenter() { if (m_display) wl_display_flush(m_display); } +void SubsurfacePresenter::presentDmabuf(int fd, uint32_t drm_format, + uint64_t drm_modifier, uint32_t width, + uint32_t height, uint32_t stride, + int buffer_scale) { + if (fd < 0 || !m_dmabuf || !m_childSurface) return; + if (buffer_scale < 1) buffer_scale = 1; + + // Wrap libghostty's borrowed fd in a wl_buffer. + zwp_linux_buffer_params_v1 *params = + zwp_linux_dmabuf_v1_create_params(m_dmabuf); + if (!params) return; + zwp_linux_buffer_params_v1_add(params, fd, /*plane_idx*/ 0, + /*offset*/ 0, stride, + static_cast(drm_modifier >> 32), + static_cast(drm_modifier & 0xFFFFFFFFu)); + wl_buffer *buffer = zwp_linux_buffer_params_v1_create_immed( + params, static_cast(width), static_cast(height), + drm_format, /*flags*/ 0); + zwp_linux_buffer_params_v1_destroy(params); + if (!buffer) { + std::fprintf(stderr, + "[ghastty] SubsurfacePresenter: create_immed returned null " + "(fd=%d %ux%u fmt=0x%x mod=0x%llx)\n", + fd, width, height, drm_format, + static_cast(drm_modifier)); + return; + } + wl_buffer_add_listener(buffer, &kBufferListener, this); + + // Set buffer scale only when it changes — calling on every present + // is harmless but the compositor's bookkeeping is cheaper if we + // skip the redundant request. + if (buffer_scale != m_lastBufferScale) { + wl_surface_set_buffer_scale(m_childSurface, buffer_scale); + m_lastBufferScale = buffer_scale; + } + + wl_surface_attach(m_childSurface, buffer, 0, 0); + // Damage the full buffer extent — terminals tend to update large + // dirty rects anyway (cursor blink, scroll, repaint) so a precise + // damage region wouldn't save much, and `damage_buffer` (vs + // `damage`) uses buffer coordinates so it's resolution-correct + // regardless of buffer_scale. + wl_surface_damage_buffer(m_childSurface, 0, 0, static_cast(width), + static_cast(height)); + wl_surface_commit(m_childSurface); + + wl_display_flush(m_display); + if (int err = wl_display_get_error(m_display); err != 0) { + std::fprintf( + stderr, + "[ghastty] SubsurfacePresenter: wl_display error %d after present\n", + err); + } +} + } // namespace wayland diff --git a/qt/src/wayland/SubsurfacePresenter.h b/qt/src/wayland/SubsurfacePresenter.h index 4c762c61d..daa17968f 100644 --- a/qt/src/wayland/SubsurfacePresenter.h +++ b/qt/src/wayland/SubsurfacePresenter.h @@ -1,67 +1,75 @@ // Wayland subsurface presenter for `GhosttySurface`. // -// Scaffolding for the GPU-direct present path (issue: Phase 2 of the -// dmabuf-as-importable-surface rework). This class owns one -// `wl_subsurface` parented to the `GhosttySurface`'s native -// `wl_surface`. Its eventual job is to receive dmabuf fds from -// libghostty's renderer, wrap each one in a `wl_buffer` via -// `zwp_linux_dmabuf_v1`, and attach it to the subsurface so the -// compositor scans it out directly — bypassing the current mmap + -// memcpy + QImage + QPainter pipeline. -// -// In Phase 2 (this commit) the presenter only creates and tears down -// the subsurface. No buffer is ever attached; the existing -// `presentVulkanDmabuf` path keeps running unchanged. The proof this -// scaffolding works is that `ghastty-vulkan` still launches and -// renders identically with no Wayland protocol errors. +// Owns one `wl_subsurface` parented to the `GhosttySurface`'s native +// `wl_surface`, plus the `zwp_linux_dmabuf_v1` machinery for wrapping +// libghostty's dmabuf fds in `wl_buffer`s and attaching them to that +// subsurface. The compositor scans the buffers out directly — no +// mmap, no memcpy, no QImage, no QPainter blit on the present path. // // Wayland-only by project decision (the Qt frontend is Wayland-only; // see `feedback-qt-no-x11` memory). If the host isn't on a Wayland -// QPA platform or the compositor lacks `wl_subcompositor`, -// `tryCreate` returns nullptr — Phase 2 silently ignores that -// because nothing consumes the presenter yet; Phase 3 will treat it -// as fatal. +// QPA platform or the compositor lacks the required globals, +// `tryCreate` returns nullptr — the caller decides whether that's a +// fatal error. #pragma once +#include #include struct wl_display; struct wl_subsurface; struct wl_surface; +struct zwp_linux_dmabuf_v1; class QWindow; namespace wayland { class SubsurfacePresenter { public: - // Build a subsurface parented to `parent`'s native `wl_surface`. + // Build a subsurface parented to `parent`'s native `wl_surface`, + // and bind the linux-dmabuf-v1 global on the same display. // Returns nullptr if any prerequisite is missing (non-Wayland QPA, - // null `wl_display`, `wl_subcompositor` unbindable, etc.). + // null `wl_display`, `wl_subcompositor` unbindable, + // `zwp_linux_dmabuf_v1` unbindable, etc.). // - // Forces `Qt::WA_NativeWindow` on the caller is the *caller's* + // Forcing `Qt::WA_NativeWindow` on the caller is the *caller's* // responsibility — `tryCreate` only reads `parent->surfaceHandle`. static std::unique_ptr tryCreate(QWindow *parent); ~SubsurfacePresenter(); - // Phase-3 accessors: when the present path moves to dmabuf-attach, - // the caller will need the child `wl_surface` to attach buffers to - // and the `wl_display` to flush. Exposed now so the API surface - // doesn't churn between phases. - wl_surface *childSurface() const { return m_childSurface; } - wl_display *display() const { return m_display; } + // Hand a dmabuf-backed frame to the compositor: wrap the fd in a + // `wl_buffer` via `zwp_linux_buffer_params_v1.create_immed`, attach + // to the subsurface, damage, commit. MUST be called on the Qt GUI + // thread (the thread that owns the wl_display dispatch); the + // renderer thread should marshal frames through a Qt-side queue. + // + // libghostty owns the fd; this method does not close it. The + // wayland client library duplicates the fd kernel-side via + // SCM_RIGHTS, so the compositor's reference survives even after + // libghostty reuses or closes its handle. + // + // `buffer_scale` is the Wayland buffer scale factor (1 for stock + // DPI, 2 for HiDPI, etc.) — set on the child surface so the + // compositor scales the buffer correctly relative to the parent's + // surface-local coordinates. + void presentDmabuf(int fd, uint32_t drm_format, uint64_t drm_modifier, + uint32_t width, uint32_t height, uint32_t stride, + int buffer_scale); SubsurfacePresenter(const SubsurfacePresenter &) = delete; SubsurfacePresenter &operator=(const SubsurfacePresenter &) = delete; private: SubsurfacePresenter(wl_display *display, wl_surface *child, - wl_subsurface *sub); + wl_subsurface *sub, zwp_linux_dmabuf_v1 *dmabuf); wl_display *m_display; wl_surface *m_childSurface; wl_subsurface *m_subsurface; + zwp_linux_dmabuf_v1 *m_dmabuf; + int m_lastBufferScale = 0; }; } // namespace wayland diff --git a/src/apprt/embedded.zig b/src/apprt/embedded.zig index b5af8a319..4e9775246 100644 --- a/src/apprt/embedded.zig +++ b/src/apprt/embedded.zig @@ -428,7 +428,12 @@ pub const Platform = union(PlatformTag) { /// host imports it for composition; libghostty retains /// ownership of the underlying VkDeviceMemory and the fd is /// valid only for the duration of the call (host must `dup()` - /// if it needs to hold the fd longer). + /// if it needs to hold the fd longer). `image_backed` tells + /// the host whether the fd was exported from a VkImage + /// (directly importable as a 2D image via linux-dmabuf-v1) + /// or from a VkBuffer (only usable via mmap + CPU readback); + /// see `vulkan/Target.zig` and `include/ghostty.h` for the + /// full rationale. present: *const fn ( ?*anyopaque, i32, // dmabuf fd @@ -437,6 +442,7 @@ pub const Platform = union(PlatformTag) { u32, // width (pixels) u32, // height (pixels) u32, // stride (bytes) + bool, // image_backed ) callconv(.c) void, }; @@ -481,6 +487,7 @@ pub const Platform = union(PlatformTag) { u32, u32, u32, + bool, ) callconv(.c) void, }, }; diff --git a/src/renderer/vulkan/Target.zig b/src/renderer/vulkan/Target.zig index 19df63eb4..c857bdaa6 100644 --- a/src/renderer/vulkan/Target.zig +++ b/src/renderer/vulkan/Target.zig @@ -747,6 +747,11 @@ pub fn present(self: *const Self) void { // Fall back to the device's singleton copy when no platform was // attached (only the smoke test does this). const platform = if (self.platform) |p| p else self.device.platform; + // `image_backed` is the host's signal that this fd is importable + // by a 2D-image consumer (Wayland linux-dmabuf-v1, Vulkan + // external image, etc.). True in `.direct` mode where the fd was + // exported from a VkImage; false in `.legacy_copy` where it was + // exported from a VkBuffer and can only be read via mmap. platform.present( platform.userdata, self.fd, @@ -755,6 +760,7 @@ pub fn present(self: *const Self) void { self.width, self.height, self.stride, + self.tiling == .direct, ); }