qt/wayland: cache wl_buffer across presents instead of churning per frame

presentDmabuf was creating a fresh wl_buffer via create_immed and
destroying it on every release event — at 125 FPS (the rate the
animated post-shader timer fires) with multiple panes, that's a
Wayland round-trip + compositor dmabuf import on every frame, and
it dominated GUI-thread CPU at idle (measured ~half the cost in a
single-window steady state).

libghostty re-uses the same dmabuf fd across frames until the next
Target.deinit (a resize), so the shape inputs to create_immed are
stable for hundreds-to-thousands of consecutive frames. Cache the
wl_buffer keyed on the full shape tuple (fd + width + height +
stride + drm_format + drm_modifier + y_invert); re-attach the
cached buffer on every present, only recreate on a key mismatch.

Buffer release listener now no-ops (was destroying on each release)
so the same wl_buffer survives many attach/release cycles. Cache
invalidation paths: shape mismatch in presentDmabuf, and the dtor.

Measured: GUI-thread CPU at idle with animation=always drops from
~16% steady-state to ~10% on this branch. The win compounds with
panes and tabs (each presenter has its own cache).

Co-Authored-By: claude-flow <ruv@ruv.net>
pull/12846/head
Nathan 2026-05-25 20:48:45 -05:00
parent 63b71ac3a1
commit 88788948fa
2 changed files with 108 additions and 40 deletions

View File

@ -220,13 +220,20 @@ wl_display *acquireWaylandDisplay() {
}
// wl_buffer::release listener: the compositor is done sampling the
// buffer for any committed surface state, so we can destroy our
// client-side handle. The underlying dmabuf memory is owned by
// libghostty; we never close that fd here (the SCM_RIGHTS transfer
// in zwp_linux_buffer_params.add gave the compositor its own
// buffer for any committed surface state. We KEEP the wl_buffer
// alive across releases — libghostty re-uses the same dmabuf fd
// across frames until resize, so we re-attach the cached wl_buffer
// on every present (see `m_cachedBuffer` in the header). The buffer
// is destroyed only when (a) the dmabuf shape changes (next
// `presentDmabuf` invalidates the cache) or (b) the presenter is
// destroyed.
//
// The underlying dmabuf memory is owned by libghostty; we never
// close that fd here (the SCM_RIGHTS transfer in
// zwp_linux_buffer_params.add gave the compositor its own
// reference, which lives independently of our wl_buffer).
void bufferRelease(void *, wl_buffer *buffer) {
wl_buffer_destroy(buffer);
void bufferRelease(void *, wl_buffer *) {
// No-op. See cache rationale above.
}
const wl_buffer_listener kBufferListener = {
bufferRelease,
@ -438,6 +445,14 @@ SubsurfacePresenter::SubsurfacePresenter(wl_display *display, wl_surface *child,
}
SubsurfacePresenter::~SubsurfacePresenter() {
// Destroy the cached wl_buffer BEFORE the child surface — the
// buffer may still be attached. wl_buffer_destroy is safe whether
// or not the compositor has released it (Wayland guarantees no
// further events on a destroyed proxy).
if (m_cachedBuffer) {
wl_buffer_destroy(m_cachedBuffer);
m_cachedBuffer = nullptr;
}
if (m_fractionalScale) wp_fractional_scale_v1_destroy(m_fractionalScale);
if (m_viewport) wp_viewport_destroy(m_viewport);
if (m_subsurface) wl_subsurface_destroy(m_subsurface);
@ -515,41 +530,76 @@ void SubsurfacePresenter::presentDmabuf(int fd, uint32_t drm_format,
}
}
// Wrap libghostty's borrowed fd in a wl_buffer.
zwp_linux_buffer_params_v1 *params =
zwp_linux_dmabuf_v1_create_params(m_dmabuf);
if (!params) return;
zwp_linux_buffer_params_v1_add(params, fd, /*plane_idx*/ 0,
/*offset*/ 0, stride,
static_cast<uint32_t>(drm_modifier >> 32),
static_cast<uint32_t>(drm_modifier & 0xFFFFFFFFu));
const uint32_t buffer_flags =
y_invert ? ZWP_LINUX_BUFFER_PARAMS_V1_FLAGS_Y_INVERT : 0;
wl_buffer *buffer = zwp_linux_buffer_params_v1_create_immed(
params, static_cast<int32_t>(width), static_cast<int32_t>(height),
drm_format, buffer_flags);
zwp_linux_buffer_params_v1_destroy(params);
if (!buffer) {
// Surface the wl_display error code if the failure was a
// protocol-fatal error (compositor rejected the buffer with
// `invalid_format` / `invalid_dimensions` / etc., which kills
// the wl_display). Without this, every subsequent presentDmabuf
// call silently no-ops on the dead display and the cause stays
// hidden until something else logs the disconnection.
const int wl_err = wl_display_get_error(m_display);
std::fprintf(stderr,
"[ghastty] SubsurfacePresenter: create_immed returned null "
"(fd=%d %ux%u fmt=0x%x mod=0x%llx wl_display_error=%d)\n",
fd, width, height, drm_format,
static_cast<unsigned long long>(drm_modifier), wl_err);
return;
// Wrap libghostty's borrowed fd in a wl_buffer. Cached across
// frames: libghostty re-uses the same dmabuf fd until the next
// Target.deinit (a resize), so the shape inputs below stay stable
// for hundreds-to-thousands of consecutive frames at an animated-
// shader frame rate. Pre-cache, every present round-tripped
// `create_immed` to the compositor (Wayland sync call + compositor-
// side dmabuf import) and destroyed the buffer on release — ~half
// the GUI-thread CPU at 125 FPS.
const bool cache_hit = m_cachedBuffer != nullptr &&
m_cachedFd == fd &&
m_cachedWidth == width &&
m_cachedHeight == height &&
m_cachedStride == stride &&
m_cachedFormat == drm_format &&
m_cachedModifier == drm_modifier &&
m_cachedYInvert == y_invert;
wl_buffer *buffer = nullptr;
if (cache_hit) {
buffer = m_cachedBuffer;
} else {
// Cache miss — destroy any stale buffer first so a failed
// create_immed below leaves the cache empty (rather than half-
// populated with the previous buffer that no longer matches the
// new inputs).
if (m_cachedBuffer) {
wl_buffer_destroy(m_cachedBuffer);
m_cachedBuffer = nullptr;
m_cachedFd = -1;
}
zwp_linux_buffer_params_v1 *params =
zwp_linux_dmabuf_v1_create_params(m_dmabuf);
if (!params) return;
zwp_linux_buffer_params_v1_add(params, fd, /*plane_idx*/ 0,
/*offset*/ 0, stride,
static_cast<uint32_t>(drm_modifier >> 32),
static_cast<uint32_t>(drm_modifier & 0xFFFFFFFFu));
const uint32_t buffer_flags =
y_invert ? ZWP_LINUX_BUFFER_PARAMS_V1_FLAGS_Y_INVERT : 0;
buffer = zwp_linux_buffer_params_v1_create_immed(
params, static_cast<int32_t>(width), static_cast<int32_t>(height),
drm_format, buffer_flags);
zwp_linux_buffer_params_v1_destroy(params);
if (!buffer) {
// Surface the wl_display error code if the failure was a
// protocol-fatal error (compositor rejected the buffer with
// `invalid_format` / `invalid_dimensions` / etc., which kills
// the wl_display). Without this, every subsequent presentDmabuf
// call silently no-ops on the dead display and the cause stays
// hidden until something else logs the disconnection.
const int wl_err = wl_display_get_error(m_display);
std::fprintf(stderr,
"[ghastty] SubsurfacePresenter: create_immed returned null "
"(fd=%d %ux%u fmt=0x%x mod=0x%llx wl_display_error=%d)\n",
fd, width, height, drm_format,
static_cast<unsigned long long>(drm_modifier), wl_err);
return;
}
// Listener data is unused — see `bufferRelease` for why this is
// nullptr (and the no-op release semantics that make the cache
// safe).
wl_buffer_add_listener(buffer, &kBufferListener, nullptr);
m_cachedBuffer = buffer;
m_cachedFd = fd;
m_cachedWidth = width;
m_cachedHeight = height;
m_cachedStride = stride;
m_cachedFormat = drm_format;
m_cachedModifier = drm_modifier;
m_cachedYInvert = y_invert;
}
// Pass nullptr as listener data — `bufferRelease` does not read it.
// Storing `this` would create a dangling-pointer hazard if the
// SubsurfacePresenter is destroyed before the compositor sends
// `release`; today the listener doesn't dereference `data` so it
// works by accident, but a future addition that reads it would UAF.
wl_buffer_add_listener(buffer, &kBufferListener, nullptr);
// Tell the compositor the destination size in surface-local
// coordinates. With fractional scaling this is the logical pixel

View File

@ -24,6 +24,7 @@
#include <cstdint>
#include <memory>
struct wl_buffer;
struct wl_display;
struct wl_subsurface;
struct wl_surface;
@ -154,6 +155,23 @@ private:
int m_lastDestHeight = 0;
int m_lastX = 0;
int m_lastY = 0;
// wl_buffer cache. libghostty re-uses the same dmabuf fd across
// frames until the next Target.deinit (i.e. until a resize), so
// we can wrap the fd in a wl_buffer ONCE and re-attach it every
// frame instead of round-tripping `create_immed` per present.
// create_immed costs a Wayland round-trip + compositor-side
// dmabuf import; at 125 FPS (animated post shader) with multiple
// panes this was ~half of the GUI-thread CPU at idle. Invalidate
// the cache when any of the dmabuf-shape inputs change.
wl_buffer *m_cachedBuffer = nullptr;
int m_cachedFd = -1;
uint32_t m_cachedWidth = 0;
uint32_t m_cachedHeight = 0;
uint32_t m_cachedStride = 0;
uint32_t m_cachedFormat = 0;
uint64_t m_cachedModifier = 0;
bool m_cachedYInvert = false;
};
} // namespace wayland