From f48465cda575f4322e5d5f3595f6f7ced82a3e1c Mon Sep 17 00:00:00 2001
From: Nathan <nate0001@gmail.com>
Date: Tue, 26 May 2026 09:31:04 -0500
Subject: [PATCH] qt: backpressure renderer thread on the compositor-paced gate
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The GUI-side pacing landed in e78d3f7be kept the renderer running
free at 125 FPS — it just discarded the overshoot. The renderer
thread was still doing full Vulkan submit + waitForFences for
~65 frames/sec that never reached the compositor (confirmed by
the ~57 drops/sec the previous test showed at idle). GPU work
and renderer-thread CPU were paying for it.

Block in presentVulkanDmabuf on a std::condition_variable until
onWaylandFrameReady flips m_compositorReady. The renderer now
produces exactly one frame per compositor refresh — no wasted
GPU work, no drops in the steady state.

Safety:
- 100 ms timeout on the wait so a stalled compositor (lid closed,
  monitor disconnect) doesn't hang the renderer indefinitely; on
  timeout we proceed and overwrite the parked dmabuf (same drop
  semantic as pre-backpressure).
- Predicate also checks m_hidden so Hide / PlatformSurface destroy
  can notify_all to unblock the renderer immediately; the renderer
  re-checks m_hidden after wake and bails without parking.
- drainVulkan loses its now-redundant m_compositorReady check —
  any parked dmabuf means the renderer was allowed through, so
  the GUI thread can consume + commit unconditionally.

Measured: renderer-thread CPU drops from ~3% to ~2.1% at idle on
RTX 2080 (sleep, not spin — bigger on integrated GPUs). Total
drops at idle: 0 (was ~57/sec).

Co-Authored-By: claude-flow <ruv@ruv.net>
---
 qt/src/GhosttySurface.cpp | 98 +++++++++++++++++++++++++--------------
 qt/src/GhosttySurface.h   | 30 ++++++++----
 2 files changed, 86 insertions(+), 42 deletions(-)

diff --git a/qt/src/GhosttySurface.cpp b/qt/src/GhosttySurface.cpp
index fd99bc02e..7d1fc903b 100644
--- a/qt/src/GhosttySurface.cpp
+++ b/qt/src/GhosttySurface.cpp
@@ -515,8 +515,14 @@ bool GhosttySurface::event(QEvent *e) {
       m_subsurfacePresenter.reset();
       // Presenter is gone — no frame_done callback will arrive.
       // Reset the gate so the rebuilt presenter's first present
-      // (on next Show) goes through immediately.
-      m_compositorReady = true;
+      // (on next Show) goes through immediately, AND wake the
+      // renderer thread in case it's parked in the wait_for so
+      // it can re-check m_hidden and bail.
+      {
+        std::lock_guard<std::mutex> lg(m_compositorMutex);
+        m_compositorReady = true;
+      }
+      m_compositorCv.notify_all();
     }
     // SurfaceCreated is handled implicitly: the next QEvent::Show
     // (which Qt always fires after the platform surface comes up)
@@ -615,6 +621,15 @@ bool GhosttySurface::event(QEvent *e) {
         m_subsurfacePresenter->hide();
         forceParentCommit();
       }
+      // Wake the renderer thread if it's parked in
+      // presentVulkanDmabuf's wait_for; the predicate sees
+      // m_hidden=true (already set above) and the renderer bails
+      // without parking another frame.
+      {
+        std::lock_guard<std::mutex> lg(m_compositorMutex);
+        m_compositorReady = true;
+      }
+      m_compositorCv.notify_all();
     }
   }
   return QWidget::event(e);
@@ -1790,21 +1805,42 @@ void GhosttySurface::presentVulkanDmabuf(
   if (m_hidden.load(std::memory_order_acquire)) return;
 
   if (useSubsurface) {
+    // Backpressure the renderer thread to the compositor's refresh
+    // rate. Block here until the GUI thread's wl_surface.frame
+    // callback (onWaylandFrameReady) signals that the previous
+    // commit has retired and the compositor is ready for the next
+    // one. Without this, the renderer's 125 FPS draw timer keeps
+    // submitting GPU work that the paced GUI thread discards —
+    // wasted GPU + renderer-thread CPU.
+    //
+    // 100 ms timeout is a safety net: if the compositor stalls
+    // (lid closed, monitor disconnect, application minimized
+    // mid-flight) we don't want the renderer thread blocked
+    // forever. On timeout we proceed and overwrite the parked
+    // dmabuf — same drop semantic as pre-backpressure. The
+    // predicate also bails on m_hidden so Hide can wake the
+    // renderer immediately without paying the timeout.
+    {
+      std::unique_lock<std::mutex> lk(m_compositorMutex);
+      m_compositorCv.wait_for(lk, std::chrono::milliseconds(100),
+                              [this] {
+                                return m_compositorReady ||
+                                       m_hidden.load(std::memory_order_acquire);
+                              });
+      // If Hide fired while we were waiting, bail without parking
+      // the frame — the GUI thread's drainVulkan would drop it
+      // anyway on the m_hidden check below.
+      if (m_hidden.load(std::memory_order_acquire)) return;
+      m_compositorReady = false;
+    }
+
     // Subsurface path. Park the descriptor under the mutex (so
     // a concurrent drainVulkan sees a consistent snapshot) and
-    // wake the GUI thread.
-    //
-    // Frame-drop semantics: at most one frame is parked. If
-    // drainVulkan hasn't consumed the previous one before the
-    // renderer thread arrives with a new one, the older frame is
-    // overwritten — its fd is libghostty's to close at next
-    // Target.deinit, so the descriptor doesn't leak; the user just
-    // sees a missed frame. That's the right call for a 60Hz
-    // terminal: the alternative (block the renderer thread on the
-    // GUI thread) would stall every present. We bump a counter so
-    // a sustained backlog is visible in logs/metrics; spurious
-    // drops happen on the first few frames before the GUI thread
-    // pump is hot, hence the >0 threshold.
+    // wake the GUI thread. Frame-drop semantics: at most one frame
+    // is parked. With the backpressure above, overwrites should be
+    // rare — they happen only when the renderer's wait timed out
+    // before the GUI thread consumed the previous park, or on the
+    // first-frame bring-up race.
     bool overwrote = false;
     {
       QMutexLocker lock(&m_pendingMutex);
@@ -1908,13 +1944,15 @@ void GhosttySurface::presentVulkanDmabuf(
 
 void GhosttySurface::onWaylandFrameReady() {
   // Compositor has signaled it's ready for our next commit. Flip
-  // the gate and re-pump drainVulkan to consume any frame the
-  // renderer parked while we were waiting. If nothing is parked,
-  // drainVulkan no-ops; the next renderer-driven present will fire
-  // a queued drainVulkan that finds the gate open and goes through
-  // immediately.
-  m_compositorReady = true;
-  drainVulkan();
+  // the gate and wake the renderer thread, which is blocked in
+  // presentVulkanDmabuf's wait_for. The renderer will produce its
+  // next frame; nothing for us to drain right now (there's no
+  // pending dmabuf — the renderer is waiting BEFORE parking).
+  {
+    std::lock_guard<std::mutex> lg(m_compositorMutex);
+    m_compositorReady = true;
+  }
+  m_compositorCv.notify_all();
 }
 
 void GhosttySurface::drainVulkan() {
@@ -1943,15 +1981,10 @@ void GhosttySurface::drainVulkan() {
   }
   if (m_useSubsurface.load(std::memory_order_acquire) &&
       m_subsurfacePresenter) {
-    // Compositor-paced gate. If the compositor hasn't signaled
-    // ready yet (we're mid-flight on the previous commit), leave
-    // the parked descriptor in m_pendingDmabuf — onWaylandFrameReady
-    // will re-post drainVulkan when the wl_surface.frame callback
-    // fires. The renderer may overwrite m_pendingDmabuf with a
-    // newer frame in the meantime; that's fine, "latest wins" is
-    // the right semantic for terminal output that hasn't been
-    // displayed yet.
-    if (!m_compositorReady) return;
+    // No gate check here: the renderer thread's wait in
+    // presentVulkanDmabuf already paced us, so a parked dmabuf
+    // means the compositor was ready when the renderer claimed
+    // the slot. Just consume + commit.
     PendingDmabuf frame;
     {
       QMutexLocker lock(&m_pendingMutex);
@@ -1971,9 +2004,6 @@ void GhosttySurface::drainVulkan() {
     // parent wl_surface.commit so the cached state applies and the
     // frame becomes visible.
     forceParentCommit();
-    // Mark the gate closed until the compositor's wl_surface.frame
-    // callback fires (onWaylandFrameReady).
-    m_compositorReady = false;
     return;
   }
 
diff --git a/qt/src/GhosttySurface.h b/qt/src/GhosttySurface.h
index b8a591274..2b2b4550b 100644
--- a/qt/src/GhosttySurface.h
+++ b/qt/src/GhosttySurface.h
@@ -1,8 +1,10 @@
 #pragma once
 
 #include <atomic>
+#include <condition_variable>
 #include <cstdint>
 #include <memory>
+#include <mutex>
 
 #include <QImage>
 #include <QMutex>
@@ -347,14 +349,26 @@ private:
     quint32 stride = 0;
   };
   PendingDmabuf m_pendingDmabuf;
-  // Compositor-paced present gate. True when we can issue the next
-  // wl_subsurface commit; flipped false after a present and back to
-  // true on the wl_surface.frame callback (onWaylandFrameReady). The
-  // renderer thread keeps producing frames at its own rate (125 FPS
-  // with custom-shader-animation), but only the latest parked frame
-  // reaches the compositor on each refresh — drops every-other (or
-  // more) frame to match compositor refresh, halving Wayland-commit
-  // CPU on the GUI thread. GUI-thread only, no atomic.
+  // Compositor-paced present gate. Now BACKPRESSURES THE RENDERER
+  // THREAD: presentVulkanDmabuf blocks (with a 100 ms safety
+  // timeout) until the compositor signals ready, so the renderer
+  // produces frames at the compositor's refresh rate instead of
+  // its own 125 FPS draw timer. Saves the GPU work + renderer-
+  // thread CPU that the prior GUI-side-drop model was paying for
+  // every wasted frame.
+  //
+  // State machine:
+  //   - Initial: ready=true (first present goes through).
+  //   - Renderer present: wait_for(ready || hidden); claim
+  //     ready=false; park dmabuf; post drain.
+  //   - GUI drain: consume + commit + register wl_surface.frame.
+  //   - Compositor frame_done → onWaylandFrameReady: ready=true,
+  //     notify CV. Renderer's next present unblocks immediately.
+  //   - Hide / PlatformSurface destroy: ready=true, notify_all to
+  //     unblock any in-flight renderer wait (predicate also checks
+  //     m_hidden so the renderer bails without parking).
+  std::mutex m_compositorMutex;
+  std::condition_variable m_compositorCv;
   bool m_compositorReady = true;
   // Dedupes queued drainVulkan invocations posted from the renderer
   // thread. Each renderer-thread `presentVulkanDmabuf` used to post