diff --git a/qt/CMakeLists.txt b/qt/CMakeLists.txt index 9f5a81e9d..e5729a939 100644 --- a/qt/CMakeLists.txt +++ b/qt/CMakeLists.txt @@ -143,6 +143,15 @@ set(GHOSTTY_SO "${GHOSTTY_LIB_DIR}/ghostty-internal.so") set(GHASTTY_VARIANT "opengl" CACHE STRING "Renderer variant: opengl (default) or vulkan") set_property(CACHE GHASTTY_VARIANT PROPERTY STRINGS opengl vulkan) +# Validate the cache value: STRINGS only constrains the cmake-gui +# dropdown, not the command-line. `-DGHASTTY_VARIANT=foo` would +# otherwise silently fall into the OpenGL branch below. +if(NOT GHASTTY_VARIANT STREQUAL "opengl" AND + NOT GHASTTY_VARIANT STREQUAL "vulkan") + message(FATAL_ERROR + "GHASTTY_VARIANT='${GHASTTY_VARIANT}' is invalid; " + "must be 'opengl' or 'vulkan'.") +endif() if(GHASTTY_VARIANT STREQUAL "vulkan") set(GHASTTY_EXE_NAME "ghastty-vulkan") set(GHASTTY_LIB_SUBDIR "ghastty-vulkan") @@ -189,7 +198,6 @@ add_custom_target(ghostty_link DEPENDS "${GHOSTTY_LINK_SO}") add_executable(ghastty src/main.cpp - src/vulkan/Host.cpp src/actions/ActionDispatcher.cpp src/actions/ChromeActions.cpp src/actions/InputActions.cpp @@ -226,6 +234,15 @@ add_executable(ghastty "${FRACSCALE_HEADER}" ) +# Vulkan host glue is variant-only. Adding it to the OpenGL build +# would force an unconditional libvulkan link on a binary that +# never calls into Vulkan, contradicting the side-by-side +# `~/.local/lib/libghostty.so` story that the variant block above +# documents. +if(GHASTTY_VARIANT STREQUAL "vulkan") + target_sources(ghastty PRIVATE src/vulkan/Host.cpp) +endif() + # Embed the app icon so it is available even running from the build tree. qt_add_resources(ghastty "appicon" PREFIX "/" @@ -250,10 +267,18 @@ target_link_libraries(ghastty PRIVATE PkgConfig::XKBCOMMON PkgConfig::EGL LayerShellQt::Interface - vulkan "${GHOSTTY_LINK_SO}" ) +# libvulkan is Vulkan-variant only. The OpenGL variant compiles +# nothing that references Vulkan symbols (vulkan/Host.cpp is gated +# above), so not linking libvulkan keeps OpenGL-only systems from +# needing the loader installed at runtime — matching the +# documented side-by-side variant story above. +if(GHASTTY_VARIANT STREQUAL "vulkan") + target_link_libraries(ghastty PRIVATE vulkan) +endif() + # Hook up the private QPA headers (see find_package above). # # Qt6::WaylandClientPrivate gives us QtWaylandClient::QWaylandWindow, diff --git a/qt/src/GhosttySurface.cpp b/qt/src/GhosttySurface.cpp index d0b67c22b..2296fc42b 100644 --- a/qt/src/GhosttySurface.cpp +++ b/qt/src/GhosttySurface.cpp @@ -8,7 +8,9 @@ #include "SearchBar.h" #include "TabWidget.h" #include "Util.h" +#ifdef GHASTTY_USE_VULKAN #include "vulkan/Host.h" +#endif #include "wayland/EglDmabufTarget.h" #include "wayland/SubsurfacePresenter.h" @@ -112,12 +114,43 @@ GhosttySurface::GhosttySurface(ghostty_app_t app, MainWindow *owner, // produce a mismatch crash. Mixing GL+VK on the same process // (e.g. NVIDIA's coexistence on one Wayland surface) is also // reportedly fragile. - vulkan::Host *vk_host = nullptr; -#ifdef GHASTTY_USE_VULKAN - vk_host = vulkan::Host::instance(); -#endif + // The "use Vulkan" decision is purely compile-time on this fork: + // each binary is linked against exactly one libghostty.so variant + // (opengl or vulkan). A runtime fallback would just mis-initialize + // the surface against the wrong renderer. + ghostty_surface_config_s sc = + m_parentSurface + ? ghostty_surface_inherited_config(m_parentSurface, + GHOSTTY_SURFACE_CONTEXT_TAB) + : ghostty_surface_config_new(); - if (vk_host == nullptr) { +#ifdef GHASTTY_USE_VULKAN + { + vulkan::Host *vk_host = vulkan::Host::instance(); + if (vk_host == nullptr) { + // libghostty was compiled with -Drenderer=vulkan and there's + // no GL fallback available: libghostty's GL surface init + // would crash on the first call. Fail loudly here. + std::fprintf(stderr, + "[ghastty] Vulkan host bring-up failed (no Vulkan 1.3 " + "GPU with VK_KHR_external_memory_fd + " + "VK_EXT_external_memory_dma_buf). The Vulkan variant " + "of libghostty has no OpenGL fallback — exiting.\n"); + std::abort(); + } + m_useVulkan = true; + sc.platform_tag = GHOSTTY_PLATFORM_VULKAN; + sc.platform.vulkan = vk_host->asPlatform(this); + + // GUI-thread frame delivery is driven by + // `QMetaObject::invokeMethod` (Qt::QueuedConnection) from + // `presentVulkanDmabuf`. The earlier 2 ms safety-net polling + // timer was removed once delivery was shown to be reliable; + // any genuine loss is visible via the dropped-frame counter + // logged from `presentVulkanDmabuf`. + } +#else + { // OpenGL path: stand up the private context + offscreen FBO // libghostty's GL renderer draws into. m_context = new QOpenGLContext(this); @@ -140,33 +173,7 @@ GhosttySurface::GhosttySurface(ghostty_app_t app, MainWindow *owner, fmt.setInternalTextureFormat(GL_RGBA8); m_fbw = m_fbh = 16; m_fbo = new QOpenGLFramebufferObject(QSize(m_fbw, m_fbh), fmt); - } - ghostty_surface_config_s sc = - m_parentSurface - ? ghostty_surface_inherited_config(m_parentSurface, - GHOSTTY_SURFACE_CONTEXT_TAB) - : ghostty_surface_config_new(); - - if (vk_host != nullptr) { - m_useVulkan = true; - sc.platform_tag = GHOSTTY_PLATFORM_VULKAN; - sc.platform.vulkan = vk_host->asPlatform(this); - - // GUI-thread frame drain. The renderer thread wakes us per frame - // via QMetaObject::invokeMethod (Qt::QueuedConnection) on each - // present — see `presentVulkanDmabuf`. The 2 ms timer is a - // safety net: if `invokeMethod` ever fails to deliver (the - // earlier QImage-handoff diagnostics suggested this could - // happen), the next tick drains the parked frame within at most - // 2 ms. Idle case has negligible CPU cost because `drainVulkan` - // returns immediately when nothing is pending. - m_vulkanPollTimer = new QTimer(this); - m_vulkanPollTimer->setInterval(2); - connect(m_vulkanPollTimer, &QTimer::timeout, this, - [this]() { drainVulkan(); }); - m_vulkanPollTimer->start(); - } else { sc.platform_tag = GHOSTTY_PLATFORM_OPENGL; sc.platform.opengl.userdata = this; sc.platform.opengl.get_proc_address = glGetProcAddress; @@ -174,6 +181,7 @@ GhosttySurface::GhosttySurface(ghostty_app_t app, MainWindow *owner, sc.platform.opengl.release_current = glReleaseCurrent; sc.platform.opengl.present = glPresent; } +#endif sc.userdata = this; sc.scale_factor = devicePixelRatioF(); @@ -234,10 +242,12 @@ void GhosttySurface::syncSurfaceSize() { // shave a pixel off the framebuffer relative to the QImage blit. const int w = std::max(1, static_cast(std::lround(width() * dpr))); const int h = std::max(1, static_cast(std::lround(height() * dpr))); - if (w == m_fbw && h == m_fbh && dpr == m_fbDpr) return; + if (w == m_fbw && h == m_fbh && + dpr == m_fbDpr.load(std::memory_order_relaxed)) + return; m_fbw = w; m_fbh = h; - m_fbDpr = dpr; + m_fbDpr.store(dpr, std::memory_order_release); // Vulkan path: libghostty manages the target image itself (it // allocates the dmabuf-exportable VkImage). Tell it the new @@ -405,7 +415,19 @@ bool GhosttySurface::event(QEvent *e) { static_cast(e)->surfaceEventType(); if (type == QPlatformSurfaceEvent::SurfaceAboutToBeDestroyed) { m_useSubsurface.store(false, std::memory_order_release); - m_eglTarget.reset(); + // EglDmabufTarget's destructor deletes a GL framebuffer + + // texture allocated against `m_context`; without that + // context current its `QOpenGLContext::currentContext()` + // check sees the wrong (or no) context and silently skips + // the gl* calls, leaking the resources every time Qt + // re-creates the QPA window (QSplitter reparent, fullscreen + // toggle, screen change). Make the owning context current + // before tearing down. Vulkan-variant builds have no + // `m_context` and skip the makeCurrent. + if (m_eglTarget) { + if (m_context) makeCurrent(); + m_eglTarget.reset(); + } m_subsurfacePresenter.reset(); } // SurfaceCreated is handled implicitly: the next QEvent::Show @@ -623,7 +645,7 @@ void GhosttySurface::renderTerminal() { // (Scaling it to the widget instead made the whole frame — images // included — rubber-band while a resize was in flight.) m_image = m_fbo->toImage(); - m_image.setDevicePixelRatio(m_fbDpr); + m_image.setDevicePixelRatio(m_fbDpr.load(std::memory_order_acquire)); m_fbo->release(); update(); @@ -1537,10 +1559,11 @@ QVariant GhosttySurface::inputMethodQuery(Qt::InputMethodQuery query) const { ghostty_surface_cursor_position(m_surface); // m_fbDpr defaults to 1.0 and only ever takes positive values // from syncSurfaceSize, so dividing is always safe. - return QRect(static_cast(c.x / m_fbDpr), - static_cast(c.y / m_fbDpr), - std::max(1, static_cast(c.width / m_fbDpr)), - std::max(1, static_cast(c.height / m_fbDpr))); + const double dpr = m_fbDpr.load(std::memory_order_acquire); + return QRect(static_cast(c.x / dpr), + static_cast(c.y / dpr), + std::max(1, static_cast(c.width / dpr)), + std::max(1, static_cast(c.height / dpr))); } default: return QWidget::inputMethodQuery(query); @@ -1621,8 +1644,25 @@ void GhosttySurface::presentVulkanDmabuf( image_backed ? 1 : 0, useSubsurface ? "subsurface" : "qimage"); } - if (dmabuf_fd < 0 || width == 0 || height == 0 || stride < width * 4) - return; + // Validate the renderer-supplied dimensions. width / height / + // stride are all u32 and the multiplications below would wrap if + // they're hostile/buggy: + // - `width * 4` (the minimum acceptable stride) wraps for + // width >= 0x40000000, accepting any stride. + // - `stride * height` (the legacy mmap path's byte count) wraps + // to a small size_t when promoted on platforms where size_t + // is 32-bit, causing an under-mapped buffer that we then + // read past. + // Cap on a sane upper bound — 65536×65536 dwarfs any plausible + // terminal — and check that stride*height doesn't exceed + // SIZE_MAX before promoting. + constexpr quint32 MAX_DIM = 65536; + if (dmabuf_fd < 0 || width == 0 || height == 0) return; + if (width > MAX_DIM || height > MAX_DIM) return; + if (stride < static_cast(width) * 4) return; + // stride*height as 64-bit and check the size_t fit explicitly. + const quint64 bytes64 = static_cast(stride) * height; + if (bytes64 > std::numeric_limits::max()) return; // Don't park / dispatch frames while we're hidden — racing the // renderer's final post-Hide frame past presenter.hide() is what @@ -1670,8 +1710,9 @@ void GhosttySurface::presentVulkanDmabuf( return; } - // Fallback: mmap + memcpy into a QImage. - const size_t bytes = static_cast(stride) * height; + // Fallback: mmap + memcpy into a QImage. `bytes64` was computed + // and bounds-checked above. + const size_t bytes = static_cast(bytes64); void *mapped = ::mmap(nullptr, bytes, PROT_READ, MAP_SHARED, dmabuf_fd, 0); if (mapped == MAP_FAILED) { std::fprintf(stderr, "[ghastty] mmap of dmabuf fd=%d failed: %s\n", @@ -1694,7 +1735,8 @@ void GhosttySurface::presentVulkanDmabuf( QImage owned = stamped.copy(); ::munmap(mapped, bytes); - if (m_fbDpr > 0) owned.setDevicePixelRatio(m_fbDpr); + const double dpr_now = m_fbDpr.load(std::memory_order_acquire); + if (dpr_now > 0) owned.setDevicePixelRatio(dpr_now); { QMutexLocker lock(&m_pendingMutex); m_pending = std::move(owned); diff --git a/qt/src/GhosttySurface.h b/qt/src/GhosttySurface.h index 074d9e319..41738d328 100644 --- a/qt/src/GhosttySurface.h +++ b/qt/src/GhosttySurface.h @@ -288,13 +288,14 @@ private: bool m_useVulkan = false; // Cross-thread frame handoff for the Vulkan path. The renderer - // thread calls `presentVulkanDmabuf` with a borrowed dmabuf fd; a - // 16 ms `QTimer` on the GUI thread drains the pending frame and - // routes it through the wl_subsurface (zero-copy) when the - // SubsurfacePresenter is available, or falls back to the - // mmap+memcpy+QImage path otherwise. The polling timer was kept - // (rather than QMetaObject::invokeMethod) because queued lambdas - // from the renderer thread were unreliable in earlier diagnostics. + // thread calls `presentVulkanDmabuf` with a borrowed dmabuf fd + // and posts a queued `drainVulkan` invocation; the GUI thread + // runs `drainVulkan` and routes the parked descriptor through + // either the wl_subsurface presenter (zero-copy) or the + // mmap+memcpy+QImage fallback. The dropped-frame counter + // (`m_droppedFrames`) surfaces any queue-loss that ever happens + // in practice — the earlier safety-net polling timer was + // removed once delivery was shown to be reliable. // // `m_useSubsurface` is set once on the GUI thread when the // presenter comes up; the renderer thread reads it acquire-style @@ -318,7 +319,6 @@ private: // null and paintEvent skips its blit. QImage m_pending; QMutex m_pendingMutex; - QTimer *m_vulkanPollTimer = nullptr; // GL objects for the alpha-premultiply pass. QOpenGLShaderProgram *m_premultProg = nullptr; @@ -326,7 +326,13 @@ private: int m_fbw = 0; // framebuffer size, device pixels int m_fbh = 0; - double m_fbDpr = 1.0; // DPR the framebuffer was sized at + // DPR the framebuffer was sized at. Atomic because the renderer + // thread reads it from `presentVulkanDmabuf` to tag the legacy + // QImage path while the GUI thread writes it from + // `syncSurfaceSize`. `double` writes aren't guaranteed atomic + // across threads on every architecture; std::atomic uses + // CAS-loop fallbacks where needed. + std::atomic m_fbDpr{1.0}; // DPR the framebuffer was sized at QLabel *m_exitOverlay = nullptr; // "process exited" banner; lazily made QLabel *m_keySeqOverlay = nullptr; // pending keybind chord; lazily made diff --git a/qt/src/main.cpp b/qt/src/main.cpp index bb943a60e..3a4fa30a3 100644 --- a/qt/src/main.cpp +++ b/qt/src/main.cpp @@ -11,7 +11,6 @@ #include "GlobalShortcuts.h" #include "MainWindow.h" #include "ghostty.h" -#include "vulkan/Host.h" // True when any argv entry starts with `+` — i.e. the user invoked a // libghostty CLI action (`+show-config`, `+list-fonts`, `+version`, …). diff --git a/qt/src/wayland/SubsurfacePresenter.cpp b/qt/src/wayland/SubsurfacePresenter.cpp index 99bd5b010..f0b42655a 100644 --- a/qt/src/wayland/SubsurfacePresenter.cpp +++ b/qt/src/wayland/SubsurfacePresenter.cpp @@ -49,9 +49,13 @@ void dmabufFormat(void *, zwp_linux_dmabuf_v1 *, uint32_t /*format*/) {} // `modifier` event: compositor advertises one (format, modifier) it // can scan out. Fires once per pair during the bind roundtrip; we -// stash them all in the per-format vector. Duplicate-keyed inserts -// are theoretically possible across compositor restarts but won't -// happen within a single bind round, so we don't dedupe. +// stash them all in the per-format vector. Only fires from inside +// `discoverGlobals` because we keep the dmabuf proxy on a private +// queue that's never dispatched after discovery — see the queue- +// retention comment in `discoverGlobals`. That guarantee is what +// lets the renderer thread read `globals.modifiers` without a +// lock, and is also why we don't bother deduping (one bind round +// only fires each pair once). void dmabufModifier(void *data, zwp_linux_dmabuf_v1 *, uint32_t format, uint32_t modifier_hi, uint32_t modifier_lo) { auto *g = static_cast(data); @@ -140,21 +144,32 @@ PresenterGlobals *discoverGlobals(wl_display *display) { // Move the bound proxies back to the default queue so Qt's main // dispatch drives subsequent events on them, then drop the private // queue. (Same lifecycle dance as `blurManager`.) + // + // EXCEPT the dmabuf proxy: its listener mutates `globals.modifiers` + // on every `modifier` event, and the renderer thread reads that + // map from `supportedDmabufModifiers` without locking. If we + // moved the proxy back to the default queue, a compositor + // restart / hot-plug fires more `modifier` events that would + // race the reader. Keep the proxy on `queue` and intentionally + // never dispatch that queue again — the events queue up + // harmlessly and are reaped at proxy destruction. The map is + // genuinely frozen post-discovery now. if (globals.compositor) wl_proxy_set_queue(reinterpret_cast(globals.compositor), nullptr); if (globals.subcompositor) wl_proxy_set_queue(reinterpret_cast(globals.subcompositor), nullptr); - if (globals.dmabuf) - wl_proxy_set_queue(reinterpret_cast(globals.dmabuf), nullptr); if (globals.viewporter) wl_proxy_set_queue(reinterpret_cast(globals.viewporter), nullptr); if (globals.fractionalScale) wl_proxy_set_queue(reinterpret_cast(globals.fractionalScale), nullptr); - wl_event_queue_destroy(queue); + // We deliberately leak `queue` (and leave globals.dmabuf attached + // to it) for the process lifetime — it has no resources beyond a + // small kernel-side buffer and going away would put dmabuf events + // back on the default queue. return &globals; } @@ -403,6 +418,33 @@ void SubsurfacePresenter::presentDmabuf(int fd, uint32_t drm_format, if (dest_width <= 0) dest_width = 1; if (dest_height <= 0) dest_height = 1; + // Validate the (format, modifier) pair against the compositor's + // advertised list before handing it to `create_immed`. If the + // pair isn't on the list, the compositor will reject the + // subsequent `create_immed` with `invalid_format` — a FATAL + // protocol error that kills the entire wl_display, taking down + // every window in the process. Better to drop this single frame + // than to take down the app. + { + const PresenterGlobals &g = globalState(); + const auto it = g.modifiers.find(drm_format); + bool ok = false; + if (it != g.modifiers.end()) { + for (const uint64_t m : it->second) { + if (m == drm_modifier) { ok = true; break; } + } + } + if (!ok) { + std::fprintf(stderr, + "[ghastty] SubsurfacePresenter: refusing dmabuf " + "(fourcc=0x%08x mod=0x%llx) — compositor doesn't " + "advertise this (format, modifier) pair\n", + drm_format, + static_cast(drm_modifier)); + return; + } + } + // Wrap libghostty's borrowed fd in a wl_buffer. zwp_linux_buffer_params_v1 *params = zwp_linux_dmabuf_v1_create_params(m_dmabuf); diff --git a/qt/src/wayland/SubsurfacePresenter.h b/qt/src/wayland/SubsurfacePresenter.h index 493c50d2b..3c1d3a081 100644 --- a/qt/src/wayland/SubsurfacePresenter.h +++ b/qt/src/wayland/SubsurfacePresenter.h @@ -109,8 +109,13 @@ public: // units of 1/120 (e.g. 144 = 1.2, 180 = 1.5, 240 = 2.0). Returns // 120 (= 1.0) until the compositor sends its first // wp_fractional_scale_v1.preferred_scale event for our surface. - // Renderer / GhosttySurface size their buffers at - // `logical * preferredScale120() / 120` device pixels. + // + // Currently INFORMATIONAL only: GhosttySurface uses Qt's + // devicePixelRatioF() for buffer sizing (which Qt derives from + // the same protocol on Wayland), so the two values agree at + // steady state. Exposed for diagnostics + a future direct- + // protocol path that bypasses Qt's DPR cache lag during a + // screen-change race. uint32_t preferredScale120() const { return m_preferredScale120; } // Stretch the existing subsurface buffer to a new destination diff --git a/src/apprt/embedded.zig b/src/apprt/embedded.zig index 7a850b682..c8702d2b4 100644 --- a/src/apprt/embedded.zig +++ b/src/apprt/embedded.zig @@ -547,24 +547,42 @@ pub const Platform = union(PlatformTag) { .vulkan => vulkan: { const config = c_platform.vulkan; + // Collapse the eight per-callback "MustBeSet" + // variants into a single `error.MissingVulkanCallback`. + // Pre-this, every caller of `Platform.init` had to + // handle 8 separate error tags (or `try` swallow + // them) — eight names that all mean "the host + // didn't fill out one of these fields." Log which + // one was null for diagnostics; the error tag + // itself stays narrow. + const which: ?[]const u8 = blk: { + if (config.get_instance_proc_addr == null) break :blk "get_instance_proc_addr"; + if (config.instance == null) break :blk "instance"; + if (config.physical_device == null) break :blk "physical_device"; + if (config.device == null) break :blk "device"; + if (config.queue == null) break :blk "queue"; + if (config.queue_family_index == null) break :blk "queue_family_index"; + if (config.get_supported_modifiers == null) break :blk "get_supported_modifiers"; + if (config.present == null) break :blk "present"; + break :blk null; + }; + if (which) |name| { + std.log.scoped(.embedded).err( + "ghostty_platform_vulkan_s.{s} is null", + .{name}, + ); + break :vulkan error.MissingVulkanCallback; + } break :vulkan .{ .vulkan = .{ .userdata = config.userdata, - .get_instance_proc_addr = config.get_instance_proc_addr orelse - break :vulkan error.GetInstanceProcAddrMustBeSet, - .instance = config.instance orelse - break :vulkan error.InstanceMustBeSet, - .physical_device = config.physical_device orelse - break :vulkan error.PhysicalDeviceMustBeSet, - .device = config.device orelse - break :vulkan error.DeviceMustBeSet, - .queue = config.queue orelse - break :vulkan error.QueueMustBeSet, - .queue_family_index = config.queue_family_index orelse - break :vulkan error.QueueFamilyIndexMustBeSet, - .get_supported_modifiers = config.get_supported_modifiers orelse - break :vulkan error.GetSupportedModifiersMustBeSet, - .present = config.present orelse - break :vulkan error.PresentMustBeSet, + .get_instance_proc_addr = config.get_instance_proc_addr.?, + .instance = config.instance.?, + .physical_device = config.physical_device.?, + .device = config.device.?, + .queue = config.queue.?, + .queue_family_index = config.queue_family_index.?, + .get_supported_modifiers = config.get_supported_modifiers.?, + .present = config.present.?, } }; }, }; diff --git a/src/renderer/Vulkan.zig b/src/renderer/Vulkan.zig index ec65d51b5..563dbbcc7 100644 --- a/src/renderer/Vulkan.zig +++ b/src/renderer/Vulkan.zig @@ -115,8 +115,9 @@ var device: ?Device = null; var device_refcount: usize = 0; var device_mutex: std.Thread.Mutex = .{}; -/// Per-thread pool of `(VkBuffer, VkDeviceMemory)` pairs that get -/// recycled across frames. Solves two problems together: +/// Process-wide pool of `(VkBuffer, VkDeviceMemory)` pairs recycled +/// across frames on the renderer thread. Solves two problems +/// together: /// /// 1. Lifetime: `vulkan/buffer.zig`'s `Buffer.deinit` is called /// mid-frame (by `renderer/image.zig:draw`'s `defer buf.deinit()`) @@ -130,8 +131,22 @@ var device_mutex: std.Thread.Mutex = .{}; /// Lifecycle: `Buffer.deinit` pushes to `pending`. `Frame.complete` /// after `vkWaitForFences` moves `pending` → `ready`. `Buffer.create` /// scans `ready` for an entry of matching usage + size and pops it -/// before allocating new. The pool only grows; entries get destroyed -/// when the device tears down (`Vulkan.deinit`). +/// before allocating new. +/// +/// Process-wide (not threadlocal) and mutex-protected: splits/tabs +/// run independent renderer threads against the SAME shared +/// VkDevice, and a per-thread pool would mean each thread leaks +/// every staging buffer the other threads release. The mutex is +/// uncontended in the steady state — entries are short-lived and +/// the pool only grows. +/// +/// Caller responsibilities: +/// - Only call `release` from a code path whose VkBuffer reference +/// is bounded by a fence the renderer thread will eventually +/// wait on (i.e. the per-frame command buffer). +/// - For one-shot uploads (e.g. atlas staging) the caller already +/// does `vkQueueWaitIdle` post-submit; that path uses +/// `Buffer.destroyImmediate` which bypasses this pool. pub const buffer_pool = struct { const Entry = struct { buffer: vk.VkBuffer, @@ -140,8 +155,9 @@ pub const buffer_pool = struct { capacity: u64, }; - threadlocal var pending: std.ArrayList(Entry) = .{}; - threadlocal var ready: std.ArrayList(Entry) = .{}; + var mutex: std.Thread.Mutex = .{}; + var pending: std.ArrayList(Entry) = .{}; + var ready: std.ArrayList(Entry) = .{}; /// Queue a buffer for recycling. The buffer cannot be reused /// until the next fence-wait (handled by `cycle`); it sits in @@ -154,6 +170,8 @@ pub const buffer_pool = struct { capacity: u64, ) !void { _ = dev; + mutex.lock(); + defer mutex.unlock(); try pending.append(std.heap.smp_allocator, .{ .buffer = buffer, .memory = memory, @@ -170,6 +188,8 @@ pub const buffer_pool = struct { usage: vk.VkBufferUsageFlags, min_capacity: u64, ) ?Entry { + mutex.lock(); + defer mutex.unlock(); var i: usize = 0; while (i < ready.items.len) : (i += 1) { const e = ready.items[i]; @@ -186,18 +206,19 @@ pub const buffer_pool = struct { /// `Frame.complete` after `vkWaitForFences`. /// /// `dev` is needed only on the OOM fallback path: if `ready` - /// can't grow to absorb `pending`, we destroy the pending - /// VkBuffers / VkDeviceMemory directly instead of leaking them - /// (the alternative would be to leave them in `pending` forever, - /// where each successive frame's `cycle` would try the same - /// failing append on an ever-growing list — guaranteed VkDevice - /// memory exhaustion). + /// can't grow to absorb `pending`, we wait the device idle and + /// then destroy the pending entries directly so the next frame + /// doesn't double up on a pending list that can never drain. pub fn cycle(dev: *const Device) void { + mutex.lock(); + defer mutex.unlock(); ready.appendSlice(std.heap.smp_allocator, pending.items) catch { - // Couldn't grow `ready` — destroy the GPU resources now - // (the GPU is provably done with them, the fence wait - // already returned) so the next frame doesn't double up - // on a pending list that can never drain. + // Couldn't grow `ready` — destroy the pending GPU + // resources directly. Other renderer threads may still + // be submitting against the shared queue, so wait the + // device idle to make sure no command buffer in flight + // anywhere references these handles before we destroy. + _ = dev.dispatch.deviceWaitIdle(dev.device); for (pending.items) |e| { dev.dispatch.destroyBuffer(dev.device, e.buffer, null); dev.dispatch.freeMemory(dev.device, e.memory, null); @@ -207,8 +228,10 @@ pub const buffer_pool = struct { } /// Tear down both lists. Call only when the device is idle - /// (`vkDeviceWaitIdle` or surface destroy). + /// (`vkDeviceWaitIdle` or final surface destroy). pub fn drainAll(dev: *const Device) void { + mutex.lock(); + defer mutex.unlock(); for (pending.items) |e| { dev.dispatch.destroyBuffer(dev.device, e.buffer, null); dev.dispatch.freeMemory(dev.device, e.memory, null); @@ -248,6 +271,28 @@ threadlocal var frame_cb: vk.VkCommandBuffer = null; /// in `Frame.complete` before handing the target dmabuf to the host. threadlocal var frame_fence: vk.VkFence = null; +/// Per-thread descriptor pool used by `RenderPass.step` to allocate +/// fresh descriptor sets when the same pipeline is bound more than +/// once in a single pass (vkCmdDraw reads descriptors at submit +/// time, so re-using the pipeline's static set would silently +/// corrupt prior draws). Reset at the start of every `beginFrame` +/// so this frame's allocations don't pile on the previous frame's; +/// the per-pass usage is bounded by a small constant — see the +/// `step_pool_*` caps below. +threadlocal var step_pool: ?DescriptorPool = null; + +/// Caps for the per-frame `step_pool`. Sized for the worst pass +/// shape (kitty image with N placements + the post pipelines): one +/// set per (image_step × MAX_DESCRIPTOR_SETS) plus a handful of +/// the renderer's other pipelines stepped once each. 256 is generous +/// — actual frames stabilize well under that. If a frame ever +/// exhausts the pool, `RenderPass.step` falls back to the pipeline's +/// static set with a warning logged. +const STEP_POOL_MAX_SETS: u32 = 256; +const STEP_POOL_UNIFORM_BUFFERS: u32 = 256; +const STEP_POOL_COMBINED_IMAGE_SAMPLERS: u32 = 256; +const STEP_POOL_STORAGE_BUFFERS: u32 = 256; + // ---- lifecycle ---------------------------------------------------------- pub fn init(alloc: Allocator, opts: rendererpkg.Options) !Vulkan { @@ -301,8 +346,27 @@ pub fn deinit(self: *Vulkan) void { // per surface), so it's always safe to clean them up regardless // of other surfaces' state. if (device) |*d| { - d.waitIdle(); + // Per-surface teardown only needs THIS surface's submissions + // to be done — block on this thread's frame fence (if it + // exists) instead of `vkDeviceWaitIdle` on the shared device, + // which would stall every other tab/split's in-flight GPU + // work just to close one. The final-refcount path below does + // the device-wide waitIdle. if (frame_fence != null) { + const wait_r = d.dispatch.waitForFences( + d.device, + 1, + &frame_fence, + vk.VK_TRUE, + std.math.maxInt(u64), + ); + if (wait_r != vk.VK_SUCCESS) { + log.warn( + "Vulkan.deinit: vkWaitForFences returned {}, falling back to device-wide wait", + .{wait_r}, + ); + d.waitIdle(); + } d.dispatch.destroyFence(d.device, frame_fence, null); frame_fence = null; } @@ -314,13 +378,14 @@ pub fn deinit(self: *Vulkan) void { p.deinit(); frame_pool = null; } + if (step_pool) |*p| { + p.deinit(); + step_pool = null; + } // `last_target` is a borrow into this thread's FrameState // target slot. The SwapChain teardown destroys the target; // we just drop our reference. last_target = null; - // Recycle this thread's pooled buffers — the waitIdle above - // proves no GPU work references them anymore. - buffer_pool.drainAll(d); } // Decrement the shared-device refcount; only the last surface @@ -330,9 +395,17 @@ pub fn deinit(self: *Vulkan) void { // renderer thread. device_mutex.lock(); defer device_mutex.unlock(); + std.debug.assert(device_refcount > 0); device_refcount -= 1; if (device_refcount == 0) { - if (device) |*d| d.deinit(); + // Last surface: NOW we can safely drain the global buffer + // pool and tear the device down. The waitIdle is needed + // because non-final deinits skipped it. + if (device) |*d| { + d.waitIdle(); + buffer_pool.drainAll(d); + d.deinit(); + } device = null; } self.* = undefined; @@ -499,16 +572,37 @@ pub fn beginFrame( if (dev.dispatch.createFence(dev.device, &fence_info, null, &frame_fence) != vk.VK_SUCCESS) return error.VulkanFailed; } + if (step_pool == null) { + step_pool = try DescriptorPool.init(.{ + .device = dev, + .max_sets = STEP_POOL_MAX_SETS, + .uniform_buffers = STEP_POOL_UNIFORM_BUFFERS, + .combined_image_samplers = STEP_POOL_COMBINED_IMAGE_SAMPLERS, + .storage_buffers = STEP_POOL_STORAGE_BUFFERS, + }); + } _ = self; - // Reset the command buffer + fence so this frame starts clean. + // Reset the command buffer + fence + step descriptor pool so + // this frame starts clean. `vkResetDescriptorPool` returns every + // set the previous frame allocated to the pool — much cheaper + // than freeing them individually, and removes any chance of + // last-frame's set being bound by accident. if (dev.dispatch.resetCommandBuffer(frame_cb, 0) != vk.VK_SUCCESS) return error.VulkanFailed; if (dev.dispatch.resetFences(dev.device, 1, &frame_fence) != vk.VK_SUCCESS) return error.VulkanFailed; + if (step_pool) |*p| { + if (dev.dispatch.resetDescriptorPool(dev.device, p.pool, 0) != vk.VK_SUCCESS) + return error.VulkanFailed; + } return try Frame.begin( - .{ .cb = frame_cb, .fence = frame_fence }, + .{ + .cb = frame_cb, + .fence = frame_fence, + .step_pool = if (step_pool) |*p| p else null, + }, dev, renderer, target, diff --git a/src/renderer/shadertoy.zig b/src/renderer/shadertoy.zig index 7fe3142f7..f85b98271 100644 --- a/src/renderer/shadertoy.zig +++ b/src/renderer/shadertoy.zig @@ -217,17 +217,28 @@ pub fn glslFromShader( try writer.writeAll(prefix); } else { // Find the first newline after `#version ...` and inject the - // defines on the following line. We assume the prefix begins - // with a `#version` directive on its own line (true today; - // the comptime split below would crash loudly otherwise). - const first_nl = std.mem.indexOfScalar(u8, prefix, '\n').?; - try writer.writeAll(prefix[0 .. first_nl + 1]); - for (defines) |def| { - try writer.writeAll("#define "); - try writer.writeAll(def); - try writer.writeAll("\n"); + // defines on the following line. The prefix is expected to + // start with `#version` followed by a newline; if a future + // edit ever drops that newline (e.g. a single-line prefix) + // we inject the defines BEFORE the prefix so glslang sees + // the directives on their own lines and reports a clear + // error instead of us crashing on a `null.?` unwrap. + if (std.mem.indexOfScalar(u8, prefix, '\n')) |first_nl| { + try writer.writeAll(prefix[0 .. first_nl + 1]); + for (defines) |def| { + try writer.writeAll("#define "); + try writer.writeAll(def); + try writer.writeAll("\n"); + } + try writer.writeAll(prefix[first_nl + 1 ..]); + } else { + for (defines) |def| { + try writer.writeAll("#define "); + try writer.writeAll(def); + try writer.writeAll("\n"); + } + try writer.writeAll(prefix); } - try writer.writeAll(prefix[first_nl + 1 ..]); } try writer.writeAll("\n\n"); try writer.writeAll(src); @@ -506,4 +517,3 @@ test "shadertoy to glsl" { const test_crt = @embedFile("shaders/test_shadertoy_crt.glsl"); const test_invalid = @embedFile("shaders/test_shadertoy_invalid.glsl"); -const test_focus = @embedFile("shaders/test_shadertoy_focus.glsl"); diff --git a/src/renderer/vulkan/DescriptorPool.zig b/src/renderer/vulkan/DescriptorPool.zig index 9248eb2b5..373074ae6 100644 --- a/src/renderer/vulkan/DescriptorPool.zig +++ b/src/renderer/vulkan/DescriptorPool.zig @@ -47,6 +47,28 @@ device: *const Device, pool: vk.VkDescriptorPool, pub fn init(opts: Options) Error!Self { + // Vulkan spec requires `maxSets > 0` and `poolSizeCount > 0` — + // a pool that vends N sets but doesn't admit any descriptor + // type would be useless and is rejected by some drivers + // (loose drivers accept it and fail at allocation time). Catch + // both shapes here so the caller gets a clear error instead of + // a downstream allocation failure. + if (opts.max_sets == 0) { + log.err("DescriptorPool.init: max_sets must be > 0", .{}); + return error.VulkanFailed; + } + if (opts.uniform_buffers == 0 and + opts.combined_image_samplers == 0 and + opts.storage_buffers == 0) + { + log.err( + "DescriptorPool.init: at least one per-type cap must be > 0 " ++ + "(uniform_buffers, combined_image_samplers, storage_buffers)", + .{}, + ); + return error.VulkanFailed; + } + // Build a small VkDescriptorPoolSize array from whichever caps // are non-zero. Vulkan accepts an array; we cap at 3 entries // matching the three types `Options` exposes. @@ -78,11 +100,12 @@ pub fn init(opts: Options) Error!Self { .sType = vk.VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO, .pNext = null, // No FREE_DESCRIPTOR_SET_BIT — we tear down by destroying - // the pool, which matches the per-frame reset pattern. + // the pool (or `vkResetDescriptorPool` for the per-frame + // step pool). .flags = 0, .maxSets = opts.max_sets, .poolSizeCount = n, - .pPoolSizes = if (n > 0) &sizes else null, + .pPoolSizes = &sizes, }; var pool: vk.VkDescriptorPool = undefined; const r = opts.device.dispatch.createDescriptorPool( diff --git a/src/renderer/vulkan/Device.zig b/src/renderer/vulkan/Device.zig index ec6bd524e..1801dbb06 100644 --- a/src/renderer/vulkan/Device.zig +++ b/src/renderer/vulkan/Device.zig @@ -195,6 +195,7 @@ pub const Dispatch = struct { // the actual renderer integration lands. createDescriptorPool: std.meta.Child(vk.PFN_vkCreateDescriptorPool), destroyDescriptorPool: std.meta.Child(vk.PFN_vkDestroyDescriptorPool), + resetDescriptorPool: std.meta.Child(vk.PFN_vkResetDescriptorPool), allocateDescriptorSets: std.meta.Child(vk.PFN_vkAllocateDescriptorSets), updateDescriptorSets: std.meta.Child(vk.PFN_vkUpdateDescriptorSets), cmdBindDescriptorSets: std.meta.Child(vk.PFN_vkCmdBindDescriptorSets), @@ -218,6 +219,12 @@ queue_family_index: u32, /// `error.UnsupportedVulkanVersion`). api_version: u32, +/// Cached `VkPhysicalDeviceMemoryProperties`. The properties are +/// immutable for the physical device's lifetime, so we query once +/// at `init` time instead of on every `findMemoryType` call (which +/// happens for every Buffer/Texture/Target allocation). +memory_properties: vk.VkPhysicalDeviceMemoryProperties, + dispatch: Dispatch, /// Process-wide mutex protecting access to `queue`. Vulkan requires @@ -351,10 +358,41 @@ pub fn init( // ---- 4. extension check -------------------------------------- var ext_count: u32 = 0; - _ = enumerate_device_extension_properties(physical_device, null, &ext_count, null); + { + const r = enumerate_device_extension_properties(physical_device, null, &ext_count, null); + // SUCCESS or INCOMPLETE both populate `ext_count`. INCOMPLETE + // shouldn't happen on the count-only call (no buffer to + // truncate) but we accept it defensively. + if (r != vk.VK_SUCCESS and r != vk.VK_INCOMPLETE) { + log.err("vkEnumerateDeviceExtensionProperties (count) failed: result={}", .{r}); + return error.HostHandleMissing; + } + } const exts = try alloc.alloc(vk.VkExtensionProperties, ext_count); defer alloc.free(exts); - _ = enumerate_device_extension_properties(physical_device, null, &ext_count, exts.ptr); + { + const r = enumerate_device_extension_properties(physical_device, null, &ext_count, exts.ptr); + if (r != vk.VK_SUCCESS and r != vk.VK_INCOMPLETE) { + log.err("vkEnumerateDeviceExtensionProperties (fill) failed: result={}", .{r}); + return error.HostHandleMissing; + } + // VK_INCOMPLETE here means the extension list grew between + // the count and fill calls (race with a driver hot-reload — + // very unlikely in practice but spec-permitted). The + // partially-filled buffer is still authoritative for the + // entries it does contain, but a required extension not yet + // populated would be missed. Treat as a hard fail since the + // extension presence check below would silently pass on a + // truncated list. + if (r == vk.VK_INCOMPLETE) { + log.err( + "vkEnumerateDeviceExtensionProperties returned INCOMPLETE; " ++ + "device extension list changed between count and fill", + .{}, + ); + return error.HostHandleMissing; + } + } inline for (REQUIRED_DEVICE_EXTENSIONS) |required| { var found = false; @@ -501,6 +539,8 @@ pub fn init( try dl.load(vk.PFN_vkCreateDescriptorPool, "vkCreateDescriptorPool"); const destroy_descriptor_pool = try dl.load(vk.PFN_vkDestroyDescriptorPool, "vkDestroyDescriptorPool"); + const reset_descriptor_pool = + try dl.load(vk.PFN_vkResetDescriptorPool, "vkResetDescriptorPool"); const allocate_descriptor_sets = try dl.load(vk.PFN_vkAllocateDescriptorSets, "vkAllocateDescriptorSets"); const update_descriptor_sets = @@ -508,6 +548,12 @@ pub fn init( const cmd_bind_descriptor_sets = try dl.load(vk.PFN_vkCmdBindDescriptorSets, "vkCmdBindDescriptorSets"); + // Snapshot the memory properties once. They never change for + // the device's lifetime, so per-allocation re-queries (which + // findMemoryType used to do) were pure waste. + var memory_properties: vk.VkPhysicalDeviceMemoryProperties = undefined; + get_physical_device_memory_properties(physical_device, &memory_properties); + return .{ .platform = platform, .instance = instance, @@ -516,6 +562,7 @@ pub fn init( .queue = queue, .queue_family_index = queue_family_index, .api_version = props.apiVersion, + .memory_properties = memory_properties, .dispatch = .{ .getPhysicalDeviceProperties = get_physical_device_properties, .getPhysicalDeviceMemoryProperties = get_physical_device_memory_properties, @@ -579,6 +626,7 @@ pub fn init( .cmdCopyImageToBuffer = cmd_copy_image_to_buffer, .createDescriptorPool = create_descriptor_pool, .destroyDescriptorPool = destroy_descriptor_pool, + .resetDescriptorPool = reset_descriptor_pool, .allocateDescriptorSets = allocate_descriptor_sets, .updateDescriptorSets = update_descriptor_sets, .cmdBindDescriptorSets = cmd_bind_descriptor_sets, @@ -593,9 +641,16 @@ pub fn deinit(self: *Device) void { } /// Block until the device is idle. Useful before tearing down -/// renderer resources to make sure no command buffers are in flight. +/// renderer resources to make sure no command buffers are in +/// flight. On `VK_ERROR_DEVICE_LOST` (or any other failure) we +/// log the result so callers proceeding to destroy resources on +/// a dead device leave a diagnostic crumb instead of silently +/// crashing on the subsequent vkDestroy*. pub fn waitIdle(self: *const Device) void { - _ = self.dispatch.deviceWaitIdle(self.device); + const r = self.dispatch.deviceWaitIdle(self.device); + if (r != vk.VK_SUCCESS) { + log.warn("vkDeviceWaitIdle returned {}; teardown proceeding anyway", .{r}); + } } /// Find a `VkMemoryType` index satisfying the requirements from a @@ -609,8 +664,7 @@ pub fn findMemoryType( type_bits: u32, required_props: vk.VkMemoryPropertyFlags, ) ?u32 { - var props: vk.VkPhysicalDeviceMemoryProperties = undefined; - self.dispatch.getPhysicalDeviceMemoryProperties(self.physical_device, &props); + const props = &self.memory_properties; var i: u32 = 0; while (i < props.memoryTypeCount) : (i += 1) { const bit: u32 = @as(u32, 1) << @intCast(i); diff --git a/src/renderer/vulkan/Frame.zig b/src/renderer/vulkan/Frame.zig index 5c3d04a82..0b9d6faa2 100644 --- a/src/renderer/vulkan/Frame.zig +++ b/src/renderer/vulkan/Frame.zig @@ -37,6 +37,7 @@ const vk = @import("vulkan").c; const Device = @import("Device.zig"); const Target = @import("Target.zig"); +const DescriptorPool = @import("DescriptorPool.zig"); const RenderPass = @import("RenderPass.zig"); const Vulkan = @import("../Vulkan.zig"); @@ -53,6 +54,15 @@ pub const Options = struct { /// Fence that gets signaled when the submit completes. Caller /// resets it to unsignaled before `begin` is called. fence: vk.VkFence, + + /// Per-frame descriptor pool. `RenderPass.step` borrows it for + /// the per-call descriptor sets it allocates whenever a + /// pipeline is re-used within a single pass. The pool is + /// caller-owned (top-level `Vulkan.zig` keeps it threadlocal) + /// and must be reset (`vkResetDescriptorPool`) by the caller + /// before each Frame.begin so this frame's allocations don't + /// pile on the previous frame's. + step_pool: ?*DescriptorPool = null, }; pub const Error = error{ @@ -67,6 +77,7 @@ renderer: *Renderer, target: *Target, cb: vk.VkCommandBuffer, fence: vk.VkFence, +step_pool: ?*DescriptorPool = null, /// Begin recording a frame. The command buffer is reset and started /// with `ONE_TIME_SUBMIT` since we always submit before the next @@ -95,6 +106,7 @@ pub fn begin( .target = target, .cb = opts.cb, .fence = opts.fence, + .step_pool = opts.step_pool, }; } @@ -112,75 +124,102 @@ pub fn complete(self: *const Self, sync: bool) void { _ = sync; const dev = self.device; + // `health` becomes `.unhealthy` on any GPU-side error below. We + // ALWAYS run `buffer_pool.cycle` and `frameCompleted` on the + // way out — skipping them on error left every retired buffer + // stuck in `pending` (unbounded growth) and held the renderer's + // swap-chain semaphore forever, so the NEXT `drawFrame` would + // hang with no diagnostic. + var health: Health = .healthy; + var submitted = false; + // Make the rendered pixels visible to the host's mmap read. In // `.direct` mode this is just a memory barrier; in `.legacy_copy` // mode it also runs `vkCmdCopyImageToBuffer`. See `Target.zig`. self.target.recordPresentBarrier(self.cb); - { + end_cb: { const r = dev.dispatch.endCommandBuffer(self.cb); if (r != vk.VK_SUCCESS) { log.err("vkEndCommandBuffer (frame) failed: result={}", .{r}); - return; + health = .unhealthy; + break :end_cb; } - } - const submit_info: vk.VkSubmitInfo = .{ - .sType = vk.VK_STRUCTURE_TYPE_SUBMIT_INFO, - .pNext = null, - .waitSemaphoreCount = 0, - .pWaitSemaphores = null, - .pWaitDstStageMask = null, - .commandBufferCount = 1, - .pCommandBuffers = &self.cb, - .signalSemaphoreCount = 0, - .pSignalSemaphores = null, - }; - { + const submit_info: vk.VkSubmitInfo = .{ + .sType = vk.VK_STRUCTURE_TYPE_SUBMIT_INFO, + .pNext = null, + .waitSemaphoreCount = 0, + .pWaitSemaphores = null, + .pWaitDstStageMask = null, + .commandBufferCount = 1, + .pCommandBuffers = &self.cb, + .signalSemaphoreCount = 0, + .pSignalSemaphores = null, + }; // Externally-synchronized via `Device.queueSubmit` — splits // and tabs share the host's VkQueue and Vulkan rejects // concurrent unsynchronized access. - const r = dev.queueSubmit(1, &submit_info, self.fence); - if (r != vk.VK_SUCCESS) { - log.err("vkQueueSubmit (frame) failed: result={}", .{r}); - return; + const sr = dev.queueSubmit(1, &submit_info, self.fence); + if (sr != vk.VK_SUCCESS) { + log.err("vkQueueSubmit (frame) failed: result={}", .{sr}); + health = .unhealthy; + break :end_cb; } - } + submitted = true; - // Wait for the GPU to finish writing the target before letting - // the host import the dmabuf. UINT64_MAX = "wait indefinitely". - { - const r = dev.dispatch.waitForFences( + // Wait for the GPU to finish writing the target before letting + // the host import the dmabuf. UINT64_MAX = "wait indefinitely". + const wr = dev.dispatch.waitForFences( dev.device, 1, &self.fence, vk.VK_TRUE, std.math.maxInt(u64), ); - if (r != vk.VK_SUCCESS) { - log.err("vkWaitForFences (frame) failed: result={}", .{r}); + if (wr != vk.VK_SUCCESS) { + log.err("vkWaitForFences (frame) failed: result={}", .{wr}); + health = .unhealthy; } } - // Recycle the per-frame Buffer pool now that the fence has - // signaled — every VkBuffer queued during this frame's - // recording is provably no longer in use by the GPU and is - // safe to hand to the next `Buffer.create` call. See - // `Vulkan.buffer_pool` for the lifecycle. - Vulkan.buffer_pool.cycle(dev); + // Recycle the per-frame Buffer pool. Even on the error path we + // still want to cycle: buffers that the failed submit referenced + // are now stuck (we can't prove the GPU is done with them), so + // we conservatively wait the device idle on the unhealthy path + // before draining. Without this, every failed submit leaks + // every buffer the renderer queued for that frame. + if (health == .unhealthy and !submitted) { + // Submit never happened — nothing in flight references + // recorded buffers, safe to cycle directly. + Vulkan.buffer_pool.cycle(dev); + } else if (health == .unhealthy) { + // Submit happened but fence wait failed (DEVICE_LOST etc.). + // Drain the device before recycling to avoid use-after-free + // on whatever queue is still ticking. + _ = dev.dispatch.deviceWaitIdle(dev.device); + Vulkan.buffer_pool.cycle(dev); + } else { + Vulkan.buffer_pool.cycle(dev); + } - // Hand the rendered target off to the host via `Vulkan.present`, - // which both calls the platform's present callback AND records - // the target pointer for `presentLastTarget` no-op republishes. - self.renderer.api.present(self.target) catch |err| { - log.err("present failed: {}", .{err}); - }; + // Hand the rendered target off to the host. On the unhealthy + // path we skip present — the dmabuf may be partially written + // and the host should see the previous frame instead (the + // generic renderer's no-op-frame logic re-presents + // `last_target`). + if (health == .healthy) { + self.renderer.api.present(self.target) catch |err| { + log.err("present failed: {}", .{err}); + health = .unhealthy; + }; + } // Tell the generic renderer the frame is done so it releases the // swap-chain semaphore. Without this, `SwapChain.nextFrame()` // blocks the second call to `drawFrame` forever (one buffer in - // the chain, never freed). - self.renderer.frameCompleted(.healthy); + // the chain, never freed). MUST run regardless of `health`. + self.renderer.frameCompleted(health); } /// Begin a render pass recording into this frame's command buffer. @@ -193,6 +232,7 @@ pub inline fn renderPass( return RenderPass.begin(.{ .device = self.device, .cb = self.cb, + .step_pool = self.step_pool, .attachments = attachments, }); } diff --git a/src/renderer/vulkan/Pipeline.zig b/src/renderer/vulkan/Pipeline.zig index ec556ff95..324b3fdfd 100644 --- a/src/renderer/vulkan/Pipeline.zig +++ b/src/renderer/vulkan/Pipeline.zig @@ -136,9 +136,27 @@ layout: vk.VkPipelineLayout, /// `RenderPass.step` skips updating/binding it). `set_count` is one /// past the last non-null index, matching what /// `vkCmdBindDescriptorSets` needs as `setCount`. +/// +/// HOT-PATH NOTE: these sets are SHARED across all `step()` calls +/// that bind this pipeline within a single command buffer, but +/// `vkCmdDraw` reads descriptors at submit time, so re-using the +/// same pipeline twice with different per-call resources would +/// cause both draws to see the LAST update's bindings. +/// `RenderPass.step` defends against this by allocating a fresh +/// per-call set from the pass's `step_pool` whenever the per-step +/// resources differ; these `descriptor_sets[i]` slots act as +/// pre-warmed defaults (used only when the call site is +/// single-step-per-pipeline like bg_color / cell_bg). descriptor_sets: [MAX_DESCRIPTOR_SETS]vk.VkDescriptorSet = .{ null, null, null }, set_count: u32 = 0, +/// Descriptor set layouts associated with this pipeline, indexed by +/// set number. `null` matches a `null` slot in `descriptor_sets`. +/// Stored so `RenderPass.step` can allocate per-call sets from the +/// pass's per-frame descriptor pool without round-tripping through +/// the original `Shaders.init` layout-creation code path. +descriptor_set_layouts: [MAX_DESCRIPTOR_SETS]vk.VkDescriptorSetLayout = .{ null, null, null }, + /// Binding number that `Step.uniforms` writes to within set 0. /// Defaults to 1 to match `common.glsl`'s /// `layout(binding = 1, std140) uniform Globals`. Override per @@ -395,14 +413,20 @@ pub fn init(opts: Options) Error!Self { return error.VulkanFailed; } } + errdefer dev.dispatch.destroyPipeline(dev.device, pipeline, null); // Allocate one descriptor set per non-null entry in - // `opts.descriptor_set_layouts`. Null entries are placeholder + // `opts.descriptor_set_layouts`. Null entries are placeholders // (the shader's set=i isn't actually used) — nothing to allocate. + // Also remember the layouts on `Self` so `RenderPass.step` can + // allocate fresh per-call sets from a per-frame pool without + // re-creating layouts. var dsets: [MAX_DESCRIPTOR_SETS]vk.VkDescriptorSet = .{ null, null, null }; + var dsls: [MAX_DESCRIPTOR_SETS]vk.VkDescriptorSetLayout = .{ null, null, null }; if (opts.descriptor_pool) |pool_ptr| { for (opts.descriptor_set_layouts, 0..) |maybe_dsl, i| { if (maybe_dsl) |dsl| { + dsls[i] = dsl; dsets[i] = pool_ptr.allocate(dsl) catch |err| { log.err( "Pipeline.init: descriptor set {} allocation failed: {}", @@ -412,6 +436,10 @@ pub fn init(opts: Options) Error!Self { }; } } + } else { + for (opts.descriptor_set_layouts, 0..) |maybe_dsl, i| { + if (maybe_dsl) |dsl| dsls[i] = dsl; + } } return .{ @@ -419,6 +447,7 @@ pub fn init(opts: Options) Error!Self { .pipeline = pipeline, .layout = layout, .descriptor_sets = dsets, + .descriptor_set_layouts = dsls, .set_count = @intCast(opts.descriptor_set_layouts.len), .sampler = opts.sampler, .vertex_stride = if (opts.vertex_input) |vi| vi.stride else 0, diff --git a/src/renderer/vulkan/RenderPass.zig b/src/renderer/vulkan/RenderPass.zig index f679d2f14..b3db3ce7e 100644 --- a/src/renderer/vulkan/RenderPass.zig +++ b/src/renderer/vulkan/RenderPass.zig @@ -18,6 +18,7 @@ const Self = @This(); const std = @import("std"); const vk = @import("vulkan").c; +const DescriptorPool = @import("DescriptorPool.zig"); const Device = @import("Device.zig"); const Pipeline = @import("Pipeline.zig"); const Sampler = @import("Sampler.zig"); @@ -57,6 +58,16 @@ pub const Options = struct { /// by the enclosing `Frame`. cb: vk.VkCommandBuffer, + /// Per-frame descriptor pool. Used by `step` to allocate fresh + /// descriptor sets on the SECOND and later step() calls that + /// bind the same pipeline within this pass — without it, + /// mutating the pipeline's static `descriptor_sets[i]` for the + /// second call would overwrite the first call's bindings before + /// the GPU has read them (vkCmdDraw reads at submit time). + /// Optional: passes that never re-use a pipeline (bg_color, + /// cell_bg, cell_text) work without it. + step_pool: ?*DescriptorPool = null, + /// Color attachments for the pass. With dynamic rendering each /// attachment is a render target + optional clear color. attachments: []const Attachment, @@ -114,8 +125,21 @@ pub const Error = error{ attachments: []const Options.Attachment, cb: vk.VkCommandBuffer, device: *const Device, +step_pool: ?*DescriptorPool = null, step_number: usize = 0, +/// VkPipeline handles already used by an earlier `step` in this +/// pass. On second-and-later use of the same pipeline we allocate +/// a fresh per-call descriptor set from `step_pool` instead of +/// mutating `pipeline.descriptor_sets[i]` (vkCmdDraw reads at +/// submit time, so re-updating the same set in place would +/// overwrite the prior call's bindings before the GPU has read +/// them). Capacity covers our worst case: per-pass image draws +/// can fire dozens of pipeline reuses. The slice is empty when no +/// step_pool was provided. +seen_pipelines: [MAX_SEEN_PIPELINES]vk.VkPipeline = .{null} ** MAX_SEEN_PIPELINES, +seen_pipelines_len: usize = 0, + /// Last `Step.uniforms` value seen in this pass. The OpenGL backend /// keeps the bound UBO across draw calls implicitly (GL state /// persists), and the renderer's image/overlay draw calls in @@ -126,6 +150,13 @@ step_number: usize = 0, /// to null at `begin`. last_uniforms: ?vk.VkBuffer = null, +/// Cap on the number of distinct pipelines we'll track per pass +/// for "first-use vs re-use" detection. The renderer's pass shape +/// is: bg_color (1), cell_bg (1), cell_text (1), bg_image (1), +/// image (varies). 8 is generous; we degrade gracefully to "always +/// allocate fresh" past this cap. +const MAX_SEEN_PIPELINES: usize = 8; + /// Begin a render pass. Transitions the first attachment to /// `COLOR_ATTACHMENT_OPTIMAL` and opens a `vkCmdBeginRendering` /// scope with the caller's clear color (defaults to opaque black). @@ -138,6 +169,7 @@ pub fn begin(opts: Options) Self { .attachments = opts.attachments, .cb = opts.cb, .device = opts.device, + .step_pool = opts.step_pool, }; if (opts.attachments.len == 0) return self; @@ -340,6 +372,48 @@ pub fn step(self: *Self, s: Step) void { } } + // Pick effective descriptor sets for this step. + // + // First time we see a given pipeline within this pass, we use + // its pre-allocated `descriptor_sets[]` slots and update them + // in place — cheap and avoids a per-pass-pool allocation in + // the common single-step case (bg_color/cell_bg/cell_text). + // + // SECOND-and-later use of the same pipeline within the same + // pass requires fresh sets: vkCmdDraw reads the descriptor + // contents at SUBMIT time, so re-updating the static sets in + // place would silently make every prior draw bound to this + // pipeline read the LAST update's UBO/sampler/storage. The + // image / kitty path issues N draws on the same `image` + // pipeline with per-call vertex buffers and textures — without + // this fix every kitty image rendered with the FINAL image's + // texture and the final draw's vertex buffer. + // + // The fresh sets come from `step_pool`, owned by the enclosing + // Frame and reset at frame start. When `step_pool` is null + // (test harnesses, smoke tests) we fall back to the static + // sets and accept the limitation. + var effective_sets: [Pipeline.MAX_DESCRIPTOR_SETS]vk.VkDescriptorSet = + s.pipeline.descriptor_sets; + const reused = self.markPipelineUsed(s.pipeline.pipeline); + if (reused) if (self.step_pool) |pool| { + for (s.pipeline.descriptor_set_layouts, 0..) |maybe_dsl, i| { + if (i >= s.pipeline.set_count) break; + const dsl = maybe_dsl orelse continue; + if (pool.allocate(dsl)) |fresh| { + effective_sets[i] = fresh; + } else |err| { + log.err( + "RenderPass.step: per-call descriptor set " ++ + "allocation for set {} failed ({}); falling " ++ + "back to the pipeline's static set, which " ++ + "may corrupt prior draws on this pipeline", + .{ i, err }, + ); + } + } + }; + // ---- update descriptor sets --------------------------------- // // We do one vkUpdateDescriptorSets call per descriptor write to @@ -353,7 +427,7 @@ pub fn step(self: *Self, s: Step) void { // supply one. Track the new one for later steps. const ubo: ?vk.VkBuffer = s.uniforms orelse self.last_uniforms; if (s.uniforms) |b| self.last_uniforms = b; - if (s.pipeline.descriptor_sets[0] != null) if (ubo) |ubo_buffer| { + if (effective_sets[0] != null) if (ubo) |ubo_buffer| { const buffer_info: vk.VkDescriptorBufferInfo = .{ .buffer = ubo_buffer, .offset = 0, @@ -362,7 +436,7 @@ pub fn step(self: *Self, s: Step) void { const write: vk.VkWriteDescriptorSet = .{ .sType = vk.VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET, .pNext = null, - .dstSet = s.pipeline.descriptor_sets[0], + .dstSet = effective_sets[0], .dstBinding = s.pipeline.uniforms_binding, .dstArrayElement = 0, .descriptorCount = 1, @@ -375,7 +449,7 @@ pub fn step(self: *Self, s: Step) void { }; // Samplers (set 1) - if (s.pipeline.descriptor_sets[1] != null) { + if (effective_sets[1] != null) { const slot_count = @max(s.textures.len, s.samplers.len); for (0..slot_count) |slot| { const tex_opt: ?Texture = if (slot < s.textures.len) s.textures[slot] else null; @@ -396,7 +470,7 @@ pub fn step(self: *Self, s: Step) void { const write: vk.VkWriteDescriptorSet = .{ .sType = vk.VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET, .pNext = null, - .dstSet = s.pipeline.descriptor_sets[1], + .dstSet = effective_sets[1], .dstBinding = @intCast(slot), .dstArrayElement = 0, .descriptorCount = 1, @@ -411,7 +485,7 @@ pub fn step(self: *Self, s: Step) void { // Storage buffers (set 2). `buffers[0]` is reserved for the // vertex buffer (handled above), so storage starts at slot 1. - if (s.pipeline.descriptor_sets[2] != null and s.buffers.len > 1) { + if (effective_sets[2] != null and s.buffers.len > 1) { for (s.buffers[1..], 1..) |maybe_buf, slot| { const buf = maybe_buf orelse continue; const buffer_info: vk.VkDescriptorBufferInfo = .{ @@ -422,7 +496,7 @@ pub fn step(self: *Self, s: Step) void { const write: vk.VkWriteDescriptorSet = .{ .sType = vk.VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET, .pNext = null, - .dstSet = s.pipeline.descriptor_sets[2], + .dstSet = effective_sets[2], .dstBinding = @intCast(slot), .dstArrayElement = 0, .descriptorCount = 1, @@ -443,19 +517,19 @@ pub fn step(self: *Self, s: Step) void { // contiguous run of non-null sets. var start: usize = 0; while (start < s.pipeline.set_count) { - if (s.pipeline.descriptor_sets[start] == null) { + if (effective_sets[start] == null) { start += 1; continue; } var end = start + 1; - while (end < s.pipeline.set_count and s.pipeline.descriptor_sets[end] != null) : (end += 1) {} + while (end < s.pipeline.set_count and effective_sets[end] != null) : (end += 1) {} dev.dispatch.cmdBindDescriptorSets( self.cb, vk.VK_PIPELINE_BIND_POINT_GRAPHICS, s.pipeline.layout, @intCast(start), @intCast(end - start), - &s.pipeline.descriptor_sets[start], + &effective_sets[start], 0, null, ); @@ -477,6 +551,26 @@ pub fn step(self: *Self, s: Step) void { self.step_number += 1; } +/// Mark `pipeline` as used in this pass and report whether it was +/// already seen. Returns `false` on the FIRST call (so `step` can +/// safely update the pipeline's static descriptor sets in place); +/// `true` on every subsequent call (so `step` allocates fresh sets +/// from `step_pool` to avoid clobbering the prior call's bindings). +/// +/// Beyond `MAX_SEEN_PIPELINES` we conservatively report `true` so +/// callers always allocate fresh — the alternative (silently +/// reverting to in-place updates) is the bug this whole mechanism +/// exists to prevent. +fn markPipelineUsed(self: *Self, pipeline: vk.VkPipeline) bool { + for (self.seen_pipelines[0..self.seen_pipelines_len]) |seen| { + if (seen == pipeline) return true; + } + if (self.seen_pipelines_len >= MAX_SEEN_PIPELINES) return true; + self.seen_pipelines[self.seen_pipelines_len] = pipeline; + self.seen_pipelines_len += 1; + return false; +} + /// Close the rendering scope and leave the attachment in a layout /// the host can read back via the dmabuf export. `GENERAL` is the /// safest choice for unknown consumer access patterns; the host diff --git a/src/renderer/vulkan/Target.zig b/src/renderer/vulkan/Target.zig index 513674a54..0a554a6b3 100644 --- a/src/renderer/vulkan/Target.zig +++ b/src/renderer/vulkan/Target.zig @@ -198,12 +198,17 @@ fn pickModifier( // work for AMD/Intel LINEAR but the compositor attach would // fail, so treat it as "no intersection." var host_mods: [MAX_MODIFIERS]u64 = undefined; - const host_count = dev.platform.get_supported_modifiers( + const host_returned = dev.platform.get_supported_modifiers( dev.platform.userdata, drm_format, &host_mods, MAX_MODIFIERS, ); + // Clamp defensively. The C ABI contract is "host returns ≤ capacity", + // but we don't get to assume the host's implementation is correct + // — and in safe builds an OOB read on `host_mods[..host_returned]` + // panics, hiding the real diagnostic. + const host_count: usize = @min(host_returned, MAX_MODIFIERS); if (host_count == 0) { log.warn( "host advertises no dmabuf modifiers for format 0x{x}; " ++ @@ -465,7 +470,22 @@ fn initDirect(opts: Options, drm_format: u32, chosen_mod: u64) Error!Self { .fd = fd, .drm_format = drm_format, .drm_modifier = actual_mod, - .stride = @intCast(layout.rowPitch), + .stride = stride: { + // VkSubresourceLayout.rowPitch is u64 but the platform + // present callback accepts u32 stride. For a sanely- + // sized terminal target stride fits comfortably in u32, + // but vendor-tiled drivers at exotic resolutions could + // legitimately exceed it. Fail the init explicitly + // instead of letting `@intCast` panic in safe builds. + if (layout.rowPitch > std.math.maxInt(u32)) { + log.err( + "Target.initDirect: rowPitch {} > u32 max; refusing direct mode", + .{layout.rowPitch}, + ); + return error.UnsupportedFormat; + } + break :stride @intCast(layout.rowPitch); + }, }; } diff --git a/src/renderer/vulkan/Texture.zig b/src/renderer/vulkan/Texture.zig index 9d34506ce..366e1a963 100644 --- a/src/renderer/vulkan/Texture.zig +++ b/src/renderer/vulkan/Texture.zig @@ -74,6 +74,12 @@ image: vk.VkImage, memory: vk.VkDeviceMemory, view: vk.VkImageView, format: vk.VkFormat, +/// Aspect mask the image was created with (e.g. COLOR_BIT for +/// renderable textures, DEPTH_BIT for depth attachments). Stored +/// so per-frame `replaceRegion` barrier/copy use the same aspect +/// the image view was made with — hardcoding COLOR_BIT here was a +/// silent validation error for any non-color caller. +aspect: vk.VkImageAspectFlags, width: usize, height: usize, device: *const Device, @@ -207,6 +213,7 @@ pub fn init( .memory = memory, .view = view, .format = opts.format, + .aspect = opts.aspect, .width = width, .height = height, .device = dev, @@ -249,7 +256,15 @@ pub fn replaceRegion( .device = dev, .usage = vk.VK_BUFFER_USAGE_TRANSFER_SRC_BIT, }, data); - defer staging.deinit(); + // `destroyImmediate` instead of `deinit`: replaceRegion runs + // synchronously on the calling thread (typically the main / + // app-init thread, NOT the renderer thread), and + // `OneShot.endAndSubmit` below calls `vkQueueWaitIdle` so the + // staging buffer is provably done with the GPU before this + // defer fires. Routing it into `Vulkan.buffer_pool` from a + // non-renderer thread would leak it forever — the pool's + // `cycle()` runs only on the renderer thread. + defer staging.destroyImmediate(); // ---- command pool (one-shot) -------------------------------- var pool = try CommandPool.init(dev); @@ -279,7 +294,7 @@ pub fn replaceRegion( .dstQueueFamilyIndex = vk.VK_QUEUE_FAMILY_IGNORED, .image = self.image, .subresourceRange = .{ - .aspectMask = vk.VK_IMAGE_ASPECT_COLOR_BIT, + .aspectMask = self.aspect, .baseMipLevel = 0, .levelCount = 1, .baseArrayLayer = 0, @@ -304,7 +319,7 @@ pub fn replaceRegion( .bufferRowLength = 0, // tightly packed .bufferImageHeight = 0, .imageSubresource = .{ - .aspectMask = vk.VK_IMAGE_ASPECT_COLOR_BIT, + .aspectMask = self.aspect, .mipLevel = 0, .baseArrayLayer = 0, .layerCount = 1, @@ -343,7 +358,7 @@ pub fn replaceRegion( .dstQueueFamilyIndex = vk.VK_QUEUE_FAMILY_IGNORED, .image = self.image, .subresourceRange = .{ - .aspectMask = vk.VK_IMAGE_ASPECT_COLOR_BIT, + .aspectMask = self.aspect, .baseMipLevel = 0, .levelCount = 1, .baseArrayLayer = 0, diff --git a/src/renderer/vulkan/buffer.zig b/src/renderer/vulkan/buffer.zig index 388717441..cd73eccce 100644 --- a/src/renderer/vulkan/buffer.zig +++ b/src/renderer/vulkan/buffer.zig @@ -83,20 +83,26 @@ pub fn Buffer(comptime T: type) type { return self; } + /// Hand the (VkBuffer, VkDeviceMemory) pair back to the + /// process-wide pool. The pool (see `Vulkan.buffer_pool`) + /// holds the entry until the current frame's fence has + /// signaled (the GPU is done with our recorded references) + /// and then makes it available to a future `Buffer.create` + /// call. Returning to the pool solves both: + /// - `renderer/image.zig:draw`'s `defer buf.deinit()` no + /// longer use-after-frees the in-flight buffer. + /// - It avoids the per-frame allocation thrash that + /// drove the driver to SIGSEGV on image-heavy frames. + /// + /// MUST be called only from the renderer thread (the path + /// whose fence will eventually retire references to this + /// buffer in `Frame.complete`). One-shot uploads (atlas + /// staging buffers, etc.) that already block on + /// `vkQueueWaitIdle` post-submit must use + /// `destroyImmediate` instead — they don't share the + /// renderer thread's fence cycle. pub fn deinit(self: Self) void { const dev = self.opts.device; - // Hand the (VkBuffer, VkDeviceMemory) pair back to the - // process-wide pool instead of destroying it. The pool - // (see `Vulkan.buffer_pool`) holds the entry until the - // current frame's fence has signaled (the GPU is done - // with our recorded references) and then makes it - // available to a future `Buffer.create` call. Returning - // to the pool solves BOTH: - // - `renderer/image.zig:draw`'s `defer buf.deinit()` - // no longer use-after-frees the in-flight buffer. - // - It avoids the per-frame allocation thrash that - // drove the driver to SIGSEGV on image-heavy - // frames. const bp = @import("../Vulkan.zig").buffer_pool; const capacity_bytes: u64 = @as(u64, self.len) * @sizeOf(T); bp.release( @@ -106,16 +112,35 @@ pub fn Buffer(comptime T: type) type { self.opts.usage, capacity_bytes, ) catch { - // OOM growing the pool — fall back to immediate - // destroy. Logging here is awkward (no logger in - // scope) so we accept the loud failure and let - // Vulkan stderr diagnose any use-after-free that - // follows. + // OOM growing the pool. The buffer may still be + // referenced by an in-flight command buffer, so we + // wait the entire device idle before destroying — + // expensive but correct. Logging here is awkward (no + // logger in scope) so we accept the loud failure and + // let Vulkan stderr diagnose anything that follows. + _ = dev.dispatch.deviceWaitIdle(dev.device); dev.dispatch.destroyBuffer(dev.device, self.buffer, null); dev.dispatch.freeMemory(dev.device, self.memory, null); }; } + /// Destroy the buffer immediately, bypassing the recycle + /// pool. The caller MUST ensure no in-flight command buffer + /// references this buffer (e.g. by having waited on a fence + /// or `vkQueueWaitIdle` covering its submission). + /// + /// Used by short-lived staging buffers like + /// `Texture.replaceRegion` whose lifetime is bounded by a + /// `OneShot.endAndSubmit` that already drains the queue; + /// stuffing those into the pool from a non-renderer thread + /// would leak them (the renderer thread's `cycle` runs the + /// pool, so an upload thread's pushes never get reused). + pub fn destroyImmediate(self: Self) void { + const dev = self.opts.device; + dev.dispatch.destroyBuffer(dev.device, self.buffer, null); + dev.dispatch.freeMemory(dev.device, self.memory, null); + } + /// Replace the buffer's contents. Grows (doubles) if needed — /// matches the OpenGL backend's behavior. Data shorter than /// the current capacity leaves the trailing slots untouched. @@ -235,14 +260,30 @@ pub fn Buffer(comptime T: type) type { }; } - /// Grow the buffer to hold at least `new_len` Ts. Destroys - /// and recreates the underlying VkBuffer (Vulkan buffers are - /// immutable in size). Contents are discarded — callers + /// Grow the buffer to hold at least `new_len` Ts. Vulkan + /// buffers are immutable in size, so we route the old + /// buffer through the recycle pool (it may still be + /// referenced by the in-flight command buffer — destroying + /// it directly would race the GPU same as `deinit` would) + /// and create a fresh one. Contents are discarded; callers /// always `sync` immediately after `grow` returns. fn grow(self: *Self, new_len: usize) Error!void { const dev = self.opts.device; - dev.dispatch.destroyBuffer(dev.device, self.buffer, null); - dev.dispatch.freeMemory(dev.device, self.memory, null); + const bp = @import("../Vulkan.zig").buffer_pool; + const capacity_bytes: u64 = @as(u64, self.len) * @sizeOf(T); + bp.release( + dev, + self.buffer, + self.memory, + self.opts.usage, + capacity_bytes, + ) catch { + // OOM appending to the pool — wait the device idle + // and destroy directly. Same fallback as `deinit`. + _ = dev.dispatch.deviceWaitIdle(dev.device); + dev.dispatch.destroyBuffer(dev.device, self.buffer, null); + dev.dispatch.freeMemory(dev.device, self.memory, null); + }; const replacement = try create(self.opts, new_len); self.* = replacement; } diff --git a/src/renderer/vulkan/shaders.zig b/src/renderer/vulkan/shaders.zig index 1927fe3fe..767e11d5d 100644 --- a/src/renderer/vulkan/shaders.zig +++ b/src/renderer/vulkan/shaders.zig @@ -67,9 +67,20 @@ fn processIncludes(comptime contents: [:0]const u8) [:0]const u8 { var i: usize = 0; while (i < contents.len) { if (std.mem.startsWith(u8, contents[i..], "#include")) { - std.debug.assert(std.mem.startsWith(u8, contents[i..], "#include \"")); - const start = i + "#include \"".len; - const end = std.mem.indexOfScalarPos(u8, contents, start, '"').?; + // Skip whitespace (space or tab) between `#include` and + // the opening quote. The previous literal-prefix + // `startsWith("#include \"")` assert tripped on legal + // `#include\t"…"` and `#include "…"` variants. Accept + // any horizontal whitespace and require exactly one + // double-quoted path. + var p = i + "#include".len; + while (p < contents.len and (contents[p] == ' ' or contents[p] == '\t')) : (p += 1) {} + if (p >= contents.len or contents[p] != '"') { + @compileError("processIncludes: malformed #include directive in shader"); + } + const start = p + 1; + const end = std.mem.indexOfScalarPos(u8, contents, start, '"') orelse + @compileError("processIncludes: unterminated #include path"); return std.fmt.comptimePrint("{s}{s}{s}", .{ contents[0..i], @embedFile("../shaders/glsl/" ++ contents[start..end]), @@ -178,6 +189,12 @@ pub fn vulkanizeGlsl( var i: usize = 0; while (i < src.len) { + // Skip comments + string literals verbatim — anything + // that looks like an identifier inside one of those is + // not a real token, and rewriting it (e.g. a comment + // that mentions `gl_VertexID` or `texture(atlas_*, ...)`) + // would silently corrupt the shader source. + if (try copySkippable(alloc, &out, src, &i)) continue; const c = src[i]; const is_ident_start = isIdentChar(c); if (is_ident_start) { @@ -245,6 +262,11 @@ pub fn vulkanizeGlsl( var i: usize = 0; while (i < pass1.len) { + // Skip comments + string literals verbatim, same reason as + // pass 1 — rewriting `layout(binding=…)` text inside a + // comment would inject a `set =` qualifier into a comment + // that's never compiled, harmless today but a footgun. + if (try copySkippable(alloc, &out, pass1, &i)) continue; if (matchKeyword(pass1, i, "layout")) |layout_end| { // Skip whitespace between `layout` and `(`. var p = layout_end; @@ -307,6 +329,52 @@ pub fn vulkanizeGlsl( return try out.toOwnedSliceSentinel(alloc, 0); } +/// If position `i` in `src` is the start of a GLSL line comment +/// (`//...\n`), block comment (`/* ... */`), or `"..."` string +/// literal, copy the whole token verbatim into `out`, advance +/// `*i` past it, and return true. Otherwise `*i` is unchanged +/// and we return false. +/// +/// Strings are unusual in GLSL but `#extension` directives can +/// quote in some preprocessor flavors, and the safe thing is to +/// leave any quoted run untouched. +fn copySkippable( + alloc: std.mem.Allocator, + out: *std.ArrayList(u8), + src: []const u8, + i: *usize, +) std.mem.Allocator.Error!bool { + const start = i.*; + if (start >= src.len) return false; + if (start + 1 < src.len and src[start] == '/' and src[start + 1] == '/') { + var p = start; + while (p < src.len and src[p] != '\n') : (p += 1) {} + try out.appendSlice(alloc, src[start..p]); + i.* = p; + return true; + } + if (start + 1 < src.len and src[start] == '/' and src[start + 1] == '*') { + var p = start + 2; + while (p + 1 < src.len and !(src[p] == '*' and src[p + 1] == '/')) : (p += 1) {} + // Include the closing `*/` if found; otherwise consume to EOF. + const end = if (p + 1 < src.len) p + 2 else src.len; + try out.appendSlice(alloc, src[start..end]); + i.* = end; + return true; + } + if (src[start] == '"') { + var p = start + 1; + while (p < src.len and src[p] != '"') : (p += 1) { + if (src[p] == '\\' and p + 1 < src.len) p += 1; + } + const end = if (p < src.len) p + 1 else src.len; + try out.appendSlice(alloc, src[start..end]); + i.* = end; + return true; + } + return false; +} + fn isIdentChar(c: u8) bool { return (c >= 'a' and c <= 'z') or (c >= 'A' and c <= 'Z') or @@ -314,15 +382,6 @@ fn isIdentChar(c: u8) bool { c == '_'; } -/// True if the first non-space, non-comment character at or after -/// position `i` in `src` is `(`. Used to recognize a function call -/// when the caller is positioned right after the identifier name. -fn nextNonSpaceIsOpenParen(src: []const u8, i: usize) bool { - var p = i; - while (p < src.len and isAnySpace(src[p])) : (p += 1) {} - return p < src.len and src[p] == '('; -} - /// Names of samplers we create with `unnormalized_coordinates = /// VK_TRUE`. The shaders here all use only the two atlas samplers /// for cell_text; if more get added (or renamed) update this list. @@ -457,9 +516,11 @@ pub const Module = struct { return error.GlslangFailed; }; - const translated = vulkanizeGlsl(alloc, src) catch { - return error.GlslangFailed; - }; + // vulkanizeGlsl returns `Allocator.Error` only — surface it + // via `try` (Module.Error includes Allocator.Error) so OOM + // doesn't get reported as `error.GlslangFailed`. Conflating + // them masked the actual failure mode in earlier diagnostics. + const translated = try vulkanizeGlsl(alloc, src); defer alloc.free(translated); const c = glslang.c; @@ -671,14 +732,19 @@ const empty_pipeline: Pipeline = .{ /// `Shaders.deinit` walks the same set in reverse to destroy /// pipelines, layouts, samplers, the descriptor pool, and modules. pub const Shaders = struct { + /// Borrowed pointer to the host-owned VkDevice wrapper. Stored + /// so `deinit` can reach the device dispatch table without + /// reaching into an arbitrary module's `.device` field (which + /// would silently break if `Modules` is restructured). The + /// pointer outlives `Shaders` because the device is process- + /// global in `Vulkan.zig`. + device: *const Device, pipelines: PipelineCollection, /// One per user-supplied custom shader. Built by `Shaders.init` /// from the `post_shaders` arg — empty when no custom shaders. - /// Owned by `Shaders` (deinit destroys each). + /// Owned by `Shaders` (deinit destroys each + frees the slice + /// using the allocator passed to `deinit`). post_pipelines: []Pipeline, - /// Allocator used to allocate `post_pipelines`; held so deinit - /// can free the slice. - post_alloc: ?Allocator = null, /// Compiled `VkShaderModule`s for each user shader, parallel to /// `post_pipelines`. Owned by `Shaders` (deinit destroys each). post_modules: []Module = &.{}, @@ -1188,12 +1254,17 @@ pub const Shaders = struct { post_modules = try alloc.alloc(Module, post_shaders.len); errdefer alloc.free(post_modules); - // Init counter so partial failures can deinit only what - // was built. - var built: usize = 0; + // Init counters so partial failures deinit exactly what + // was built. We track modules and pipelines separately + // because the inner loop creates a module first, then + // tries to build a pipeline against it — if Pipeline.init + // fails after Module.initFromSpirv succeeded, the module + // is populated but the pipeline isn't. + var modules_built: usize = 0; + var pipelines_built: usize = 0; errdefer { - for (post_pipelines[0..built]) |p| p.deinit(); - for (post_modules[0..built]) |m| m.deinit(); + for (post_pipelines[0..pipelines_built]) |p| p.deinit(); + for (post_modules[0..modules_built]) |m| m.deinit(); } // Shared descriptor set layouts across post pipelines. @@ -1224,6 +1295,7 @@ pub const Shaders = struct { } const spv_words: []const u32 = std.mem.bytesAsSlice(u32, @as([]align(@alignOf(u32)) const u8, @alignCast(spv_bytes))); post_modules[i] = try Module.initFromSpirv(device, spv_words, .fragment); + modules_built = i + 1; post_pipelines[i] = try Pipeline.init(.{ .device = device, .descriptor_pool = &pool, @@ -1236,14 +1308,14 @@ pub const Shaders = struct { .blending_enabled = false, .topology = vk.VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST, }); - built = i + 1; + pipelines_built = i + 1; } } return .{ + .device = device, .pipelines = pipelines, .post_pipelines = post_pipelines, - .post_alloc = if (post_shaders.len > 0) alloc else null, .post_modules = post_modules, .modules = modules, .descriptor_pool = pool, @@ -1292,7 +1364,6 @@ pub const Shaders = struct { } pub fn deinit(self: *Shaders, alloc: Allocator) void { - _ = alloc; if (self.defunct) return; self.defunct = true; @@ -1314,10 +1385,12 @@ pub const Shaders = struct { // pipeline first (holds VkPipelineLayout), then shader module. for (self.post_pipelines) |p| p.deinit(); for (self.post_modules) |m| m.deinit(); - if (self.post_alloc) |a| { - a.free(self.post_pipelines); - a.free(self.post_modules); - } + // The slices were allocated from the same allocator the + // caller hands to deinit (the renderer's `self.alloc`). + // Use it directly — the previous `post_alloc` field was + // an extra source of truth for the same value. + if (self.post_pipelines.len > 0) alloc.free(self.post_pipelines); + if (self.post_modules.len > 0) alloc.free(self.post_modules); // Atlas sampler held by `Shaders` for the cell_text pipeline's // texture bindings. @@ -1331,7 +1404,7 @@ pub const Shaders = struct { // Destroy every descriptor set layout we created. The empty // placeholder is one of the entries. - const dev = self.modules.full_screen_vert.device; + const dev = self.device; for (self.set_layouts[0..self.set_layouts_len]) |dsl| { if (dsl != null) dev.dispatch.destroyDescriptorSetLayout( dev.device,