unicode: isolate properties, tables, and ziglyph into separate files (#8810)

This makes it cleaner visually where the separation of concerns is. There is now the generic `Properties.zig`, and then the implementation-specific `props_<impl>.zig` files. Despite Zig's lazy analysis, I find this is much easier to understand as a human. Doing this resulted in finding one part in `src/terminal` where we were still inadvertently using ziglyph directly instead of our LUTs! I switched this out. After this PR, `src/terminal` as a standalone module no longer depends on `ziglyph` at all.[^1] cc @jacobsandlund this is going to cause conflicts in your PR. I'm sorry about that. But it should make it cleaner to bring in the uucode work by adding a dedicated `props_uucode.zig` file! [^1]: Why would I be talking about `src/terminal` as a standalone module? That's interesting.
2025-09-20 15:15:15 -07:00 · 2025-09-20 15:15:15 -07:00 · 511314e1a1
parent a96b2abf7c bf1278deff
commit 511314e1a1
5 changed files with 194 additions and 6 deletions
--- a/src/terminal/Terminal.zig
+++ b/src/terminal/Terminal.zig
@ -344,7 +344,7 @@ pub fn print(self: *Terminal, c: u21) !void {
            // VS15 makes it narrow.
            if (c == 0xFE0F or c == 0xFE0E) {
                // This only applies to emoji
-                const prev_props = unicode.getProperties(prev.cell.content.codepoint);
+                const prev_props = unicode.table.get(prev.cell.content.codepoint);
                const emoji = prev_props.grapheme_boundary_class.isExtendedPictographic();
                if (!emoji) return;

@ -470,7 +470,7 @@ pub fn print(self: *Terminal, c: u21) !void {

        // If this is a emoji variation selector, prev must be an emoji
        if (c == 0xFE0F or c == 0xFE0E) {
-            const prev_props = unicode.getProperties(prev.content.codepoint);
+            const prev_props = unicode.table.get(prev.content.codepoint);
            const emoji = prev_props.grapheme_boundary_class == .extended_pictographic;
            if (!emoji) return;
        }
--- a/src/unicode/Properties.zig
+++ b/src/unicode/Properties.zig
@ -0,0 +1,75 @@
+//! Property set per codepoint that Ghostty cares about.
+//!
+//! Adding to this lets you find new properties but also potentially makes
+//! our lookup tables less efficient. Any changes to this should run the
+//! benchmarks in src/bench to verify that we haven't regressed.
+const Properties = @This();
+
+const std = @import("std");
+
+/// Codepoint width. We clamp to [0, 2] since Ghostty handles control
+/// characters and we max out at 2 for wide characters (i.e. 3-em dash
+/// becomes a 2-em dash).
+width: u2 = 0,
+
+/// Grapheme boundary class.
+grapheme_boundary_class: GraphemeBoundaryClass = .invalid,
+
+// Needed for lut.Generator
+pub fn eql(a: Properties, b: Properties) bool {
+    return a.width == b.width and
+        a.grapheme_boundary_class == b.grapheme_boundary_class;
+}
+
+// Needed for lut.Generator
+pub fn format(
+    self: Properties,
+    comptime layout: []const u8,
+    opts: std.fmt.FormatOptions,
+    writer: anytype,
+) !void {
+    _ = layout;
+    _ = opts;
+    try std.fmt.format(writer,
+        \\.{{
+        \\    .width= {},
+        \\    .grapheme_boundary_class= .{s},
+        \\}}
+    , .{
+        self.width,
+        @tagName(self.grapheme_boundary_class),
+    });
+}
+
+/// Possible grapheme boundary classes. This isn't an exhaustive list:
+/// we omit control, CR, LF, etc. because in Ghostty's usage that are
+/// impossible because they're handled by the terminal.
+pub const GraphemeBoundaryClass = enum(u4) {
+    invalid,
+    L,
+    V,
+    T,
+    LV,
+    LVT,
+    prepend,
+    extend,
+    zwj,
+    spacing_mark,
+    regional_indicator,
+    extended_pictographic,
+    extended_pictographic_base, // \p{Extended_Pictographic} & \p{Emoji_Modifier_Base}
+    emoji_modifier, // \p{Emoji_Modifier}
+
+    /// Returns true if this is an extended pictographic type. This
+    /// should be used instead of comparing the enum value directly
+    /// because we classify multiple.
+    pub fn isExtendedPictographic(self: GraphemeBoundaryClass) bool {
+        return switch (self) {
+            .extended_pictographic,
+            .extended_pictographic_base,
+            => true,
+
+            else => false,
+        };
+    }
+};
--- a/src/unicode/main.zig
+++ b/src/unicode/main.zig
@ -1,14 +1,13 @@
 pub const lut = @import("lut.zig");

 const grapheme = @import("grapheme.zig");
-const props = @import("props.zig");
-pub const table = props.table;
-pub const Properties = props.Properties;
-pub const getProperties = props.get;
+pub const table = @import("props_table.zig").table;
+pub const Properties = @import("Properties.zig");
 pub const graphemeBreak = grapheme.graphemeBreak;
 pub const GraphemeBreakState = grapheme.BreakState;

 test {
+    _ = @import("props_ziglyph.zig");
    _ = @import("symbols.zig");
    @import("std").testing.refAllDecls(@This());
 }
--- a/src/unicode/props_table.zig
+++ b/src/unicode/props_table.zig
@ -0,0 +1,18 @@
+const Properties = @import("Properties.zig");
+const lut = @import("lut.zig");
+
+/// The lookup tables for Ghostty.
+pub const table = table: {
+    // This is only available after running a generator as part of the Ghostty
+    // build.zig process, but due to Zig's lazy analysis we can still reference
+    // it here.
+    //
+    // An example process is the `main` in `props_ziglyph.zig`
+    const generated = @import("unicode_tables").Tables(Properties);
+    const Tables = lut.Tables(Properties);
+    break :table Tables{
+        .stage1 = &generated.stage1,
+        .stage2 = &generated.stage2,
+        .stage3 = &generated.stage3,
+    };
+};
--- a/src/unicode/props_ziglyph.zig
+++ b/src/unicode/props_ziglyph.zig
@ -0,0 +1,96 @@
+const props = @This();
+
+const std = @import("std");
+const assert = std.debug.assert;
+const ziglyph = @import("ziglyph");
+const lut = @import("lut.zig");
+const Properties = @import("Properties.zig");
+const GraphemeBoundaryClass = Properties.GraphemeBoundaryClass;
+
+/// Gets the grapheme boundary class for a codepoint. This is VERY
+/// SLOW. The use case for this is only in generating lookup tables.
+fn graphemeBoundaryClass(cp: u21) GraphemeBoundaryClass {
+    // We special-case modifier bases because we should not break
+    // if a modifier isn't next to a base.
+    if (ziglyph.emoji.isEmojiModifierBase(cp)) {
+        assert(ziglyph.emoji.isExtendedPictographic(cp));
+        return .extended_pictographic_base;
+    }
+
+    if (ziglyph.emoji.isEmojiModifier(cp)) return .emoji_modifier;
+    if (ziglyph.emoji.isExtendedPictographic(cp)) return .extended_pictographic;
+    if (ziglyph.grapheme_break.isL(cp)) return .L;
+    if (ziglyph.grapheme_break.isV(cp)) return .V;
+    if (ziglyph.grapheme_break.isT(cp)) return .T;
+    if (ziglyph.grapheme_break.isLv(cp)) return .LV;
+    if (ziglyph.grapheme_break.isLvt(cp)) return .LVT;
+    if (ziglyph.grapheme_break.isPrepend(cp)) return .prepend;
+    if (ziglyph.grapheme_break.isExtend(cp)) return .extend;
+    if (ziglyph.grapheme_break.isZwj(cp)) return .zwj;
+    if (ziglyph.grapheme_break.isSpacingmark(cp)) return .spacing_mark;
+    if (ziglyph.grapheme_break.isRegionalIndicator(cp)) return .regional_indicator;
+
+    // This is obviously not INVALID invalid, there is SOME grapheme
+    // boundary class for every codepoint. But we don't care about
+    // anything that doesn't fit into the above categories.
+    return .invalid;
+}
+
+pub fn get(cp: u21) Properties {
+    const zg_width = ziglyph.display_width.codePointWidth(cp, .half);
+    return .{
+        .width = @intCast(@min(2, @max(0, zg_width))),
+        .grapheme_boundary_class = .init(cp),
+    };
+}
+
+/// Runnable binary to generate the lookup tables and output to stdout.
+pub fn main() !void {
+    var arena_state = std.heap.ArenaAllocator.init(std.heap.page_allocator);
+    defer arena_state.deinit();
+    const alloc = arena_state.allocator();
+
+    const gen: lut.Generator(
+        Properties,
+        struct {
+            pub fn get(ctx: @This(), cp: u21) !Properties {
+                _ = ctx;
+                return props.get(cp);
+            }
+
+            pub fn eql(ctx: @This(), a: Properties, b: Properties) bool {
+                _ = ctx;
+                return a.eql(b);
+            }
+        },
+    ) = .{};
+
+    const t = try gen.generate(alloc);
+    defer alloc.free(t.stage1);
+    defer alloc.free(t.stage2);
+    defer alloc.free(t.stage3);
+    try t.writeZig(std.io.getStdOut().writer());
+
+    // Uncomment when manually debugging to see our table sizes.
+    // std.log.warn("stage1={} stage2={} stage3={}", .{
+    //     t.stage1.len,
+    //     t.stage2.len,
+    //     t.stage3.len,
+    // });
+}
+
+// This is not very fast in debug modes, so its commented by default.
+// IMPORTANT: UNCOMMENT THIS WHENEVER MAKING CODEPOINTWIDTH CHANGES.
+// test "unicode props: tables match ziglyph" {
+//     const testing = std.testing;
+//
+//     const min = 0xFF + 1; // start outside ascii
+//     for (min..std.math.maxInt(u21)) |cp| {
+//         const t = table.get(@intCast(cp));
+//         const zg = @min(2, @max(0, ziglyph.display_width.codePointWidth(@intCast(cp), .half)));
+//         if (t.width != zg) {
+//             std.log.warn("mismatch cp=U+{x} t={} zg={}", .{ cp, t, zg });
+//             try testing.expect(false);
+//         }
+//     }
+// }