Merge 7bddbfed1e into a4cb73db84

2025-12-17 16:48:47 -05:00 · 2025-12-17 16:48:47 -05:00 · a60c00aa5f
parent a4cb73db84 7bddbfed1e
commit a60c00aa5f
8 changed files with 149 additions and 196 deletions
--- a/src/benchmark/GraphemeBreak.zig
+++ b/src/benchmark/GraphemeBreak.zig
@ -10,6 +10,7 @@ const Benchmark = @import("Benchmark.zig");
 const options = @import("options.zig");
 const UTF8Decoder = @import("../terminal/UTF8Decoder.zig");
 const unicode = @import("../unicode/main.zig");
+const uucode = @import("uucode");

 const log = std.log.scoped(.@"terminal-stream-bench");

@ -118,7 +119,7 @@ fn stepTable(ptr: *anyopaque) Benchmark.Error!void {
    var r = &f_reader.interface;

    var d: UTF8Decoder = .{};
-    var state: unicode.GraphemeBreakState = .{};
+    var state: uucode.grapheme.BreakState = .default;
    var cp1: u21 = 0;
    var buf: [4096]u8 align(std.atomic.cache_line) = undefined;
    while (true) {
--- a/src/build/uucode_config.zig
+++ b/src/build/uucode_config.zig
@ -4,6 +4,7 @@ const config = @import("config.zig");
 const config_x = @import("config.x.zig");
 const d = config.default;
 const wcwidth = config_x.wcwidth;
+const grapheme_break_no_control = config_x.grapheme_break_no_control;

 const Allocator = std.mem.Allocator;

@ -85,10 +86,15 @@ pub const tables = [_]config.Table{
    },
    .{
        .name = "buildtime",
-        .extensions = &.{ wcwidth, width, is_symbol },
+        .extensions = &.{
+            wcwidth,
+            grapheme_break_no_control,
+            width,
+            is_symbol,
+        },
        .fields = &.{
            width.field("width"),
-            d.field("grapheme_break"),
+            grapheme_break_no_control.field("grapheme_break_no_control"),
            is_symbol.field("is_symbol"),
            d.field("is_emoji_vs_base"),
        },
--- a/src/font/shaper/web_canvas.zig
+++ b/src/font/shaper/web_canvas.zig
@ -4,6 +4,7 @@ const Allocator = std.mem.Allocator;
 const font = @import("../main.zig");
 const terminal = @import("../../terminal/main.zig");
 const unicode = @import("../../unicode/main.zig");
+const uucode = @import("uucode");

 const log = std.log.scoped(.font_shaper);

@ -111,7 +112,7 @@ pub const Shaper = struct {
        // font ligatures. However, we do support grapheme clustering.
        // This means we can render things like skin tone emoji but
        // we can't render things like single glyph "=>".
-        var break_state: unicode.GraphemeBreakState = .{};
+        var break_state: uucode.grapheme.BreakState = .default;
        var cp1: u21 = @intCast(codepoints[0]);

        var start: usize = 0;
--- a/src/terminal/Terminal.zig
+++ b/src/terminal/Terminal.zig
@ -9,6 +9,7 @@ const assert = @import("../quirks.zig").inlineAssert;
 const testing = std.testing;
 const Allocator = std.mem.Allocator;
 const unicode = @import("../unicode/main.zig");
+const uucode = @import("uucode");

 const ansi = @import("ansi.zig");
 const modespkg = @import("modes.zig");
@ -361,7 +362,7 @@ pub fn print(self: *Terminal, c: u21) !void {
        if (prev.cell.codepoint() == 0) break :grapheme;

        const grapheme_break = brk: {
-            var state: unicode.GraphemeBreakState = .{};
+            var state: uucode.grapheme.BreakState = .default;
            var cp1: u21 = prev.cell.content.codepoint;
            if (prev.cell.hasGrapheme()) {
                const cps = self.screens.active.cursor.page_pin.node.data.lookupGrapheme(prev.cell).?;
@ -512,7 +513,7 @@ pub fn print(self: *Terminal, c: u21) !void {
        // If this is a emoji variation selector, prev must be an emoji
        if (c == 0xFE0F or c == 0xFE0E) {
            const prev_props = unicode.table.get(prev.content.codepoint);
-            const emoji = prev_props.grapheme_boundary_class == .extended_pictographic;
+            const emoji = prev_props.grapheme_break == .extended_pictographic;
            if (!emoji) return;
        }

@ -4014,6 +4015,53 @@ test "Terminal: overwrite multicodepoint grapheme tail clears grapheme data" {
    try testing.expectEqual(@as(usize, 0), page.graphemeCount());
 }

+test "Terminal: print breaks valid grapheme cluster with Prepend + ASCII for speed" {
+    const alloc = testing.allocator;
+    var t = try init(alloc, .{ .rows = 5, .cols = 5 });
+    defer t.deinit(alloc);
+    t.modes.set(.grapheme_cluster, true);
+
+    // Make sure we're not at cursor.x == 0 for the next char.
+    try t.print('_');
+
+    // U+0600 ARABIC NUMBER SIGN (Prepend)
+    try t.print(0x0600);
+    try t.print('1');
+
+    // We should have 3 cells taken up, each narrow. Note that this is
+    // **incorrect** grapheme break behavior, since a Prepend code point should
+    // not break with the one following it per UAX #29 GB9b. However, as an
+    // optimization we assume a grapheme break when c <= 255, and note that
+    // this deviation only affects these very uncommon scenarios (e.g. the
+    // Arabic number sign should precede Arabic-script digits).
+    try testing.expectEqual(@as(usize, 0), t.screens.active.cursor.y);
+    try testing.expectEqual(@as(usize, 3), t.screens.active.cursor.x);
+    // This is what we'd expect if we did break correctly:
+    //try testing.expectEqual(@as(usize, 2), t.screens.active.cursor.x);
+
+    // Assert various properties about our screen to verify
+    // we have all expected cells.
+    {
+        const list_cell = t.screens.active.pages.getCell(.{ .screen = .{ .x = 1, .y = 0 } }).?;
+        const cell = list_cell.cell;
+        try testing.expectEqual(@as(u21, 0x0600), cell.content.codepoint);
+        try testing.expect(!cell.hasGrapheme());
+        // This is what we'd expect if we did break correctly:
+        //try testing.expect(cell.hasGrapheme());
+        //try testing.expectEqualSlices(u21, &.{'1'}, list_cell.node.data.lookupGrapheme(cell).?);
+        try testing.expectEqual(Cell.Wide.narrow, cell.wide);
+    }
+    {
+        const list_cell = t.screens.active.pages.getCell(.{ .screen = .{ .x = 2, .y = 0 } }).?;
+        const cell = list_cell.cell;
+        try testing.expectEqual(@as(u21, '1'), cell.content.codepoint);
+        // This is what we'd expect if we did break correctly:
+        //try testing.expectEqual(@as(u21, 0), cell.content.codepoint);
+        try testing.expect(!cell.hasGrapheme());
+        try testing.expectEqual(Cell.Wide.narrow, cell.wide);
+    }
+}
+
 test "Terminal: print writes to bottom if scrolled" {
    var t = try init(testing.allocator, .{ .cols = 5, .rows = 2 });
    defer t.deinit(testing.allocator);
--- a/src/unicode/grapheme.zig
+++ b/src/unicode/grapheme.zig
@ -1,6 +1,6 @@
 const std = @import("std");
 const table = @import("props_table.zig").table;
-const GraphemeBoundaryClass = @import("props.zig").GraphemeBoundaryClass;
+const uucode = @import("uucode");

 /// Determines if there is a grapheme break between two codepoints. This
 /// must be called sequentially maintaining the state between calls.
@ -9,11 +9,11 @@ const GraphemeBoundaryClass = @import("props.zig").GraphemeBoundaryClass;
 /// line feeds, and carriage returns are expected to be filtered out before
 /// calling this function. This is because this function is tuned for
 /// Ghostty.
-pub fn graphemeBreak(cp1: u21, cp2: u21, state: *BreakState) bool {
+pub fn graphemeBreak(cp1: u21, cp2: u21, state: *uucode.grapheme.BreakState) bool {
    const value = Precompute.data[
        (Precompute.Key{
-            .gbc1 = table.get(cp1).grapheme_boundary_class,
-            .gbc2 = table.get(cp2).grapheme_boundary_class,
+            .gb1 = table.get(cp1).grapheme_break,
+            .gb2 = table.get(cp2).grapheme_break,
            .state = state.*,
        }).index()
    ];
@ -21,133 +21,64 @@ pub fn graphemeBreak(cp1: u21, cp2: u21, state: *BreakState) bool {
    return value.result;
 }

-/// The state that must be maintained between calls to `graphemeBreak`.
-pub const BreakState = packed struct(u2) {
-    extended_pictographic: bool = false,
-    regional_indicator: bool = false,
-};
-
 /// This is all the structures and data for the precomputed lookup table
-/// for all possible permutations of state and grapheme boundary classes.
-/// Precomputation only requires 2^10 keys of 3 bit values so the whole
-/// table is less than 1KB.
+/// for all possible permutations of state and grapheme break properties.
+/// Precomputation requires 2^13 keys of 4 bit values so the whole table is
+/// 8KB.
 const Precompute = struct {
-    const Key = packed struct(u10) {
-        state: BreakState,
-        gbc1: GraphemeBoundaryClass,
-        gbc2: GraphemeBoundaryClass,
+    const Key = packed struct(u13) {
+        state: uucode.grapheme.BreakState,
+        gb1: uucode.x.types.GraphemeBreakNoControl,
+        gb2: uucode.x.types.GraphemeBreakNoControl,

        fn index(self: Key) usize {
-            return @intCast(@as(u10, @bitCast(self)));
+            return @intCast(@as(u13, @bitCast(self)));
        }
    };

-    const Value = packed struct(u3) {
+    const Value = packed struct(u4) {
        result: bool,
-        state: BreakState,
+        state: uucode.grapheme.BreakState,
    };

    const data = precompute: {
-        var result: [std.math.maxInt(u10)]Value = undefined;
+        var result: [std.math.maxInt(u13) + 1]Value = undefined;

-        @setEvalBranchQuota(3_000);
-        const info = @typeInfo(GraphemeBoundaryClass).@"enum";
-        for (0..std.math.maxInt(u2) + 1) |state_init| {
+        const max_state_int = blk: {
+            var max: usize = 0;
+            for (@typeInfo(uucode.grapheme.BreakState).@"enum".fields) |field| {
+                if (field.value > max) max = field.value;
+            }
+            break :blk max;
+        };
+
+        @setEvalBranchQuota(10_000);
+        const info = @typeInfo(uucode.x.types.GraphemeBreakNoControl).@"enum";
+        for (0..max_state_int + 1) |state_int| {
            for (info.fields) |field1| {
                for (info.fields) |field2| {
-                    var state: BreakState = @bitCast(@as(u2, @intCast(state_init)));
+                    var state: uucode.grapheme.BreakState = @enumFromInt(state_int);
+
                    const key: Key = .{
-                        .gbc1 = @field(GraphemeBoundaryClass, field1.name),
-                        .gbc2 = @field(GraphemeBoundaryClass, field2.name),
+                        .gb1 = @field(uucode.x.types.GraphemeBreakNoControl, field1.name),
+                        .gb2 = @field(uucode.x.types.GraphemeBreakNoControl, field2.name),
                        .state = state,
                    };
-                    const v = graphemeBreakClass(key.gbc1, key.gbc2, &state);
+                    const v = uucode.x.grapheme.computeGraphemeBreakNoControl(
+                        key.gb1,
+                        key.gb2,
+                        &state,
+                    );
                    result[key.index()] = .{ .result = v, .state = state };
                }
            }
        }

+        std.debug.assert(@sizeOf(@TypeOf(result)) == 8192);
        break :precompute result;
    };
 };

-/// This is the algorithm from utf8proc. We only use this offline for
-/// precomputing the lookup table.
-fn graphemeBreakClass(
-    gbc1: GraphemeBoundaryClass,
-    gbc2: GraphemeBoundaryClass,
-    state: *BreakState,
-) bool {
-    // GB11: Emoji Extend* ZWJ x Emoji
-    if (!state.extended_pictographic and gbc1.isExtendedPictographic()) {
-        state.extended_pictographic = true;
-    }
-
-    // These two properties are ignored because they're not relevant to
-    // Ghostty -- they're filtered out before checking grapheme boundaries.
-    // GB3: CR x LF
-    // GB4: Control
-
-    // GB6: Hangul L x (L|V|LV|VT)
-    if (gbc1 == .L) {
-        if (gbc2 == .L or
-            gbc2 == .V or
-            gbc2 == .LV or
-            gbc2 == .LVT) return false;
-    }
-
-    // GB7: Hangul (LV | V) x (V | T)
-    if (gbc1 == .LV or gbc1 == .V) {
-        if (gbc2 == .V or
-            gbc2 == .T) return false;
-    }
-
-    // GB8: Hangul (LVT | T) x T
-    if (gbc1 == .LVT or gbc1 == .T) {
-        if (gbc2 == .T) return false;
-    }
-
-    // GB9b: x (Extend | ZWJ)
-    if (gbc2 == .extend or gbc2 == .zwj) return false;
-
-    // GB9a: x Spacing
-    if (gbc2 == .spacing_mark) return false;
-
-    // GB9b: Prepend x
-    if (gbc1 == .prepend) return false;
-
-    // GB12, GB13: RI x RI
-    if (gbc1 == .regional_indicator and gbc2 == .regional_indicator) {
-        if (state.regional_indicator) {
-            state.regional_indicator = false;
-            return true;
-        } else {
-            state.regional_indicator = true;
-            return false;
-        }
-    }
-
-    // GB11: Emoji Extend* ZWJ x Emoji
-    if (state.extended_pictographic and
-        gbc1 == .zwj and
-        gbc2.isExtendedPictographic())
-    {
-        state.extended_pictographic = false;
-        return false;
-    }
-
-    // UTS #51. This isn't covered by UAX #29 as far as I can tell (but
-    // I'm probably wrong). This is a special case for emoji modifiers
-    // which only do not break if they're next to a base.
-    //
-    // emoji_modifier_sequence := emoji_modifier_base emoji_modifier
-    if (gbc2 == .emoji_modifier and gbc1 == .extended_pictographic_base) {
-        return false;
-    }
-
-    return true;
-}
-
 /// If you build this file as a binary, we will verify the grapheme break
 /// implementation. This iterates over billions of codepoints so it is
 /// SLOW. It's not meant to be run in CI, but it's useful for debugging.
@ -156,13 +87,11 @@ fn graphemeBreakClass(
 /// adding a `-Demit-unicode-test` option for `zig build`, but that
 /// hasn't been done here.
 pub fn main() !void {
-    const uucode = @import("uucode");
-
    // Set the min and max to control the test range.
    const min = 0;
    const max = uucode.config.max_code_point + 1;

-    var state: BreakState = .{};
+    var state: uucode.grapheme.BreakState = .default;
    var uu_state: uucode.grapheme.BreakState = .default;
    for (min..max) |cp1| {
        if (cp1 % 1000 == 0) std.log.warn("progress cp1={}", .{cp1});
@ -199,13 +128,53 @@ test "grapheme break: emoji modifier" {

    // Emoji and modifier
    {
-        var state: BreakState = .{};
+        var state: uucode.grapheme.BreakState = .default;
        try testing.expect(!graphemeBreak(0x261D, 0x1F3FF, &state));
    }

    // Non-emoji and emoji modifier
    {
-        var state: BreakState = .{};
+        var state: uucode.grapheme.BreakState = .default;
        try testing.expect(graphemeBreak(0x22, 0x1F3FF, &state));
    }
 }
+
+test "long emoji zwj sequences" {
+    var state: uucode.grapheme.BreakState = .default;
+    // 👩‍👩‍👧‍👦 (family: woman, woman, girl, boy)
+    var it = uucode.utf8.Iterator.init("\u{1F469}\u{200D}\u{1F469}\u{200D}\u{1F467}\u{200D}\u{1F466}_");
+    var cp1 = it.next() orelse unreachable;
+    var cp2 = it.next() orelse unreachable;
+    try std.testing.expect(cp1 == 0x1F469); // 👩
+    try std.testing.expect(!graphemeBreak(cp1, cp2, &state));
+
+    cp1 = cp2;
+    cp2 = it.next() orelse unreachable;
+    try std.testing.expect(cp1 == 0x200D);
+    try std.testing.expect(!graphemeBreak(cp1, cp2, &state));
+
+    cp1 = cp2;
+    cp2 = it.next() orelse unreachable;
+    try std.testing.expect(cp1 == 0x1F469); // 👩
+    try std.testing.expect(!graphemeBreak(cp1, cp2, &state));
+
+    cp1 = cp2;
+    cp2 = it.next() orelse unreachable;
+    try std.testing.expect(cp1 == 0x200D);
+    try std.testing.expect(!graphemeBreak(cp1, cp2, &state));
+
+    cp1 = cp2;
+    cp2 = it.next() orelse unreachable;
+    try std.testing.expect(cp1 == 0x1F467); // 👧
+    try std.testing.expect(!graphemeBreak(cp1, cp2, &state));
+
+    cp1 = cp2;
+    cp2 = it.next() orelse unreachable;
+    try std.testing.expect(cp1 == 0x200D);
+    try std.testing.expect(!graphemeBreak(cp1, cp2, &state));
+
+    cp1 = cp2;
+    cp2 = it.next() orelse unreachable;
+    try std.testing.expect(cp1 == 0x1F466); // 👦
+    try std.testing.expect(graphemeBreak(cp1, cp2, &state)); // break
+}
--- a/src/unicode/main.zig
+++ b/src/unicode/main.zig
@ -4,7 +4,6 @@ const grapheme = @import("grapheme.zig");
 pub const table = @import("props_table.zig").table;
 pub const Properties = @import("props.zig").Properties;
 pub const graphemeBreak = grapheme.graphemeBreak;
-pub const GraphemeBreakState = grapheme.BreakState;

 test {
    @import("std").testing.refAllDecls(@This());
--- a/src/unicode/props.zig
+++ b/src/unicode/props.zig
@ -5,6 +5,7 @@
 //! benchmarks in src/bench to verify that we haven't regressed.

 const std = @import("std");
+const uucode = @import("uucode");

 pub const Properties = packed struct {
    /// Codepoint width. We clamp to [0, 2] since Ghostty handles control
@ -12,8 +13,8 @@ pub const Properties = packed struct {
    /// becomes a 2-em dash).
    width: u2 = 0,

-    /// Grapheme boundary class.
-    grapheme_boundary_class: GraphemeBoundaryClass = .invalid,
+    /// Grapheme break property.
+    grapheme_break: uucode.x.types.GraphemeBreakNoControl = .other,

    /// Emoji VS compatibility
    emoji_vs_base: bool = false,
@ -21,7 +22,7 @@ pub const Properties = packed struct {
    // Needed for lut.Generator
    pub fn eql(a: Properties, b: Properties) bool {
        return a.width == b.width and
-            a.grapheme_boundary_class == b.grapheme_boundary_class and
+            a.grapheme_break == b.grapheme_break and
            a.emoji_vs_base == b.emoji_vs_base;
    }

@ -33,46 +34,13 @@ pub const Properties = packed struct {
        try writer.print(
            \\.{{
            \\    .width= {},
-            \\    .grapheme_boundary_class= .{s},
+            \\    .grapheme_break= .{s},
            \\    .emoji_vs_base= {},
            \\}}
        , .{
            self.width,
-            @tagName(self.grapheme_boundary_class),
+            @tagName(self.grapheme_break),
            self.emoji_vs_base,
        });
    }
 };
-
-/// Possible grapheme boundary classes. This isn't an exhaustive list:
-/// we omit control, CR, LF, etc. because in Ghostty's usage that are
-/// impossible because they're handled by the terminal.
-pub const GraphemeBoundaryClass = enum(u4) {
-    invalid,
-    L,
-    V,
-    T,
-    LV,
-    LVT,
-    prepend,
-    extend,
-    zwj,
-    spacing_mark,
-    regional_indicator,
-    extended_pictographic,
-    extended_pictographic_base, // \p{Extended_Pictographic} & \p{Emoji_Modifier_Base}
-    emoji_modifier, // \p{Emoji_Modifier}
-
-    /// Returns true if this is an extended pictographic type. This
-    /// should be used instead of comparing the enum value directly
-    /// because we classify multiple.
-    pub fn isExtendedPictographic(self: GraphemeBoundaryClass) bool {
-        return switch (self) {
-            .extended_pictographic,
-            .extended_pictographic_base,
-            => true,
-
-            else => false,
-        };
-    }
-};
--- a/src/unicode/props_uucode.zig
+++ b/src/unicode/props_uucode.zig
@ -4,56 +4,17 @@ const assert = std.debug.assert;
 const uucode = @import("uucode");
 const lut = @import("lut.zig");
 const Properties = @import("props.zig").Properties;
-const GraphemeBoundaryClass = @import("props.zig").GraphemeBoundaryClass;
-
-/// Gets the grapheme boundary class for a codepoint.
-/// The use case for this is only in generating lookup tables.
-fn graphemeBoundaryClass(cp: u21) GraphemeBoundaryClass {
-    if (cp > uucode.config.max_code_point) return .invalid;
-
-    return switch (uucode.get(.grapheme_break, cp)) {
-        .extended_pictographic => .extended_pictographic,
-        .l => .L,
-        .v => .V,
-        .t => .T,
-        .lv => .LV,
-        .lvt => .LVT,
-        .prepend => .prepend,
-        .zwj => .zwj,
-        .spacing_mark => .spacing_mark,
-        .regional_indicator => .regional_indicator,
-        .emoji_modifier => .emoji_modifier,
-        .emoji_modifier_base => .extended_pictographic_base,
-
-        .zwnj,
-        .indic_conjunct_break_extend,
-        .indic_conjunct_break_linker,
-        => .extend,
-
-        // This is obviously not INVALID invalid, there is SOME grapheme
-        // boundary class for every codepoint. But we don't care about
-        // anything that doesn't fit into the above categories. Also note
-        // that `indic_conjunct_break_consonant` is `other` in
-        // 'GraphemeBreakProperty.txt' (it's missing).
-        .other,
-        .indic_conjunct_break_consonant,
-        .cr,
-        .lf,
-        .control,
-        => .invalid,
-    };
-}

 pub fn get(cp: u21) Properties {
    if (cp > uucode.config.max_code_point) return .{
        .width = 1,
-        .grapheme_boundary_class = .invalid,
+        .grapheme_break = .other,
        .emoji_vs_base = false,
    };

    return .{
        .width = uucode.get(.width, cp),
-        .grapheme_boundary_class = graphemeBoundaryClass(cp),
+        .grapheme_break = uucode.get(.grapheme_break_no_control, cp),
        .emoji_vs_base = uucode.get(.is_emoji_vs_base, cp),
    };
 }