Jacob Sandlund 2025-12-17 16:48:47 -05:00 committed by GitHub
commit a60c00aa5f
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
8 changed files with 149 additions and 196 deletions

View File

@ -10,6 +10,7 @@ const Benchmark = @import("Benchmark.zig");
const options = @import("options.zig");
const UTF8Decoder = @import("../terminal/UTF8Decoder.zig");
const unicode = @import("../unicode/main.zig");
const uucode = @import("uucode");
const log = std.log.scoped(.@"terminal-stream-bench");
@ -118,7 +119,7 @@ fn stepTable(ptr: *anyopaque) Benchmark.Error!void {
var r = &f_reader.interface;
var d: UTF8Decoder = .{};
var state: unicode.GraphemeBreakState = .{};
var state: uucode.grapheme.BreakState = .default;
var cp1: u21 = 0;
var buf: [4096]u8 align(std.atomic.cache_line) = undefined;
while (true) {

View File

@ -4,6 +4,7 @@ const config = @import("config.zig");
const config_x = @import("config.x.zig");
const d = config.default;
const wcwidth = config_x.wcwidth;
const grapheme_break_no_control = config_x.grapheme_break_no_control;
const Allocator = std.mem.Allocator;
@ -85,10 +86,15 @@ pub const tables = [_]config.Table{
},
.{
.name = "buildtime",
.extensions = &.{ wcwidth, width, is_symbol },
.extensions = &.{
wcwidth,
grapheme_break_no_control,
width,
is_symbol,
},
.fields = &.{
width.field("width"),
d.field("grapheme_break"),
grapheme_break_no_control.field("grapheme_break_no_control"),
is_symbol.field("is_symbol"),
d.field("is_emoji_vs_base"),
},

View File

@ -4,6 +4,7 @@ const Allocator = std.mem.Allocator;
const font = @import("../main.zig");
const terminal = @import("../../terminal/main.zig");
const unicode = @import("../../unicode/main.zig");
const uucode = @import("uucode");
const log = std.log.scoped(.font_shaper);
@ -111,7 +112,7 @@ pub const Shaper = struct {
// font ligatures. However, we do support grapheme clustering.
// This means we can render things like skin tone emoji but
// we can't render things like single glyph "=>".
var break_state: unicode.GraphemeBreakState = .{};
var break_state: uucode.grapheme.BreakState = .default;
var cp1: u21 = @intCast(codepoints[0]);
var start: usize = 0;

View File

@ -9,6 +9,7 @@ const assert = @import("../quirks.zig").inlineAssert;
const testing = std.testing;
const Allocator = std.mem.Allocator;
const unicode = @import("../unicode/main.zig");
const uucode = @import("uucode");
const ansi = @import("ansi.zig");
const modespkg = @import("modes.zig");
@ -361,7 +362,7 @@ pub fn print(self: *Terminal, c: u21) !void {
if (prev.cell.codepoint() == 0) break :grapheme;
const grapheme_break = brk: {
var state: unicode.GraphemeBreakState = .{};
var state: uucode.grapheme.BreakState = .default;
var cp1: u21 = prev.cell.content.codepoint;
if (prev.cell.hasGrapheme()) {
const cps = self.screens.active.cursor.page_pin.node.data.lookupGrapheme(prev.cell).?;
@ -512,7 +513,7 @@ pub fn print(self: *Terminal, c: u21) !void {
// If this is a emoji variation selector, prev must be an emoji
if (c == 0xFE0F or c == 0xFE0E) {
const prev_props = unicode.table.get(prev.content.codepoint);
const emoji = prev_props.grapheme_boundary_class == .extended_pictographic;
const emoji = prev_props.grapheme_break == .extended_pictographic;
if (!emoji) return;
}
@ -4014,6 +4015,53 @@ test "Terminal: overwrite multicodepoint grapheme tail clears grapheme data" {
try testing.expectEqual(@as(usize, 0), page.graphemeCount());
}
test "Terminal: print breaks valid grapheme cluster with Prepend + ASCII for speed" {
const alloc = testing.allocator;
var t = try init(alloc, .{ .rows = 5, .cols = 5 });
defer t.deinit(alloc);
t.modes.set(.grapheme_cluster, true);
// Make sure we're not at cursor.x == 0 for the next char.
try t.print('_');
// U+0600 ARABIC NUMBER SIGN (Prepend)
try t.print(0x0600);
try t.print('1');
// We should have 3 cells taken up, each narrow. Note that this is
// **incorrect** grapheme break behavior, since a Prepend code point should
// not break with the one following it per UAX #29 GB9b. However, as an
// optimization we assume a grapheme break when c <= 255, and note that
// this deviation only affects these very uncommon scenarios (e.g. the
// Arabic number sign should precede Arabic-script digits).
try testing.expectEqual(@as(usize, 0), t.screens.active.cursor.y);
try testing.expectEqual(@as(usize, 3), t.screens.active.cursor.x);
// This is what we'd expect if we did break correctly:
//try testing.expectEqual(@as(usize, 2), t.screens.active.cursor.x);
// Assert various properties about our screen to verify
// we have all expected cells.
{
const list_cell = t.screens.active.pages.getCell(.{ .screen = .{ .x = 1, .y = 0 } }).?;
const cell = list_cell.cell;
try testing.expectEqual(@as(u21, 0x0600), cell.content.codepoint);
try testing.expect(!cell.hasGrapheme());
// This is what we'd expect if we did break correctly:
//try testing.expect(cell.hasGrapheme());
//try testing.expectEqualSlices(u21, &.{'1'}, list_cell.node.data.lookupGrapheme(cell).?);
try testing.expectEqual(Cell.Wide.narrow, cell.wide);
}
{
const list_cell = t.screens.active.pages.getCell(.{ .screen = .{ .x = 2, .y = 0 } }).?;
const cell = list_cell.cell;
try testing.expectEqual(@as(u21, '1'), cell.content.codepoint);
// This is what we'd expect if we did break correctly:
//try testing.expectEqual(@as(u21, 0), cell.content.codepoint);
try testing.expect(!cell.hasGrapheme());
try testing.expectEqual(Cell.Wide.narrow, cell.wide);
}
}
test "Terminal: print writes to bottom if scrolled" {
var t = try init(testing.allocator, .{ .cols = 5, .rows = 2 });
defer t.deinit(testing.allocator);

View File

@ -1,6 +1,6 @@
const std = @import("std");
const table = @import("props_table.zig").table;
const GraphemeBoundaryClass = @import("props.zig").GraphemeBoundaryClass;
const uucode = @import("uucode");
/// Determines if there is a grapheme break between two codepoints. This
/// must be called sequentially maintaining the state between calls.
@ -9,11 +9,11 @@ const GraphemeBoundaryClass = @import("props.zig").GraphemeBoundaryClass;
/// line feeds, and carriage returns are expected to be filtered out before
/// calling this function. This is because this function is tuned for
/// Ghostty.
pub fn graphemeBreak(cp1: u21, cp2: u21, state: *BreakState) bool {
pub fn graphemeBreak(cp1: u21, cp2: u21, state: *uucode.grapheme.BreakState) bool {
const value = Precompute.data[
(Precompute.Key{
.gbc1 = table.get(cp1).grapheme_boundary_class,
.gbc2 = table.get(cp2).grapheme_boundary_class,
.gb1 = table.get(cp1).grapheme_break,
.gb2 = table.get(cp2).grapheme_break,
.state = state.*,
}).index()
];
@ -21,133 +21,64 @@ pub fn graphemeBreak(cp1: u21, cp2: u21, state: *BreakState) bool {
return value.result;
}
/// The state that must be maintained between calls to `graphemeBreak`.
pub const BreakState = packed struct(u2) {
extended_pictographic: bool = false,
regional_indicator: bool = false,
};
/// This is all the structures and data for the precomputed lookup table
/// for all possible permutations of state and grapheme boundary classes.
/// Precomputation only requires 2^10 keys of 3 bit values so the whole
/// table is less than 1KB.
/// for all possible permutations of state and grapheme break properties.
/// Precomputation requires 2^13 keys of 4 bit values so the whole table is
/// 8KB.
const Precompute = struct {
const Key = packed struct(u10) {
state: BreakState,
gbc1: GraphemeBoundaryClass,
gbc2: GraphemeBoundaryClass,
const Key = packed struct(u13) {
state: uucode.grapheme.BreakState,
gb1: uucode.x.types.GraphemeBreakNoControl,
gb2: uucode.x.types.GraphemeBreakNoControl,
fn index(self: Key) usize {
return @intCast(@as(u10, @bitCast(self)));
return @intCast(@as(u13, @bitCast(self)));
}
};
const Value = packed struct(u3) {
const Value = packed struct(u4) {
result: bool,
state: BreakState,
state: uucode.grapheme.BreakState,
};
const data = precompute: {
var result: [std.math.maxInt(u10)]Value = undefined;
var result: [std.math.maxInt(u13) + 1]Value = undefined;
@setEvalBranchQuota(3_000);
const info = @typeInfo(GraphemeBoundaryClass).@"enum";
for (0..std.math.maxInt(u2) + 1) |state_init| {
const max_state_int = blk: {
var max: usize = 0;
for (@typeInfo(uucode.grapheme.BreakState).@"enum".fields) |field| {
if (field.value > max) max = field.value;
}
break :blk max;
};
@setEvalBranchQuota(10_000);
const info = @typeInfo(uucode.x.types.GraphemeBreakNoControl).@"enum";
for (0..max_state_int + 1) |state_int| {
for (info.fields) |field1| {
for (info.fields) |field2| {
var state: BreakState = @bitCast(@as(u2, @intCast(state_init)));
var state: uucode.grapheme.BreakState = @enumFromInt(state_int);
const key: Key = .{
.gbc1 = @field(GraphemeBoundaryClass, field1.name),
.gbc2 = @field(GraphemeBoundaryClass, field2.name),
.gb1 = @field(uucode.x.types.GraphemeBreakNoControl, field1.name),
.gb2 = @field(uucode.x.types.GraphemeBreakNoControl, field2.name),
.state = state,
};
const v = graphemeBreakClass(key.gbc1, key.gbc2, &state);
const v = uucode.x.grapheme.computeGraphemeBreakNoControl(
key.gb1,
key.gb2,
&state,
);
result[key.index()] = .{ .result = v, .state = state };
}
}
}
std.debug.assert(@sizeOf(@TypeOf(result)) == 8192);
break :precompute result;
};
};
/// This is the algorithm from utf8proc. We only use this offline for
/// precomputing the lookup table.
fn graphemeBreakClass(
gbc1: GraphemeBoundaryClass,
gbc2: GraphemeBoundaryClass,
state: *BreakState,
) bool {
// GB11: Emoji Extend* ZWJ x Emoji
if (!state.extended_pictographic and gbc1.isExtendedPictographic()) {
state.extended_pictographic = true;
}
// These two properties are ignored because they're not relevant to
// Ghostty -- they're filtered out before checking grapheme boundaries.
// GB3: CR x LF
// GB4: Control
// GB6: Hangul L x (L|V|LV|VT)
if (gbc1 == .L) {
if (gbc2 == .L or
gbc2 == .V or
gbc2 == .LV or
gbc2 == .LVT) return false;
}
// GB7: Hangul (LV | V) x (V | T)
if (gbc1 == .LV or gbc1 == .V) {
if (gbc2 == .V or
gbc2 == .T) return false;
}
// GB8: Hangul (LVT | T) x T
if (gbc1 == .LVT or gbc1 == .T) {
if (gbc2 == .T) return false;
}
// GB9b: x (Extend | ZWJ)
if (gbc2 == .extend or gbc2 == .zwj) return false;
// GB9a: x Spacing
if (gbc2 == .spacing_mark) return false;
// GB9b: Prepend x
if (gbc1 == .prepend) return false;
// GB12, GB13: RI x RI
if (gbc1 == .regional_indicator and gbc2 == .regional_indicator) {
if (state.regional_indicator) {
state.regional_indicator = false;
return true;
} else {
state.regional_indicator = true;
return false;
}
}
// GB11: Emoji Extend* ZWJ x Emoji
if (state.extended_pictographic and
gbc1 == .zwj and
gbc2.isExtendedPictographic())
{
state.extended_pictographic = false;
return false;
}
// UTS #51. This isn't covered by UAX #29 as far as I can tell (but
// I'm probably wrong). This is a special case for emoji modifiers
// which only do not break if they're next to a base.
//
// emoji_modifier_sequence := emoji_modifier_base emoji_modifier
if (gbc2 == .emoji_modifier and gbc1 == .extended_pictographic_base) {
return false;
}
return true;
}
/// If you build this file as a binary, we will verify the grapheme break
/// implementation. This iterates over billions of codepoints so it is
/// SLOW. It's not meant to be run in CI, but it's useful for debugging.
@ -156,13 +87,11 @@ fn graphemeBreakClass(
/// adding a `-Demit-unicode-test` option for `zig build`, but that
/// hasn't been done here.
pub fn main() !void {
const uucode = @import("uucode");
// Set the min and max to control the test range.
const min = 0;
const max = uucode.config.max_code_point + 1;
var state: BreakState = .{};
var state: uucode.grapheme.BreakState = .default;
var uu_state: uucode.grapheme.BreakState = .default;
for (min..max) |cp1| {
if (cp1 % 1000 == 0) std.log.warn("progress cp1={}", .{cp1});
@ -199,13 +128,53 @@ test "grapheme break: emoji modifier" {
// Emoji and modifier
{
var state: BreakState = .{};
var state: uucode.grapheme.BreakState = .default;
try testing.expect(!graphemeBreak(0x261D, 0x1F3FF, &state));
}
// Non-emoji and emoji modifier
{
var state: BreakState = .{};
var state: uucode.grapheme.BreakState = .default;
try testing.expect(graphemeBreak(0x22, 0x1F3FF, &state));
}
}
test "long emoji zwj sequences" {
var state: uucode.grapheme.BreakState = .default;
// 👩👩👧👦 (family: woman, woman, girl, boy)
var it = uucode.utf8.Iterator.init("\u{1F469}\u{200D}\u{1F469}\u{200D}\u{1F467}\u{200D}\u{1F466}_");
var cp1 = it.next() orelse unreachable;
var cp2 = it.next() orelse unreachable;
try std.testing.expect(cp1 == 0x1F469); // 👩
try std.testing.expect(!graphemeBreak(cp1, cp2, &state));
cp1 = cp2;
cp2 = it.next() orelse unreachable;
try std.testing.expect(cp1 == 0x200D);
try std.testing.expect(!graphemeBreak(cp1, cp2, &state));
cp1 = cp2;
cp2 = it.next() orelse unreachable;
try std.testing.expect(cp1 == 0x1F469); // 👩
try std.testing.expect(!graphemeBreak(cp1, cp2, &state));
cp1 = cp2;
cp2 = it.next() orelse unreachable;
try std.testing.expect(cp1 == 0x200D);
try std.testing.expect(!graphemeBreak(cp1, cp2, &state));
cp1 = cp2;
cp2 = it.next() orelse unreachable;
try std.testing.expect(cp1 == 0x1F467); // 👧
try std.testing.expect(!graphemeBreak(cp1, cp2, &state));
cp1 = cp2;
cp2 = it.next() orelse unreachable;
try std.testing.expect(cp1 == 0x200D);
try std.testing.expect(!graphemeBreak(cp1, cp2, &state));
cp1 = cp2;
cp2 = it.next() orelse unreachable;
try std.testing.expect(cp1 == 0x1F466); // 👦
try std.testing.expect(graphemeBreak(cp1, cp2, &state)); // break
}

View File

@ -4,7 +4,6 @@ const grapheme = @import("grapheme.zig");
pub const table = @import("props_table.zig").table;
pub const Properties = @import("props.zig").Properties;
pub const graphemeBreak = grapheme.graphemeBreak;
pub const GraphemeBreakState = grapheme.BreakState;
test {
@import("std").testing.refAllDecls(@This());

View File

@ -5,6 +5,7 @@
//! benchmarks in src/bench to verify that we haven't regressed.
const std = @import("std");
const uucode = @import("uucode");
pub const Properties = packed struct {
/// Codepoint width. We clamp to [0, 2] since Ghostty handles control
@ -12,8 +13,8 @@ pub const Properties = packed struct {
/// becomes a 2-em dash).
width: u2 = 0,
/// Grapheme boundary class.
grapheme_boundary_class: GraphemeBoundaryClass = .invalid,
/// Grapheme break property.
grapheme_break: uucode.x.types.GraphemeBreakNoControl = .other,
/// Emoji VS compatibility
emoji_vs_base: bool = false,
@ -21,7 +22,7 @@ pub const Properties = packed struct {
// Needed for lut.Generator
pub fn eql(a: Properties, b: Properties) bool {
return a.width == b.width and
a.grapheme_boundary_class == b.grapheme_boundary_class and
a.grapheme_break == b.grapheme_break and
a.emoji_vs_base == b.emoji_vs_base;
}
@ -33,46 +34,13 @@ pub const Properties = packed struct {
try writer.print(
\\.{{
\\ .width= {},
\\ .grapheme_boundary_class= .{s},
\\ .grapheme_break= .{s},
\\ .emoji_vs_base= {},
\\}}
, .{
self.width,
@tagName(self.grapheme_boundary_class),
@tagName(self.grapheme_break),
self.emoji_vs_base,
});
}
};
/// Possible grapheme boundary classes. This isn't an exhaustive list:
/// we omit control, CR, LF, etc. because in Ghostty's usage that are
/// impossible because they're handled by the terminal.
pub const GraphemeBoundaryClass = enum(u4) {
invalid,
L,
V,
T,
LV,
LVT,
prepend,
extend,
zwj,
spacing_mark,
regional_indicator,
extended_pictographic,
extended_pictographic_base, // \p{Extended_Pictographic} & \p{Emoji_Modifier_Base}
emoji_modifier, // \p{Emoji_Modifier}
/// Returns true if this is an extended pictographic type. This
/// should be used instead of comparing the enum value directly
/// because we classify multiple.
pub fn isExtendedPictographic(self: GraphemeBoundaryClass) bool {
return switch (self) {
.extended_pictographic,
.extended_pictographic_base,
=> true,
else => false,
};
}
};

View File

@ -4,56 +4,17 @@ const assert = std.debug.assert;
const uucode = @import("uucode");
const lut = @import("lut.zig");
const Properties = @import("props.zig").Properties;
const GraphemeBoundaryClass = @import("props.zig").GraphemeBoundaryClass;
/// Gets the grapheme boundary class for a codepoint.
/// The use case for this is only in generating lookup tables.
fn graphemeBoundaryClass(cp: u21) GraphemeBoundaryClass {
if (cp > uucode.config.max_code_point) return .invalid;
return switch (uucode.get(.grapheme_break, cp)) {
.extended_pictographic => .extended_pictographic,
.l => .L,
.v => .V,
.t => .T,
.lv => .LV,
.lvt => .LVT,
.prepend => .prepend,
.zwj => .zwj,
.spacing_mark => .spacing_mark,
.regional_indicator => .regional_indicator,
.emoji_modifier => .emoji_modifier,
.emoji_modifier_base => .extended_pictographic_base,
.zwnj,
.indic_conjunct_break_extend,
.indic_conjunct_break_linker,
=> .extend,
// This is obviously not INVALID invalid, there is SOME grapheme
// boundary class for every codepoint. But we don't care about
// anything that doesn't fit into the above categories. Also note
// that `indic_conjunct_break_consonant` is `other` in
// 'GraphemeBreakProperty.txt' (it's missing).
.other,
.indic_conjunct_break_consonant,
.cr,
.lf,
.control,
=> .invalid,
};
}
pub fn get(cp: u21) Properties {
if (cp > uucode.config.max_code_point) return .{
.width = 1,
.grapheme_boundary_class = .invalid,
.grapheme_break = .other,
.emoji_vs_base = false,
};
return .{
.width = uucode.get(.width, cp),
.grapheme_boundary_class = graphemeBoundaryClass(cp),
.grapheme_break = uucode.get(.grapheme_break_no_control, cp),
.emoji_vs_base = uucode.get(.is_emoji_vs_base, cp),
};
}