unicode: isolate properties, tables, and ziglyph into separate files (#8810)

This makes it cleaner visually where the separation of concerns is.
There is now the generic `Properties.zig`, and then the
implementation-specific `props_<impl>.zig` files. Despite Zig's lazy
analysis, I find this is much easier to understand as a human.

Doing this resulted in finding one part in `src/terminal` where we were
still inadvertently using ziglyph directly instead of our LUTs! I
switched this out.

After this PR, `src/terminal` as a standalone module no longer depends
on `ziglyph` at all.[^1]

cc @jacobsandlund this is going to cause conflicts in your PR. I'm sorry
about that. But it should make it cleaner to bring in the uucode work by
adding a dedicated `props_uucode.zig` file!

[^1]: Why would I be talking about `src/terminal` as a standalone
module? That's interesting.
pull/8812/head
Mitchell Hashimoto 2025-09-20 15:15:15 -07:00 committed by GitHub
commit 511314e1a1
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 194 additions and 6 deletions

View File

@ -344,7 +344,7 @@ pub fn print(self: *Terminal, c: u21) !void {
// VS15 makes it narrow.
if (c == 0xFE0F or c == 0xFE0E) {
// This only applies to emoji
const prev_props = unicode.getProperties(prev.cell.content.codepoint);
const prev_props = unicode.table.get(prev.cell.content.codepoint);
const emoji = prev_props.grapheme_boundary_class.isExtendedPictographic();
if (!emoji) return;
@ -470,7 +470,7 @@ pub fn print(self: *Terminal, c: u21) !void {
// If this is a emoji variation selector, prev must be an emoji
if (c == 0xFE0F or c == 0xFE0E) {
const prev_props = unicode.getProperties(prev.content.codepoint);
const prev_props = unicode.table.get(prev.content.codepoint);
const emoji = prev_props.grapheme_boundary_class == .extended_pictographic;
if (!emoji) return;
}

View File

@ -0,0 +1,75 @@
//! Property set per codepoint that Ghostty cares about.
//!
//! Adding to this lets you find new properties but also potentially makes
//! our lookup tables less efficient. Any changes to this should run the
//! benchmarks in src/bench to verify that we haven't regressed.
const Properties = @This();
const std = @import("std");
/// Codepoint width. We clamp to [0, 2] since Ghostty handles control
/// characters and we max out at 2 for wide characters (i.e. 3-em dash
/// becomes a 2-em dash).
width: u2 = 0,
/// Grapheme boundary class.
grapheme_boundary_class: GraphemeBoundaryClass = .invalid,
// Needed for lut.Generator
pub fn eql(a: Properties, b: Properties) bool {
return a.width == b.width and
a.grapheme_boundary_class == b.grapheme_boundary_class;
}
// Needed for lut.Generator
pub fn format(
self: Properties,
comptime layout: []const u8,
opts: std.fmt.FormatOptions,
writer: anytype,
) !void {
_ = layout;
_ = opts;
try std.fmt.format(writer,
\\.{{
\\ .width= {},
\\ .grapheme_boundary_class= .{s},
\\}}
, .{
self.width,
@tagName(self.grapheme_boundary_class),
});
}
/// Possible grapheme boundary classes. This isn't an exhaustive list:
/// we omit control, CR, LF, etc. because in Ghostty's usage that are
/// impossible because they're handled by the terminal.
pub const GraphemeBoundaryClass = enum(u4) {
invalid,
L,
V,
T,
LV,
LVT,
prepend,
extend,
zwj,
spacing_mark,
regional_indicator,
extended_pictographic,
extended_pictographic_base, // \p{Extended_Pictographic} & \p{Emoji_Modifier_Base}
emoji_modifier, // \p{Emoji_Modifier}
/// Returns true if this is an extended pictographic type. This
/// should be used instead of comparing the enum value directly
/// because we classify multiple.
pub fn isExtendedPictographic(self: GraphemeBoundaryClass) bool {
return switch (self) {
.extended_pictographic,
.extended_pictographic_base,
=> true,
else => false,
};
}
};

View File

@ -1,14 +1,13 @@
pub const lut = @import("lut.zig");
const grapheme = @import("grapheme.zig");
const props = @import("props.zig");
pub const table = props.table;
pub const Properties = props.Properties;
pub const getProperties = props.get;
pub const table = @import("props_table.zig").table;
pub const Properties = @import("Properties.zig");
pub const graphemeBreak = grapheme.graphemeBreak;
pub const GraphemeBreakState = grapheme.BreakState;
test {
_ = @import("props_ziglyph.zig");
_ = @import("symbols.zig");
@import("std").testing.refAllDecls(@This());
}

View File

@ -0,0 +1,18 @@
const Properties = @import("Properties.zig");
const lut = @import("lut.zig");
/// The lookup tables for Ghostty.
pub const table = table: {
// This is only available after running a generator as part of the Ghostty
// build.zig process, but due to Zig's lazy analysis we can still reference
// it here.
//
// An example process is the `main` in `props_ziglyph.zig`
const generated = @import("unicode_tables").Tables(Properties);
const Tables = lut.Tables(Properties);
break :table Tables{
.stage1 = &generated.stage1,
.stage2 = &generated.stage2,
.stage3 = &generated.stage3,
};
};

View File

@ -0,0 +1,96 @@
const props = @This();
const std = @import("std");
const assert = std.debug.assert;
const ziglyph = @import("ziglyph");
const lut = @import("lut.zig");
const Properties = @import("Properties.zig");
const GraphemeBoundaryClass = Properties.GraphemeBoundaryClass;
/// Gets the grapheme boundary class for a codepoint. This is VERY
/// SLOW. The use case for this is only in generating lookup tables.
fn graphemeBoundaryClass(cp: u21) GraphemeBoundaryClass {
// We special-case modifier bases because we should not break
// if a modifier isn't next to a base.
if (ziglyph.emoji.isEmojiModifierBase(cp)) {
assert(ziglyph.emoji.isExtendedPictographic(cp));
return .extended_pictographic_base;
}
if (ziglyph.emoji.isEmojiModifier(cp)) return .emoji_modifier;
if (ziglyph.emoji.isExtendedPictographic(cp)) return .extended_pictographic;
if (ziglyph.grapheme_break.isL(cp)) return .L;
if (ziglyph.grapheme_break.isV(cp)) return .V;
if (ziglyph.grapheme_break.isT(cp)) return .T;
if (ziglyph.grapheme_break.isLv(cp)) return .LV;
if (ziglyph.grapheme_break.isLvt(cp)) return .LVT;
if (ziglyph.grapheme_break.isPrepend(cp)) return .prepend;
if (ziglyph.grapheme_break.isExtend(cp)) return .extend;
if (ziglyph.grapheme_break.isZwj(cp)) return .zwj;
if (ziglyph.grapheme_break.isSpacingmark(cp)) return .spacing_mark;
if (ziglyph.grapheme_break.isRegionalIndicator(cp)) return .regional_indicator;
// This is obviously not INVALID invalid, there is SOME grapheme
// boundary class for every codepoint. But we don't care about
// anything that doesn't fit into the above categories.
return .invalid;
}
pub fn get(cp: u21) Properties {
const zg_width = ziglyph.display_width.codePointWidth(cp, .half);
return .{
.width = @intCast(@min(2, @max(0, zg_width))),
.grapheme_boundary_class = .init(cp),
};
}
/// Runnable binary to generate the lookup tables and output to stdout.
pub fn main() !void {
var arena_state = std.heap.ArenaAllocator.init(std.heap.page_allocator);
defer arena_state.deinit();
const alloc = arena_state.allocator();
const gen: lut.Generator(
Properties,
struct {
pub fn get(ctx: @This(), cp: u21) !Properties {
_ = ctx;
return props.get(cp);
}
pub fn eql(ctx: @This(), a: Properties, b: Properties) bool {
_ = ctx;
return a.eql(b);
}
},
) = .{};
const t = try gen.generate(alloc);
defer alloc.free(t.stage1);
defer alloc.free(t.stage2);
defer alloc.free(t.stage3);
try t.writeZig(std.io.getStdOut().writer());
// Uncomment when manually debugging to see our table sizes.
// std.log.warn("stage1={} stage2={} stage3={}", .{
// t.stage1.len,
// t.stage2.len,
// t.stage3.len,
// });
}
// This is not very fast in debug modes, so its commented by default.
// IMPORTANT: UNCOMMENT THIS WHENEVER MAKING CODEPOINTWIDTH CHANGES.
// test "unicode props: tables match ziglyph" {
// const testing = std.testing;
//
// const min = 0xFF + 1; // start outside ascii
// for (min..std.math.maxInt(u21)) |cp| {
// const t = table.get(@intCast(cp));
// const zg = @min(2, @max(0, ziglyph.display_width.codePointWidth(@intCast(cp), .half)));
// if (t.width != zg) {
// std.log.warn("mismatch cp=U+{x} t={} zg={}", .{ cp, t, zg });
// try testing.expect(false);
// }
// }
// }