unicode: isolate properties, tables, and ziglyph into separate files
This makes it cleaner to add new sources of table generation and also avoids inadvertently depending on different modules (despite Zig's lazy analysis). This also fixes up terminal to only use our look up tables which avoids bringing ziglyph in for the terminal module.pull/8810/head
parent
c277ef8d82
commit
bf1278deff
|
|
@ -344,7 +344,7 @@ pub fn print(self: *Terminal, c: u21) !void {
|
||||||
// VS15 makes it narrow.
|
// VS15 makes it narrow.
|
||||||
if (c == 0xFE0F or c == 0xFE0E) {
|
if (c == 0xFE0F or c == 0xFE0E) {
|
||||||
// This only applies to emoji
|
// This only applies to emoji
|
||||||
const prev_props = unicode.getProperties(prev.cell.content.codepoint);
|
const prev_props = unicode.table.get(prev.cell.content.codepoint);
|
||||||
const emoji = prev_props.grapheme_boundary_class.isExtendedPictographic();
|
const emoji = prev_props.grapheme_boundary_class.isExtendedPictographic();
|
||||||
if (!emoji) return;
|
if (!emoji) return;
|
||||||
|
|
||||||
|
|
@ -470,7 +470,7 @@ pub fn print(self: *Terminal, c: u21) !void {
|
||||||
|
|
||||||
// If this is a emoji variation selector, prev must be an emoji
|
// If this is a emoji variation selector, prev must be an emoji
|
||||||
if (c == 0xFE0F or c == 0xFE0E) {
|
if (c == 0xFE0F or c == 0xFE0E) {
|
||||||
const prev_props = unicode.getProperties(prev.content.codepoint);
|
const prev_props = unicode.table.get(prev.content.codepoint);
|
||||||
const emoji = prev_props.grapheme_boundary_class == .extended_pictographic;
|
const emoji = prev_props.grapheme_boundary_class == .extended_pictographic;
|
||||||
if (!emoji) return;
|
if (!emoji) return;
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,75 @@
|
||||||
|
//! Property set per codepoint that Ghostty cares about.
|
||||||
|
//!
|
||||||
|
//! Adding to this lets you find new properties but also potentially makes
|
||||||
|
//! our lookup tables less efficient. Any changes to this should run the
|
||||||
|
//! benchmarks in src/bench to verify that we haven't regressed.
|
||||||
|
const Properties = @This();
|
||||||
|
|
||||||
|
const std = @import("std");
|
||||||
|
|
||||||
|
/// Codepoint width. We clamp to [0, 2] since Ghostty handles control
|
||||||
|
/// characters and we max out at 2 for wide characters (i.e. 3-em dash
|
||||||
|
/// becomes a 2-em dash).
|
||||||
|
width: u2 = 0,
|
||||||
|
|
||||||
|
/// Grapheme boundary class.
|
||||||
|
grapheme_boundary_class: GraphemeBoundaryClass = .invalid,
|
||||||
|
|
||||||
|
// Needed for lut.Generator
|
||||||
|
pub fn eql(a: Properties, b: Properties) bool {
|
||||||
|
return a.width == b.width and
|
||||||
|
a.grapheme_boundary_class == b.grapheme_boundary_class;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Needed for lut.Generator
|
||||||
|
pub fn format(
|
||||||
|
self: Properties,
|
||||||
|
comptime layout: []const u8,
|
||||||
|
opts: std.fmt.FormatOptions,
|
||||||
|
writer: anytype,
|
||||||
|
) !void {
|
||||||
|
_ = layout;
|
||||||
|
_ = opts;
|
||||||
|
try std.fmt.format(writer,
|
||||||
|
\\.{{
|
||||||
|
\\ .width= {},
|
||||||
|
\\ .grapheme_boundary_class= .{s},
|
||||||
|
\\}}
|
||||||
|
, .{
|
||||||
|
self.width,
|
||||||
|
@tagName(self.grapheme_boundary_class),
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Possible grapheme boundary classes. This isn't an exhaustive list:
|
||||||
|
/// we omit control, CR, LF, etc. because in Ghostty's usage that are
|
||||||
|
/// impossible because they're handled by the terminal.
|
||||||
|
pub const GraphemeBoundaryClass = enum(u4) {
|
||||||
|
invalid,
|
||||||
|
L,
|
||||||
|
V,
|
||||||
|
T,
|
||||||
|
LV,
|
||||||
|
LVT,
|
||||||
|
prepend,
|
||||||
|
extend,
|
||||||
|
zwj,
|
||||||
|
spacing_mark,
|
||||||
|
regional_indicator,
|
||||||
|
extended_pictographic,
|
||||||
|
extended_pictographic_base, // \p{Extended_Pictographic} & \p{Emoji_Modifier_Base}
|
||||||
|
emoji_modifier, // \p{Emoji_Modifier}
|
||||||
|
|
||||||
|
/// Returns true if this is an extended pictographic type. This
|
||||||
|
/// should be used instead of comparing the enum value directly
|
||||||
|
/// because we classify multiple.
|
||||||
|
pub fn isExtendedPictographic(self: GraphemeBoundaryClass) bool {
|
||||||
|
return switch (self) {
|
||||||
|
.extended_pictographic,
|
||||||
|
.extended_pictographic_base,
|
||||||
|
=> true,
|
||||||
|
|
||||||
|
else => false,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
@ -1,14 +1,13 @@
|
||||||
pub const lut = @import("lut.zig");
|
pub const lut = @import("lut.zig");
|
||||||
|
|
||||||
const grapheme = @import("grapheme.zig");
|
const grapheme = @import("grapheme.zig");
|
||||||
const props = @import("props.zig");
|
pub const table = @import("props_table.zig").table;
|
||||||
pub const table = props.table;
|
pub const Properties = @import("Properties.zig");
|
||||||
pub const Properties = props.Properties;
|
|
||||||
pub const getProperties = props.get;
|
|
||||||
pub const graphemeBreak = grapheme.graphemeBreak;
|
pub const graphemeBreak = grapheme.graphemeBreak;
|
||||||
pub const GraphemeBreakState = grapheme.BreakState;
|
pub const GraphemeBreakState = grapheme.BreakState;
|
||||||
|
|
||||||
test {
|
test {
|
||||||
|
_ = @import("props_ziglyph.zig");
|
||||||
_ = @import("symbols.zig");
|
_ = @import("symbols.zig");
|
||||||
@import("std").testing.refAllDecls(@This());
|
@import("std").testing.refAllDecls(@This());
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,18 @@
|
||||||
|
const Properties = @import("Properties.zig");
|
||||||
|
const lut = @import("lut.zig");
|
||||||
|
|
||||||
|
/// The lookup tables for Ghostty.
|
||||||
|
pub const table = table: {
|
||||||
|
// This is only available after running a generator as part of the Ghostty
|
||||||
|
// build.zig process, but due to Zig's lazy analysis we can still reference
|
||||||
|
// it here.
|
||||||
|
//
|
||||||
|
// An example process is the `main` in `props_ziglyph.zig`
|
||||||
|
const generated = @import("unicode_tables").Tables(Properties);
|
||||||
|
const Tables = lut.Tables(Properties);
|
||||||
|
break :table Tables{
|
||||||
|
.stage1 = &generated.stage1,
|
||||||
|
.stage2 = &generated.stage2,
|
||||||
|
.stage3 = &generated.stage3,
|
||||||
|
};
|
||||||
|
};
|
||||||
|
|
@ -0,0 +1,96 @@
|
||||||
|
const props = @This();
|
||||||
|
|
||||||
|
const std = @import("std");
|
||||||
|
const assert = std.debug.assert;
|
||||||
|
const ziglyph = @import("ziglyph");
|
||||||
|
const lut = @import("lut.zig");
|
||||||
|
const Properties = @import("Properties.zig");
|
||||||
|
const GraphemeBoundaryClass = Properties.GraphemeBoundaryClass;
|
||||||
|
|
||||||
|
/// Gets the grapheme boundary class for a codepoint. This is VERY
|
||||||
|
/// SLOW. The use case for this is only in generating lookup tables.
|
||||||
|
fn graphemeBoundaryClass(cp: u21) GraphemeBoundaryClass {
|
||||||
|
// We special-case modifier bases because we should not break
|
||||||
|
// if a modifier isn't next to a base.
|
||||||
|
if (ziglyph.emoji.isEmojiModifierBase(cp)) {
|
||||||
|
assert(ziglyph.emoji.isExtendedPictographic(cp));
|
||||||
|
return .extended_pictographic_base;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (ziglyph.emoji.isEmojiModifier(cp)) return .emoji_modifier;
|
||||||
|
if (ziglyph.emoji.isExtendedPictographic(cp)) return .extended_pictographic;
|
||||||
|
if (ziglyph.grapheme_break.isL(cp)) return .L;
|
||||||
|
if (ziglyph.grapheme_break.isV(cp)) return .V;
|
||||||
|
if (ziglyph.grapheme_break.isT(cp)) return .T;
|
||||||
|
if (ziglyph.grapheme_break.isLv(cp)) return .LV;
|
||||||
|
if (ziglyph.grapheme_break.isLvt(cp)) return .LVT;
|
||||||
|
if (ziglyph.grapheme_break.isPrepend(cp)) return .prepend;
|
||||||
|
if (ziglyph.grapheme_break.isExtend(cp)) return .extend;
|
||||||
|
if (ziglyph.grapheme_break.isZwj(cp)) return .zwj;
|
||||||
|
if (ziglyph.grapheme_break.isSpacingmark(cp)) return .spacing_mark;
|
||||||
|
if (ziglyph.grapheme_break.isRegionalIndicator(cp)) return .regional_indicator;
|
||||||
|
|
||||||
|
// This is obviously not INVALID invalid, there is SOME grapheme
|
||||||
|
// boundary class for every codepoint. But we don't care about
|
||||||
|
// anything that doesn't fit into the above categories.
|
||||||
|
return .invalid;
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn get(cp: u21) Properties {
|
||||||
|
const zg_width = ziglyph.display_width.codePointWidth(cp, .half);
|
||||||
|
return .{
|
||||||
|
.width = @intCast(@min(2, @max(0, zg_width))),
|
||||||
|
.grapheme_boundary_class = .init(cp),
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Runnable binary to generate the lookup tables and output to stdout.
|
||||||
|
pub fn main() !void {
|
||||||
|
var arena_state = std.heap.ArenaAllocator.init(std.heap.page_allocator);
|
||||||
|
defer arena_state.deinit();
|
||||||
|
const alloc = arena_state.allocator();
|
||||||
|
|
||||||
|
const gen: lut.Generator(
|
||||||
|
Properties,
|
||||||
|
struct {
|
||||||
|
pub fn get(ctx: @This(), cp: u21) !Properties {
|
||||||
|
_ = ctx;
|
||||||
|
return props.get(cp);
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn eql(ctx: @This(), a: Properties, b: Properties) bool {
|
||||||
|
_ = ctx;
|
||||||
|
return a.eql(b);
|
||||||
|
}
|
||||||
|
},
|
||||||
|
) = .{};
|
||||||
|
|
||||||
|
const t = try gen.generate(alloc);
|
||||||
|
defer alloc.free(t.stage1);
|
||||||
|
defer alloc.free(t.stage2);
|
||||||
|
defer alloc.free(t.stage3);
|
||||||
|
try t.writeZig(std.io.getStdOut().writer());
|
||||||
|
|
||||||
|
// Uncomment when manually debugging to see our table sizes.
|
||||||
|
// std.log.warn("stage1={} stage2={} stage3={}", .{
|
||||||
|
// t.stage1.len,
|
||||||
|
// t.stage2.len,
|
||||||
|
// t.stage3.len,
|
||||||
|
// });
|
||||||
|
}
|
||||||
|
|
||||||
|
// This is not very fast in debug modes, so its commented by default.
|
||||||
|
// IMPORTANT: UNCOMMENT THIS WHENEVER MAKING CODEPOINTWIDTH CHANGES.
|
||||||
|
// test "unicode props: tables match ziglyph" {
|
||||||
|
// const testing = std.testing;
|
||||||
|
//
|
||||||
|
// const min = 0xFF + 1; // start outside ascii
|
||||||
|
// for (min..std.math.maxInt(u21)) |cp| {
|
||||||
|
// const t = table.get(@intCast(cp));
|
||||||
|
// const zg = @min(2, @max(0, ziglyph.display_width.codePointWidth(@intCast(cp), .half)));
|
||||||
|
// if (t.width != zg) {
|
||||||
|
// std.log.warn("mismatch cp=U+{x} t={} zg={}", .{ cp, t, zg });
|
||||||
|
// try testing.expect(false);
|
||||||
|
// }
|
||||||
|
// }
|
||||||
|
// }
|
||||||
Loading…
Reference in New Issue