update uucode and cleanups

pull/8757/head
Jacob Sandlund 2025-09-18 09:26:09 -04:00
parent ec5e1e504d
commit 3275903611
5 changed files with 59 additions and 225 deletions

View File

@ -37,8 +37,8 @@
.lazy = true,
},
.uucode = .{
.url = "https://github.com/jacobsandlund/uucode/archive/a1833012b50197bdf7d43543e38f8be5c5a75016.tar.gz",
.hash = "uucode-0.0.0-ZZjBPu_RPwA01mgYvIKApZJX_JMUTKD2kyBsyYCdzfaz",
.url = "https://github.com/jacobsandlund/uucode/archive/ef173d765bd756eeecf7ce89f93c4f70c9038ab6.tar.gz",
.hash = "uucode-0.0.0-ZZjBPtMqQABaVqHdy8MX_XwChpQyZBAGchp-1cPuiQ6J",
},
.zig_wayland = .{
// codeberg ifreund/zig-wayland

View File

@ -138,146 +138,13 @@ fn stepTable(ptr: *anyopaque) Benchmark.Error!void {
}
}
const GraphemeBoundaryClass = uucode.TypeOfX(.grapheme_boundary_class);
const BreakState = enum(u3) {
default,
regional_indicator,
extended_pictographic,
};
fn computeGraphemeBoundaryClass(
gb1: GraphemeBoundaryClass,
gb2: GraphemeBoundaryClass,
state: *BreakState,
) bool {
// Set state back to default when `gb1` or `gb2` is not expected in sequence.
switch (state.*) {
.regional_indicator => {
if (gb1 != .regional_indicator or gb2 != .regional_indicator) {
state.* = .default;
}
},
.extended_pictographic => {
switch (gb1) {
.extend,
.zwj,
.extended_pictographic,
=> {},
else => state.* = .default,
}
switch (gb2) {
.extend,
.zwj,
.extended_pictographic,
=> {},
else => state.* = .default,
}
},
.default => {},
}
// GB6: L x (L | V | LV | VT)
if (gb1 == .L) {
if (gb2 == .L or
gb2 == .V or
gb2 == .LV or
gb2 == .LVT) return false;
}
// GB7: (LV | V) x (V | T)
if (gb1 == .LV or gb1 == .V) {
if (gb2 == .V or gb2 == .T) return false;
}
// GB8: (LVT | T) x T
if (gb1 == .LVT or gb1 == .T) {
if (gb2 == .T) return false;
}
// Handle GB9 (Extend | ZWJ) later, since it can also match the start of
// GB9c (Indic) and GB11 (Emoji ZWJ)
// GB9a: SpacingMark
if (gb2 == .spacing_mark) return false;
// GB9b: Prepend
if (gb1 == .prepend) return false;
// GB11: Emoji ZWJ sequence
if (gb1 == .extended_pictographic) {
// start of sequence:
// In normal operation, we'll be in this state, but
// precomputeGraphemeBreak iterates all states.
// std.debug.assert(state.* == .default);
if (gb2 == .extend or gb2 == .zwj) {
state.* = .extended_pictographic;
return false;
}
// else, not an Emoji ZWJ sequence
} else if (state.* == .extended_pictographic) {
// continue or end sequence:
if (gb1 == .extend and (gb2 == .extend or gb2 == .zwj)) {
// continue extend* ZWJ sequence
return false;
} else if (gb1 == .zwj and gb2 == .extended_pictographic) {
// ZWJ -> end of sequence
state.* = .default;
return false;
} else {
// Not a valid Emoji ZWJ sequence
state.* = .default;
}
}
// GB12 and GB13: Regional Indicator
if (gb1 == .regional_indicator and gb2 == .regional_indicator) {
if (state.* == .default) {
state.* = .regional_indicator;
return false;
} else {
state.* = .default;
return true;
}
}
// GB9: x (Extend | ZWJ)
if (gb2 == .extend or gb2 == .zwj) return false;
// GB999: Otherwise, break everywhere
return true;
}
pub fn isBreak(
cp1: u21,
cp2: u21,
state: *BreakState,
) bool {
const table = comptime uucode.grapheme.precomputeGraphemeBreak(
GraphemeBoundaryClass,
BreakState,
computeGraphemeBoundaryClass,
);
const gb1 = uucode.getX(.grapheme_boundary_class, cp1);
const gb2 = uucode.getX(.grapheme_boundary_class, cp2);
const result = table.get(gb1, gb2, state.*);
state.* = result.state;
return result.result;
}
fn stepUucode(ptr: *anyopaque) Benchmark.Error!void {
const self: *GraphemeBreak = @ptrCast(@alignCast(ptr));
const f = self.data_f orelse return;
var r = std.io.bufferedReader(f.reader());
var d: UTF8Decoder = .{};
var state: BreakState = .default;
var state: uucode.grapheme.BreakState = .default;
var cp1: u21 = 0;
var buf: [4096]u8 = undefined;
while (true) {
@ -291,7 +158,7 @@ fn stepUucode(ptr: *anyopaque) Benchmark.Error!void {
const cp_, const consumed = d.next(c);
assert(consumed);
if (cp_) |cp2| {
std.mem.doNotOptimizeAway(isBreak(cp1, @intCast(cp2), &state));
std.mem.doNotOptimizeAway(uucode.grapheme.isBreak(cp1, @intCast(cp2), &state));
cp1 = cp2;
}
}

View File

@ -1,96 +1,42 @@
const std = @import("std");
const config = @import("config.zig");
const config_x = @import("config.x.zig");
const d = config.default;
const wcwidth = config_x.wcwidth;
const Allocator = std.mem.Allocator;
pub const log_level = .debug;
fn computeWidth(cp: u21, data: anytype, backing: anytype, tracking: anytype) void {
fn computeWidth(
alloc: std.mem.Allocator,
cp: u21,
data: anytype,
backing: anytype,
tracking: anytype,
) Allocator.Error!void {
_ = alloc;
_ = cp;
_ = backing;
_ = tracking;
if (data.wcwidth < 0) {
data.width = 0;
} else if (data.wcwidth > 2) {
data.width = 2;
} else {
data.width = @intCast(data.wcwidth);
}
data.width = @intCast(@min(2, @max(0, data.wcwidth)));
}
const width = config.Extension{ .inputs = &.{"wcwidth"}, .compute = &computeWidth, .fields = &.{
.{ .name = "width", .type = u2 },
} };
pub const GraphemeBoundaryClass = enum(u4) {
invalid,
L,
V,
T,
LV,
LVT,
prepend,
extend,
zwj,
spacing_mark,
regional_indicator,
extended_pictographic,
extended_pictographic_base, // \p{Extended_Pictographic} & \p{Emoji_Modifier_Base}
emoji_modifier, // \p{Emoji_Modifier}
};
fn computeGraphemeBoundaryClass(cp: u21, data: anytype, backing: anytype, tracking: anytype) void {
_ = cp;
_ = backing;
_ = tracking;
if (data.is_emoji_modifier) {
data.grapheme_boundary_class = .emoji_modifier;
} else if (data.is_emoji_modifier_base) {
data.grapheme_boundary_class = .extended_pictographic_base;
} else {
data.grapheme_boundary_class = switch (data.grapheme_break) {
.extended_pictographic => .extended_pictographic,
.l => .L,
.v => .V,
.t => .T,
.lv => .LV,
.lvt => .LVT,
.prepend => .prepend,
.zwj => .zwj,
.spacing_mark => .spacing_mark,
.regional_indicator => .regional_indicator,
.zwnj,
.indic_conjunct_break_extend,
.indic_conjunct_break_linker,
=> .extend,
// This is obviously not INVALID invalid, there is SOME grapheme
// boundary class for every codepoint. But we don't care about
// anything that doesn't fit into the above categories.
.other,
.indic_conjunct_break_consonant,
.cr,
.lf,
.control,
=> .invalid,
};
}
}
const grapheme_boundary_class = config.Extension{
.inputs = &.{
"grapheme_break",
"is_emoji_modifier",
"is_emoji_modifier_base",
},
.compute = &computeGraphemeBoundaryClass,
const width = config.Extension{
.inputs = &.{"wcwidth"},
.compute = &computeWidth,
.fields = &.{
.{ .name = "grapheme_boundary_class", .type = GraphemeBoundaryClass },
.{ .name = "width", .type = u2 },
},
};
fn computeIsSymbol(cp: u21, data: anytype, backing: anytype, tracking: anytype) void {
fn computeIsSymbol(
alloc: Allocator,
cp: u21,
data: anytype,
backing: anytype,
tracking: anytype,
) Allocator.Error!void {
_ = alloc;
_ = cp;
_ = backing;
_ = tracking;
@ -117,24 +63,26 @@ pub const tables = [_]config.Table{
.{
.extensions = &.{wcwidth},
.fields = &.{
wcwidth.field("wcwidth"),
d.field("general_category"),
d.field("block"),
d.field("is_emoji_presentation"),
d.field("case_folding_full"),
// Alternative:
// d.field("case_folding_simple"),
d.field("is_emoji_modifier"),
d.field("is_emoji_modifier_base"),
d.field("grapheme_break"),
},
},
.{
.stages = .two,
.extensions = &.{ wcwidth, width, grapheme_boundary_class },
.extensions = &.{ wcwidth, width },
.fields = &.{
width.field("width"),
grapheme_boundary_class.field("grapheme_boundary_class"),
},
},
.{
.stages = .two,
.extensions = &.{},
.fields = &.{
d.field("grapheme_break"),
},
},
.{

View File

@ -1629,7 +1629,11 @@ pub const Trigger = struct {
// If more codepoints are produced then we return the codepoint
// as-is which isn't correct but until we have a failing test
// then I don't want to handle this.
return uucode.get(.case_folding_full, cp).array(cp);
var buffer: [1]u21 = undefined;
const slice = uucode.get(.case_folding_full, cp).with(&buffer, cp);
var array: [3]u21 = [_]u21{0} ** 3;
@memcpy(array[0..slice.len], slice);
return array;
}
/// Convert the trigger to a C API compatible trigger.

View File

@ -61,7 +61,22 @@ pub const Properties = struct {
/// Possible grapheme boundary classes. This isn't an exhaustive list:
/// we omit control, CR, LF, etc. because in Ghostty's usage that are
/// impossible because they're handled by the terminal.
pub const GraphemeBoundaryClass = uucode.TypeOfX(.grapheme_boundary_class);
pub const GraphemeBoundaryClass = enum(u4) {
invalid,
L,
V,
T,
LV,
LVT,
prepend,
extend,
zwj,
spacing_mark,
regional_indicator,
extended_pictographic,
extended_pictographic_base, // \p{Extended_Pictographic} & \p{Emoji_Modifier_Base}
emoji_modifier, // \p{Emoji_Modifier}
};
/// Gets the grapheme boundary class for a codepoint.
/// The use case for this is only in generating lookup tables.
@ -113,13 +128,13 @@ pub fn isExtendedPictographic(self: GraphemeBoundaryClass) bool {
}
pub fn get(cp: u21) Properties {
const wcwidth = if (cp > uucode.config.max_code_point)
const width = if (cp > uucode.config.max_code_point)
0
else
uucode.get(.wcwidth, cp);
uucode.getX(.width, cp);
return .{
.width = @intCast(@min(2, @max(0, wcwidth))),
.width = width,
.grapheme_boundary_class = computeGraphemeBoundaryClass(cp),
};
}