add two LUT-based implementations of isSymbol
parent
968b9d536d
commit
a7da96faee
|
|
@ -0,0 +1,172 @@
|
||||||
|
//! This benchmark tests the throughput of grapheme break calculation.
|
||||||
|
//! This is a common operation in terminal character printing for terminals
|
||||||
|
//! that support grapheme clustering.
|
||||||
|
const IsSymbol = @This();
|
||||||
|
|
||||||
|
const std = @import("std");
|
||||||
|
const builtin = @import("builtin");
|
||||||
|
const assert = std.debug.assert;
|
||||||
|
const Allocator = std.mem.Allocator;
|
||||||
|
const Benchmark = @import("Benchmark.zig");
|
||||||
|
const options = @import("options.zig");
|
||||||
|
const UTF8Decoder = @import("../terminal/UTF8Decoder.zig");
|
||||||
|
const symbols1 = @import("../unicode/symbols1.zig");
|
||||||
|
const symbols2 = @import("../unicode/symbols2.zig");
|
||||||
|
|
||||||
|
const log = std.log.scoped(.@"is-symbol-bench");
|
||||||
|
|
||||||
|
opts: Options,
|
||||||
|
|
||||||
|
/// The file, opened in the setup function.
|
||||||
|
data_f: ?std.fs.File = null,
|
||||||
|
|
||||||
|
pub const Options = struct {
|
||||||
|
/// Which test to run.
|
||||||
|
mode: Mode = .ziglyph,
|
||||||
|
|
||||||
|
/// The data to read as a filepath. If this is "-" then
|
||||||
|
/// we will read stdin. If this is unset, then we will
|
||||||
|
/// do nothing (benchmark is a noop). It'd be more unixy to
|
||||||
|
/// use stdin by default but I find that a hanging CLI command
|
||||||
|
/// with no interaction is a bit annoying.
|
||||||
|
data: ?[]const u8 = null,
|
||||||
|
};
|
||||||
|
|
||||||
|
pub const Mode = enum {
|
||||||
|
/// "Naive" ziglyph implementation.
|
||||||
|
ziglyph,
|
||||||
|
|
||||||
|
/// Ghostty's table-based approach.
|
||||||
|
table1,
|
||||||
|
table2,
|
||||||
|
};
|
||||||
|
|
||||||
|
/// Create a new terminal stream handler for the given arguments.
|
||||||
|
pub fn create(
|
||||||
|
alloc: Allocator,
|
||||||
|
opts: Options,
|
||||||
|
) !*IsSymbol {
|
||||||
|
const ptr = try alloc.create(IsSymbol);
|
||||||
|
errdefer alloc.destroy(ptr);
|
||||||
|
ptr.* = .{ .opts = opts };
|
||||||
|
return ptr;
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn destroy(self: *IsSymbol, alloc: Allocator) void {
|
||||||
|
alloc.destroy(self);
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn benchmark(self: *IsSymbol) Benchmark {
|
||||||
|
return .init(self, .{
|
||||||
|
.stepFn = switch (self.opts.mode) {
|
||||||
|
.ziglyph => stepZiglyph,
|
||||||
|
.table1 => stepTable1,
|
||||||
|
.table2 => stepTable1,
|
||||||
|
},
|
||||||
|
.setupFn = setup,
|
||||||
|
.teardownFn = teardown,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
fn setup(ptr: *anyopaque) Benchmark.Error!void {
|
||||||
|
const self: *IsSymbol = @ptrCast(@alignCast(ptr));
|
||||||
|
|
||||||
|
// Open our data file to prepare for reading. We can do more
|
||||||
|
// validation here eventually.
|
||||||
|
assert(self.data_f == null);
|
||||||
|
self.data_f = options.dataFile(self.opts.data) catch |err| {
|
||||||
|
log.warn("error opening data file err={}", .{err});
|
||||||
|
return error.BenchmarkFailed;
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
fn teardown(ptr: *anyopaque) void {
|
||||||
|
const self: *IsSymbol = @ptrCast(@alignCast(ptr));
|
||||||
|
if (self.data_f) |f| {
|
||||||
|
f.close();
|
||||||
|
self.data_f = null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn stepZiglyph(ptr: *anyopaque) Benchmark.Error!void {
|
||||||
|
const self: *IsSymbol = @ptrCast(@alignCast(ptr));
|
||||||
|
|
||||||
|
const f = self.data_f orelse return;
|
||||||
|
var r = std.io.bufferedReader(f.reader());
|
||||||
|
var d: UTF8Decoder = .{};
|
||||||
|
var buf: [4096]u8 = undefined;
|
||||||
|
while (true) {
|
||||||
|
const n = r.read(&buf) catch |err| {
|
||||||
|
log.warn("error reading data file err={}", .{err});
|
||||||
|
return error.BenchmarkFailed;
|
||||||
|
};
|
||||||
|
if (n == 0) break; // EOF reached
|
||||||
|
|
||||||
|
for (buf[0..n]) |c| {
|
||||||
|
const cp_, const consumed = d.next(c);
|
||||||
|
assert(consumed);
|
||||||
|
if (cp_) |cp| {
|
||||||
|
std.mem.doNotOptimizeAway(symbols1.isSymbol(cp));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn stepTable1(ptr: *anyopaque) Benchmark.Error!void {
|
||||||
|
const self: *IsSymbol = @ptrCast(@alignCast(ptr));
|
||||||
|
|
||||||
|
const f = self.data_f orelse return;
|
||||||
|
var r = std.io.bufferedReader(f.reader());
|
||||||
|
var d: UTF8Decoder = .{};
|
||||||
|
var buf: [4096]u8 = undefined;
|
||||||
|
while (true) {
|
||||||
|
const n = r.read(&buf) catch |err| {
|
||||||
|
log.warn("error reading data file err={}", .{err});
|
||||||
|
return error.BenchmarkFailed;
|
||||||
|
};
|
||||||
|
if (n == 0) break; // EOF reached
|
||||||
|
|
||||||
|
for (buf[0..n]) |c| {
|
||||||
|
const cp_, const consumed = d.next(c);
|
||||||
|
assert(consumed);
|
||||||
|
if (cp_) |cp| {
|
||||||
|
std.mem.doNotOptimizeAway(symbols1.table.get(cp));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn stepTable2(ptr: *anyopaque) Benchmark.Error!void {
|
||||||
|
const self: *IsSymbol = @ptrCast(@alignCast(ptr));
|
||||||
|
|
||||||
|
const f = self.data_f orelse return;
|
||||||
|
var r = std.io.bufferedReader(f.reader());
|
||||||
|
var d: UTF8Decoder = .{};
|
||||||
|
var buf: [4096]u8 = undefined;
|
||||||
|
while (true) {
|
||||||
|
const n = r.read(&buf) catch |err| {
|
||||||
|
log.warn("error reading data file err={}", .{err});
|
||||||
|
return error.BenchmarkFailed;
|
||||||
|
};
|
||||||
|
if (n == 0) break; // EOF reached
|
||||||
|
|
||||||
|
for (buf[0..n]) |c| {
|
||||||
|
const cp_, const consumed = d.next(c);
|
||||||
|
assert(consumed);
|
||||||
|
if (cp_) |cp| {
|
||||||
|
std.mem.doNotOptimizeAway(symbols2.table.get(cp));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
test IsSymbol {
|
||||||
|
const testing = std.testing;
|
||||||
|
const alloc = testing.allocator;
|
||||||
|
|
||||||
|
const impl: *IsSymbol = try .create(alloc, .{});
|
||||||
|
defer impl.destroy(alloc);
|
||||||
|
|
||||||
|
const bench = impl.benchmark();
|
||||||
|
_ = try bench.run(.once);
|
||||||
|
}
|
||||||
|
|
@ -10,6 +10,7 @@ pub const Action = enum {
|
||||||
@"grapheme-break",
|
@"grapheme-break",
|
||||||
@"terminal-parser",
|
@"terminal-parser",
|
||||||
@"terminal-stream",
|
@"terminal-stream",
|
||||||
|
@"is-symbol",
|
||||||
|
|
||||||
/// Returns the struct associated with the action. The struct
|
/// Returns the struct associated with the action. The struct
|
||||||
/// should have a few decls:
|
/// should have a few decls:
|
||||||
|
|
@ -25,6 +26,7 @@ pub const Action = enum {
|
||||||
.@"codepoint-width" => @import("CodepointWidth.zig"),
|
.@"codepoint-width" => @import("CodepointWidth.zig"),
|
||||||
.@"grapheme-break" => @import("GraphemeBreak.zig"),
|
.@"grapheme-break" => @import("GraphemeBreak.zig"),
|
||||||
.@"terminal-parser" => @import("TerminalParser.zig"),
|
.@"terminal-parser" => @import("TerminalParser.zig"),
|
||||||
|
.@"is-symbol" => @import("IsSymbol.zig"),
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
|
||||||
|
|
@ -5,6 +5,7 @@ pub const TerminalStream = @import("TerminalStream.zig");
|
||||||
pub const CodepointWidth = @import("CodepointWidth.zig");
|
pub const CodepointWidth = @import("CodepointWidth.zig");
|
||||||
pub const GraphemeBreak = @import("GraphemeBreak.zig");
|
pub const GraphemeBreak = @import("GraphemeBreak.zig");
|
||||||
pub const TerminalParser = @import("TerminalParser.zig");
|
pub const TerminalParser = @import("TerminalParser.zig");
|
||||||
|
pub const IsSymbol = @import("IsSymbol.zig");
|
||||||
|
|
||||||
test {
|
test {
|
||||||
@import("std").testing.refAllDecls(@This());
|
@import("std").testing.refAllDecls(@This());
|
||||||
|
|
|
||||||
|
|
@ -61,6 +61,7 @@ emit_termcap: bool = false,
|
||||||
emit_test_exe: bool = false,
|
emit_test_exe: bool = false,
|
||||||
emit_xcframework: bool = false,
|
emit_xcframework: bool = false,
|
||||||
emit_webdata: bool = false,
|
emit_webdata: bool = false,
|
||||||
|
emit_unicode_table_gen: bool = false,
|
||||||
|
|
||||||
/// Environmental properties
|
/// Environmental properties
|
||||||
env: std.process.EnvMap,
|
env: std.process.EnvMap,
|
||||||
|
|
@ -299,6 +300,12 @@ pub fn init(b: *std.Build) !Config {
|
||||||
"Build and install test executables with 'build'",
|
"Build and install test executables with 'build'",
|
||||||
) orelse false;
|
) orelse false;
|
||||||
|
|
||||||
|
config.emit_unicode_table_gen = b.option(
|
||||||
|
bool,
|
||||||
|
"emit-unicode-table-gen",
|
||||||
|
"Build and install executables that generate unicode tables with 'build'",
|
||||||
|
) orelse false;
|
||||||
|
|
||||||
config.emit_bench = b.option(
|
config.emit_bench = b.option(
|
||||||
bool,
|
bool,
|
||||||
"emit-bench",
|
"emit-bench",
|
||||||
|
|
|
||||||
|
|
@ -31,6 +31,7 @@ pub fn init(b: *std.Build, cfg: *const Config) !SharedDeps {
|
||||||
.metallib = undefined,
|
.metallib = undefined,
|
||||||
};
|
};
|
||||||
try result.initTarget(b, cfg.target);
|
try result.initTarget(b, cfg.target);
|
||||||
|
if (cfg.emit_unicode_table_gen) result.unicode_tables.install(b);
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -4,14 +4,18 @@ const std = @import("std");
|
||||||
const Config = @import("Config.zig");
|
const Config = @import("Config.zig");
|
||||||
|
|
||||||
/// The exe.
|
/// The exe.
|
||||||
exe: *std.Build.Step.Compile,
|
props_exe: *std.Build.Step.Compile,
|
||||||
|
symbols1_exe: *std.Build.Step.Compile,
|
||||||
|
symbols2_exe: *std.Build.Step.Compile,
|
||||||
|
|
||||||
/// The output path for the unicode tables
|
/// The output path for the unicode tables
|
||||||
output: std.Build.LazyPath,
|
props_output: std.Build.LazyPath,
|
||||||
|
symbols1_output: std.Build.LazyPath,
|
||||||
|
symbols2_output: std.Build.LazyPath,
|
||||||
|
|
||||||
pub fn init(b: *std.Build) !UnicodeTables {
|
pub fn init(b: *std.Build) !UnicodeTables {
|
||||||
const exe = b.addExecutable(.{
|
const props_exe = b.addExecutable(.{
|
||||||
.name = "unigen",
|
.name = "props-unigen",
|
||||||
.root_module = b.createModule(.{
|
.root_module = b.createModule(.{
|
||||||
.root_source_file = b.path("src/unicode/props.zig"),
|
.root_source_file = b.path("src/unicode/props.zig"),
|
||||||
.target = b.graph.host,
|
.target = b.graph.host,
|
||||||
|
|
@ -21,31 +25,72 @@ pub fn init(b: *std.Build) !UnicodeTables {
|
||||||
}),
|
}),
|
||||||
});
|
});
|
||||||
|
|
||||||
|
const symbols1_exe = b.addExecutable(.{
|
||||||
|
.name = "symbols1-unigen",
|
||||||
|
.root_module = b.createModule(.{
|
||||||
|
.root_source_file = b.path("src/unicode/symbols1.zig"),
|
||||||
|
.target = b.graph.host,
|
||||||
|
.strip = false,
|
||||||
|
.omit_frame_pointer = false,
|
||||||
|
.unwind_tables = .sync,
|
||||||
|
}),
|
||||||
|
});
|
||||||
|
|
||||||
|
const symbols2_exe = b.addExecutable(.{
|
||||||
|
.name = "symbols2-unigen",
|
||||||
|
.root_module = b.createModule(.{
|
||||||
|
.root_source_file = b.path("src/unicode/symbols2.zig"),
|
||||||
|
.target = b.graph.host,
|
||||||
|
.strip = false,
|
||||||
|
.omit_frame_pointer = false,
|
||||||
|
.unwind_tables = .sync,
|
||||||
|
}),
|
||||||
|
});
|
||||||
|
|
||||||
if (b.lazyDependency("ziglyph", .{
|
if (b.lazyDependency("ziglyph", .{
|
||||||
.target = b.graph.host,
|
.target = b.graph.host,
|
||||||
})) |ziglyph_dep| {
|
})) |ziglyph_dep| {
|
||||||
exe.root_module.addImport(
|
inline for (&.{ props_exe, symbols1_exe, symbols2_exe }) |exe| {
|
||||||
"ziglyph",
|
exe.root_module.addImport(
|
||||||
ziglyph_dep.module("ziglyph"),
|
"ziglyph",
|
||||||
);
|
ziglyph_dep.module("ziglyph"),
|
||||||
|
);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
const run = b.addRunArtifact(exe);
|
const props_run = b.addRunArtifact(props_exe);
|
||||||
|
const symbols1_run = b.addRunArtifact(symbols1_exe);
|
||||||
|
const symbols2_run = b.addRunArtifact(symbols2_exe);
|
||||||
|
|
||||||
return .{
|
return .{
|
||||||
.exe = exe,
|
.props_exe = props_exe,
|
||||||
.output = run.captureStdOut(),
|
.symbols1_exe = symbols1_exe,
|
||||||
|
.symbols2_exe = symbols2_exe,
|
||||||
|
.props_output = props_run.captureStdOut(),
|
||||||
|
.symbols1_output = symbols1_run.captureStdOut(),
|
||||||
|
.symbols2_output = symbols2_run.captureStdOut(),
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Add the "unicode_tables" import.
|
/// Add the "unicode_tables" import.
|
||||||
pub fn addImport(self: *const UnicodeTables, step: *std.Build.Step.Compile) void {
|
pub fn addImport(self: *const UnicodeTables, step: *std.Build.Step.Compile) void {
|
||||||
self.output.addStepDependencies(&step.step);
|
self.props_output.addStepDependencies(&step.step);
|
||||||
step.root_module.addAnonymousImport("unicode_tables", .{
|
step.root_module.addAnonymousImport("unicode_tables", .{
|
||||||
.root_source_file = self.output,
|
.root_source_file = self.props_output,
|
||||||
|
});
|
||||||
|
self.symbols1_output.addStepDependencies(&step.step);
|
||||||
|
step.root_module.addAnonymousImport("symbols1_tables", .{
|
||||||
|
.root_source_file = self.symbols1_output,
|
||||||
|
});
|
||||||
|
self.symbols2_output.addStepDependencies(&step.step);
|
||||||
|
step.root_module.addAnonymousImport("symbols2_tables", .{
|
||||||
|
.root_source_file = self.symbols2_output,
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Install the exe
|
/// Install the exe
|
||||||
pub fn install(self: *const UnicodeTables, b: *std.Build) void {
|
pub fn install(self: *const UnicodeTables, b: *std.Build) void {
|
||||||
b.installArtifact(self.exe);
|
b.installArtifact(self.props_exe);
|
||||||
|
b.installArtifact(self.symbols1_exe);
|
||||||
|
b.installArtifact(self.symbols2_exe);
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -1,12 +1,12 @@
|
||||||
const std = @import("std");
|
const std = @import("std");
|
||||||
const Allocator = std.mem.Allocator;
|
const Allocator = std.mem.Allocator;
|
||||||
const assert = std.debug.assert;
|
const assert = std.debug.assert;
|
||||||
const ziglyph = @import("ziglyph");
|
|
||||||
const font = @import("../font/main.zig");
|
const font = @import("../font/main.zig");
|
||||||
const terminal = @import("../terminal/main.zig");
|
const terminal = @import("../terminal/main.zig");
|
||||||
const renderer = @import("../renderer.zig");
|
const renderer = @import("../renderer.zig");
|
||||||
const shaderpkg = renderer.Renderer.API.shaders;
|
const shaderpkg = renderer.Renderer.API.shaders;
|
||||||
const ArrayListCollection = @import("../datastruct/array_list_collection.zig").ArrayListCollection;
|
const ArrayListCollection = @import("../datastruct/array_list_collection.zig").ArrayListCollection;
|
||||||
|
const symbols = @import("../unicode/symbols1.zig").table;
|
||||||
|
|
||||||
/// The possible cell content keys that exist.
|
/// The possible cell content keys that exist.
|
||||||
pub const Key = enum {
|
pub const Key = enum {
|
||||||
|
|
@ -249,15 +249,7 @@ pub fn isCovering(cp: u21) bool {
|
||||||
/// In the future it may be prudent to expand this to encompass more
|
/// In the future it may be prudent to expand this to encompass more
|
||||||
/// symbol-like characters, and/or exclude some PUA sections.
|
/// symbol-like characters, and/or exclude some PUA sections.
|
||||||
pub fn isSymbol(cp: u21) bool {
|
pub fn isSymbol(cp: u21) bool {
|
||||||
// TODO: This should probably become a codegen'd LUT
|
return symbols.get(cp);
|
||||||
return ziglyph.general_category.isPrivateUse(cp) or
|
|
||||||
ziglyph.blocks.isDingbats(cp) or
|
|
||||||
ziglyph.blocks.isEmoticons(cp) or
|
|
||||||
ziglyph.blocks.isMiscellaneousSymbols(cp) or
|
|
||||||
ziglyph.blocks.isEnclosedAlphanumerics(cp) or
|
|
||||||
ziglyph.blocks.isEnclosedAlphanumericSupplement(cp) or
|
|
||||||
ziglyph.blocks.isMiscellaneousSymbolsAndPictographs(cp) or
|
|
||||||
ziglyph.blocks.isTransportAndMapSymbols(cp);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Returns the appropriate `constraint_width` for
|
/// Returns the appropriate `constraint_width` for
|
||||||
|
|
|
||||||
|
|
@ -142,6 +142,32 @@ pub fn Tables(comptime Elem: type) type {
|
||||||
return self.stage3[self.stage2[self.stage1[high] + low]];
|
return self.stage3[self.stage2[self.stage1[high] + low]];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub inline fn getInline(self: *const Self, cp: u21) Elem {
|
||||||
|
const high = cp >> 8;
|
||||||
|
const low = cp & 0xFF;
|
||||||
|
return self.stage3[self.stage2[self.stage1[high] + low]];
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn getBool(self: *const Self, cp: u21) bool {
|
||||||
|
assert(Elem == bool);
|
||||||
|
assert(self.stage3.len == 2);
|
||||||
|
assert(self.stage3[0] == false);
|
||||||
|
assert(self.stage3[1] == true);
|
||||||
|
const high = cp >> 8;
|
||||||
|
const low = cp & 0xFF;
|
||||||
|
return self.stage2[self.stage1[high] + low] != 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
pub inline fn getBoolInline(self: *const Self, cp: u21) bool {
|
||||||
|
assert(Elem == bool);
|
||||||
|
assert(self.stage3.len == 2);
|
||||||
|
assert(self.stage3[0] == false);
|
||||||
|
assert(self.stage3[1] == true);
|
||||||
|
const high = cp >> 8;
|
||||||
|
const low = cp & 0xFF;
|
||||||
|
return self.stage2[self.stage1[high] + low] != 0;
|
||||||
|
}
|
||||||
|
|
||||||
/// Writes the lookup table as Zig to the given writer. The
|
/// Writes the lookup table as Zig to the given writer. The
|
||||||
/// written file exports three constants: stage1, stage2, and
|
/// written file exports three constants: stage1, stage2, and
|
||||||
/// stage3. These can be used to rebuild the lookup table in Zig.
|
/// stage3. These can be used to rebuild the lookup table in Zig.
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,183 @@
|
||||||
|
const std = @import("std");
|
||||||
|
const assert = std.debug.assert;
|
||||||
|
const Allocator = std.mem.Allocator;
|
||||||
|
|
||||||
|
// This whole file is based on the algorithm described here:
|
||||||
|
// https://here-be-braces.com/fast-lookup-of-unicode-properties/
|
||||||
|
|
||||||
|
const set_size = @typeInfo(usize).int.bits;
|
||||||
|
// const Set = std.bit_set.ArrayBitSet(usize, set_size);
|
||||||
|
const Set = std.bit_set.IntegerBitSet(set_size);
|
||||||
|
const cp_shift = std.math.log2_int(u21, set_size);
|
||||||
|
const cp_mask = set_size - 1;
|
||||||
|
|
||||||
|
/// Creates a type that is able to generate a 2-level lookup table
|
||||||
|
/// from a Unicode codepoint to a mapping of type bool. The lookup table
|
||||||
|
/// generally is expected to be codegen'd and then reloaded, although it
|
||||||
|
/// can in theory be generated at runtime.
|
||||||
|
///
|
||||||
|
/// Context must have one function:
|
||||||
|
/// - `get(Context, u21) bool`: returns the mapping for a given codepoint
|
||||||
|
///
|
||||||
|
pub fn Generator(
|
||||||
|
comptime Context: type,
|
||||||
|
) type {
|
||||||
|
return struct {
|
||||||
|
const Self = @This();
|
||||||
|
|
||||||
|
/// Mapping of a block to its index in the stage2 array.
|
||||||
|
const SetMap = std.HashMap(
|
||||||
|
Set,
|
||||||
|
u16,
|
||||||
|
struct {
|
||||||
|
pub fn hash(ctx: @This(), k: Set) u64 {
|
||||||
|
_ = ctx;
|
||||||
|
var hasher = std.hash.Wyhash.init(0);
|
||||||
|
std.hash.autoHashStrat(&hasher, k, .DeepRecursive);
|
||||||
|
return hasher.final();
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn eql(ctx: @This(), a: Set, b: Set) bool {
|
||||||
|
_ = ctx;
|
||||||
|
return a.eql(b);
|
||||||
|
}
|
||||||
|
},
|
||||||
|
std.hash_map.default_max_load_percentage,
|
||||||
|
);
|
||||||
|
|
||||||
|
ctx: Context = undefined,
|
||||||
|
|
||||||
|
/// Generate the lookup tables. The arrays in the return value
|
||||||
|
/// are owned by the caller and must be freed.
|
||||||
|
pub fn generate(self: *const Self, alloc: Allocator) !Tables {
|
||||||
|
var min: u21 = std.math.maxInt(u21);
|
||||||
|
var max: u21 = std.math.minInt(u21);
|
||||||
|
|
||||||
|
// Maps block => stage2 index
|
||||||
|
var set_map = SetMap.init(alloc);
|
||||||
|
defer set_map.deinit();
|
||||||
|
|
||||||
|
// Our stages
|
||||||
|
var stage1 = std.ArrayList(u16).init(alloc);
|
||||||
|
defer stage1.deinit();
|
||||||
|
var stage2 = std.ArrayList(Set).init(alloc);
|
||||||
|
defer stage2.deinit();
|
||||||
|
|
||||||
|
var set: Set = .initEmpty();
|
||||||
|
|
||||||
|
// ensure that the 1st entry is always all false
|
||||||
|
try stage2.append(set);
|
||||||
|
try set_map.putNoClobber(set, 0);
|
||||||
|
|
||||||
|
for (0..std.math.maxInt(u21) + 1) |cp_| {
|
||||||
|
const cp: u21 = @intCast(cp_);
|
||||||
|
const high = cp >> cp_shift;
|
||||||
|
const low = cp & cp_mask;
|
||||||
|
|
||||||
|
if (self.ctx.get(cp)) {
|
||||||
|
if (cp < min) min = cp;
|
||||||
|
if (cp > max) max = cp;
|
||||||
|
set.set(low);
|
||||||
|
}
|
||||||
|
|
||||||
|
// If we still have space and we're not done with codepoints,
|
||||||
|
// we keep building up the block. Conversely: we finalize this
|
||||||
|
// block if we've filled it or are out of codepoints.
|
||||||
|
if (low + 1 < set_size and cp != std.math.maxInt(u21)) continue;
|
||||||
|
|
||||||
|
// Look for the stage2 index for this block. If it doesn't exist
|
||||||
|
// we add it to stage2 and update the mapping.
|
||||||
|
const gop = try set_map.getOrPut(set);
|
||||||
|
if (!gop.found_existing) {
|
||||||
|
gop.value_ptr.* = std.math.cast(
|
||||||
|
u16,
|
||||||
|
stage2.items.len,
|
||||||
|
) orelse return error.Stage2TooLarge;
|
||||||
|
try stage2.append(set);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Map stage1 => stage2 and reset our block
|
||||||
|
try stage1.append(gop.value_ptr.*);
|
||||||
|
set = .initEmpty();
|
||||||
|
assert(stage1.items.len - 1 == high);
|
||||||
|
}
|
||||||
|
|
||||||
|
// All of our lengths must fit in a u16 for this to work
|
||||||
|
assert(stage1.items.len <= std.math.maxInt(u16));
|
||||||
|
assert(stage2.items.len <= std.math.maxInt(u16));
|
||||||
|
|
||||||
|
const stage1_owned = try stage1.toOwnedSlice();
|
||||||
|
errdefer alloc.free(stage1_owned);
|
||||||
|
const stage2_owned = try stage2.toOwnedSlice();
|
||||||
|
errdefer alloc.free(stage2_owned);
|
||||||
|
|
||||||
|
return .{
|
||||||
|
.min = min,
|
||||||
|
.max = max,
|
||||||
|
.stage1 = stage1_owned,
|
||||||
|
.stage2 = stage2_owned,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Creates a type that given a 3-level lookup table, can be used to
|
||||||
|
/// look up a mapping for a given codepoint, encode it out to Zig, etc.
|
||||||
|
pub const Tables = struct {
|
||||||
|
const Self = @This();
|
||||||
|
|
||||||
|
min: u21,
|
||||||
|
max: u21,
|
||||||
|
stage1: []const u16,
|
||||||
|
stage2: []const Set,
|
||||||
|
|
||||||
|
/// Given a codepoint, returns the mapping for that codepoint.
|
||||||
|
pub fn get(self: *const Self, cp: u21) bool {
|
||||||
|
if (cp < self.min) return false;
|
||||||
|
if (cp > self.max) return false;
|
||||||
|
const high = cp >> cp_shift;
|
||||||
|
const stage2 = self.stage1[high];
|
||||||
|
// take advantage of the fact that the first entry is always all false
|
||||||
|
if (stage2 == 0) return false;
|
||||||
|
const low = cp & cp_mask;
|
||||||
|
return self.stage2[stage2].isSet(low);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Writes the lookup table as Zig to the given writer. The
|
||||||
|
/// written file exports three constants: stage1, stage2, and
|
||||||
|
/// stage3. These can be used to rebuild the lookup table in Zig.
|
||||||
|
pub fn writeZig(self: *const Self, writer: anytype) !void {
|
||||||
|
try writer.print(
|
||||||
|
\\//! This file is auto-generated. Do not edit.
|
||||||
|
\\const std = @import("std");
|
||||||
|
\\
|
||||||
|
\\pub const min: u21 = {};
|
||||||
|
\\pub const max: u21 = {};
|
||||||
|
\\
|
||||||
|
\\pub const stage1: [{}]u16 = .{{
|
||||||
|
, .{ self.min, self.max, self.stage1.len });
|
||||||
|
for (self.stage1) |entry| try writer.print("{},", .{entry});
|
||||||
|
|
||||||
|
try writer.print(
|
||||||
|
\\
|
||||||
|
\\}};
|
||||||
|
\\
|
||||||
|
\\pub const Set = std.bit_set.IntegerBitSet({d});
|
||||||
|
\\pub const stage2: [{d}]Set = .{{
|
||||||
|
\\
|
||||||
|
, .{ set_size, self.stage2.len });
|
||||||
|
// for (self.stage2) |entry| {
|
||||||
|
// try writer.print(" .{{\n", .{});
|
||||||
|
// try writer.print(" .masks = [{d}]{s}{{\n", .{ entry.masks.len, @typeName(Set.MaskInt) });
|
||||||
|
// for (entry.masks) |mask| {
|
||||||
|
// try writer.print(" {d},\n", .{mask});
|
||||||
|
// }
|
||||||
|
// try writer.print(" }},\n", .{});
|
||||||
|
// try writer.print(" }},\n", .{});
|
||||||
|
// }
|
||||||
|
for (self.stage2) |entry| {
|
||||||
|
try writer.print(" .{{ .mask = {d} }},\n", .{entry.mask});
|
||||||
|
}
|
||||||
|
try writer.writeAll("};\n");
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
@ -9,5 +9,7 @@ pub const graphemeBreak = grapheme.graphemeBreak;
|
||||||
pub const GraphemeBreakState = grapheme.BreakState;
|
pub const GraphemeBreakState = grapheme.BreakState;
|
||||||
|
|
||||||
test {
|
test {
|
||||||
|
_ = @import("symbols1.zig");
|
||||||
|
_ = @import("symbols2.zig");
|
||||||
@import("std").testing.refAllDecls(@This());
|
@import("std").testing.refAllDecls(@This());
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -166,7 +166,7 @@ pub fn main() !void {
|
||||||
|
|
||||||
// This is not very fast in debug modes, so its commented by default.
|
// This is not very fast in debug modes, so its commented by default.
|
||||||
// IMPORTANT: UNCOMMENT THIS WHENEVER MAKING CODEPOINTWIDTH CHANGES.
|
// IMPORTANT: UNCOMMENT THIS WHENEVER MAKING CODEPOINTWIDTH CHANGES.
|
||||||
// test "tables match ziglyph" {
|
// test "unicode props: tables match ziglyph" {
|
||||||
// const testing = std.testing;
|
// const testing = std.testing;
|
||||||
//
|
//
|
||||||
// const min = 0xFF + 1; // start outside ascii
|
// const min = 0xFF + 1; // start outside ascii
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,93 @@
|
||||||
|
const props = @This();
|
||||||
|
const std = @import("std");
|
||||||
|
const assert = std.debug.assert;
|
||||||
|
const ziglyph = @import("ziglyph");
|
||||||
|
const lut = @import("lut.zig");
|
||||||
|
|
||||||
|
/// The lookup tables for Ghostty.
|
||||||
|
pub const table = table: {
|
||||||
|
// This is only available after running main() below as part of the Ghostty
|
||||||
|
// build.zig, but due to Zig's lazy analysis we can still reference it here.
|
||||||
|
const generated = @import("symbols1_tables").Tables(bool);
|
||||||
|
const Tables = lut.Tables(bool);
|
||||||
|
break :table Tables{
|
||||||
|
.stage1 = &generated.stage1,
|
||||||
|
.stage2 = &generated.stage2,
|
||||||
|
.stage3 = &generated.stage3,
|
||||||
|
};
|
||||||
|
};
|
||||||
|
|
||||||
|
/// Returns true of the codepoint is a "symbol-like" character, which
|
||||||
|
/// for now we define as anything in a private use area and anything
|
||||||
|
/// in several unicode blocks:
|
||||||
|
/// - Dingbats
|
||||||
|
/// - Emoticons
|
||||||
|
/// - Miscellaneous Symbols
|
||||||
|
/// - Enclosed Alphanumerics
|
||||||
|
/// - Enclosed Alphanumeric Supplement
|
||||||
|
/// - Miscellaneous Symbols and Pictographs
|
||||||
|
/// - Transport and Map Symbols
|
||||||
|
///
|
||||||
|
/// In the future it may be prudent to expand this to encompass more
|
||||||
|
/// symbol-like characters, and/or exclude some PUA sections.
|
||||||
|
pub fn isSymbol(cp: u21) bool {
|
||||||
|
return ziglyph.general_category.isPrivateUse(cp) or
|
||||||
|
ziglyph.blocks.isDingbats(cp) or
|
||||||
|
ziglyph.blocks.isEmoticons(cp) or
|
||||||
|
ziglyph.blocks.isMiscellaneousSymbols(cp) or
|
||||||
|
ziglyph.blocks.isEnclosedAlphanumerics(cp) or
|
||||||
|
ziglyph.blocks.isEnclosedAlphanumericSupplement(cp) or
|
||||||
|
ziglyph.blocks.isMiscellaneousSymbolsAndPictographs(cp) or
|
||||||
|
ziglyph.blocks.isTransportAndMapSymbols(cp);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Runnable binary to generate the lookup tables and output to stdout.
|
||||||
|
pub fn main() !void {
|
||||||
|
var arena_state = std.heap.ArenaAllocator.init(std.heap.page_allocator);
|
||||||
|
defer arena_state.deinit();
|
||||||
|
const alloc = arena_state.allocator();
|
||||||
|
|
||||||
|
const gen: lut.Generator(
|
||||||
|
bool,
|
||||||
|
struct {
|
||||||
|
pub fn get(ctx: @This(), cp: u21) !bool {
|
||||||
|
_ = ctx;
|
||||||
|
return isSymbol(cp);
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn eql(ctx: @This(), a: bool, b: bool) bool {
|
||||||
|
_ = ctx;
|
||||||
|
return a == b;
|
||||||
|
}
|
||||||
|
},
|
||||||
|
) = .{};
|
||||||
|
|
||||||
|
const t = try gen.generate(alloc);
|
||||||
|
defer alloc.free(t.stage1);
|
||||||
|
defer alloc.free(t.stage2);
|
||||||
|
defer alloc.free(t.stage3);
|
||||||
|
try t.writeZig(std.io.getStdOut().writer());
|
||||||
|
|
||||||
|
// Uncomment when manually debugging to see our table sizes.
|
||||||
|
// std.log.warn("stage1={} stage2={} stage3={}", .{
|
||||||
|
// t.stage1.len,
|
||||||
|
// t.stage2.len,
|
||||||
|
// t.stage3.len,
|
||||||
|
// });
|
||||||
|
}
|
||||||
|
|
||||||
|
// This is not very fast in debug modes, so its commented by default.
|
||||||
|
// IMPORTANT: UNCOMMENT THIS WHENEVER MAKING CHANGES.
|
||||||
|
test "unicode symbols1: tables match ziglyph" {
|
||||||
|
const testing = std.testing;
|
||||||
|
|
||||||
|
for (0..std.math.maxInt(u21)) |cp| {
|
||||||
|
const t = table.get(@intCast(cp));
|
||||||
|
const zg = isSymbol(@intCast(cp));
|
||||||
|
|
||||||
|
if (t != zg) {
|
||||||
|
std.log.warn("mismatch cp=U+{x} t={} zg={}", .{ cp, t, zg });
|
||||||
|
try testing.expect(false);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,85 @@
|
||||||
|
const props = @This();
|
||||||
|
const std = @import("std");
|
||||||
|
const assert = std.debug.assert;
|
||||||
|
const ziglyph = @import("ziglyph");
|
||||||
|
const lut2 = @import("lut2.zig");
|
||||||
|
|
||||||
|
/// The lookup tables for Ghostty.
|
||||||
|
pub const table = table: {
|
||||||
|
// This is only available after running main() below as part of the Ghostty
|
||||||
|
// build.zig, but due to Zig's lazy analysis we can still reference it here.
|
||||||
|
const generated = @import("symbols2_tables");
|
||||||
|
break :table lut2.Tables{
|
||||||
|
.min = generated.min,
|
||||||
|
.max = generated.max,
|
||||||
|
.stage1 = &generated.stage1,
|
||||||
|
.stage2 = &generated.stage2,
|
||||||
|
};
|
||||||
|
};
|
||||||
|
|
||||||
|
/// Returns true of the codepoint is a "symbol-like" character, which
|
||||||
|
/// for now we define as anything in a private use area and anything
|
||||||
|
/// in several unicode blocks:
|
||||||
|
/// - Dingbats
|
||||||
|
/// - Emoticons
|
||||||
|
/// - Miscellaneous Symbols
|
||||||
|
/// - Enclosed Alphanumerics
|
||||||
|
/// - Enclosed Alphanumeric Supplement
|
||||||
|
/// - Miscellaneous Symbols and Pictographs
|
||||||
|
/// - Transport and Map Symbols
|
||||||
|
///
|
||||||
|
/// In the future it may be prudent to expand this to encompass more
|
||||||
|
/// symbol-like characters, and/or exclude some PUA sections.
|
||||||
|
pub fn isSymbol(cp: u21) bool {
|
||||||
|
return ziglyph.general_category.isPrivateUse(cp) or
|
||||||
|
ziglyph.blocks.isDingbats(cp) or
|
||||||
|
ziglyph.blocks.isEmoticons(cp) or
|
||||||
|
ziglyph.blocks.isMiscellaneousSymbols(cp) or
|
||||||
|
ziglyph.blocks.isEnclosedAlphanumerics(cp) or
|
||||||
|
ziglyph.blocks.isEnclosedAlphanumericSupplement(cp) or
|
||||||
|
ziglyph.blocks.isMiscellaneousSymbolsAndPictographs(cp) or
|
||||||
|
ziglyph.blocks.isTransportAndMapSymbols(cp);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Runnable binary to generate the lookup tables and output to stdout.
|
||||||
|
pub fn main() !void {
|
||||||
|
var arena_state = std.heap.ArenaAllocator.init(std.heap.page_allocator);
|
||||||
|
defer arena_state.deinit();
|
||||||
|
const alloc = arena_state.allocator();
|
||||||
|
|
||||||
|
const gen: lut2.Generator(
|
||||||
|
struct {
|
||||||
|
pub fn get(ctx: @This(), cp: u21) bool {
|
||||||
|
_ = ctx;
|
||||||
|
return isSymbol(cp);
|
||||||
|
}
|
||||||
|
},
|
||||||
|
) = .{};
|
||||||
|
|
||||||
|
const t = try gen.generate(alloc);
|
||||||
|
defer alloc.free(t.stage1);
|
||||||
|
defer alloc.free(t.stage2);
|
||||||
|
try t.writeZig(std.io.getStdOut().writer());
|
||||||
|
|
||||||
|
// Uncomment when manually debugging to see our table sizes.
|
||||||
|
// std.log.warn("stage1={} stage2={}", .{
|
||||||
|
// t.stage1.len,
|
||||||
|
// t.stage2.len,
|
||||||
|
// });
|
||||||
|
}
|
||||||
|
|
||||||
|
// This is not very fast in debug modes, so its commented by default.
|
||||||
|
// IMPORTANT: UNCOMMENT THIS WHENEVER MAKING CHANGES.
|
||||||
|
test "unicode symbols2: tables match ziglyph" {
|
||||||
|
const testing = std.testing;
|
||||||
|
|
||||||
|
for (0..std.math.maxInt(u21)) |cp| {
|
||||||
|
const t1 = table.get(@intCast(cp));
|
||||||
|
const zg = isSymbol(@intCast(cp));
|
||||||
|
|
||||||
|
if (t1 != zg) {
|
||||||
|
std.log.warn("mismatch cp=U+{x} t={} zg={}", .{ cp, t1, zg });
|
||||||
|
try testing.expect(false);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
Loading…
Reference in New Issue