diff --git a/src/benchmark/IsSymbol.zig b/src/benchmark/IsSymbol.zig index 46ebb8c66..940207619 100644 --- a/src/benchmark/IsSymbol.zig +++ b/src/benchmark/IsSymbol.zig @@ -10,8 +10,7 @@ const Allocator = std.mem.Allocator; const Benchmark = @import("Benchmark.zig"); const options = @import("options.zig"); const UTF8Decoder = @import("../terminal/UTF8Decoder.zig"); -const symbols1 = @import("../unicode/symbols1.zig"); -const symbols2 = @import("../unicode/symbols2.zig"); +const symbols = @import("../unicode/symbols.zig"); const log = std.log.scoped(.@"is-symbol-bench"); @@ -37,8 +36,7 @@ pub const Mode = enum { ziglyph, /// Ghostty's table-based approach. - table1, - table2, + table, }; /// Create a new terminal stream handler for the given arguments. @@ -60,8 +58,7 @@ pub fn benchmark(self: *IsSymbol) Benchmark { return .init(self, .{ .stepFn = switch (self.opts.mode) { .ziglyph => stepZiglyph, - .table1 => stepTable1, - .table2 => stepTable1, + .table => stepTable, }, .setupFn = setup, .teardownFn = teardown, @@ -106,13 +103,13 @@ fn stepZiglyph(ptr: *anyopaque) Benchmark.Error!void { const cp_, const consumed = d.next(c); assert(consumed); if (cp_) |cp| { - std.mem.doNotOptimizeAway(symbols1.isSymbol(cp)); + std.mem.doNotOptimizeAway(symbols.isSymbol(cp)); } } } } -fn stepTable1(ptr: *anyopaque) Benchmark.Error!void { +fn stepTable(ptr: *anyopaque) Benchmark.Error!void { const self: *IsSymbol = @ptrCast(@alignCast(ptr)); const f = self.data_f orelse return; @@ -130,31 +127,7 @@ fn stepTable1(ptr: *anyopaque) Benchmark.Error!void { const cp_, const consumed = d.next(c); assert(consumed); if (cp_) |cp| { - std.mem.doNotOptimizeAway(symbols1.table.get(cp)); - } - } - } -} - -fn stepTable2(ptr: *anyopaque) Benchmark.Error!void { - const self: *IsSymbol = @ptrCast(@alignCast(ptr)); - - const f = self.data_f orelse return; - var r = std.io.bufferedReader(f.reader()); - var d: UTF8Decoder = .{}; - var buf: [4096]u8 = undefined; - while (true) { - const n = r.read(&buf) catch |err| { - log.warn("error reading data file err={}", .{err}); - return error.BenchmarkFailed; - }; - if (n == 0) break; // EOF reached - - for (buf[0..n]) |c| { - const cp_, const consumed = d.next(c); - assert(consumed); - if (cp_) |cp| { - std.mem.doNotOptimizeAway(symbols2.table.get(cp)); + std.mem.doNotOptimizeAway(symbols.table.get(cp)); } } } diff --git a/src/build/UnicodeTables.zig b/src/build/UnicodeTables.zig index dd9a6bdf2..6bb656a29 100644 --- a/src/build/UnicodeTables.zig +++ b/src/build/UnicodeTables.zig @@ -5,13 +5,11 @@ const Config = @import("Config.zig"); /// The exe. props_exe: *std.Build.Step.Compile, -symbols1_exe: *std.Build.Step.Compile, -symbols2_exe: *std.Build.Step.Compile, +symbols_exe: *std.Build.Step.Compile, /// The output path for the unicode tables props_output: std.Build.LazyPath, -symbols1_output: std.Build.LazyPath, -symbols2_output: std.Build.LazyPath, +symbols_output: std.Build.LazyPath, pub fn init(b: *std.Build) !UnicodeTables { const props_exe = b.addExecutable(.{ @@ -25,21 +23,10 @@ pub fn init(b: *std.Build) !UnicodeTables { }), }); - const symbols1_exe = b.addExecutable(.{ - .name = "symbols1-unigen", + const symbols_exe = b.addExecutable(.{ + .name = "symbols-unigen", .root_module = b.createModule(.{ - .root_source_file = b.path("src/unicode/symbols1.zig"), - .target = b.graph.host, - .strip = false, - .omit_frame_pointer = false, - .unwind_tables = .sync, - }), - }); - - const symbols2_exe = b.addExecutable(.{ - .name = "symbols2-unigen", - .root_module = b.createModule(.{ - .root_source_file = b.path("src/unicode/symbols2.zig"), + .root_source_file = b.path("src/unicode/symbols.zig"), .target = b.graph.host, .strip = false, .omit_frame_pointer = false, @@ -50,7 +37,7 @@ pub fn init(b: *std.Build) !UnicodeTables { if (b.lazyDependency("ziglyph", .{ .target = b.graph.host, })) |ziglyph_dep| { - inline for (&.{ props_exe, symbols1_exe, symbols2_exe }) |exe| { + inline for (&.{ props_exe, symbols_exe }) |exe| { exe.root_module.addImport( "ziglyph", ziglyph_dep.module("ziglyph"), @@ -59,16 +46,13 @@ pub fn init(b: *std.Build) !UnicodeTables { } const props_run = b.addRunArtifact(props_exe); - const symbols1_run = b.addRunArtifact(symbols1_exe); - const symbols2_run = b.addRunArtifact(symbols2_exe); + const symbols_run = b.addRunArtifact(symbols_exe); return .{ .props_exe = props_exe, - .symbols1_exe = symbols1_exe, - .symbols2_exe = symbols2_exe, + .symbols_exe = symbols_exe, .props_output = props_run.captureStdOut(), - .symbols1_output = symbols1_run.captureStdOut(), - .symbols2_output = symbols2_run.captureStdOut(), + .symbols_output = symbols_run.captureStdOut(), }; } @@ -78,19 +62,14 @@ pub fn addImport(self: *const UnicodeTables, step: *std.Build.Step.Compile) void step.root_module.addAnonymousImport("unicode_tables", .{ .root_source_file = self.props_output, }); - self.symbols1_output.addStepDependencies(&step.step); - step.root_module.addAnonymousImport("symbols1_tables", .{ - .root_source_file = self.symbols1_output, - }); - self.symbols2_output.addStepDependencies(&step.step); - step.root_module.addAnonymousImport("symbols2_tables", .{ - .root_source_file = self.symbols2_output, + self.symbols_output.addStepDependencies(&step.step); + step.root_module.addAnonymousImport("symbols_tables", .{ + .root_source_file = self.symbols_output, }); } /// Install the exe pub fn install(self: *const UnicodeTables, b: *std.Build) void { b.installArtifact(self.props_exe); - b.installArtifact(self.symbols1_exe); - b.installArtifact(self.symbols2_exe); + b.installArtifact(self.symbols_exe); } diff --git a/src/renderer/cell.zig b/src/renderer/cell.zig index a75fddf52..6ada849ed 100644 --- a/src/renderer/cell.zig +++ b/src/renderer/cell.zig @@ -6,7 +6,7 @@ const terminal = @import("../terminal/main.zig"); const renderer = @import("../renderer.zig"); const shaderpkg = renderer.Renderer.API.shaders; const ArrayListCollection = @import("../datastruct/array_list_collection.zig").ArrayListCollection; -const symbols = @import("../unicode/symbols1.zig").table; +const symbols = @import("../unicode/symbols.zig").table; /// The possible cell content keys that exist. pub const Key = enum { diff --git a/src/unicode/lut2.zig b/src/unicode/lut2.zig deleted file mode 100644 index ef5c886a2..000000000 --- a/src/unicode/lut2.zig +++ /dev/null @@ -1,183 +0,0 @@ -const std = @import("std"); -const assert = std.debug.assert; -const Allocator = std.mem.Allocator; - -// This whole file is based on the algorithm described here: -// https://here-be-braces.com/fast-lookup-of-unicode-properties/ - -const set_size = @typeInfo(usize).int.bits; -// const Set = std.bit_set.ArrayBitSet(usize, set_size); -const Set = std.bit_set.IntegerBitSet(set_size); -const cp_shift = std.math.log2_int(u21, set_size); -const cp_mask = set_size - 1; - -/// Creates a type that is able to generate a 2-level lookup table -/// from a Unicode codepoint to a mapping of type bool. The lookup table -/// generally is expected to be codegen'd and then reloaded, although it -/// can in theory be generated at runtime. -/// -/// Context must have one function: -/// - `get(Context, u21) bool`: returns the mapping for a given codepoint -/// -pub fn Generator( - comptime Context: type, -) type { - return struct { - const Self = @This(); - - /// Mapping of a block to its index in the stage2 array. - const SetMap = std.HashMap( - Set, - u16, - struct { - pub fn hash(ctx: @This(), k: Set) u64 { - _ = ctx; - var hasher = std.hash.Wyhash.init(0); - std.hash.autoHashStrat(&hasher, k, .DeepRecursive); - return hasher.final(); - } - - pub fn eql(ctx: @This(), a: Set, b: Set) bool { - _ = ctx; - return a.eql(b); - } - }, - std.hash_map.default_max_load_percentage, - ); - - ctx: Context = undefined, - - /// Generate the lookup tables. The arrays in the return value - /// are owned by the caller and must be freed. - pub fn generate(self: *const Self, alloc: Allocator) !Tables { - var min: u21 = std.math.maxInt(u21); - var max: u21 = std.math.minInt(u21); - - // Maps block => stage2 index - var set_map = SetMap.init(alloc); - defer set_map.deinit(); - - // Our stages - var stage1 = std.ArrayList(u16).init(alloc); - defer stage1.deinit(); - var stage2 = std.ArrayList(Set).init(alloc); - defer stage2.deinit(); - - var set: Set = .initEmpty(); - - // ensure that the 1st entry is always all false - try stage2.append(set); - try set_map.putNoClobber(set, 0); - - for (0..std.math.maxInt(u21) + 1) |cp_| { - const cp: u21 = @intCast(cp_); - const high = cp >> cp_shift; - const low = cp & cp_mask; - - if (self.ctx.get(cp)) { - if (cp < min) min = cp; - if (cp > max) max = cp; - set.set(low); - } - - // If we still have space and we're not done with codepoints, - // we keep building up the block. Conversely: we finalize this - // block if we've filled it or are out of codepoints. - if (low + 1 < set_size and cp != std.math.maxInt(u21)) continue; - - // Look for the stage2 index for this block. If it doesn't exist - // we add it to stage2 and update the mapping. - const gop = try set_map.getOrPut(set); - if (!gop.found_existing) { - gop.value_ptr.* = std.math.cast( - u16, - stage2.items.len, - ) orelse return error.Stage2TooLarge; - try stage2.append(set); - } - - // Map stage1 => stage2 and reset our block - try stage1.append(gop.value_ptr.*); - set = .initEmpty(); - assert(stage1.items.len - 1 == high); - } - - // All of our lengths must fit in a u16 for this to work - assert(stage1.items.len <= std.math.maxInt(u16)); - assert(stage2.items.len <= std.math.maxInt(u16)); - - const stage1_owned = try stage1.toOwnedSlice(); - errdefer alloc.free(stage1_owned); - const stage2_owned = try stage2.toOwnedSlice(); - errdefer alloc.free(stage2_owned); - - return .{ - .min = min, - .max = max, - .stage1 = stage1_owned, - .stage2 = stage2_owned, - }; - } - }; -} - -/// Creates a type that given a 3-level lookup table, can be used to -/// look up a mapping for a given codepoint, encode it out to Zig, etc. -pub const Tables = struct { - const Self = @This(); - - min: u21, - max: u21, - stage1: []const u16, - stage2: []const Set, - - /// Given a codepoint, returns the mapping for that codepoint. - pub fn get(self: *const Self, cp: u21) bool { - if (cp < self.min) return false; - if (cp > self.max) return false; - const high = cp >> cp_shift; - const stage2 = self.stage1[high]; - // take advantage of the fact that the first entry is always all false - if (stage2 == 0) return false; - const low = cp & cp_mask; - return self.stage2[stage2].isSet(low); - } - - /// Writes the lookup table as Zig to the given writer. The - /// written file exports three constants: stage1, stage2, and - /// stage3. These can be used to rebuild the lookup table in Zig. - pub fn writeZig(self: *const Self, writer: anytype) !void { - try writer.print( - \\//! This file is auto-generated. Do not edit. - \\const std = @import("std"); - \\ - \\pub const min: u21 = {}; - \\pub const max: u21 = {}; - \\ - \\pub const stage1: [{}]u16 = .{{ - , .{ self.min, self.max, self.stage1.len }); - for (self.stage1) |entry| try writer.print("{},", .{entry}); - - try writer.print( - \\ - \\}}; - \\ - \\pub const Set = std.bit_set.IntegerBitSet({d}); - \\pub const stage2: [{d}]Set = .{{ - \\ - , .{ set_size, self.stage2.len }); - // for (self.stage2) |entry| { - // try writer.print(" .{{\n", .{}); - // try writer.print(" .masks = [{d}]{s}{{\n", .{ entry.masks.len, @typeName(Set.MaskInt) }); - // for (entry.masks) |mask| { - // try writer.print(" {d},\n", .{mask}); - // } - // try writer.print(" }},\n", .{}); - // try writer.print(" }},\n", .{}); - // } - for (self.stage2) |entry| { - try writer.print(" .{{ .mask = {d} }},\n", .{entry.mask}); - } - try writer.writeAll("};\n"); - } -}; diff --git a/src/unicode/main.zig b/src/unicode/main.zig index 91dfd482c..17c86deca 100644 --- a/src/unicode/main.zig +++ b/src/unicode/main.zig @@ -9,7 +9,6 @@ pub const graphemeBreak = grapheme.graphemeBreak; pub const GraphemeBreakState = grapheme.BreakState; test { - _ = @import("symbols1.zig"); - _ = @import("symbols2.zig"); + _ = @import("symbols.zig"); @import("std").testing.refAllDecls(@This()); } diff --git a/src/unicode/symbols1.zig b/src/unicode/symbols.zig similarity index 97% rename from src/unicode/symbols1.zig rename to src/unicode/symbols.zig index e5b8cc22a..3e038fe7d 100644 --- a/src/unicode/symbols1.zig +++ b/src/unicode/symbols.zig @@ -8,7 +8,7 @@ const lut = @import("lut.zig"); pub const table = table: { // This is only available after running main() below as part of the Ghostty // build.zig, but due to Zig's lazy analysis we can still reference it here. - const generated = @import("symbols1_tables").Tables(bool); + const generated = @import("symbols_tables").Tables(bool); const Tables = lut.Tables(bool); break :table Tables{ .stage1 = &generated.stage1, diff --git a/src/unicode/symbols2.zig b/src/unicode/symbols2.zig deleted file mode 100644 index 1d23c51be..000000000 --- a/src/unicode/symbols2.zig +++ /dev/null @@ -1,85 +0,0 @@ -const props = @This(); -const std = @import("std"); -const assert = std.debug.assert; -const ziglyph = @import("ziglyph"); -const lut2 = @import("lut2.zig"); - -/// The lookup tables for Ghostty. -pub const table = table: { - // This is only available after running main() below as part of the Ghostty - // build.zig, but due to Zig's lazy analysis we can still reference it here. - const generated = @import("symbols2_tables"); - break :table lut2.Tables{ - .min = generated.min, - .max = generated.max, - .stage1 = &generated.stage1, - .stage2 = &generated.stage2, - }; -}; - -/// Returns true of the codepoint is a "symbol-like" character, which -/// for now we define as anything in a private use area and anything -/// in several unicode blocks: -/// - Dingbats -/// - Emoticons -/// - Miscellaneous Symbols -/// - Enclosed Alphanumerics -/// - Enclosed Alphanumeric Supplement -/// - Miscellaneous Symbols and Pictographs -/// - Transport and Map Symbols -/// -/// In the future it may be prudent to expand this to encompass more -/// symbol-like characters, and/or exclude some PUA sections. -pub fn isSymbol(cp: u21) bool { - return ziglyph.general_category.isPrivateUse(cp) or - ziglyph.blocks.isDingbats(cp) or - ziglyph.blocks.isEmoticons(cp) or - ziglyph.blocks.isMiscellaneousSymbols(cp) or - ziglyph.blocks.isEnclosedAlphanumerics(cp) or - ziglyph.blocks.isEnclosedAlphanumericSupplement(cp) or - ziglyph.blocks.isMiscellaneousSymbolsAndPictographs(cp) or - ziglyph.blocks.isTransportAndMapSymbols(cp); -} - -/// Runnable binary to generate the lookup tables and output to stdout. -pub fn main() !void { - var arena_state = std.heap.ArenaAllocator.init(std.heap.page_allocator); - defer arena_state.deinit(); - const alloc = arena_state.allocator(); - - const gen: lut2.Generator( - struct { - pub fn get(ctx: @This(), cp: u21) bool { - _ = ctx; - return isSymbol(cp); - } - }, - ) = .{}; - - const t = try gen.generate(alloc); - defer alloc.free(t.stage1); - defer alloc.free(t.stage2); - try t.writeZig(std.io.getStdOut().writer()); - - // Uncomment when manually debugging to see our table sizes. - // std.log.warn("stage1={} stage2={}", .{ - // t.stage1.len, - // t.stage2.len, - // }); -} - -// This is not very fast in debug modes, so its commented by default. -// IMPORTANT: UNCOMMENT THIS WHENEVER MAKING CHANGES. -test "unicode symbols2: tables match ziglyph" { - const testing = std.testing; - - for (0..std.math.maxInt(u21)) |cp| { - const t1 = table.get(@intCast(cp)); - const zg = isSymbol(@intCast(cp)); - - if (t1 != zg) { - std.log.warn("mismatch cp=U+{x} t={} zg={}", .{ cp, t1, zg }); - try testing.expect(false); - } - } -}