deps: Replace ziglyph with uucode (#8757)

This replaces `ziglyph` with
[uucode](https://github.com/jacobsandlund/uucode), maintaining the
Ghostty unicode props and symbols tables, but now built with `uucode`
and Unicode 16.

See the above `uucode` repository for a description of the library, but
the short summary is that it aims to be a fast and flexible Unicode
library in Zig, that is very configurable during build time.

This PR adds tests to make sure that the symbol and props tables match
the previous built-by-ziglyph implementation (minus differences in
Unicode 16).

## Benchmarks

Also added are benchmarks comparing the Ghostty tables with `uucode`,
with the following results (MacBook Pro 2021 M1 16 GB).

Summary:

| Benchmark | Dataset | Table Time (ms) | uucode Time (ms) | Fastest |
Fastest Speedup |

|-----------|---------|-----------------|------------------|---------|----------------|
| codepoint-width | data.txt | 929.3 (± 2.3) | 924.1 (± 0.9) | uucode |
1.01x |
| | Arabic | 2,652.0 (± 21.0) | 2,589.0 (± 8.0) | uucode | 1.02x |
| | Japanese | 2,302.0 (± 1.0) | 2,250.0 (± 2.0) | uucode | 1.02x |
| | English | 1,770.0 (± 5.0) | 1,770.0 (± 5.0) | tie | 1.00x |
| grapheme-break | data.txt | 974.5 (± 1.2) | 977.4 (± 0.7) | table |
1.00x |
| | Arabic | 4,135.0 (± 22.0) | 4,037.0 (± 9.0) | uucode | 1.02x |
| | Japanese | 3,537.0 (± 3.0) | 3,459.0 (± 1.0) | uucode | 1.02x |
| | English | 4,799.0 (± 4.0) | 5,119.0 (± 2.0) | table | 1.07x |
| is-symbol | data.txt | 877.9 (± 2.0) | 883.5 (± 1.5) | table | 1.01x |
| | Arabic | 2,735.0 (± 8.0) | 2,611.0 (± 1.0) | uucode | 1.05x |
| | Japanese | 2,391.0 (± 2.0) | 2,276.0 (± 4.0) | uucode | 1.05x |
| | English | 2,450.0 (± 5.0) | 2,212.0 (± 2.0) | uucode | 1.11x |

Sources:

```
# data.txt
zig-out/bin/ghostty-gen +utf8 | head -c 200000000 > data.txt

# Arabic
curl -L "https://www.dropbox.com/scl/fi/86gjpfzopssavk2nzo69u/arwiki-20180920-corpus.xml.bz2?dl=1&e=1&file_subpath=%2Fdata&rlkey=dmjlaw1xegg8vsje4xrn040v8" | bzcat | head -c 1000000000 > arwiki-20180920-corpus.xml

# Japanese
curl -L "https://www.dropbox.com/scl/fi/vru4zxv5qff1klod9xiht/jawiki-20181001-corpus.xml.bz2?rlkey=utuuooiwyupws3x5517u8n8jl&e=1&dl=1" | bzcat | head -c 1000000000 > jawiki-20181001-corpus.xml

# English
curl -L "https://www.dropbox.com/scl/fi/la1nvupgk2honb3n6m9zc/enwiki-20181001-corpus.xml.bz2?rlkey=8vg4vokbaijh1lg5lw3ytc864&e=1&dl=1" | bzcat | head -c 1000000000 > enwiki-20181001-corpus.xml
```

### codepoint-width

data.txt

```
hyperfine --warmup 4 'zig-out/bin/ghostty-bench +codepoint-width --data=data.txt --mode=table' 'zig-out/bin/ghostty-bench +codepoint-width --data=data.txt --mode=uucode'
Benchmark 1: zig-out/bin/ghostty-bench +codepoint-width --data=data.txt --mode=table
  Time (mean ± σ):     929.3 ms ±   2.3 ms    [User: 884.8 ms, System: 42.1 ms]
  Range (min … max):   925.2 ms … 933.2 ms    10 runs

Benchmark 2: zig-out/bin/ghostty-bench +codepoint-width --data=data.txt --mode=uucode
  Time (mean ± σ):     924.1 ms ±   0.9 ms    [User: 879.0 ms, System: 43.3 ms]
  Range (min … max):   922.8 ms … 925.4 ms    10 runs

Summary
  zig-out/bin/ghostty-bench +codepoint-width --data=data.txt --mode=uucode ran
    1.01 ± 0.00 times faster than zig-out/bin/ghostty-bench +codepoint-width --data=data.txt --mode=table
```

Arabic

```
hyperfine --warmup 1 --runs=5 zig-out/bin/ghostty-bench +codepoint-width --data=arwiki-20180920-corpus.xml --mode=table zig-out/bin/ghostty-bench +codepoint-width --data=arwiki-20180920-corpus.xml --mode=uucode
Benchmark 1: zig-out/bin/ghostty-bench +codepoint-width --data=arwiki-20180920-corpus.xml --mode=table
  Time (mean ± σ):      2.652 s ±  0.021 s    [User: 2.458 s, System: 0.180 s]
  Range (min … max):    2.622 s …  2.677 s    5 runs

Benchmark 2: zig-out/bin/ghostty-bench +codepoint-width --data=arwiki-20180920-corpus.xml --mode=uucode
  Time (mean ± σ):      2.589 s ±  0.008 s    [User: 2.405 s, System: 0.181 s]
  Range (min … max):    2.583 s …  2.598 s    5 runs

Summary
  zig-out/bin/ghostty-bench +codepoint-width --data=arwiki-20180920-corpus.xml --mode=uucode ran
    1.02 ± 0.01 times faster than zig-out/bin/ghostty-bench +codepoint-width --data=arwiki-20180920-corpus.xml --mode=table
```

Japanese

```
hyperfine --warmup 1 --runs=5 zig-out/bin/ghostty-bench +codepoint-width --data=jawiki-20181001-corpus.xml --mode=table zig-out/bin/ghostty-bench +codepoint-width --data=jawiki-20181001-corpus.xml --mode=uucode
Benchmark 1: zig-out/bin/ghostty-bench +codepoint-width --data=jawiki-20181001-corpus.xml --mode=table
  Time (mean ± σ):      2.302 s ±  0.001 s    [User: 2.126 s, System: 0.173 s]
  Range (min … max):    2.301 s …  2.303 s    5 runs

Benchmark 2: zig-out/bin/ghostty-bench +codepoint-width --data=jawiki-20181001-corpus.xml --mode=uucode
  Time (mean ± σ):      2.250 s ±  0.002 s    [User: 2.074 s, System: 0.174 s]
  Range (min … max):    2.246 s …  2.251 s    5 runs

Summary
  zig-out/bin/ghostty-bench +codepoint-width --data=jawiki-20181001-corpus.xml --mode=uucode ran
    1.02 ± 0.00 times faster than zig-out/bin/ghostty-bench +codepoint-width --data=jawiki-20181001-corpus.xml --mode=table
```

English

```
hyperfine --warmup 1 --runs=5 zig-out/bin/ghostty-bench +codepoint-width --data=enwiki-20181001-corpus.xml --mode=table zig-out/bin/ghostty-bench +codepoint-width --data=enwiki-20181001-corpus.xml --mode=uucode
Benchmark 1: zig-out/bin/ghostty-bench +codepoint-width --data=enwiki-20181001-corpus.xml --mode=table
  Time (mean ± σ):      1.770 s ±  0.005 s    [User: 1.600 s, System: 0.168 s]
  Range (min … max):    1.766 s …  1.776 s    5 runs

Benchmark 2: zig-out/bin/ghostty-bench +codepoint-width --data=enwiki-20181001-corpus.xml --mode=uucode
  Time (mean ± σ):      1.770 s ±  0.005 s    [User: 1.598 s, System: 0.169 s]
  Range (min … max):    1.765 s …  1.775 s    5 runs

Summary
  zig-out/bin/ghostty-bench +codepoint-width --data=enwiki-20181001-corpus.xml --mode=uucode ran
    1.00 ± 0.00 times faster than zig-out/bin/ghostty-bench +codepoint-width --data=enwiki-20181001-corpus.xml --mode=table
```

### grapheme-break

data.txt

```
hyperfine --warmup 4 --runs=10 zig-out/bin/ghostty-bench +grapheme-break --data=data.txt --mode=table zig-out/bin/ghostty-bench +grapheme-break --data=data.txt --mode=uucode
Benchmark 1: zig-out/bin/ghostty-bench +grapheme-break --data=data.txt --mode=table
  Time (mean ± σ):     974.5 ms ±   1.2 ms    [User: 929.2 ms, System: 43.3 ms]
  Range (min … max):   972.7 ms … 976.8 ms    10 runs

Benchmark 2: zig-out/bin/ghostty-bench +grapheme-break --data=data.txt --mode=uucode
  Time (mean ± σ):     977.4 ms ±   0.7 ms    [User: 931.5 ms, System: 44.0 ms]
  Range (min … max):   976.4 ms … 978.5 ms    10 runs

Summary
  zig-out/bin/ghostty-bench +grapheme-break --data=data.txt --mode=table ran
    1.00 ± 0.00 times faster than zig-out/bin/ghostty-bench +grapheme-break --data=data.txt --mode=uucode
```

Arabic

```
hyperfine --warmup 1 --runs=5 zig-out/bin/ghostty-bench +grapheme-break --data=arwiki-20180920-corpus.xml --mode=table zig-out/bin/ghostty-bench +grapheme-break --data=arwiki-20180920-corpus.xml --mode=uucode
Benchmark 1: zig-out/bin/ghostty-bench +grapheme-break --data=arwiki-20180920-corpus.xml --mode=table
  Time (mean ± σ):      4.135 s ±  0.022 s    [User: 3.942 s, System: 0.186 s]
  Range (min … max):    4.111 s …  4.159 s    5 runs

Benchmark 2: zig-out/bin/ghostty-bench +grapheme-break --data=arwiki-20180920-corpus.xml --mode=uucode
  Time (mean ± σ):      4.037 s ±  0.009 s    [User: 3.867 s, System: 0.163 s]
  Range (min … max):    4.026 s …  4.048 s    5 runs

Summary
  zig-out/bin/ghostty-bench +grapheme-break --data=arwiki-20180920-corpus.xml --mode=uucode ran
    1.02 ± 0.01 times faster than zig-out/bin/ghostty-bench +grapheme-break --data=arwiki-20180920-corpus.xml --mode=table
```

Japanese

```
hyperfine --warmup 1 --runs=5 zig-out/bin/ghostty-bench +grapheme-break --data=jawiki-20181001-corpus.xml --mode=table zig-out/bin/ghostty-bench +grapheme-break --data=jawiki-20181001-corpus.xml --mode=uucode
Benchmark 1: zig-out/bin/ghostty-bench +grapheme-break --data=jawiki-20181001-corpus.xml --mode=table
  Time (mean ± σ):      3.537 s ±  0.003 s    [User: 3.337 s, System: 0.195 s]
  Range (min … max):    3.533 s …  3.541 s    5 runs

Benchmark 2: zig-out/bin/ghostty-bench +grapheme-break --data=jawiki-20181001-corpus.xml --mode=uucode
  Time (mean ± σ):      3.459 s ±  0.001 s    [User: 3.265 s, System: 0.191 s]
  Range (min … max):    3.458 s …  3.460 s    5 runs

Summary
  zig-out/bin/ghostty-bench +grapheme-break --data=jawiki-20181001-corpus.xml --mode=uucode ran
    1.02 ± 0.00 times faster than zig-out/bin/ghostty-bench +grapheme-break --data=jawiki-20181001-corpus.xml --mode=table
```

English

```
hyperfine --warmup 1 --runs=5 zig-out/bin/ghostty-bench +grapheme-break --data=enwiki-20181001-corpus.xml --mode=table zig-out/bin/ghostty-bench +grapheme-break --data=enwiki-20181001-corpus.xml --mode=uucode
Benchmark 1: zig-out/bin/ghostty-bench +grapheme-break --data=enwiki-20181001-corpus.xml --mode=table
  Time (mean ± σ):      4.799 s ±  0.004 s    [User: 4.587 s, System: 0.207 s]
  Range (min … max):    4.795 s …  4.807 s    5 runs

Benchmark 2: zig-out/bin/ghostty-bench +grapheme-break --data=enwiki-20181001-corpus.xml --mode=uucode
  Time (mean ± σ):      5.119 s ±  0.002 s    [User: 4.907 s, System: 0.208 s]
  Range (min … max):    5.116 s …  5.121 s    5 runs

Summary
  zig-out/bin/ghostty-bench +grapheme-break --data=enwiki-20181001-corpus.xml --mode=table ran
    1.07 ± 0.00 times faster than zig-out/bin/ghostty-bench +grapheme-break --data=enwiki-20181001-corpus.xml --mode=uucode
```

### is-symbol

data.txt

```
hyperfine --warmup 4 --runs=10 zig-out/bin/ghostty-bench +is-symbol --data=data.txt --mode=table zig-out/bin/ghostty-bench +is-symbol --data=data.txt --mode=uucode
Benchmark 1: zig-out/bin/ghostty-bench +is-symbol --data=data.txt --mode=table
  Time (mean ± σ):     877.9 ms ±   2.0 ms    [User: 832.2 ms, System: 43.8 ms]
  Range (min … max):   873.8 ms … 881.2 ms    10 runs

Benchmark 2: zig-out/bin/ghostty-bench +is-symbol --data=data.txt --mode=uucode
  Time (mean ± σ):     883.5 ms ±   1.5 ms    [User: 837.8 ms, System: 43.8 ms]
  Range (min … max):   881.9 ms … 886.8 ms    10 runs

Summary
  zig-out/bin/ghostty-bench +is-symbol --data=data.txt --mode=table ran
    1.01 ± 0.00 times faster than zig-out/bin/ghostty-bench +is-symbol --data=data.txt --mode=uucode
```

Arabic

```
hyperfine --warmup 1 --runs=5 zig-out/bin/ghostty-bench +is-symbol --data=arwiki-20180920-corpus.xml --mode=table zig-out/bin/ghostty-bench +is-symbol --data=arwiki-20180920-corpus.xml --mode=uucode
Benchmark 1: zig-out/bin/ghostty-bench +is-symbol --data=arwiki-20180920-corpus.xml --mode=table
  Time (mean ± σ):      2.735 s ±  0.008 s    [User: 2.548 s, System: 0.183 s]
  Range (min … max):    2.724 s …  2.743 s    5 runs

Benchmark 2: zig-out/bin/ghostty-bench +is-symbol --data=arwiki-20180920-corpus.xml --mode=uucode
  Time (mean ± σ):      2.611 s ±  0.001 s    [User: 2.427 s, System: 0.181 s]
  Range (min … max):    2.610 s …  2.612 s    5 runs

Summary
  zig-out/bin/ghostty-bench +is-symbol --data=arwiki-20180920-corpus.xml --mode=uucode ran
    1.05 ± 0.00 times faster than zig-out/bin/ghostty-bench +is-symbol --data=arwiki-20180920-corpus.xml --mode=table
```

Japanese

```
hyperfine --warmup 1 --runs=5 zig-out/bin/ghostty-bench +is-symbol --data=jawiki-20181001-corpus.xml --mode=table zig-out/bin/ghostty-bench +is-symbol --data=jawiki-20181001-corpus.xml --mode=uucode
Benchmark 1: zig-out/bin/ghostty-bench +is-symbol --data=jawiki-20181001-corpus.xml --mode=table
  Time (mean ± σ):      2.391 s ±  0.002 s    [User: 2.210 s, System: 0.178 s]
  Range (min … max):    2.387 s …  2.393 s    5 runs

Benchmark 2: zig-out/bin/ghostty-bench +is-symbol --data=jawiki-20181001-corpus.xml --mode=uucode
  Time (mean ± σ):      2.276 s ±  0.004 s    [User: 2.100 s, System: 0.173 s]
  Range (min … max):    2.274 s …  2.283 s    5 runs

Summary
  zig-out/bin/ghostty-bench +is-symbol --data=jawiki-20181001-corpus.xml --mode=uucode ran
    1.05 ± 0.00 times faster than zig-out/bin/ghostty-bench +is-symbol --data=jawiki-20181001-corpus.xml --mode=table
```

English

```
hyperfine --warmup 1 --runs=5 zig-out/bin/ghostty-bench +is-symbol --data=enwiki-20181001-corpus.xml --mode=table zig-out/bin/ghostty-bench +is-symbol --data=enwiki-20181001-corpus.xml --mode=uucode
Benchmark 1: zig-out/bin/ghostty-bench +is-symbol --data=enwiki-20181001-corpus.xml --mode=table
  Time (mean ± σ):      2.450 s ±  0.005 s    [User: 2.267 s, System: 0.179 s]
  Range (min … max):    2.446 s …  2.457 s    5 runs

Benchmark 2: zig-out/bin/ghostty-bench +is-symbol --data=enwiki-20181001-corpus.xml --mode=uucode
  Time (mean ± σ):      2.212 s ±  0.002 s    [User: 2.036 s, System: 0.173 s]
  Range (min … max):    2.208 s …  2.214 s    5 runs

Summary
  zig-out/bin/ghostty-bench +is-symbol --data=enwiki-20181001-corpus.xml --mode=uucode ran
    1.11 ± 0.00 times faster than zig-out/bin/ghostty-bench +is-symbol --data=enwiki-20181001-corpus.xml --mode=table
```
pull/8938/head
Mitchell Hashimoto 2025-09-30 06:54:17 -07:00 committed by GitHub
commit 6da3727431
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
25 changed files with 468 additions and 183 deletions

View File

@ -42,6 +42,10 @@
.hash = "ziglyph-0.11.2-AAAAAHPtHwB4Mbzn1KvOV7Wpjo82NYEc_v0WC8oCLrkf",
.lazy = true,
},
.uucode = .{
.url = "https://github.com/jacobsandlund/uucode/archive/190706c6b56f0842d29778007f74f7d3d1335fc5.tar.gz",
.hash = "uucode-0.1.0-ZZjBPpAFQABNCvd9cVPBg4I7233Ays-NWfWphPNqGbyE",
},
.zig_wayland = .{
// codeberg ifreund/zig-wayland
.url = "https://codeberg.org/ifreund/zig-wayland/archive/f3c5d503e540ada8cbcb056420de240af0c094f7.tar.gz",

5
build.zig.zon.json generated
View File

@ -109,6 +109,11 @@
"url": "https://deps.files.ghostty.org/utfcpp-1220d4d18426ca72fc2b7e56ce47273149815501d0d2395c2a98c726b31ba931e641.tar.gz",
"hash": "sha256-/8ZooxDndgfTk/PBizJxXyI9oerExNbgV5oR345rWc8="
},
"uucode-0.0.0-ZZjBPk0GQACuYIoFqT_Vzkvn8Ur_M3dE7o4DNUE65Z7v": {
"name": "uucode",
"url": "https://github.com/jacobsandlund/uucode/archive/f748edb9639d9b3c8ae13d6190953f419a615343.tar.gz",
"hash": "sha256-j1ZNumH19olw0DHTEb6sChnCZpYhK9+1Q/rr6nxPRcQ="
},
"vaxis-0.1.0-BWNV_FUICQAFZnTCL11TUvnUr1Y0_ZdqtXHhd51d76Rn": {
"name": "vaxis",
"url": "git+https://github.com/rockorager/libvaxis#1f41c121e8fc153d9ce8c6eb64b2bbab68ad7d23",

8
build.zig.zon.nix generated
View File

@ -258,6 +258,14 @@ in
hash = "sha256-/8ZooxDndgfTk/PBizJxXyI9oerExNbgV5oR345rWc8=";
};
}
{
name = "uucode-0.0.0-ZZjBPk0GQACuYIoFqT_Vzkvn8Ur_M3dE7o4DNUE65Z7v";
path = fetchZigArtifact {
name = "uucode";
url = "https://github.com/jacobsandlund/uucode/archive/f748edb9639d9b3c8ae13d6190953f419a615343.tar.gz";
hash = "sha256-j1ZNumH19olw0DHTEb6sChnCZpYhK9+1Q/rr6nxPRcQ=";
};
}
{
name = "vaxis-0.1.0-BWNV_FUICQAFZnTCL11TUvnUr1Y0_ZdqtXHhd51d76Rn";
path = fetchZigArtifact {

1
build.zig.zon.txt generated
View File

@ -28,6 +28,7 @@ https://deps.files.ghostty.org/zig_js-12205a66d423259567764fa0fc60c82be35365c21a
https://deps.files.ghostty.org/ziglyph-b89d43d1e3fb01b6074bc1f7fc980324b04d26a5.tar.gz
https://deps.files.ghostty.org/zlib-1220fed0c74e1019b3ee29edae2051788b080cd96e90d56836eea857b0b966742efb.tar.gz
https://github.com/ghostty-org/zig-gobject/releases/download/2025-09-20-20-1/ghostty-gobject-2025-09-20-20-1.tar.zst
https://github.com/jacobsandlund/uucode/archive/f748edb9639d9b3c8ae13d6190953f419a615343.tar.gz
https://github.com/mbadolato/iTerm2-Color-Schemes/releases/download/release-20250922-150534-d28055b/ghostty-themes.tgz
https://github.com/mitchellh/libxev/archive/7f803181b158a10fec8619f793e3b4df515566cb.tar.gz
https://github.com/mitchellh/zig-objc/archive/c9e917a4e15a983b672ca779c7985d738a2d517c.tar.gz

View File

@ -131,6 +131,12 @@
"dest": "vendor/p/N-V-__8AAHffAgDU0YQmynL8K35WzkcnMUmBVQHQ0jlcKpjH",
"sha256": "ffc668a310e77607d393f3c18b32715f223da1eac4c4d6e0579a11df8e6b59cf"
},
{
"type": "archive",
"url": "https://github.com/jacobsandlund/uucode/archive/f748edb9639d9b3c8ae13d6190953f419a615343.tar.gz",
"dest": "vendor/p/uucode-0.0.0-ZZjBPk0GQACuYIoFqT_Vzkvn8Ur_M3dE7o4DNUE65Z7v",
"sha256": "8f564dba61f5f68970d031d311beac0a19c26696212bdfb543faebea7c4f45c4"
},
{
"type": "git",
"url": "https://github.com/rockorager/libvaxis",

View File

@ -10,6 +10,7 @@ const assert = std.debug.assert;
const Allocator = std.mem.Allocator;
const Benchmark = @import("Benchmark.zig");
const options = @import("options.zig");
const uucode = @import("uucode");
const UTF8Decoder = @import("../terminal/UTF8Decoder.zig");
const simd = @import("../simd/main.zig");
const table = @import("../unicode/main.zig").table;
@ -47,6 +48,9 @@ pub const Mode = enum {
/// Test our lookup table implementation.
table,
/// Using uucode, with custom `width` extension based on `wcwidth`.
uucode,
};
/// Create a new terminal stream handler for the given arguments.
@ -71,6 +75,7 @@ pub fn benchmark(self: *CodepointWidth) Benchmark {
.wcwidth => stepWcwidth,
.table => stepTable,
.simd => stepSimd,
.uucode => stepUucode,
},
.setupFn = setup,
.teardownFn = teardown,
@ -180,6 +185,35 @@ fn stepSimd(ptr: *anyopaque) Benchmark.Error!void {
}
}
fn stepUucode(ptr: *anyopaque) Benchmark.Error!void {
const self: *CodepointWidth = @ptrCast(@alignCast(ptr));
const f = self.data_f orelse return;
var r = std.io.bufferedReader(f.reader());
var d: UTF8Decoder = .{};
var buf: [4096]u8 align(std.atomic.cache_line) = undefined;
while (true) {
const n = r.read(&buf) catch |err| {
log.warn("error reading data file err={}", .{err});
return error.BenchmarkFailed;
};
if (n == 0) break; // EOF reached
for (buf[0..n]) |c| {
const cp_, const consumed = d.next(c);
assert(consumed);
if (cp_) |cp| {
// This is the same trick we do in terminal.zig so we
// keep it here.
std.mem.doNotOptimizeAway(if (cp <= 0xFF)
1
else
uucode.get(.width, @intCast(cp)));
}
}
}
}
test CodepointWidth {
const testing = std.testing;
const alloc = testing.allocator;

View File

@ -8,6 +8,7 @@ const assert = std.debug.assert;
const Allocator = std.mem.Allocator;
const Benchmark = @import("Benchmark.zig");
const options = @import("options.zig");
const uucode = @import("uucode");
const UTF8Decoder = @import("../terminal/UTF8Decoder.zig");
const unicode = @import("../unicode/main.zig");
@ -38,6 +39,9 @@ pub const Mode = enum {
/// Ghostty's table-based approach.
table,
/// uucode implementation
uucode,
};
/// Create a new terminal stream handler for the given arguments.
@ -60,6 +64,7 @@ pub fn benchmark(self: *GraphemeBreak) Benchmark {
.stepFn = switch (self.opts.mode) {
.noop => stepNoop,
.table => stepTable,
.uucode => stepUucode,
},
.setupFn = setup,
.teardownFn = teardown,
@ -133,6 +138,33 @@ fn stepTable(ptr: *anyopaque) Benchmark.Error!void {
}
}
fn stepUucode(ptr: *anyopaque) Benchmark.Error!void {
const self: *GraphemeBreak = @ptrCast(@alignCast(ptr));
const f = self.data_f orelse return;
var r = std.io.bufferedReader(f.reader());
var d: UTF8Decoder = .{};
var state: uucode.grapheme.BreakState = .default;
var cp1: u21 = 0;
var buf: [4096]u8 align(std.atomic.cache_line) = undefined;
while (true) {
const n = r.read(&buf) catch |err| {
log.warn("error reading data file err={}", .{err});
return error.BenchmarkFailed;
};
if (n == 0) break; // EOF reached
for (buf[0..n]) |c| {
const cp_, const consumed = d.next(c);
assert(consumed);
if (cp_) |cp2| {
std.mem.doNotOptimizeAway(uucode.grapheme.isBreak(cp1, @intCast(cp2), &state));
cp1 = cp2;
}
}
}
}
test GraphemeBreak {
const testing = std.testing;
const alloc = testing.allocator;

View File

@ -10,7 +10,7 @@ const Allocator = std.mem.Allocator;
const Benchmark = @import("Benchmark.zig");
const options = @import("options.zig");
const UTF8Decoder = @import("../terminal/UTF8Decoder.zig");
const symbols = @import("../unicode/symbols_ziglyph.zig");
const uucode = @import("uucode");
const symbols_table = @import("../unicode/symbols_table.zig").table;
const log = std.log.scoped(.@"is-symbol-bench");
@ -22,7 +22,7 @@ data_f: ?std.fs.File = null,
pub const Options = struct {
/// Which test to run.
mode: Mode = .ziglyph,
mode: Mode = .uucode,
/// The data to read as a filepath. If this is "-" then
/// we will read stdin. If this is unset, then we will
@ -33,8 +33,8 @@ pub const Options = struct {
};
pub const Mode = enum {
/// "Naive" ziglyph implementation.
ziglyph,
/// uucode implementation
uucode,
/// Ghostty's table-based approach.
table,
@ -58,7 +58,7 @@ pub fn destroy(self: *IsSymbol, alloc: Allocator) void {
pub fn benchmark(self: *IsSymbol) Benchmark {
return .init(self, .{
.stepFn = switch (self.opts.mode) {
.ziglyph => stepZiglyph,
.uucode => stepUucode,
.table => stepTable,
},
.setupFn = setup,
@ -86,7 +86,7 @@ fn teardown(ptr: *anyopaque) void {
}
}
fn stepZiglyph(ptr: *anyopaque) Benchmark.Error!void {
fn stepUucode(ptr: *anyopaque) Benchmark.Error!void {
const self: *IsSymbol = @ptrCast(@alignCast(ptr));
const f = self.data_f orelse return;
@ -104,7 +104,7 @@ fn stepZiglyph(ptr: *anyopaque) Benchmark.Error!void {
const cp_, const consumed = d.next(c);
assert(consumed);
if (cp_) |cp| {
std.mem.doNotOptimizeAway(symbols.isSymbol(cp));
std.mem.doNotOptimizeAway(uucode.get(.is_symbol, cp));
}
}
}

View File

@ -79,7 +79,7 @@ fn step(ptr: *anyopaque) Benchmark.Error!void {
var p: terminalpkg.Parser = .init();
var buf: [4096]u8 = undefined;
var buf: [4096]u8 align(std.atomic.cache_line) = undefined;
while (true) {
const n = r.read(&buf) catch |err| {
log.warn("error reading data file err={}", .{err});

View File

@ -115,7 +115,7 @@ fn step(ptr: *anyopaque) Benchmark.Error!void {
const f = self.data_f orelse return;
var r = std.io.bufferedReader(f.reader());
var buf: [4096]u8 = undefined;
var buf: [4096]u8 align(std.atomic.cache_line) = undefined;
while (true) {
const n = r.read(&buf) catch |err| {
log.warn("error reading data file err={}", .{err});

View File

@ -17,16 +17,26 @@ help_strings: HelpStrings,
metallib: ?*MetallibStep,
unicode_tables: UnicodeTables,
framedata: GhosttyFrameData,
uucode_tables: std.Build.LazyPath,
/// Used to keep track of a list of file sources.
pub const LazyPathList = std.ArrayList(std.Build.LazyPath);
pub fn init(b: *std.Build, cfg: *const Config) !SharedDeps {
const uucode_tables = blk: {
const uucode = b.dependency("uucode", .{
.build_config_path = b.path("src/build/uucode_config.zig"),
});
break :blk uucode.namedLazyPath("tables.zig");
};
var result: SharedDeps = .{
.config = cfg,
.help_strings = try .init(b, cfg),
.unicode_tables = try .init(b),
.unicode_tables = try .init(b, uucode_tables),
.framedata = try .init(b),
.uucode_tables = uucode_tables,
// Setup by retarget
.options = undefined,
@ -393,12 +403,22 @@ pub fn add(
})) |dep| {
step.root_module.addImport("z2d", dep.module("z2d"));
}
if (step.kind == .@"test") {
if (b.lazyDependency("ziglyph", .{
.target = target,
.optimize = optimize,
.target = step.root_module.resolved_target.?,
.optimize = step.root_module.optimize.?,
})) |dep| {
step.root_module.addImport("ziglyph", dep.module("ziglyph"));
}
}
if (b.lazyDependency("uucode", .{
.target = target,
.optimize = optimize,
.tables_path = self.uucode_tables,
.build_config_path = b.path("src/build/uucode_config.zig"),
})) |dep| {
step.root_module.addImport("uucode", dep.module("uucode"));
}
if (b.lazyDependency("zf", .{
.target = target,
.optimize = optimize,

View File

@ -11,11 +11,11 @@ symbols_exe: *std.Build.Step.Compile,
props_output: std.Build.LazyPath,
symbols_output: std.Build.LazyPath,
pub fn init(b: *std.Build) !UnicodeTables {
pub fn init(b: *std.Build, uucode_tables: std.Build.LazyPath) !UnicodeTables {
const props_exe = b.addExecutable(.{
.name = "props-unigen",
.root_module = b.createModule(.{
.root_source_file = b.path("src/unicode/props_ziglyph.zig"),
.root_source_file = b.path("src/unicode/props_uucode.zig"),
.target = b.graph.host,
.strip = false,
.omit_frame_pointer = false,
@ -26,7 +26,7 @@ pub fn init(b: *std.Build) !UnicodeTables {
const symbols_exe = b.addExecutable(.{
.name = "symbols-unigen",
.root_module = b.createModule(.{
.root_source_file = b.path("src/unicode/symbols_ziglyph.zig"),
.root_source_file = b.path("src/unicode/symbols_uucode.zig"),
.target = b.graph.host,
.strip = false,
.omit_frame_pointer = false,
@ -34,14 +34,13 @@ pub fn init(b: *std.Build) !UnicodeTables {
}),
});
if (b.lazyDependency("ziglyph", .{
if (b.lazyDependency("uucode", .{
.target = b.graph.host,
})) |ziglyph_dep| {
.tables_path = uucode_tables,
.build_config_path = b.path("src/build/uucode_config.zig"),
})) |dep| {
inline for (&.{ props_exe, symbols_exe }) |exe| {
exe.root_module.addImport(
"ziglyph",
ziglyph_dep.module("ziglyph"),
);
exe.root_module.addImport("uucode", dep.module("uucode"));
}
}

View File

@ -0,0 +1,81 @@
const std = @import("std");
const config = @import("config.zig");
const config_x = @import("config.x.zig");
const d = config.default;
const wcwidth = config_x.wcwidth;
const Allocator = std.mem.Allocator;
fn computeWidth(
alloc: std.mem.Allocator,
cp: u21,
data: anytype,
backing: anytype,
tracking: anytype,
) Allocator.Error!void {
_ = alloc;
_ = cp;
_ = backing;
_ = tracking;
data.width = @intCast(@min(2, @max(0, data.wcwidth)));
}
const width = config.Extension{
.inputs = &.{"wcwidth"},
.compute = &computeWidth,
.fields = &.{
.{ .name = "width", .type = u2 },
},
};
fn computeIsSymbol(
alloc: Allocator,
cp: u21,
data: anytype,
backing: anytype,
tracking: anytype,
) Allocator.Error!void {
_ = alloc;
_ = cp;
_ = backing;
_ = tracking;
const block = data.block;
data.is_symbol = data.general_category == .other_private_use or
block == .dingbats or
block == .emoticons or
block == .miscellaneous_symbols or
block == .enclosed_alphanumerics or
block == .enclosed_alphanumeric_supplement or
block == .miscellaneous_symbols_and_pictographs or
block == .transport_and_map_symbols;
}
const is_symbol = config.Extension{
.inputs = &.{ "block", "general_category" },
.compute = &computeIsSymbol,
.fields = &.{
.{ .name = "is_symbol", .type = bool },
},
};
pub const tables = [_]config.Table{
.{
.name = "runtime",
.extensions = &.{},
.fields = &.{
d.field("is_emoji_presentation"),
d.field("case_folding_full"),
},
},
.{
.name = "buildtime",
.extensions = &.{ wcwidth, width, is_symbol },
.fields = &.{
width.field("width"),
d.field("grapheme_break"),
is_symbol.field("is_symbol"),
d.field("is_emoji_modifier"),
d.field("is_emoji_modifier_base"),
},
},
};

View File

@ -13,7 +13,7 @@ const CodepointResolver = @This();
const std = @import("std");
const Allocator = std.mem.Allocator;
const ziglyph = @import("ziglyph");
const uucode = @import("uucode");
const font = @import("main.zig");
const Atlas = font.Atlas;
const CodepointMap = font.CodepointMap;
@ -150,7 +150,7 @@ pub fn getIndex(
// we'll do this multiple times if we recurse, but this is a cached function
// call higher up (GroupCache) so this should be rare.
const p_mode: Collection.PresentationMode = if (p) |v| .{ .explicit = v } else .{
.default = if (ziglyph.emoji.isEmojiPresentation(@intCast(cp)))
.default = if (uucode.get(.is_emoji_presentation, @intCast(cp)))
.emoji
else
.text,

View File

@ -1,9 +1,9 @@
const std = @import("std");
const assert = std.debug.assert;
const Allocator = std.mem.Allocator;
const ziglyph = @import("ziglyph");
const font = @import("../main.zig");
const terminal = @import("../../terminal/main.zig");
const unicode = @import("../../unicode/main.zig");
const log = std.log.scoped(.font_shaper);
@ -111,7 +111,7 @@ pub const Shaper = struct {
// font ligatures. However, we do support grapheme clustering.
// This means we can render things like skin tone emoji but
// we can't render things like single glyph "=>".
var break_state: u3 = 0;
var break_state: unicode.GraphemeBreakState = .{};
var cp1: u21 = @intCast(codepoints[0]);
var start: usize = 0;
@ -126,7 +126,7 @@ pub const Shaper = struct {
const cp2: u21 = @intCast(codepoints[i]);
defer cp1 = cp2;
break :blk ziglyph.graphemeBreak(
break :blk unicode.graphemeBreak(
cp1,
cp2,
&break_state,

View File

@ -6,7 +6,7 @@ const std = @import("std");
const Allocator = std.mem.Allocator;
const assert = std.debug.assert;
const build_config = @import("../build_config.zig");
const ziglyph = @import("ziglyph");
const uucode = @import("uucode");
const key = @import("key.zig");
const KeyEvent = key.KeyEvent;
@ -1618,15 +1618,19 @@ pub const Trigger = struct {
/// in more codepoints so we need to use a 3 element array.
fn foldedCodepoint(cp: u21) [3]u21 {
// ASCII fast path
if (ziglyph.letter.isAsciiLetter(cp)) {
return .{ ziglyph.letter.toLower(cp), 0, 0 };
if (uucode.ascii.isAlphabetic(cp)) {
return .{ uucode.ascii.toLower(cp), 0, 0 };
}
// Unicode slow path. Case folding can result in more codepoints.
// If more codepoints are produced then we return the codepoint
// as-is which isn't correct but until we have a failing test
// then I don't want to handle this.
return ziglyph.letter.toCaseFold(cp);
var buffer: [1]u21 = undefined;
const slice = uucode.get(.case_folding_full, cp).with(&buffer, cp);
var array: [3]u21 = [_]u21{0} ** 3;
@memcpy(array[0..slice.len], slice);
return array;
}
/// Convert the trigger to a C API compatible trigger.

View File

@ -191,8 +191,8 @@ test {
_ = @import("simd/main.zig");
_ = @import("synthetic/main.zig");
_ = @import("unicode/main.zig");
_ = @import("unicode/props_ziglyph.zig");
_ = @import("unicode/symbols_ziglyph.zig");
_ = @import("unicode/props_uucode.zig");
_ = @import("unicode/symbols_uucode.zig");
// Extra
_ = @import("extra/bash.zig");

View File

@ -6,7 +6,9 @@ extern "c" fn ghostty_simd_codepoint_width(u32) i8;
pub fn codepointWidth(cp: u32) i8 {
if (comptime options.simd) return ghostty_simd_codepoint_width(cp);
return @import("ziglyph").display_width.codePointWidth(@intCast(cp), .half);
const uucode = @import("uucode");
if (cp > uucode.config.max_code_point) return 1;
return @import("uucode").get(.width, @intCast(cp));
}
test "codepointWidth basic" {
@ -20,26 +22,30 @@ test "codepointWidth basic" {
try testing.expectEqual(@as(i8, 2), codepointWidth(0xF900)); //
try testing.expectEqual(@as(i8, 2), codepointWidth(0x20000)); // 𠀀
try testing.expectEqual(@as(i8, 2), codepointWidth(0x30000)); // 𠀀
// try testing.expectEqual(@as(i8, 1), @import("ziglyph").display_width.codePointWidth(0x100, .half));
// try testing.expectEqual(@as(i8, 1), @import("uucode").get(.width, 0x100));
}
// This is not very fast in debug modes, so its commented by default.
// IMPORTANT: UNCOMMENT THIS WHENEVER MAKING CODEPOINTWIDTH CHANGES.
// test "codepointWidth matches ziglyph" {
// test "codepointWidth matches uucode" {
// const testing = std.testing;
// const ziglyph = @import("ziglyph");
// const uucode = @import("uucode");
//
// const min = 0xFF + 1; // start outside ascii
// for (min..std.math.maxInt(u21)) |cp| {
// const max = std.math.maxInt(u21) + 1;
// for (min..max) |cp| {
// const simd = codepointWidth(@intCast(cp));
// const zg = ziglyph.display_width.codePointWidth(@intCast(cp), .half);
// if (simd != zg) mismatch: {
// const uu = if (cp > uucode.config.max_code_point)
// 1
// else
// uucode.get(.width, @intCast(cp));
// if (simd != uu) mismatch: {
// if (cp == 0x2E3B) {
// try testing.expectEqual(@as(i8, 2), simd);
// break :mismatch;
// }
//
// std.log.warn("mismatch cp=U+{x} simd={} zg={}", .{ cp, simd, zg });
// std.log.warn("mismatch cp=U+{x} simd={} uucode={}", .{ cp, simd, uu });
// try testing.expect(false);
// }
// }

View File

@ -434,8 +434,8 @@ pub fn print(self: *Terminal, c: u21) !void {
// control characters because they're always filtered prior.
const width: usize = if (c <= 0xFF) 1 else @intCast(unicode.table.get(c).width);
// Note: it is possible to have a width of "3" and a width of "-1"
// from ziglyph. We should look into those cases and handle them
// Note: it is possible to have a width of "3" and a width of "-1" from
// uucode.x's wcwidth. We should look into those cases and handle them
// appropriately.
assert(width <= 2);
// log.debug("c={x} width={}", .{ c, width });

View File

@ -151,35 +151,39 @@ fn graphemeBreakClass(
/// If you build this file as a binary, we will verify the grapheme break
/// implementation. This iterates over billions of codepoints so it is
/// SLOW. It's not meant to be run in CI, but it's useful for debugging.
/// TODO: this is hard to build with newer zig build, so
/// https://github.com/ghostty-org/ghostty/pull/7806 took the approach of
/// adding a `-Demit-unicode-test` option for `zig build`, but that
/// hasn't been done here.
pub fn main() !void {
const ziglyph = @import("ziglyph");
const uucode = @import("uucode");
// Set the min and max to control the test range.
const min = 0;
const max = std.math.maxInt(u21) + 1;
const max = uucode.config.max_code_point + 1;
var state: BreakState = .{};
var zg_state: u3 = 0;
var uu_state: uucode.grapheme.BreakState = .default;
for (min..max) |cp1| {
if (cp1 % 1000 == 0) std.log.warn("progress cp1={}", .{cp1});
if (cp1 == '\r' or cp1 == '\n' or
ziglyph.grapheme_break.isControl(@intCast(cp1))) continue;
uucode.get(.grapheme_break, @intCast(cp1)) == .control) continue;
for (min..max) |cp2| {
if (cp2 == '\r' or cp2 == '\n' or
ziglyph.grapheme_break.isControl(@intCast(cp2))) continue;
uucode.get(.grapheme_break, @intCast(cp1)) == .control) continue;
const gb = graphemeBreak(@intCast(cp1), @intCast(cp2), &state);
const zg_gb = ziglyph.graphemeBreak(@intCast(cp1), @intCast(cp2), &zg_state);
if (gb != zg_gb) {
std.log.warn("cp1={x} cp2={x} gb={} state={} zg_gb={} zg_state={}", .{
const uu_gb = uucode.grapheme.isBreak(@intCast(cp1), @intCast(cp2), &uu_state);
if (gb != uu_gb) {
std.log.warn("cp1={x} cp2={x} gb={} state={} uu_gb={} uu_state={}", .{
cp1,
cp2,
gb,
state,
zg_gb,
zg_state,
uu_gb,
uu_state,
});
}
}

View File

@ -7,7 +7,7 @@ pub const table = table: {
// build.zig process, but due to Zig's lazy analysis we can still reference
// it here.
//
// An example process is the `main` in `props_ziglyph.zig`
// An example process is the `main` in `props_uucode.zig`
const generated = @import("unicode_tables").Tables(Properties);
const Tables = lut.Tables(Properties);
break :table Tables{

View File

@ -0,0 +1,173 @@
const props = @This();
const std = @import("std");
const assert = std.debug.assert;
const uucode = @import("uucode");
const lut = @import("lut.zig");
const Properties = @import("Properties.zig");
const GraphemeBoundaryClass = Properties.GraphemeBoundaryClass;
/// Gets the grapheme boundary class for a codepoint.
/// The use case for this is only in generating lookup tables.
fn graphemeBoundaryClass(cp: u21) GraphemeBoundaryClass {
if (cp > uucode.config.max_code_point) return .invalid;
// We special-case modifier bases because we should not break
// if a modifier isn't next to a base.
if (uucode.get(.is_emoji_modifier, cp)) return .emoji_modifier;
if (uucode.get(.is_emoji_modifier_base, cp)) return .extended_pictographic_base;
return switch (uucode.get(.grapheme_break, cp)) {
.extended_pictographic => .extended_pictographic,
.l => .L,
.v => .V,
.t => .T,
.lv => .LV,
.lvt => .LVT,
.prepend => .prepend,
.zwj => .zwj,
.spacing_mark => .spacing_mark,
.regional_indicator => .regional_indicator,
.zwnj,
.indic_conjunct_break_extend,
.indic_conjunct_break_linker,
=> .extend,
// This is obviously not INVALID invalid, there is SOME grapheme
// boundary class for every codepoint. But we don't care about
// anything that doesn't fit into the above categories. Also note
// that `indic_conjunct_break_consonant` is `other` in
// 'GraphemeBreakProperty.txt' (it's missing).
.other,
.indic_conjunct_break_consonant,
.cr,
.lf,
.control,
=> .invalid,
};
}
pub fn get(cp: u21) Properties {
const width = if (cp > uucode.config.max_code_point)
1
else
uucode.get(.width, cp);
return .{
.width = width,
.grapheme_boundary_class = graphemeBoundaryClass(cp),
};
}
/// Runnable binary to generate the lookup tables and output to stdout.
pub fn main() !void {
var arena_state = std.heap.ArenaAllocator.init(std.heap.page_allocator);
defer arena_state.deinit();
const alloc = arena_state.allocator();
const gen: lut.Generator(
Properties,
struct {
pub fn get(ctx: @This(), cp: u21) !Properties {
_ = ctx;
return props.get(cp);
}
pub fn eql(ctx: @This(), a: Properties, b: Properties) bool {
_ = ctx;
return a.eql(b);
}
},
) = .{};
const t = try gen.generate(alloc);
defer alloc.free(t.stage1);
defer alloc.free(t.stage2);
defer alloc.free(t.stage3);
try t.writeZig(std.io.getStdOut().writer());
// Uncomment when manually debugging to see our table sizes.
// std.log.warn("stage1={} stage2={} stage3={}", .{
// t.stage1.len,
// t.stage2.len,
// t.stage3.len,
// });
}
test "unicode props: tables match uucode" {
if (std.valgrind.runningOnValgrind() > 0) return error.SkipZigTest;
const testing = std.testing;
const table = @import("props_table.zig").table;
const min = 0xFF + 1; // start outside ascii
const max = std.math.maxInt(u21) + 1;
for (min..max) |cp| {
const t = table.get(@intCast(cp));
const uu = if (cp > uucode.config.max_code_point)
1
else
uucode.get(.width, @intCast(cp));
if (t.width != uu) {
std.log.warn("mismatch cp=U+{x} t={} uu={}", .{ cp, t.width, uu });
try testing.expect(false);
}
}
}
test "unicode props: tables match ziglyph" {
if (std.valgrind.runningOnValgrind() > 0) return error.SkipZigTest;
const testing = std.testing;
const table = @import("props_table.zig").table;
const ziglyph = @import("ziglyph");
const min = 0xFF + 1; // start outside ascii
const max = std.math.maxInt(u21) + 1;
for (min..max) |cp| {
const t = table.get(@intCast(cp));
const zg = @min(2, @max(0, ziglyph.display_width.codePointWidth(@intCast(cp), .half)));
if (t.width != zg) {
// Known exceptions
if (cp == 0x0897) continue; // non-spacing mark (t = 0)
if (cp == 0x2065) continue; // unassigned (t = 1)
if (cp >= 0x2630 and cp <= 0x2637) continue; // east asian width is wide (t = 2)
if (cp >= 0x268A and cp <= 0x268F) continue; // east asian width is wide (t = 2)
if (cp >= 0x2FFC and cp <= 0x2FFF) continue; // east asian width is wide (t = 2)
if (cp == 0x31E4 or cp == 0x31E5) continue; // east asian width is wide (t = 2)
if (cp == 0x31EF) continue; // east asian width is wide (t = 2)
if (cp >= 0x4DC0 and cp <= 0x4DFF) continue; // east asian width is wide (t = 2)
if (cp >= 0xFFF0 and cp <= 0xFFF8) continue; // unassigned (t = 1)
if (cp >= 0xFFF0 and cp <= 0xFFF8) continue; // unassigned (t = 1)
if (cp >= 0x10D69 and cp <= 0x10D6D) continue; // non-spacing mark, despite being east asian width normal (t = 0)
if (cp >= 0x10EFC and cp <= 0x10EFF) continue; // non-spacing mark, despite being east asian width normal (t = 0)
if (cp >= 0x113BB and cp <= 0x113C0) continue; // non-spacing mark, despite being east asian width normal (t = 0)
if (cp == 0x113CE) continue; // non-spacing mark, despite being east asian width normal (t = 0)
if (cp == 0x113D0) continue; // non-spacing mark, despite being east asian width normal (t = 0)
if (cp == 0x113D2) continue; // non-spacing mark, despite being east asian width normal (t = 0)
if (cp == 0x113E1) continue; // non-spacing mark, despite being east asian width normal (t = 0)
if (cp == 0x113E2) continue; // non-spacing mark, despite being east asian width normal (t = 0)
if (cp == 0x1171E) continue; // mark spacing combining (t = 1)
if (cp == 0x11F5A) continue; // non-spacing mark, despite being east asian width normal (t = 0)
if (cp == 0x1611E) continue; // non-spacing mark, despite being east asian width normal (t = 0)
if (cp == 0x1611F) continue; // non-spacing mark, despite being east asian width normal (t = 0)
if (cp >= 0x16120 and cp <= 0x1612F) continue; // non-spacing mark, despite being east asian width normal (t = 0)
if (cp >= 0xE0000 and cp <= 0xE0FFF) continue; // ziglyph ignores these with 0, but many are unassigned (t = 1)
if (cp == 0x18CFF) continue; // east asian width is wide (t = 2)
if (cp >= 0x1D300 and cp <= 0x1D376) continue; // east asian width is wide (t = 2)
if (cp == 0x1E5EE) continue; // non-spacing mark, despite being east asian width normal (t = 0)
if (cp == 0x1E5EF) continue; // non-spacing mark, despite being east asian width normal (t = 0)
if (cp == 0x1FA89) continue; // east asian width is wide (t = 2)
if (cp == 0x1FA8F) continue; // east asian width is wide (t = 2)
if (cp == 0x1FABE) continue; // east asian width is wide (t = 2)
if (cp == 0x1FAC6) continue; // east asian width is wide (t = 2)
if (cp == 0x1FADC) continue; // east asian width is wide (t = 2)
if (cp == 0x1FADF) continue; // east asian width is wide (t = 2)
if (cp == 0x1FAE9) continue; // east asian width is wide (t = 2)
std.log.warn("mismatch cp=U+{x} t={} zg={}", .{ cp, t.width, zg });
try testing.expect(false);
}
}
}

View File

@ -1,96 +0,0 @@
const props = @This();
const std = @import("std");
const assert = std.debug.assert;
const ziglyph = @import("ziglyph");
const lut = @import("lut.zig");
const Properties = @import("Properties.zig");
const GraphemeBoundaryClass = Properties.GraphemeBoundaryClass;
/// Gets the grapheme boundary class for a codepoint. This is VERY
/// SLOW. The use case for this is only in generating lookup tables.
fn graphemeBoundaryClass(cp: u21) GraphemeBoundaryClass {
// We special-case modifier bases because we should not break
// if a modifier isn't next to a base.
if (ziglyph.emoji.isEmojiModifierBase(cp)) {
assert(ziglyph.emoji.isExtendedPictographic(cp));
return .extended_pictographic_base;
}
if (ziglyph.emoji.isEmojiModifier(cp)) return .emoji_modifier;
if (ziglyph.emoji.isExtendedPictographic(cp)) return .extended_pictographic;
if (ziglyph.grapheme_break.isL(cp)) return .L;
if (ziglyph.grapheme_break.isV(cp)) return .V;
if (ziglyph.grapheme_break.isT(cp)) return .T;
if (ziglyph.grapheme_break.isLv(cp)) return .LV;
if (ziglyph.grapheme_break.isLvt(cp)) return .LVT;
if (ziglyph.grapheme_break.isPrepend(cp)) return .prepend;
if (ziglyph.grapheme_break.isExtend(cp)) return .extend;
if (ziglyph.grapheme_break.isZwj(cp)) return .zwj;
if (ziglyph.grapheme_break.isSpacingmark(cp)) return .spacing_mark;
if (ziglyph.grapheme_break.isRegionalIndicator(cp)) return .regional_indicator;
// This is obviously not INVALID invalid, there is SOME grapheme
// boundary class for every codepoint. But we don't care about
// anything that doesn't fit into the above categories.
return .invalid;
}
pub fn get(cp: u21) Properties {
const zg_width = ziglyph.display_width.codePointWidth(cp, .half);
return .{
.width = @intCast(@min(2, @max(0, zg_width))),
.grapheme_boundary_class = graphemeBoundaryClass(cp),
};
}
/// Runnable binary to generate the lookup tables and output to stdout.
pub fn main() !void {
var arena_state = std.heap.ArenaAllocator.init(std.heap.page_allocator);
defer arena_state.deinit();
const alloc = arena_state.allocator();
const gen: lut.Generator(
Properties,
struct {
pub fn get(ctx: @This(), cp: u21) !Properties {
_ = ctx;
return props.get(cp);
}
pub fn eql(ctx: @This(), a: Properties, b: Properties) bool {
_ = ctx;
return a.eql(b);
}
},
) = .{};
const t = try gen.generate(alloc);
defer alloc.free(t.stage1);
defer alloc.free(t.stage2);
defer alloc.free(t.stage3);
try t.writeZig(std.io.getStdOut().writer());
// Uncomment when manually debugging to see our table sizes.
// std.log.warn("stage1={} stage2={} stage3={}", .{
// t.stage1.len,
// t.stage2.len,
// t.stage3.len,
// });
}
// This is not very fast in debug modes, so its commented by default.
// IMPORTANT: UNCOMMENT THIS WHENEVER MAKING CODEPOINTWIDTH CHANGES.
// test "unicode props: tables match ziglyph" {
// const testing = std.testing;
//
// const min = 0xFF + 1; // start outside ascii
// for (min..std.math.maxInt(u21)) |cp| {
// const t = table.get(@intCast(cp));
// const zg = @min(2, @max(0, ziglyph.display_width.codePointWidth(@intCast(cp), .half)));
// if (t.width != zg) {
// std.log.warn("mismatch cp=U+{x} t={} zg={}", .{ cp, t, zg });
// try testing.expect(false);
// }
// }
// }

View File

@ -6,7 +6,7 @@ pub const table = table: {
// build.zig process, but due to Zig's lazy analysis we can still reference
// it here.
//
// An example process is the `main` in `symbols_ziglyph.zig`
// An example process is the `main` in `symbols_uucode.zig`
const generated = @import("symbols_tables").Tables(bool);
const Tables = lut.Tables(bool);
break :table Tables{

View File

@ -1,33 +1,7 @@
const props = @This();
const std = @import("std");
const assert = std.debug.assert;
const ziglyph = @import("ziglyph");
const uucode = @import("uucode");
const lut = @import("lut.zig");
/// Returns true of the codepoint is a "symbol-like" character, which
/// for now we define as anything in a private use area and anything
/// in several unicode blocks:
/// - Dingbats
/// - Emoticons
/// - Miscellaneous Symbols
/// - Enclosed Alphanumerics
/// - Enclosed Alphanumeric Supplement
/// - Miscellaneous Symbols and Pictographs
/// - Transport and Map Symbols
///
/// In the future it may be prudent to expand this to encompass more
/// symbol-like characters, and/or exclude some PUA sections.
pub fn isSymbol(cp: u21) bool {
return ziglyph.general_category.isPrivateUse(cp) or
ziglyph.blocks.isDingbats(cp) or
ziglyph.blocks.isEmoticons(cp) or
ziglyph.blocks.isMiscellaneousSymbols(cp) or
ziglyph.blocks.isEnclosedAlphanumerics(cp) or
ziglyph.blocks.isEnclosedAlphanumericSupplement(cp) or
ziglyph.blocks.isMiscellaneousSymbolsAndPictographs(cp) or
ziglyph.blocks.isTransportAndMapSymbols(cp);
}
/// Runnable binary to generate the lookup tables and output to stdout.
pub fn main() !void {
var arena_state = std.heap.ArenaAllocator.init(std.heap.page_allocator);
@ -39,7 +13,10 @@ pub fn main() !void {
struct {
pub fn get(ctx: @This(), cp: u21) !bool {
_ = ctx;
return isSymbol(cp);
return if (cp > uucode.config.max_code_point)
false
else
uucode.get(.is_symbol, @intCast(cp));
}
pub fn eql(ctx: @This(), a: bool, b: bool) bool {
@ -63,9 +40,7 @@ pub fn main() !void {
// });
}
// This is not very fast in debug modes, so its commented by default.
// IMPORTANT: UNCOMMENT THIS WHENEVER MAKING CHANGES.
test "unicode symbols: tables match ziglyph" {
test "unicode symbols: tables match uucode" {
if (std.valgrind.runningOnValgrind() > 0) return error.SkipZigTest;
const testing = std.testing;
@ -73,7 +48,36 @@ test "unicode symbols: tables match ziglyph" {
for (0..std.math.maxInt(u21)) |cp| {
const t = table.get(@intCast(cp));
const zg = isSymbol(@intCast(cp));
const uu = if (cp > uucode.config.max_code_point)
false
else
uucode.get(.is_symbol, @intCast(cp));
if (t != uu) {
std.log.warn("mismatch cp=U+{x} t={} uu={}", .{ cp, t, uu });
try testing.expect(false);
}
}
}
test "unicode symbols: tables match ziglyph" {
if (std.valgrind.runningOnValgrind() > 0) return error.SkipZigTest;
const testing = std.testing;
const table = @import("symbols_table.zig").table;
const ziglyph = @import("ziglyph");
for (0..std.math.maxInt(u21)) |cp_usize| {
const cp: u21 = @intCast(cp_usize);
const t = table.get(cp);
const zg = ziglyph.general_category.isPrivateUse(cp) or
ziglyph.blocks.isDingbats(cp) or
ziglyph.blocks.isEmoticons(cp) or
ziglyph.blocks.isMiscellaneousSymbols(cp) or
ziglyph.blocks.isEnclosedAlphanumerics(cp) or
ziglyph.blocks.isEnclosedAlphanumericSupplement(cp) or
ziglyph.blocks.isMiscellaneousSymbolsAndPictographs(cp) or
ziglyph.blocks.isTransportAndMapSymbols(cp);
if (t != zg) {
std.log.warn("mismatch cp=U+{x} t={} zg={}", .{ cp, t, zg });