unicode: fix VS15/VS16 check to consider emoji bases (#9679)

This PR builds on https://github.com/ghostty-org/ghostty/pull/9678 ~so
the diff from there is included here (it's not possible to stack PRs
unless it's a PR against my own fork)--review that one first!~

This PR fixes an issue with the VS15/VS16 handling in terminal `print`,
where before we didn't check for valid sequences if the base code point
is an emoji. This meant that a VS15 after an emoji that isn't part of a
valid sequence would narrow the cell, despite that the emoji doesn't
have a text presentation.

This PR also uses a single `is_emoji_vs_base` instead of the
`is_emoji_vs_emoji` and `is_emoji_vs_text`. See the [uucode
comment](215ff09730/src/config.zig (L239-L254))
for why I'm recommending this.

AI was used in some of the uucode changes in
https://github.com/ghostty-org/ghostty/pull/9678 (Amp--primarily for
tests), but everything was carefully vetted and much of it done by hand.
This PR was made without AI.
pull/9685/head
Mitchell Hashimoto 2025-11-24 08:20:07 -08:00 committed by GitHub
commit 98e7c17fcc
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 113 additions and 31 deletions

View File

@ -90,8 +90,7 @@ pub const tables = [_]config.Table{
width.field("width"),
d.field("grapheme_break"),
is_symbol.field("is_symbol"),
d.field("is_emoji_vs_text"),
d.field("is_emoji_vs_emoji"),
d.field("is_emoji_vs_base"),
},
},
};

View File

@ -374,20 +374,10 @@ pub fn print(self: *Terminal, c: u21) !void {
// the cell width accordingly. VS16 makes the character wide and
// VS15 makes it narrow.
if (c == 0xFE0F or c == 0xFE0E) {
// This check below isn't robust enough to be correct.
// But it is correct enough (the emoji check alone served us
// well through Ghostty 1.2.3!) and we can fix it up later.
// Emoji always allow VS15/16
const prev_props = unicode.table.get(prev.cell.content.codepoint);
const emoji = prev_props.grapheme_boundary_class.isExtendedPictographic();
if (!emoji) valid_check: {
// If not an emoji, check if it is a defined variation
// sequence in emoji-variation-sequences.txt
if (c == 0xFE0F and prev_props.emoji_vs_emoji) break :valid_check;
if (c == 0xFE0E and prev_props.emoji_vs_text) break :valid_check;
return;
}
// Check if it is a valid variation sequence in
// emoji-variation-sequences.txt, and if not, ignore the char.
if (!prev_props.emoji_vs_base) return;
switch (c) {
0xFE0F => wide: {
@ -3288,7 +3278,7 @@ test "Terminal: print invalid VS16 non-grapheme" {
try t.print('x');
try t.print(0xFE0F);
// We should have 2 cells taken up. It is one character but "wide".
// We should have 1 narrow cell.
try testing.expectEqual(@as(usize, 0), t.screens.active.cursor.y);
try testing.expectEqual(@as(usize, 1), t.screens.active.cursor.x);
@ -3601,6 +3591,71 @@ test "Terminal: VS15 on already narrow emoji" {
}
}
test "Terminal: print invalid VS15 following emoji is wide" {
var t = try init(testing.allocator, .{ .cols = 80, .rows = 80 });
defer t.deinit(testing.allocator);
// Enable grapheme clustering
t.modes.set(.grapheme_cluster, true);
try t.print('\u{1F9E0}'); // 🧠
try t.print(0xFE0E); // not valid with U+1F9E0 as base
// We should have 2 cells taken up. It is one character but "wide".
try testing.expectEqual(@as(usize, 0), t.screens.active.cursor.y);
try testing.expectEqual(@as(usize, 2), t.screens.active.cursor.x);
// Assert various properties about our screen to verify
// we have all expected cells.
{
const list_cell = t.screens.active.pages.getCell(.{ .screen = .{ .x = 0, .y = 0 } }).?;
const cell = list_cell.cell;
try testing.expectEqual(@as(u21, '\u{1F9E0}'), cell.content.codepoint);
try testing.expect(!cell.hasGrapheme());
try testing.expectEqual(Cell.Wide.wide, cell.wide);
}
{
const list_cell = t.screens.active.pages.getCell(.{ .screen = .{ .x = 1, .y = 0 } }).?;
const cell = list_cell.cell;
try testing.expectEqual(@as(u21, 0), cell.content.codepoint);
try testing.expectEqual(Cell.Wide.spacer_tail, cell.wide);
}
}
test "Terminal: print invalid VS15 in emoji ZWJ sequence" {
var t = try init(testing.allocator, .{ .cols = 80, .rows = 80 });
defer t.deinit(testing.allocator);
// Enable grapheme clustering
t.modes.set(.grapheme_cluster, true);
try t.print('\u{1F469}'); // 👩
try t.print(0xFE0E); // not valid with U+1F469 as base
try t.print('\u{200D}'); // ZWJ
try t.print('\u{1F466}'); // 👦
// We should have 2 cells taken up. It is one character but "wide".
try testing.expectEqual(@as(usize, 0), t.screens.active.cursor.y);
try testing.expectEqual(@as(usize, 2), t.screens.active.cursor.x);
// Assert various properties about our screen to verify
// we have all expected cells.
{
const list_cell = t.screens.active.pages.getCell(.{ .screen = .{ .x = 0, .y = 0 } }).?;
const cell = list_cell.cell;
try testing.expectEqual(@as(u21, '\u{1F469}'), cell.content.codepoint);
try testing.expect(cell.hasGrapheme());
try testing.expectEqualSlices(u21, &.{ '\u{200D}', '\u{1F466}' }, list_cell.node.data.lookupGrapheme(cell).?);
try testing.expectEqual(Cell.Wide.wide, cell.wide);
}
{
const list_cell = t.screens.active.pages.getCell(.{ .screen = .{ .x = 1, .y = 0 } }).?;
const cell = list_cell.cell;
try testing.expectEqual(@as(u21, 0), cell.content.codepoint);
try testing.expectEqual(Cell.Wide.spacer_tail, cell.wide);
}
}
test "Terminal: VS15 to make narrow character with pending wrap" {
var t = try init(testing.allocator, .{ .rows = 5, .cols = 2 });
defer t.deinit(testing.allocator);
@ -3723,9 +3778,9 @@ test "Terminal: print invalid VS16 grapheme" {
// https://github.com/mitchellh/ghostty/issues/1482
try t.print('x');
try t.print(0xFE0F);
try t.print(0xFE0F); // invalid VS16
// We should have 2 cells taken up. It is one character but "wide".
// We should have 1 cells taken up, and narrow.
try testing.expectEqual(@as(usize, 0), t.screens.active.cursor.y);
try testing.expectEqual(@as(usize, 1), t.screens.active.cursor.x);
@ -3758,7 +3813,7 @@ test "Terminal: print invalid VS16 with second char" {
try t.print(0xFE0F);
try t.print('y');
// We should have 2 cells taken up. It is one character but "wide".
// We should have 2 cells taken up, from two separate narrow characters.
try testing.expectEqual(@as(usize, 0), t.screens.active.cursor.y);
try testing.expectEqual(@as(usize, 2), t.screens.active.cursor.x);
@ -3781,6 +3836,40 @@ test "Terminal: print invalid VS16 with second char" {
}
}
test "Terminal: print invalid VS16 with second char (combining)" {
var t = try init(testing.allocator, .{ .cols = 80, .rows = 80 });
defer t.deinit(testing.allocator);
// Enable grapheme clustering
t.modes.set(.grapheme_cluster, true);
// https://github.com/mitchellh/ghostty/issues/1482
try t.print('n');
try t.print(0xFE0F); // invalid VS16
try t.print(0x0303); // combining tilde
// We should have 1 cells taken up, and narrow.
try testing.expectEqual(@as(usize, 0), t.screens.active.cursor.y);
try testing.expectEqual(@as(usize, 1), t.screens.active.cursor.x);
// Assert various properties about our screen to verify
// we have all expected cells.
{
const list_cell = t.screens.active.pages.getCell(.{ .screen = .{ .x = 0, .y = 0 } }).?;
const cell = list_cell.cell;
try testing.expectEqual(@as(u21, 'n'), cell.content.codepoint);
try testing.expect(cell.hasGrapheme());
try testing.expectEqualSlices(u21, &.{'\u{0303}'}, list_cell.node.data.lookupGrapheme(cell).?);
try testing.expectEqual(Cell.Wide.narrow, cell.wide);
}
{
const list_cell = t.screens.active.pages.getCell(.{ .screen = .{ .x = 1, .y = 0 } }).?;
const cell = list_cell.cell;
try testing.expectEqual(@as(u21, 0), cell.content.codepoint);
try testing.expectEqual(Cell.Wide.narrow, cell.wide);
}
}
test "Terminal: overwrite grapheme should clear grapheme data" {
var t = try init(testing.allocator, .{ .rows = 5, .cols = 5 });
defer t.deinit(testing.allocator);

View File

@ -16,15 +16,13 @@ pub const Properties = packed struct {
grapheme_boundary_class: GraphemeBoundaryClass = .invalid,
/// Emoji VS compatibility
emoji_vs_text: bool = false,
emoji_vs_emoji: bool = false,
emoji_vs_base: bool = false,
// Needed for lut.Generator
pub fn eql(a: Properties, b: Properties) bool {
return a.width == b.width and
a.grapheme_boundary_class == b.grapheme_boundary_class and
a.emoji_vs_text == b.emoji_vs_text and
a.emoji_vs_emoji == b.emoji_vs_emoji;
a.emoji_vs_base == b.emoji_vs_base;
}
// Needed for lut.Generator
@ -36,14 +34,12 @@ pub const Properties = packed struct {
\\.{{
\\ .width= {},
\\ .grapheme_boundary_class= .{s},
\\ .emoji_vs_text= {},
\\ .emoji_vs_emoji= {},
\\ .emoji_vs_base= {},
\\}}
, .{
self.width,
@tagName(self.grapheme_boundary_class),
self.emoji_vs_text,
self.emoji_vs_emoji,
self.emoji_vs_base,
});
}
};

View File

@ -48,15 +48,13 @@ pub fn get(cp: u21) Properties {
if (cp > uucode.config.max_code_point) return .{
.width = 1,
.grapheme_boundary_class = .invalid,
.emoji_vs_text = false,
.emoji_vs_emoji = false,
.emoji_vs_base = false,
};
return .{
.width = uucode.get(.width, cp),
.grapheme_boundary_class = graphemeBoundaryClass(cp),
.emoji_vs_text = uucode.get(.is_emoji_vs_text, cp),
.emoji_vs_emoji = uucode.get(.is_emoji_vs_emoji, cp),
.emoji_vs_base = uucode.get(.is_emoji_vs_base, cp),
};
}