simd: fix scalar utf8 decoding

2025-09-22 09:12:48 -07:00 · 2025-09-22 09:12:48 -07:00 · 530633b9e9
parent 5a29dd3ef5
commit 530633b9e9
1 changed files with 38 additions and 11 deletions
--- a/src/simd/vt.zig
+++ b/src/simd/vt.zig
@ -43,19 +43,44 @@ fn utf8DecodeUntilControlSeqScalar(
 ) DecodeResult {
    // Find our escape
    const idx = indexOf(input, 0x1B) orelse input.len;
+    const decode = input[0..idx];

-    // Copy up to the escape
-    const view = std.unicode.Utf8View.init(input[0..idx]) catch unreachable;
-    var it = view.iterator();
-    var i: usize = 0;
-    while (it.nextCodepoint()) |cp| {
-        output[i] = @intCast(cp);
-        i += 1;
+    // Go through and decode one item at a time.
+    var decode_offset: usize = 0;
+    var decode_count: usize = 0;
+    while (decode_offset < decode.len) {
+        const decode_rem = decode[decode_offset..];
+        const cp_len = std.unicode.utf8ByteSequenceLength(decode_rem[0]) catch {
+            // Note, this is matching our SIMD behavior, but it is admittedly
+            // a bit weird. See our "decode invalid leading byte" test too.
+            // SIMD should be our source of truth then we copy behavior here.
+            break;
+        };
+
+        // If we don't have that number of bytes available. we finish. We
+        // assume this is a partial input and we defer to the future.
+        if (decode_rem.len < cp_len) break;
+
+        // We have the bytes available, so move forward
+        const cp_bytes = decode_rem[0..cp_len];
+        decode_offset += cp_len;
+        if (std.unicode.utf8Decode(cp_bytes)) |cp| {
+            output[decode_count] = @intCast(cp);
+            decode_count += 1;
+        } else |_| {
+            // If decoding failed, we replace the leading byte with the
+            // replacement char and then continue decoding after that
+            // byte. This matches the SIMD behavior and is tested by the
+            // "invalid UTF-8" tests.
+            output[decode_count] = 0xFFFD;
+            decode_count += 1;
+            decode_offset -= cp_len - 1;
+        }
    }

    return .{
-        .consumed = idx,
-        .decoded = i,
+        .consumed = decode_offset,
+        .decoded = decode_count,
    };
 }

@ -139,16 +164,18 @@ test "decode invalid UTF-8" {

    var output: [64]u32 = undefined;

-    // Invalid leading 1s
+    // Invalid leading 2-byte sequence
    {
-        const str = "hello\xc2\x00";
+        const str = "hello\xc2\x01";
        try testing.expectEqual(DecodeResult{
            .consumed = 7,
            .decoded = 7,
        }, utf8DecodeUntilControlSeq(str, &output));
    }

+    // Replacement will only replace the invalid leading byte.
    try testing.expectEqual(@as(u32, 0xFFFD), output[5]);
+    try testing.expectEqual(@as(u32, 0x01), output[6]);
 }

 // This is testing our current behavior so that we know we have to handle