pkg/highway: no libc requirement (#12402)
This uses a custom fork Google Highway that removes all libc usage. For most, it was logging and we can just remove it. For detection, we moved this to an extern func implemented in Zig built using the Zig standard library so we can avoid libc. # Benchmark Results All benchmarks use 50 MB pre-generated inputs (`ghostty-gen +utf8 --seed=42`) built and run with `-Doptimize=ReleaseFast` on Apple Silicon (aarch64-macos). ## Input Descriptions | Input | Description | |:------|:------------| | ascii-only | 1-byte sequences only, printable ASCII | | 2byte-only | 2-byte sequences only (Latin/Cyrillic/etc.) | | 3byte-only | 3-byte sequences only (CJK/BMP) | | 4byte-only | 4-byte sequences only (emoji/supplementary planes) | | mixed-equal | Equal weight across all 4 lengths | | mostly-ascii | ~80% ASCII, ~20% multibyte | | cjk-heavy | ~80% 3-byte, ~20% other | | 10pct-invalid | Equal-weight mix with 10% malformed sequences | ## Terminal Parser (byte-by-byte DFA, no SIMD) | Input | Mean [ms] | Min [ms] | Max [ms] | Relative | |:------|----------:|---------:|---------:|---------:| | ascii-only | 46.3 ± 0.8 | 45.4 | 48.1 | 1.00 | | 2byte-only | 59.1 ± 1.2 | 57.8 | 62.7 | 1.28 ± 0.03 | | 3byte-only | 65.4 ± 2.1 | 64.1 | 78.6 | 1.41 ± 0.05 | | 4byte-only | 59.3 ± 1.3 | 57.2 | 63.5 | 1.28 ± 0.04 | | mixed-equal | 180.7 ± 0.7 | 179.5 | 182.3 | 3.90 ± 0.07 | | mostly-ascii | 59.3 ± 1.0 | 57.3 | 61.1 | 1.28 ± 0.03 | | cjk-heavy | 142.4 ± 2.0 | 140.4 | 149.9 | 3.08 ± 0.07 | | 10pct-invalid | 180.2 ± 1.5 | 178.4 | 184.9 | 3.89 ± 0.08 | ## Terminal Stream (SIMD UTF-8 decode + terminal handling) | Input | Mean [ms] | Min [ms] | Max [ms] | Relative | |:------|----------:|---------:|---------:|---------:| | ascii-only | 377.0 ± 8.7 | 357.1 | 386.4 | 2.42 ± 0.08 | | 2byte-only | 664.5 ± 4.0 | 656.9 | 672.6 | 4.27 ± 0.11 | | 3byte-only | 233.5 ± 0.9 | 231.1 | 234.8 | 1.50 ± 0.04 | | 4byte-only | 155.5 ± 4.0 | 149.6 | 161.3 | 1.00 | | mixed-equal | 467.0 ± 3.4 | 461.8 | 473.9 | 3.00 ± 0.08 | | mostly-ascii | 470.8 ± 7.2 | 459.6 | 482.8 | 3.03 ± 0.09 | | cjk-heavy | 338.4 ± 2.4 | 334.3 | 341.7 | 2.18 ± 0.06 | | 10pct-invalid | 635.1 ± 3.5 | 630.5 | 640.8 | 4.08 ± 0.11 | ## Branch Comparison: `main` vs `fixed` ### Terminal Parser | Input | main [ms] | fixed [ms] | Δ | |:------|----------:|-----------:|:--| | ascii-only | 46.9 ± 0.7 | 47.3 ± 0.9 | ~same | | 2byte-only | 59.0 ± 0.5 | 59.1 ± 1.2 | ~same | | 3byte-only | 65.9 ± 2.1 | 65.4 ± 2.1 | ~same | | 4byte-only | 58.8 ± 0.5 | 59.3 ± 1.3 | ~same | | mixed-equal | 182.5 ± 0.9 | 180.7 ± 0.7 | fixed 1% faster | | mostly-ascii | 59.0 ± 0.5 | 59.3 ± 1.0 | ~same | | cjk-heavy | 144.1 ± 1.7 | 142.4 ± 2.0 | ~same | | 10pct-invalid | 181.7 ± 1.0 | 180.2 ± 1.5 | ~same | ### Terminal Stream | Input | main [ms] | fixed [ms] | Δ | |:------|----------:|-----------:|:--| | ascii-only | 388.4 ± 8.8 | 383.1 ± 7.6 | ~same | | 2byte-only | 687.7 ± 4.8 | 672.9 ± 2.6 | fixed 2% faster | | 3byte-only | 235.5 ± 1.2 | 236.3 ± 2.5 | ~same | | 4byte-only | 166.2 ± 2.9 | 159.9 ± 3.1 | fixed 4% faster | | mixed-equal | 481.8 ± 3.3 | 480.7 ± 6.3 | ~same | | mostly-ascii | 483.8 ± 6.7 | 475.9 ± 4.3 | ~same | | cjk-heavy | 341.7 ± 3.1 | 341.6 ± 2.0 | ~same | | 10pct-invalid | 647.6 ± 3.3 | 640.4 ± 3.4 | ~same | No regressions in either benchmark. Fixed branch is equal or slightly faster across all inputs. ## Reproduction ```bash # Generate inputs (do NOT regenerate when comparing branches) for profile in \ "--weight-one=1 --weight-two=0 --weight-three=0 --weight-four=0 --ascii-printable-only=true" \ "--weight-one=0 --weight-two=1 --weight-three=0 --weight-four=0" \ "--weight-one=0 --weight-two=0 --weight-three=1 --weight-four=0" \ "--weight-one=0 --weight-two=0 --weight-three=0 --weight-four=1" \ "--weight-one=1 --weight-two=1 --weight-three=1 --weight-four=1" \ "--weight-one=10 --weight-two=1 --weight-three=1 --weight-four=0.5 --ascii-printable-only=true" \ "--weight-one=1 --weight-two=0.5 --weight-three=10 --weight-four=0.5" \ "--weight-one=1 --weight-two=1 --weight-three=1 --weight-four=1 --invalid-rate=0.1"; do ghostty-gen +utf8 --seed=42 $profile | head -c 50000000 > /tmp/ghostty-bench-data/<name>.dat done # Build zig build -Demit-bench -Doptimize=ReleaseFast -Demit-macos-app=false # Run hyperfine --warmup 3 --min-runs 10 \ './zig-out/bin/ghostty-bench +terminal-stream --data=<path>' ```pull/12412/head
commit
5f43437576
|
|
@ -1,90 +0,0 @@
|
|||
#include <hwy/abort.h>
|
||||
#include <hwy/base.h>
|
||||
#include <hwy/targets.h>
|
||||
|
||||
#include <stdarg.h>
|
||||
#include <stdint.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
namespace hwy {
|
||||
namespace {
|
||||
|
||||
// Highway's upstream abort.cc pulls in libc++ even when the rest of the
|
||||
// library is compiled with HWY_NO_LIBCXX. Ghostty only needs Highway's dynamic
|
||||
// dispatch/runtime target selection, so we provide the tiny Warn/Abort surface
|
||||
// that targets.cc/per_target.cc expect and keep the package free of libc++.
|
||||
WarnFunc g_warn_func = nullptr;
|
||||
AbortFunc g_abort_func = nullptr;
|
||||
|
||||
// Mirror the upstream behavior closely enough for Highway's internal callers:
|
||||
// format into a fixed buffer, fall back to a generic error if formatting fails,
|
||||
// and then dispatch to either the registered hook or stderr.
|
||||
void format_message(const char* format, va_list args, char* buffer, size_t size) {
|
||||
const int written = vsnprintf(buffer, size, format, args);
|
||||
if (written < 0) {
|
||||
snprintf(buffer, size, "%s", "failed to format highway message");
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
WarnFunc& GetWarnFunc() {
|
||||
return g_warn_func;
|
||||
}
|
||||
|
||||
AbortFunc& GetAbortFunc() {
|
||||
return g_abort_func;
|
||||
}
|
||||
|
||||
WarnFunc SetWarnFunc(WarnFunc func) {
|
||||
// Highway documents these setters as thread-safe. Using the compiler builtin
|
||||
// keeps that guarantee without depending on std::atomic.
|
||||
return __atomic_exchange_n(&g_warn_func, func, __ATOMIC_SEQ_CST);
|
||||
}
|
||||
|
||||
AbortFunc SetAbortFunc(AbortFunc func) {
|
||||
return __atomic_exchange_n(&g_abort_func, func, __ATOMIC_SEQ_CST);
|
||||
}
|
||||
|
||||
void Warn(const char* file, int line, const char* format, ...) {
|
||||
char message[1024];
|
||||
va_list args;
|
||||
va_start(args, format);
|
||||
format_message(format, args, message, sizeof(message));
|
||||
va_end(args);
|
||||
|
||||
if (WarnFunc func = g_warn_func) {
|
||||
func(file, line, message);
|
||||
return;
|
||||
}
|
||||
|
||||
fprintf(stderr, "%s:%d: %s\n", file, line, message);
|
||||
}
|
||||
|
||||
HWY_NORETURN void Abort(const char* file, int line, const char* format, ...) {
|
||||
char message[1024];
|
||||
va_list args;
|
||||
va_start(args, format);
|
||||
format_message(format, args, message, sizeof(message));
|
||||
va_end(args);
|
||||
|
||||
if (AbortFunc func = g_abort_func) {
|
||||
func(file, line, message);
|
||||
} else {
|
||||
fprintf(stderr, "%s:%d: %s\n", file, line, message);
|
||||
}
|
||||
|
||||
abort();
|
||||
}
|
||||
|
||||
} // namespace hwy
|
||||
|
||||
extern "C" {
|
||||
|
||||
// Zig reads HWY_SUPPORTED_TARGETS via this C shim so it can keep its target
|
||||
// enum in sync with the vendored Highway build without parsing C++ headers.
|
||||
int64_t hwy_supported_targets() {
|
||||
return HWY_SUPPORTED_TARGETS;
|
||||
}
|
||||
}
|
||||
|
|
@ -7,7 +7,7 @@ pub fn build(b: *std.Build) !void {
|
|||
const upstream_ = b.lazyDependency("highway", .{});
|
||||
|
||||
const module = b.addModule("highway", .{
|
||||
.root_source_file = b.path("main.zig"),
|
||||
.root_source_file = b.path("src/main.zig"),
|
||||
.target = target,
|
||||
.optimize = optimize,
|
||||
});
|
||||
|
|
@ -15,22 +15,23 @@ pub fn build(b: *std.Build) !void {
|
|||
const lib = b.addLibrary(.{
|
||||
.name = "highway",
|
||||
.root_module = b.createModule(.{
|
||||
.root_source_file = b.path("src/detect.zig"),
|
||||
.target = target,
|
||||
.optimize = optimize,
|
||||
}),
|
||||
.linkage = .static,
|
||||
});
|
||||
|
||||
// Our highway package is free of libc at runtime (uses no symbols)
|
||||
// but does require libc headers at compile time.
|
||||
lib.linkLibC();
|
||||
|
||||
lib.addIncludePath(b.path("src/cpp"));
|
||||
if (upstream_) |upstream| {
|
||||
lib.addIncludePath(upstream.path(""));
|
||||
module.addIncludePath(upstream.path(""));
|
||||
}
|
||||
|
||||
if (target.result.os.tag.isDarwin()) {
|
||||
const apple_sdk = @import("apple_sdk");
|
||||
try apple_sdk.addPaths(b, lib);
|
||||
}
|
||||
|
||||
if (target.result.abi.isAndroid()) {
|
||||
const android_ndk = @import("android_ndk");
|
||||
try android_ndk.addPaths(b, lib);
|
||||
|
|
@ -93,19 +94,13 @@ pub fn build(b: *std.Build) !void {
|
|||
});
|
||||
}
|
||||
|
||||
lib.addCSourceFiles(.{ .flags = flags.items, .files = &.{"bridge.cpp"} });
|
||||
lib.addCSourceFiles(.{ .flags = flags.items, .files = &.{
|
||||
"src/cpp/abort.cc",
|
||||
"src/cpp/per_target.cc",
|
||||
"src/cpp/targets.cpp",
|
||||
} });
|
||||
|
||||
if (upstream_) |upstream| {
|
||||
lib.addCSourceFiles(.{
|
||||
.root = upstream.path(""),
|
||||
.flags = flags.items,
|
||||
.files = &.{
|
||||
// These provide the runtime target selection used by
|
||||
// HWY_DYNAMIC_DISPATCH. The benchmark, timer, print, and
|
||||
// aligned allocator support files are unused by Ghostty.
|
||||
"hwy/per_target.cc",
|
||||
"hwy/targets.cc",
|
||||
},
|
||||
});
|
||||
lib.installHeadersDirectory(
|
||||
upstream.path("hwy"),
|
||||
"hwy",
|
||||
|
|
@ -119,7 +114,7 @@ pub fn build(b: *std.Build) !void {
|
|||
const test_exe = b.addTest(.{
|
||||
.name = "test",
|
||||
.root_module = b.createModule(.{
|
||||
.root_source_file = b.path("main.zig"),
|
||||
.root_source_file = b.path("src/main.zig"),
|
||||
.target = target,
|
||||
.optimize = optimize,
|
||||
}),
|
||||
|
|
|
|||
|
|
@ -11,7 +11,6 @@
|
|||
.lazy = true,
|
||||
},
|
||||
|
||||
.apple_sdk = .{ .path = "../apple-sdk" },
|
||||
.android_ndk = .{ .path = "../android-ndk" },
|
||||
},
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,57 +0,0 @@
|
|||
extern "c" fn hwy_supported_targets() i64;
|
||||
|
||||
pub const Targets = packed struct(i64) {
|
||||
// x86_64
|
||||
_reserved: u4 = 0,
|
||||
avx3_spr: bool = false,
|
||||
_reserved_5: u1 = 0,
|
||||
avx3_zen4: bool = false,
|
||||
avx3_dl: bool = false,
|
||||
avx3: bool = false,
|
||||
avx2: bool = false,
|
||||
_reserved_10: u1 = 0,
|
||||
sse4: bool = false,
|
||||
ssse3: bool = false,
|
||||
_reserved_13: u1 = 0, // SSE3 reserved
|
||||
sse2: bool = false,
|
||||
_reserved_15_23: u9 = 0,
|
||||
|
||||
// aarch64
|
||||
sve2_128: bool = false,
|
||||
sve_256: bool = false,
|
||||
sve2: bool = false,
|
||||
sve: bool = false,
|
||||
neon: bool = false,
|
||||
neon_without_aes: bool = false,
|
||||
_reserved_30_36: u6 = 0,
|
||||
|
||||
// risc-v
|
||||
rvv: bool = false,
|
||||
_reserved_38_46: u9 = 0,
|
||||
|
||||
// IBM Power
|
||||
ppc10: bool = false,
|
||||
ppc9: bool = false,
|
||||
ppc8: bool = false,
|
||||
z15: bool = false,
|
||||
z14: bool = false,
|
||||
_reserved_52_57: u6 = 0,
|
||||
|
||||
// WebAssembly
|
||||
wasm_emu256: bool = false,
|
||||
wasm: bool = false,
|
||||
_reserved_60_61: u2 = 0,
|
||||
|
||||
// Emulation
|
||||
emu128: bool = false,
|
||||
scalar: bool = false,
|
||||
_reserved_63: u1 = 0,
|
||||
};
|
||||
|
||||
pub fn supported_targets() Targets {
|
||||
return @bitCast(hwy_supported_targets());
|
||||
}
|
||||
|
||||
test {
|
||||
_ = supported_targets();
|
||||
}
|
||||
|
|
@ -0,0 +1,70 @@
|
|||
// Copyright 2019 Google LLC
|
||||
// Copyright 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
// SPDX-License-Identifier: BSD-3-Clause
|
||||
|
||||
// Vendored from google/highway hwy/abort.cc at commit:
|
||||
// 66486a10623fa0d72fe91260f96c892e41aceb06
|
||||
//
|
||||
// Local modifications:
|
||||
// - Removed stdio/stdlib/string/sanitizer-backed formatting and logging paths
|
||||
// so this file no longer pulls in libc/libc++ symbols.
|
||||
// - Replaced std::atomic storage with compiler atomics on plain function
|
||||
// pointers to preserve thread-safe handler installation without libc++.
|
||||
// - Kept only the Warn/Abort symbol surface Highway's runtime dispatch needs,
|
||||
// with a trap-only fallback when no abort handler is installed.
|
||||
//
|
||||
// Why:
|
||||
// - Ghostty only needs Highway's runtime dispatch support here, not its
|
||||
// formatted stderr diagnostics.
|
||||
// - Keeping this translation unit libc/libc++ free lets pkg/highway build as a
|
||||
// small vendored shim around Zig-driven target detection.
|
||||
|
||||
#include "hwy/abort.h"
|
||||
|
||||
#include "hwy/base.h"
|
||||
|
||||
namespace hwy {
|
||||
|
||||
namespace {
|
||||
|
||||
WarnFunc g_warn_func = nullptr;
|
||||
AbortFunc g_abort_func = nullptr;
|
||||
|
||||
} // namespace
|
||||
|
||||
HWY_DLLEXPORT WarnFunc& GetWarnFunc() {
|
||||
return g_warn_func;
|
||||
}
|
||||
|
||||
HWY_DLLEXPORT AbortFunc& GetAbortFunc() {
|
||||
return g_abort_func;
|
||||
}
|
||||
|
||||
HWY_DLLEXPORT WarnFunc SetWarnFunc(WarnFunc func) {
|
||||
return __atomic_exchange_n(&g_warn_func, func, __ATOMIC_SEQ_CST);
|
||||
}
|
||||
|
||||
HWY_DLLEXPORT AbortFunc SetAbortFunc(AbortFunc func) {
|
||||
return __atomic_exchange_n(&g_abort_func, func, __ATOMIC_SEQ_CST);
|
||||
}
|
||||
|
||||
HWY_DLLEXPORT void HWY_FORMAT(3, 4)
|
||||
Warn(const char* file, int line, const char* format, ...) {
|
||||
WarnFunc handler = __atomic_load_n(&g_warn_func, __ATOMIC_SEQ_CST);
|
||||
if (handler != nullptr) {
|
||||
handler(file, line, format);
|
||||
}
|
||||
}
|
||||
|
||||
HWY_DLLEXPORT HWY_NORETURN void HWY_FORMAT(3, 4)
|
||||
Abort(const char* file, int line, const char* format, ...) {
|
||||
AbortFunc handler = __atomic_load_n(&g_abort_func, __ATOMIC_SEQ_CST);
|
||||
if (handler != nullptr) {
|
||||
handler(file, line, format);
|
||||
}
|
||||
|
||||
__builtin_trap();
|
||||
}
|
||||
|
||||
} // namespace hwy
|
||||
|
|
@ -0,0 +1,91 @@
|
|||
// Copyright 2022 Google LLC
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
// Vendored from google/highway hwy/per_target.cc at commit:
|
||||
// 66486a10623fa0d72fe91260f96c892e41aceb06
|
||||
//
|
||||
// Local modifications:
|
||||
// - Changed HWY_TARGET_INCLUDE from the upstream path to the local vendored
|
||||
// filename so Highway's multi-pass include machinery resolves this copy.
|
||||
// - Left the implementation otherwise identical to upstream.
|
||||
//
|
||||
// Why:
|
||||
// - Ghostty vendors only the specific Highway .cc files it needs in this
|
||||
// directory, so the original source-relative include path no longer exists.
|
||||
// - Keeping the logic unchanged aside from the include path reduces fork
|
||||
// maintenance cost while still allowing a minimal vendored source set.
|
||||
|
||||
// Enable all targets so that calling Have* does not call into a null pointer.
|
||||
#ifndef HWY_COMPILE_ALL_ATTAINABLE
|
||||
#define HWY_COMPILE_ALL_ATTAINABLE
|
||||
#endif
|
||||
#include "hwy/per_target.h"
|
||||
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
|
||||
#undef HWY_TARGET_INCLUDE
|
||||
#define HWY_TARGET_INCLUDE "per_target.cc"
|
||||
#include "hwy/foreach_target.h" // IWYU pragma: keep
|
||||
#include "hwy/highway.h"
|
||||
|
||||
HWY_BEFORE_NAMESPACE();
|
||||
namespace hwy {
|
||||
namespace HWY_NAMESPACE {
|
||||
namespace {
|
||||
int64_t GetTarget() { return HWY_TARGET; }
|
||||
size_t GetVectorBytes() { return Lanes(ScalableTag<uint8_t>()); }
|
||||
bool GetHaveInteger64() { return HWY_HAVE_INTEGER64 != 0; }
|
||||
bool GetHaveFloat16() { return HWY_HAVE_FLOAT16 != 0; }
|
||||
bool GetHaveFloat64() { return HWY_HAVE_FLOAT64 != 0; }
|
||||
} // namespace
|
||||
// NOLINTNEXTLINE(google-readability-namespace-comments)
|
||||
} // namespace HWY_NAMESPACE
|
||||
|
||||
} // namespace hwy
|
||||
HWY_AFTER_NAMESPACE();
|
||||
|
||||
#if HWY_ONCE
|
||||
namespace hwy {
|
||||
namespace {
|
||||
HWY_EXPORT(GetTarget);
|
||||
HWY_EXPORT(GetVectorBytes);
|
||||
HWY_EXPORT(GetHaveInteger64);
|
||||
HWY_EXPORT(GetHaveFloat16);
|
||||
HWY_EXPORT(GetHaveFloat64);
|
||||
} // namespace
|
||||
|
||||
HWY_DLLEXPORT int64_t DispatchedTarget() {
|
||||
return HWY_DYNAMIC_DISPATCH(GetTarget)();
|
||||
}
|
||||
|
||||
HWY_DLLEXPORT size_t VectorBytes() {
|
||||
return HWY_DYNAMIC_DISPATCH(GetVectorBytes)();
|
||||
}
|
||||
|
||||
HWY_DLLEXPORT bool HaveInteger64() {
|
||||
return HWY_DYNAMIC_DISPATCH(GetHaveInteger64)();
|
||||
}
|
||||
|
||||
HWY_DLLEXPORT bool HaveFloat16() {
|
||||
return HWY_DYNAMIC_DISPATCH(GetHaveFloat16)();
|
||||
}
|
||||
|
||||
HWY_DLLEXPORT bool HaveFloat64() {
|
||||
return HWY_DYNAMIC_DISPATCH(GetHaveFloat64)();
|
||||
}
|
||||
|
||||
} // namespace hwy
|
||||
#endif // HWY_ONCE
|
||||
|
|
@ -0,0 +1,79 @@
|
|||
// Vendored from google/highway hwy/targets.cc at commit:
|
||||
// 66486a10623fa0d72fe91260f96c892e41aceb06
|
||||
//
|
||||
// Local modifications:
|
||||
// - Dropped upstream CPU feature probing and platform-specific detection code
|
||||
// in favor of Ghostty's Zig-provided ghostty_hwy_detect_targets().
|
||||
// - Removed the HWY_WARN baseline-mismatch diagnostic path so this file does
|
||||
// not depend on libc-backed formatting/logging.
|
||||
// - Kept only the chosen-target bookkeeping and runtime dispatch state that
|
||||
// Highway's HWY_DYNAMIC_DISPATCH machinery needs.
|
||||
// - Added hwy_supported_targets() as a small C shim for Zig to query the final
|
||||
// supported target mask.
|
||||
//
|
||||
// Why:
|
||||
// - Ghostty wants a minimal vendored Highway runtime that avoids direct libc
|
||||
// usage and lets Zig own target detection policy.
|
||||
// - Narrowing this file to dispatch state makes the local fork easier to audit
|
||||
// and maintain than carrying upstream's full platform detection surface.
|
||||
|
||||
#include "hwy/targets.h"
|
||||
|
||||
namespace hwy {
|
||||
|
||||
extern "C" int64_t ghostty_hwy_detect_targets();
|
||||
|
||||
// Vendored from Highway's hwy/targets.cc. Ghostty provides target detection in
|
||||
// Zig, so this TU only retains the runtime dispatch/chosen-target state.
|
||||
static int64_t DetectTargets() {
|
||||
int64_t bits = HWY_SCALAR | HWY_EMU128;
|
||||
|
||||
#if (HWY_ARCH_X86 || HWY_ARCH_ARM || HWY_ARCH_PPC || HWY_ARCH_S390X || \
|
||||
HWY_ARCH_RISCV || HWY_ARCH_LOONGARCH) && \
|
||||
HWY_HAVE_RUNTIME_DISPATCH
|
||||
bits |= ghostty_hwy_detect_targets();
|
||||
#else
|
||||
bits |= HWY_ENABLED_BASELINE;
|
||||
#endif
|
||||
|
||||
return bits;
|
||||
}
|
||||
|
||||
// When running tests, this value can be set to the mocked supported targets
|
||||
// mask. Only written to from a single thread before the test starts.
|
||||
static int64_t supported_targets_for_test_ = 0;
|
||||
|
||||
// Mask of targets disabled at runtime with DisableTargets.
|
||||
static int64_t supported_mask_ = LimitsMax<int64_t>();
|
||||
|
||||
HWY_DLLEXPORT void DisableTargets(int64_t disabled_targets) {
|
||||
supported_mask_ = static_cast<int64_t>(~disabled_targets);
|
||||
GetChosenTarget().DeInit();
|
||||
}
|
||||
|
||||
HWY_DLLEXPORT void SetSupportedTargetsForTest(int64_t targets) {
|
||||
supported_targets_for_test_ = targets;
|
||||
GetChosenTarget().DeInit();
|
||||
}
|
||||
|
||||
HWY_DLLEXPORT int64_t SupportedTargets() {
|
||||
int64_t targets = supported_targets_for_test_;
|
||||
if (HWY_LIKELY(targets == 0)) {
|
||||
targets = DetectTargets();
|
||||
GetChosenTarget().Update(targets);
|
||||
}
|
||||
|
||||
targets &= supported_mask_;
|
||||
return targets == 0 ? HWY_STATIC_TARGET : targets;
|
||||
}
|
||||
|
||||
HWY_DLLEXPORT ChosenTarget& GetChosenTarget() {
|
||||
static ChosenTarget chosen_target;
|
||||
return chosen_target;
|
||||
}
|
||||
|
||||
} // namespace hwy
|
||||
|
||||
extern "C" int64_t hwy_supported_targets() {
|
||||
return hwy::SupportedTargets();
|
||||
}
|
||||
|
|
@ -0,0 +1,49 @@
|
|||
const builtin = @import("builtin");
|
||||
const HwyTargets = @import("targets.zig").Targets;
|
||||
|
||||
const x86 = @import("detect/x86.zig");
|
||||
const aarch64_darwin = @import("detect/aarch64_darwin.zig");
|
||||
const aarch64_linux = @import("detect/aarch64_linux.zig");
|
||||
const ppc = @import("detect/ppc.zig");
|
||||
const s390x = @import("detect/s390x.zig");
|
||||
const riscv = @import("detect/riscv.zig");
|
||||
const loongarch = @import("detect/loongarch.zig");
|
||||
|
||||
/// Detect Highway targets at runtime using minimal, direct CPU feature
|
||||
/// probing.
|
||||
///
|
||||
/// Previous versions called std.zig.system.resolveTargetQuery which
|
||||
/// drags in the full Zig target/CPU model tables for every architecture,
|
||||
/// bloating the binary by ~300 KB and causing code-layout regressions in
|
||||
/// unrelated hot paths (icache / branch-predictor pressure).
|
||||
///
|
||||
/// This version uses only inline assembly (CPUID on x86, MRS on AArch64)
|
||||
/// and lightweight syscalls (sysctlbyname on Darwin, getauxval on Linux),
|
||||
/// so it adds no data tables and no std.Target dependency.
|
||||
pub export fn ghostty_hwy_detect_targets() callconv(.c) i64 {
|
||||
return switch (builtin.cpu.arch) {
|
||||
.x86_64, .x86 => x86.detect(),
|
||||
.aarch64, .aarch64_be => detectAarch64(),
|
||||
.powerpc, .powerpc64, .powerpc64le => ppc.detect(),
|
||||
.s390x => s390x.detect(),
|
||||
.riscv32, .riscv64 => riscv.detect(),
|
||||
.loongarch32, .loongarch64 => loongarch.detect(),
|
||||
else => 0,
|
||||
};
|
||||
}
|
||||
|
||||
fn detectAarch64() i64 {
|
||||
var t: HwyTargets = .{};
|
||||
|
||||
// All AArch64 implementations have NEON.
|
||||
t.neon_without_aes = true;
|
||||
|
||||
if (comptime builtin.os.tag.isDarwin()) {
|
||||
return aarch64_darwin.detect(&t);
|
||||
} else if (comptime builtin.os.tag == .linux) {
|
||||
return aarch64_linux.detect(&t);
|
||||
}
|
||||
|
||||
// Other OS: return baseline NEON.
|
||||
return @bitCast(t);
|
||||
}
|
||||
|
|
@ -0,0 +1,33 @@
|
|||
const HwyTargets = @import("../targets.zig").Targets;
|
||||
|
||||
pub fn detect(t: *HwyTargets) i64 {
|
||||
// All Apple Silicon has AES.
|
||||
t.neon = true;
|
||||
|
||||
// Every Apple chip from A11 (2017) onward has FP16 + DotProd.
|
||||
// BF16 arrived with M2 / A15 (ARM_BLIZZARD_AVALANCHE, 2022).
|
||||
// We probe hw.optional.arm.FEAT_BF16 to be precise.
|
||||
const has_bf16 = darwinSysctlBool("hw.optional.arm.FEAT_BF16");
|
||||
if (has_bf16) {
|
||||
t.neon_bf16 = true;
|
||||
}
|
||||
|
||||
// Apple Silicon does not support SVE.
|
||||
return @bitCast(t.*);
|
||||
}
|
||||
|
||||
fn darwinSysctlBool(comptime name: [:0]const u8) bool {
|
||||
var value: c_int = 0;
|
||||
var len: usize = @sizeOf(c_int);
|
||||
const rc = sysctlbyname(name.ptr, &value, &len, null, 0);
|
||||
return rc == 0 and value != 0;
|
||||
}
|
||||
|
||||
// We can rely on libc for macOS because libsystem is always available.
|
||||
extern "c" fn sysctlbyname(
|
||||
name: [*:0]const u8,
|
||||
oldp: ?*anyopaque,
|
||||
oldlenp: ?*usize,
|
||||
newp: ?*const anyopaque,
|
||||
newlen: usize,
|
||||
) c_int;
|
||||
|
|
@ -0,0 +1,65 @@
|
|||
const HwyTargets = @import("../targets.zig").Targets;
|
||||
const linux = @import("linux.zig");
|
||||
|
||||
pub fn detect(t: *HwyTargets) i64 {
|
||||
// Linux exposes AArch64 features via getauxval(AT_HWCAP / AT_HWCAP2).
|
||||
const AT_HWCAP: usize = 16;
|
||||
const AT_HWCAP2: usize = 26;
|
||||
|
||||
const hwcap = linux.getauxval(AT_HWCAP);
|
||||
const hwcap2 = linux.getauxval(AT_HWCAP2);
|
||||
|
||||
// Bit positions from Linux UAPI asm/hwcap.h
|
||||
const HWCAP_AES: usize = 1 << 3;
|
||||
const HWCAP_FPHP: usize = 1 << 9; // FEAT_FP16
|
||||
const HWCAP_ASIMDDP: usize = 1 << 20; // DotProd
|
||||
const HWCAP_SVE: usize = 1 << 22;
|
||||
|
||||
const HWCAP2_BF16: usize = 1 << 14;
|
||||
const HWCAP2_SVE2: usize = 1 << 1;
|
||||
const HWCAP2_SVEAES: usize = 1 << 2;
|
||||
|
||||
if (hwcap & HWCAP_AES != 0) {
|
||||
t.neon = true;
|
||||
|
||||
if (hwcap & HWCAP_FPHP != 0 and
|
||||
hwcap & HWCAP_ASIMDDP != 0 and
|
||||
hwcap2 & HWCAP2_BF16 != 0)
|
||||
{
|
||||
t.neon_bf16 = true;
|
||||
}
|
||||
}
|
||||
|
||||
if (hwcap & HWCAP_SVE != 0) {
|
||||
const vec_bytes = sveVectorBytes();
|
||||
|
||||
if (vec_bytes >= 32) {
|
||||
t.sve = true;
|
||||
if (vec_bytes == 32) {
|
||||
t.sve_256 = true;
|
||||
}
|
||||
}
|
||||
|
||||
if (hwcap2 & HWCAP2_SVE2 != 0 and hwcap2 & HWCAP2_SVEAES != 0) {
|
||||
if (vec_bytes >= 32) {
|
||||
t.sve2 = true;
|
||||
} else if (vec_bytes == 16) {
|
||||
t.sve2_128 = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return @bitCast(t.*);
|
||||
}
|
||||
|
||||
fn sveVectorBytes() usize {
|
||||
// PR_SVE_GET_VL returns the SVE vector length in the lower 16 bits.
|
||||
const PR_SVE_GET_VL: i32 = 51;
|
||||
const ret = linux.prctl(PR_SVE_GET_VL, 0, 0, 0, 0);
|
||||
const signed: isize = @bitCast(ret);
|
||||
if (signed >= 0) {
|
||||
return ret & 0xFFFF;
|
||||
}
|
||||
// prctl failed: assume 128-bit (NEON-width, conservative).
|
||||
return 16;
|
||||
}
|
||||
|
|
@ -0,0 +1,10 @@
|
|||
/// Reads from the ELF auxiliary vector (set by the kernel at process
|
||||
/// start). Does not call into libc.
|
||||
pub inline fn getauxval(key: usize) usize {
|
||||
return @import("std").os.linux.getauxval(key);
|
||||
}
|
||||
|
||||
/// Direct syscall wrapper for prctl(2).
|
||||
pub inline fn prctl(option: i32, a2: usize, a3: usize, a4: usize, a5: usize) usize {
|
||||
return @import("std").os.linux.prctl(option, a2, a3, a4, a5);
|
||||
}
|
||||
|
|
@ -0,0 +1,26 @@
|
|||
const builtin = @import("builtin");
|
||||
const HwyTargets = @import("../targets.zig").Targets;
|
||||
const linux = @import("linux.zig");
|
||||
|
||||
pub fn detect() i64 {
|
||||
var t: HwyTargets = .{};
|
||||
|
||||
if (comptime builtin.os.tag != .linux) return @bitCast(t);
|
||||
|
||||
const AT_HWCAP: usize = 16;
|
||||
const hwcap = linux.getauxval(AT_HWCAP);
|
||||
|
||||
// From Linux arch/loongarch/include/uapi/asm/hwcap.h
|
||||
const HWCAP_LSX: usize = 1 << 4;
|
||||
const HWCAP_LASX: usize = 1 << 5;
|
||||
|
||||
if (hwcap & HWCAP_LSX != 0) {
|
||||
t.lsx = true;
|
||||
|
||||
if (hwcap & HWCAP_LASX != 0) {
|
||||
t.lasx = true;
|
||||
}
|
||||
}
|
||||
|
||||
return @bitCast(t);
|
||||
}
|
||||
|
|
@ -0,0 +1,43 @@
|
|||
const builtin = @import("builtin");
|
||||
const HwyTargets = @import("../targets.zig").Targets;
|
||||
const linux = @import("linux.zig");
|
||||
|
||||
pub fn detect() i64 {
|
||||
var t: HwyTargets = .{};
|
||||
|
||||
if (comptime builtin.os.tag != .linux) return @bitCast(t);
|
||||
|
||||
const AT_HWCAP: usize = 16;
|
||||
const AT_HWCAP2: usize = 26;
|
||||
const hwcap = linux.getauxval(AT_HWCAP);
|
||||
const hwcap2 = linux.getauxval(AT_HWCAP2);
|
||||
|
||||
// From Linux arch/powerpc/include/uapi/asm/cputable.h
|
||||
const PPC_FEATURE_HAS_ALTIVEC: usize = 0x10000000;
|
||||
const PPC_FEATURE_HAS_VSX: usize = 0x00000080;
|
||||
const PPC_FEATURE2_ARCH_2_07: usize = 0x80000000; // POWER8
|
||||
const PPC_FEATURE2_VEC_CRYPTO: usize = 0x02000000;
|
||||
const PPC_FEATURE2_ARCH_3_00: usize = 0x00800000; // POWER9
|
||||
const PPC_FEATURE2_ARCH_3_1: usize = 0x00040000; // POWER10
|
||||
const PPC_FEATURE2_MMA: usize = 0x00020000;
|
||||
|
||||
if (hwcap & PPC_FEATURE_HAS_ALTIVEC != 0 and
|
||||
hwcap & PPC_FEATURE_HAS_VSX != 0 and
|
||||
hwcap2 & PPC_FEATURE2_ARCH_2_07 != 0 and
|
||||
hwcap2 & PPC_FEATURE2_VEC_CRYPTO != 0)
|
||||
{
|
||||
t.ppc8 = true;
|
||||
|
||||
if (hwcap2 & PPC_FEATURE2_ARCH_3_00 != 0) {
|
||||
t.ppc9 = true;
|
||||
|
||||
if (hwcap2 & PPC_FEATURE2_ARCH_3_1 != 0 and
|
||||
hwcap2 & PPC_FEATURE2_MMA != 0)
|
||||
{
|
||||
t.ppc10 = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return @bitCast(t);
|
||||
}
|
||||
|
|
@ -0,0 +1,22 @@
|
|||
const builtin = @import("builtin");
|
||||
const HwyTargets = @import("../targets.zig").Targets;
|
||||
const linux = @import("linux.zig");
|
||||
|
||||
pub fn detect() i64 {
|
||||
var t: HwyTargets = .{};
|
||||
|
||||
if (comptime builtin.os.tag != .linux) return @bitCast(t);
|
||||
|
||||
const AT_HWCAP: usize = 16;
|
||||
const hwcap = linux.getauxval(AT_HWCAP);
|
||||
|
||||
// ISA extension bit for 'V' (vector).
|
||||
// Letter-based bits: bit position = letter - 'A'.
|
||||
const HWCAP_V: usize = 1 << ('V' - 'A');
|
||||
|
||||
if (hwcap & HWCAP_V != 0) {
|
||||
t.rvv = true;
|
||||
}
|
||||
|
||||
return @bitCast(t);
|
||||
}
|
||||
|
|
@ -0,0 +1,29 @@
|
|||
const builtin = @import("builtin");
|
||||
const HwyTargets = @import("../targets.zig").Targets;
|
||||
const linux = @import("linux.zig");
|
||||
|
||||
pub fn detect() i64 {
|
||||
var t: HwyTargets = .{};
|
||||
|
||||
if (comptime builtin.os.tag != .linux) return @bitCast(t);
|
||||
|
||||
const AT_HWCAP: usize = 16;
|
||||
const hwcap = linux.getauxval(AT_HWCAP);
|
||||
|
||||
// From Linux arch/s390/include/asm/elf.h
|
||||
const HWCAP_VX: usize = 1 << 11;
|
||||
const HWCAP_VXE: usize = 1 << 13; // z14
|
||||
const HWCAP_VXE2: usize = 1 << 15; // z15
|
||||
|
||||
if (hwcap & HWCAP_VX != 0) {
|
||||
if (hwcap & HWCAP_VXE != 0) {
|
||||
t.z14 = true;
|
||||
|
||||
if (hwcap & HWCAP_VXE2 != 0) {
|
||||
t.z15 = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return @bitCast(t);
|
||||
}
|
||||
|
|
@ -0,0 +1,166 @@
|
|||
const builtin = @import("builtin");
|
||||
const HwyTargets = @import("../targets.zig").Targets;
|
||||
|
||||
const CpuidResult = struct { eax: u32, ebx: u32, ecx: u32, edx: u32 };
|
||||
|
||||
fn cpuid(leaf: u32, subleaf: u32) CpuidResult {
|
||||
var eax: u32 = undefined;
|
||||
var ebx: u32 = undefined;
|
||||
var ecx: u32 = undefined;
|
||||
var edx: u32 = undefined;
|
||||
asm volatile ("cpuid"
|
||||
: [_] "={eax}" (eax),
|
||||
[_] "={ebx}" (ebx),
|
||||
[_] "={ecx}" (ecx),
|
||||
[_] "={edx}" (edx),
|
||||
: [_] "{eax}" (leaf),
|
||||
[_] "{ecx}" (subleaf),
|
||||
);
|
||||
return .{ .eax = eax, .ebx = ebx, .ecx = ecx, .edx = edx };
|
||||
}
|
||||
|
||||
inline fn bit(val: u32, comptime pos: u5) bool {
|
||||
return (val >> pos) & 1 != 0;
|
||||
}
|
||||
|
||||
pub fn detect() i64 {
|
||||
var t: HwyTargets = .{};
|
||||
|
||||
// x86_64 always has SSE2.
|
||||
if (comptime builtin.cpu.arch == .x86_64) {
|
||||
t.sse2 = true;
|
||||
}
|
||||
|
||||
const leaf0 = cpuid(0, 0);
|
||||
const max_leaf = leaf0.eax;
|
||||
if (max_leaf < 1) return @bitCast(t);
|
||||
|
||||
const leaf1 = cpuid(1, 0);
|
||||
|
||||
// -- SSE2 on 32-bit x86 -------------------------------------------------
|
||||
if (comptime builtin.cpu.arch == .x86) {
|
||||
if (bit(leaf1.edx, 25) and bit(leaf1.edx, 26)) {
|
||||
t.sse2 = true;
|
||||
}
|
||||
}
|
||||
|
||||
// -- SSSE3 ---------------------------------------------------------------
|
||||
if (bit(leaf1.ecx, 0) and // SSE3
|
||||
bit(leaf1.ecx, 9)) // SSSE3
|
||||
{
|
||||
t.ssse3 = true;
|
||||
}
|
||||
|
||||
// -- SSE4 ----------------------------------------------------------------
|
||||
if (bit(leaf1.ecx, 19) and // SSE4.1
|
||||
bit(leaf1.ecx, 20) and // SSE4.2
|
||||
bit(leaf1.ecx, 1) and // PCLMUL
|
||||
bit(leaf1.ecx, 25)) // AES
|
||||
{
|
||||
t.sse4 = true;
|
||||
}
|
||||
|
||||
// Check XSAVE / AVX OS support before enabling any AVX-dependent target.
|
||||
const has_xsave = bit(leaf1.ecx, 27);
|
||||
const has_avx_bit = bit(leaf1.ecx, 28);
|
||||
const xcr0: u32 = if (has_xsave and has_avx_bit) asm volatile ("xgetbv"
|
||||
: [_] "={eax}" (-> u32),
|
||||
: [_] "{ecx}" (@as(u32, 0)),
|
||||
: .{ .edx = true }) else 0;
|
||||
const has_avx_save = (xcr0 & 0x6) == 0x6; // SSE + AVX state
|
||||
|
||||
// Darwin lazily saves AVX-512 context on first use.
|
||||
const has_avx512_save = if (comptime builtin.os.tag.isDarwin())
|
||||
true
|
||||
else
|
||||
(xcr0 & 0xE0) == 0xE0; // opmask + zmm_hi256 + hi16_zmm
|
||||
|
||||
// -- AVX2 ----------------------------------------------------------------
|
||||
if (has_avx_save and max_leaf >= 7) {
|
||||
const leaf7 = cpuid(7, 0);
|
||||
|
||||
if (bit(leaf7.ebx, 5) and // AVX2
|
||||
bit(leaf1.ecx, 12) and // FMA
|
||||
bit(leaf1.ecx, 29)) // F16C
|
||||
{
|
||||
// Also need LZCNT (extended leaf), BMI, BMI2.
|
||||
const leaf_ext = cpuid(0x80000001, 0);
|
||||
if (bit(leaf_ext.ecx, 5) and // LZCNT
|
||||
bit(leaf7.ebx, 3) and // BMI
|
||||
bit(leaf7.ebx, 8)) // BMI2
|
||||
{
|
||||
t.avx2 = true;
|
||||
}
|
||||
}
|
||||
|
||||
// -- AVX-512 ---------------------------------------------------------
|
||||
if (has_avx512_save) {
|
||||
if (bit(leaf7.ebx, 16) and // AVX512F
|
||||
bit(leaf7.ebx, 31) and // AVX512VL
|
||||
bit(leaf7.ebx, 17) and // AVX512DQ
|
||||
bit(leaf7.ebx, 30) and // AVX512BW
|
||||
bit(leaf7.ebx, 28)) // AVX512CD
|
||||
{
|
||||
t.avx3 = true;
|
||||
}
|
||||
|
||||
if (bit(leaf7.ecx, 11) and // AVX512VNNI
|
||||
bit(leaf7.ecx, 10) and // VPCLMULQDQ (AVX save ok)
|
||||
bit(leaf7.ecx, 1) and // AVX512VBMI
|
||||
bit(leaf7.ecx, 6) and // AVX512VBMI2
|
||||
bit(leaf7.ecx, 9) and // VAES (AVX save ok)
|
||||
bit(leaf7.ecx, 14) and // AVX512VPOPCNTDQ
|
||||
bit(leaf7.ecx, 12) and // AVX512BITALG
|
||||
bit(leaf7.ecx, 8)) // GFNI
|
||||
{
|
||||
t.avx3_dl = true;
|
||||
}
|
||||
|
||||
// AVX512BF16 is in leaf 7 sub-1.
|
||||
if (t.avx3_dl and leaf7.eax >= 1) {
|
||||
const leaf7_1 = cpuid(7, 1);
|
||||
if (bit(leaf7_1.eax, 5)) { // AVX512BF16
|
||||
if (isAMD()) {
|
||||
t.avx3_zen4 = true;
|
||||
}
|
||||
}
|
||||
|
||||
if (bit(leaf7.edx, 23) and // AVX512FP16
|
||||
bit(leaf7_1.eax, 5)) // AVX512BF16
|
||||
{
|
||||
t.avx3_spr = true;
|
||||
}
|
||||
} else if (bit(leaf7.edx, 23)) { // AVX512FP16 without sub-leaf
|
||||
// Can't check BF16 without sub-leaf support, skip avx3_spr.
|
||||
}
|
||||
}
|
||||
|
||||
// -- AVX10 -----------------------------------------------------------
|
||||
if (max_leaf >= 7 and cpuid(7, 0).eax >= 1) {
|
||||
const leaf7_1 = cpuid(7, 1);
|
||||
if (bit(leaf7_1.edx, 19)) { // AVX10.1-256
|
||||
if (max_leaf >= 0x24) {
|
||||
const leaf24 = cpuid(0x24, 0);
|
||||
if (bit(leaf24.ebx, 18)) { // AVX10.1-512
|
||||
t.avx3_spr = true;
|
||||
t.avx3_dl = true;
|
||||
t.avx3 = true;
|
||||
}
|
||||
}
|
||||
|
||||
// AVX10.2 detection would require a leaf we can't
|
||||
// reliably check yet; leave for future.
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return @bitCast(t);
|
||||
}
|
||||
|
||||
fn isAMD() bool {
|
||||
const leaf0 = cpuid(0, 0);
|
||||
// "Auth" "enti" "cAMD"
|
||||
return leaf0.ebx == 0x68747541 and
|
||||
leaf0.ecx == 0x444d4163 and
|
||||
leaf0.edx == 0x69746e65;
|
||||
}
|
||||
|
|
@ -0,0 +1,12 @@
|
|||
extern "c" fn hwy_supported_targets() i64;
|
||||
|
||||
pub const Targets = @import("targets.zig").Targets;
|
||||
|
||||
pub fn supported_targets() Targets {
|
||||
return @bitCast(hwy_supported_targets());
|
||||
}
|
||||
|
||||
test {
|
||||
_ = supported_targets();
|
||||
_ = @import("runtime_detect.zig");
|
||||
}
|
||||
|
|
@ -0,0 +1,109 @@
|
|||
const assert = @import("std").debug.assert;
|
||||
|
||||
pub const Targets = packed struct(i64) {
|
||||
// x86_64
|
||||
_reserved_0_2: u3 = 0,
|
||||
avx10_2_512: bool = false,
|
||||
avx3_spr: bool = false,
|
||||
avx10_2: bool = false,
|
||||
avx3_zen4: bool = false,
|
||||
avx3_dl: bool = false,
|
||||
avx3: bool = false,
|
||||
avx2: bool = false,
|
||||
_reserved_10: u1 = 0,
|
||||
sse4: bool = false,
|
||||
ssse3: bool = false,
|
||||
_reserved_13: u1 = 0,
|
||||
sse2: bool = false,
|
||||
_reserved_15_17: u3 = 0,
|
||||
|
||||
// aarch64
|
||||
sve2_128: bool = false,
|
||||
sve_256: bool = false,
|
||||
_reserved_20_22: u3 = 0,
|
||||
sve2: bool = false,
|
||||
sve: bool = false,
|
||||
_reserved_25: u1 = 0,
|
||||
neon_bf16: bool = false,
|
||||
_reserved_27: u1 = 0,
|
||||
neon: bool = false,
|
||||
neon_without_aes: bool = false,
|
||||
_reserved_30_36: u7 = 0,
|
||||
|
||||
// risc-v
|
||||
rvv: bool = false,
|
||||
_reserved_38_39: u2 = 0,
|
||||
|
||||
// LoongArch
|
||||
lasx: bool = false,
|
||||
lsx: bool = false,
|
||||
_reserved_42_46: u5 = 0,
|
||||
|
||||
// IBM Power
|
||||
ppc10: bool = false,
|
||||
ppc9: bool = false,
|
||||
ppc8: bool = false,
|
||||
z15: bool = false,
|
||||
z14: bool = false,
|
||||
_reserved_52_57: u6 = 0,
|
||||
|
||||
// WebAssembly
|
||||
wasm_emu256: bool = false,
|
||||
wasm: bool = false,
|
||||
_reserved_60: u1 = 0,
|
||||
|
||||
// Emulation
|
||||
emu128: bool = false,
|
||||
scalar: bool = false,
|
||||
_reserved_63: u1 = 0,
|
||||
|
||||
fn bitPos(comptime field_name: []const u8) comptime_int {
|
||||
return @bitOffsetOf(Targets, field_name);
|
||||
}
|
||||
|
||||
// Verify at comptime that each flag field matches its Highway bit constant.
|
||||
comptime {
|
||||
// x86
|
||||
assert(bitPos("avx10_2_512") == 3);
|
||||
assert(bitPos("avx3_spr") == 4);
|
||||
assert(bitPos("avx10_2") == 5);
|
||||
assert(bitPos("avx3_zen4") == 6);
|
||||
assert(bitPos("avx3_dl") == 7);
|
||||
assert(bitPos("avx3") == 8);
|
||||
assert(bitPos("avx2") == 9);
|
||||
assert(bitPos("sse4") == 11);
|
||||
assert(bitPos("ssse3") == 12);
|
||||
assert(bitPos("sse2") == 14);
|
||||
|
||||
// aarch64
|
||||
assert(bitPos("sve2_128") == 18);
|
||||
assert(bitPos("sve_256") == 19);
|
||||
assert(bitPos("sve2") == 23);
|
||||
assert(bitPos("sve") == 24);
|
||||
assert(bitPos("neon_bf16") == 26);
|
||||
assert(bitPos("neon") == 28);
|
||||
assert(bitPos("neon_without_aes") == 29);
|
||||
|
||||
// risc-v
|
||||
assert(bitPos("rvv") == 37);
|
||||
|
||||
// LoongArch
|
||||
assert(bitPos("lasx") == 40);
|
||||
assert(bitPos("lsx") == 41);
|
||||
|
||||
// IBM Power
|
||||
assert(bitPos("ppc10") == 47);
|
||||
assert(bitPos("ppc9") == 48);
|
||||
assert(bitPos("ppc8") == 49);
|
||||
assert(bitPos("z15") == 50);
|
||||
assert(bitPos("z14") == 51);
|
||||
|
||||
// WebAssembly
|
||||
assert(bitPos("wasm_emu256") == 58);
|
||||
assert(bitPos("wasm") == 59);
|
||||
|
||||
// Emulation
|
||||
assert(bitPos("emu128") == 61);
|
||||
assert(bitPos("scalar") == 62);
|
||||
}
|
||||
};
|
||||
|
|
@ -88,11 +88,17 @@ fn step(ptr: *anyopaque) Benchmark.Error!void {
|
|||
return error.BenchmarkFailed;
|
||||
};
|
||||
if (n == 0) break; // EOF reached
|
||||
for (buf[0..n]) |c| {
|
||||
const actions = p.next(c);
|
||||
//std.log.warn("actions={any}", .{actions});
|
||||
_ = actions;
|
||||
}
|
||||
parseAll(&p, buf[0..n]);
|
||||
}
|
||||
}
|
||||
|
||||
/// Separated from `step` so that the tight per-byte loop gets its own
|
||||
/// function alignment, insulating it from code-layout changes elsewhere
|
||||
/// in the binary that would otherwise shift its cache-line placement.
|
||||
noinline fn parseAll(p: *terminalpkg.Parser, data: []const u8) void {
|
||||
for (data) |c| {
|
||||
const actions = p.next(c);
|
||||
_ = actions;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -7,8 +7,14 @@
|
|||
#ifndef GHOSTTY_SIMD_CPW_HELPERS_
|
||||
#define GHOSTTY_SIMD_CPW_HELPERS_
|
||||
|
||||
#include <assert.h>
|
||||
#include <stddef.h>
|
||||
#ifdef NDEBUG
|
||||
#define GHOSTTY_SIMD_ASSERT(cond) ((void)0)
|
||||
#else
|
||||
#define GHOSTTY_SIMD_ASSERT(cond) \
|
||||
do { \
|
||||
if (!(cond)) __builtin_trap();\
|
||||
} while (0)
|
||||
#endif
|
||||
|
||||
// Replacement for std::size() that works without libc++.
|
||||
template <typename T, size_t N>
|
||||
|
|
@ -249,8 +255,8 @@ static_assert(array_size(nsm_gte16) == array_size(nsm_lte16));
|
|||
/// Handles 16-bit codepoints.
|
||||
template <class D, typename T = uint16_t>
|
||||
int8_t CodepointWidth16(D d, uint16_t input) {
|
||||
assert(input > 0xFF);
|
||||
assert(input <= 0xFFFF);
|
||||
GHOSTTY_SIMD_ASSERT(input > 0xFF);
|
||||
GHOSTTY_SIMD_ASSERT(input <= 0xFFFF);
|
||||
|
||||
const size_t N = hn::Lanes(d);
|
||||
const hn::Vec<D> input_vec = Set(d, input);
|
||||
|
|
@ -287,7 +293,7 @@ int8_t CodepointWidth16(D d, uint16_t input) {
|
|||
return 2;
|
||||
}
|
||||
}
|
||||
assert(i >= 7); // We should have checked all the ranges.
|
||||
GHOSTTY_SIMD_ASSERT(i >= 7); // We should have checked all the ranges.
|
||||
}
|
||||
|
||||
{
|
||||
|
|
@ -353,7 +359,7 @@ int8_t CodepointWidth16(D d, uint16_t input) {
|
|||
/// Handles codepoints larger than 16-bit.
|
||||
template <class D, typename T = uint32_t>
|
||||
int8_t CodepointWidth32(D d, T input) {
|
||||
assert(input > 0xFFFF);
|
||||
GHOSTTY_SIMD_ASSERT(input > 0xFFFF);
|
||||
|
||||
const size_t N = hn::Lanes(d);
|
||||
const hn::Vec<D> input_vec = Set(d, input);
|
||||
|
|
@ -379,7 +385,7 @@ int8_t CodepointWidth32(D d, T input) {
|
|||
return 2;
|
||||
}
|
||||
}
|
||||
assert(i >= 2); // We should have checked all the ranges.
|
||||
GHOSTTY_SIMD_ASSERT(i >= 2); // We should have checked all the ranges.
|
||||
}
|
||||
|
||||
{
|
||||
|
|
|
|||
|
|
@ -5,8 +5,6 @@
|
|||
#include <hwy/highway.h>
|
||||
|
||||
#include <simdutf.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
||||
#include <simd/index_of.h>
|
||||
#include <simd/vt.h>
|
||||
|
|
|
|||
Loading…
Reference in New Issue