From 057c7629bd7bf368a4c955e9a592a6f31ab0b961 Mon Sep 17 00:00:00 2001 From: Huan Yu Date: Mon, 13 Apr 2026 22:18:33 +0800 Subject: [PATCH] align ocr detection normalization with paddle ppocrv5 --- machine-learning/immich_ml/models/ocr/detection.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/machine-learning/immich_ml/models/ocr/detection.py b/machine-learning/immich_ml/models/ocr/detection.py index 0a2cb8ad91..18ef4c1013 100644 --- a/machine-learning/immich_ml/models/ocr/detection.py +++ b/machine-learning/immich_ml/models/ocr/detection.py @@ -25,8 +25,12 @@ class TextDetector(InferenceModel): def __init__(self, model_name: str, min_score: float = 0.5, **model_kwargs: Any) -> None: super().__init__(model_name.split("__")[-1], **model_kwargs, model_format=ModelFormat.ONNX) self.max_resolution = 736 - self.mean = np.array([0.5, 0.5, 0.5], dtype=np.float32) - self.std_inv = np.float32(1.0) / (np.array([0.5, 0.5, 0.5], dtype=np.float32) * 255.0) + # Align with Paddle NormalizeImage: + # scale=1/255, mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225] + # This implementation works on raw 0..255 pixels, so we fold scale into mean/std: + # (x/255 - mean) / std == (x - mean*255) / (std*255) + self.mean = np.array([0.485, 0.456, 0.406], dtype=np.float32) * 255.0 + self.std_inv = np.float32(1.0) / (np.array([0.229, 0.224, 0.225], dtype=np.float32) * 255.0) self._empty: TextDetectionOutput = { "boxes": np.empty(0, dtype=np.float32), "scores": np.empty(0, dtype=np.float32),