mirror-immich/machine-learning/immich_ml/sessions/rknn/immich_session.py

from __future__ import annotations

import threading
import time
from concurrent.futures import Future, ThreadPoolExecutor
from pathlib import Path
from types import TracebackType
from typing import TYPE_CHECKING, Any, NamedTuple, Optional, Protocol, Sequence, cast

import numpy as np
from numpy.typing import NDArray

from immich_ml.config import log, settings
from immich_ml.models.constants import RKNN_SUPPORTED_SOCS

from .native import rknn_pool as _native_mod  # pragma: no cover - compiled extension load

if TYPE_CHECKING:
    class NativeRKNNExecutor(Protocol):
        def infer(self, inputs: list[NDArray[np.float32]]) -> list[NDArray[np.float32]]: ...

        def get_io_info(self) -> dict[str, Any]: ...
else:
    NativeRKNNExecutor = _native_mod.NativeRKNNExecutor


__all__ = [
    "RknnSession",
    "RknnPoolExecutor",
    "run_inference",
    "is_available",
    "soc_name",
    "model_prefix",
]


def get_soc(device_tree_path: Path | str) -> str | None:
    try:
        with Path(device_tree_path).open() as f:
            device_compatible_str = f.read().lower()
            for soc in RKNN_SUPPORTED_SOCS:
                if soc in device_compatible_str:
                    return soc
    except OSError as exc:
        log.debug("Could not read device tree %s: %s", device_tree_path, exc)
    return None


soc_name = get_soc("/proc/device-tree/compatible")
is_available = soc_name is not None and settings.rknn
model_prefix = Path("rknpu") / soc_name if is_available and soc_name else None


class SessionNode(NamedTuple):
    name: Optional[str]
    shape: tuple[int, ...]


class RKNNInferenceResult(NamedTuple):
    tag: Any
    start_time: float
    end_time: float
    duration_s: float
    outputs: list[NDArray[np.float32]]


class InferenceExecutor(Protocol):
    def infer(self, inputs: list[NDArray[np.float32]]) -> list[NDArray[np.float32]]: ...


def run_inference(executor: InferenceExecutor, inputs: list[NDArray[np.float32]]) -> list[NDArray[np.float32]]:
    return executor.infer(inputs)


class RknnPoolExecutor:
    def __init__(self, model_path: str | Path, tpes: int) -> None:
        if tpes < 1:
            raise ValueError("tpes must be >= 1")
        model_path_str = Path(model_path).as_posix()
        self._native = NativeRKNNExecutor(model_path_str, num_workers=tpes)
        self._executor = ThreadPoolExecutor(max_workers=tpes, thread_name_prefix="rknn-worker")
        self._closed = False

    def _run_inference(self, inputs: list[NDArray[np.float32]], tag: Any) -> RKNNInferenceResult:
        start = time.perf_counter()
        outputs = self._native.infer(inputs)
        end = time.perf_counter()
        return RKNNInferenceResult(
            tag=tag,
            start_time=start,
            end_time=end,
            duration_s=end - start,
            outputs=outputs,
        )

    def submit(self, inputs: Sequence[NDArray[np.float32]], *, tag: Any = None) -> Future[RKNNInferenceResult]:
        if self._closed:
            raise RuntimeError("Pool is closed")
        return self._executor.submit(self._run_inference, list(inputs), tag)

    def put(self, inputs: Sequence[NDArray[np.float32]], *, tag: Any = None) -> Future[RKNNInferenceResult]:
        return self.submit(inputs, tag=tag)

    def close(self, *, wait: bool = True) -> None:
        if self._closed:
            return
        self._closed = True
        self._executor.shutdown(wait=wait)

    @property
    def executor(self) -> NativeRKNNExecutor:
        return self._native

    def __enter__(self) -> "RknnPoolExecutor":
        return self

    def __exit__(
        self,
        exc_type: type[BaseException] | None,
        exc_val: BaseException | None,
        exc_tb: TracebackType | None,
    ) -> None:
        self.close()


class RknnSession:
    def __init__(
        self,
        model_path: Path | str,
        *,
        num_workers: Optional[int] = None,
        logger: Any = None,
    ) -> None:
        if not is_available:
            raise RuntimeError("RKNN is not available on this device")
        self.model_path = Path(model_path)
        self.log = logger or log
        default_workers = getattr(settings, "rknn_threads", 1)
        self.tpe = num_workers or default_workers
        if self.tpe < 1:
            raise ValueError("num_workers must be >= 1")
        self.log.info(
            "Loading RKNN model from %s with %s worker(s).",
            self.model_path,
            self.tpe,
        )
        self.rknnpool = RknnPoolExecutor(self.model_path, self.tpe)
        self._io_info = self._normalize_io_info(self.rknnpool.executor.get_io_info())
        self._input_nodes = self._build_nodes("inputs")
        self._output_nodes = self._build_nodes("outputs")
        self.log.info("Loaded RKNN model from %s.", self.model_path)

    @property
    def io_info(self) -> dict[str, Any]:
        return self._io_info

    def get_inputs(self) -> list[SessionNode]:
        return self._input_nodes

    def get_outputs(self) -> list[SessionNode]:
        return self._output_nodes

    def run(
        self,
        _output_names: Sequence[str] | None,
        input_feed: dict[str, NDArray[np.float32]] | dict[str, NDArray[np.int32]],
        _run_options: Any = None,
    ) -> list[NDArray[np.float32]]:
        return self.run_async(_output_names, input_feed, _run_options).result().outputs

    def run_async(
        self,
        _output_names: Sequence[str] | None,
        input_feed: dict[str, NDArray[np.float32]] | dict[str, NDArray[np.int32]],
        _run_options: Any = None,
    ) -> Future[RKNNInferenceResult]:
        inputs_list = list(input_feed.values())
        if not inputs_list:
            raise ValueError("input_feed must not be empty")

        batch_sizes = {int(x.shape[0]) for x in inputs_list}
        if len(batch_sizes) != 1:
            raise ValueError(f"All inputs must have the same batch size, got {sorted(batch_sizes)}")

        batch_size = batch_sizes.pop()
        if batch_size <= 1:
            return self.rknnpool.put(inputs_list)

        # Split each input tensor into per-sample slices of shape (1, ...)
        per_sample_inputs = [[inp[i : i + 1] for inp in inputs_list] for i in range(batch_size)]
        sub_futures = [self.rknnpool.put(sample) for sample in per_sample_inputs]

        parent_future: Future[RKNNInferenceResult] = Future()

        def _aggregate() -> None:
            try:
                results = [f.result() for f in sub_futures]
                num_outputs = len(results[0].outputs)
                stacked_outputs = [np.concatenate([r.outputs[j] for r in results], axis=0) for j in range(num_outputs)]
                start_time = min(r.start_time for r in results)
                end_time = max(r.end_time for r in results)
                parent_future.set_result(
                    RKNNInferenceResult(
                        tag=None,
                        start_time=start_time,
                        end_time=end_time,
                        duration_s=end_time - start_time,
                        outputs=stacked_outputs,
                    ),
                )
            except Exception as exc:  # noqa: BLE001
                if not parent_future.done():
                    parent_future.set_exception(exc)

        threading.Thread(target=_aggregate, daemon=True).start()
        return parent_future

    def close(self) -> None:
        self.rknnpool.close()

    def _build_nodes(self, key: str) -> list[SessionNode]:
        nodes: list[SessionNode] = []
        for entry in self._io_info.get(key, []):
            shape = self._shape_from_entry(entry)
            if key == "inputs" and shape:
                # Represent the batch dimension symbolically for readability while
                # keeping the static type compatible with the ModelSession protocol.
                symbolic_shape_any: tuple[Any, ...] = ("batch", *shape[1:])
                symbolic_shape = cast(tuple[int, ...], symbolic_shape_any)
            else:
                symbolic_shape = shape
            nodes.append(SessionNode(name=entry.get("name"), shape=symbolic_shape))
        return nodes

    @staticmethod
    def _shape_from_entry(entry: dict[str, Any]) -> tuple[int, ...]:
        if dims := entry.get("dims"):
            return tuple(int(dim) for dim in dims)
        dyn = entry.get("dynamic", {})
        ranges = dyn.get("ranges", [])
        if ranges:
            return tuple(int(dim) for dim in ranges[-1])
        raise ValueError(f"Cannot determine shape from entry: {entry}")

    def _normalize_io_info(self, info: dict[str, Any]) -> dict[str, Any]:
        return {
            **info,
            "inputs": [self._normalize_tensor_desc(t) for t in info.get("inputs", [])],
            "outputs": [self._normalize_tensor_desc(t) for t in info.get("outputs", [])],
        }

    @staticmethod
    def _normalize_tensor_desc(tensor: dict[str, Any]) -> dict[str, Any]:
        dims = list(RknnSession._shape_from_entry(tensor))
        desc = {**tensor, "dims": dims, "n_dims": len(dims)}
        # Force NCHW format if the runtime reports NHWC tensors
        if tensor.get("fmt") == 1 and len(dims) == 4:
            n, h, w, c = dims
            desc["dims"] = [n, c, h, w]
            desc["fmt"] = 0
            dyn = desc.get("dynamic", {})
            if "ranges" in dyn:
                dyn["ranges"] = [
                    [shape[0], shape[3], shape[1], shape[2]] if len(shape) == 4 else shape for shape in dyn["ranges"]
                ]
        return desc