optimized scrfd code

2026-02-15 05:18:37 +03:00 · 2024-06-09 23:03:34 -04:00
parent fb4fe5d40b
commit 8d2a849edc
6 changed files with 411 additions and 32 deletions
--- a/machine-learning/app/models/facial_recognition/detection.py
+++ b/machine-learning/app/models/facial_recognition/detection.py
@@ -1,44 +1,33 @@
-from pathlib import Path
 from typing import Any

 import numpy as np
 import onnxruntime as ort
-from insightface.model_zoo import RetinaFace
 from numpy.typing import NDArray

 from app.models.base import InferenceModel
-from app.models.session import ort_has_batch_dim, ort_squeeze_outputs
-from app.models.transforms import decode_cv2
+from app.models.session import ort_has_batch_dim, ort_expand_outputs
+from app.models.transforms import decode_pil
 from app.schemas import FaceDetectionOutput, ModelSession, ModelTask, ModelType
-
+from .scrfd import SCRFD
+from PIL import Image
+from PIL.ImageOps import pad

 class FaceDetector(InferenceModel):
    depends = []
    identity = (ModelType.DETECTION, ModelTask.FACIAL_RECOGNITION)

-    def __init__(
-        self,
-        model_name: str,
-        min_score: float = 0.7,
-        cache_dir: Path | str | None = None,
-        **model_kwargs: Any,
-    ) -> None:
-        self.min_score = model_kwargs.pop("minScore", min_score)
-        super().__init__(model_name, cache_dir, **model_kwargs)
-
    def _load(self) -> ModelSession:
        session = self._make_session(self.model_path)
-        if isinstance(session, ort.InferenceSession) and ort_has_batch_dim(session):
-            ort_squeeze_outputs(session)
-        self.model = RetinaFace(session=session)
-        self.model.prepare(ctx_id=0, det_thresh=self.min_score, input_size=(640, 640))
+        if isinstance(session, ort.InferenceSession) and not ort_has_batch_dim(session):
+            ort_expand_outputs(session)
+        self.model = SCRFD(session=session)

        return session

-    def _predict(self, inputs: NDArray[np.uint8] | bytes, **kwargs: Any) -> FaceDetectionOutput:
-        inputs = decode_cv2(inputs)
+    def _predict(self, inputs: NDArray[np.uint8] | bytes | Image.Image, **kwargs: Any) -> FaceDetectionOutput:
+        inputs = self._transform(inputs)

-        bboxes, landmarks = self._detect(inputs)
+        [bboxes], [landmarks] = self.model.detect(inputs, threshold=kwargs.pop("minScore", 0.7))
        return {
            "boxes": bboxes[:, :4].round(),
            "scores": bboxes[:, 4],
@@ -48,5 +37,7 @@ class FaceDetector(InferenceModel):
    def _detect(self, inputs: NDArray[np.uint8] | bytes) -> tuple[NDArray[np.float32], NDArray[np.float32]]:
        return self.model.detect(inputs)  # type: ignore

-    def configure(self, **kwargs: Any) -> None:
-        self.model.det_thresh = kwargs.pop("minScore", self.model.det_thresh)
+    def _transform(self, inputs: NDArray[np.uint8] | bytes | Image.Image) -> NDArray[np.uint8]:
+        image = decode_pil(inputs)
+        padded = pad(image, (640, 640), method=Image.Resampling.BICUBIC)
+        return np.array(padded, dtype=np.uint8)[None, ...]
--- a/machine-learning/app/models/facial_recognition/scrfd.py
+++ b/machine-learning/app/models/facial_recognition/scrfd.py
@@ -0,0 +1,325 @@
+# Based on InsightFace-REST by SthPhoenix https://github.com/SthPhoenix/InsightFace-REST/blob/master/src/api_trt/modules/model_zoo/detectors/scrfd.py
+# Primary changes made:
+# 1. Removed CuPy-related code
+# 2. Adapted proposal generation to be thread-safe
+# 3. Added typing
+# 4. Assume RGB input
+# 5. Removed unused variables
+
+# Copyright 2021 SthPhoenix
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+# http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# -*- coding: utf-8 -*-
+# Based on Jia Guo reference implementation at
+# https://github.com/deepinsight/insightface/blob/master/detection/scrfd/tools/scrfd.py
+
+
+from __future__ import division
+
+import cv2
+import numpy as np
+from numba import njit
+from app.schemas import ModelSession
+from numpy.typing import NDArray
+
+
+@njit(cache=True, nogil=True)
+def nms(dets, threshold: float = 0.4) -> NDArray[np.float32]:
+    x1 = dets[:, 0]
+    y1 = dets[:, 1]
+    x2 = dets[:, 2]
+    y2 = dets[:, 3]
+    scores = dets[:, 4]
+
+    areas = (x2 - x1 + 1) * (y2 - y1 + 1)
+    order = scores.argsort()[::-1]
+
+    keep = []
+    while order.size > 0:
+        i = order[0]
+        keep.append(i)
+        xx1 = np.maximum(x1[i], x1[order[1:]])
+        yy1 = np.maximum(y1[i], y1[order[1:]])
+        xx2 = np.minimum(x2[i], x2[order[1:]])
+        yy2 = np.minimum(y2[i], y2[order[1:]])
+
+        w = np.maximum(0.0, xx2 - xx1 + 1)
+        h = np.maximum(0.0, yy2 - yy1 + 1)
+        inter = w * h
+        ovr = inter / (areas[i] + areas[order[1:]] - inter)
+
+        inds = np.where(ovr <= threshold)[0]
+        order = order[inds + 1]
+
+    return np.asarray(keep)
+
+
+@njit(fastmath=True, cache=True, nogil=True)
+def single_distance2bbox(point: NDArray[np.float32], distance: NDArray[np.float32], stride: int) -> NDArray[np.float32]:
+    """
+    Fast conversion of single bbox distances to coordinates
+
+    :param point: Anchor point
+    :param distance: Bbox distances from anchor point
+    :param stride: Current stride scale
+    :return: bbox
+    """
+    distance[0] = point[0] - distance[0] * stride
+    distance[1] = point[1] - distance[1] * stride
+    distance[2] = point[0] + distance[2] * stride
+    distance[3] = point[1] + distance[3] * stride
+    return distance
+
+
+@njit(fastmath=True, cache=True, nogil=True)
+def single_distance2kps(point: NDArray[np.float32], distance: NDArray[np.float32], stride: int) -> NDArray[np.float32]:
+    """
+    Fast conversion of single keypoint distances to coordinates
+
+    :param point: Anchor point
+    :param distance: Keypoint distances from anchor point
+    :param stride: Current stride scale
+    :return: keypoint
+    """
+    for ix in range(0, distance.shape[0], 2):
+        distance[ix] = distance[ix] * stride + point[0]
+        distance[ix + 1] = distance[ix + 1] * stride + point[1]
+    return distance
+
+
+@njit(fastmath=True, cache=True, nogil=True)
+def generate_proposals(
+    score_blob: NDArray[np.float32],
+    bbox_blob: NDArray[np.float32],
+    kpss_blob: NDArray[np.float32],
+    stride: int,
+    anchors: NDArray[np.float32],
+    threshold: float,
+) -> tuple[NDArray[np.float32], NDArray[np.float32], NDArray[np.float32]]:
+    """
+    Convert distances from anchors to actual coordinates on source image
+    and filter proposals by confidence threshold.
+
+    :param score_blob: Raw scores for stride
+    :param bbox_blob: Raw bbox distances for stride
+    :param kpss_blob: Raw keypoints distances for stride
+    :param stride: Stride scale
+    :param anchors: Precomputed anchors for stride
+    :param threshold: Confidence threshold
+    :return: Filtered scores, bboxes and keypoints
+    """
+
+    idxs = []
+    for ix in range(score_blob.shape[0]):
+        if score_blob[ix][0] > threshold:
+            idxs.append(ix)
+
+    score_out = np.empty((len(idxs), 1), dtype="float32")
+    bbox_out = np.empty((len(idxs), 4), dtype="float32")
+    kpss_out = np.empty((len(idxs), 10), dtype="float32")
+
+    for i in range(len(idxs)):
+        ix = idxs[i]
+        score_out[i] = score_blob[ix]
+        bbox_out[i] = single_distance2bbox(anchors[ix], bbox_blob[ix], stride)
+        kpss_out[i] = single_distance2kps(anchors[ix], kpss_blob[ix], stride)
+
+    return score_out, bbox_out, kpss_out
+
+
+@njit(fastmath=True, cache=True, nogil=True)
+def filter(
+    bboxes_list: NDArray[np.float32],
+    kpss_list: NDArray[np.float32],
+    scores_list: NDArray[np.float32],
+    nms_threshold: float = 0.4,
+) -> tuple[NDArray[np.float32], NDArray[np.float32]]:
+    """
+    Filter postprocessed network outputs with NMS
+
+    :param bboxes_list: List of bboxes (np.ndarray)
+    :param kpss_list: List of keypoints (np.ndarray)
+    :param scores_list: List of scores (np.ndarray)
+    :return: Face bboxes with scores [t,l,b,r,score], and key points
+    """
+
+    pre_det = np.hstack((bboxes_list, scores_list))
+    keep = nms(pre_det, threshold=nms_threshold)
+    det = pre_det[keep, :]
+    kpss = kpss_list[keep, :]
+    kpss = kpss.reshape((kpss.shape[0], -1, 2))
+
+    return det, kpss
+
+
+class SCRFD:
+    def __init__(self, session: ModelSession):
+        self.session = session
+        self.center_cache: dict[tuple[int, int], NDArray[np.float32]] = {}
+        self.nms_threshold = 0.4
+        self.fmc = 3
+        self._feat_stride_fpn = [8, 16, 32]
+        self._num_anchors = 2
+
+    def prepare(self, nms_threshold: float = 0.4) -> None:
+        """
+        Populate class parameters
+
+        :param nms_threshold: Threshold for NMS IoU
+        """
+
+        self.nms_threshold = nms_threshold
+
+    def detect(
+        self, imgs: NDArray[np.uint8], threshold: float = 0.5
+    ) -> tuple[list[NDArray[np.float32]], list[NDArray[np.float32]]]:
+        """
+        Run detection pipeline for provided images
+
+        :param img: Raw image as nd.ndarray with HWC shape
+        :param threshold: Confidence threshold
+        :return: Face bboxes with scores [t,l,b,r,score], and key points
+        """
+
+        height, width = imgs.shape[1:3]
+        blob = self._preprocess(imgs)
+        net_outs = self._forward(blob)
+
+        batch_bboxes, batch_kpss, batch_scores = self._postprocess(net_outs, height, width, threshold)
+
+        dets_list = []
+        kpss_list = []
+        for e in range(imgs.shape[0]):
+            if len(batch_bboxes[e]) == 0:
+                det, kpss = np.zeros((0, 5), dtype="float32"), np.zeros((0, 10), dtype="float32")
+            else:
+                det, kpss = filter(batch_bboxes[e], batch_kpss[e], batch_scores[e], self.nms_threshold)
+
+            dets_list.append(det)
+            kpss_list.append(kpss)
+
+        return dets_list, kpss_list
+
+    @staticmethod
+    def _build_anchors(
+        input_height: int, input_width: int, strides: list[int], num_anchors: int
+    ) -> NDArray[np.float32]:
+        """
+        Precompute anchor points for provided image size
+
+        :param input_height: Input image height
+        :param input_width: Input image width
+        :param strides: Model strides
+        :param num_anchors: Model num anchors
+        :return: box centers
+        """
+
+        centers = []
+        for stride in strides:
+            height = input_height // stride
+            width = input_width // stride
+
+            anchor_centers = np.stack(np.mgrid[:height, :width][::-1], axis=-1).astype(np.float32)
+            anchor_centers = (anchor_centers * stride).reshape((-1, 2))
+            if num_anchors > 1:
+                anchor_centers = np.stack([anchor_centers] * num_anchors, axis=1).reshape((-1, 2))
+            centers.append(anchor_centers)
+        return centers
+
+    def _preprocess(self, images: NDArray[np.uint8]):
+        """
+        Normalize image on CPU if backend can't provide CUDA stream,
+        otherwise preprocess image on GPU using CuPy
+
+        :param img: Raw image as np.ndarray with HWC shape
+        :return: Preprocessed image or None if image was processed on device
+        """
+
+        input_size = tuple(images[0].shape[0:2][::-1])
+        return cv2.dnn.blobFromImages(images, 1.0 / 128, input_size, (127.5, 127.5, 127.5), swapRB=False)
+
+    def _forward(self, blob: NDArray[np.float32]) -> list[NDArray[np.float32]]:
+        """
+        Send input data to inference backend.
+
+        :param blob: Preprocessed image of shape NCHW or None
+        :return: network outputs
+        """
+
+        return self.session.run(None, {"input.1": blob})
+
+    def _postprocess(
+        self, net_outs: list[NDArray[np.float32]], height: int, width: int, threshold: float
+    ) -> tuple[list[NDArray[np.float32]], list[NDArray[np.float32]], list[NDArray[np.float32]]]:
+        """
+        Precompute anchor points for provided image size and process network outputs
+
+        :param net_outs: Network outputs
+        :param input_height: Input image height
+        :param input_width: Input image width
+        :param threshold: Confidence threshold
+        :return: filtered bboxes, keypoints and scores
+        """
+
+        key = (height, width)
+
+        if not self.center_cache.get(key):
+            self.center_cache[key] = self._build_anchors(height, width, self._feat_stride_fpn, self._num_anchors)
+        anchor_centers = self.center_cache[key]
+        bboxes, kpss, scores = self._process_strides(net_outs, threshold, anchor_centers)
+        return bboxes, kpss, scores
+
+    def _process_strides(
+        self, net_outs: list[NDArray[np.float32]], threshold: float, anchors: NDArray[np.float32]
+    ) -> tuple[list[NDArray[np.float32]], list[NDArray[np.float32]], list[NDArray[np.float32]]]:
+        """
+        Process network outputs by strides and return results proposals filtered by threshold
+
+        :param net_outs: Network outputs
+        :param threshold: Confidence threshold
+        :param anchor_centers: Precomputed anchor centers for all strides
+        :return: filtered bboxes, keypoints and scores
+        """
+
+        batch_size = net_outs[0].shape[0]
+        bboxes_by_img = []
+        kpss_by_img = []
+        scores_by_img = []
+
+        for batch in range(batch_size):
+            scores_strided = []
+            bboxes_strided = []
+            kpss_strided = []
+            for idx, stride in enumerate(self._feat_stride_fpn):
+                score_blob = net_outs[idx][batch]
+                bbox_blob = net_outs[idx + self.fmc][batch]
+                kpss_blob = net_outs[idx + self.fmc * 2][batch]
+                stride_anchors = anchors[idx]
+                score_list, bbox_list, kpss_list = generate_proposals(
+                    score_blob,
+                    bbox_blob,
+                    kpss_blob,
+                    stride,
+                    stride_anchors,
+                    threshold,
+                )
+
+                scores_strided.append(score_list)
+                bboxes_strided.append(bbox_list)
+                kpss_strided.append(kpss_list)
+            bboxes_by_img.append(np.concatenate(bboxes_strided, axis=0))
+            kpss_by_img.append(np.concatenate(kpss_strided, axis=0))
+            scores_by_img.append(np.concatenate(scores_strided, axis=0))
+
+        return bboxes_by_img, kpss_by_img, scores_by_img
--- a/machine-learning/app/models/session.py
+++ b/machine-learning/app/models/session.py
@@ -12,12 +12,12 @@ def ort_has_batch_dim(session: ort.InferenceSession) -> bool:
    return session.get_inputs()[0].shape[0] == "batch"


-def ort_squeeze_outputs(session: ort.InferenceSession) -> None:
+def ort_expand_outputs(session: ort.InferenceSession) -> None:
    original_run = session.run

    def run(output_names: list[str], input_feed: dict[str, NDArray[np.float32]]) -> list[NDArray[np.float32]]:
        out: list[NDArray[np.float32]] = original_run(output_names, input_feed)
-        out = [o.squeeze(axis=0) for o in out]
+        out = [np.expand_dims(o, axis=0) for o in out]
        return out

    session.run = run
--- a/machine-learning/app/models/transforms.py
+++ b/machine-learning/app/models/transforms.py
@@ -3,6 +3,7 @@ from typing import IO

 import cv2
 import numpy as np
+from numba import njit
 from numpy.typing import NDArray
 from PIL import Image

@@ -30,10 +31,11 @@ def to_numpy(img: Image.Image) -> NDArray[np.float32]:
    return np.asarray(img if img.mode == "RGB" else img.convert("RGB"), dtype=np.float32) / 255.0


+@njit(cache=True, fastmath=True, nogil=True)
 def normalize(
    img: NDArray[np.float32], mean: float | NDArray[np.float32], std: float | NDArray[np.float32]
 ) -> NDArray[np.float32]:
-    return np.divide(img - mean, std, dtype=np.float32)
+    return (img - mean) / std


 def get_pil_resampling(resample: str) -> Image.Resampling: