tflite CLIP export

2026-02-15 13:28:24 +03:00 · 2023-11-30 22:02:20 +01:00
parent 5f6ad9e239
commit eb0f79b72e
5 changed files with 51 additions and 15 deletions
--- a/machine-learning/export/env.yaml
+++ b/machine-learning/export/env.yaml
@@ -22,5 +22,5 @@ dependencies:
  - pip:
      - multilingual-clip
      - onnx-simplifier
-      - tensorflow
+      - tensorflow==2.14.*
 category: main
--- a/machine-learning/export/models/tfclip.py
+++ b/machine-learning/export/models/tfclip.py
@@ -13,20 +13,22 @@ class _CLIPWrapper(tf.Module):
        self.model = TFCLIPModel.from_pretrained(model_name)

    @tf.function()
-    def encode_image(self, input):
-        return self.model.get_image_features(input)
+    def encode_image(self, input_tensor):
+        return self.model.get_image_features(input_tensor)

    @tf.function()
-    def encode_text(self, input):
-        return self.model.get_text_features(input)
+    def encode_text(self, input_tensor):
+        return self.model.get_text_features(input_tensor)


 # exported model signatures use batch size 2 because of the following reasons:
-# 1. ARM-NN cannot use dynamic batch sizes
+# 1. ARM-NN cannot use dynamic batch sizes for complex models like CLIP ViT
 # 2. batch size 1 creates a larger TF-Lite model that uses a lot (50%) more RAM
-# 3. batch size 2 is ~50% faster on GPU than 1 while 4 (or larger) are not faster
+# 3. batch size 2 is ~50% faster on GPU than 1 while 4 (or larger) are not really faster
 # 4. batch size >2 wastes more computation if only a single image is processed
-BATCH_SIZE = 2
+BATCH_SIZE_IMAGE = 2
+# On most small-scale systems there will only be one query at a time, no sense in batching
+BATCH_SIZE_TEXT = 1

 SIGNATURE_TEXT = "encode_text"
 SIGNATURE_IMAGE = "encode_image"
@@ -52,19 +54,19 @@ def _export_temporary_tf_model(model_name, tmp_path: str, context_length: int):
    wrapper = _CLIPWrapper(model_name)
    conf = wrapper.model.config.vision_config
    spec_visual = tf.TensorSpec(
-        shape=(BATCH_SIZE, conf.num_channels, conf.image_size, conf.image_size), dtype=tf.float32
+        shape=(BATCH_SIZE_IMAGE, conf.num_channels, conf.image_size, conf.image_size), dtype=tf.float32
    )
    encode_image = wrapper.encode_image.get_concrete_function(spec_visual)
-    spec_text = tf.TensorSpec(shape=(BATCH_SIZE, context_length), dtype=tf.int32)
+    spec_text = tf.TensorSpec(shape=(BATCH_SIZE_TEXT, context_length), dtype=tf.int32)
    encode_text = wrapper.encode_text.get_concrete_function(spec_text)
-    signatures = {"encode_text": encode_text, "encode_image": encode_image}
+    signatures = {SIGNATURE_IMAGE: encode_image, SIGNATURE_TEXT: encode_text}
    tf.saved_model.save(wrapper, tmp_path, signatures)


 def _export_tflite_model(tmp_path: str, signature: str, output_path: str):
    converter = tf.lite.TFLiteConverter.from_saved_model(tmp_path, signature_keys=[signature])
    converter.optimizations = [tf.lite.Optimize.DEFAULT]
-    converter.target_spec.supported_types = [tf.float32]
+    converter.target_spec.supported_types = [tf.float16]
    tflite_model = converter.convert()
    with open(output_path, "wb") as f:
        f.write(tflite_model)
--- a/machine-learning/export/run.py
+++ b/machine-learning/export/run.py
@@ -4,9 +4,10 @@ from pathlib import Path
 from tempfile import TemporaryDirectory

 from huggingface_hub import create_repo, login, upload_folder
-from models import mclip, openclip, tfclip
 from rich.progress import Progress

+from models import mclip, openclip, tfclip
+
 models = [
    "RN50::openai",
    "RN50::yfcc15m",