From 5f6ad9e23988ca9cef6bca0e669f9b52057a45d8 Mon Sep 17 00:00:00 2001
From: Fynn Petersen-Frey <zoodyy@users.noreply.github.com>
Date: Sat, 4 Nov 2023 09:34:19 +0100
Subject: [PATCH] feat(ml): ARM NN acceleration

---
 machine-learning/export/env.yaml         |  5 +-
 machine-learning/export/models/tfclip.py | 70 ++++++++++++++++++++++++
 machine-learning/export/models/util.py   | 10 +++-
 machine-learning/export/run.py           |  7 ++-
 4 files changed, 86 insertions(+), 6 deletions(-)
 create mode 100644 machine-learning/export/models/tfclip.py

diff --git a/machine-learning/export/env.yaml b/machine-learning/export/env.yaml
index f7144812d0..f53f5b0011 100644
--- a/machine-learning/export/env.yaml
+++ b/machine-learning/export/env.yaml
@@ -20,6 +20,7 @@ dependencies:
   - torchvision
   - transformers==4.*
   - pip:
-    - multilingual-clip
-    - onnx-simplifier
+      - multilingual-clip
+      - onnx-simplifier
+      - tensorflow
 category: main
diff --git a/machine-learning/export/models/tfclip.py b/machine-learning/export/models/tfclip.py
new file mode 100644
index 0000000000..4dbe00d10b
--- /dev/null
+++ b/machine-learning/export/models/tfclip.py
@@ -0,0 +1,70 @@
+import tempfile
+from pathlib import Path
+
+import tensorflow as tf
+from transformers import TFCLIPModel
+
+from .util import ModelType, get_model_path
+
+
+class _CLIPWrapper(tf.Module):
+    def __init__(self, model_name: str):
+        super(_CLIPWrapper)
+        self.model = TFCLIPModel.from_pretrained(model_name)
+
+    @tf.function()
+    def encode_image(self, input):
+        return self.model.get_image_features(input)
+
+    @tf.function()
+    def encode_text(self, input):
+        return self.model.get_text_features(input)
+
+
+# exported model signatures use batch size 2 because of the following reasons:
+# 1. ARM-NN cannot use dynamic batch sizes
+# 2. batch size 1 creates a larger TF-Lite model that uses a lot (50%) more RAM
+# 3. batch size 2 is ~50% faster on GPU than 1 while 4 (or larger) are not faster
+# 4. batch size >2 wastes more computation if only a single image is processed
+BATCH_SIZE = 2
+
+SIGNATURE_TEXT = "encode_text"
+SIGNATURE_IMAGE = "encode_image"
+
+
+def to_tflite(
+    model_name,
+    output_path_image: Path | str | None,
+    output_path_text: Path | str | None,
+    context_length: int = 77,
+):
+    with tempfile.TemporaryDirectory() as tmpdir:
+        _export_temporary_tf_model(model_name, tmpdir, context_length)
+        if output_path_image is not None:
+            image_path = get_model_path(output_path_image, ModelType.TFLITE)
+            _export_tflite_model(tmpdir, SIGNATURE_IMAGE, image_path.as_posix())
+        if output_path_text is not None:
+            text_path = get_model_path(output_path_text, ModelType.TFLITE)
+            _export_tflite_model(tmpdir, SIGNATURE_TEXT, text_path.as_posix())
+
+
+def _export_temporary_tf_model(model_name, tmp_path: str, context_length: int):
+    wrapper = _CLIPWrapper(model_name)
+    conf = wrapper.model.config.vision_config
+    spec_visual = tf.TensorSpec(
+        shape=(BATCH_SIZE, conf.num_channels, conf.image_size, conf.image_size), dtype=tf.float32
+    )
+    encode_image = wrapper.encode_image.get_concrete_function(spec_visual)
+    spec_text = tf.TensorSpec(shape=(BATCH_SIZE, context_length), dtype=tf.int32)
+    encode_text = wrapper.encode_text.get_concrete_function(spec_text)
+    signatures = {"encode_text": encode_text, "encode_image": encode_image}
+    tf.saved_model.save(wrapper, tmp_path, signatures)
+
+
+def _export_tflite_model(tmp_path: str, signature: str, output_path: str):
+    converter = tf.lite.TFLiteConverter.from_saved_model(tmp_path, signature_keys=[signature])
+    converter.optimizations = [tf.lite.Optimize.DEFAULT]
+    converter.target_spec.supported_types = [tf.float32]
+    tflite_model = converter.convert()
+    with open(output_path, "wb") as f:
+        f.write(tflite_model)
diff --git a/machine-learning/export/models/util.py b/machine-learning/export/models/util.py
index 67e582af46..73a974ec00 100644
--- a/machine-learning/export/models/util.py
+++ b/machine-learning/export/models/util.py
@@ -1,12 +1,18 @@
 import json
+from enum import Enum
 from pathlib import Path
 from typing import Any
 
 
-def get_model_path(output_dir: Path | str) -> Path:
+class ModelType(Enum):
+    ONNX = "onnx"
+    TFLITE = "tflite"
+
+
+def get_model_path(output_dir: Path | str, model_type: ModelType = ModelType.ONNX) -> Path:
     output_dir = Path(output_dir)
     output_dir.mkdir(parents=True, exist_ok=True)
-    return output_dir / "model.onnx"
+    return output_dir / f"model.{model_type.value}"
 
 
 def save_config(config: Any, output_path: Path | str) -> None:
diff --git a/machine-learning/export/run.py b/machine-learning/export/run.py
index 5ce32189e2..49dfef5a11 100644
--- a/machine-learning/export/run.py
+++ b/machine-learning/export/run.py
@@ -4,7 +4,7 @@ from pathlib import Path
 from tempfile import TemporaryDirectory
 
 from huggingface_hub import create_repo, login, upload_folder
-from models import mclip, openclip
+from models import mclip, openclip, tfclip
 from rich.progress import Progress
 
 models = [
@@ -37,9 +37,10 @@ models = [
     "M-CLIP/XLM-Roberta-Large-Vit-B-32",
     "M-CLIP/XLM-Roberta-Large-Vit-B-16Plus",
     "M-CLIP/XLM-Roberta-Large-Vit-L-14",
+    "openai/clip-vit-base-patch32",
 ]
 
-login(token=os.environ["HF_AUTH_TOKEN"])
+# login(token=os.environ["HF_AUTH_TOKEN"])
 
 with Progress() as progress:
     task1 = progress.add_task("[green]Exporting models...", total=len(models))
@@ -65,6 +66,8 @@ with Progress() as progress:
                 textual_dir = tmpdir / model_name / "textual"
                 if model.startswith("M-CLIP"):
                     mclip.to_onnx(model, visual_dir, textual_dir)
+                elif "/" in model:
+                    tfclip.to_tflite(model, visual_dir.as_posix(), textual_dir.as_posix())
                 else:
                     name, _, pretrained = model_name.partition("__")
                     openclip.to_onnx(openclip.OpenCLIPModelConfig(name, pretrained), visual_dir, textual_dir)