From 9a81b6d19da4dc97b2e7bd94149b62feb981a3bc Mon Sep 17 00:00:00 2001
From: Jael Gu <mengjia.gu@zilliz.com>
Date: Mon, 15 Aug 2022 16:39:31 +0800
Subject: [PATCH] Update test

Signed-off-by: Jael Gu <mengjia.gu@zilliz.com>
---
 README.md         | 24 +++++++++++++--
 nn_fingerprint.py | 76 ++++++++++++++++++++++++++++++-----------------
 performance.md    | 46 ++++++++++++++++++++++++++++
 test.py           | 19 +++++++++---
 4 files changed, 130 insertions(+), 35 deletions(-)
 create mode 100644 performance.md

diff --git a/README.md b/README.md
index c186644..adb3953 100644
--- a/README.md
+++ b/README.md
@@ -56,7 +56,7 @@ import towhee
 
 Create the operator via the following factory method
 
-***audio_embedding.nnfp(params=None, checkpoint_path=None, framework='pytorch')***
+***audio_embedding.nnfp(params=None, model_path=None, framework='pytorch')***
 
 **Parameters:**
 
@@ -64,9 +64,10 @@ Create the operator via the following factory method
 
 A dictionary of model parameters. If None, it will use default parameters to create model.
 
-*checkpoint_path: str*
+*model_path: str*
 
-The path to model weights. If None, it will load default model weights.
+The path to model. If None, it will load default model weights.
+When the path ends with '.onnx', the operator will use onnx inference.
 
 *framework: str*
 
@@ -79,6 +80,8 @@ Default value is "pytorch" since the model is implemented in Pytorch.
 
 An audio embedding operator generates vectors in numpy.ndarray given towhee audio frames.
 
+***\_\_call\_\_(data)***
+
 **Parameters:**
 
 *data: List[towhee.types.audio_frame.AudioFrame]*
@@ -93,3 +96,18 @@ The audio input should be at least 1s.
 
 Audio embeddings in shape (num_clips, 128).
 Each embedding stands for features of an audio clip with length of 1s.
+
+
+***save_model(format='pytorch', path='default')***
+
+**Parameters:**
+
+*format: str*
+
+Format used to save model, defaults to 'pytorch'.
+Accepted formats: 'pytorch', 'torchscript, 'onnx', 'tensorrt' (in progress)
+
+*path: str*
+
+Path to save model, defaults to 'default'.
+The default path is under 'saved' in the same directory of operator cache.
\ No newline at end of file
diff --git a/nn_fingerprint.py b/nn_fingerprint.py
index 5f9d7ce..3623a7d 100644
--- a/nn_fingerprint.py
+++ b/nn_fingerprint.py
@@ -22,6 +22,7 @@ from typing import List
 import torch
 import numpy
 import resampy
+import onnxruntime
 
 from towhee.operator.base import NNOperator
 from towhee import register
@@ -43,8 +44,9 @@ class NNFingerprint(NNOperator):
 
     def __init__(self,
                  params: dict = None,
-                 checkpoint_path: str = None,
-                 framework: str = 'pytorch'):
+                 model_path: str = None,
+                 framework: str = 'pytorch',
+                 ):
         super().__init__(framework=framework)
         self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
         if params is None:
@@ -52,38 +54,47 @@ class NNFingerprint(NNOperator):
         else:
             self.params = params
 
-        dim = self.params['dim']
-        h = self.params['h']
-        u = self.params['u']
-        f_bin = self.params['n_mels']
-        n_seg = int(self.params['segment_size'] * self.params['sample_rate'])
-        t = (n_seg + self.params['hop_length'] - 1) // self.params['hop_length']
-
-        log.info('Creating model...')
-        self.model = NNFp(
-            dim=dim, h=h, u=u,
-            in_f=f_bin, in_t=t,
-            fuller=self.params['fuller'],
-            activation=self.params['activation']
-        ).to(self.device)
-
-        log.info('Loading weights...')
-        if checkpoint_path is None:
+        log.info('Loading model...')
+        if model_path is None:
             path = str(Path(__file__).parent)
-            checkpoint_path = os.path.join(path, 'saved_model', 'pfann_fma_m.pt')
-        state_dict = torch.load(checkpoint_path, map_location=self.device)
-        if isinstance(state_dict, torch.nn.Module):
-            self.model = state_dict
+            model_path = os.path.join(path, 'saved_model', 'pfann_fma_m.pt')
+        if model_path.endswith('.onnx'):
+            log.warning('Using onnx.')
+            self.model = onnxruntime.InferenceSession(model_path)
         else:
-            self.model.load_state_dict(state_dict)
-        self.model.eval()
+            state_dict = torch.load(model_path, map_location=self.device)
+            if isinstance(state_dict, torch.nn.Module):
+                self.model = state_dict
+            else:
+                dim = self.params['dim']
+                h = self.params['h']
+                u = self.params['u']
+                f_bin = self.params['n_mels']
+                n_seg = int(self.params['segment_size'] * self.params['sample_rate'])
+                t = (n_seg + self.params['hop_length'] - 1) // self.params['hop_length']
+                log.info('Creating model with parameters...')
+                self.model = NNFp(
+                    dim=dim, h=h, u=u,
+                    in_f=f_bin, in_t=t,
+                    fuller=self.params['fuller'],
+                    activation=self.params['activation']
+                ).to(self.device)
+                self.model.load_state_dict(state_dict)
+            self.model.eval()
         log.info('Model is loaded.')
 
     def __call__(self, data: List[AudioFrame]) -> numpy.ndarray:
         audio_tensors = self.preprocess(data).to(self.device)
         # print(audio_tensors.shape)
-        features = self.model(audio_tensors)
-        return features.detach().cpu().numpy()
+        if isinstance(self.model, onnxruntime.InferenceSession):
+            audio_numpy = audio_tensors.detach().cpu().numpy() if audio_tensors.requires_grad \
+                else audio_tensors.cpu().numpy()
+            ort_inputs = {self.model.get_inputs()[0].name: audio_numpy}
+            outs = self.model.run(None, ort_inputs)[0]
+        else:
+            features = self.model(audio_tensors)
+            outs = features.detach().cpu().numpy()
+        return outs
 
     def preprocess(self, frames: List[AudioFrame]):
         sr = frames[0].sample_rate
@@ -137,7 +148,7 @@ class NNFingerprint(NNOperator):
             log.warning('Converting float dtype from %s to %s.', wav.dtype, dtype)
             return wav.astype(dtype)
 
-    def save_model(self, format: str='pytorch', path: str = 'default'):
+    def save_model(self, format: str = 'pytorch', path: str = 'default'):
         if path == 'default':
             path = str(Path(__file__).parent)
             path = os.path.join(path, 'saved', format)
@@ -156,6 +167,9 @@ class NNFingerprint(NNOperator):
                 try:
                     jit_model = torch.jit.script(self.model)
                 except Exception:
+                    log.warning(
+                        'Failed to directly export as torchscript.'
+                        'Using dummy input in shape of %s now.', dummy_input.shape)
                     jit_model = torch.jit.trace(self.model, dummy_input, strict=False)
                 torch.jit.save(jit_model, path)
             except Exception as e:
@@ -181,3 +195,9 @@ class NNFingerprint(NNOperator):
             # todo: elif format == 'tensorrt':
         else:
             log.error(f'Unsupported format "{format}".')
+
+    def input_schema(self):
+        return [(AudioFrame, (1024,))]
+
+    def output_schema(self):
+        return [(numpy.ndarray, (-1, self.params['dim']))]
diff --git a/performance.md b/performance.md
new file mode 100644
index 0000000..5e4830c
--- /dev/null
+++ b/performance.md
@@ -0,0 +1,46 @@
+# Inference Performance
+
+## Test Scripts
+
+```python
+from towhee import ops
+import time
+
+decode = ops.audio_decode.ffmpeg()
+audio = [x[0] for x in decode('path/to/test.wav')]
+
+op = ops.audio_embedding.nnfp()
+# op = ops.audio_embedding.nnfp(
+#		    model_path='path/to/torchscript/model.pt')
+# op = ops.audio_embedding.nnfp(
+#		    model_path='path/to/model.onnx')
+
+
+start = time.time()
+for _ in range(100):
+	embs = op(audio)
+	assert(embs.shape == (10, 128))
+end = time.time()
+
+print((end-start) / 100)
+```
+
+## Results
+
+- Device: MacOS, 2.3 GHz Quad-Core Intel Core i7, 8 CPUs
+- Input: 10s audio, loop for 100 times
+
+| inference method | mem usage | avg time |
+| -- | -- | -- |
+| pytorch | 0.3G | 0.451s |
+｜ torchscript | 0.3G | 0.470s |
+| onnx | 0.3G | 0.378s |
+
+- Device: MacOS, 2.3 GHz Quad-Core Intel Core i7, 8 CPUs
+- Input: 188s audio, loop for 100 times
+
+| inference method | mem usage | avg time |
+| -- | -- | -- |
+| pytorch | 2.6G | 8.162s |
+｜ torchscript | 2.8G | 7.507s |
+| onnx | 1.7G | 6.769s |
diff --git a/test.py b/test.py
index da05983..62ab8a4 100644
--- a/test.py
+++ b/test.py
@@ -1,4 +1,7 @@
 from towhee import ops
+
+import warnings
+
 import torch
 import numpy
 import onnx
@@ -17,11 +20,19 @@ op = ops.audio_embedding.nnfp()
 out0 = op.get_op().model(audio)
 # print(out0)
 
+# Test Pytorch
 op.get_op().save_model(format='pytorch')
 op = ops.audio_embedding.nnfp(checkpoint_path='./saved/pytorch/nnfp.pt')
 out1 = op.get_op().model(audio)
-assert((out0 == out1).all())
+assert ((out0 == out1).all())
+
+# Test Torchscript
+op.get_op().save_model(format='torchscript')
+op = ops.audio_embedding.nnfp(checkpoint_path='./saved/torchscript/nnfp.pt')
+out2 = op.get_op().model(audio)
+assert ((out0 == out2).all())
 
+# Test ONNX
 op.get_op().save_model(format='onnx')
 op = ops.audio_embedding.nnfp()
 onnx_model = onnx.load('./saved/onnx/nnfp.onnx')
@@ -30,6 +41,6 @@ onnx.checker.check_model(onnx_model)
 ort_session = onnxruntime.InferenceSession('./saved/onnx/nnfp.onnx')
 ort_inputs = {ort_session.get_inputs()[0].name: to_numpy(audio)}
 ort_outs = ort_session.run(None, ort_inputs)
-out2 = ort_outs[0]
-# print(out2)
-assert(numpy.allclose(to_numpy(out0), out2, rtol=1e-03, atol=1e-05))
+out3 = ort_outs[0]
+# print(out3)
+assert (numpy.allclose(to_numpy(out0), out3, rtol=1e-03, atol=1e-05))