From 9a81b6d19da4dc97b2e7bd94149b62feb981a3bc Mon Sep 17 00:00:00 2001 From: Jael Gu Date: Mon, 15 Aug 2022 16:39:31 +0800 Subject: [PATCH] Update test Signed-off-by: Jael Gu --- README.md | 24 +++++++++++++-- nn_fingerprint.py | 76 ++++++++++++++++++++++++++++++----------------- performance.md | 46 ++++++++++++++++++++++++++++ test.py | 19 +++++++++--- 4 files changed, 130 insertions(+), 35 deletions(-) create mode 100644 performance.md diff --git a/README.md b/README.md index c186644..adb3953 100644 --- a/README.md +++ b/README.md @@ -56,7 +56,7 @@ import towhee Create the operator via the following factory method -***audio_embedding.nnfp(params=None, checkpoint_path=None, framework='pytorch')*** +***audio_embedding.nnfp(params=None, model_path=None, framework='pytorch')*** **Parameters:** @@ -64,9 +64,10 @@ Create the operator via the following factory method A dictionary of model parameters. If None, it will use default parameters to create model. -*checkpoint_path: str* +*model_path: str* -The path to model weights. If None, it will load default model weights. +The path to model. If None, it will load default model weights. +When the path ends with '.onnx', the operator will use onnx inference. *framework: str* @@ -79,6 +80,8 @@ Default value is "pytorch" since the model is implemented in Pytorch. An audio embedding operator generates vectors in numpy.ndarray given towhee audio frames. +***\_\_call\_\_(data)*** + **Parameters:** *data: List[towhee.types.audio_frame.AudioFrame]* @@ -93,3 +96,18 @@ The audio input should be at least 1s. Audio embeddings in shape (num_clips, 128). Each embedding stands for features of an audio clip with length of 1s. + + +***save_model(format='pytorch', path='default')*** + +**Parameters:** + +*format: str* + +Format used to save model, defaults to 'pytorch'. +Accepted formats: 'pytorch', 'torchscript, 'onnx', 'tensorrt' (in progress) + +*path: str* + +Path to save model, defaults to 'default'. +The default path is under 'saved' in the same directory of operator cache. \ No newline at end of file diff --git a/nn_fingerprint.py b/nn_fingerprint.py index 5f9d7ce..3623a7d 100644 --- a/nn_fingerprint.py +++ b/nn_fingerprint.py @@ -22,6 +22,7 @@ from typing import List import torch import numpy import resampy +import onnxruntime from towhee.operator.base import NNOperator from towhee import register @@ -43,8 +44,9 @@ class NNFingerprint(NNOperator): def __init__(self, params: dict = None, - checkpoint_path: str = None, - framework: str = 'pytorch'): + model_path: str = None, + framework: str = 'pytorch', + ): super().__init__(framework=framework) self.device = 'cuda' if torch.cuda.is_available() else 'cpu' if params is None: @@ -52,38 +54,47 @@ class NNFingerprint(NNOperator): else: self.params = params - dim = self.params['dim'] - h = self.params['h'] - u = self.params['u'] - f_bin = self.params['n_mels'] - n_seg = int(self.params['segment_size'] * self.params['sample_rate']) - t = (n_seg + self.params['hop_length'] - 1) // self.params['hop_length'] - - log.info('Creating model...') - self.model = NNFp( - dim=dim, h=h, u=u, - in_f=f_bin, in_t=t, - fuller=self.params['fuller'], - activation=self.params['activation'] - ).to(self.device) - - log.info('Loading weights...') - if checkpoint_path is None: + log.info('Loading model...') + if model_path is None: path = str(Path(__file__).parent) - checkpoint_path = os.path.join(path, 'saved_model', 'pfann_fma_m.pt') - state_dict = torch.load(checkpoint_path, map_location=self.device) - if isinstance(state_dict, torch.nn.Module): - self.model = state_dict + model_path = os.path.join(path, 'saved_model', 'pfann_fma_m.pt') + if model_path.endswith('.onnx'): + log.warning('Using onnx.') + self.model = onnxruntime.InferenceSession(model_path) else: - self.model.load_state_dict(state_dict) - self.model.eval() + state_dict = torch.load(model_path, map_location=self.device) + if isinstance(state_dict, torch.nn.Module): + self.model = state_dict + else: + dim = self.params['dim'] + h = self.params['h'] + u = self.params['u'] + f_bin = self.params['n_mels'] + n_seg = int(self.params['segment_size'] * self.params['sample_rate']) + t = (n_seg + self.params['hop_length'] - 1) // self.params['hop_length'] + log.info('Creating model with parameters...') + self.model = NNFp( + dim=dim, h=h, u=u, + in_f=f_bin, in_t=t, + fuller=self.params['fuller'], + activation=self.params['activation'] + ).to(self.device) + self.model.load_state_dict(state_dict) + self.model.eval() log.info('Model is loaded.') def __call__(self, data: List[AudioFrame]) -> numpy.ndarray: audio_tensors = self.preprocess(data).to(self.device) # print(audio_tensors.shape) - features = self.model(audio_tensors) - return features.detach().cpu().numpy() + if isinstance(self.model, onnxruntime.InferenceSession): + audio_numpy = audio_tensors.detach().cpu().numpy() if audio_tensors.requires_grad \ + else audio_tensors.cpu().numpy() + ort_inputs = {self.model.get_inputs()[0].name: audio_numpy} + outs = self.model.run(None, ort_inputs)[0] + else: + features = self.model(audio_tensors) + outs = features.detach().cpu().numpy() + return outs def preprocess(self, frames: List[AudioFrame]): sr = frames[0].sample_rate @@ -137,7 +148,7 @@ class NNFingerprint(NNOperator): log.warning('Converting float dtype from %s to %s.', wav.dtype, dtype) return wav.astype(dtype) - def save_model(self, format: str='pytorch', path: str = 'default'): + def save_model(self, format: str = 'pytorch', path: str = 'default'): if path == 'default': path = str(Path(__file__).parent) path = os.path.join(path, 'saved', format) @@ -156,6 +167,9 @@ class NNFingerprint(NNOperator): try: jit_model = torch.jit.script(self.model) except Exception: + log.warning( + 'Failed to directly export as torchscript.' + 'Using dummy input in shape of %s now.', dummy_input.shape) jit_model = torch.jit.trace(self.model, dummy_input, strict=False) torch.jit.save(jit_model, path) except Exception as e: @@ -181,3 +195,9 @@ class NNFingerprint(NNOperator): # todo: elif format == 'tensorrt': else: log.error(f'Unsupported format "{format}".') + + def input_schema(self): + return [(AudioFrame, (1024,))] + + def output_schema(self): + return [(numpy.ndarray, (-1, self.params['dim']))] diff --git a/performance.md b/performance.md new file mode 100644 index 0000000..5e4830c --- /dev/null +++ b/performance.md @@ -0,0 +1,46 @@ +# Inference Performance + +## Test Scripts + +```python +from towhee import ops +import time + +decode = ops.audio_decode.ffmpeg() +audio = [x[0] for x in decode('path/to/test.wav')] + +op = ops.audio_embedding.nnfp() +# op = ops.audio_embedding.nnfp( +# model_path='path/to/torchscript/model.pt') +# op = ops.audio_embedding.nnfp( +# model_path='path/to/model.onnx') + + +start = time.time() +for _ in range(100): + embs = op(audio) + assert(embs.shape == (10, 128)) +end = time.time() + +print((end-start) / 100) +``` + +## Results + +- Device: MacOS, 2.3 GHz Quad-Core Intel Core i7, 8 CPUs +- Input: 10s audio, loop for 100 times + +| inference method | mem usage | avg time | +| -- | -- | -- | +| pytorch | 0.3G | 0.451s | +| torchscript | 0.3G | 0.470s | +| onnx | 0.3G | 0.378s | + +- Device: MacOS, 2.3 GHz Quad-Core Intel Core i7, 8 CPUs +- Input: 188s audio, loop for 100 times + +| inference method | mem usage | avg time | +| -- | -- | -- | +| pytorch | 2.6G | 8.162s | +| torchscript | 2.8G | 7.507s | +| onnx | 1.7G | 6.769s | diff --git a/test.py b/test.py index da05983..62ab8a4 100644 --- a/test.py +++ b/test.py @@ -1,4 +1,7 @@ from towhee import ops + +import warnings + import torch import numpy import onnx @@ -17,11 +20,19 @@ op = ops.audio_embedding.nnfp() out0 = op.get_op().model(audio) # print(out0) +# Test Pytorch op.get_op().save_model(format='pytorch') op = ops.audio_embedding.nnfp(checkpoint_path='./saved/pytorch/nnfp.pt') out1 = op.get_op().model(audio) -assert((out0 == out1).all()) +assert ((out0 == out1).all()) + +# Test Torchscript +op.get_op().save_model(format='torchscript') +op = ops.audio_embedding.nnfp(checkpoint_path='./saved/torchscript/nnfp.pt') +out2 = op.get_op().model(audio) +assert ((out0 == out2).all()) +# Test ONNX op.get_op().save_model(format='onnx') op = ops.audio_embedding.nnfp() onnx_model = onnx.load('./saved/onnx/nnfp.onnx') @@ -30,6 +41,6 @@ onnx.checker.check_model(onnx_model) ort_session = onnxruntime.InferenceSession('./saved/onnx/nnfp.onnx') ort_inputs = {ort_session.get_inputs()[0].name: to_numpy(audio)} ort_outs = ort_session.run(None, ort_inputs) -out2 = ort_outs[0] -# print(out2) -assert(numpy.allclose(to_numpy(out0), out2, rtol=1e-03, atol=1e-05)) +out3 = ort_outs[0] +# print(out3) +assert (numpy.allclose(to_numpy(out0), out3, rtol=1e-03, atol=1e-05))