Add files

Signed-off-by: Jael Gu <mengjia.gu@zilliz.com>
3 years ago · bee30eac5e
8 changed files with 290 additions and 1 deletions
--- a/README.md
+++ b/README.md
@ -1,2 +1,95 @@
-# nnfp
+# Audio Embedding with Neural Network Fingerprint

+*Author: [Jael Gu](https://github.com/jaelgu)*
+
+<br />
+
+## Description
+
+The audio embedding operator converts an input audio into a dense vector which can be used to represent the audio clip's semantics.
+Each vector represents for an audio clip with a fixed length of around 1s.
+This operator generates audio embeddings with fingerprinting method introduced by [Neural Audio Fingerprint](https://arxiv.org/abs/2010.11910).
+The model is implemented in Pytorch.
+We've also trained the nnfp model with [FMA dataset](https://github.com/mdeff/fma) (& some noise audio) and shared weights in this operator.
+The nnfp operator is suitable to generate audio fingerprints.
+
+<br />
+
+## Code Example
+
+Generate embeddings for the audio "test.wav". 
+
+*Write the pipeline in simplified style*:
+
+```python
+import towhee
+
+(
+    towhee.glob('test.wav')
+          .audio_decode.ffmpeg()
+          .runas_op(func=lambda x:[y[0] for y in x])
+          .audio_embedding.nnfp()  # use default model
+          .show()
+)
+```
+<img src="./result1.png" width="800px"/>
+
+*Write a same pipeline with explicit inputs/outputs name specifications:*
+
+```python
+import towhee
+
+(
+    towhee.glob['path']('test.wav')
+          .audio_decode.ffmpeg['path', 'frames']()
+          .runas_op['frames', 'frames'](func=lambda x:[y[0] for y in x])
+          .audio_embedding.nnfp['frames', 'vecs']()
+          .select['path', 'vecs']()
+          .show()
+)
+```
+<img src="./result2.png" width="800px"/>
+
+<br />
+
+## Factory Constructor
+
+Create the operator via the following factory method
+
+***audio_embedding.nnfp(params=None, checkpoint_path=None, framework='pytorch')***
+
+**Parameters:**
+
+*params: dict*
+
+A dictionary of model parameters. If None, it will use default parameters to create model.
+
+*checkpoint_path: str*
+
+The path to model weights. If None, it will load default model weights.
+
+*framework: str*
+
+The framework of model implementation.
+Default value is "pytorch" since the model is implemented in Pytorch.
+
+<br />
+
+## Interface
+
+An audio embedding operator generates vectors in numpy.ndarray given towhee audio frames.
+
+**Parameters:**
+
+*data: List[towhee.types.audio_frame.AudioFrame]*
+
+Input audio data is a list of towhee audio frames.
+The input data should represent for an audio longer than 1s.
+
+
+**Returns**:
+
+*numpy.ndarray*
+
+Audio embeddings in shape (num_clips, 128).
+Each embedding stands for features of an audio clip with length of 1s.
--- a/init.py
+++ b/init.py
@ -0,0 +1,19 @@
+# Copyright 2021 Zilliz. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .nn_fingerprint import NNFingerprint
+
+
+def nnfp():
+    return NNFingerprint()
--- a/configs.py
+++ b/configs.py
@ -0,0 +1,36 @@
+# Parameter configs for nnfp, inspired by https://github.com/stdio2016/pfann
+#
+# Copyright 2021 Zilliz. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+default_params = {
+    "dim": 128,
+    "h": 1024,
+    "u": 32,
+    "fuller": True,
+    "activation": "relu",
+    "sample_rate": 8000,
+    "window_length": 1024,
+    "hop_length": 256,
+    "n_mels": 256,
+    "f_min": 300,
+    "f_max": 4000,
+    "segment_size": 1,
+    "hop_size": 1,
+    "frame_shift_mul": 1,
+    "naf_mode": False,
+    "mel_log": "log",
+    "spec_norm": "l2"
+}
--- a/nn_fingerprint.py
+++ b/nn_fingerprint.py
@ -0,0 +1,135 @@
+# Copyright 2021 Zilliz. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import warnings
+
+import os
+from pathlib import Path
+from typing import List
+
+import torch
+import numpy
+import resampy
+
+from towhee.operator.base import NNOperator
+from towhee import register
+from towhee.types.audio_frame import AudioFrame
+from towhee.models.nnfp import NNFp
+from towhee.models.utils.audio_preprocess import preprocess_wav, MelSpec
+
+from .configs import default_params
+
+warnings.filterwarnings('ignore')
+log = logging.getLogger()
+
+
+@register(output_schema=['vecs'])
+class NNFingerprint(NNOperator):
+    """
+    Audio embedding operator using Neural Network Fingerprint
+    """
+
+    def __init__(self,
+                 params: dict = None,
+                 checkpoint_path: str = None,
+                 framework: str = 'pytorch'):
+        super().__init__(framework=framework)
+        self.device = "cuda" if torch.cuda.is_available() else "cpu"
+        if params is None:
+            self.params = default_params
+        else:
+            self.params = params
+
+        dim = self.params['dim']
+        h = self.params['h']
+        u = self.params['u']
+        f_bin = self.params['n_mels']
+        n_seg = int(self.params['segment_size'] * self.params['sample_rate'])
+        t = (n_seg + self.params['hop_length'] - 1) // self.params['hop_length']
+
+        log.info('Creating model...')
+        self.model = NNFp(
+            dim=dim, h=h, u=u,
+            in_f=f_bin, in_t=t,
+            fuller=self.params['fuller'],
+            activation=self.params['activation']
+        ).to(self.device)
+
+        log.info('Loading weights...')
+        if checkpoint_path is None:
+            path = str(Path(__file__).parent)
+            checkpoint_path = os.path.join(path, './checkpoints/pfann_fma_m.pt')
+        state_dict = torch.load(checkpoint_path, map_location=self.device)
+        self.model.load_state_dict(state_dict)
+        self.model.eval()
+        log.info('Model is loaded.')
+
+    def __call__(self, data: List[AudioFrame]) -> numpy.ndarray:
+        audio_tensors = self.preprocess(data).to(self.device)
+        features = self.model(audio_tensors)
+        return features.detach().cpu().numpy()
+
+    def preprocess(self, frames: List[AudioFrame]):
+        sr = frames[0].sample_rate
+        layout = frames[0].layout
+        if layout == 'stereo':
+            frames = [frame.reshape(-1, 2) for frame in frames]
+            audio = numpy.vstack(frames).transpose()
+        else:
+            audio = numpy.hstack(frames)
+            audio = audio[None, :]
+
+        audio = self.int2float(audio)
+
+        if sr != self.params['sample_rate']:
+            audio = resampy.resample(audio, sr, self.params['sample_rate'])
+
+        wav = preprocess_wav(audio,
+                             segment_size=int(self.params['sample_rate'] * self.params['segment_size']),
+                             hop_size=int(self.params['sample_rate'] * self.params['hop_size']),
+                             frame_shift_mul=self.params['frame_shift_mul']).to(self.device)
+        wav = wav.to(torch.float32)
+        mel = MelSpec(sample_rate=self.params['sample_rate'],
+                      window_length=self.params['window_length'],
+                      hop_length=self.params['hop_length'],
+                      f_min=self.params['f_min'],
+                      f_max=self.params['f_max'],
+                      n_mels=self.params['n_mels'],
+                      naf_mode=self.params['naf_mode'],
+                      mel_log=self.params['mel_log'],
+                      spec_norm=self.params['spec_norm']).to(self.device)
+        wav = mel(wav)
+        return wav
+
+    @staticmethod
+    def int2float(wav: numpy.ndarray, dtype: str = 'float64'):
+        """
+        Convert audio imgs from int to float.
+        The input dtype must be integers.
+        The output dtype is controlled by the parameter `dtype`, defaults to 'float64'.
+
+        The code is inspired by https://github.com/mgeier/python-audio/blob/master/audio-files/utility.py
+        """
+        dtype = numpy.dtype(dtype)
+        assert dtype.kind == 'f'
+
+        if wav.dtype.kind in 'iu':
+            ii = numpy.iinfo(wav.dtype)
+            abs_max = 2 ** (ii.bits - 1)
+            offset = ii.min + abs_max
+            return (wav.astype(dtype) - offset) / abs_max
+        else:
+            log.warning('Converting float dtype from %s to %s.', wav.dtype, dtype)
+            return wav.astype(dtype)
--- a/result1.png
+++ b/result1.png
--- a/result2.png
+++ b/result2.png
--- a/saved_model/pfann_fma_m.pt
+++ b/saved_model/pfann_fma_m.pt
--- a/saved_model/pfann_fma_s.pt
+++ b/saved_model/pfann_fma_s.pt