Update

Signed-off-by: Jael Gu <mengjia.gu@zilliz.com>
3 years ago · 5d3f8c394c
4 changed files with 109 additions and 43 deletions
--- a/README.md
+++ b/README.md
@ -1,10 +1,10 @@
 # Audio Embedding with CLMR

-*Author: Jael Gu*
+*Author: [Jael Gu](https://github.com/jaelgu)*

 <br />

-## Desription
+## Description

 The audio embedding operator converts an input audio into a dense vector which can be used to represent the audio clip's semantics.
 Each vector represents for an audio clip with a fixed length of around 2s.
@ -22,11 +22,13 @@ Generate embeddings for the audio "test.wav".
 ```python
 import towhee

-towhee.glob('test.wav') \
-      .audio_decode() \
-      .time_window(range=10) \
-      .audio_embedding.clmr() \
-      .show()
+(
+    towhee.glob('test.wav')
+          .audio_decode.ffmpeg()
+          .runas_op(func=lambda x:[y[0] for y in x])
+          .audio_embedding.clmr()
+          .show()
+)
 ```
    | [-2.1045141, 0.55381, 0.4537212, ...] shape=(6, 512) |

@ -35,12 +37,13 @@ towhee.glob('test.wav') \
 ```python
 import towhee

-towhee.glob['path']('test.wav') \
-      .audio_decode['path', 'audio']() \
-      .time_window['audio', 'frames'](range=10) \
-      .audio_embedding.clmr['frames', 'vecs']() \
-      .select('vecs') \
-      .to_vec()
+(
+    towhee.glob['path']('test.wav')
+          .audio_decode.ffmpeg['path', 'frames']()
+          .runas_op['frames', 'frames'](func=lambda x:[y[0] for y in x])
+          .audio_embedding.clmr['frames', 'vecs']()
+          .show()
+)
 ```
    [array([[-2.1045141 ,  0.55381   ,  0.4537212 , ...,  0.18805158,
          0.3079657 , -1.216063  ],
@ -74,14 +77,13 @@ Default value is "pytorch" since the model is implemented in Pytorch.

 ## Interface

-An audio embedding operator generates vectors in numpy.ndarray given an audio file path or a [towhee audio](link/to/AudioFrame/api/doc).
+An audio embedding operator generates vectors in numpy.ndarray given towhee audio frames.

 **Parameters:**

-*Union[str, towhee.types.Audio (a sub-class of numpy.ndarray]*
+*data: List[towhee.types.audio_frame.AudioFrame]*

-The audio path or link in string.
-Or audio input data in towhee audio frames.
+Input audio data is a list of towhee audio frames.
 The input data should represent for an audio longer than 2s.

 **Returns**:
--- a/clmr_magnatagatune.py
+++ b/clmr_magnatagatune.py
@ -11,19 +11,34 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+# Copyright 2021 Zilliz. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+

 import os
 import sys
 import logging
 from pathlib import Path
-from typing import Union
+from typing import List

-import torchaudio
+import resampy
 import torch
 import numpy

 from towhee.operator import NNOperator
 from towhee import register
+from towhee.types.audio_frame import AudioFrame

 sys.path.append(str(Path(__file__).parent))
 from clmr_checkpoint import load_encoder_checkpoint
@ -56,30 +71,57 @@ class ClmrMagnatagatune(NNOperator):
        self.model.eval()
        self.model.to(self.device)

-    def __call__(self, audio: Union[str, numpy.ndarray], sample_rate: int = None) -> numpy.ndarray:
+    def __call__(self, data: List[AudioFrame]) -> numpy.ndarray:
+        audio_tensors = self.preprocess(data).to(self.device)
+        features = self.model(audio_tensors)
+        outs = features.to("cpu")
+        return outs.detach().numpy()
+
+    def __call__(self, data: List[AudioFrame]) -> numpy.ndarray:
        _sr = 22050
        audio_length = 59049

-        if isinstance(audio, str):
-            source = os.path.abspath(audio)
-            audio, sr = torchaudio.load(source)
-        elif isinstance(audio, numpy.ndarray):
-            sr = sample_rate
-            audio = torch.tensor(audio).to(torch.float32)
-
+        sr = data[0].sample_rate
+        layout = data[0].layout
+        if layout == 'stereo':
+            frames = [frame.reshape(-1, 2) for frame in data]
+            audio = numpy.vstack(frames).transpose()
+            # audio = numpy.mean(audio, axis=0)
+            # audio = numpy.expand_dims(audio, 0)
+        else:
+            audio = numpy.hstack(data)
+            audio = numpy.expand_dims(audio, 0)
+
+        audio = self.int2float(audio).astype('float32')
        if sr != _sr:
-            transform = torchaudio.transforms.Resample(orig_freq=sr, new_freq=_sr)
-            audio = transform(audio)
-
+            audio = resampy.resample(audio, sr, _sr)
        with torch.no_grad():
+            audio = torch.from_numpy(audio)
            batch = torch.split(audio, audio_length, dim=1)
            batch = torch.cat(batch[:-1])
            batch = batch.unsqueeze(dim=1)
            batch = batch.to(self.device)
            features = numpy.squeeze(self.model(batch))

-        embeddings = features.to("cpu")
-        return embeddings.detach().numpy()
+        return features.to('cpu').detach().numpy()
+
+    def int2float(self, wav: numpy.ndarray, dtype: str = 'float64'):
+        """
+        Convert audio data from int to float.
+        The input dtype must be integers.
+        The output dtype is controlled by the parameter `dtype`, defaults to 'float64'.
+
+        The code is inspired by https://github.com/mgeier/python-audio/blob/master/audio-files/utility.py
+        """
+        dtype = numpy.dtype(dtype)
+        assert dtype.kind == 'f'
+        if wav.dtype.kind in 'iu':
+            ii = numpy.iinfo(wav.dtype)
+            abs_max = 2 ** (ii.bits - 1)
+            offset = ii.min + abs_max
+            return (wav.astype(dtype) - offset) / abs_max
+        else:
+            return wav.astype(dtype)


 # if __name__ == "__main__":
--- a/clmr_model.py
+++ b/clmr_model.py
@ -1,5 +1,20 @@
-import torch.nn as nn
-import numpy as np
+# Original implementation by https://github.com/Spijkervet/CLMR
+# Copyright 2021 Zilliz. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from torch import nn


 class Model(nn.Module):
@ -7,17 +22,9 @@ class Model(nn.Module):
        super(Model, self).__init__()

    def initialize(self, m):
-        if isinstance(m, (nn.Conv1d)):
+        if isinstance(m, nn.Conv1d):
            # nn.init.xavier_uniform_(m.weight)
            # if m.bias is not None:
            #     nn.init.xavier_uniform_(m.bias)

            nn.init.kaiming_uniform_(m.weight, mode="fan_in", nonlinearity="relu")
-
-
-class Identity(nn.Module):
-    def __init__(self):
-        super(Identity, self).__init__()
-
-    def forward(self, x):
-        return x
--- a/sample_cnn.py
+++ b/sample_cnn.py
@ -1,3 +1,18 @@
+# Original implementation by https://github.com/Spijkervet/CLMR
+# Copyright 2021 Zilliz. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from torch import nn
 from clmr_model import Model