Replace resampy with torchaudio

Signed-off-by: Jael Gu <mengjia.gu@zilliz.com>
3 years ago · 6644e26cb4
3 changed files with 29 additions and 26 deletions
--- a/README.md
+++ b/README.md
@ -85,11 +85,11 @@ An audio embedding operator generates vectors in numpy.ndarray given towhee audi
 *data: List[towhee.types.audio_frame.AudioFrame]*

 Input audio data is a list of towhee audio frames.
-The input data should represent for an audio longer than 2s.
+The input data should represent for an audio longer than 3s.

 **Returns**:

 *numpy.ndarray*

 Audio embeddings in shape (num_clips, 512).
-Each embedding stands for features of an audio clip with length of 2s.
+Each embedding stands for features of an audio clip with length of 2.7s.
--- a/clmr_magnatagatune.py
+++ b/clmr_magnatagatune.py
@ -32,8 +32,8 @@ import logging
 from pathlib import Path
 from typing import List

-import resampy
 import torch
+import torchaudio
 import numpy

 from towhee.operator import NNOperator
@ -85,22 +85,23 @@ class ClmrMagnatagatune(NNOperator):
        layout = data[0].layout
        if layout == 'stereo':
            frames = [frame.reshape(-1, 2) for frame in data]
-            audio = numpy.vstack(frames).transpose()
-            # audio = numpy.mean(audio, axis=0)
-            # audio = numpy.expand_dims(audio, 0)
+            audio = numpy.vstack(frames)
+            audio = numpy.mean(audio, axis=1)
        else:
            audio = numpy.hstack(data)
-            audio = numpy.expand_dims(audio, 0)
+            if len(audio.shape) != 1:
+                audio = audio.squeeze()
+        audio = self.int2float(audio, dtype='float32')
+        audio = torch.from_numpy(audio)

-        audio = self.int2float(audio).astype('float32')
        if sr != _sr:
-            audio = resampy.resample(audio, sr, _sr)
+            resampler = torchaudio.transforms.Resample(sr, _sr, dtype=audio.dtype)
+            audio = resampler(audio)
        with torch.no_grad():
-            audio = torch.from_numpy(audio)
-            batch = torch.split(audio, audio_length, dim=1)
-            batch = torch.cat(batch[:-1])
-            batch = batch.unsqueeze(dim=1)
-            batch = batch.to(self.device)
+            batch = torch.split(audio, audio_length)
+            batch = [x for x in batch if len(x) == audio_length]
+            batch = torch.vstack(batch)
+            batch = batch.unsqueeze(dim=1).to(self.device)
            features = numpy.squeeze(self.model(batch))

        return features.to('cpu').detach().numpy()
@ -116,22 +117,26 @@ class ClmrMagnatagatune(NNOperator):
        dtype = numpy.dtype(dtype)
        assert dtype.kind == 'f'
        if wav.dtype.kind in 'iu':
-            ii = numpy.iinfo(wav.dtype)
-            abs_max = 2 ** (ii.bits - 1)
-            offset = ii.min + abs_max
-            return (wav.astype(dtype) - offset) / abs_max
+            # ii = numpy.iinfo(wav.dtype)
+            # abs_max = 2 ** (ii.bits - 1)
+            # offset = ii.min + abs_max
+            # return (wav.astype(dtype) - offset) / abs_max
+            if wav.dtype != 'int16':
+                wav = (wav >> 16).astype(numpy.int16)
+            assert wav.dtype == 'int16'
+            wav = (wav / 32768.0).astype(dtype)
+            return wav
        else:
            return wav.astype(dtype)


 # if __name__ == "__main__":
-#     encoder = ClmrMagnatagatune()
+#     import towhee
 #
-#     # audio_path = "/audio/path/or/link"
-#     # vec = encoder(audio_path)
+#     audio_path = "path/to/audio.wav"
+#     frames = towhee.glob(audio_path).audio_decode.ffmpeg(99999).flatten()[0]
 #
-#     audio_data = numpy.zeros((2, 441344))
-#     sample_rate = 44100
-#     vec = encoder(audio_data, sample_rate)
+#     encoder = ClmrMagnatagatune()
+#     vec = encoder(frames)
 #
 #     print(vec.shape)
--- a/requirements.txt
+++ b/requirements.txt
@ -1,5 +1,3 @@
 torchaudio
 torch
-numpy>=1.19.5
-resampy
 towhee>=0.7.0