From 6644e26cb4be70d9039d9836f79729067a4d3eac Mon Sep 17 00:00:00 2001
From: Jael Gu <mengjia.gu@zilliz.com>
Date: Thu, 29 Sep 2022 15:50:21 +0800
Subject: [PATCH] Replace resampy with torchaudio

Signed-off-by: Jael Gu <mengjia.gu@zilliz.com>
---
 README.md             |  4 ++--
 clmr_magnatagatune.py | 49 ++++++++++++++++++++++++-------------------
 requirements.txt      |  2 --
 3 files changed, 29 insertions(+), 26 deletions(-)

diff --git a/README.md b/README.md
index d65b530..311a7f0 100644
--- a/README.md
+++ b/README.md
@@ -85,11 +85,11 @@ An audio embedding operator generates vectors in numpy.ndarray given towhee audi
 *data: List[towhee.types.audio_frame.AudioFrame]*
 
 Input audio data is a list of towhee audio frames.
-The input data should represent for an audio longer than 2s.
+The input data should represent for an audio longer than 3s.
 
 **Returns**:
 
 *numpy.ndarray*
 
 Audio embeddings in shape (num_clips, 512).
-Each embedding stands for features of an audio clip with length of 2s.
+Each embedding stands for features of an audio clip with length of 2.7s.
diff --git a/clmr_magnatagatune.py b/clmr_magnatagatune.py
index dd43e4a..743edc7 100644
--- a/clmr_magnatagatune.py
+++ b/clmr_magnatagatune.py
@@ -32,8 +32,8 @@ import logging
 from pathlib import Path
 from typing import List
 
-import resampy
 import torch
+import torchaudio
 import numpy
 
 from towhee.operator import NNOperator
@@ -85,22 +85,23 @@ class ClmrMagnatagatune(NNOperator):
         layout = data[0].layout
         if layout == 'stereo':
             frames = [frame.reshape(-1, 2) for frame in data]
-            audio = numpy.vstack(frames).transpose()
-            # audio = numpy.mean(audio, axis=0)
-            # audio = numpy.expand_dims(audio, 0)
+            audio = numpy.vstack(frames)
+            audio = numpy.mean(audio, axis=1)
         else:
             audio = numpy.hstack(data)
-            audio = numpy.expand_dims(audio, 0)
+            if len(audio.shape) != 1:
+                audio = audio.squeeze()
+        audio = self.int2float(audio, dtype='float32')
+        audio = torch.from_numpy(audio)
 
-        audio = self.int2float(audio).astype('float32')
         if sr != _sr:
-            audio = resampy.resample(audio, sr, _sr)
+            resampler = torchaudio.transforms.Resample(sr, _sr, dtype=audio.dtype)
+            audio = resampler(audio)
         with torch.no_grad():
-            audio = torch.from_numpy(audio)
-            batch = torch.split(audio, audio_length, dim=1)
-            batch = torch.cat(batch[:-1])
-            batch = batch.unsqueeze(dim=1)
-            batch = batch.to(self.device)
+            batch = torch.split(audio, audio_length)
+            batch = [x for x in batch if len(x) == audio_length]
+            batch = torch.vstack(batch)
+            batch = batch.unsqueeze(dim=1).to(self.device)
             features = numpy.squeeze(self.model(batch))
 
         return features.to('cpu').detach().numpy()
@@ -116,22 +117,26 @@ class ClmrMagnatagatune(NNOperator):
         dtype = numpy.dtype(dtype)
         assert dtype.kind == 'f'
         if wav.dtype.kind in 'iu':
-            ii = numpy.iinfo(wav.dtype)
-            abs_max = 2 ** (ii.bits - 1)
-            offset = ii.min + abs_max
-            return (wav.astype(dtype) - offset) / abs_max
+            # ii = numpy.iinfo(wav.dtype)
+            # abs_max = 2 ** (ii.bits - 1)
+            # offset = ii.min + abs_max
+            # return (wav.astype(dtype) - offset) / abs_max
+            if wav.dtype != 'int16':
+                wav = (wav >> 16).astype(numpy.int16)
+            assert wav.dtype == 'int16'
+            wav = (wav / 32768.0).astype(dtype)
+            return wav
         else:
             return wav.astype(dtype)
 
 
 # if __name__ == "__main__":
-#     encoder = ClmrMagnatagatune()
+#     import towhee
 #
-#     # audio_path = "/audio/path/or/link"
-#     # vec = encoder(audio_path)
+#     audio_path = "path/to/audio.wav"
+#     frames = towhee.glob(audio_path).audio_decode.ffmpeg(99999).flatten()[0]
 #
-#     audio_data = numpy.zeros((2, 441344))
-#     sample_rate = 44100
-#     vec = encoder(audio_data, sample_rate)
+#     encoder = ClmrMagnatagatune()
+#     vec = encoder(frames)
 #
 #     print(vec.shape)
diff --git a/requirements.txt b/requirements.txt
index 01a7c43..de85780 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +1,3 @@
 torchaudio
 torch
-numpy>=1.19.5
-resampy
 towhee>=0.7.0