Replace resampy with torchaudio

Signed-off-by: Jael Gu <mengjia.gu@zilliz.com>
3 years ago · 6644e26cb4
3 changed files with 29 additions and 26 deletions
--- a/README.md
+++ b/README.md
@ -85,11 +85,11 @@ An audio embedding operator generates vectors in numpy.ndarray given towhee audi
 *data: List[towhee.types.audio_frame.AudioFrame]*
 Input audio data is a list of towhee audio frames.
 The input data should represent for an audio longer than 2s.
 The input data should represent for an audio longer than 3s.
 **Returns**:
 *numpy.ndarray*
 Audio embeddings in shape (num_clips, 512).
 Each embedding stands for features of an audio clip with length of 2s.
 Each embedding stands for features of an audio clip with length of 2.7s.
--- a/clmr_magnatagatune.py
+++ b/clmr_magnatagatune.py
@ -32,8 +32,8 @@ import logging
 from pathlib import Path
 from typing import List
 import resampy
 import torch
 import torchaudio
 import numpy
 from towhee.operator import NNOperator
@ -85,22 +85,23 @@ class ClmrMagnatagatune(NNOperator):
        layout = data[0].layout
        if layout == 'stereo':
            frames = [frame.reshape(-1, 2) for frame in data]
            audio = numpy.vstack(frames).transpose()
            # audio = numpy.mean(audio, axis=0)
            # audio = numpy.expand_dims(audio, 0)
            audio = numpy.vstack(frames)
            audio = numpy.mean(audio, axis=1)
        else:
            audio = numpy.hstack(data)
            audio = numpy.expand_dims(audio, 0)
            if len(audio.shape) != 1:
                audio = audio.squeeze()
        audio = self.int2float(audio, dtype='float32')
        audio = torch.from_numpy(audio)
        audio = self.int2float(audio).astype('float32')
        if sr != _sr:
            audio = resampy.resample(audio, sr, _sr)
            resampler = torchaudio.transforms.Resample(sr, _sr, dtype=audio.dtype)
            audio = resampler(audio)
        with torch.no_grad():
            audio = torch.from_numpy(audio)
            batch = torch.split(audio, audio_length, dim=1)
            batch = torch.cat(batch[:-1])
            batch = batch.unsqueeze(dim=1)
            batch = batch.to(self.device)
            batch = torch.split(audio, audio_length)
            batch = [x for x in batch if len(x) == audio_length]
            batch = torch.vstack(batch)
            batch = batch.unsqueeze(dim=1).to(self.device)
            features = numpy.squeeze(self.model(batch))
        return features.to('cpu').detach().numpy()
@ -116,22 +117,26 @@ class ClmrMagnatagatune(NNOperator):
        dtype = numpy.dtype(dtype)
        assert dtype.kind == 'f'
        if wav.dtype.kind in 'iu':
            ii = numpy.iinfo(wav.dtype)
            abs_max = 2 ** (ii.bits - 1)
            offset = ii.min + abs_max
            return (wav.astype(dtype) - offset) / abs_max
            # ii = numpy.iinfo(wav.dtype)
            # abs_max = 2 ** (ii.bits - 1)
            # offset = ii.min + abs_max
            # return (wav.astype(dtype) - offset) / abs_max
            if wav.dtype != 'int16':
                wav = (wav >> 16).astype(numpy.int16)
            assert wav.dtype == 'int16'
            wav = (wav / 32768.0).astype(dtype)
            return wav
        else:
            return wav.astype(dtype)
 # if __name__ == "__main__":
 #     encoder = ClmrMagnatagatune()
 #     import towhee
 #
 #     # audio_path = "/audio/path/or/link"
 #     # vec = encoder(audio_path)
 #     audio_path = "path/to/audio.wav"
 #     frames = towhee.glob(audio_path).audio_decode.ffmpeg(99999).flatten()[0]
 #
 #     audio_data = numpy.zeros((2, 441344))
 #     sample_rate = 44100
 #     vec = encoder(audio_data, sample_rate)
 #     encoder = ClmrMagnatagatune()
 #     vec = encoder(frames)
 #
 #     print(vec.shape)
--- a/requirements.txt
+++ b/requirements.txt
@ -1,5 +1,3 @@
 torchaudio
 torch
 numpy>=1.19.5
 resampy
 towhee>=0.7.0