diff --git a/README.md b/README.md index d65b530..311a7f0 100644 --- a/README.md +++ b/README.md @@ -85,11 +85,11 @@ An audio embedding operator generates vectors in numpy.ndarray given towhee audi *data: List[towhee.types.audio_frame.AudioFrame]* Input audio data is a list of towhee audio frames. -The input data should represent for an audio longer than 2s. +The input data should represent for an audio longer than 3s. **Returns**: *numpy.ndarray* Audio embeddings in shape (num_clips, 512). -Each embedding stands for features of an audio clip with length of 2s. +Each embedding stands for features of an audio clip with length of 2.7s. diff --git a/clmr_magnatagatune.py b/clmr_magnatagatune.py index dd43e4a..743edc7 100644 --- a/clmr_magnatagatune.py +++ b/clmr_magnatagatune.py @@ -32,8 +32,8 @@ import logging from pathlib import Path from typing import List -import resampy import torch +import torchaudio import numpy from towhee.operator import NNOperator @@ -85,22 +85,23 @@ class ClmrMagnatagatune(NNOperator): layout = data[0].layout if layout == 'stereo': frames = [frame.reshape(-1, 2) for frame in data] - audio = numpy.vstack(frames).transpose() - # audio = numpy.mean(audio, axis=0) - # audio = numpy.expand_dims(audio, 0) + audio = numpy.vstack(frames) + audio = numpy.mean(audio, axis=1) else: audio = numpy.hstack(data) - audio = numpy.expand_dims(audio, 0) + if len(audio.shape) != 1: + audio = audio.squeeze() + audio = self.int2float(audio, dtype='float32') + audio = torch.from_numpy(audio) - audio = self.int2float(audio).astype('float32') if sr != _sr: - audio = resampy.resample(audio, sr, _sr) + resampler = torchaudio.transforms.Resample(sr, _sr, dtype=audio.dtype) + audio = resampler(audio) with torch.no_grad(): - audio = torch.from_numpy(audio) - batch = torch.split(audio, audio_length, dim=1) - batch = torch.cat(batch[:-1]) - batch = batch.unsqueeze(dim=1) - batch = batch.to(self.device) + batch = torch.split(audio, audio_length) + batch = [x for x in batch if len(x) == audio_length] + batch = torch.vstack(batch) + batch = batch.unsqueeze(dim=1).to(self.device) features = numpy.squeeze(self.model(batch)) return features.to('cpu').detach().numpy() @@ -116,22 +117,26 @@ class ClmrMagnatagatune(NNOperator): dtype = numpy.dtype(dtype) assert dtype.kind == 'f' if wav.dtype.kind in 'iu': - ii = numpy.iinfo(wav.dtype) - abs_max = 2 ** (ii.bits - 1) - offset = ii.min + abs_max - return (wav.astype(dtype) - offset) / abs_max + # ii = numpy.iinfo(wav.dtype) + # abs_max = 2 ** (ii.bits - 1) + # offset = ii.min + abs_max + # return (wav.astype(dtype) - offset) / abs_max + if wav.dtype != 'int16': + wav = (wav >> 16).astype(numpy.int16) + assert wav.dtype == 'int16' + wav = (wav / 32768.0).astype(dtype) + return wav else: return wav.astype(dtype) # if __name__ == "__main__": -# encoder = ClmrMagnatagatune() +# import towhee # -# # audio_path = "/audio/path/or/link" -# # vec = encoder(audio_path) +# audio_path = "path/to/audio.wav" +# frames = towhee.glob(audio_path).audio_decode.ffmpeg(99999).flatten()[0] # -# audio_data = numpy.zeros((2, 441344)) -# sample_rate = 44100 -# vec = encoder(audio_data, sample_rate) +# encoder = ClmrMagnatagatune() +# vec = encoder(frames) # # print(vec.shape) diff --git a/requirements.txt b/requirements.txt index 01a7c43..de85780 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,3 @@ torchaudio torch -numpy>=1.19.5 -resampy towhee>=0.7.0