logo
Browse Source

Replace resampy with torchaudio

Signed-off-by: Jael Gu <mengjia.gu@zilliz.com>
main
Jael Gu 2 years ago
parent
commit
6644e26cb4
  1. 4
      README.md
  2. 49
      clmr_magnatagatune.py
  3. 2
      requirements.txt

4
README.md

@ -85,11 +85,11 @@ An audio embedding operator generates vectors in numpy.ndarray given towhee audi
*data: List[towhee.types.audio_frame.AudioFrame]*
Input audio data is a list of towhee audio frames.
The input data should represent for an audio longer than 2s.
The input data should represent for an audio longer than 3s.
**Returns**:
*numpy.ndarray*
Audio embeddings in shape (num_clips, 512).
Each embedding stands for features of an audio clip with length of 2s.
Each embedding stands for features of an audio clip with length of 2.7s.

49
clmr_magnatagatune.py

@ -32,8 +32,8 @@ import logging
from pathlib import Path
from typing import List
import resampy
import torch
import torchaudio
import numpy
from towhee.operator import NNOperator
@ -85,22 +85,23 @@ class ClmrMagnatagatune(NNOperator):
layout = data[0].layout
if layout == 'stereo':
frames = [frame.reshape(-1, 2) for frame in data]
audio = numpy.vstack(frames).transpose()
# audio = numpy.mean(audio, axis=0)
# audio = numpy.expand_dims(audio, 0)
audio = numpy.vstack(frames)
audio = numpy.mean(audio, axis=1)
else:
audio = numpy.hstack(data)
audio = numpy.expand_dims(audio, 0)
if len(audio.shape) != 1:
audio = audio.squeeze()
audio = self.int2float(audio, dtype='float32')
audio = torch.from_numpy(audio)
audio = self.int2float(audio).astype('float32')
if sr != _sr:
audio = resampy.resample(audio, sr, _sr)
resampler = torchaudio.transforms.Resample(sr, _sr, dtype=audio.dtype)
audio = resampler(audio)
with torch.no_grad():
audio = torch.from_numpy(audio)
batch = torch.split(audio, audio_length, dim=1)
batch = torch.cat(batch[:-1])
batch = batch.unsqueeze(dim=1)
batch = batch.to(self.device)
batch = torch.split(audio, audio_length)
batch = [x for x in batch if len(x) == audio_length]
batch = torch.vstack(batch)
batch = batch.unsqueeze(dim=1).to(self.device)
features = numpy.squeeze(self.model(batch))
return features.to('cpu').detach().numpy()
@ -116,22 +117,26 @@ class ClmrMagnatagatune(NNOperator):
dtype = numpy.dtype(dtype)
assert dtype.kind == 'f'
if wav.dtype.kind in 'iu':
ii = numpy.iinfo(wav.dtype)
abs_max = 2 ** (ii.bits - 1)
offset = ii.min + abs_max
return (wav.astype(dtype) - offset) / abs_max
# ii = numpy.iinfo(wav.dtype)
# abs_max = 2 ** (ii.bits - 1)
# offset = ii.min + abs_max
# return (wav.astype(dtype) - offset) / abs_max
if wav.dtype != 'int16':
wav = (wav >> 16).astype(numpy.int16)
assert wav.dtype == 'int16'
wav = (wav / 32768.0).astype(dtype)
return wav
else:
return wav.astype(dtype)
# if __name__ == "__main__":
# encoder = ClmrMagnatagatune()
# import towhee
#
# # audio_path = "/audio/path/or/link"
# # vec = encoder(audio_path)
# audio_path = "path/to/audio.wav"
# frames = towhee.glob(audio_path).audio_decode.ffmpeg(99999).flatten()[0]
#
# audio_data = numpy.zeros((2, 441344))
# sample_rate = 44100
# vec = encoder(audio_data, sample_rate)
# encoder = ClmrMagnatagatune()
# vec = encoder(frames)
#
# print(vec.shape)

2
requirements.txt

@ -1,5 +1,3 @@
torchaudio
torch
numpy>=1.19.5
resampy
towhee>=0.7.0

Loading…
Cancel
Save