diff --git a/README.md b/README.md index 8f2ab36..9908899 100644 --- a/README.md +++ b/README.md @@ -1,10 +1,10 @@ # Audio Embedding with CLMR -*Author: Jael Gu* +*Author: [Jael Gu](https://github.com/jaelgu)*
-## Desription +## Description The audio embedding operator converts an input audio into a dense vector which can be used to represent the audio clip's semantics. Each vector represents for an audio clip with a fixed length of around 2s. @@ -22,11 +22,13 @@ Generate embeddings for the audio "test.wav". ```python import towhee -towhee.glob('test.wav') \ - .audio_decode() \ - .time_window(range=10) \ - .audio_embedding.clmr() \ - .show() +( + towhee.glob('test.wav') + .audio_decode.ffmpeg() + .runas_op(func=lambda x:[y[0] for y in x]) + .audio_embedding.clmr() + .show() +) ``` | [-2.1045141, 0.55381, 0.4537212, ...] shape=(6, 512) | @@ -35,12 +37,13 @@ towhee.glob('test.wav') \ ```python import towhee -towhee.glob['path']('test.wav') \ - .audio_decode['path', 'audio']() \ - .time_window['audio', 'frames'](range=10) \ - .audio_embedding.clmr['frames', 'vecs']() \ - .select('vecs') \ - .to_vec() +( + towhee.glob['path']('test.wav') + .audio_decode.ffmpeg['path', 'frames']() + .runas_op['frames', 'frames'](func=lambda x:[y[0] for y in x]) + .audio_embedding.clmr['frames', 'vecs']() + .show() +) ``` [array([[-2.1045141 , 0.55381 , 0.4537212 , ..., 0.18805158, 0.3079657 , -1.216063 ], @@ -74,14 +77,13 @@ Default value is "pytorch" since the model is implemented in Pytorch. ## Interface -An audio embedding operator generates vectors in numpy.ndarray given an audio file path or a [towhee audio](link/to/AudioFrame/api/doc). +An audio embedding operator generates vectors in numpy.ndarray given towhee audio frames. **Parameters:** -*Union[str, towhee.types.Audio (a sub-class of numpy.ndarray]* +*data: List[towhee.types.audio_frame.AudioFrame]* -The audio path or link in string. -Or audio input data in towhee audio frames. +Input audio data is a list of towhee audio frames. The input data should represent for an audio longer than 2s. **Returns**: diff --git a/clmr_magnatagatune.py b/clmr_magnatagatune.py index b90c0e9..dd43e4a 100644 --- a/clmr_magnatagatune.py +++ b/clmr_magnatagatune.py @@ -11,19 +11,34 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +# Copyright 2021 Zilliz. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import os import sys import logging from pathlib import Path -from typing import Union +from typing import List -import torchaudio +import resampy import torch import numpy from towhee.operator import NNOperator from towhee import register +from towhee.types.audio_frame import AudioFrame sys.path.append(str(Path(__file__).parent)) from clmr_checkpoint import load_encoder_checkpoint @@ -56,30 +71,57 @@ class ClmrMagnatagatune(NNOperator): self.model.eval() self.model.to(self.device) - def __call__(self, audio: Union[str, numpy.ndarray], sample_rate: int = None) -> numpy.ndarray: + def __call__(self, data: List[AudioFrame]) -> numpy.ndarray: + audio_tensors = self.preprocess(data).to(self.device) + features = self.model(audio_tensors) + outs = features.to("cpu") + return outs.detach().numpy() + + def __call__(self, data: List[AudioFrame]) -> numpy.ndarray: _sr = 22050 audio_length = 59049 - if isinstance(audio, str): - source = os.path.abspath(audio) - audio, sr = torchaudio.load(source) - elif isinstance(audio, numpy.ndarray): - sr = sample_rate - audio = torch.tensor(audio).to(torch.float32) - + sr = data[0].sample_rate + layout = data[0].layout + if layout == 'stereo': + frames = [frame.reshape(-1, 2) for frame in data] + audio = numpy.vstack(frames).transpose() + # audio = numpy.mean(audio, axis=0) + # audio = numpy.expand_dims(audio, 0) + else: + audio = numpy.hstack(data) + audio = numpy.expand_dims(audio, 0) + + audio = self.int2float(audio).astype('float32') if sr != _sr: - transform = torchaudio.transforms.Resample(orig_freq=sr, new_freq=_sr) - audio = transform(audio) - + audio = resampy.resample(audio, sr, _sr) with torch.no_grad(): + audio = torch.from_numpy(audio) batch = torch.split(audio, audio_length, dim=1) batch = torch.cat(batch[:-1]) batch = batch.unsqueeze(dim=1) batch = batch.to(self.device) features = numpy.squeeze(self.model(batch)) - embeddings = features.to("cpu") - return embeddings.detach().numpy() + return features.to('cpu').detach().numpy() + + def int2float(self, wav: numpy.ndarray, dtype: str = 'float64'): + """ + Convert audio data from int to float. + The input dtype must be integers. + The output dtype is controlled by the parameter `dtype`, defaults to 'float64'. + + The code is inspired by https://github.com/mgeier/python-audio/blob/master/audio-files/utility.py + """ + dtype = numpy.dtype(dtype) + assert dtype.kind == 'f' + if wav.dtype.kind in 'iu': + ii = numpy.iinfo(wav.dtype) + abs_max = 2 ** (ii.bits - 1) + offset = ii.min + abs_max + return (wav.astype(dtype) - offset) / abs_max + else: + return wav.astype(dtype) # if __name__ == "__main__": diff --git a/clmr_model.py b/clmr_model.py index e4201c6..12bd2f1 100644 --- a/clmr_model.py +++ b/clmr_model.py @@ -1,5 +1,20 @@ -import torch.nn as nn -import numpy as np +# Original implementation by https://github.com/Spijkervet/CLMR +# Copyright 2021 Zilliz. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from torch import nn class Model(nn.Module): @@ -7,17 +22,9 @@ class Model(nn.Module): super(Model, self).__init__() def initialize(self, m): - if isinstance(m, (nn.Conv1d)): + if isinstance(m, nn.Conv1d): # nn.init.xavier_uniform_(m.weight) # if m.bias is not None: # nn.init.xavier_uniform_(m.bias) nn.init.kaiming_uniform_(m.weight, mode="fan_in", nonlinearity="relu") - - -class Identity(nn.Module): - def __init__(self): - super(Identity, self).__init__() - - def forward(self, x): - return x diff --git a/sample_cnn.py b/sample_cnn.py index 355cebf..e774600 100644 --- a/sample_cnn.py +++ b/sample_cnn.py @@ -1,3 +1,18 @@ +# Original implementation by https://github.com/Spijkervet/CLMR +# Copyright 2021 Zilliz. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + from torch import nn from clmr_model import Model