diff --git a/README.md b/README.md index 9ba730b..6ab3a16 100644 --- a/README.md +++ b/README.md @@ -6,46 +6,9 @@ ## Desription The audio embedding operator converts an input audio into a dense vector which can be used to represent the audio clip's semantics. +Each vector represents for an audio clip with a fixed length of around 2s. This operator is built on top of the original implementation of [CLMR](https://github.com/Spijkervet/CLMR). -The [default model weight](./checkpoints/clmr_checkpoint_10000.pt) provided is pretrained on [Magnatagatune Dataset](https://paperswithcode.com/dataset/magnatagatune) with [SampleCNN](./models/sample_cnn.py). - -```python -import numpy as np -from towhee import ops - -audio_encoder = ops.audio_embedding.clmr() - -# Path or url as input -audio_embedding = audio_encoder("/audio/path/or/url/") - -# Audio data as input -audio_data = np.zeros((2, 441344)) -sample_rate = 44100 -audio_embedding = audio_encoder(audio_data, sample_rate) -``` - -## Factory Constructor - -Create the operator via the following factory method - -***ops.audio_embedding.clmr()*** - - -## Interface - -An audio embedding operator generates vectors in numpy.ndarray given an audio file path or audio data in numpy.ndarray. - - -**Parameters:** - -​ None. - - -**Returns**: *numpy.ndarray* - -​ Audio embeddings in shape (num_clips, 512). - - +The [default model weight](clmr_checkpoint_10000.pt) provided is pretrained on [Magnatagatune Dataset](https://paperswithcode.com/dataset/magnatagatune) with [SampleCNN](sample_cnn.py). ## Code Example @@ -57,9 +20,12 @@ Generate embeddings for the audio "test.wav". from towhee import dc dc.glob('test.wav') + .audio_decode() + .time_window(range=10) .audio_embedding.clmr() .show() ``` + | | *Write a same pipeline with explicit inputs/outputs name specifications:* @@ -67,9 +33,41 @@ dc.glob('test.wav') from towhee import dc dc.glob['path']('test.wav') - .audio_embedding.clmr['path', 'vecs']() + .audio_decode['path', 'audio']() + .time_window['audio', 'frames'](range=10) + .audio_embedding.clmr['frames', 'vecs']() .select('vecs') - .show() + .to_vec() ``` +## Factory Constructor + +Create the operator via the following factory method + +***audio_embedding.clmr(framework="pytorch")*** + +**Parameters:** + +​ *framework: str* + +​ The framework of model implementation. +Default value is "pytorch" since the model is implemented in Pytorch. + +## Interface + +An audio embedding operator generates vectors in numpy.ndarray given an audio file path or a [towhee audio](link/to/AudioFrame/api/doc). + +**Parameters:** + +​ *Union[str, towhee.types.Audio]* + +​ The audio path or link in string. +Or audio input data in towhee audio frames. +The input data should represent for an audio longer than 2s. + +**Returns**: + +​ *numpy.ndarray* +​ Audio embeddings in shape (num_clips, 512). +Each embedding stands for features of an audio clip with length of 2s. \ No newline at end of file diff --git a/utils/checkpoint.py b/clmr_checkpoint.py similarity index 100% rename from utils/checkpoint.py rename to clmr_checkpoint.py diff --git a/checkpoints/clmr_checkpoint_10000.pt b/clmr_checkpoint_10000.pt similarity index 100% rename from checkpoints/clmr_checkpoint_10000.pt rename to clmr_checkpoint_10000.pt diff --git a/clmr_magnatagatune.py b/clmr_magnatagatune.py index 7328bf9..b90c0e9 100644 --- a/clmr_magnatagatune.py +++ b/clmr_magnatagatune.py @@ -26,14 +26,13 @@ from towhee.operator import NNOperator from towhee import register sys.path.append(str(Path(__file__).parent)) - -from utils.checkpoint import load_encoder_checkpoint -from models.sample_cnn import SampleCNN +from clmr_checkpoint import load_encoder_checkpoint +from sample_cnn import SampleCNN log = logging.getLogger() -@register(output_schema=['vec']) +@register(output_schema=['vecs']) class ClmrMagnatagatune(NNOperator): """ Pretrained clmr @@ -44,7 +43,7 @@ class ClmrMagnatagatune(NNOperator): self.device = "cuda" if torch.cuda.is_available() else "cpu" weight_path = os.path.join(str(Path(__file__).parent), - 'checkpoints/clmr_checkpoint_10000.pt') + 'clmr_checkpoint_10000.pt') state_dict = load_encoder_checkpoint(weight_path, 1) encoder = SampleCNN(strides=[3, 3, 3, 3, 3, 3, 3, 3, 3], supervised=False, out_dim=1) encoder.load_state_dict(state_dict) @@ -86,11 +85,11 @@ class ClmrMagnatagatune(NNOperator): # if __name__ == "__main__": # encoder = ClmrMagnatagatune() # -# audio_path = "/audio/path/or/link" -# vec = encoder(audio_path) +# # audio_path = "/audio/path/or/link" +# # vec = encoder(audio_path) # -# # audio_data = numpy.zeros((2, 441344)) -# # sample_rate = 44100 -# # vec = encoder(audio_data, sample_rate) +# audio_data = numpy.zeros((2, 441344)) +# sample_rate = 44100 +# vec = encoder(audio_data, sample_rate) # # print(vec.shape) diff --git a/models/model.py b/clmr_model.py similarity index 100% rename from models/model.py rename to clmr_model.py diff --git a/models/__init__.py b/models/__init__.py deleted file mode 100644 index 8b13789..0000000 --- a/models/__init__.py +++ /dev/null @@ -1 +0,0 @@ - diff --git a/requirements.txt b/requirements.txt index 2af2d5f..16049ae 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,3 @@ torchaudio==0.9.0 torch==1.9.0 -soundfile numpy diff --git a/models/sample_cnn.py b/sample_cnn.py similarity index 96% rename from models/sample_cnn.py rename to sample_cnn.py index 7d619c9..355cebf 100644 --- a/models/sample_cnn.py +++ b/sample_cnn.py @@ -1,6 +1,5 @@ -import torch -import torch.nn as nn -from .model import Model +from torch import nn +from clmr_model import Model class SampleCNN(Model): diff --git a/utils/__init__.py b/utils/__init__.py deleted file mode 100644 index 8b13789..0000000 --- a/utils/__init__.py +++ /dev/null @@ -1 +0,0 @@ -