logo
Browse Source

Refactor

Signed-off-by: Jael Gu <mengjia.gu@zilliz.com>
main
Jael Gu 3 years ago
parent
commit
0a39a6d1ae
  1. 80
      README.md
  2. 0
      clmr_checkpoint.py
  3. 0
      clmr_checkpoint_10000.pt
  4. 19
      clmr_magnatagatune.py
  5. 0
      clmr_model.py
  6. 1
      models/__init__.py
  7. 1
      requirements.txt
  8. 5
      sample_cnn.py
  9. 1
      utils/__init__.py

80
README.md

@ -6,46 +6,9 @@
## Desription ## Desription
The audio embedding operator converts an input audio into a dense vector which can be used to represent the audio clip's semantics. The audio embedding operator converts an input audio into a dense vector which can be used to represent the audio clip's semantics.
Each vector represents for an audio clip with a fixed length of around 2s.
This operator is built on top of the original implementation of [CLMR](https://github.com/Spijkervet/CLMR). This operator is built on top of the original implementation of [CLMR](https://github.com/Spijkervet/CLMR).
The [default model weight](./checkpoints/clmr_checkpoint_10000.pt) provided is pretrained on [Magnatagatune Dataset](https://paperswithcode.com/dataset/magnatagatune) with [SampleCNN](./models/sample_cnn.py).
```python
import numpy as np
from towhee import ops
audio_encoder = ops.audio_embedding.clmr()
# Path or url as input
audio_embedding = audio_encoder("/audio/path/or/url/")
# Audio data as input
audio_data = np.zeros((2, 441344))
sample_rate = 44100
audio_embedding = audio_encoder(audio_data, sample_rate)
```
## Factory Constructor
Create the operator via the following factory method
***ops.audio_embedding.clmr()***
## Interface
An audio embedding operator generates vectors in numpy.ndarray given an audio file path or audio data in numpy.ndarray.
**Parameters:**
​ None.
**Returns**: *numpy.ndarray*
​ Audio embeddings in shape (num_clips, 512).
The [default model weight](clmr_checkpoint_10000.pt) provided is pretrained on [Magnatagatune Dataset](https://paperswithcode.com/dataset/magnatagatune) with [SampleCNN](sample_cnn.py).
## Code Example ## Code Example
@ -57,9 +20,12 @@ Generate embeddings for the audio "test.wav".
from towhee import dc from towhee import dc
dc.glob('test.wav') dc.glob('test.wav')
.audio_decode()
.time_window(range=10)
.audio_embedding.clmr() .audio_embedding.clmr()
.show() .show()
``` ```
| |
*Write a same pipeline with explicit inputs/outputs name specifications:* *Write a same pipeline with explicit inputs/outputs name specifications:*
@ -67,9 +33,41 @@ dc.glob('test.wav')
from towhee import dc from towhee import dc
dc.glob['path']('test.wav') dc.glob['path']('test.wav')
.audio_embedding.clmr['path', 'vecs']()
.audio_decode['path', 'audio']()
.time_window['audio', 'frames'](range=10)
.audio_embedding.clmr['frames', 'vecs']()
.select('vecs') .select('vecs')
.show()
.to_vec()
``` ```
## Factory Constructor
Create the operator via the following factory method
***audio_embedding.clmr(framework="pytorch")***
**Parameters:**
​ *framework: str*
​ The framework of model implementation.
Default value is "pytorch" since the model is implemented in Pytorch.
## Interface
An audio embedding operator generates vectors in numpy.ndarray given an audio file path or a [towhee audio](link/to/AudioFrame/api/doc).
**Parameters:**
​ *Union[str, towhee.types.Audio]*
​ The audio path or link in string.
Or audio input data in towhee audio frames.
The input data should represent for an audio longer than 2s.
**Returns**:
​ *numpy.ndarray*
​ Audio embeddings in shape (num_clips, 512).
Each embedding stands for features of an audio clip with length of 2s.

0
utils/checkpoint.py → clmr_checkpoint.py

0
checkpoints/clmr_checkpoint_10000.pt → clmr_checkpoint_10000.pt

19
clmr_magnatagatune.py

@ -26,14 +26,13 @@ from towhee.operator import NNOperator
from towhee import register from towhee import register
sys.path.append(str(Path(__file__).parent)) sys.path.append(str(Path(__file__).parent))
from utils.checkpoint import load_encoder_checkpoint
from models.sample_cnn import SampleCNN
from clmr_checkpoint import load_encoder_checkpoint
from sample_cnn import SampleCNN
log = logging.getLogger() log = logging.getLogger()
@register(output_schema=['vec'])
@register(output_schema=['vecs'])
class ClmrMagnatagatune(NNOperator): class ClmrMagnatagatune(NNOperator):
""" """
Pretrained clmr Pretrained clmr
@ -44,7 +43,7 @@ class ClmrMagnatagatune(NNOperator):
self.device = "cuda" if torch.cuda.is_available() else "cpu" self.device = "cuda" if torch.cuda.is_available() else "cpu"
weight_path = os.path.join(str(Path(__file__).parent), weight_path = os.path.join(str(Path(__file__).parent),
'checkpoints/clmr_checkpoint_10000.pt')
'clmr_checkpoint_10000.pt')
state_dict = load_encoder_checkpoint(weight_path, 1) state_dict = load_encoder_checkpoint(weight_path, 1)
encoder = SampleCNN(strides=[3, 3, 3, 3, 3, 3, 3, 3, 3], supervised=False, out_dim=1) encoder = SampleCNN(strides=[3, 3, 3, 3, 3, 3, 3, 3, 3], supervised=False, out_dim=1)
encoder.load_state_dict(state_dict) encoder.load_state_dict(state_dict)
@ -86,11 +85,11 @@ class ClmrMagnatagatune(NNOperator):
# if __name__ == "__main__": # if __name__ == "__main__":
# encoder = ClmrMagnatagatune() # encoder = ClmrMagnatagatune()
# #
# audio_path = "/audio/path/or/link"
# vec = encoder(audio_path)
# # audio_path = "/audio/path/or/link"
# # vec = encoder(audio_path)
# #
# # audio_data = numpy.zeros((2, 441344))
# # sample_rate = 44100
# # vec = encoder(audio_data, sample_rate)
# audio_data = numpy.zeros((2, 441344))
# sample_rate = 44100
# vec = encoder(audio_data, sample_rate)
# #
# print(vec.shape) # print(vec.shape)

0
models/model.py → clmr_model.py

1
models/__init__.py

@ -1 +0,0 @@

1
requirements.txt

@ -1,4 +1,3 @@
torchaudio==0.9.0 torchaudio==0.9.0
torch==1.9.0 torch==1.9.0
soundfile
numpy numpy

5
models/sample_cnn.py → sample_cnn.py

@ -1,6 +1,5 @@
import torch
import torch.nn as nn
from .model import Model
from torch import nn
from clmr_model import Model
class SampleCNN(Model): class SampleCNN(Model):

1
utils/__init__.py

@ -1 +0,0 @@
Loading…
Cancel
Save