Refactor

Signed-off-by: Jael Gu <mengjia.gu@zilliz.com>
3 years ago · 0a39a6d1ae
9 changed files with 50 additions and 57 deletions
--- a/README.md
+++ b/README.md
@ -6,46 +6,9 @@
 ## Desription

 The audio embedding operator converts an input audio into a dense vector which can be used to represent the audio clip's semantics.
+Each vector represents for an audio clip with a fixed length of around 2s.
 This operator is built on top of the original implementation of [CLMR](https://github.com/Spijkervet/CLMR).
-The [default model weight](./checkpoints/clmr_checkpoint_10000.pt) provided is pretrained on [Magnatagatune Dataset](https://paperswithcode.com/dataset/magnatagatune) with [SampleCNN](./models/sample_cnn.py).
-
-```python
-import numpy as np
-from towhee import ops
-
-audio_encoder = ops.audio_embedding.clmr()
-
-# Path or url as input
-audio_embedding = audio_encoder("/audio/path/or/url/")
-
-# Audio data as input
-audio_data = np.zeros((2, 441344))
-sample_rate = 44100
-audio_embedding = audio_encoder(audio_data, sample_rate)
-```
-
-## Factory Constructor
-
-Create the operator via the following factory method
-
-***ops.audio_embedding.clmr()***
-
-
-## Interface
-
-An audio embedding operator generates vectors in numpy.ndarray given an audio file path or audio data in numpy.ndarray.
-
-
-**Parameters:**
-
-	None.
-
-
-**Returns**: *numpy.ndarray*
-
-	Audio embeddings in shape (num_clips, 512).
-
-
+The [default model weight](clmr_checkpoint_10000.pt) provided is pretrained on [Magnatagatune Dataset](https://paperswithcode.com/dataset/magnatagatune) with [SampleCNN](sample_cnn.py).

 ## Code Example

@ -57,9 +20,12 @@ Generate embeddings for the audio "test.wav".
 from towhee import dc

 dc.glob('test.wav')
+  .audio_decode()
+  .time_window(range=10)
  .audio_embedding.clmr()
  .show()
 ```
+    |  |

 *Write a same pipeline with explicit inputs/outputs name specifications:*

@ -67,9 +33,41 @@ dc.glob('test.wav')
 from towhee import dc

 dc.glob['path']('test.wav')
-  .audio_embedding.clmr['path', 'vecs']()
+  .audio_decode['path', 'audio']()
+  .time_window['audio', 'frames'](range=10)
+  .audio_embedding.clmr['frames', 'vecs']()
  .select('vecs')
-  .show()
+  .to_vec()
 ```

+## Factory Constructor
+
+Create the operator via the following factory method
+
+***audio_embedding.clmr(framework="pytorch")***
+
+**Parameters:**
+
+   *framework: str*
+
+   The framework of model implementation.
+Default value is "pytorch" since the model is implemented in Pytorch.
+
+## Interface
+
+An audio embedding operator generates vectors in numpy.ndarray given an audio file path or a [towhee audio](link/to/AudioFrame/api/doc).
+
+**Parameters:**
+
+   *Union[str, towhee.types.Audio]*
+
+   The audio path or link in string.
+Or audio input data in towhee audio frames.
+The input data should represent for an audio longer than 2s.
+
+**Returns**:
+
+   *numpy.ndarray*

+   Audio embeddings in shape (num_clips, 512).
+Each embedding stands for features of an audio clip with length of 2s.
--- a/utils/checkpoint.py
+++ b/utils/checkpoint.py
--- a/checkpoints/clmr_checkpoint_10000.pt
+++ b/checkpoints/clmr_checkpoint_10000.pt
--- a/clmr_magnatagatune.py
+++ b/clmr_magnatagatune.py
@ -26,14 +26,13 @@ from towhee.operator import NNOperator
 from towhee import register

 sys.path.append(str(Path(__file__).parent))
-
-from utils.checkpoint import load_encoder_checkpoint
-from models.sample_cnn import SampleCNN
+from clmr_checkpoint import load_encoder_checkpoint
+from sample_cnn import SampleCNN

 log = logging.getLogger()


-@register(output_schema=['vec'])
+@register(output_schema=['vecs'])
 class ClmrMagnatagatune(NNOperator):
    """
    Pretrained clmr
@ -44,7 +43,7 @@ class ClmrMagnatagatune(NNOperator):
        self.device = "cuda" if torch.cuda.is_available() else "cpu"

        weight_path = os.path.join(str(Path(__file__).parent),
-                                   'checkpoints/clmr_checkpoint_10000.pt')
+                                   'clmr_checkpoint_10000.pt')
        state_dict = load_encoder_checkpoint(weight_path, 1)
        encoder = SampleCNN(strides=[3, 3, 3, 3, 3, 3, 3, 3, 3], supervised=False, out_dim=1)
        encoder.load_state_dict(state_dict)
@ -86,11 +85,11 @@ class ClmrMagnatagatune(NNOperator):
 # if __name__ == "__main__":
 #     encoder = ClmrMagnatagatune()
 #
-#     audio_path = "/audio/path/or/link"
-#     vec = encoder(audio_path)
+#     # audio_path = "/audio/path/or/link"
+#     # vec = encoder(audio_path)
 #
-#     # audio_data = numpy.zeros((2, 441344))
-#     # sample_rate = 44100
-#     # vec = encoder(audio_data, sample_rate)
+#     audio_data = numpy.zeros((2, 441344))
+#     sample_rate = 44100
+#     vec = encoder(audio_data, sample_rate)
 #
 #     print(vec.shape)
--- a/models/model.py
+++ b/models/model.py
--- a/models/init.py
+++ b/models/init.py
@ -1 +0,0 @@
-
--- a/requirements.txt
+++ b/requirements.txt
@ -1,4 +1,3 @@
 torchaudio==0.9.0
 torch==1.9.0
-soundfile
 numpy
--- a/models/sample_cnn.py
+++ b/models/sample_cnn.py
@ -1,6 +1,5 @@
-import torch
-import torch.nn as nn
-from .model import Model
+from torch import nn
+from clmr_model import Model


 class SampleCNN(Model):
--- a/utils/init.py
+++ b/utils/init.py
@ -1 +0,0 @@
-