Add audio embedding

Signed-off-by: shiyu22 <shiyu.chen@zilliz.com>
2 years ago · 65e2953517
3 changed files with 72 additions and 1 deletions
--- a/README.md
+++ b/README.md
@ -1,2 +1,37 @@
-# audio-embedding
+# Audio Embedding

+## **Description**
+
+The audio embedding pipeline converts an input audio into a dense vector which can be used to represent the audio clip's semantics. Each vector represents for an audio clip with a fixed length of around 0.9s. This operator is built on top of VGGish with Pytorch.
+
+
+## Code Example
+
+- Create audio embedding pipeline with the default configuration.
+
+```python
+from towhee import AutoPipes
+
+p = AutoPipes.pipeline('audio-embedding')
+res = p('test.wav')
+res.get()
+```
+
+
+## **Interface**
+
+**AudioEmbeddingConfig**
+
+> You can find some parameters in [audio_decode.ffmpeg](https://towhee.io/audio-decode/ffmpeg) and [audio_embedding.vggish](https://towhee.io/audio-embedding/vggish) operators.
+
+***weights_path:*** str
+
+The path to model weights. If None, it will load default model weights.
+
+***framework:*** str
+
+The framework of model implementation. Default value is "pytorch" since the model is implemented in Pytorch.
+
+***device***: int
+
+The number of GPU device, defaults to -1, which means using CPU.
--- a/audio_embedding.py
+++ b/audio_embedding.py
@ -0,0 +1,36 @@
+from towhee import pipe, ops, AutoPipes, AutoConfig
+
+
+@AutoConfig.register
+class AudioEmbeddingConfig:
+    def __init__(self):
+        # config for audio_decode.ffmpeg
+        self.batch_size = -1
+        self.sample_rate = None
+        self.layout = None
+        
+        # config for audio_embedding.vggish
+        self.weights_path: str = None
+        self.framework: str = 'pytorch'
+        
+        # config for triton
+        self.device = -1
+
+
+@AutoPipes.register
+def AudioEmbedding(config=None):
+    if not config:
+        config = AudioEmbeddingConfig()
+
+    if config.device >= 0:
+        op_config = AutoConfig.TritonGPUConfig(device_ids=[config.device], max_batch_size=128)
+    else:
+        op_config = AutoConfig.TritonCPUConfig()
+
+    
+    return (
+        pipe.input('path')
+            .map('path', 'frame', ops.audio_decode.ffmpeg(config.batch_size, config.sample_rate, config.layout))
+            .map('frame', 'vec', ops.audio_embedding.vggish(config.weights_path, config.framework), conf=op_config)
+            .output('vec')
+    )
--- a/test.wav
+++ b/test.wav