logo
Browse Source

Adapt audio-decode/ffmpeg

Signed-off-by: Jael Gu <mengjia.gu@zilliz.com>
main
Jael Gu 3 years ago
parent
commit
49c8aab5ac
  1. 28
      README.md
  2. 4
      requirements.txt
  3. 22
      vggish.py
  4. 17
      vggish_input.py

28
README.md

@ -1,6 +1,6 @@
# Audio Embedding with Vggish
*Author: Jael Gu*
*Author: [Jael Gu](https://github.com/jaelgu)*
<br />
@ -23,11 +23,12 @@ Generate embeddings for the audio "test.wav".
```python
import towhee
towhee.glob('test.wav') \
.audio_decode() \
.time_window(range=10) \
.audio_embedding.vggish() \
(
towhee.glob('test.wav')
.audio_decode.ffmpeg()
.audio_embedding.vggish()
.show()
)
```
| [-0.4931737, -0.40068552, -0.032327592, ...] shape=(10, 128) |
@ -36,12 +37,12 @@ towhee.glob('test.wav') \
```python
import towhee
towhee.glob['path']('test.wav') \
.audio_decode['path', 'audio']() \
.time_window['audio', 'frames'](range=10) \
.audio_embedding.vggish['frames', 'vecs']() \
.select('vecs') \
.to_vec()
(
towhee.glob['path']('test.wav')
.audio_decode.ffmpeg['path', 'frames']()
.audio_embedding.vggish['frames', 'vecs']()
.show()
)
```
[array([[-0.4931737 , -0.40068552, -0.03232759, ..., -0.33428153,
0.1333081 , -0.25221825],
@ -84,10 +85,9 @@ An audio embedding operator generates vectors in numpy.ndarray given an audio fi
**Parameters:**
*Union[str, towhee.types.Audio (a sub-class of numpy.ndarray)]*
*data: List[towhee.types.audio_frame.AudioFrame]*
The audio path or link in string.
Or audio input data in towhee audio frames.
Input audio data is a list of towhee audio frames.
The input data should represent for an audio longer than 0.9s.

4
requirements.txt

@ -1,4 +1,4 @@
torch==1.9.0
numpy==1.19.5
torch>=1.9.0
numpy>=1.19.5
resampy
torchaudio

22
vggish.py

@ -19,13 +19,14 @@ import os
import sys
import numpy
from pathlib import Path
from typing import Union
from typing import List
import torch
from towhee.operator.base import NNOperator
from towhee.models.vggish.torch_vggish import VGG
from towhee import register
from towhee.types.audio_frame import AudioFrame
sys.path.append(str(Path(__file__).parent))
import vggish_input
@ -51,25 +52,26 @@ class Vggish(NNOperator):
self.model.eval()
self.model.to(self.device)
def __call__(self, audio: Union[str, numpy.ndarray], sr: int = None) -> numpy.ndarray:
audio_tensors = self.preprocess(audio, sr).to(self.device)
def __call__(self, data: List[AudioFrame]) -> numpy.ndarray:
audio_tensors = self.preprocess(data).to(self.device)
features = self.model(audio_tensors)
outs = features.to("cpu")
return outs.detach().numpy()
def preprocess(self, audio: Union[str, numpy.ndarray], sr: int = None):
if isinstance(audio, str):
audio_tensors = vggish_input.wavfile_to_examples(audio)
elif isinstance(audio, numpy.ndarray):
def preprocess(self, frames: List[AudioFrame]):
sr = frames[0].sample_rate
audio = numpy.hstack(frames)
if audio.dtype == numpy.int32:
audio = audio / 2147483648.0
elif audio.dtype == numpy.int16:
audio = audio / 32768.0
try:
audio = audio.transpose()
audio_tensors = vggish_input.waveform_to_examples(audio, sr, return_tensor=True)
return audio_tensors
except Exception as e:
log.error("Fail to load audio data.")
raise e
else:
log.error(f"Invalid input audio: {type(audio)}")
return audio_tensors
# if __name__ == '__main__':

17
vggish_input.py

@ -44,9 +44,9 @@ def waveform_to_examples(data, sample_rate, return_tensor=True):
bands, where the frame length is vggish_params.STFT_HOP_LENGTH_SECONDS.
"""
# Convert to mono.
if len(data.shape) > 1:
data = np.mean(data, axis=1)
# Todo: convert stereo to mono.
# if len(data.shape) > 1:
# data = np.mean(data, axis=1)
# Resample to the rate assumed by VGGish.
if sample_rate != vggish_params.SAMPLE_RATE:
data = resampy.resample(data, sample_rate, vggish_params.SAMPLE_RATE)
@ -81,12 +81,15 @@ def waveform_to_examples(data, sample_rate, return_tensor=True):
def wavfile_to_examples(wav_file, return_tensor=True):
"""Convenience wrapper around waveform_to_examples() for a common WAV format.
"""
Convenience wrapper around waveform_to_examples() for a common WAV format.
Args:
wav_file: String path to a file, or a file-like object. The file
is assumed to contain WAV audio data with signed 16-bit PCM samples.
torch: Return data as a Pytorch tensor ready for VGGish
wav_file:
String path to a file, or a file-like object.
The file is assumed to contain WAV audio data with signed 16-bit PCM samples.
return_tensor:
Return data as a Pytorch tensor ready for VGGish
Returns:
See waveform_to_examples.

Loading…
Cancel
Save