Adapt audio-decode/ffmpeg

Signed-off-by: Jael Gu <mengjia.gu@zilliz.com>
3 years ago · 49c8aab5ac
4 changed files with 47 additions and 42 deletions
--- a/README.md
+++ b/README.md
@ -1,6 +1,6 @@
 # Audio Embedding with Vggish
 *Author: Jael Gu*
 *Author: [Jael Gu](https://github.com/jaelgu)*
 <br />
@ -23,11 +23,12 @@ Generate embeddings for the audio "test.wav".
 ```python
 import towhee
 towhee.glob('test.wav') \
      .audio_decode() \
      .time_window(range=10) \
      .audio_embedding.vggish() \
      .show()
 (
    towhee.glob('test.wav')
          .audio_decode.ffmpeg()
          .audio_embedding.vggish()
          .show()
 )
 ```
    | [-0.4931737, -0.40068552, -0.032327592, ...] shape=(10, 128) |
@ -36,12 +37,12 @@ towhee.glob('test.wav') \
 ```python
 import towhee
 towhee.glob['path']('test.wav') \
      .audio_decode['path', 'audio']() \
      .time_window['audio', 'frames'](range=10) \
      .audio_embedding.vggish['frames', 'vecs']() \
      .select('vecs') \
      .to_vec()
 (
    towhee.glob['path']('test.wav')
          .audio_decode.ffmpeg['path', 'frames']()
          .audio_embedding.vggish['frames', 'vecs']()
          .show()
 )
 ```
    [array([[-0.4931737 , -0.40068552, -0.03232759, ..., -0.33428153,
          0.1333081 , -0.25221825],
@ -84,10 +85,9 @@ An audio embedding operator generates vectors in numpy.ndarray given an audio fi
 **Parameters:**
 *Union[str, towhee.types.Audio (a sub-class of numpy.ndarray)]*
 *data: List[towhee.types.audio_frame.AudioFrame]*
 The audio path or link in string.
 Or audio input data in towhee audio frames.
 Input audio data is a list of towhee audio frames.
 The input data should represent for an audio longer than 0.9s.
--- a/requirements.txt
+++ b/requirements.txt
@ -1,4 +1,4 @@
 torch==1.9.0
 numpy==1.19.5
 torch>=1.9.0
 numpy>=1.19.5
 resampy
 torchaudio
--- a/vggish.py
+++ b/vggish.py
@ -19,13 +19,14 @@ import os
 import sys
 import numpy
 from pathlib import Path
 from typing import Union
 from typing import List
 import torch
 from towhee.operator.base import NNOperator
 from towhee.models.vggish.torch_vggish import VGG
 from towhee import register
 from towhee.types.audio_frame import AudioFrame
 sys.path.append(str(Path(__file__).parent))
 import vggish_input
@ -51,25 +52,26 @@ class Vggish(NNOperator):
        self.model.eval()
        self.model.to(self.device)
    def __call__(self, audio: Union[str, numpy.ndarray], sr: int = None) -> numpy.ndarray:
        audio_tensors = self.preprocess(audio, sr).to(self.device)
    def __call__(self, data: List[AudioFrame]) -> numpy.ndarray:
        audio_tensors = self.preprocess(data).to(self.device)
        features = self.model(audio_tensors)
        outs = features.to("cpu")
        return outs.detach().numpy()
    def preprocess(self, audio: Union[str, numpy.ndarray], sr: int = None):
        if isinstance(audio, str):
            audio_tensors = vggish_input.wavfile_to_examples(audio)
        elif isinstance(audio, numpy.ndarray):
            try:
                audio = audio.transpose()
                audio_tensors = vggish_input.waveform_to_examples(audio, sr, return_tensor=True)
            except Exception as e:
                log.error("Fail to load audio data.")
                raise e
        else:
            log.error(f"Invalid input audio: {type(audio)}")
        return audio_tensors
    def preprocess(self, frames: List[AudioFrame]):
        sr = frames[0].sample_rate
        audio = numpy.hstack(frames)
        if audio.dtype == numpy.int32:
            audio = audio / 2147483648.0
        elif audio.dtype == numpy.int16:
            audio = audio / 32768.0
        try:
            audio = audio.transpose()
            audio_tensors = vggish_input.waveform_to_examples(audio, sr, return_tensor=True)
            return audio_tensors
        except Exception as e:
            log.error("Fail to load audio data.")
            raise e
 # if __name__ == '__main__':
--- a/vggish_input.py
+++ b/vggish_input.py
@ -44,9 +44,9 @@ def waveform_to_examples(data, sample_rate, return_tensor=True):
    bands, where the frame length is vggish_params.STFT_HOP_LENGTH_SECONDS.
  """
    # Convert to mono.
    if len(data.shape) > 1:
        data = np.mean(data, axis=1)
    # Todo: convert stereo to mono.
    # if len(data.shape) > 1:
    #     data = np.mean(data, axis=1)
    # Resample to the rate assumed by VGGish.
    if sample_rate != vggish_params.SAMPLE_RATE:
        data = resampy.resample(data, sample_rate, vggish_params.SAMPLE_RATE)
@ -81,12 +81,15 @@ def waveform_to_examples(data, sample_rate, return_tensor=True):
 def wavfile_to_examples(wav_file, return_tensor=True):
    """Convenience wrapper around waveform_to_examples() for a common WAV format.
  Args:
    wav_file: String path to a file, or a file-like object. The file
    is assumed to contain WAV audio data with signed 16-bit PCM samples.
    torch: Return data as a Pytorch tensor ready for VGGish
    """
    Convenience wrapper around waveform_to_examples() for a common WAV format.
    Args:
        wav_file:
            String path to a file, or a file-like object.
            The file is assumed to contain WAV audio data with signed 16-bit PCM samples.
        return_tensor:
            Return data as a Pytorch tensor ready for VGGish
  Returns:
    See waveform_to_examples.