From 61ece2d7569d506d2aba216893f7de5b4e51bdd4 Mon Sep 17 00:00:00 2001 From: Jael Gu Date: Fri, 17 Jun 2022 12:41:22 +0800 Subject: [PATCH] Update Signed-off-by: Jael Gu --- README.md | 4 ++-- requirements.txt | 3 +-- vggish.py | 23 ++++++++++++++--------- vggish_input.py | 5 ++--- 4 files changed, 19 insertions(+), 16 deletions(-) diff --git a/README.md b/README.md index 8eead77..d0223d9 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,7 @@
-## Desription +## Description The audio embedding operator converts an input audio into a dense vector which can be used to represent the audio clip's semantics. Each vector represents for an audio clip with a fixed length of around 0.9s. @@ -83,7 +83,7 @@ Default value is "pytorch" since the model is implemented in Pytorch. ## Interface -An audio embedding operator generates vectors in numpy.ndarray given an audio file path or towhee audio frames. +An audio embedding operator generates vectors in numpy.ndarray given towhee audio frames. **Parameters:** diff --git a/requirements.txt b/requirements.txt index b2263e3..33cc5c0 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,6 @@ torch>=1.9.0 numpy>=1.19.5 resampy -torchaudio towhee -towhee.models \ No newline at end of file +towhee.models diff --git a/vggish.py b/vggish.py index 365d5c0..c27066d 100644 --- a/vggish.py +++ b/vggish.py @@ -60,13 +60,15 @@ class Vggish(NNOperator): def preprocess(self, frames: List[AudioFrame]): sr = frames[0].sample_rate - layout = frames[0].lay_out - audio = numpy.hstack(frames) + layout = frames[0].layout if layout == 'stereo': - audio = audio.reshape(-1, 2) + frames = [frame.reshape(-1, 2) for frame in frames] + audio = numpy.vstack(frames) + else: + audio = numpy.hstack(frames) + audio = audio.transpose() audio = self.int2float(audio) try: - audio = audio.transpose() audio_tensors = vggish_input.waveform_to_examples(audio, sr, return_tensor=True) return audio_tensors except Exception as e: @@ -81,11 +83,14 @@ class Vggish(NNOperator): The code is inspired by https://github.com/mgeier/python-audio/blob/master/audio-files/utility.py """ - assert wav.dtype.kind in 'iu' dtype = numpy.dtype(dtype) assert dtype.kind == 'f' - ii = numpy.iinfo(wav.dtype) - abs_max = 2 ** (ii.bits - 1) - offset = ii.min + abs_max - return (wav.astype(dtype) - offset) / abs_max + if wav.dtype.kind in 'iu': + ii = numpy.iinfo(wav.dtype) + abs_max = 2 ** (ii.bits - 1) + offset = ii.min + abs_max + return (wav.astype(dtype) - offset) / abs_max + else: + log.warning('Converting float dtype from %s to %s.', wav.dtype, dtype) + return wav.astype(dtype) diff --git a/vggish_input.py b/vggish_input.py index 5c1ea6d..09e1bf2 100644 --- a/vggish_input.py +++ b/vggish_input.py @@ -42,9 +42,8 @@ def waveform_to_examples(data, sample_rate, return_tensor=True): bands, where the frame length is vggish_params.STFT_HOP_LENGTH_SECONDS. """ - # Todo: convert stereo to mono. - # if len(data.shape) > 1: - # data = np.mean(data, axis=1) + if len(data.shape) > 1: + data = np.mean(data, axis=1) # Resample to the rate assumed by VGGish. if sample_rate != vggish_params.SAMPLE_RATE: data = resampy.resample(data, sample_rate, vggish_params.SAMPLE_RATE)