Update

Signed-off-by: Jael Gu <mengjia.gu@zilliz.com>
3 years ago · 61ece2d756
4 changed files with 19 additions and 16 deletions
--- a/README.md
+++ b/README.md
@ -4,7 +4,7 @@

 <br />

-## Desription
+## Description

 The audio embedding operator converts an input audio into a dense vector which can be used to represent the audio clip's semantics.
 Each vector represents for an audio clip with a fixed length of around 0.9s.
@ -83,7 +83,7 @@ Default value is "pytorch" since the model is implemented in Pytorch.

 ## Interface

-An audio embedding operator generates vectors in numpy.ndarray given an audio file path or towhee audio frames.
+An audio embedding operator generates vectors in numpy.ndarray given towhee audio frames.

 **Parameters:**

--- a/requirements.txt
+++ b/requirements.txt
@ -1,7 +1,6 @@
 torch>=1.9.0
 numpy>=1.19.5
 resampy
-torchaudio

 towhee
-towhee.models
+towhee.models
--- a/vggish.py
+++ b/vggish.py
@ -60,13 +60,15 @@ class Vggish(NNOperator):

    def preprocess(self, frames: List[AudioFrame]):
        sr = frames[0].sample_rate
-        layout = frames[0].lay_out
-        audio = numpy.hstack(frames)
+        layout = frames[0].layout
        if layout == 'stereo':
-            audio = audio.reshape(-1, 2)
+            frames = [frame.reshape(-1, 2) for frame in frames]
+            audio = numpy.vstack(frames)
+        else:
+            audio = numpy.hstack(frames)
+            audio = audio.transpose()
        audio = self.int2float(audio)
        try:
-            audio = audio.transpose()
            audio_tensors = vggish_input.waveform_to_examples(audio, sr, return_tensor=True)
            return audio_tensors
        except Exception as e:
@ -81,11 +83,14 @@ class Vggish(NNOperator):

        The code is inspired by https://github.com/mgeier/python-audio/blob/master/audio-files/utility.py
        """
-        assert wav.dtype.kind in 'iu'
        dtype = numpy.dtype(dtype)
        assert dtype.kind == 'f'

-        ii = numpy.iinfo(wav.dtype)
-        abs_max = 2 ** (ii.bits - 1)
-        offset = ii.min + abs_max
-        return (wav.astype(dtype) - offset) / abs_max
+        if wav.dtype.kind in 'iu':
+            ii = numpy.iinfo(wav.dtype)
+            abs_max = 2 ** (ii.bits - 1)
+            offset = ii.min + abs_max
+            return (wav.astype(dtype) - offset) / abs_max
+        else:
+            log.warning('Converting float dtype from %s to %s.', wav.dtype, dtype)
+            return wav.astype(dtype)
--- a/vggish_input.py
+++ b/vggish_input.py
@ -42,9 +42,8 @@ def waveform_to_examples(data, sample_rate, return_tensor=True):
    bands, where the frame length is vggish_params.STFT_HOP_LENGTH_SECONDS.

  """
-    # Todo: convert stereo to mono.
-    # if len(data.shape) > 1:
-    #     data = np.mean(data, axis=1)
+    if len(data.shape) > 1:
+        data = np.mean(data, axis=1)
    # Resample to the rate assumed by VGGish.
    if sample_rate != vggish_params.SAMPLE_RATE:
        data = resampy.resample(data, sample_rate, vggish_params.SAMPLE_RATE)