From 61ece2d7569d506d2aba216893f7de5b4e51bdd4 Mon Sep 17 00:00:00 2001
From: Jael Gu <mengjia.gu@zilliz.com>
Date: Fri, 17 Jun 2022 12:41:22 +0800
Subject: [PATCH] Update

Signed-off-by: Jael Gu <mengjia.gu@zilliz.com>
---
 README.md        |  4 ++--
 requirements.txt |  3 +--
 vggish.py        | 23 ++++++++++++++---------
 vggish_input.py  |  5 ++---
 4 files changed, 19 insertions(+), 16 deletions(-)
diff --git a/README.md b/README.md
index 8eead77..d0223d9 100644
--- a/README.md
+++ b/README.md
@@ -4,7 +4,7 @@
 
 <br />
 
-## Desription
+## Description
 
 The audio embedding operator converts an input audio into a dense vector which can be used to represent the audio clip's semantics.
 Each vector represents for an audio clip with a fixed length of around 0.9s.
@@ -83,7 +83,7 @@ Default value is "pytorch" since the model is implemented in Pytorch.
 
 ## Interface
 
-An audio embedding operator generates vectors in numpy.ndarray given an audio file path or towhee audio frames.
+An audio embedding operator generates vectors in numpy.ndarray given towhee audio frames.
 
 **Parameters:**
 
diff --git a/requirements.txt b/requirements.txt
index b2263e3..33cc5c0 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,7 +1,6 @@
 torch>=1.9.0
 numpy>=1.19.5
 resampy
-torchaudio
 
 towhee
-towhee.models
\ No newline at end of file
+towhee.models
diff --git a/vggish.py b/vggish.py
index 365d5c0..c27066d 100644
--- a/vggish.py
+++ b/vggish.py
@@ -60,13 +60,15 @@ class Vggish(NNOperator):
 
     def preprocess(self, frames: List[AudioFrame]):
         sr = frames[0].sample_rate
-        layout = frames[0].lay_out
-        audio = numpy.hstack(frames)
+        layout = frames[0].layout
         if layout == 'stereo':
-            audio = audio.reshape(-1, 2)
+            frames = [frame.reshape(-1, 2) for frame in frames]
+            audio = numpy.vstack(frames)
+        else:
+            audio = numpy.hstack(frames)
+            audio = audio.transpose()
         audio = self.int2float(audio)
         try:
-            audio = audio.transpose()
             audio_tensors = vggish_input.waveform_to_examples(audio, sr, return_tensor=True)
             return audio_tensors
         except Exception as e:
@@ -81,11 +83,14 @@ class Vggish(NNOperator):
 
         The code is inspired by https://github.com/mgeier/python-audio/blob/master/audio-files/utility.py
         """
-        assert wav.dtype.kind in 'iu'
         dtype = numpy.dtype(dtype)
         assert dtype.kind == 'f'
 
-        ii = numpy.iinfo(wav.dtype)
-        abs_max = 2 ** (ii.bits - 1)
-        offset = ii.min + abs_max
-        return (wav.astype(dtype) - offset) / abs_max
+        if wav.dtype.kind in 'iu':
+            ii = numpy.iinfo(wav.dtype)
+            abs_max = 2 ** (ii.bits - 1)
+            offset = ii.min + abs_max
+            return (wav.astype(dtype) - offset) / abs_max
+        else:
+            log.warning('Converting float dtype from %s to %s.', wav.dtype, dtype)
+            return wav.astype(dtype)
diff --git a/vggish_input.py b/vggish_input.py
index 5c1ea6d..09e1bf2 100644
--- a/vggish_input.py
+++ b/vggish_input.py
@@ -42,9 +42,8 @@ def waveform_to_examples(data, sample_rate, return_tensor=True):
     bands, where the frame length is vggish_params.STFT_HOP_LENGTH_SECONDS.
 
   """
-    # Todo: convert stereo to mono.
-    # if len(data.shape) > 1:
-    #     data = np.mean(data, axis=1)
+    if len(data.shape) > 1:
+        data = np.mean(data, axis=1)
     # Resample to the rate assumed by VGGish.
     if sample_rate != vggish_params.SAMPLE_RATE:
         data = resampy.resample(data, sample_rate, vggish_params.SAMPLE_RATE)