From 15bf4a3908c5344c5666d046e8b35856b1a6dfb1 Mon Sep 17 00:00:00 2001
From: Jael Gu <mengjia.gu@zilliz.com>
Date: Tue, 7 Jun 2022 14:41:58 +0800
Subject: [PATCH] Allow stereo & debug dtype convention

Signed-off-by: Jael Gu <mengjia.gu@zilliz.com>
---
 vggish.py | 22 ++++++++++++++++++++--
 1 file changed, 20 insertions(+), 2 deletions(-)

diff --git a/vggish.py b/vggish.py
index c68775c..365d5c0 100644
--- a/vggish.py
+++ b/vggish.py
@@ -60,9 +60,11 @@ class Vggish(NNOperator):
 
     def preprocess(self, frames: List[AudioFrame]):
         sr = frames[0].sample_rate
+        layout = frames[0].lay_out
         audio = numpy.hstack(frames)
-        ii = numpy.iinfo(audio.dtype)
-        audio = 2 * audio / (ii.max - ii.min + 1)
+        if layout == 'stereo':
+            audio = audio.reshape(-1, 2)
+        audio = self.int2float(audio)
         try:
             audio = audio.transpose()
             audio_tensors = vggish_input.waveform_to_examples(audio, sr, return_tensor=True)
@@ -71,3 +73,19 @@ class Vggish(NNOperator):
             log.error("Fail to load audio data.")
             raise e
 
+    def int2float(self, wav: numpy.ndarray, dtype: str = 'float64'):
+        """
+        Convert audio data from int to float.
+        The input dtype must be integers.
+        The output dtype is controlled by the parameter `dtype`, defaults to 'float64'.
+
+        The code is inspired by https://github.com/mgeier/python-audio/blob/master/audio-files/utility.py
+        """
+        assert wav.dtype.kind in 'iu'
+        dtype = numpy.dtype(dtype)
+        assert dtype.kind == 'f'
+
+        ii = numpy.iinfo(wav.dtype)
+        abs_max = 2 ** (ii.bits - 1)
+        offset = ii.min + abs_max
+        return (wav.astype(dtype) - offset) / abs_max