From 15bf4a3908c5344c5666d046e8b35856b1a6dfb1 Mon Sep 17 00:00:00 2001 From: Jael Gu Date: Tue, 7 Jun 2022 14:41:58 +0800 Subject: [PATCH] Allow stereo & debug dtype convention Signed-off-by: Jael Gu --- vggish.py | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/vggish.py b/vggish.py index c68775c..365d5c0 100644 --- a/vggish.py +++ b/vggish.py @@ -60,9 +60,11 @@ class Vggish(NNOperator): def preprocess(self, frames: List[AudioFrame]): sr = frames[0].sample_rate + layout = frames[0].lay_out audio = numpy.hstack(frames) - ii = numpy.iinfo(audio.dtype) - audio = 2 * audio / (ii.max - ii.min + 1) + if layout == 'stereo': + audio = audio.reshape(-1, 2) + audio = self.int2float(audio) try: audio = audio.transpose() audio_tensors = vggish_input.waveform_to_examples(audio, sr, return_tensor=True) @@ -71,3 +73,19 @@ class Vggish(NNOperator): log.error("Fail to load audio data.") raise e + def int2float(self, wav: numpy.ndarray, dtype: str = 'float64'): + """ + Convert audio data from int to float. + The input dtype must be integers. + The output dtype is controlled by the parameter `dtype`, defaults to 'float64'. + + The code is inspired by https://github.com/mgeier/python-audio/blob/master/audio-files/utility.py + """ + assert wav.dtype.kind in 'iu' + dtype = numpy.dtype(dtype) + assert dtype.kind == 'f' + + ii = numpy.iinfo(wav.dtype) + abs_max = 2 ** (ii.bits - 1) + offset = ii.min + abs_max + return (wav.astype(dtype) - offset) / abs_max