logo
Browse Source

Allow stereo & debug dtype convention

Signed-off-by: Jael Gu <mengjia.gu@zilliz.com>
main
Jael Gu 3 years ago
parent
commit
15bf4a3908
  1. 22
      vggish.py

22
vggish.py

@ -60,9 +60,11 @@ class Vggish(NNOperator):
def preprocess(self, frames: List[AudioFrame]):
sr = frames[0].sample_rate
layout = frames[0].lay_out
audio = numpy.hstack(frames)
ii = numpy.iinfo(audio.dtype)
audio = 2 * audio / (ii.max - ii.min + 1)
if layout == 'stereo':
audio = audio.reshape(-1, 2)
audio = self.int2float(audio)
try:
audio = audio.transpose()
audio_tensors = vggish_input.waveform_to_examples(audio, sr, return_tensor=True)
@ -71,3 +73,19 @@ class Vggish(NNOperator):
log.error("Fail to load audio data.")
raise e
def int2float(self, wav: numpy.ndarray, dtype: str = 'float64'):
"""
Convert audio data from int to float.
The input dtype must be integers.
The output dtype is controlled by the parameter `dtype`, defaults to 'float64'.
The code is inspired by https://github.com/mgeier/python-audio/blob/master/audio-files/utility.py
"""
assert wav.dtype.kind in 'iu'
dtype = numpy.dtype(dtype)
assert dtype.kind == 'f'
ii = numpy.iinfo(wav.dtype)
abs_max = 2 ** (ii.bits - 1)
offset = ii.min + abs_max
return (wav.astype(dtype) - offset) / abs_max

Loading…
Cancel
Save