logo
Browse Source

Update

Signed-off-by: Jael Gu <mengjia.gu@zilliz.com>
main
Jael Gu 3 years ago
parent
commit
61ece2d756
  1. 4
      README.md
  2. 1
      requirements.txt
  3. 15
      vggish.py
  4. 5
      vggish_input.py

4
README.md

@ -4,7 +4,7 @@
<br />
## Desription
## Description
The audio embedding operator converts an input audio into a dense vector which can be used to represent the audio clip's semantics.
Each vector represents for an audio clip with a fixed length of around 0.9s.
@ -83,7 +83,7 @@ Default value is "pytorch" since the model is implemented in Pytorch.
## Interface
An audio embedding operator generates vectors in numpy.ndarray given an audio file path or towhee audio frames.
An audio embedding operator generates vectors in numpy.ndarray given towhee audio frames.
**Parameters:**

1
requirements.txt

@ -1,7 +1,6 @@
torch>=1.9.0
numpy>=1.19.5
resampy
torchaudio
towhee
towhee.models

15
vggish.py

@ -60,13 +60,15 @@ class Vggish(NNOperator):
def preprocess(self, frames: List[AudioFrame]):
sr = frames[0].sample_rate
layout = frames[0].lay_out
audio = numpy.hstack(frames)
layout = frames[0].layout
if layout == 'stereo':
audio = audio.reshape(-1, 2)
frames = [frame.reshape(-1, 2) for frame in frames]
audio = numpy.vstack(frames)
else:
audio = numpy.hstack(frames)
audio = audio.transpose()
audio = self.int2float(audio)
try:
audio = audio.transpose()
audio_tensors = vggish_input.waveform_to_examples(audio, sr, return_tensor=True)
return audio_tensors
except Exception as e:
@ -81,11 +83,14 @@ class Vggish(NNOperator):
The code is inspired by https://github.com/mgeier/python-audio/blob/master/audio-files/utility.py
"""
assert wav.dtype.kind in 'iu'
dtype = numpy.dtype(dtype)
assert dtype.kind == 'f'
if wav.dtype.kind in 'iu':
ii = numpy.iinfo(wav.dtype)
abs_max = 2 ** (ii.bits - 1)
offset = ii.min + abs_max
return (wav.astype(dtype) - offset) / abs_max
else:
log.warning('Converting float dtype from %s to %s.', wav.dtype, dtype)
return wav.astype(dtype)

5
vggish_input.py

@ -42,9 +42,8 @@ def waveform_to_examples(data, sample_rate, return_tensor=True):
bands, where the frame length is vggish_params.STFT_HOP_LENGTH_SECONDS.
"""
# Todo: convert stereo to mono.
# if len(data.shape) > 1:
# data = np.mean(data, axis=1)
if len(data.shape) > 1:
data = np.mean(data, axis=1)
# Resample to the rate assumed by VGGish.
if sample_rate != vggish_params.SAMPLE_RATE:
data = resampy.resample(data, sample_rate, vggish_params.SAMPLE_RATE)

Loading…
Cancel
Save