Update

Signed-off-by: Jael Gu <mengjia.gu@zilliz.com>
3 years ago · 61ece2d756
4 changed files with 19 additions and 16 deletions
--- a/README.md
+++ b/README.md
@ -4,7 +4,7 @@
 <br />
 ## Desription
 ## Description
 The audio embedding operator converts an input audio into a dense vector which can be used to represent the audio clip's semantics.
 Each vector represents for an audio clip with a fixed length of around 0.9s.
@ -83,7 +83,7 @@ Default value is "pytorch" since the model is implemented in Pytorch.
 ## Interface
 An audio embedding operator generates vectors in numpy.ndarray given an audio file path or towhee audio frames.
 An audio embedding operator generates vectors in numpy.ndarray given towhee audio frames.
 **Parameters:**
--- a/requirements.txt
+++ b/requirements.txt
@ -1,7 +1,6 @@
 torch>=1.9.0
 numpy>=1.19.5
 resampy
 torchaudio
 towhee
 towhee.models
 towhee.models
--- a/vggish.py
+++ b/vggish.py
@ -60,13 +60,15 @@ class Vggish(NNOperator):
    def preprocess(self, frames: List[AudioFrame]):
        sr = frames[0].sample_rate
        layout = frames[0].lay_out
        audio = numpy.hstack(frames)
        layout = frames[0].layout
        if layout == 'stereo':
            audio = audio.reshape(-1, 2)
            frames = [frame.reshape(-1, 2) for frame in frames]
            audio = numpy.vstack(frames)
        else:
            audio = numpy.hstack(frames)
            audio = audio.transpose()
        audio = self.int2float(audio)
        try:
            audio = audio.transpose()
            audio_tensors = vggish_input.waveform_to_examples(audio, sr, return_tensor=True)
            return audio_tensors
        except Exception as e:
@ -81,11 +83,14 @@ class Vggish(NNOperator):
        The code is inspired by https://github.com/mgeier/python-audio/blob/master/audio-files/utility.py
        """
        assert wav.dtype.kind in 'iu'
        dtype = numpy.dtype(dtype)
        assert dtype.kind == 'f'
        ii = numpy.iinfo(wav.dtype)
        abs_max = 2 ** (ii.bits - 1)
        offset = ii.min + abs_max
        return (wav.astype(dtype) - offset) / abs_max
        if wav.dtype.kind in 'iu':
            ii = numpy.iinfo(wav.dtype)
            abs_max = 2 ** (ii.bits - 1)
            offset = ii.min + abs_max
            return (wav.astype(dtype) - offset) / abs_max
        else:
            log.warning('Converting float dtype from %s to %s.', wav.dtype, dtype)
            return wav.astype(dtype)
--- a/vggish_input.py
+++ b/vggish_input.py
@ -42,9 +42,8 @@ def waveform_to_examples(data, sample_rate, return_tensor=True):
    bands, where the frame length is vggish_params.STFT_HOP_LENGTH_SECONDS.
  """
    # Todo: convert stereo to mono.
    # if len(data.shape) > 1:
    #     data = np.mean(data, axis=1)
    if len(data.shape) > 1:
        data = np.mean(data, axis=1)
    # Resample to the rate assumed by VGGish.
    if sample_rate != vggish_params.SAMPLE_RATE:
        data = resampy.resample(data, sample_rate, vggish_params.SAMPLE_RATE)