# Copyright 2021 Zilliz. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import logging import warnings import os import sys import numpy from pathlib import Path from typing import List import torch from towhee.operator.base import NNOperator from towhee.models.vggish.torch_vggish import VGG from towhee import register from towhee.types.audio_frame import AudioFrame sys.path.append(str(Path(__file__).parent)) import vggish_input warnings.filterwarnings('ignore') log = logging.getLogger() @register(output_schema=['vec']) class Vggish(NNOperator): """ """ def __init__(self, weights_path: str = None, framework: str = 'pytorch') -> None: super().__init__(framework=framework) self.device = "cuda" if torch.cuda.is_available() else "cpu" self.model = VGG() if not weights_path: path = str(Path(__file__).parent) weights_path = os.path.join(path, 'vggish.pth') state_dict = torch.load(weights_path, map_location=torch.device('cpu')) self.model.load_state_dict(state_dict) self.model.eval() self.model.to(self.device) def __call__(self, data: List[AudioFrame]) -> numpy.ndarray: audio_tensors = self.preprocess(data).to(self.device) features = self.model(audio_tensors) outs = features.to("cpu") return outs.detach().numpy() def preprocess(self, frames: List[AudioFrame]): sr = frames[0].sample_rate layout = frames[0].layout if layout == 'stereo': frames = [frame.reshape(-1, 2) for frame in frames] audio = numpy.vstack(frames) else: audio = numpy.hstack(frames) audio = audio.transpose() audio = self.int2float(audio) try: audio_tensors = vggish_input.waveform_to_examples(audio, sr, return_tensor=True) return audio_tensors except Exception as e: log.error("Fail to load audio data.") raise e def int2float(self, wav: numpy.ndarray, dtype: str = 'float64'): """ Convert audio data from int to float. The input dtype must be integers. The output dtype is controlled by the parameter `dtype`, defaults to 'float64'. The code is inspired by https://github.com/mgeier/python-audio/blob/master/audio-files/utility.py """ dtype = numpy.dtype(dtype) assert dtype.kind == 'f' if wav.dtype.kind in 'iu': # ii = numpy.iinfo(wav.dtype) # abs_max = 2 ** (ii.bits - 1) # offset = ii.min + abs_max # return (wav.astype(dtype) - offset) / abs_max if wav.dtype != 'int16': wav = (wav >> 16).astype(numpy.int16) assert wav.dtype == 'int16' wav = (wav / 32768.0).astype(dtype) return wav else: log.warning('Converting float dtype from %s to %s.', wav.dtype, dtype) return wav.astype(dtype)