# Copyright 2021 Zilliz. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import logging import warnings import os import sys import numpy from pathlib import Path from typing import List import torch from towhee.operator.base import NNOperator from towhee.models.vggish.torch_vggish import VGG from towhee import register from towhee.types.audio_frame import AudioFrame sys.path.append(str(Path(__file__).parent)) import vggish_input warnings.filterwarnings('ignore') log = logging.getLogger() @register(output_schema=['vec']) class Vggish(NNOperator): """ """ def __init__(self, weights_path: str = None, framework: str = 'pytorch') -> None: super().__init__(framework=framework) self.device = "cuda" if torch.cuda.is_available() else "cpu" self.model = VGG() if not weights_path: path = str(Path(__file__).parent) weights_path = os.path.join(path, 'vggish.pth') state_dict = torch.load(weights_path, map_location=torch.device('cpu')) self.model.load_state_dict(state_dict) self.model.eval() self.model.to(self.device) def __call__(self, data: List[AudioFrame]) -> numpy.ndarray: audio_tensors = self.preprocess(data).to(self.device) features = self.model(audio_tensors) outs = features.to("cpu") return outs.detach().numpy() def preprocess(self, frames: List[AudioFrame]): sr = frames[0].sample_rate layout = frames[0].lay_out audio = numpy.hstack(frames) if layout == 'stereo': audio = audio.reshape(-1, 2) audio = self.int2float(audio) try: audio = audio.transpose() audio_tensors = vggish_input.waveform_to_examples(audio, sr, return_tensor=True) return audio_tensors except Exception as e: log.error("Fail to load audio data.") raise e def int2float(self, wav: numpy.ndarray, dtype: str = 'float64'): """ Convert audio data from int to float. The input dtype must be integers. The output dtype is controlled by the parameter `dtype`, defaults to 'float64'. The code is inspired by https://github.com/mgeier/python-audio/blob/master/audio-files/utility.py """ assert wav.dtype.kind in 'iu' dtype = numpy.dtype(dtype) assert dtype.kind == 'f' ii = numpy.iinfo(wav.dtype) abs_max = 2 ** (ii.bits - 1) offset = ii.min + abs_max return (wav.astype(dtype) - offset) / abs_max