# Copyright 2021 Zilliz. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import numpy as np import torch import towhee from PIL import Image as PILImage from transformers import Data2VecAudioModel, Wav2Vec2Processor from towhee.operator.base import NNOperator class Data2VecAudio(NNOperator): def __init__(self, model_name = "facebook/data2vec-audio-base-960h"): self.model = Data2VecAudioModel.from_pretrained("facebook/data2vec-audio-base-960h") self.processor = Wav2Vec2Processor.from_pretrained("facebook/data2vec-audio-base-960h") def __call__(self, data): audio = np.hstack(data).reshape(1, -1) audio = audio.astype(np.float32, order='C') / 32768.0 sampling_rate = data[0]._sample_rate inputs = self.processor(audio.flatten(), sampling_rate=sampling_rate, return_tensors="pt") with torch.no_grad(): outputs = self.model(**inputs) last_hidden_states = outputs.last_hidden_state feat = last_hidden_states[:,-1,:].flatten().detach().cpu().numpy() return feat