| 
					
					
						
							
						
					
					
				 | 
				@ -19,13 +19,14 @@ import os | 
			
		
		
	
		
			
				 | 
				 | 
				import sys | 
				 | 
				 | 
				import sys | 
			
		
		
	
		
			
				 | 
				 | 
				import numpy | 
				 | 
				 | 
				import numpy | 
			
		
		
	
		
			
				 | 
				 | 
				from pathlib import Path | 
				 | 
				 | 
				from pathlib import Path | 
			
		
		
	
		
			
				 | 
				 | 
				from typing import Union | 
				 | 
				 | 
				 | 
			
		
		
	
		
			
				 | 
				 | 
				 | 
				 | 
				 | 
				from typing import List | 
			
		
		
	
		
			
				 | 
				 | 
				
 | 
				 | 
				 | 
				
 | 
			
		
		
	
		
			
				 | 
				 | 
				import torch | 
				 | 
				 | 
				import torch | 
			
		
		
	
		
			
				 | 
				 | 
				
 | 
				 | 
				 | 
				
 | 
			
		
		
	
		
			
				 | 
				 | 
				from towhee.operator.base import NNOperator | 
				 | 
				 | 
				from towhee.operator.base import NNOperator | 
			
		
		
	
		
			
				 | 
				 | 
				from towhee.models.vggish.torch_vggish import VGG | 
				 | 
				 | 
				from towhee.models.vggish.torch_vggish import VGG | 
			
		
		
	
		
			
				 | 
				 | 
				from towhee import register | 
				 | 
				 | 
				from towhee import register | 
			
		
		
	
		
			
				 | 
				 | 
				 | 
				 | 
				 | 
				from towhee.types.audio_frame import AudioFrame | 
			
		
		
	
		
			
				 | 
				 | 
				
 | 
				 | 
				 | 
				
 | 
			
		
		
	
		
			
				 | 
				 | 
				sys.path.append(str(Path(__file__).parent)) | 
				 | 
				 | 
				sys.path.append(str(Path(__file__).parent)) | 
			
		
		
	
		
			
				 | 
				 | 
				import vggish_input | 
				 | 
				 | 
				import vggish_input | 
			
		
		
	
	
		
			
				| 
					
					
					
						
							
						
					
				 | 
				@ -51,25 +52,26 @@ class Vggish(NNOperator): | 
			
		
		
	
		
			
				 | 
				 | 
				        self.model.eval() | 
				 | 
				 | 
				        self.model.eval() | 
			
		
		
	
		
			
				 | 
				 | 
				        self.model.to(self.device) | 
				 | 
				 | 
				        self.model.to(self.device) | 
			
		
		
	
		
			
				 | 
				 | 
				
 | 
				 | 
				 | 
				
 | 
			
		
		
	
		
			
				 | 
				 | 
				    def __call__(self, audio: Union[str, numpy.ndarray], sr: int = None) -> numpy.ndarray: | 
				 | 
				 | 
				 | 
			
		
		
	
		
			
				 | 
				 | 
				        audio_tensors = self.preprocess(audio, sr).to(self.device) | 
				 | 
				 | 
				 | 
			
		
		
	
		
			
				 | 
				 | 
				 | 
				 | 
				 | 
				    def __call__(self, data: List[AudioFrame]) -> numpy.ndarray: | 
			
		
		
	
		
			
				 | 
				 | 
				 | 
				 | 
				 | 
				        audio_tensors = self.preprocess(data).to(self.device) | 
			
		
		
	
		
			
				 | 
				 | 
				        features = self.model(audio_tensors) | 
				 | 
				 | 
				        features = self.model(audio_tensors) | 
			
		
		
	
		
			
				 | 
				 | 
				        outs = features.to("cpu") | 
				 | 
				 | 
				        outs = features.to("cpu") | 
			
		
		
	
		
			
				 | 
				 | 
				        return outs.detach().numpy() | 
				 | 
				 | 
				        return outs.detach().numpy() | 
			
		
		
	
		
			
				 | 
				 | 
				
 | 
				 | 
				 | 
				
 | 
			
		
		
	
		
			
				 | 
				 | 
				    def preprocess(self, audio: Union[str, numpy.ndarray], sr: int = None): | 
				 | 
				 | 
				 | 
			
		
		
	
		
			
				 | 
				 | 
				        if isinstance(audio, str): | 
				 | 
				 | 
				 | 
			
		
		
	
		
			
				 | 
				 | 
				            audio_tensors = vggish_input.wavfile_to_examples(audio) | 
				 | 
				 | 
				 | 
			
		
		
	
		
			
				 | 
				 | 
				        elif isinstance(audio, numpy.ndarray): | 
				 | 
				 | 
				 | 
			
		
		
	
		
			
				 | 
				 | 
				 | 
				 | 
				 | 
				    def preprocess(self, frames: List[AudioFrame]): | 
			
		
		
	
		
			
				 | 
				 | 
				 | 
				 | 
				 | 
				        sr = frames[0].sample_rate | 
			
		
		
	
		
			
				 | 
				 | 
				 | 
				 | 
				 | 
				        audio = numpy.hstack(frames) | 
			
		
		
	
		
			
				 | 
				 | 
				 | 
				 | 
				 | 
				        if audio.dtype == numpy.int32: | 
			
		
		
	
		
			
				 | 
				 | 
				 | 
				 | 
				 | 
				            audio = audio / 2147483648.0 | 
			
		
		
	
		
			
				 | 
				 | 
				 | 
				 | 
				 | 
				        elif audio.dtype == numpy.int16: | 
			
		
		
	
		
			
				 | 
				 | 
				 | 
				 | 
				 | 
				            audio = audio / 32768.0 | 
			
		
		
	
		
			
				 | 
				 | 
				        try: | 
				 | 
				 | 
				        try: | 
			
		
		
	
		
			
				 | 
				 | 
				            audio = audio.transpose() | 
				 | 
				 | 
				            audio = audio.transpose() | 
			
		
		
	
		
			
				 | 
				 | 
				            audio_tensors = vggish_input.waveform_to_examples(audio, sr, return_tensor=True) | 
				 | 
				 | 
				            audio_tensors = vggish_input.waveform_to_examples(audio, sr, return_tensor=True) | 
			
		
		
	
		
			
				 | 
				 | 
				 | 
				 | 
				 | 
				            return audio_tensors | 
			
		
		
	
		
			
				 | 
				 | 
				        except Exception as e: | 
				 | 
				 | 
				        except Exception as e: | 
			
		
		
	
		
			
				 | 
				 | 
				            log.error("Fail to load audio data.") | 
				 | 
				 | 
				            log.error("Fail to load audio data.") | 
			
		
		
	
		
			
				 | 
				 | 
				            raise e | 
				 | 
				 | 
				            raise e | 
			
		
		
	
		
			
				 | 
				 | 
				        else: | 
				 | 
				 | 
				 | 
			
		
		
	
		
			
				 | 
				 | 
				            log.error(f"Invalid input audio: {type(audio)}") | 
				 | 
				 | 
				 | 
			
		
		
	
		
			
				 | 
				 | 
				        return audio_tensors | 
				 | 
				 | 
				 | 
			
		
		
	
		
			
				 | 
				 | 
				
 | 
				 | 
				 | 
				
 | 
			
		
		
	
		
			
				 | 
				 | 
				
 | 
				 | 
				 | 
				
 | 
			
		
		
	
		
			
				 | 
				 | 
				# if __name__ == '__main__': | 
				 | 
				 | 
				# if __name__ == '__main__': | 
			
		
		
	
	
		
			
				| 
					
						
							
						
					
					
					
				 | 
				
  |