# Copyright 2017 The TensorFlow Authors All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== """Compute input examples for VGGish from audio waveform.""" # Modification: Return torch tensors rather than numpy arrays import torch import torchaudio import numpy as np import mel_features import vggish_params def waveform_to_examples(data, sample_rate, return_tensor=True): """Converts audio waveform into an array of examples for VGGish. Args: data: np.array of either one dimension (mono) or two dimensions (multi-channel, with the outer dimension representing channels). Each sample is generally expected to lie in the range [-1.0, +1.0], although this is not required. sample_rate: Sample rate of data. return_tensor: Return data as a Pytorch tensor ready for VGGish Returns: 3-D np.array of shape [num_examples, num_frames, num_bands] which represents a sequence of examples, each of which contains a patch of log mel spectrogram, covering num_frames frames of audio and num_bands mel frequency bands, where the frame length is vggish_params.STFT_HOP_LENGTH_SECONDS. """ if len(data.shape) > 1: data = np.mean(data, axis=1) # Resample to the rate assumed by VGGish. if sample_rate != vggish_params.SAMPLE_RATE: data = torch.from_numpy(data) resampler = torchaudio.transforms.Resample(sample_rate, vggish_params.SAMPLE_RATE, dtype=data.dtype) data = resampler(data).cpu().detach().numpy() # Compute log mel spectrogram features. log_mel = mel_features.log_mel_spectrogram( data, audio_sample_rate=vggish_params.SAMPLE_RATE, log_offset=vggish_params.LOG_OFFSET, window_length_secs=vggish_params.STFT_WINDOW_LENGTH_SECONDS, hop_length_secs=vggish_params.STFT_HOP_LENGTH_SECONDS, num_mel_bins=vggish_params.NUM_MEL_BINS, lower_edge_hertz=vggish_params.MEL_MIN_HZ, upper_edge_hertz=vggish_params.MEL_MAX_HZ) # Frame features into examples. features_sample_rate = 1.0 / vggish_params.STFT_HOP_LENGTH_SECONDS example_window_length = int(round( vggish_params.EXAMPLE_WINDOW_SECONDS * features_sample_rate)) example_hop_length = int(round( vggish_params.EXAMPLE_HOP_SECONDS * features_sample_rate)) log_mel_examples = mel_features.frame( log_mel, window_length=example_window_length, hop_length=example_hop_length) if return_tensor: log_mel_examples = torch.tensor( log_mel_examples, requires_grad=True)[:, None, :, :].float() return log_mel_examples