vggish/vggish_input.py

# Copyright 2017 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

"""Compute input examples for VGGish from audio waveform."""

# Modification: Return torch tensors rather than numpy arrays
import torch
import torchaudio
import numpy as np

import mel_features
import vggish_params


def waveform_to_examples(data, sample_rate, return_tensor=True):
    """Converts audio waveform into an array of examples for VGGish.

  Args:
    data: np.array of either one dimension (mono) or two dimensions
      (multi-channel, with the outer dimension representing channels).
      Each sample is generally expected to lie in the range [-1.0, +1.0],
      although this is not required.
    sample_rate: Sample rate of data.
    return_tensor: Return data as a Pytorch tensor ready for VGGish

  Returns:
    3-D np.array of shape [num_examples, num_frames, num_bands] which represents
    a sequence of examples, each of which contains a patch of log mel
    spectrogram, covering num_frames frames of audio and num_bands mel frequency
    bands, where the frame length is vggish_params.STFT_HOP_LENGTH_SECONDS.

  """
    if len(data.shape) > 1:
        data = np.mean(data, axis=1)
    # Resample to the rate assumed by VGGish.
    if sample_rate != vggish_params.SAMPLE_RATE:
        data = torch.from_numpy(data)
        resampler = torchaudio.transforms.Resample(sample_rate, vggish_params.SAMPLE_RATE, dtype=data.dtype)
        data = resampler(data).cpu().detach().numpy()

    # Compute log mel spectrogram features.
    log_mel = mel_features.log_mel_spectrogram(
        data,
        audio_sample_rate=vggish_params.SAMPLE_RATE,
        log_offset=vggish_params.LOG_OFFSET,
        window_length_secs=vggish_params.STFT_WINDOW_LENGTH_SECONDS,
        hop_length_secs=vggish_params.STFT_HOP_LENGTH_SECONDS,
        num_mel_bins=vggish_params.NUM_MEL_BINS,
        lower_edge_hertz=vggish_params.MEL_MIN_HZ,
        upper_edge_hertz=vggish_params.MEL_MAX_HZ)

    # Frame features into examples.
    features_sample_rate = 1.0 / vggish_params.STFT_HOP_LENGTH_SECONDS
    example_window_length = int(round(
        vggish_params.EXAMPLE_WINDOW_SECONDS * features_sample_rate))
    example_hop_length = int(round(
        vggish_params.EXAMPLE_HOP_SECONDS * features_sample_rate))
    log_mel_examples = mel_features.frame(
        log_mel,
        window_length=example_window_length,
        hop_length=example_hop_length)

    if return_tensor:
        log_mel_examples = torch.tensor(
            log_mel_examples, requires_grad=True)[:, None, :, :].float()

    return log_mel_examples
Refactor Signed-off-by: Jael Gu <mengjia.gu@zilliz.com> 3 years ago			`# Copyright 2017 The TensorFlow Authors All Rights Reserved.`
			`#`
			`# Licensed under the Apache License, Version 2.0 (the "License");`
			`# you may not use this file except in compliance with the License.`
			`# You may obtain a copy of the License at`
			`#`
			`# http://www.apache.org/licenses/LICENSE-2.0`
			`#`
			`# Unless required by applicable law or agreed to in writing, software`
			`# distributed under the License is distributed on an "AS IS" BASIS,`
			`# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
			`# See the License for the specific language governing permissions and`
			`# limitations under the License.`
			`# ==============================================================================`

			`"""Compute input examples for VGGish from audio waveform."""`

			`# Modification: Return torch tensors rather than numpy arrays`
			`import torch`
Replace resampy with torchaudio Signed-off-by: Jael Gu <mengjia.gu@zilliz.com> 3 years ago			`import torchaudio`
Refactor Signed-off-by: Jael Gu <mengjia.gu@zilliz.com> 3 years ago			`import numpy as np`

Update Signed-off-by: Jael Gu <mengjia.gu@zilliz.com> 3 years ago			`import mel_features`
			`import vggish_params`
Refactor Signed-off-by: Jael Gu <mengjia.gu@zilliz.com> 3 years ago

			`def waveform_to_examples(data, sample_rate, return_tensor=True):`
			`"""Converts audio waveform into an array of examples for VGGish.`

			`Args:`
			`data: np.array of either one dimension (mono) or two dimensions`
			`(multi-channel, with the outer dimension representing channels).`
			`Each sample is generally expected to lie in the range [-1.0, +1.0],`
			`although this is not required.`
			`sample_rate: Sample rate of data.`
			`return_tensor: Return data as a Pytorch tensor ready for VGGish`

			`Returns:`
			`3-D np.array of shape [num_examples, num_frames, num_bands] which represents`
			`a sequence of examples, each of which contains a patch of log mel`
			`spectrogram, covering num_frames frames of audio and num_bands mel frequency`
			`bands, where the frame length is vggish_params.STFT_HOP_LENGTH_SECONDS.`

			`"""`
Update Signed-off-by: Jael Gu <mengjia.gu@zilliz.com> 3 years ago			`if len(data.shape) > 1:`
			`data = np.mean(data, axis=1)`
Refactor Signed-off-by: Jael Gu <mengjia.gu@zilliz.com> 3 years ago			`# Resample to the rate assumed by VGGish.`
			`if sample_rate != vggish_params.SAMPLE_RATE:`
Replace resampy with torchaudio Signed-off-by: Jael Gu <mengjia.gu@zilliz.com> 3 years ago			`data = torch.from_numpy(data)`
			`resampler = torchaudio.transforms.Resample(sample_rate, vggish_params.SAMPLE_RATE, dtype=data.dtype)`
			`data = resampler(data).cpu().detach().numpy()`
Refactor Signed-off-by: Jael Gu <mengjia.gu@zilliz.com> 3 years ago
			`# Compute log mel spectrogram features.`
			`log_mel = mel_features.log_mel_spectrogram(`
			`data,`
			`audio_sample_rate=vggish_params.SAMPLE_RATE,`
			`log_offset=vggish_params.LOG_OFFSET,`
			`window_length_secs=vggish_params.STFT_WINDOW_LENGTH_SECONDS,`
			`hop_length_secs=vggish_params.STFT_HOP_LENGTH_SECONDS,`
			`num_mel_bins=vggish_params.NUM_MEL_BINS,`
			`lower_edge_hertz=vggish_params.MEL_MIN_HZ,`
			`upper_edge_hertz=vggish_params.MEL_MAX_HZ)`

			`# Frame features into examples.`
			`features_sample_rate = 1.0 / vggish_params.STFT_HOP_LENGTH_SECONDS`
			`example_window_length = int(round(`
			`vggish_params.EXAMPLE_WINDOW_SECONDS * features_sample_rate))`
			`example_hop_length = int(round(`
			`vggish_params.EXAMPLE_HOP_SECONDS * features_sample_rate))`
			`log_mel_examples = mel_features.frame(`
			`log_mel,`
			`window_length=example_window_length,`
			`hop_length=example_hop_length)`

			`if return_tensor:`
			`log_mel_examples = torch.tensor(`
			`log_mel_examples, requires_grad=True)[:, None, :, :].float()`

			`return log_mel_examples`