vggish
copied
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
Readme
Files and versions
224 lines
9.6 KiB
224 lines
9.6 KiB
2 years ago
|
# Copyright 2017 The TensorFlow Authors All Rights Reserved.
|
||
|
#
|
||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||
|
# you may not use this file except in compliance with the License.
|
||
|
# You may obtain a copy of the License at
|
||
|
#
|
||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||
|
#
|
||
|
# Unless required by applicable law or agreed to in writing, software
|
||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||
|
# See the License for the specific language governing permissions and
|
||
|
# limitations under the License.
|
||
|
# ==============================================================================
|
||
|
|
||
|
"""Defines routines to compute mel spectrogram features from audio waveform."""
|
||
|
|
||
|
import numpy as np
|
||
|
|
||
|
|
||
|
def frame(data, window_length, hop_length):
|
||
|
"""Convert array into a sequence of successive possibly overlapping frames.
|
||
|
|
||
|
An n-dimensional array of shape (num_samples, ...) is converted into an
|
||
|
(n+1)-D array of shape (num_frames, window_length, ...), where each frame
|
||
|
starts hop_length points after the preceding one.
|
||
|
|
||
|
This is accomplished using stride_tricks, so the original data is not
|
||
|
copied. However, there is no zero-padding, so any incomplete frames at the
|
||
|
end are not included.
|
||
|
|
||
|
Args:
|
||
|
data: np.array of dimension N >= 1.
|
||
|
window_length: Number of samples in each frame.
|
||
|
hop_length: Advance (in samples) between each window.
|
||
|
|
||
|
Returns:
|
||
|
(N+1)-D np.array with as many rows as there are complete frames that can be
|
||
|
extracted.
|
||
|
"""
|
||
|
num_samples = data.shape[0]
|
||
|
num_frames = 1 + int(np.floor((num_samples - window_length) / hop_length))
|
||
|
shape = (num_frames, window_length) + data.shape[1:]
|
||
|
strides = (data.strides[0] * hop_length,) + data.strides
|
||
|
return np.lib.stride_tricks.as_strided(data, shape=shape, strides=strides)
|
||
|
|
||
|
|
||
|
def periodic_hann(window_length):
|
||
|
"""Calculate a "periodic" Hann window.
|
||
|
|
||
|
The classic Hann window is defined as a raised cosine that starts and
|
||
|
ends on zero, and where every value appears twice, except the middle
|
||
|
point for an odd-length window. Matlab calls this a "symmetric" window
|
||
|
and np.hanning() returns it. However, for Fourier analysis, this
|
||
|
actually represents just over one cycle of a period N-1 cosine, and
|
||
|
thus is not compactly expressed on a length-N Fourier basis. Instead,
|
||
|
it's better to use a raised cosine that ends just before the final
|
||
|
zero value - i.e. a complete cycle of a period-N cosine. Matlab
|
||
|
calls this a "periodic" window. This routine calculates it.
|
||
|
|
||
|
Args:
|
||
|
window_length: The number of points in the returned window.
|
||
|
|
||
|
Returns:
|
||
|
A 1D np.array containing the periodic hann window.
|
||
|
"""
|
||
|
return 0.5 - (0.5 * np.cos(2 * np.pi / window_length *
|
||
|
np.arange(window_length)))
|
||
|
|
||
|
|
||
|
def stft_magnitude(signal, fft_length,
|
||
|
hop_length=None,
|
||
|
window_length=None):
|
||
|
"""Calculate the short-time Fourier transform magnitude.
|
||
|
|
||
|
Args:
|
||
|
signal: 1D np.array of the input time-domain signal.
|
||
|
fft_length: Size of the FFT to apply.
|
||
|
hop_length: Advance (in samples) between each frame passed to FFT.
|
||
|
window_length: Length of each block of samples to pass to FFT.
|
||
|
|
||
|
Returns:
|
||
|
2D np.array where each row contains the magnitudes of the fft_length/2+1
|
||
|
unique values of the FFT for the corresponding frame of input samples.
|
||
|
"""
|
||
|
frames = frame(signal, window_length, hop_length)
|
||
|
# Apply frame window to each frame. We use a periodic Hann (cosine of period
|
||
|
# window_length) instead of the symmetric Hann of np.hanning (period
|
||
|
# window_length-1).
|
||
|
window = periodic_hann(window_length)
|
||
|
windowed_frames = frames * window
|
||
|
return np.abs(np.fft.rfft(windowed_frames, int(fft_length)))
|
||
|
|
||
|
|
||
|
# Mel spectrum constants and functions.
|
||
|
_MEL_BREAK_FREQUENCY_HERTZ = 700.0
|
||
|
_MEL_HIGH_FREQUENCY_Q = 1127.0
|
||
|
|
||
|
|
||
|
def hertz_to_mel(frequencies_hertz):
|
||
|
"""Convert frequencies to mel scale using HTK formula.
|
||
|
|
||
|
Args:
|
||
|
frequencies_hertz: Scalar or np.array of frequencies in hertz.
|
||
|
|
||
|
Returns:
|
||
|
Object of same size as frequencies_hertz containing corresponding values
|
||
|
on the mel scale.
|
||
|
"""
|
||
|
return _MEL_HIGH_FREQUENCY_Q * np.log(
|
||
|
1.0 + (frequencies_hertz / _MEL_BREAK_FREQUENCY_HERTZ))
|
||
|
|
||
|
|
||
|
def spectrogram_to_mel_matrix(num_mel_bins=20,
|
||
|
num_spectrogram_bins=129,
|
||
|
audio_sample_rate=8000,
|
||
|
lower_edge_hertz=125.0,
|
||
|
upper_edge_hertz=3800.0):
|
||
|
"""Return a matrix that can post-multiply spectrogram rows to make mel.
|
||
|
|
||
|
Returns a np.array matrix A that can be used to post-multiply a matrix S of
|
||
|
spectrogram values (STFT magnitudes) arranged as frames x bins to generate a
|
||
|
"mel spectrogram" M of frames x num_mel_bins. M = S A.
|
||
|
|
||
|
The classic HTK algorithm exploits the complementarity of adjacent mel bands
|
||
|
to multiply each FFT bin by only one mel weight, then add it, with positive
|
||
|
and negative signs, to the two adjacent mel bands to which that bin
|
||
|
contributes. Here, by expressing this operation as a matrix multiply, we go
|
||
|
from num_fft multiplies per frame (plus around 2*num_fft adds) to around
|
||
|
num_fft^2 multiplies and adds. However, because these are all presumably
|
||
|
accomplished in a single call to np.dot(), it's not clear which approach is
|
||
|
faster in Python. The matrix multiplication has the attraction of being more
|
||
|
general and flexible, and much easier to read.
|
||
|
|
||
|
Args:
|
||
|
num_mel_bins: How many bands in the resulting mel spectrum. This is
|
||
|
the number of columns in the output matrix.
|
||
|
num_spectrogram_bins: How many bins there are in the source spectrogram
|
||
|
data, which is understood to be fft_size/2 + 1, i.e. the spectrogram
|
||
|
only contains the nonredundant FFT bins.
|
||
|
audio_sample_rate: Samples per second of the audio at the input to the
|
||
|
spectrogram. We need this to figure out the actual frequencies for
|
||
|
each spectrogram bin, which dictates how they are mapped into mel.
|
||
|
lower_edge_hertz: Lower bound on the frequencies to be included in the mel
|
||
|
spectrum. This corresponds to the lower edge of the lowest triangular
|
||
|
band.
|
||
|
upper_edge_hertz: The desired top edge of the highest frequency band.
|
||
|
|
||
|
Returns:
|
||
|
An np.array with shape (num_spectrogram_bins, num_mel_bins).
|
||
|
|
||
|
Raises:
|
||
|
ValueError: if frequency edges are incorrectly ordered or out of range.
|
||
|
"""
|
||
|
nyquist_hertz = audio_sample_rate / 2.
|
||
|
if lower_edge_hertz < 0.0:
|
||
|
raise ValueError("lower_edge_hertz %.1f must be >= 0" % lower_edge_hertz)
|
||
|
if lower_edge_hertz >= upper_edge_hertz:
|
||
|
raise ValueError("lower_edge_hertz %.1f >= upper_edge_hertz %.1f" %
|
||
|
(lower_edge_hertz, upper_edge_hertz))
|
||
|
if upper_edge_hertz > nyquist_hertz:
|
||
|
raise ValueError("upper_edge_hertz %.1f is greater than Nyquist %.1f" %
|
||
|
(upper_edge_hertz, nyquist_hertz))
|
||
|
spectrogram_bins_hertz = np.linspace(0.0, nyquist_hertz, num_spectrogram_bins)
|
||
|
spectrogram_bins_mel = hertz_to_mel(spectrogram_bins_hertz)
|
||
|
# The i'th mel band (starting from i=1) has center frequency
|
||
|
# band_edges_mel[i], lower edge band_edges_mel[i-1], and higher edge
|
||
|
# band_edges_mel[i+1]. Thus, we need num_mel_bins + 2 values in
|
||
|
# the band_edges_mel arrays.
|
||
|
band_edges_mel = np.linspace(hertz_to_mel(lower_edge_hertz),
|
||
|
hertz_to_mel(upper_edge_hertz), num_mel_bins + 2)
|
||
|
# Matrix to post-multiply feature arrays whose rows are num_spectrogram_bins
|
||
|
# of spectrogram values.
|
||
|
mel_weights_matrix = np.empty((num_spectrogram_bins, num_mel_bins))
|
||
|
for i in range(num_mel_bins):
|
||
|
lower_edge_mel, center_mel, upper_edge_mel = band_edges_mel[i:i + 3]
|
||
|
# Calculate lower and upper slopes for every spectrogram bin.
|
||
|
# Line segments are linear in the *mel* domain, not hertz.
|
||
|
lower_slope = ((spectrogram_bins_mel - lower_edge_mel) /
|
||
|
(center_mel - lower_edge_mel))
|
||
|
upper_slope = ((upper_edge_mel - spectrogram_bins_mel) /
|
||
|
(upper_edge_mel - center_mel))
|
||
|
# .. then intersect them with each other and zero.
|
||
|
mel_weights_matrix[:, i] = np.maximum(0.0, np.minimum(lower_slope,
|
||
|
upper_slope))
|
||
|
# HTK excludes the spectrogram DC bin; make sure it always gets a zero
|
||
|
# coefficient.
|
||
|
mel_weights_matrix[0, :] = 0.0
|
||
|
return mel_weights_matrix
|
||
|
|
||
|
|
||
|
def log_mel_spectrogram(data,
|
||
|
audio_sample_rate=8000,
|
||
|
log_offset=0.0,
|
||
|
window_length_secs=0.025,
|
||
|
hop_length_secs=0.010,
|
||
|
**kwargs):
|
||
|
"""Convert waveform to a log magnitude mel-frequency spectrogram.
|
||
|
|
||
|
Args:
|
||
|
data: 1D np.array of waveform data.
|
||
|
audio_sample_rate: The sampling rate of data.
|
||
|
log_offset: Add this to values when taking log to avoid -Infs.
|
||
|
window_length_secs: Duration of each window to analyze.
|
||
|
hop_length_secs: Advance between successive analysis windows.
|
||
|
**kwargs: Additional arguments to pass to spectrogram_to_mel_matrix.
|
||
|
|
||
|
Returns:
|
||
|
2D np.array of (num_frames, num_mel_bins) consisting of log mel filterbank
|
||
|
magnitudes for successive frames.
|
||
|
"""
|
||
|
window_length_samples = int(round(audio_sample_rate * window_length_secs))
|
||
|
hop_length_samples = int(round(audio_sample_rate * hop_length_secs))
|
||
|
fft_length = 2 ** int(np.ceil(np.log(window_length_samples) / np.log(2.0)))
|
||
|
spectrogram = stft_magnitude(
|
||
|
data,
|
||
|
fft_length=fft_length,
|
||
|
hop_length=hop_length_samples,
|
||
|
window_length=window_length_samples)
|
||
|
mel_spectrogram = np.dot(spectrogram, spectrogram_to_mel_matrix(
|
||
|
num_spectrogram_bins=spectrogram.shape[1],
|
||
|
audio_sample_rate=audio_sample_rate, **kwargs))
|
||
|
return np.log(mel_spectrogram + log_offset)
|