nnfp
copied
Jael Gu
2 years ago
8 changed files with 290 additions and 1 deletions
@ -1,2 +1,95 @@ |
|||||
# nnfp |
|
||||
|
# Audio Embedding with Neural Network Fingerprint |
||||
|
|
||||
|
*Author: [Jael Gu](https://github.com/jaelgu)* |
||||
|
|
||||
|
<br /> |
||||
|
|
||||
|
## Description |
||||
|
|
||||
|
The audio embedding operator converts an input audio into a dense vector which can be used to represent the audio clip's semantics. |
||||
|
Each vector represents for an audio clip with a fixed length of around 1s. |
||||
|
This operator generates audio embeddings with fingerprinting method introduced by [Neural Audio Fingerprint](https://arxiv.org/abs/2010.11910). |
||||
|
The model is implemented in Pytorch. |
||||
|
We've also trained the nnfp model with [FMA dataset](https://github.com/mdeff/fma) (& some noise audio) and shared weights in this operator. |
||||
|
The nnfp operator is suitable to generate audio fingerprints. |
||||
|
|
||||
|
<br /> |
||||
|
|
||||
|
## Code Example |
||||
|
|
||||
|
Generate embeddings for the audio "test.wav". |
||||
|
|
||||
|
*Write the pipeline in simplified style*: |
||||
|
|
||||
|
```python |
||||
|
import towhee |
||||
|
|
||||
|
( |
||||
|
towhee.glob('test.wav') |
||||
|
.audio_decode.ffmpeg() |
||||
|
.runas_op(func=lambda x:[y[0] for y in x]) |
||||
|
.audio_embedding.nnfp() # use default model |
||||
|
.show() |
||||
|
) |
||||
|
``` |
||||
|
<img src="./result1.png" width="800px"/> |
||||
|
|
||||
|
*Write a same pipeline with explicit inputs/outputs name specifications:* |
||||
|
|
||||
|
```python |
||||
|
import towhee |
||||
|
|
||||
|
( |
||||
|
towhee.glob['path']('test.wav') |
||||
|
.audio_decode.ffmpeg['path', 'frames']() |
||||
|
.runas_op['frames', 'frames'](func=lambda x:[y[0] for y in x]) |
||||
|
.audio_embedding.nnfp['frames', 'vecs']() |
||||
|
.select['path', 'vecs']() |
||||
|
.show() |
||||
|
) |
||||
|
``` |
||||
|
<img src="./result2.png" width="800px"/> |
||||
|
|
||||
|
<br /> |
||||
|
|
||||
|
## Factory Constructor |
||||
|
|
||||
|
Create the operator via the following factory method |
||||
|
|
||||
|
***audio_embedding.nnfp(params=None, checkpoint_path=None, framework='pytorch')*** |
||||
|
|
||||
|
**Parameters:** |
||||
|
|
||||
|
*params: dict* |
||||
|
|
||||
|
A dictionary of model parameters. If None, it will use default parameters to create model. |
||||
|
|
||||
|
*checkpoint_path: str* |
||||
|
|
||||
|
The path to model weights. If None, it will load default model weights. |
||||
|
|
||||
|
*framework: str* |
||||
|
|
||||
|
The framework of model implementation. |
||||
|
Default value is "pytorch" since the model is implemented in Pytorch. |
||||
|
|
||||
|
<br /> |
||||
|
|
||||
|
## Interface |
||||
|
|
||||
|
An audio embedding operator generates vectors in numpy.ndarray given towhee audio frames. |
||||
|
|
||||
|
**Parameters:** |
||||
|
|
||||
|
*data: List[towhee.types.audio_frame.AudioFrame]* |
||||
|
|
||||
|
Input audio data is a list of towhee audio frames. |
||||
|
The input data should represent for an audio longer than 1s. |
||||
|
|
||||
|
|
||||
|
**Returns**: |
||||
|
|
||||
|
*numpy.ndarray* |
||||
|
|
||||
|
Audio embeddings in shape (num_clips, 128). |
||||
|
Each embedding stands for features of an audio clip with length of 1s. |
||||
|
@ -0,0 +1,19 @@ |
|||||
|
# Copyright 2021 Zilliz. All rights reserved. |
||||
|
# |
||||
|
# Licensed under the Apache License, Version 2.0 (the "License"); |
||||
|
# you may not use this file except in compliance with the License. |
||||
|
# You may obtain a copy of the License at |
||||
|
# |
||||
|
# http://www.apache.org/licenses/LICENSE-2.0 |
||||
|
# |
||||
|
# Unless required by applicable law or agreed to in writing, software |
||||
|
# distributed under the License is distributed on an "AS IS" BASIS, |
||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||
|
# See the License for the specific language governing permissions and |
||||
|
# limitations under the License. |
||||
|
|
||||
|
from .nn_fingerprint import NNFingerprint |
||||
|
|
||||
|
|
||||
|
def nnfp(): |
||||
|
return NNFingerprint() |
@ -0,0 +1,36 @@ |
|||||
|
# Parameter configs for nnfp, inspired by https://github.com/stdio2016/pfann |
||||
|
# |
||||
|
# Copyright 2021 Zilliz. All rights reserved. |
||||
|
# |
||||
|
# Licensed under the Apache License, Version 2.0 (the "License"); |
||||
|
# you may not use this file except in compliance with the License. |
||||
|
# You may obtain a copy of the License at |
||||
|
# |
||||
|
# http://www.apache.org/licenses/LICENSE-2.0 |
||||
|
# |
||||
|
# Unless required by applicable law or agreed to in writing, software |
||||
|
# distributed under the License is distributed on an "AS IS" BASIS, |
||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||
|
# See the License for the specific language governing permissions and |
||||
|
# limitations under the License. |
||||
|
|
||||
|
|
||||
|
default_params = { |
||||
|
"dim": 128, |
||||
|
"h": 1024, |
||||
|
"u": 32, |
||||
|
"fuller": True, |
||||
|
"activation": "relu", |
||||
|
"sample_rate": 8000, |
||||
|
"window_length": 1024, |
||||
|
"hop_length": 256, |
||||
|
"n_mels": 256, |
||||
|
"f_min": 300, |
||||
|
"f_max": 4000, |
||||
|
"segment_size": 1, |
||||
|
"hop_size": 1, |
||||
|
"frame_shift_mul": 1, |
||||
|
"naf_mode": False, |
||||
|
"mel_log": "log", |
||||
|
"spec_norm": "l2" |
||||
|
} |
@ -0,0 +1,135 @@ |
|||||
|
# Copyright 2021 Zilliz. All rights reserved. |
||||
|
# |
||||
|
# Licensed under the Apache License, Version 2.0 (the "License"); |
||||
|
# you may not use this file except in compliance with the License. |
||||
|
# You may obtain a copy of the License at |
||||
|
# |
||||
|
# http://www.apache.org/licenses/LICENSE-2.0 |
||||
|
# |
||||
|
# Unless required by applicable law or agreed to in writing, software |
||||
|
# distributed under the License is distributed on an "AS IS" BASIS, |
||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||
|
# See the License for the specific language governing permissions and |
||||
|
# limitations under the License. |
||||
|
|
||||
|
import logging |
||||
|
import warnings |
||||
|
|
||||
|
import os |
||||
|
from pathlib import Path |
||||
|
from typing import List |
||||
|
|
||||
|
import torch |
||||
|
import numpy |
||||
|
import resampy |
||||
|
|
||||
|
from towhee.operator.base import NNOperator |
||||
|
from towhee import register |
||||
|
from towhee.types.audio_frame import AudioFrame |
||||
|
from towhee.models.nnfp import NNFp |
||||
|
from towhee.models.utils.audio_preprocess import preprocess_wav, MelSpec |
||||
|
|
||||
|
from .configs import default_params |
||||
|
|
||||
|
warnings.filterwarnings('ignore') |
||||
|
log = logging.getLogger() |
||||
|
|
||||
|
|
||||
|
@register(output_schema=['vecs']) |
||||
|
class NNFingerprint(NNOperator): |
||||
|
""" |
||||
|
Audio embedding operator using Neural Network Fingerprint |
||||
|
""" |
||||
|
|
||||
|
def __init__(self, |
||||
|
params: dict = None, |
||||
|
checkpoint_path: str = None, |
||||
|
framework: str = 'pytorch'): |
||||
|
super().__init__(framework=framework) |
||||
|
self.device = "cuda" if torch.cuda.is_available() else "cpu" |
||||
|
if params is None: |
||||
|
self.params = default_params |
||||
|
else: |
||||
|
self.params = params |
||||
|
|
||||
|
dim = self.params['dim'] |
||||
|
h = self.params['h'] |
||||
|
u = self.params['u'] |
||||
|
f_bin = self.params['n_mels'] |
||||
|
n_seg = int(self.params['segment_size'] * self.params['sample_rate']) |
||||
|
t = (n_seg + self.params['hop_length'] - 1) // self.params['hop_length'] |
||||
|
|
||||
|
log.info('Creating model...') |
||||
|
self.model = NNFp( |
||||
|
dim=dim, h=h, u=u, |
||||
|
in_f=f_bin, in_t=t, |
||||
|
fuller=self.params['fuller'], |
||||
|
activation=self.params['activation'] |
||||
|
).to(self.device) |
||||
|
|
||||
|
log.info('Loading weights...') |
||||
|
if checkpoint_path is None: |
||||
|
path = str(Path(__file__).parent) |
||||
|
checkpoint_path = os.path.join(path, './checkpoints/pfann_fma_m.pt') |
||||
|
state_dict = torch.load(checkpoint_path, map_location=self.device) |
||||
|
self.model.load_state_dict(state_dict) |
||||
|
self.model.eval() |
||||
|
log.info('Model is loaded.') |
||||
|
|
||||
|
def __call__(self, data: List[AudioFrame]) -> numpy.ndarray: |
||||
|
audio_tensors = self.preprocess(data).to(self.device) |
||||
|
features = self.model(audio_tensors) |
||||
|
return features.detach().cpu().numpy() |
||||
|
|
||||
|
def preprocess(self, frames: List[AudioFrame]): |
||||
|
sr = frames[0].sample_rate |
||||
|
layout = frames[0].layout |
||||
|
if layout == 'stereo': |
||||
|
frames = [frame.reshape(-1, 2) for frame in frames] |
||||
|
audio = numpy.vstack(frames).transpose() |
||||
|
else: |
||||
|
audio = numpy.hstack(frames) |
||||
|
audio = audio[None, :] |
||||
|
|
||||
|
audio = self.int2float(audio) |
||||
|
|
||||
|
if sr != self.params['sample_rate']: |
||||
|
audio = resampy.resample(audio, sr, self.params['sample_rate']) |
||||
|
|
||||
|
wav = preprocess_wav(audio, |
||||
|
segment_size=int(self.params['sample_rate'] * self.params['segment_size']), |
||||
|
hop_size=int(self.params['sample_rate'] * self.params['hop_size']), |
||||
|
frame_shift_mul=self.params['frame_shift_mul']).to(self.device) |
||||
|
wav = wav.to(torch.float32) |
||||
|
mel = MelSpec(sample_rate=self.params['sample_rate'], |
||||
|
window_length=self.params['window_length'], |
||||
|
hop_length=self.params['hop_length'], |
||||
|
f_min=self.params['f_min'], |
||||
|
f_max=self.params['f_max'], |
||||
|
n_mels=self.params['n_mels'], |
||||
|
naf_mode=self.params['naf_mode'], |
||||
|
mel_log=self.params['mel_log'], |
||||
|
spec_norm=self.params['spec_norm']).to(self.device) |
||||
|
wav = mel(wav) |
||||
|
return wav |
||||
|
|
||||
|
@staticmethod |
||||
|
def int2float(wav: numpy.ndarray, dtype: str = 'float64'): |
||||
|
""" |
||||
|
Convert audio imgs from int to float. |
||||
|
The input dtype must be integers. |
||||
|
The output dtype is controlled by the parameter `dtype`, defaults to 'float64'. |
||||
|
|
||||
|
The code is inspired by https://github.com/mgeier/python-audio/blob/master/audio-files/utility.py |
||||
|
""" |
||||
|
dtype = numpy.dtype(dtype) |
||||
|
assert dtype.kind == 'f' |
||||
|
|
||||
|
if wav.dtype.kind in 'iu': |
||||
|
ii = numpy.iinfo(wav.dtype) |
||||
|
abs_max = 2 ** (ii.bits - 1) |
||||
|
offset = ii.min + abs_max |
||||
|
return (wav.astype(dtype) - offset) / abs_max |
||||
|
else: |
||||
|
log.warning('Converting float dtype from %s to %s.', wav.dtype, dtype) |
||||
|
return wav.astype(dtype) |
After Width: | Height: | Size: 3.8 KiB |
After Width: | Height: | Size: 5.9 KiB |
Binary file not shown.
Binary file not shown.
Loading…
Reference in new issue