nnfp
copied
Jael Gu
2 years ago
8 changed files with 290 additions and 1 deletions
@ -1,2 +1,95 @@ |
|||
# nnfp |
|||
# Audio Embedding with Neural Network Fingerprint |
|||
|
|||
*Author: [Jael Gu](https://github.com/jaelgu)* |
|||
|
|||
<br /> |
|||
|
|||
## Description |
|||
|
|||
The audio embedding operator converts an input audio into a dense vector which can be used to represent the audio clip's semantics. |
|||
Each vector represents for an audio clip with a fixed length of around 1s. |
|||
This operator generates audio embeddings with fingerprinting method introduced by [Neural Audio Fingerprint](https://arxiv.org/abs/2010.11910). |
|||
The model is implemented in Pytorch. |
|||
We've also trained the nnfp model with [FMA dataset](https://github.com/mdeff/fma) (& some noise audio) and shared weights in this operator. |
|||
The nnfp operator is suitable to generate audio fingerprints. |
|||
|
|||
<br /> |
|||
|
|||
## Code Example |
|||
|
|||
Generate embeddings for the audio "test.wav". |
|||
|
|||
*Write the pipeline in simplified style*: |
|||
|
|||
```python |
|||
import towhee |
|||
|
|||
( |
|||
towhee.glob('test.wav') |
|||
.audio_decode.ffmpeg() |
|||
.runas_op(func=lambda x:[y[0] for y in x]) |
|||
.audio_embedding.nnfp() # use default model |
|||
.show() |
|||
) |
|||
``` |
|||
<img src="./result1.png" width="800px"/> |
|||
|
|||
*Write a same pipeline with explicit inputs/outputs name specifications:* |
|||
|
|||
```python |
|||
import towhee |
|||
|
|||
( |
|||
towhee.glob['path']('test.wav') |
|||
.audio_decode.ffmpeg['path', 'frames']() |
|||
.runas_op['frames', 'frames'](func=lambda x:[y[0] for y in x]) |
|||
.audio_embedding.nnfp['frames', 'vecs']() |
|||
.select['path', 'vecs']() |
|||
.show() |
|||
) |
|||
``` |
|||
<img src="./result2.png" width="800px"/> |
|||
|
|||
<br /> |
|||
|
|||
## Factory Constructor |
|||
|
|||
Create the operator via the following factory method |
|||
|
|||
***audio_embedding.nnfp(params=None, checkpoint_path=None, framework='pytorch')*** |
|||
|
|||
**Parameters:** |
|||
|
|||
*params: dict* |
|||
|
|||
A dictionary of model parameters. If None, it will use default parameters to create model. |
|||
|
|||
*checkpoint_path: str* |
|||
|
|||
The path to model weights. If None, it will load default model weights. |
|||
|
|||
*framework: str* |
|||
|
|||
The framework of model implementation. |
|||
Default value is "pytorch" since the model is implemented in Pytorch. |
|||
|
|||
<br /> |
|||
|
|||
## Interface |
|||
|
|||
An audio embedding operator generates vectors in numpy.ndarray given towhee audio frames. |
|||
|
|||
**Parameters:** |
|||
|
|||
*data: List[towhee.types.audio_frame.AudioFrame]* |
|||
|
|||
Input audio data is a list of towhee audio frames. |
|||
The input data should represent for an audio longer than 1s. |
|||
|
|||
|
|||
**Returns**: |
|||
|
|||
*numpy.ndarray* |
|||
|
|||
Audio embeddings in shape (num_clips, 128). |
|||
Each embedding stands for features of an audio clip with length of 1s. |
|||
|
@ -0,0 +1,19 @@ |
|||
# Copyright 2021 Zilliz. All rights reserved. |
|||
# |
|||
# Licensed under the Apache License, Version 2.0 (the "License"); |
|||
# you may not use this file except in compliance with the License. |
|||
# You may obtain a copy of the License at |
|||
# |
|||
# http://www.apache.org/licenses/LICENSE-2.0 |
|||
# |
|||
# Unless required by applicable law or agreed to in writing, software |
|||
# distributed under the License is distributed on an "AS IS" BASIS, |
|||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
|||
# See the License for the specific language governing permissions and |
|||
# limitations under the License. |
|||
|
|||
from .nn_fingerprint import NNFingerprint |
|||
|
|||
|
|||
def nnfp(): |
|||
return NNFingerprint() |
@ -0,0 +1,36 @@ |
|||
# Parameter configs for nnfp, inspired by https://github.com/stdio2016/pfann |
|||
# |
|||
# Copyright 2021 Zilliz. All rights reserved. |
|||
# |
|||
# Licensed under the Apache License, Version 2.0 (the "License"); |
|||
# you may not use this file except in compliance with the License. |
|||
# You may obtain a copy of the License at |
|||
# |
|||
# http://www.apache.org/licenses/LICENSE-2.0 |
|||
# |
|||
# Unless required by applicable law or agreed to in writing, software |
|||
# distributed under the License is distributed on an "AS IS" BASIS, |
|||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
|||
# See the License for the specific language governing permissions and |
|||
# limitations under the License. |
|||
|
|||
|
|||
default_params = { |
|||
"dim": 128, |
|||
"h": 1024, |
|||
"u": 32, |
|||
"fuller": True, |
|||
"activation": "relu", |
|||
"sample_rate": 8000, |
|||
"window_length": 1024, |
|||
"hop_length": 256, |
|||
"n_mels": 256, |
|||
"f_min": 300, |
|||
"f_max": 4000, |
|||
"segment_size": 1, |
|||
"hop_size": 1, |
|||
"frame_shift_mul": 1, |
|||
"naf_mode": False, |
|||
"mel_log": "log", |
|||
"spec_norm": "l2" |
|||
} |
@ -0,0 +1,135 @@ |
|||
# Copyright 2021 Zilliz. All rights reserved. |
|||
# |
|||
# Licensed under the Apache License, Version 2.0 (the "License"); |
|||
# you may not use this file except in compliance with the License. |
|||
# You may obtain a copy of the License at |
|||
# |
|||
# http://www.apache.org/licenses/LICENSE-2.0 |
|||
# |
|||
# Unless required by applicable law or agreed to in writing, software |
|||
# distributed under the License is distributed on an "AS IS" BASIS, |
|||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
|||
# See the License for the specific language governing permissions and |
|||
# limitations under the License. |
|||
|
|||
import logging |
|||
import warnings |
|||
|
|||
import os |
|||
from pathlib import Path |
|||
from typing import List |
|||
|
|||
import torch |
|||
import numpy |
|||
import resampy |
|||
|
|||
from towhee.operator.base import NNOperator |
|||
from towhee import register |
|||
from towhee.types.audio_frame import AudioFrame |
|||
from towhee.models.nnfp import NNFp |
|||
from towhee.models.utils.audio_preprocess import preprocess_wav, MelSpec |
|||
|
|||
from .configs import default_params |
|||
|
|||
warnings.filterwarnings('ignore') |
|||
log = logging.getLogger() |
|||
|
|||
|
|||
@register(output_schema=['vecs']) |
|||
class NNFingerprint(NNOperator): |
|||
""" |
|||
Audio embedding operator using Neural Network Fingerprint |
|||
""" |
|||
|
|||
def __init__(self, |
|||
params: dict = None, |
|||
checkpoint_path: str = None, |
|||
framework: str = 'pytorch'): |
|||
super().__init__(framework=framework) |
|||
self.device = "cuda" if torch.cuda.is_available() else "cpu" |
|||
if params is None: |
|||
self.params = default_params |
|||
else: |
|||
self.params = params |
|||
|
|||
dim = self.params['dim'] |
|||
h = self.params['h'] |
|||
u = self.params['u'] |
|||
f_bin = self.params['n_mels'] |
|||
n_seg = int(self.params['segment_size'] * self.params['sample_rate']) |
|||
t = (n_seg + self.params['hop_length'] - 1) // self.params['hop_length'] |
|||
|
|||
log.info('Creating model...') |
|||
self.model = NNFp( |
|||
dim=dim, h=h, u=u, |
|||
in_f=f_bin, in_t=t, |
|||
fuller=self.params['fuller'], |
|||
activation=self.params['activation'] |
|||
).to(self.device) |
|||
|
|||
log.info('Loading weights...') |
|||
if checkpoint_path is None: |
|||
path = str(Path(__file__).parent) |
|||
checkpoint_path = os.path.join(path, './checkpoints/pfann_fma_m.pt') |
|||
state_dict = torch.load(checkpoint_path, map_location=self.device) |
|||
self.model.load_state_dict(state_dict) |
|||
self.model.eval() |
|||
log.info('Model is loaded.') |
|||
|
|||
def __call__(self, data: List[AudioFrame]) -> numpy.ndarray: |
|||
audio_tensors = self.preprocess(data).to(self.device) |
|||
features = self.model(audio_tensors) |
|||
return features.detach().cpu().numpy() |
|||
|
|||
def preprocess(self, frames: List[AudioFrame]): |
|||
sr = frames[0].sample_rate |
|||
layout = frames[0].layout |
|||
if layout == 'stereo': |
|||
frames = [frame.reshape(-1, 2) for frame in frames] |
|||
audio = numpy.vstack(frames).transpose() |
|||
else: |
|||
audio = numpy.hstack(frames) |
|||
audio = audio[None, :] |
|||
|
|||
audio = self.int2float(audio) |
|||
|
|||
if sr != self.params['sample_rate']: |
|||
audio = resampy.resample(audio, sr, self.params['sample_rate']) |
|||
|
|||
wav = preprocess_wav(audio, |
|||
segment_size=int(self.params['sample_rate'] * self.params['segment_size']), |
|||
hop_size=int(self.params['sample_rate'] * self.params['hop_size']), |
|||
frame_shift_mul=self.params['frame_shift_mul']).to(self.device) |
|||
wav = wav.to(torch.float32) |
|||
mel = MelSpec(sample_rate=self.params['sample_rate'], |
|||
window_length=self.params['window_length'], |
|||
hop_length=self.params['hop_length'], |
|||
f_min=self.params['f_min'], |
|||
f_max=self.params['f_max'], |
|||
n_mels=self.params['n_mels'], |
|||
naf_mode=self.params['naf_mode'], |
|||
mel_log=self.params['mel_log'], |
|||
spec_norm=self.params['spec_norm']).to(self.device) |
|||
wav = mel(wav) |
|||
return wav |
|||
|
|||
@staticmethod |
|||
def int2float(wav: numpy.ndarray, dtype: str = 'float64'): |
|||
""" |
|||
Convert audio imgs from int to float. |
|||
The input dtype must be integers. |
|||
The output dtype is controlled by the parameter `dtype`, defaults to 'float64'. |
|||
|
|||
The code is inspired by https://github.com/mgeier/python-audio/blob/master/audio-files/utility.py |
|||
""" |
|||
dtype = numpy.dtype(dtype) |
|||
assert dtype.kind == 'f' |
|||
|
|||
if wav.dtype.kind in 'iu': |
|||
ii = numpy.iinfo(wav.dtype) |
|||
abs_max = 2 ** (ii.bits - 1) |
|||
offset = ii.min + abs_max |
|||
return (wav.astype(dtype) - offset) / abs_max |
|||
else: |
|||
log.warning('Converting float dtype from %s to %s.', wav.dtype, dtype) |
|||
return wav.astype(dtype) |
After Width: | Height: | Size: 3.8 KiB |
After Width: | Height: | Size: 5.9 KiB |
Binary file not shown.
Binary file not shown.
Loading…
Reference in new issue