clmr
copied
Jael Gu
3 years ago
10 changed files with 323 additions and 1 deletions
@ -1,2 +1,75 @@ |
|||
# clmr |
|||
# Audio Embedding with CLMR |
|||
|
|||
*Author: Jael Gu* |
|||
|
|||
|
|||
## Desription |
|||
|
|||
The audio embedding operator converts an input audio into a dense vector which can be used to represent the audio clip's semantics. |
|||
This operator is built on top of the original implementation of [CLMR](https://github.com/Spijkervet/CLMR). |
|||
The [default model weight](./checkpoints/clmr_checkpoint_10000.pt) provided is pretrained on [Magnatagatune Dataset](https://paperswithcode.com/dataset/magnatagatune) with [SampleCNN](./models/sample_cnn.py). |
|||
|
|||
```python |
|||
import numpy as np |
|||
from towhee import ops |
|||
|
|||
audio_encoder = ops.audio_embedding.clmr() |
|||
|
|||
# Path or url as input |
|||
audio_embedding = audio_encoder("/audio/path/or/url/") |
|||
|
|||
# Audio data as input |
|||
audio_data = np.zeros((2, 441344)) |
|||
sample_rate = 44100 |
|||
audio_embedding = audio_encoder(audio_data, sample_rate) |
|||
``` |
|||
|
|||
## Factory Constructor |
|||
|
|||
Create the operator via the following factory method |
|||
|
|||
***ops.audio_embedding.clmr()*** |
|||
|
|||
|
|||
## Interface |
|||
|
|||
An audio embedding operator generates vectors in numpy.ndarray given an audio file path or audio data in numpy.ndarray. |
|||
|
|||
|
|||
**Parameters:** |
|||
|
|||
None. |
|||
|
|||
|
|||
**Returns**: *numpy.ndarray* |
|||
|
|||
Audio embeddings. |
|||
|
|||
|
|||
|
|||
## Code Example |
|||
|
|||
Generate embeddings for the audio "test.wav". |
|||
|
|||
*Write the pipeline in simplified style*: |
|||
|
|||
```python |
|||
from towhee import dc |
|||
|
|||
dc.glob('test.wav') |
|||
.audio_embedding.clmr() |
|||
.show() |
|||
``` |
|||
|
|||
*Write a same pipeline with explicit inputs/outputs name specifications:* |
|||
|
|||
```python |
|||
from towhee import dc |
|||
|
|||
dc.glob['path']('test.wav') |
|||
.audio_embedding.clmr['path', 'vecs']() |
|||
.select('vecs') |
|||
.show() |
|||
``` |
|||
|
|||
|
|||
|
@ -0,0 +1,19 @@ |
|||
# Copyright 2021 Zilliz. All rights reserved. |
|||
# |
|||
# Licensed under the Apache License, Version 2.0 (the "License"); |
|||
# you may not use this file except in compliance with the License. |
|||
# You may obtain a copy of the License at |
|||
# |
|||
# http://www.apache.org/licenses/LICENSE-2.0 |
|||
# |
|||
# Unless required by applicable law or agreed to in writing, software |
|||
# distributed under the License is distributed on an "AS IS" BASIS, |
|||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
|||
# See the License for the specific language governing permissions and |
|||
# limitations under the License. |
|||
|
|||
from .clmr_magnatagatune import ClmrMagnatagatune |
|||
|
|||
|
|||
def clmr(): |
|||
return ClmrMagnatagatune() |
Binary file not shown.
@ -0,0 +1,95 @@ |
|||
# Copyright 2021 Zilliz. All rights reserved. |
|||
# |
|||
# Licensed under the Apache License, Version 2.0 (the "License"); |
|||
# you may not use this file except in compliance with the License. |
|||
# You may obtain a copy of the License at |
|||
# |
|||
# http://www.apache.org/licenses/LICENSE-2.0 |
|||
# |
|||
# Unless required by applicable law or agreed to in writing, software |
|||
# distributed under the License is distributed on an "AS IS" BASIS, |
|||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
|||
# See the License for the specific language governing permissions and |
|||
# limitations under the License. |
|||
|
|||
import os |
|||
import sys |
|||
import logging |
|||
from pathlib import Path |
|||
from typing import Union |
|||
|
|||
import subprocess |
|||
import torchaudio |
|||
import torch |
|||
import numpy |
|||
|
|||
from towhee.operator import NNOperator |
|||
from towhee import register |
|||
|
|||
sys.path.append(str(Path(__file__).parent)) |
|||
|
|||
from utils.checkpoint import load_encoder_checkpoint |
|||
from models.sample_cnn import SampleCNN |
|||
|
|||
|
|||
@register(output_schema=['vec']) |
|||
class ClmrMagnatagatune(NNOperator): |
|||
""" |
|||
Pretrained clmr |
|||
""" |
|||
|
|||
def __init__(self, framework="pytorch") -> None: |
|||
super().__init__(framework=framework) |
|||
self.device = "cuda" if torch.cuda.is_available() else "cpu" |
|||
|
|||
weight_path = os.path.join(str(Path(__file__).parent), |
|||
'checkpoints/clmr_checkpoint_10000.pt') |
|||
state_dict = load_encoder_checkpoint(weight_path, 1) |
|||
encoder = SampleCNN(strides=[3, 3, 3, 3, 3, 3, 3, 3, 3], supervised=False, out_dim=1) |
|||
encoder.load_state_dict(state_dict) |
|||
|
|||
new_encoder = torch.nn.Sequential(*(list(encoder.children())[:-1])) |
|||
x = list(new_encoder[0][:10].children()) |
|||
y = torch.nn.Sequential(*list(new_encoder[0][10].children())[:-1]) |
|||
x.append(y) |
|||
self.model = torch.nn.Sequential(*x) |
|||
self.model.eval() |
|||
self.model.to(self.device) |
|||
|
|||
def __call__(self, audio: Union[str, numpy.ndarray], sample_rate: int = None) -> numpy.ndarray: |
|||
_sr = 22050 |
|||
audio_length = 59049 |
|||
|
|||
if isinstance(audio, str): |
|||
source = os.path.abspath(audio) |
|||
audio, sr = torchaudio.load(source) |
|||
elif isinstance(audio, numpy.ndarray): |
|||
sr = sample_rate |
|||
audio = torch.tensor(audio).to(torch.float32) |
|||
|
|||
if sr != _sr: |
|||
transform = torchaudio.transforms.Resample(orig_freq=sr, new_freq=_sr) |
|||
audio = transform(audio) |
|||
|
|||
with torch.no_grad(): |
|||
batch = torch.split(audio, audio_length, dim=1) |
|||
batch = torch.cat(batch[:-1]) |
|||
batch = batch.unsqueeze(dim=1) |
|||
batch = batch.to(self.device) |
|||
features = numpy.squeeze(self.model(batch)) |
|||
|
|||
embeddings = features.to("cpu") |
|||
return embeddings.detach().numpy() |
|||
|
|||
|
|||
# if __name__ == "__main__": |
|||
# encoder = ClmrMagnatagatune() |
|||
# |
|||
# audio_path = "/audio/path/or/link" |
|||
# vec = encoder(audio_path) |
|||
# |
|||
# # audio_data = numpy.zeros((2, 441344)) |
|||
# # sample_rate = 44100 |
|||
# # vec = encoder(audio_data, sample_rate) |
|||
# |
|||
# print(vec.shape) |
@ -0,0 +1 @@ |
|||
|
@ -0,0 +1,23 @@ |
|||
import torch.nn as nn |
|||
import numpy as np |
|||
|
|||
|
|||
class Model(nn.Module): |
|||
def __init__(self): |
|||
super(Model, self).__init__() |
|||
|
|||
def initialize(self, m): |
|||
if isinstance(m, (nn.Conv1d)): |
|||
# nn.init.xavier_uniform_(m.weight) |
|||
# if m.bias is not None: |
|||
# nn.init.xavier_uniform_(m.bias) |
|||
|
|||
nn.init.kaiming_uniform_(m.weight, mode="fan_in", nonlinearity="relu") |
|||
|
|||
|
|||
class Identity(nn.Module): |
|||
def __init__(self): |
|||
super(Identity, self).__init__() |
|||
|
|||
def forward(self, x): |
|||
return x |
@ -0,0 +1,67 @@ |
|||
import torch |
|||
import torch.nn as nn |
|||
from .model import Model |
|||
|
|||
|
|||
class SampleCNN(Model): |
|||
def __init__(self, strides, supervised, out_dim): |
|||
super(SampleCNN, self).__init__() |
|||
|
|||
self.strides = strides |
|||
self.supervised = supervised |
|||
self.sequential = [ |
|||
nn.Sequential( |
|||
nn.Conv1d(1, 128, kernel_size=3, stride=3, padding=0), |
|||
nn.BatchNorm1d(128), |
|||
nn.ReLU(), |
|||
) |
|||
] |
|||
|
|||
self.hidden = [ |
|||
[128, 128], |
|||
[128, 128], |
|||
[128, 256], |
|||
[256, 256], |
|||
[256, 256], |
|||
[256, 256], |
|||
[256, 256], |
|||
[256, 256], |
|||
[256, 512], |
|||
] |
|||
|
|||
assert len(self.hidden) == len( |
|||
self.strides |
|||
), "Number of hidden layers and strides are not equal" |
|||
for stride, (h_in, h_out) in zip(self.strides, self.hidden): |
|||
self.sequential.append( |
|||
nn.Sequential( |
|||
nn.Conv1d(h_in, h_out, kernel_size=stride, stride=1, padding=1), |
|||
nn.BatchNorm1d(h_out), |
|||
nn.ReLU(), |
|||
nn.MaxPool1d(stride, stride=stride), |
|||
) |
|||
) |
|||
|
|||
# 1 x 512 |
|||
self.sequential.append( |
|||
nn.Sequential( |
|||
nn.Conv1d(512, 512, kernel_size=3, stride=1, padding=1), |
|||
nn.BatchNorm1d(512), |
|||
nn.ReLU(), |
|||
) |
|||
) |
|||
|
|||
self.sequential = nn.Sequential(*self.sequential) |
|||
|
|||
if self.supervised: |
|||
self.dropout = nn.Dropout(0.5) |
|||
self.fc = nn.Linear(512, out_dim) |
|||
|
|||
def forward(self, x): |
|||
out = self.sequential(x) |
|||
if self.supervised: |
|||
out = self.dropout(out) |
|||
|
|||
out = out.reshape(x.shape[0], out.size(1) * out.size(2)) |
|||
logit = self.fc(out) |
|||
return logit |
@ -0,0 +1,4 @@ |
|||
torchaudio==0.9.0 |
|||
torch==1.9.0 |
|||
soundfile |
|||
numpy |
@ -0,0 +1 @@ |
|||
|
@ -0,0 +1,36 @@ |
|||
import torch |
|||
from collections import OrderedDict |
|||
|
|||
|
|||
def load_encoder_checkpoint(checkpoint_path: str, output_dim: int) -> OrderedDict: |
|||
state_dict = torch.load(checkpoint_path, map_location=torch.device("cpu")) |
|||
if "pytorch-lightning_version" in state_dict.keys(): |
|||
new_state_dict = OrderedDict( |
|||
{ |
|||
k.replace("model.encoder.", ""): v |
|||
for k, v in state_dict["state_dict"].items() |
|||
if "model.encoder." in k |
|||
} |
|||
) |
|||
else: |
|||
new_state_dict = OrderedDict() |
|||
for k, v in state_dict.items(): |
|||
if "encoder." in k: |
|||
new_state_dict[k.replace("encoder.", "")] = v |
|||
|
|||
new_state_dict["fc.weight"] = torch.zeros(output_dim, 512) |
|||
new_state_dict["fc.bias"] = torch.zeros(output_dim) |
|||
return new_state_dict |
|||
|
|||
|
|||
def load_finetuner_checkpoint(checkpoint_path: str) -> OrderedDict: |
|||
state_dict = torch.load(checkpoint_path, map_location=torch.device("cpu")) |
|||
if "pytorch-lightning_version" in state_dict.keys(): |
|||
state_dict = OrderedDict( |
|||
{ |
|||
k.replace("model.", ""): v |
|||
for k, v in state_dict["state_dict"].items() |
|||
if "model." in k |
|||
} |
|||
) |
|||
return state_dict |
Loading…
Reference in new issue