transformers
copied
Jael Gu
2 years ago
6 changed files with 535 additions and 1 deletions
@ -1,2 +1,149 @@ |
|||||
# transformers |
|
||||
|
# Sentence Embedding with Transformers |
||||
|
|
||||
|
*author: [Jael Gu](https://github.com/jaelgu)* |
||||
|
|
||||
|
<br /> |
||||
|
|
||||
|
## Description |
||||
|
|
||||
|
A sentence embedding operator generates one embedding vector in ndarray for each input text. |
||||
|
The embedding represents the semantic information of the whole input text as one vector. |
||||
|
This operator is implemented with pre-trained models from [Huggingface Transformers](https://huggingface.co/docs/transformers). |
||||
|
|
||||
|
<br /> |
||||
|
|
||||
|
## Code Example |
||||
|
|
||||
|
Use the pre-trained model 'sentence-transformers/paraphrase-albert-small-v2' |
||||
|
to generate an embedding for the sentence "Hello, world.". |
||||
|
|
||||
|
*Write a same pipeline with explicit inputs/outputs name specifications:* |
||||
|
|
||||
|
- **option 1 (towhee>=0.9.0):** |
||||
|
```python |
||||
|
from towhee.dc2 import pipe, ops, DataCollection |
||||
|
|
||||
|
p = ( |
||||
|
pipe.input('text') |
||||
|
.map('text', 'vec', |
||||
|
ops.sentence_embedding.transformers(model_name='sentence-transformers/paraphrase-albert-small-v2')) |
||||
|
.output('text', 'vec') |
||||
|
) |
||||
|
|
||||
|
DataCollection(p('Hello, world.')).show() |
||||
|
``` |
||||
|
|
||||
|
<img src="./result.png" width="800px"/> |
||||
|
|
||||
|
- **option 2:** |
||||
|
|
||||
|
```python |
||||
|
import towhee |
||||
|
|
||||
|
( |
||||
|
towhee.dc['text'](['Hello, world.']) |
||||
|
.sentence_embedding.transformers['text', 'vec']( |
||||
|
model_name='sentence-transformers/paraphrase-albert-small-v2') |
||||
|
.show() |
||||
|
) |
||||
|
``` |
||||
|
|
||||
|
<br /> |
||||
|
|
||||
|
## Factory Constructor |
||||
|
|
||||
|
Create the operator via the following factory method: |
||||
|
|
||||
|
***sentence_embedding.transformers(model_name=None)*** |
||||
|
|
||||
|
**Parameters:** |
||||
|
|
||||
|
***model_name***: *str* |
||||
|
|
||||
|
The model name in string, defaults to None. |
||||
|
If None, the operator will be initialized without specified model. |
||||
|
|
||||
|
Supported model names: refer to `supported_model_names` below. |
||||
|
|
||||
|
***checkpoint_path***: *str* |
||||
|
|
||||
|
The path to local checkpoint, defaults to None. |
||||
|
If None, the operator will download and load pretrained model by `model_name` from Huggingface transformers. |
||||
|
|
||||
|
<br /> |
||||
|
|
||||
|
***tokenizer***: *object* |
||||
|
|
||||
|
The method to tokenize input text, defaults to None. |
||||
|
If None, the operator will use default tokenizer by `model_name` from Huggingface transformers. |
||||
|
|
||||
|
<br /> |
||||
|
|
||||
|
## Interface |
||||
|
|
||||
|
The operator takes a piece of text in string as input. |
||||
|
It loads tokenizer and pre-trained model using model name, |
||||
|
and then return a text emabedding in numpy.ndarray. |
||||
|
|
||||
|
***\_\_call\_\_(txt)*** |
||||
|
|
||||
|
**Parameters:** |
||||
|
|
||||
|
***data***: *Union[str, list]* |
||||
|
|
||||
|
​ The text in string or a list of texts. |
||||
|
|
||||
|
**Returns**: |
||||
|
|
||||
|
*numpy.ndarray or list* |
||||
|
|
||||
|
​ The text embedding (or token embeddings) extracted by model. |
||||
|
If `data` is string, the operator returns an embedding in numpy.ndarray with shape of (dim,). |
||||
|
If `data` is a list, the operator returns a list of embedding(s) with length of input list. |
||||
|
|
||||
|
<br /> |
||||
|
|
||||
|
***save_model(format='pytorch', path='default')*** |
||||
|
|
||||
|
Save model to local with specified format. |
||||
|
|
||||
|
**Parameters:** |
||||
|
|
||||
|
***format***: *str* |
||||
|
|
||||
|
​ The format to export model as, such as 'pytorch', 'torchscript', 'onnx', |
||||
|
defaults to 'pytorch'. |
||||
|
|
||||
|
***path***: *str* |
||||
|
|
||||
|
​ The path where exported model is saved to. |
||||
|
By default, it will save model to `saved` directory under the operator cache. |
||||
|
|
||||
|
```python |
||||
|
from towhee import ops |
||||
|
|
||||
|
op = ops.sentence_embedding.transformers(model_name='sentence-transformers/paraphrase-albert-small-v2').get_op() |
||||
|
op.save_model('onnx', 'test.onnx') |
||||
|
``` |
||||
|
PosixPath('/Home/.towhee/operators/sentence-embedding/transformers/main/test.onnx') |
||||
|
|
||||
|
<br /> |
||||
|
|
||||
|
***supported_model_names(format=None)*** |
||||
|
|
||||
|
Get a list of all supported model names or supported model names for specified model format. |
||||
|
|
||||
|
**Parameters:** |
||||
|
|
||||
|
***format***: *str* |
||||
|
|
||||
|
​ The model format such as 'pytorch', 'torchscript', 'onnx'. |
||||
|
|
||||
|
```python |
||||
|
from towhee import ops |
||||
|
|
||||
|
|
||||
|
op = ops.sentence_embedding.transformers().get_op() |
||||
|
full_list = op.supported_model_names() |
||||
|
onnx_list = op.supported_model_names(format='onnx') |
||||
|
``` |
||||
|
@ -0,0 +1,19 @@ |
|||||
|
# Copyright 2021 Zilliz. All rights reserved. |
||||
|
# |
||||
|
# Licensed under the Apache License, Version 2.0 (the "License"); |
||||
|
# you may not use this file except in compliance with the License. |
||||
|
# You may obtain a copy of the License at |
||||
|
# |
||||
|
# http://www.apache.org/licenses/LICENSE-2.0 |
||||
|
# |
||||
|
# Unless required by applicable law or agreed to in writing, software |
||||
|
# distributed under the License is distributed on an "AS IS" BASIS, |
||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||
|
# See the License for the specific language governing permissions and |
||||
|
# limitations under the License. |
||||
|
|
||||
|
from .auto_transformers import AutoTransformers |
||||
|
|
||||
|
|
||||
|
def transformers(*args, **kwargs): |
||||
|
return AutoTransformers(*args, **kwargs) |
@ -0,0 +1,256 @@ |
|||||
|
# Copyright 2021 Zilliz. All rights reserved. |
||||
|
# |
||||
|
# Licensed under the Apache License, Version 2.0 (the "License"); |
||||
|
# you may not use this file except in compliance with the License. |
||||
|
# You may obtain a copy of the License at |
||||
|
# |
||||
|
# http://www.apache.org/licenses/LICENSE-2.0 |
||||
|
# |
||||
|
# Unless required by applicable law or agreed to in writing, software |
||||
|
# distributed under the License is distributed on an "AS IS" BASIS, |
||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||
|
# See the License for the specific language governing permissions and |
||||
|
# limitations under the License. |
||||
|
|
||||
|
import numpy |
||||
|
import os |
||||
|
import torch |
||||
|
import shutil |
||||
|
from pathlib import Path |
||||
|
from typing import Union |
||||
|
from collections import OrderedDict |
||||
|
|
||||
|
from transformers import AutoModel |
||||
|
|
||||
|
from towhee.operator import NNOperator |
||||
|
from towhee import register |
||||
|
# from towhee.dc2 import accelerate |
||||
|
|
||||
|
import warnings |
||||
|
import logging |
||||
|
from transformers import logging as t_logging |
||||
|
|
||||
|
log = logging.getLogger('run_op') |
||||
|
warnings.filterwarnings('ignore') |
||||
|
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' |
||||
|
t_logging.set_verbosity_error() |
||||
|
|
||||
|
|
||||
|
# @accelerate |
||||
|
class Model: |
||||
|
def __init__(self, model): |
||||
|
self.model = model |
||||
|
|
||||
|
def __call__(self, *args, **kwargs): |
||||
|
outs = self.model(*args, **kwargs, return_dict=True) |
||||
|
return outs['last_hidden_state'] |
||||
|
|
||||
|
|
||||
|
@register(output_schema=['vec']) |
||||
|
class AutoTransformers(NNOperator): |
||||
|
""" |
||||
|
NLP embedding operator that uses the pretrained transformers model gathered by huggingface. |
||||
|
Args: |
||||
|
model_name (`str`): |
||||
|
The model name to load a pretrained model from transformers. |
||||
|
checkpoint_path (`str`): |
||||
|
The local checkpoint path. |
||||
|
tokenizer (`object`): |
||||
|
The tokenizer to tokenize input text as model inputs. |
||||
|
""" |
||||
|
|
||||
|
def __init__(self, |
||||
|
model_name: str = None, |
||||
|
checkpoint_path: str = None, |
||||
|
tokenizer: object = None, |
||||
|
device: str = None, |
||||
|
norm: bool = False |
||||
|
): |
||||
|
super().__init__() |
||||
|
self._device = device |
||||
|
self.model_name = model_name |
||||
|
self.user_tokenizer = tokenizer |
||||
|
self.norm = norm |
||||
|
self.checkpoint_path = checkpoint_path |
||||
|
|
||||
|
if self.model_name: |
||||
|
model_list = self.supported_model_names() |
||||
|
# assert model_name in model_list, f"Invalid model name: {model_name}. Supported model names: {model_list}" |
||||
|
self.model = Model(self._model) |
||||
|
else: |
||||
|
log.warning('The operator is initialized without specified model.') |
||||
|
pass |
||||
|
|
||||
|
def __call__(self, data: Union[str, list]) -> numpy.ndarray: |
||||
|
if isinstance(data, str): |
||||
|
txt = [data] |
||||
|
else: |
||||
|
txt = data |
||||
|
try: |
||||
|
inputs = self.tokenizer(txt, padding=True, truncation=True, return_tensors='pt').to(self.device) |
||||
|
except Exception as e: |
||||
|
log.error(f'Fail to tokenize inputs: {e}') |
||||
|
raise e |
||||
|
try: |
||||
|
outs = self.model(**inputs) |
||||
|
except Exception as e: |
||||
|
log.error(f'Invalid input for the model: {self.model_name}') |
||||
|
raise e |
||||
|
outs = self.post_proc(outs, inputs) |
||||
|
if self.norm: |
||||
|
outs = torch.nn.functional.normalize(outs, ) |
||||
|
features = outs.cpu().detach().numpy() |
||||
|
if isinstance(data, str): |
||||
|
features = features.squeeze(0) |
||||
|
else: |
||||
|
features = list(features) |
||||
|
return features |
||||
|
|
||||
|
@property |
||||
|
def _model(self): |
||||
|
model = AutoModel.from_pretrained(self.model_name).to(self.device) |
||||
|
if hasattr(model, 'pooler') and model.pooler: |
||||
|
model.pooler = None |
||||
|
if self.checkpoint_path: |
||||
|
try: |
||||
|
state_dict = torch.load(self.checkpoint_path, map_location=self.device) |
||||
|
model.load_state_dict(state_dict) |
||||
|
except Exception: |
||||
|
log.error(f'Fail to load weights from {self.checkpoint_path}') |
||||
|
model.eval() |
||||
|
return model |
||||
|
|
||||
|
@property |
||||
|
def device(self): |
||||
|
if self._device is None: |
||||
|
if self._device_id < 0: |
||||
|
self._device = torch.device('cpu') |
||||
|
else: |
||||
|
self._device = torch.device(self._device_id) |
||||
|
return self._device |
||||
|
|
||||
|
@property |
||||
|
def model_config(self): |
||||
|
from transformers import AutoConfig |
||||
|
configs = AutoConfig.from_pretrained(self.model_name) |
||||
|
return configs |
||||
|
|
||||
|
@property |
||||
|
def onnx_config(self): |
||||
|
from transformers.onnx.features import FeaturesManager |
||||
|
model_kind, model_onnx_config = FeaturesManager.check_supported_model_or_raise( |
||||
|
self._model, feature='default') |
||||
|
old_config = model_onnx_config(self.model_config) |
||||
|
onnx_config = { |
||||
|
'inputs': dict(old_config.inputs), |
||||
|
'outputs': {'last_hidden_state': old_config.outputs['last_hidden_state']} |
||||
|
} |
||||
|
return onnx_config |
||||
|
|
||||
|
@property |
||||
|
def tokenizer(self): |
||||
|
from transformers import AutoTokenizer |
||||
|
try: |
||||
|
if self.user_tokenizer: |
||||
|
t = tokenizer |
||||
|
else: |
||||
|
t = AutoTokenizer.from_pretrained(self.model_name) |
||||
|
if not t.pad_token: |
||||
|
t.pad_token = '[PAD]' |
||||
|
except Exception as e: |
||||
|
log.error(f'Fail to load tokenizer.') |
||||
|
raise e |
||||
|
return t |
||||
|
|
||||
|
def post_proc(self, token_embeddings, inputs): |
||||
|
token_embeddings = token_embeddings.to(self.device) |
||||
|
attention_mask = inputs['attention_mask'].to(self.device) |
||||
|
input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float() |
||||
|
sentence_embs = torch.sum( |
||||
|
token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9) |
||||
|
return sentence_embs |
||||
|
|
||||
|
def save_model(self, model_type: str = 'pytorch', output_file: str = 'default'): |
||||
|
if output_file == 'default': |
||||
|
output_file = str(Path(__file__).parent) |
||||
|
output_file = os.path.join(output_file, 'saved', model_type) |
||||
|
os.makedirs(output_file, exist_ok=True) |
||||
|
name = self.model_name.replace('/', '-') |
||||
|
output_file = os.path.join(output_file, name) |
||||
|
if model_type in ['pytorch', 'torchscript']: |
||||
|
output_file = output_file + '.pt' |
||||
|
elif model_type == 'onnx': |
||||
|
output_file = output_file + '.onnx' |
||||
|
else: |
||||
|
raise AttributeError('Unsupported model_type.') |
||||
|
|
||||
|
dummy_input = 'test sentence' |
||||
|
inputs = self.tokenizer(dummy_input, padding=True, truncation=True, return_tensors='pt') # a dictionary |
||||
|
if model_type == 'pytorch': |
||||
|
torch.save(self._model, output_file) |
||||
|
elif model_type == 'torchscript': |
||||
|
inputs = list(inputs.values()) |
||||
|
try: |
||||
|
try: |
||||
|
jit_model = torch.jit.script(self._model) |
||||
|
except Exception: |
||||
|
jit_model = torch.jit.trace(self._model, inputs, strict=False) |
||||
|
torch.jit.save(jit_model, output_file) |
||||
|
except Exception as e: |
||||
|
log.error(f'Fail to save as torchscript: {e}.') |
||||
|
raise RuntimeError(f'Fail to save as torchscript: {e}.') |
||||
|
elif model_type == 'onnx': |
||||
|
dynamic_axes = {} |
||||
|
for k, v in self.onnx_config['inputs'].items(): |
||||
|
dynamic_axes[k] = v |
||||
|
for k, v in self.onnx_config['outputs'].items(): |
||||
|
dynamic_axes[k] = v |
||||
|
torch.onnx.export( |
||||
|
self._model, |
||||
|
tuple(inputs.values()), |
||||
|
output_file, |
||||
|
input_names=list(self.onnx_config['inputs'].keys()), |
||||
|
output_names=list(self.onnx_config['outputs'].keys()), |
||||
|
dynamic_axes=dynamic_axes, |
||||
|
opset_version=torch.onnx.constant_folding_opset_versions[-1], |
||||
|
do_constant_folding=True, |
||||
|
) |
||||
|
# todo: elif format == 'tensorrt': |
||||
|
else: |
||||
|
log.error(f'Unsupported format "{format}".') |
||||
|
return Path(output_file).resolve() |
||||
|
|
||||
|
@property |
||||
|
def supported_formats(self): |
||||
|
onnxes = self.supported_model_names(format='onnx') |
||||
|
if self.model_name in onnxes: |
||||
|
return ['onnx'] |
||||
|
else: |
||||
|
return ['pytorch'] |
||||
|
|
||||
|
@staticmethod |
||||
|
def supported_model_names(format: str = None): |
||||
|
full_list = [ |
||||
|
|
||||
|
] |
||||
|
full_list.sort() |
||||
|
if format is None: |
||||
|
model_list = full_list |
||||
|
elif format == 'pytorch': |
||||
|
to_remove = [] |
||||
|
assert set(to_remove).issubset(set(full_list)) |
||||
|
model_list = list(set(full_list) - set(to_remove)) |
||||
|
elif format == 'torchscript': |
||||
|
to_remove = [ |
||||
|
] |
||||
|
assert set(to_remove).issubset(set(full_list)) |
||||
|
model_list = list(set(full_list) - set(to_remove)) |
||||
|
elif format == 'onnx': |
||||
|
to_remove = [ |
||||
|
] |
||||
|
assert set(to_remove).issubset(set(full_list)) |
||||
|
model_list = list(set(full_list) - set(to_remove)) |
||||
|
# todo: elif format == 'tensorrt': |
||||
|
else: |
||||
|
log.error(f'Invalid format "{format}". Currently supported formats: "pytorch", "torchscript".') |
||||
|
return model_list |
@ -0,0 +1,7 @@ |
|||||
|
numpy |
||||
|
transformers |
||||
|
sentencepiece |
||||
|
protobuf |
||||
|
|
||||
|
towhee |
||||
|
torch |
After Width: | Height: | Size: 5.7 KiB |
@ -0,0 +1,105 @@ |
|||||
|
from towhee import ops |
||||
|
import torch |
||||
|
import numpy |
||||
|
import onnx |
||||
|
import onnxruntime |
||||
|
|
||||
|
import os |
||||
|
from pathlib import Path |
||||
|
import logging |
||||
|
import platform |
||||
|
import psutil |
||||
|
|
||||
|
import warnings |
||||
|
from transformers import logging as t_logging |
||||
|
|
||||
|
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' |
||||
|
warnings.filterwarnings("ignore") |
||||
|
t_logging.set_verbosity_error() |
||||
|
|
||||
|
# full_models = AutoTransformers.supported_model_names() |
||||
|
# checked_models = AutoTransformers.supported_model_names(format='onnx') |
||||
|
# models = [x for x in full_models if x not in checked_models] |
||||
|
models = ['distilbert-base-cased', 'sentence-transformers/paraphrase-albert-small-v2'] |
||||
|
test_txt = 'hello, world.' |
||||
|
atol = 1e-3 |
||||
|
log_path = 'transformers_onnx.log' |
||||
|
f = open('onnx.csv', 'w+') |
||||
|
f.write('model,load_op,save_onnx,check_onnx,run_onnx,accuracy\n') |
||||
|
|
||||
|
logger = logging.getLogger('transformers_onnx') |
||||
|
logger.setLevel(logging.DEBUG) |
||||
|
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') |
||||
|
fh = logging.FileHandler(log_path) |
||||
|
fh.setLevel(logging.DEBUG) |
||||
|
fh.setFormatter(formatter) |
||||
|
logger.addHandler(fh) |
||||
|
ch = logging.StreamHandler() |
||||
|
ch.setLevel(logging.ERROR) |
||||
|
ch.setFormatter(formatter) |
||||
|
logger.addHandler(ch) |
||||
|
|
||||
|
logger.debug(f'machine: {platform.platform()}-{platform.processor()}') |
||||
|
logger.debug(f'free/available/total mem: {round(psutil.virtual_memory().free / (1024.0 ** 3))}' |
||||
|
f'/{round(psutil.virtual_memory().available / (1024.0 ** 3))}' |
||||
|
f'/{round(psutil.virtual_memory().total / (1024.0 ** 3))} GB') |
||||
|
logger.debug(f'cpu: {psutil.cpu_count()}') |
||||
|
|
||||
|
|
||||
|
status = None |
||||
|
for name in models: |
||||
|
logger.info(f'***{name}***') |
||||
|
saved_name = name.replace('/', '-') |
||||
|
onnx_path = f'saved/onnx/{saved_name}.onnx' |
||||
|
if status: |
||||
|
f.write(','.join(status) + '\n') |
||||
|
status = [name] + ['fail'] * 5 |
||||
|
try: |
||||
|
op = ops.sentence_embedding.transformers(model_name=name).get_op() |
||||
|
out1 = op(test_txt) |
||||
|
logger.info('OP LOADED.') |
||||
|
status[1] = 'success' |
||||
|
except Exception as e: |
||||
|
logger.error(f'FAIL TO LOAD OP: {e}') |
||||
|
continue |
||||
|
try: |
||||
|
op.save_model(model_type='onnx') |
||||
|
logger.info('ONNX SAVED.') |
||||
|
status[2] = 'success' |
||||
|
except Exception as e: |
||||
|
logger.error(f'FAIL TO SAVE ONNX: {e}') |
||||
|
continue |
||||
|
try: |
||||
|
try: |
||||
|
onnx_model = onnx.load(onnx_path) |
||||
|
onnx.checker.check_model(onnx_model) |
||||
|
except Exception: |
||||
|
saved_onnx = onnx.load(onnx_path, load_external_data=False) |
||||
|
onnx.checker.check_model(saved_onnx) |
||||
|
logger.info('ONNX CHECKED.') |
||||
|
status[3] = 'success' |
||||
|
except Exception as e: |
||||
|
logger.error(f'FAIL TO CHECK ONNX: {e}') |
||||
|
continue |
||||
|
try: |
||||
|
sess = onnxruntime.InferenceSession(onnx_path, |
||||
|
providers=onnxruntime.get_available_providers()) |
||||
|
inputs = op.tokenizer(test_txt, return_tensors='np') |
||||
|
out2 = sess.run(output_names=['last_hidden_state'], input_feed=dict(inputs))[0] |
||||
|
new_inputs = op.tokenizer(test_txt, return_tensors='pt') |
||||
|
out2 = op.post_proc(torch.from_numpy(out2), new_inputs) |
||||
|
logger.info('ONNX WORKED.') |
||||
|
status[4] = 'success' |
||||
|
if numpy.allclose(out1, out2, atol=atol): |
||||
|
logger.info('Check accuracy: OK') |
||||
|
status[5] = 'success' |
||||
|
else: |
||||
|
logger.info(f'Check accuracy: atol is larger than {atol}.') |
||||
|
except Exception as e: |
||||
|
logger.error(f'FAIL TO RUN ONNX: {e}') |
||||
|
continue |
||||
|
|
||||
|
if status: |
||||
|
f.write(','.join(status) + '\n') |
||||
|
|
||||
|
print('Finished.') |
Loading…
Reference in new issue