Add files

Signed-off-by: Jael Gu <mengjia.gu@zilliz.com>
3 years ago · b56e5e1fa4
6 changed files with 535 additions and 1 deletions
--- a/README.md
+++ b/README.md
@ -1,2 +1,149 @@
-# transformers
+# Sentence Embedding with Transformers

+*author: [Jael Gu](https://github.com/jaelgu)*
+
+<br />
+
+## Description
+
+A sentence embedding operator generates one embedding vector in ndarray for each input text.
+The embedding represents the semantic information of the whole input text as one vector.
+This operator is implemented with pre-trained models from [Huggingface Transformers](https://huggingface.co/docs/transformers).
+
+<br />
+
+## Code Example
+
+Use the pre-trained model 'sentence-transformers/paraphrase-albert-small-v2'
+to generate an embedding for the sentence "Hello, world.".
+
+*Write a same pipeline with explicit inputs/outputs name specifications:*
+
+- **option 1 (towhee>=0.9.0):**
+```python
+from towhee.dc2 import pipe, ops, DataCollection
+
+p = (
+    pipe.input('text')
+        .map('text', 'vec', 
+             ops.sentence_embedding.transformers(model_name='sentence-transformers/paraphrase-albert-small-v2'))
+        .output('text', 'vec')
+)
+
+DataCollection(p('Hello, world.')).show()
+```
+
+<img src="./result.png" width="800px"/>
+
+- **option 2:**
+
+```python
+import towhee
+
+(
+    towhee.dc['text'](['Hello, world.'])
+          .sentence_embedding.transformers['text', 'vec'](
+                model_name='sentence-transformers/paraphrase-albert-small-v2')
+          .show()
+)
+```
+
+<br />
+
+## Factory Constructor
+
+Create the operator via the following factory method:
+
+***sentence_embedding.transformers(model_name=None)***
+
+**Parameters:**
+
+***model_name***: *str*
+
+The model name in string, defaults to None.
+If None, the operator will be initialized without specified model.
+
+Supported model names: refer to `supported_model_names` below. 
+
+***checkpoint_path***: *str*
+
+The path to local checkpoint, defaults to None.
+If None, the operator will download and load pretrained model by `model_name` from Huggingface transformers.
+
+<br /> 
+
+***tokenizer***: *object*
+
+The method to tokenize input text, defaults to None.
+If None, the operator will use default tokenizer by `model_name` from Huggingface transformers.
+
+<br />
+
+## Interface
+
+The operator takes a piece of text in string as input.
+It loads tokenizer and pre-trained model using model name,
+and then return a text emabedding in numpy.ndarray.
+
+***\_\_call\_\_(txt)***
+
+**Parameters:**
+
+***data***: *Union[str, list]*
+
+	The text in string or a list of texts.
+
+**Returns**:
+
+*numpy.ndarray or list*
+
+	The text embedding (or token embeddings) extracted by model.
+If `data` is string, the operator returns an embedding in numpy.ndarray with shape of (dim,).
+If `data` is a list, the operator returns a list of embedding(s) with length of input list.
+
+<br />
+
+***save_model(format='pytorch', path='default')***
+
+Save model to local with specified format.
+
+**Parameters:**
+
+***format***: *str*
+
+	The format to export model as, such as 'pytorch', 'torchscript', 'onnx',
+defaults to 'pytorch'.
+
+***path***: *str*
+
+	The path where exported model is saved to.
+By default, it will save model to `saved` directory under the operator cache.
+
+```python
+from towhee import ops
+
+op = ops.sentence_embedding.transformers(model_name='sentence-transformers/paraphrase-albert-small-v2').get_op()
+op.save_model('onnx', 'test.onnx')
+```
+PosixPath('/Home/.towhee/operators/sentence-embedding/transformers/main/test.onnx')
+
+<br />
+
+***supported_model_names(format=None)***
+
+Get a list of all supported model names or supported model names for specified model format.
+
+**Parameters:**
+
+***format***: *str*
+
+	The model format such as 'pytorch', 'torchscript', 'onnx'.
+
+```python
+from towhee import ops
+
+
+op = ops.sentence_embedding.transformers().get_op()
+full_list = op.supported_model_names()
+onnx_list = op.supported_model_names(format='onnx')
+```
--- a/init.py
+++ b/init.py
@ -0,0 +1,19 @@
+# Copyright 2021 Zilliz. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .auto_transformers import AutoTransformers
+
+
+def transformers(*args, **kwargs):
+    return AutoTransformers(*args, **kwargs)
--- a/auto_transformers.py
+++ b/auto_transformers.py
@ -0,0 +1,256 @@
+# Copyright 2021 Zilliz. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy
+import os
+import torch
+import shutil
+from pathlib import Path
+from typing import Union
+from collections import OrderedDict
+
+from transformers import AutoModel
+
+from towhee.operator import NNOperator
+from towhee import register
+# from towhee.dc2 import accelerate
+
+import warnings
+import logging
+from transformers import logging as t_logging
+
+log = logging.getLogger('run_op')
+warnings.filterwarnings('ignore')
+os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
+t_logging.set_verbosity_error()
+
+
+# @accelerate
+class Model:
+    def __init__(self, model):
+        self.model = model
+
+    def __call__(self, *args, **kwargs):
+        outs = self.model(*args, **kwargs, return_dict=True)
+        return outs['last_hidden_state']
+
+
+@register(output_schema=['vec'])
+class AutoTransformers(NNOperator):
+    """
+    NLP embedding operator that uses the pretrained transformers model gathered by huggingface.
+    Args:
+        model_name (`str`):
+            The model name to load a pretrained model from transformers.
+        checkpoint_path (`str`):
+            The local checkpoint path.
+        tokenizer (`object`):
+            The tokenizer to tokenize input text as model inputs.
+    """
+
+    def __init__(self,
+                 model_name: str = None,
+                 checkpoint_path: str = None,
+                 tokenizer: object = None,
+                 device: str = None,
+                 norm: bool = False
+                 ):
+        super().__init__()
+        self._device = device
+        self.model_name = model_name
+        self.user_tokenizer = tokenizer
+        self.norm = norm
+        self.checkpoint_path = checkpoint_path
+
+        if self.model_name:
+            model_list = self.supported_model_names()
+            # assert model_name in model_list, f"Invalid model name: {model_name}. Supported model names: {model_list}"
+            self.model = Model(self._model)
+        else:
+            log.warning('The operator is initialized without specified model.')
+            pass
+
+    def __call__(self, data: Union[str, list]) -> numpy.ndarray:
+        if isinstance(data, str):
+            txt = [data]
+        else:
+            txt = data
+        try:
+            inputs = self.tokenizer(txt, padding=True, truncation=True, return_tensors='pt').to(self.device)
+        except Exception as e:
+            log.error(f'Fail to tokenize inputs: {e}')
+            raise e
+        try:
+            outs = self.model(**inputs)
+        except Exception as e:
+            log.error(f'Invalid input for the model: {self.model_name}')
+            raise e
+        outs = self.post_proc(outs, inputs)
+        if self.norm:
+            outs = torch.nn.functional.normalize(outs, )
+        features = outs.cpu().detach().numpy()
+        if isinstance(data, str):
+            features = features.squeeze(0)
+        else:
+            features = list(features)
+        return features
+
+    @property
+    def _model(self):
+        model = AutoModel.from_pretrained(self.model_name).to(self.device)
+        if hasattr(model, 'pooler') and model.pooler:
+            model.pooler = None
+        if self.checkpoint_path:
+            try:
+                state_dict = torch.load(self.checkpoint_path, map_location=self.device)
+                model.load_state_dict(state_dict)
+            except Exception:
+                log.error(f'Fail to load weights from {self.checkpoint_path}')
+        model.eval()
+        return model
+
+    @property
+    def device(self):
+        if self._device is None:
+            if self._device_id < 0:
+                self._device = torch.device('cpu')
+            else:
+                self._device = torch.device(self._device_id)
+        return self._device
+
+    @property
+    def model_config(self):
+        from transformers import AutoConfig
+        configs = AutoConfig.from_pretrained(self.model_name)
+        return configs
+
+    @property
+    def onnx_config(self):
+        from transformers.onnx.features import FeaturesManager
+        model_kind, model_onnx_config = FeaturesManager.check_supported_model_or_raise(
+            self._model, feature='default')
+        old_config = model_onnx_config(self.model_config)
+        onnx_config = {
+            'inputs': dict(old_config.inputs),
+            'outputs': {'last_hidden_state': old_config.outputs['last_hidden_state']}
+        }
+        return onnx_config
+
+    @property
+    def tokenizer(self):
+        from transformers import AutoTokenizer
+        try:
+            if self.user_tokenizer:
+                t = tokenizer
+            else:
+                t = AutoTokenizer.from_pretrained(self.model_name)
+            if not t.pad_token:
+                t.pad_token = '[PAD]'
+        except Exception as e:
+            log.error(f'Fail to load tokenizer.')
+            raise e
+        return t
+
+    def post_proc(self, token_embeddings, inputs):
+        token_embeddings = token_embeddings.to(self.device)
+        attention_mask = inputs['attention_mask'].to(self.device)
+        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
+        sentence_embs = torch.sum(
+            token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
+        return sentence_embs
+
+    def save_model(self, model_type: str = 'pytorch', output_file: str = 'default'):
+        if output_file == 'default':
+            output_file = str(Path(__file__).parent)
+            output_file = os.path.join(output_file, 'saved', model_type)
+            os.makedirs(output_file, exist_ok=True)
+            name = self.model_name.replace('/', '-')
+            output_file = os.path.join(output_file, name)
+            if model_type in ['pytorch', 'torchscript']:
+                output_file = output_file + '.pt'
+            elif model_type == 'onnx':
+                output_file = output_file + '.onnx'
+            else:
+                raise AttributeError('Unsupported model_type.')
+
+        dummy_input = 'test sentence'
+        inputs = self.tokenizer(dummy_input, padding=True, truncation=True, return_tensors='pt')  # a dictionary
+        if model_type == 'pytorch':
+            torch.save(self._model, output_file)
+        elif model_type == 'torchscript':
+            inputs = list(inputs.values())
+            try:
+                try:
+                    jit_model = torch.jit.script(self._model)
+                except Exception:
+                    jit_model = torch.jit.trace(self._model, inputs, strict=False)
+                torch.jit.save(jit_model, output_file)
+            except Exception as e:
+                log.error(f'Fail to save as torchscript: {e}.')
+                raise RuntimeError(f'Fail to save as torchscript: {e}.')
+        elif model_type == 'onnx':
+            dynamic_axes = {}
+            for k, v in self.onnx_config['inputs'].items():
+                dynamic_axes[k] = v
+            for k, v in self.onnx_config['outputs'].items():
+                dynamic_axes[k] = v
+            torch.onnx.export(
+                self._model,
+                tuple(inputs.values()),
+                output_file,
+                input_names=list(self.onnx_config['inputs'].keys()),
+                output_names=list(self.onnx_config['outputs'].keys()),
+                dynamic_axes=dynamic_axes,
+                opset_version=torch.onnx.constant_folding_opset_versions[-1],
+                do_constant_folding=True,
+            )
+        # todo: elif format == 'tensorrt':
+        else:
+            log.error(f'Unsupported format "{format}".')
+        return Path(output_file).resolve()
+
+    @property
+    def supported_formats(self):
+        onnxes = self.supported_model_names(format='onnx')
+        if self.model_name in onnxes:
+            return ['onnx']
+        else:
+            return ['pytorch']
+
+    @staticmethod
+    def supported_model_names(format: str = None):
+        full_list = [
+
+        ]
+        full_list.sort()
+        if format is None:
+            model_list = full_list
+        elif format == 'pytorch':
+            to_remove = []
+            assert set(to_remove).issubset(set(full_list))
+            model_list = list(set(full_list) - set(to_remove))
+        elif format == 'torchscript':
+            to_remove = [
+            ]
+            assert set(to_remove).issubset(set(full_list))
+            model_list = list(set(full_list) - set(to_remove))
+        elif format == 'onnx':
+            to_remove = [
+            ]
+            assert set(to_remove).issubset(set(full_list))
+            model_list = list(set(full_list) - set(to_remove))
+        # todo: elif format == 'tensorrt':
+        else:
+            log.error(f'Invalid format "{format}". Currently supported formats: "pytorch", "torchscript".')
+        return model_list
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,7 @@
+numpy
+transformers
+sentencepiece
+protobuf
+
+towhee
+torch
--- a/result.png
+++ b/result.png
--- a/test_onnx.py
+++ b/test_onnx.py
@ -0,0 +1,105 @@
+from towhee import ops
+import torch
+import numpy
+import onnx
+import onnxruntime
+
+import os
+from pathlib import Path
+import logging
+import platform
+import psutil
+
+import warnings
+from transformers import logging as t_logging
+
+os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
+warnings.filterwarnings("ignore")
+t_logging.set_verbosity_error()
+
+# full_models = AutoTransformers.supported_model_names()
+# checked_models = AutoTransformers.supported_model_names(format='onnx')
+# models = [x for x in full_models if x not in checked_models]
+models = ['distilbert-base-cased', 'sentence-transformers/paraphrase-albert-small-v2']
+test_txt = 'hello, world.'
+atol = 1e-3
+log_path = 'transformers_onnx.log'
+f = open('onnx.csv', 'w+')
+f.write('model,load_op,save_onnx,check_onnx,run_onnx,accuracy\n')
+
+logger = logging.getLogger('transformers_onnx')
+logger.setLevel(logging.DEBUG)
+formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+fh = logging.FileHandler(log_path)
+fh.setLevel(logging.DEBUG)
+fh.setFormatter(formatter)
+logger.addHandler(fh)
+ch = logging.StreamHandler()
+ch.setLevel(logging.ERROR)
+ch.setFormatter(formatter)
+logger.addHandler(ch)
+
+logger.debug(f'machine: {platform.platform()}-{platform.processor()}')
+logger.debug(f'free/available/total mem: {round(psutil.virtual_memory().free / (1024.0 ** 3))}'
+             f'/{round(psutil.virtual_memory().available / (1024.0 ** 3))}'
+             f'/{round(psutil.virtual_memory().total / (1024.0 ** 3))} GB')
+logger.debug(f'cpu: {psutil.cpu_count()}')
+
+
+status = None
+for name in models:
+    logger.info(f'***{name}***')
+    saved_name = name.replace('/', '-')
+    onnx_path = f'saved/onnx/{saved_name}.onnx'
+    if status:
+        f.write(','.join(status) + '\n')
+    status = [name] + ['fail'] * 5
+    try:
+        op = ops.sentence_embedding.transformers(model_name=name).get_op()
+        out1 = op(test_txt)
+        logger.info('OP LOADED.')
+        status[1] = 'success'
+    except Exception as e:
+        logger.error(f'FAIL TO LOAD OP: {e}')
+        continue
+    try:
+        op.save_model(model_type='onnx')
+        logger.info('ONNX SAVED.')
+        status[2] = 'success'
+    except Exception as e:
+        logger.error(f'FAIL TO SAVE ONNX: {e}')
+        continue
+    try:
+        try:
+            onnx_model = onnx.load(onnx_path)
+            onnx.checker.check_model(onnx_model)
+        except Exception:
+            saved_onnx = onnx.load(onnx_path, load_external_data=False)
+            onnx.checker.check_model(saved_onnx)
+        logger.info('ONNX CHECKED.')
+        status[3] = 'success'
+    except Exception as e:
+        logger.error(f'FAIL TO CHECK ONNX: {e}')
+        continue
+    try:
+        sess = onnxruntime.InferenceSession(onnx_path,
+                                            providers=onnxruntime.get_available_providers())
+        inputs = op.tokenizer(test_txt, return_tensors='np')
+        out2 = sess.run(output_names=['last_hidden_state'], input_feed=dict(inputs))[0]
+        new_inputs = op.tokenizer(test_txt, return_tensors='pt')
+        out2 = op.post_proc(torch.from_numpy(out2), new_inputs)
+        logger.info('ONNX WORKED.')
+        status[4] = 'success'
+        if numpy.allclose(out1, out2, atol=atol):
+            logger.info('Check accuracy: OK')
+            status[5] = 'success'
+        else:
+            logger.info(f'Check accuracy: atol is larger than {atol}.')
+    except Exception as e:
+        logger.error(f'FAIL TO RUN ONNX: {e}')
+        continue
+
+if status:
+    f.write(','.join(status) + '\n')
+
+print('Finished.')