Add files

Signed-off-by: Jael Gu <mengjia.gu@zilliz.com>
3 years ago · 46757653c9
6 changed files with 459 additions and 1 deletions
--- a/README.md
+++ b/README.md
@ -1,2 +1,115 @@
 # sbert
 # Sentence Embedding with Sentence Transformers
 *author: [Jael Gu](https://github.com/jaelgu)*
 <br />
 ## Description
 This operator takes a sentence or a list of sentences in string as input.
 It generates an embedding vector in numpy.ndarray for each sentence, which captures the input sentence's core semantic elements.
 This operator is implemented with pre-trained models from [Sentence Transformers](https://www.sbert.net/).
 <br />
 ## Code Example
 Use the pre-trained model "all-MiniLM-L12-v2"
 to generate a text embedding for the sentence "This is a sentence.".
 *Write a same pipeline with explicit inputs/outputs name specifications:*
 - **option 1 (towhee>=0.9.0):**
 ```python
 from towhee.dc2 import pipe, ops, DataCollection
 p = (
    pipe.input('sentence')
        .map('sentence', 'vec', ops.sentence_embedding.sbert(model_name='all-MiniLM-L12-v2'))
        .output('sentence', 'vec')
 )
 DataCollection(p('This is a sentence.')).show()
 ```
 <img src="./result.png" width="800px"/>
 - **option 2:**
 ```python
 import towhee
 (
    towhee.dc['sentence'](['This is a sentence.'])
          .sentence_embedding.sbert['sentence', 'vec'](model_name='all-MiniLM-L12-v2')
          .show()
 )
 ```
 <br />
 ## Factory Constructor
 Create the operator via the following factory method:
 ***text_embedding.sbert(model_name='all-MiniLM-L12-v2')***
 **Parameters:**
 ***model_name***: *str*
 The model name in string. Supported model names:
 Refer to [SBert Doc](https://www.sbert.net/docs/pretrained_models.html).
 Please note that only models listed `supported_model_names` are tested.
 You can refer to [Towhee Pipeline]() for model performance.
 ***device***: *str*
 The device to run model, defaults to None.
 If None, it will use 'cuda' automatically when cuda is available.
 <br />
 ## Interface
 The operator takes a sentence or a list of sentences in string as input.
 It loads tokenizer and pre-trained model using model name,
 and then returns text embedding in numpy.ndarray.
 ***__call__(txt)***
 **Parameters:**
 ***txt***: *Union[List[str], str]*
 	A sentence or a list of sentences in string.
 **Returns**:
 *Union[List[numpy.ndarray], numpy.ndarray]*
 	If input is a sentence in string, then it returns an embedding vector of shape (dim,) in numpy.ndarray.
 If input is a list of sentences, then it returns a list of embedding vectors, each of which a numpy.ndarray in shape of (dim,).
 <br/>
 ***supported_model_names(format=None)***
 Get a list of all supported model names or supported model names for specified model format.
 **Parameters:**
 ***format***: *str*
 	The model format such as 'pytorch', defaults to None.
 If None, it will return a full list of supported model names.
 ```python
 from towhee import ops
 op = ops.sentence_embedding.sentence_transformers().get_op()
 full_list = op.supported_model_names()
 onnx_list = op.supported_model_names(format='onnx')
 ```
--- a/init.py
+++ b/init.py
@ -0,0 +1,19 @@
 # Copyright 2021 Zilliz. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from .s_bert import STransformers
 def sbert(*args, **kwargs):
    return STransformers(*args, **kwargs)
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,2 @@
 sentence_transformers
 torch
--- a/result.png
+++ b/result.png
--- a/s_bert.py
+++ b/s_bert.py
@ -0,0 +1,221 @@
 # Copyright 2021 Zilliz. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import logging
 import numpy
 from typing import Union, List
 from pathlib import Path
 import torch
 from sentence_transformers import SentenceTransformer
 from towhee.operator import NNOperator
 # from towhee.dc2 import accelerate
 import os
 import warnings
 warnings.filterwarnings('ignore')
 logging.getLogger('sentence_transformers').setLevel(logging.ERROR)
 log = logging.getLogger('op_sbert')
 class ConvertModel(torch.nn.Module):
    def __init__(self, model):
        super().__init__()
        self.net = model
        try:
            self.input_names = self.net.tokenizer.model_input_names
        except AttributeError:
            self.input_names = list(self.net.tokenize(['test']).keys())
    def forward(self, *args, **kwargs):
        if args:
            assert kwargs == {}, 'Only accept neither args or kwargs as inputs.'
            assert len(args) == len(self.input_names)
            for k, v in zip(self.input_names, args):
                kwargs[k] = v
        outs = self.net(kwargs)
        return outs['sentence_embedding']
 # @accelerate
 class Model:
    def __init__(self, model):
        self.model = model
    def __call__(self, **features):
        outs = self.model(features)
        return outs['sentence_embedding']
 class STransformers(NNOperator):
    """
    Operator using pretrained Sentence Transformers
    """
    def __init__(self, model_name: str = None, device: str = None):
        self.model_name = model_name
        if device:
            self.device = device
        else:
            self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
        if self.model_name:
            self.model = Model(self._model)
        else:
            log.warning('The operator is initialized without specified model.')
            pass
    def __call__(self, txt: Union[List[str], str]):
        if isinstance(txt, str):
            sentences = [txt]
        else:
            sentences = txt
        inputs = self.tokenize(sentences)
        embs = self.model(**inputs).cpu().detach().numpy()
        if isinstance(txt, str):
            embs = embs.squeeze(0)
        else:
            embs = list(embs)
        return embs
    @property
    def _model(self):
        m = SentenceTransformer(model_name_or_path=self.model_name, device=self.device)
        m.eval()
        return m
    @property
    def supported_formats(self):
        return ['onnx']
    def tokenize(self, x):
        try:
            outs = self._model.tokenize(x)
        except Exception:
            from transformers import AutoTokenizer
            try:
                tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/' + self.model_name)
            except Exception as e:
                log.error(e)
                log.warning(f'Fail to load tokenizer with sentence-transformers/{self.model_name}.'
                            f'Trying to load tokenizer with self.model_name...')
                tokenizer = AutoTokenizer.from_pretrained(self.model_name)
            outs = tokenizer(
                x,
                padding=True, truncation='longest_first', max_length=self.max_seq_length,
                return_tensors='pt',
            )
        return outs
    @property
    def max_seq_length(self):
        import json
        from torch.hub import _get_torch_home
        torch_cache = _get_torch_home()
        sbert_cache = os.path.join(torch_cache, 'sentence_transformers')
        cfg_path = os.path.join(sbert_cache, 'sentence-transformers_' + self.model_name, 'sentence_bert_config.json')
        if not os.path.exists(cfg_path):
            cfg_path = os.path.join(sbert_cache, self.model_name, 'config.json')
            k = 'max_position_embeddings'
        else:
            k = 'max_seq_length'
        with open(cfg_path) as f:
            cfg = json.load(f)
            if k in cfg:
                max_seq_len = cfg[k]
            else:
                max_seq_len = None
        return max_seq_len
    def save_model(self, format: str = 'pytorch', path: str = 'default'):
        if path == 'default':
            path = str(Path(__file__).parent)
            path = os.path.join(path, 'saved', format)
            os.makedirs(path, exist_ok=True)
            name = self.model_name.replace('/', '-')
            path = os.path.join(path, name)
            if format in ['pytorch', 'torchscript']:
                path = path + '.pt'
            elif format == 'onnx':
                path = path + '.onnx'
            else:
                raise AttributeError(f'Invalid format {format}.')
        dummy_text = ['[CLS]']
        dummy_input = self.tokenize(dummy_text)
        if format == 'pytorch':
            torch.save(self._model, path)
        elif format == 'torchscript':
            try:
                try:
                    jit_model = torch.jit.script(self._model)
                except Exception:
                    jit_model = torch.jit.trace(self._model, dummy_input, strict=False)
                torch.jit.save(jit_model, path)
            except Exception as e:
                log.error(f'Fail to save as torchscript: {e}.')
                raise RuntimeError(f'Fail to save as torchscript: {e}.')
        elif format == 'onnx':
            new_model = ConvertModel(self._model)
            input_names = list(dummy_input.keys())
            dynamic_axes = {}
            for i_n, i_v in dummy_input.items():
                if len(i_v.shape) == 1:
                    dynamic_axes[i_n] = {0: 'batch_size'}
                else:
                    dynamic_axes[i_n] = {0: 'batch_size', 1: 'sequence_length'}
            dynamic_axes['output_0'] = {0: 'batch_size', 1: 'emb_dim'}
            try:
                torch.onnx.export(new_model,
                                  tuple(dummy_input.values()),
                                  path,
                                  input_names=input_names,
                                  output_names=['output_0'],
                                  opset_version=13,
                                  dynamic_axes=dynamic_axes,
                                  do_constant_folding=True
                                  )
            except Exception as e:
                log.error(f'Fail to save as onnx: {e}.')
                raise RuntimeError(f'Fail to save as onnx: {e}.')
        # todo: elif format == 'tensorrt':
        else:
            log.error(f'Unsupported format "{format}".')
        return Path(path).resolve()
    @staticmethod
    def supported_model_names(format: str = None):
        import requests
        req = requests.get("https://www.sbert.net/_static/html/models_en_sentence_embeddings.html")
        data = req.text
        full_list = []
        for line in data.split('\r\n'):
            line = line.replace(' ', '')
            if line.startswith('"name":'):
                name = line.split(':')[-1].replace('"', '').replace(',', '')
                full_list.append(name)
        full_list.sort()
        if format is None:
            model_list = full_list
        elif format == 'pytorch':
            to_remove = []
            assert set(to_remove).issubset(set(full_list))
            model_list = list(set(full_list) - set(to_remove))
        elif format == 'onnx':
            to_remove = []
            assert set(to_remove).issubset(set(full_list))
            model_list = list(set(full_list) - set(to_remove))
        else:
            log.error(f'Invalid or unsupported format "{format}".')
        return model_list
--- a/test_onnx.py
+++ b/test_onnx.py
@ -0,0 +1,103 @@
 from towhee import ops
 import numpy
 import onnx
 import onnxruntime
 import os
 from pathlib import Path
 import logging
 import platform
 import psutil
 op = ops.sentence_embedding.sbert().get_op()
 # full_models = op.supported_model_names()
 # checked_models = AutoTransformers.supported_model_names(format='onnx')
 # models = [x for x in full_models if x not in checked_models]
 models = ['all-MiniLM-L12-v2']
 test_txt = 'hello, world.'
 atol = 1e-3
 log_path = 'sbert.log'
 f = open('onnx.csv', 'w+')
 f.write('model,load_op,save_onnx,check_onnx,run_onnx,accuracy\n')
 logger = logging.getLogger('sbert_onnx')
 logger.setLevel(logging.DEBUG)
 formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
 fh = logging.FileHandler(log_path)
 fh.setLevel(logging.DEBUG)
 fh.setFormatter(formatter)
 logger.addHandler(fh)
 ch = logging.StreamHandler()
 ch.setLevel(logging.ERROR)
 ch.setFormatter(formatter)
 logger.addHandler(ch)
 logger.debug(f'machine: {platform.platform()}-{platform.processor()}')
 logger.debug(f'free/available/total mem: {round(psutil.virtual_memory().free / (1024.0 ** 3))}'
             f'/{round(psutil.virtual_memory().available / (1024.0 ** 3))}'
             f'/{round(psutil.virtual_memory().total / (1024.0 ** 3))} GB')
 logger.debug(f'cpu: {psutil.cpu_count()}')
 status = None
 for name in models:
    logger.info(f'***{name}***')
    saved_name = name.replace('/', '-')
    onnx_path = f'saved/onnx/{saved_name}.onnx'
    if status:
        f.write(','.join(status) + '\n')
    status = [name] + ['fail'] * 5
    try:
        op = ops.sentence_embedding.sbert(model_name=name, device='cpu').get_op()
        out1 = op(test_txt)
        logger.info('OP LOADED.')
        status[1] = 'success'
    except Exception as e:
        logger.error(f'FAIL TO LOAD OP: {e}')
        continue
    try:
        op.save_model('onnx')
        logger.info('ONNX SAVED.')
        status[2] = 'success'
    except Exception as e:
        logger.error(f'FAIL TO SAVE ONNX: {e}')
        continue
    try:
        try:
            onnx_model = onnx.load(onnx_path)
            onnx.checker.check_model(onnx_model)
        except Exception:
            saved_onnx = onnx.load(onnx_path, load_external_data=False)
            onnx.checker.check_model(saved_onnx)
        logger.info('ONNX CHECKED.')
        status[3] = 'success'
    except Exception as e:
        logger.error(f'FAIL TO CHECK ONNX: {e}')
        pass
    try:
        inputs = op._model.tokenize([test_txt])
        sess = onnxruntime.InferenceSession(onnx_path, providers=onnxruntime.get_available_providers())
        onnx_inputs = {}
        for n in sess.get_inputs():
            k = n.name
            if k in inputs:
                onnx_inputs[k] = inputs[k].cpu().detach().numpy()
        out2 = sess.run(None, input_feed=onnx_inputs)[0].squeeze(0)
        logger.info('ONNX WORKED.')
        status[4] = 'success'
        if numpy.allclose(out1, out2, atol=atol):
            logger.info('Check accuracy: OK')
            status[5] = 'success'
        else:
            logger.info(f'Check accuracy: atol is larger than {atol}.')
    except Exception as e:
        logger.error(f'FAIL TO RUN ONNX: {e}')
        continue
 if status:
    f.write(','.join(status) + '\n')
 print('Finished.')