sbert/s_bert.py


								# Copyright 2021 Zilliz. All rights reserved.

								#

								# Licensed under the Apache License, Version 2.0 (the "License");

								# you may not use this file except in compliance with the License.

								# You may obtain a copy of the License at

								#

								#     http://www.apache.org/licenses/LICENSE-2.0

								#

								# Unless required by applicable law or agreed to in writing, software

								# distributed under the License is distributed on an "AS IS" BASIS,

								# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

								# See the License for the specific language governing permissions and

								# limitations under the License.


								import logging

								import numpy

								from typing import Union, List

								from pathlib import Path

								from functools import partial


								import torch

								from sentence_transformers import SentenceTransformer

								from transformers import AutoTokenizer, AutoConfig


								from towhee.operator import NNOperator

								try:

								    from towhee import accelerate

								except:

								    def accelerate(func):

								        return func


								import os

								import warnings


								warnings.filterwarnings('ignore')

								logging.getLogger('sentence_transformers').setLevel(logging.ERROR)

								log = logging.getLogger('op_s_transformers')

								log.setLevel(logging.ERROR)


								class ConvertModel(torch.nn.Module):

								    def __init__(self, model):

								        super().__init__()

								        self.net = model

								        try:

								            self.input_names = self.net.tokenizer.model_input_names

								        except AttributeError:

								            self.input_names = list(self.net.tokenize(['test']).keys())


								    def forward(self, *args, **kwargs):

								        if args:

								            assert kwargs == {}, 'Only accept neither args or kwargs as inputs.'

								            assert len(args) == len(self.input_names)

								            for k, v in zip(self.input_names, args):

								                kwargs[k] = v

								        outs = self.net(kwargs)

								        return outs['sentence_embedding']


								@accelerate

								class Model:

								    def __init__(self, model_name, device):

								        self.device = device

								        self.model = SentenceTransformer(model_name_or_path=model_name, device=self.device)

								        self.model.eval()


								    def __call__(self, *_, **kwargs):

								        new_kwargs = {}

								        for k, v in kwargs.items():

								            new_kwargs[k] = v.to(self.device)

								        outs = self.model(new_kwargs)

								        return outs['sentence_embedding']


								class STransformers(NNOperator):

								    """

								    Operator using pretrained Sentence Transformers

								    """


								    def __init__(self, model_name: str = None, device: str = None, return_usage: bool = False):

								        self.model_name = model_name

								        if device:

								            self.device = device

								        else:

								            self.device = 'cuda' if torch.cuda.is_available() else 'cpu'


								        if self.model_name:

								            self.model = Model(model_name=self.model_name, device=self.device)

								        else:

								            log.warning('The operator is initialized without specified model.')

								        self._tokenize = self.get_tokenizer()

								        self.return_usage = return_usage


								    def __call__(self, txt: Union[List[str], str]):

								        if isinstance(txt, str):

								            sentences = [txt]

								        else:

								            sentences = txt

								        inputs = self._tokenize(sentences)

								        num_tokens = int(torch.count_nonzero(inputs['input_ids']))

								        embs = self.model(**inputs).cpu().detach().numpy()

								        if isinstance(txt, str):

								            embs = embs.squeeze(0)

								        else:

								            embs = list(embs)

								        if self.return_usage:

								            return {'data': embs, 'token_usage': num_tokens}

								        return embs


								    @property

								    def supported_formats(self):

								        return ['onnx']


								    def get_tokenizer(self):

								        try:

								            tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/' + self.model_name)

								            conf = AutoConfig.from_pretrained('sentence-transformers/' + self.model_name)

								        except Exception:

								            tokenizer = AutoTokenizer.from_pretrained(self.model_name)

								            conf = AutoConfig.from_pretrained(self.model_name)

								        return partial(tokenizer,

								                       padding=True,

								                       truncation='longest_first',

								                       max_length=conf.max_position_embeddings,

								                       return_tensors='pt')

								    # @property

								    # def max_seq_length(self):

								    #     import json

								    #     from torch.hub import _get_torch_home

								    #     torch_cache = _get_torch_home()

								    #     sbert_cache = os.path.join(torch_cache, 'sentence_transformers')

								    #     cfg_path = os.path.join(sbert_cache, 'sentence-transformers_' + self.model_name, 'sentence_bert_config.json')

								    #     if not os.path.exists(cfg_path):

								    #         cfg_path = os.path.join(sbert_cache, self.model_name, 'config.json')

								    #         k = 'max_position_embeddings'

								    #     else:

								    #         k = 'max_seq_length'

								    #     with open(cfg_path) as f:

								    #         cfg = json.load(f)

								    #         if k in cfg:

								    #             max_seq_len = cfg[k]

								    #         else:

								    #             max_seq_len = None

								    #     return max_seq_len


								    @property

								    def _model(self):

								        return self.model.model


								    def save_model(self, format: str = 'pytorch', path: str = 'default'):

								        if path == 'default':

								            path = str(Path(__file__).parent)

								            path = os.path.join(path, 'saved', format)

								            os.makedirs(path, exist_ok=True)

								            name = self.model_name.replace('/', '-')

								            path = os.path.join(path, name)

								            if format in ['pytorch', 'torchscript']:

								                path = path + '.pt'

								            elif format == 'onnx':

								                path = path + '.onnx'

								            else:

								                raise AttributeError(f'Invalid format {format}.')

								        dummy_text = ['[CLS]']

								        dummy_input = self._tokenize(dummy_text)

								        if format == 'pytorch':

								            torch.save(self._model, path)

								        elif format == 'torchscript':

								            try:

								                try:

								                    jit_model = torch.jit.script(self._model)

								                except Exception:

								                    jit_model = torch.jit.trace(self._model, dummy_input, strict=False)

								                torch.jit.save(jit_model, path)

								            except Exception as e:

								                log.error(f'Fail to save as torchscript: {e}.')

								                raise RuntimeError(f'Fail to save as torchscript: {e}.')

								        elif format == 'onnx':

								            new_model = ConvertModel(self._model)

								            input_names = list(dummy_input.keys())

								            dynamic_axes = {}

								            for i_n, i_v in dummy_input.items():

								                if len(i_v.shape) == 1:

								                    dynamic_axes[i_n] = {0: 'batch_size'}

								                else:

								                    dynamic_axes[i_n] = {0: 'batch_size', 1: 'sequence_length'}

								            dynamic_axes['output_0'] = {0: 'batch_size', 1: 'emb_dim'}

								            try:

								                torch.onnx.export(new_model.to('cpu'),

								                                  tuple(dummy_input.values()),

								                                  path,

								                                  input_names=input_names,

								                                  output_names=['output_0'],

								                                  opset_version=13,

								                                  dynamic_axes=dynamic_axes,

								                                  do_constant_folding=True

								                                  )

								            except Exception as e:

								                log.error(f'Fail to save as onnx: {e}.')

								                raise RuntimeError(f'Fail to save as onnx: {e}.')

								        # todo: elif format == 'tensorrt':

								        else:

								            log.error(f'Unsupported format "{format}".')

								        return Path(path).resolve()


								    @staticmethod

								    def supported_model_names(format: str = None):

								        full_list = [

								            'clip-ViT-B-32-multilingual-v1',

								            'sentence-t5-xxl',

								            'sentence-t5-xl',

								            'sentence-t5-large',

								            'sentence-t5-base',

								            'all-distilroberta-v1',

								            'gtr-t5-xxl',

								            'gtr-t5-large',

								            'gtr-t5-xl',

								            'all-MiniLM-L12-v1',

								            'all-MiniLM-L12-v2',

								            'all-MiniLM-L6-v1',

								            'all-MiniLM-L6-v2',

								            'all-mpnet-base-v1',

								            'all-mpnet-base-v2',

								            'all-roberta-large-v1',

								            'bert-base-nli-mean-tokens',

								            'gtr-t5-base',

								            'distiluse-base-multilingual-cased-v1',

								            'distiluse-base-multilingual-cased-v2',

								            'msmarco-bert-base-dot-v5',

								            'msmarco-distilbert-base-tas-b',

								            'msmarco-distilbert-base-v4',

								            'msmarco-distilbert-dot-v5',

								            'multi-qa-distilbert-cos-v1',

								            'multi-qa-distilbert-dot-v1',

								            'multi-qa-MiniLM-L6-cos-v1',

								            'multi-qa-MiniLM-L6-dot-v1',

								            'multi-qa-mpnet-base-cos-v1',

								            'multi-qa-mpnet-base-dot-v1',

								            'paraphrase-albert-small-v2',

								            'paraphrase-distilroberta-base-v2',

								            'average_word_embeddings_komninos',

								            'paraphrase-MiniLM-L12-v2',

								            'paraphrase-MiniLM-L3-v2',

								            'average_word_embeddings_glove.6B.300d',

								            'paraphrase-MiniLM-L6-v2',

								            'paraphrase-mpnet-base-v2',

								            'paraphrase-multilingual-MiniLM-L12-v2',

								            'paraphrase-multilingual-mpnet-base-v2',

								            'paraphrase-TinyBERT-L6-v2'

								        ]

								        full_list.sort()

								        if format is None:

								            model_list = full_list

								        elif format == 'pytorch':

								            to_remove = []

								            assert set(to_remove).issubset(set(full_list))

								            model_list = list(set(full_list) - set(to_remove))

								        elif format == 'onnx':

								            to_remove = ['gtr-t5-xxl', 'sentence-t5-xxl']

								            assert set(to_remove).issubset(set(full_list))

								            model_list = list(set(full_list) - set(to_remove))

								        else:

								            log.error(f'Invalid or unsupported format "{format}".')

								        return model_list


								    def train(self, training_config=None, **kwargs):

								        from .train_sts_task import train_sts

								        train_sts(self._model, training_config)


								if __name__ == '__main__':

								    from sentence_transformers import util

								    op = STransformers(model_name='nli-distilroberta-base-v2')

								    # Check if dataset exsist. If not, download and extract  it

								    sts_dataset_path = 'datasets/stsbenchmark.tsv.gz'


								    if not os.path.exists(sts_dataset_path):

								        util.http_get('https://sbert.net/datasets/stsbenchmark.tsv.gz', sts_dataset_path)


								    training_config = {

								        'sts_dataset_path': sts_dataset_path,

								        'train_batch_size': 16,

								        'num_epochs': 4,

								        'model_save_path': './output'

								    }

								    op.train(training_config)