From 923b3c6ff4cba0972c4d074f0c8fad000bf897f5 Mon Sep 17 00:00:00 2001 From: Jael Gu Date: Fri, 25 Mar 2022 18:01:15 +0800 Subject: [PATCH] Refactor operator Signed-off-by: Jael Gu --- README.md | 74 +++++++++++++++++++++++++++++++++++++++++++- __init__.py | 19 ++++++++++++ auto_transformers.py | 70 +++++++++++++++++++++++++++++++++++++++++ requirements.txt | 4 +++ 4 files changed, 166 insertions(+), 1 deletion(-) create mode 100644 __init__.py create mode 100644 auto_transformers.py create mode 100644 requirements.txt diff --git a/README.md b/README.md index e2cfae6..3e61838 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,74 @@ -# transformers +# Text Embedding with Transformers + +*author: Jael Gu* + + + +## Desription + +A text embedding operator implemented with pretrained models from [Huggingface Transformers](https://huggingface.co/docs/transformers). + + + +```python +from towhee import ops + +text_encoder = ops.text_embedding.transformers("bert-base-cased") +text_embedding = text_encoder("Hello, world.") +``` + +## Factory Constructor + +Create the operator via the following factory method + +***ops.text_embedding.transformers(model_name)*** + + + +## Interface + +A text embedding operator takes a sentence, paragraph, or document in string as an input +and output an embedding vector in ndarray which captures the input's core semantic elements. + + +**Parameters:** + +​ ***text***: *str* + +​ The text in string. + + + +**Returns**: *numpy.ndarray* + +​ The text embedding extracted by model. + + + +## Code Example + +Use the pretrained Bert-Base-Cased model ('bert-base-cased') +to generate a text embedding for the sentence "Hello, world.". + + *Write the pipeline in simplified style*: + +```python +import towhee.DataCollection as dc + +dc.glob("Hello, world.") + .text_embedding.transformers('bert-base-cased') + .show() +``` + +*Write a same pipeline with explicit inputs/outputs name specifications:* + +```python +from towhee import DataCollection as dc + +dc.glob['text']('Hello, world.') + .text_embedding.transformers['text', 'vec']('bert-base-cased') + .select('vec') + .show() +``` + diff --git a/__init__.py b/__init__.py new file mode 100644 index 0000000..846a47a --- /dev/null +++ b/__init__.py @@ -0,0 +1,19 @@ +# Copyright 2021 Zilliz. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .auto_transformers import AutoTransformers + + +def transformers(): + return AutoTransformers() diff --git a/auto_transformers.py b/auto_transformers.py new file mode 100644 index 0000000..649110a --- /dev/null +++ b/auto_transformers.py @@ -0,0 +1,70 @@ +# Copyright 2021 Zilliz. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging +import numpy + +from transformers import AutoTokenizer, AutoModel + +from towhee.operator import NNOperator +from towhee import register + +import warnings + +warnings.filterwarnings('ignore') +log = logging.getLogger() + + +@register(output_schema=['vec']) +class AutoTransformers(NNOperator): + """ + NLP embedding operator that uses the pretrained transformers model gathered by huggingface. + Args: + model_name (`str`): + Which model to use for the embeddings. + """ + + def __init__(self, model_name: str) -> None: + super().__init__() + self.model_name = model_name + try: + self.model = AutoModel.from_pretrained(model_name) + except Exception as e: + log.error(f'Fail to load model by name: {self.model_name}') + raise e + try: + self.tokenizer = AutoTokenizer.from_pretrained(model_name) + except Exception as e: + log.error(f'Fail to load tokenizer by name: {self.model_name}') + raise e + + def __call__(self, txt: str) -> numpy.ndarray: + try: + inputs = self.tokenizer(txt, return_tensors="pt") + except Exception as e: + log.error(f'Invalid input for the tokenizer: {self.model_name}') + raise e + try: + outs = self.model(**inputs) + except Exception as e: + log.error(f'Invalid input for the model: {self.model_name}') + raise e + try: + features = outs.last_hidden_state.squeeze(0) + except Exception as e: + log.error(f'Fail to extract features by model: {self.model_name}') + raise e + feature_vector = features.detach().numpy() + return feature_vector + diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..7bd17fa --- /dev/null +++ b/requirements.txt @@ -0,0 +1,4 @@ +numpy +transformers +sentencepiece +protobuf