diff --git a/README.md b/README.md index 7616723..7ecd753 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,50 @@ -# nlp-longformer +# Operator: nlp-longformer -2 \ No newline at end of file +Author: + +## Overview + + + +## Interface + +```python +__init__(self, model_name: str, framework: str = 'pytorch') +``` + +Args: + +- model_name: + - the model name for embedding + - supported types: str, for example 'xxx' or 'xxx' +- framework: + - the framework of the model + - supported types: str, default is 'pytorch' + +```python +__call__(self, call_arg_1: xxx) +``` + +Args: + +- call_arg_1: + - xxx(description about call_arg_1) + - supported types: xxx + Returns: + +The Operator returns a tuple Tuple[('results_1', xxx)] containing following fields: + +- results_1: + - xxx(description of results_1) + - data type: xxx + - shape: (xxx,) + +## Requirements + + + +## How it works + + + +## Reference diff --git a/__init__.py b/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/nlp_longformer.py b/nlp_longformer.py new file mode 100644 index 0000000..955060c --- /dev/null +++ b/nlp_longformer.py @@ -0,0 +1,35 @@ +import numpy +from typing import NamedTuple +import torch +from transformers import LongformerTokenizer, LongformerModel + +from towhee.operator import NNOperator + +import warnings +warnings.filterwarnings('ignore') + +class NlpLongformer(NNOperator): + """ + NLP embedding operator that uses the pretrained longformer model gathered by huggingface. + The Longformer model was presented in Longformer: The Long-Document Transformer by Iz Beltagy, + Matthew E. Peters, Arman Cohan. + Ref: https://huggingface.co/docs/transformers/v4.16.2/en/model_doc/longformer#transformers.LongformerConfig + + Args: + model_name (`str`): + Which model to use for the embeddings. + """ + def __init__(self, model_name: str) -> None: + self.model = LongformerModel.from_pretrained(model_name) + self.tokenizer = LongformerTokenizer.from_pretrained(model_name) + + def __call__(self, txt: str) -> NamedTuple('Outputs', [('feature_vector', numpy.ndarray)]): + input_ids = torch.tensor(self.tokenizer.encode(txt)).unsqueeze(0) + attention_mask = None + outs = self.model(input_ids, attention_mask=attention_mask, output_hidden_states=True) + feature_vector = outs[1].squeeze() + Outputs = NamedTuple('Outputs', [('feature_vector', numpy.ndarray)]) + return Outputs(feature_vector.detach().numpy()) + + def get_model(self): + return self.model