Add operator

Signed-off-by: Jael Gu <mengjia.gu@zilliz.com>
3 years ago · 638a038911
3 changed files with 84 additions and 2 deletions
--- a/README.md
+++ b/README.md
@ -1,3 +1,50 @@
 # nlp-longformer
 # Operator: nlp-longformer
 2
 Author:
 ## Overview
 ## Interface
 ```python
 __init__(self, model_name: str, framework: str = 'pytorch')
 ```
 Args:
 - model_name:
  - the model name for embedding
  - supported types: str, for example 'xxx' or 'xxx'
 - framework:
  - the framework of the model
  - supported types: str, default is 'pytorch'
 ```python
 __call__(self, call_arg_1: xxx)
 ```
 Args:
 - call_arg_1:
  - xxx(description about call_arg_1)
  - supported types: xxx
  Returns:
 The Operator returns a tuple Tuple[('results_1', xxx)] containing following fields:
 - results_1:
  - xxx(description of results_1)
  - data type: xxx
  - shape: (xxx,)
 ## Requirements
 ## How it works
 ## Reference
--- a/init.py
+++ b/init.py
--- a/nlp_longformer.py
+++ b/nlp_longformer.py
@ -0,0 +1,35 @@
 import numpy
 from typing import NamedTuple
 import torch
 from transformers import LongformerTokenizer, LongformerModel
 from towhee.operator import NNOperator
 import warnings
 warnings.filterwarnings('ignore')
 class NlpLongformer(NNOperator):
    """
    NLP embedding operator that uses the pretrained longformer model gathered by huggingface.
    The Longformer model was presented in Longformer: The Long-Document Transformer by Iz Beltagy,
    Matthew E. Peters, Arman Cohan.
    Ref: https://huggingface.co/docs/transformers/v4.16.2/en/model_doc/longformer#transformers.LongformerConfig
    Args:
        model_name (`str`):
            Which model to use for the embeddings.
    """
    def __init__(self, model_name: str) -> None:
        self.model = LongformerModel.from_pretrained(model_name)
        self.tokenizer = LongformerTokenizer.from_pretrained(model_name)
    def __call__(self, txt: str) -> NamedTuple('Outputs', [('feature_vector', numpy.ndarray)]):
        input_ids = torch.tensor(self.tokenizer.encode(txt)).unsqueeze(0)
        attention_mask = None
        outs = self.model(input_ids, attention_mask=attention_mask, output_hidden_states=True)
        feature_vector = outs[1].squeeze()
        Outputs = NamedTuple('Outputs', [('feature_vector', numpy.ndarray)])
        return Outputs(feature_vector.detach().numpy())
    def get_model(self):
        return self.model