Add operator

Signed-off-by: Jael Gu <mengjia.gu@zilliz.com>
4 years ago · 638a038911
3 changed files with 84 additions and 2 deletions
--- a/README.md
+++ b/README.md
@ -1,3 +1,50 @@
-# nlp-longformer
+# Operator: nlp-longformer

-2
+Author:
+
+## Overview
+
+
+
+## Interface
+
+```python
+__init__(self, model_name: str, framework: str = 'pytorch')
+```
+
+Args:
+
+- model_name:
+  - the model name for embedding
+  - supported types: str, for example 'xxx' or 'xxx'
+- framework:
+  - the framework of the model
+  - supported types: str, default is 'pytorch'
+
+```python
+__call__(self, call_arg_1: xxx)
+```
+
+Args:
+
+- call_arg_1:
+  - xxx(description about call_arg_1)
+  - supported types: xxx
+  Returns:
+
+The Operator returns a tuple Tuple[('results_1', xxx)] containing following fields:
+
+- results_1:
+  - xxx(description of results_1)
+  - data type: xxx
+  - shape: (xxx,)
+
+## Requirements
+
+
+
+## How it works
+
+
+
+## Reference
--- a/init.py
+++ b/init.py
--- a/nlp_longformer.py
+++ b/nlp_longformer.py
@ -0,0 +1,35 @@
+import numpy
+from typing import NamedTuple
+import torch
+from transformers import LongformerTokenizer, LongformerModel
+
+from towhee.operator import NNOperator
+
+import warnings
+warnings.filterwarnings('ignore')
+
+class NlpLongformer(NNOperator):
+    """
+    NLP embedding operator that uses the pretrained longformer model gathered by huggingface.
+    The Longformer model was presented in Longformer: The Long-Document Transformer by Iz Beltagy,
+    Matthew E. Peters, Arman Cohan.
+    Ref: https://huggingface.co/docs/transformers/v4.16.2/en/model_doc/longformer#transformers.LongformerConfig
+
+    Args:
+        model_name (`str`):
+            Which model to use for the embeddings.
+    """
+    def __init__(self, model_name: str) -> None:
+        self.model = LongformerModel.from_pretrained(model_name)
+        self.tokenizer = LongformerTokenizer.from_pretrained(model_name)
+
+    def __call__(self, txt: str) -> NamedTuple('Outputs', [('feature_vector', numpy.ndarray)]):
+        input_ids = torch.tensor(self.tokenizer.encode(txt)).unsqueeze(0)
+        attention_mask = None
+        outs = self.model(input_ids, attention_mask=attention_mask, output_hidden_states=True)
+        feature_vector = outs[1].squeeze()
+        Outputs = NamedTuple('Outputs', [('feature_vector', numpy.ndarray)])
+        return Outputs(feature_vector.detach().numpy())
+
+    def get_model(self):
+        return self.model