towhee
/
nlp-longformer
copied
3 changed files with 84 additions and 2 deletions
@ -1,3 +1,50 @@ |
|||
# nlp-longformer |
|||
# Operator: nlp-longformer |
|||
|
|||
2 |
|||
Author: |
|||
|
|||
## Overview |
|||
|
|||
|
|||
|
|||
## Interface |
|||
|
|||
```python |
|||
__init__(self, model_name: str, framework: str = 'pytorch') |
|||
``` |
|||
|
|||
Args: |
|||
|
|||
- model_name: |
|||
- the model name for embedding |
|||
- supported types: str, for example 'xxx' or 'xxx' |
|||
- framework: |
|||
- the framework of the model |
|||
- supported types: str, default is 'pytorch' |
|||
|
|||
```python |
|||
__call__(self, call_arg_1: xxx) |
|||
``` |
|||
|
|||
Args: |
|||
|
|||
- call_arg_1: |
|||
- xxx(description about call_arg_1) |
|||
- supported types: xxx |
|||
Returns: |
|||
|
|||
The Operator returns a tuple Tuple[('results_1', xxx)] containing following fields: |
|||
|
|||
- results_1: |
|||
- xxx(description of results_1) |
|||
- data type: xxx |
|||
- shape: (xxx,) |
|||
|
|||
## Requirements |
|||
|
|||
|
|||
|
|||
## How it works |
|||
|
|||
|
|||
|
|||
## Reference |
|||
|
@ -0,0 +1,35 @@ |
|||
import numpy |
|||
from typing import NamedTuple |
|||
import torch |
|||
from transformers import LongformerTokenizer, LongformerModel |
|||
|
|||
from towhee.operator import NNOperator |
|||
|
|||
import warnings |
|||
warnings.filterwarnings('ignore') |
|||
|
|||
class NlpLongformer(NNOperator): |
|||
""" |
|||
NLP embedding operator that uses the pretrained longformer model gathered by huggingface. |
|||
The Longformer model was presented in Longformer: The Long-Document Transformer by Iz Beltagy, |
|||
Matthew E. Peters, Arman Cohan. |
|||
Ref: https://huggingface.co/docs/transformers/v4.16.2/en/model_doc/longformer#transformers.LongformerConfig |
|||
|
|||
Args: |
|||
model_name (`str`): |
|||
Which model to use for the embeddings. |
|||
""" |
|||
def __init__(self, model_name: str) -> None: |
|||
self.model = LongformerModel.from_pretrained(model_name) |
|||
self.tokenizer = LongformerTokenizer.from_pretrained(model_name) |
|||
|
|||
def __call__(self, txt: str) -> NamedTuple('Outputs', [('feature_vector', numpy.ndarray)]): |
|||
input_ids = torch.tensor(self.tokenizer.encode(txt)).unsqueeze(0) |
|||
attention_mask = None |
|||
outs = self.model(input_ids, attention_mask=attention_mask, output_hidden_states=True) |
|||
feature_vector = outs[1].squeeze() |
|||
Outputs = NamedTuple('Outputs', [('feature_vector', numpy.ndarray)]) |
|||
return Outputs(feature_vector.detach().numpy()) |
|||
|
|||
def get_model(self): |
|||
return self.model |
Loading…
Reference in new issue