towhee
/
nlp-longformer
copied
3 changed files with 84 additions and 2 deletions
@ -1,3 +1,50 @@ |
|||||
# nlp-longformer |
|
||||
|
# Operator: nlp-longformer |
||||
|
|
||||
2 |
|
||||
|
Author: |
||||
|
|
||||
|
## Overview |
||||
|
|
||||
|
|
||||
|
|
||||
|
## Interface |
||||
|
|
||||
|
```python |
||||
|
__init__(self, model_name: str, framework: str = 'pytorch') |
||||
|
``` |
||||
|
|
||||
|
Args: |
||||
|
|
||||
|
- model_name: |
||||
|
- the model name for embedding |
||||
|
- supported types: str, for example 'xxx' or 'xxx' |
||||
|
- framework: |
||||
|
- the framework of the model |
||||
|
- supported types: str, default is 'pytorch' |
||||
|
|
||||
|
```python |
||||
|
__call__(self, call_arg_1: xxx) |
||||
|
``` |
||||
|
|
||||
|
Args: |
||||
|
|
||||
|
- call_arg_1: |
||||
|
- xxx(description about call_arg_1) |
||||
|
- supported types: xxx |
||||
|
Returns: |
||||
|
|
||||
|
The Operator returns a tuple Tuple[('results_1', xxx)] containing following fields: |
||||
|
|
||||
|
- results_1: |
||||
|
- xxx(description of results_1) |
||||
|
- data type: xxx |
||||
|
- shape: (xxx,) |
||||
|
|
||||
|
## Requirements |
||||
|
|
||||
|
|
||||
|
|
||||
|
## How it works |
||||
|
|
||||
|
|
||||
|
|
||||
|
## Reference |
||||
|
@ -0,0 +1,35 @@ |
|||||
|
import numpy |
||||
|
from typing import NamedTuple |
||||
|
import torch |
||||
|
from transformers import LongformerTokenizer, LongformerModel |
||||
|
|
||||
|
from towhee.operator import NNOperator |
||||
|
|
||||
|
import warnings |
||||
|
warnings.filterwarnings('ignore') |
||||
|
|
||||
|
class NlpLongformer(NNOperator): |
||||
|
""" |
||||
|
NLP embedding operator that uses the pretrained longformer model gathered by huggingface. |
||||
|
The Longformer model was presented in Longformer: The Long-Document Transformer by Iz Beltagy, |
||||
|
Matthew E. Peters, Arman Cohan. |
||||
|
Ref: https://huggingface.co/docs/transformers/v4.16.2/en/model_doc/longformer#transformers.LongformerConfig |
||||
|
|
||||
|
Args: |
||||
|
model_name (`str`): |
||||
|
Which model to use for the embeddings. |
||||
|
""" |
||||
|
def __init__(self, model_name: str) -> None: |
||||
|
self.model = LongformerModel.from_pretrained(model_name) |
||||
|
self.tokenizer = LongformerTokenizer.from_pretrained(model_name) |
||||
|
|
||||
|
def __call__(self, txt: str) -> NamedTuple('Outputs', [('feature_vector', numpy.ndarray)]): |
||||
|
input_ids = torch.tensor(self.tokenizer.encode(txt)).unsqueeze(0) |
||||
|
attention_mask = None |
||||
|
outs = self.model(input_ids, attention_mask=attention_mask, output_hidden_states=True) |
||||
|
feature_vector = outs[1].squeeze() |
||||
|
Outputs = NamedTuple('Outputs', [('feature_vector', numpy.ndarray)]) |
||||
|
return Outputs(feature_vector.detach().numpy()) |
||||
|
|
||||
|
def get_model(self): |
||||
|
return self.model |
Loading…
Reference in new issue