towhee
/
            
              nlp-longformer
              
                 
                
            
          copied
				 3 changed files with 84 additions and 2 deletions
			
			
		| @ -1,3 +1,50 @@ | |||||
| # nlp-longformer |  | ||||
|  | # Operator: nlp-longformer | ||||
| 
 | 
 | ||||
| 2 |  | ||||
|  | Author: | ||||
|  | 
 | ||||
|  | ## Overview | ||||
|  | 
 | ||||
|  | 
 | ||||
|  | 
 | ||||
|  | ## Interface | ||||
|  | 
 | ||||
|  | ```python | ||||
|  | __init__(self, model_name: str, framework: str = 'pytorch') | ||||
|  | ``` | ||||
|  | 
 | ||||
|  | Args: | ||||
|  | 
 | ||||
|  | - model_name: | ||||
|  |   - the model name for embedding | ||||
|  |   - supported types: str, for example 'xxx' or 'xxx' | ||||
|  | - framework: | ||||
|  |   - the framework of the model | ||||
|  |   - supported types: str, default is 'pytorch' | ||||
|  | 
 | ||||
|  | ```python | ||||
|  | __call__(self, call_arg_1: xxx) | ||||
|  | ``` | ||||
|  | 
 | ||||
|  | Args: | ||||
|  | 
 | ||||
|  | - call_arg_1: | ||||
|  |   - xxx(description about call_arg_1) | ||||
|  |   - supported types: xxx | ||||
|  |   Returns: | ||||
|  | 
 | ||||
|  | The Operator returns a tuple Tuple[('results_1', xxx)] containing following fields: | ||||
|  | 
 | ||||
|  | - results_1: | ||||
|  |   - xxx(description of results_1) | ||||
|  |   - data type: xxx | ||||
|  |   - shape: (xxx,) | ||||
|  | 
 | ||||
|  | ## Requirements | ||||
|  | 
 | ||||
|  | 
 | ||||
|  | 
 | ||||
|  | ## How it works | ||||
|  | 
 | ||||
|  | 
 | ||||
|  | 
 | ||||
|  | ## Reference | ||||
|  | |||||
| @ -0,0 +1,35 @@ | |||||
|  | import numpy | ||||
|  | from typing import NamedTuple | ||||
|  | import torch | ||||
|  | from transformers import LongformerTokenizer, LongformerModel | ||||
|  | 
 | ||||
|  | from towhee.operator import NNOperator | ||||
|  | 
 | ||||
|  | import warnings | ||||
|  | warnings.filterwarnings('ignore') | ||||
|  | 
 | ||||
|  | class NlpLongformer(NNOperator): | ||||
|  |     """ | ||||
|  |     NLP embedding operator that uses the pretrained longformer model gathered by huggingface. | ||||
|  |     The Longformer model was presented in Longformer: The Long-Document Transformer by Iz Beltagy, | ||||
|  |     Matthew E. Peters, Arman Cohan. | ||||
|  |     Ref: https://huggingface.co/docs/transformers/v4.16.2/en/model_doc/longformer#transformers.LongformerConfig | ||||
|  | 
 | ||||
|  |     Args: | ||||
|  |         model_name (`str`): | ||||
|  |             Which model to use for the embeddings. | ||||
|  |     """ | ||||
|  |     def __init__(self, model_name: str) -> None: | ||||
|  |         self.model = LongformerModel.from_pretrained(model_name) | ||||
|  |         self.tokenizer = LongformerTokenizer.from_pretrained(model_name) | ||||
|  | 
 | ||||
|  |     def __call__(self, txt: str) -> NamedTuple('Outputs', [('feature_vector', numpy.ndarray)]): | ||||
|  |         input_ids = torch.tensor(self.tokenizer.encode(txt)).unsqueeze(0) | ||||
|  |         attention_mask = None | ||||
|  |         outs = self.model(input_ids, attention_mask=attention_mask, output_hidden_states=True) | ||||
|  |         feature_vector = outs[1].squeeze() | ||||
|  |         Outputs = NamedTuple('Outputs', [('feature_vector', numpy.ndarray)]) | ||||
|  |         return Outputs(feature_vector.detach().numpy()) | ||||
|  | 
 | ||||
|  |     def get_model(self): | ||||
|  |         return self.model | ||||
					Loading…
					
					
				
		Reference in new issue
	
	