import numpy from typing import NamedTuple import torch from transformers import LongformerTokenizer, LongformerModel from towhee.operator import NNOperator import warnings warnings.filterwarnings('ignore') class NlpLongformer(NNOperator): """ NLP embedding operator that uses the pretrained longformer model gathered by huggingface. The Longformer model was presented in Longformer: The Long-Document Transformer by Iz Beltagy, Matthew E. Peters, Arman Cohan. Ref: https://huggingface.co/docs/transformers/v4.16.2/en/model_doc/longformer#transformers.LongformerConfig Args: model_name (`str`): Which model to use for the embeddings. """ def __init__(self, model_name: str) -> None: self.model = LongformerModel.from_pretrained(model_name) self.tokenizer = LongformerTokenizer.from_pretrained(model_name) def __call__(self, txt: str) -> NamedTuple('Outputs', [('feature_vector', numpy.ndarray)]): input_ids = torch.tensor(self.tokenizer.encode(txt)).unsqueeze(0) attention_mask = None outs = self.model(input_ids, attention_mask=attention_mask, output_hidden_states=True) feature_vector = outs[1].squeeze(0) Outputs = NamedTuple('Outputs', [('feature_vector', numpy.ndarray)]) return Outputs(feature_vector.detach().numpy()) def get_model(self): return self.model