import numpy import torch from transformers import LongformerTokenizer, LongformerModel import logging from towhee.operator import NNOperator from towhee import register import warnings warnings.filterwarnings('ignore') log = logging.getLogger() @register(output_schema=['vec']) class Longformer(NNOperator): """ NLP embedding operator that uses the pretrained longformer model gathered by huggingface. The Longformer model was presented in Longformer: The Long-Document Transformer by Iz Beltagy, Matthew E. Peters, Arman Cohan. Ref: https://huggingface.co/docs/transformers/v4.16.2/en/model_doc/longformer#transformers.LongformerConfig Args: model_name (`str`): Which model to use for the embeddings. """ def __init__(self, model_name: str) -> None: super().__init__() self.model_name = model_name try: self.model = LongformerModel.from_pretrained(model_name) except Exception as e: log.error(f'Fail to load model by name: {model_name}') raise e try: self.tokenizer = LongformerTokenizer.from_pretrained(model_name) except Exception as e: log.error(f'Fail to load tokenizer by name: {model_name}') raise e def __call__(self, txt: str) -> numpy.ndarray: try: input_ids = torch.tensor(self.tokenizer.encode(txt)).unsqueeze(0) except Exception as e: log.error(f'Invalid input for the tokenizer: {self.model_name}') raise e try: attention_mask = None outs = self.model(input_ids, attention_mask=attention_mask, labels=input_ids, output_hidden_states=True) except Exception as e: log.error(f'Invalid input for the model: {self.model_name}') raise e try: feature_vector = outs[1].squeeze() except Exception as e: log.error(f'Fail to extract features by model: {self.model_name}') raise e feature_vector = feature_vector.detach().numpy() return feature_vector