refactor longformer operator

3 years ago · 9f04ad607c
4 changed files with 148 additions and 1 deletions
--- a/README.md
+++ b/README.md
@ -1,2 +1,85 @@
 # longformer
 # Text Embedding with longformer
 *author: Kyle He*
 ## Desription
 This operator uses Longformer to convert long text to embeddings.
 The Longformer model was presented in Longformer: The Long-Document Transformer by Iz Beltagy, Matthew E. Peters, Arman Cohan[1].
 **Longformer** models were proposed in “[Longformer: The Long-Document Transformer][2].
 Transformer-based models are unable to process long sequences due to their self-attention
 operation, which scales quadratically with the sequence length. To address this limitation,
 we introduce the Longformer with an attention mechanism that scales linearly with sequence
 length, making it easy to process documents of thousands of tokens or longer[2].
 ## Reference
 [1].https://huggingface.co/docs/transformers/v4.16.2/en/model_doc/longformer#transformers.LongformerConfig
 [2].https://arxiv.org/pdf/2004.05150.pdf
 ```python
 from towhee import ops
 text_encoder = ops.text_embedding.longformer(model_name="allenai/longformer-base-4096")
 text_embedding = text_encoder("Hello, world.")
 ```
 ## Factory Constructor
 Create the operator via the following factory method
 ***ops.text_embedding.longformer(model_name)***
 ## Interface
 A text embedding operator takes a sentence, paragraph, or document in string as an input
 and output an embedding vector in ndarray which captures the input's core semantic elements.
 **Parameters:**
 	***text***: *str*
 	The text in string.
 **Returns**: *numpy.ndarray*
 	The text embedding extracted by model.
 ## Code Example
 Use the pretrained model ('allenai/longformer-base-4096')
 to generate a text embedding for the sentence "Hello, world.". 
 *Write the pipeline in simplified style*:
 ```python
 import towhee.DataCollection as dc
 dc.glob("Hello, world.")
  .text_embedding.longformer('longformer-base-4096')
  .show()
 ```
 *Write a same pipeline with explicit inputs/outputs name specifications:*
 ```python
 from towhee import DataCollection as dc
 dc.glob['text']('Hello, world.')
  .text_embedding.longformer['text', 'vec']('longformer-base-4096')
  .select('vec')
  .show()
 ```
--- a/init.py
+++ b/init.py
--- a/longformer.py
+++ b/longformer.py
@ -0,0 +1,60 @@
 import numpy
 from typing import NamedTuple
 import torch
 from transformers import LongformerTokenizer, LongformerModel
 import logging
 from towhee.operator import NNOperator
 from towhee import register
 import warnings
 warnings.filterwarnings('ignore')
 log = logging.getLogger()
@register(output_schema=['vec'])
 class LongformerEmbedding(NNOperator):
    """
    NLP embedding operator that uses the pretrained longformer model gathered by huggingface.
    The Longformer model was presented in Longformer: The Long-Document Transformer by Iz Beltagy,
    Matthew E. Peters, Arman Cohan.
    Ref: https://huggingface.co/docs/transformers/v4.16.2/en/model_doc/longformer#transformers.LongformerConfig
    Args:
        model_name (`str`):
            Which model to use for the embeddings.
    """
    def __init__(self, model_name: str) -> None:
        super().__init__()
        self.model_name = model_name
        try:
            self.model = LongformerModel.from_pretrained(model_name)
        except Exception as e:
            log.error(f'Fail to load model by name: {model_name}')
            raise e
        try:
            self.tokenizer = LongformerTokenizer.from_pretrained(model_name)
        except Exception as e:
            log.error(f'Fail to load tokenizer by name: {model_name}')
            raise e
    def __call__(self, txt: str) -> numpy.ndarray:
        try:
            input_ids = torch.tensor(self.tokenizer.encode(txt)).unsqueeze(0)
        except Exception as e:
            log.error(f'Invalid input for the tokenizer: {self.model_name}')
            raise e
        try:
            attention_mask = None
            outs = self.model(input_ids, attention_mask=attention_mask, labels=input_ids, output_hidden_states=True)
        except Exception as e:
            log.error(f'Invalid input for the model: {self.model_name}')
            raise e
        try:
            feature_vector = outs[1].squeeze()
        except Exception as e:
            log.error(f'Fail to extract features by model: {self.model_name}')
            raise e
        feature_vector = feature_vector.detach().numpy()
        return feature_vector
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,4 @@
 numpy
 transformers
 sentencepiece
 protobuf