diff --git a/README.md b/README.md index eaa361b..85c9cea 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,85 @@ -# longformer +# Text Embedding with longformer +*author: Kyle He* + + + +## Desription + +This operator uses Longformer to convert long text to embeddings. + +The Longformer model was presented in Longformer: The Long-Document Transformer by Iz Beltagy, Matthew E. Peters, Arman Cohan[1]. + +**Longformer** models were proposed in “[Longformer: The Long-Document Transformer][2]. + +Transformer-based models are unable to process long sequences due to their self-attention +operation, which scales quadratically with the sequence length. To address this limitation, +we introduce the Longformer with an attention mechanism that scales linearly with sequence +length, making it easy to process documents of thousands of tokens or longer[2]. + +## Reference + +[1].https://huggingface.co/docs/transformers/v4.16.2/en/model_doc/longformer#transformers.LongformerConfig + +[2].https://arxiv.org/pdf/2004.05150.pdf + +```python +from towhee import ops + +text_encoder = ops.text_embedding.longformer(model_name="allenai/longformer-base-4096") +text_embedding = text_encoder("Hello, world.") +``` + +## Factory Constructor + +Create the operator via the following factory method + +***ops.text_embedding.longformer(model_name)*** + + + +## Interface + +A text embedding operator takes a sentence, paragraph, or document in string as an input +and output an embedding vector in ndarray which captures the input's core semantic elements. + + +**Parameters:** + +​ ***text***: *str* + +​ The text in string. + + + +**Returns**: *numpy.ndarray* + +​ The text embedding extracted by model. + + + +## Code Example + +Use the pretrained model ('allenai/longformer-base-4096') +to generate a text embedding for the sentence "Hello, world.". + + *Write the pipeline in simplified style*: + +```python +import towhee.DataCollection as dc + +dc.glob("Hello, world.") + .text_embedding.longformer('longformer-base-4096') + .show() +``` + +*Write a same pipeline with explicit inputs/outputs name specifications:* + +```python +from towhee import DataCollection as dc + +dc.glob['text']('Hello, world.') + .text_embedding.longformer['text', 'vec']('longformer-base-4096') + .select('vec') + .show() +``` diff --git a/__init__.py b/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/longformer.py b/longformer.py new file mode 100644 index 0000000..0d67981 --- /dev/null +++ b/longformer.py @@ -0,0 +1,60 @@ +import numpy +from typing import NamedTuple +import torch +from transformers import LongformerTokenizer, LongformerModel +import logging + +from towhee.operator import NNOperator +from towhee import register + + +import warnings +warnings.filterwarnings('ignore') +log = logging.getLogger() + + +@register(output_schema=['vec']) +class LongformerEmbedding(NNOperator): + """ + NLP embedding operator that uses the pretrained longformer model gathered by huggingface. + The Longformer model was presented in Longformer: The Long-Document Transformer by Iz Beltagy, + Matthew E. Peters, Arman Cohan. + Ref: https://huggingface.co/docs/transformers/v4.16.2/en/model_doc/longformer#transformers.LongformerConfig + + Args: + model_name (`str`): + Which model to use for the embeddings. + """ + def __init__(self, model_name: str) -> None: + super().__init__() + self.model_name = model_name + try: + self.model = LongformerModel.from_pretrained(model_name) + except Exception as e: + log.error(f'Fail to load model by name: {model_name}') + raise e + try: + self.tokenizer = LongformerTokenizer.from_pretrained(model_name) + except Exception as e: + log.error(f'Fail to load tokenizer by name: {model_name}') + raise e + + def __call__(self, txt: str) -> numpy.ndarray: + try: + input_ids = torch.tensor(self.tokenizer.encode(txt)).unsqueeze(0) + except Exception as e: + log.error(f'Invalid input for the tokenizer: {self.model_name}') + raise e + try: + attention_mask = None + outs = self.model(input_ids, attention_mask=attention_mask, labels=input_ids, output_hidden_states=True) + except Exception as e: + log.error(f'Invalid input for the model: {self.model_name}') + raise e + try: + feature_vector = outs[1].squeeze() + except Exception as e: + log.error(f'Fail to extract features by model: {self.model_name}') + raise e + feature_vector = feature_vector.detach().numpy() + return feature_vector diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..7bd17fa --- /dev/null +++ b/requirements.txt @@ -0,0 +1,4 @@ +numpy +transformers +sentencepiece +protobuf