realm
              
                
                
            
          copied
				 4 changed files with 166 additions and 1 deletions
			
			
		@ -1,2 +1,74 @@ | 
				
			|||
# realm | 
				
			|||
# Text Embedding with Transformers | 
				
			|||
 | 
				
			|||
*author: Jael Gu and David Wang* | 
				
			|||
 | 
				
			|||
 | 
				
			|||
 | 
				
			|||
## Desription | 
				
			|||
 | 
				
			|||
A REALM text embedding operator implemented with pretrained models from [Huggingface Transformers](https://huggingface.co/docs/transformers). | 
				
			|||
 | 
				
			|||
 | 
				
			|||
 | 
				
			|||
```python | 
				
			|||
from towhee import ops | 
				
			|||
 | 
				
			|||
text_encoder = ops.text_embedding.realm('google/realm-cc-news-pretrained-encoder') | 
				
			|||
text_embedding = text_encoder("Hello, world.") | 
				
			|||
``` | 
				
			|||
 | 
				
			|||
## Factory Constructor | 
				
			|||
 | 
				
			|||
Create the operator via the following factory method | 
				
			|||
 | 
				
			|||
***ops.text_embedding.realm(model_name)*** | 
				
			|||
 | 
				
			|||
 | 
				
			|||
 | 
				
			|||
## Interface | 
				
			|||
 | 
				
			|||
A text embedding operator takes a sentence, paragraph, or document in string as an input | 
				
			|||
and output an embedding vector in ndarray which captures the input's core semantic elements. | 
				
			|||
 | 
				
			|||
 | 
				
			|||
**Parameters:** | 
				
			|||
 | 
				
			|||
	***text***: *str* | 
				
			|||
 | 
				
			|||
	The text in string. | 
				
			|||
 | 
				
			|||
 | 
				
			|||
 | 
				
			|||
**Returns**: *numpy.ndarray* | 
				
			|||
 | 
				
			|||
	The text embedding extracted by model. | 
				
			|||
 | 
				
			|||
 | 
				
			|||
 | 
				
			|||
## Code Example | 
				
			|||
 | 
				
			|||
Use the pretrained model ('google/realm-cc-news-pretrained-encoder') | 
				
			|||
to generate a text embedding for the sentence "Hello, world.".  | 
				
			|||
 | 
				
			|||
 *Write the pipeline in simplified style*: | 
				
			|||
 | 
				
			|||
```python | 
				
			|||
import towhee.DataCollection as dc | 
				
			|||
 | 
				
			|||
dc.glob("Hello, world.") | 
				
			|||
  .text_embedding.realm('google/realm-cc-news-pretrained-encoder') | 
				
			|||
  .show() | 
				
			|||
``` | 
				
			|||
 | 
				
			|||
*Write a same pipeline with explicit inputs/outputs name specifications:* | 
				
			|||
 | 
				
			|||
```python | 
				
			|||
from towhee import DataCollection as dc | 
				
			|||
 | 
				
			|||
dc.glob['text']('Hello, world.') | 
				
			|||
  .text_embedding.realm['text', 'vec']('bert-base-cased') | 
				
			|||
  .select('vec') | 
				
			|||
  .show() | 
				
			|||
``` | 
				
			|||
 | 
				
			|||
 | 
				
			|||
 | 
				
			|||
@ -0,0 +1,19 @@ | 
				
			|||
# Copyright 2021 Zilliz. All rights reserved. | 
				
			|||
# | 
				
			|||
# Licensed under the Apache License, Version 2.0 (the "License"); | 
				
			|||
# you may not use this file except in compliance with the License. | 
				
			|||
# You may obtain a copy of the License at | 
				
			|||
# | 
				
			|||
#     http://www.apache.org/licenses/LICENSE-2.0 | 
				
			|||
# | 
				
			|||
# Unless required by applicable law or agreed to in writing, software | 
				
			|||
# distributed under the License is distributed on an "AS IS" BASIS, | 
				
			|||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | 
				
			|||
# See the License for the specific language governing permissions and | 
				
			|||
# limitations under the License. | 
				
			|||
 | 
				
			|||
from .realm import Realm | 
				
			|||
 | 
				
			|||
 | 
				
			|||
def realm(model_name: str): | 
				
			|||
    return Realm(model_name) | 
				
			|||
@ -0,0 +1,70 @@ | 
				
			|||
# Copyright 2021 Zilliz. All rights reserved. | 
				
			|||
# | 
				
			|||
# Licensed under the Apache License, Version 2.0 (the "License"); | 
				
			|||
# you may not use this file except in compliance with the License. | 
				
			|||
# You may obtain a copy of the License at | 
				
			|||
# | 
				
			|||
#     http://www.apache.org/licenses/LICENSE-2.0 | 
				
			|||
# | 
				
			|||
# Unless required by applicable law or agreed to in writing, software | 
				
			|||
# distributed under the License is distributed on an "AS IS" BASIS, | 
				
			|||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | 
				
			|||
# See the License for the specific language governing permissions and | 
				
			|||
# limitations under the License. | 
				
			|||
 | 
				
			|||
import logging | 
				
			|||
import numpy | 
				
			|||
 | 
				
			|||
from transformers import RealmTokenizer, RealmEmbedder | 
				
			|||
 | 
				
			|||
from towhee.operator import NNOperator | 
				
			|||
from towhee import register | 
				
			|||
 | 
				
			|||
import warnings | 
				
			|||
 | 
				
			|||
warnings.filterwarnings('ignore') | 
				
			|||
log = logging.getLogger() | 
				
			|||
 | 
				
			|||
 | 
				
			|||
@register(output_schema=['vec']) | 
				
			|||
class Realm(NNOperator): | 
				
			|||
    """ | 
				
			|||
    NLP embedding operator that uses the pretrained REALM model gathered by huggingface. | 
				
			|||
    Args: | 
				
			|||
        model_name (`str`): | 
				
			|||
            Which model to use for the embeddings. | 
				
			|||
    """ | 
				
			|||
 | 
				
			|||
    def __init__(self, model_name: str) -> None: | 
				
			|||
        super().__init__() | 
				
			|||
        self.model_name = model_name | 
				
			|||
        try: | 
				
			|||
            self.model = RealmEmbedder.from_pretrained(model_name) | 
				
			|||
        except Exception as e: | 
				
			|||
            log.error(f'Fail to load model by name: {self.model_name}') | 
				
			|||
            raise e | 
				
			|||
        try: | 
				
			|||
            self.tokenizer = RealmTokenizer.from_pretrained(model_name) | 
				
			|||
        except Exception as e: | 
				
			|||
            log.error(f'Fail to load tokenizer by name: {self.model_name}') | 
				
			|||
            raise e | 
				
			|||
 | 
				
			|||
    def __call__(self, txt: str) -> numpy.ndarray: | 
				
			|||
        try: | 
				
			|||
            inputs = self.tokenizer(txt, return_tensors="pt") | 
				
			|||
        except Exception as e: | 
				
			|||
            log.error(f'Invalid input for the tokenizer: {self.model_name}') | 
				
			|||
            raise e | 
				
			|||
        try: | 
				
			|||
            outs = self.model(**inputs) | 
				
			|||
        except Exception as e: | 
				
			|||
            log.error(f'Invalid input for the model: {self.model_name}') | 
				
			|||
            raise e | 
				
			|||
        try: | 
				
			|||
            features = outs.projected_score.squeeze(0) | 
				
			|||
        except Exception as e: | 
				
			|||
            log.error(f'Fail to extract features by model: {self.model_name}') | 
				
			|||
            raise e | 
				
			|||
        feature_vector = features.detach().numpy() | 
				
			|||
        return feature_vector | 
				
			|||
 | 
				
			|||
@ -0,0 +1,4 @@ | 
				
			|||
numpy | 
				
			|||
transformers | 
				
			|||
sentencepiece | 
				
			|||
protobuf | 
				
			|||
					Loading…
					
					
				
		Reference in new issue