|
|
@ -15,6 +15,16 @@ import torch |
|
|
|
from transformers import BertForSequenceClassification, BertConfig, BertTokenizer |
|
|
|
from transformers import CLIPProcessor, CLIPModel |
|
|
|
|
|
|
|
import sys |
|
|
|
from pathlib import Path |
|
|
|
import torch |
|
|
|
from torchvision import transforms |
|
|
|
|
|
|
|
from towhee.types.image_utils import to_pil |
|
|
|
from towhee.operator.base import NNOperator, OperatorFlag |
|
|
|
from towhee.types.arg import arg, to_image_color |
|
|
|
from towhee import register |
|
|
|
|
|
|
|
@register(output_schema=['vec']) |
|
|
|
class Taiyi(NNOperator): |
|
|
|
""" |
|
|
@ -23,10 +33,13 @@ class Taiyi(NNOperator): |
|
|
|
def __init__(self, model_name: str, modality: str): |
|
|
|
self.modality = modality |
|
|
|
self.device = "cuda" if torch.cuda.is_available() else "cpu" |
|
|
|
self.text_tokenizer = BertTokenizer.from_pretrained("IDEA-CCNL/Taiyi-CLIP-Roberta-large-326M-Chinese") |
|
|
|
self.text_encoder = BertForSequenceClassification.from_pretrained("IDEA-CCNL/Taiyi-CLIP-Roberta-large-326M-Chinese").eval() |
|
|
|
self.clip_model = CLIPModel.from_pretrained("openai/clip-vit-large-patch14") |
|
|
|
self.processor = CLIPProcessor.from_pretrained("openai/clip-vit-large-patch14") |
|
|
|
config = self._configs()[model_name] |
|
|
|
|
|
|
|
self.text_tokenizer = BertTokenizer.from_pretrained(config['tokenizer']) |
|
|
|
self.text_encoder = BertForSequenceClassification.from_pretrained(config['text_encoder']).eval() |
|
|
|
|
|
|
|
self.clip_model = CLIPModel.from_pretrained(config['clip_model']) |
|
|
|
self.processor = CLIPProcessor.from_pretrained(config['processor']) |
|
|
|
|
|
|
|
def inference_single_data(self, data): |
|
|
|
if self.modality == 'image': |
|
|
@ -52,14 +65,29 @@ class Taiyi(NNOperator): |
|
|
|
return results |
|
|
|
|
|
|
|
def _inference_from_text(self, text): |
|
|
|
self.text = self.text_tokenizer(text, return_tensors='pt', padding=True)['input_ids'].to(self.device) |
|
|
|
text_features = text_encoder(text).logits |
|
|
|
tokens = self.text_tokenizer(text, return_tensors='pt', padding=True)['input_ids'].to(self.device) |
|
|
|
text_features = self.text_encoder(tokens).logits |
|
|
|
return text_features |
|
|
|
|
|
|
|
@arg(1, to_image_color('RGB')) |
|
|
|
def _inference_from_image(self, img): |
|
|
|
image = to_pil(image) |
|
|
|
image = self.processor(images=image.raw), return_tensors="pt") |
|
|
|
image_features = clip_model.get_image_features(**image) |
|
|
|
image = to_pil(img) |
|
|
|
image = self.processor(images=image, return_tensors="pt") |
|
|
|
image_features = self.clip_model.get_image_features(**image) |
|
|
|
return image_features |
|
|
|
|
|
|
|
def _configs(self): |
|
|
|
config = {} |
|
|
|
config['taiyi-clip-roberta-102m-chinese'] = {} |
|
|
|
config['taiyi-clip-roberta-102m-chinese']['tokenizer'] = 'IDEA-CCNL/Taiyi-CLIP-Roberta-102M-Chinese' |
|
|
|
config['taiyi-clip-roberta-102m-chinese']['text_encoder'] = 'IDEA-CCNL/Taiyi-CLIP-Roberta-102M-Chinese' |
|
|
|
config['taiyi-clip-roberta-102m-chinese']['clip_model'] = 'openai/clip-vit-base-patch32' |
|
|
|
config['taiyi-clip-roberta-102m-chinese']['processor'] = 'openai/clip-vit-base-patch32' |
|
|
|
|
|
|
|
config['taiyi-clip-roberta-large-326m-chinese'] = {} |
|
|
|
config['taiyi-clip-roberta-large-326m-chinese']['tokenizer'] = 'IDEA-CCNL/Taiyi-CLIP-Roberta-large-326M-Chinese' |
|
|
|
config['taiyi-clip-roberta-large-326m-chinese']['text_encoder'] = 'IDEA-CCNL/Taiyi-CLIP-Roberta-large-326M-Chinese' |
|
|
|
config['taiyi-clip-roberta-large-326m-chinese']['clip_model'] = 'openai/clip-vit-large-patch14' |
|
|
|
config['taiyi-clip-roberta-large-326m-chinese']['processor'] = 'openai/clip-vit-large-patch14' |
|
|
|
return config |
|
|
|
|
|
|
|