clip/clip.py


								# Copyright 2021 Zilliz. All rights reserved.

								#

								# Licensed under the Apache License, Version 2.0 (the "License");

								# you may not use this file except in compliance with the License.

								# You may obtain a copy of the License at

								#

								#     http://www.apache.org/licenses/LICENSE-2.0

								#

								# Unless required by applicable law or agreed to in writing, software

								# distributed under the License is distributed on an "AS IS" BASIS,

								# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

								# See the License for the specific language governing permissions and

								# limitations under the License.

								import sys

								import os

								from pathlib import Path

								import torch

								from torch import nn

								from torchvision import transforms


								import logging

								import warnings

								from towhee.types.image_utils import to_pil

								from towhee.operator.base import NNOperator, OperatorFlag

								from towhee.types.arg import arg, to_image_color

								from towhee import register

								from transformers import CLIPTokenizer, CLIPTextModel ,CLIPModel,CLIPProcessor

								from transformers import logging as t_logging

								# from towhee.dc2 import accelerate


								log = logging.getLogger('run_op')

								warnings.filterwarnings('ignore')

								os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

								t_logging.set_verbosity_error()


								def create_model(model_name, modality, checkpoint_path, device):

								    hf_clip_model = CLIPModel.from_pretrained(model_name)

								    if checkpoint_path:

								        try:

								            state_dict = torch.load(checkpoint_path, map_location=device)

								            hf_clip_model.load_state_dict(state_dict)

								        except Exception as e:

								            log.error(f"Fail to load state dict from {checkpoint_path}: {e}")

								    hf_clip_model.to(device)

								    hf_clip_model.eval()


								    if modality == 'image':

								        clip = CLIPModelVision(hf_clip_model)

								    elif modality == 'text':

								        clip = CLIPModelText(hf_clip_model)

								    else:

								        raise ValueError("modality[{}] not implemented.".format(modality))

								    return clip


								class CLIPModelVision(nn.Module):

								    def __init__(self, model):

								        super().__init__()

								        self.backbone = model


								    def forward(self, pixel_values):

								        image_embeds = self.backbone.get_image_features(pixel_values)

								        return image_embeds


								class CLIPModelText(nn.Module):

								    def __init__(self, model):

								        super().__init__()

								        self.backbone = model


								    def forward(self, input_ids, attention_mask):

								        text_embeds = self.backbone.get_text_features(input_ids, attention_mask)

								        return text_embeds


								# @accelerate

								class Model:

								    def __init__(self, model_name, modality, checkpoint_path, device):

								        self.model = create_model(model_name, modality, checkpoint_path, device)

								        self.device = device


								    def __call__(self, *args, **kwargs):

								        new_args = []

								        for item in args:

								            new_args.append(item.to(self.device))

								        new_kwargs = {}

								        for k, value in kwargs.items():

								            new_kwargs[k] = value.to(self.device)

								        outs = self.model(*new_args, **new_kwargs)

								        return outs


								@register(output_schema=['vec'])

								class Clip(NNOperator):

								    """

								    CLIP multi-modal embedding operator

								    """

								    def __init__(self, model_name: str, modality: str, device: str = 'cpu', checkpoint_path: str = None):

								        self.model_name = model_name

								        self.modality = modality

								        self.device = device

								        self.checkpoint_path = checkpoint_path

								        real_name = self._configs()[model_name]


								        self.model = Model(real_name, modality, checkpoint_path, device)

								        self.tokenizer = CLIPTokenizer.from_pretrained(real_name)

								        self.processor =  CLIPProcessor.from_pretrained(real_name)


								    def inference_single_data(self, data):

								        if self.modality == 'image':

								            vec = self._inference_from_image(data)

								        elif self.modality == 'text':

								            vec = self._inference_from_text(data)

								        else:

								            raise ValueError("modality[{}] not implemented.".format(self.modality))

								        return vec.detach().cpu().numpy().flatten()


								    def __call__(self, data):

								        if not isinstance(data, list):

								            data = [data]

								        else:

								            data = data

								        results = []

								        for single_data in data:

								            result = self.inference_single_data(single_data)

								            results.append(result)

								        if len(data) == 1:

								            return results[0]

								        else:

								            return results


								    def _inference_from_text(self, text):

								        tokens = self.tokenizer([text], padding=True, return_tensors="pt")

								        text_features = self.model(tokens['input_ids'], tokens['attention_mask'])

								        return text_features


								    @arg(1, to_image_color('RGB'))

								    def _inference_from_image(self, img):

								        img = to_pil(img)

								        inputs = self.processor(images=img, return_tensors="pt")

								        image_features = self.model(inputs['pixel_values'])

								        return image_features


								    def train(self, **kwargs):

								        import sys

								        import pathlib

								        path = str(pathlib.Path(__file__).parent)

								        print(path)

								        sys.path.append(path)

								        from train_clip_with_hf_trainer import train_with_hf_trainer

								        data_args = kwargs.pop('data_args', None)

								        training_args = kwargs.pop('training_args', None)

								        train_with_hf_trainer(self._model.backbone, self.tokenizer, data_args, training_args)


								    def _configs(self):

								        config = {}

								        config['clip_vit_base_patch16'] = 'openai/clip-vit-base-patch16'

								        config['clip_vit_base_patch32'] = 'openai/clip-vit-base-patch32'

								        config['clip_vit_large_patch14'] = 'openai/clip-vit-large-patch14'

								        config['clip_vit_large_patch14_336'] ='openai/clip-vit-large-patch14-336'

								        return config


								    @property

								    def supported_formats(self):

								        onnxes = self.supported_model_names(format='onnx')

								        if self.model_name in onnxes:

								            return ['onnx']

								        else:

								            return ['pytorch']


								    @staticmethod

								    def supported_model_names(format: str = None):

								        full_list = [

								            'clip_vit_base_patch16',

								            'clip_vit_base_patch32',

								            'clip_vit_large_patch14',

								            'clip_vit_large_patch14_336'

								        ]

								        if format == None:

								            model_list = full_list

								        elif format == 'pytorch' or format == 'torchscript' or format == 'onnx':

								            model_list = full_list

								        else:

								            log.error(f'Invalid format "{format}". Currently supported formats: "pytorch", "torchscript".')

								        return model_list


								    @property

								    def _model(self):

								        return self.model.model


								    def save_model(self, model_type: str = 'pytorch', output_file: str = 'default'):

								        import os

								        from PIL import Image

								        from torch.onnx import export as onnx_export


								        if output_file == 'default':

								            output_file = str(Path(__file__).parent)

								            output_file = os.path.join(output_file, 'saved', model_type)

								            os.makedirs(output_file, exist_ok=True)

								            name = self.model_name.replace('/', '-')

								            output_file = os.path.join(output_file, name)

								            if model_type in ['pytorch', 'torchscript']:

								                output_file = output_file + '.pt'

								            elif model_type == 'onnx':

								                output_file = output_file + '.onnx'

								            else:

								                raise AttributeError('Unsupported model_type.')

								        if self.modality == 'image':

								            sz = self.processor.feature_extractor.crop_size

								            if isinstance(sz, int):

								                h = sz

								                w = sz

								            elif isinstance(sz, dict):

								                h = sz['height']

								                w = sz['width']

								            dummy_input = Image.new('RGB', (w, h), color = 'red')

								            inputs = self.processor(images=dummy_input, return_tensors='pt').to(self.device)   # a dictionary

								        elif self.modality == 'text':

								            dummy_input = 'dummy'

								            inputs = self.tokenizer(dummy_input, padding=True, truncation=True, return_tensors='pt').to(self.device)  # a dictionary

								        else:

								            raise ValueError("modality[{}] not implemented.".format(self.modality))


								        if model_type == 'pytorch':

								            torch.save(self._model, output_file)

								        elif model_type == 'torchscript':

								            inputs = list(inputs.values())

								            try:

								                try:

								                    jit_model = torch.jit.script(self._model)

								                except Exception:

								                    jit_model = torch.jit.trace(self._model, inputs, strict=False)

								                torch.jit.save(jit_model, output_file)

								            except Exception as e:

								                log.error(f'Fail to save as torchscript: {e}.')

								                raise RuntimeError(f'Fail to save as torchscript: {e}.')

								        elif model_type == 'onnx':

								            if self.modality == 'image':

								                input_names= ['pixel_values']

								                output_names=['image_embeds']

								                dynamic_axes={'pixel_values': {0: 'batch'}, 'image_embeds': {0: 'batch'}}

								            elif self.modality == 'text':

								                input_names= ['input_ids', 'attention_mask']

								                output_names=['text_embeds']

								                dynamic_axes={'input_ids': {0: 'batch', 1: 'sequence'}, 'attention_mask': {0: 'batch', 1: 'sequence'}, 'text_embeds': {0: 'batch'}}

								            else:

								                raise ValueError("modality[{}] not implemented.".format(self.modality))


								            onnx_export(self._model,

								                (dict(inputs),),

								                f=Path(output_file),

								                input_names= input_names,

								                output_names=output_names,

								                dynamic_axes=dynamic_axes,

								                do_constant_folding=True,

								                opset_version=14,

								            )

								        else:

								            raise NotImplementedError