magic/magic.py


								# Copyright 2021 Zilliz. All rights reserved.

								#

								# Licensed under the Apache License, Version 2.0 (the "License");

								# you may not use this file except in compliance with the License.

								# You may obtain a copy of the License at

								#

								#     http://www.apache.org/licenses/LICENSE-2.0

								#

								# Unless required by applicable law or agreed to in writing, software

								# distributed under the License is distributed on an "AS IS" BASIS,

								# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

								# See the License for the specific language governing permissions and

								# limitations under the License.


								from re import I

								import sys

								import os

								import pathlib

								import pickle

								from argparse import Namespace


								import torch

								import torchvision

								from torchvision import transforms

								from transformers import GPT2Tokenizer


								from towhee.types.arg import arg, to_image_color

								from towhee.types.image_utils import to_pil

								from towhee.operator.base import NNOperator, OperatorFlag

								from towhee import register


								class Magic(NNOperator):

								    """

								    Magic image captioning operator

								    """

								    def __init__(self, model_name: str):

								        super().__init__()

								        path = str(pathlib.Path(__file__).parent)

								        sys.path.append(path + '/clip')

								        sys.path.append(path + '/language_model')

								        print(sys.path)

								        from clip import CLIP

								        from simctg import SimCTG

								        sys.path.pop()

								        sys.path.pop()


								        self.device = "cuda" if torch.cuda.is_available() else "cpu"

								        # Load Language Model

								        cfg = self._configs()[model_name]

								        language_model_name = cfg['language_model'] # or r'/path/to/downloaded/cambridgeltl/magic_mscoco'

								        sos_token, pad_token = r'<-start_of_text->', r'<-pad->'

								        self.generation_model = SimCTG(language_model_name, sos_token, pad_token).to(self.device)

								        self.generation_model.eval()


								        model_name = cfg['clip_model'] # or r"/path/to/downloaded/openai/clip-vit-base-patch32"

								        self.clip = CLIP(model_name).to(self.device)

								        self.clip.to(self.device)

								        self.clip.eval()


								        sos_token = r'<-start_of_text->'

								        start_token = self.generation_model.tokenizer.tokenize(sos_token)

								        start_token_id = self.generation_model.tokenizer.convert_tokens_to_ids(start_token)

								        self.input_ids = torch.LongTensor(start_token_id).view(1,-1).to(self.device)


								    def _preprocess(self, img):

								        img = to_pil(img)

								        processed_img = self.transf_1(img)

								        processed_img = self.transf_2(processed_img)

								        processed_img = processed_img.to(self.device)

								        return processed_img


								    @arg(1, to_image_color('RGB'))

								    def inference_single_data(self, data):

								        text = self._inference_from_image(data)

								        return text


								    def __call__(self, data):

								        if not isinstance(data, list):

								            data = [data]

								        else:

								            data = data

								        results = []

								        for single_data in data:

								            result = self.inference_single_data(single_data)

								            results.append(result)

								        if len(data) == 1:

								            return results[0]

								        else:

								            return results


								    @arg(1, to_image_color('RGB'))

								    def _inference_from_image(self, img):

								        #img = self._preprocess(img).unsqueeze(0)

								        k, alpha, beta, decoding_len = 45, 0.1, 2.0, 16

								        eos_token = '<|endoftext|>'

								        with torch.no_grad():

								            print(type(img))

								            output = self.generation_model.magic_search(self.input_ids, k,

								                    alpha, decoding_len, beta, img, self.clip, 60)


								        return output


								    def _configs(self):

								        config = {}

								        config['magic_mscoco'] = {}

								        config['magic_mscoco']['language_model'] = 'cambridgeltl/magic_mscoco'

								        config['magic_mscoco']['clip_model'] = 'openai/clip-vit-base-patch32'

								        return config