clip-caption-reward/clip_caption_reward.py

# Copyright 2021 Zilliz. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import sys
import pathlib
import json
from pathlib import Path
import numpy as np
import torch
from torch import nn
from torchvision.transforms import Compose, Resize, CenterCrop, ToTensor, Normalize
from torchvision.transforms.functional import InterpolationMode
from timm.models.vision_transformer import resize_pos_embed
from towhee.types.image_utils import to_pil
from towhee.types.arg import arg, to_image_color
from towhee.operator.base import NNOperator, OperatorFlag


class ClipCaptionReward(NNOperator):
    """
    BLIP multi-modal embedding operator
    """
    def __init__(self, model_name: str):
        super().__init__()
        sys.path.append(str(Path(__file__).parent))
        from utils import opts
        from transformer_model import TransformerModel
        from captioning.models.model_utils import decode_sequence
        self.decode_sequence = decode_sequence
        import mclip
        sys.path.pop()
        
        path = pathlib.Path(__file__).parent

        cfg = self._configs()[model_name] 
        config = str(path) + cfg['config']
        opt = opts.parse_opt(parse=False, cfg=(config))
        dict_json = json.load(open("{}/data/cocotalk.json".format(path)))
        ix_to_word = dict_json["ix_to_word"]
        self.device = "cuda" if torch.cuda.is_available() else "cpu"

        clip_model, clip_transform = mclip.load("RN50", jit=False, device=self.device)
        self.clip_model = clip_model
        self.clip_transform = clip_transform

        vocab_size = len(ix_to_word)
        seq_length = 1
        opt.vocab_size = vocab_size
        opt.seq_length = seq_length
        opt.batch_size = 1
        opt.vocab = ix_to_word

        num_patches = 196  # 600 * 1000 // 32 // 32

        pos_embed = nn.Parameter(
            torch.zeros(
                1,
                num_patches + 1,
                clip_model.visual.attnpool.positional_embedding.shape[-1],
                device=self.device,
            ),
        )
        pos_embed.weight = resize_pos_embed(
            clip_model.visual.attnpool.positional_embedding.unsqueeze(0), pos_embed
        )
        self.clip_model.visual.attnpool.positional_embedding = pos_embed

        ckpt_path = str(path) + cfg['weights']
        raw_state_dict = torch.load(ckpt_path, map_location=torch.device('cpu'))

        self.model = TransformerModel(opt)
        self.model.load_state_dict(raw_state_dict)
        self.model.to(self.device)

        self.image_mean  = (
            torch.Tensor([0.48145466, 0.4578275, 0.40821073])
            .to(self.device)
            .reshape(3, 1, 1)
        )
        self.image_std = (
            torch.Tensor([0.26862954, 0.26130258, 0.27577711])
            .to(self.device)
            .reshape(3, 1, 1)
        )
        self._preprocess = Compose(
            [
                Resize((448, 448), interpolation= InterpolationMode.BILINEAR),
                CenterCrop((448, 448)),
                ToTensor(),
            ]
        )
        self.eval_kwargs = {}
        self.eval_kwargs.update(vars(opt))

    @arg(1, to_image_color('RGB'))
    def inference_single_data(self, data):
        text = self._inference_from_image(data)
        return text

    def _inference_from_image(self, img):
        img = to_pil(img) 
        img = self._preprocess(img)
        img = torch.tensor(np.stack([img])).to(self.device)
        img -= self.image_mean
        img /= self.image_std
        tmp_att, tmp_fc = self.clip_model.encode_image(img)
        tmp_att = tmp_att[0].permute(1, 2, 0)
    
        att_feat = tmp_att

        
        with torch.no_grad():
            fc_feats = torch.zeros((1, 0)).to(self.device)
            att_feats = att_feat.view(1, 196, 2048).float().to(self.device)
            att_masks = None

            # forward the model to also get generated samples for each image
            # Only leave one feature for each image, in case duplicate sample
            tmp_eval_kwargs = self.eval_kwargs.copy()
            tmp_eval_kwargs.update({"sample_n": 1})
            seq, seq_logprobs = self.model(
                fc_feats, att_feats, att_masks, opt=tmp_eval_kwargs, mode="sample"
            )
            seq = seq.data

            sents = self.decode_sequence(self.model.vocab, seq)

        return sents[0]

    def __call__(self, data):
        results = []
        if not isinstance(data, list):
            data = [data]
        else:
            data = data
        for single_data in data:
            result = self.inference_single_data(single_data)
            results.append(result)
        if len(data) == 1:
            return results[0]
        else:
            return results

    def _configs(self):
        config = {}
        config['clipRN50_clips_grammar'] = {}
        config['clipRN50_clips_grammar']['weights'] = '/weights/clipRN50_clips_grammar-last.pth'
        config['clipRN50_clips_grammar']['config'] = '/configs/phase2/clipRN50_clips_grammar.yml'
        return config
init the repo. Signed-off-by: wxywb <xy.wang@zilliz.com> 2 years ago			`# Copyright 2021 Zilliz. All rights reserved.`
			`#`
			`# Licensed under the Apache License, Version 2.0 (the "License");`
			`# you may not use this file except in compliance with the License.`
			`# You may obtain a copy of the License at`
			`#`
			`# http://www.apache.org/licenses/LICENSE-2.0`
			`#`
			`# Unless required by applicable law or agreed to in writing, software`
			`# distributed under the License is distributed on an "AS IS" BASIS,`
			`# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
			`# See the License for the specific language governing permissions and`
			`# limitations under the License.`
update the repo. Signed-off-by: wxywb <xy.wang@zilliz.com> 2 years ago			`import sys`
init the repo. Signed-off-by: wxywb <xy.wang@zilliz.com> 2 years ago			`import pathlib`
update the repo. Signed-off-by: wxywb <xy.wang@zilliz.com> 2 years ago			`import json`
			`from pathlib import Path`
			`import numpy as np`
			`import torch`
init the repo. Signed-off-by: wxywb <xy.wang@zilliz.com> 2 years ago			`from torch import nn`
update the repo. Signed-off-by: wxywb <xy.wang@zilliz.com> 2 years ago			`from torchvision.transforms import Compose, Resize, CenterCrop, ToTensor, Normalize`
			`from torchvision.transforms.functional import InterpolationMode`
init the repo. Signed-off-by: wxywb <xy.wang@zilliz.com> 2 years ago			`from timm.models.vision_transformer import resize_pos_embed`
			`from towhee.types.image_utils import to_pil`
update the repo. Signed-off-by: wxywb <xy.wang@zilliz.com> 2 years ago			`from towhee.types.arg import arg, to_image_color`
			`from towhee.operator.base import NNOperator, OperatorFlag`
init the repo. Signed-off-by: wxywb <xy.wang@zilliz.com> 2 years ago

			`class ClipCaptionReward(NNOperator):`
			`"""`
			`BLIP multi-modal embedding operator`
			`"""`
			`def __init__(self, model_name: str):`
			`super().__init__()`
			`sys.path.append(str(Path(__file__).parent))`
			`from utils import opts`
update the repo. Signed-off-by: wxywb <xy.wang@zilliz.com> 2 years ago			`from transformer_model import TransformerModel`
			`from captioning.models.model_utils import decode_sequence`
			`self.decode_sequence = decode_sequence`
			`import mclip`
			`sys.path.pop()`

init the repo. Signed-off-by: wxywb <xy.wang@zilliz.com> 2 years ago			`path = pathlib.Path(__file__).parent`
update the repo. Signed-off-by: wxywb <xy.wang@zilliz.com> 2 years ago
			`cfg = self._configs()[model_name]`
			`config = str(path) + cfg['config']`
			`opt = opts.parse_opt(parse=False, cfg=(config))`
init the repo. Signed-off-by: wxywb <xy.wang@zilliz.com> 2 years ago			`dict_json = json.load(open("{}/data/cocotalk.json".format(path)))`
			`ix_to_word = dict_json["ix_to_word"]`
			`self.device = "cuda" if torch.cuda.is_available() else "cpu"`

update the repo. Signed-off-by: wxywb <xy.wang@zilliz.com> 2 years ago			`clip_model, clip_transform = mclip.load("RN50", jit=False, device=self.device)`
init the repo. Signed-off-by: wxywb <xy.wang@zilliz.com> 2 years ago			`self.clip_model = clip_model`
			`self.clip_transform = clip_transform`

			`vocab_size = len(ix_to_word)`
			`seq_length = 1`
			`opt.vocab_size = vocab_size`
			`opt.seq_length = seq_length`
			`opt.batch_size = 1`
			`opt.vocab = ix_to_word`

			`num_patches = 196 # 600 * 1000 // 32 // 32`

			`pos_embed = nn.Parameter(`
			`torch.zeros(`
			`1,`
			`num_patches + 1,`
			`clip_model.visual.attnpool.positional_embedding.shape[-1],`
			`device=self.device,`
			`),`
			`)`
			`pos_embed.weight = resize_pos_embed(`
			`clip_model.visual.attnpool.positional_embedding.unsqueeze(0), pos_embed`
			`)`
			`self.clip_model.visual.attnpool.positional_embedding = pos_embed`

update the repo. Signed-off-by: wxywb <xy.wang@zilliz.com> 2 years ago			`ckpt_path = str(path) + cfg['weights']`
			`raw_state_dict = torch.load(ckpt_path, map_location=torch.device('cpu'))`

init the repo. Signed-off-by: wxywb <xy.wang@zilliz.com> 2 years ago			`self.model = TransformerModel(opt)`
update the repo. Signed-off-by: wxywb <xy.wang@zilliz.com> 2 years ago			`self.model.load_state_dict(raw_state_dict)`
update the readme. Signed-off-by: wxywb <xy.wang@zilliz.com> 2 years ago			`self.model.to(self.device)`
update the repo. Signed-off-by: wxywb <xy.wang@zilliz.com> 2 years ago
init the repo. Signed-off-by: wxywb <xy.wang@zilliz.com> 2 years ago			`self.image_mean = (`
			`torch.Tensor([0.48145466, 0.4578275, 0.40821073])`
			`.to(self.device)`
			`.reshape(3, 1, 1)`
			`)`
			`self.image_std = (`
			`torch.Tensor([0.26862954, 0.26130258, 0.27577711])`
			`.to(self.device)`
			`.reshape(3, 1, 1)`
			`)`
update the repo. Signed-off-by: wxywb <xy.wang@zilliz.com> 2 years ago			`self._preprocess = Compose(`
			`[`
			`Resize((448, 448), interpolation= InterpolationMode.BILINEAR),`
			`CenterCrop((448, 448)),`
			`ToTensor(),`
			`]`
			`)`
			`self.eval_kwargs = {}`
			`self.eval_kwargs.update(vars(opt))`
init the repo. Signed-off-by: wxywb <xy.wang@zilliz.com> 2 years ago
			`@arg(1, to_image_color('RGB'))`
			`def inference_single_data(self, data):`
			`text = self._inference_from_image(data)`
			`return text`

			`def _inference_from_image(self, img):`
			`img = to_pil(img)`
			`img = self._preprocess(img)`
update the repo. Signed-off-by: wxywb <xy.wang@zilliz.com> 2 years ago			`img = torch.tensor(np.stack([img])).to(self.device)`
init the repo. Signed-off-by: wxywb <xy.wang@zilliz.com> 2 years ago			`img -= self.image_mean`
			`img /= self.image_std`
			`tmp_att, tmp_fc = self.clip_model.encode_image(img)`
			`tmp_att = tmp_att[0].permute(1, 2, 0)`

			`att_feat = tmp_att`

update the repo. Signed-off-by: wxywb <xy.wang@zilliz.com> 2 years ago
			`with torch.no_grad():`
			`fc_feats = torch.zeros((1, 0)).to(self.device)`
			`att_feats = att_feat.view(1, 196, 2048).float().to(self.device)`
			`att_masks = None`

			`# forward the model to also get generated samples for each image`
			`# Only leave one feature for each image, in case duplicate sample`
			`tmp_eval_kwargs = self.eval_kwargs.copy()`
			`tmp_eval_kwargs.update({"sample_n": 1})`
			`seq, seq_logprobs = self.model(`
			`fc_feats, att_feats, att_masks, opt=tmp_eval_kwargs, mode="sample"`
			`)`
			`seq = seq.data`

			`sents = self.decode_sequence(self.model.vocab, seq)`

			`return sents[0]`
init the repo. Signed-off-by: wxywb <xy.wang@zilliz.com> 2 years ago
			`def __call__(self, data):`
update the repo. Signed-off-by: wxywb <xy.wang@zilliz.com> 2 years ago			`results = []`
init the repo. Signed-off-by: wxywb <xy.wang@zilliz.com> 2 years ago			`if not isinstance(data, list):`
			`data = [data]`
			`else:`
			`data = data`
			`for single_data in data:`
			`result = self.inference_single_data(single_data)`
			`results.append(result)`
			`if len(data) == 1:`
			`return results[0]`
			`else:`
			`return results`

update the repo. Signed-off-by: wxywb <xy.wang@zilliz.com> 2 years ago			`def _configs(self):`
			`config = {}`
			`config['clipRN50_clips_grammar'] = {}`
			`config['clipRN50_clips_grammar']['weights'] = '/weights/clipRN50_clips_grammar-last.pth'`
			`config['clipRN50_clips_grammar']['config'] = '/configs/phase2/clipRN50_clips_grammar.yml'`
			`return config`