magic/zerocap/model/ZeroCLIP_batched.py


								import numpy as np

								from torch import nn

								from transformers.models.gpt2 import GPT2LMHeadModel, GPT2Tokenizer

								from transformers.models.gpt_neo import GPTNeoForCausalLM

								import torch

								import clip

								from PIL import Image

								from datetime import datetime

								import sys


								class TextCLIP(nn.Module):

								    def __init__(self, model):

								        super(TextCLIP, self).__init__()

								        self.model = model


								    def forward(self, text):

								        return self.model.encode_text(text)


								class ImageCLIP(nn.Module):

								    def __init__(self, model):

								        super(ImageCLIP, self).__init__()

								        self.model = model


								    def forward(self, image):

								        return self.model.encode_image(image)


								def log_info(text, verbose=True):

								    if verbose:

								        dt_string = datetime.now().strftime("%d/%m/%Y %H:%M:%S")

								        print(f'{dt_string} | {text}')

								        sys.stdout.flush()


								def add_context(x, y):

								    return (x[0] + y[0], x[1] + y[1])


								def convert_models_to_fp32(model):

								    for p in model.parameters():

								        p.data = p.data.float()


								class CLIPTextGenerator:

								    def __init__(self,

								                 seed=0,

								                 lm_model='gpt-2',

								                 forbidden_tokens_file_path='./forbidden_tokens.npy',

								                 clip_checkpoints='./clip_checkpoints',

								                 target_seq_length=15,

								                 reset_context_delta=True,

								                 num_iterations=5,

								                 clip_loss_temperature=0.01,

								                 clip_scale=1.,

								                 ce_scale=0.2,

								                 stepsize=0.3,

								                 grad_norm_factor=0.9,

								                 fusion_factor=0.99,

								                 repetition_penalty=1.,

								                 end_token='.',

								                 end_factor=1.01,

								                 forbidden_factor=20,

								                 **kwargs):


								        self.device = "cuda" if torch.cuda.is_available() else "cpu"


								        # set Random seed

								        torch.manual_seed(seed)

								        np.random.seed(seed)


								        # Initialize Language model

								        self.context_prefix = ''


								        if lm_model == 'gpt-neo':

								            self.lm_tokenizer = GPT2Tokenizer.from_pretrained('EleutherAI/gpt-neo-125M')

								            self.lm_model = GPTNeoForCausalLM.from_pretrained('EleutherAI/gpt-neo-125M', output_hidden_states=True)

								        elif lm_model == 'gpt-2':

								            self.lm_tokenizer = GPT2Tokenizer.from_pretrained('gpt2-medium')

								            self.lm_model = GPT2LMHeadModel.from_pretrained('gpt2-medium', output_hidden_states=True)

								            self.context_prefix = self.lm_tokenizer.bos_token


								        self.lm_model.to(self.device)

								        self.lm_model.eval()


								        self.forbidden_tokens = np.load(forbidden_tokens_file_path)

								        self.capital_letter_tokens = [self.lm_tokenizer.encoder[x] for x in self.lm_tokenizer.encoder.keys() if

								                                      (x[0] == 'Ġ' and len(x) > 1 and x[1].isupper())]


								        # Freeze LM weights

								        for param in self.lm_model.parameters():

								            param.requires_grad = False


								        # Initialize CLIP

								        self.clip, self.clip_preprocess = clip.load("ViT-B/32", device=self.device,

								                                                    download_root=clip_checkpoints, jit=False)

								        self.clip_image = ImageCLIP(self.clip)

								        self.clip_image = torch.nn.DataParallel(self.clip_image)

								        self.clip_text = TextCLIP(self.clip)

								        self.clip_text = torch.nn.DataParallel(self.clip_text)


								        # Init arguments

								        self.target_seq_length = target_seq_length

								        self.reset_context_delta = reset_context_delta

								        self.num_iterations = num_iterations

								        self.clip_loss_temperature = clip_loss_temperature

								        self.clip_scale = clip_scale

								        self.ce_scale = ce_scale

								        self.stepsize = stepsize

								        self.grad_norm_factor = grad_norm_factor

								        self.fusion_factor = fusion_factor

								        self.repetition_penalty = repetition_penalty

								        self.end_token = self.lm_tokenizer.encode(end_token)[0]

								        self.end_factor = end_factor

								        self.ef_idx = 1

								        self.forbidden_factor = forbidden_factor


								    def get_img_feature(self, img_path, weights):

								        imgs = [Image.open(x) for x in img_path]

								        clip_imgs = [self.clip_preprocess(x).unsqueeze(0).to(self.device) for x in imgs]


								        with torch.no_grad():

								            image_fts = [self.clip_image(x) for x in clip_imgs]


								            if weights is not None:

								                image_features = sum([x * weights[i] for i, x in enumerate(image_fts)])

								            else:

								                image_features = sum(image_fts)


								            image_features = torch.nn.functional.normalize(image_features, dim=-1)

								            return image_features.detach()


								    def get_txt_features(self, text):

								        clip_texts = clip.tokenize(text).to(self.device)


								        with torch.no_grad():

								            text_features = self.clip_text(clip_texts)


								            text_features = torch.nn.functional.normalize(text_features, dim=-1)

								        return text_features.detach()


								    def get_combined_feature(self, img_path, texts, weights_i, weights_t):

								        imgs = [Image.open(x) for x in img_path]

								        clip_imgs = [self.clip_preprocess(x).unsqueeze(0).to(self.device) for x in imgs]

								        clip_texts = [clip.tokenize(x).to(self.device) for x in texts]


								        with torch.no_grad():

								            image_fts = [self.clip.encode_image(x) for x in clip_imgs]

								            text_fts = [self.clip.encode_text(x) for x in clip_texts]


								            features = sum([x * weights_i[i] for i, x in enumerate(image_fts)])

								            if weights_t is not None:

								                features += sum([x * weights_t[i] for i, x in enumerate(text_fts)])


								            features = features / features.norm(dim=-1, keepdim=True)

								            return features.detach()


								    def run(self, image_features, cond_text, beam_size):

								        self.image_features = image_features


								        context_tokens = self.lm_tokenizer.encode(self.context_prefix + cond_text)


								        output_tokens, output_text = self.generate_text(context_tokens, beam_size)


								        return output_text


								    def generate_text(self, context_tokens, beam_size):

								        context_tokens = torch.tensor(context_tokens, device=self.device, dtype=torch.long).unsqueeze(0)


								        gen_tokens = None

								        scores = None

								        seq_lengths = torch.ones(beam_size, device=self.device)

								        is_stopped = torch.zeros(beam_size, device=self.device, dtype=torch.bool)


								        for i in range(self.target_seq_length):

								            probs = self.get_next_probs(i, context_tokens)

								            logits = probs.log()


								            if scores is None:

								                scores, next_tokens = logits.topk(beam_size, -1)

								                context_tokens = context_tokens.expand(beam_size, *context_tokens.shape[1:])

								                next_tokens, scores = next_tokens.permute(1, 0), scores.squeeze(0)


								                if gen_tokens is None:

								                    gen_tokens = next_tokens

								                else:

								                    gen_tokens = gen_tokens.expand(beam_size, *gen_tokens.shape[1:])

								                    gen_tokens = torch.cat((gen_tokens, next_tokens), dim=1)

								            else:

								                logits[is_stopped] = -float(np.inf)

								                logits[is_stopped, 0] = 0

								                scores_sum = scores[:, None] + logits

								                seq_lengths[~is_stopped] += 1

								                scores_sum_average = scores_sum / seq_lengths[:, None]

								                scores_sum_average, next_tokens = scores_sum_average.view(-1).topk(

								                    beam_size, -1)

								                next_tokens_source = next_tokens // scores_sum.shape[1]

								                seq_lengths = seq_lengths[next_tokens_source]

								                next_tokens = next_tokens % scores_sum.shape[1]

								                next_tokens = next_tokens.unsqueeze(1)

								                gen_tokens = gen_tokens[next_tokens_source]

								                gen_tokens = torch.cat((gen_tokens, next_tokens), dim=-1)

								                context_tokens = context_tokens[next_tokens_source]

								                scores = scores_sum_average * seq_lengths

								                is_stopped = is_stopped[next_tokens_source]


								            context_tokens = torch.cat((context_tokens, next_tokens), dim=1)

								            is_stopped = is_stopped + next_tokens.eq(self.end_token).squeeze()


								            ####

								            tmp_scores = scores / seq_lengths

								            tmp_output_list = gen_tokens.cpu().numpy()

								            tmp_output_texts = [

								                self.lm_tokenizer.decode(tmp_output)

								                for tmp_output, tmp_length in zip(tmp_output_list, seq_lengths)

								            ]

								            tmp_order = tmp_scores.argsort(descending=True)

								            tmp_output_texts = [tmp_output_texts[i] + ' %% ' + str(tmp_scores[i].cpu().numpy()) for i in tmp_order]

								            log_info(tmp_output_texts, verbose=True)

								            ####


								            if is_stopped.all():

								                break


								        scores = scores / seq_lengths

								        output_list = gen_tokens.cpu().numpy()

								        output_texts = [

								            self.lm_tokenizer.decode(output[: int(length)])

								            for output, length in zip(output_list, seq_lengths)

								        ]

								        order = scores.argsort(descending=True)

								        output_texts = [output_texts[i] for i in order]


								        return context_tokens, output_texts


								    def get_next_probs(self, i, context_tokens):

								        last_token = context_tokens[:, -1:]


								        if self.reset_context_delta and context_tokens.size(1) > 1:

								            context = self.lm_model(context_tokens[:, :-1])["past_key_values"]


								        # Logits of LM with unshifted context

								        logits_before_shift = self.lm_model(context_tokens)["logits"]

								        logits_before_shift = logits_before_shift[:, -1, :]

								        probs_before_shift = nn.functional.softmax(logits_before_shift, dim=-1)


								        if context:

								            context = self.shift_context(i, context, last_token, context_tokens, probs_before_shift)


								        lm_output = self.lm_model(last_token, past_key_values=context)

								        logits, past = (

								            lm_output["logits"],

								            lm_output["past_key_values"],

								        )

								        logits = logits[:, -1, :]


								        logits = self.update_special_tokens_logits(context_tokens, i, logits)


								        probs = nn.functional.softmax(logits, dim=-1)

								        probs = (probs ** self.fusion_factor) * (probs_before_shift ** (1 - self.fusion_factor))

								        probs = probs / probs.sum()


								        return probs


								    def shift_context(self, i, context, last_token, context_tokens, probs_before_shift):

								        context_delta = [tuple([np.zeros(x.shape).astype("float32") for x in p]) for p in context]


								        for i in range(self.num_iterations):

								            curr_shift = [tuple([torch.from_numpy(x).requires_grad_(True).to(device=self.device) for x in p_]) for p_ in

								                          context_delta]


								            for p0, p1 in curr_shift:

								                p0.retain_grad()

								                p1.retain_grad()


								            shifted_context = list(map(add_context, context, curr_shift))


								            shifted_outputs = self.lm_model(last_token, past_key_values=shifted_context)

								            logits = shifted_outputs["logits"][:, -1, :]

								            probs = nn.functional.softmax(logits, dim=-1)


								            loss = 0.0


								            # CLIP LOSS

								            clip_loss, clip_losses = self.clip_loss(probs, context_tokens)

								            loss += self.clip_scale * clip_loss


								            # CE/Fluency loss

								            ce_loss = self.ce_scale * ((probs * probs.log()) - (probs * probs_before_shift.log())).sum(-1)

								            loss += ce_loss.sum()


								            loss.backward()


								            # --------- Specific Gen ---------

								            final_grads = self.norm_grad(context, context_tokens, curr_shift)


								            # --------- update context ---------

								            context_delta = list(map(add_context, final_grads, context_delta))


								            for p0, p1 in curr_shift:

								                p0.grad.data.zero_()

								                p1.grad.data.zero_()


								            new_context = []

								            for p0, p1 in context:

								                new_context.append((p0.detach(), p1.detach()))

								            context = new_context


								        context_delta = [tuple([torch.from_numpy(x).requires_grad_(True).to(device=self.device) for x in p_])

								                         for p_ in context_delta]

								        context = list(map(add_context, context, context_delta))


								        new_context = []

								        for p0, p1 in context:

								            new_context.append((p0.detach(), p1.detach()))

								        context = new_context


								        return context


								    def norm_grad(self, context, context_tokens, curr_shift, ):

								        factor = 1

								        sep_grads = None

								        window_mask = torch.ones_like(context[0][0]).to(self.device)


								        for b in range(context_tokens.shape[0]):

								            tmp_sep_norms = [[(torch.norm(x.grad[b:(b + 1)] * window_mask[b:(b + 1)]) + 1e-15) for x in p_]

								                             for p_ in curr_shift]


								            # normalize gradients

								            tmp_grad = [tuple([-self.stepsize * factor * (

								                    x.grad[b:(b + 1)] * window_mask[b:(b + 1)] / tmp_sep_norms[i][

								                j] ** self.grad_norm_factor).data.cpu().numpy()

								                               for j, x in enumerate(p_)])

								                        for i, p_ in enumerate(curr_shift)]

								            if sep_grads is None:

								                sep_grads = tmp_grad

								            else:

								                for l_index in range(len(sep_grads)):

								                    sep_grads[l_index] = list(sep_grads[l_index])

								                    for k_index in range(len(sep_grads[0])):

								                        sep_grads[l_index][k_index] = np.concatenate(

								                            (sep_grads[l_index][k_index], tmp_grad[l_index][k_index]), axis=0)

								                    sep_grads[l_index] = tuple(sep_grads[l_index])

								        final_grads = sep_grads


								        return final_grads


								    def update_special_tokens_logits(self, context_tokens, i, logits):

								        for beam_id in range(context_tokens.shape[0]):

								            for token_idx in set(context_tokens[beam_id][-4:].tolist()):

								                factor = self.repetition_penalty if logits[beam_id, token_idx] > 0 else (1 / self.repetition_penalty)

								                logits[beam_id, token_idx] /= factor


								            if i >= self.ef_idx:

								                factor = self.end_factor if logits[beam_id, self.end_token] > 0 else (1 / self.end_factor)

								                logits[beam_id, self.end_token] *= factor

								            if i == 0:

								                start_factor = 1.6

								                factor = start_factor if logits[beam_id, self.end_token] > 0 else (1 / start_factor)

								                logits[beam_id, self.end_token] /= factor


								            for token_idx in list(self.forbidden_tokens):

								                factor = self.forbidden_factor if logits[beam_id, token_idx] > 0 else (1 / self.forbidden_factor)

								                logits[beam_id, token_idx] /= factor


								        return logits


								    def clip_loss(self, probs, context_tokens):

								        for p_ in self.clip.transformer.parameters():

								            if p_.grad is not None:

								                p_.grad.data.zero_()


								        top_size = 512

								        top_probs, top_indices = probs.topk(top_size, -1)


								        prefix_texts = [self.lm_tokenizer.decode(x, skip_special_tokens=True) for x in context_tokens]


								        clip_loss = 0

								        losses = []


								        top_texts = []

								        for idx_p in range(probs.shape[0]):

								            prefix_text = prefix_texts[idx_p]

								            for x in top_indices[idx_p]:

								                top_texts.append(prefix_text + self.lm_tokenizer.decode(x))


								        text_features = self.get_txt_features(top_texts)#.reshape(probs.size(0), top_size, -1)


								        with torch.no_grad():

								            similiraties = (self.image_features @ text_features.T).reshape(probs.size(0), -1)

								            similiraties = similiraties.reshape(probs.size(0), -1)

								            target_probs = nn.functional.softmax(similiraties / self.clip_loss_temperature, dim=-1).detach()

								            target_probs = target_probs.type(torch.float32)


								        clip_loss += torch.sum(-(target_probs * torch.log(top_probs)))

								        # for idx_p in range(probs.shape[0]):

								        #     top_texts = []

								        #     prefix_text = prefix_texts[idx_p]

								        #     for x in top_indices[idx_p]:

								        #         top_texts.append(prefix_text + self.lm_tokenizer.decode(x))

								        #     text_features = self.get_txt_features(top_texts)

								        #

								        #     with torch.no_grad():

								        #         similiraties = (self.image_features @ text_features.T)

								        #         target_probs = nn.functional.softmax(similiraties / self.clip_loss_temperature, dim=-1).detach()

								        #         target_probs = target_probs.type(torch.float32)

								        #

								        #     target = torch.zeros_like(probs[idx_p])

								        #     target[top_indices[idx_p]] = target_probs[0]

								        #     target = target.unsqueeze(0)

								        #     cur_clip_loss = torch.sum(-(target * torch.log(probs[idx_p:(idx_p + 1)])))

								        #

								        #     clip_loss += cur_clip_loss

								        #     losses.append(cur_clip_loss)


								        return clip_loss, losses


								    def clip_loss_old(self, probs, context_tokens):

								        for p_ in self.clip.transformer.parameters():

								            if p_.grad is not None:

								                p_.grad.data.zero_()


								        top_size = 512

								        _, top_indices = probs.topk(top_size, -1)


								        prefix_texts = [self.lm_tokenizer.decode(x).replace(self.lm_tokenizer.bos_token, '') for x in context_tokens]


								        clip_loss = 0

								        losses = []

								        for idx_p in range(probs.shape[0]):

								            top_texts = []

								            prefix_text = prefix_texts[idx_p]

								            for x in top_indices[idx_p]:

								                top_texts.append(prefix_text + self.lm_tokenizer.decode(x))

								            text_features = self.get_txt_features(top_texts)


								            with torch.no_grad():

								                similiraties = (self.image_features @ text_features.T)

								                target_probs = nn.functional.softmax(similiraties / self.clip_loss_temperature, dim=-1).detach()

								                target_probs = target_probs.type(torch.float32)


								            target = torch.zeros_like(probs[idx_p])

								            target[top_indices[idx_p]] = target_probs[0]

								            target = target.unsqueeze(0)

								            cur_clip_loss = torch.sum(-(target * torch.log(probs[idx_p:(idx_p + 1)])))


								            clip_loss += cur_clip_loss

								            losses.append(cur_clip_loss)


								        return clip_loss, losses