lightningdot/uniter_model/model/vcr.py


								"""

								Bert for VCR model

								"""

								from torch import nn

								from torch.nn import functional as F

								from pytorch_pretrained_bert.modeling import (

								    BertPreTrainedModel, BertEmbeddings, BertEncoder, BertLayerNorm,

								    BertPooler, BertOnlyMLMHead)

								from .model import (BertTextEmbeddings, BertImageEmbeddings,

								                    BertForImageTextMaskedLM,

								                    BertVisionLanguageEncoder,

								                    BertForImageTextPretraining,

								                    _get_image_hidden,

								                    mask_img_feat,

								                    RegionFeatureRegression,

								                    mask_img_feat_for_mrc,

								                    RegionClassification)

								import torch

								import random


								class BertVisionLanguageEncoderForVCR(BertVisionLanguageEncoder):

								    """ Modification for Joint Vision-Language Encoding

								    """

								    def __init__(self, config, img_dim, num_region_toks):

								        BertPreTrainedModel.__init__(self, config)

								        self.embeddings = BertTextEmbeddings(config)

								        self.img_embeddings = BertImageEmbeddings(config, img_dim)

								        self.num_region_toks = num_region_toks

								        self.region_token_embeddings = nn.Embedding(

								            num_region_toks,

								            config.hidden_size)

								        self.LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12)

								        self.dropout = nn.Dropout(config.hidden_dropout_prob)

								        self.encoder = BertEncoder(config)

								        self.pooler = BertPooler(config)

								        self.apply(self.init_bert_weights)


								    def forward(self, input_ids, position_ids, txt_lens,

								                img_feat, img_pos_feat, num_bbs,

								                attention_mask, output_all_encoded_layers=True,

								                txt_type_ids=None, img_type_ids=None, region_tok_ids=None):

								        extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)

								        extended_attention_mask = extended_attention_mask.to(

								            dtype=next(self.parameters()).dtype)  # fp16 compatibility

								        extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0


								        embedding_output = self._compute_img_txt_embeddings(

								            input_ids, position_ids, txt_lens,

								            img_feat, img_pos_feat, num_bbs, attention_mask.size(1),

								            txt_type_ids, img_type_ids)

								        if region_tok_ids is not None:

								            region_tok_embeddings = self.region_token_embeddings(

								                region_tok_ids)

								            embedding_output += region_tok_embeddings

								            embedding_output = self.LayerNorm(embedding_output)

								            embedding_output = self.dropout(embedding_output)

								        encoded_layers = self.encoder(

								            embedding_output, extended_attention_mask,

								            output_all_encoded_layers=output_all_encoded_layers)

								        if not output_all_encoded_layers:

								            encoded_layers = encoded_layers[-1]

								        return encoded_layers


								class BertForVisualCommonsenseReasoning(BertPreTrainedModel):

								    """ Finetune multi-modal BERT for ITM

								    """

								    def __init__(self, config, img_dim, obj_cls=True, img_label_dim=81):

								        super().__init__(config, img_dim)

								        self.bert = BertVisionLanguageEncoder(

								            config, img_dim)

								        # self.vcr_output = nn.Linear(config.hidden_size, 1)

								        # self.vcr_output = nn.Linear(config.hidden_size, 2)

								        self.vcr_output = nn.Sequential(

								            nn.Linear(config.hidden_size, config.hidden_size*2),

								            nn.ReLU(),

								            BertLayerNorm(config.hidden_size*2, eps=1e-12),

								            nn.Linear(config.hidden_size*2, 2)

								        )

								        self.apply(self.init_bert_weights)

								        self.obj_cls = obj_cls

								        if self.obj_cls:

								            self.region_classifier = RegionClassification(

								                config.hidden_size, img_label_dim)


								    def init_type_embedding(self):

								        new_emb = nn.Embedding(4, self.bert.config.hidden_size)

								        new_emb.apply(self.init_bert_weights)

								        for i in [0, 1]:

								            emb = self.bert.embeddings.token_type_embeddings.weight.data[i, :]

								            new_emb.weight.data[i, :].copy_(emb)

								        emb = self.bert.embeddings.token_type_embeddings.weight.data[0, :]

								        new_emb.weight.data[2, :].copy_(emb)

								        new_emb.weight.data[3, :].copy_(emb)

								        self.bert.embeddings.token_type_embeddings = new_emb


								    def init_word_embedding(self, num_special_tokens):

								        orig_word_num = self.bert.embeddings.word_embeddings.weight.size(0)

								        new_emb = nn.Embedding(

								            orig_word_num + num_special_tokens, self.bert.config.hidden_size)

								        new_emb.apply(self.init_bert_weights)

								        emb = self.bert.embeddings.word_embeddings.weight.data

								        new_emb.weight.data[:orig_word_num, :].copy_(emb)

								        self.bert.embeddings.word_embeddings = new_emb


								    def masked_predict_labels(self, sequence_output, mask):

								        # only compute masked outputs

								        mask = mask.unsqueeze(-1).expand_as(sequence_output)

								        sequence_output_masked = sequence_output[mask].contiguous().view(

								            -1, self.config.hidden_size)

								        prediction_soft_label = self.region_classifier(sequence_output_masked)


								        return prediction_soft_label


								    def forward(self, input_ids, position_ids, txt_lens, txt_type_ids,

								                img_feat, img_pos_feat, num_bbs,

								                attention_mask, targets, obj_targets=None, img_masks=None,

								                region_tok_ids=None, compute_loss=True):

								        sequence_output = self.bert(input_ids, position_ids, txt_lens,

								                                    img_feat, img_pos_feat, num_bbs,

								                                    attention_mask,

								                                    output_all_encoded_layers=False,

								                                    txt_type_ids=txt_type_ids)

								        pooled_output = self.bert.pooler(sequence_output)

								        rank_scores = self.vcr_output(pooled_output)

								        # rank_scores = rank_scores.reshape((-1, 4))


								        if self.obj_cls and img_masks is not None:

								            img_feat = mask_img_feat_for_mrc(img_feat, img_masks)

								            masked_sequence_output = self.bert(

								                input_ids, position_ids, txt_lens,

								                img_feat, img_pos_feat, num_bbs,

								                attention_mask,

								                output_all_encoded_layers=False,

								                txt_type_ids=txt_type_ids)

								            # get only the image part

								            img_sequence_output = _get_image_hidden(

								                masked_sequence_output, txt_lens, num_bbs)

								            # only compute masked tokens for better efficiency

								            predicted_obj_label = self.masked_predict_labels(

								                img_sequence_output, img_masks)


								        if compute_loss:

								            vcr_loss = F.cross_entropy(

								                    rank_scores, targets.squeeze(-1),

								                    reduction='mean')

								            if self.obj_cls:

								                obj_cls_loss = F.cross_entropy(

								                    predicted_obj_label, obj_targets.long(),

								                    ignore_index=0, reduction='mean')

								            else:

								                obj_cls_loss = torch.tensor([0.], device=vcr_loss.device)

								            return vcr_loss, obj_cls_loss

								        else:

								            rank_scores = rank_scores[:, 1:]

								            return rank_scores


								class BertForImageTextPretrainingForVCR(BertForImageTextPretraining):

								    def init_type_embedding(self):

								        new_emb = nn.Embedding(4, self.bert.config.hidden_size)

								        new_emb.apply(self.init_bert_weights)

								        for i in [0, 1]:

								            emb = self.bert.embeddings.token_type_embeddings.weight.data[i, :]

								            new_emb.weight.data[i, :].copy_(emb)

								        emb = self.bert.embeddings.token_type_embeddings.weight.data[0, :]

								        new_emb.weight.data[2, :].copy_(emb)

								        new_emb.weight.data[3, :].copy_(emb)

								        self.bert.embeddings.token_type_embeddings = new_emb


								    def init_word_embedding(self, num_special_tokens):

								        orig_word_num = self.bert.embeddings.word_embeddings.weight.size(0)

								        new_emb = nn.Embedding(

								            orig_word_num + num_special_tokens, self.bert.config.hidden_size)

								        new_emb.apply(self.init_bert_weights)

								        emb = self.bert.embeddings.word_embeddings.weight.data

								        new_emb.weight.data[:orig_word_num, :].copy_(emb)

								        self.bert.embeddings.word_embeddings = new_emb

								        self.cls = BertOnlyMLMHead(

								            self.bert.config, self.bert.embeddings.word_embeddings.weight)


								    def forward(self, input_ids, position_ids, txt_type_ids, txt_lens,

								                img_feat, img_pos_feat, num_bbs,

								                attention_mask, labels, task, compute_loss=True):

								        if task == 'mlm':

								            txt_labels = labels

								            return self.forward_mlm(input_ids, position_ids, txt_type_ids,

								                                    txt_lens,

								                                    img_feat, img_pos_feat, num_bbs,

								                                    attention_mask, txt_labels, compute_loss)

								        elif task == 'mrm':

								            img_mask = labels

								            return self.forward_mrm(input_ids, position_ids, txt_type_ids,

								                                    txt_lens,

								                                    img_feat, img_pos_feat, num_bbs,

								                                    attention_mask, img_mask, compute_loss)

								        elif task.startswith('mrc'):

								            img_mask, mrc_label_target = labels

								            return self.forward_mrc(input_ids, position_ids, txt_type_ids,

								                                    txt_lens,

								                                    img_feat, img_pos_feat, num_bbs,

								                                    attention_mask, img_mask,

								                                    mrc_label_target, task, compute_loss)

								        else:

								            raise ValueError('invalid task')


								    # MLM

								    def forward_mlm(self, input_ids, position_ids, txt_type_ids, txt_lens,

								                    img_feat, img_pos_feat, num_bbs,

								                    attention_mask, txt_labels, compute_loss=True):

								        sequence_output = self.bert(input_ids, position_ids, txt_lens,

								                                    img_feat, img_pos_feat, num_bbs,

								                                    attention_mask,

								                                    output_all_encoded_layers=False,

								                                    txt_type_ids=txt_type_ids)

								        # get only the text part

								        sequence_output = sequence_output[:, :input_ids.size(1), :]

								        # only compute masked tokens for better efficiency

								        prediction_scores = self.masked_compute_scores(

								            sequence_output, txt_labels != -1)

								        if self.vocab_pad:

								            prediction_scores = prediction_scores[:, :-self.vocab_pad]


								        if compute_loss:

								            masked_lm_loss = F.cross_entropy(prediction_scores,

								                                             txt_labels[txt_labels != -1],

								                                             reduction='none')

								            return masked_lm_loss

								        else:

								            return prediction_scores


								    # MRM

								    def forward_mrm(self, input_ids, position_ids, txt_type_ids, txt_lens,

								                    img_feat, img_pos_feat, num_bbs,

								                    attention_mask, img_masks, compute_loss=True):

								        img_feat, feat_targets = mask_img_feat(img_feat, img_masks)

								        sequence_output = self.bert(input_ids, position_ids, txt_lens,

								                                    img_feat, img_pos_feat, num_bbs,

								                                    attention_mask,

								                                    output_all_encoded_layers=False,

								                                    txt_type_ids=txt_type_ids)

								        # get only the text part

								        sequence_output = _get_image_hidden(sequence_output, txt_lens, num_bbs)

								        # only compute masked tokens for better efficiency

								        prediction_feat = self.masked_compute_feat(

								            sequence_output, img_masks)


								        if compute_loss:

								            mrm_loss = F.mse_loss(prediction_feat, feat_targets,

								                                  reduction='none')

								            return mrm_loss

								        else:

								            return prediction_feat


								    # MRC

								    def forward_mrc(self, input_ids, position_ids, txt_type_ids, txt_lens,

								                    img_feat, img_pos_feat, num_bbs,

								                    attention_mask, img_masks,

								                    label_targets, task, compute_loss=True):

								        img_feat = mask_img_feat_for_mrc(img_feat, img_masks)

								        sequence_output = self.bert(input_ids, position_ids, txt_lens,

								                                    img_feat, img_pos_feat, num_bbs,

								                                    attention_mask,

								                                    output_all_encoded_layers=False,

								                                    txt_type_ids=txt_type_ids)

								        # get only the image part

								        sequence_output = _get_image_hidden(sequence_output, txt_lens, num_bbs)

								        # only compute masked tokens for better efficiency

								        prediction_soft_label = self.masked_predict_labels(

								            sequence_output, img_masks)


								        if compute_loss:

								            if "kl" in task:

								                prediction_soft_label = F.log_softmax(

								                    prediction_soft_label, dim=-1)

								                mrc_loss = F.kl_div(

								                    prediction_soft_label, label_targets, reduction='none')

								            else:

								                label_targets = torch.max(

								                    label_targets, -1)[1]  # argmax

								                mrc_loss = F.cross_entropy(

								                    prediction_soft_label, label_targets,

								                    ignore_index=0, reduction='none')

								            return mrc_loss

								        else:

								            return prediction_soft_label