lightningdot/uniter_model/model/re.py

"""
Bert for Referring Expression Comprehension
"""
import sys
import torch
import torch.nn as nn
from torch.nn import functional as F
from pytorch_pretrained_bert.modeling import BertPreTrainedModel, BertLayerNorm

from .model import BertVisionLanguageEncoder

import numpy as np
import random


class BertForReferringExpressionComprehension(BertPreTrainedModel):
    """Finetune multi-model BERT for Referring Expression Comprehension
    """
    def __init__(self, config, img_dim, loss="cls",
                 margin=0.2, hard_ratio=0.3, mlp=1):
        super().__init__(config)
        self.bert = BertVisionLanguageEncoder(config, img_dim)
        if mlp == 1:
            self.re_output = nn.Linear(config.hidden_size, 1)
        elif mlp == 2:
            self.re_output = nn.Sequential(
                nn.Linear(config.hidden_size, config.hidden_size), 
                nn.ReLU(), 
                BertLayerNorm(config.hidden_size, eps=1e-12),
                nn.Linear(config.hidden_size, 1)
            )
        else:
            sys.exit("MLP restricted to be 1 or 2 layers.")
        self.loss = loss
        assert self.loss in ['cls', 'rank']
        if self.loss == 'rank':
            self.margin = margin
            self.hard_ratio = hard_ratio
        else:
            self.crit = nn.CrossEntropyLoss(reduction='none')
        # initialize
        self.apply(self.init_bert_weights)
    
    def forward(self, input_ids, position_ids, txt_lens, img_feat,
                img_pos_feat, num_bbs, attn_masks, obj_masks,
                targets, compute_loss=True):
        sequence_output = self.bert(input_ids, position_ids, txt_lens,
                                    img_feat, img_pos_feat, num_bbs,
                                    attn_masks, 
                                    output_all_encoded_layers=False)
        # get only the region part
        sequence_output = self._get_image_hidden(sequence_output, txt_lens, 
                                                 num_bbs)

        # re score (n, max_num_bb)
        scores = self.re_output(sequence_output).squeeze(2)
        scores = scores.masked_fill(obj_masks, -1e4)  # mask out non-objects

        # loss
        if compute_loss:
            if self.loss == 'cls':
                ce_loss = self.crit(scores, targets)  # (n, ) as no reduction
                return ce_loss
            else:
                # ranking
                _n = len(num_bbs)
                # positive (target)
                pos_ix = targets
                pos_sc = scores.gather(1, pos_ix.view(_n, 1)) # (n, 1)
                pos_sc = torch.sigmoid(pos_sc).view(-1)  # (n, ) sc[0, 1]
                # negative
                neg_ix = self.sample_neg_ix(scores, targets, num_bbs)
                neg_sc = scores.gather(1, neg_ix.view(_n, 1)) # (n, 1)
                neg_sc = torch.sigmoid(neg_sc).view(-1)  # (n, ) sc[0, 1]
                # ranking
                mm_loss = torch.clamp(self.margin + neg_sc - pos_sc, 0) # (n, )
                return mm_loss
        else:
            # (n, max_num_bb)
            return scores
    
    def sample_neg_ix(self, scores, targets, num_bbs):
        """
        Inputs:
        :scores    (n, max_num_bb)
        :targets   (n, )
        :num_bbs   list of [num_bb]
        return:
        :neg_ix    (n, ) easy/hard negative (!= target)
        """
        neg_ix = []
        cand_ixs = torch.argsort(scores, dim=-1, descending=True) # (n, num_bb)
        for i in range(len(num_bbs)):
            num_bb = num_bbs[i]
            if np.random.uniform(0, 1, 1) < self.hard_ratio:
                # sample hard negative, w/ highest score
                for ix in cand_ixs[i].tolist():
                    if ix != targets[i]:
                        assert ix < num_bb, f'ix={ix}, num_bb={num_bb}'
                        neg_ix.append(ix)
                        break
            else:
                # sample easy negative, i.e., random one 
                ix = random.randint(0, num_bb-1)  # [0, num_bb-1]
                while ix == targets[i]:
                    ix = random.randint(0, num_bb-1)
                neg_ix.append(ix)
        neg_ix = torch.tensor(neg_ix).type(targets.type())
        assert neg_ix.numel() == targets.numel()
        return neg_ix

    def _get_image_hidden(self, sequence_output, txt_lens, num_bbs):
        """
        Extracting the img_hidden part from sequence_output.
        Inputs:
        - sequence_output: (n, txt_len+num_bb, hid_size)
        - txt_lens       : [txt_len]
        - num_bbs        : [num_bb]
        Output:
        - img_hidden     : (n, max_num_bb, hid_size)
        """
        outputs = []
        max_bb = max(num_bbs)
        hid_size = sequence_output.size(-1)
        for seq_out, len_, nbb in zip(sequence_output.split(1, dim=0),
                                    txt_lens, num_bbs):
            img_hid = seq_out[:, len_:len_+nbb, :]
            if nbb < max_bb:
                img_hid = torch.cat(
                        [img_hid, self._get_pad(img_hid, max_bb-nbb, hid_size)],
                        dim=1)
            outputs.append(img_hid)

        img_hidden = torch.cat(outputs, dim=0)
        return img_hidden

    def _get_pad(self, t, len_, hidden_size):
        pad = torch.zeros(1, len_, hidden_size, dtype=t.dtype, device=t.device)
        return pad
update the operator. Signed-off-by: wxywb <xy.wang@zilliz.com> 2 years ago			`"""`
			`Bert for Referring Expression Comprehension`
			`"""`
			`import sys`
			`import torch`
			`import torch.nn as nn`
			`from torch.nn import functional as F`
			`from pytorch_pretrained_bert.modeling import BertPreTrainedModel, BertLayerNorm`

			`from .model import BertVisionLanguageEncoder`

			`import numpy as np`
			`import random`


			`class BertForReferringExpressionComprehension(BertPreTrainedModel):`
			`"""Finetune multi-model BERT for Referring Expression Comprehension`
			`"""`
			`def __init__(self, config, img_dim, loss="cls",`
			`margin=0.2, hard_ratio=0.3, mlp=1):`
			`super().__init__(config)`
			`self.bert = BertVisionLanguageEncoder(config, img_dim)`
			`if mlp == 1:`
			`self.re_output = nn.Linear(config.hidden_size, 1)`
			`elif mlp == 2:`
			`self.re_output = nn.Sequential(`
			`nn.Linear(config.hidden_size, config.hidden_size),`
			`nn.ReLU(),`
			`BertLayerNorm(config.hidden_size, eps=1e-12),`
			`nn.Linear(config.hidden_size, 1)`
			`)`
			`else:`
			`sys.exit("MLP restricted to be 1 or 2 layers.")`
			`self.loss = loss`
			`assert self.loss in ['cls', 'rank']`
			`if self.loss == 'rank':`
			`self.margin = margin`
			`self.hard_ratio = hard_ratio`
			`else:`
			`self.crit = nn.CrossEntropyLoss(reduction='none')`
			`# initialize`
			`self.apply(self.init_bert_weights)`

			`def forward(self, input_ids, position_ids, txt_lens, img_feat,`
			`img_pos_feat, num_bbs, attn_masks, obj_masks,`
			`targets, compute_loss=True):`
			`sequence_output = self.bert(input_ids, position_ids, txt_lens,`
			`img_feat, img_pos_feat, num_bbs,`
			`attn_masks,`
			`output_all_encoded_layers=False)`
			`# get only the region part`
			`sequence_output = self._get_image_hidden(sequence_output, txt_lens,`
			`num_bbs)`

			`# re score (n, max_num_bb)`
			`scores = self.re_output(sequence_output).squeeze(2)`
			`scores = scores.masked_fill(obj_masks, -1e4) # mask out non-objects`

			`# loss`
			`if compute_loss:`
			`if self.loss == 'cls':`
			`ce_loss = self.crit(scores, targets) # (n, ) as no reduction`
			`return ce_loss`
			`else:`
			`# ranking`
			`_n = len(num_bbs)`
			`# positive (target)`
			`pos_ix = targets`
			`pos_sc = scores.gather(1, pos_ix.view(_n, 1)) # (n, 1)`
			`pos_sc = torch.sigmoid(pos_sc).view(-1) # (n, ) sc[0, 1]`
			`# negative`
			`neg_ix = self.sample_neg_ix(scores, targets, num_bbs)`
			`neg_sc = scores.gather(1, neg_ix.view(_n, 1)) # (n, 1)`
			`neg_sc = torch.sigmoid(neg_sc).view(-1) # (n, ) sc[0, 1]`
			`# ranking`
			`mm_loss = torch.clamp(self.margin + neg_sc - pos_sc, 0) # (n, )`
			`return mm_loss`
			`else:`
			`# (n, max_num_bb)`
			`return scores`

			`def sample_neg_ix(self, scores, targets, num_bbs):`
			`"""`
			`Inputs:`
			`:scores (n, max_num_bb)`
			`:targets (n, )`
			`:num_bbs list of [num_bb]`
			`return:`
			`:neg_ix (n, ) easy/hard negative (!= target)`
			`"""`
			`neg_ix = []`
			`cand_ixs = torch.argsort(scores, dim=-1, descending=True) # (n, num_bb)`
			`for i in range(len(num_bbs)):`
			`num_bb = num_bbs[i]`
			`if np.random.uniform(0, 1, 1) < self.hard_ratio:`
			`# sample hard negative, w/ highest score`
			`for ix in cand_ixs[i].tolist():`
			`if ix != targets[i]:`
			`assert ix < num_bb, f'ix={ix}, num_bb={num_bb}'`
			`neg_ix.append(ix)`
			`break`
			`else:`
			`# sample easy negative, i.e., random one`
			`ix = random.randint(0, num_bb-1) # [0, num_bb-1]`
			`while ix == targets[i]:`
			`ix = random.randint(0, num_bb-1)`
			`neg_ix.append(ix)`
			`neg_ix = torch.tensor(neg_ix).type(targets.type())`
			`assert neg_ix.numel() == targets.numel()`
			`return neg_ix`

			`def _get_image_hidden(self, sequence_output, txt_lens, num_bbs):`
			`"""`
			`Extracting the img_hidden part from sequence_output.`
			`Inputs:`
			`- sequence_output: (n, txt_len+num_bb, hid_size)`
			`- txt_lens : [txt_len]`
			`- num_bbs : [num_bb]`
			`Output:`
			`- img_hidden : (n, max_num_bb, hid_size)`
			`"""`
			`outputs = []`
			`max_bb = max(num_bbs)`
			`hid_size = sequence_output.size(-1)`
			`for seq_out, len_, nbb in zip(sequence_output.split(1, dim=0),`
			`txt_lens, num_bbs):`
			`img_hid = seq_out[:, len_:len_+nbb, :]`
			`if nbb < max_bb:`
			`img_hid = torch.cat(`
			`[img_hid, self._get_pad(img_hid, max_bb-nbb, hid_size)],`
			`dim=1)`
			`outputs.append(img_hid)`

			`img_hidden = torch.cat(outputs, dim=0)`
			`return img_hidden`

			`def _get_pad(self, t, len_, hidden_size):`
			`pad = torch.zeros(1, len_, hidden_size, dtype=t.dtype, device=t.device)`
			`return pad`