lightningdot/uniter_model/data/re.py

"""
Referring Expression Comprehension dataset
"""
import sys
import json
import random
import numpy as np

import torch
from torch.utils.data import Dataset
from torch.nn.utils.rnn import pad_sequence
from toolz.sandbox import unzip

from .data import TxtLmdb


class ReImageFeatDir(object):
    def __init__(self, img_dir):
        self.img_dir = img_dir

    def __getitem__(self, file_name):
        img_dump = np.load(f'{self.img_dir}/{file_name}', allow_pickle=True)
        img_feat = torch.tensor(img_dump['features'])
        img_bb = torch.tensor(img_dump['norm_bb'])
        return img_feat, img_bb


class ReDetectFeatDir(object):
    def __init__(self, img_dir, conf_th=0.2, max_bb=100, min_bb=10, num_bb=36,
                 format_='npz'):
        assert format_ == 'npz', 'only support npz for now.'
        assert isinstance(img_dir, str), 'img_dir is path, not db.'
        self.img_dir = img_dir
        self.conf_th = conf_th
        self.max_bb = max_bb
        self.min_bb = min_bb
        self.num_bb = num_bb

    def _compute_num_bb(self, img_dump):
        num_bb = max(self.min_bb, (img_dump['conf'] > self.conf_th).sum())
        num_bb = min(self.max_bb, num_bb)
        return num_bb

    def __getitem__(self, file_name):
        # image input features
        img_dump = np.load(f'{self.img_dir}/{file_name}', allow_pickle=True)
        num_bb = self._compute_num_bb(img_dump)
        img_feat = torch.tensor(img_dump['features'][:num_bb, :])
        img_bb = torch.tensor(img_dump['norm_bb'][:num_bb, :])
        return img_feat, img_bb


class ReferringExpressionDataset(Dataset):
    def __init__(self, db_dir, img_dir, max_txt_len=60):
        assert isinstance(img_dir, ReImageFeatDir) or \
               isinstance(img_dir, ReDetectFeatDir)
        self.img_dir = img_dir

        # load refs = [{ref_id, sent_ids, ann_id, image_id, sentences, split}]
        refs = json.load(open(f'{db_dir}/refs.json', 'r'))
        self.ref_ids = [ref['ref_id'] for ref in refs]
        self.Refs = {ref['ref_id']: ref for ref in refs}

        # load annotations = [{id, area, bbox, image_id, category_id}]
        anns = json.load(open(f'{db_dir}/annotations.json', 'r'))
        self.Anns = {ann['id']: ann for ann in anns}

        # load categories = [{id, name, supercategory}]
        categories = json.load(open(f'{db_dir}/categories.json', 'r'))
        self.Cats = {cat['id']: cat['name'] for cat in categories}

        # load images = [{id, file_name, ann_ids, height, width}]
        images = json.load(open(f'{db_dir}/images.json', 'r'))
        self.Images = {img['id']: img for img in images}

        # id2len: sent_id -> sent_len
        id2len = json.load(open(f'{db_dir}/id2len.json', 'r'))
        self.id2len = {int(_id): _len for _id, _len in id2len.items()}
        self.max_txt_len = max_txt_len
        self.sent_ids = self._get_sent_ids()

        # db[str(sent_id)] =
        # {sent_id, sent, ref_id, ann_id, image_id,
        #  bbox, input_ids, toked_sent}
        self.db = TxtLmdb(db_dir, readonly=True)

        # meta
        meta = json.load(open(f'{db_dir}/meta.json', 'r'))
        self.cls_ = meta['CLS']
        self.sep = meta['SEP']
        self.mask = meta['MASK']
        self.v_range = meta['v_range']

    def shuffle(self):
        # we shuffle ref_ids and make sent_ids according to ref_ids
        random.shuffle(self.ref_ids)
        self.sent_ids = self._get_sent_ids()

    def _get_sent_ids(self):
        sent_ids = []
        for ref_id in self.ref_ids:
            for sent_id in self.Refs[ref_id]['sent_ids']:
                sent_len = self.id2len[sent_id]
                if self.max_txt_len == -1 or sent_len < self.max_txt_len:
                    sent_ids.append(sent_id)
        return sent_ids

    def _get_img_feat(self, fname):
        img_feat, bb = self.img_dir[fname]
        img_bb = torch.cat([bb, bb[:, 4:5]*bb[:, 5:]], dim=-1)
        num_bb = img_feat.size(0)
        return img_feat, img_bb, num_bb

    def __len__(self):
        return len(self.sent_ids)

    def __getitem__(self, i):
        """
        Return:
        :input_ids     : (L, ), i.e., [cls, wd, wd, ..., sep, 0, 0]
        :position_ids  : range(L)
        :img_feat      : (num_bb, d)
        :img_pos_feat  : (num_bb, 7)
        :attn_masks    : (L+num_bb, ), i.e., [1, 1, ..., 0, 0, 1, 1]
        :obj_masks     : (num_bb, ) all 0's
        :target        : (1, )
        """
        # {sent_id, sent, ref_id, ann_id, image_id,
        #  bbox, input_ids, toked_sent}
        sent_id = self.sent_ids[i]
        txt_dump = self.db[str(sent_id)]
        image_id = txt_dump['image_id']
        fname = f'visual_grounding_coco_gt_{int(image_id):012}.npz'
        img_feat, img_pos_feat, num_bb = self._get_img_feat(fname)

        # text input
        input_ids = txt_dump['input_ids']
        input_ids = [self.cls_] + input_ids + [self.sep]
        attn_masks = [1] * len(input_ids)
        position_ids = list(range(len(input_ids)))
        attn_masks += [1] * num_bb

        input_ids = torch.tensor(input_ids)
        position_ids = torch.tensor(position_ids)
        attn_masks = torch.tensor(attn_masks)

        # target bbox
        img = self.Images[image_id]
        assert len(img['ann_ids']) == num_bb, \
            'Please use visual_grounding_coco_gt'
        target = img['ann_ids'].index(txt_dump['ann_id'])
        target = torch.tensor([target])

        # obj_masks, to be padded with 1, for masking out non-object prob.
        obj_masks = torch.tensor([0]*len(img['ann_ids'])).bool()

        return (input_ids, position_ids, img_feat, img_pos_feat, attn_masks,
                obj_masks, target)


def re_collate(inputs):
    """
    Return:
    :input_ids     : (n, max_L) padded with 0
    :position_ids  : (n, max_L) padded with 0
    :txt_lens      : list of [txt_len]
    :img_feat      : (n, max_num_bb, feat_dim)
    :img_pos_feat  : (n, max_num_bb, 7)
    :num_bbs       : list of [num_bb]
    :attn_masks    : (n, max_{L+num_bb}) padded with 0
    :obj_masks     : (n, max_num_bb) padded with 1
    :targets       : (n, )
    """
    (input_ids, position_ids, img_feats, img_pos_feats, attn_masks, obj_masks,
     targets) = map(list, unzip(inputs))

    txt_lens = [i.size(0) for i in input_ids]
    num_bbs = [f.size(0) for f in img_feats]

    input_ids = pad_sequence(input_ids, batch_first=True, padding_value=0)
    position_ids = pad_sequence(position_ids,
                                batch_first=True, padding_value=0)
    attn_masks = pad_sequence(attn_masks, batch_first=True, padding_value=0)
    targets = torch.cat(targets, dim=0)
    obj_masks = pad_sequence(obj_masks,
                             batch_first=True, padding_value=1).bool()

    batch_size = len(img_feats)
    num_bb = max(num_bbs)
    feat_dim = img_feats[0].size(1)
    pos_dim = img_pos_feats[0].size(1)
    img_feat = torch.zeros(batch_size, num_bb, feat_dim)
    img_pos_feat = torch.zeros(batch_size, num_bb, pos_dim)
    for i, (im, pos) in enumerate(zip(img_feats, img_pos_feats)):
        len_ = im.size(0)
        img_feat.data[i, :len_, :] = im.data
        img_pos_feat.data[i, :len_, :] = pos.data

    return (input_ids, position_ids, txt_lens,
            img_feat, img_pos_feat, num_bbs,
            attn_masks, obj_masks, targets)


class ReferringExpressionEvalDataset(ReferringExpressionDataset):
    def __getitem__(self, i):
        """
        Return:
        :input_ids     : (L, ), i.e., [cls, wd, wd, ..., sep, 0, 0]
        :position_ids  : range(L)
        :img_feat      : (num_bb, d)
        :img_pos_feat  : (num_bb, 7)
        :attn_masks    : (L+num_bb, ), i.e., [1, 1, ..., 0, 0, 1, 1]
        :obj_masks     : (num_bb, ) all 0's
        :tgt_box       : ndarray (4, ) xywh
        :obj_boxes     : ndarray (num_bb, 4) xywh
        :sent_id
        """
        # {sent_id, sent, ref_id, ann_id, image_id,
        #  bbox, input_ids, toked_sent}
        sent_id = self.sent_ids[i]
        txt_dump = self.db[str(sent_id)]
        image_id = txt_dump['image_id']
        if isinstance(self.img_dir, ReImageFeatDir):
            if '_gt' in self.img_dir.img_dir:
                fname = f'visual_grounding_coco_gt_{int(image_id):012}.npz'
            elif '_det' in self.img_dir.img_dir:
                fname = f'visual_grounding_det_coco_{int(image_id):012}.npz'
        elif isinstance(self.img_dir, ReDetectFeatDir):
            fname = f'coco_train2014_{int(image_id):012}.npz'
        else:
            sys.exit('%s not supported.' % self.img_dir)
        img_feat, img_pos_feat, num_bb = self._get_img_feat(fname)

        # image info
        img = self.Images[image_id]
        im_width, im_height = img['width'], img['height']

        # object boxes, img_pos_feat (xyxywha) -> xywh
        obj_boxes = np.stack([img_pos_feat[:, 0]*im_width,
                              img_pos_feat[:, 1]*im_height,
                              img_pos_feat[:, 4]*im_width,
                              img_pos_feat[:, 5]*im_height], axis=1)
        obj_masks = torch.tensor([0]*num_bb).bool()

        # target box
        tgt_box = np.array(txt_dump['bbox'])  # xywh

        # text input
        input_ids = txt_dump['input_ids']
        input_ids = [self.cls_] + input_ids + [self.sep]
        attn_masks = [1] * len(input_ids)
        position_ids = list(range(len(input_ids)))
        attn_masks += [1] * num_bb

        input_ids = torch.tensor(input_ids)
        position_ids = torch.tensor(position_ids)
        attn_masks = torch.tensor(attn_masks)

        return (input_ids, position_ids, img_feat, img_pos_feat, attn_masks,
                obj_masks, tgt_box, obj_boxes, sent_id)

    # IoU function
    def computeIoU(self, box1, box2):
        # each box is of [x1, y1, w, h]
        inter_x1 = max(box1[0], box2[0])
        inter_y1 = max(box1[1], box2[1])
        inter_x2 = min(box1[0]+box1[2]-1, box2[0]+box2[2]-1)
        inter_y2 = min(box1[1]+box1[3]-1, box2[1]+box2[3]-1)

        if inter_x1 < inter_x2 and inter_y1 < inter_y2:
            inter = (inter_x2-inter_x1+1)*(inter_y2-inter_y1+1)
        else:
            inter = 0
        union = box1[2]*box1[3] + box2[2]*box2[3] - inter
        return float(inter)/union


def re_eval_collate(inputs):
    """
    Return:
    :input_ids     : (n, max_L)
    :position_ids  : (n, max_L)
    :txt_lens      : list of [txt_len]
    :img_feat      : (n, max_num_bb, d)
    :img_pos_feat  : (n, max_num_bb, 7)
    :num_bbs       : list of [num_bb]
    :attn_masks    : (n, max{L+num_bb})
    :obj_masks     : (n, max_num_bb)
    :tgt_box       : list of n [xywh]
    :obj_boxes     : list of n [[xywh, xywh, ...]]
    :sent_ids      : list of n [sent_id]
    """
    (input_ids, position_ids, img_feats, img_pos_feats, attn_masks, obj_masks,
     tgt_box, obj_boxes, sent_ids) = map(list, unzip(inputs))

    txt_lens = [i.size(0) for i in input_ids]
    num_bbs = [f.size(0) for f in img_feats]

    input_ids = pad_sequence(input_ids, batch_first=True, padding_value=0)
    position_ids = pad_sequence(position_ids,
                                batch_first=True, padding_value=0)
    attn_masks = pad_sequence(attn_masks, batch_first=True, padding_value=0)
    obj_masks = pad_sequence(obj_masks,
                             batch_first=True, padding_value=1).bool()

    batch_size = len(img_feats)
    num_bb = max(num_bbs)
    feat_dim = img_feats[0].size(1)
    pos_dim = img_pos_feats[0].size(1)
    img_feat = torch.zeros(batch_size, num_bb, feat_dim)
    img_pos_feat = torch.zeros(batch_size, num_bb, pos_dim)
    for i, (im, pos) in enumerate(zip(img_feats, img_pos_feats)):
        len_ = im.size(0)
        img_feat.data[i, :len_, :] = im.data
        img_pos_feat.data[i, :len_, :] = pos.data

    return (input_ids, position_ids, txt_lens,
            img_feat, img_pos_feat, num_bbs,
            attn_masks, obj_masks, tgt_box, obj_boxes, sent_ids)
update the operator. Signed-off-by: wxywb <xy.wang@zilliz.com> 2 years ago			`"""`
			`Referring Expression Comprehension dataset`
			`"""`
			`import sys`
			`import json`
			`import random`
			`import numpy as np`

			`import torch`
			`from torch.utils.data import Dataset`
			`from torch.nn.utils.rnn import pad_sequence`
			`from toolz.sandbox import unzip`

			`from .data import TxtLmdb`


			`class ReImageFeatDir(object):`
			`def __init__(self, img_dir):`
			`self.img_dir = img_dir`

			`def __getitem__(self, file_name):`
			`img_dump = np.load(f'{self.img_dir}/{file_name}', allow_pickle=True)`
			`img_feat = torch.tensor(img_dump['features'])`
			`img_bb = torch.tensor(img_dump['norm_bb'])`
			`return img_feat, img_bb`


			`class ReDetectFeatDir(object):`
			`def __init__(self, img_dir, conf_th=0.2, max_bb=100, min_bb=10, num_bb=36,`
			`format_='npz'):`
			`assert format_ == 'npz', 'only support npz for now.'`
			`assert isinstance(img_dir, str), 'img_dir is path, not db.'`
			`self.img_dir = img_dir`
			`self.conf_th = conf_th`
			`self.max_bb = max_bb`
			`self.min_bb = min_bb`
			`self.num_bb = num_bb`

			`def _compute_num_bb(self, img_dump):`
			`num_bb = max(self.min_bb, (img_dump['conf'] > self.conf_th).sum())`
			`num_bb = min(self.max_bb, num_bb)`
			`return num_bb`

			`def __getitem__(self, file_name):`
			`# image input features`
			`img_dump = np.load(f'{self.img_dir}/{file_name}', allow_pickle=True)`
			`num_bb = self._compute_num_bb(img_dump)`
			`img_feat = torch.tensor(img_dump['features'][:num_bb, :])`
			`img_bb = torch.tensor(img_dump['norm_bb'][:num_bb, :])`
			`return img_feat, img_bb`


			`class ReferringExpressionDataset(Dataset):`
			`def __init__(self, db_dir, img_dir, max_txt_len=60):`
			`assert isinstance(img_dir, ReImageFeatDir) or \`
			`isinstance(img_dir, ReDetectFeatDir)`
			`self.img_dir = img_dir`

			`# load refs = [{ref_id, sent_ids, ann_id, image_id, sentences, split}]`
			`refs = json.load(open(f'{db_dir}/refs.json', 'r'))`
			`self.ref_ids = [ref['ref_id'] for ref in refs]`
			`self.Refs = {ref['ref_id']: ref for ref in refs}`

			`# load annotations = [{id, area, bbox, image_id, category_id}]`
			`anns = json.load(open(f'{db_dir}/annotations.json', 'r'))`
			`self.Anns = {ann['id']: ann for ann in anns}`

			`# load categories = [{id, name, supercategory}]`
			`categories = json.load(open(f'{db_dir}/categories.json', 'r'))`
			`self.Cats = {cat['id']: cat['name'] for cat in categories}`

			`# load images = [{id, file_name, ann_ids, height, width}]`
			`images = json.load(open(f'{db_dir}/images.json', 'r'))`
			`self.Images = {img['id']: img for img in images}`

			`# id2len: sent_id -> sent_len`
			`id2len = json.load(open(f'{db_dir}/id2len.json', 'r'))`
			`self.id2len = {int(_id): _len for _id, _len in id2len.items()}`
			`self.max_txt_len = max_txt_len`
			`self.sent_ids = self._get_sent_ids()`

			`# db[str(sent_id)] =`
			`# {sent_id, sent, ref_id, ann_id, image_id,`
			`# bbox, input_ids, toked_sent}`
			`self.db = TxtLmdb(db_dir, readonly=True)`

			`# meta`
			`meta = json.load(open(f'{db_dir}/meta.json', 'r'))`
			`self.cls_ = meta['CLS']`
			`self.sep = meta['SEP']`
			`self.mask = meta['MASK']`
			`self.v_range = meta['v_range']`

			`def shuffle(self):`
			`# we shuffle ref_ids and make sent_ids according to ref_ids`
			`random.shuffle(self.ref_ids)`
			`self.sent_ids = self._get_sent_ids()`

			`def _get_sent_ids(self):`
			`sent_ids = []`
			`for ref_id in self.ref_ids:`
			`for sent_id in self.Refs[ref_id]['sent_ids']:`
			`sent_len = self.id2len[sent_id]`
			`if self.max_txt_len == -1 or sent_len < self.max_txt_len:`
			`sent_ids.append(sent_id)`
			`return sent_ids`

			`def _get_img_feat(self, fname):`
			`img_feat, bb = self.img_dir[fname]`
			`img_bb = torch.cat([bb, bb[:, 4:5]*bb[:, 5:]], dim=-1)`
			`num_bb = img_feat.size(0)`
			`return img_feat, img_bb, num_bb`

			`def __len__(self):`
			`return len(self.sent_ids)`

			`def __getitem__(self, i):`
			`"""`
			`Return:`
			`:input_ids : (L, ), i.e., [cls, wd, wd, ..., sep, 0, 0]`
			`:position_ids : range(L)`
			`:img_feat : (num_bb, d)`
			`:img_pos_feat : (num_bb, 7)`
			`:attn_masks : (L+num_bb, ), i.e., [1, 1, ..., 0, 0, 1, 1]`
			`:obj_masks : (num_bb, ) all 0's`
			`:target : (1, )`
			`"""`
			`# {sent_id, sent, ref_id, ann_id, image_id,`
			`# bbox, input_ids, toked_sent}`
			`sent_id = self.sent_ids[i]`
			`txt_dump = self.db[str(sent_id)]`
			`image_id = txt_dump['image_id']`
			`fname = f'visual_grounding_coco_gt_{int(image_id):012}.npz'`
			`img_feat, img_pos_feat, num_bb = self._get_img_feat(fname)`

			`# text input`
			`input_ids = txt_dump['input_ids']`
			`input_ids = [self.cls_] + input_ids + [self.sep]`
			`attn_masks = [1] * len(input_ids)`
			`position_ids = list(range(len(input_ids)))`
			`attn_masks += [1] * num_bb`

			`input_ids = torch.tensor(input_ids)`
			`position_ids = torch.tensor(position_ids)`
			`attn_masks = torch.tensor(attn_masks)`

			`# target bbox`
			`img = self.Images[image_id]`
			`assert len(img['ann_ids']) == num_bb, \`
			`'Please use visual_grounding_coco_gt'`
			`target = img['ann_ids'].index(txt_dump['ann_id'])`
			`target = torch.tensor([target])`

			`# obj_masks, to be padded with 1, for masking out non-object prob.`
			`obj_masks = torch.tensor([0]*len(img['ann_ids'])).bool()`

			`return (input_ids, position_ids, img_feat, img_pos_feat, attn_masks,`
			`obj_masks, target)`


			`def re_collate(inputs):`
			`"""`
			`Return:`
			`:input_ids : (n, max_L) padded with 0`
			`:position_ids : (n, max_L) padded with 0`
			`:txt_lens : list of [txt_len]`
			`:img_feat : (n, max_num_bb, feat_dim)`
			`:img_pos_feat : (n, max_num_bb, 7)`
			`:num_bbs : list of [num_bb]`
			`:attn_masks : (n, max_{L+num_bb}) padded with 0`
			`:obj_masks : (n, max_num_bb) padded with 1`
			`:targets : (n, )`
			`"""`
			`(input_ids, position_ids, img_feats, img_pos_feats, attn_masks, obj_masks,`
			`targets) = map(list, unzip(inputs))`

			`txt_lens = [i.size(0) for i in input_ids]`
			`num_bbs = [f.size(0) for f in img_feats]`

			`input_ids = pad_sequence(input_ids, batch_first=True, padding_value=0)`
			`position_ids = pad_sequence(position_ids,`
			`batch_first=True, padding_value=0)`
			`attn_masks = pad_sequence(attn_masks, batch_first=True, padding_value=0)`
			`targets = torch.cat(targets, dim=0)`
			`obj_masks = pad_sequence(obj_masks,`
			`batch_first=True, padding_value=1).bool()`

			`batch_size = len(img_feats)`
			`num_bb = max(num_bbs)`
			`feat_dim = img_feats[0].size(1)`
			`pos_dim = img_pos_feats[0].size(1)`
			`img_feat = torch.zeros(batch_size, num_bb, feat_dim)`
			`img_pos_feat = torch.zeros(batch_size, num_bb, pos_dim)`
			`for i, (im, pos) in enumerate(zip(img_feats, img_pos_feats)):`
			`len_ = im.size(0)`
			`img_feat.data[i, :len_, :] = im.data`
			`img_pos_feat.data[i, :len_, :] = pos.data`

			`return (input_ids, position_ids, txt_lens,`
			`img_feat, img_pos_feat, num_bbs,`
			`attn_masks, obj_masks, targets)`


			`class ReferringExpressionEvalDataset(ReferringExpressionDataset):`
			`def __getitem__(self, i):`
			`"""`
			`Return:`
			`:input_ids : (L, ), i.e., [cls, wd, wd, ..., sep, 0, 0]`
			`:position_ids : range(L)`
			`:img_feat : (num_bb, d)`
			`:img_pos_feat : (num_bb, 7)`
			`:attn_masks : (L+num_bb, ), i.e., [1, 1, ..., 0, 0, 1, 1]`
			`:obj_masks : (num_bb, ) all 0's`
			`:tgt_box : ndarray (4, ) xywh`
			`:obj_boxes : ndarray (num_bb, 4) xywh`
			`:sent_id`
			`"""`
			`# {sent_id, sent, ref_id, ann_id, image_id,`
			`# bbox, input_ids, toked_sent}`
			`sent_id = self.sent_ids[i]`
			`txt_dump = self.db[str(sent_id)]`
			`image_id = txt_dump['image_id']`
			`if isinstance(self.img_dir, ReImageFeatDir):`
			`if '_gt' in self.img_dir.img_dir:`
			`fname = f'visual_grounding_coco_gt_{int(image_id):012}.npz'`
			`elif '_det' in self.img_dir.img_dir:`
			`fname = f'visual_grounding_det_coco_{int(image_id):012}.npz'`
			`elif isinstance(self.img_dir, ReDetectFeatDir):`
			`fname = f'coco_train2014_{int(image_id):012}.npz'`
			`else:`
			`sys.exit('%s not supported.' % self.img_dir)`
			`img_feat, img_pos_feat, num_bb = self._get_img_feat(fname)`

			`# image info`
			`img = self.Images[image_id]`
			`im_width, im_height = img['width'], img['height']`

			`# object boxes, img_pos_feat (xyxywha) -> xywh`
			`obj_boxes = np.stack([img_pos_feat[:, 0]*im_width,`
			`img_pos_feat[:, 1]*im_height,`
			`img_pos_feat[:, 4]*im_width,`
			`img_pos_feat[:, 5]*im_height], axis=1)`
			`obj_masks = torch.tensor([0]*num_bb).bool()`

			`# target box`
			`tgt_box = np.array(txt_dump['bbox']) # xywh`

			`# text input`
			`input_ids = txt_dump['input_ids']`
			`input_ids = [self.cls_] + input_ids + [self.sep]`
			`attn_masks = [1] * len(input_ids)`
			`position_ids = list(range(len(input_ids)))`
			`attn_masks += [1] * num_bb`

			`input_ids = torch.tensor(input_ids)`
			`position_ids = torch.tensor(position_ids)`
			`attn_masks = torch.tensor(attn_masks)`

			`return (input_ids, position_ids, img_feat, img_pos_feat, attn_masks,`
			`obj_masks, tgt_box, obj_boxes, sent_id)`

			`# IoU function`
			`def computeIoU(self, box1, box2):`
			`# each box is of [x1, y1, w, h]`
			`inter_x1 = max(box1[0], box2[0])`
			`inter_y1 = max(box1[1], box2[1])`
			`inter_x2 = min(box1[0]+box1[2]-1, box2[0]+box2[2]-1)`
			`inter_y2 = min(box1[1]+box1[3]-1, box2[1]+box2[3]-1)`

			`if inter_x1 < inter_x2 and inter_y1 < inter_y2:`
			`inter = (inter_x2-inter_x1+1)*(inter_y2-inter_y1+1)`
			`else:`
			`inter = 0`
			`union = box1[2]box1[3] + box2[2]box2[3] - inter`
			`return float(inter)/union`


			`def re_eval_collate(inputs):`
			`"""`
			`Return:`
			`:input_ids : (n, max_L)`
			`:position_ids : (n, max_L)`
			`:txt_lens : list of [txt_len]`
			`:img_feat : (n, max_num_bb, d)`
			`:img_pos_feat : (n, max_num_bb, 7)`
			`:num_bbs : list of [num_bb]`
			`:attn_masks : (n, max{L+num_bb})`
			`:obj_masks : (n, max_num_bb)`
			`:tgt_box : list of n [xywh]`
			`:obj_boxes : list of n [[xywh, xywh, ...]]`
			`:sent_ids : list of n [sent_id]`
			`"""`
			`(input_ids, position_ids, img_feats, img_pos_feats, attn_masks, obj_masks,`
			`tgt_box, obj_boxes, sent_ids) = map(list, unzip(inputs))`

			`txt_lens = [i.size(0) for i in input_ids]`
			`num_bbs = [f.size(0) for f in img_feats]`

			`input_ids = pad_sequence(input_ids, batch_first=True, padding_value=0)`
			`position_ids = pad_sequence(position_ids,`
			`batch_first=True, padding_value=0)`
			`attn_masks = pad_sequence(attn_masks, batch_first=True, padding_value=0)`
			`obj_masks = pad_sequence(obj_masks,`
			`batch_first=True, padding_value=1).bool()`

			`batch_size = len(img_feats)`
			`num_bb = max(num_bbs)`
			`feat_dim = img_feats[0].size(1)`
			`pos_dim = img_pos_feats[0].size(1)`
			`img_feat = torch.zeros(batch_size, num_bb, feat_dim)`
			`img_pos_feat = torch.zeros(batch_size, num_bb, pos_dim)`
			`for i, (im, pos) in enumerate(zip(img_feats, img_pos_feats)):`
			`len_ = im.size(0)`
			`img_feat.data[i, :len_, :] = im.data`
			`img_pos_feat.data[i, :len_, :] = pos.data`

			`return (input_ids, position_ids, txt_lens,`
			`img_feat, img_pos_feat, num_bbs,`
			`attn_masks, obj_masks, tgt_box, obj_boxes, sent_ids)`