lightningdot/uniter_model/data/re.py


								"""

								Referring Expression Comprehension dataset

								"""

								import sys

								import json

								import random

								import numpy as np


								import torch

								from torch.utils.data import Dataset

								from torch.nn.utils.rnn import pad_sequence

								from toolz.sandbox import unzip


								from .data import TxtLmdb


								class ReImageFeatDir(object):

								    def __init__(self, img_dir):

								        self.img_dir = img_dir


								    def __getitem__(self, file_name):

								        img_dump = np.load(f'{self.img_dir}/{file_name}', allow_pickle=True)

								        img_feat = torch.tensor(img_dump['features'])

								        img_bb = torch.tensor(img_dump['norm_bb'])

								        return img_feat, img_bb


								class ReDetectFeatDir(object):

								    def __init__(self, img_dir, conf_th=0.2, max_bb=100, min_bb=10, num_bb=36,

								                 format_='npz'):

								        assert format_ == 'npz', 'only support npz for now.'

								        assert isinstance(img_dir, str), 'img_dir is path, not db.'

								        self.img_dir = img_dir

								        self.conf_th = conf_th

								        self.max_bb = max_bb

								        self.min_bb = min_bb

								        self.num_bb = num_bb


								    def _compute_num_bb(self, img_dump):

								        num_bb = max(self.min_bb, (img_dump['conf'] > self.conf_th).sum())

								        num_bb = min(self.max_bb, num_bb)

								        return num_bb


								    def __getitem__(self, file_name):

								        # image input features

								        img_dump = np.load(f'{self.img_dir}/{file_name}', allow_pickle=True)

								        num_bb = self._compute_num_bb(img_dump)

								        img_feat = torch.tensor(img_dump['features'][:num_bb, :])

								        img_bb = torch.tensor(img_dump['norm_bb'][:num_bb, :])

								        return img_feat, img_bb


								class ReferringExpressionDataset(Dataset):

								    def __init__(self, db_dir, img_dir, max_txt_len=60):

								        assert isinstance(img_dir, ReImageFeatDir) or \

								               isinstance(img_dir, ReDetectFeatDir)

								        self.img_dir = img_dir


								        # load refs = [{ref_id, sent_ids, ann_id, image_id, sentences, split}]

								        refs = json.load(open(f'{db_dir}/refs.json', 'r'))

								        self.ref_ids = [ref['ref_id'] for ref in refs]

								        self.Refs = {ref['ref_id']: ref for ref in refs}


								        # load annotations = [{id, area, bbox, image_id, category_id}]

								        anns = json.load(open(f'{db_dir}/annotations.json', 'r'))

								        self.Anns = {ann['id']: ann for ann in anns}


								        # load categories = [{id, name, supercategory}]

								        categories = json.load(open(f'{db_dir}/categories.json', 'r'))

								        self.Cats = {cat['id']: cat['name'] for cat in categories}


								        # load images = [{id, file_name, ann_ids, height, width}]

								        images = json.load(open(f'{db_dir}/images.json', 'r'))

								        self.Images = {img['id']: img for img in images}


								        # id2len: sent_id -> sent_len

								        id2len = json.load(open(f'{db_dir}/id2len.json', 'r'))

								        self.id2len = {int(_id): _len for _id, _len in id2len.items()}

								        self.max_txt_len = max_txt_len

								        self.sent_ids = self._get_sent_ids()


								        # db[str(sent_id)] =

								        # {sent_id, sent, ref_id, ann_id, image_id,

								        #  bbox, input_ids, toked_sent}

								        self.db = TxtLmdb(db_dir, readonly=True)


								        # meta

								        meta = json.load(open(f'{db_dir}/meta.json', 'r'))

								        self.cls_ = meta['CLS']

								        self.sep = meta['SEP']

								        self.mask = meta['MASK']

								        self.v_range = meta['v_range']


								    def shuffle(self):

								        # we shuffle ref_ids and make sent_ids according to ref_ids

								        random.shuffle(self.ref_ids)

								        self.sent_ids = self._get_sent_ids()


								    def _get_sent_ids(self):

								        sent_ids = []

								        for ref_id in self.ref_ids:

								            for sent_id in self.Refs[ref_id]['sent_ids']:

								                sent_len = self.id2len[sent_id]

								                if self.max_txt_len == -1 or sent_len < self.max_txt_len:

								                    sent_ids.append(sent_id)

								        return sent_ids


								    def _get_img_feat(self, fname):

								        img_feat, bb = self.img_dir[fname]

								        img_bb = torch.cat([bb, bb[:, 4:5]*bb[:, 5:]], dim=-1)

								        num_bb = img_feat.size(0)

								        return img_feat, img_bb, num_bb


								    def __len__(self):

								        return len(self.sent_ids)


								    def __getitem__(self, i):

								        """

								        Return:

								        :input_ids     : (L, ), i.e., [cls, wd, wd, ..., sep, 0, 0]

								        :position_ids  : range(L)

								        :img_feat      : (num_bb, d)

								        :img_pos_feat  : (num_bb, 7)

								        :attn_masks    : (L+num_bb, ), i.e., [1, 1, ..., 0, 0, 1, 1]

								        :obj_masks     : (num_bb, ) all 0's

								        :target        : (1, )

								        """

								        # {sent_id, sent, ref_id, ann_id, image_id,

								        #  bbox, input_ids, toked_sent}

								        sent_id = self.sent_ids[i]

								        txt_dump = self.db[str(sent_id)]

								        image_id = txt_dump['image_id']

								        fname = f'visual_grounding_coco_gt_{int(image_id):012}.npz'

								        img_feat, img_pos_feat, num_bb = self._get_img_feat(fname)


								        # text input

								        input_ids = txt_dump['input_ids']

								        input_ids = [self.cls_] + input_ids + [self.sep]

								        attn_masks = [1] * len(input_ids)

								        position_ids = list(range(len(input_ids)))

								        attn_masks += [1] * num_bb


								        input_ids = torch.tensor(input_ids)

								        position_ids = torch.tensor(position_ids)

								        attn_masks = torch.tensor(attn_masks)


								        # target bbox

								        img = self.Images[image_id]

								        assert len(img['ann_ids']) == num_bb, \

								            'Please use visual_grounding_coco_gt'

								        target = img['ann_ids'].index(txt_dump['ann_id'])

								        target = torch.tensor([target])


								        # obj_masks, to be padded with 1, for masking out non-object prob.

								        obj_masks = torch.tensor([0]*len(img['ann_ids'])).bool()


								        return (input_ids, position_ids, img_feat, img_pos_feat, attn_masks,

								                obj_masks, target)


								def re_collate(inputs):

								    """

								    Return:

								    :input_ids     : (n, max_L) padded with 0

								    :position_ids  : (n, max_L) padded with 0

								    :txt_lens      : list of [txt_len]

								    :img_feat      : (n, max_num_bb, feat_dim)

								    :img_pos_feat  : (n, max_num_bb, 7)

								    :num_bbs       : list of [num_bb]

								    :attn_masks    : (n, max_{L+num_bb}) padded with 0

								    :obj_masks     : (n, max_num_bb) padded with 1

								    :targets       : (n, )

								    """

								    (input_ids, position_ids, img_feats, img_pos_feats, attn_masks, obj_masks,

								     targets) = map(list, unzip(inputs))


								    txt_lens = [i.size(0) for i in input_ids]

								    num_bbs = [f.size(0) for f in img_feats]


								    input_ids = pad_sequence(input_ids, batch_first=True, padding_value=0)

								    position_ids = pad_sequence(position_ids,

								                                batch_first=True, padding_value=0)

								    attn_masks = pad_sequence(attn_masks, batch_first=True, padding_value=0)

								    targets = torch.cat(targets, dim=0)

								    obj_masks = pad_sequence(obj_masks,

								                             batch_first=True, padding_value=1).bool()


								    batch_size = len(img_feats)

								    num_bb = max(num_bbs)

								    feat_dim = img_feats[0].size(1)

								    pos_dim = img_pos_feats[0].size(1)

								    img_feat = torch.zeros(batch_size, num_bb, feat_dim)

								    img_pos_feat = torch.zeros(batch_size, num_bb, pos_dim)

								    for i, (im, pos) in enumerate(zip(img_feats, img_pos_feats)):

								        len_ = im.size(0)

								        img_feat.data[i, :len_, :] = im.data

								        img_pos_feat.data[i, :len_, :] = pos.data


								    return (input_ids, position_ids, txt_lens,

								            img_feat, img_pos_feat, num_bbs,

								            attn_masks, obj_masks, targets)


								class ReferringExpressionEvalDataset(ReferringExpressionDataset):

								    def __getitem__(self, i):

								        """

								        Return:

								        :input_ids     : (L, ), i.e., [cls, wd, wd, ..., sep, 0, 0]

								        :position_ids  : range(L)

								        :img_feat      : (num_bb, d)

								        :img_pos_feat  : (num_bb, 7)

								        :attn_masks    : (L+num_bb, ), i.e., [1, 1, ..., 0, 0, 1, 1]

								        :obj_masks     : (num_bb, ) all 0's

								        :tgt_box       : ndarray (4, ) xywh

								        :obj_boxes     : ndarray (num_bb, 4) xywh

								        :sent_id

								        """

								        # {sent_id, sent, ref_id, ann_id, image_id,

								        #  bbox, input_ids, toked_sent}

								        sent_id = self.sent_ids[i]

								        txt_dump = self.db[str(sent_id)]

								        image_id = txt_dump['image_id']

								        if isinstance(self.img_dir, ReImageFeatDir):

								            if '_gt' in self.img_dir.img_dir:

								                fname = f'visual_grounding_coco_gt_{int(image_id):012}.npz'

								            elif '_det' in self.img_dir.img_dir:

								                fname = f'visual_grounding_det_coco_{int(image_id):012}.npz'

								        elif isinstance(self.img_dir, ReDetectFeatDir):

								            fname = f'coco_train2014_{int(image_id):012}.npz'

								        else:

								            sys.exit('%s not supported.' % self.img_dir)

								        img_feat, img_pos_feat, num_bb = self._get_img_feat(fname)


								        # image info

								        img = self.Images[image_id]

								        im_width, im_height = img['width'], img['height']


								        # object boxes, img_pos_feat (xyxywha) -> xywh

								        obj_boxes = np.stack([img_pos_feat[:, 0]*im_width,

								                              img_pos_feat[:, 1]*im_height,

								                              img_pos_feat[:, 4]*im_width,

								                              img_pos_feat[:, 5]*im_height], axis=1)

								        obj_masks = torch.tensor([0]*num_bb).bool()


								        # target box

								        tgt_box = np.array(txt_dump['bbox'])  # xywh


								        # text input

								        input_ids = txt_dump['input_ids']

								        input_ids = [self.cls_] + input_ids + [self.sep]

								        attn_masks = [1] * len(input_ids)

								        position_ids = list(range(len(input_ids)))

								        attn_masks += [1] * num_bb


								        input_ids = torch.tensor(input_ids)

								        position_ids = torch.tensor(position_ids)

								        attn_masks = torch.tensor(attn_masks)


								        return (input_ids, position_ids, img_feat, img_pos_feat, attn_masks,

								                obj_masks, tgt_box, obj_boxes, sent_id)


								    # IoU function

								    def computeIoU(self, box1, box2):

								        # each box is of [x1, y1, w, h]

								        inter_x1 = max(box1[0], box2[0])

								        inter_y1 = max(box1[1], box2[1])

								        inter_x2 = min(box1[0]+box1[2]-1, box2[0]+box2[2]-1)

								        inter_y2 = min(box1[1]+box1[3]-1, box2[1]+box2[3]-1)


								        if inter_x1 < inter_x2 and inter_y1 < inter_y2:

								            inter = (inter_x2-inter_x1+1)*(inter_y2-inter_y1+1)

								        else:

								            inter = 0

								        union = box1[2]*box1[3] + box2[2]*box2[3] - inter

								        return float(inter)/union


								def re_eval_collate(inputs):

								    """

								    Return:

								    :input_ids     : (n, max_L)

								    :position_ids  : (n, max_L)

								    :txt_lens      : list of [txt_len]

								    :img_feat      : (n, max_num_bb, d)

								    :img_pos_feat  : (n, max_num_bb, 7)

								    :num_bbs       : list of [num_bb]

								    :attn_masks    : (n, max{L+num_bb})

								    :obj_masks     : (n, max_num_bb)

								    :tgt_box       : list of n [xywh]

								    :obj_boxes     : list of n [[xywh, xywh, ...]]

								    :sent_ids      : list of n [sent_id]

								    """

								    (input_ids, position_ids, img_feats, img_pos_feats, attn_masks, obj_masks,

								     tgt_box, obj_boxes, sent_ids) = map(list, unzip(inputs))


								    txt_lens = [i.size(0) for i in input_ids]

								    num_bbs = [f.size(0) for f in img_feats]


								    input_ids = pad_sequence(input_ids, batch_first=True, padding_value=0)

								    position_ids = pad_sequence(position_ids,

								                                batch_first=True, padding_value=0)

								    attn_masks = pad_sequence(attn_masks, batch_first=True, padding_value=0)

								    obj_masks = pad_sequence(obj_masks,

								                             batch_first=True, padding_value=1).bool()


								    batch_size = len(img_feats)

								    num_bb = max(num_bbs)

								    feat_dim = img_feats[0].size(1)

								    pos_dim = img_pos_feats[0].size(1)

								    img_feat = torch.zeros(batch_size, num_bb, feat_dim)

								    img_pos_feat = torch.zeros(batch_size, num_bb, pos_dim)

								    for i, (im, pos) in enumerate(zip(img_feats, img_pos_feats)):

								        len_ = im.size(0)

								        img_feat.data[i, :len_, :] = im.data

								        img_pos_feat.data[i, :len_, :] = pos.data


								    return (input_ids, position_ids, txt_lens,

								            img_feat, img_pos_feat, num_bbs,

								            attn_masks, obj_masks, tgt_box, obj_boxes, sent_ids)