""" MLM datasets """ import math import random import torch from torch.utils.data import Dataset from torch.nn.utils.rnn import pad_sequence from toolz.sandbox import unzip from .data import (DetectFeatTxtTokDataset, TxtTokLmdb, get_ids_and_lens, pad_tensors, get_gather_index) def random_word(tokens, vocab_range, mask): """ Masking some random tokens for Language Model task with probabilities as in the original BERT paper. :param tokens: list of int, tokenized sentence. :param vocab_range: for choosing a random word :return: (list of int, list of int), masked tokens and related labels for LM prediction """ output_label = [] for i, token in enumerate(tokens): prob = random.random() # mask token with 15% probability if prob < 0.15: prob /= 0.15 # 80% randomly change token to mask token if prob < 0.8: tokens[i] = mask # 10% randomly change token to random token elif prob < 0.9: tokens[i] = random.choice(list(range(*vocab_range))) # -> rest 10% randomly keep current token # append current token to output (we will predict these later) output_label.append(token) else: # no masking token (will be ignored by loss function later) output_label.append(-1) if all(o == -1 for o in output_label): # at least mask 1 output_label[0] = tokens[0] tokens[0] = mask return tokens, output_label class MlmDataset(DetectFeatTxtTokDataset): def __init__(self, txt_db, img_db): assert isinstance(txt_db, TxtTokLmdb) super().__init__(txt_db, img_db) def __getitem__(self, i): """ Return: - input_ids : (L, ), i.e., [cls, wd, wd, ..., sep, 0, 0], 0s padded - img_feat : (num_bb, d) - img_pos_feat : (num_bb, 7) - attn_masks : (L + num_bb, ), ie., [1, 1, ..., 0, 0, 1, 1] - txt_labels : (L, ), [-1, -1, wid, -1, -1, -1] 0's padded so that (L + num_bb) % 8 == 0 """ example = super().__getitem__(i) # text input input_ids, txt_labels = self.create_mlm_io(example['input_ids']) # img input img_feat, img_pos_feat, num_bb = self._get_img_feat( example['img_fname']) attn_masks = torch.ones(len(input_ids) + num_bb, dtype=torch.long) return input_ids, img_feat, img_pos_feat, attn_masks, txt_labels def create_mlm_io(self, input_ids): input_ids, txt_labels = random_word(input_ids, self.txt_db.v_range, self.txt_db.mask) input_ids = torch.tensor([self.txt_db.cls_] + input_ids + [self.txt_db.sep]) txt_labels = torch.tensor([-1] + txt_labels + [-1]) return input_ids, txt_labels def mlm_collate(inputs): """ Return: :input_ids (n, max_L) padded with 0 :position_ids (n, max_L) padded with 0 :txt_lens list of [txt_len] :img_feat (n, max_num_bb, feat_dim) :img_pos_feat (n, max_num_bb, 7) :num_bbs list of [num_bb] :attn_masks (n, max_{L + num_bb}) padded with 0 :txt_labels (n, max_L) padded with -1 """ (input_ids, img_feats, img_pos_feats, attn_masks, txt_labels ) = map(list, unzip(inputs)) # text batches txt_lens = [i.size(0) for i in input_ids] input_ids = pad_sequence(input_ids, batch_first=True, padding_value=0) txt_labels = pad_sequence(txt_labels, batch_first=True, padding_value=-1) position_ids = torch.arange(0, input_ids.size(1), dtype=torch.long ).unsqueeze(0) # image batches num_bbs = [f.size(0) for f in img_feats] img_feat = pad_tensors(img_feats, num_bbs) img_pos_feat = pad_tensors(img_pos_feats, num_bbs) attn_masks = pad_sequence(attn_masks, batch_first=True, padding_value=0) bs, max_tl = input_ids.size() out_size = attn_masks.size(1) gather_index = get_gather_index(txt_lens, num_bbs, bs, max_tl, out_size) batch = {'input_ids': input_ids, 'position_ids': position_ids, 'img_feat': img_feat, 'img_pos_feat': img_pos_feat, 'attn_masks': attn_masks, 'gather_index': gather_index, 'txt_labels': txt_labels} return batch class BlindMlmDataset(Dataset): def __init__(self, txt_db): assert isinstance(txt_db, TxtTokLmdb) self.txt_db = txt_db self.lens, self.ids = get_ids_and_lens(txt_db) def __len__(self): return len(self.ids) def __getitem__(self, i): id_ = self.ids[i] example = self.txt_db[id_] input_ids, txt_labels = self.create_mlm_io(example['input_ids']) attn_masks = torch.ones(len(input_ids), dtype=torch.long) return input_ids, attn_masks, txt_labels def mlm_blind_collate(inputs): input_ids, attn_masks, txt_labels = map(list, unzip(inputs)) input_ids = pad_sequence(input_ids, batch_first=True, padding_value=0) position_ids = torch.arange(0, input_ids.size(1), dtype=torch.long ).unsqueeze(0) attn_masks = pad_sequence(attn_masks, batch_first=True, padding_value=0) txt_labels = pad_sequence(txt_labels, batch_first=True, padding_value=-1) batch = {'input_ids': input_ids, 'position_ids': position_ids, 'attn_masks': attn_masks, 'txt_labels': txt_labels} return batch def eval_mask(len_, num_samples=7): """ build the mask for evaluating MLM circularly mask 1 word out of every x words """ # build the random masks if len_ <= num_samples: masks = torch.eye(len_).bool() num_samples = len_ else: mask_inds = [list(range(i, len_, num_samples)) for i in range(num_samples)] masks = torch.zeros(num_samples, len_).bool() for i, indices in enumerate(mask_inds): for j in indices: masks.data[i, j] = 1 assert (masks.sum(dim=0) != torch.ones(len_).long()).sum().item() == 0 assert masks.sum().item() == len_ return masks def eval_gather_inds(len_, num_samples=7): """ get the gather indices """ inds = torch.arange(0, num_samples, dtype=torch.long) mul = math.ceil(len_ / num_samples) output = inds.repeat(mul)[:len_] return output def stack_pad_tensors(tensors, lens=None, ns=None, pad=0): """N x [B_i, T, ...]""" if ns is None: ns = [t.size(0) for t in tensors] if lens is None: lens = [t.size(1) for t in tensors] max_len = max(lens) bs = sum(ns) hid_dims = tensors[0].size()[2:] dtype = tensors[0].dtype output = torch.zeros(bs, max_len, *hid_dims, dtype=dtype) if pad: output.data.fill_(pad) i = 0 for t, l, n in zip(tensors, lens, ns): output.data[i:i+n, :l, ...] = t.data i += n return output def expand_tensors(tensors, ns): return [t.unsqueeze(0).expand(n, *tuple([-1]*t.dim())) for t, n in zip(tensors, ns)] class MlmEvalDataset(DetectFeatTxtTokDataset): """ For evaluating MLM training task """ def __init__(self, txt_db, img_db): assert isinstance(txt_db, TxtTokLmdb) super().__init__(txt_db, img_db) def __getitem__(self, i): example = super().__getitem__(i) # text input (input_ids, txt_labels, gather_inds ) = self.create_mlm_eval_io(example['input_ids']) # img input img_feat, img_pos_feat, num_bb = self._get_img_feat( example['img_fname']) attn_masks = torch.ones(input_ids.size(1) + num_bb, dtype=torch.long) return (input_ids, img_feat, img_pos_feat, attn_masks, txt_labels, gather_inds) def create_mlm_eval_io(self, input_ids): txt_labels = torch.tensor(input_ids) masks = eval_mask(len(input_ids)) n_mask = masks.size(0) masks = torch.cat([torch.zeros(n_mask, 1).bool(), masks, torch.zeros(n_mask, 1).bool()], dim=1) input_ids = torch.tensor([[self.txt_db.cls_] + input_ids + [self.txt_db.sep] for _ in range(n_mask)]) input_ids.data.masked_fill_(masks, self.txt_db.mask) gather_inds = eval_gather_inds(len(txt_labels)) return input_ids, txt_labels, gather_inds def _batch_gather_tgt(gather_inds, n_masks): gather_tgts = [] offset = 0 for g, n in zip(gather_inds, n_masks): gather_tgts.append(g + offset) offset += n gather_tgt = pad_sequence(gather_tgts, batch_first=True, padding_value=0) return gather_tgt def mlm_eval_collate(inputs): (input_ids, img_feats, img_pos_feats, attn_masks, txt_labels, gather_inds ) = map(list, unzip(inputs)) # sizes n_masks, txt_lens = map(list, unzip(i.size() for i in input_ids)) # text batches input_ids = stack_pad_tensors(input_ids, txt_lens, n_masks) position_ids = torch.arange(0, input_ids.size(1), dtype=torch.long ).unsqueeze(0) txt_labels = pad_sequence(txt_labels, batch_first=True, padding_value=-1) gather_tgt = _batch_gather_tgt(gather_inds, n_masks) # image batches num_bbs = [f.size(0) for f in img_feats] img_feat = stack_pad_tensors(expand_tensors(img_feats, n_masks), num_bbs, n_masks) img_pos_feat = stack_pad_tensors(expand_tensors(img_pos_feats, n_masks), num_bbs, n_masks) bs, max_tl = input_ids.size() attn_masks = stack_pad_tensors(expand_tensors(attn_masks, n_masks), None, n_masks) out_size = attn_masks.size(1) # repeat txt_lens, num_bbs txt_lens = [l for l, n in zip(txt_lens, n_masks) for _ in range(n)] num_bbs = [b for b, n in zip(num_bbs, n_masks) for _ in range(n)] gather_index = get_gather_index(txt_lens, num_bbs, bs, max_tl, out_size) batch = {'input_ids': input_ids, 'position_ids': position_ids, 'img_feat': img_feat, 'img_pos_feat': img_pos_feat, 'attn_masks': attn_masks, 'gather_index': gather_index, 'gather_tgt': gather_tgt, 'txt_labels': txt_labels} return batch class BlindMlmEvalDataset(Dataset): def __init__(self, txt_db): assert isinstance(txt_db, TxtTokLmdb) self.txt_db = txt_db self.lens, self.ids = get_ids_and_lens(txt_db) def __len__(self): return len(self.ids) def __getitem__(self, i): id_ = self.ids[i] example = self.txt_db[id_] input_ids = example['input_ids'] # text input input_ids = example['input_ids'] (input_ids, txt_labels, gather_inds ) = self.txt_db.create_mlm_eval_io(input_ids) attn_masks = torch.ones(len(input_ids), dtype=torch.long) return input_ids, attn_masks, txt_labels, gather_inds def mlm_blind_eval_collate(inputs): (input_ids, position_ids, attn_masks, txt_labels, gather_inds ) = map(list, unzip(inputs)) # sizes n_masks, txt_lens = map(list, unzip(i.size() for i in input_ids)) # text batches input_ids = stack_pad_tensors(input_ids, txt_lens, n_masks) position_ids = torch.arange(0, input_ids.size(1), dtype=torch.long ).unsqueeze(0) attn_masks = stack_pad_tensors(expand_tensors(attn_masks, n_masks), None, n_masks) txt_labels = pad_sequence(txt_labels, batch_first=True, padding_value=-1) gather_tgt = _batch_gather_tgt(gather_inds, n_masks) batch = {'input_ids': input_ids, 'position_ids': position_ids, 'attn_masks': attn_masks, 'gather_tgt': gather_tgt, 'txt_labels': txt_labels} return batch