lightningdot/uniter_model/prepro.py


								"""

								preprocess COCO annotations into LMDB

								"""

								import argparse

								from collections import defaultdict

								import json

								import os

								from os.path import basename, exists

								import pickle

								import re


								from cytoolz import curry

								from tqdm import tqdm

								from pytorch_pretrained_bert import BertTokenizer


								from utils.vqa import compute_target

								from utils.visual_entailment import compute_target as compute_target_ve

								from data.data import open_lmdb


								IN_WORD = '@@'


								@curry

								def bert_tokenize(tokenizer, text):

								    """ reconstructable tokenization for possible generation """

								    if text == ('this house is leaning out to wards '

								                'the road taken in cambridge@ @@@@'):

								        # SBU special case

								        text = text.replace('@@', '')

								    assert IN_WORD not in text

								    ids = []

								    words = []

								    for word in text.strip().split():

								        ws = tokenizer.tokenize(word)

								        if not ws:

								            # some special char in conceptual caption

								            continue

								        words.append(ws[0])

								        for w in ws[1:]:

								            words.append(f'{IN_WORD}{w}')

								        ids.extend(tokenizer.convert_tokens_to_ids(ws))

								    return ids, words


								@curry

								def bert_tokenize_for_vcr(tokenizer, special_tokens, text, txt_region_tokens):

								    """ reconstructable tokenization for possible generation """

								    assert IN_WORD not in text

								    ids = []

								    words = []

								    special_tokens_dict = {val: ind for ind, val in enumerate(special_tokens)}

								    toked_txt_region_tokens = []

								    index = 0

								    for word in text.strip().split():

								        if word in special_tokens_dict:

								            words.append(word)

								            ids.extend([len(tokenizer.vocab)+special_tokens_dict[word]])

								            toked_txt_region_tokens.append(txt_region_tokens[index])

								        else:

								            ws = tokenizer.tokenize(word)

								            words.append(ws[0])

								            toked_txt_region_tokens.append(txt_region_tokens[index])

								            for w in ws[1:]:

								                words.append(f'{IN_WORD}{w}')

								                toked_txt_region_tokens.append(txt_region_tokens[index])

								            ids.extend(tokenizer.convert_tokens_to_ids(ws))

								        index += 1

								    return ids, words, toked_txt_region_tokens


								def _norm_text(text):

								    norm_text = re.sub(r"([.,'!?\"()*#:;])", '', text.lower()

								                       ).replace('-', ' ').replace('/', ' ')

								    return norm_text


								def make_word2id(texts):

								    word2id = {'PAD': 0, 'UNK': 1}

								    for text in texts:

								        for w in _norm_text(text).split():

								            if w not in word2id:

								                word2id[w] = len(word2id)

								    return word2id


								def gen_vqa_texts(annotation):

								    questions = json.load(open(annotation))['questions']

								    for q in questions:

								        yield q['question']


								def gen_ve_texts(annotation):

								    contents = open(annotation, "r").read()

								    hypotheses = [json.loads(str(item))

								                  for item in contents.strip().split('\n')]

								    for h in hypotheses:

								        yield h['sentence2']


								def gen_itm_texts(annotation):

								    data = json.load(open(annotation))

								    for q in data:

								        for s in q["sentences"]:

								            yield s['raw']


								@curry

								def _get_coco_fname(id_, split):

								    fname = f'coco_{split}_{id_:012}.npz'

								    return fname


								def _get_vg_fname(id_):

								    fname = f'vg_{int(id_):012}.npz'

								    return fname


								def _get_gqa_fname(id_):

								    if "n" not in id_:

								        fname = f'gqa_{int(id_):012}.npz'

								    else:

								        fname = f'gqa_{id_}.npz'

								    return fname


								def _get_flickr_fname(id_):

								    fname = f'flickr30k_{id_:012}.npz'

								    return fname


								def _get_vcr_fname(id_, split):

								    fname_gt = f'vcr_gt_{split}_{id_}.npz'

								    fname = f'vcr_{split}_{id_}.npz'

								    return fname_gt, fname


								def process_vqa(questions, answers, ans2label, db, tokenizer, split):

								    """

								    Inputs:

								    - questions : [{image_id, question, question_id}]

								    - answers   : [{answers, image_id, question_id,

								                    question_type, answer_type}]

								    - ans2label : ans -> ans_id

								    - db

								    - tokenizer

								    - split

								    Return:

								    - id2len   : qid -> tokenized question length

								    - txt2img  : qid -> img(feature) filename

								    - img2txts : img(feature) filename -> [qid]

								    Besides, we write into db[qid]:

								    - toked_question : [tokens]

								    - input_ids      : [wd_ids]

								    - img_fname      : img(feature) filename

								    - target         : {labels, scores}

								    """

								    id2len = {}

								    txt2img = {}

								    img2txts = defaultdict(list)

								    if split == 'vg':

								        get_img_fname = _get_vg_fname

								    elif split == 'gqa':

								        get_img_fname = _get_gqa_fname

								    else:

								        get_img_fname = _get_coco_fname(split=split)

								    for q in tqdm(questions, desc='processing VQA questions'):

								        qid = str(q['question_id'])

								        input_ids, toked_question = tokenizer(q['question'])

								        id2len[qid] = len(input_ids)

								        img_fname = get_img_fname(q['image_id'])

								        txt2img[qid] = img_fname

								        img2txts[img_fname].append(qid)

								        q['toked_question'] = toked_question

								        q['input_ids'] = input_ids

								        q['img_fname'] = img_fname

								        db[qid] = q

								    if answers is not None:

								        for a in tqdm(answers, desc='processing VQA answers'):

								            qid = str(a['question_id'])

								            q = db[qid]

								            assert q['question_id'] == a['question_id']

								            assert q['image_id'] == a['image_id']

								            for k, v in a.items():

								                q[k] = v

								            q['target'] = compute_target(a['answers'], ans2label)

								            db[qid] = q

								    return id2len, txt2img, img2txts


								def process_referring_expressions(refs, instances, iid_to_ann_ids,

								                                  db, tokenizer, split):

								    """

								    Inputs:

								    - refs: [ref_id, ann_id, image_id, split, sent_ids, sentences]

								    - instances: {images, annotations, categories}

								    - iid_to_ann_ids: image_id -> ann_ids ordered by extracted butd features

								    Return:

								    - id2len : sent_id -> tokenized question length

								    - images : [{id, file_name, ann_ids, height, width} ]

								    - annotations: [{id, area, bbox, image_id, category_id, iscrowd}]

								    - categories : [{id, name, supercategory}]

								    """

								    # images within split

								    image_set = set([ref['image_id'] for ref in refs if ref['split'] == split])

								    images = []

								    for img in instances['images']:

								        if img['id'] in image_set:

								            images.append({'id': img['id'], 'file_name': img['file_name'],

								                           'ann_ids': iid_to_ann_ids[str(img['id'])],

								                           'height': img['height'], 'width': img['width']})

								    # anns within split

								    annotations = []

								    for ann in instances['annotations']:

								        if ann['image_id'] in image_set:

								            annotations.append({

								                'id': ann['id'], 'area': ann['area'], 'bbox': ann['bbox'],

								                'image_id': ann['image_id'], 'category_id': ann['category_id'],

								                'iscrowd': ann['iscrowd']

								            })

								    Anns = {ann['id']: ann for ann in annotations}

								    # category info

								    categories = instances['categories']

								    # refs within split

								    refs = [ref for ref in refs if ref['split'] == split]

								    id2len = {}

								    for ref in tqdm(refs, desc='processing referring expressions'):

								        ref_id = ref['ref_id']

								        ann_id = ref['ann_id']

								        image_id = ref['image_id']

								        for sent in ref['sentences']:

								            sent_id = sent['sent_id']

								            input_ids, toked_sent = tokenizer(sent['sent'])

								            id2len[str(sent_id)] = len(input_ids)

								            db[str(sent_id)] = {

								                'sent_id': sent_id, 'sent': sent['sent'],

								                'ref_id': ref_id, 'ann_id': ann_id, 'image_id': image_id,

								                'bbox': Anns[ann_id]['bbox'],

								                'input_ids': input_ids, 'toked_sent': toked_sent}

								    return id2len, images, annotations, categories, refs


								def process_gqa(questions, db, tokenizer, split):

								    id2len = {}

								    txt2img = {}

								    img2txts = defaultdict(list)

								    get_img_fname = _get_gqa_fname

								    for qid, q in tqdm(questions.items(),

								                       desc=f'processing GQA_{split} questions'):

								        input_ids, toked_question = tokenizer(q['question'])

								        id2len[qid] = len(input_ids)

								        img_fname = get_img_fname(q['imageId'])

								        txt2img[qid] = img_fname

								        img2txts[img_fname].append(qid)

								        q['toked_question'] = toked_question

								        q['input_ids'] = input_ids

								        q['img_fname'] = img_fname

								        input_ids_a, toked_a = tokenizer(q['fullAnswer'])

								        id2len[qid] += len(input_ids_a)

								        q['input_ids_a'] = input_ids_a

								        q['toked_answers'] = toked_a

								        db[qid] = q

								    return id2len, txt2img, img2txts


								def process_nlvr2(jsonl, db, tokenizer, imgs=None):

								    id2len = {}

								    txt2img = {}  # not sure if useful

								    img2txts = defaultdict(list)  # not sure if useful

								    for line in tqdm(jsonl, desc='processing NLVR2'):

								        example = json.loads(line)

								        id_ = example['identifier']

								        img_id = '-'.join(id_.split('-')[:-1])

								        img_fname = (f'nlvr2_{img_id}-img0.npz', f'nlvr2_{img_id}-img1.npz')

								        if imgs is not None:

								            if not all(img in imgs for img in img_fname):

								                continue

								        input_ids, toked_question = tokenizer(example['sentence'])

								        target = 1 if example['label'] == 'True' else 0

								        id2len[id_] = len(input_ids)

								        txt2img[id_] = img_fname

								        for fname in img_fname:

								            img2txts[fname].append(id_)

								        example['toked_question'] = toked_question

								        example['input_ids'] = input_ids

								        example['img_fname'] = img_fname

								        example['target'] = target

								        db[id_] = example

								    return id2len, txt2img, img2txts


								def process_visual_entailment(hypotheses, ans2label, db, tokenizer):

								    id2len = {}

								    txt2img = {}

								    img2txts = defaultdict(list)

								    for h in tqdm(hypotheses, desc='processing visaul entailment hypotheses'):

								        hid = h['pairID']

								        h['image_id'] = int(h["Flikr30kID"].split(".")[0])

								        input_ids, toked_hypothesis = tokenizer(h['sentence2'])

								        id2len[hid] = len(input_ids)

								        img_fname = _get_flickr_fname(h['image_id'])

								        txt2img[hid] = img_fname

								        img2txts[img_fname].append(hid)

								        h['toked_hypothesis'] = toked_hypothesis

								        h['input_ids'] = input_ids

								        h['target'] = compute_target_ve([h['gold_label']], ans2label)

								        h['img_fname'] = img_fname

								        db[hid] = h


								    return id2len, txt2img, img2txts


								def process_caption(data, db, tokenizer, split):

								    id2len = {}

								    txt2img = {}

								    img2txts = defaultdict(list)

								    for q in tqdm(data['annotations'], desc='processing COCO captions'):

								        id_ = str(q['id'])

								        input_ids, toked_caption = tokenizer(q['caption'])

								        id2len[id_] = len(input_ids)

								        img_fname = _get_coco_fname(q['image_id'], split)

								        txt2img[id_] = img_fname

								        img2txts[img_fname].append(id_)

								        q['toked_caption'] = toked_caption

								        q['input_ids'] = input_ids

								        q['img_fname'] = img_fname

								        db[id_] = q

								    return id2len, txt2img, img2txts


								def process_conceptual_caption(tsv, imgs, db, tokenizer, split):

								    id2len = {}

								    txt2img = {}

								    img2txts = defaultdict(list)

								    for line in tqdm(tsv, desc='processing conceptual captions'):

								        fields = line.strip().split('\t')

								        assert len(fields) == 4

								        id_, _, caption, success = fields

								        if success == 'fail':

								            continue

								        assert success == 'success'

								        input_ids, toked_caption = tokenizer(caption)

								        assert input_ids  # safeguard for empty text

								        img_fname = f'gcc_{split}_{int(id_):012}.npz'

								        if img_fname not in imgs:

								            continue

								        id2len[id_] = len(input_ids)

								        txt2img[id_] = img_fname

								        img2txts[img_fname].append(id_)

								        db[id_] = {'id': id_,

								                   'toked_caption': toked_caption,

								                   'input_ids': input_ids,

								                   'img_fname': img_fname}

								    return id2len, txt2img, img2txts


								def process_sbu_caption(data, db, tokenizer):

								    id2len = {}

								    txt2img = {}

								    img2txts = defaultdict(list)

								    for ex in tqdm(data, desc='processing SBU captions'):

								        if ex['file_path'] == '0347/565.jpg':

								            # special case for corrupted image

								            continue

								        id_ = ex['iid']

								        input_ids, toked_caption = tokenizer(ex['sent'])

								        assert input_ids  # safeguard for empty text

								        try:

								            # FIXME sbu feature extraction bug

								            id_ = str(int(id_))

								        except ValueError:

								            pass

								        img_fname = f'sbu_{id_}.npz'

								        id2len[id_] = len(input_ids)

								        txt2img[id_] = img_fname

								        img2txts[img_fname].append(id_)

								        db[id_] = {'id': id_,

								                   'toked_caption': toked_caption,

								                   'input_ids': input_ids,

								                   'img_fname': img_fname}

								    return id2len, txt2img, img2txts


								def process_image_text_retrieval(data, db, tokenizer, dataset, split):

								    id2len = {}

								    txt2img = {}

								    img2txts = defaultdict(list)

								    if dataset == 'coco':

								        _get_img_fname = _get_coco_fname(split=split)

								    elif dataset == 'flickr':

								        _get_img_fname = _get_flickr_fname

								    else:

								        raise ValueError('unrecognized data')

								    for q in tqdm(data, desc=f'processing image_text_retrieval for {split}'):

								        filename = q["filename"].split(".jpg")[0]

								        image_id = (int(filename.split("_")[-1]) if re.search('[a-zA-Z]',

								                                                              filename)

								                    else int(filename))

								        img_fname = _get_img_fname(image_id)

								        for s in q["sentences"]:

								            s['image_id'] = image_id

								            id_ = str(s['sentid'])

								            txt2img[id_] = img_fname

								            img2txts[img_fname].append(id_)

								            input_ids, toked_caption = tokenizer(s['raw'])

								            id2len[id_] = len(input_ids)

								            s['toked_caption'] = toked_caption

								            s['input_ids'] = input_ids

								            s['img_fname'] = img_fname

								            db[id_] = s

								    return id2len, txt2img, img2txts


								def process_caption_licheng_cleaned(data, db, tokenizer, split="COCO"):

								    """

								    Inputs:

								    - data      : [{id, dataset, split, sent, bbox,

								                    dataset_image_id, file_path}]

								    - db

								    - tokenizer

								    - split

								    Return:

								    - id2len    : id -> tokenized caption length

								    - txt2img   : id -> img(feature) filenamee

								    - img2txts  : img(feature) filename -> id(s)

								    We will also write to db[id]:

								    - image_id

								    - toked_caption : [tokens]

								    - input_ids     : [wd_ids]

								    - img_fname     : img(feature) filename

								    """

								    id2len = {}

								    txt2img = {}

								    img2txts = defaultdict(list)

								    for q in tqdm(data, desc='processing licheng collected captions '

								                             f'for split: {split}'):

								        id_ = str(q['id'])

								        input_ids, toked_caption = tokenizer(q['sent'])

								        id2len[id_] = len(input_ids)

								        if q['dataset'] == 'vg':

								            img_fname = _get_vg_fname(q['dataset_image_id'])

								        else:

								            assert q['dataset'] == 'coco'

								            img_split = basename(q['file_path']).split('_')[1]

								            img_fname = _get_coco_fname(q['dataset_image_id'], img_split)

								        txt2img[id_] = img_fname

								        img2txts[img_fname].append(id_)

								        q['image_id'] = q['dataset_image_id']

								        q['toked_caption'] = toked_caption

								        q['input_ids'] = input_ids

								        q['img_fname'] = img_fname

								        db[id_] = q

								    return id2len, txt2img, img2txts


								def process_vcr_text(tokened_txt, objects, special_tokens):

								    text_region_tokens = []

								    image_region_tokens = [0]*len(objects)

								    words = []

								    for w in tokened_txt:

								        if isinstance(w, str):

								            word_splits = w.split(" ")

								            for splited_w in word_splits:

								                words.append(splited_w)

								                text_region_tokens.append(0)

								        else:

								            for index in w:

								                text_region_tokens.append(index+1)

								                image_region_tokens[index] = index+1

								                object_name = objects[index]

								                if "person" in object_name:

								                    object_name = f"{object_name}_{index}"

								                    if object_name not in special_tokens:

								                        special_tokens.append(object_name)

								                words.append(object_name)

								    return " ".join(words), image_region_tokens, text_region_tokens


								def process_vcr_obj_categories(objects, object2ids):

								    output_ids = []

								    for obj in objects:

								        output_ids.append(object2ids[obj]+1)

								    return output_ids


								def process_vcr(data, db, tokenizer, split, object2ids):

								    id2len_qa = {}

								    id2len_qar = {}

								    txt2img = {}

								    img2txts = defaultdict(list)

								    special_tokens = [f"person_{i}" for i in range(81)]

								    for q in tqdm(data, desc='processing VCR %s questions' % split):

								        filename, file_extension = os.path.splitext(

								            q["img_fn"].split("/")[-1])

								        q["image_id"] = filename

								        q['qa_target'] = q["answer_label"] if "answer_label" in q else -1

								        q["qar_target"] = q["rationale_label"] \

								            if "rationale_label" in q else -1

								        qid = str(q['annot_id'])

								        q["raw_q"], image_region_tokens,  txt_region_tokens = process_vcr_text(

								            q["question"], q["objects"], special_tokens)

								        q["image_region_tokens"] = image_region_tokens

								        input_ids, toked_question, toked_txt_region_tokens = tokenizer(

								            special_tokens, q["raw_q"], txt_region_tokens)

								        object_ids = process_vcr_obj_categories(q["objects"], object2ids)

								        q["object_ids"] = object_ids

								        q['toked_question'] = toked_question

								        q['input_ids'] = input_ids

								        q['toked_txt_region_tokens'] = toked_txt_region_tokens

								        q["raw_as"] = []

								        q["raw_rs"] = []

								        img_fname_gt, img_fname = _get_vcr_fname(q['image_id'], split)

								        txt2img[qid] = [img_fname_gt, img_fname]

								        img2txts[img_fname].append(qid)

								        img2txts[img_fname_gt].append(qid)


								        input_ids_as = []

								        toked_as = []

								        input_ids_rs = []

								        toked_rs = []

								        toked_txt_region_tokens_a = []

								        toked_txt_region_tokens_r = []

								        max_qa_len = 0

								        for ans in q["answer_choices"]:

								            raw_ans, _, txt_region_tokens = process_vcr_text(

								                ans, q["objects"], special_tokens)

								            q["raw_as"].append(raw_ans)

								            input_ids_a, toked_a, toked_txt_region_tokens = tokenizer(

								                special_tokens, raw_ans, txt_region_tokens)

								            if len(input_ids_a) > max_qa_len:

								                max_qa_len = len(input_ids_a)

								            input_ids_as.append(input_ids_a)

								            toked_as.append(toked_a)

								            toked_txt_region_tokens_a.append(toked_txt_region_tokens)

								        id2len_qa[qid] = (len(input_ids)+max_qa_len)*4


								        max_r_len = 0

								        for r in q["rationale_choices"]:

								            raw_r, _, txt_region_tokens = process_vcr_text(

								                r, q["objects"], special_tokens)

								            q["raw_rs"].append(raw_r)

								            input_ids_r, toked_r, toked_txt_region_tokens = tokenizer(

								                special_tokens, raw_r, txt_region_tokens)

								            if len(input_ids_r) > max_r_len:

								                max_r_len = len(input_ids_r)

								            input_ids_rs.append(input_ids_r)

								            toked_rs.append(toked_r)

								            toked_txt_region_tokens_r.append(toked_txt_region_tokens)

								        id2len_qar[qid] = id2len_qa[qid]+max_r_len

								        q['img_fname'] = [img_fname_gt, img_fname]

								        q['toked_as'] = toked_as

								        q['toked_txt_region_tokens_a'] = toked_txt_region_tokens_a

								        q['input_ids_as'] = input_ids_as

								        q['toked_rs'] = toked_rs

								        q['input_ids_rs'] = input_ids_rs

								        q['toked_txt_region_tokens_r'] = toked_txt_region_tokens_r

								        db[qid] = q

								    return id2len_qa, id2len_qar, txt2img, img2txts, special_tokens


								def _get_img_split(annotation):

								    for split in ['train2014', 'val2014', 'test2015', 'test-dev2015']:

								        if split in annotation:

								            img_split = split

								            break

								    else:

								        if ('vg' in annotation.lower()

								                or 'genome' in annotation.lower()):

								            img_split = 'vg'

								        elif 'gqa' in annotation.lower():

								            if ('test' in annotation.lower()

								                    or 'submission' in annotation.lower()):

								                img_split = 'gqa'

								            else:

								                img_split = 'vg'

								        elif 'val' in annotation.lower():

								            img_split = 'val2014'

								        elif 'train' in annotation.lower():

								            img_split = 'train2014'

								        else:

								            raise ValueError('cannot identify split')

								    if img_split == 'test-dev2015':

								        img_split = 'test2015'

								    return img_split


								def main(opts):

								    if not exists(opts.output):

								        os.makedirs(opts.output)

								    else:

								        raise ValueError('Found existing DB. Please explicitly remove '

								                         'for re-processing')

								    meta = vars(opts)

								    toker = BertTokenizer.from_pretrained(

								        opts.bert, do_lower_case='uncased' in opts.bert)

								    tokenizer = bert_tokenize(toker)

								    meta['UNK'] = toker.convert_tokens_to_ids(['[UNK]'])[0]

								    meta['CLS'] = toker.convert_tokens_to_ids(['[CLS]'])[0]

								    meta['SEP'] = toker.convert_tokens_to_ids(['[SEP]'])[0]

								    meta['MASK'] = toker.convert_tokens_to_ids(['[MASK]'])[0]

								    meta['v_range'] = (toker.convert_tokens_to_ids('!')[0],

								                       len(toker.vocab))

								    with open(f'{opts.output}/meta.json', 'w') as f:

								        json.dump(vars(opts), f, indent=4)


								    output_field_name = ['id2len', 'txt2img', 'img2txts']

								    with open_lmdb(opts.output, readonly=False) as db:

								        if opts.task == 'vqa':

								            questions = json.load(open(opts.annotations[0]))['questions']

								            if len(opts.annotations) == 3:

								                answers = json.load(open(opts.annotations[1]))['annotations']

								                ans2label = pickle.load(open(opts.annotations[2], 'rb'))

								                with open(f'{opts.output}/ans2label.pkl', 'wb') as f:

								                    pickle.dump(ans2label, f)

								            else:

								                answers = None

								                ans2label = None


								            # train2014, val2014

								            img_split = _get_img_split(opts.annotations[0])

								            jsons = process_vqa(questions, answers, ans2label,

								                                db, tokenizer, img_split)

								        elif opts.task == 've':

								            contents = open(opts.annotations[0], "r").read()

								            hypotheses = [json.loads(str(item))

								                          for item in contents.strip().split('\n')]

								            from utils.misc import VE_ENT2IDX

								            ans2label = VE_ENT2IDX

								            jsons = process_visual_entailment(

								                hypotheses, ans2label, db, tokenizer)

								        elif opts.task == 'caption':

								            data = json.load(open(opts.annotations[0]))

								            img_split = _get_img_split(opts.annotations[0])

								            jsons = process_caption(data, db, tokenizer, img_split)

								        elif opts.task == 'conceptual':

								            split = 'train' if 'train' in opts.annotations[0] else 'val'

								            imgs = set(json.load(open(opts.annotations[1])))

								            with open(opts.annotations[0]) as tsv:

								                jsons = process_conceptual_caption(tsv, imgs,

								                                                   db, tokenizer, split)

								        elif opts.task == 'sbu':

								            data = json.load(open(opts.annotations[0]))

								            jsons = process_sbu_caption(data, db, tokenizer)

								        elif opts.task == 'itm':

								            data = json.load(open(opts.annotations[0]))

								            if 'coco' in opts.annotations[0].lower():

								                dataset = 'coco'

								                if 'train' in opts.annotations[0].lower():

								                    split = 'train2014'

								                elif ('val' in opts.annotations[0].lower()

								                      or 'test' in opts.annotations[0].lower()):

								                    split = 'val2014'

								                else:

								                    raise ValueError()

								            elif 'flickr' in opts.annotations[0].lower():

								                dataset = 'flickr'

								                split = None

								            else:

								                raise ValueError()

								            jsons = process_image_text_retrieval(

								                data, db, tokenizer, dataset, split)

								        elif opts.task == 'licheng_cleaned':

								            data = json.load(open(opts.annotations[0]))

								            jsons = process_caption_licheng_cleaned(

								                data, db, tokenizer,

								                split=opts.annotations[0].split(".")[0].split("/")[-1])

								        elif opts.task == 'gqa':

								            data = json.load(open(opts.annotations[0]))

								            data_split = opts.annotations[0].split(".")[0].split("/")[-1]

								            data_split = data_split.split("_")[0]

								            jsons = process_gqa(

								                data, db, tokenizer,

								                split=data_split)

								        elif opts.task == 'vcr':

								            data = []

								            with open(opts.annotations[0], "r") as f:

								                for line in f:

								                    data.append(json.loads(line))

								            img_split = opts.annotations[0].split("/")[-1].split(".")[0]

								            tokenizer = bert_tokenize_for_vcr(toker)

								            ann_folder = "/".join(opts.annotations[0].split("/")[:-1])

								            object_categories_path = ann_folder+"/object_categories.json"

								            object_categories = json.load(open(object_categories_path, "r"))

								            jsons = process_vcr(data, db, tokenizer,

								                                img_split, object_categories)

								            output_field_name = ['id2len_qa', 'id2len_qar', 'txt2img',

								                                 'img2txts', 'special_tokens']

								        elif opts.task == 'nlvr2':

								            with open(opts.annotations[0]) as ann:

								                if len(opts.annotations) == 2:

								                    imgs = set(json.load(open(opts.annotations[1])))

								                else:

								                    imgs = None

								                jsons = process_nlvr2(ann, db, tokenizer, imgs)

								        elif opts.task == 're':

								            data = []

								            refs = pickle.load(open(opts.annotations[0], 'rb'))

								            instances = json.load(open(opts.annotations[1], 'r'))

								            iid_to_ann_ids = json.load(open(opts.annotations[2],

								                                            'r'))['iid_to_ann_ids']

								            # dirs/refcoco_testA_bert-base-cased.db -> testA

								            img_split = opts.output.split('/')[-1].split('_')[1]

								            jsons = process_referring_expressions(

								                refs, instances, iid_to_ann_ids, db, tokenizer, img_split)

								            output_field_name = ['id2len', 'images', 'annotations',

								                                 'categories', 'refs']

								        else:

								            raise ValueError()


								    for dump, name in zip(jsons, output_field_name):

								        with open(f'{opts.output}/{name}.json', 'w') as f:

								            json.dump(dump, f)


								if __name__ == '__main__':

								    parser = argparse.ArgumentParser()

								    parser.add_argument('--annotations', required=True, nargs='+',

								                        help='annotation JSON')

								    parser.add_argument('--output', required=True,

								                        help='output dir of DB')

								    parser.add_argument('--task', required=True,

								                        choices=['vqa', 'caption',

								                                 've', "itm", "licheng_cleaned",

								                                 'vcr', 'nlvr2', 're', 'gqa',

								                                 'conceptual', 'sbu'])

								    parser.add_argument('--bert', default='bert-base-cased')

								    args = parser.parse_args()

								    if args.task == 'vqa':

								        assert len(args.annotations) == 3 or len(args.annotations) == 1

								    elif args.task == 'gqa':

								        assert len(args.annotations) == 1

								    elif args.task == 've':

								        assert len(args.annotations) == 1

								    elif args.task == 'itm':

								        assert len(args.annotations) == 1

								    elif args.task == 'licheng_cleaned':

								        assert len(args.annotations) == 1

								    elif args.task == 'caption':

								        assert len(args.annotations) == 1

								    elif args.task == 'vcr':

								        assert len(args.annotations) == 1

								    elif args.task == 'nlvr2':

								        assert len(args.annotations) == 1 or len(args.annotations) == 2

								    elif args.task == 'conceptual':

								        assert len(args.annotations) == 2 or len(args.annotations) == 1

								    elif args.task == 'sbu':

								        assert len(args.annotations) == 1

								    elif args.task == 're':

								        assert len(args.annotations) == 3

								    main(args)