lightningdot/uniter_model/scripts/prepro_all.sh

TOKER=$1
TXT_DB=$2
FORMAT=$3
#TXT_DB='/ssd2/yenchun/TXT_DB_test'

ANNOTATIONS='/ssd2/yenchun/ANNOTATIONS'
VQA_ANN=$ANNOTATIONS/VQA/
CAP_ANN=$ANNOTATIONS/COCO_annotation/
CONCEPT_ANN=$ANNOTATIONS/conceptual_captions/
SBU_ANN=$ANNOTATIONS/sbu_caption/
PRETRAIN_ANN=$ANNOTATIONS/latest_cleaned/
ITM_ANN=$ANNOTATIONS/Image-Text-Matching
VE_ANN=$ANNOTATIONS/visual_entailment/
GQA_ANN=$ANNOTATIONS/GQA/
VCR_ANN=$ANNOTATIONS/VCR/
NLVR2_ANN=$ANNOTATIONS/NLVR2/


# process licheng's split
#python scripts/split_annotations.py --format $FORMAT \
#    $PRETRAIN_ANN/collected\(coco+vg\).json $PRETRAIN_ANN


if [ $TOKER = 'bert-large-cased' ]; then
    SUFFIX='large-cased'
elif [ $TOKER = 'bert-base-cased' ]; then
    SUFFIX='base-cased'
else
    echo "invalid tokenizer specified"
    exit(1)
fi

# Image Text Retrieval
for DSET in 'flickr30k' 'coco'; do
    for SPLIT in 'train' 'val' 'test'; do
        python prepro.py --task itm --bert $TOKER --format $FORMAT \
            --annotations $ITM_ANN/${DSET}_$SPLIT.json \
            --output $TXT_DB/itm_${DSET}_${SPLIT}_$SUFFIX.db

    done
done
# coco 1k splits
for SPLIT in 'val' 'test'; do
    for i in 0 1 2 3 4; do
        python prepro.py --task itm --bert $TOKER --format $FORMAT \
            --annotations $ITM_ANN/coco_${SPLIT}_1k_$i.json \
            --output $TXT_DB/itm_coco_${SPLIT}_1k_${i}_$SUFFIX.db
    done
done
# coco val rest
python prepro.py --task itm --bert $TOKER --format $FORMAT \
    --annotations $ITM_ANN/coco_restval.json \
    --output $TXT_DB/itm_coco_restval_$SUFFIX.db


# COCO
for SPLIT in 'train' 'val'; do
    # VQA
    python prepro.py --task vqa --bert $TOKER --format $FORMAT \
        --annotations $VQA_ANN/v2_OpenEnded_mscoco_${SPLIT}2014_questions.json \
            $VQA_ANN/v2_mscoco_${SPLIT}2014_annotations.json \
            $VQA_ANN/ans2label.pkl \
        --output $TXT_DB/vqa_${SPLIT}_$SUFFIX.db
    if [ $SPLIT = 'val' ]; then
        for SP in 'train' 'dev'; do
            python prepro.py --task vqa --bert $TOKER --format $FORMAT \
                --annotations $VQA_ANN/v2_OpenEnded_mscoco_${SP}val2014_questions.json \
                    $VQA_ANN/v2_mscoco_${SP}val2014_annotations.json \
                    $VQA_ANN/ans2label.pkl \
                --output $TXT_DB/vqa_${SP}val_$SUFFIX.db
        done
    fi

    # Caption
    python prepro.py --task caption --bert $TOKER --format $FORMAT \
        --annotations $CAP_ANN/captions_${SPLIT}2014.json \
        --output $TXT_DB/caption_${SPLIT}_$SUFFIX.db
done

# COCO VQA test
python prepro.py --task vqa --bert $TOKER --format $FORMAT \
    --annotations $VQA_ANN/v2_OpenEnded_mscoco_test2015_questions.json \
    --output $TXT_DB/vqa_test_$SUFFIX.db

# VG VQA
python prepro.py --task vqa --bert $TOKER --format $FORMAT \
    --annotations $VQA_ANN/VG_questions.json.mapped \
        $VQA_ANN/VG_annotations.json.mapped \
        $VQA_ANN/ans2label.pkl \
    --output $TXT_DB/vqa_vg_$SUFFIX.db

# all pretraining

# coco trainval
python prepro.py --task licheng_cleaned --bert $TOKER --format $FORMAT \
    --annotations $PRETRAIN_ANN/pretrain_caption_coco_trainval.json \
    --output $TXT_DB/pretrain_caption_coco_trainval_$SUFFIX.db

for DSET in 'coco' 'vg'; do
    for SPLIT in 'val' 'train'; do
        python prepro.py --task licheng_cleaned --bert $TOKER --format $FORMAT \
            --annotations $PRETRAIN_ANN/pretrain_caption_${DSET}_$SPLIT.json \
            --output $TXT_DB/pretrain_caption_${DSET}_${SPLIT}_$SUFFIX.db
    done
done

# pretrain VQA
for DSET in 'genome_vqa' 'gqa'; do
    if [ $DSET = 'genome_vqa' ]; then
        DS='vg'
    else
        DS='gqa'
    fi
    for SPLIT in 'val' 'train'; do
        python prepro.py --task vqa --bert $TOKER --format $FORMAT \
            --annotations $PRETRAIN_ANN/${DSET}_${SPLIT}_questions.json \
                $PRETRAIN_ANN/${DSET}_${SPLIT}_annotations.json \
                $PRETRAIN_ANN/ans2label.pkl \
            --output $TXT_DB/pretrain_vqa_${DS}_${SPLIT}_$SUFFIX.db
    done
done
# Pretrain VQA COCO
for SPLIT in 'val' 'trainsplit' 'valsplit' ; do
    python prepro.py --task vqa --bert $TOKER --format $FORMAT \
        --annotations $PRETRAIN_ANN/coco_vqa_${SPLIT}_questions.json \
            $PRETRAIN_ANN/coco_vqa_${SPLIT}_annotations.json \
            $PRETRAIN_ANN/ans2label.pkl \
        --output $TXT_DB/pretrain_vqa_coco_${SPLIT}_$SUFFIX.db
done


# Visual Entailment
for SPLIT in 'train' 'dev' 'test'; do
    python prepro.py --task ve --bert $TOKER --format $FORMAT \
        --annotations $VE_ANN/snli_ve_$SPLIT.jsonl \
        --output $TXT_DB/ve_${SPLIT}_$SUFFIX.db
done

# GQA
for SPLIT in 'train' 'val' 'testdev'; do
    for VER in 'all' 'balanced'; do
        python prepro.py --task vqa --bert $TOKER --format $FORMAT \
            --annotations $GQA_ANN/gqa_${SPLIT}_${VER}_questions.vqa.json \
                $GQA_ANN/gqa_${SPLIT}_${VER}_annotations.vqa.json \
                $GQA_ANN/ans2label.pkl \
            --output $TXT_DB/gqa_${SPLIT}_${VER}_$SUFFIX.db
    done
done
# GQA test
python prepro.py --task vqa --bert $TOKER --format $FORMAT \
    --annotations $GQA_ANN/gqa_submission_questions.vqa.json \
    --output $TXT_DB/gqa_submission_$SUFFIX.db


# Conceptual Captions
for SPLIT in 'train' 'val'; do
    python prepro.py --task conceptual --bert $TOKER --format $FORMAT \
        --annotations $CONCEPT_ANN/${SPLIT}_imageId2Ann.tsv \
            $CONCEPT_ANN/${SPLIT}_imgs.json \
        --output $TXT_DB/conceptual_caption_${SPLIT}_$SUFFIX.db
done

# SBU captions
for SPLIT in 'train' 'val'; do
    python prepro.py --task sbu --bert $TOKER --format $FORMAT \
        --annotations $SBU_ANN/sbu_${SPLIT}_captions.json \
        --output $TXT_DB/sbu_caption_${SPLIT}_$SUFFIX.db
done

# VCR
for SPLIT in 'train' 'val'; do
    python prepro.py --task vcr --bert $TOKER --format $FORMAT \
        --annotations $VCR_ANN/$SPLIT.jsonl \
        --output $TXT_DB/vcr_${SPLIT}_$SUFFIX.db
done

# NLVR2
for SPLIT in 'dev' 'test1'; do
    python prepro.py --task nlvr2 --bert $TOKER --format $FORMAT \
        --annotations $NLVR2_ANN/$SPLIT.json \
        --output $TXT_DB/nlvr2_${SPLIT}_$SUFFIX.db
done
# some corrupted train features
python prepro.py --task nlvr2 --bert $TOKER --format $FORMAT \
    --annotations $NLVR2_ANN/train.json $NLVR2_ANN/train_imgs.json \
    --output $TXT_DB/nlvr2_train_$SUFFIX.db
update the operator. Signed-off-by: wxywb <xy.wang@zilliz.com> 2 years ago			`TOKER=$1`
			`TXT_DB=$2`
			`FORMAT=$3`
			`#TXT_DB='/ssd2/yenchun/TXT_DB_test'`

			`ANNOTATIONS='/ssd2/yenchun/ANNOTATIONS'`
			`VQA_ANN=$ANNOTATIONS/VQA/`
			`CAP_ANN=$ANNOTATIONS/COCO_annotation/`
			`CONCEPT_ANN=$ANNOTATIONS/conceptual_captions/`
			`SBU_ANN=$ANNOTATIONS/sbu_caption/`
			`PRETRAIN_ANN=$ANNOTATIONS/latest_cleaned/`
			`ITM_ANN=$ANNOTATIONS/Image-Text-Matching`
			`VE_ANN=$ANNOTATIONS/visual_entailment/`
			`GQA_ANN=$ANNOTATIONS/GQA/`
			`VCR_ANN=$ANNOTATIONS/VCR/`
			`NLVR2_ANN=$ANNOTATIONS/NLVR2/`


			`# process licheng's split`
			`#python scripts/split_annotations.py --format $FORMAT \`
			`# $PRETRAIN_ANN/collected\(coco+vg\).json $PRETRAIN_ANN`


			`if [ $TOKER = 'bert-large-cased' ]; then`
			`SUFFIX='large-cased'`
			`elif [ $TOKER = 'bert-base-cased' ]; then`
			`SUFFIX='base-cased'`
			`else`
			`echo "invalid tokenizer specified"`
			`exit(1)`
			`fi`

			`# Image Text Retrieval`
			`for DSET in 'flickr30k' 'coco'; do`
			`for SPLIT in 'train' 'val' 'test'; do`
			`python prepro.py --task itm --bert $TOKER --format $FORMAT \`
			`--annotations $ITM_ANN/${DSET}_$SPLIT.json \`
			`--output $TXT_DB/itm_${DSET}_${SPLIT}_$SUFFIX.db`

			`done`
			`done`
			`# coco 1k splits`
			`for SPLIT in 'val' 'test'; do`
			`for i in 0 1 2 3 4; do`
			`python prepro.py --task itm --bert $TOKER --format $FORMAT \`
			`--annotations $ITM_ANN/coco_${SPLIT}_1k_$i.json \`
			`--output $TXT_DB/itm_coco_${SPLIT}_1k_${i}_$SUFFIX.db`
			`done`
			`done`
			`# coco val rest`
			`python prepro.py --task itm --bert $TOKER --format $FORMAT \`
			`--annotations $ITM_ANN/coco_restval.json \`
			`--output $TXT_DB/itm_coco_restval_$SUFFIX.db`


			`# COCO`
			`for SPLIT in 'train' 'val'; do`
			`# VQA`
			`python prepro.py --task vqa --bert $TOKER --format $FORMAT \`
			`--annotations $VQA_ANN/v2_OpenEnded_mscoco_${SPLIT}2014_questions.json \`
			`$VQA_ANN/v2_mscoco_${SPLIT}2014_annotations.json \`
			`$VQA_ANN/ans2label.pkl \`
			`--output $TXT_DB/vqa_${SPLIT}_$SUFFIX.db`
			`if [ $SPLIT = 'val' ]; then`
			`for SP in 'train' 'dev'; do`
			`python prepro.py --task vqa --bert $TOKER --format $FORMAT \`
			`--annotations $VQA_ANN/v2_OpenEnded_mscoco_${SP}val2014_questions.json \`
			`$VQA_ANN/v2_mscoco_${SP}val2014_annotations.json \`
			`$VQA_ANN/ans2label.pkl \`
			`--output $TXT_DB/vqa_${SP}val_$SUFFIX.db`
			`done`
			`fi`

			`# Caption`
			`python prepro.py --task caption --bert $TOKER --format $FORMAT \`
			`--annotations $CAP_ANN/captions_${SPLIT}2014.json \`
			`--output $TXT_DB/caption_${SPLIT}_$SUFFIX.db`
			`done`

			`# COCO VQA test`
			`python prepro.py --task vqa --bert $TOKER --format $FORMAT \`
			`--annotations $VQA_ANN/v2_OpenEnded_mscoco_test2015_questions.json \`
			`--output $TXT_DB/vqa_test_$SUFFIX.db`

			`# VG VQA`
			`python prepro.py --task vqa --bert $TOKER --format $FORMAT \`
			`--annotations $VQA_ANN/VG_questions.json.mapped \`
			`$VQA_ANN/VG_annotations.json.mapped \`
			`$VQA_ANN/ans2label.pkl \`
			`--output $TXT_DB/vqa_vg_$SUFFIX.db`

			`# all pretraining`

			`# coco trainval`
			`python prepro.py --task licheng_cleaned --bert $TOKER --format $FORMAT \`
			`--annotations $PRETRAIN_ANN/pretrain_caption_coco_trainval.json \`
			`--output $TXT_DB/pretrain_caption_coco_trainval_$SUFFIX.db`

			`for DSET in 'coco' 'vg'; do`
			`for SPLIT in 'val' 'train'; do`
			`python prepro.py --task licheng_cleaned --bert $TOKER --format $FORMAT \`
			`--annotations $PRETRAIN_ANN/pretrain_caption_${DSET}_$SPLIT.json \`
			`--output $TXT_DB/pretrain_caption_${DSET}_${SPLIT}_$SUFFIX.db`
			`done`
			`done`

			`# pretrain VQA`
			`for DSET in 'genome_vqa' 'gqa'; do`
			`if [ $DSET = 'genome_vqa' ]; then`
			`DS='vg'`
			`else`
			`DS='gqa'`
			`fi`
			`for SPLIT in 'val' 'train'; do`
			`python prepro.py --task vqa --bert $TOKER --format $FORMAT \`
			`--annotations $PRETRAIN_ANN/${DSET}_${SPLIT}_questions.json \`
			`$PRETRAIN_ANN/${DSET}_${SPLIT}_annotations.json \`
			`$PRETRAIN_ANN/ans2label.pkl \`
			`--output $TXT_DB/pretrain_vqa_${DS}_${SPLIT}_$SUFFIX.db`
			`done`
			`done`
			`# Pretrain VQA COCO`
			`for SPLIT in 'val' 'trainsplit' 'valsplit' ; do`
			`python prepro.py --task vqa --bert $TOKER --format $FORMAT \`
			`--annotations $PRETRAIN_ANN/coco_vqa_${SPLIT}_questions.json \`
			`$PRETRAIN_ANN/coco_vqa_${SPLIT}_annotations.json \`
			`$PRETRAIN_ANN/ans2label.pkl \`
			`--output $TXT_DB/pretrain_vqa_coco_${SPLIT}_$SUFFIX.db`
			`done`


			`# Visual Entailment`
			`for SPLIT in 'train' 'dev' 'test'; do`
			`python prepro.py --task ve --bert $TOKER --format $FORMAT \`
			`--annotations $VE_ANN/snli_ve_$SPLIT.jsonl \`
			`--output $TXT_DB/ve_${SPLIT}_$SUFFIX.db`
			`done`

			`# GQA`
			`for SPLIT in 'train' 'val' 'testdev'; do`
			`for VER in 'all' 'balanced'; do`
			`python prepro.py --task vqa --bert $TOKER --format $FORMAT \`
			`--annotations $GQA_ANN/gqa_${SPLIT}_${VER}_questions.vqa.json \`
			`$GQA_ANN/gqa_${SPLIT}_${VER}_annotations.vqa.json \`
			`$GQA_ANN/ans2label.pkl \`
			`--output $TXT_DB/gqa_${SPLIT}_${VER}_$SUFFIX.db`
			`done`
			`done`
			`# GQA test`
			`python prepro.py --task vqa --bert $TOKER --format $FORMAT \`
			`--annotations $GQA_ANN/gqa_submission_questions.vqa.json \`
			`--output $TXT_DB/gqa_submission_$SUFFIX.db`


			`# Conceptual Captions`
			`for SPLIT in 'train' 'val'; do`
			`python prepro.py --task conceptual --bert $TOKER --format $FORMAT \`
			`--annotations $CONCEPT_ANN/${SPLIT}_imageId2Ann.tsv \`
			`$CONCEPT_ANN/${SPLIT}_imgs.json \`
			`--output $TXT_DB/conceptual_caption_${SPLIT}_$SUFFIX.db`
			`done`

			`# SBU captions`
			`for SPLIT in 'train' 'val'; do`
			`python prepro.py --task sbu --bert $TOKER --format $FORMAT \`
			`--annotations $SBU_ANN/sbu_${SPLIT}_captions.json \`
			`--output $TXT_DB/sbu_caption_${SPLIT}_$SUFFIX.db`
			`done`

			`# VCR`
			`for SPLIT in 'train' 'val'; do`
			`python prepro.py --task vcr --bert $TOKER --format $FORMAT \`
			`--annotations $VCR_ANN/$SPLIT.jsonl \`
			`--output $TXT_DB/vcr_${SPLIT}_$SUFFIX.db`
			`done`

			`# NLVR2`
			`for SPLIT in 'dev' 'test1'; do`
			`python prepro.py --task nlvr2 --bert $TOKER --format $FORMAT \`
			`--annotations $NLVR2_ANN/$SPLIT.json \`
			`--output $TXT_DB/nlvr2_${SPLIT}_$SUFFIX.db`
			`done`
			`# some corrupted train features`
			`python prepro.py --task nlvr2 --bert $TOKER --format $FORMAT \`
			`--annotations $NLVR2_ANN/train.json $NLVR2_ANN/train_imgs.json \`
			`--output $TXT_DB/nlvr2_train_$SUFFIX.db`