clip-caption-reward/configs/phase2/clipRN50_clips_grammar.yml


								caption_model: transformer

								noamopt: true

								noamopt_warmup: 20000

								label_smoothing: 0.0

								input_json: data/cocotalk.json

								input_label_h5: data/cocotalk_label.h5

								input_fc_dir: data/cocotalk_clip_RN50_fc

								input_att_dir: data/cocotalk_clip_RN50_att

								input_clipscore_vis_dir: data/cocotalk_clipscore_vis

								seq_per_img: 5

								batch_size: 160

								learning_rate: 0.0005


								checkpoint_path: save/clipRN50_clips_grammar/clipRN50_clips_grammar


								use_multi_rewards: true

								use_grammar: true

								use_grammar_baseline: true

								# clip_load_path: '/scratch-space/retrieval/save/clip_negative_text/clip_negative_text-epoch=10.ckpt'

								clip_load_path: 'retrieval/save/clip_negative_text/clip_negative_text-epoch=12.ckpt'


								# Notice: because I'm to lazy, I reuse the option name for RNNs to set the hyperparameters for transformer:

								# N=num_layers

								# d_model=input_encoding_size

								# d_ff=rnn_size


								# will be ignored

								num_layers: 6

								input_encoding_size: 512

								rnn_size: 2048


								# Transformer config

								N_enc: 6

								N_dec: 6

								d_model: 512

								d_ff: 2048

								num_att_heads: 8

								dropout: 0.1


								learning_rate_decay_start: 0

								scheduled_sampling_start: -1

								save_checkpoint_every: 3000

								language_eval: 1

								val_images_use: 5000

								max_epochs: 15

								train_sample_n: 5


								REFORWARD: false


								# _BASE_: transformer.yml

								reduce_on_plateau: false

								noamopt: false

								learning_rate: 0.000005

								learning_rate_decay_start: -1


								self_critical_after: 15

								max_epochs: 40


								verbose: false

								precision: 32


								use_clipscore: true

								clipscore_reward_weight: 2.0