#!/usr/bin/env bash

set -e

cd repo/ec-game
# gdown --folder https://drive.google.com/drive/folders/1u5JHzt5U-w8Y0vBoT8-I9lacROXWUXs5
# mv image_features data

for dataset in cc coco_2014; do
  python train.py --dataset $dataset --name default --extract $dataset
  pt_path=(./ckpt/${dataset}_vocab_4035_seq_15_reset_-1_nlayers_1/run-default/model_*_999_4035.pt-${dataset}.pt)
  out_path=../../data/${dataset}
  mkdir -p $out_path

  out_metadata=$out_path/metadata.json
  if ! [[ -s $out_metadata ]]; then
    echo '{}' >$out_metadata
  fi
  acc=$(basename $pt_path | cut -d_ -f2)
  jq ".metrics.system.acc=$acc" \
    <$out_metadata >$out_metadata.tmp
  mv $out_metadata{.tmp,}

  sed 's/^  //' <<- EOF | python
  import torch
  import json
  corpus = torch.load("$pt_path").tolist()
  with open("${out_path}/corpus.jsonl", "w") as fo:
    for row in corpus:
      fo.write(json.dumps(row, indent=None))
      fo.write("\n")
EOF

done
