"""Scripts that convert T5-model outputs to format that can be directly compared with `documents.jsonl`"""

from glob import glob
import json
import os
from collections import defaultdict
import fire


def main(data_dir, result_dir, out_dir_in_our_format='./baselines'):
    for out_file in glob(f'{result_dir}/*/*/test_generations.txt'):
        challenge = out_file.split('/')[6]
        out_stem = out_file.split('/')[7]
        data = defaultdict(list)

        with open(out_file) as raw_out:
            for line in raw_out:
                line = json.loads(line)
                doc_id = line['doc_id'].split('__')[0]
                data[doc_id].append((line['label_name'], line['preds']))

        os.makedirs(out_dir_in_our_format, exist_ok=True)
        with open(f'{data_dir}/{challenge}/test/document.jsonl') as expected, \
             open(f'{out_dir_in_our_format}/{out_stem}.jsonl', 'w+') as output:
            for line in expected:
                line = json.loads(line)
                ans = []
                for key, val in data[line['name']]:
                    key = key.rstrip('=')
                    vals = [v.strip() for v in val.split(' | ')]
                    ans.append({'key': key, 'values': [{'value': val} for val in vals]})

                ans_doc = {
                    'name': line['name'],
                    'annotations': ans
                }
                output.write(json.dumps(ans_doc) + '\n')


if __name__ == "__main__":
    fire.Fire(main)
