import glob


# Create ascii files (for experiments in appendix with T5-Base) from prepro files.

import random
random.seed(23453)
for x in glob.glob("*prepro.tsv"):
    name = x.replace("prepro", "ascii")
    data = []
    ivocab = set()
    ovocab = set()
    with open(x) as f:
        for line in f:
            i, o = line.rstrip("\n").split("\t")
            data.append((i,o))
            ivocab.update(i)
            ovocab.update(o)
    ivocab = sorted(ivocab)
    random.shuffle(ivocab)
    ovocab = sorted(ovocab)
    random.shuffle(ovocab)
    iv2n = {v: chr(33+i)   for i,v in enumerate(ivocab)}
    ov2n = {v: chr(33+i)   for i,v in enumerate(ovocab)}
    
    with open(name, "w") as f_out:
        for i,o in data:
            f_out.write("".join(iv2n[c] for c in i)+"\t"+"".join(ov2n[c] for c in o)+"\n")




