import sys

CDS_PATH = './counterfactual-data-substitution/'

sys.path.append(CDS_PATH)

from src.substitutor import Substitutor
from src.utils import load_json_pairs

base_pairs = load_json_pairs('f{CDS_PATH}data/cda_default_pairs.json')
name_pairs = load_json_pairs('f{CDS_PATH}data/names_pairs_1000_scaled.json')

substitutor = Substitutor(base_pairs, name_pairs=name_pairs)

with open('./data/corpus/umbc30m', 'r') as in_file:
    with open('./data/corpus/umbc30m_cds', 'w') as out_file:
        for i,line in enumerate(in_file):
            out_file.write(next(substitutor.probablistic_substitute([line])))
            if i % 30000 == 0:
                print(f'{i/300000:3.1f}% done')
                sys.stdout.flush()
