# This script is for removing punctuation e.g. "," or ";" before the newline token, as this fucks with the tokenization in some models

import json
import os

script_dir = os.path.dirname(os.path.abspath(__file__))
parent_dir = os.path.dirname(script_dir)

file_paths = [
    os.path.join(parent_dir, "data", "rhyme_families_word.json"),
    os.path.join(parent_dir, "data", "rhyme_families.json"),
    os.path.join(parent_dir, "data", "train", "specific_word_lines.json"),
    os.path.join(parent_dir, "data", "train", "rhyme_family_lines.json"),
    os.path.join(parent_dir, "data", "test", "specific_word_lines.json"),
    os.path.join(parent_dir, "data", "test", "rhyme_family_lines.json"),
    os.path.join(parent_dir, "data", "test", "specific_word_pairs.json"),
]


def remove_non_alphanumeric_characters_from_right(text):
    last_idx = len(text)
    while last_idx > 0 and not text[last_idx - 1].isalnum():
        last_idx -= 1
    return text[:last_idx]


for file_path in file_paths:
    data = json.load(open(file_path))
    for key in data:
        for i, line in enumerate(data[key]):
            new_line = remove_non_alphanumeric_characters_from_right(line) + "\n"
            if new_line != line:
                print("Line:" + line)
                print("New line:" + new_line)
            data[key][i] = new_line

    json.dump(data, open(file_path, "w"))
