import unicodedata
import re
from pathlib import Path

# Mapping of common “ambiguous” Unicode → ASCII
CONFUSABLES = {
    '\u2018': "'",  # left single curly quote
    '\u2019': "'",  # right single curly quote
    '\u201C': '"',  # left double curly quote
    '\u201D': '"',  # right double curly quote
    '\u2013': '-',  # en dash
    '\u2014': '-',  # em dash
    '\u2026': '...',# ellipsis
    '\u00A0': ' ',  # non-breaking space
}

def normalize_line(line: str) -> str:
    line = unicodedata.normalize('NFKC', line)
    for src, tgt in CONFUSABLES.items():
        line = line.replace(src, tgt)
    return line

def fix_jsonl(input_path: str, output_path: str):
    input_file = Path(input_path)
    output_file = Path(output_path)
    write_file = output_file
    if input_file.resolve() == output_file.resolve():
        write_file = output_file.with_suffix(output_file.suffix + ".tmp")

    with input_file.open('r', encoding='utf-8') as fin, \
         write_file.open('w', encoding='utf-8') as fout:
        for raw in fin:
            fixed = normalize_line(raw)
            fout.write(fixed)

    if write_file != output_file:
        write_file.replace(output_file)

if __name__ == "__main__":
    sft_data_root = Path(__file__).resolve().parent
    train_combined_path = sft_data_root / "judge" / "train_combined.jsonl"
    fix_jsonl(str(train_combined_path), str(train_combined_path))
    print(f"Wrote cleaned file with ASCII-only equivalents to {train_combined_path}.")
