import json
from simalign import SentenceAligner
from tqdm import tqdm

# === File paths ===
src_text_path = ""
tgt_text_path = ""
output_path = ""

# === Load data ===
with open(src_text_path, 'r', encoding='utf-8') as f:
    en_lines = [line.strip() for line in f]

with open(tgt_text_path, 'r', encoding='utf-8') as f:
    de_lines = [line.strip() for line in f]

assert len(en_lines) == len(de_lines), "Number of English and German lines do not match."

# === Initialize aligner ===
aligner = SentenceAligner(model="", token_type="", matching_methods="") # Choose models or methods of aligner

# === Pronoun filter (for noisy cross-span alignments) ===
PRONOUNS = {"it", "he", "she", "they", "we", "i", "you"}

# === Process alignments ===
with open(output_path, "w", encoding="utf-8") as fout:
    for en, de in tqdm(zip(en_lines, de_lines), total=len(en_lines), desc="Aligning"):
        en_tokens = en.strip().split()
        de_tokens = de.strip().split()

        raw_alignment = aligner.get_word_aligns(en_tokens, de_tokens)[""] # Choose an alignment method

        # Build de_idx → earliest en_idx mapping
        de2en = {}
        for en_idx, de_idx in raw_alignment:
            if en_idx < len(en_tokens):
                en_word = en_tokens[en_idx].lower()

                # Skip pronoun alignments if far away
                if en_word in PRONOUNS and de_idx < en_idx - 5:
                    continue

                if de_idx not in de2en or en_idx < de2en[de_idx]:
                    de2en[de_idx] = en_idx

        # Build reverse mapping en_idx → list of de_idx
        en2de = {}
        for de_idx, en_idx in de2en.items():
            en2de.setdefault(en_idx, []).append(de_idx)

        # Sorted alignment pairs
        alignment_pairs = sorted([[e, d] for d, e in de2en.items()])

        fout.write(json.dumps({
            "en_tokens": en_tokens,
            "de_tokens": de_tokens,
            "alignment": alignment_pairs,
            "de2en": de2en,
        }, ensure_ascii=False) + "\n")
