import jieba
from simalign import SentenceAligner
from tqdm import tqdm
import json

# File paths
source_text_path = ""
target_text_path = ""
output_path = ""

# === Load data ===
with open(source_text_path, 'r', encoding='utf-8') as f:
    source_lines = [line.strip() for line in f]

with open(target_text_path, 'r', encoding='utf-8') as f:
    target_lines = [line.strip() for line in f]

assert len(source_lines) == len(target_lines), "Number of lines is inconsistent!"

# === Initialize aligner ===
aligner = SentenceAligner(model="bert", token_type="bpe", matching_methods="i")

# === Pronoun filter (for noisy cross-span alignments) ===
PRONOUNS = {"it", "he", "she", "they", "we", "i", "you"}

# === Process alignments ===
with open(output_path, "w", encoding="utf-8") as fout:
    for src, tgt in tqdm(zip(source_lines, target_lines), total=len(source_lines), desc="Aligning"):
        src_tokens = src.strip().split()
        tgt_tokens = list(jieba.cut(tgt.strip()))

        raw_alignment = aligner.get_word_aligns(src_tokens, tgt_tokens)[" "]  # Choose an alignment method

        # Build zh_idx → earliest en_idx mapping
        tgt2src = {}
        for src_idx, tgt_idx in raw_alignment:
            if src_idx < len(src_tokens):
                en_word = src_tokens[src_idx].lower()

                # Skip pronoun alignments if far away
                if en_word in PRONOUNS and tgt_idx < src_idx - 5:
                    continue

                if tgt_idx not in tgt2src or src_idx < tgt2src[tgt_idx]:
                    tgt2src[tgt_idx] = src_idx

        # Build reverse mapping en_idx → list of de_idx
        src2tgt = {}
        for tgt_idx, src_idx in tgt2src.items():
            src2tgt.setdefault(src_idx, []).append(tgt_idx)
            
        # Sorted alignment pairs
        alignment_pairs = sorted([[s, t] for s, t in tgt2src.items()])

        fout.write(json.dumps({
            "src_tokens": src_tokens,
            "tgt_tokens": tgt_tokens,
            "alignment": alignment_pairs,
            "tgt2src": tgt2src,  
        }, ensure_ascii=False) + "\n")
