import csv
import re
import difflib

def clean_text(s):
    s = s.lower().strip()
    s = re.sub(r'^[a|the]\s+', '', s)         # Remove a/the
    s = s.rstrip('.')
    s = re.sub(r'\s+', ' ', s)
    return s

def find_best_match(target, candidates):
    """difflib"""
    cleaned_candidates = [clean_text(x) for x in candidates]
    cleaned_target = clean_text(target)
    # Exact match first
    if cleaned_target in cleaned_candidates:
        return cleaned_candidates.index(cleaned_target)
    # Use difflib to find the closest match
    matches = difflib.get_close_matches(cleaned_target, cleaned_candidates, n=1, cutoff=0.6)
    if matches:
        return cleaned_candidates.index(matches[0])
    return -1

def main():
    input_csv = "../result/s3_output.csv"
    output_csv = "../result/s4_output.csv"
    with open(input_csv, "r", encoding="utf-8", newline='') as fin:
        reader = csv.DictReader(fin)
        rows = list(reader)
        fieldnames = reader.fieldnames or []

    out_col = "Matched_Part_Names"
    if out_col not in fieldnames:
        fieldnames.append(out_col)
    with open(output_csv, "w", encoding="utf-8", newline='') as fout:
        writer = csv.DictWriter(fout, fieldnames=fieldnames)
        writer.writeheader()
        for row in rows:
            descriptive = [x.strip() for x in row.get("Descriptive", "").split(";") if x.strip()]
            part_names = [x.strip() for x in row.get("Part_names", "").split(";") if x.strip()]
            descriptive_pair = [x.strip() for x in row.get("Descriptive_Pair", "").split(";") if x.strip()]
            matched_names = []
            for target in descriptive_pair:
                idx = find_best_match(target, descriptive)
                if 0 <= idx < len(part_names):
                    matched_names.append(part_names[idx])
                else:
                    matched_names.append("[NOT FOUND]")
            row[out_col] = ";".join(matched_names)
            writer.writerow(row)
            print(f"[MATCH] {descriptive_pair} -> {matched_names}")

if __name__ == "__main__":
    main()
