# generate_top20_pairs_for_dscript.py

import pandas as pd
import os
from config import *

# === LOAD ID MAPPING ===
alias_df = pd.read_csv(ALIAS_V11, sep="\t")
name_to_id = dict(zip(alias_df["preferred_name"], alias_df["protein_external_id"]))

# === LOAD TOP10 EXPANSION ===
df = pd.read_csv(INPUT_FILE, sep="\t")

# Map protein1 → STRING ID
df["protein1_id"] = df["protein1"].map(name_to_id)

# Check valid rows
valid_pairs = df[["protein1_id", "similar_protein"]].dropna().drop_duplicates()

# === SAVE PAIRS FILE (NO HEADER) ===
valid_pairs.to_csv(OUTPUT_PAIRS_FILE, sep="\t", index=False, header=False)

print(f"Saved {len(valid_pairs)} STRING ID pairs to: {OUTPUT_PAIRS_FILE}")