import pandas as pd
import numpy as np
from utils import *
from config import *

def main():
    ppi_df = load_physical_ppis(STRING_FILE, experiment_threshold=EXPERIMENT_THRESHOLD)
    fasta_out = os.path.join(PROCESSED_DIR, 'filtered_sequences.fasta')
    seq_lengths = create_filtered_fasta(ppi_df, FASTA_FILE, fasta_out)
    valid_ids = set(seq_lengths.keys())

    clusters = parse_cdhit_clusters(CDHIT_CLUSTER_FILE)
    non_redundant_df = remove_sequence_redundant_ppis(ppi_df, clusters)

    # Keep only interactions where both proteins are within length range
    non_redundant_df = non_redundant_df[non_redundant_df['protein1'].isin(valid_ids) & non_redundant_df['protein2'].isin(valid_ids)]
    print(f"Valid non-redundant PPIs after length filter: {len(non_redundant_df)}")

    non_redundant_df['label'] = 1
    negative_df = generate_negative_ppis(non_redundant_df, valid_ids)

    all_df = pd.concat([non_redundant_df[['protein1', 'protein2', 'label']], negative_df], ignore_index=True)

    print(f"Final dataset: {len(all_df)} samples | Positives: {sum(all_df.label==1)} | Negatives: {sum(all_df.label==0)}")
    all_df.to_csv(OUTPUT_COMBINED, sep='\t', index=False)


if __name__ == "__main__":
    main()