import pandas as pd
import numpy as np
import os
import re

def process_dream4_network_with_repeats_to_csv(instance_num: int, timeseries_filepath: str, gt_filepath: str, output_instance_dir: str):
    """
    Processes a DREAM4 In Silico Size 100 time series file that contains
    multiple independent time series experiments concatenated together,
    and its corresponding ground truth file. Saves each experiment
    and the ground truth matrix as CSV files.

    Args:
        instance_num: The network instance number (e.g., 1, 2, etc.).
                      Used for output directory naming and matching GT file.
        timeseries_filepath: Full path to the concatenated time series file
                             for this instance (e.g., insilico_size100_1_timeseries.tsv).
        gt_filepath: Full path to the ground truth file for this instance
                     (e.g., DREAM4_GoldStandard_InSilico_Size100_1.tsv).
        output_instance_dir: Directory to save the processed data for this instance.
    """
    print(f"Processing DREAM4 Size 100 Instance {instance_num} and saving to CSV...")
    os.makedirs(output_instance_dir, exist_ok=True)

    # --- Load and Split Time Series Data ---
    experiments_data_df = [] # Store DataFrames for each experiment
    gene_names = None

    # Read the time series data, assuming tab separation and header
    df_timeseries = pd.read_csv(timeseries_filepath, sep='\t')
    print(f"  Loaded time series data: {timeseries_filepath} (Shape: {df_timeseries.shape})")
    # Identify start indices of each experiment (where Time == 0.0)
    # The first row is always the start of the first experiment, so include index 0
    zero_indices = df_timeseries[df_timeseries['Time'] == 0.0].index.tolist()

    # Iterate through the start indices to split the DataFrame and save
    for i in range(len(zero_indices)):
        start_idx = zero_indices[i]
        # The end index is the start of the next experiment - 1, or the end of the DataFrame
        end_idx = zero_indices[i+1] - 1 if i + 1 < len(zero_indices) else len(df_timeseries) - 1

        # Extract the slice for this experiment
        # Use .copy() to ensure we work on a copy and avoid potential SettingWithCopyWarning
        df_experiment = df_timeseries.iloc[start_idx : end_idx + 1].copy()

        # Drop the 'Time' column
        if 'Time' in df_experiment.columns:
            df_experiment = df_experiment.drop(columns=['Time'])

        # Get gene names from the first experiment (should be consistent across splits)
        if gene_names is None:
            gene_names = df_experiment.columns.tolist()
            num_genes = len(gene_names)
            print(f"  Identified {num_genes} genes from time series: {gene_names[:5]}...{gene_names[-5:]}")

        # Save individual experiment data to CSV
        timeseries_output_path = os.path.join(output_instance_dir, f'timeseries_data_{i}.csv')
        try:
            df_experiment.to_csv(timeseries_output_path, index=False) # Save without pandas index
            print(f"    Saved Experiment {i} data to {timeseries_output_path}")
        except Exception as e:
            print(f"    Error saving Experiment {i} data to CSV: {e}")

    print(f"  Split time series file into {len(zero_indices)} independent experiments and saved as CSVs.")\
        
    # Create a mapping from gene name to index based on the time series column order
    gene_to_index = {name: i for i, name in enumerate(gene_names)}
    num_genes = len(gene_names) # Should be the same as inferred from data
    
    static_adj_matrix_np = np.zeros((num_genes, num_genes), dtype=int)

    # Read the ground truth data, assuming tab separation
    df_gt = pd.read_csv(gt_filepath, sep='\t', header=None, names=['Source', 'Target', 'Type'])
    print(f"  Loaded ground truth data: {gt_filepath}")

    # Filter for existing edges (Type == 1)
    existing_edges_df = df_gt[df_gt['Type'] == 1]

    # Populate the static adjacency matrix (NumPy)
    edge_count = 0
    for index, row in existing_edges_df.iterrows():
        source_gene = row['Source']
        target_gene = row['Target']

        # Ensure genes exist in the time series data columns
        if source_gene in gene_to_index and target_gene in gene_to_index:
            source_idx = gene_to_index[source_gene]
            target_idx = gene_to_index[target_gene]
            static_adj_matrix_np[source_idx, target_idx] = 1
            edge_count += 1
        else:
            # This shouldn't happen with standard DREAM4 files, but good for robustness
            print(f"  Warning: Gene in GT file '{source_gene}' or '{target_gene}' not found in time series data columns.")

    print(f"  Built static adjacency matrix (Shape: {static_adj_matrix_np.shape}) with {edge_count} existing edges.")

    # Convert NumPy matrix to DataFrame for saving as CSV
    # Use gene names for both index and columns for clarity in CSV
    static_adj_matrix_df = pd.DataFrame(static_adj_matrix_np, index=gene_names, columns=gene_names)

    # Save the static ground truth matrix to CSV
    # This GT is the same for all experiments within this instance's time series file
    gt_matrix_output_path = os.path.join(output_instance_dir, 'ground_truth_matrix.csv')
    try:
        static_adj_matrix_df.to_csv(gt_matrix_output_path) # Save with index (gene names) and header
        print(f"  Saved ground truth matrix to {gt_matrix_output_path}")
    except Exception as e:
        print(f"  Error saving ground truth matrix to CSV: {e}")

    # --- Save Gene Names ---
    if gene_names:
        gene_names_output_path = os.path.join(output_instance_dir, 'gene_names.txt')
        try:
            with open(gene_names_output_path, 'w') as f:
                for gene in gene_names:
                    f.write(f"{gene}\n")
            print(f"  Saved gene names to {gene_names_output_path}")
        except Exception as e:
             print(f"  Error saving gene names for instance {instance_num}: {e}")


    print(f"Processing for Instance {instance_num} complete.")


if __name__ == "__main__":
    # IMPORTANT: Set this directory according to where your DREAM4 files are located
    DREAM4_BASE_DIR = 'dream/' # Assuming the files are in the same directory as the script for now

    # Specify the base directory where processed data will be saved
    PROCESSED_OUTPUT_BASE_DIR = './'

    # Create the base output directory if it doesn't exist
    os.makedirs(PROCESSED_OUTPUT_BASE_DIR, exist_ok=True)

    # Process each instance from 1 to 5
    for i in range(1, 6):
        timeseries_file = os.path.join(DREAM4_BASE_DIR, f'insilico_size100_{i}_timeseries.tsv')
        gt_file = os.path.join(DREAM4_BASE_DIR, f'DREAM4_GoldStandard_InSilico_Size100_{i}.tsv')
        output_dir = os.path.join(PROCESSED_OUTPUT_BASE_DIR, f'instance_{i}')

        # Process the specific instance's time series (with repeats) and GT
        process_dream4_network_with_repeats_to_csv(i, timeseries_file, gt_file, output_dir)
        print("-" * 30)

    print("\nAll specified DREAM4 Size 100 instances processed and saved as CSV files.")
