import pandas as pd
from sklearn.metrics import confusion_matrix, classification_report
from glob import glob
import os

def generate_confusion_matrix(csv_paths, validity_col='validity', manual_col='manual_annotation'):
    """
    Generate and plot a confusion matrix from multiple CSV files.
    
    Args:
        csv_paths (list): List of paths to CSV files
        validity_col (str): Name of the column containing validity predictions
        manual_col (str): Name of the column containing manual annotations
    """
    # Combine all CSV files
    dfs = []
    for path in csv_paths:
        df = pd.read_csv(path)
        dfs.append(df)
    combined_df = pd.concat(dfs, ignore_index=True)
    
    # Drop rows with empty values in validity column
    combined_df = combined_df.dropna(subset=[validity_col])
    
    # Generate confusion matrix
    cm = confusion_matrix(combined_df[manual_col], combined_df[validity_col])
    
    # Create labels for the plot
    labels = sorted(combined_df[manual_col].unique())
    
    # Print confusion matrix with labels
    print("\nConfusion Matrix:")
    cm_df = pd.DataFrame(cm, index=labels, columns=labels)
    print("\nFormat: True (rows) vs Predicted (columns)")
    print(cm_df)
    
    # Print classification report
    print("\nCombined Classification Report:")
    print(classification_report(combined_df[manual_col], combined_df[validity_col]))
    
    # Calculate and print accuracy
    accuracy = (cm.diagonal().sum() / cm.sum()) * 100
    print(f"\nOverall Accuracy: {accuracy:.2f}%")


if __name__ == "__main__":
    data_dir = "../../data/annotated_data"  # Provide only the data directory
    csv_files = glob(os.path.join(data_dir, "*.csv"))
    
    if not csv_files:
        raise ValueError(f"No CSV files found in {data_dir}")
        
    generate_confusion_matrix(csv_files)
