import os
import csv
from statistics import mean, median
from collections import Counter

def read_arff_file(file_path):
    data = []
    attributes = []
    class_attribute = None
    data_section = False
    with open(file_path, 'r') as file:
        for line in file:
            line = line.strip()
            if line.lower().startswith('@attribute'):
                attr_name = line.split()[1]
                attributes.append(attr_name)
                if 'class' in line.lower():
                    class_attribute = attr_name
            elif line.lower().startswith('@data'):
                data_section = True
            elif data_section and line:
                data.append(line.split(','))
    return attributes, class_attribute, data

def compute_statistics(file_path):
    attributes, class_attribute, data = read_arff_file(file_path)
    
    num_instances = len(data)
    num_attributes = len(attributes) - 1  # Excluding the class attribute
    
    class_index = attributes.index(class_attribute)
    class_values = [row[class_index] for row in data]
    num_classes = len(set(class_values))
    
    class_distribution = Counter(class_values)
    dataset_name = os.path.basename(file_path)
    if dataset_name.lower().endswith('_data.arff'):
        dataset_name = dataset_name[:-10]  # Remove '_data.arff'
    elif dataset_name.lower().endswith('.arff'):
        dataset_name = dataset_name[:-5]  # Remove '.arff'
    return {
        'Dataset': dataset_name,
        'Instances': num_instances,
        'Attributes': num_attributes,
        'Classes': num_classes,
        'Class Distribution': dict(class_distribution)
    }

def generate_latex_file(total_datasets, min_instances, max_instances, mean_instances, median_instances, output_file):
    latex_content = "% Dataset statistics - auto-generated\n"
    latex_content += f"\\newcommand{{\\totalDatasets}}{{{total_datasets}}}\n"
    latex_content += f"\\newcommand{{\\minInstances}}{{{min_instances}}}\n"
    latex_content += f"\\newcommand{{\\maxInstances}}{{{max_instances}}}\n"
    latex_content += f"\\newcommand{{\\meanInstances}}{{{mean_instances:.2f}}}\n"
    latex_content += f"\\newcommand{{\\medianInstances}}{{{median_instances:.2f}}}\n"
    
    with open(output_file, 'w') as file:
        file.write(latex_content)
    
    print(f"LaTeX commands have been written to {output_file}")

def main():
    data_dir = 'data/weka/'
    results_dir = 'results/'
    output_csv = os.path.join(results_dir, 'dataset_statistics.csv')
    output_latex = os.path.join(results_dir, 'dataset_statistics.tex')
    output_summary_csv = os.path.join(results_dir, 'dataset_summary_statistics.csv')
    
    # Create results directory if it doesn't exist
    if not os.path.exists(results_dir):
        os.makedirs(results_dir)
        print(f"Created directory {results_dir}")
    
    statistics = []
    
    # Check if data directory exists
    if not os.path.exists(data_dir):
        print(f"Error: Directory {data_dir} does not exist.")
        return
    
    for filename in os.listdir(data_dir):
        if filename.endswith('.arff'):
            file_path = os.path.join(data_dir, filename)
            stats = compute_statistics(file_path)
            statistics.append(stats)
    
    # Check if any ARFF files were found
    if not statistics:
        print(f"No ARFF files found in {data_dir}")
        return
    
    # Calculate aggregate statistics
    total_datasets = len(statistics)
    instances_list = [stat['Instances'] for stat in statistics]
    min_instances = min(instances_list)
    max_instances = max(instances_list)
    mean_instances = mean(instances_list)  # Using mean function directly
    median_instances = median(instances_list)  # Using median function directly
    
    # Write individual statistics to CSV
    with open(output_csv, 'w', newline='') as csvfile:
        fieldnames = ['Dataset', 'Instances', 'Attributes', 'Class 0 Examples', 'Class 1 Examples']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        
        writer.writeheader()
        for stats in statistics:
            class_dist = stats['Class Distribution']
            writer.writerow({
                'Dataset': stats['Dataset'],
                'Instances': stats['Instances'],
                'Attributes': stats['Attributes'],
                'Class 0 Examples': str(class_dist.get('0', 0)),
                'Class 1 Examples': str(class_dist.get('1', 0))
            })
    
    # Write summary statistics to CSV
    with open(output_summary_csv, 'w', newline='') as csvfile:
        fieldnames = ['Metric', 'Value']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        
        writer.writeheader()
        writer.writerow({'Metric': 'Total Datasets', 'Value': total_datasets})
        writer.writerow({'Metric': 'Min Instances', 'Value': min_instances})
        writer.writerow({'Metric': 'Max Instances', 'Value': max_instances})
        writer.writerow({'Metric': 'Mean Instances', 'Value': f'{mean_instances:.2f}'})
        writer.writerow({'Metric': 'Median Instances', 'Value': f'{median_instances:.2f}'})
    
    print(f"Dataset statistics written to {output_csv}")
    print(f"Summary statistics written to {output_summary_csv}")
    
    # Generate LaTeX file
    generate_latex_file(total_datasets, min_instances, max_instances, mean_instances, median_instances, output_latex)

if __name__ == "__main__":
    main()
