'''
Load data from summary folder, calculate the average and standard deviation of the results, and save them to a CSV file.
'''
import os
import sys
sys.path.append('.')
path0 = os.path.dirname(sys.argv[0])

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

path_summary = os.path.join(path0, 'summary')

def read_summary_results(fname: str) -> pd.DataFrame:
    """
    Read the summary file and return a pandas DataFrame with the results
    
    Args:
        fname: Name of the summary file
        
    Returns:
        pandas DataFrame with the results
    """
    if not os.path.exists(fname):
        print(f"Warning: Summary file {fname} does not exist.")
        return None
    
    # Read the results file into a DataFrame
    try:
        # Read with header from file instead of specifying column names
        # This assumes the CSV file already has a header
        df = pd.read_csv(fname, sep=',', skipinitialspace=True)
        
        # Check if expected columns are present - updated to match comprehensive header from run_models_165_parallel.py
        expected_columns = ["n_sample", "seed"]
        
        # Basic train metrics
        expected_columns.extend([
            "train_mse", "train_nll", "train_mae", "train_crps", 
            "train_coverage_95", "train_width_95", "train_ace"
        ])
        
        # Train coverage at different confidence levels
        for conf in [10, 20, 30, 40, 50, 60, 70, 80, 90]:
            expected_columns.append(f"train_coverage_{conf}")
        
        # Train width at different confidence levels
        for conf in [10, 20, 30, 40, 50, 60, 70, 80, 90]:
            expected_columns.append(f"train_width_{conf}")
        
        # Basic test metrics
        expected_columns.extend([
            "test_mse", "test_nll", "test_mae", "test_crps",
            "test_coverage_95", "test_width_95", "test_ace"
        ])
        
        # Test coverage at different confidence levels
        for conf in [10, 20, 30, 40, 50, 60, 70, 80, 90]:
            expected_columns.append(f"test_coverage_{conf}")
        
        # Test width at different confidence levels
        for conf in [10, 20, 30, 40, 50, 60, 70, 80, 90]:
            expected_columns.append(f"test_width_{conf}")
        
        # Training metadata
        expected_columns.extend(["training_time", "epoch"])
        
        # If file has no header or different column names, assign the expected ones
        if not all(col in df.columns for col in expected_columns[:2]):
            print(f"Warning: CSV file {fname} does not have expected headers. Assigning default column names.")
            df = pd.read_csv(fname, sep=',', skipinitialspace=True, header=None, 
                             names=expected_columns[:len(df.columns)])
        
        # Make sure numeric columns are properly converted - updated to include all new metrics
        numeric_cols = ["n_sample", "seed"]
        
        # Basic train metrics
        numeric_cols.extend([
            "train_mse", "train_nll", "train_mae", "train_crps", 
            "train_coverage_95", "train_width_95", "train_ace"
        ])
        
        # Train coverage at different confidence levels
        for conf in [10, 20, 30, 40, 50, 60, 70, 80, 90]:
            numeric_cols.append(f"train_coverage_{conf}")
        
        # Train width at different confidence levels
        for conf in [10, 20, 30, 40, 50, 60, 70, 80, 90]:
            numeric_cols.append(f"train_width_{conf}")
        
        # Basic test metrics
        numeric_cols.extend([
            "test_mse", "test_nll", "test_mae", "test_crps",
            "test_coverage_95", "test_width_95", "test_ace"
        ])
        
        # Test coverage at different confidence levels
        for conf in [10, 20, 30, 40, 50, 60, 70, 80, 90]:
            numeric_cols.append(f"test_coverage_{conf}")
        
        # Test width at different confidence levels
        for conf in [10, 20, 30, 40, 50, 60, 70, 80, 90]:
            numeric_cols.append(f"test_width_{conf}")
        
        # Training metadata
        numeric_cols.extend(["training_time", "epoch"])
        for col in numeric_cols:
            if col in df.columns:
                df[col] = pd.to_numeric(df[col], errors='coerce')
        
        return df
    
    except Exception as e:
        print(f"Error reading summary file {fname}: {e}")
        return None

def calculate_summary_statistics(df: pd.DataFrame, threshold=100.0):
    """
    Calculate average and standard deviation of results across different seeds,
    grouped by n_sample
    
    Args:
        df: DataFrame with results for a specific model and dataset
        
    Returns:
        Dictionary with sample sizes as keys, each containing avg and std for each metric
    """

    if df is None or len(df) == 0:
        print(f"No results found")
        return None
    
    # Group by n_sample
    stats_by_sample = {}
    
    # Define comprehensive metrics list to match the new output format
    metrics = ['train_mse', 'train_nll', 'train_mae', 'train_crps', 'train_coverage_95', 'train_width_95', 'train_ace']
    
    # Add train coverage at different confidence levels
    for conf in [10, 20, 30, 40, 50, 60, 70, 80, 90]:
        metrics.append(f'train_coverage_{conf}')
    
    # Add train width at different confidence levels
    for conf in [10, 20, 30, 40, 50, 60, 70, 80, 90]:
        metrics.append(f'train_width_{conf}')
    
    # Add basic test metrics
    metrics.extend(['test_mse', 'test_nll', 'test_mae', 'test_crps', 'test_coverage_95', 'test_width_95', 'test_ace'])
    
    # Add test coverage at different confidence levels
    for conf in [10, 20, 30, 40, 50, 60, 70, 80, 90]:
        metrics.append(f'test_coverage_{conf}')
    
    # Add test width at different confidence levels
    for conf in [10, 20, 30, 40, 50, 60, 70, 80, 90]:
        metrics.append(f'test_width_{conf}')
    
    # Add training metadata
    metrics.extend(['training_time', 'epoch'])
    
    # Get unique sample sizes
    sample_sizes = df['n_sample'].unique()
    
    for sample_size in sample_sizes:
        # Filter data for this sample size
        df_sample = df[df['n_sample'] == sample_size]
        
        # Calculate statistics for this group
        stats = {}
        for metric in metrics:
            if metric in df.columns:
                
                data = df_sample[metric]
                data = data[data <= threshold]
                
                stats[f'{metric}_mean'] = data.mean()
                stats[f'{metric}_std'] = data.std()
        
        # Add count of seeds
        stats['n_seeds'] = len(df_sample)
        
        # Store stats for this sample size
        stats_by_sample[sample_size] = stats
    
    return stats_by_sample

def calculate_advantage_metric(model_data, metric_name, target_model='HVBLL', 
                             use_test_data=True, aggregation_method='mean'):
    """
    Calculate a single number describing HVBLL's advantage relative to other models.
    
    Parameters:
    - model_data: Dictionary of model dataframes
    - metric_name: The metric to compare (e.g., 'nll', 'mse', 'mae', 'crps')
    - target_model: The model to compare others against (default: 'HVBLL')
    - use_test_data: Whether to use test data (True) or train data (False)
    - aggregation_method: How to aggregate across datasets ('mean', 'median', 'weighted_mean')
    
    Returns:
    - Dictionary with advantage metrics for each comparison model
    """
    if target_model not in model_data:
        print(f"Error: Target model '{target_model}' not found in data")
        return None
    
    target_data = model_data[target_model]
    other_models = [name for name in model_data.keys() if name != target_model]
    
    if not other_models:
        print(f"Error: No other models found for comparison")
        return None
    
    # Determine which data to use (train or test)
    data_type = 'test' if use_test_data else 'train'
    metric_col = f'{data_type}_{metric_name}_mean'
    
    # Check if the metric column exists in target model
    if metric_col not in target_data.columns:
        print(f"Error: Column '{metric_col}' not found in {target_model} data")
        return None
    
    advantages = {}
    
    for model_name in other_models:
        if model_name not in model_data:
            continue
            
        model_data_df = model_data[model_name]
        if metric_col not in model_data_df.columns:
            print(f"Warning: Column '{metric_col}' not found in {model_name} data")
            continue
        
        # Find common dataset cases
        common_cases = set(target_data['dataset_case']) & set(model_data_df['dataset_case'])
        
        if not common_cases:
            print(f"Warning: No common dataset cases between {target_model} and {model_name}")
            continue
        
        # Calculate relative improvements for each common case
        improvements = []
        weights = []
        
        for case in common_cases:
            target_row = target_data[target_data['dataset_case'] == case]
            model_row = model_data_df[model_data_df['dataset_case'] == case]
            
            if target_row.empty or model_row.empty:
                continue
            
            target_value = target_row[metric_col].values[0]
            model_value = model_row[metric_col].values[0]
            
            # Calculate relative improvement (lower is better for most metrics)
            # For metrics where higher is better (like coverage), we'd invert this
            if metric_name in ['coverage_95', 'coverage_10', 'coverage_20', 'coverage_30', 
                             'coverage_40', 'coverage_50', 'coverage_60', 'coverage_70', 
                             'coverage_80', 'coverage_90']:
                # For coverage metrics, higher is better
                improvement = (target_value - model_value) / model_value if model_value != 0 else 0
                
            elif metric_name == 'nll':
                # For NLL, lower is better. Scaling of data does not influence the difference between NLL. So, no need to divide by model_value.
                improvement = (model_value - target_value)
                
            else:
                # For most metrics (NLL, MSE, MAE, CRPS), lower is better
                improvement = (model_value - target_value) / model_value if model_value != 0 else 0
            
            improvements.append(improvement)
            
            # Use sample size as weight for weighted aggregation
            sample_size = target_row['sample_size'].values[0] if 'sample_size' in target_row.columns else 1
            weights.append(sample_size)
        
        if not improvements:
            continue
        
        # Aggregate improvements
        if aggregation_method == 'mean':
            avg_improvement = np.mean(improvements)
        elif aggregation_method == 'median':
            avg_improvement = np.median(improvements)
        elif aggregation_method == 'weighted_mean':
            if sum(weights) > 0:
                avg_improvement = np.average(improvements, weights=weights)
            else:
                avg_improvement = np.mean(improvements)
        else:
            avg_improvement = np.mean(improvements)
        
        # Calculate additional statistics
        std_improvement = np.std(improvements)
        win_rate = np.mean([imp > 0 for imp in improvements])  # Percentage of cases where target model wins
        
        advantages[model_name] = {
            'avg_improvement': avg_improvement,
            'std_improvement': std_improvement,
            'win_rate': win_rate,
            'n_cases': len(improvements),
            'improvements': improvements
        }
    
    return advantages

def save_model_data(case_list, model_list=['HVBLL', 'VBLL']):
    '''
    Save data for comparison of a model across all datasets and cases
    Args:
        case_list: List of tuples (dataset_id, n_cases)
        model_list: List of models to compare (default: HVBLL and VBLL)
    Returns:
        None (saves data to file)
    '''
    # Prepare data structure to collect results - updated to include comprehensive metrics
    all_results = {}
    for model_name in model_list:
        all_results[model_name] = {
            'dataset_case': [],
            'sample_size': []
        }
        
        # Basic train metrics
        for metric in ['train_mse', 'train_nll', 'train_mae', 'train_crps', 'train_coverage_95', 'train_width_95', 'train_ace']:
            all_results[model_name][f'{metric}_mean'] = []
            all_results[model_name][f'{metric}_std'] = []
        
        # Train coverage at different confidence levels
        for conf in [10, 20, 30, 40, 50, 60, 70, 80, 90]:
            all_results[model_name][f'train_coverage_{conf}_mean'] = []
            all_results[model_name][f'train_coverage_{conf}_std'] = []
        
        # Train width at different confidence levels
        for conf in [10, 20, 30, 40, 50, 60, 70, 80, 90]:
            all_results[model_name][f'train_width_{conf}_mean'] = []
            all_results[model_name][f'train_width_{conf}_std'] = []
        
        # Basic test metrics
        for metric in ['test_mse', 'test_nll', 'test_mae', 'test_crps', 'test_coverage_95', 'test_width_95', 'test_ace']:
            all_results[model_name][f'{metric}_mean'] = []
            all_results[model_name][f'{metric}_std'] = []
        
        # Test coverage at different confidence levels
        for conf in [10, 20, 30, 40, 50, 60, 70, 80, 90]:
            all_results[model_name][f'test_coverage_{conf}_mean'] = []
            all_results[model_name][f'test_coverage_{conf}_std'] = []
        
        # Test width at different confidence levels
        for conf in [10, 20, 30, 40, 50, 60, 70, 80, 90]:
            all_results[model_name][f'test_width_{conf}_mean'] = []
            all_results[model_name][f'test_width_{conf}_std'] = []
        
        # Training metadata
        for metric in ['training_time', 'epoch']:
            all_results[model_name][f'{metric}_mean'] = []
            all_results[model_name][f'{metric}_std'] = []
    
    # Collect data across all datasets and cases
    for fname_suffix, dataset_prefix, threshold in case_list:
        for model_name in model_list:
            
            fname = os.path.join(path_summary, 'summary-%s-%s.csv'%(model_name, fname_suffix))

            df = read_summary_results(fname)
            if df is None:
                continue
            
            stats_by_sample = calculate_summary_statistics(df, threshold=threshold)
            if stats_by_sample is None:
                continue
            
            i_sample = 0
            for sample_size, stats in stats_by_sample.items():
                i_sample += 1
                if 'test_nll_mean' in stats and 'train_nll_mean' in stats:
                    all_results[model_name]['dataset_case'].append(f"{dataset_prefix}-S{i_sample}")
                    all_results[model_name]['sample_size'].append(sample_size)
                    
                    # Basic train metrics
                    for metric in ['train_mse', 'train_nll', 'train_mae', 'train_crps', 'train_coverage_95', 'train_width_95', 'train_ace']:
                        all_results[model_name][f'{metric}_mean'].append(stats.get(f'{metric}_mean', np.nan))
                        all_results[model_name][f'{metric}_std'].append(stats.get(f'{metric}_std', np.nan))
                    
                    # Train coverage at different confidence levels
                    for conf in [10, 20, 30, 40, 50, 60, 70, 80, 90]:
                        all_results[model_name][f'train_coverage_{conf}_mean'].append(stats.get(f'train_coverage_{conf}_mean', np.nan))
                        all_results[model_name][f'train_coverage_{conf}_std'].append(stats.get(f'train_coverage_{conf}_std', np.nan))
                    
                    # Train width at different confidence levels
                    for conf in [10, 20, 30, 40, 50, 60, 70, 80, 90]:
                        all_results[model_name][f'train_width_{conf}_mean'].append(stats.get(f'train_width_{conf}_mean', np.nan))
                        all_results[model_name][f'train_width_{conf}_std'].append(stats.get(f'train_width_{conf}_std', np.nan))
                    
                    # Basic test metrics
                    for metric in ['test_mse', 'test_nll', 'test_mae', 'test_crps', 'test_coverage_95', 'test_width_95', 'test_ace']:
                        all_results[model_name][f'{metric}_mean'].append(stats.get(f'{metric}_mean', np.nan))
                        all_results[model_name][f'{metric}_std'].append(stats.get(f'{metric}_std', np.nan))
                    
                    # Test coverage at different confidence levels
                    for conf in [10, 20, 30, 40, 50, 60, 70, 80, 90]:
                        all_results[model_name][f'test_coverage_{conf}_mean'].append(stats.get(f'test_coverage_{conf}_mean', np.nan))
                        all_results[model_name][f'test_coverage_{conf}_std'].append(stats.get(f'test_coverage_{conf}_std', np.nan))
                    
                    # Test width at different confidence levels
                    for conf in [10, 20, 30, 40, 50, 60, 70, 80, 90]:
                        all_results[model_name][f'test_width_{conf}_mean'].append(stats.get(f'test_width_{conf}_mean', np.nan))
                        all_results[model_name][f'test_width_{conf}_std'].append(stats.get(f'test_width_{conf}_std', np.nan))
                    
                    # Training metadata
                    for metric in ['training_time', 'epoch']:
                        all_results[model_name][f'{metric}_mean'].append(stats.get(f'{metric}_mean', np.nan))
                        all_results[model_name][f'{metric}_std'].append(stats.get(f'{metric}_std', np.nan))
    
    # Save the data to CSV files
    for model_name in model_list:
        df = pd.DataFrame(all_results[model_name])
        output_file = os.path.join(path0, 'result', f'results_{model_name}_comparison.csv')
        
        # Format all float columns to 6 decimal places
        float_columns = df.select_dtypes(include=[np.number]).columns
        for col in float_columns:
            df[col] = df[col].apply(lambda x: f"{x:.6f}" if pd.notna(x) else x)
        
        df.to_csv(output_file, index=False)
        print(f"Data for {model_name} saved to {output_file}")

def load_model_data_for_advantage(model_name, path0):
    """Load data for a specific model for advantage calculation"""
    filename = f'results_{model_name}_comparison.csv'
    filepath = os.path.join(path0, 'result', filename)
    
    if not os.path.exists(filepath):
        print(f"Warning: File {filepath} not found. Skipping {model_name}.")
        return None
    
    data = pd.read_csv(filepath)
    return data

def generate_latex_table(model_list=['HVBLL', 'VBLL'], metrics_to_test=['nll', 'mae', 'crps'], 
                        target_model='HVBLL', dataset_size=None, output_file='table.tex'):
    """
    Generate a LaTeX table from advantage analysis results
    
    This function automatically generates a publication-ready LaTeX table showing
    the advantage of the target model (e.g., HVBLL) over baseline models across
    multiple metrics. The table includes average improvement percentages and win rates.
    
    Parameters:
    -----------
    model_list : list of str
        List of model names to compare (e.g., ['HVBLL', 'VBLL', 'BLL'])
    metrics_to_test : list of str
        List of metrics to include in the table (e.g., ['nll', 'mae', 'crps'])
    target_model : str
        The model to compare others against (default: 'HVBLL')
    dataset_size : str or None
        Filter by dataset size:
        - 'small': Use only small datasets (sample_index == 1)
        - 'large': Use only large datasets (sample_index == 3) 
        - None: Use all datasets
    output_file : str
        Output file path for the LaTeX table (default: 'table.tex')
    
    Returns:
    --------
    None
        Saves the LaTeX table to the specified output file
    
    Example:
    --------
    >>> # Generate table for all datasets with NLL and MAE metrics
    >>> generate_latex_table(
    ...     model_list=['HVBLL', 'VBLL', 'BLL'],
    ...     metrics_to_test=['nll', 'mae'],
    ...     target_model='HVBLL',
    ...     dataset_size=None,
    ...     output_file='advantage_table.tex'
    ... )
    
    Notes:
    ------
    - The function requires that comparison data files exist in the 'result/' directory
    - Generated tables use booktabs package formatting
    - Tables include overall summary statistics across all baseline models
    - Win rate shows percentage of cases where target model outperforms baseline
    - Average improvement shows relative performance gain (positive = target model better)
    """
    # Load data for multiple models
    model_data = {}
    
    print("Loading model data for LaTeX table generation...")
    for model_name in model_list:
        data = load_model_data_for_advantage(model_name, path0)
        if data is not None:
            data.name = model_name
            model_data[model_name] = data
            print(f"Loaded {len(data)} records for {model_name}")
    
    if not model_data:
        print("Error: No valid model data found!")
        return
    
    # Filter data by dataset size if specified
    if dataset_size is not None:
        print(f"Filtering data for {dataset_size} datasets...")
        if dataset_size == 'small':
            sample_suffix = 'S1'
        elif dataset_size == 'large':
            sample_suffix = 'S3'
        else:
            print(f"Warning: Unknown dataset_size '{dataset_size}'. Using all data.")
            sample_suffix = None
        
        if sample_suffix is not None:
            for model_name in model_data:
                original_count = len(model_data[model_name])
                # Filter based on dataset case name ending with the sample suffix
                model_data[model_name] = model_data[model_name][model_data[model_name]['dataset_case'].str.endswith(sample_suffix)]
                filtered_count = len(model_data[model_name])
                print(f"  {model_name}: {original_count} -> {filtered_count} records")
    
    # Calculate advantages for all metrics
    all_advantages = {}
    for metric in metrics_to_test:
        advantages = calculate_advantage_metric(
            model_data, 
            metric_name=metric, 
            target_model=target_model, 
            use_test_data=True, 
            aggregation_method='mean'
        )
        if advantages:
            all_advantages[metric] = advantages
    
    if not all_advantages:
        print("Error: No advantages calculated!")
        return
    
    # Generate LaTeX table
    dataset_info = f" ({dataset_size} datasets)" if dataset_size else " (all datasets)"
    table_title = f"{target_model} Advantage Summary{dataset_info}"
    
    # Get the number of cases from the first metric
    first_metric = list(all_advantages.keys())[0]
    n_cases = list(all_advantages[first_metric].values())[0]['n_cases']
    
    # Start building the LaTeX table
    latex_content = f"""\\begin{{table}}[htbp]
\\centering
\\caption{{{table_title}}}
\\label{{tab:{target_model.lower()}_advantage}}
\\begin{{tabular}}{{l{'c' * (len(metrics_to_test) * 2)}c}}
\\toprule    
"""
    
    # Add metric headers
    metric_headers = [f"\\multirow{{2}}{{*}}{{Model}} & "]
    for metric in metrics_to_test:
        metric_headers.append(f"\\multicolumn{{2}}{{c}}{{{metric.upper()}}}")
    latex_content += " & ".join(metric_headers) + " & \\\\\n"
    
    # Add sub-headers for each metric
    sub_headers = [f"& "]
    for metric in metrics_to_test:
        sub_headers.append("AI & WR")
    latex_content += " & ".join(sub_headers) + " & \\\\\n"
    
    # Add horizontal lines
    latex_content += "\\cmidrule(lr){2-3} " + " ".join([f"\\cmidrule(lr){{{2+i*2}-{3+i*2}}}" for i in range(1, len(metrics_to_test))]) + "\n"
    
    # Get all model names (excluding target model)
    other_models = [name for name in model_list if name != target_model]
    
    # Add data rows
    latex_content += "\\midrule\n"
    for model_name in other_models:
        row_data = [model_name]
        for metric in metrics_to_test:
            if metric in all_advantages and model_name in all_advantages[metric]:
                stats = all_advantages[metric][model_name]
                row_data.append(f"{stats['avg_improvement']:.3f}")
                row_data.append(f"{stats['win_rate']*100:.1f} \\%")
            else:
                row_data.extend(["N/A", "N/A"])
        row_data.append(str(n_cases))
        latex_content += " & ".join(row_data) + " \\\\\n"
    
    # Add overall summary row
    latex_content += "\\midrule\n"
    overall_data = ["\\textbf{Overall}"]
    for metric in metrics_to_test:
        if metric in all_advantages:
            overall_improvement = np.mean([stats['avg_improvement'] for stats in all_advantages[metric].values()])
            overall_win_rate = np.mean([stats['win_rate'] for stats in all_advantages[metric].values()])
            overall_data.append(f"\\textbf{{{overall_improvement:.3f}}}")
            overall_data.append(f"\\textbf{{{overall_win_rate*100:.1f} \\%}}")
        else:
            overall_data.extend(["\\textbf{N/A}", "\\textbf{N/A}"])
    overall_data.append(f"\\textbf{{{n_cases}}}")
    latex_content += " & ".join(overall_data) + " \\\\\n"
    
    # Close the table
    latex_content += f"""\\bottomrule
\\end{{tabular}}
\\begin{{tablenotes}}
\\small
\\item Note: Average Improvement represents the relative improvement of {target_model} over each baseline model (positive values indicate {target_model} performs better). Win Rate shows the percentage of cases where {target_model} outperforms the baseline model. All metrics are calculated on test data across {n_cases} dataset cases. Win Rate is calculated as the percentage of cases where the target model outperforms the baseline model. 
\\end{{tablenotes}}
\\end{{table}}"""
    
    # Write to file
    output_path = os.path.join(path0, output_file)
    with open(output_path, 'w') as f:
        f.write(latex_content)
    
    print(f"LaTeX table saved to {output_path}")



if __name__ == "__main__":
    
    # ID_UCI, N_CASE, threshold
    uci_case_list = [
        (165, 5, 100), 
        (186, 4, 100), 
        (291, 5, 100), 
        (294, 5, 100),
        # (464, 2),
    ]
    
    case_list = []
    for id_UCI, n_case, threshold in uci_case_list:
        for i_case_partial_x in range(n_case):
            fname_suffix = '%d-%d'%(id_UCI, i_case_partial_x)
            dataset_prefix = 'DS%d-C%d'%(id_UCI, i_case_partial_x)
            case_list.append((fname_suffix, dataset_prefix, threshold))
    
    case_list.append(('era5', 'era5', 100))
    case_list.append(('laminate', 'laminate', 100))

    model_list = [
        'HVBLL', 
        'VBLL', 
        'BLL', 
        'MC-Dropout', 
        # 'Deep-GP',    #! Very bad results
        'PNN', 
        'SWAG', 
        'DVI',
        'MDN',
        ]
    
    save_model_data(case_list, model_list=model_list)
    
    # Generate LaTeX tables
    print("\n" + "="*60)
    print("Generating LaTeX tables...")
    print("="*60)
    
    # Generate table for small datasets
    generate_latex_table(model_list=model_list, target_model='HVBLL',
                        metrics_to_test=['nll', 'mae', 'crps'],
                        dataset_size='small', output_file='table_small.tex')
    
    # Generate table for all datasets
    generate_latex_table(model_list=model_list, target_model='HVBLL',
                        metrics_to_test=['nll', 'mae', 'crps'],
                        dataset_size=None, output_file='table_all.tex')
