#!/usr/bin/env python3
"""
评估BP问题的solver性能
支持评估pool solvers、population bests和EoH optimal heuristic
"""

import os
import sys
import json
import argparse
import hashlib
import re
from typing import Dict, Any, List, Tuple, Optional
import numpy as np
import pandas as pd

# Add project root to path for imports
_script_dir = os.path.dirname(os.path.abspath(__file__))
_problem_dir = os.path.dirname(os.path.dirname(_script_dir))  # bp_online
_heupsro_dir = os.path.dirname(os.path.dirname(os.path.dirname(_script_dir)))  # heupsro (testing -> bp_online -> problems -> heupsro)
_project_root = os.path.dirname(_heupsro_dir)  # project root
if _script_dir not in sys.path:
    sys.path.insert(0, _script_dir)
if _project_root not in sys.path:
    sys.path.insert(0, _project_root)
if _heupsro_dir not in sys.path:
    sys.path.insert(0, _heupsro_dir)

# Now that bp_online/__init__.py uses lazy imports for adapter, we can safely import directly
from heupsro.core.config import HeuPSROConfig
from heupsro.problems.bp_online.evaluation import evaluate_solvers_on_instances_with_lbs
from heupsro.problems.bp_online.oracle import create_bp_online_oracle
from heupsro.problems.bp_online.initialization import get_eoh_optimal_code, get_best_fit_code, get_first_fit_code, get_worst_fit_code
from load_test_data import load_all_test_data

# Limit BLAS intra-op threads
os.environ.setdefault("OPENBLAS_NUM_THREADS", "1")
os.environ.setdefault("OMP_NUM_THREADS", "1")
os.environ.setdefault("MKL_NUM_THREADS", "1")
os.environ.setdefault("NUMEXPR_NUM_THREADS", "1")
os.environ.setdefault("OMP_DYNAMIC", "FALSE")
os.environ.setdefault("MKL_DYNAMIC", "FALSE")
os.environ.setdefault("VECLIB_MAXIMUM_THREADS", "1")
os.environ.setdefault("BLIS_NUM_THREADS", "1")
os.environ.setdefault("JOBLIB_START_METHOD", "spawn")
os.environ.setdefault("TMPDIR", "/dev/shm")


def generate_column_name(dataset_label: str, instance_name: str) -> str:
    """根据参考文件的格式生成列名。
    
    参考文件格式（虽然叫mean_gap，但实际是逐实例结果）：
    - Falkenauer_t120_00_random_mean_gap
    - Hard28_BPP119_random_mean_gap
    - Hard28_BPP119_size_asc_mean_gap
    - Hard28_BPP119_size_desc_mean_gap
    - Weibull_shape1p4_scale30p0_1000_00_random_mean_gap
    - HeavyTail_alpha1.2_1000_00_random_mean_gap
    - Mixture2_1000_00_random_mean_gap
    
    参数:
        dataset_label: 数据集标签，可能包含order信息（如 'Hard28_size_desc', 'Falkenauer_T120_120_random'）
        instance_name: 实例名称（可能是完整文件名，如 'dataset_Hard28_BPP119_n200_lb76_size_desc' 或 'BPP119'）
    
    返回:
        列名，格式为 {base_name}_{order}_mean_gap（遵循参考文件格式）
    """
    import re
    
    # 从dataset_label中提取order信息（如果存在）
    order = None
    if dataset_label.endswith('_random'):
        order = 'random'
        base_dataset = dataset_label[:-7]  # 移除 '_random'
    elif dataset_label.endswith('_size_asc'):
        order = 'size_asc'
        base_dataset = dataset_label[:-9]  # 移除 '_size_asc'
    elif dataset_label.endswith('_size_desc'):
        order = 'size_desc'
        base_dataset = dataset_label[:-10]  # 移除 '_size_desc'
    else:
        # 如果没有order信息，默认使用random
        order = 'random'
        base_dataset = dataset_label
    
    # 从instance_name中提取实例名称部分（去除dataset_前缀和order后缀）
    # 处理格式：dataset_Hard28_BPP119_n200_lb76_size_desc -> BPP119
    clean_instance_name = instance_name
    if clean_instance_name.startswith('dataset_'):
        clean_instance_name = clean_instance_name[8:]  # 移除 'dataset_' 前缀
    
    # 移除order后缀（如果instance_name中包含）
    for order_suffix in ['_random', '_size_asc', '_size_desc']:
        if clean_instance_name.endswith(order_suffix):
            clean_instance_name = clean_instance_name[:-len(order_suffix)]
            break
    
    # 根据不同的数据集类型生成列名
    if base_dataset.startswith('Falkenauer_'):
        # Falkenauer格式：Falkenauer_t120_00_random_mean_gap（必须有Falkenauer_前缀）
        # instance_name可能是 'Falkenauer_t120_00' 或 't120_00' 或 'dataset_Falkenauer_t120_00_n120_lb40_random_seed0'
        # 统一提取核心标识符：t120_00 或 u250_00 格式（t/u + 数字 + _ + 两位数字）
        match = re.search(r'([tu]\d+_\d+)', clean_instance_name)
        if match:
            # 成功提取核心标识符（如 t120_00）
            instance_part = match.group(1)
        else:
            # 如果无法提取，尝试其他方式
            parts = clean_instance_name.split('_')
            if len(parts) >= 3 and parts[0] == 'Falkenauer':
                # instance_name = 'Falkenauer_t120_00' -> 提取 't120_00'
                instance_part = '_'.join(parts[1:3])  # 只取前两部分（t120 和 00）
            elif len(parts) >= 2 and (parts[0].startswith('t') or parts[0].startswith('u')):
                # instance_name = 't120_00' -> 直接使用
                instance_part = clean_instance_name
            else:
                # 最后尝试：使用clean_instance_name
                instance_part = clean_instance_name
        # 必须加上Falkenauer_前缀，确保格式统一
        return f"Falkenauer_{instance_part}_{order}_mean_gap"
    
    elif base_dataset.startswith('Hard28'):
        # Hard28格式：Hard28_BPP119_random_mean_gap
        # instance_name可能是 'dataset_Hard28_BPP119_n200_lb76_size_desc' -> 提取 'BPP119'
        # 或者 'Hard28_BPP119' -> 提取 'BPP119'
        # 或者 'BPP119' -> 直接使用
        parts = clean_instance_name.split('_')
        if 'BPP' in clean_instance_name:
            # 找到BPP开头的部分
            for part in parts:
                if part.startswith('BPP'):
                    return f"Hard28_{part}_{order}_mean_gap"
            # 如果没有找到，尝试从整个字符串中提取
            match = re.search(r'BPP\d+', clean_instance_name)
            if match:
                return f"Hard28_{match.group()}_{order}_mean_gap"
        # 如果instance_name已经是 'BPP119' 格式
        if clean_instance_name.startswith('BPP'):
            return f"Hard28_{clean_instance_name}_{order}_mean_gap"
        # 默认情况
        return f"Hard28_{clean_instance_name}_{order}_mean_gap"
    
    elif base_dataset.startswith('Weibull_'):
        # Weibull格式：Weibull_shape1p4_scale30p0_1000_00_random_mean_gap
        # instance_name可能是 'Weibull_shape1p4_scale30p0_1000_00' 或 'shape1p4_scale30p0_1000_00'
        # 或 'dataset_Weibull_shape1p4_scale30p0_1000_00_n1000_lbXXX_random_seed0'
        parts = clean_instance_name.split('_')
        if len(parts) >= 2 and parts[0] == 'Weibull':
            # instance_name = 'Weibull_shape1p4_scale30p0_1000_00' -> 提取 'shape1p4_scale30p0_1000_00'
            instance_part = '_'.join(parts[1:])
        elif parts[0].startswith('shape'):
            # instance_name = 'shape1p4_scale30p0_1000_00' -> 直接使用
            instance_part = clean_instance_name
        else:
            # 尝试从clean_instance_name中提取shape开头的部分
            # 处理格式：dataset_Weibull_shape1p4_scale30p0_1000_00_n1000_lbXXX_random_seed0 -> shape1p4_scale30p0_1000_00
            match = re.search(r'shape[\dp]+_scale[\dp]+_\d+_\d+', clean_instance_name)
            if match:
                instance_part = match.group()
            else:
                instance_part = clean_instance_name
        return f"Weibull_{instance_part}_{order}_mean_gap"
    
    elif base_dataset.startswith('HeavyTail_'):
        # HeavyTail格式：HeavyTail_alpha1.2_1000_00_random_mean_gap
        # instance_name可能是 'HeavyTail_alpha1.2_1000_00' 或 'alpha1.2_1000_00'
        # 或 'dataset_HeavyTail_alpha1.2_1000_00_n1000_lbXXX_random_seed0'
        parts = clean_instance_name.split('_')
        if len(parts) >= 2 and parts[0] == 'HeavyTail':
            # instance_name = 'HeavyTail_alpha1.2_1000_00' -> 提取 'alpha1.2_1000_00'
            instance_part = '_'.join(parts[1:])
        elif parts[0].startswith('alpha'):
            # instance_name = 'alpha1.2_1000_00' -> 直接使用
            instance_part = clean_instance_name
        else:
            # 尝试从clean_instance_name中提取alpha开头的部分
            # 处理格式：dataset_HeavyTail_alpha1.2_1000_00_n1000_lbXXX_random_seed0 -> alpha1.2_1000_00
            match = re.search(r'alpha[\d.]+_\d+_\d+', clean_instance_name)
            if match:
                instance_part = match.group()
            else:
                instance_part = clean_instance_name
        return f"HeavyTail_{instance_part}_{order}_mean_gap"
    
    elif base_dataset.startswith('Mixture'):
        # Mixture格式：Mixture2_1000_00_random_mean_gap
        # instance_name可能是 'Mixture2_1000_00' 或类似格式
        parts = clean_instance_name.split('_')
        if len(parts) >= 2 and parts[0].startswith('Mixture'):
            # instance_name = 'Mixture2_1000_00' -> 直接使用
            return f"{clean_instance_name}_{order}_mean_gap"
        else:
            return f"{clean_instance_name}_{order}_mean_gap"
    
    elif base_dataset.startswith('g') and len(base_dataset) >= 2 and base_dataset[1].isdigit():
        # g2/g4格式：g2_50_100_1_mean_gap（没有order后缀）
        # instance_name可能是 'g2_50_100_1' 或类似格式
        return f"{clean_instance_name}_mean_gap"
    
    else:
        # 默认格式：直接使用instance_name和order
        return f"{clean_instance_name}_{order}_mean_gap"


def natural_key(text):
    """Natural sort key for strings containing numbers."""
    return [int(c) if c.isdigit() else c.lower() for c in re.split(r'(\d+)', text)]


def load_config_from_experiment(exp_dir: str) -> Optional[HeuPSROConfig]:
    """Load configuration from experiment directory if config.json exists."""
    config_path = os.path.join(exp_dir, "config.json")
    if os.path.exists(config_path):
        try:
            with open(config_path, "r") as f:
                config_data = json.load(f)
            from heupsro.problems.bp_online.config import BPOnlineConfig
            cfg = BPOnlineConfig()
            for key, value in config_data.items():
                if hasattr(cfg, key):
                    setattr(cfg, key, value)
            return cfg
        except Exception as e:
            print(f"Warning: Failed to load config from {config_path}: {e}")
            return None
    return None


def list_population_files(pops_dir: str) -> List[str]:
    files = [os.path.join(pops_dir, f) for f in os.listdir(pops_dir) if f.startswith('population_generation_') and f.endswith('.json')]
    files.sort(key=natural_key)
    return files


def load_population_best_individual(pop_file: str, *, select: str = 'min') -> Optional[Dict[str, Any]]:
    try:
        with open(pop_file, 'r', encoding='utf-8') as f:
            population = json.load(f)
        if not isinstance(population, list) or not population:
            return None
        if select == 'min':
            return min(population, key=lambda ind: ind.get('objective', float('inf')))
        return max(population, key=lambda ind: ind.get('objective', float('-inf')))
    except Exception:
        return None


def load_pools(exp_dir: str) -> Dict[str, Any]:
    pools_path = os.path.join(exp_dir, 'psro_results', 'pools.json')
    with open(pools_path, 'r') as f:
        return json.load(f)


def generate_test_instances(n_instances: int, capacity: int, num_items: int, seed: int = 42) -> Tuple[List[Dict], List[float]]:
    """Generate test instances using Weibull distribution."""
    np.random.seed(seed)
    instances = []
    lower_bounds = []
    
    oracle = create_bp_online_oracle(oracle_type='lb')
    
    for i in range(n_instances):
        shape = 2.0
        scale = 30.0
        items = np.random.weibull(shape, num_items) * scale
        items = np.clip(items, 1, capacity - 1)
        
        instance = {
            'items': items.tolist(),
            'capacity': capacity,
            'num_items': num_items
        }
        instances.append(instance)
        lb = oracle.solve_exact(instance)
        lower_bounds.append(lb)
    
    return instances, lower_bounds


def save_table(out_dir: str, table: Dict[str, Dict[str, float]], basename: str) -> None:
    """Save table as CSV only (JSON format removed)."""
    os.makedirs(out_dir, exist_ok=True)
    cp = os.path.join(out_dir, f"{basename}.csv")
    cols = sorted({k for row in table.values() for k in row.keys()})
    with open(cp, 'w') as f:
        f.write(",".join(["solver"] + cols) + "\n")
        for rk, row in table.items():
            vals = ["" if row.get(c) is None else f"{row.get(c):.6f}" for c in cols]
            f.write(",".join([rk] + vals) + "\n")


def load_existing_table(out_dir: str, basename: str) -> Optional[Dict[str, Dict[str, float]]]:
    """Load table from CSV (JSON format removed).
    
    过滤掉csAA、csAB、csBA、csBB等Irnich_BPP数据集的列。
    """
    cp = os.path.join(out_dir, f"{basename}.csv")
    if not os.path.exists(cp):
        return None
    try:
        df = pd.read_csv(cp)
        table = {}
        # 过滤掉csAA、csAB、csBA、csBB等Irnich_BPP数据集的列
        valid_cols = [col for col in df.columns 
                     if col == 'solver' or not (col.startswith('csAA') or col.startswith('csAB') or 
                                               col.startswith('csBA') or col.startswith('csBB'))]
        df_filtered = df[valid_cols]
        for _, row in df_filtered.iterrows():
            solver = row['solver']
            table[solver] = {col: row[col] for col in df_filtered.columns if col != 'solver' and pd.notna(row[col])}
        return table
    except Exception:
        return None


def parse_args() -> argparse.Namespace:
    _script_dir = os.path.dirname(os.path.abspath(__file__))
    _problem_dir = os.path.dirname(os.path.dirname(_script_dir))  # bp_online
    _heupsro_dir = os.path.dirname(os.path.dirname(_problem_dir))  # heupsro
    _project_root = os.path.dirname(_heupsro_dir)  # project root (EoH)
    
    _default_test_data_dir = os.path.join(_script_dir, 'TestingData')
    
    p = argparse.ArgumentParser(description="Evaluate pool solvers, population-bests, EoH optimal heuristic, or baseline algorithms (Best Fit, First Fit, Worst Fit)")
    p.add_argument('--mode', type=str, required=True, choices=['population', 'pool', 'optimal', 'best_fit', 'first_fit', 'worst_fit'],
                   help='Evaluation mode: population (from exp_without_pool), pool (from exp_with_pool), optimal (EoH optimal heuristic), best_fit (Best Fit baseline), first_fit (First Fit baseline), or worst_fit (Worst Fit baseline)')
    p.add_argument('--exp_with_pool', type=str, default=None,
                   help='Experiment directory with pool (for pool mode)')
    p.add_argument('--exp_without_pool', type=str, default=None,
                   help='Experiment directory without pool (for population mode)')
    p.add_argument('--exp_dir', type=str, default=None,
                   help='Experiment directory to load config from (optional)')
    p.add_argument('--test_data_dir', type=str, default=_default_test_data_dir,
                   help=f'Directory containing test instances (default: {_default_test_data_dir})')
    p.add_argument('--out_dir', type=str, default=None,
                   help='Output directory for results (default: based on mode)')
    p.add_argument('--capacity', type=int, default=None,
                   help='Bin capacity (default: from config or 100)')
    p.add_argument('--num_items', type=int, default=None,
                   help='Number of items per instance (default: from config or 100)')
    p.add_argument('--time_limit', type=int, default=5,
                   help='Time limit per instance (default: 5)')
    p.add_argument('--n_jobs', type=int, default=-1,
                   help='Number of parallel jobs (default: -1 for all cores)')
    p.add_argument('--backend', type=str, default='threading', choices=['threading', 'loky', 'multiprocessing'],
                   help='Parallel backend (default: threading)')
    p.add_argument('--prefer', type=str, default='threads', choices=['threads', 'processes'],
                   help='Parallel preference (default: threads)')
    p.add_argument('--n_test_instances', type=int, default=0,
                   help='Number of test instances to generate if no data loaded (default: 0)')
    p.add_argument('--limit_test_instances', type=int, default=None,
                   help='Limit number of instances per dataset (default: None)')
    p.add_argument('--limit_pool_solvers', type=int, default=None,
                   help='Limit number of pool solvers to evaluate (default: None)')
    p.add_argument('--limit_populations', type=int, default=None,
                   help='Limit number of population files to evaluate (default: None)')
    p.add_argument('--select', type=str, default='min', choices=['min', 'max'],
                   help='Select best individual by min or max objective (default: min)')
    p.add_argument('--resume', action='store_true',
                   help='Resume evaluation (skip already computed results)')
    p.add_argument('--debug_ident', action='store_true',
                   help='Print solver identification info for debugging')
    p.add_argument('--use_lower_bounds_json', action='store_true', default=True,
                   help='Use lower_bounds.json if available (default: True)')
    p.add_argument('--only_ordered_data_outputs', action='store_true', default=True,
                   help='Only load data from ordered_data/outputs directory (excluding group_B) and Synthetic/Weibull (default: True)')
    
    return p.parse_args()


def main() -> None:
    args = parse_args()

    # Validate mode-specific arguments
    if args.mode == 'pool' and not args.exp_with_pool:
        raise ValueError("--exp_with_pool is required for pool mode")
    if args.mode == 'population' and not args.exp_without_pool:
        raise ValueError("--exp_without_pool is required for population mode")
    # optimal, best_fit, first_fit, and worst_fit modes don't require experiment directories
    
    exp_with_pool = os.path.abspath(args.exp_with_pool) if args.exp_with_pool else None
    exp_without_pool = os.path.abspath(args.exp_without_pool) if args.exp_without_pool else None
    
    # Set default output directory
    if args.out_dir:
        out_dir = os.path.abspath(args.out_dir)
    else:
        # Baseline modes (optimal, best_fit, first_fit, worst_fit) save to baseline_results/
        if args.mode in ['optimal', 'best_fit', 'first_fit', 'worst_fit']:
            baseline_results_dir = os.path.join(_script_dir, 'baseline_results')
            os.makedirs(baseline_results_dir, exist_ok=True)
            out_dir = baseline_results_dir
        elif args.mode == 'pool' and exp_with_pool:
            out_dir = os.path.join(exp_with_pool, 'test')
        elif args.mode == 'population' and exp_without_pool:
            out_dir = os.path.join(exp_without_pool, 'test')
        else:
            out_dir = _script_dir

    # Load config
    config_exp_dir = args.exp_dir if args.exp_dir else (exp_with_pool if args.mode != 'optimal' else None)
    config = load_config_from_experiment(config_exp_dir) if config_exp_dir else None
    
    if config:
        backend = args.backend if args.backend != 'threading' else getattr(config, 'parallel_backend', 'loky')
        prefer = args.prefer if args.prefer != 'threads' else getattr(config, 'parallel_prefer', 'processes')
        n_jobs = args.n_jobs if args.n_jobs != -1 else (
            getattr(config, 'test_n_jobs', None) or 
            getattr(config, 'eval_n_jobs', -1)
        )
        capacity = args.capacity if args.capacity is not None else getattr(config, 'capacity', 100)
        num_items = args.num_items if args.num_items is not None else getattr(config, 'num_items', 100)
        time_limit = args.time_limit if args.time_limit != 5 else (
            getattr(config, 'test_time_limit', None) or 
            getattr(config, 'instance_solver_time_limit', 5)
        )
        config.capacity = capacity
        config.num_items = num_items
        config.instance_solver_time_limit = time_limit
        print(f"✅ Loaded configuration from {config_exp_dir}/config.json")
    else:
        backend = args.backend
        prefer = args.prefer
        n_jobs = args.n_jobs
        capacity = args.capacity or 100
        num_items = args.num_items or 100
        time_limit = args.time_limit
        from heupsro.problems.bp_online.config import BPOnlineConfig
        config = BPOnlineConfig(
            capacity=capacity,
            num_items=num_items,
            instance_solver_time_limit=time_limit,
            parallel_backend=backend,
            parallel_prefer=prefer,
            eval_n_jobs=n_jobs
        )
        print(f"⚠️  No config.json found, using command-line arguments")

    # Load test instances using load_test_data module
    # 默认只使用ordered_data/outputs目录中的数据
    datasets_dict = load_all_test_data(
        test_data_dir=args.test_data_dir,
        use_lower_bounds_json=args.use_lower_bounds_json,
        only_ordered_data_outputs=args.only_ordered_data_outputs
    )
    
    # Filter out HeavyTail and Mixture datasets (no longer evaluating heavy tail mixture and mixture)
    heavy_tail_keys = [key for key in datasets_dict.keys() if key.startswith('HeavyTail_')]
    mixture_keys = [key for key in datasets_dict.keys() if key.startswith('Mixture')]
    filtered_keys = heavy_tail_keys + mixture_keys
    if filtered_keys:
        for key in filtered_keys:
            del datasets_dict[key]
        if heavy_tail_keys:
            print(f"\n⚠️  Filtered out {len(heavy_tail_keys)} HeavyTail datasets: {', '.join(heavy_tail_keys)}")
        if mixture_keys:
            print(f"⚠️  Filtered out {len(mixture_keys)} Mixture datasets: {', '.join(mixture_keys)}")
    
    # If no datasets loaded, generate test instances
    if not datasets_dict:
        if args.mode == 'optimal' or args.n_test_instances > 0:
            print(f"No datasets found, generating {args.n_test_instances} test instances...")
            instances, lower_bounds = generate_test_instances(args.n_test_instances, capacity, num_items)
            dataset_label = f"Generated_{num_items}"
            datasets_dict[dataset_label] = (instances, lower_bounds)
        else:
            raise RuntimeError(
                f"No test instances loaded. Please check:\n"
                f"  - test_data_dir: {args.test_data_dir}"
            )
    
    # Limit instances per dataset if requested
    if args.limit_test_instances is not None and args.limit_test_instances > 0:
        for label in datasets_dict:
            instances, lbs = datasets_dict[label]
            if len(instances) > args.limit_test_instances:
                datasets_dict[label] = (instances[:args.limit_test_instances], lbs[:args.limit_test_instances])
    
    dataset_labels = list(datasets_dict.keys())
    print(f"\n✅ Loaded {len(dataset_labels)} datasets:")
    for label in dataset_labels:
        instances, lbs = datasets_dict[label]
        print(f"  - {label}: {len(instances)} instances")

    # Load sources based on mode
    all_row_keys = []
    all_solver_codes = []
    
    # Build result tables (only gap_table)
    if args.mode in ['optimal', 'best_fit', 'first_fit', 'worst_fit']:
        # For all baseline modes, load existing baseline_gap_table.csv if it exists
        # This allows all baseline algorithms (including eoh_optimal) to be saved in the same file
        gap_table: Dict[str, Dict[str, float]] = load_existing_table(out_dir, 'baseline_gap_table') or {}
    else:
        gap_table: Dict[str, Dict[str, float]] = load_existing_table(out_dir, 'trend_gap_table') or {}
    
    # A: pool solvers
    if args.mode == 'pool':
        pools = load_pools(exp_with_pool)
        pool_solvers = pools.get('solvers', [])
        if args.limit_pool_solvers is not None and args.limit_pool_solvers > 0:
            pool_solvers = pool_solvers[:args.limit_pool_solvers]
        
        for idx, solver in enumerate(pool_solvers):
            row_key = f"h{idx}"
            
            code = solver.get('code', '')
            algo = solver.get('algorithm', '')
            if args.debug_ident:
                h = hashlib.sha256((code or '').encode('utf-8', errors='ignore')).hexdigest()
                algo_preview = (algo or '')[:32].replace('\n', ' ')
                print(f"Queue {row_key} (program_id={program_id}, idx={idx}) algo_preview={algo_preview} code_len={len(code)} sha256={h}")
            all_row_keys.append(row_key)
            all_solver_codes.append(code)
            if row_key not in gap_table:
                gap_table[row_key] = {}
    
    # B: population bests
    elif args.mode == 'population':
        pop_best = []
        if exp_without_pool:
            pops_dir = os.path.join(exp_without_pool, 'solver_eoh', 'results', 'pops')
            if os.path.isdir(pops_dir):
                pop_files = list_population_files(pops_dir)
                if args.limit_populations is not None and args.limit_populations > 0:
                    pop_files = pop_files[:args.limit_populations]
                for pf in pop_files:
                    ind = load_population_best_individual(pf, select=args.select)
                    if ind is not None:
                        # 从文件名提取generation number: population_generation_{i}.json -> i
                        filename = os.path.splitext(os.path.basename(pf))[0]
                        # 提取generation number
                        match = re.search(r'population_generation_(\d+)', filename)
                        if match:
                            gen_num = int(match.group(1))
                            pop_best.append((gen_num, ind))
                        else:
                            # 如果无法提取，使用文件名
                            pop_best.append((filename, ind))
        
        print(f"Total population bests: {len(pop_best)}")
        for gen_or_name, ind in pop_best:
            # 使用h{i}格式，与pool模式保持一致
            if isinstance(gen_or_name, int):
                row_key = f"h{gen_or_name}"
            else:
                # 如果无法提取generation number，使用原格式作为fallback
                row_key = f"popbest_{gen_or_name}"
            code = ind.get('code', '')
            algo = ind.get('algorithm', '')
            if args.debug_ident:
                h = hashlib.sha256((code or '').encode('utf-8', errors='ignore')).hexdigest()
                algo_preview = (algo or '')[:32].replace('\n', ' ')
                print(f"Queue {row_key} algo_preview={algo_preview} code_len={len(code)} sha256={h}")
            all_row_keys.append(row_key)
            all_solver_codes.append(code)
            if row_key not in gap_table:
                gap_table[row_key] = {}
    
    # C: EoH optimal heuristic
    elif args.mode == 'optimal':
        eoh_code = get_eoh_optimal_code()
        print(f"Using EoH optimal heuristic code (length: {len(eoh_code)} chars)")
        row_key = "eoh_optimal"
        all_row_keys.append(row_key)
        all_solver_codes.append(eoh_code)
        gap_table[row_key] = {}
    
    # D: Best Fit baseline
    elif args.mode == 'best_fit':
        best_fit_code = get_best_fit_code()
        print(f"Using Best Fit baseline algorithm (length: {len(best_fit_code)} chars)")
        row_key = "best_fit"
        all_row_keys.append(row_key)
        all_solver_codes.append(best_fit_code)
        gap_table[row_key] = {}
    
    # E: First Fit baseline
    elif args.mode == 'first_fit':
        first_fit_code = get_first_fit_code()
        print(f"Using First Fit baseline algorithm (length: {len(first_fit_code)} chars)")
        row_key = "first_fit"
        all_row_keys.append(row_key)
        all_solver_codes.append(first_fit_code)
        gap_table[row_key] = {}
    
    # F: Worst Fit baseline
    elif args.mode == 'worst_fit':
        worst_fit_code = get_worst_fit_code()
        print(f"Using Worst Fit baseline algorithm (length: {len(worst_fit_code)} chars)")
        row_key = "worst_fit"
        all_row_keys.append(row_key)
        all_solver_codes.append(worst_fit_code)
        gap_table[row_key] = {}

    # Check if all results already exist before starting evaluation (only for non-resume mode)
    # Also check for anomalous gap values (> 1000%) that should be recomputed
    if not args.resume:
        all_results_exist = True
        missing_results = []
        anomalous_results = []  # Results with gap > 1000% that should be recomputed
        
        for dataset_label in dataset_labels:
            for row_key in all_row_keys:
                mean_col_gap = f"{dataset_label}_mean_gap"
                if row_key not in gap_table or mean_col_gap not in gap_table[row_key]:
                    all_results_exist = False
                    missing_results.append((row_key, dataset_label))
                else:
                    # Check for anomalous gap values (> 1000%)
                    gap_value = gap_table[row_key].get(mean_col_gap)
                    if gap_value is not None:
                        try:
                            gap_float = float(gap_value)
                            if gap_float > 1000.0:  # Gap > 1000% is considered anomalous
                                all_results_exist = False
                                anomalous_results.append((row_key, dataset_label, gap_float))
                        except (ValueError, TypeError):
                            pass
        
        if all_results_exist:
            print(f"\n✅ All results already exist in CSV file!")
            print(f"   Found results for {len(all_row_keys)} solvers across {len(dataset_labels)} datasets")
            print(f"   Skipping evaluation...")
            
            # Final save (to ensure CSV is up to date)
            if args.mode in ['optimal', 'best_fit', 'first_fit', 'worst_fit']:
                # All baseline algorithms (including eoh_optimal) save to the same file
                save_table(out_dir, gap_table, 'baseline_gap_table')
                print(f"\n✅ Results saved to: {out_dir}")
                print(f"   - baseline_gap_table.csv")
            else:
                save_table(out_dir, gap_table, 'trend_gap_table')
                print(f"\n✅ Results saved to: {out_dir}")
                print(f"   - trend_gap_table.csv")
            return
        
        # If some results are missing or anomalous, show what needs to be evaluated
        if anomalous_results:
            print(f"\n⚠️  发现 {len(anomalous_results)} 个异常gap值（>1000%），将重新计算:")
            anomalous_by_solver = {}
            for row_key, dataset_label, gap_val in anomalous_results:
                if row_key not in anomalous_by_solver:
                    anomalous_by_solver[row_key] = []
                anomalous_by_solver[row_key].append((dataset_label, gap_val))
            for row_key, datasets in anomalous_by_solver.items():
                print(f"   {row_key}: {len(datasets)} 个异常数据集")
                for dataset_label, gap_val in datasets[:3]:  # 只显示前3个
                    print(f"      - {dataset_label}: gap={gap_val:.1f}%")
                if len(datasets) > 3:
                    print(f"      ... 还有 {len(datasets) - 3} 个")
        
        # If some results are missing, show what needs to be evaluated
        if missing_results:
            print(f"\n⚠️  Some results are missing, will evaluate:")
            missing_by_solver = {}
            for row_key, dataset_label in missing_results:
                if row_key not in missing_by_solver:
                    missing_by_solver[row_key] = []
                missing_by_solver[row_key].append(dataset_label)
            for row_key, datasets in missing_by_solver.items():
                print(f"   {row_key}: missing {len(datasets)} dataset(s) - {', '.join(datasets)}")

    # Evaluate each dataset separately
    for dataset_label in dataset_labels:
        test_instances, test_lbs = datasets_dict[dataset_label]
        
        # Check which solvers need evaluation
        eval_row_keys = []
        eval_solver_codes = []
        for solver_idx, (row_key, code) in enumerate(zip(all_row_keys, all_solver_codes)):
            need_eval = False
            # 检查是否所有实例的结果都已存在
            for inst_idx in range(len(test_instances)):
                inst_name = test_instances[inst_idx].get('instance_name', f'inst_{inst_idx}')
                col_name_gap = generate_column_name(dataset_label, inst_name)
                if row_key not in gap_table or col_name_gap not in gap_table[row_key]:
                    need_eval = True
                    break
                else:
                    # 检查异常值（gap > 1000%）
                    gap_value = gap_table[row_key].get(col_name_gap)
                    if gap_value is not None:
                        try:
                            gap_float = float(gap_value)
                            if gap_float > 1000.0:  # Gap > 1000% is considered anomalous
                                need_eval = True
                                # Clear the anomalous value
                                del gap_table[row_key][col_name_gap]
                                break
                        except (ValueError, TypeError):
                            pass
            
            if need_eval:
                eval_row_keys.append(row_key)
                eval_solver_codes.append(code)
        
        # Initialize table rows
        for row_key in all_row_keys:
            if row_key not in gap_table:
                gap_table[row_key] = {}

        # Batch evaluate
        if eval_row_keys and test_instances:
            print(f"\nEvaluating {len(eval_row_keys)} solvers on {len(test_instances)} instances from dataset '{dataset_label}'")
            
            raw_results = evaluate_solvers_on_instances_with_lbs(
                solver_codes=eval_solver_codes,
                instances=test_instances,
                lower_bounds=test_lbs,
                config=config,
                return_format="raw",
            )
            
            # Save detailed results (only per-instance, no mean_gap)
            for solver_idx, row_key in enumerate(eval_row_keys):
                saved_count = 0
                skipped_duplicates = 0
                for inst_idx in range(len(test_instances)):
                    key = (solver_idx, inst_idx)
                    if key in raw_results:
                        num_bins, gap = raw_results[key]
                        if gap is not None:
                            # Save per-instance gap using reference file format
                            inst_name = test_instances[inst_idx].get('instance_name', f'inst_{inst_idx}')
                            col_name_gap = generate_column_name(dataset_label, inst_name)
                            
                            # 检查列名是否已存在，避免重复列
                            if col_name_gap in gap_table[row_key]:
                                existing_gap = gap_table[row_key][col_name_gap]
                                # 如果值相同，跳过；如果值不同，警告并保留现有值
                                if abs(existing_gap - gap) > 0.01:  # 允许0.01%的误差
                                    print(f"    ⚠️  警告: 列 {col_name_gap} 已存在，值不同 (现有: {existing_gap:.2f}%, 新值: {gap:.2f}%)，保留现有值")
                                skipped_duplicates += 1
                            else:
                                gap_table[row_key][col_name_gap] = gap
                                saved_count += 1
                
                if skipped_duplicates > 0:
                    print(f"  {row_key}: saved {saved_count} instance results, skipped {skipped_duplicates} duplicates")
                else:
                    print(f"  {row_key}: saved {saved_count} instance results")
        else:
            print(f"\nDataset '{dataset_label}': All results already computed, skipping evaluation")

    # Validate: 确保列数和实例数一致
    total_instances = sum(len(instances) for instances, _ in datasets_dict.values())
    
    # 统计所有唯一的列名（跨所有solver）
    all_unique_cols = set()
    for row in gap_table.values():
        all_unique_cols.update(row.keys())
    total_columns = len(all_unique_cols)
    
    # 统计每个solver的列数
    cols_per_solver = {solver: len(row) for solver, row in gap_table.items()}
    
    print(f"\n📊 验证结果:")
    print(f"  总实例数: {total_instances}")
    print(f"  总唯一列数: {total_columns}")
    print(f"  每个solver的列数: {cols_per_solver}")
    
    if total_columns != total_instances:
        diff = abs(total_columns - total_instances)
        print(f"  ⚠️  警告: 列数 ({total_columns}) 与实例数 ({total_instances}) 不一致！")
        print(f"  差异: {diff}")
        
        # 检查是否有重复列（同一solver内不应该有重复，但跨solver可能有）
        # 检查是否有相似的列名（可能是同一实例的不同命名）
        if total_columns < total_instances:
            print(f"  可能的原因: 某些实例生成了相同的列名（去重后）")
        else:
            print(f"  可能的原因: 某些实例生成了多个不同的列名")
            
        # 检查每个solver的列数是否一致
        if len(set(cols_per_solver.values())) > 1:
            print(f"  ⚠️  不同solver的列数不一致！")
    else:
        print(f"  ✅ 列数与实例数一致")
        
    # 检查每个solver的列数是否一致
    if len(set(cols_per_solver.values())) == 1:
        expected_cols = list(cols_per_solver.values())[0]
        if expected_cols == total_instances:
            print(f"  ✅ 所有solver的列数一致，且等于实例数")
        else:
            print(f"  ⚠️  所有solver的列数一致 ({expected_cols})，但与实例数 ({total_instances}) 不一致")
    
    # Final save
    if args.mode in ['optimal', 'best_fit', 'first_fit', 'worst_fit']:
        # All baseline algorithms (including eoh_optimal) save to the same file
        save_table(out_dir, gap_table, 'baseline_gap_table')
        print(f"\n✅ Saved {args.mode} results to: {out_dir}")
        print(f"   - baseline_gap_table.csv")
    else:
        save_table(out_dir, gap_table, 'trend_gap_table')
        print(f"Saved trend tables under: {out_dir}")


if __name__ == '__main__':
    main()

