import torch
import os

class Config:
    # -------------------------- Basic Env Config --------------------------
    DEVICE = "cuda"  # Use "cpu" if CUDA unavailable
    NUM_WORKERS = 4  # DataLoader workers
    BATCH_SIZE = 64  # Train/val batch size

    # -------------------------- Data Config --------------------------
    PROCESSED_DATA_DIR = "./data/processed"
    DATASET_NAME = "big_kaggle"
    MAX_PUZZLES = 6250  # Max puzzles to load
    TRAIN_SPLIT = 0.8   # Train set ratio
    VAL_SPLIT = 0.1     # Val set ratio (test = 1 - train - val)

    # -------------------------- Path Config --------------------------
    RESULT_DIR = "sudoku-5k"
    VIS_DIR = os.path.join(RESULT_DIR, "vis_tlad_dynamics")
    SAVE_DIR = os.path.join(RESULT_DIR, "checkpoints")
    LOG_DIR = os.path.join(RESULT_DIR, "logs")
    CSV_DIR = os.path.join(RESULT_DIR, "training_records")

    # -------------------------- Model Architecture --------------------------
    GRID_SIZE = 9               # Fixed 9x9 Sudoku grid
    SEQ_LEN = 81                # Total tokens (9*9)
    NUM_HEADS = 4               # Attention heads
    NUM_LAYERS_S1 = 4           # Transformer layers (Stage 1)

    # -------------------------- Dynamics Engine --------------------------
    EBA_STEP_SIZE = 1.0         # EBA optimizer step size
    EBA_MOMENTUM = 0.9          # EBA optimizer momentum
    TEMP_START = 5.0            # Initial annealing temperature
    TEMP_END = 1.0              # Final annealing temperature
    W_TASK = 2.5                # Task loss weight
    W_CELL_UNIQ = 1.0           # Cell uniqueness loss weight1
    W_ROW = 1.0                 # Row constraint loss weight1
    W_COL = 1.0                 # Column constraint loss weight1
    W_BOX = 1.0                 # Box constraint loss weight1
    W_ENTROPY = 0.1             # Entropy regularization weight

    # -------------------------- Training Strategy --------------------------
    LR_PHASE1 = 1e-3            # Learning rate (Phase 1)
    LR_PHASE2 = 1e-4            # Learning rate (Phase 2)
    
    # -------------------------- Tunable Core Parameters --------------------------
    EPOCHS_PHASE1 = 1          # Train epochs (Phase 1)   
    EPOCHS_PHASE2 = 1          # Train epochs (Phase 2)    
    EBA_STEPS = 24              # EBA iteration steps
    HIDDEN_DIM = 256            # Transformer hidden dimension
    TRANSITION_CENTER = 0.8     # Annealing transition center 
    ANNEALING_SLPOE = 2.0       # Annealing slope (typo preserved) 
    LAMBDA_INIT = 10            # Initial Lagrangian multiplier
    LAMBDA_MAX = 10             # Max Lagrangian multiplier

    # -------------------------- Utility --------------------------
    @classmethod
    def to_dict(cls):
        """Convert config to dict (exclude private/callable attrs)"""
        return {
            k: v for k, v in cls.__dict__.items()
            if not k.startswith('__') and not callable(getattr(cls, k))
        }

