##########################################################################################
# Machine Environment Config
DEBUG_MODE = False
USE_CUDA = not DEBUG_MODE
CUDA_DEVICE_NUM = 0

##########################################################################################
# Path Config
import os
import sys

os.chdir(os.path.dirname(os.path.abspath(__file__)))
sys.path.insert(0, "..")  # for problem_def
sys.path.insert(0, "../..")  # for utils

##########################################################################################
# import
import logging
from utils.utils import create_logger, copy_all_src

import torch

from MGMOTSPTrainer import TSPTrainer

##########################################################################################
# parameters
env_params = {
    'problem_size': 10,
    'pomo_size': 10,
}

training_method = "Chb" # Either Linear or Chb
curriculum_learning = True

distribution = "MG_flex" # MG_fix, MG_flex
emax = 2

### Shared encoder ###

encoder_params = {
    'embedding_dim': 128,
    'encoder_layer_num': 4,
    'qkv_dim': 16,
    'head_num': 8, 
    'ff_hidden_dim': 512,
    "great_nodeless": False,
    "great_asymmetric": True,
    "dropout": 0.1, 
}

### Shared encoder end ###

### Head 2 - roll-out ###

head2_params = {
    'edge_to_node_layers': 1,
    'attention_layers': 2, # Transformer layers
    'ff_hidden_dim': 512,
    'embedding_dim': 128,
    'qkv_dim': 16, 
    'head_num': 8,
    'logit_clipping': 10,
    'eval_type': 'softmax',
    'pref_cond_time_emb': True
}

### Head 2 end ###

### Head 1 - sparsification ###

head1_params = {
    'separate_encoder': False,
    'only_by_distance': False, # Only use scalarized distance -> not trainable
    'freeze_encoder': True, # Freeze h2 encoder during h1 training
    'attention_layers': 2, # Set to 0 for no layers
    'decoder_dropout': 0.1, 
    'eval_type': 'softmax',
    'tanh': False, 
    'logit_clipping': 10, # If tanh  
    'aug_query': False,
    'aug_query_weights': False # If aug_query
}

### Head 1 end ###

optimizer_params = {
    'optimizer': {
        'lr': 1e-4, 
        'weight_decay': 1e-6
    },
    'scheduler': {
        'milestones': [180,],
        'gamma': 0.1
    }
}

trainer_params = {
    'use_cuda': USE_CUDA,
    'cuda_device_num': CUDA_DEVICE_NUM,
    'initial_dist': 'XASY', # Distribution for first e_0 epochs, TMAT or XASY
    'e_0': 5, # Initial epochs to train head 2 on simple graph 
    'epochs': 20,
    'train_episodes': 1*1000,
    'train_batch_size': 64,
    'train_batch_size_init': 128,
    'fwd_batch_size': 64, # To allow training for very large problems, gradient accumulates over train_batch_size instances but we run with fewer
    'fwd_batch_size_init': 128, 
    'samples_per_instance_h1': 4,
    'pomo_size_h1': 5,
    'h1_baseline': True,
    'logging': {
        'model_save_interval': 5,
        'img_save_interval': 10,
        'log_image_params_1': {
            'json_foldername': 'log_image_style',
            'filename': 'style_tsp.json'
        },
        'log_image_params_2': {
            'json_foldername': 'log_image_style',
            'filename': 'style_loss_1.json'
        },
    },
    'model_load': {
        'enable': False,  # enable loading pre-trained model
        'path': './result/20250304_110849_train_test_tsp',  # directory path of pre-trained model and log files saved.
        'epoch': 70,  # epoch version of pre-trained model to laod.
    }
}

logger_params = {
    'log_file': {
        'desc': 'train__tsp',
        'filename': 'run_log'
    }
}

def curriculum_function(epoch):

    if epoch <= 10:
        problem_size = 5
        fwd_batch_size = trainer_params['train_batch_size']
    elif epoch <= 20:
        problem_size = 10
        fwd_batch_size = trainer_params['train_batch_size']

    problem_size += 1

    batch_size = trainer_params['train_batch_size']
    pomo_size = problem_size - 1
    emax_new = emax

    logger = logging.getLogger('root')
    logger.info(f"Training on problem size: {problem_size}")

    return problem_size, pomo_size, batch_size, fwd_batch_size, emax_new

### Config end

head2_params["training_method"] = training_method
head1_params["training_method"] = training_method

##########################################################################################
# main
def main():
    if DEBUG_MODE:
        _set_debug_mode()

    create_logger(**logger_params)

    _print_config()

    # We treat depot as any other node
    env_params['problem_size'] += 1

    trainer = TSPTrainer(
                    distribution=distribution,
                    emax=emax,
                    training_method=training_method,
                    curriculum_learning=curriculum_learning,
                    curriculum_function=curriculum_function,
                    env_params=env_params,
                    encoder_params=encoder_params,
                    head1_params=head1_params,
                    head2_params=head2_params,
                    optimizer_params=optimizer_params,
                    trainer_params=trainer_params)

    copy_all_src(trainer.result_folder)

    trainer.run()


def _set_debug_mode():
    global trainer_params
    trainer_params['epochs'] = 1
    trainer_params['train_episodes'] = 10
    trainer_params['train_batch_size'] = 10


def _print_config():
    logger = logging.getLogger('root')
    logger.info('DEBUG_MODE: {}'.format(DEBUG_MODE))
    logger.info('USE_CUDA: {}, CUDA_DEVICE_NUM: {}'.format(USE_CUDA, CUDA_DEVICE_NUM))
    logger.info('Distribution: {}'.format(distribution))
    logger.info('Max Edges: {}'.format(emax))
    logger.info('Training Method: {}'.format(training_method))
    [logger.info('Encoder params: {}'.format(encoder_params))]
    [logger.info('Head 1 params: {}'.format(head1_params))]
    [logger.info('Head 2 params: {}'.format(head2_params))]

##########################################################################################

if __name__ == "__main__":
    main()
