# --------------------------------------------------------
# Swin Transformer
# Copyright (c) 2021 Microsoft
# Licensed under The MIT License [see LICENSE for details]
# Written by Ze Liu
# --------------------------------------------------------
# Adapted for DUPS from AutoFocusFormer

import os
import yaml
from yacs.config import CfgNode as CN

_C = CN()

# Base config files
_C.BASE = ['']

# -----------------------------------------------------------------------------
# Data settings
# -----------------------------------------------------------------------------
_C.DATA = CN()
# Batch size for a single GPU, could be overwritten by command line argument
_C.DATA.BATCH_SIZE = 128
# Path to dataset, could be overwritten by command line argument
_C.DATA.DATA_PATH = 'imagenet'
# Dataset name
_C.DATA.DATASET = 'imagenet'
# Input image size
_C.DATA.IMG_SIZE = 224
# Input channels
_C.DATA.IN_CHANS = 3
# Interpolation to resize image (random, bilinear, bicubic)
_C.DATA.INTERPOLATION = 'bicubic'
# Pin CPU memory in DataLoader for more efficient (sometimes) transfer to GPU.
_C.DATA.PIN_MEMORY = True
# Number of data loading threads
_C.DATA.NUM_WORKERS = 32

# -----------------------------------------------------------------------------
# Model settings
# -----------------------------------------------------------------------------
_C.MODEL = CN()
# Model type
_C.MODEL.TYPE = 'DUPS'
# Model name
_C.MODEL.NAME = 'DUPS_tiny'
# Checkpoint to resume, could be overwritten by command line argument
_C.MODEL.RESUME = ''
# Number of classes, overwritten in data preparation
_C.MODEL.NUM_CLASSES = 1000
# Dropout rate
_C.MODEL.DROP_RATE = 0.0
# Drop path rate
_C.MODEL.DROP_PATH_RATE = 0.0
# Label Smoothing
_C.MODEL.LABEL_SMOOTHING = 0.1


# OT parameters
_C.MODEL.DUPS = CN()
_C.MODEL.DUPS.NAME = ["MixResViT", "MixResNeighbour", "MixResNeighbour", "MixResNeighbour"]
_C.MODEL.DUPS.EMBED_DIM = [512, 256, 128, 64]
_C.MODEL.DUPS.DEPTHS = [4, 4, 4, 4]
_C.MODEL.DUPS.NUM_HEADS = [32, 16, 8, 4]
_C.MODEL.DUPS.PATCH_SIZES = [32, 16, 8, 4]
_C.MODEL.DUPS.SPLIT_RATIO = [4, 4, 4, 4]
_C.MODEL.DUPS.MLP_RATIO = [4., 4., 4., 4.]
_C.MODEL.DUPS.UPSCALE_RATIO = [0.25, 0.25, 0.25, 0.25]
_C.MODEL.DUPS.DROP_RATE = [0.0, 0.0, 0.0, 0.0]
_C.MODEL.DUPS.DROP_PATH_RATE = 0.3
_C.MODEL.DUPS.ATTN_DROP_RATE = [0.0, 0.0, 0.0, 0.0]
_C.MODEL.DUPS.OUT_FEATURES = ["res2", "res3", "res4", "res5"]
_C.MODEL.DUPS.CLUSTER_SIZE = [8, 8, 8, 8]
_C.MODEL.DUPS.NBHD_SIZE = [48, 48, 48, 48]
_C.MODEL.DUPS.KEEP_OLD_SCALE = False
_C.MODEL.DUPS.ADD_IMAGE_DATA_TO_ALL = False
_C.MODEL.DUPS.OUT_DIM = 256
_C.MODEL.DUPS.N_RESOLUTION_SCALES = 4
_C.MODEL.DUPS.NUM_REGISTER_TOKENS = 0
_C.MODEL.DUPS.LAYER_SCALE = 0.0
_C.MODEL.DUPS.AUX_LOSS = False

# -----------------------------------------------------------------------------
# Training settings
# -----------------------------------------------------------------------------
_C.TRAIN = CN()
_C.TRAIN.START_EPOCH = 0
_C.TRAIN.EPOCHS = 300
_C.TRAIN.WARMUP_EPOCHS = 20
_C.TRAIN.COOLDOWN_EPOCHS = 0
_C.TRAIN.WEIGHT_DECAY = 0.05
_C.TRAIN.BASE_LR = 5e-4
_C.TRAIN.WARMUP_LR = 5e-7
_C.TRAIN.MIN_LR = 5e-6
# EMA
_C.TRAIN.USE_EMA = False
_C.TRAIN.EMA_DECAY = 0.9998

# Clip gradient norm
_C.TRAIN.CLIP_GRAD = 5.0
# Auto resume from latest checkpoint
_C.TRAIN.AUTO_RESUME = True
# Gradient accumulation steps
# could be overwritten by command line argument
_C.TRAIN.ACCUMULATION_STEPS = 0

# LR scheduler
_C.TRAIN.LR_SCHEDULER = CN()
_C.TRAIN.LR_SCHEDULER.NAME = 'cosine'
# Epoch interval to decay LR, used in StepLRScheduler
_C.TRAIN.LR_SCHEDULER.DECAY_EPOCHS = 30
# LR decay rate, used in StepLRScheduler
_C.TRAIN.LR_SCHEDULER.DECAY_RATE = 0.1

# Optimizer
_C.TRAIN.OPTIMIZER = CN()
_C.TRAIN.OPTIMIZER.NAME = 'adamw'
# Optimizer Epsilon
_C.TRAIN.OPTIMIZER.EPS = 1e-8
# Optimizer Betas
_C.TRAIN.OPTIMIZER.BETAS = (0.9, 0.999)
# SGD momentum
_C.TRAIN.OPTIMIZER.MOMENTUM = 0.9

# -----------------------------------------------------------------------------
# Augmentation settings
# -----------------------------------------------------------------------------
_C.AUG = CN()
# Color jitter factor
_C.AUG.COLOR_JITTER = 0.4
# Use AutoAugment policy. "v0" or "original"
_C.AUG.AUTO_AUGMENT = 'rand-m9-mstd0.5-inc1'
# Random erase prob
_C.AUG.REPROB = 0.25
# Random erase mode
_C.AUG.REMODE = 'pixel'
# Random erase count
_C.AUG.RECOUNT = 1
# Mixup alpha, mixup enabled if > 0
_C.AUG.MIXUP = 0.0  # 0.8
# Cutmix alpha, cutmix enabled if > 0
_C.AUG.CUTMIX = 0.0  # 1.0
# Cutmix min/max ratio, overrides alpha and enables cutmix if set
_C.AUG.CUTMIX_MINMAX = None
# Probability of performing mixup or cutmix when either/both is enabled
_C.AUG.MIXUP_PROB = 1.0
# Probability of switching to cutmix when both mixup and cutmix enabled
_C.AUG.MIXUP_SWITCH_PROB = 0.5
# How to apply mixup/cutmix params. Per "batch", "pair", or "elem"
_C.AUG.MIXUP_MODE = 'batch'

# -----------------------------------------------------------------------------
# Testing settings
# -----------------------------------------------------------------------------
_C.TEST = CN()
# Whether to use center crop when testing
_C.TEST.CROP = True

# -----------------------------------------------------------------------------
# Misc
# -----------------------------------------------------------------------------
# Pytorch native amp, overwritten by command line argument
_C.AMP_ENABLE = True
# Path to output folder, overwritten by command line argument
_C.OUTPUT = ''
# Tag of experiment, overwritten by command line argument
_C.TAG = 'default'
# Frequency to save checkpoint (epochs)
_C.SAVE_FREQ = 5
# Frequency to logging info
_C.PRINT_FREQ = 20
# Frequency to validate (epochs)
_C.EVAL_FREQ = 1
# Fixed random seed
_C.SEED = 0
# Perform evaluation only, overwritten by command line argument
_C.EVAL_MODE = False
# Test throughput only, overwritten by command line argument
_C.THROUGHPUT_MODE = False
# local rank for DistributedDataParallel, given by command line argument
_C.LOCAL_RANK = 0


def _update_config_from_file(config, _C_file):
    config.defrost()
    with open(_C_file, 'r') as f:
        yaml__C = yaml.load(f, Loader=yaml.FullLoader)

    for _C in yaml__C.setdefault('BASE', ['']):
        if _C:
            _update_config_from_file(
                config, os.path.join(os.path.dirname(_C_file), _C)
            )
    print('=> merge config from {}'.format(_C_file))
    config.merge_from_file(_C_file)
    config.freeze()


def update_config(config, args):
    _update_config_from_file(config, args.cfg)

    config.defrost()
    if args.opts:
        config.merge_from_list(args.opts)

    # merge from specific arguments
    if args.batch_size:
        config.DATA.BATCH_SIZE = args.batch_size
    if args.data_path:
        config.DATA.DATA_PATH = args.data_path
    if args.blr:
        config.TRAIN.BASE_LR = args.blr
    if args.resume:
        config.MODEL.RESUME = args.resume
    if args.accumulation_steps:
        config.TRAIN.ACCUMULATION_STEPS = args.accumulation_steps
    if args.output:
        config.OUTPUT = args.output
    if args.tag:
        config.TAG = args.tag
    if args.eval:
        config.EVAL_MODE = True
    if args.throughput:
        config.THROUGHPUT_MODE = True
    if args.epochs:
        config.TRAIN.EPOCHS = args.epochs

    # set local rank for distributed training
    if 'LOCAL_RANK' in os.environ:
        config.LOCAL_RANK = int(os.environ['LOCAL_RANK'])
    else:
        config.LOCAL_RANK = args.local_rank

    # output folder
    config.OUTPUT = os.path.join(config.OUTPUT, config.MODEL.NAME, config.TAG)

    config.freeze()


def get_config(args):
    """Get a yacs _CNode object with default values."""
    # Return a clone so that the defaults will not be altered
    # This is for the "local variable" use pattern
    config = _C.clone()
    update_config(config, args)

    return config
