import os
import re
from datetime import timedelta

import torch
import torch.distributed as dist
import torch.multiprocessing as mp
from configs import create_workshop, get_config
from engine import Engine

os.environ["WANDB_AGENT_DISABLE_FLAPPING"] = "True"
os.environ["WANDB_AGENT_MAX_INITIAL_FAILURES"] = "20"


def main_worker(local_rank, cfg, mode, world_size, dist_url):
    torch.cuda.empty_cache()
    torch.cuda.set_device(local_rank)
    sv.utils.environment.set_seed(cfg.train.seed)
    """ To enable deterministic behavior in this case, you must set an environment variable before running 
    your PyTorch application: CUBLAS_WORKSPACE_CONFIG=:4096:8 or CUBLAS_WORKSPACE_CONFIG=:16:8. """
    os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
    # os.environ["TORCH_NCCL_BLOCKING_WAIT"] = "0"  # for distributed
    # os.environ["NCCL_IB_GID_INDEX"] = "3"
    dist.init_process_group(
        backend="nccl",
        init_method=dist_url,
        timeout=timedelta(seconds=6000),
        world_size=world_size,
        rank=local_rank,
    )

    if cfg.model.init_with_ckpt:
        mark = re.search("(?<=_mark_)\w+", cfg.model.path_to_vesper)
        if mark is not None:
            if cfg.mark is None:
                cfg.mark = mark.group()
            else:
                cfg.mark = mark.group() + "_" + cfg.mark

    cfg = create_workshop(cfg, mode, local_rank)
    engine = Engine(cfg, mode, local_rank, world_size)
    engine.run()


def main(cfg, mode):
    # device_id = device_id if device_id is not None else cfg.train.device_id
    # utils.environment.visible_gpus(device_id)
    sv.utils.environment.set_seed(cfg.train.seed)
    free_port = sv.utils.distributed.find_free_port()
    dist_url = f"tcp://127.0.0.1:{free_port}"
    world_size = torch.cuda.device_count()  # num_gpus
    print(f"world_size={world_size} Using dist_url={dist_url}")
    mp.spawn(fn=main_worker, args=(cfg, mode, world_size, dist_url), nprocs=world_size)


if __name__ == "__main__":
    mode = "_finetune"
    cfg = get_config(mode=mode)
    main(cfg, mode)
