import gc
import logging

from utils.dataset import ShardingLMDB_SFT_Dataset, cycle
from utils.distributed import EMA_FSDP, fsdp_wrap, fsdp_state_dict, launch_distributed_job, reduce_mean, reduce_dict
from utils.misc import set_seed, merge_step_dicts
import torch.distributed as dist
from omegaconf import OmegaConf
from model import T2V_FLOW_DPO
import torch
import wandb
import time
import os

import datetime
from torch.distributed.fsdp import FullyShardedDataParallel as FSDP


class Trainer:
    def __init__(self, config):
        self.config = config
        self.step = config.step
        self.GAS = config.GAS

        # Step 1: Initialize the distributed training environment (rank, seed, dtype, logging etc.)
        torch.backends.cuda.matmul.allow_tf32 = True
        torch.backends.cudnn.allow_tf32 = True

        launch_distributed_job()
        global_rank = dist.get_rank()
        self.world_size = dist.get_world_size()
        print(f"---------- global_rank={global_rank}")
        print(f"---------- world_size={self.world_size}")
        # 单机内进程编号（local rank，0-7）
        local_rank = int(os.environ["LOCAL_RANK"])
        print(f"---------- local_rank={local_rank}")

        self.dtype = torch.bfloat16 if config.mixed_precision else torch.float32
        self.device = torch.cuda.current_device()
        self.is_main_process = global_rank == 0
        self.disable_wandb = config.disable_wandb

        # use a random seed for the training
        if config.seed == 0:
            random_seed = torch.randint(0, 10000000, (1,), device=self.device)
            dist.broadcast(random_seed, src=0)
            config.seed = random_seed.item()

        set_seed(config.seed + global_rank)

        if self.is_main_process and not self.disable_wandb:
            # wandb.login(host=config.wandb_host, key=config.wandb_key)
            # 必须写进 Python，slurm/docker 的子进程不会自动继承
            os.environ["https_proxy"]  = "http://oversea-squid2.ko.txyun:11080"
            os.environ["http_proxy"]   = "http://oversea-squid2.ko.txyun:11080"
            wandb.login(key=config.wandb_key)
            wandb.init(
                config=OmegaConf.to_container(config, resolve=True),
                name=config.config_name,
                mode="online",  # os.environ["WANDB_MODE"] = "offline"
                entity=config.wandb_entity,
                project=config.wandb_project,
                dir=config.wandb_save_dir
            )

        self.output_path = config.logdir

        # Step 2: Initialize the model and optimizer
        self.model = T2V_FLOW_DPO(config, device=self.device)
        # ---------------------------------------------------------------------------
        # 7. (If resuming) Load the model and optimizer, lr_scheduler, ema's statedicts
        if getattr(config, "generator_ckpt", False):
            print(f"Loading pretrained generator from {config.generator_ckpt}")
            state_dict = torch.load(config.generator_ckpt, map_location="cpu", mmap=True)
            if "generator" in state_dict:
                state_dict_gen = state_dict["generator"]
            print("----------- LOADING PRE GEN STATE DICT")
            self.model.generator.load_state_dict(
                state_dict_gen, strict=True
            )

            del state_dict_gen
            del state_dict
            gc.collect()
        # ---------------------------------------------------------------------------
        
        print("------------ We begin to fsdp all the models")
        self.model.generator = self.model.generator.float()
        self.model.generator = fsdp_wrap(
            self.model.generator,
            sharding_strategy=config.sharding_strategy,
            mixed_precision=config.mixed_precision,
            wrap_strategy=config.generator_fsdp_wrap_strategy,
            cpu_offload = False
        )
        # 打印当前卡的本地显存占用 (MB)
        allocated = torch.cuda.memory_allocated() / 1024**2
        print(f"[after gen model R{os.environ.get('RANK', '?')}] allocated: {allocated:.1f} MB")

        self.model.real_score = fsdp_wrap(
            self.model.real_score,
            sharding_strategy=config.sharding_strategy,
            mixed_precision=config.mixed_precision,
            wrap_strategy=config.real_score_fsdp_wrap_strategy,
            cpu_offload = False
        )
        # 打印当前卡的本地显存占用 (MB)
        allocated = torch.cuda.memory_allocated() / 1024**2
        print(f"[after real model R{os.environ.get('RANK', '?')}] allocated: {allocated:.1f} MB")

        self.generator_optimizer = torch.optim.AdamW(
            [param for param in self.model.generator.parameters() if param.requires_grad],
            lr=config.lr,  # 2.0e-06
            betas=(config.beta1, config.beta2),
            foreach=False, 
            weight_decay=config.weight_decay
        )
        print(f"config.lr = {config.lr} weight_decay = {config.weight_decay}")

        print("------------ We begin to initialize all the dataset")
        # Step 3: Initialize the dataloader
        # 1. train dataloader
        train_dataset = ShardingLMDB_SFT_Dataset(config.train_data_path, max_pair=int(1e8))
        train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset, shuffle=True, drop_last=True)
        train_dataloader = torch.utils.data.DataLoader(
            train_dataset, batch_size=config.batch_size,  # 1
            sampler=train_sampler, num_workers=0)

        if dist.get_rank() == 0:
            print("DATASET SIZE %d" % len(train_dataset))
        self.dataloader = cycle(train_dataloader, train_sampler)

        ##############################################################################################################
        # 6. Set up EMA parameter containers
        self.ema_weight = config.get("ema_weight", 0.99)  # 0.99
        self.ema_start_step = config.get("ema_start_step", 0)  # 400
        self.generator_ema = None
        if (self.ema_weight > 0.0) and (self.step >= self.ema_start_step):
            print(f"Setting up EMA with weight {self.ema_weight}")
            self.generator_ema = EMA_FSDP(self.model.generator, decay=self.ema_weight)

        self.max_grad_norm_generator = getattr(config, "max_grad_norm_generator", 5.0)
        print(f"self.max_grad_norm_generator {self.max_grad_norm_generator}")
        self.max_grad_norm_critic = getattr(config, "max_grad_norm_critic", 5.0)
        print(f"self.max_grad_norm_critic {self.max_grad_norm_critic}")
        self.previous_time = None

    def save(self):
        print("Start gathering distributed model states...")
        generator_state_dict = fsdp_state_dict(
            self.model.generator)

        if (self.ema_weight > 0.0) and (self.ema_start_step < self.step):
            state_dict = {
                "generator": generator_state_dict,
                "generator_ema": self.generator_ema.state_dict(),
            }
        else:
            state_dict = {
                "generator": generator_state_dict,
            }

        if self.is_main_process:
            os.makedirs(os.path.join(self.output_path, f"checkpoint_model_{self.step:06d}"), exist_ok=True)
            torch.save(state_dict, os.path.join(self.output_path, f"checkpoint_model_{self.step:06d}", "model.pt"))
            print("Model saved to", os.path.join(self.output_path, f"checkpoint_model_{self.step:06d}", "model.pt"))

    def fwdbwd_one_step(self, batch):
        self.model.eval()  # prevent any randomness (e.g. dropout)

        if self.step % 5 == 0:
            torch.cuda.empty_cache()

        # Step 1: Get the next batch of text prompts
        noise_shape = batch["noise_shape"][0].numpy().tolist()
        noise_shape = tuple(map(int, noise_shape))  # 1, 21, 16, tgt_h // 8, tgt_w // 8
        text_feature = batch['text_feature'][0].to(device=self.device, dtype=self.dtype)
        neg_text_feature = batch['neg_text_feature'][0].to(device=self.device, dtype=self.dtype)
        prompt = batch['prompt']
        latent = batch['latent'][0].to(device=self.device, dtype=self.dtype)
        

        conditional_dict = dict()
        conditional_dict['prompt_embeds'] = text_feature
        unconditional_dict = dict()
        unconditional_dict['prompt_embeds'] = neg_text_feature

        # Step 2: Extract the conditional infos get from batch

        # Step 3: Store gradients for the generator (if training the generator)
        generator_loss, generator_log_dict = self.model.generator_loss(
            prompt = prompt, latent = latent,
            image_or_video_shape=noise_shape,
            conditional_dict=conditional_dict, unconditional_dict=unconditional_dict,
        )
        generator_loss /= self.GAS
        generator_loss.backward()
        generator_log_dict.update({"generator_loss": generator_loss})

        return generator_log_dict


    def train(self):
        start_step = self.step

        if self.is_main_process:
            print(f"Update Gen Every {self.config.dfake_gen_update_ratio} steps")
        
        while True:
            if self.is_main_process:
                print(f"training step {self.step} ...")
            self.model.step = self.step

            # Train the generator + critic
            if self.is_main_process:
                print("------------[TRAIN] We begin to train geneartor + fake model")
            self.generator_optimizer.zero_grad(set_to_none=True)
            step_dicts = []
            for gas_idx in range(self.GAS):
                batch = next(self.dataloader)
                generator_log_dict_gas = self.fwdbwd_one_step(batch)
                step_dicts.append(generator_log_dict_gas)
            generator_grad_norm = self.model.generator.clip_grad_norm_(self.max_grad_norm_generator)
            self.generator_optimizer.step()
            generator_log_dict = merge_step_dicts(step_dicts)
            generator_log_dict["generator_grad_norm"] = generator_grad_norm.item()
            if self.generator_ema is not None:
                self.generator_ema.update(self.model.generator)
            

            # Increment the step since we finished gradient update
            self.step += 1

            # Create EMA params (if not already created)
            if (self.step >= self.ema_start_step) and (self.generator_ema is None) and (self.ema_weight > 0):
                self.generator_ema = EMA_FSDP(self.model.generator, decay=self.ema_weight)

            # Save the model
            if (not self.config.no_save) and (self.step - start_step) > 0 and self.step % self.config.log_iters == 0:
                torch.cuda.empty_cache()
                self.save()
                torch.cuda.empty_cache()
            

            generator_log_dict = reduce_dict(generator_log_dict, self.device)

            # Logging
            if self.is_main_process:
                wandb_loss_dict = {}
                wandb_loss_dict.update(generator_log_dict)

                time_str = datetime.datetime.now().strftime('%m-%d %H:%M:%S')
                print(f"[{time_str}] step={self.step}", end='')
                for k, v in wandb_loss_dict.items():
                    print(f' | {k}: {v:.6f}', end='')
                if not self.disable_wandb:
                    wandb.log(wandb_loss_dict, step=self.step)

            if self.step % self.config.gc_interval == 0:
                if dist.get_rank() == 0:
                    logging.info("DistGarbageCollector: Running GC.")
                gc.collect()
                torch.cuda.empty_cache()

            if self.is_main_process:
                current_time = time.time()
                if self.previous_time is None:
                    self.previous_time = current_time
                else:
                    if not self.disable_wandb:
                        wandb.log({"per iteration time": current_time - self.previous_time}, step=self.step)
                    print(f' | per iteration time: {(current_time - self.previous_time):.6f}', end='')
                    print()  # 换行
                    self.previous_time = current_time
    