"""
Train a diffusion model on images.
"""

import argparse

from cm import dist_util, logger
from cm.image_datasets import load_data
from cm.script_util import (
    model_and_diffusion_defaults,
    create_model_and_diffusion,
    cm_train_defaults,
    ctm_train_defaults,
    ctm_eval_defaults,
    ctm_loss_defaults,
    ctm_data_defaults,
    add_dict_to_argparser,
    create_ema_and_scales_fn,
)
from cm.train_util import CMTrainLoop
import torch.distributed as dist
import copy
import cm.enc_dec_lib as enc_dec_lib
import torch as th


def main():
    args = create_argparser().parse_args()

    dist_util.setup_dist(args.device_id)
    logger.configure(dir=args.out_dir)

    logger.log("creating model and diffusion...")
    ema_scale_fn = create_ema_and_scales_fn(
        target_ema_mode=args.target_ema_mode,
        start_ema=args.start_ema,
        scale_mode=args.scale_mode,
        start_scales=args.start_scales,
        end_scales=args.end_scales,
        total_steps=args.total_training_steps,
        distill_steps_per_iter=args.distill_steps_per_iter,
    )

    # Load Feature Extractor
    classifier_vpsde = enc_dec_lib.vpsde(beta_min=0.1, beta_max=20., multiplier=1., cos_t_classifier=args.cos_t_classifier)
    pretrained_classifier = None
    if args.time_conditioned_classifier:
        pretrained_classifier = enc_dec_lib.load_classifier(args, eval=True)
        pretrained_classifier.to(dist_util.dev())
        if args.use_fp16:
            pretrained_classifier.convert_to_fp16()

    # Load Discriminator
    discriminator = None
    if args.discriminator_training:
        feature_networks = None
        if args.d_architecture == 'stylegan-xl':
            discriminator, feature_networks = enc_dec_lib.load_discriminator(args, ckpt_path=None)
            feature_networks.to(dist_util.dev())
        else:
            discriminator = enc_dec_lib.load_discriminator(args, ckpt_path=None)
        discriminator.to(dist_util.dev())
        discriminator.train()
        if args.use_d_fp16:
            discriminator.convert_to_fp16()

    # Load Decoder
    latent_decoder = None
    if args.data_name in ['church']:
        latent_decoder = enc_dec_lib.decoder_()
        latent_decoder.to(dist_util.dev())
        latent_decoder.eval()

    # Load Model
    model, diffusion = create_model_and_diffusion(args, feature_networks)
    model.to(dist_util.dev())
    model.train()
    if args.use_fp16:
        model.convert_to_fp16()

    logger.log("creating data loader...")
    if args.batch_size == -1:
        batch_size = args.global_batch_size // dist.get_world_size()
        if args.global_batch_size % dist.get_world_size() != 0:
            logger.log(
                f"warning, using smaller global_batch_size of {dist.get_world_size()*batch_size} instead of {args.global_batch_size}"
            )
    else:
        batch_size = args.batch_size

    data = load_data(
        data_name=args.data_name,
        data_dir=args.data_dir,
        batch_size=batch_size,
        image_size=args.image_size,
        class_cond=args.class_cond,
        train_classes=args.train_classes,
        num_workers=args.num_workers,
        type=args.type,
        deterministic=args.deterministic,
    )

    if len(args.teacher_model_path) > 0:  # path to the teacher score model.
        logger.log(f"loading the teacher model from {args.teacher_model_path}")
        teacher_model, _ = create_model_and_diffusion(args, teacher=True)

        if args.data_name in ['church']:
            sd = th.load(args.teacher_model_path, map_location="cpu")
            if "state_dict" in list(sd.keys()):
                sd = sd["state_dict"]
            sd_ = copy.deepcopy(sd)
            for name in sd:
                if name.split('.')[0] == 'model':
                    if name.split('.')[1] == 'diffusion_model':
                        sd_['.'.join(name.split('.')[2:])] = sd[name]
            teacher_model.load_state_dict(sd_, strict=False)
            latent_decoder.load_state_dict(sd, strict=False)
            del sd, sd_
            latent_decoder.eval()
        else:
            if not args.edm_nn_ncsn and not args.edm_nn_ddpm:
                teacher_model.load_state_dict(
                    dist_util.load_state_dict(args.teacher_model_path, map_location="cpu"),
                )
        teacher_model.to(dist_util.dev())
        teacher_model.eval()


        for dst_name, dst in model.named_parameters():
            for src_name, src in teacher_model.named_parameters():
                if dst_name in ['.'.join(src_name.split('.')[1:]), src_name]:
                    dst.data.copy_(src.data)
                    break
        teacher_model.requires_grad_(False)
        if args.edm_nn_ncsn:
            model.model.map_noise.freqs = teacher_model.model.model.map_noise.freqs

        if args.use_fp16:
            teacher_model.convert_to_fp16()
    else:
        teacher_model = None
        teacher_diffusion = None

    # load the target model for distillation, if path specified.

    logger.log("creating the target model")
    target_model, _ = create_model_and_diffusion(args)

    target_model.to(dist_util.dev())
    target_model.train()

    dist_util.sync_params(target_model.parameters())
    dist_util.sync_params(target_model.buffers())

    for dst, src in zip(target_model.parameters(), model.parameters()):
        dst.data.copy_(src.data)

    if args.use_fp16:
        target_model.convert_to_fp16()
    if args.edm_nn_ncsn:
        target_model.model.map_noise.freqs = teacher_model.model.model.map_noise.freqs

    logger.log("training...")
    CMTrainLoop(
        model=model,
        target_model=target_model,
        teacher_model=teacher_model,
        latent_decoder=latent_decoder,
        discriminator=discriminator,
        ema_scale_fn=ema_scale_fn,
        diffusion=diffusion,
        data=data,
        batch_size=batch_size,
        args=args,
        pretrained_classifier=pretrained_classifier,
        classifier_vpsde=classifier_vpsde,
    ).run_loop()


def create_argparser():
    defaults = dict(
        out_dir="",
        data_dir="",
        schedule_sampler="uniform",
        lr=1e-4,
        weight_decay=0.0,
        lr_anneal_steps=0,
        global_batch_size=2048,
        batch_size=-1,
        microbatch=-1,  # -1 disables microbatches
        ema_rate="0.9999",  # comma-separated list of EMA values
        log_interval=10,
        save_interval=10000,
        resume_checkpoint="",
        use_fp16=False,
        fp16_scale_growth=1e-3,
        device_id=0,
        num_workers=1,
    )
    defaults.update(model_and_diffusion_defaults())
    defaults.update(cm_train_defaults())
    defaults.update(ctm_train_defaults())
    defaults.update(ctm_eval_defaults())
    defaults.update(ctm_loss_defaults())
    defaults.update(ctm_data_defaults())
    defaults.update()
    parser = argparse.ArgumentParser()
    add_dict_to_argparser(parser, defaults)
    return parser


if __name__ == "__main__":
    main()
