# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import time

import torch
from torch import Tensor

from imaginaire.callbacks.every_n import EveryN
from imaginaire.model import ImaginaireModel
from imaginaire.trainer import ImaginaireTrainer
from imaginaire.utils import log
from imaginaire.utils.distributed import rank0_only, is_rank0
from imaginaire.utils.easy_io import easy_io


class IterSpeed(EveryN):
    """
    Args:
        hit_thres (int): Number of iterations to wait before logging.
    """

    def __init__(self, *args, hit_thres: int = 5, **kwargs):
        super().__init__(*args, **kwargs)
        self.time = None
        self.hit_counter = 0
        self.hit_thres = hit_thres
        self.name = self.__class__.__name__
        self.last_hit_time = time.time()

    def on_training_step_end(
        self,
        model: ImaginaireModel,
        data_batch: dict[str, torch.Tensor],
        output_batch: dict[str, torch.Tensor],
        loss: torch.Tensor,
        iteration: int = 0,
    ) -> None:
        if self.hit_counter < self.hit_thres:
            if torch.distributed.is_initialized():
                log_loss = loss.detach()
                torch.distributed.all_reduce(log_loss, op=torch.distributed.ReduceOp.AVG)
                log_loss = log_loss.item()
                if "mask_loss" in output_batch:
                    log_mask_loss = output_batch["mask_loss"].detach()
                    torch.distributed.all_reduce(log_mask_loss, op=torch.distributed.ReduceOp.AVG)
                    log_mask_loss = log_mask_loss.item()
                else:
                    log_mask_loss = 0.0
            else:
                log_loss = loss.detach().item()
                if "mask_loss" in output_batch:
                    log_mask_loss = output_batch["mask_loss"].detach().item()
                else:
                    log_mask_loss = 0.0
            log.info(
                f"Iteration {iteration}: "
                f"Hit counter: {self.hit_counter + 1}/{self.hit_thres} | "
                f"Loss: {log_loss:.4f} | "
                f"Mask Loss: {log_mask_loss:.4f} "
                f"Time: {time.time() - self.last_hit_time:.2f}s"
            )
            self.hit_counter += 1
            self.last_hit_time = time.time()
            #! useful for large scale training and avoid oom crash in the first two iterations!!!
            torch.cuda.synchronize()
            return
        super().on_training_step_end(model, data_batch, output_batch, loss, iteration)

    def every_n_impl(
        self,
        trainer: ImaginaireTrainer,
        model: ImaginaireModel,
        data_batch: dict[str, Tensor],
        output_batch: dict[str, Tensor],
        loss: Tensor,
        iteration: int,
    ) -> None:
        if self.time is None:
            self.time = time.time()
            return
        cur_time = time.time()
        iter_speed = (cur_time - self.time) / self.every_n / self.step_size
        if torch.distributed.is_initialized():
            log_loss = loss.detach()
            torch.distributed.all_reduce(log_loss, op=torch.distributed.ReduceOp.AVG)
            log_loss = log_loss.item()
            if "mask_loss" in output_batch:
                log_mask_loss = output_batch["mask_loss"].detach()
                torch.distributed.all_reduce(log_mask_loss, op=torch.distributed.ReduceOp.AVG)
                log_mask_loss = log_mask_loss.item()
            else:
                log_mask_loss = 0.0
        else:
            log_loss = loss.detach().item()
            if "mask_loss" in output_batch:
                log_mask_loss = output_batch["mask_loss"].detach().item()
            else:
                log_mask_loss = 0.0
        if is_rank0():
            log.info(f"{iteration} : iter_speed {iter_speed:.2f} seconds per iteration | Loss: {log_loss:.4f} | Mask Loss {log_mask_loss:.4f}")

        self.time = cur_time
