# coding=utf-8
# Copyright 2025 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from dataclasses import dataclass, field
from typing import Any, Literal, Optional

import trl


@dataclass
class DatasetConfig:
    """Configuration for a dataset in a mixture."""

    id: str
    config: Optional[str] = None
    split: str = "train"
    columns: Optional[list[str]] = None
    weight: Optional[float] = None


@dataclass
class DatasetMixtureConfig:
    """Configuration for a mixture of datasets."""

    datasets: list[DatasetConfig]
    seed: int = 0
    test_split_size: Optional[float] = None


@dataclass
class ScriptArguments(trl.ScriptArguments):
    """
    Extended version of ScriptArguments with support for dataset mixtures.

    Args:
        dataset_mixture (`dict[str, Any]` or `None`, *optional*, defaults to `None`):
            Configuration for creating dataset mixtures with advanced options.
            Format:
              dataset_mixture:
                datasets:
                  - id: dataset_id1
                    config: config_name
                    columns:
                      - col1
                      - col2
                    weight: 0.5
                  - id: dataset_id2
                    config: config_name
                    columns:
                      - col1
                      - col2
                    weight: 0.5
                seed: 42
                test_split_size: 0.1
    """

    # Override the dataset_name to make it optional
    dataset_name: Optional[str] = field(
        default=None, metadata={"help": "Dataset name. Can be omitted if using dataset_mixture."}
    )
    dataset_mixture: Optional[dict[str, Any]] = field(
        default=None,
        metadata={"help": "Configuration for creating dataset mixtures with advanced options like shuffling."},
    )

    def __post_init__(self):
        if self.dataset_name is None and self.dataset_mixture is None:
            raise ValueError("Either `dataset_name` or `dataset_mixture` must be provided")

        if self.dataset_mixture is not None:
            if not isinstance(self.dataset_mixture, dict) or "datasets" not in self.dataset_mixture:
                raise ValueError(
                    "dataset_mixture must be a dictionary with a 'datasets' key. "
                    "Expected format: {'datasets': [...], 'seed': int}"
                )

            datasets_list = []
            datasets_data = self.dataset_mixture.get("datasets", [])

            if isinstance(datasets_data, list):
                for dataset_config in datasets_data:
                    datasets_list.append(
                        DatasetConfig(
                            id=dataset_config.get("id"),
                            config=dataset_config.get("config"),
                            split=dataset_config.get("split", "train"),
                            columns=dataset_config.get("columns"),
                            weight=dataset_config.get("weight", 1.0),
                        )
                    )
            else:
                raise ValueError("'datasets' must be a list of dataset configurations")

            self.dataset_mixture = DatasetMixtureConfig(
                datasets=datasets_list,
                seed=self.dataset_mixture.get("seed", 0),
                test_split_size=self.dataset_mixture.get("test_split_size", None),
            )

            # Check that column names are consistent across all dataset configs
            columns_sets = [set(dataset.columns) for dataset in datasets_list if dataset.columns is not None]
            if columns_sets:
                first_columns = columns_sets[0]
                if not all(columns == first_columns for columns in columns_sets):
                    raise ValueError(
                        "Column names must be consistent across all dataset configurations in a mixture. "
                        f"Found different column sets: {[list(cols) for cols in columns_sets]}"
                    )


# TODO: add the shared options with a mixin to reduce code duplication
@dataclass
class GRPOConfig(trl.GRPOConfig):
    """
    args for callbacks, benchmarks etc
    """

    benchmarks: list[str] = field(
        default_factory=lambda: [],
        metadata={"help": "The benchmarks to run after training."},
    )
    callbacks: list[str] = field(
        default_factory=lambda: [],
        metadata={"help": "The callbacks to run during training."},
    )
    chat_template: Optional[str] = field(default=None, metadata={"help": "The chat template to use."})
    hub_model_revision: Optional[str] = field(
        default="main", metadata={"help": "The Hub model branch to push the model to."}
    )
    num_completions_to_print: int = field(default=0, metadata={"help": "Number of completions to print."})
    overwrite_hub_revision: bool = field(default=False, metadata={"help": "Whether to overwrite the Hub revision."})
    push_to_hub_revision: bool = field(default=False, metadata={"help": "Whether to push to a Hub revision/branch."})
    system_prompt: Optional[str] = field(
        default=None,
        metadata={"help": "The optional system prompt to use."},
    )
    wandb_log_unique_prompts: bool = field(
        default=True,
        metadata={
            "help": ("Whether to log the unique prompts to wandb. This will create a new run for each unique prompt.")
        },
    )
    wandb_entity: Optional[str] = field(
        default=None,
        metadata={"help": ("The entity to store runs under.")},
    )
    wandb_project: Optional[str] = field(
        default=None,
        metadata={"help": ("The project to store runs under.")},
    )
    wandb_run_group: Optional[str] = field(
        default=None,
        metadata={"help": ("The group to store runs under.")},
    )


@dataclass
class SFTConfig(trl.SFTConfig):
    """
    args for callbacks, benchmarks etc
    """

    benchmarks: list[str] = field(
        default_factory=lambda: [],
        metadata={"help": "The benchmarks to run after training."},
    )
    callbacks: list[str] = field(
        default_factory=lambda: [],
        metadata={"help": "The callbacks to run during training."},
    )
    chat_template: Optional[str] = field(default=None, metadata={"help": "The chat template to use."})
    system_prompt: Optional[str] = field(
        default=None,
        metadata={"help": "The optional system prompt to use for benchmarking."},
    )
    hub_model_revision: Optional[str] = field(
        default="main",
        metadata={"help": "The Hub model branch to push the model to."},
    )
    overwrite_hub_revision: bool = field(default=False, metadata={"help": "Whether to overwrite the Hub revision."})
    push_to_hub_revision: bool = field(default=False, metadata={"help": "Whether to push to a Hub revision/branch."})
    wandb_entity: Optional[str] = field(
        default=None,
        metadata={"help": ("The entity to store runs under.")},
    )
    wandb_project: Optional[str] = field(
        default=None,
        metadata={"help": ("The project to store runs under.")},
    )
    wandb_run_group: Optional[str] = field(
        default=None,
        metadata={"help": ("The group to store runs under.")},
    )


@dataclass
class GRPOScriptArguments(ScriptArguments):
    """
    Script arguments for the GRPO training script.

    Args:
        reward_funcs (`list[str]`):
            List of reward functions. Possible values: 'accuracy', 'format', 'reasoning_steps', 'cosine', 'repetition_penalty', 'length', 'tag_count', 'code', 'ioi_code', 'code_format', 'soft_overlong_punishment'.
        cosine_min_value_wrong (`float`):
            Minimum reward for cosine scaling for wrong answers.
        cosine_max_value_wrong (`float`):
            Maximum reward for cosine scaling for wrong answers.
        cosine_min_value_correct (`float`):
            Minimum reward for cosine scaling for correct answers.
        cosine_max_value_correct (`float`):
            Maximum reward for cosine scaling for correct answers.
        cosine_max_len (`int`):
            Maximum length for cosine scaling.
        code_language (`str`):
            Language for code format reward.
        max_completion_len (`int`):
            Maximum number of tokens in completion.
        soft_punish_cache (`int`):
            Minimum number of tokens in completion.
    """

    reward_funcs: list[str] = field(
        default_factory=lambda: ["accuracy", "format", "tag_count"],
        metadata={
            "help": "List of reward functions. Possible values: 'accuracy', 'format', 'reasoning_steps', 'cosine', 'repetition_penalty', 'length', tag_count', 'code', 'code_format'"
        },
    )
    cosine_min_value_wrong: float = field(
        default=0.0,
        metadata={"help": "Minimum reward for wrong answers"},
    )
    cosine_max_value_wrong: float = field(
        default=-0.5,
        metadata={"help": "Maximum reward for wrong answers"},
    )
    cosine_min_value_correct: float = field(
        default=0.5,
        metadata={"help": "Minimum reward for correct answers"},
    )
    cosine_max_value_correct: float = field(
        default=1.0,
        metadata={"help": "Maximum reward for correct answers"},
    )
    cosine_max_len: int = field(
        default=1000,
        metadata={"help": "Maximum length for scaling"},
    )
    repetition_n_grams: int = field(
        default=3,
        metadata={"help": "Number of n-grams for repetition penalty reward"},
    )
    repetition_max_penalty: float = field(
        default=-1.0,
        metadata={"help": "Maximum (negative) penalty for for repetition penalty reward"},
    )
    code_language: str = field(
        default="python",
        # '(?:python|cpp)'
        metadata={
            "help": "Language for code format reward. Based on E2B supported languages https://e2b.dev/docs/code-interpreting/supported-languages",
            "choices": ["python", "javascript", "r", "java", "bash", "cpp"],
        },
    )
    code_eval_test_batch_size: int = field(
        default=1,
        metadata={
            "help": "for each generation, evaluate these many test cases in parallel, then check if any of them failed (0 score): if so stop evaluating; otherwise continue with the next batch of test cases. Useful to avoid overloading the eval server + save time on wrong solutions"
        },
    )
    code_eval_scoring_mode: Literal["pass_fail", "partial", "weighted_sum"] = field(
        default="weighted_sum",
        metadata={"help": "use fraction of passed test cases as reward. If false, use 0/1 scoring."},
    )
    parallel_code_exec_per_proc: int = field(
        default=2,
        metadata={
            "help": "Number of parallel E2B code executions per process. Default of 2 is suitable for the Free Hobby tier of E2B with 8 GPUs used for training."
        },
    )

    dataset_prompt_column: str = field(
        default="prompt",
        metadata={"help": "Column to use as prompts for training."},
    )

    e2b_router_url: Optional[str] = field(
        default=None,
        metadata={"help": "URL for the E2B router. See scripts/e2b_router.py"},
    )

    morph_router_url: Optional[str] = field(
        default=None,
        metadata={"help": "URL for the MorphCloud router. See scripts/morph_router.py"},
    )

    code_provider: Optional[str] = field(
        default="e2b",
        metadata={
            "help": "Provider for code execution. Options: 'e2b', 'local', 'morph'.",
            "choices": ["e2b", "local", "morph"],
        },
    )

    ioi_provider: Optional[str] = field(
        default="piston",
        metadata={
            "help": "Provider for IOI code execution. Options: 'piston', 'morph'.",
            "choices": ["piston", "morph"],
        },
    )

    max_completion_len: int = field(
        default=16384,
        metadata={"help": "Maximum number of characters in completion."},
    )
    soft_punish_cache: int = field(
        default=4096,
        metadata={"help": "Minimum number of characters in completion."},
    )
    method: str = field(
        default="grpo",
        metadata={
            "help": "Method to use for training. Options: 'grpo', 'dr.grpo', 'rloo', 'cv_grpo', 'psppo'",
            "choices": ["grpo", "dr.grpo", "rloo", "cv_grpo", "psppo"],
        },
    )
    project_name: Optional[str] = field(
        default=None,
        metadata={"help": "Project name for wandb logging. If not provided, defaults to the script name."},
    )
    tau: float = field(
        default=0.1,
        metadata={
            "help": "Tau value for CV-GRPO. Used to control the effect of the covariance term in the reward function."
        },
    )

    # -------------------------
    # PS-PPO / stochastic cutoff
    # -------------------------
    cutoff_budget: Optional[float] = field(
        default=None,
        metadata={
            "help": (
                "Expected backpropagation budget B over completion tokens (sum_t xi_t = B). "
                "If None, cutoff is disabled."
            )
        },
    )
    uhat_topk: int = field(
        default=32,
        metadata={
            "help": (
                "Top-k vocabulary approximation for Û(t) estimation. "
                "Uses top-k token probs plus a residual 'other' bucket."
            )
        },
    )


    uhat_prob_mode: Literal["topk_softmax", "full_softmax"] = field(
        default="topk_softmax",
        metadata={
            "help": (
                "How to convert top-k logits into probabilities for Û(t) estimation. "
                "topk_softmax normalizes only within the top-k set (fast; no full-vocab logsumexp). "
                "full_softmax computes exact probabilities under the full softmax (slower; enables optional OTHER bucket)."
            )
        },
    )

    cutoff_per_prompt: bool = field(
        default=True,
        metadata={
            "help": (
                "If True, design Û(t)/xi per prompt-group (size=num_generations) and broadcast within that group. "
                "This matches the intended setting where a batch corresponds to one prompt with multiple completions."
            )
        },
    )

    cutoff_use_other_bucket: bool = field(
        default=False,
        metadata={
            "help": (
                "Whether to include the merged OTHER bucket (all non-topk probability mass) in the L1 distance when estimating Û(t). "
                "Set False to ignore residual mass for speed / pure top-k approximation."
            )
        },
    )
    # Memory controls for Û(t) estimation (top-k token probs)
    logits_to_keep_chunk_size: int = field(
        default=2,
        metadata={
            "help": (
                "Chunk size along the batch dimension when computing logits for Û(t). "
                "Smaller values reduce peak GPU memory at the cost of extra forward passes."
            )
        },
    )
    logits_to_keep_vocab_chunk_size: int = field(
        default=0,
        metadata={
            "help": (
                "Optional vocab-chunk size for chunked top-k selection during Û(t) estimation. "
                "Set to a positive value (e.g., 8192) to reduce peak memory of torch.topk; 0 disables."
            )
        },
    )
    cutoff_block_size: int = field(
        default=32,
        metadata={
            "help": (
                "Block size (in completion tokens) used to aggregate Û(t) and design a blockwise-constant xi via PAV. "
                "Example: 32 => 2048-token completion -> 64 blocks."
            )
        },
    )
    cutoff_q_min: float = field(
        default=1e-4,
        metadata={"help": "Minimum floor for xi_t to avoid overly aggressive early truncation."},
    )
    cutoff_bucket_size: int = field(
        default=64,
        metadata={
            "help": (
                "Quantization bucket size for sampled H (in tokens). "
                "Sampled H_i are rounded up to multiples of this size before taking batch max, "
                "stabilizing compute under the 'execute up to max(H)' implementation."
            )
        },
    )
    cutoff_min_H: int = field(
        default=1,
        metadata={"help": "Minimum cutoff H (in tokens) to guarantee at least this many completion tokens are backpropagated."},
    )
    
    cutoff_strategy: str = field(
        default="auto",
        metadata={
            "help": (
                "Cutoff strategy. "
                "'auto' enables PS-PPO when cutoff_budget>0, otherwise disables cutoff. "
                "Options: none, psppo, fixed, uniform, fixed_plus_uniform, auto."
            )
        },
    )
    cutoff_fixed_h: int = field(
        default=512,
        metadata={"help": "Fixed prefix length H0 used by 'fixed' and as the base prefix for 'fixed_plus_uniform'."},
    )
    cutoff_suffix_frac: float = field(
        default=0.25,
        metadata={
            "help": (
                "Suffix fraction for 'fixed_plus_uniform'. "
                "We sample H = H0 + U with U ~ Unif{0..M}, M=floor(frac*(T_keep-H0))."
            )
        },
    )
    cutoff_max_H: Optional[int] = field(
        default=None,
        metadata={"help": "Optional cap on the effective horizon T_keep used for cutoff design and execution."},
    )
cutoff_distributed_gather: bool = field(
        default=True,
        metadata={
            "help": (
                "If True, gather per-process statistics for Û(t) estimation so xi is designed from the global batch. "
                "Recommended for multi-GPU training."
            )
        },
    )