import argparse
import json
import logging
import os
import sys
from functools import partial
from typing import Union

from lm_eval import evaluator, utils
from lm_eval.evaluator import request_caching_arg_to_dict
from lm_eval.loggers import EvaluationTracker, WandbLogger
from lm_eval.tasks import TaskManager
from lm_eval.utils import handle_non_serializable, make_table, simple_parse_args_string


import gc
import torch


def _int_or_none_list_arg_type(
    min_len: int, max_len: int, defaults: str, value: str, split_char: str = ","
):
    def parse_value(item):
        item = item.strip().lower()
        if item == "none":
            return None
        try:
            return int(item)
        except ValueError:
            raise argparse.ArgumentTypeError(f"{item} is not an integer or None")

    items = [parse_value(v) for v in value.split(split_char)]
    num_items = len(items)

    if num_items == 1:
        # Makes downstream handling the same for single and multiple values
        items = items * max_len
    elif num_items < min_len or num_items > max_len:
        raise argparse.ArgumentTypeError(
            f"Argument requires {max_len} integers or None, separated by '{split_char}'"
        )
    elif num_items != max_len:
        logging.warning(
            f"Argument requires {max_len} integers or None, separated by '{split_char}'. "
            "Missing values will be filled with defaults."
        )
        default_items = [parse_value(v) for v in defaults.split(split_char)]
        items.extend(
            default_items[num_items:]
        )  # extend items list with missing defaults

    return items


def check_argument_types(parser: argparse.ArgumentParser):
    """
    Check to make sure all CLI args are typed, raises error if not
    """
    for action in parser._actions:
        if action.dest != "help" and not action.const:
            if action.type is None:
                raise ValueError(
                    f"Argument '{action.dest}' doesn't have a type specified."
                )
            else:
                continue


def setup_parser() -> argparse.ArgumentParser:
    parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter)
    parser.add_argument(
        "--model", "-m", type=str, default="hf", help="Name of model e.g. `hf`"
    )
    parser.add_argument(
        "--tasks",
        "-t",
        default=None,
        type=str,
        metavar="task1,task2",
        help="To get full list of tasks, use the command lm-eval --tasks list",
    )
    parser.add_argument(
        "--model_args",
        "-a",
        default="",
        type=str,
        help="Comma separated string arguments for model, e.g. `pretrained=EleutherAI/pythia-160m,dtype=float32`",
    )
    parser.add_argument(
        "--num_fewshot",
        "-f",
        type=int,
        default=None,
        metavar="N",
        help="Number of examples in few-shot context",
    )
    parser.add_argument(
        "--batch_size",
        "-b",
        type=str,
        default=1,
        metavar="auto|auto:N|N",
        help="Acceptable values are 'auto', 'auto:N' or N, where N is an integer. Default 1.",
    )
    parser.add_argument(
        "--max_batch_size",
        type=int,
        default=None,
        metavar="N",
        help="Maximal batch size to try with --batch_size auto.",
    )
    parser.add_argument(
        "--device",
        type=str,
        default=None,
        help="Device to use (e.g. cuda, cuda:0, cpu).",
    )
    parser.add_argument(
        "--output_path",
        "-o",
        default=None,
        type=str,
        metavar="DIR|DIR/file.json",
        help="The path to the output file where the result metrics will be saved. If the path is a directory and log_samples is true, the results will be saved in the directory. Else the parent directory will be used.",
    )
    parser.add_argument(
        "--limit",
        "-L",
        type=float,
        default=None,
        metavar="N|0<N<1",
        help="Limit the number of examples per task. "
        "If <1, limit is a percentage of the total number of examples.",
    )
    parser.add_argument(
        "--use_cache",
        "-c",
        type=str,
        default=None,
        metavar="DIR",
        help="A path to a sqlite db file for caching model responses. `None` if not caching.",
    )
    parser.add_argument(
        "--cache_requests",
        type=str,
        default=None,
        choices=["true", "refresh", "delete"],
        help="Speed up evaluation by caching the building of dataset requests. `None` if not caching.",
    )
    parser.add_argument(
        "--check_integrity",
        action="store_true",
        help="Whether to run the relevant part of the test suite for the tasks.",
    )
    parser.add_argument(
        "--write_out",
        "-w",
        action="store_true",
        default=False,
        help="Prints the prompt for the first few documents.",
    )
    parser.add_argument(
        "--log_samples",
        "-s",
        action="store_true",
        default=False,
        help="If True, write out all model outputs and documents for per-sample measurement and post-hoc analysis. Use with --output_path.",
    )
    parser.add_argument(
        "--system_instruction",
        type=str,
        default=None,
        help="System instruction to be used in the prompt",
    )
    parser.add_argument(
        "--apply_chat_template",
        action="store_true",
        default=False,
        help="If True, applies the chat template to the prompt",
    )
    parser.add_argument(
        "--fewshot_as_multiturn",
        action="store_true",
        default=False,
        help="If True, uses the fewshot as a multi-turn conversation",
    )
    parser.add_argument(
        "--show_config",
        action="store_true",
        default=False,
        help="If True, shows the the full config of all tasks at the end of the evaluation.",
    )
    parser.add_argument(
        "--include_path",
        type=str,
        default=None,
        metavar="DIR",
        help="Additional path to include if there are external tasks to include.",
    )
    parser.add_argument(
        "--gen_kwargs",
        type=str,
        default=None,
        help=(
            "String arguments for model generation on greedy_until tasks,"
            " e.g. `temperature=0,top_k=0,top_p=0`."
        ),
    )
    parser.add_argument(
        "--verbosity",
        "-v",
        type=str.upper,
        default="INFO",
        metavar="CRITICAL|ERROR|WARNING|INFO|DEBUG",
        help="Controls the reported logging error level. Set to DEBUG when testing + adding new task configurations for comprehensive log output.",
    )
    parser.add_argument(
        "--wandb_args",
        type=str,
        default="",
        help="Comma separated string arguments passed to wandb.init, e.g. `project=lm-eval,job_type=eval",
    )
    parser.add_argument(
        "--hf_hub_log_args",
        type=str,
        default="",
        help="Comma separated string arguments passed to Hugging Face Hub's log function, e.g. `hub_results_org=EleutherAI,hub_repo_name=lm-eval-results`",
    )
    parser.add_argument(
        "--predict_only",
        "-x",
        action="store_true",
        default=False,
        help="Use with --log_samples. Only model outputs will be saved and metrics will not be evaluated.",
    )
    default_seed_string = "0,1234,1234,1234"
    parser.add_argument(
        "--seed",
        type=partial(_int_or_none_list_arg_type, 3, 4, default_seed_string),
        default=default_seed_string,  # for backward compatibility
        help=(
            "Set seed for python's random, numpy, torch, and fewshot sampling.\n"
            "Accepts a comma-separated list of 4 values for python's random, numpy, torch, and fewshot sampling seeds, "
            "respectively, or a single integer to set the same seed for all four.\n"
            f"The values are either an integer or 'None' to not set the seed. Default is `{default_seed_string}` "
            "(for backward compatibility).\n"
            "E.g. `--seed 0,None,8,52` sets `random.seed(0)`, `torch.manual_seed(8)`, and fewshot sampling seed to 52. "
            "Here numpy's seed is not set since the second value is `None`.\n"
            "E.g, `--seed 42` sets all four seeds to 42."
        ),
    )
    parser.add_argument(
        "--trust_remote_code",
        action="store_true",
        help="Sets trust_remote_code to True to execute code to create HF Datasets from the Hub",
    )
    parser.add_argument(
        "--lora_scaling_experiment",
        action="store_true",
        help="Whether doing lora scaling experimnt or not",
    )
    parser.add_argument(
        "--multiple_checkpoints",
        action="store_true",
        help="Whether doing multiple checkpoint experiments or not",
    )
    parser.add_argument(
        "--begin",
        type=int,
        help="Starting of checkpoint",
    )
    parser.add_argument(
        "--end",
        type=int,
        help="Ending of checkpoint",
    )
    parser.add_argument(
        "--step",
        type=int,
        help="Checkpoint exmination step",
    )
    parser.add_argument(
        "--final",
        type=int,
        help="Final checkpoint number",
    )
    parser.add_argument(
        "--list_num",
        default=None,
        type=str,
        help="Numbers separated with comas. If provided this fill be the interval",
    )
    
    return parser


def parse_eval_args(parser: argparse.ArgumentParser) -> argparse.Namespace:
    check_argument_types(parser)
    return parser.parse_args()


def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:
    if not args:
        # we allow for args to be passed externally, else we parse them ourselves
        parser = setup_parser()
        args = parse_eval_args(parser)

    if args.wandb_args:
        wandb_logger = WandbLogger(**simple_parse_args_string(args.wandb_args))

    eval_logger = utils.eval_logger
    eval_logger.setLevel(getattr(logging, f"{args.verbosity}"))
    eval_logger.info(f"Verbosity set to {args.verbosity}")
    os.environ["TOKENIZERS_PARALLELISM"] = "false"

    # update the evaluation tracker args with the output path and the HF token
    new_hf_hub_log_args = args.hf_hub_log_args
    if args.output_path:
        new_hf_hub_log_args =  args.hf_hub_log_args + f",output_path={args.output_path}"
    if os.environ.get("HF_TOKEN", None):
        new_hf_hub_log_args = args.hf_hub_log_args + f",token={os.environ.get('HF_TOKEN')}"
    evaluation_tracker_args = simple_parse_args_string(new_hf_hub_log_args)
    evaluation_tracker = EvaluationTracker(**evaluation_tracker_args)

    if args.predict_only:
        args.log_samples = True
    if (args.log_samples or args.predict_only) and not args.output_path:
        raise ValueError(
            "Specify --output_path if providing --log_samples or --predict_only"
        )

    if args.fewshot_as_multiturn and args.apply_chat_template is False:
        raise ValueError(
            "If fewshot_as_multiturn is set, apply_chat_template must be set to True."
        )

    if (
        args.num_fewshot is None or args.num_fewshot == 0
    ) and args.fewshot_as_multiturn:
        raise ValueError(
            "If fewshot_as_multiturn is set, num_fewshot must be greater than 0."
        )

    if args.include_path is not None:
        eval_logger.info(f"Including path: {args.include_path}")
    task_manager = TaskManager(args.verbosity, include_path=args.include_path)

    if "push_samples_to_hub" in evaluation_tracker_args and not args.log_samples:
        eval_logger.warning(
            "Pushing samples to the Hub requires --log_samples to be set. Samples will not be pushed to the Hub."
        )

    if args.limit:
        eval_logger.warning(
            " --limit SHOULD ONLY BE USED FOR TESTING."
            "REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT."
        )

    if args.tasks is None:
        eval_logger.error("Need to specify task to evaluate.")
        sys.exit()
    elif args.tasks == "list":
        eval_logger.info(
            "Available Tasks:\n - {}".format("\n - ".join(task_manager.all_tasks))
        )
        sys.exit()
    else:
        if os.path.isdir(args.tasks):
            import glob

            task_names = []
            yaml_path = os.path.join(args.tasks, "*.yaml")
            for yaml_file in glob.glob(yaml_path):
                config = utils.load_yaml_config(yaml_file)
                task_names.append(config)
        else:
            task_list = args.tasks.split(",")
            task_names = task_manager.match_tasks(task_list)
            for task in [task for task in task_list if task not in task_names]:
                if os.path.isfile(task):
                    config = utils.load_yaml_config(task)
                    task_names.append(config)
            task_missing = [
                task for task in task_list if task not in task_names and "*" not in task
            ]  # we don't want errors if a wildcard ("*") task name was used

            if task_missing:
                missing = ", ".join(task_missing)
                eval_logger.error(
                    f"Tasks were not found: {missing}\n"
                    f"{utils.SPACING}Try `lm-eval --tasks list` for list of available tasks",
                )
                raise ValueError(
                    f"Tasks not found: {missing}. Try `lm-eval --tasks list` for list of available tasks, or '--verbosity DEBUG' to troubleshoot task registration issues."
                )

    # Respect user's value passed in via CLI, otherwise default to True and add to comma-separated model args
    if args.trust_remote_code:
        eval_logger.info(
            "Passed `--trust_remote_code`, setting environment variable `HF_DATASETS_TRUST_REMOTE_CODE=true`"
        )
        # HACK: import datasets and override its HF_DATASETS_TRUST_REMOTE_CODE value internally,
        # because it's already been determined based on the prior env var before launching our
        # script--`datasets` gets imported by lm_eval internally before these lines can update the env.
        import datasets

        datasets.config.HF_DATASETS_TRUST_REMOTE_CODE = True

        args.model_args = args.model_args + ",trust_remote_code=True"

    eval_logger.info(f"Selected Tasks: {task_names}")

    request_caching_args = request_caching_arg_to_dict(
        cache_requests=args.cache_requests
    )
    del request_caching_args["delete_requests_cache"]  

    assert not (args.lora_scaling_experiment and args.multiple_checkpoints), "Lora scaling or multpile checkpoints not both of them"
    if (not args.lora_scaling_experiment) and (not args.multiple_checkpoints) :
        results = evaluator.simple_evaluate(
            model=args.model,
            model_args=args.model_args,
            tasks=task_names,
            num_fewshot=args.num_fewshot,
            batch_size=args.batch_size,
            max_batch_size=args.max_batch_size,
            device=args.device,
            use_cache=args.use_cache,
            limit=args.limit,
            check_integrity=args.check_integrity,
            write_out=args.write_out,
            log_samples=args.log_samples,
            evaluation_tracker=evaluation_tracker,
            system_instruction=args.system_instruction,
            apply_chat_template=args.apply_chat_template,
            fewshot_as_multiturn=args.fewshot_as_multiturn,
            gen_kwargs=args.gen_kwargs,
            task_manager=task_manager,
            verbosity=args.verbosity,
            predict_only=args.predict_only,
            random_seed=args.seed[0],
            numpy_random_seed=args.seed[1],
            torch_random_seed=args.seed[2],
            fewshot_random_seed=args.seed[3],
            **request_caching_args,
        )
        if results is not None:
            if args.log_samples:
                samples = results.pop("samples")
            dumped = json.dumps(
                results, indent=2, default=handle_non_serializable, ensure_ascii=False
            )
            if args.show_config:
                print(dumped)
            print("results")
            print(results)
            batch_sizes = ",".join(map(str, results["config"]["batch_sizes"]))
      


            # Add W&B logging
            if args.wandb_args:
                try:
                    wandb_logger.post_init(results)
                    wandb_logger.log_eval_result()
                    if args.log_samples:
                        wandb_logger.log_eval_samples(samples)
                except Exception as e:
                    eval_logger.info(f"Logging to Weights and Biases failed due to {e}")

            evaluation_tracker.save_results_aggregated(
                results=results, samples=samples if args.log_samples else None
            )

            if args.log_samples:
                for task_name, config in results["configs"].items():
                    evaluation_tracker.save_results_samples(
                        task_name=task_name, samples=samples[task_name]
                    )

            if (
                evaluation_tracker.push_results_to_hub
                or evaluation_tracker.push_samples_to_hub
            ):
                evaluation_tracker.recreate_metadata_card()

            print(
                f"{args.model} ({args.model_args}), gen_kwargs: ({args.gen_kwargs}), limit: {args.limit}, num_fewshot: {args.num_fewshot}, "
                f"batch_size: {args.batch_size}{f' ({batch_sizes})' if batch_sizes else ''}"
            )
    elif (args.lora_scaling_experiment):
        print("RANGE")
        interval = [0.0,0.25,0.5,0.75,1.0,1.25,1.5]
        for i,lam in enumerate(interval):
                # update the evaluation tracker args with the output path and the HF token
            if args.output_path:
                new_hf_hub_log_args = args.hf_hub_log_args + f",output_path={args.output_path}" + f"/sub_lora_{lam}"
            if os.environ.get("HF_TOKEN", None):
                new_hf_hub_log_args = args.hf_hub_log_args + f",token={os.environ.get('HF_TOKEN')}"+ f"/sub_lora_{lam}"
            evaluation_tracker_args = simple_parse_args_string(new_hf_hub_log_args)
            evaluation_tracker = EvaluationTracker(**evaluation_tracker_args)
            model_args_new = args.model_args + f",lora_scale={lam}"
            results = evaluator.simple_evaluate(
                model=args.model,
                model_args=model_args_new,
                tasks=task_names,
                num_fewshot=args.num_fewshot,
                batch_size=args.batch_size,
                max_batch_size=args.max_batch_size,
                device=args.device,
                use_cache=args.use_cache,
                limit=args.limit,
                check_integrity=args.check_integrity,
                write_out=args.write_out,
                log_samples=args.log_samples,
                evaluation_tracker=evaluation_tracker,
                system_instruction=args.system_instruction,
                apply_chat_template=args.apply_chat_template,
                fewshot_as_multiturn=args.fewshot_as_multiturn,
                gen_kwargs=args.gen_kwargs,
                task_manager=task_manager,
                verbosity=args.verbosity,
                predict_only=args.predict_only,
                random_seed=args.seed[0],
                numpy_random_seed=args.seed[1],
                torch_random_seed=args.seed[2],
                fewshot_random_seed=args.seed[3],
                **request_caching_args,
            )
            if results is not None:
                if args.log_samples:
                    samples = results.pop("samples")
                dumped = json.dumps(
                    results, indent=2, default=handle_non_serializable, ensure_ascii=False
                )
                if args.show_config:
                    print(dumped)
                batch_sizes = ",".join(map(str, results["config"]["batch_sizes"]))
       
                # Add W&B logging
                if args.wandb_args:
                    try:
                        wandb_logger.post_init(results)
                        wandb_logger.log_eval_result()
                        if args.log_samples:
                            wandb_logger.log_eval_samples(samples)
                    except Exception as e:
                        eval_logger.info(f"Logging to Weights and Biases failed due to {e}")

                evaluation_tracker.save_results_aggregated(
                    results=results, samples=samples if args.log_samples else None
                )

                if args.log_samples:
                    for task_name, config in results["configs"].items():
                        evaluation_tracker.save_results_samples(
                            task_name=task_name, samples=samples[task_name]
                        )

                if (
                    evaluation_tracker.push_results_to_hub
                    or evaluation_tracker.push_samples_to_hub
                ):
                    evaluation_tracker.recreate_metadata_card()

                print(
                    f"{args.model} ({model_args_new}), gen_kwargs: ({args.gen_kwargs}), limit: {args.limit}, num_fewshot: {args.num_fewshot}, "
                    f"batch_size: {args.batch_size}{f' ({batch_sizes})' if batch_sizes else ''}"
                )
                del results
                del evaluation_tracker
                torch.cuda.empty_cache()
                gc.collect()
    elif args.multiple_checkpoints:
        interval = []
        i = args.begin
        while(i<=args.end):
            interval.append(i)
            i += args.step
        interval.append(args.final)
        if args.list_num:
            interval = args.list_num.split(",")
            interval = [int(num) for num in interval]

        for i,che in enumerate(interval):
                # update the evaluation tracker args with the output path and the HF token
            if args.output_path:
                new_hf_hub_log_args = args.hf_hub_log_args + f",output_path={args.output_path}" + f"/sub_checkpoint_{che}"
            if os.environ.get("HF_TOKEN", None):
                new_hf_hub_log_args = args.hf_hub_log_args + f",token={os.environ.get('HF_TOKEN')}"+ f"/sub_checkpoint_{che}"
            evaluation_tracker_args = simple_parse_args_string(new_hf_hub_log_args)
            evaluation_tracker = EvaluationTracker(**evaluation_tracker_args)
            model_args_new = args.model_args
            splitted_temp = model_args_new.split(",")
            index_temp = -1
            for j,sp in enumerate(splitted_temp):
                if "peft" in sp:
                    index_temp = j
            assert index_temp != -1, "Can not find peft."
            splitted_temp[index_temp] = splitted_temp[index_temp] + f"/trainer_{che}.0_epoch"
            model_args_new = ",".join(splitted_temp)
            results = evaluator.simple_evaluate(
                model=args.model,
                model_args=model_args_new,
                tasks=task_names,
                num_fewshot=args.num_fewshot,
                batch_size=args.batch_size,
                max_batch_size=args.max_batch_size,
                device=args.device,
                use_cache=args.use_cache,
                limit=args.limit,
                check_integrity=args.check_integrity,
                write_out=args.write_out,
                log_samples=args.log_samples,
                evaluation_tracker=evaluation_tracker,
                system_instruction=args.system_instruction,
                apply_chat_template=args.apply_chat_template,
                fewshot_as_multiturn=args.fewshot_as_multiturn,
                gen_kwargs=args.gen_kwargs,
                task_manager=task_manager,
                verbosity=args.verbosity,
                predict_only=args.predict_only,
                random_seed=args.seed[0],
                numpy_random_seed=args.seed[1],
                torch_random_seed=args.seed[2],
                fewshot_random_seed=args.seed[3],
                **request_caching_args,
            )
            if results is not None:
                if args.log_samples:
                    samples = results.pop("samples")
                dumped = json.dumps(
                    results, indent=2, default=handle_non_serializable, ensure_ascii=False
                )
                if args.show_config:
                    print(dumped)
                batch_sizes = ",".join(map(str, results["config"]["batch_sizes"]))
 
                # Add W&B logging
                if args.wandb_args:
                    try:
                        wandb_logger.post_init(results)
                        wandb_logger.log_eval_result()
                        if args.log_samples:
                            wandb_logger.log_eval_samples(samples)
                    except Exception as e:
                        eval_logger.info(f"Logging to Weights and Biases failed due to {e}")

                evaluation_tracker.save_results_aggregated(
                    results=results, samples=samples if args.log_samples else None
                )

                if args.log_samples:
                    for task_name, config in results["configs"].items():
                        evaluation_tracker.save_results_samples(
                            task_name=task_name, samples=samples[task_name]
                        )

                if (
                    evaluation_tracker.push_results_to_hub
                    or evaluation_tracker.push_samples_to_hub
                ):
                    evaluation_tracker.recreate_metadata_card()

                print(
                    f"{args.model} ({model_args_new}), gen_kwargs: ({args.gen_kwargs}), limit: {args.limit}, num_fewshot: {args.num_fewshot}, "
                    f"batch_size: {args.batch_size}{f' ({batch_sizes})' if batch_sizes else ''}"
                )
                del results
                del evaluation_tracker
                torch.cuda.empty_cache()
                gc.collect()

        if args.wandb_args:
            # Tear down wandb run once all the logging is done.
            wandb_logger.run.finish()
        

if __name__ == "__main__":
    cli_evaluate()