from utils.base_prompts import Prompts
import random
import json
import numpy as np
import itertools
from typing import Dict, List
import shutil
from utils.discoverybench_utils.dataset import (
    get_datasets_fpaths,
    get_dataset_description,
)


class DiscoveryBenchPrompts(Prompts):
    def __init__(self, task, **kwargs):
        super().__init__(task)
        self.system_prompt = (
            "You are a research scientist who is interested in data-driven research using the provided dataset(s) and query. "
            "Be creative and think of an interesting new experiment to help answer the provided scientific query. "
            "Explain in natural language the experiment plan that the programmer should follow (do not provide the code yourself). "
            "Here are a few instructions that you must follow:\n"
            "1. Strictly use only the dataset(s) provided and do not simulate dummy/synthetic data or columns that cannot be derived from the existing columns.\n"
            "2. The experiment plan should be creative, independent, and self-contained.\n"
            "3. Use the prior experiments (if any) as inspiration to think of an interesting and creative new experiment. However, do not repeat the same experiments.\n\n"
            "Here is a possible approach to coming up with a new experiment plan:\n"
            "1. Find an interesting context: this could be a specific subset of the data. E.g., if the dataset has multiple categorical variables, you could split the data based on specific values of such variables, which would then allow you to validate a hypothesis in the specific contexts defined by the values of those variables.\n"
            "2. Find interesting variables: these could be the columns in the dataset that you find interesting or relevant to the context. You are allowed and encouraged to create composite variables derived from the existing variables.\n"
            "3. Find interesting relationships: these are interactions between the variables that you find interesting or relevant to the context. You are encouraged to propose experiments involving complex predictive or causal models.\n"
            "4. You must require that your proposed experiment plan is based on robust statistical tests. Remember, your programmer can install python packages via pip which can allow it to write code for complex statistical analyses.\n"
            "5. Multiple datasets: If you are provided with more than one dataset, then try to also propose an experiment that utilize contexts, variables, and relationships across datasets, e.g., this may involve using join or similar operations.\n\n"
            "Generally, in typical data-driven research, you will need to explore and visualize the data for possible high-level insights, clean, transform, or derive new variables from the dataset to be suited for the investigation, deep-dive into specific parts of the data for fine-grained analysis, perform data modeling, and run statistical tests.\n\n"
            "Examples of valid experiment plans:\n"
            "Experiment plan #1:\n"
            "1. Merge the datasets offshore, immigration, and native_employment on the common columns 'year' and 'beaind'.\n2. Replace infinite values with NaNs and drop rows with NaNs in any column.\n3. Independent variables: 'iv_offshoring_1', 'penetration'\n4. Fit the OLS regression modela\n\n"
            "Experiment plan #2:\n"
            "1. Chose BMI as dependent variable.\n2. Time preference (independent) variables as 'DISSAVED' and 'SAMESAVE'.\n3. Fit an OLS regression model and returned the model summary.\n\n"
            "Experiment plan #3:\n"
            "1. Selected appropriate variables from the raw data\ne.g.: Height: Height of respondent on 1985 instead of 1981.\n      Income: Total net family income, 1989 (There are many other income variables in the raw data)\n      Age:    Age of respondent at 1989 (Derived from Age at 1979).\nData Transformation:\n2. Replaced -1 to -5 values (unavailable data) with NaN\n3. Imputed the missing values in the AGE and INCOME variable with mean.\n4. AGE_1989 had missing values, hence derived the variable as [AGE_1979 + 10]\n5. Created a BMI variable using: bmi = (weight) * 0.453592 / (height) * 0.0254\n6. Divided the Family income variable by 1000$ (Mentioned in the paper)\n7. Created an AGE^2 variable (From the paper)\n8. One-hot encoded RACE variable into BLACK and HISPANIC\n9. One-hot encooded GENDER variable into MALE and FEMALE\n10. Selected 'Was more money put into or taken out of R/spouse savings since last interview, 1989' as the Time Preference variable.\n  DISSAVED = 1 if 'TOOK MORE MONEY OUT' else 0\n  SAMESAVE = 1 if 'NO SAVINGS' or 'NO CHANGE' else 0\n11. Dropped the unimportant columns for replication\n12. DISSAVED and SAMESAVE as independent variables and BMI as dependent variable\n13. Fit an OLS Regression Model\n\n"
        )
        self.user_prompt = (
            "Plan an experiment to answer the question about the following dataset.\n"
            "%s"
            " Now create exactly %s new experiment plans that could answer the scientific question. "
            """(Note: give only a list of experiment plans in the provided JSON format, e.g. {"response": ["experiment_plan_1", "experiment_plan_2", ...]})"""
        )

        self.system_prompt_for_neighbors = (
            "You are a research scientist who is interested in data-driven research using the provided dataset(s) and query. "
            "Be creative and think of an interesting experiment to help answer the provided scientific query. "
            "Explain in natural language the experiment plan that the programmer should follow (do not provide the code yourself). "
            "Here are a few instructions that you must follow:\n"
            "1. Strictly use only the dataset(s) provided and do not simulate dummy/synthetic data or columns that cannot be derived from the existing columns.\n"
            "2. The experiment plan should be creative, independent, and self-contained.\n"
            "3. Use the prior experiments (if any) as inspiration to think of an interesting and creative new experiment.\n\n"
            "Here is a possible approach to coming up with a new experiment plan:\n"
            "1. Find an interesting context: this could be a specific subset of the data. E.g., if the dataset has multiple categorical variables, you could split the data based on specific values of such variables, which would then allow you to validate a hypothesis in the specific contexts defined by the values of those variables.\n"
            "2. Find interesting variables: these could be the columns in the dataset that you find interesting or relevant to the context. You are allowed and encouraged to create composite variables derived from the existing variables.\n"
            "3. Find interesting relationships: these are interactions between the variables that you find interesting or relevant to the context. You are encouraged to propose experiments involving complex predictive or causal models.\n"
            "4. You must require that your proposed experiment plan is based on robust statistical tests. Remember, your programmer can install python packages via pip which can allow it to write code for complex statistical analyses.\n"
            "5. Multiple datasets: If you are provided with more than one dataset, then try to also propose an experiment that utilize contexts, variables, and relationships across datasets, e.g., this may involve using join or similar operations.\n\n"
            "Generally, in typical data-driven research, you will need to explore and visualize the data for possible high-level insights, clean, transform, or derive new variables from the dataset to be suited for the investigation, deep-dive into specific parts of the data for fine-grained analysis, perform data modeling, and run statistical tests.\n\n"
            "Examples of valid experiment plans:\n"
            "Experiment plan #1:\n"
            "1. Merge the datasets offshore, immigration, and native_employment on the common columns 'year' and 'beaind'.\n2. Replace infinite values with NaNs and drop rows with NaNs in any column.\n3. Independent variables: 'iv_offshoring_1', 'penetration'\n4. Fit the OLS regression modela\n\n"
            "Experiment plan #2:\n"
            "1. Chose BMI as dependent variable.\n2. Time preference (independent) variables as 'DISSAVED' and 'SAMESAVE'.\n3. Fit an OLS regression model and returned the model summary.\n\n"
            "Experiment plan #3:\n"
            "1. Selected appropriate variables from the raw data\ne.g.: Height: Height of respondent on 1985 instead of 1981.\n      Income: Total net family income, 1989 (There are many other income variables in the raw data)\n      Age:    Age of respondent at 1989 (Derived from Age at 1979).\nData Transformation:\n2. Replaced -1 to -5 values (unavailable data) with NaN\n3. Imputed the missing values in the AGE and INCOME variable with mean.\n4. AGE_1989 had missing values, hence derived the variable as [AGE_1979 + 10]\n5. Created a BMI variable using: bmi = (weight) * 0.453592 / (height) * 0.0254\n6. Divided the Family income variable by 1000$ (Mentioned in the paper)\n7. Created an AGE^2 variable (From the paper)\n8. One-hot encoded RACE variable into BLACK and HISPANIC\n9. One-hot encooded GENDER variable into MALE and FEMALE\n10. Selected 'Was more money put into or taken out of R/spouse savings since last interview, 1989' as the Time Preference variable.\n  DISSAVED = 1 if 'TOOK MORE MONEY OUT' else 0\n  SAMESAVE = 1 if 'NO SAVINGS' or 'NO CHANGE' else 0\n11. Dropped the unimportant columns for replication\n12. DISSAVED and SAMESAVE as independent variables and BMI as dependent variable\n13. Fit an OLS Regression Model\n\n"
        )
        self.user_prompt_for_neighbors = (
            "Plan an experiment to answer the question about the following dataset.\n"
            "%s"
            "\n\n##### PRIOR EXPERIMENTS #####\n"
            "%s"
            "\nNow create exactly %s new experiment plans that could answer the scientific question and are **similar** to the prior experiments. "
            """(Note: give only a list of experiment plans in the provided JSON format, e.g. {"response": ["experiment_plan_1", "experiment_plan_2", ...]})"""
        )

        self.system_prompt_for_opro = ""

        self.user_prompt_for_opro = ""

    def build_prompt(
        self, use_alternate=False, alt_num_guesses=2
    ) -> List[Dict[str, str]]:
        """
        Constructs a single prompt of the following format:
            [
                {"content": system_prompt, "role": "system"},
                {"content": user_prompt, "role": "user"},
            ]

        Returns:
        - List[Dict[str, str]]: A single prompt
        """
        dataset_paths = get_datasets_fpaths(self.task.dataset_metadata)
        for dataset_fpath in dataset_paths:
            shutil.copy(dataset_fpath, self.task.work_dir)
        exp_objective = get_dataset_description(
            self.task.dataset_metadata, self.task.qid
        )

        if not use_alternate:
            return [
                {"content": self.system_prompt, "role": "system"},
                {
                    "content": self.user_prompt
                    % (exp_objective, self.task.migrate_alpha),
                    "role": "user",
                },
            ]
        else:
            return [
                {"content": self.system_prompt, "role": "system"},
                {
                    "content": self.user_prompt % (exp_objective, 1),
                    "role": "user",
                },
            ]

    def build_dataset(
        self,
        task_id,
        min_training_size=50,
        max_training_size=80,
        max_seq_len=2048,
    ):
        training_dataset = [
            {
                "prompt": self.build_prompt(),
                "problem": [self.task.target],
                "solution": [self.task.target],
            }
        ]
        print("Training dataset size:", len(training_dataset))
        if len(training_dataset) < min_training_size:
            expanded_training_dataset = []
            for item in training_dataset:
                expanded_training_dataset.extend(
                    [item] * ((min_training_size // len(training_dataset)) + 1)
                )
            training_dataset = expanded_training_dataset[-max_training_size:]
            print("Extending training dataset to:", len(training_dataset))
        else:
            # Sort by prompt length
            training_dataset = training_dataset[-max_training_size:]
            print("Clipping training dataset size to:", len(training_dataset))
        print("Example of training prompt:\n", training_dataset[-1]["prompt"])

        validation_dataset = []
        test_dataset = []

        return training_dataset, validation_dataset, test_dataset

    def get_neighborhood_samples_prompt(self, target_input, target_output, alt=False):
        dataset_paths = get_datasets_fpaths(self.task.dataset_metadata)
        for dataset_fpath in dataset_paths:
            shutil.copy(dataset_fpath, self.task.work_dir)
        exp_objective = get_dataset_description(
            self.task.dataset_metadata, self.task.qid
        )
        prev_exp = ""
        for i, plan in enumerate(target_output):
            prev_exp += f"\n\nPlan #{i + 1}:\n{plan}\n"
            # query += f"Evaluation Score: {round(plan['score'], 3)}"
            # if plan["reflection"]:
            #     query += f"\nFeedback: {plan['reflection']}"
        return [
            {"content": self.system_prompt_for_neighbors, "role": "system"},
            {
                "content": self.user_prompt_for_neighbors
                % (exp_objective, prev_exp, self.task.migrate_alpha),
                "role": "user",
            },
        ]

    def get_opro_samples_prompt(self, target_input, target_output, alt=False):
        # if len(target_output) == 1:
        #     context = f' "{target_output[0]}."\n'
        # else:
        #     context = "\n\n".join(target_output)
        #     context = f"\n\n{context}\n"
        # return [
        #     {"content": self.system_prompt_for_opro, "role": "system"},
        #     {"content": self.user_prompt_for_opro % (context, self.task.migrate_gamma if not alt else 2), "role": "user"},
        # ]
        context = []
        for guess in target_output:
            try:
                context.append(json.loads(guess)["response"][0])
            except Exception as _:
                pass

        return [
            {"content": self.system_prompt_for_neighbors, "role": "system"},
            {
                "content": self.user_prompt_for_opro
                % (self.task.target, "SMILES: " + "\n\nSMILES: ".join(context), 1),
                "role": "user",
            },
        ]
