# coding: utf-8
import re
import time
# import prompts

from prompts.psyqa.prompt import *
from prompts.tutoring.cima.prompt import *
from run_strategy.dataset import *
from tools.retriever import *
from tools.utils import *
from run_strategy.evaluate import *

from app.sample_resp_cralwer import SampleRespCrawler
from app.chatgpt_resp_cralwer import ChatgptRespCrawler
from app.huggingface_resp_crawler import HuggingFaceRespCrawlerBase


class ThinkPlanDo:
    def __init__(self, key_path, dataset, model_type="chatgpt", model_path="", language="", setting="zero-shot",
            temperature=0.7, persona="", top_p=0.95):
        
        self.key_path = key_path
        self.dataset = dataset
        self.model_type = model_type
        self.model_path = model_path
        self.language = language
        self.token_unit_price = get_token_unit_price(model_type)

        self.setting = setting
        if self.dataset == "hotpotqa":
            data_paths = ["./dataset/hotpotQA/hotpot_dev_distractor_v1.json"]
            demo_path = "./cot_retrieval/config/prompt_en.json"
            self.prompt_constructor = HotpotQA(data_paths, demo_path=demo_path, mode="dev")
        elif self.dataset == "psyqa":
            data_paths = ["./dataset/PsyQA/psyqa_test.json"]
            self.prompt_constructor = PsyQA(data_paths, mode="dev")
            self.language = "cn"
        elif self.dataset == "cima":
            data_paths = ["./dataset/tutoring/CIMA_test.json"]
            self.prompt_constructor = StrategyTutoring(data_paths, mode="dev")
            self.language = "en"

        self.retriever = Retriever('bm25')
        self.call_retrieval_times, self.persona_right_count, self.knowledge_right_count = 0, 0, 0
        self.init_model_type(temperature, persona, top_p, model_type)

    def init_model_type(self, temperature, persona, top_p, model):
        if self.model_type in ["gpt-3.5-turbo", "gpt-3.5-turbo-0613", "text-davinci-003", "gpt-4-0613"]:
            self.sample_crawler = ChatgptRespCrawler(self.key_path, temperature, persona=persona, top_p=top_p, model=model)
        else:
            self.sample_crawler = HuggingFaceRespCrawlerBase(self.model_type, self.model_path, top_p=top_p, temperature=temperature)

    # @retry  # 装饰器，如果有异常就重复执行；无异常即返回
    def get_api_result(self, input_path, output_path): 
        while not is_finished_all_prompts(input_path, output_path):
            self.sample_crawler.get_all_result(input_path, output_path)

    def reflect_global(self, prompt):
        reflection = self.sample_crawler.call_openai_each(prompt)
        total_tokens, reflection = get_response_according_to_model_type(reflection, self.model_type)
        
        self.cache["prices"] += total_tokens * self.token_unit_price
        return reflection
    
    def plan(self, prompt):
        plan = self.sample_crawler.call_openai_each(prompt)
        total_tokens, plan = get_response_according_to_model_type(plan, self.model_type)

        self.cache["prices"] += total_tokens * self.token_unit_price
        return plan

    def solve(self, prompt):
        generations = self.sample_crawler.call_openai_each(prompt)
        total_tokens, generations = get_response_according_to_model_type(generations, self.model_type)

        self.cache["prices"] += total_tokens * self.token_unit_price
        return generations
    
    def _parse_plans(self, response):
        plans = []
        for line in response.splitlines():
            if line.startswith("Plan:"):
                plans.append(line)
            elif "plan" in line:
                plans.append(line)
        return plans
    
    def _parse_planner_evidences(self, response):
        evidences = []
        for line in response.splitlines():
            if line.startswith("Do"):
                if ":" in line:
                    e, tool_call = line.split(":", 1)
                    e, tool_call = e.strip(), tool_call.strip()
                    evidences.append(tool_call)
                elif len(line) > 10:
                    tool_call = line["2:"]
                    evidences.append(tool_call)
        return evidences

    def predict_responses(self):
        # init the evaluation metrics
        count, rl_scores, f1_scores, avg_bleu_scores = 0, [], [], []
        d1_scores, d2_scores = [], []
        prices, evaluation_elo = [], []
        bleu_scores, bert_scores = [], []

        for sample in self.prompt_constructor.examples[:200]:
            count += 1 # the number of current results
            self._reinitialize()

            # init cache
            self.cache = {}
            self.cache["prices"] = 0

            if self.dataset == "psyqa":
                question = sample["question"]
                desc = sample["desc"]
                self.cache["question"] = sample["question"]
                self.cache["description"] = sample["desc"]
                self.cache["response"] = sample["answer"]

                # [1] Reflect current user interest and used knowledge
                reflection_prompt = f"Question: {question}\nDescription: {desc}\nThought: "
                full_reflection_prompt = ours_refection_prompt + "\n\n" + reflection_prompt # full prompt
            elif self.dataset == "cima":
                context = sample["context"]
                self.cache["context"] = sample["context"]
                self.cache["response"] = sample["references"]

                # [1] Reflect current user interest and used knowledge
                reflection_prompt = f"Dialogue: {context}\nThought: "
                full_reflection_prompt = cima_refection_prompt + "\n\n" + reflection_prompt # full prompt
                
            reflection = self.reflect_global(full_reflection_prompt)

            # [2] Plan the modules
            if self.dataset == "psyqa":
                test_prompt = f"Question: {question}\nDescription: {desc}\nThought: {reflection}"
                planner_prompt = psyqa_modules_prompt + "\n\n" + test_prompt # full prompt
            elif self.dataset == "cima":
                test_prompt = f"Dialogue: {context}\nThought: {reflection}"
                planner_prompt = cima_modules_prompt + "\n\n" + test_prompt # full prompt

            plan = self.plan(planner_prompt)
            self.plans = self._parse_plans(plan)
            self.planner_evidences = self._parse_planner_evidences(plan)
            # assert len(self.plans) == len(self.planner_evidences)

            # [3] Do
            generations = " ".join(self.planner_evidences)
            
            # [4] Evaluate the results
            # normalize the number in the text
            if self.dataset == "psyqa":
                f1_scores.append(f1(generations, self.cache["response"], "cn"))
                d1_scores.append(distinct_n_sentence_level(generations, 1, lang="cn"))
                prices.append(self.cache["prices"])
                d2_scores.append(distinct_n_sentence_level(generations, 2, lang="cn"))
                avg_bleu_scores.append(avg_bleu(generations, self.cache["response"], lang="cn"))
                # bert_score, bleu_score = tutoring_scores(generations, self.cache["response"], self.language)
                # bleu_scores.append(bleu_score)
                # bert_scores.append(bert_score)

                evaluation_elo.append({
                    "question": sample["question"],
                    "desc": sample["desc"],
                    "answer": sample["answer"],
                    "generation": generations
                })
            elif self.dataset == "cima":
                f1_scores.append(f1(generations, self.cache["response"], "en"))
                d1_scores.append(distinct_n_sentence_level(generations, 1, lang="en"))
                prices.append(self.cache["prices"])
                bert_score, bleu_score = tutoring_scores(generations, self.cache["response"], self.language)
                bleu_scores.append(bleu_score)
                bert_scores.append(bert_score)

                evaluation_elo.append({
                    "context": self.cache["context"],
                    "referneces": sample["references"],
                    "plan": self.plans,
                    "generations": generations
                })
        
        # save the final evaluation results
        f1_score, rl_score, avg_bleu_score = np.mean(f1_scores), np.mean(rl_scores), np.mean(avg_bleu_scores)
        d1_score, d2_score = np.mean(d1_scores), np.mean(d2_scores)

        result = {
            "method": "ours",
            "model_type": self.model_type,
            "f1_score": f1_score,
            "d1_score": d1_score,
            "d2_score": d2_score,
            "avg_bleu_score": avg_bleu_score,
            "price": sum(prices),
            "bleu_score": np.mean(bleu_scores, dtype=np.float64),
            "bert_score": np.mean(bert_scores, dtype=np.float64)
        }

        with open("./exp_output/" + self.dataset + "/" + self.model_type + "_ours_result.json", "w", encoding="utf-8") as f:
            json.dump(result, f, indent=4, ensure_ascii=False)
        
        with open("./response_output/" + self.dataset + "/" + self.model_type + "_ours_result.json", "w", encoding="utf-8") as f:
            json.dump(evaluation_elo, f, indent=4, ensure_ascii=False)

    def _reinitialize(self):
        self.plans = []
        self.planner_evidences = {}
        self.worker_evidences = {}