import json
import os
import pprint
import asyncio
from datetime import datetime
from time import sleep
from tqdm import tqdm
import argparse
from collections import Counter
from distutils.util import strtobool
import multiprocessing as mp

from src.evol.data_utils import load_data
from src.evol.openai_backend import call_chatgpt, LLM
from src.utils.data_utils import extract_answer_math, extract_answer_number
from src.evol.openai_utils import num_tokens_from_messages
from src.utils.code_utils import execute_tora, execute_code_interactive
from src.utils.math_utils import compare_ans, vote
from src.utils.file_utils import load_jsonl_ml


def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument("--verbose", action="store_true")
    parser.add_argument("--prompt_path", default=None, type=str)
    parser.add_argument("--strategy_path", default=None, type=str)
    parser.add_argument("--dataset", default="gsm", type=str)
    parser.add_argument("--data_path", default=None, type=str)
    parser.add_argument("--model", default="gpt-3.5-turbo", type=str)
    parser.add_argument("--temperature", default=0.0, type=float)
    parser.add_argument("--top_p", default=1.0, type=float)
    parser.add_argument("--max_tokens", default=1024, type=int)
    parser.add_argument("--num_seqs", default=1, type=int)
    parser.add_argument("--num_skips", default=0, type=int)
    parser.add_argument("--input_col", default="question", type=str)
    parser.add_argument("--output_col", default="answer", type=str)
    parser.add_argument("--max_iter", default=3, type=int)
    parser.add_argument("--num_process", default=1, type=int)
    parser.add_argument("--output_path", default=None, type=str)
    parser.add_argument("--batch_size", default=10, type=int)
    parser.add_argument("--do_recover", action="store_true")
    args = parser.parse_args()
    return args


def load_prompt(prompt_path):
    with open(prompt_path, "r", encoding="utf-8") as fp:
        prompt = fp.read().strip()
    return prompt


def stop_tora(result):
    if "\\boxed" in result or "\\underline{correct}" in result:
        return True
    return False


def clean_strategy(strategy):
    strategy = strategy.split("## Strategy")[-1].strip()
    strategy = strategy.split("##")[0].strip()
    return strategy


def get_max_tokens_model(batch_messages, max_tokens, model):
    for m in batch_messages:
        num_tokens = num_tokens_from_messages(m)
        while num_tokens + max_tokens > 4000 and max_tokens > 512:
            max_tokens -= 100
        if max_tokens < 512:
            model = "gpt-3.5-turbo-16k"
            max_tokens = 1024
            break
    return max_tokens, model


def extract_judgement(s):
    ans = s.split("\\underline")
    if len(ans) == 1:
        return s
    ans = ans[-1]
    if len(ans) == 0:
        return ""
    try:
        if ans[0] == "{":
            stack = 1
            a = ""
            for c in ans[1:]:
                if c == "{":
                    stack += 1
                    a += c
                elif c == "}":
                    stack -= 1
                    if stack == 0:
                        break
                    a += c
                else:
                    a += c
        else:
            a = ans.strip()
    except:
        return ""
    return a


def clean_feedback(feedback):
    feedback = feedback.split("## Feedback")[-1].strip()
    feedback = feedback.split("##")[0].strip()
    return feedback


def parse_output(output):
    if "## Refined Solution" in output:
        try:
            feedback, solution = output.split("## Refined Solution")
        except:
            return None, None
        solution = solution.strip().strip(":").strip()
    else:
        if "\\underline{correct}" not in output.replace(" ", ""):
            return None, None
        feedback, solution = output, ""
    feedback = clean_feedback(feedback)
    return feedback, solution


def batch_tora(llm, batch_messages_base):
    max_iters = 3
    itr = 0
    while itr < max_iters:
        if itr == 0:
            max_tokens, model = get_max_tokens_model(
                batch_messages_base, args.max_tokens, args.model
            )
            print(max_tokens)
            batch_outputs = asyncio.run(
                llm.achat(
                    batch_messages_base,
                    model=model,
                    stop=["```output", "---"],
                    max_tokens=max_tokens,
                    temperature=args.temperature,
                    num_beams=args.num_seqs,
                )
            )
            batch_full_outputs = [["" for o in outputs] for outputs in batch_outputs]
        else:
            max_tokens, model = get_max_tokens_model(
                batch_messages, args.max_tokens, args.model
            )
            print(max_tokens)
            batch_outputs = asyncio.run(
                llm.achat(
                    batch_messages,
                    model=model,
                    stop=["```output", "---"],
                    max_tokens=max_tokens,
                    temperature=0,
                    num_beams=1,
                )
            )
            temp_outputs = [[None for _ in outputs] for outputs in batch_full_outputs]
            for outputs, idx in zip(batch_outputs, remain_ids):
                temp_outputs[idx[0]][idx[1]] = outputs[0]
            batch_outputs = temp_outputs
        # print(batch_outputs)
        remain_ids = []
        should_continue = False
        for i, outputs in enumerate(batch_outputs):
            for j, output in enumerate(outputs):
                if output is not None:
                    batch_full_outputs[i][j] += output
                    if not stop_tora(output):
                        if output.strip().endswith("```"):
                            code_output = execute_code_interactive(output)
                            print("code output", code_output)
                            if (
                                len(code_output) > 0
                                and "SyntaxError" not in code_output
                            ):
                                code_output = f"```output\n{code_output}\n```\n"
                                batch_full_outputs[i][j] += code_output
                            if itr >= 2 and "error" not in batch_full_outputs[i][j].lower():
                                should_continue = True
                        elif output == "" and max_iters < 5:
                            max_iters += 1
                        remain_ids.append((i, j))
        if len(remain_ids) == 0:
            break
        if should_continue and max_iters < 5:
            max_iters += 1
        user_messages = [
            batch_messages_base[idx[0]][1]["content"]
            + batch_full_outputs[idx[0]][idx[1]]
            for idx in remain_ids
        ]
        batch_messages = [
            [
                {
                    "role": "system",
                    "content": "You are a helpful expert for math problem solving.",
                },
                {
                    "role": "user",
                    "content": user_message,
                },
            ]
            for user_message in user_messages
        ]
        itr += 1
    return batch_full_outputs


def need_recover(sample):
    for generation in sample["generation"]:
        if generation.startswith("```") or "SyntaxError" in generation:
            return True
    if len(sample["feedback"][0]) == 0:
        return True
    return False


def main(args, samples, idx):
    # load prompt
    prompt = load_prompt(args.prompt_path)
    if idx <= 0:
        print(prompt)
    os.makedirs(f"result/{args.model}/{args.dataset}", exist_ok=True)
    if args.output_path is None:
        output_path = f"result/{args.model}/{args.dataset}/t{args.temperature}_n{args.num_seqs}-train_specific.jsonl"
    else:
        output_path = args.output_path
    print("%" * 30, "Tora", "%" * 30)
    print("Start PID %d and save to %s" % (os.getpid(), output_path))

    if idx != -1:
        output_path = output_path.replace(".jsonl", f"_{idx}.jsonl")
    save_samples, scores = [], []
    # samples = samples[args.num_skips :]
    if args.do_recover:
        ref_samples = load_jsonl_ml(output_path)
        with open(output_path.replace(".jsonl", ".cache.jsonl"), "w") as f:
            for s in ref_samples:
                f.write(json.dumps(s, ensure_ascii=False, indent=4) + "\n")
    else:
        ref_samples = []
    llm = LLM()
    batch_size = args.batch_size
    print(samples[0]["question"])
    with open(output_path, "w" if args.num_skips == 0 else "a") as f:
        for i in tqdm(range(0, len(samples), batch_size)):
            start = i
            batch_samples = samples[start : start + batch_size]
            run_samples = []
            run_ids = []
            batch_save_samples = [{} for _ in range(len(batch_samples))]
            print(len(ref_samples))
            for j in range(len(batch_samples)):
                if i + j < len(ref_samples):
                    if need_recover(ref_samples[i + j]):
                        print("Need", j)
                        run_samples.append(samples[i + j])
                        run_ids.append(j)
                    else:
                        batch_save_samples[j] = ref_samples[i + j]
                else:
                    run_samples.append(samples[i + j])
                    run_ids.append(j)
            print(run_ids)
            print(run_samples)
            batch_messages = [
                [
                    {
                        "role": "system",
                        "content": "You are a helpful expert for math problem solving.",
                    },
                    {
                        "role": "user",
                        "content": prompt.replace("{question}", s["question"])
                        .replace("{solution}", s["generation"][0])
                        .replace("{gold}", s["steps"])
                        if "steps" in s
                        else prompt.replace("{question}", s["question"]).replace(
                            "{solution}", s["generation"][0]
                        ),
                    },
                ]
                for s in run_samples
            ]
            batch_outputs = batch_tora(llm, batch_messages)
            # print(batch_outputs)
            for s, outputs, run_id in zip(run_samples, batch_outputs, run_ids):
                feedbacks, solutions, pred_anss = [], [], []
                verify_scores = []
                for o in outputs:
                    feedback, solution = parse_output(o)
                    if feedback is None:
                        continue
                    judgement = extract_judgement(feedback)
                    if len(judgement.strip()) == 0:
                        continue
                    feedbacks.append(feedback)
                    solutions.append(solution)
                    if judgement == "correct":
                        pred_anss.append(extract_answer_number(s["generation"][0]))
                        verify_scores.append(int(s["score"] == 1))
                    else:
                        pred_anss.append(extract_answer_math(solution))
                        verify_scores.append(int(s["score"] == 0))
                label_ans = s["answer"]
                score = 0
                for p in pred_anss:
                    if compare_ans(p, label_ans):
                        score = 1
                        break
                scores.append(score)
                save_sample = s
                save_sample["old_generation"] = save_sample["generation"]
                save_sample["old_score"] = save_sample["score"]
                save_sample["generation"] = solutions
                save_sample["feedback"] = feedbacks
                save_sample["pred_answers"] = pred_anss
                save_sample["label_answer"] = label_ans
                save_sample["score"] = score
                batch_save_samples[run_id] = save_sample
            print(batch_save_samples)
            print("save", len(batch_save_samples))
            for save_sample in batch_save_samples:
                f.write(json.dumps(save_sample, ensure_ascii=False, indent=4) + "\n")
                f.flush()
    print(f"Accuracy - {sum(scores) / len(scores)}")


if __name__ == "__main__":
    args = parse_args()
    samples = load_data(args.dataset, args.data_path)
    if args.num_process == 1:
        main(args, samples, idx=-1)
    else:
        num_each_split = int(len(samples) / args.num_process)
        p = mp.Pool(args.num_process)
        for idx in range(args.num_process):
            start = idx * num_each_split
            if idx == args.num_process - 1:
                end = max((idx + 1) * num_each_split, len(samples))
            else:
                end = (idx + 1) * num_each_split
            split_data = samples[start:end]
            p.apply_async(
                main,
                args=(
                    args,
                    split_data,
                    idx,
                ),
            )
        p.close()
        p.join()
        print("All of the child processes over!")
