from datasets import load_dataset, DatasetDict, concatenate_datasets
import openai
import os

openai.api_key = os.environ.get('OPENAI_API_KEY')
multi_part = 3
length_per_part = 20000

###############
# Load datasets
###############

# print(data_args)
# print(training_args)

os.environ["CUDA_VISIBLE_DEVICES"] = ""


def get_eval(sys_prompt, user_prompt):
    try_num = 0
    while try_num < 10:
        try:
            response = openai.ChatCompletion.create(**{
                "model": "gpt-4",
                "messages": [
                    {"role": "system", "content": sys_prompt},
                    {"role": "user", "content": user_prompt}
                ],
                "temperature": 0,
                "max_tokens": 1024,
                "top_p": 0.6,
                "presence_penalty": 0,
                "frequency_penalty": 0
            })
            return response["choices"][0]["message"]["content"].strip()
        except KeyboardInterrupt as e:
            raise e
        except Exception as e:
            print(e)
            pass
    raise Exception("API Error")


def get_score(prompt, answer):
    try_num = 0

    system_prompt = "A chat between a curious user and an artificial intelligence expert. The expert gives helpful, specific, and concise answers to the user's questions."

    feedback_prompt = \
        """Given my answer to an instruction, your role is to provide specific and constructive feedback for me. You should find the best way for me to learn from your feedback and improve my performance. 

    You should consider multiple aspects of my answer, including helpfulness, truthfulness, honesty, and to what extent the answer follows instructions.
    ---

    ### Instruction
    {instruction}

    ### Answer
    {completion}
    ---

    Please act as a teacher and provide specific and constructive feedback. Besides describing the weaknesses of the answer, you should also provide specific suggestions to guide me toward understanding how to improve. Please note, however, that your suggestions should help me better complete the instructions, but you should not introduce new requirements that are not mentioned in the instructions. Your feedback should focus on enhancing my ability to think critically and respond accurately. However, never explicitly provide the reference answer, nor do polite phrases be required. Only respond with concise feedback in chat style. Finally, score the overall quality of the answer from 1 to 10, where 1 is the worst and 10 is the best.

    You should follow this format:
    *Format*
    ### Feedback
    [Your feedback]
    Overall Score: [1-10]

    ---

    ### Feedback
    """
    while try_num < 4:
        try:
            response = get_eval(system_prompt, feedback_prompt.format(
                instruction=prompt, completion=answer))
            response = response.split("\nOverall Score: ")
            assert len(response) == 2
            critique, score = response[0].strip(), response[1].split(".")[
                0].strip()
            score = score if "/" not in score else (
                eval(score.split("/")[0].strip()))
            final_score = float(score)
            assert 1 <= final_score <= 10
            return final_score
        except Exception as e:
            print(e)
    raise Exception("API Error: not a float score")


def annotate(batch, rank):  # , model, tokenizer):
    resp_a_batch = batch["chosen"]
    resp_b_batch = batch["rejected"]
    chosen = []
    rejected = []
    score_chosen = []
    score_rejected = []
    for i in range(len(resp_a_batch)):
        resp_a = resp_a_batch[i]
        resp_b = resp_b_batch[i]
        prompt = batch["prompt"][i]
        score_a = get_score(prompt, resp_a)
        score_b = get_score(prompt, resp_b)
        print(score_a, score_b)
        if score_a >= score_b:
            chosen.append(resp_a)
            rejected.append(resp_b)
            score_chosen.append(score_a)
            score_rejected.append(score_b)
        else:
            chosen.append(resp_b)
            rejected.append(resp_a)
            score_chosen.append(score_b)
            score_rejected.append(score_a)
    return batch


if __name__ == "__main__":
    #  """
    # set_start_method("spawn")

    multi_part = 3

    # train_dataset = load_dataset(
    #     "YYYYYYibo/ultrafeedback_binarized_with_response_full_labeled_without_ranked", split="train_prefs",
    #     download_mode="force_redownload", ignore_verifications=True)

    # train_dataset = train_dataset.select(range(100))
    print(os.environ.get('OPENAI_API_KEY'))

    # new_train_dataset = train_dataset.map(
    #     annotate,
    #     batched=True,
    #     batch_size=4,
    #     with_rank=True,
    #     num_proc=1,  # one process per GPU
    # )

    # new_train_dataset.push_to_hub(
    #     "YYYYYYibo/ultrafeedback_binarized_with_response_full_labeled_and_ranked", split="train_prefs", private=False)
