import datasets
from langchain.llms import OpenAIChat
from langchain import PromptTemplate
from langchain.chains import LLMChain


lm = OpenAIChat(model_name='gpt-3.5-turbo', temperature=0)
initial_solution_prompt_template = """
You will be given a Reddit post and a reply. How helpful is this reply helpful for the original poster? Score it on a scale from 1 to 10 where 1 means very unhelpful and 10 means very helpful.

POST:
{history}

Reply:
{reply}

Answer by outputting a number from 1 to 10 (and nothing else).

Answer:"""
helpfulness_prompt = PromptTemplate(
    input_variables=["history", "reply"],
    template=initial_solution_prompt_template,
)
helpfulness_chain = LLMChain(llm=lm, prompt=helpfulness_prompt, output_key="score")
shp = datasets.load_dataset('stanfordnlp/SHP', split='train').shuffle(seed=2137)
num_correct, total = 0, 0
for i in range(300):
    try:
        inputs = shp[i]
        score_A = int(helpfulness_chain(dict(history=inputs['history'], reply=inputs['human_ref_A']))['score'])
        score_B = int(helpfulness_chain(dict(history=inputs['history'], reply=inputs['human_ref_B']))['score'])
        is_correct = score_A >= score_B and inputs['labels'] == 1 or score_A < score_B and inputs['labels'] == 0
        num_correct += int(is_correct)
        total += 1
        print(f"Score A: {score_A}, Score B: {score_B}, Correct: {is_correct}")
    except:
        print(f"Error: {score_A}, {score_B}")
        continue
print(num_correct/total)
