from huggingface_hub import hf_hub_download
import pandas as pd
import requests
import json
import time
import numpy as np
import pickle


# To ensure anonymity for the double-blind review process, the specific source URL is masked.
# The data is sourced from a publicly available LLM ranking benchmark dataset.
# The reviewer can run this by locally downloading the Nectar Dataset mentioned in the paper and load it as 'rlaif_data.parquet' 
try:
    dataset = pd.read_parquet("rlaif_data.parquet")
except:
    print("Could not load 'rlaif_data.parquet'.")

print('Downloaded Dataset')


url = "http://localhost:11434/api/chat"

def chat(model, prompt, retries=3, backoff=2.0):
    data = {
        "model": model,
        "messages": [{"role": "user", "content": prompt}],
        "stream": False,
    }
    for attempt in range(retries):
        try:
            r = requests.post(url, headers={"Content-Type":"application/json"},
                              json=data, timeout=120)
            r.raise_for_status()
            j = r.json()
            # defensive parsing
            return j.get("message", {}).get("content", j)
        except (requests.ConnectionError, requests.Timeout) as e:
            if attempt == retries - 1:
                raise
            time.sleep(backoff * (2 ** attempt))
        except requests.HTTPError:
            # surface useful server error text if present
            raise

def llama3_1b(prompt):
    return chat("llama3.2:1b", prompt)

def llama3_3b(prompt):
    return chat("llama3.2:3b", prompt)


#helper function to take chatbot response and parse it into rankings
def parse_response(response):
	individual_chars=list(response)
	ranks=[1,2,3,4,5,6,7]
	rankings=[]
	for i in individual_chars[:20]:
		if i.isnumeric():
			if int(i) in ranks:
				rankings.append(int(i))
	if len(rankings)!=7:
		rankings=None
	return rankings

def pairwise_order():

	order=[]
	for i in range(21):
		c=np.random.choice([1,2,3,4,5,6,7], 2, replace=False)
		order.append(('Model '+str(c[0]),'Model '+str(c[1])))

	return order



#Goal new dataset size
T=15000

#New dataset and models we will consider
new_dataset=np.zeros([T,6,2])
model_indices={'gpt-4-0613':0, 'gpt-4':1, 'gpt-3.5-turbo-instruct':2, 'anthropic':3, 'mistral-7b-instruct-v0.1':4, 'gpt-3.5-turbo':5}

tried=0
count=0
lengths=[]

#iterate through data
for datapoint in dataset.iterrows():
	answers=datapoint[1]['answers']
	models=[answer['model'] for answer in answers]
    
	#ensure datapoint contains all models that we want, otherwise we skip
	same_models=True
	for m in model_indices.keys():
		if m not in models:
			same_models=False

	if same_models:
		#build prompt and answers for LLM query
		prompt=datapoint[1]['prompt'].split('\n\nAssistant:')[0]
		shuffle_order=[0,1,2,3,4,5,6]
		np.random.shuffle(shuffle_order)
		answers_strings=[answers[i]['answer'] for i in shuffle_order]
		only_ans=''
		for i in  range(len(answers_strings)):
			only_ans+=" [MODEL "+str(i)+" RESPONSE START]: "+answers_strings[i]+"[MODEL "+str(i)+" RESPONSE END], "

		full_prompt="['INPUT: "+prompt+"'],"+only_ans+" PAIRWISE EVALUATION ORDER:"+str(pairwise_order())+" OUTPUT:"

		#only look at short prompts to save time
		if len(full_prompt)<3000:
			tried+=1
			count+=1

			#query for LLM
			query="You are an impartial judge. Given a conversation and 7 assistant responses, rank them from best to worst using pairwise comparisons in the provided order. For each comparison, select the better response according to this rubric: relevance to the request, factual accuracy, creativity when appropriate (or analytical correctness when expected), and sufficient detail. Break ties randomly. Avoid bias from response order, length, or assistant names. Output only the final ranking in the required format. Conversation:"+full_prompt
            
            #querying larger model
			response1 = llama3_3b(query)
			prompt="The following answer provides a ranking of 7 models. Please extract the rankings in the form of a list with each entry corresponding to that models ranking.  Thus, in your output each entry will indicate the rank (out of 7) of the corresponding model's answer. An example of an acceptable answer is '[3,4,5,2,1,6,7]' which indicates that Model 1 came 3rd overall, Model 2 came 4th overall, and so on. Please only provide the list and no other text: "+'Answer: '+response1
			response2=llama3_3b(prompt)	        

			parsed_response1=parse_response(response2)
			
            #querying smaller model
			response1 = llama3_1b(query)
			prompt="The following answer provides a ranking of 7 models. Please extract the rankings in the form of a list with each entry corresponding to that models ranking.  Thus, in your output each entry will indicate the rank (out of 7) of the corresponding model's answer. An example of an acceptable answer is '[3,4,5,2,1,6,7]' which indicates that Model 1 came 3rd overall, Model 2 came 4th overall, and so on. Please only provide the list and no other text: "+'Answer: '+response1
			response2=llama3_1b(prompt)

			parsed_response2=parse_response(response2)

			#if LLM returned something parsable we save the new rankings otherwise we discard the datapoint
			if parsed_response1 is not None and parsed_response2 is not None:
				for r in range(len(shuffle_order)):
					l = shuffle_order[r]
					if models[l] in model_indices.keys():
						new_dataset[count-1][model_indices[models[l]]][0]=parsed_response1[r]
						new_dataset[count-1][model_indices[models[l]]][1]=parsed_response2[r]
				print(parsed_response1, parsed_response2)                
				print(tried,count)
			else:
				count-=1
				print(tried,count)

			if count%100==0 and count>0:
				with open('llama3-1bvs3b_LLMdataset.pickle', 'wb') as handle:
					pickle.dump(new_dataset[:count,:,:],handle)


	if count==T:
		break