import json
import pandas as pd
from datasets import Dataset
from huggingface_hub import login

# Define the system prompt template
SYSTEM_PROMPT = """
Your role as a forecasting assistant involves thoroughly exploring questions through a systematic long thinking process before providing the final probability of the event in the question occuring. This requires engaging in a comprehensive cycle of analysis, summarizing, exploration, reassessment, reflection, backtracing, and iteration to develop well-considered thinking process. Please structure your response into two main sections: Thought and Solution.

In the Thought section, detail your reasoning process using the specified format: <think> {your detailed reasoning} </think>

In the Solution section, provide your final answer wrapped in asterisks: <answer> *{your final answer}* </answer>

Now, try to make predictions about the following question through the above guidelines:
"""

def convert_to_sharegpt_format(input_files, output_file):
    # Initialize a list to store all data
    all_data = []
    
    # Load and combine data from all input files
    for input_file in input_files:
        with open(input_file, 'r') as f:
            data = json.load(f)
            # Add source information to each entry
            for entry in data:
                if "metaculus" in input_file:
                    entry["source"] = "metaculus"
                elif "manifold" in input_file:
                    entry["source"] = "manifold"
            all_data.extend(data)
    
    # Initialize lists for the dataframe
    messages_list = []
    sources = []
    
    # Process each entry in the combined data
    for entry in all_data:
        # Create the messages with system, user and assistant messages
        messages = [
            {"role": "system", "content": SYSTEM_PROMPT},
            {"role": "user", "content": entry["prompt"]},
            {"role": "assistant", "content": f"<think>{entry['reasoning']}</think>\n\n<answer>{entry['response']}</answer>"}
        ]
        
        # Add to our lists
        messages_list.append(messages)
        sources.append(entry["source"])
    
    # Create a DataFrame
    df = pd.DataFrame({
        "messages": messages_list,
        "source": sources
    })
    
    # Create and save as HuggingFace dataset
    dataset = Dataset.from_pandas(df)
    dataset.save_to_disk(output_file)
    
    # Optionally upload to HuggingFace (needs authentication)
    # dataset.push_to_hub("your-username/dataset-name")
    
    print(f"Converted {len(all_data)} entries to ShareGPT format")
    return dataset

if __name__ == "__main__":
    input_file1 = "data/deepseek-r1__metaculus__curated.json"
    input_file2 = "data/deepseek-r1__manifold__curated.json"
    output_folder = "data/shareGPT/r1__curated"
    
    dataset = convert_to_sharegpt_format([input_file1, input_file2], output_file)
    
    # Print a sample entry to verify
    print("\nSample entry:")
    sample = dataset[0]
    print(f"Source: {sample['source']}")
    print(f"System message: {sample['messages'][0]['content']}")
    print(f"User message: {sample['messages'][1]['content']}")
    print(f"Assistant message start: {sample['messages'][2]['content']}")

    login()  # This will prompt for your token
    dataset.push_to_hub("shash42/deepseek-r1-forecasting-sft")