# import pandas as pd

# # Specify the path to your CSV file
# csv_file_path = '/home/miria/utopia/papers_reports/distilgpt2_hd_sft.csv'

# # Read the CSV file into a DataFrame
# df = pd.read_csv(csv_file_path)

# # Print the contents of the DataFrame
# print(df)


import pandas as pd

def get_dpo_dataset_dicts():
    # each item in this list is a dataset dict
    list_of_dpo_dataset_dicts = []

    # Simulate the structure of the dataset for demonstration purposes
    json_data = [
        {
            "utterances": [
                ["agent1", "Hello! Welcome to the Four Seasons."],
                ["agent2", "Hi! I'd like to book a room."],
                ["agent1", "Sure, when would you like to check in?"],
                ["agent2", "I'd like to check in tomorrow."]
            ]
        },
        {
            "utterances": [
                ["agent1", "Hello, how can I assist you today?"],
                ["agent2", "I have a reservation inquiry."],
                ["agent1", "Can you please provide more details?"]
            ]
        }
    ]
    
    for conversation in json_data:
        dpo_dataset_dict = {}
        agent1 = []
        agent2 = []
        for utterance in conversation['utterances']:
            if utterance[0] == 'agent1':
                agent1.append(utterance[1])
            else:
                agent2.append(utterance[1])

        # Sometimes agent2 has the final say... ignore the final prompt
        if len(agent1) == len(agent2):
            agent2 = agent2[:-1]

        dpo_dataset_dict['prompt'] = agent2
        dpo_dataset_dict['chosen'] = agent1[1:]
        dpo_dataset_dict['rejected'] = agent1[2:] + agent1[:1]
        list_of_dpo_dataset_dicts.append(dpo_dataset_dict)

    return list_of_dpo_dataset_dicts

# Generate dataset and convert to DataFrame
dataset_dicts = get_dpo_dataset_dicts()
df = pd.DataFrame(dataset_dicts)

# Adding the lengths of the prompts, chosen, and rejected sets
df['prompt_length'] = df['prompt'].apply(len)
df['chosen_length'] = df['chosen'].apply(len)
df['rejected_length'] = df['rejected'].apply(len)

# Display the DataFrame with prompt, chosen, rejected, and their respective lengths
print(df[['prompt', 'chosen', 'rejected', 'prompt_length', 'chosen_length', 'rejected_length']])
