# import pandas as pd

# csv_file_path = '/home/miria/utopia/papers_reports/distilgpt2_hd_sft.csv'

# # Read the CSV file into a DataFrame
# df = pd.read_csv(csv_file_path)

# # Print the contents of the DataFrame
# print(df)


import pandas as pd

# def get_dpo_dataset_dicts():
#     # each item in this list is a dataset dict
#     list_of_dpo_dataset_dicts = []

#     # Simulate the structure of the dataset for demonstration purposes
#     json_data = [
#         {
#             "utterances": [
#                 ["agent1", "Hello! Welcome to the Four Seasons."],
#                 ["agent2", "Hi! I'd like to book a room."],
#                 ["agent1", "Sure, when would you like to check in?"],
#                 ["agent2", "I'd like to check in tomorrow."]
#             ]
#         },
#         {
#             "utterances": [
#                 ["agent1", "Hello, how can I assist you today?"],
#                 ["agent2", "I have a reservation inquiry."],
#                 ["agent1", "Can you please provide more details?"]
#             ]
#         }
#     ]
    
#     for conversation in json_data:
#         dpo_dataset_dict = {}
#         agent1 = []
#         agent2 = []
#         for utterance in conversation['utterances']:
#             if utterance[0] == 'agent1':
#                 agent1.append(utterance[1])
#             else:
#                 agent2.append(utterance[1])

#         # Sometimes agent2 has the final say... ignore the final prompt
#         if len(agent1) == len(agent2):
#             agent2 = agent2[:-1]

#         dpo_dataset_dict['prompt'] = agent2
#         dpo_dataset_dict['chosen'] = agent1[1:]
#         dpo_dataset_dict['rejected'] = agent1[2:] + agent1[:1]
#         list_of_dpo_dataset_dicts.append(dpo_dataset_dict)

#     return list_of_dpo_dataset_dicts

# # Generate dataset and convert to DataFrame
# dataset_dicts = get_dpo_dataset_dicts()
# df = pd.DataFrame(dataset_dicts)

# # Adding the lengths of the prompts, chosen, and rejected sets
# df['prompt_length'] = df['prompt'].apply(len)
# df['chosen_length'] = df['chosen'].apply(len)
# df['rejected_length'] = df['rejected'].apply(len)

# # Display the DataFrame with prompt, chosen, rejected, and their respective lengths
# print(df[['prompt', 'chosen', 'rejected', 'prompt_length', 'chosen_length', 'rejected_length']])

# print(len(ALLpref_dataset_alternate.json))

import flash_attn
print("Flash Attention is installed correctly!")

print(len("/home/miria/cvxdpo/datasets/test_dataset.json"))
print(len("/home/miria/cvxdpo/datasets/test_edu_dataset.json"))

