import pandas as pd
from datasets import Dataset
from sklearn.utils import shuffle

def read_data(data_path):
    df = pd.read_csv(data_path)
    df = df.dropna()
    return df

# alias,context,options,options_info,label

def prepare_dataset(df):
    df.rename(columns={'label': 'correct answer'}, inplace=True)
    df['pretext'] = df.apply(lambda x: f'Conversation:\n{x["context"]}Question: What does the "{x["alias"]}" refer to?\n', axis=1)
    df.drop(['context','alias'],axis=1,inplace=True)
    print(len(df))

if __name__ == "__main__":
    data_path = './data/esd_clean-yes-2.csv'
    df = read_data(data_path)
    prepare_dataset(df)