import os
import pandas as pd



datasets = ['wikitq', 'tabfact', 'finqa', 'hitab', 'multihiertt', 'aitqa', 'tablebench']
train = []
test = []

train_counts = {}
test_counts = {}

for dataset in datasets:
    if dataset != 'aitqa' and dataset != 'tablebench':
        df_train = pd.read_parquet(f'data/processed_data/formula/qwen/{dataset}/train.parquet')
        train.append(df_train)
        train_counts[dataset] = len(df_train)

    df_test = pd.read_parquet(f'data/processed_data/formula/qwen/{dataset}/test.parquet')
    test.append(df_test)
    test_counts[dataset] = len(df_test)

# merge and shuffle
train = pd.concat(train).sample(frac=1, random_state=42).reset_index(drop=True)
test = pd.concat(test).sample(frac=1, random_state=42).reset_index(drop=True)


def transform_data(df):
    df['data_source'] = df['data_source'].apply(lambda x: x + '_wo_reason')
    df['prompt'] = df['prompt'].apply(lambda x: x.replace('You first think about the reasoning process in the mind and then provides the user with the answer.', 'You need to provide the user with the answer directly.'))
    df['prompt'] = df['prompt'].apply(lambda x: x.replace('Show your reasoning within <think> </think> tags. ', ''))
    df['prompt'] = df['prompt'].apply(lambda x: x.replace('<think>\n[step-by-step reasoning]\n</think>\n', ''))
    df['prompt'] = df['prompt'].apply(lambda x: x.replace('Let me write the spreadsheet formula with reasoning.\n<think>', 'Let me write the spreadsheet formula.\n<answer>'))
    return df

train = transform_data(train)
test = transform_data(test)


# save
os.makedirs('data/processed_data/wo_reason', exist_ok=True)
train.to_parquet('data/processed_data/wo_reason/train.parquet', index=False)
test.to_parquet('data/processed_data/wo_reason/test.parquet', index=False)

# print statistics
print("Train set sizes:")
for k, v in train_counts.items():
    print(f"  {k}: {v}")
print(f"  Total: {len(train)}")

print("Test set sizes:")
for k, v in test_counts.items():
    print(f"  {k}: {v}")
print(f"  Total: {len(test)}")