import json
import pandas as pd
import os

### ------------- GSM8k and SVAMP ------------- ###

file_path = 'data/gsm8k_test.jsonl'

# Initialize an empty list to store the data
data_list = []
# svamp_seeds_ls = []

# Reading the JSONL file and storing the data
with open(file_path, 'r', encoding='utf-8') as f:
	for line in f:
		data_list.append(json.loads(line))

# svamp_data = []

# with open('data/SVAMP.json', 'r', encoding='utf-8') as f:
# 	svamp_data = json.load(f)

# for i in range(len(svamp_data)):
# 	cur_ex = svamp_data[i]
# 	ques = cur_ex['Body'] + ' ' + cur_ex['Question']
# 	ans = cur_ex['Equation'] + "\n#### " + str(cur_ex['Answer'])
# 	data_list.append({'question': ques, 'answer': ans})
# 	svamp_seeds_ls.append({'question': ques, 'answer': ans})

# convert data_list to a pandas DataFrame
df = pd.DataFrame(data_list)

seed_df = df.sample(150)

seed_df.to_csv('data/gsm8k_seeds_150.tsv', sep='\t', index=False)

# svamp_seeds_df = pd.DataFrame(svamp_seeds_ls)
# svamp_seeds_df = svamp_seeds_df.sample(100)
# svamp_seeds_df.to_csv('data/svamp_seeds.tsv', sep='\t', index=False)


### ------------- GSM8k train ------------- ###

# file_path = 'data/gsm8k_train.jsonl'

# # Initialize an empty list to store the data
# data_list = []

# # Reading the JSONL file and storing the data
# with open(file_path, 'r', encoding='utf-8') as f:
# 	for line in f:
# 		data_list.append(json.loads(line))

# # convert data_list to a pandas DataFrame
# df = pd.DataFrame(data_list)

# # seed_df = df.sample(100)

# df.to_csv('data/train_seeds.tsv', sep='\t', index=False)


### ------------- MATH ------------- ###

# file_path = 'data/MATH/test'

# In the above file path, there are multiple directories, each with multiple json files. Write a loop to open those json files each one as a separate dictionary.

# data = []

# directories = os.listdir('data/MATH/test')
# for directory in directories:
# 	files = os.listdir(f'data/MATH/test/{directory}')
# 	for file in files:
# 		with open(f'data/MATH/test/{directory}/{file}', 'r', encoding='utf-8') as f:
# 			cur_dict = json.load(f)
# 			if cur_dict['level'] != 'Level 1':
# 				continue
# 			type1 = cur_dict['type']
# 			sol = cur_dict['solution']
# 			ans = sol.split("\\boxed{")[1].split("}")[0].strip()
# 			try:
# 				ans = float(ans)
# 			except:
# 				continue
# 			prob_words = cur_dict['problem'].split()
# 			number_found = False
# 			for word in prob_words:
# 				if word.isnumeric():
# 					number_found = True
# 					break
# 			if not number_found:
# 				continue
# 			data.append(cur_dict)

# df = pd.DataFrame(data)

# df.to_csv('data/math_seeds.tsv', sep='\t', index=False)