import json
import os
import re
# Genetics 34
# Medical 23
# Ecology 1
# BioEng 17
# 75
if __name__ == "__main__":
    folder_path = "data/testset"
    tot_data = []
    file_paths = {
        "Molecular Biology & Genetics": "data/protocol_list/Genetics_testset.json",
        "Biomedical & Clinical Research": "data/protocol_list/Medical_testset.json",
        "Ecology & Environmental Biology": "data/protocol_list/Ecology_testset.json",
        "Bioengineering & Technology": "data/protocol_list/BioEng_testset.json"
    }

    for filepath,dirnames,filenames in os.walk(folder_path):
        for filename in filenames:
            if filename.endswith('.json'):
                file_path = os.path.join(filepath, filename)
                BigProb, SmallProb = file_path.split("/")[2], file_path.split("/")[3]
                print(BigProb, SmallProb)
                with open(file_path, 'r', encoding='utf-8') as json_file:
                    data = json.load(json_file)
                    area = data["bigAreas"][0]
                    if area in file_paths:
                        tot_data.append({
                            "procedures": ' '.join(data["procedures"]),
                            "bigAreas": file_paths[area].split("/")[2].split("_")[0],
                            "bigProb": BigProb,
                            "smallProb": SmallProb
                        })
                    else:
                        print("Error!")
                        raise "Area does not match!"
    
    print(len(tot_data))

    with open("data/protocol_list/testset.json", "w") as f:
        json.dump(tot_data, f, indent=2)
