import os
import json
import pickle


def get_cate(global_idx_dict):
    cate_list = []
    for key, value in global_idx_dict.items():
        cate_list += [key] * len(value)
    return cate_list

def get_syn(global_idx_list):
    visual_dir = "visual/graphviz_test"
    syn_list = []
    for idx in range(len(global_idx_list)):
        png_file = os.path.join(visual_dir, f"{idx}.png")
        with open(png_file, 'rb') as f:
            img_bytes = f.read()
            syn_list.append(img_bytes)
    return syn_list


def get_triple(global_idx_list):
    with open("evaluate/raw/triple.json", "r") as f:
        all_chunk_data = json.load(f)

    global_to_local = {item["idx"]: enum_idx for enum_idx, item in enumerate(all_chunk_data)}
    triple_local_idx_list = [global_to_local[idx] for idx in global_idx_list]

    triple_list = []
    for idx in triple_local_idx_list:
        item = all_chunk_data[idx]
        triple_text = ""
        for i, triple in enumerate(item["triple"]):
            end_char = '.' if i == len(item["triple"]) - 1 else ';'
            triple_text += f'{triple["Source"]} {triple["Relationship"]} {triple["Target"]}{end_char}\n'
        triple_list.append(triple_text)
    return triple_list


def get_QA(global_idx_list):
    with open("evaluate/raw/annotation.json", "r") as f:
        all_chunk_data = json.load(f)

    global_to_local = {item["idx"]: enum_idx for enum_idx, item in enumerate(all_chunk_data)}
    anno_local_idx_list = [global_to_local[idx] for idx in global_idx_list]
    return [all_chunk_data[idx]["annotation"] for idx in anno_local_idx_list]


def main():
    with open("evaluate/selection/global_idx_test.json", "r") as f:
        global_idx_dict = json.load(f)
    global_idx_list = [idx for cate_idx_list in global_idx_dict.values() for idx in cate_idx_list]

    cate_data = get_cate(global_idx_dict)
    syn_data = get_syn(global_idx_list)
    triple_data = get_triple(global_idx_list)
    QA_data = get_QA(global_idx_list)

    with open("pkl/test_(url_real).pkl", 'rb') as file:
        url_real_data = pickle.load(file)

    data_test = []
    for idx in range(len(global_idx_list)):
        item = {
            "img_url": url_real_data[idx]["img_url"],
            "category": cate_data[idx],
            "real_bytes": url_real_data[idx]["real_bytes"],
            "syn_bytes": syn_data[idx],
            "triples": triple_data[idx],
            "QAs": QA_data[idx],
        }
        data_test.append(item)

    with open("pkl/data_test.pkl", 'wb') as file:
        pickle.dump(data_test, file)


if __name__ == '__main__':
    main()

