import os
import re
import json
import pickle
import pandas as pd
from collections import defaultdict
from openpyxl import load_workbook
from openpyxl.styles import Font, Alignment
from openpyxl.worksheet.datavalidation import DataValidation


def parse_triple(text):
    lines = text.strip().split('\n')
    cleaned_lines = [re.sub(r'[^\w\s]', '', line).strip() for line in lines if line.strip()]
    return cleaned_lines


def assign_segments(num_classes=10, items_per_class=150, num_chunks=6, sampled_per_class=30):
    combinations = [
        ('A', 'B'),
        ('A', 'C'),
        ('A', 'D'),
        ('B', 'C'),
        ('B', 'D'),
        ('C', 'D'),
    ]

    assignment = defaultdict(list)
    step = items_per_class // sampled_per_class
    chunk_size = sampled_per_class // num_chunks

    for class_idx in range(num_classes):
        class_start = class_idx * items_per_class
        sampled_indices = [class_start + i * step for i in range(sampled_per_class)]

        for chunk_idx in range(num_chunks):
            chunk_start = chunk_idx * chunk_size
            chunk_end = chunk_start + chunk_size
            indices = sampled_indices[chunk_start:chunk_end]
            combo = combinations[chunk_idx % len(combinations)]
            assignment[combo[0]].extend(indices)
            assignment[combo[1]].extend(indices)

    return assignment


def generate_data_json(idx_list, filename):
    data_segment = []
    for idx in idx_list:
        data_segment.append({
            "idx": idx,
            "img_url": data_test[idx]["img_url"],
            "category": data_test[idx]["category"],
            "triples": parse_triple(data_test[idx]["triples"]),
            "QAs": data_test[idx]["QAs"],
        })

    with open(filename, "w", encoding="utf-8") as f:
        json.dump(data_segment, f, indent=4, ensure_ascii=False)


def generate_answer_sheet(idx_list, filename):
    df = pd.DataFrame({
        "Idx": idx_list,
        "Knowledge Triples Quality": ["" for _ in idx_list],
        "QA Pair Correctness": ["" for _ in idx_list],
        "Negative Verification": ["" for _ in idx_list],
        "Question Difficulty Level": ["" for _ in idx_list],
        "Notes (if any)": ["" for _ in idx_list],
    })
    df.to_excel(filename, index=False)
    wb = load_workbook(filename)
    ws = wb.active
    times_new_roman = Font(name='Times New Roman')
    center_align = Alignment(horizontal='center')

    header = [cell.value for cell in next(ws.iter_rows(min_row=1, max_row=1))]
    choice_columns = ["Knowledge Triples Quality", "QA Pair Correctness", "Negative Verification", "Question Difficulty Level"]
    choice_columns_indices = [i for i, name in enumerate(header) if name in choice_columns]
    for row in ws.iter_rows():
        for idx, cell in enumerate(row):
            cell.font = times_new_roman
            if idx in choice_columns_indices:
                cell.alignment = center_align
    for col in ws.columns:
        max_length = 0
        col_letter = col[0].column_letter
        for cell in col:
            if cell.value:
                max_length = max(max_length, len(str(cell.value)))
        ws.column_dimensions[col_letter].width = max_length + 4

    dropdown_options = {
        "Knowledge Triples Quality": ["Correct", "Partially Correct", "Unusable"],
        "QA Pair Correctness": ["Correct", "Partially Correct", "Unusable"],
        "Negative Verification": ["All require image", "Some require image", "All guessable without image"],
        "Question Difficulty Level": ["High School Level", "College-Level Expert"]
    }
    if "DropdownOptions" in wb.sheetnames:
        opt_ws = wb["DropdownOptions"]
    else:
        opt_ws = wb.create_sheet("DropdownOptions")
        wb.move_sheet(opt_ws, offset=1)
        opt_ws.sheet_state = 'hidden'
    header = [cell.value for cell in ws[1]]
    for col_idx, col_name in enumerate(header):
        if col_name in dropdown_options:
            options = dropdown_options[col_name]
            opt_col = col_idx + 1
            for row_idx, value in enumerate(options):
                opt_ws.cell(row=row_idx + 1, column=opt_col, value=value)
            col_letter = ws.cell(row=1, column=col_idx + 1).column_letter
            option_range = f"'DropdownOptions'!${chr(65 + col_idx)}$1:${chr(65 + col_idx)}${len(options)}"
            dv = DataValidation(type="list", formula1=option_range, allow_blank=True)
            dv_range = f"{col_letter}2:{col_letter}{len(idx_list) + 1}"
            dv.add(dv_range)
            ws.add_data_validation(dv)

    wb.save(filename)


if __name__ == "__main__":
    with open("pkl/data_test.pkl", 'rb') as file:
        data_test = pickle.load(file)

    os.makedirs("human_eval", exist_ok=True)
    assignment = assign_segments()
    with open("human_eval/assignment.json", 'w') as file:
        json.dump(assignment, file)
    for person, idx_list in assignment.items():
        generate_data_json(idx_list, f"human_eval/segment_{person}.json")
        generate_answer_sheet(idx_list, f"human_eval/sheet_{person}.xlsx")

