import pandas as pd
import numpy as np
from tqdm import tqdm

# 假设已经加载了数据到DataFrame中
data = pd.read_csv('unit-bio.csv')
concept_set = set()
student_set = set()
exercise_set = set()
for row in tqdm(data.iterrows(), desc='extracting'):
    student_id = row[0]
    exercise_id = row[1]['exam_id']
    concept_id = row[1]['question_id']
    student_set.add(student_id)
    concept_set.add(concept_id)
    exercise_set.add(exercise_id)
    full_score = row[1]['score']
    if full_score == 'n.a.':
        continue


def get_sorted_list_from_set(tmp_set):
    return sorted(list(tmp_set))


def get_transfer_dict_from_list(tmp_list):
    tmp_dict = {content: index for index, content in enumerate(tmp_list)}
    return tmp_dict


concept_list = get_sorted_list_from_set(concept_set)
student_list = get_sorted_list_from_set(student_set)
exercise_list = get_sorted_list_from_set(exercise_set)

student_transfer_dict = get_transfer_dict_from_list(student_list)
exercise_transfer_dict = get_transfer_dict_from_list(exercise_list)
concept_transfer_dict = get_transfer_dict_from_list(concept_list)

student_num = len(student_list)
exercise_num = len(exercise_list)
concept_num = len(concept_list)

response_matrix = np.ones(shape=(student_num, exercise_num)) * -1
response_data = []
for row in tqdm(data.iterrows(), desc='extracting'):
    student_id = row[0]
    exercise_id = row[1]['exam_id']
    concept_id = row[1]['question_id']
    score = row[1]['concept']
    full_score = row[1]['score']
    if full_score == 'n.a.':
        continue
    if int(score) / int(full_score) < 1.0:
        true_score = 0
    else:
        true_score = 1
    response_matrix[student_transfer_dict[student_id], exercise_transfer_dict[exercise_id]] = true_score
    # response_data.append([student_transfer_dict[student_id], exercise_transfer_dict[exercise_id], int(score)])
for student_id, exercise_id in zip(*np.where(response_matrix != -1)):
   response_data.append([student_id, exercise_id, response_matrix[student_id, exercise_id]])

np.savetxt('response.csv', np.array(response_data), delimiter=',')

q_matrix = np.zeros(shape=(exercise_num, concept_num))
for row in tqdm(data.iterrows(), desc='extracting'):
    student_id = row[0]
    exercise_id = row[1]['exam_id']
    concept_id = row[1]['question_id']
    score = row[1]['concept']
    full_score = row[1]['score']
    if full_score == 'n.a.':
        continue
    q_matrix[exercise_transfer_dict[exercise_id], concept_transfer_dict[concept_id]] = 1
np.savetxt('q_matrix.csv', q_matrix, delimiter=',')
print(student_num, exercise_num, concept_num)