# %% [markdown]
# # Evaluation Noteboook
# Notebook to easily evaluate and visualize infernece results for Kaleidoscope

# %%
import pandas as pd
import numpy as np
from eval_utils import compute_accuracy, get_summary
import sys

# %%
result_path = sys.argv[1]
full_acc = pd.read_json(compute_accuracy(result_path))

# %% [markdown]
# You can group the results for any column on the dataset (language as example)

# %%
summary = get_summary(full_acc, "language")
summary

# %%
accuracy = np.mean(summary['accuracy'])
valid_acc = np.mean(summary['valid_acc'])
rate = 1 - sum(summary['valid_count'])/sum(summary['total_questions'])
print(f"Accuracy: {accuracy*100}")
print(f"Valid Accuracy: {valid_acc*100}")
print(f"Failure Rate: {rate*100}")
print(f"Total questions:{np.sum(summary['total_questions'])}")
print(f"Missing questions:{np.sum(summary['none_count'])}")


