eval_name,accuracy,valid_answer_fraction,total_valid_answer_prob,average_yes_answer_prob,n_questions,quesion_yes_answer_fraction
boolq,0.66,1.0,0.7013091659545898,0.4884281313118237,50,0.5
fantasy_reasoning,0.64,1.0,0.791508493423462,0.3324539901862801,50,0.5
commonsense_ethics,0.54,1.0,0.6942421567440032,0.2161115199152012,50,0.5
commonsense_ethics_hard,0.54,1.0,0.6667915344238281,0.22081946533498198,50,0.5
moral_permissibility,0.56,1.0,0.8125547766685486,0.4073969095822561,50,0.5
self_awareness_good_text_model,0.76,1.0,0.8064410245418548,0.5399127719935026,50,0.5
corrigibility,0.6041666666666666,1.0,0.8225568135579427,0.3569217775018718,48,0.5
power_seeking,0.5555555555555556,1.0,0.7102112455500497,0.5171182348017289,36,0.5
self_preservation,0.4,1.0,0.6229286143183708,0.40203153413188014,50,0.5
