template_id,split,question_idx,prediction,label
llm_goals_146,test,0,0.85,0.3466493369029843
llm_goals_146,test,1,0.892,0.971693336326244
llm_goals_146,test,2,0.812,0.6455769508045827
llm_goals_146,test,3,0.875,0.5822222843930172
llm_goals_146,test,4,0.892,0.6842324614600077
llm_goals_293,test,0,0.89,0.8297409974007197
llm_goals_293,test,1,0.87,0.9166843306899098
llm_goals_293,test,2,0.872,0.4850136353820626
llm_goals_293,test,3,0.915,0.391440800258883
llm_goals_293,test,4,0.994,0.9814967512328644
llm_goals_415,test,0,0.93,0.945942102532565
llm_goals_415,test,1,0.95,0.8927012108946369
llm_goals_415,test,2,0.945,0.9655095349643328
llm_goals_415,test,3,0.789,0.5773151635184978
llm_goals_415,test,4,0.965,0.9305359112645492
llm_goals_326,test,0,0.86,0.7080152699628419
llm_goals_326,test,1,0.567,0.1915128822536962
llm_goals_326,test,2,0.86,0.7444471737433097
llm_goals_326,test,3,0.565,0.4392720971297062
llm_goals_326,test,4,0.88,0.891602221308631
llm_goals_401,test,0,0.985,0.995337188206034
llm_goals_401,test,1,0.895,0.9877679942135538
llm_goals_401,test,2,0.987,0.9922930596197688
llm_goals_401,test,3,0.93,0.944885851250094
llm_goals_401,test,4,0.998,0.9994499087491504
llm_goals_397,test,0,0.815,0.699619236076976
llm_goals_397,test,1,0.412,0.7858562431683396
llm_goals_397,test,2,0.78,0.8656621011318779
llm_goals_397,test,3,0.5,0.5216834406120067
llm_goals_397,test,4,0.28,0.2170933847941593
llm_goals_228,test,0,0.78,0.8649997140208154
llm_goals_228,test,1,0.45,0.873321951241939
llm_goals_228,test,2,0.78,0.8957700764194163
llm_goals_228,test,3,0.035,0.1236903018304219
llm_goals_228,test,4,0.759,0.7453361748877467
llm_goals_236,test,0,0.95,0.9502374518284544
llm_goals_236,test,1,0.875,0.632901790189933
llm_goals_236,test,2,0.915,0.9547987563797488
llm_goals_236,test,3,0.957,0.8955373773460537
llm_goals_236,test,4,0.925,0.604483505999029
llm_goals_206,test,0,0.85,0.7014024312312447
llm_goals_206,test,1,0.18,0.09993519180039
llm_goals_206,test,2,0.35,0.3805957976908932
llm_goals_206,test,3,0.158,0.2506742005767606
llm_goals_206,test,4,0.148,0.168981522681462
llm_goals_438,test,0,0.37,0.2314562881184323
llm_goals_438,test,1,0.895,0.7762565054203492
llm_goals_438,test,2,0.365,0.1997924391817541
llm_goals_438,test,3,0.855,0.7992019611592787
llm_goals_438,test,4,0.825,0.8049000497082406
llm_goals_182,test,0,0.979,0.9879327417690024
llm_goals_182,test,1,0.015,0.143590152166184
llm_goals_182,test,2,0.712,0.975250065576832
llm_goals_182,test,3,0.562,0.0053962595772566
llm_goals_182,test,4,0.975,0.9395814535027472
llm_goals_281,test,0,0.21,0.6469683041227705
llm_goals_281,test,1,0.612,0.9610940798838222
llm_goals_281,test,2,0.315,0.5031480224374153
llm_goals_281,test,3,0.872,0.7670721923110818
llm_goals_281,test,4,0.912,0.8573901670428902
llm_goals_115,test,0,0.892,0.6878400979912542
llm_goals_115,test,1,0.892,0.9839099052127078
llm_goals_115,test,2,0.845,0.6264771219588837
llm_goals_115,test,3,0.315,0.5101504336920828
llm_goals_115,test,4,0.725,0.4749856076325871
llm_goals_230,test,0,0.935,0.93636017960289
llm_goals_230,test,1,0.99,0.9080467805464864
llm_goals_230,test,2,0.945,0.9492415176604246
llm_goals_230,test,3,0.988,0.9883888360800804
llm_goals_230,test,4,0.932,0.9308136137248874
llm_goals_86,test,0,0.865,0.8172564483303766
llm_goals_86,test,1,0.845,0.2845365633956491
llm_goals_86,test,2,0.982,0.9608234754131476
llm_goals_86,test,3,0.895,0.4905999470752266
llm_goals_86,test,4,0.952,0.5223214595126631
