template_id,split,question_idx,prediction,label
llm_goals_146,test,0,0.5,0.3466493369029843
llm_goals_146,test,1,0.5,0.971693336326244
llm_goals_146,test,2,0.5,0.6455769508045827
llm_goals_146,test,3,0.35,0.5822222843930172
llm_goals_146,test,4,0.5,0.6842324614600077
llm_goals_146,test,5,0.3,0.3860201177799698
llm_goals_146,test,6,0.35,0.2341590231269344
llm_goals_146,test,7,0.5,0.3643683725782807
llm_goals_146,test,8,0.5,0.7055765903024079
llm_goals_146,test,9,0.3,0.5084888304786936
llm_goals_293,test,0,0.9,0.8297409974007197
llm_goals_293,test,1,0.75,0.9166843306899098
llm_goals_293,test,2,0.5,0.4850136353820626
llm_goals_293,test,3,1.0,0.391440800258883
llm_goals_293,test,4,1.0,0.9814967512328644
llm_goals_293,test,5,0.85,0.9737153059180352
llm_goals_293,test,6,0.85,0.985194742988862
llm_goals_293,test,7,0.85,0.8402554946174373
llm_goals_293,test,8,0.9,0.9863079191276464
llm_goals_293,test,9,0.5,0.9549403173664608
llm_goals_415,test,0,0.7,0.945942102532565
llm_goals_415,test,1,1.0,0.8927012108946369
llm_goals_415,test,2,0.85,0.9655095349643328
llm_goals_415,test,3,0.5,0.5773151635184978
llm_goals_415,test,4,1.0,0.9305359112645492
llm_goals_415,test,5,0.5,0.2698413337656404
llm_goals_415,test,6,0.5,0.1924170964690193
llm_goals_415,test,7,0.85,0.964020489204398
llm_goals_415,test,8,1.0,0.9764932994994092
llm_goals_415,test,9,0.5,0.3491418165522914
llm_goals_326,test,0,1.0,0.7080152699628419
llm_goals_326,test,1,0.5,0.1915128822536962
llm_goals_326,test,2,1.0,0.7444471737433097
llm_goals_326,test,3,0.5,0.4392720971297062
llm_goals_326,test,4,1.0,0.891602221308631
llm_goals_326,test,5,0.0,0.0516888735952717
llm_goals_326,test,6,0.0,0.0431335229249402
llm_goals_326,test,7,1.0,0.7478485077290631
llm_goals_326,test,8,1.0,0.8572844895817008
llm_goals_326,test,9,0.0,0.0757120509371778
llm_goals_401,test,0,1.0,0.995337188206034
llm_goals_401,test,1,1.0,0.9877679942135538
llm_goals_401,test,2,1.0,0.9922930596197688
llm_goals_401,test,3,1.0,0.944885851250094
llm_goals_401,test,4,1.0,0.9994499087491504
llm_goals_401,test,5,1.0,0.9963147043479246
llm_goals_401,test,6,1.0,0.9937722087866844
llm_goals_401,test,7,1.0,0.9823669190452718
llm_goals_401,test,8,1.0,0.999488890156012
llm_goals_401,test,9,1.0,0.9958814380530916
llm_goals_397,test,0,0.5,0.699619236076976
llm_goals_397,test,1,0.5,0.7858562431683396
llm_goals_397,test,2,1.0,0.8656621011318779
llm_goals_397,test,3,0.5,0.5216834406120067
llm_goals_397,test,4,0.2,0.2170933847941593
llm_goals_397,test,5,1.0,0.6203395193658312
llm_goals_397,test,6,0.5,0.7062451920643317
llm_goals_397,test,7,0.5,0.7818592804776917
llm_goals_397,test,8,0.2,0.3722692260862962
llm_goals_397,test,9,1.0,0.7525421946277221
llm_goals_228,test,0,0.85,0.8649997140208154
llm_goals_228,test,1,0.7,0.873321951241939
llm_goals_228,test,2,0.85,0.8957700764194163
llm_goals_228,test,3,0.2,0.1236903018304219
llm_goals_228,test,4,0.5,0.7453361748877467
llm_goals_228,test,5,0.4,0.1192843072628729
llm_goals_228,test,6,0.35,0.1326971908799849
llm_goals_228,test,7,0.85,0.871489699671336
llm_goals_228,test,8,0.7,0.8243934448117555
llm_goals_228,test,9,0.35,0.0467115483491378
llm_goals_236,test,0,0.65,0.9502374518284544
llm_goals_236,test,1,0.6,0.632901790189933
llm_goals_236,test,2,0.85,0.9547987563797488
llm_goals_236,test,3,1.0,0.8955373773460537
llm_goals_236,test,4,0.7,0.604483505999029
llm_goals_236,test,5,1.0,0.9645155057204132
llm_goals_236,test,6,0.9,0.8400796085543216
llm_goals_236,test,7,0.85,0.9809643632475058
llm_goals_236,test,8,0.9,0.9551138894715042
llm_goals_236,test,9,1.0,0.9323794813876984
llm_goals_206,test,0,0.5,0.7014024312312447
llm_goals_206,test,1,0.5,0.09993519180039
llm_goals_206,test,2,0.5,0.3805957976908932
llm_goals_206,test,3,0.5,0.2506742005767606
llm_goals_206,test,4,0.5,0.168981522681462
llm_goals_206,test,5,0.5,0.1396780009361202
llm_goals_206,test,6,0.5,0.1710164544129008
llm_goals_206,test,7,0.5,0.6071092597712212
llm_goals_206,test,8,0.5,0.1978085524873241
llm_goals_206,test,9,0.5,0.1702414804858378
llm_goals_438,test,0,1.0,0.2314562881184323
llm_goals_438,test,1,1.0,0.7762565054203492
llm_goals_438,test,2,0.85,0.1997924391817541
llm_goals_438,test,3,1.0,0.7992019611592787
llm_goals_438,test,4,1.0,0.8049000497082406
llm_goals_438,test,5,1.0,0.7174028107354763
llm_goals_438,test,6,0.9,0.5982684312746689
llm_goals_438,test,7,1.0,0.3291190451199206
llm_goals_438,test,8,1.0,0.7413057021464516
llm_goals_438,test,9,0.9,0.5655428700196011
llm_goals_182,test,0,0.0,0.9879327417690024
llm_goals_182,test,1,0.0,0.143590152166184
llm_goals_182,test,2,0.0,0.975250065576832
llm_goals_182,test,3,0.2,0.0053962595772566
llm_goals_182,test,4,1.0,0.9395814535027472
llm_goals_182,test,5,0.4,0.7582924861360835
llm_goals_182,test,6,0.5,0.5490946196305377
llm_goals_182,test,7,0.0,0.9749513260519144
llm_goals_182,test,8,1.0,0.9392208432034008
llm_goals_182,test,9,0.5,0.644100014754752
llm_goals_281,test,0,0.5,0.6469683041227705
llm_goals_281,test,1,1.0,0.9610940798838222
llm_goals_281,test,2,0.9,0.5031480224374153
llm_goals_281,test,3,0.9,0.7670721923110818
llm_goals_281,test,4,0.5,0.8573901670428902
llm_goals_281,test,5,1.0,0.9842762939932492
llm_goals_281,test,6,1.0,0.9899150137032138
llm_goals_281,test,7,0.8,0.6044210331647685
llm_goals_281,test,8,0.5,0.8588374254971683
llm_goals_281,test,9,1.0,0.9818143245858572
llm_goals_115,test,0,1.0,0.6878400979912542
llm_goals_115,test,1,1.0,0.9839099052127078
llm_goals_115,test,2,1.0,0.6264771219588837
llm_goals_115,test,3,0.85,0.5101504336920828
llm_goals_115,test,4,0.85,0.4749856076325871
llm_goals_115,test,5,0.9,0.8824023049105343
llm_goals_115,test,6,0.9,0.8532372770314965
llm_goals_115,test,7,0.9,0.6852873624151903
llm_goals_115,test,8,0.85,0.7160879332925761
llm_goals_115,test,9,0.85,0.8154454841155164
llm_goals_230,test,0,0.9,0.93636017960289
llm_goals_230,test,1,0.9,0.9080467805464864
llm_goals_230,test,2,0.85,0.9492415176604246
llm_goals_230,test,3,0.9,0.9883888360800804
llm_goals_230,test,4,0.6,0.9308136137248874
llm_goals_230,test,5,0.3,0.7461662385047432
llm_goals_230,test,6,0.35,0.717943379234597
llm_goals_230,test,7,0.85,0.9498506183856972
llm_goals_230,test,8,0.65,0.903114856201669
llm_goals_230,test,9,0.3,0.82763987275553
llm_goals_86,test,0,0.5,0.8172564483303766
llm_goals_86,test,1,0.5,0.2845365633956491
llm_goals_86,test,2,0.5,0.9608234754131476
llm_goals_86,test,3,0.5,0.4905999470752266
llm_goals_86,test,4,0.5,0.5223214595126631
llm_goals_86,test,5,0.5,0.9513769165530092
llm_goals_86,test,6,0.5,0.951058446008376
llm_goals_86,test,7,0.5,0.7200026044712197
llm_goals_86,test,8,0.5,0.8539416814851118
llm_goals_86,test,9,0.5,0.8409760614216922
