template_id,split,question_idx,prediction,label
llm_goals_75,test,0,0.275,0.0759730864369293
llm_goals_75,test,1,0.712,0.8120127334654941
llm_goals_75,test,2,0.35,0.114709127325878
llm_goals_75,test,3,0.712,0.1496870071208148
llm_goals_75,test,4,0.552,0.321632167436288
llm_goals_78,test,0,0.685,0.710685793210421
llm_goals_78,test,1,0.721,0.4521352090792044
llm_goals_78,test,2,0.712,0.6984783435600235
llm_goals_78,test,3,0.562,0.2113029164438998
llm_goals_78,test,4,0.372,0.59362413330766
llm_goals_53,test,0,0.102,0.0313037944392233
llm_goals_53,test,1,0.197,0.8240056677766623
llm_goals_53,test,2,0.287,0.1449565288533016
llm_goals_53,test,3,0.32,0.3918474727299789
llm_goals_305,test,0,0.512,0.0438240495208372
llm_goals_305,test,1,0.612,0.1352509650243347
llm_goals_305,test,2,0.315,0.1723536643548477
llm_goals_305,test,3,0.68,0.250094549889143
llm_goals_305,test,4,0.785,0.8419536365764244
llm_goals_81,test,0,0.165,0.2060992725380174
llm_goals_81,test,1,0.52,0.1581010259271764
llm_goals_81,test,2,0.72,0.6899052301996583
llm_goals_81,test,3,0.652,0.7805136989347726
llm_goals_81,test,4,0.42,0.0972643506873081
llm_goals_133,test,0,0.623,0.4440529323069227
llm_goals_133,test,1,0.612,0.3955848959165448
llm_goals_133,test,2,0.512,0.5130030466159122
llm_goals_133,test,3,0.512,0.220245744335209
llm_goals_133,test,4,0.721,0.5879895213269322
llm_goals_157,test,0,0.105,0.6355320308190959
llm_goals_157,test,1,0.345,0.7424853921670768
llm_goals_157,test,2,0.085,0.073314241900373
llm_goals_157,test,3,0.22,0.4675850432855137
llm_goals_157,test,4,0.682,0.5458627288071527
llm_goals_186,test,0,0.045,0.0367510709036243
llm_goals_186,test,1,0.875,0.9589190483469818
llm_goals_186,test,2,0.045,0.0667798014228945
llm_goals_186,test,3,0.015,0.0340377111776925
llm_goals_186,test,4,0.42,0.4002710909538108
llm_goals_401,test,0,0.945,0.9617876405899952
llm_goals_401,test,1,0.675,0.9404490002093446
llm_goals_401,test,2,0.945,0.935046673136648
llm_goals_401,test,3,0.945,0.9543637645078082
llm_goals_401,test,4,0.89,0.9660480618061604
llm_goals_420,test,0,0.428,0.9711560599789952
llm_goals_420,test,1,0.145,0.5978053926454786
llm_goals_420,test,2,0.321,0.0817351976918742
llm_goals_420,test,3,0.315,0.4730792485055278
llm_goals_263,test,0,0.695,0.5209935935366439
llm_goals_263,test,1,0.732,0.7968068746445729
llm_goals_263,test,2,0.685,0.5467470232144288
llm_goals_263,test,3,0.725,0.4733215007319302
llm_goals_263,test,4,0.782,0.0057083956844275
llm_goals_427,test,0,0.512,0.2826418194423331
llm_goals_427,test,1,0.615,0.380977464557011
llm_goals_427,test,2,0.52,0.1487359361961987
llm_goals_427,test,3,0.57,0.3576580711889061
llm_goals_427,test,4,0.715,0.2270212997591984
llm_goals_449,test,0,0.007,0.0338418398422384
llm_goals_449,test,1,0.215,0.5657697432527007
llm_goals_449,test,2,0.008,0.0110135668657682
llm_goals_449,test,3,0.115,0.2352946829974157
llm_goals_449,test,4,0.415,0.4842577903424374
llm_goals_93,test,0,0.152,0.0729546648027557
llm_goals_93,test,1,0.732,0.6981648167380458
llm_goals_93,test,2,0.192,0.3318565449695651
llm_goals_93,test,3,0.52,0.5505450337981316
llm_goals_93,test,4,0.895,0.9547701486411684
llm_goals_358,test,0,0.857,0.9198604804287828
llm_goals_358,test,1,0.92,0.8623428941495346
llm_goals_358,test,2,0.87,0.8146633538972576
llm_goals_358,test,3,0.925,0.6635203414505186
llm_goals_358,test,4,0.945,0.9873765704706148
