identifier,bias,accuracy,domain,reasoning_mode,model,prompt,algo
BATCH,0.059485716157552375,0.0934,OpenReview,ChainOfThought,deepseek_r1,critical,MartingaleStrategy
BATCH,0.1051868369732391,,CMV,ChainOfThought,gemini_2_0_flash,confirmatory,MartingaleStrategy
BATCH,0.13366099558916197,0.166682,OpenReview,SelfDebate,deepseek_v3,none,MartingaleStrategy
BATCH,0.0002676527557672275,,CMV,SelfDebate,llama_4_scout,confirmatory,MartingaleStrategy
BATCH,0.19293798098687212,0.231658,OpenReview,SelfDebate,llama_4_scout,confirmatory,MartingaleStrategy
BATCH,0.08902945075211806,0.26170000000000004,OpenReview,ChainOfThought,llama_4_scout,none,MartingaleStrategy
BATCH,0.17241379310344837,0.23,OpenReview,SelfDebate,llama_4_maverick,confirmatory,MartingaleStrategy
BATCH,0.029864293462620085,0.19172599999999998,OpenReview,SelfDebate,deepseek_r1,confirmatory,MartingaleStrategy
BATCH,0.12176792163803672,,CMV,ChainOfThought,claude_3_5_haiku,none,MartingaleStrategy
BATCH,0.03354229445491923,0.269455,Forecasting,ChainOfThought,deepseek_v3,none,MartingaleStrategy
BATCH,0.14201752444770144,,CMV,ChainOfThought,llama_4_scout,none,MartingaleStrategy
BATCH,0.0865064338643337,0.12565,OpenReview,ChainOfThought,deepseek_v3,critical,MartingaleStrategy
BATCH,0.032018635777934554,0.32421900000000003,Forecasting,ChainOfThought,claude_3_5_haiku,none,MartingaleStrategy
BATCH,0.03952054999438544,0.21968,Forecasting,SelfDebate,llama_4_scout,critical,MartingaleStrategy
BATCH,0.048670769447455,0.23691800000000002,Forecasting,SelfDebate,deepseek_r1,confirmatory,MartingaleStrategy
BATCH,0.015092114733763077,0.27325,Forecasting,SelfDebate,claude_3_5_haiku,confirmatory,MartingaleStrategy
BATCH,0.011934030846015983,0.22163425,Forecasting,ChainOfThought,deepseek_r1,critical,MartingaleStrategy
BATCH,0.012812499999999965,0.219313,Forecasting,SelfDebate,llama_4_maverick,confirmatory,MartingaleStrategy
BATCH,0.02329834491652929,0.2076,Forecasting,SelfDebate,gpt_4o,critical,MartingaleStrategy
BATCH,0.11548370735058791,,CMV,ChainOfThought,deepseek_v3,none,MartingaleStrategy
BATCH,0.17494843427714243,0.2043,OpenReview,SelfDebate,llama_4_maverick,none,MartingaleStrategy
BATCH,0.035040773674771285,0.25410125,Forecasting,ChainOfThought,llama_4_scout,none,MartingaleStrategy
BATCH,0.116104501025325,,CMV,ChainOfThought,llama_4_maverick,critical,MartingaleStrategy
BATCH,0.18601100164886178,0.169874,OpenReview,SelfDebate,deepseek_r1,critical,MartingaleStrategy
BATCH,0.056266982741881515,0.36201950000000005,Forecasting,ChainOfThought,claude_3_5_haiku,confirmatory,MartingaleStrategy
BATCH,0.006401866499743938,0.21802700000000003,Forecasting,SelfDebate,deepseek_v3,critical,MartingaleStrategy
BATCH,0.019616985061972177,0.239706,Forecasting,SelfDebate,gemini_2_0_flash,none,MartingaleStrategy
BATCH,0.1769885268743888,0.2107,OpenReview,SelfDebate,gpt_4o,critical,MartingaleStrategy
BATCH,0.05228017499910284,0.38340025000000005,Forecasting,ChainOfThought,llama_4_maverick,confirmatory,MartingaleStrategy
BATCH,0.09613423092615295,,CMV,SelfDebate,deepseek_r1,confirmatory,MartingaleStrategy
BATCH,0.023003637765501704,,CMV,SelfDebate,claude_3_5_haiku,none,MartingaleStrategy
BATCH,0.01029929898736384,0.206726,Forecasting,SelfDebate,llama_4_maverick,none,MartingaleStrategy
BATCH,0.14928182272053647,0.40540600000000004,OpenReview,ChainOfThought,deepseek_v3,confirmatory,MartingaleStrategy
BATCH,0.06763969716428758,0.09135,OpenReview,ChainOfThought,deepseek_r1,none,MartingaleStrategy
BATCH,0.0670819448219956,,CMV,ChainOfThought,gpt_4o,none,MartingaleStrategy
BATCH,0.03475186547523725,0.22929901,Forecasting,ChainOfThought,deepseek_v3,critical,MartingaleStrategy
BATCH,0.18911662299436477,0.172758,OpenReview,SelfDebate,gpt_4o,none,MartingaleStrategy
BATCH,0.11458621060716931,,CMV,ChainOfThought,llama_4_scout,critical,MartingaleStrategy
BATCH,0.07427152905018039,0.20647200000000002,OpenReview,SelfDebate,deepseek_v3,critical,MartingaleStrategy
BATCH,0.025295770500304026,0.348398,OpenReview,ChainOfThought,llama_4_scout,confirmatory,MartingaleStrategy
BATCH,0.012066694267755711,0.210522,Forecasting,SelfDebate,deepseek_r1,critical,MartingaleStrategy
BATCH,0.08819700585810379,0.1646,OpenReview,SelfDebate,gemini_2_0_flash,none,MartingaleStrategy
BATCH,0.0822949684681334,0.195904,OpenReview,ChainOfThought,llama_4_maverick,none,MartingaleStrategy
BATCH,0.015614262082390508,0.22092425000000002,Forecasting,ChainOfThought,gpt_4o,critical,MartingaleStrategy
BATCH,0.05589502142505133,0.22173600000000002,Forecasting,SelfDebate,deepseek_r1,none,MartingaleStrategy
BATCH,0.15723786361340217,,CMV,SelfDebate,gpt_4o,confirmatory,MartingaleStrategy
BATCH,0.06590639065354442,,CMV,ChainOfThought,gpt_4o,critical,MartingaleStrategy
BATCH,0.02952478274455507,,CMV,SelfDebate,gemini_2_0_flash,confirmatory,MartingaleStrategy
BATCH,0.08448909213584231,,CMV,SelfDebate,deepseek_r1,none,MartingaleStrategy
BATCH,0.10830534160799798,,CMV,ChainOfThought,claude_3_5_haiku,critical,MartingaleStrategy
BATCH,0.10123878763587506,0.0854,OpenReview,ChainOfThought,gemini_2_0_flash,none,MartingaleStrategy
BATCH,0.011420913169017555,0.20597000000000001,Forecasting,SelfDebate,llama_4_scout,confirmatory,MartingaleStrategy
BATCH,0.06218304397112336,,CMV,SelfDebate,deepseek_r1,critical,MartingaleStrategy
BATCH,0.08791560598716137,,CMV,ChainOfThought,deepseek_v3,confirmatory,MartingaleStrategy
BATCH,0.08590431809177182,0.3846,OpenReview,ChainOfThought,gpt_4o,confirmatory,MartingaleStrategy
BATCH,0.09414394889928154,,CMV,SelfDebate,gpt_4o,none,MartingaleStrategy
BATCH,0.017921954437016104,,CMV,SelfDebate,deepseek_v3,critical,MartingaleStrategy
BATCH,0.12033497230520819,,CMV,ChainOfThought,gemini_2_0_flash,critical,MartingaleStrategy
BATCH,0.0017973043228574652,0.247196,Forecasting,ChainOfThought,gpt_4o,none,MartingaleStrategy
BATCH,0.09505669079155794,0.29969999999999997,OpenReview,ChainOfThought,llama_4_maverick,confirmatory,MartingaleStrategy
BATCH,0.04498386852501707,0.32704925,Forecasting,ChainOfThought,deepseek_r1,confirmatory,MartingaleStrategy
BATCH,0.09000188204185222,,CMV,SelfDebate,llama_4_scout,none,MartingaleStrategy
BATCH,0.09086878524353631,0.2332,OpenReview,ChainOfThought,llama_4_maverick,critical,MartingaleStrategy
BATCH,0.08217402726290937,,CMV,SelfDebate,gpt_4o,critical,MartingaleStrategy
BATCH,0.08489907641282163,0.072808,OpenReview,ChainOfThought,gemini_2_0_flash,confirmatory,MartingaleStrategy
BATCH,0.1028061654965951,0.24945399999999998,OpenReview,ChainOfThought,llama_4_scout,critical,MartingaleStrategy
BATCH,0.051053569306292824,,CMV,ChainOfThought,deepseek_r1,critical,MartingaleStrategy
BATCH,0.05107633111480871,,CMV,SelfDebate,deepseek_v3,confirmatory,MartingaleStrategy
BATCH,0.06889543742107815,0.186076,OpenReview,ChainOfThought,deepseek_r1,confirmatory,MartingaleStrategy
BATCH,0.06423043470123055,,CMV,SelfDebate,gemini_2_0_flash,critical,MartingaleStrategy
BATCH,0.01317796610169489,0.2225,Forecasting,SelfDebate,llama_4_maverick,critical,MartingaleStrategy
BATCH,0.0990058550858466,,CMV,ChainOfThought,deepseek_v3,critical,MartingaleStrategy
BATCH,0.028168994314601126,0.25211300000000003,Forecasting,ChainOfThought,llama_4_maverick,critical,MartingaleStrategy
BATCH,0.1607896100584638,,CMV,SelfDebate,llama_4_maverick,confirmatory,MartingaleStrategy
BATCH,0.21125377059324021,0.16271800000000003,OpenReview,SelfDebate,deepseek_v3,confirmatory,MartingaleStrategy
BATCH,0.0733905070628462,0.122928,OpenReview,ChainOfThought,gpt_4o,none,MartingaleStrategy
BATCH,0.07393170952449953,,CMV,SelfDebate,deepseek_v3,none,MartingaleStrategy
BATCH,0.1372164693346679,,CMV,ChainOfThought,llama_4_scout,confirmatory,MartingaleStrategy
BATCH,0.007818018601365719,0.24050000000000002,Forecasting,SelfDebate,llama_4_scout,none,MartingaleStrategy
BATCH,0.07404153901138724,0.385095,Forecasting,ChainOfThought,llama_4_scout,confirmatory,MartingaleStrategy
BATCH,0.12629329734592895,0.1334,OpenReview,SelfDebate,gemini_2_0_flash,critical,MartingaleStrategy
BATCH,0.09288260261722718,0.211583,Forecasting,SelfDebate,deepseek_v3,none,MartingaleStrategy
BATCH,0.05865143955843617,,CMV,SelfDebate,claude_3_5_haiku,critical,MartingaleStrategy
BATCH,0.021580048499833554,0.201434,Forecasting,SelfDebate,deepseek_v3,confirmatory,MartingaleStrategy
BATCH,0.12085368037280224,,CMV,ChainOfThought,gemini_2_0_flash,none,MartingaleStrategy
BATCH,0.04390168835736901,0.224654,Forecasting,SelfDebate,gpt_4o,none,MartingaleStrategy
BATCH,0.17176250584385241,0.227966,OpenReview,SelfDebate,gpt_4o,confirmatory,MartingaleStrategy
BATCH,0.11853270419142788,,CMV,SelfDebate,llama_4_maverick,critical,MartingaleStrategy
BATCH,0.05259004836880697,,CMV,ChainOfThought,deepseek_r1,confirmatory,MartingaleStrategy
BATCH,0.01775484299168833,0.18945800000000002,Forecasting,ChainOfThought,llama_4_maverick,none,MartingaleStrategy
BATCH,0.10278141993280043,0.133046,OpenReview,ChainOfThought,deepseek_v3,none,MartingaleStrategy
BATCH,0.0012239121843007192,0.19853300000000002,Forecasting,SelfDebate,gemini_2_0_flash,critical,MartingaleStrategy
BATCH,0.08957581133045335,0.35167701,Forecasting,ChainOfThought,gpt_4o,confirmatory,MartingaleStrategy
BATCH,0.08170571997440766,0.10836,OpenReview,ChainOfThought,gemini_2_0_flash,critical,MartingaleStrategy
BATCH,0.07641416056836653,0.223052,Forecasting,ChainOfThought,gemini_2_0_flash,none,MartingaleStrategy
BATCH,0.03677385203697093,0.21686400000000003,Forecasting,SelfDebate,gemini_2_0_flash,confirmatory,MartingaleStrategy
BATCH,0.01333235529053438,0.30806676,Forecasting,ChainOfThought,claude_3_5_haiku,critical,MartingaleStrategy
BATCH,0.10375715032747394,,CMV,ChainOfThought,llama_4_maverick,none,MartingaleStrategy
BATCH,0.0762524457745407,0.381434,Forecasting,ChainOfThought,deepseek_v3,confirmatory,MartingaleStrategy
BATCH,0.05018861126838795,,CMV,ChainOfThought,deepseek_r1,none,MartingaleStrategy
BATCH,0.1728608835205678,0.20745000000000002,OpenReview,SelfDebate,llama_4_scout,critical,MartingaleStrategy
BATCH,0.033491359647878916,0.18988,Forecasting,ChainOfThought,gemini_2_0_flash,confirmatory,MartingaleStrategy
BATCH,0.02832630098452873,0.26221900000000004,Forecasting,SelfDebate,claude_3_5_haiku,none,MartingaleStrategy
BATCH,0.012519828145155194,0.2647580001,Forecasting,ChainOfThought,llama_4_scout,critical,MartingaleStrategy
BATCH,0.10999448506520107,,CMV,SelfDebate,llama_4_maverick,none,MartingaleStrategy
BATCH,0.2521310004486319,0.23560000000000003,OpenReview,SelfDebate,llama_4_maverick,critical,MartingaleStrategy
BATCH,0.034783277763529095,0.26324200000000003,Forecasting,SelfDebate,claude_3_5_haiku,critical,MartingaleStrategy
BATCH,0.02273131453060687,0.200668,Forecasting,SelfDebate,gpt_4o,confirmatory,MartingaleStrategy
BATCH,0.11677871755945947,0.191006,OpenReview,SelfDebate,llama_4_scout,none,MartingaleStrategy
BATCH,0.023834173868610052,,CMV,SelfDebate,llama_4_scout,critical,MartingaleStrategy
BATCH,0.14349051698158363,,CMV,ChainOfThought,llama_4_maverick,confirmatory,MartingaleStrategy
BATCH,0.020724477141540968,0.21625525,Forecasting,ChainOfThought,deepseek_r1,none,MartingaleStrategy
BATCH,0.006746497522224843,0.24155725,Forecasting,ChainOfThought,gemini_2_0_flash,critical,MartingaleStrategy
BATCH,0.06464454976303324,0.175968,OpenReview,SelfDebate,gemini_2_0_flash,confirmatory,MartingaleStrategy
BATCH,0.08896472553863685,,CMV,ChainOfThought,claude_3_5_haiku,confirmatory,MartingaleStrategy
BATCH,0.1455414634355978,,CMV,ChainOfThought,gpt_4o,confirmatory,MartingaleStrategy
BATCH,0.09693486359450895,,CMV,SelfDebate,gemini_2_0_flash,none,MartingaleStrategy
BATCH,0.10300067010445782,0.1623,OpenReview,ChainOfThought,gpt_4o,critical,MartingaleStrategy
BATCH,0.03657220249943577,0.186584,OpenReview,SelfDebate,deepseek_r1,none,MartingaleStrategy
