{"subject": "TruthfulQA", "method": "standard", "iterations": 30, "random_seed": 42, "calibration_size": 300, "total_examples": 684, "small_model": {"avg_accuracy": 0.5573099415204678, "std_accuracy": 0.011981170320788234, "avg_cost": 3.203571150097469e-05, "std_cost": 4.349684142149015e-08, "x_position": 1.0, "x_std": 0.0}, "large_model": {"avg_accuracy": 0.8335282651072122, "std_accuracy": 0.00984887844289096, "avg_cost": 0.0006150619883040945, "std_cost": 1.3416954616486923e-06, "x_position": 0.0, "x_std": 0.0}, "hybrid_models": {"0.05": {"avg_accuracy": 0.8011284722222223, "std_accuracy": 0.021500633929159164, "avg_large_model_usage": 0.7828125, "std_large_model_usage": 0.0764567249301004, "avg_lambda": 0.8610000000000007, "std_lambda": 0.05912416313713307, "avg_cost": 0.0005132336458333332, "std_cost": 4.776312330838144e-05}, "0.1": {"avg_accuracy": 0.7566840277777778, "std_accuracy": 0.030663501914058804, "avg_large_model_usage": 0.6028645833333334, "std_large_model_usage": 0.06713415060007821, "avg_lambda": 0.7003333333333298, "std_lambda": 0.04819981558286365, "avg_cost": 0.00039438156249999983, "std_cost": 4.111896367474812e-05}, "0.15": {"avg_accuracy": 0.7103298611111111, "std_accuracy": 0.033845373680030405, "avg_large_model_usage": 0.45642361111111107, "std_large_model_usage": 0.0651768113226023, "avg_lambda": 0.5416666666666676, "std_lambda": 0.06266755318521885, "avg_cost": 0.00030400204861111095, "std_cost": 3.9622113328864916e-05}, "0.2": {"avg_accuracy": 0.6673611111111111, "std_accuracy": 0.032705771455462, "avg_large_model_usage": 0.35026041666666674, "std_large_model_usage": 0.042588277650418865, "avg_lambda": 0.40533333333333477, "std_lambda": 0.047450559064731244, "avg_cost": 0.00023566281249999993, "std_cost": 2.7345435784046303e-05}, "0.25": {"avg_accuracy": 0.6191840277777776, "std_accuracy": 0.03823003786599406, "avg_large_model_usage": 0.27326388888888886, "std_large_model_usage": 0.05137510372537665, "avg_lambda": 0.27966666666666734, "std_lambda": 0.07867584692196769, "avg_cost": 0.00018407548611111104, "std_cost": 3.0326159127935378e-05}}, "all_trials": [{"trial": 1, "trial_seed": 42, "subject": "TruthfulQA", "method": "standard", "calibration_size": 300, "test_size": 384, "small_model_accuracy": 0.5321637426900585, "large_model_accuracy": 0.8245614035087719, "small_model_cost": 3.2059795321637445e-05, "large_model_cost": 0.0006154649122807024, "hybrid_results": {"0.05": {"accuracy": 0.8125, "large_model_usage": 0.8203125, "small_model_usage": 0.1796875, "avg_lambda": 0.8999999999999971, "avg_cost": 0.0005386778645833334, "unrestricted": {"accuracy": 0.8125, "large_model_usage": 0.8203125, "avg_cost": 0.0005386778645833334}, "random_baseline": {"accuracy": 0.7966145833333333, "accuracy_std": 0.009493607361228222, "avg_cost": 0.0005424924739583333, "cost_std": 9.267536493510888e-06, "small_model_fraction": 0.13161874221743722}, "unrestricted_random_baseline": {"accuracy": 0.7942708333333334, "accuracy_std": 0.0078125, "avg_cost": 0.0005368509114583335, "cost_std": 9.178873088319103e-06, "small_model_fraction": 0.13161874221743722}}, "0.1": {"accuracy": 0.7864583333333334, "large_model_usage": 0.671875, "small_model_usage": 0.328125, "avg_lambda": 0.7599999999999959, "avg_cost": 0.0004436674479166664, "unrestricted": {"accuracy": 0.7994791666666666, "large_model_usage": 0.671875, "avg_cost": 0.00044780286458333306}, "random_baseline": {"accuracy": 0.7455729166666666, "accuracy_std": 0.01035463299587033, "avg_cost": 0.00044118132812499995, "cost_std": 9.529625299858931e-06, "small_model_fraction": 0.29447370166979575}, "unrestricted_random_baseline": {"accuracy": 0.7536458333333333, "accuracy_std": 0.016395912748642903, "avg_cost": 0.0004421926041666666, "cost_std": 1.4180262941338643e-05, "small_model_fraction": 0.2873852882389673}}, "0.15": {"accuracy": 0.7682291666666666, "large_model_usage": 0.6588541666666666, "small_model_usage": 0.34114583333333337, "avg_lambda": 0.699999999999995, "avg_cost": 0.00042205807291666625, "unrestricted": {"accuracy": 0.7994791666666666, "large_model_usage": 0.6588541666666666, "avg_cost": 0.0004395007812499996}, "random_baseline": {"accuracy": 0.7380208333333333, "accuracy_std": 0.020937240877335183, "avg_cost": 0.00041214388020833337, "cost_std": 1.553374795395819e-05, "small_model_fraction": 0.3315137864613667}, "unrestricted_random_baseline": {"accuracy": 0.7411458333333333, "accuracy_std": 0.01142276677055277, "avg_cost": 0.0004379324218749999, "cost_std": 1.1320776573266252e-05, "small_model_fraction": 0.3016156799376332}}, "0.2": {"accuracy": 0.7135416666666666, "large_model_usage": 0.4739583333333333, "small_model_usage": 0.5260416666666667, "avg_lambda": 0.5399999999999987, "avg_cost": 0.0003131570312499998, "unrestricted": {"accuracy": 0.734375, "large_model_usage": 0.4739583333333333, "avg_cost": 0.0003264539062499997}, "random_baseline": {"accuracy": 0.6859375000000001, "accuracy_std": 0.013288907106430279, "avg_cost": 0.00031255260416666663, "cost_std": 1.213562751403926e-05, "small_model_fraction": 0.5181783159641266}, "unrestricted_random_baseline": {"accuracy": 0.69453125, "accuracy_std": 0.01774085555234488, "avg_cost": 0.00033816940104166665, "cost_std": 1.2983097299985691e-05, "small_model_fraction": 0.4953864778168912}}, "0.25": {"accuracy": 0.6901041666666666, "large_model_usage": 0.3776041666666667, "small_model_usage": 0.6223958333333333, "avg_lambda": 0.44999999999999857, "avg_cost": 0.00025318307291666644, "unrestricted": {"accuracy": 0.7161458333333334, "large_model_usage": 0.3776041666666667, "avg_cost": 0.0002661153645833331}, "random_baseline": {"accuracy": 0.6591145833333333, "accuracy_std": 0.01657076903118025, "avg_cost": 0.0002569141666666665, "cost_std": 8.9616132116518e-06, "small_model_fraction": 0.6209781656568084}, "unrestricted_random_baseline": {"accuracy": 0.6643229166666667, "accuracy_std": 0.014237434305038155, "avg_cost": 0.00026661932291666645, "cost_std": 9.823165034058509e-06, "small_model_fraction": 0.5988112506080087}}}}, {"trial": 2, "trial_seed": 43, "subject": "TruthfulQA", "method": "standard", "calibration_size": 300, "test_size": 384, "small_model_accuracy": 0.5555555555555556, "large_model_accuracy": 0.8377192982456141, "small_model_cost": 3.2054824561403506e-05, "large_model_cost": 0.0006174707602339186, "hybrid_results": {"0.05": {"accuracy": 0.7838541666666666, "large_model_usage": 0.7265625, "small_model_usage": 0.2734375, "avg_lambda": 0.8000000000000057, "avg_cost": 0.00047770416666666687, "unrestricted": {"accuracy": 0.7916666666666666, "large_model_usage": 0.7265625, "avg_cost": 0.0004832458333333336}, "random_baseline": {"accuracy": 0.7791666666666667, "accuracy_std": 0.018333037402914615, "avg_cost": 0.0004864808333333334, "cost_std": 1.4120076118914955e-05, "small_model_fraction": 0.23874750421115623}, "unrestricted_random_baseline": {"accuracy": 0.77265625, "accuracy_std": 0.01473369287480796, "avg_cost": 0.00048567119791666674, "cost_std": 1.1538081401062203e-05, "small_model_fraction": 0.22928130022014162}}, "0.1": {"accuracy": 0.7057291666666666, "large_model_usage": 0.4713541666666667, "small_model_usage": 0.5286458333333333, "avg_lambda": 0.610000000000006, "avg_cost": 0.0003155062499999997, "unrestricted": {"accuracy": 0.7265625, "large_model_usage": 0.4713541666666667, "avg_cost": 0.0003249749999999998}, "random_baseline": {"accuracy": 0.6950520833333333, "accuracy_std": 0.012567633866680883, "avg_cost": 0.0003205762239583334, "cost_std": 1.4437397446504936e-05, "small_model_fraction": 0.5158119071135766}, "unrestricted_random_baseline": {"accuracy": 0.6875, "accuracy_std": 0.016304662335935968, "avg_cost": 0.0003305525260416667, "cost_std": 1.7548171644687845e-05, "small_model_fraction": 0.49963750969283927}}, "0.15": {"accuracy": 0.6875, "large_model_usage": 0.4088541666666667, "small_model_usage": 0.5911458333333333, "avg_lambda": 0.5, "avg_cost": 0.0002738864583333332, "unrestricted": {"accuracy": 0.71875, "large_model_usage": 0.4088541666666667, "avg_cost": 0.0002858031249999997}, "random_baseline": {"accuracy": 0.6776041666666667, "accuracy_std": 0.015131082339931184, "avg_cost": 0.00028132231770833336, "cost_std": 1.331589770948585e-05, "small_model_fraction": 0.5869063019370699}, "unrestricted_random_baseline": {"accuracy": 0.6807291666666667, "accuracy_std": 0.011945150980395546, "avg_cost": 0.00028711437500000006, "cost_std": 1.2061184645221007e-05, "small_model_fraction": 0.566550404633084}}, "0.2": {"accuracy": 0.640625, "large_model_usage": 0.3020833333333333, "small_model_usage": 0.6979166666666667, "avg_lambda": 0.3899999999999985, "avg_cost": 0.00020685520833333318, "unrestricted": {"accuracy": 0.6796875, "large_model_usage": 0.3020833333333333, "avg_cost": 0.00021918854166666664}, "random_baseline": {"accuracy": 0.6463541666666667, "accuracy_std": 0.01274712317762545, "avg_cost": 0.00021035078124999996, "cost_std": 1.2614777605681321e-05, "small_model_fraction": 0.7014082242719919}, "unrestricted_random_baseline": {"accuracy": 0.6518229166666667, "accuracy_std": 0.017038893544745432, "avg_cost": 0.00022938625000000007, "cost_std": 1.6026346624546152e-05, "small_model_fraction": 0.680340582307027}}, "0.25": {"accuracy": 0.5729166666666666, "large_model_usage": 0.21875, "small_model_usage": 0.78125, "avg_lambda": 0.10999999999999958, "avg_cost": 0.00014874583333333323, "unrestricted": {"accuracy": 0.6458333333333334, "large_model_usage": 0.21875, "avg_cost": 0.0001687093749999999}, "random_baseline": {"accuracy": 0.6057291666666667, "accuracy_std": 0.017044862679118304, "avg_cost": 0.00015355630208333323, "cost_std": 1.3851528589879824e-05, "small_model_fraction": 0.8006699140537108}, "unrestricted_random_baseline": {"accuracy": 0.6088541666666666, "accuracy_std": 0.012035645841970165, "avg_cost": 0.0001675689062499999, "cost_std": 9.294856981610515e-06, "small_model_fraction": 0.7665684479845768}}}}, {"trial": 3, "trial_seed": 44, "subject": "TruthfulQA", "method": "standard", "calibration_size": 300, "test_size": 384, "small_model_accuracy": 0.5628654970760234, "large_model_accuracy": 0.8333333333333334, "small_model_cost": 3.208538011695906e-05, "large_model_cost": 0.000615087719298246, "hybrid_results": {"0.05": {"accuracy": 0.8125, "large_model_usage": 0.8125, "small_model_usage": 0.1875, "avg_lambda": 0.8999999999999971, "avg_cost": 0.0005350263020833334, "unrestricted": {"accuracy": 0.8125, "large_model_usage": 0.8125, "avg_cost": 0.00053531796875}, "random_baseline": {"accuracy": 0.8013020833333334, "accuracy_std": 0.0067958272659895255, "avg_cost": 0.0005315677864583333, "cost_std": 1.111475323136425e-05, "small_model_fraction": 0.13732606515326043}, "unrestricted_random_baseline": {"accuracy": 0.8020833333333334, "accuracy_std": 0.009461407356554655, "avg_cost": 0.0005349819270833333, "cost_std": 9.10718770805563e-06, "small_model_fraction": 0.13682578128291395}}, "0.1": {"accuracy": 0.7708333333333334, "large_model_usage": 0.6145833333333334, "small_model_usage": 0.38541666666666663, "avg_lambda": 0.7100000000000012, "avg_cost": 0.00039864088541666665, "unrestricted": {"accuracy": 0.7890625, "large_model_usage": 0.6145833333333334, "avg_cost": 0.00041369817708333313}, "random_baseline": {"accuracy": 0.74296875, "accuracy_std": 0.012382805654134117, "avg_cost": 0.00040213924479166657, "cost_std": 1.820410267255952e-05, "small_model_fraction": 0.37126237638349224}, "unrestricted_random_baseline": {"accuracy": 0.7609375, "accuracy_std": 0.010974639325888286, "avg_cost": 0.00042295468750000003, "cost_std": 1.3313269236484743e-05, "small_model_fraction": 0.3454352215768554}}, "0.15": {"accuracy": 0.7317708333333334, "large_model_usage": 0.4557291666666667, "small_model_usage": 0.5442708333333333, "avg_lambda": 0.5600000000000018, "avg_cost": 0.00030501588541666653, "unrestricted": {"accuracy": 0.7578125, "large_model_usage": 0.4557291666666667, "avg_cost": 0.00031468255208333325}, "random_baseline": {"accuracy": 0.7005208333333333, "accuracy_std": 0.008945606269436277, "avg_cost": 0.0003035476041666665, "cost_std": 9.236463462515284e-06, "small_model_fraction": 0.5318534987647131}, "unrestricted_random_baseline": {"accuracy": 0.7067708333333333, "accuracy_std": 0.010031958481384485, "avg_cost": 0.00031759841145833316, "cost_std": 1.3112681056628624e-05, "small_model_fraction": 0.5152726619189439}}, "0.2": {"accuracy": 0.7057291666666666, "large_model_usage": 0.3697916666666667, "small_model_usage": 0.6302083333333333, "avg_lambda": 0.4600000000000002, "avg_cost": 0.00024693776041666667, "unrestricted": {"accuracy": 0.7473958333333334, "large_model_usage": 0.3697916666666667, "avg_cost": 0.00026167213541666655}, "random_baseline": {"accuracy": 0.6708333333333333, "accuracy_std": 0.013541666666666667, "avg_cost": 0.00024172528645833316, "cost_std": 1.359820197166206e-05, "small_model_fraction": 0.631472524447456}, "unrestricted_random_baseline": {"accuracy": 0.6888020833333333, "accuracy_std": 0.008005430285905949, "avg_cost": 0.00028256585937499985, "cost_std": 1.4998831881532226e-05, "small_model_fraction": 0.6061992553544171}}, "0.25": {"accuracy": 0.6796875, "large_model_usage": 0.2916666666666667, "small_model_usage": 0.7083333333333333, "avg_lambda": 0.3499999999999975, "avg_cost": 0.0001988856770833333, "unrestricted": {"accuracy": 0.7239583333333334, "large_model_usage": 0.2916666666666667, "avg_cost": 0.0002131981770833334}, "random_baseline": {"accuracy": 0.6565104166666667, "accuracy_std": 0.014889367040544582, "avg_cost": 0.00020345101562499983, "cost_std": 1.0110470726222348e-05, "small_model_fraction": 0.713894292087039}, "unrestricted_random_baseline": {"accuracy": 0.65703125, "accuracy_std": 0.012491859328345255, "avg_cost": 0.00021661768229166655, "cost_std": 9.36707697571276e-06, "small_model_fraction": 0.6893446478778936}}}}, {"trial": 4, "trial_seed": 45, "subject": "TruthfulQA", "method": "standard", "calibration_size": 300, "test_size": 384, "small_model_accuracy": 0.5453216374269005, "large_model_accuracy": 0.8289473684210527, "small_model_cost": 3.205482456140352e-05, "large_model_cost": 0.0006153888888888898, "hybrid_results": {"0.05": {"accuracy": 0.7786458333333334, "large_model_usage": 0.7239583333333334, "small_model_usage": 0.27604166666666663, "avg_lambda": 0.8000000000000057, "avg_cost": 0.00047669374999999986, "unrestricted": {"accuracy": 0.7786458333333334, "large_model_usage": 0.7239583333333334, "avg_cost": 0.00048207395833333317}, "random_baseline": {"accuracy": 0.7497395833333333, "accuracy_std": 0.012939852342505056, "avg_cost": 0.0004808900260416668, "cost_std": 1.2359898199288383e-05, "small_model_fraction": 0.23776279728972918}, "unrestricted_random_baseline": {"accuracy": 0.7526041666666666, "accuracy_std": 0.010925092168439095, "avg_cost": 0.0004859759635416668, "cost_std": 1.4100862111362681e-05, "small_model_fraction": 0.22853959456191308}}, "0.1": {"accuracy": 0.7526041666666666, "large_model_usage": 0.640625, "small_model_usage": 0.359375, "avg_lambda": 0.699999999999995, "avg_cost": 0.0004160791666666664, "unrestricted": {"accuracy": 0.7682291666666666, "large_model_usage": 0.640625, "avg_cost": 0.00043034999999999975}, "random_baseline": {"accuracy": 0.7205729166666667, "accuracy_std": 0.017664237377199594, "avg_cost": 0.00042485093750000016, "cost_std": 1.4447340679608661e-05, "small_model_fraction": 0.341673381361679}, "unrestricted_random_baseline": {"accuracy": 0.7291666666666667, "accuracy_std": 0.016053161466065034, "avg_cost": 0.00043728218750000027, "cost_std": 1.4927313120287307e-05, "small_model_fraction": 0.31720912630435455}}, "0.15": {"accuracy": 0.6927083333333334, "large_model_usage": 0.4635416666666667, "small_model_usage": 0.5364583333333333, "avg_lambda": 0.5, "avg_cost": 0.0003071156250000001, "unrestricted": {"accuracy": 0.7239583333333334, "large_model_usage": 0.4635416666666667, "avg_cost": 0.00032034479166666665}, "random_baseline": {"accuracy": 0.6661458333333334, "accuracy_std": 0.013420936173672016, "avg_cost": 0.00031080216145833335, "cost_std": 1.2708510084073291e-05, "small_model_fraction": 0.5284677901406144}, "unrestricted_random_baseline": {"accuracy": 0.6760416666666667, "accuracy_std": 0.012336165919090647, "avg_cost": 0.00032306390625, "cost_std": 9.977880139810213e-06, "small_model_fraction": 0.50578924713127}}, "0.2": {"accuracy": 0.6536458333333334, "large_model_usage": 0.3723958333333333, "small_model_usage": 0.6276041666666667, "avg_lambda": 0.40000000000000285, "avg_cost": 0.00025007916666666685, "unrestricted": {"accuracy": 0.6979166666666666, "large_model_usage": 0.3723958333333333, "avg_cost": 0.0002628760416666668}, "random_baseline": {"accuracy": 0.64375, "accuracy_std": 0.01231415668493426, "avg_cost": 0.0002606841145833333, "cost_std": 8.054509123132651e-06, "small_model_fraction": 0.626244453327753}, "unrestricted_random_baseline": {"accuracy": 0.63984375, "accuracy_std": 0.00894939597783311, "avg_cost": 0.00025950265624999987, "cost_std": 1.185687065365862e-05, "small_model_fraction": 0.6043069808183202}}, "0.25": {"accuracy": 0.6015625, "large_model_usage": 0.3151041666666667, "small_model_usage": 0.6848958333333333, "avg_lambda": 0.3100000000000015, "avg_cost": 0.00020769895833333327, "unrestricted": {"accuracy": 0.6796875, "large_model_usage": 0.3151041666666667, "avg_cost": 0.00022677187500000004}, "random_baseline": {"accuracy": 0.6145833333333334, "accuracy_std": 0.007547592055307003, "avg_cost": 0.00020182111979166655, "cost_std": 1.0351749365091394e-05, "small_model_fraction": 0.6988961479998153}, "unrestricted_random_baseline": {"accuracy": 0.6309895833333334, "accuracy_std": 0.015928050711846907, "avg_cost": 0.00023143513020833323, "cost_std": 1.9891873300221465e-05, "small_model_fraction": 0.6661997604013032}}}}, {"trial": 5, "trial_seed": 46, "subject": "TruthfulQA", "method": "standard", "calibration_size": 300, "test_size": 384, "small_model_accuracy": 0.5760233918128655, "large_model_accuracy": 0.8377192982456141, "small_model_cost": 3.2044005847953286e-05, "large_model_cost": 0.0006139795321637432, "hybrid_results": {"0.05": {"accuracy": 0.828125, "large_model_usage": 0.8151041666666666, "small_model_usage": 0.18489583333333337, "avg_lambda": 0.8999999999999971, "avg_cost": 0.0005350078124999994, "unrestricted": {"accuracy": 0.828125, "large_model_usage": 0.8151041666666666, "avg_cost": 0.0005350078124999994}, "random_baseline": {"accuracy": 0.7963541666666667, "accuracy_std": 0.009081039465709734, "avg_cost": 0.0005293816406249999, "cost_std": 5.490875131217643e-06, "small_model_fraction": 0.13570527333794266}, "unrestricted_random_baseline": {"accuracy": 0.7955729166666666, "accuracy_std": 0.007920263711325785, "avg_cost": 0.0005266174218749999, "cost_std": 8.078704583806372e-06, "small_model_fraction": 0.13570527333794266}}, "0.1": {"accuracy": 0.78125, "large_model_usage": 0.6328125, "small_model_usage": 0.3671875, "avg_lambda": 0.699999999999995, "avg_cost": 0.00040510156249999986, "unrestricted": {"accuracy": 0.8072916666666666, "large_model_usage": 0.6328125, "avg_cost": 0.00042244010416666607}, "random_baseline": {"accuracy": 0.7385416666666667, "accuracy_std": 0.014414950524060464, "avg_cost": 0.00041109411458333314, "cost_std": 1.0052703033170399e-05, "small_model_fraction": 0.3589366179207879}, "unrestricted_random_baseline": {"accuracy": 0.74140625, "accuracy_std": 0.01597057094423122, "avg_cost": 0.0004231751302083332, "cost_std": 1.4425909237560085e-05, "small_model_fraction": 0.329142008582472}}, "0.15": {"accuracy": 0.7161458333333334, "large_model_usage": 0.4322916666666667, "small_model_usage": 0.5677083333333333, "avg_lambda": 0.5, "avg_cost": 0.0002855182291666667, "unrestricted": {"accuracy": 0.7447916666666666, "large_model_usage": 0.4322916666666667, "avg_cost": 0.00029849739583333345}, "random_baseline": {"accuracy": 0.6804687500000001, "accuracy_std": 0.015052443767310872, "avg_cost": 0.00028086296875, "cost_std": 1.525385232767179e-05, "small_model_fraction": 0.5644290271751435}, "unrestricted_random_baseline": {"accuracy": 0.6919270833333332, "accuracy_std": 0.014360745498918996, "avg_cost": 0.0002943583072916666, "cost_std": 8.276547281113772e-06, "small_model_fraction": 0.5421255827560731}}, "0.2": {"accuracy": 0.6822916666666666, "large_model_usage": 0.3411458333333333, "small_model_usage": 0.6588541666666667, "avg_lambda": 0.40000000000000285, "avg_cost": 0.00022985156250000002, "unrestricted": {"accuracy": 0.7213541666666666, "large_model_usage": 0.3411458333333333, "avg_cost": 0.00024242968750000005}, "random_baseline": {"accuracy": 0.6710937499999999, "accuracy_std": 0.011590735981389511, "avg_cost": 0.00022935859374999994, "cost_std": 1.0079368715651559e-05, "small_model_fraction": 0.6600868176851853}, "unrestricted_random_baseline": {"accuracy": 0.6723958333333333, "accuracy_std": 0.013420936173671991, "avg_cost": 0.0002473309375, "cost_std": 1.3760603464481652e-05, "small_model_fraction": 0.6384725246386143}}, "0.25": {"accuracy": 0.65625, "large_model_usage": 0.2838541666666667, "small_model_usage": 0.7161458333333333, "avg_lambda": 0.3100000000000015, "avg_cost": 0.0001909244791666668, "unrestricted": {"accuracy": 0.7005208333333334, "large_model_usage": 0.2838541666666667, "avg_cost": 0.00020708593750000005}, "random_baseline": {"accuracy": 0.6471354166666666, "accuracy_std": 0.016397980720240807, "avg_cost": 0.00018963463541666665, "cost_std": 1.321364003595047e-05, "small_model_fraction": 0.7269792509067468}, "unrestricted_random_baseline": {"accuracy": 0.6598958333333332, "accuracy_std": 0.014320549046737012, "avg_cost": 0.0002098677864583333, "cost_std": 1.3600773773147102e-05, "small_model_fraction": 0.699207345596805}}}}, {"trial": 6, "trial_seed": 47, "subject": "TruthfulQA", "method": "standard", "calibration_size": 300, "test_size": 384, "small_model_accuracy": 0.543859649122807, "large_model_accuracy": 0.827485380116959, "small_model_cost": 3.213333333333337e-05, "large_model_cost": 0.0006172397660818714, "hybrid_results": {"0.05": {"accuracy": 0.8151041666666666, "large_model_usage": 0.7890625, "small_model_usage": 0.2109375, "avg_lambda": 0.8999999999999971, "avg_cost": 0.0005190002604166664, "unrestricted": {"accuracy": 0.8151041666666666, "large_model_usage": 0.7890625, "avg_cost": 0.0005193492187499997}, "random_baseline": {"accuracy": 0.7903645833333333, "accuracy_std": 0.011891093809198817, "avg_cost": 0.0005172108854166666, "cost_std": 1.403003192250554e-05, "small_model_fraction": 0.16790023176420213}, "unrestricted_random_baseline": {"accuracy": 0.7859375, "accuracy_std": 0.011036260468967244, "avg_cost": 0.0005174908072916666, "cost_std": 1.0827444461294634e-05, "small_model_fraction": 0.16730383029978121}}, "0.1": {"accuracy": 0.7916666666666666, "large_model_usage": 0.6380208333333334, "small_model_usage": 0.36197916666666663, "avg_lambda": 0.7599999999999959, "avg_cost": 0.0004227763020833331, "unrestricted": {"accuracy": 0.796875, "large_model_usage": 0.6380208333333334, "avg_cost": 0.00042602109374999974}, "random_baseline": {"accuracy": 0.7380208333333333, "accuracy_std": 0.018929981088944014, "avg_cost": 0.00042263455729166656, "cost_std": 1.017426515953828e-05, "small_model_fraction": 0.3323557101996743}, "unrestricted_random_baseline": {"accuracy": 0.7361979166666666, "accuracy_std": 0.01022280543041173, "avg_cost": 0.0004218805208333333, "cost_std": 1.1344119872800925e-05, "small_model_fraction": 0.32681006673199914}}, "0.15": {"accuracy": 0.7526041666666666, "large_model_usage": 0.4947916666666667, "small_model_usage": 0.5052083333333333, "avg_lambda": 0.610000000000006, "avg_cost": 0.00032964609374999963, "unrestricted": {"accuracy": 0.7630208333333334, "large_model_usage": 0.4947916666666667, "avg_cost": 0.00033870859374999975}, "random_baseline": {"accuracy": 0.6906249999999999, "accuracy_std": 0.012532509807829129, "avg_cost": 0.0003305981770833332, "cost_std": 1.2408665410968134e-05, "small_model_fraction": 0.4915236890849075}, "unrestricted_random_baseline": {"accuracy": 0.7067708333333333, "accuracy_std": 0.015459189666308163, "avg_cost": 0.00033737622395833323, "cost_std": 1.0185800068294913e-05, "small_model_fraction": 0.4760350540387519}}, "0.2": {"accuracy": 0.6848958333333334, "large_model_usage": 0.3489583333333333, "small_model_usage": 0.6510416666666667, "avg_lambda": 0.40000000000000285, "avg_cost": 0.00023642734375000003, "unrestricted": {"accuracy": 0.7213541666666666, "large_model_usage": 0.3489583333333333, "avg_cost": 0.0002484690104166665}, "random_baseline": {"accuracy": 0.6455729166666666, "accuracy_std": 0.012072215610936722, "avg_cost": 0.00022998242187499996, "cost_std": 1.0067762500831024e-05, "small_model_fraction": 0.6508429937148437}, "unrestricted_random_baseline": {"accuracy": 0.6625000000000001, "accuracy_std": 0.008652733190539654, "avg_cost": 0.00024836942708333324, "cost_std": 1.3815831699249673e-05, "small_model_fraction": 0.6302626924351248}}, "0.25": {"accuracy": 0.6484375, "large_model_usage": 0.2890625, "small_model_usage": 0.7109375, "avg_lambda": 0.3100000000000015, "avg_cost": 0.00019506796874999986, "unrestricted": {"accuracy": 0.7057291666666666, "large_model_usage": 0.2890625, "avg_cost": 0.00021131276041666666}, "random_baseline": {"accuracy": 0.6330729166666667, "accuracy_std": 0.012294866285534247, "avg_cost": 0.00019495682291666657, "cost_std": 1.0753319138834652e-05, "small_model_fraction": 0.7215299195203154}, "unrestricted_random_baseline": {"accuracy": 0.6388020833333334, "accuracy_std": 0.012813452708481813, "avg_cost": 0.0002087271614583333, "cost_std": 9.257077797008657e-06, "small_model_fraction": 0.6937660961243619}}}}, {"trial": 7, "trial_seed": 48, "subject": "TruthfulQA", "method": "standard", "calibration_size": 300, "test_size": 384, "small_model_accuracy": 0.5453216374269005, "large_model_accuracy": 0.8289473684210527, "small_model_cost": 3.201330409356728e-05, "large_model_cost": 0.0006156111111111118, "hybrid_results": {"0.05": {"accuracy": 0.8151041666666666, "large_model_usage": 0.8020833333333334, "small_model_usage": 0.19791666666666663, "avg_lambda": 0.8999999999999971, "avg_cost": 0.0005285929687499996, "unrestricted": {"accuracy": 0.8151041666666666, "large_model_usage": 0.8020833333333334, "avg_cost": 0.0005285929687499996}, "random_baseline": {"accuracy": 0.7869791666666667, "accuracy_std": 0.00854230180565455, "avg_cost": 0.0005292125000000001, "cost_std": 4.150774116678241e-06, "small_model_fraction": 0.1491063559779555}, "unrestricted_random_baseline": {"accuracy": 0.7864583333333333, "accuracy_std": 0.008869472065586652, "avg_cost": 0.0005350349218749999, "cost_std": 1.2026095707486768e-05, "small_model_fraction": 0.1491063559779555}}, "0.1": {"accuracy": 0.7473958333333334, "large_model_usage": 0.6171875, "small_model_usage": 0.3828125, "avg_lambda": 0.699999999999995, "avg_cost": 0.0003982648437499999, "unrestricted": {"accuracy": 0.7682291666666666, "large_model_usage": 0.6171875, "avg_cost": 0.0004144992187499999}, "random_baseline": {"accuracy": 0.7247395833333334, "accuracy_std": 0.015842667889849105, "avg_cost": 0.0003980176041666663, "cost_std": 1.3858142753179185e-05, "small_model_fraction": 0.3724247499692505}, "unrestricted_random_baseline": {"accuracy": 0.7283854166666667, "accuracy_std": 0.012161765030037096, "avg_cost": 0.00041041656249999963, "cost_std": 8.243693828091611e-06, "small_model_fraction": 0.34460700493185015}}, "0.15": {"accuracy": 0.6822916666666666, "large_model_usage": 0.4192708333333333, "small_model_usage": 0.5807291666666667, "avg_lambda": 0.5, "avg_cost": 0.00028273880208333317, "unrestricted": {"accuracy": 0.7057291666666666, "large_model_usage": 0.4192708333333333, "avg_cost": 0.00029330651041666663}, "random_baseline": {"accuracy": 0.6617187499999999, "accuracy_std": 0.016406249999999997, "avg_cost": 0.0002845196093749998, "cost_std": 1.5065113079373858e-05, "small_model_fraction": 0.5703796433521753}, "unrestricted_random_baseline": {"accuracy": 0.6651041666666667, "accuracy_std": 0.02037916411095194, "avg_cost": 0.0002952119531249999, "cost_std": 9.877867490509963e-06, "small_model_fraction": 0.5522717817285352}}, "0.2": {"accuracy": 0.6432291666666666, "large_model_usage": 0.3385416666666667, "small_model_usage": 0.6614583333333333, "avg_lambda": 0.40000000000000285, "avg_cost": 0.00023209817708333324, "unrestricted": {"accuracy": 0.6875, "large_model_usage": 0.3385416666666667, "avg_cost": 0.00024282734374999997}, "random_baseline": {"accuracy": 0.6390625, "accuracy_std": 0.013740527039867625, "avg_cost": 0.00022608466145833321, "cost_std": 1.6837010814850363e-05, "small_model_fraction": 0.6571528018374633}, "unrestricted_random_baseline": {"accuracy": 0.6442708333333333, "accuracy_std": 0.01260804003844812, "avg_cost": 0.00024320989583333311, "cost_std": 1.1437328210278189e-05, "small_model_fraction": 0.6387682799327328}}, "0.25": {"accuracy": 0.5963541666666666, "large_model_usage": 0.2786458333333333, "small_model_usage": 0.7213541666666667, "avg_lambda": 0.26999999999999935, "avg_cost": 0.00018674921875000002, "unrestricted": {"accuracy": 0.6666666666666666, "large_model_usage": 0.2786458333333333, "avg_cost": 0.0002054315104166667}, "random_baseline": {"accuracy": 0.6151041666666666, "accuracy_std": 0.01151736686848749, "avg_cost": 0.00018537148437499995, "cost_std": 1.3394532228809196e-05, "small_model_fraction": 0.7348586427231366}, "unrestricted_random_baseline": {"accuracy": 0.6236979166666667, "accuracy_std": 0.017671914143045364, "avg_cost": 0.00021237492187499993, "cost_std": 1.5547278351852563e-05, "small_model_fraction": 0.702846370843395}}}}, {"trial": 8, "trial_seed": 49, "subject": "TruthfulQA", "method": "standard", "calibration_size": 300, "test_size": 384, "small_model_accuracy": 0.5628654970760234, "large_model_accuracy": 0.8523391812865497, "small_model_cost": 3.202690058479531e-05, "large_model_cost": 0.0006141052631578953, "hybrid_results": {"0.05": {"accuracy": 0.8489583333333334, "large_model_usage": 0.7864583333333334, "small_model_usage": 0.21354166666666663, "avg_lambda": 0.8999999999999971, "avg_cost": 0.0005188328125000001, "unrestricted": {"accuracy": 0.8489583333333334, "large_model_usage": 0.7864583333333334, "avg_cost": 0.0005188328125000001}, "random_baseline": {"accuracy": 0.8182291666666666, "accuracy_std": 0.010206207261596567, "avg_cost": 0.0005245832031250002, "cost_std": 7.640574618218018e-06, "small_model_fraction": 0.16367633085816768}, "unrestricted_random_baseline": {"accuracy": 0.8151041666666666, "accuracy_std": 0.01440553821452014, "avg_cost": 0.0005205245833333333, "cost_std": 1.1327211012098589e-05, "small_model_fraction": 0.16367633085816768}}, "0.1": {"accuracy": 0.7942708333333334, "large_model_usage": 0.6067708333333334, "small_model_usage": 0.39322916666666663, "avg_lambda": 0.699999999999995, "avg_cost": 0.00039555677083333323, "unrestricted": {"accuracy": 0.8177083333333334, "large_model_usage": 0.6067708333333334, "avg_cost": 0.0004088536458333333}, "random_baseline": {"accuracy": 0.7583333333333333, "accuracy_std": 0.01326847833630936, "avg_cost": 0.0004032686718750001, "cost_std": 1.69592764553542e-05, "small_model_fraction": 0.3754623198128513}, "unrestricted_random_baseline": {"accuracy": 0.7630208333333334, "accuracy_std": 0.011529137302697223, "avg_cost": 0.00040358669270833334, "cost_std": 1.4541299736842778e-05, "small_model_fraction": 0.3526185313215205}}, "0.15": {"accuracy": 0.7421875, "large_model_usage": 0.4609375, "small_model_usage": 0.5390625, "avg_lambda": 0.5399999999999987, "avg_cost": 0.00030720260416666626, "unrestricted": {"accuracy": 0.7760416666666666, "large_model_usage": 0.4609375, "avg_cost": 0.00031821822916666627}, "random_baseline": {"accuracy": 0.7075520833333333, "accuracy_std": 0.011051612069751535, "avg_cost": 0.0003084176041666665, "cost_std": 1.3737289232875066e-05, "small_model_fraction": 0.5272531650799627}, "unrestricted_random_baseline": {"accuracy": 0.7236979166666668, "accuracy_std": 0.01125831579230065, "avg_cost": 0.00031667216145833313, "cost_std": 1.5936859502316583e-05, "small_model_fraction": 0.5083285224402586}}, "0.2": {"accuracy": 0.6979166666666666, "large_model_usage": 0.3567708333333333, "small_model_usage": 0.6432291666666667, "avg_lambda": 0.40000000000000285, "avg_cost": 0.0002394578124999997, "unrestricted": {"accuracy": 0.7421875, "large_model_usage": 0.3567708333333333, "avg_cost": 0.00025316614583333297}, "random_baseline": {"accuracy": 0.6822916666666667, "accuracy_std": 0.01067390704787458, "avg_cost": 0.00023657874999999983, "cost_std": 1.7853441441735437e-05, "small_model_fraction": 0.6436374803587477}, "unrestricted_random_baseline": {"accuracy": 0.6919270833333334, "accuracy_std": 0.01343356289221763, "avg_cost": 0.00025453874999999983, "cost_std": 1.1026453244512938e-05, "small_model_fraction": 0.6200868139626716}}, "0.25": {"accuracy": 0.6510416666666666, "large_model_usage": 0.2786458333333333, "small_model_usage": 0.7213541666666667, "avg_lambda": 0.3100000000000015, "avg_cost": 0.00018702552083333312, "unrestricted": {"accuracy": 0.7265625, "large_model_usage": 0.2786458333333333, "avg_cost": 0.00020453072916666639}, "random_baseline": {"accuracy": 0.6611979166666666, "accuracy_std": 0.008022355104556967, "avg_cost": 0.00018477700520833318, "cost_std": 1.0235326421564296e-05, "small_model_fraction": 0.733715200195107}, "unrestricted_random_baseline": {"accuracy": 0.6713541666666666, "accuracy_std": 0.014014191715701793, "avg_cost": 0.00020594026041666653, "cost_std": 1.2016452305899774e-05, "small_model_fraction": 0.7036415718678991}}}}, {"trial": 9, "trial_seed": 50, "subject": "TruthfulQA", "method": "standard", "calibration_size": 300, "test_size": 384, "small_model_accuracy": 0.5614035087719298, "large_model_accuracy": 0.8333333333333334, "small_model_cost": 3.204590643274857e-05, "large_model_cost": 0.000615818713450294, "hybrid_results": {"0.05": {"accuracy": 0.7760416666666666, "large_model_usage": 0.6822916666666666, "small_model_usage": 0.31770833333333337, "avg_lambda": 0.8000000000000057, "avg_cost": 0.0004478492187500001, "unrestricted": {"accuracy": 0.7838541666666666, "large_model_usage": 0.6822916666666666, "avg_cost": 0.00045373984375000003}, "random_baseline": {"accuracy": 0.7499999999999999, "accuracy_std": 0.013380450602776185, "avg_cost": 0.0004485393229166668, "cost_std": 1.3601901791810486e-05, "small_model_fraction": 0.2877309334746823}, "unrestricted_random_baseline": {"accuracy": 0.76328125, "accuracy_std": 0.014934844748895476, "avg_cost": 0.0004535207031250001, "cost_std": 1.569626431295989e-05, "small_model_fraction": 0.2776403212892763}}, "0.1": {"accuracy": 0.7473958333333334, "large_model_usage": 0.59375, "small_model_usage": 0.40625, "avg_lambda": 0.699999999999995, "avg_cost": 0.00038308880208333325, "unrestricted": {"accuracy": 0.7682291666666666, "large_model_usage": 0.59375, "avg_cost": 0.0003985263020833332}, "random_baseline": {"accuracy": 0.7210937500000001, "accuracy_std": 0.01657076903118024, "avg_cost": 0.00038622169270833347, "cost_std": 1.086110185238787e-05, "small_model_fraction": 0.39866521456516907}, "unrestricted_random_baseline": {"accuracy": 0.73125, "accuracy_std": 0.012958182608984091, "avg_cost": 0.00040157690104166663, "cost_std": 1.5657914667947586e-05, "small_model_fraction": 0.3722208515965186}}, "0.15": {"accuracy": 0.6979166666666666, "large_model_usage": 0.4192708333333333, "small_model_usage": 0.5807291666666667, "avg_lambda": 0.5399999999999987, "avg_cost": 0.00028141171874999975, "unrestricted": {"accuracy": 0.7213541666666666, "large_model_usage": 0.4192708333333333, "avg_cost": 0.00029191692708333315}, "random_baseline": {"accuracy": 0.67578125, "accuracy_std": 0.013291458494973792, "avg_cost": 0.00027733552083333314, "cost_std": 1.3698107206275361e-05, "small_model_fraction": 0.5728375674241428}, "unrestricted_random_baseline": {"accuracy": 0.68828125, "accuracy_std": 0.015231592516551842, "avg_cost": 0.00030131304687499975, "cost_std": 1.4819949769349512e-05, "small_model_fraction": 0.5548421962676756}}, "0.2": {"accuracy": 0.6875, "large_model_usage": 0.390625, "small_model_usage": 0.609375, "avg_lambda": 0.5, "avg_cost": 0.0002633752604166665, "unrestricted": {"accuracy": 0.7161458333333334, "large_model_usage": 0.390625, "avg_cost": 0.00027389609374999977}, "random_baseline": {"accuracy": 0.6645833333333333, "accuracy_std": 0.014859731893424086, "avg_cost": 0.00026036729166666647, "cost_std": 1.6039944782621796e-05, "small_model_fraction": 0.603733933470174}, "unrestricted_random_baseline": {"accuracy": 0.6729166666666667, "accuracy_std": 0.012608040038448128, "avg_cost": 0.00027728432291666653, "cost_std": 1.0910501257498993e-05, "small_model_fraction": 0.5857117967641438}}, "0.25": {"accuracy": 0.6197916666666666, "large_model_usage": 0.2682291666666667, "small_model_usage": 0.7317708333333333, "avg_lambda": 0.31999999999999684, "avg_cost": 0.00018258880208333318, "unrestricted": {"accuracy": 0.6796875, "large_model_usage": 0.2682291666666667, "avg_cost": 0.0001976252604166665}, "random_baseline": {"accuracy": 0.6257812500000001, "accuracy_std": 0.011413857812922465, "avg_cost": 0.00018390124999999985, "cost_std": 1.3576519956719346e-05, "small_model_fraction": 0.7421207465628663}, "unrestricted_random_baseline": {"accuracy": 0.6307291666666667, "accuracy_std": 0.015572829570547965, "avg_cost": 0.0001989905989583332, "cost_std": 1.2218739342143532e-05, "small_model_fraction": 0.7163633660330099}}}}, {"trial": 10, "trial_seed": 51, "subject": "TruthfulQA", "method": "standard", "calibration_size": 300, "test_size": 384, "small_model_accuracy": 0.5730994152046783, "large_model_accuracy": 0.8318713450292398, "small_model_cost": 3.1980847953216366e-05, "large_model_cost": 0.0006141900584795336, "hybrid_results": {"0.05": {"accuracy": 0.7942708333333334, "large_model_usage": 0.6927083333333334, "small_model_usage": 0.30729166666666663, "avg_lambda": 0.8000000000000057, "avg_cost": 0.0004576731770833331, "unrestricted": {"accuracy": 0.8072916666666666, "large_model_usage": 0.6927083333333334, "avg_cost": 0.00046217317708333305}, "random_baseline": {"accuracy": 0.7651041666666666, "accuracy_std": 0.008854166666666665, "avg_cost": 0.00046076773437500016, "cost_std": 1.042205270744942e-05, "small_model_fraction": 0.2688327126510232}, "unrestricted_random_baseline": {"accuracy": 0.76484375, "accuracy_std": 0.011936631912881078, "avg_cost": 0.0004565917447916667, "cost_std": 7.975529981030984e-06, "small_model_fraction": 0.2611035322831415}}, "0.1": {"accuracy": 0.7708333333333334, "large_model_usage": 0.6223958333333334, "small_model_usage": 0.37760416666666663, "avg_lambda": 0.699999999999995, "avg_cost": 0.0004016940104166666, "unrestricted": {"accuracy": 0.7942708333333334, "large_model_usage": 0.6223958333333334, "avg_cost": 0.0004189179687499999}, "random_baseline": {"accuracy": 0.73828125, "accuracy_std": 0.012610729180003371, "avg_cost": 0.000399698151041667, "cost_std": 1.12218154956431e-05, "small_model_fraction": 0.36498228509777536}, "unrestricted_random_baseline": {"accuracy": 0.7481770833333333, "accuracy_std": 0.011294400230313266, "avg_cost": 0.0004148740364583336, "cost_std": 6.079155865846771e-06, "small_model_fraction": 0.3353984894072832}}, "0.15": {"accuracy": 0.7135416666666666, "large_model_usage": 0.3880208333333333, "small_model_usage": 0.6119791666666667, "avg_lambda": 0.5, "avg_cost": 0.0002648658854166667, "unrestricted": {"accuracy": 0.734375, "large_model_usage": 0.3880208333333333, "avg_cost": 0.00027477734375}, "random_baseline": {"accuracy": 0.6950520833333333, "accuracy_std": 0.01602990978803376, "avg_cost": 0.0002722142968750002, "cost_std": 1.240519168496386e-05, "small_model_fraction": 0.5999976756586825}, "unrestricted_random_baseline": {"accuracy": 0.6861979166666667, "accuracy_std": 0.010169595932170133, "avg_cost": 0.0002698165104166668, "cost_std": 1.3555347554474222e-05, "small_model_fraction": 0.5829737980659984}}, "0.2": {"accuracy": 0.6875, "large_model_usage": 0.3255208333333333, "small_model_usage": 0.6744791666666667, "avg_lambda": 0.40000000000000285, "avg_cost": 0.00022272526041666656, "unrestricted": {"accuracy": 0.7161458333333334, "large_model_usage": 0.3255208333333333, "avg_cost": 0.00023517317708333335}, "random_baseline": {"accuracy": 0.6583333333333332, "accuracy_std": 0.013770108402502387, "avg_cost": 0.0002227102604166668, "cost_std": 1.1372728354495801e-05, "small_model_fraction": 0.6723782293120764}, "unrestricted_random_baseline": {"accuracy": 0.67421875, "accuracy_std": 0.020750159491522258, "avg_cost": 0.00023489635416666681, "cost_std": 1.1582145559324417e-05, "small_model_fraction": 0.6509977419518475}}, "0.25": {"accuracy": 0.6432291666666666, "large_model_usage": 0.2760416666666667, "small_model_usage": 0.7239583333333333, "avg_lambda": 0.3100000000000015, "avg_cost": 0.00018720963541666644, "unrestricted": {"accuracy": 0.7005208333333334, "large_model_usage": 0.2760416666666667, "avg_cost": 0.0002041158854166665}, "random_baseline": {"accuracy": 0.6502604166666666, "accuracy_std": 0.010677083333333335, "avg_cost": 0.00018350604166666654, "cost_std": 1.3715835604107492e-05, "small_model_fraction": 0.7333797118683106}, "unrestricted_random_baseline": {"accuracy": 0.6588541666666667, "accuracy_std": 0.01035135776789156, "avg_cost": 0.00021185218750000004, "cost_std": 1.0215589256176701e-05, "small_model_fraction": 0.7043416106250878}}}}, {"trial": 11, "trial_seed": 52, "subject": "TruthfulQA", "method": "standard", "calibration_size": 300, "test_size": 384, "small_model_accuracy": 0.554093567251462, "large_model_accuracy": 0.827485380116959, "small_model_cost": 3.2059356725146234e-05, "large_model_cost": 0.0006148479532163749, "hybrid_results": {"0.05": {"accuracy": 0.8203125, "large_model_usage": 1.0, "small_model_usage": 0.0, "avg_lambda": 1.0, "avg_cost": 0.000646898958333333, "unrestricted": {"accuracy": 0.8203125, "large_model_usage": 1.0, "avg_cost": 0.000646898958333333}, "random_baseline": {"accuracy": 0.8203125, "accuracy_std": 0.0, "avg_cost": 0.0006148541666666665, "cost_std": 1.0842021724855044e-19, "small_model_fraction": 0.0}, "unrestricted_random_baseline": {"accuracy": 0.8203125, "accuracy_std": 0.0, "avg_cost": 0.0006148541666666665, "cost_std": 1.0842021724855044e-19, "small_model_fraction": 0.0}}, "0.1": {"accuracy": 0.7473958333333334, "large_model_usage": 0.6302083333333334, "small_model_usage": 0.36979166666666663, "avg_lambda": 0.699999999999995, "avg_cost": 0.0004069718749999996, "unrestricted": {"accuracy": 0.7760416666666666, "large_model_usage": 0.6302083333333334, "avg_cost": 0.00042228958333333276}, "random_baseline": {"accuracy": 0.7260416666666666, "accuracy_std": 0.010974639325888288, "avg_cost": 0.0004059272916666666, "cost_std": 9.240079933984094e-06, "small_model_fraction": 0.3566920826315516}, "unrestricted_random_baseline": {"accuracy": 0.7341145833333332, "accuracy_std": 0.008514470168891567, "avg_cost": 0.0004195126041666666, "cost_std": 1.2931078444715395e-05, "small_model_fraction": 0.33040860964399515}}, "0.15": {"accuracy": 0.7057291666666666, "large_model_usage": 0.4817708333333333, "small_model_usage": 0.5182291666666667, "avg_lambda": 0.5800000000000045, "avg_cost": 0.0003172114583333333, "unrestricted": {"accuracy": 0.7291666666666666, "large_model_usage": 0.4817708333333333, "avg_cost": 0.0003305083333333332}, "random_baseline": {"accuracy": 0.6802083333333333, "accuracy_std": 0.017422634045497626, "avg_cost": 0.0003213349739583333, "cost_std": 1.4915574958527513e-05, "small_model_fraction": 0.5107109107402057}, "unrestricted_random_baseline": {"accuracy": 0.6872395833333333, "accuracy_std": 0.013096136417375163, "avg_cost": 0.0003269692187499999, "cost_std": 1.6155355312509785e-05, "small_model_fraction": 0.4878949615605961}}, "0.2": {"accuracy": 0.65625, "large_model_usage": 0.3515625, "small_model_usage": 0.6484375, "avg_lambda": 0.4300000000000041, "avg_cost": 0.00023539375, "unrestricted": {"accuracy": 0.6953125, "large_model_usage": 0.3515625, "avg_cost": 0.00024905000000000017}, "random_baseline": {"accuracy": 0.6424479166666667, "accuracy_std": 0.010865961481783896, "avg_cost": 0.0002330732291666667, "cost_std": 1.4842148801066064e-05, "small_model_fraction": 0.6511009403769038}, "unrestricted_random_baseline": {"accuracy": 0.6520833333333333, "accuracy_std": 0.011657827752916362, "avg_cost": 0.0002449935416666666, "cost_std": 9.06655587955181e-06, "small_model_fraction": 0.6276683439221691}}, "0.25": {"accuracy": 0.609375, "large_model_usage": 0.2760416666666667, "small_model_usage": 0.7239583333333333, "avg_lambda": 0.3100000000000015, "avg_cost": 0.00018524791666666663, "unrestricted": {"accuracy": 0.6666666666666666, "large_model_usage": 0.2760416666666667, "avg_cost": 0.00020240937499999999}, "random_baseline": {"accuracy": 0.61015625, "accuracy_std": 0.010089255535317976, "avg_cost": 0.00017254049479166665, "cost_std": 1.684527261104474e-05, "small_model_fraction": 0.737145577549361}, "unrestricted_random_baseline": {"accuracy": 0.62734375, "accuracy_std": 0.012404693087306826, "avg_cost": 0.00020189880208333326, "cost_std": 1.6547602783904377e-05, "small_model_fraction": 0.7076984359329042}}}}, {"trial": 12, "trial_seed": 53, "subject": "TruthfulQA", "method": "standard", "calibration_size": 300, "test_size": 384, "small_model_accuracy": 0.5614035087719298, "large_model_accuracy": 0.8523391812865497, "small_model_cost": 3.209298245614035e-05, "large_model_cost": 0.0006173918128654981, "hybrid_results": {"0.05": {"accuracy": 0.8151041666666666, "large_model_usage": 0.7421875, "small_model_usage": 0.2578125, "avg_lambda": 0.8000000000000057, "avg_cost": 0.0004888049479166665, "unrestricted": {"accuracy": 0.8255208333333334, "large_model_usage": 0.7421875, "avg_cost": 0.0004949716145833333}, "random_baseline": {"accuracy": 0.7932291666666667, "accuracy_std": 0.013441133229754099, "avg_cost": 0.0004914751562499999, "cost_std": 1.1619308106977641e-05, "small_model_fraction": 0.21969438219942797}, "unrestricted_random_baseline": {"accuracy": 0.79609375, "accuracy_std": 0.01061337701683064, "avg_cost": 0.0004958372916666666, "cost_std": 1.0804685418549383e-05, "small_model_fraction": 0.20915845363392266}}, "0.1": {"accuracy": 0.765625, "large_model_usage": 0.6640625, "small_model_usage": 0.3359375, "avg_lambda": 0.699999999999995, "avg_cost": 0.0004276903645833331, "unrestricted": {"accuracy": 0.8072916666666666, "large_model_usage": 0.6640625, "avg_cost": 0.0004469039062499997}, "random_baseline": {"accuracy": 0.7630208333333333, "accuracy_std": 0.014215983399954347, "avg_cost": 0.0004357658072916664, "cost_std": 1.3893699300879667e-05, "small_model_fraction": 0.32411041749304015}, "unrestricted_random_baseline": {"accuracy": 0.7744791666666666, "accuracy_std": 0.013541666666666681, "avg_cost": 0.0004457771354166664, "cost_std": 7.786064520071841e-06, "small_model_fraction": 0.29128352519730694}}, "0.15": {"accuracy": 0.734375, "large_model_usage": 0.4895833333333333, "small_model_usage": 0.5104166666666667, "avg_lambda": 0.5800000000000045, "avg_cost": 0.0003250497395833333, "unrestricted": {"accuracy": 0.7604166666666666, "large_model_usage": 0.4895833333333333, "avg_cost": 0.00033705494791666643}, "random_baseline": {"accuracy": 0.6984375, "accuracy_std": 0.014768173832039405, "avg_cost": 0.00032355726562499984, "cost_std": 1.4176348540079854e-05, "small_model_fraction": 0.4994748974258173}, "unrestricted_random_baseline": {"accuracy": 0.6955729166666667, "accuracy_std": 0.008514470168891581, "avg_cost": 0.0003283005208333333, "cost_std": 1.1677081595673345e-05, "small_model_fraction": 0.4789636513586815}}, "0.2": {"accuracy": 0.6770833333333334, "large_model_usage": 0.3541666666666667, "small_model_usage": 0.6458333333333333, "avg_lambda": 0.4300000000000041, "avg_cost": 0.00023860182291666674, "unrestricted": {"accuracy": 0.7213541666666666, "large_model_usage": 0.3541666666666667, "avg_cost": 0.0002522528645833334}, "random_baseline": {"accuracy": 0.6536458333333333, "accuracy_std": 0.009389456446520792, "avg_cost": 0.00023735203124999996, "cost_std": 1.7972426132413766e-05, "small_model_fraction": 0.6471736662858284}, "unrestricted_random_baseline": {"accuracy": 0.6708333333333333, "accuracy_std": 0.01956420830311764, "avg_cost": 0.0002580514322916667, "cost_std": 1.0848504169443697e-05, "small_model_fraction": 0.6238504663110068}}, "0.25": {"accuracy": 0.625, "large_model_usage": 0.28125, "small_model_usage": 0.71875, "avg_lambda": 0.3100000000000015, "avg_cost": 0.00018837265625, "unrestricted": {"accuracy": 0.6901041666666666, "large_model_usage": 0.28125, "avg_cost": 0.00020624765625000007}, "random_baseline": {"accuracy": 0.6341145833333333, "accuracy_std": 0.00918868685644655, "avg_cost": 0.0001917734374999999, "cost_std": 8.818962942499834e-06, "small_model_fraction": 0.7329916520001284}, "unrestricted_random_baseline": {"accuracy": 0.6486979166666667, "accuracy_std": 0.015644519058203736, "avg_cost": 0.00020731177083333326, "cost_std": 9.421558761046264e-06, "small_model_fraction": 0.7024516969014682}}}}, {"trial": 13, "trial_seed": 54, "subject": "TruthfulQA", "method": "standard", "calibration_size": 300, "test_size": 384, "small_model_accuracy": 0.5672514619883041, "large_model_accuracy": 0.8377192982456141, "small_model_cost": 3.1965058479532194e-05, "large_model_cost": 0.0006119795321637431, "hybrid_results": {"0.05": {"accuracy": 0.8203125, "large_model_usage": 1.0, "small_model_usage": 0.0, "avg_lambda": 1.0, "avg_cost": 0.000644392447916666, "unrestricted": {"accuracy": 0.8203125, "large_model_usage": 1.0, "avg_cost": 0.000644392447916666}, "random_baseline": {"accuracy": 0.8203125, "accuracy_std": 0.0, "avg_cost": 0.0006124531250000004, "cost_std": 0.0, "small_model_fraction": 0.0}, "unrestricted_random_baseline": {"accuracy": 0.8203125, "accuracy_std": 0.0, "avg_cost": 0.0006124531250000004, "cost_std": 0.0, "small_model_fraction": 0.0}}, "0.1": {"accuracy": 0.7682291666666666, "large_model_usage": 0.609375, "small_model_usage": 0.390625, "avg_lambda": 0.7599999999999959, "avg_cost": 0.0004054497395833328, "unrestricted": {"accuracy": 0.7708333333333334, "large_model_usage": 0.609375, "avg_cost": 0.0004089289062499995}, "random_baseline": {"accuracy": 0.7302083333333333, "accuracy_std": 0.015282709115751253, "avg_cost": 0.00041047330729166684, "cost_std": 1.4738703630278946e-05, "small_model_fraction": 0.35607696350842366}, "unrestricted_random_baseline": {"accuracy": 0.7335937499999999, "accuracy_std": 0.019936762547315856, "avg_cost": 0.00041298104166666684, "cost_std": 1.8238789324001614e-05, "small_model_fraction": 0.3500785499782107}}, "0.15": {"accuracy": 0.7291666666666666, "large_model_usage": 0.4869791666666667, "small_model_usage": 0.5130208333333333, "avg_lambda": 0.620000000000003, "avg_cost": 0.0003270539062499999, "unrestricted": {"accuracy": 0.7447916666666666, "large_model_usage": 0.4869791666666667, "avg_cost": 0.0003345226562499998}, "random_baseline": {"accuracy": 0.7028645833333333, "accuracy_std": 0.012072215610936722, "avg_cost": 0.0003249854427083332, "cost_std": 2.0403525949728256e-05, "small_model_fraction": 0.49123882047962664}, "unrestricted_random_baseline": {"accuracy": 0.7002604166666666, "accuracy_std": 0.018245900057920143, "avg_cost": 0.0003363032812499999, "cost_std": 1.632440131325556e-05, "small_model_fraction": 0.4783619866438106}}, "0.2": {"accuracy": 0.671875, "large_model_usage": 0.359375, "small_model_usage": 0.640625, "avg_lambda": 0.40000000000000285, "avg_cost": 0.00024183515625000014, "unrestricted": {"accuracy": 0.7109375, "large_model_usage": 0.359375, "avg_cost": 0.00025598098958333346}, "random_baseline": {"accuracy": 0.6559895833333333, "accuracy_std": 0.011730318364119998, "avg_cost": 0.0002390303645833333, "cost_std": 1.2181602437148406e-05, "small_model_fraction": 0.638164033325948}, "unrestricted_random_baseline": {"accuracy": 0.6700520833333334, "accuracy_std": 0.012272782987446013, "avg_cost": 0.0002591751562499999, "cost_std": 1.3106936111345285e-05, "small_model_fraction": 0.6137752741222682}}, "0.25": {"accuracy": 0.6302083333333334, "large_model_usage": 0.3046875, "small_model_usage": 0.6953125, "avg_lambda": 0.3100000000000015, "avg_cost": 0.00020261119791666667, "unrestricted": {"accuracy": 0.6927083333333334, "large_model_usage": 0.3046875, "avg_cost": 0.00022187161458333345}, "random_baseline": {"accuracy": 0.6434895833333333, "accuracy_std": 0.013851131403096056, "avg_cost": 0.00019707190104166652, "cost_std": 1.3004051515873843e-05, "small_model_fraction": 0.705789860116417}, "unrestricted_random_baseline": {"accuracy": 0.6546875, "accuracy_std": 0.01081278098543056, "avg_cost": 0.00022500281249999998, "cost_std": 1.1328676446587542e-05, "small_model_fraction": 0.6725830738368851}}}}, {"trial": 14, "trial_seed": 55, "subject": "TruthfulQA", "method": "standard", "calibration_size": 300, "test_size": 384, "small_model_accuracy": 0.5511695906432749, "large_model_accuracy": 0.8245614035087719, "small_model_cost": 3.201418128654974e-05, "large_model_cost": 0.0006144122807017549, "hybrid_results": {"0.05": {"accuracy": 0.8229166666666666, "large_model_usage": 0.796875, "small_model_usage": 0.203125, "avg_lambda": 0.8999999999999971, "avg_cost": 0.0005253890624999997, "unrestricted": {"accuracy": 0.8229166666666666, "large_model_usage": 0.796875, "avg_cost": 0.0005260140624999996}, "random_baseline": {"accuracy": 0.7903645833333334, "accuracy_std": 0.006298638865858245, "avg_cost": 0.0005246122656249996, "cost_std": 4.709519241393224e-06, "small_model_fraction": 0.15285629930994757}, "unrestricted_random_baseline": {"accuracy": 0.7947916666666667, "accuracy_std": 0.009729969631390315, "avg_cost": 0.000531682942708333, "cost_std": 1.1667677035077946e-05, "small_model_fraction": 0.15178315020347288}}, "0.1": {"accuracy": 0.7265625, "large_model_usage": 0.4453125, "small_model_usage": 0.5546875, "avg_lambda": 0.610000000000006, "avg_cost": 0.00030132656249999963, "unrestricted": {"accuracy": 0.7369791666666666, "large_model_usage": 0.4453125, "avg_cost": 0.00030956093749999984}, "random_baseline": {"accuracy": 0.6901041666666667, "accuracy_std": 0.007725206757391322, "avg_cost": 0.0002999928645833333, "cost_std": 1.527973627543031e-05, "small_model_fraction": 0.5375802539811332}, "unrestricted_random_baseline": {"accuracy": 0.6903645833333333, "accuracy_std": 0.013604122798551008, "avg_cost": 0.0003126419270833332, "cost_std": 1.4236336831872023e-05, "small_model_fraction": 0.5234415145033284}}, "0.15": {"accuracy": 0.6927083333333334, "large_model_usage": 0.3880208333333333, "small_model_usage": 0.6119791666666667, "avg_lambda": 0.5, "avg_cost": 0.0002620973958333331, "unrestricted": {"accuracy": 0.7239583333333334, "large_model_usage": 0.3880208333333333, "avg_cost": 0.00027403489583333306}, "random_baseline": {"accuracy": 0.6666666666666666, "accuracy_std": 0.022341261030757523, "avg_cost": 0.00025347906249999994, "cost_std": 1.6719442383362165e-05, "small_model_fraction": 0.6049382462308626}, "unrestricted_random_baseline": {"accuracy": 0.68359375, "accuracy_std": 0.014511074703911513, "avg_cost": 0.00026934697916666665, "cost_std": 1.5249773803678793e-05, "small_model_fraction": 0.5844410982971956}}, "0.2": {"accuracy": 0.6458333333333334, "large_model_usage": 0.265625, "small_model_usage": 0.734375, "avg_lambda": 0.36000000000000054, "avg_cost": 0.00018355052083333337, "unrestricted": {"accuracy": 0.6953125, "large_model_usage": 0.265625, "avg_cost": 0.0001974776041666667}, "random_baseline": {"accuracy": 0.6408854166666667, "accuracy_std": 0.01465986244274747, "avg_cost": 0.00018504934895833327, "cost_std": 1.7211265677922386e-05, "small_model_fraction": 0.739806260187072}, "unrestricted_random_baseline": {"accuracy": 0.6408854166666667, "accuracy_std": 0.014332383235226051, "avg_cost": 0.00020370059895833325, "cost_std": 1.1960091812790737e-05, "small_model_fraction": 0.7158929209311271}}, "0.25": {"accuracy": 0.6354166666666666, "large_model_usage": 0.25, "small_model_usage": 0.75, "avg_lambda": 0.3100000000000015, "avg_cost": 0.00017186822916666672, "unrestricted": {"accuracy": 0.6875, "large_model_usage": 0.25, "avg_cost": 0.00018742552083333332}, "random_baseline": {"accuracy": 0.6338541666666668, "accuracy_std": 0.012281068877657907, "avg_cost": 0.00017458260416666657, "cost_std": 1.2599210017122299e-05, "small_model_fraction": 0.7598652055689286}, "unrestricted_random_baseline": {"accuracy": 0.6419270833333333, "accuracy_std": 0.015195931730099121, "avg_cost": 0.0001921688020833332, "cost_std": 8.409103693424055e-06, "small_model_fraction": 0.7331527357269289}}}}, {"trial": 15, "trial_seed": 56, "subject": "TruthfulQA", "method": "standard", "calibration_size": 300, "test_size": 384, "small_model_accuracy": 0.5570175438596491, "large_model_accuracy": 0.8333333333333334, "small_model_cost": 3.199035087719302e-05, "large_model_cost": 0.00061342105263158, "hybrid_results": {"0.05": {"accuracy": 0.7604166666666666, "large_model_usage": 0.71875, "small_model_usage": 0.28125, "avg_lambda": 0.8000000000000057, "avg_cost": 0.00047040781250000055, "unrestricted": {"accuracy": 0.7760416666666666, "large_model_usage": 0.71875, "avg_cost": 0.0004755380208333339}, "random_baseline": {"accuracy": 0.7421874999999999, "accuracy_std": 0.013278696649981199, "avg_cost": 0.0004689535677083333, "cost_std": 1.281902565496072e-05, "small_model_fraction": 0.24596781645698576}, "unrestricted_random_baseline": {"accuracy": 0.7502604166666667, "accuracy_std": 0.014934844748895466, "avg_cost": 0.0004806095312499999, "cost_std": 1.1892502741185668e-05, "small_model_fraction": 0.2371443946496171}}, "0.1": {"accuracy": 0.6640625, "large_model_usage": 0.4635416666666667, "small_model_usage": 0.5364583333333333, "avg_lambda": 0.5999999999999962, "avg_cost": 0.00030752239583333314, "unrestricted": {"accuracy": 0.6979166666666666, "large_model_usage": 0.4635416666666667, "avg_cost": 0.0003180276041666666}, "random_baseline": {"accuracy": 0.6591145833333333, "accuracy_std": 0.012834605760782915, "avg_cost": 0.0003129500260416666, "cost_std": 1.2846075087267123e-05, "small_model_fraction": 0.5261136982881018}, "unrestricted_random_baseline": {"accuracy": 0.6544270833333334, "accuracy_std": 0.014641346663406865, "avg_cost": 0.00031374023437499984, "cost_std": 9.739068157226343e-06, "small_model_fraction": 0.5080458386074289}}, "0.15": {"accuracy": 0.6015625, "large_model_usage": 0.3411458333333333, "small_model_usage": 0.6588541666666667, "avg_lambda": 0.4300000000000041, "avg_cost": 0.00022737656249999995, "unrestricted": {"accuracy": 0.6380208333333334, "large_model_usage": 0.3411458333333333, "avg_cost": 0.00024203281249999991}, "random_baseline": {"accuracy": 0.6091145833333333, "accuracy_std": 0.016611644201892768, "avg_cost": 0.0002262993489583332, "cost_std": 1.868905827665631e-05, "small_model_fraction": 0.6639561498330653}, "unrestricted_random_baseline": {"accuracy": 0.6169270833333333, "accuracy_std": 0.015025387225782985, "avg_cost": 0.0002378403385416665, "cost_std": 1.2077669675194819e-05, "small_model_fraction": 0.6387489326087654}}, "0.2": {"accuracy": 0.5755208333333334, "large_model_usage": 0.2708333333333333, "small_model_usage": 0.7291666666666667, "avg_lambda": 0.3100000000000015, "avg_cost": 0.00018130364583333327, "unrestricted": {"accuracy": 0.6197916666666666, "large_model_usage": 0.2708333333333333, "avg_cost": 0.0001984546874999999}, "random_baseline": {"accuracy": 0.5802083333333333, "accuracy_std": 0.010468620438667586, "avg_cost": 0.0001851717447916666, "cost_std": 1.2560250526434997e-05, "small_model_fraction": 0.7431967481152819}, "unrestricted_random_baseline": {"accuracy": 0.590625, "accuracy_std": 0.014158622103196024, "avg_cost": 0.0001980295312499999, "cost_std": 9.316964513613788e-06, "small_model_fraction": 0.7136987501338962}}, "0.25": {"accuracy": 0.4947916666666667, "large_model_usage": 0.041666666666666664, "small_model_usage": 0.9583333333333334, "avg_lambda": 0.0, "avg_cost": 5.301197916666666e-05, "unrestricted": {"accuracy": 0.5104166666666666, "large_model_usage": 0.041666666666666664, "avg_cost": 5.763697916666666e-05}, "random_baseline": {"accuracy": 0.5046875, "accuracy_std": 0.0063575810498613106, "avg_cost": 5.599716145833335e-05, "cost_std": 7.5758453181283355e-06, "small_model_fraction": 0.9638449978199574}, "unrestricted_random_baseline": {"accuracy": 0.5093750000000001, "accuracy_std": 0.004830009633070667, "avg_cost": 5.936177083333338e-05, "cost_std": 6.025579378740371e-06, "small_model_fraction": 0.95589048151037}}}}, {"trial": 16, "trial_seed": 57, "subject": "TruthfulQA", "method": "standard", "calibration_size": 300, "test_size": 384, "small_model_accuracy": 0.5818713450292398, "large_model_accuracy": 0.8552631578947368, "small_model_cost": 3.2012719298245625e-05, "large_model_cost": 0.0006140789473684211, "hybrid_results": {"0.05": {"accuracy": 0.7994791666666666, "large_model_usage": 0.7421875, "small_model_usage": 0.2578125, "avg_lambda": 0.8000000000000057, "avg_cost": 0.0004872768229166662, "unrestricted": {"accuracy": 0.8046875, "large_model_usage": 0.7421875, "avg_cost": 0.0004904695312499994}, "random_baseline": {"accuracy": 0.7911458333333334, "accuracy_std": 0.00893043135389701, "avg_cost": 0.00048616296875, "cost_std": 1.0252041641180592e-05, "small_model_fraction": 0.21784827625571726}, "unrestricted_random_baseline": {"accuracy": 0.7880208333333334, "accuracy_std": 0.012608040038448141, "avg_cost": 0.0004891108072916666, "cost_std": 1.3623948944895104e-05, "small_model_fraction": 0.2123631472800704}}, "0.1": {"accuracy": 0.7760416666666666, "large_model_usage": 0.6484375, "small_model_usage": 0.3515625, "avg_lambda": 0.699999999999995, "avg_cost": 0.00041704765624999963, "unrestricted": {"accuracy": 0.7916666666666666, "large_model_usage": 0.6484375, "avg_cost": 0.0004316778645833329}, "random_baseline": {"accuracy": 0.7598958333333334, "accuracy_std": 0.013770108402502375, "avg_cost": 0.00041896460937499997, "cost_std": 1.5270502965394097e-05, "small_model_fraction": 0.33850321770371955}, "unrestricted_random_baseline": {"accuracy": 0.75703125, "accuracy_std": 0.011590735981389518, "avg_cost": 0.0004325529166666666, "cost_std": 1.3222310607968043e-05, "small_model_fraction": 0.31336826290340525}}, "0.15": {"accuracy": 0.7213541666666666, "large_model_usage": 0.4453125, "small_model_usage": 0.5546875, "avg_lambda": 0.5, "avg_cost": 0.00029477682291666673, "unrestricted": {"accuracy": 0.7604166666666666, "large_model_usage": 0.4453125, "avg_cost": 0.00030661536458333346}, "random_baseline": {"accuracy": 0.6932291666666667, "accuracy_std": 0.010272439022560422, "avg_cost": 0.0002910801562499999, "cost_std": 2.2947141821147805e-05, "small_model_fraction": 0.5485666562555807}, "unrestricted_random_baseline": {"accuracy": 0.7036458333333333, "accuracy_std": 0.011097539454511723, "avg_cost": 0.0003069346874999999, "cost_std": 1.4047768004189195e-05, "small_model_fraction": 0.5282278338059136}}, "0.2": {"accuracy": 0.6927083333333334, "large_model_usage": 0.3671875, "small_model_usage": 0.6328125, "avg_lambda": 0.40000000000000285, "avg_cost": 0.00024240182291666653, "unrestricted": {"accuracy": 0.7447916666666666, "large_model_usage": 0.3671875, "avg_cost": 0.00025800078125000006}, "random_baseline": {"accuracy": 0.6684895833333333, "accuracy_std": 0.015627169988205715, "avg_cost": 0.00024674234374999997, "cost_std": 1.457688222790146e-05, "small_model_fraction": 0.6385478258789208}, "unrestricted_random_baseline": {"accuracy": 0.6776041666666666, "accuracy_std": 0.012958182608984115, "avg_cost": 0.0002610002083333333, "cost_std": 1.041760814482348e-05, "small_model_fraction": 0.6117485415688665}}, "0.25": {"accuracy": 0.625, "large_model_usage": 0.2630208333333333, "small_model_usage": 0.7369791666666667, "avg_lambda": 0.20000000000000143, "avg_cost": 0.00017398515625, "unrestricted": {"accuracy": 0.6927083333333334, "large_model_usage": 0.2630208333333333, "avg_cost": 0.00019474557291666657}, "random_baseline": {"accuracy": 0.63984375, "accuracy_std": 0.015540134112328859, "avg_cost": 0.00017792562499999997, "cost_std": 1.1334049453669635e-05, "small_model_fraction": 0.7560888605022489}, "unrestricted_random_baseline": {"accuracy": 0.6528645833333334, "accuracy_std": 0.00790312026327787, "avg_cost": 0.0001943890625, "cost_std": 1.653219953607664e-05, "small_model_fraction": 0.7204221001483676}}}}, {"trial": 17, "trial_seed": 58, "subject": "TruthfulQA", "method": "standard", "calibration_size": 300, "test_size": 384, "small_model_accuracy": 0.5453216374269005, "large_model_accuracy": 0.8216374269005848, "small_model_cost": 3.197076023391814e-05, "large_model_cost": 0.0006144093567251471, "hybrid_results": {"0.05": {"accuracy": 0.7890625, "large_model_usage": 0.7447916666666666, "small_model_usage": 0.25520833333333337, "avg_lambda": 0.8000000000000057, "avg_cost": 0.0004862598958333331, "unrestricted": {"accuracy": 0.7942708333333334, "large_model_usage": 0.7447916666666666, "avg_cost": 0.0004927911458333332}, "random_baseline": {"accuracy": 0.76796875, "accuracy_std": 0.00934964743282096, "avg_cost": 0.00048729914062499956, "cost_std": 9.528207835065597e-06, "small_model_fraction": 0.2200222678644956}, "unrestricted_random_baseline": {"accuracy": 0.7697916666666667, "accuracy_std": 0.011657827752916346, "avg_cost": 0.0004969879166666664, "cost_std": 7.049146807905302e-06, "small_model_fraction": 0.20880863944195263}}, "0.1": {"accuracy": 0.75, "large_model_usage": 0.640625, "small_model_usage": 0.359375, "avg_lambda": 0.699999999999995, "avg_cost": 0.0004117807291666667, "unrestricted": {"accuracy": 0.7786458333333334, "large_model_usage": 0.640625, "avg_cost": 0.0004288692708333333}, "random_baseline": {"accuracy": 0.73515625, "accuracy_std": 0.010990077003129805, "avg_cost": 0.0004197056510416664, "cost_std": 1.4896773451107022e-05, "small_model_fraction": 0.34789697794612384}, "unrestricted_random_baseline": {"accuracy": 0.7424479166666667, "accuracy_std": 0.013997244427475726, "avg_cost": 0.00043139624999999983, "cost_std": 1.2051384577234641e-05, "small_model_fraction": 0.3185573329266957}}, "0.15": {"accuracy": 0.703125, "large_model_usage": 0.4348958333333333, "small_model_usage": 0.5651041666666667, "avg_lambda": 0.5, "avg_cost": 0.0002922130208333333, "unrestricted": {"accuracy": 0.7317708333333334, "large_model_usage": 0.4348958333333333, "avg_cost": 0.0003023067708333334}, "random_baseline": {"accuracy": 0.6736979166666666, "accuracy_std": 0.011707170204681783, "avg_cost": 0.0002942627604166664, "cost_std": 8.51643067671861e-06, "small_model_fraction": 0.5531850702079387}, "unrestricted_random_baseline": {"accuracy": 0.6789062499999999, "accuracy_std": 0.013534152562694052, "avg_cost": 0.0003023619010416665, "cost_std": 1.51432987324865e-05, "small_model_fraction": 0.5358549171912815}}, "0.2": {"accuracy": 0.6927083333333334, "large_model_usage": 0.3723958333333333, "small_model_usage": 0.6276041666666667, "avg_lambda": 0.40000000000000285, "avg_cost": 0.0002495307291666667, "unrestricted": {"accuracy": 0.71875, "large_model_usage": 0.3723958333333333, "avg_cost": 0.0002631140625000001}, "random_baseline": {"accuracy": 0.64453125, "accuracy_std": 0.012283829599032046, "avg_cost": 0.0002468484374999999, "cost_std": 1.4043219981623069e-05, "small_model_fraction": 0.6264671156008721}, "unrestricted_random_baseline": {"accuracy": 0.6536458333333334, "accuracy_std": 0.009743899444723812, "avg_cost": 0.00025586041666666645, "cost_std": 1.0559741165893638e-05, "small_model_fraction": 0.6031456300139567}}, "0.25": {"accuracy": 0.5911458333333334, "large_model_usage": 0.28125, "small_model_usage": 0.71875, "avg_lambda": 0.25, "avg_cost": 0.00018439010416666663, "unrestricted": {"accuracy": 0.6927083333333334, "large_model_usage": 0.28125, "avg_cost": 0.0002064734375}, "random_baseline": {"accuracy": 0.62265625, "accuracy_std": 0.013147818293087851, "avg_cost": 0.00018898117187499995, "cost_std": 1.6544314258483117e-05, "small_model_fraction": 0.7383083043414967}, "unrestricted_random_baseline": {"accuracy": 0.6333333333333334, "accuracy_std": 0.012532509807829127, "avg_cost": 0.0002111436458333333, "cost_std": 1.2486746376001792e-05, "small_model_fraction": 0.7003930056879228}}}}, {"trial": 18, "trial_seed": 59, "subject": "TruthfulQA", "method": "standard", "calibration_size": 300, "test_size": 384, "small_model_accuracy": 0.577485380116959, "large_model_accuracy": 0.847953216374269, "small_model_cost": 3.1978801169590635e-05, "large_model_cost": 0.0006127222222222229, "hybrid_results": {"0.05": {"accuracy": 0.7942708333333334, "large_model_usage": 0.828125, "small_model_usage": 0.171875, "avg_lambda": 0.8799999999999967, "avg_cost": 0.0005366333333333335, "unrestricted": {"accuracy": 0.8177083333333334, "large_model_usage": 0.828125, "avg_cost": 0.0005449718750000003}, "random_baseline": {"accuracy": 0.8052083333333334, "accuracy_std": 0.008051887934239738, "avg_cost": 0.0005411442187500001, "cost_std": 1.0241022798400411e-05, "small_model_fraction": 0.13101980346324676}, "unrestricted_random_baseline": {"accuracy": 0.8104166666666666, "accuracy_std": 0.00885416666666666, "avg_cost": 0.0005501469010416668, "cost_std": 9.648948171968305e-06, "small_model_fraction": 0.1166614115049656}}, "0.1": {"accuracy": 0.7526041666666666, "large_model_usage": 0.6432291666666666, "small_model_usage": 0.35677083333333337, "avg_lambda": 0.699999999999995, "avg_cost": 0.00041404999999999974, "unrestricted": {"accuracy": 0.7786458333333334, "large_model_usage": 0.6432291666666666, "avg_cost": 0.00043098749999999976}, "random_baseline": {"accuracy": 0.7505208333333333, "accuracy_std": 0.016379359576579565, "avg_cost": 0.0004114230989583333, "cost_std": 1.5434204200051184e-05, "small_model_fraction": 0.3420998241566265}, "unrestricted_random_baseline": {"accuracy": 0.7518229166666667, "accuracy_std": 0.011879682008288598, "avg_cost": 0.0004317701302083334, "cost_std": 1.4912070593751838e-05, "small_model_fraction": 0.3129346207535474}}, "0.15": {"accuracy": 0.6927083333333334, "large_model_usage": 0.4322916666666667, "small_model_usage": 0.5677083333333333, "avg_lambda": 0.5, "avg_cost": 0.0002911749999999997, "unrestricted": {"accuracy": 0.7265625, "large_model_usage": 0.4322916666666667, "avg_cost": 0.00030012812499999965}, "random_baseline": {"accuracy": 0.68828125, "accuracy_std": 0.010021813231715549, "avg_cost": 0.00029143013020833307, "cost_std": 1.533435948929266e-05, "small_model_fraction": 0.5536820746749048}, "unrestricted_random_baseline": {"accuracy": 0.6919270833333333, "accuracy_std": 0.012217400110145741, "avg_cost": 0.00030350447916666643, "cost_std": 1.035253354185021e-05, "small_model_fraction": 0.5382654127284434}}, "0.2": {"accuracy": 0.6432291666666666, "large_model_usage": 0.3229166666666667, "small_model_usage": 0.6770833333333333, "avg_lambda": 0.37000000000000194, "avg_cost": 0.00021781562499999986, "unrestricted": {"accuracy": 0.6901041666666666, "large_model_usage": 0.3229166666666667, "avg_cost": 0.0002323833333333332}, "random_baseline": {"accuracy": 0.64765625, "accuracy_std": 0.00823921459351895, "avg_cost": 0.00021891703124999983, "cost_std": 1.4550060759660274e-05, "small_model_fraction": 0.6800018440268013}, "unrestricted_random_baseline": {"accuracy": 0.66328125, "accuracy_std": 0.01643103286911183, "avg_cost": 0.00023506330729166643, "cost_std": 1.7401970282752895e-05, "small_model_fraction": 0.6549172579510287}}, "0.25": {"accuracy": 0.6041666666666666, "large_model_usage": 0.28125, "small_model_usage": 0.71875, "avg_lambda": 0.26999999999999935, "avg_cost": 0.00018612812499999997, "unrestricted": {"accuracy": 0.6770833333333334, "large_model_usage": 0.28125, "avg_cost": 0.0002067427083333333}, "random_baseline": {"accuracy": 0.6390625000000001, "accuracy_std": 0.007103219633846804, "avg_cost": 0.00018989716145833326, "cost_std": 1.3182464481187465e-05, "small_model_fraction": 0.7345655271462147}, "unrestricted_random_baseline": {"accuracy": 0.64921875, "accuracy_std": 0.015670506649302487, "avg_cost": 0.00020496973958333318, "cost_std": 1.3029600829376698e-05, "small_model_fraction": 0.6990686405935127}}}}, {"trial": 19, "trial_seed": 60, "subject": "TruthfulQA", "method": "standard", "calibration_size": 300, "test_size": 384, "small_model_accuracy": 0.5511695906432749, "large_model_accuracy": 0.8347953216374269, "small_model_cost": 3.2065058479532224e-05, "large_model_cost": 0.0006162017543859654, "hybrid_results": {"0.05": {"accuracy": 0.7864583333333334, "large_model_usage": 0.7291666666666666, "small_model_usage": 0.27083333333333337, "avg_lambda": 0.8000000000000057, "avg_cost": 0.00047829114583333287, "unrestricted": {"accuracy": 0.796875, "large_model_usage": 0.7291666666666666, "avg_cost": 0.0004866609374999996}, "random_baseline": {"accuracy": 0.7539062499999999, "accuracy_std": 0.009899259275389694, "avg_cost": 0.0004769892447916667, "cost_std": 1.1212943240927793e-05, "small_model_fraction": 0.23609304041176524}, "unrestricted_random_baseline": {"accuracy": 0.7661458333333334, "accuracy_std": 0.013471372037893309, "avg_cost": 0.0004874775000000001, "cost_std": 6.957402994193457e-06, "small_model_fraction": 0.22176455920981145}}, "0.1": {"accuracy": 0.734375, "large_model_usage": 0.6041666666666666, "small_model_usage": 0.39583333333333337, "avg_lambda": 0.699999999999995, "avg_cost": 0.00038955677083333287, "unrestricted": {"accuracy": 0.7682291666666666, "large_model_usage": 0.6041666666666666, "avg_cost": 0.0004097546874999996}, "random_baseline": {"accuracy": 0.7205729166666666, "accuracy_std": 0.010354632995870345, "avg_cost": 0.0003944856249999999, "cost_std": 1.42162738766597e-05, "small_model_fraction": 0.38799990677000107}, "unrestricted_random_baseline": {"accuracy": 0.7296875, "accuracy_std": 0.013010412496663317, "avg_cost": 0.00040753822916666664, "cost_std": 1.1557818599270986e-05, "small_model_fraction": 0.3534225264954668}}, "0.15": {"accuracy": 0.6875, "large_model_usage": 0.390625, "small_model_usage": 0.609375, "avg_lambda": 0.5099999999999983, "avg_cost": 0.00026490572916666673, "unrestricted": {"accuracy": 0.7109375, "large_model_usage": 0.390625, "avg_cost": 0.0002749786458333333}, "random_baseline": {"accuracy": 0.66015625, "accuracy_std": 0.011602431984346595, "avg_cost": 0.00026424127604166654, "cost_std": 1.8857591562150145e-05, "small_model_fraction": 0.6013935225798058}, "unrestricted_random_baseline": {"accuracy": 0.66953125, "accuracy_std": 0.011437599612498814, "avg_cost": 0.00027723825520833327, "cost_std": 1.2125374771293431e-05, "small_model_fraction": 0.584149413902408}}, "0.2": {"accuracy": 0.65625, "large_model_usage": 0.3229166666666667, "small_model_usage": 0.6770833333333333, "avg_lambda": 0.4099999999999973, "avg_cost": 0.00021959322916666665, "unrestricted": {"accuracy": 0.6875, "large_model_usage": 0.3229166666666667, "avg_cost": 0.0002329213541666667}, "random_baseline": {"accuracy": 0.6455729166666667, "accuracy_std": 0.01267509742045621, "avg_cost": 0.00022355851562499987, "cost_std": 1.0316363198200226e-05, "small_model_fraction": 0.6789652627521751}, "unrestricted_random_baseline": {"accuracy": 0.64375, "accuracy_std": 0.011097539454511718, "avg_cost": 0.00022574096354166656, "cost_std": 1.8053029990735093e-05, "small_model_fraction": 0.6561484715911299}}, "0.25": {"accuracy": 0.6145833333333334, "large_model_usage": 0.24479166666666666, "small_model_usage": 0.7552083333333334, "avg_lambda": 0.3100000000000015, "avg_cost": 0.00017010364583333325, "unrestricted": {"accuracy": 0.6588541666666666, "large_model_usage": 0.24479166666666666, "avg_cost": 0.0001837234375}, "random_baseline": {"accuracy": 0.6231770833333334, "accuracy_std": 0.013023437239635416, "avg_cost": 0.00017793690104166653, "cost_std": 1.2481376787873215e-05, "small_model_fraction": 0.7636878690875604}, "unrestricted_random_baseline": {"accuracy": 0.6268229166666667, "accuracy_std": 0.009464990544472016, "avg_cost": 0.0001809695833333332, "cost_std": 9.316815668269218e-06, "small_model_fraction": 0.7403717655759803}}}}, {"trial": 20, "trial_seed": 61, "subject": "TruthfulQA", "method": "standard", "calibration_size": 300, "test_size": 384, "small_model_accuracy": 0.5628654970760234, "large_model_accuracy": 0.8216374269005848, "small_model_cost": 3.207514619883046e-05, "large_model_cost": 0.0006158654970760243, "hybrid_results": {"0.05": {"accuracy": 0.7916666666666666, "large_model_usage": 0.7135416666666666, "small_model_usage": 0.28645833333333337, "avg_lambda": 0.8000000000000057, "avg_cost": 0.0004676960937499996, "unrestricted": {"accuracy": 0.7942708333333334, "large_model_usage": 0.7135416666666666, "avg_cost": 0.00047337317708333305}, "random_baseline": {"accuracy": 0.7526041666666667, "accuracy_std": 0.013072812914594927, "avg_cost": 0.00046353234374999993, "cost_std": 1.172721579556529e-05, "small_model_fraction": 0.25380584503218967}, "unrestricted_random_baseline": {"accuracy": 0.7705729166666667, "accuracy_std": 0.007407011798608268, "avg_cost": 0.000473695, "cost_std": 1.2321306844650245e-05, "small_model_fraction": 0.24408132093753282}}, "0.1": {"accuracy": 0.7265625, "large_model_usage": 0.484375, "small_model_usage": 0.515625, "avg_lambda": 0.610000000000006, "avg_cost": 0.00032517005208333337, "unrestricted": {"accuracy": 0.7447916666666666, "large_model_usage": 0.484375, "avg_cost": 0.00033252421875}, "random_baseline": {"accuracy": 0.6893229166666666, "accuracy_std": 0.016554390703450845, "avg_cost": 0.0003244118749999998, "cost_std": 1.1096417318109279e-05, "small_model_fraction": 0.49794492929850026}, "unrestricted_random_baseline": {"accuracy": 0.6911458333333333, "accuracy_std": 0.015014099276457886, "avg_cost": 0.00032857898437499986, "cost_std": 1.628909325580385e-05, "small_model_fraction": 0.48534765588413775}}, "0.15": {"accuracy": 0.6744791666666666, "large_model_usage": 0.3723958333333333, "small_model_usage": 0.6276041666666667, "avg_lambda": 0.44999999999999857, "avg_cost": 0.00025049817708333344, "unrestricted": {"accuracy": 0.7135416666666666, "large_model_usage": 0.3723958333333333, "avg_cost": 0.00026334713541666674}, "random_baseline": {"accuracy": 0.65390625, "accuracy_std": 0.013703460538461337, "avg_cost": 0.00024778773437499985, "cost_std": 9.813971295317419e-06, "small_model_fraction": 0.6258536466793189}, "unrestricted_random_baseline": {"accuracy": 0.6557291666666667, "accuracy_std": 0.015086196206208583, "avg_cost": 0.00026292950520833325, "cost_std": 9.241621108350722e-06, "small_model_fraction": 0.6038441045311373}}, "0.2": {"accuracy": 0.5885416666666666, "large_model_usage": 0.28125, "small_model_usage": 0.71875, "avg_lambda": 0.2999999999999981, "avg_cost": 0.00018647734375000014, "unrestricted": {"accuracy": 0.6770833333333334, "large_model_usage": 0.28125, "avg_cost": 0.00020600338541666677}, "random_baseline": {"accuracy": 0.6192708333333334, "accuracy_std": 0.01544163237538916, "avg_cost": 0.00018426377604166669, "cost_std": 1.149178529848089e-05, "small_model_fraction": 0.7355177294054835}, "unrestricted_random_baseline": {"accuracy": 0.6346354166666666, "accuracy_std": 0.009536371574791964, "avg_cost": 0.00020096312500000006, "cost_std": 1.1332945951655397e-05, "small_model_fraction": 0.7020707194689078}}, "0.25": {"accuracy": 0.5729166666666666, "large_model_usage": 0.23697916666666666, "small_model_usage": 0.7630208333333334, "avg_lambda": 0.14000000000000046, "avg_cost": 0.00015840963541666672, "unrestricted": {"accuracy": 0.65625, "large_model_usage": 0.23697916666666666, "avg_cost": 0.00017926380208333335}, "random_baseline": {"accuracy": 0.61015625, "accuracy_std": 0.00924754213779411, "avg_cost": 0.00015433856770833334, "cost_std": 9.973408535386524e-06, "small_model_fraction": 0.7835961333927357}, "unrestricted_random_baseline": {"accuracy": 0.6247395833333333, "accuracy_std": 0.01598754737309462, "avg_cost": 0.00018287182291666673, "cost_std": 1.272895912056319e-05, "small_model_fraction": 0.7478741201129144}}}}, {"trial": 21, "trial_seed": 62, "subject": "TruthfulQA", "method": "standard", "calibration_size": 300, "test_size": 384, "small_model_accuracy": 0.5614035087719298, "large_model_accuracy": 0.8421052631578947, "small_model_cost": 3.1927192982456195e-05, "large_model_cost": 0.0006130935672514634, "hybrid_results": {"0.05": {"accuracy": 0.7916666666666666, "large_model_usage": 0.7708333333333334, "small_model_usage": 0.22916666666666663, "avg_lambda": 0.8399999999999972, "avg_cost": 0.00050420234375, "unrestricted": {"accuracy": 0.796875, "large_model_usage": 0.7708333333333334, "avg_cost": 0.0005086658854166668}, "random_baseline": {"accuracy": 0.7919270833333333, "accuracy_std": 0.009421902351627884, "avg_cost": 0.0005090430468750002, "cost_std": 1.618837138955373e-05, "small_model_fraction": 0.18736669622090077}, "unrestricted_random_baseline": {"accuracy": 0.7958333333333333, "accuracy_std": 0.007198059875565242, "avg_cost": 0.0005100494270833336, "cost_std": 1.171374092818601e-05, "small_model_fraction": 0.17968637976714671}}, "0.1": {"accuracy": 0.7552083333333334, "large_model_usage": 0.6171875, "small_model_usage": 0.3828125, "avg_lambda": 0.699999999999995, "avg_cost": 0.0003994783854166667, "unrestricted": {"accuracy": 0.7864583333333334, "large_model_usage": 0.6171875, "avg_cost": 0.0004137596354166667}, "random_baseline": {"accuracy": 0.7361979166666666, "accuracy_std": 0.009173914038441939, "avg_cost": 0.00039598994791666673, "cost_std": 4.854331500632631e-06, "small_model_fraction": 0.36756287234181856}, "unrestricted_random_baseline": {"accuracy": 0.7434895833333334, "accuracy_std": 0.014132253699182437, "avg_cost": 0.00041332812499999996, "cost_std": 1.3657497144219429e-05, "small_model_fraction": 0.3429894444349425}}, "0.15": {"accuracy": 0.7083333333333334, "large_model_usage": 0.4765625, "small_model_usage": 0.5234375, "avg_lambda": 0.5800000000000045, "avg_cost": 0.00031784296874999964, "unrestricted": {"accuracy": 0.734375, "large_model_usage": 0.4765625, "avg_cost": 0.0003279523437499997}, "random_baseline": {"accuracy": 0.6903645833333332, "accuracy_std": 0.018504248878635953, "avg_cost": 0.00032082424479166637, "cost_std": 1.3993700872510297e-05, "small_model_fraction": 0.5080311104936702}, "unrestricted_random_baseline": {"accuracy": 0.68984375, "accuracy_std": 0.011318392610469125, "avg_cost": 0.00032447416666666647, "cost_std": 1.1182111685886633e-05, "small_model_fraction": 0.490636134721516}}, "0.2": {"accuracy": 0.625, "large_model_usage": 0.296875, "small_model_usage": 0.703125, "avg_lambda": 0.3499999999999975, "avg_cost": 0.00020372838541666652, "unrestricted": {"accuracy": 0.6901041666666666, "large_model_usage": 0.296875, "avg_cost": 0.0002158429687499999}, "random_baseline": {"accuracy": 0.63984375, "accuracy_std": 0.0079884696094677, "avg_cost": 0.00020590427083333327, "cost_std": 1.0060274402608133e-05, "small_model_fraction": 0.7043855253148422}, "unrestricted_random_baseline": {"accuracy": 0.6354166666666667, "accuracy_std": 0.011587810136086965, "avg_cost": 0.00021431434895833319, "cost_std": 1.6715014472853927e-05, "small_model_fraction": 0.6835402323493448}}, "0.25": {"accuracy": 0.5833333333333334, "large_model_usage": 0.2760416666666667, "small_model_usage": 0.7239583333333333, "avg_lambda": 0.26999999999999935, "avg_cost": 0.00018199401041666676, "unrestricted": {"accuracy": 0.6770833333333334, "large_model_usage": 0.2760416666666667, "avg_cost": 0.0002030773437499999}, "random_baseline": {"accuracy": 0.6223958333333334, "accuracy_std": 0.013431038498153771, "avg_cost": 0.00018089052083333337, "cost_std": 1.1334387480056747e-05, "small_model_fraction": 0.741783378945547}, "unrestricted_random_baseline": {"accuracy": 0.63203125, "accuracy_std": 0.0138315330467599, "avg_cost": 0.0001985913541666666, "cost_std": 1.536036745647361e-05, "small_model_fraction": 0.7055057581698237}}}}, {"trial": 22, "trial_seed": 63, "subject": "TruthfulQA", "method": "standard", "calibration_size": 300, "test_size": 384, "small_model_accuracy": 0.5394736842105263, "large_model_accuracy": 0.8187134502923976, "small_model_cost": 3.205672514619887e-05, "large_model_cost": 0.0006147017543859664, "hybrid_results": {"0.05": {"accuracy": 0.7682291666666666, "large_model_usage": 0.78125, "small_model_usage": 0.21875, "avg_lambda": 0.8600000000000082, "avg_cost": 0.0005041570312499997, "unrestricted": {"accuracy": 0.7864583333333334, "large_model_usage": 0.78125, "avg_cost": 0.0005133861979166662}, "random_baseline": {"accuracy": 0.7645833333333333, "accuracy_std": 0.012281068877657921, "avg_cost": 0.0005022455208333334, "cost_std": 1.0923082749821056e-05, "small_model_fraction": 0.18972911050181768}, "unrestricted_random_baseline": {"accuracy": 0.7656249999999999, "accuracy_std": 0.007725206757391317, "avg_cost": 0.0005140864062499999, "cost_std": 6.671321715446618e-06, "small_model_fraction": 0.17388899138382113}}, "0.1": {"accuracy": 0.7395833333333334, "large_model_usage": 0.5989583333333334, "small_model_usage": 0.40104166666666663, "avg_lambda": 0.699999999999995, "avg_cost": 0.00038461536458333297, "unrestricted": {"accuracy": 0.7630208333333334, "large_model_usage": 0.5989583333333334, "avg_cost": 0.00040253723958333295}, "random_baseline": {"accuracy": 0.7075520833333334, "accuracy_std": 0.013634000117404612, "avg_cost": 0.0003899009635416665, "cost_std": 1.1149157948871531e-05, "small_model_fraction": 0.3948997730279259}, "unrestricted_random_baseline": {"accuracy": 0.7091145833333332, "accuracy_std": 0.010677083333333335, "avg_cost": 0.00038598937499999974, "cost_std": 1.3358942289024603e-05, "small_model_fraction": 0.36414026406346367}}, "0.15": {"accuracy": 0.6927083333333334, "large_model_usage": 0.4348958333333333, "small_model_usage": 0.5651041666666667, "avg_lambda": 0.5500000000000033, "avg_cost": 0.0002919591145833331, "unrestricted": {"accuracy": 0.7213541666666666, "large_model_usage": 0.4348958333333333, "avg_cost": 0.0003011882812499998}, "random_baseline": {"accuracy": 0.6640625, "accuracy_std": 0.012214624374540173, "avg_cost": 0.0002905817187499998, "cost_std": 1.8941218089135564e-05, "small_model_fraction": 0.5539267025477697}, "unrestricted_random_baseline": {"accuracy": 0.6617187500000001, "accuracy_std": 0.0074070117986082895, "avg_cost": 0.0003009061197916664, "cost_std": 1.166251102063108e-05, "small_model_fraction": 0.5380865834297729}}, "0.2": {"accuracy": 0.6432291666666666, "large_model_usage": 0.3359375, "small_model_usage": 0.6640625, "avg_lambda": 0.40000000000000285, "avg_cost": 0.0002253549479166665, "unrestricted": {"accuracy": 0.6927083333333334, "large_model_usage": 0.3359375, "avg_cost": 0.00024041744791666655}, "random_baseline": {"accuracy": 0.6330729166666667, "accuracy_std": 0.011845380641191823, "avg_cost": 0.00022522940104166656, "cost_std": 1.080979098952032e-05, "small_model_fraction": 0.6682401581239229}, "unrestricted_random_baseline": {"accuracy": 0.63515625, "accuracy_std": 0.011258315792300662, "avg_cost": 0.00023317322916666653, "cost_std": 1.1115377716603632e-05, "small_model_fraction": 0.6423882255679143}}, "0.25": {"accuracy": 0.609375, "large_model_usage": 0.2734375, "small_model_usage": 0.7265625, "avg_lambda": 0.3100000000000015, "avg_cost": 0.0001836309895833332, "unrestricted": {"accuracy": 0.6666666666666666, "large_model_usage": 0.2734375, "avg_cost": 0.00020124036458333332}, "random_baseline": {"accuracy": 0.6177083333333334, "accuracy_std": 0.013868257245514961, "avg_cost": 0.00019000802083333327, "cost_std": 9.184525805176336e-06, "small_model_fraction": 0.7398514415631285}, "unrestricted_random_baseline": {"accuracy": 0.6098958333333333, "accuracy_std": 0.013370310055201501, "avg_cost": 0.00020154075520833326, "cost_std": 1.2163099162847142e-05, "small_model_fraction": 0.7096282797471309}}}}, {"trial": 23, "trial_seed": 64, "subject": "TruthfulQA", "method": "standard", "calibration_size": 300, "test_size": 384, "small_model_accuracy": 0.554093567251462, "large_model_accuracy": 0.8187134502923976, "small_model_cost": 3.206476608187141e-05, "large_model_cost": 0.0006161257309941526, "hybrid_results": {"0.05": {"accuracy": 0.8020833333333334, "large_model_usage": 0.7942708333333334, "small_model_usage": 0.20572916666666663, "avg_lambda": 0.8799999999999967, "avg_cost": 0.0005181622395833331, "unrestricted": {"accuracy": 0.8125, "large_model_usage": 0.7942708333333334, "avg_cost": 0.0005255945312499998}, "random_baseline": {"accuracy": 0.7859375, "accuracy_std": 0.00966001926614137, "avg_cost": 0.0005223729947916669, "cost_std": 9.167405434578355e-06, "small_model_fraction": 0.1677281949933642}, "unrestricted_random_baseline": {"accuracy": 0.7950520833333333, "accuracy_std": 0.007183913658402973, "avg_cost": 0.0005302334635416668, "cost_std": 9.919203782651956e-06, "small_model_fraction": 0.15500299657545075}}, "0.1": {"accuracy": 0.7708333333333334, "large_model_usage": 0.640625, "small_model_usage": 0.359375, "avg_lambda": 0.699999999999995, "avg_cost": 0.00041451119791666684, "unrestricted": {"accuracy": 0.7942708333333334, "large_model_usage": 0.640625, "avg_cost": 0.00043078203125000014}, "random_baseline": {"accuracy": 0.7434895833333333, "accuracy_std": 0.010302104398360663, "avg_cost": 0.00041334052083333354, "cost_std": 1.1223658903464345e-05, "small_model_fraction": 0.345194329341571}, "unrestricted_random_baseline": {"accuracy": 0.7536458333333333, "accuracy_std": 0.010364452469860628, "avg_cost": 0.00043652354166666695, "cost_std": 1.3242888510923225e-05, "small_model_fraction": 0.31733622152267754}}, "0.15": {"accuracy": 0.703125, "large_model_usage": 0.4427083333333333, "small_model_usage": 0.5572916666666667, "avg_lambda": 0.5, "avg_cost": 0.0002960164062499997, "unrestricted": {"accuracy": 0.7421875, "large_model_usage": 0.4427083333333333, "avg_cost": 0.0003085580729166665}, "random_baseline": {"accuracy": 0.6755208333333333, "accuracy_std": 0.013441133229754113, "avg_cost": 0.0002883014062499999, "cost_std": 1.2706217033273924e-05, "small_model_fraction": 0.5480751907332643}, "unrestricted_random_baseline": {"accuracy": 0.690625, "accuracy_std": 0.009374999999999996, "avg_cost": 0.0003101099218749999, "cost_std": 1.0049357431732488e-05, "small_model_fraction": 0.5266019757435407}}, "0.2": {"accuracy": 0.6666666666666666, "large_model_usage": 0.3619791666666667, "small_model_usage": 0.6380208333333333, "avg_lambda": 0.40000000000000285, "avg_cost": 0.00024234453124999985, "unrestricted": {"accuracy": 0.7161458333333334, "large_model_usage": 0.3619791666666667, "avg_cost": 0.0002578341145833331}, "random_baseline": {"accuracy": 0.6544270833333333, "accuracy_std": 0.01613953170036058, "avg_cost": 0.0002373253125, "cost_std": 1.2488438808719195e-05, "small_model_fraction": 0.6399694932536539}, "unrestricted_random_baseline": {"accuracy": 0.6591145833333332, "accuracy_std": 0.009912951203476182, "avg_cost": 0.0002489136197916666, "cost_std": 1.8099057924280085e-05, "small_model_fraction": 0.6134490026475755}}, "0.25": {"accuracy": 0.5911458333333334, "large_model_usage": 0.2708333333333333, "small_model_usage": 0.7291666666666667, "avg_lambda": 0.25, "avg_cost": 0.00017780286458333325, "unrestricted": {"accuracy": 0.671875, "large_model_usage": 0.2708333333333333, "avg_cost": 0.00020065182291666674}, "random_baseline": {"accuracy": 0.6268229166666666, "accuracy_std": 0.011590735981389525, "avg_cost": 0.0001792299739583333, "cost_std": 1.1374604755067818e-05, "small_model_fraction": 0.750474509928343}, "unrestricted_random_baseline": {"accuracy": 0.6382812499999999, "accuracy_std": 0.016406250000000004, "avg_cost": 0.00020203390624999994, "cost_std": 1.2548777786350583e-05, "small_model_fraction": 0.711353665177547}}}}, {"trial": 24, "trial_seed": 65, "subject": "TruthfulQA", "method": "standard", "calibration_size": 300, "test_size": 384, "small_model_accuracy": 0.5599415204678363, "large_model_accuracy": 0.8245614035087719, "small_model_cost": 3.207353801169599e-05, "large_model_cost": 0.0006163713450292402, "hybrid_results": {"0.05": {"accuracy": 0.8203125, "large_model_usage": 0.828125, "small_model_usage": 0.171875, "avg_lambda": 0.8999999999999971, "avg_cost": 0.0005451763020833336, "unrestricted": {"accuracy": 0.8203125, "large_model_usage": 0.828125, "avg_cost": 0.0005454888020833335}, "random_baseline": {"accuracy": 0.79765625, "accuracy_std": 0.006593223386027174, "avg_cost": 0.0005452766927083331, "cost_std": 7.992243234615047e-06, "small_model_fraction": 0.12184718493007951}, "unrestricted_random_baseline": {"accuracy": 0.79765625, "accuracy_std": 0.009677554258365657, "avg_cost": 0.0005482965624999998, "cost_std": 7.428040521864896e-06, "small_model_fraction": 0.12131235492345158}}, "0.1": {"accuracy": 0.7890625, "large_model_usage": 0.671875, "small_model_usage": 0.328125, "avg_lambda": 0.7599999999999959, "avg_cost": 0.0004452231770833332, "unrestricted": {"accuracy": 0.796875, "large_model_usage": 0.671875, "avg_cost": 0.0004498013020833333}, "random_baseline": {"accuracy": 0.7567708333333333, "accuracy_std": 0.012554136242910378, "avg_cost": 0.0004468746874999998, "cost_std": 1.0040893790899064e-05, "small_model_fraction": 0.29291256255009024}, "unrestricted_random_baseline": {"accuracy": 0.7505208333333333, "accuracy_std": 0.011576099359840048, "avg_cost": 0.0004455920572916666, "cost_std": 9.435630608298712e-06, "small_model_fraction": 0.2850773029529879}}, "0.15": {"accuracy": 0.75, "large_model_usage": 0.5130208333333334, "small_model_usage": 0.48697916666666663, "avg_lambda": 0.610000000000006, "avg_cost": 0.000343082552083333, "unrestricted": {"accuracy": 0.7708333333333334, "large_model_usage": 0.5130208333333334, "avg_cost": 0.0003503585937499997}, "random_baseline": {"accuracy": 0.7018229166666666, "accuracy_std": 0.012981712063140124, "avg_cost": 0.00034225526041666666, "cost_std": 1.5187699314696488e-05, "small_model_fraction": 0.46772175021649764}, "unrestricted_random_baseline": {"accuracy": 0.7132812499999999, "accuracy_std": 0.011437599612498812, "avg_cost": 0.00035724624999999984, "cost_std": 1.189539782990854e-05, "small_model_fraction": 0.4552691248955058}}, "0.2": {"accuracy": 0.6848958333333334, "large_model_usage": 0.3802083333333333, "small_model_usage": 0.6197916666666667, "avg_lambda": 0.4099999999999973, "avg_cost": 0.00025133776041666656, "unrestricted": {"accuracy": 0.7291666666666666, "large_model_usage": 0.3802083333333333, "avg_cost": 0.0002684315104166665}, "random_baseline": {"accuracy": 0.6606770833333333, "accuracy_std": 0.018672074760670156, "avg_cost": 0.000247093984375, "cost_std": 1.1793344904734134e-05, "small_model_fraction": 0.6247389263290749}, "unrestricted_random_baseline": {"accuracy": 0.6645833333333334, "accuracy_std": 0.011865401822806793, "avg_cost": 0.00026884921875000003, "cost_std": 9.334129714413643e-06, "small_model_fraction": 0.5954837249665158}}, "0.25": {"accuracy": 0.6536458333333334, "large_model_usage": 0.2942708333333333, "small_model_usage": 0.7057291666666667, "avg_lambda": 0.3100000000000015, "avg_cost": 0.00019715546875000002, "unrestricted": {"accuracy": 0.7005208333333334, "large_model_usage": 0.2942708333333333, "avg_cost": 0.0002147596354166667}, "random_baseline": {"accuracy": 0.6380208333333334, "accuracy_std": 0.012597277731716474, "avg_cost": 0.00019679333333333333, "cost_std": 1.7168453466248654e-05, "small_model_fraction": 0.7174695356449501}, "unrestricted_random_baseline": {"accuracy": 0.64140625, "accuracy_std": 0.009677554258365657, "avg_cost": 0.00021264299479166671, "cost_std": 1.6506792623771723e-05, "small_model_fraction": 0.6873407786048984}}}}, {"trial": 25, "trial_seed": 66, "subject": "TruthfulQA", "method": "standard", "calibration_size": 300, "test_size": 384, "small_model_accuracy": 0.5614035087719298, "large_model_accuracy": 0.827485380116959, "small_model_cost": 3.202061403508776e-05, "large_model_cost": 0.0006139795321637437, "hybrid_results": {"0.05": {"accuracy": 0.8125, "large_model_usage": 0.8151041666666666, "small_model_usage": 0.18489583333333337, "avg_lambda": 0.8999999999999971, "avg_cost": 0.0005355890624999996, "unrestricted": {"accuracy": 0.8125, "large_model_usage": 0.8151041666666666, "avg_cost": 0.0005355890624999996}, "random_baseline": {"accuracy": 0.7963541666666666, "accuracy_std": 0.007967217989988737, "avg_cost": 0.000535201171875, "cost_std": 9.644177955992852e-06, "small_model_fraction": 0.1347010368288815}, "unrestricted_random_baseline": {"accuracy": 0.7927083333333333, "accuracy_std": 0.010937500000000008, "avg_cost": 0.0005378204947916667, "cost_std": 1.4391101728049773e-05, "small_model_fraction": 0.1347010368288815}}, "0.1": {"accuracy": 0.7786458333333334, "large_model_usage": 0.625, "small_model_usage": 0.375, "avg_lambda": 0.7599999999999959, "avg_cost": 0.0004148598958333329, "unrestricted": {"accuracy": 0.7838541666666666, "large_model_usage": 0.625, "avg_cost": 0.00041807343749999953}, "random_baseline": {"accuracy": 0.7416666666666666, "accuracy_std": 0.012640271978657908, "avg_cost": 0.0004160726041666663, "cost_std": 1.3743415665165167e-05, "small_model_fraction": 0.34215411110237615}, "unrestricted_random_baseline": {"accuracy": 0.7447916666666667, "accuracy_std": 0.01917204408261038, "avg_cost": 0.00042501093749999975, "cost_std": 1.2628705385233697e-05, "small_model_fraction": 0.3366321720675727}}, "0.15": {"accuracy": 0.703125, "large_model_usage": 0.4609375, "small_model_usage": 0.5390625, "avg_lambda": 0.5399999999999987, "avg_cost": 0.00030552656249999995, "unrestricted": {"accuracy": 0.7395833333333334, "large_model_usage": 0.4609375, "avg_cost": 0.00031734947916666644}, "random_baseline": {"accuracy": 0.6854166666666666, "accuracy_std": 0.016544146014863138, "avg_cost": 0.0003016979166666665, "cost_std": 1.613104130124452e-05, "small_model_fraction": 0.5300253334988035}, "unrestricted_random_baseline": {"accuracy": 0.6908854166666666, "accuracy_std": 0.014026284384890857, "avg_cost": 0.0003138164322916664, "cost_std": 9.916437786258423e-06, "small_model_fraction": 0.509709609659251}}, "0.2": {"accuracy": 0.6666666666666666, "large_model_usage": 0.3515625, "small_model_usage": 0.6484375, "avg_lambda": 0.40000000000000285, "avg_cost": 0.00023443281249999984, "unrestricted": {"accuracy": 0.7109375, "large_model_usage": 0.3515625, "avg_cost": 0.00024967760416666647}, "random_baseline": {"accuracy": 0.6653645833333334, "accuracy_std": 0.013742994585373721, "avg_cost": 0.00023805604166666654, "cost_std": 1.3735409478742068e-05, "small_model_fraction": 0.6521881662784932}, "unrestricted_random_baseline": {"accuracy": 0.6690104166666666, "accuracy_std": 0.01581696316568773, "avg_cost": 0.0002514273177083331, "cost_std": 1.2996920698823332e-05, "small_model_fraction": 0.625992517080973}}, "0.25": {"accuracy": 0.6328125, "large_model_usage": 0.265625, "small_model_usage": 0.734375, "avg_lambda": 0.3100000000000015, "avg_cost": 0.00018164114583333322, "unrestricted": {"accuracy": 0.6848958333333334, "large_model_usage": 0.265625, "avg_cost": 0.00019682864583333323}, "random_baseline": {"accuracy": 0.63125, "accuracy_std": 0.012391017972260748, "avg_cost": 0.00018672606770833318, "cost_std": 1.4441461531884241e-05, "small_model_fraction": 0.7429019005682318}, "unrestricted_random_baseline": {"accuracy": 0.6338541666666667, "accuracy_std": 0.008652733190539672, "avg_cost": 0.00018999898437499987, "cost_std": 1.3767334875286427e-05, "small_model_fraction": 0.7168046976095815}}}}, {"trial": 26, "trial_seed": 67, "subject": "TruthfulQA", "method": "standard", "calibration_size": 300, "test_size": 384, "small_model_accuracy": 0.5467836257309941, "large_model_accuracy": 0.8377192982456141, "small_model_cost": 3.206505847953217e-05, "large_model_cost": 0.0006165058479532175, "hybrid_results": {"0.05": {"accuracy": 0.7578125, "large_model_usage": 0.640625, "small_model_usage": 0.359375, "avg_lambda": 0.779999999999997, "avg_cost": 0.00042660078124999966, "unrestricted": {"accuracy": 0.765625, "large_model_usage": 0.640625, "avg_cost": 0.0004299132812499997}, "random_baseline": {"accuracy": 0.7515625, "accuracy_std": 0.013691082737598862, "avg_cost": 0.00042392161458333313, "cost_std": 1.2133343211858165e-05, "small_model_fraction": 0.32493465569751845}, "unrestricted_random_baseline": {"accuracy": 0.75625, "accuracy_std": 0.009827063679225628, "avg_cost": 0.00043592966145833314, "cost_std": 1.02702917180514e-05, "small_model_fraction": 0.31926684458703264}}, "0.1": {"accuracy": 0.7161458333333334, "large_model_usage": 0.4947916666666667, "small_model_usage": 0.5052083333333333, "avg_lambda": 0.6399999999999937, "avg_cost": 0.0003315174479166665, "unrestricted": {"accuracy": 0.7317708333333334, "large_model_usage": 0.4947916666666667, "avg_cost": 0.00033965286458333344}, "random_baseline": {"accuracy": 0.6984375, "accuracy_std": 0.017959312134965266, "avg_cost": 0.0003321938541666666, "cost_std": 1.8272869438880734e-05, "small_model_fraction": 0.48762578719598876}, "unrestricted_random_baseline": {"accuracy": 0.6950520833333333, "accuracy_std": 0.012459243451985074, "avg_cost": 0.00033390359375, "cost_std": 1.636878630267946e-05, "small_model_fraction": 0.47370578569507843}}, "0.15": {"accuracy": 0.6901041666666666, "large_model_usage": 0.4322916666666667, "small_model_usage": 0.5677083333333333, "avg_lambda": 0.5, "avg_cost": 0.00029056432291666665, "unrestricted": {"accuracy": 0.7109375, "large_model_usage": 0.4322916666666667, "avg_cost": 0.0003008351562500001}, "random_baseline": {"accuracy": 0.67890625, "accuracy_std": 0.012049724272145082, "avg_cost": 0.0002943108854166667, "cost_std": 1.46800953976577e-05, "small_model_fraction": 0.5576981122930785}, "unrestricted_random_baseline": {"accuracy": 0.6760416666666667, "accuracy_std": 0.012979099785809498, "avg_cost": 0.0003044869010416667, "cost_std": 1.716735132684862e-05, "small_model_fraction": 0.5401243331894968}}, "0.2": {"accuracy": 0.6692708333333334, "large_model_usage": 0.3671875, "small_model_usage": 0.6328125, "avg_lambda": 0.40000000000000285, "avg_cost": 0.0002479028645833333, "unrestricted": {"accuracy": 0.7005208333333334, "large_model_usage": 0.3671875, "avg_cost": 0.00026070494791666677}, "random_baseline": {"accuracy": 0.6632812499999999, "accuracy_std": 0.013584168078901, "avg_cost": 0.0002452234635416667, "cost_std": 7.81162331057625e-06, "small_model_fraction": 0.6306934594723058}, "unrestricted_random_baseline": {"accuracy": 0.6682291666666667, "accuracy_std": 0.015014099276457903, "avg_cost": 0.00025903408854166673, "cost_std": 1.6710722072079132e-05, "small_model_fraction": 0.6087886171616548}}, "0.25": {"accuracy": 0.6458333333333334, "large_model_usage": 0.3072916666666667, "small_model_usage": 0.6927083333333333, "avg_lambda": 0.3100000000000015, "avg_cost": 0.00020686119791666667, "unrestricted": {"accuracy": 0.6901041666666666, "large_model_usage": 0.3072916666666667, "avg_cost": 0.00022315286458333332}, "random_baseline": {"accuracy": 0.6401041666666667, "accuracy_std": 0.015353544763524647, "avg_cost": 0.0002139715625, "cost_std": 1.396022055941421e-05, "small_model_fraction": 0.7009172826651162}, "unrestricted_random_baseline": {"accuracy": 0.6382812500000001, "accuracy_std": 0.011076130374029571, "avg_cost": 0.0002121331510416667, "cost_std": 8.293007354912557e-06, "small_model_fraction": 0.6730416330525387}}}}, {"trial": 27, "trial_seed": 68, "subject": "TruthfulQA", "method": "standard", "calibration_size": 300, "test_size": 384, "small_model_accuracy": 0.543859649122807, "large_model_accuracy": 0.8260233918128655, "small_model_cost": 3.200994152046785e-05, "large_model_cost": 0.0006160818713450303, "hybrid_results": {"0.05": {"accuracy": 0.7916666666666666, "large_model_usage": 0.7135416666666666, "small_model_usage": 0.28645833333333337, "avg_lambda": 0.8000000000000057, "avg_cost": 0.0004691822916666668, "unrestricted": {"accuracy": 0.8020833333333334, "large_model_usage": 0.7135416666666666, "avg_cost": 0.0004761770833333333}, "random_baseline": {"accuracy": 0.7500000000000001, "accuracy_std": 0.011470164346638152, "avg_cost": 0.00047361018229166675, "cost_std": 1.127581574990805e-05, "small_model_fraction": 0.251509398375792}, "unrestricted_random_baseline": {"accuracy": 0.7575520833333333, "accuracy_std": 0.008593749999999995, "avg_cost": 0.00048008515625000013, "cost_std": 1.0786500119376262e-05, "small_model_fraction": 0.23953349042766048}}, "0.1": {"accuracy": 0.71875, "large_model_usage": 0.5, "small_model_usage": 0.5, "avg_lambda": 0.6699999999999946, "avg_cost": 0.00033775520833333335, "unrestricted": {"accuracy": 0.7369791666666666, "large_model_usage": 0.5, "avg_cost": 0.0003427083333333334}, "random_baseline": {"accuracy": 0.6825520833333334, "accuracy_std": 0.013604122798551029, "avg_cost": 0.00033491869791666643, "cost_std": 1.4276457485412565e-05, "small_model_fraction": 0.47652805895893313}, "unrestricted_random_baseline": {"accuracy": 0.6791666666666667, "accuracy_std": 0.019962258225767502, "avg_cost": 0.00033357072916666644, "cost_std": 1.6039859270306696e-05, "small_model_fraction": 0.4680477250358703}}, "0.15": {"accuracy": 0.6770833333333334, "large_model_usage": 0.4375, "small_model_usage": 0.5625, "avg_lambda": 0.5, "avg_cost": 0.00029198958333333307, "unrestricted": {"accuracy": 0.7213541666666666, "large_model_usage": 0.4375, "avg_cost": 0.0003042968749999998}, "random_baseline": {"accuracy": 0.6515625, "accuracy_std": 0.013868257245514947, "avg_cost": 0.00028923841145833316, "cost_std": 9.686484215513106e-06, "small_model_fraction": 0.5548842042606718}, "unrestricted_random_baseline": {"accuracy": 0.6684895833333333, "accuracy_std": 0.010677083333333332, "avg_cost": 0.00029714986979166634, "cost_std": 1.2947000934097722e-05, "small_model_fraction": 0.5338126700228194}}, "0.2": {"accuracy": 0.6614583333333334, "large_model_usage": 0.3489583333333333, "small_model_usage": 0.6510416666666667, "avg_lambda": 0.40000000000000285, "avg_cost": 0.00023498958333333323, "unrestricted": {"accuracy": 0.6875, "large_model_usage": 0.3489583333333333, "avg_cost": 0.0002492083333333333}, "random_baseline": {"accuracy": 0.6205729166666667, "accuracy_std": 0.012437452016937037, "avg_cost": 0.0002265286197916664, "cost_std": 1.2580302383597335e-05, "small_model_fraction": 0.6524749239809653}, "unrestricted_random_baseline": {"accuracy": 0.6395833333333334, "accuracy_std": 0.013134916888456761, "avg_cost": 0.0002481712499999997, "cost_std": 9.271909870782296e-06, "small_model_fraction": 0.6281307477349488}}, "0.25": {"accuracy": 0.5989583333333334, "large_model_usage": 0.2552083333333333, "small_model_usage": 0.7447916666666667, "avg_lambda": 0.29000000000000226, "avg_cost": 0.00017289583333333325, "unrestricted": {"accuracy": 0.6510416666666666, "large_model_usage": 0.2552083333333333, "avg_cost": 0.00019152083333333323}, "random_baseline": {"accuracy": 0.59296875, "accuracy_std": 0.017740855552344895, "avg_cost": 0.00016778609374999993, "cost_std": 1.607523030526785e-05, "small_model_fraction": 0.7587867442025792}, "unrestricted_random_baseline": {"accuracy": 0.5973958333333333, "accuracy_std": 0.016764038478574043, "avg_cost": 0.00019067119791666657, "cost_std": 1.532888763428707e-05, "small_model_fraction": 0.7268985485045008}}}}, {"trial": 28, "trial_seed": 69, "subject": "TruthfulQA", "method": "standard", "calibration_size": 300, "test_size": 384, "small_model_accuracy": 0.5467836257309941, "large_model_accuracy": 0.8464912280701754, "small_model_cost": 3.20296783625731e-05, "large_model_cost": 0.0006146754385964921, "hybrid_results": {"0.05": {"accuracy": 0.8359375, "large_model_usage": 0.8229166666666666, "small_model_usage": 0.17708333333333337, "avg_lambda": 0.8999999999999971, "avg_cost": 0.0005431283854166662, "unrestricted": {"accuracy": 0.8359375, "large_model_usage": 0.8229166666666666, "avg_cost": 0.0005434252604166661}, "random_baseline": {"accuracy": 0.8234375, "accuracy_std": 0.008699631816921904, "avg_cost": 0.0005443438020833331, "cost_std": 7.340530082186138e-06, "small_model_fraction": 0.12279683139048572}, "unrestricted_random_baseline": {"accuracy": 0.8260416666666668, "accuracy_std": 0.005915529526875281, "avg_cost": 0.0005466302083333331, "cost_std": 7.810257697310353e-06, "small_model_fraction": 0.12228730223870611}}, "0.1": {"accuracy": 0.8098958333333334, "large_model_usage": 0.6692708333333334, "small_model_usage": 0.33072916666666663, "avg_lambda": 0.779999999999997, "avg_cost": 0.00044450859374999945, "unrestricted": {"accuracy": 0.8151041666666666, "large_model_usage": 0.6692708333333334, "avg_cost": 0.00044845651041666625}, "random_baseline": {"accuracy": 0.7721354166666666, "accuracy_std": 0.010878436757398663, "avg_cost": 0.0004455664583333332, "cost_std": 1.3436153787237329e-05, "small_model_fraction": 0.2920588399685163}, "unrestricted_random_baseline": {"accuracy": 0.7679687499999999, "accuracy_std": 0.013199297809868952, "avg_cost": 0.0004455192447916665, "cost_std": 1.1062197169800931e-05, "small_model_fraction": 0.28528299616063924}}, "0.15": {"accuracy": 0.7838541666666666, "large_model_usage": 0.6380208333333334, "small_model_usage": 0.36197916666666663, "avg_lambda": 0.699999999999995, "avg_cost": 0.00041137838541666603, "unrestricted": {"accuracy": 0.8098958333333334, "large_model_usage": 0.6380208333333334, "avg_cost": 0.0004285554687499995}, "random_baseline": {"accuracy": 0.7489583333333333, "accuracy_std": 0.008168951636124007, "avg_cost": 0.00041672471354166644, "cost_std": 1.1847706401610424e-05, "small_model_fraction": 0.3489205054855405}, "unrestricted_random_baseline": {"accuracy": 0.7606770833333334, "accuracy_std": 0.010638905320737252, "avg_cost": 0.00042543958333333325, "cost_std": 1.2357869873959883e-05, "small_model_fraction": 0.31943932754572807}}, "0.2": {"accuracy": 0.7265625, "large_model_usage": 0.4375, "small_model_usage": 0.5625, "avg_lambda": 0.5, "avg_cost": 0.00029343046874999993, "unrestricted": {"accuracy": 0.7578125, "large_model_usage": 0.4375, "avg_cost": 0.0003044460937499998}, "random_baseline": {"accuracy": 0.6958333333333333, "accuracy_std": 0.012640271978657903, "avg_cost": 0.00029436518229166667, "cost_std": 1.4407808716577359e-05, "small_model_fraction": 0.5513555435768032}, "unrestricted_random_baseline": {"accuracy": 0.70546875, "accuracy_std": 0.01611430052542186, "avg_cost": 0.00030916856770833327, "cost_std": 1.3501743254482576e-05, "small_model_fraction": 0.5324493303134006}}, "0.25": {"accuracy": 0.6458333333333334, "large_model_usage": 0.3098958333333333, "small_model_usage": 0.6901041666666667, "avg_lambda": 0.3100000000000015, "avg_cost": 0.00020774296874999994, "unrestricted": {"accuracy": 0.7161458333333334, "large_model_usage": 0.3098958333333333, "avg_cost": 0.0002248210937499999}, "random_baseline": {"accuracy": 0.65078125, "accuracy_std": 0.014520418603594416, "avg_cost": 0.00021131260416666658, "cost_std": 1.6431384506044597e-05, "small_model_fraction": 0.6984217471746779}, "unrestricted_random_baseline": {"accuracy": 0.6562500000000001, "accuracy_std": 0.01302083333333335, "avg_cost": 0.00022966070312499992, "cost_std": 1.6394183061277104e-05, "small_model_fraction": 0.669110412285459}}}}, {"trial": 29, "trial_seed": 70, "subject": "TruthfulQA", "method": "standard", "calibration_size": 300, "test_size": 384, "small_model_accuracy": 0.5760233918128655, "large_model_accuracy": 0.8391812865497076, "small_model_cost": 3.2075000000000034e-05, "large_model_cost": 0.0006160730994152059, "hybrid_results": {"0.05": {"accuracy": 0.8046875, "large_model_usage": 0.8255208333333334, "small_model_usage": 0.17447916666666663, "avg_lambda": 0.8999999999999971, "avg_cost": 0.0005435916666666665, "unrestricted": {"accuracy": 0.8046875, "large_model_usage": 0.8255208333333334, "avg_cost": 0.0005435916666666665}, "random_baseline": {"accuracy": 0.7906250000000001, "accuracy_std": 0.010875319280115169, "avg_cost": 0.0005463092708333334, "cost_std": 1.2328831272346662e-05, "small_model_fraction": 0.1241124462924102}, "unrestricted_random_baseline": {"accuracy": 0.79296875, "accuracy_std": 0.006715519249076891, "avg_cost": 0.0005432308333333332, "cost_std": 6.218493089725731e-06, "small_model_fraction": 0.1241124462924102}}, "0.1": {"accuracy": 0.7890625, "large_model_usage": 0.6692708333333334, "small_model_usage": 0.33072916666666663, "avg_lambda": 0.779999999999997, "avg_cost": 0.00044563333333333307, "unrestricted": {"accuracy": 0.7890625, "large_model_usage": 0.6692708333333334, "avg_cost": 0.0004473833333333329}, "random_baseline": {"accuracy": 0.74921875, "accuracy_std": 0.013683650682728796, "avg_cost": 0.00044269395833333307, "cost_std": 1.565568919522607e-05, "small_model_fraction": 0.2918498643275464}, "unrestricted_random_baseline": {"accuracy": 0.7479166666666667, "accuracy_std": 0.007529600153542177, "avg_cost": 0.000446820520833333, "cost_std": 9.823580587153644e-06, "small_model_fraction": 0.2888532792329164}}, "0.15": {"accuracy": 0.7473958333333334, "large_model_usage": 0.5026041666666666, "small_model_usage": 0.49739583333333337, "avg_lambda": 0.610000000000006, "avg_cost": 0.00033506041666666654, "unrestricted": {"accuracy": 0.7552083333333334, "large_model_usage": 0.5026041666666666, "avg_cost": 0.000344414583333333}, "random_baseline": {"accuracy": 0.7067708333333333, "accuracy_std": 0.013491493590613144, "avg_cost": 0.0003246267447916665, "cost_std": 1.4535628319009532e-05, "small_model_fraction": 0.48118766658647527}, "unrestricted_random_baseline": {"accuracy": 0.7028645833333333, "accuracy_std": 0.018831204364365545, "avg_cost": 0.0003391020572916665, "cost_std": 1.6816053712288306e-05, "small_model_fraction": 0.46517020578303536}}, "0.2": {"accuracy": 0.6979166666666666, "large_model_usage": 0.3645833333333333, "small_model_usage": 0.6354166666666667, "avg_lambda": 0.40000000000000285, "avg_cost": 0.00024559166666666656, "unrestricted": {"accuracy": 0.7317708333333334, "large_model_usage": 0.3645833333333333, "avg_cost": 0.0002580187499999998}, "random_baseline": {"accuracy": 0.66640625, "accuracy_std": 0.01418972158940133, "avg_cost": 0.0002463576041666665, "cost_std": 1.5489008054790847e-05, "small_model_fraction": 0.6343880795494468}, "unrestricted_random_baseline": {"accuracy": 0.6677083333333333, "accuracy_std": 0.009895833333333314, "avg_cost": 0.00025035109374999984, "cost_std": 7.435960730618759e-06, "small_model_fraction": 0.6131087580143642}}, "0.25": {"accuracy": 0.6692708333333334, "large_model_usage": 0.2994791666666667, "small_model_usage": 0.7005208333333333, "avg_lambda": 0.3100000000000015, "avg_cost": 0.00020026354166666663, "unrestricted": {"accuracy": 0.71875, "large_model_usage": 0.2994791666666667, "avg_cost": 0.00021692499999999995}, "random_baseline": {"accuracy": 0.6505208333333334, "accuracy_std": 0.011158481919598198, "avg_cost": 0.00019860294270833325, "cost_std": 1.321265809534032e-05, "small_model_fraction": 0.7120049845451819}, "unrestricted_random_baseline": {"accuracy": 0.6528645833333333, "accuracy_std": 0.012217400110145734, "avg_cost": 0.00022239289062499982, "cost_std": 1.244120885453574e-05, "small_model_fraction": 0.6834749972900563}}}}, {"trial": 30, "trial_seed": 71, "subject": "TruthfulQA", "method": "standard", "calibration_size": 300, "test_size": 384, "small_model_accuracy": 0.5614035087719298, "large_model_accuracy": 0.8318713450292398, "small_model_cost": 3.2025292397660845e-05, "large_model_cost": 0.000614564327485381, "hybrid_results": {"0.05": {"accuracy": 0.7838541666666666, "large_model_usage": 0.8255208333333334, "small_model_usage": 0.17447916666666663, "avg_lambda": 0.8899999999999917, "avg_cost": 0.0005401104166666665, "unrestricted": {"accuracy": 0.7994791666666666, "large_model_usage": 0.8255208333333334, "avg_cost": 0.0005452614583333333}, "random_baseline": {"accuracy": 0.7799479166666667, "accuracy_std": 0.012981712063140123, "avg_cost": 0.0005457570312499998, "cost_std": 1.2179826482195622e-05, "small_model_fraction": 0.12780930776167315}, "unrestricted_random_baseline": {"accuracy": 0.790625, "accuracy_std": 0.007654655446197428, "avg_cost": 0.0005532514843749998, "cost_std": 8.96585484876265e-06, "small_model_fraction": 0.11896691033178898}}, "0.1": {"accuracy": 0.7734375, "large_model_usage": 0.65625, "small_model_usage": 0.34375, "avg_lambda": 0.699999999999995, "avg_cost": 0.00042640208333333333, "unrestricted": {"accuracy": 0.7864583333333334, "large_model_usage": 0.65625, "avg_cost": 0.0004407093750000001}, "random_baseline": {"accuracy": 0.7239583333333333, "accuracy_std": 0.012434725402409724, "avg_cost": 0.00042394968750000003, "cost_std": 1.310503171557687e-05, "small_model_fraction": 0.3230036663958729}, "unrestricted_random_baseline": {"accuracy": 0.7354166666666667, "accuracy_std": 0.011060812802914221, "avg_cost": 0.00044450695312499993, "cost_std": 1.0204026382073883e-05, "small_model_fraction": 0.2984434381452247}}, "0.15": {"accuracy": 0.7265625, "large_model_usage": 0.4895833333333333, "small_model_usage": 0.5104166666666667, "avg_lambda": 0.5399999999999987, "avg_cost": 0.0003248239583333333, "unrestricted": {"accuracy": 0.75, "large_model_usage": 0.4895833333333333, "avg_cost": 0.0003369854166666666}, "random_baseline": {"accuracy": 0.6807291666666667, "accuracy_std": 0.008808090898587367, "avg_cost": 0.000339475234375, "cost_std": 1.6057785689017138e-05, "small_model_fraction": 0.4973750284535317}, "unrestricted_random_baseline": {"accuracy": 0.68828125, "accuracy_std": 0.014916670303770497, "avg_cost": 0.00033371979166666664, "cost_std": 2.0710900285563787e-05, "small_model_fraction": 0.47649838740319933}}, "0.2": {"accuracy": 0.6822916666666666, "large_model_usage": 0.375, "small_model_usage": 0.625, "avg_lambda": 0.40000000000000285, "avg_cost": 0.00025330312499999996, "unrestricted": {"accuracy": 0.71875, "large_model_usage": 0.375, "avg_cost": 0.0002653135416666666}, "random_baseline": {"accuracy": 0.6411458333333333, "accuracy_std": 0.014490028899212037, "avg_cost": 0.00025923781249999984, "cost_std": 1.6010511640444995e-05, "small_model_fraction": 0.6201493474698764}, "unrestricted_random_baseline": {"accuracy": 0.6473958333333334, "accuracy_std": 0.020178511070635945, "avg_cost": 0.00027243518229166653, "cost_std": 8.69652128781148e-06, "small_model_fraction": 0.5995319880428672}}, "0.25": {"accuracy": 0.5833333333333334, "large_model_usage": 0.3072916666666667, "small_model_usage": 0.6927083333333333, "avg_lambda": 0.2600000000000014, "avg_cost": 0.00020006874999999987, "unrestricted": {"accuracy": 0.6875, "large_model_usage": 0.3072916666666667, "avg_cost": 0.00022353229166666651}, "random_baseline": {"accuracy": 0.6190104166666666, "accuracy_std": 0.01549643286126234, "avg_cost": 0.00019659309895833326, "cost_std": 1.4076883034825371e-05, "small_model_fraction": 0.7115327085728519}, "unrestricted_random_baseline": {"accuracy": 0.6328125, "accuracy_std": 0.010085894130748467, "avg_cost": 0.0002247750520833332, "cost_std": 1.3359764273994727e-05, "small_model_fraction": 0.6712546495014396}}}}], "unrestricted_hybrid": {"0.05": {"avg_accuracy": 0.8067708333333333, "std_accuracy": 0.018097240875747187, "avg_cost": 0.0005166728819444442, "std_cost": 4.6075804493456236e-05}, "0.1": {"avg_accuracy": 0.7756944444444446, "std_accuracy": 0.027778320307201922, "avg_cost": 0.00040598243055555534, "std_cost": 4.164271737074089e-05}, "0.15": {"avg_accuracy": 0.7380208333333332, "std_accuracy": 0.031102080472570032, "avg_cost": 0.00031539267361111096, "std_cost": 4.03691967373523e-05}, "0.2": {"avg_accuracy": 0.7086805555555554, "std_accuracy": 0.026699447263606395, "avg_cost": 0.00024938954861111106, "std_cost": 2.66248648313958e-05}, "0.25": {"avg_accuracy": 0.6811631944444444, "std_accuracy": 0.03799872914668487, "avg_cost": 0.00020159822916666667, "std_cost": 3.187344018091947e-05}}, "random_baseline": {"0.05": {"avg_accuracy": 0.7834375, "std_accuracy": 0.023309313808927458, "avg_cost": 0.0005122227977430555, "std_cost": 4.227793489740391e-05, "avg_small_model_fraction": 0.17827498469740852, "std_small_model_fraction": 0.0730640152513437}, "0.1": {"avg_accuracy": 0.7286371527777779, "std_accuracy": 0.026200598642723318, "avg_cost": 0.0003961761354166666, "std_cost": 4.111389204203667e-05, "avg_small_model_fraction": 0.3784952133693971, "std_small_model_fraction": 0.07057969337139776}, "0.15": {"avg_accuracy": 0.6834722222222223, "std_accuracy": 0.025604025168746818, "avg_cost": 0.0003036086276041666, "std_cost": 3.9736099237553464e-05, "avg_small_model_fraction": 0.5335335914918403, "std_small_model_fraction": 0.06789277088386617}, "0.2": {"avg_accuracy": 0.6515711805555555, "std_accuracy": 0.021805287742184983, "avg_cost": 0.00023519077604166656, "std_cost": 2.7095350860754472e-05, "avg_small_model_fraction": 0.6507553607898162, "std_small_model_fraction": 0.04678371774578696}, "0.25": {"avg_accuracy": 0.6273090277777779, "std_accuracy": 0.028167705917163412, "avg_cost": 0.00018469496961805545, "std_cost": 3.024730140067756e-05, "avg_small_model_fraction": 0.7392350070969836, "std_small_model_fraction": 0.05200623938174398}}, "unrestricted_random_baseline": {"0.05": {"avg_accuracy": 0.786328125, "std_accuracy": 0.020791380284642603, "avg_cost": 0.00051619096875, "std_cost": 4.037717447600102e-05, "avg_small_model_fraction": 0.17237749641821226, "std_small_model_fraction": 0.0701025041438952}, "0.1": {"avg_accuracy": 0.7321961805555557, "std_accuracy": 0.0284889551631099, "avg_cost": 0.0004051915460069444, "std_cost": 4.259915550267286e-05, "avg_small_model_fraction": 0.35859537215298515, "std_small_model_fraction": 0.0715000494209171}, "0.15": {"avg_accuracy": 0.6897569444444444, "std_accuracy": 0.02570945507487301, "avg_cost": 0.00031462125260416654, "std_cost": 4.019508384678577e-05, "avg_small_model_fraction": 0.513996853131344, "std_small_model_fraction": 0.06916880521617609}, "0.2": {"avg_accuracy": 0.6594097222222223, "std_accuracy": 0.022437253095807862, "avg_cost": 0.00025045566840277775, "std_cost": 2.8744322731320913e-05, "avg_small_model_fraction": 0.6272104221958569, "std_small_model_fraction": 0.045542905201839554}, "0.25": {"avg_accuracy": 0.6348871527777777, "std_accuracy": 0.02873301784115658, "avg_cost": 0.00020248409201388884, "std_cost": 3.210851465247836e-05, "avg_small_model_fraction": 0.709180331464419, "std_small_model_fraction": 0.054674772655106146}}}