{"subject": "TruthfulQA", "method": "standard", "iterations": 30, "random_seed": 42, "calibration_size": 400, "total_examples": 684, "small_model": {"avg_accuracy": 0.556335282651072, "std_accuracy": 0.012042461547327405, "avg_cost": 3.203621832358677e-05, "std_cost": 4.1788899969096624e-08, "x_position": 1.0, "x_std": 0.0}, "large_model": {"avg_accuracy": 0.8313840155945417, "std_accuracy": 0.009469752671727963, "avg_cost": 0.0006146627680311897, "std_cost": 1.1901210604054217e-06, "x_position": 0.0, "x_std": 0.0}, "hybrid_models": {"0.05": {"avg_accuracy": 0.7980046948356808, "std_accuracy": 0.02711470837656699, "avg_large_model_usage": 0.7752347417840377, "std_large_model_usage": 0.05011132331490825, "avg_lambda": 0.866333333333337, "std_lambda": 0.04012342070273758, "avg_cost": 0.0005080243192488261, "std_cost": 3.204357926047236e-05}, "0.1": {"avg_accuracy": 0.7509389671361503, "std_accuracy": 0.03496313194725045, "avg_large_model_usage": 0.6036384976525824, "std_large_model_usage": 0.05019866744739142, "avg_lambda": 0.7093333333333313, "std_lambda": 0.04040901989517757, "avg_cost": 0.0003941323004694835, "std_cost": 2.9686936095276506e-05}, "0.15": {"avg_accuracy": 0.7028169014084507, "std_accuracy": 0.030785201400555486, "avg_large_model_usage": 0.4444835680751174, "std_large_model_usage": 0.025580662127030286, "avg_lambda": 0.5416666666666665, "std_lambda": 0.047193455290137444, "avg_cost": 0.0002969675117370891, "std_cost": 1.6070091982805298e-05}, "0.2": {"avg_accuracy": 0.6562206572769953, "std_accuracy": 0.03625772842087319, "avg_large_model_usage": 0.3363849765258216, "std_large_model_usage": 0.0334134130329858, "avg_lambda": 0.4013333333333346, "std_lambda": 0.03575223380744501, "avg_cost": 0.00022773417840375577, "std_cost": 2.0490719309474145e-05}, "0.25": {"avg_accuracy": 0.6096244131455398, "std_accuracy": 0.03233221908313068, "avg_large_model_usage": 0.2748826291079812, "std_large_model_usage": 0.024847171121876514, "avg_lambda": 0.2893333333333338, "std_lambda": 0.03119116684077217, "avg_cost": 0.00018467619718309854, "std_cost": 1.3896282840994987e-05}}, "all_trials": [{"trial": 1, "trial_seed": 42, "subject": "TruthfulQA", "method": "standard", "calibration_size": 400, "test_size": 284, "small_model_accuracy": 0.5350877192982456, "large_model_accuracy": 0.8128654970760234, "small_model_cost": 3.2059795321637445e-05, "large_model_cost": 0.000615394736842106, "hybrid_results": {"0.05": {"accuracy": 0.7922535211267606, "large_model_usage": 0.7992957746478874, "small_model_usage": 0.20070422535211263, "avg_lambda": 0.870000000000003, "avg_cost": 0.0005135605633802813, "unrestricted": {"accuracy": 0.8133802816901409, "large_model_usage": 0.7992957746478874, "avg_cost": 0.0005251169014084504}, "random_baseline": {"accuracy": 0.7834507042253521, "accuracy_std": 0.011704767703235692, "avg_cost": 0.0005133415492957747, "cost_std": 1.572054338133769e-05, "small_model_fraction": 0.1745723875144405}, "unrestricted_random_baseline": {"accuracy": 0.7922535211267606, "accuracy_std": 0.0060987704491862246, "avg_cost": 0.0005286793309859153, "cost_std": 1.1724841153323516e-05, "small_model_fraction": 0.15476157693956313}}, "0.1": {"accuracy": 0.7711267605633803, "large_model_usage": 0.5950704225352113, "small_model_usage": 0.40492957746478875, "avg_lambda": 0.7599999999999976, "avg_cost": 0.0003965535211267604, "unrestricted": {"accuracy": 0.7816901408450704, "large_model_usage": 0.5950704225352113, "avg_cost": 0.0004001309859154928}, "random_baseline": {"accuracy": 0.7355633802816901, "accuracy_std": 0.01273947360220107, "avg_cost": 0.00039385137323943667, "cost_std": 1.508991162455852e-05, "small_model_fraction": 0.37515533553490504}, "unrestricted_random_baseline": {"accuracy": 0.7475352112676056, "accuracy_std": 0.016062668630925394, "avg_cost": 0.0004036594014084508, "cost_std": 1.1930229617268017e-05, "small_model_fraction": 0.3690225556616341}}, "0.15": {"accuracy": 0.7429577464788732, "large_model_usage": 0.4647887323943662, "small_model_usage": 0.5352112676056338, "avg_lambda": 0.6500000000000034, "avg_cost": 0.00031302535211267596, "unrestricted": {"accuracy": 0.7570422535211268, "large_model_usage": 0.4647887323943662, "avg_cost": 0.0003198140845070421}, "random_baseline": {"accuracy": 0.7056338028169014, "accuracy_std": 0.016074242550018797, "avg_cost": 0.0003093927464788733, "cost_std": 1.534006366409889e-05, "small_model_fraction": 0.5183460876547205}, "unrestricted_random_baseline": {"accuracy": 0.7130281690140845, "accuracy_std": 0.009990324619092356, "avg_cost": 0.00032093672535211286, "cost_std": 1.2029830667097621e-05, "small_model_fraction": 0.5067082927770962}}, "0.2": {"accuracy": 0.7147887323943662, "large_model_usage": 0.3873239436619718, "small_model_usage": 0.6126760563380282, "avg_lambda": 0.5, "avg_cost": 0.0002594760563380282, "unrestricted": {"accuracy": 0.7359154929577465, "large_model_usage": 0.3873239436619718, "avg_cost": 0.00027163802816901393}, "random_baseline": {"accuracy": 0.6746478873239437, "accuracy_std": 0.013564338228350842, "avg_cost": 0.0002471068309859154, "cost_std": 1.1363250650197759e-05, "small_model_fraction": 0.6101446273325786}, "unrestricted_random_baseline": {"accuracy": 0.6746478873239437, "accuracy_std": 0.015283474939584983, "avg_cost": 0.00026723327464788735, "cost_std": 1.587082049226973e-05, "small_model_fraction": 0.5892955902437228}}, "0.25": {"accuracy": 0.6654929577464789, "large_model_usage": 0.2535211267605634, "small_model_usage": 0.7464788732394366, "avg_lambda": 0.34999999999999865, "avg_cost": 0.00017956760563380274, "unrestricted": {"accuracy": 0.7007042253521126, "large_model_usage": 0.2535211267605634, "avg_cost": 0.00018873661971830982}, "random_baseline": {"accuracy": 0.6507042253521127, "accuracy_std": 0.009018484841454705, "avg_cost": 0.00017625147887323934, "cost_std": 1.1765711119106478e-05, "small_model_fraction": 0.7471301651711714}, "unrestricted_random_baseline": {"accuracy": 0.6478873239436621, "accuracy_std": 0.015348235716692516, "avg_cost": 0.00018948447183098578, "cost_std": 1.6683671844212414e-05, "small_model_fraction": 0.7314118986455832}}}}, {"trial": 2, "trial_seed": 43, "subject": "TruthfulQA", "method": "standard", "calibration_size": 400, "test_size": 284, "small_model_accuracy": 0.5555555555555556, "large_model_accuracy": 0.8260233918128655, "small_model_cost": 3.205482456140351e-05, "large_model_cost": 0.0006160906432748544, "hybrid_results": {"0.05": {"accuracy": 0.7676056338028169, "large_model_usage": 0.8169014084507042, "small_model_usage": 0.18309859154929575, "avg_lambda": 0.900000000000004, "avg_cost": 0.0005383820422535209, "unrestricted": {"accuracy": 0.7676056338028169, "large_model_usage": 0.8169014084507042, "avg_cost": 0.0005392552816901405}, "random_baseline": {"accuracy": 0.7735915492957746, "accuracy_std": 0.014263823310231759, "avg_cost": 0.000549854366197183, "cost_std": 9.227782317797831e-06, "small_model_fraction": 0.1330545122943224}, "unrestricted_random_baseline": {"accuracy": 0.7626760563380282, "accuracy_std": 0.009342605043254635, "avg_cost": 0.0005416085211267605, "cost_std": 1.4828565964809484e-05, "small_model_fraction": 0.1315593309909852}}, "0.1": {"accuracy": 0.721830985915493, "large_model_usage": 0.6056338028169014, "small_model_usage": 0.3943661971830986, "avg_lambda": 0.7599999999999976, "avg_cost": 0.0004049242957746478, "unrestricted": {"accuracy": 0.7288732394366197, "large_model_usage": 0.6056338028169014, "avg_cost": 0.0004085510563380281}, "random_baseline": {"accuracy": 0.7035211267605633, "accuracy_std": 0.014755159746307742, "avg_cost": 0.0004055917253521126, "cost_std": 1.0398462080729351e-05, "small_model_fraction": 0.36156403551648}, "unrestricted_random_baseline": {"accuracy": 0.7007042253521126, "accuracy_std": 0.01673924552676722, "avg_cost": 0.0004107480281690141, "cost_std": 1.7300886385192845e-05, "small_model_fraction": 0.35535420994213496}}, "0.15": {"accuracy": 0.6725352112676056, "large_model_usage": 0.43309859154929575, "small_model_usage": 0.5669014084507042, "avg_lambda": 0.5999999999999969, "avg_cost": 0.0002905158450704226, "unrestricted": {"accuracy": 0.6971830985915493, "large_model_usage": 0.43309859154929575, "avg_cost": 0.00030099471830985914}, "random_baseline": {"accuracy": 0.6524647887323944, "accuracy_std": 0.009188724190633712, "avg_cost": 0.00028778908450704214, "cost_std": 1.921969810194567e-05, "small_model_fraction": 0.5574569020811557}, "unrestricted_random_baseline": {"accuracy": 0.6577464788732394, "accuracy_std": 0.014066890391153627, "avg_cost": 0.0003167222887323943, "cost_std": 1.5102977819828607e-05, "small_model_fraction": 0.5395147264411068}}, "0.2": {"accuracy": 0.6443661971830986, "large_model_usage": 0.31690140845070425, "small_model_usage": 0.6830985915492958, "avg_lambda": 0.40000000000000185, "avg_cost": 0.00021803697183098592, "unrestricted": {"accuracy": 0.6654929577464789, "large_model_usage": 0.31690140845070425, "avg_cost": 0.00022919894366197187}, "random_baseline": {"accuracy": 0.6098591549295773, "accuracy_std": 0.013156015276246082, "avg_cost": 0.00021884961267605623, "cost_std": 1.0078965661763085e-05, "small_model_fraction": 0.6815569502581622}, "unrestricted_random_baseline": {"accuracy": 0.6359154929577465, "accuracy_std": 0.016605388905002266, "avg_cost": 0.000234149542253521, "cost_std": 1.931792909252889e-05, "small_model_fraction": 0.6624451569856636}}, "0.25": {"accuracy": 0.5880281690140845, "large_model_usage": 0.24647887323943662, "small_model_usage": 0.7535211267605634, "avg_lambda": 0.2700000000000005, "avg_cost": 0.00016824119718309848, "unrestricted": {"accuracy": 0.6408450704225352, "large_model_usage": 0.24647887323943662, "avg_cost": 0.0001852623239436619}, "random_baseline": {"accuracy": 0.5950704225352113, "accuracy_std": 0.014172261829224, "avg_cost": 0.0001730623239436619, "cost_std": 9.295319349712105e-06, "small_model_fraction": 0.7668184582896055}, "unrestricted_random_baseline": {"accuracy": 0.6042253521126759, "accuracy_std": 0.009984117520251976, "avg_cost": 0.0001879010211267605, "cost_std": 1.4223143525777286e-05, "small_model_fraction": 0.7376744807882623}}}}, {"trial": 3, "trial_seed": 44, "subject": "TruthfulQA", "method": "standard", "calibration_size": 400, "test_size": 284, "small_model_accuracy": 0.5760233918128655, "large_model_accuracy": 0.8362573099415205, "small_model_cost": 3.208538011695908e-05, "large_model_cost": 0.0006144093567251462, "hybrid_results": {"0.05": {"accuracy": 0.8204225352112676, "large_model_usage": 0.8028169014084507, "small_model_usage": 0.19718309859154926, "avg_lambda": 0.900000000000004, "avg_cost": 0.0005267799295774644, "unrestricted": {"accuracy": 0.823943661971831, "large_model_usage": 0.8028169014084507, "avg_cost": 0.0005280757042253517}, "random_baseline": {"accuracy": 0.782394366197183, "accuracy_std": 0.012676056338028202, "avg_cost": 0.000522896936619718, "cost_std": 1.0567271401938057e-05, "small_model_fraction": 0.15048225844673177}, "unrestricted_random_baseline": {"accuracy": 0.7873239436619718, "accuracy_std": 0.008212608302599043, "avg_cost": 0.0005278647535211263, "cost_std": 9.456832945469194e-06, "small_model_fraction": 0.14825708019555503}}, "0.1": {"accuracy": 0.75, "large_model_usage": 0.6161971830985915, "small_model_usage": 0.3838028169014085, "avg_lambda": 0.6999999999999973, "avg_cost": 0.00040104049295774637, "unrestricted": {"accuracy": 0.778169014084507, "large_model_usage": 0.6161971830985915, "avg_cost": 0.0004152728873239436}, "random_baseline": {"accuracy": 0.7345070422535211, "accuracy_std": 0.009859154929577452, "avg_cost": 0.0003980781690140842, "cost_std": 1.3927448995552051e-05, "small_model_fraction": 0.3664092023313058}, "unrestricted_random_baseline": {"accuracy": 0.7394366197183098, "accuracy_std": 0.013361736592260741, "avg_cost": 0.0004239042957746477, "cost_std": 1.7649899214114427e-05, "small_model_fraction": 0.3419685216485432}}, "0.15": {"accuracy": 0.704225352112676, "large_model_usage": 0.4788732394366197, "small_model_usage": 0.5211267605633803, "avg_lambda": 0.5500000000000005, "avg_cost": 0.0003179982394366198, "unrestricted": {"accuracy": 0.7359154929577465, "large_model_usage": 0.4788732394366197, "avg_cost": 0.0003302517605633801}, "random_baseline": {"accuracy": 0.704225352112676, "accuracy_std": 0.024496517085811402, "avg_cost": 0.00032898521126760547, "cost_std": 1.7012511693783637e-05, "small_model_fraction": 0.5090141041675924}, "unrestricted_random_baseline": {"accuracy": 0.7007042253521127, "accuracy_std": 0.02864907831944788, "avg_cost": 0.000331395598591549, "cost_std": 1.461076535722184e-05, "small_model_fraction": 0.4879716576618992}}, "0.2": {"accuracy": 0.6725352112676056, "large_model_usage": 0.34507042253521125, "small_model_usage": 0.6549295774647887, "avg_lambda": 0.40000000000000185, "avg_cost": 0.0002329771126760561, "unrestricted": {"accuracy": 0.7007042253521126, "large_model_usage": 0.34507042253521125, "avg_cost": 0.00024743485915492954}, "random_baseline": {"accuracy": 0.642605633802817, "accuracy_std": 0.015528927514929548, "avg_cost": 0.0002253021478873239, "cost_std": 1.1396875133396806e-05, "small_model_fraction": 0.6550172401809489}, "unrestricted_random_baseline": {"accuracy": 0.6626760563380282, "accuracy_std": 0.016197183098591528, "avg_cost": 0.0002574478873239436, "cost_std": 1.397927887799724e-05, "small_model_fraction": 0.6301895719762421}}, "0.25": {"accuracy": 0.6338028169014085, "large_model_usage": 0.2746478873239437, "small_model_usage": 0.7253521126760563, "avg_lambda": 0.3100000000000012, "avg_cost": 0.00018878697183098583, "unrestricted": {"accuracy": 0.6866197183098591, "large_model_usage": 0.2746478873239437, "avg_cost": 0.00020379401408450695}, "random_baseline": {"accuracy": 0.6295774647887324, "accuracy_std": 0.013156015276246082, "avg_cost": 0.00018293088028169006, "cost_std": 1.5226791074014027e-05, "small_model_fraction": 0.7309030745621138}, "unrestricted_random_baseline": {"accuracy": 0.652112676056338, "accuracy_std": 0.018883222081397064, "avg_cost": 0.0002185759154929577, "cost_std": 1.4982070211484821e-05, "small_model_fraction": 0.7051321242726695}}}}, {"trial": 4, "trial_seed": 45, "subject": "TruthfulQA", "method": "standard", "calibration_size": 400, "test_size": 284, "small_model_accuracy": 0.533625730994152, "large_model_accuracy": 0.8289473684210527, "small_model_cost": 3.2054824561403526e-05, "large_model_cost": 0.0006148274853801175, "hybrid_results": {"0.05": {"accuracy": 0.7464788732394366, "large_model_usage": 0.7112676056338029, "small_model_usage": 0.28873239436619713, "avg_lambda": 0.8000000000000037, "avg_cost": 0.0004684260563380277, "unrestricted": {"accuracy": 0.7605633802816901, "large_model_usage": 0.7112676056338029, "avg_cost": 0.0004742852112676051}, "random_baseline": {"accuracy": 0.7200704225352113, "accuracy_std": 0.01401833044886929, "avg_cost": 0.00046810982394366213, "cost_std": 1.3257352492909592e-05, "small_model_fraction": 0.25121533470086993}, "unrestricted_random_baseline": {"accuracy": 0.7355633802816902, "accuracy_std": 0.014470931400024289, "avg_cost": 0.0004910786971830987, "cost_std": 1.4196799096928274e-05, "small_model_fraction": 0.2411614057445148}}, "0.1": {"accuracy": 0.6549295774647887, "large_model_usage": 0.4964788732394366, "small_model_usage": 0.5035211267605634, "avg_lambda": 0.5999999999999969, "avg_cost": 0.00032970774647887323, "unrestricted": {"accuracy": 0.6901408450704225, "large_model_usage": 0.4964788732394366, "avg_cost": 0.00034156690140845056}, "random_baseline": {"accuracy": 0.652112676056338, "accuracy_std": 0.011112488618351745, "avg_cost": 0.0003309788028169014, "cost_std": 9.452079266742762e-06, "small_model_fraction": 0.48924693636227023}, "unrestricted_random_baseline": {"accuracy": 0.6499999999999999, "accuracy_std": 0.027055454524364145, "avg_cost": 0.00033671137323943665, "cost_std": 2.5637700179195556e-05, "small_model_fraction": 0.46889739746503223}}, "0.15": {"accuracy": 0.6267605633802817, "large_model_usage": 0.43309859154929575, "small_model_usage": 0.5669014084507042, "avg_lambda": 0.5, "avg_cost": 0.0002880669014084506, "unrestricted": {"accuracy": 0.6690140845070423, "large_model_usage": 0.43309859154929575, "avg_cost": 0.00030281338028169}, "random_baseline": {"accuracy": 0.6193661971830986, "accuracy_std": 0.018122689059799697, "avg_cost": 0.0002828635563380282, "cost_std": 2.105011858560464e-05, "small_model_fraction": 0.5606999194379058}, "unrestricted_random_baseline": {"accuracy": 0.6200704225352113, "accuracy_std": 0.022460814695517004, "avg_cost": 0.0002942625704225352, "cost_std": 1.3344034294990075e-05, "small_model_fraction": 0.5353959203578482}}, "0.2": {"accuracy": 0.5809859154929577, "large_model_usage": 0.29577464788732394, "small_model_usage": 0.704225352112676, "avg_lambda": 0.3700000000000011, "avg_cost": 0.00020301760563380276, "unrestricted": {"accuracy": 0.6161971830985915, "large_model_usage": 0.29577464788732394, "avg_cost": 0.00021710915492957757}, "random_baseline": {"accuracy": 0.5806338028169014, "accuracy_std": 0.01140432023993072, "avg_cost": 0.0002055824647887324, "cost_std": 7.062198998997996e-06, "small_model_fraction": 0.7066389819449999}, "unrestricted_random_baseline": {"accuracy": 0.5838028169014085, "accuracy_std": 0.015652190683727352, "avg_cost": 0.00021542809859154922, "cost_std": 1.4348966408133677e-05, "small_model_fraction": 0.6824587994429961}}, "0.25": {"accuracy": 0.5422535211267606, "large_model_usage": 0.2746478873239437, "small_model_usage": 0.7253521126760563, "avg_lambda": 0.29999999999999843, "avg_cost": 0.00018438380281690138, "unrestricted": {"accuracy": 0.6056338028169014, "large_model_usage": 0.2746478873239437, "avg_cost": 0.00020316549295774661}, "random_baseline": {"accuracy": 0.5535211267605634, "accuracy_std": 0.007352328527401786, "avg_cost": 0.00017072672535211263, "cost_std": 1.2387893475775421e-05, "small_model_fraction": 0.7386133761980238}, "unrestricted_random_baseline": {"accuracy": 0.5788732394366197, "accuracy_std": 0.022992715094809133, "avg_cost": 0.00020693922535211265, "cost_std": 1.597298549355337e-05, "small_model_fraction": 0.7063852169112453}}}}, {"trial": 5, "trial_seed": 46, "subject": "TruthfulQA", "method": "standard", "calibration_size": 400, "test_size": 284, "small_model_accuracy": 0.5716374269005848, "large_model_accuracy": 0.8362573099415205, "small_model_cost": 3.204400584795328e-05, "large_model_cost": 0.0006141900584795328, "hybrid_results": {"0.05": {"accuracy": 0.8098591549295775, "large_model_usage": 0.6936619718309859, "small_model_usage": 0.3063380281690141, "avg_lambda": 0.8000000000000037, "avg_cost": 0.0004558711267605633, "unrestricted": {"accuracy": 0.8133802816901409, "large_model_usage": 0.6936619718309859, "avg_cost": 0.00046091338028169006}, "random_baseline": {"accuracy": 0.7612676056338028, "accuracy_std": 0.014838948947679894, "avg_cost": 0.0004550111267605631, "cost_std": 1.2760954359468978e-05, "small_model_fraction": 0.27195740828833576}, "unrestricted_random_baseline": {"accuracy": 0.7552816901408451, "accuracy_std": 0.01384031268100952, "avg_cost": 0.00045635552816901387, "cost_std": 1.5122563308640971e-05, "small_model_fraction": 0.26329591604195984}}, "0.1": {"accuracy": 0.75, "large_model_usage": 0.4859154929577465, "small_model_usage": 0.5140845070422535, "avg_lambda": 0.6500000000000034, "avg_cost": 0.0003273922535211267, "unrestricted": {"accuracy": 0.7816901408450704, "large_model_usage": 0.4859154929577465, "avg_cost": 0.0003329204225352113}, "random_baseline": {"accuracy": 0.7052816901408451, "accuracy_std": 0.015271301719860195, "avg_cost": 0.0003184916901408451, "cost_std": 1.0952411683831315e-05, "small_model_fraction": 0.4926561017839808}, "unrestricted_random_baseline": {"accuracy": 0.7133802816901409, "accuracy_std": 0.017120064480483425, "avg_cost": 0.00033650214788732383, "cost_std": 1.298747041694551e-05, "small_model_fraction": 0.4831599126591132}}, "0.15": {"accuracy": 0.704225352112676, "large_model_usage": 0.4225352112676056, "small_model_usage": 0.5774647887323944, "avg_lambda": 0.5, "avg_cost": 0.0002825964788732394, "unrestricted": {"accuracy": 0.7429577464788732, "large_model_usage": 0.4225352112676056, "avg_cost": 0.0002928922535211266}, "random_baseline": {"accuracy": 0.6869718309859155, "accuracy_std": 0.014641283722740699, "avg_cost": 0.00028520838028169005, "cost_std": 1.6947491466015666e-05, "small_model_fraction": 0.5696054763359321}, "unrestricted_random_baseline": {"accuracy": 0.6809859154929577, "accuracy_std": 0.013564338228350844, "avg_cost": 0.00028529295774647884, "cost_std": 1.4437107823249712e-05, "small_model_fraction": 0.5519195801568797}}, "0.2": {"accuracy": 0.647887323943662, "large_model_usage": 0.2887323943661972, "small_model_usage": 0.7112676056338028, "avg_lambda": 0.34999999999999865, "avg_cost": 0.00019608239436619718, "unrestricted": {"accuracy": 0.7007042253521126, "large_model_usage": 0.2887323943661972, "avg_cost": 0.00020967394366197182}, "random_baseline": {"accuracy": 0.6426056338028169, "accuracy_std": 0.008942552886760784, "avg_cost": 0.00019915591549295777, "cost_std": 1.2177923599481255e-05, "small_model_fraction": 0.7182178118760547}, "unrestricted_random_baseline": {"accuracy": 0.648943661971831, "accuracy_std": 0.011789203551255763, "avg_cost": 0.00020760147887323948, "cost_std": 1.2647175785178354e-05, "small_model_fraction": 0.6948704933907118}}, "0.25": {"accuracy": 0.5880281690140845, "large_model_usage": 0.23943661971830985, "small_model_usage": 0.7605633802816901, "avg_lambda": 0.2400000000000003, "avg_cost": 0.00015946971830985905, "unrestricted": {"accuracy": 0.6654929577464789, "large_model_usage": 0.23943661971830985, "avg_cost": 0.00017916690140845065}, "random_baseline": {"accuracy": 0.620774647887324, "accuracy_std": 0.011469364418945329, "avg_cost": 0.0001533645422535211, "cost_std": 1.514470009067171e-05, "small_model_fraction": 0.7811104071119602}, "unrestricted_random_baseline": {"accuracy": 0.6316901408450704, "accuracy_std": 0.009342605043254683, "avg_cost": 0.00018396721830985917, "cost_std": 1.5280996662276732e-05, "small_model_fraction": 0.7472749408925281}}}}, {"trial": 6, "trial_seed": 47, "subject": "TruthfulQA", "method": "standard", "calibration_size": 400, "test_size": 284, "small_model_accuracy": 0.5482456140350878, "large_model_accuracy": 0.8333333333333334, "small_model_cost": 3.213333333333336e-05, "large_model_cost": 0.0006180818713450298, "hybrid_results": {"0.05": {"accuracy": 0.7887323943661971, "large_model_usage": 0.7922535211267606, "small_model_usage": 0.20774647887323938, "avg_lambda": 0.900000000000004, "avg_cost": 0.0005251028169014084, "unrestricted": {"accuracy": 0.7887323943661971, "large_model_usage": 0.7922535211267606, "avg_cost": 0.0005251028169014084}, "random_baseline": {"accuracy": 0.7598591549295775, "accuracy_std": 0.012177194694218704, "avg_cost": 0.0005278061267605631, "cost_std": 1.655161712382833e-05, "small_model_fraction": 0.15868126364668111}, "unrestricted_random_baseline": {"accuracy": 0.7700704225352112, "accuracy_std": 0.013179554417051322, "avg_cost": 0.0005316513732394363, "cost_std": 8.633011462610133e-06, "small_model_fraction": 0.15868126364668111}}, "0.1": {"accuracy": 0.7429577464788732, "large_model_usage": 0.6338028169014085, "small_model_usage": 0.3661971830985915, "avg_lambda": 0.75, "avg_cost": 0.00041191267605633755, "unrestricted": {"accuracy": 0.7535211267605634, "large_model_usage": 0.6338028169014085, "avg_cost": 0.00042568028169014067}, "random_baseline": {"accuracy": 0.704225352112676, "accuracy_std": 0.014346161117931973, "avg_cost": 0.0004132294366197181, "cost_std": 1.7607861350066013e-05, "small_model_fraction": 0.35185546496674897}, "unrestricted_random_baseline": {"accuracy": 0.7098591549295774, "accuracy_std": 0.018777875529539995, "avg_cost": 0.0004272019014084505, "cost_std": 8.955814913492226e-06, "small_model_fraction": 0.32835919397933283}}, "0.15": {"accuracy": 0.6866197183098591, "large_model_usage": 0.44014084507042256, "small_model_usage": 0.5598591549295775, "avg_lambda": 0.5, "avg_cost": 0.00029342676056338, "unrestricted": {"accuracy": 0.7288732394366197, "large_model_usage": 0.44014084507042256, "avg_cost": 0.000305419718309859}, "random_baseline": {"accuracy": 0.6443661971830985, "accuracy_std": 0.024293222972761045, "avg_cost": 0.0002948897535211267, "cost_std": 1.4891870089721574e-05, "small_model_fraction": 0.5540676180937397}, "unrestricted_random_baseline": {"accuracy": 0.647887323943662, "accuracy_std": 0.013727879358885847, "avg_cost": 0.0003108024295774647, "cost_std": 2.0077312655008967e-05, "small_model_fraction": 0.5336000224458782}}, "0.2": {"accuracy": 0.6514084507042254, "large_model_usage": 0.3732394366197183, "small_model_usage": 0.6267605633802817, "avg_lambda": 0.40000000000000185, "avg_cost": 0.0002468492957746477, "unrestricted": {"accuracy": 0.704225352112676, "large_model_usage": 0.3732394366197183, "avg_cost": 0.0002629267605633802}, "random_baseline": {"accuracy": 0.6278169014084508, "accuracy_std": 0.016292584931351103, "avg_cost": 0.0002528123591549294, "cost_std": 1.4916420343871846e-05, "small_model_fraction": 0.6335583272041062}, "unrestricted_random_baseline": {"accuracy": 0.6288732394366197, "accuracy_std": 0.016455382317673156, "avg_cost": 0.0002620659507042252, "cost_std": 1.4878752753558927e-05, "small_model_fraction": 0.606119970854096}}, "0.25": {"accuracy": 0.5633802816901409, "large_model_usage": 0.29577464788732394, "small_model_usage": 0.704225352112676, "avg_lambda": 0.29999999999999843, "avg_cost": 0.00019099014084507037, "unrestricted": {"accuracy": 0.6690140845070423, "large_model_usage": 0.29577464788732394, "avg_cost": 0.0002147154929577464}, "random_baseline": {"accuracy": 0.5936619718309858, "accuracy_std": 0.019169940265938306, "avg_cost": 0.0001905971126760563, "cost_std": 1.925708225409766e-05, "small_model_fraction": 0.7288894890824593}, "unrestricted_random_baseline": {"accuracy": 0.5999999999999999, "accuracy_std": 0.0115930124177143, "avg_cost": 0.0002096036971830985, "cost_std": 1.697167958866057e-05, "small_model_fraction": 0.6883989842453221}}}}, {"trial": 7, "trial_seed": 48, "subject": "TruthfulQA", "method": "standard", "calibration_size": 400, "test_size": 284, "small_model_accuracy": 0.543859649122807, "large_model_accuracy": 0.8289473684210527, "small_model_cost": 3.201330409356728e-05, "large_model_cost": 0.0006151432748538018, "hybrid_results": {"0.05": {"accuracy": 0.7922535211267606, "large_model_usage": 0.7992957746478874, "small_model_usage": 0.20070422535211263, "avg_lambda": 0.900000000000004, "avg_cost": 0.0005273017605633803, "unrestricted": {"accuracy": 0.7922535211267606, "large_model_usage": 0.7992957746478874, "avg_cost": 0.0005277735915492957}, "random_baseline": {"accuracy": 0.771830985915493, "accuracy_std": 0.013061434500698174, "avg_cost": 0.0005325990140845071, "cost_std": 1.2234927838141327e-05, "small_model_fraction": 0.1506379687120203}, "unrestricted_random_baseline": {"accuracy": 0.768661971830986, "accuracy_std": 0.010685909088375703, "avg_cost": 0.0005347665845070423, "cost_std": 1.6078855485318755e-05, "small_model_fraction": 0.14982883351133716}}, "0.1": {"accuracy": 0.7323943661971831, "large_model_usage": 0.6161971830985915, "small_model_usage": 0.3838028169014085, "avg_lambda": 0.75, "avg_cost": 0.00039996373239436616, "unrestricted": {"accuracy": 0.75, "large_model_usage": 0.6161971830985915, "avg_cost": 0.0004133088028169013}, "random_baseline": {"accuracy": 0.7007042253521127, "accuracy_std": 0.01157160332757396, "avg_cost": 0.0004034511619718306, "cost_std": 1.3511082314886144e-05, "small_model_fraction": 0.36900785973820405}, "unrestricted_random_baseline": {"accuracy": 0.7056338028169014, "accuracy_std": 0.013004355853957351, "avg_cost": 0.00041362950704225323, "cost_std": 9.240975603428157e-06, "small_model_fraction": 0.3461226178681334}}, "0.15": {"accuracy": 0.6971830985915493, "large_model_usage": 0.4647887323943662, "small_model_usage": 0.5352112676056338, "avg_lambda": 0.5999999999999969, "avg_cost": 0.000314625704225352, "unrestricted": {"accuracy": 0.7077464788732394, "large_model_usage": 0.4647887323943662, "avg_cost": 0.00032134401408450683}, "random_baseline": {"accuracy": 0.6552816901408451, "accuracy_std": 0.01706566312120089, "avg_cost": 0.00032054204225352077, "cost_std": 2.316211347545104e-05, "small_model_fraction": 0.5153526412587933}, "unrestricted_random_baseline": {"accuracy": 0.6591549295774648, "accuracy_std": 0.02021507055550327, "avg_cost": 0.00031996760563380254, "cost_std": 1.7160737034907117e-05, "small_model_fraction": 0.5038315221326471}}, "0.2": {"accuracy": 0.6161971830985915, "large_model_usage": 0.352112676056338, "small_model_usage": 0.647887323943662, "avg_lambda": 0.40000000000000185, "avg_cost": 0.00023706232394366179, "unrestricted": {"accuracy": 0.6725352112676056, "large_model_usage": 0.352112676056338, "avg_cost": 0.0002514919014084506}, "random_baseline": {"accuracy": 0.6211267605633802, "accuracy_std": 0.019930946053640725, "avg_cost": 0.00023424771126760545, "cost_std": 1.634708752083554e-05, "small_model_fraction": 0.6483648069352888}, "unrestricted_random_baseline": {"accuracy": 0.6246478873239437, "accuracy_std": 0.014013907564881993, "avg_cost": 0.00025769781690140826, "cost_std": 1.0029215112246073e-05, "small_model_fraction": 0.6236197617681251}}, "0.25": {"accuracy": 0.5950704225352113, "large_model_usage": 0.2992957746478873, "small_model_usage": 0.7007042253521127, "avg_lambda": 0.3100000000000012, "avg_cost": 0.00019931584507042245, "unrestricted": {"accuracy": 0.6584507042253521, "large_model_usage": 0.2992957746478873, "avg_cost": 0.00021843556338028164}, "random_baseline": {"accuracy": 0.6003521126760563, "accuracy_std": 0.016155027812611897, "avg_cost": 0.00019681320422535196, "cost_std": 9.229727039260261e-06, "small_model_fraction": 0.7130956229899477}, "unrestricted_random_baseline": {"accuracy": 0.6109154929577465, "accuracy_std": 0.011810218191020018, "avg_cost": 0.00021561088028169002, "cost_std": 1.2546010933181057e-05, "small_model_fraction": 0.6803075323950969}}}}, {"trial": 8, "trial_seed": 49, "subject": "TruthfulQA", "method": "standard", "calibration_size": 400, "test_size": 284, "small_model_accuracy": 0.5745614035087719, "large_model_accuracy": 0.8464912280701754, "small_model_cost": 3.2026900584795304e-05, "large_model_cost": 0.0006131929824561406, "hybrid_results": {"0.05": {"accuracy": 0.8169014084507042, "large_model_usage": 0.8345070422535211, "small_model_usage": 0.16549295774647887, "avg_lambda": 0.900000000000004, "avg_cost": 0.0005476352112676051, "unrestricted": {"accuracy": 0.8169014084507042, "large_model_usage": 0.8345070422535211, "avg_cost": 0.0005476352112676051}, "random_baseline": {"accuracy": 0.7964788732394367, "accuracy_std": 0.009154929577464813, "avg_cost": 0.0005512717957746478, "cost_std": 1.2879050820843717e-05, "small_model_fraction": 0.11280384942190802}, "unrestricted_random_baseline": {"accuracy": 0.7978873239436621, "accuracy_std": 0.01022946411713659, "avg_cost": 0.0005501953169014084, "cost_std": 7.364406191412989e-06, "small_model_fraction": 0.11280384942190802}}, "0.1": {"accuracy": 0.7570422535211268, "large_model_usage": 0.6690140845070423, "small_model_usage": 0.33098591549295775, "avg_lambda": 0.6999999999999973, "avg_cost": 0.0004282056338028167, "unrestricted": {"accuracy": 0.7887323943661971, "large_model_usage": 0.6690140845070423, "avg_cost": 0.00044607887323943604}, "random_baseline": {"accuracy": 0.7390845070422535, "accuracy_std": 0.016016289317667808, "avg_cost": 0.00043220559859154917, "cost_std": 1.2054755134022896e-05, "small_model_fraction": 0.31830375932757066}, "unrestricted_random_baseline": {"accuracy": 0.7429577464788732, "accuracy_std": 0.015746957588026668, "avg_cost": 0.00045180943661971815, "cost_std": 1.4110173284760387e-05, "small_model_fraction": 0.2875496599502157}}, "0.15": {"accuracy": 0.6936619718309859, "large_model_usage": 0.4612676056338028, "small_model_usage": 0.5387323943661972, "avg_lambda": 0.5, "avg_cost": 0.00030782535211267595, "unrestricted": {"accuracy": 0.7112676056338029, "large_model_usage": 0.4612676056338028, "avg_cost": 0.00031726197183098564}, "random_baseline": {"accuracy": 0.677112676056338, "accuracy_std": 0.011789203551255763, "avg_cost": 0.0003051073239436618, "cost_std": 1.5598162047627116e-05, "small_model_fraction": 0.5254395255830929}, "unrestricted_random_baseline": {"accuracy": 0.672887323943662, "accuracy_std": 0.017210351281478763, "avg_cost": 0.00030334225352112663, "cost_std": 8.297196071940053e-06, "small_model_fraction": 0.5092021366289339}}, "0.2": {"accuracy": 0.6373239436619719, "large_model_usage": 0.3380281690140845, "small_model_usage": 0.6619718309859155, "avg_lambda": 0.3900000000000005, "avg_cost": 0.00022990985915492945, "unrestricted": {"accuracy": 0.676056338028169, "large_model_usage": 0.3380281690140845, "avg_cost": 0.0002417408450704224}, "random_baseline": {"accuracy": 0.6485915492957746, "accuracy_std": 0.019718309859154938, "avg_cost": 0.00023641200704225342, "cost_std": 1.3822950589137383e-05, "small_model_fraction": 0.6595070415448985}, "unrestricted_random_baseline": {"accuracy": 0.6563380281690141, "accuracy_std": 0.014013907564881951, "avg_cost": 0.00024530172535211254, "cost_std": 9.248234215523946e-06, "small_model_fraction": 0.6391497180799821}}, "0.25": {"accuracy": 0.6056338028169014, "large_model_usage": 0.31690140845070425, "small_model_usage": 0.6830985915492958, "avg_lambda": 0.2700000000000005, "avg_cost": 0.0002065859154929578, "unrestricted": {"accuracy": 0.676056338028169, "large_model_usage": 0.31690140845070425, "avg_cost": 0.0002282549295774648}, "random_baseline": {"accuracy": 0.6176056338028169, "accuracy_std": 0.01319365774309523, "avg_cost": 0.0001995528169014084, "cost_std": 1.595446261880574e-05, "small_model_fraction": 0.6996400506614472}, "unrestricted_random_baseline": {"accuracy": 0.6443661971830987, "accuracy_std": 0.015021640146268672, "avg_cost": 0.00023035123239436607, "cost_std": 1.3787454856053268e-05, "small_model_fraction": 0.6623546433390978}}}}, {"trial": 9, "trial_seed": 50, "subject": "TruthfulQA", "method": "standard", "calibration_size": 400, "test_size": 284, "small_model_accuracy": 0.5526315789473685, "large_model_accuracy": 0.8230994152046783, "small_model_cost": 3.204590643274858e-05, "large_model_cost": 0.0006138538011695924, "hybrid_results": {"0.05": {"accuracy": 0.7288732394366197, "large_model_usage": 0.7112676056338029, "small_model_usage": 0.28873239436619713, "avg_lambda": 0.8300000000000048, "avg_cost": 0.00046642183098591486, "unrestricted": {"accuracy": 0.7359154929577465, "large_model_usage": 0.7112676056338029, "avg_cost": 0.0004709429577464783}, "random_baseline": {"accuracy": 0.7397887323943663, "accuracy_std": 0.013858217362080564, "avg_cost": 0.000467107464788732, "cost_std": 1.561190200216408e-05, "small_model_fraction": 0.25340317915480015}, "unrestricted_random_baseline": {"accuracy": 0.7338028169014085, "accuracy_std": 0.011157027829404842, "avg_cost": 0.0004677513380281686, "cost_std": 1.3051908724774581e-05, "small_model_fraction": 0.24563235513975587}}, "0.1": {"accuracy": 0.7112676056338029, "large_model_usage": 0.5880281690140845, "small_model_usage": 0.4119718309859155, "avg_lambda": 0.6999999999999973, "avg_cost": 0.0003779288732394363, "unrestricted": {"accuracy": 0.7288732394366197, "large_model_usage": 0.5880281690140845, "avg_cost": 0.00039594999999999984}, "random_baseline": {"accuracy": 0.696830985915493, "accuracy_std": 0.010259719918544703, "avg_cost": 0.00038750010563380247, "cost_std": 1.4794139351989645e-05, "small_model_fraction": 0.40550313954895156}, "unrestricted_random_baseline": {"accuracy": 0.7007042253521127, "accuracy_std": 0.019350185442301545, "avg_cost": 0.0003964511971830982, "cost_std": 1.309456722993544e-05, "small_model_fraction": 0.3745287802740321}}, "0.15": {"accuracy": 0.6725352112676056, "large_model_usage": 0.4014084507042254, "small_model_usage": 0.5985915492957746, "avg_lambda": 0.540000000000001, "avg_cost": 0.0002692950704225354, "unrestricted": {"accuracy": 0.7077464788732394, "large_model_usage": 0.4014084507042254, "avg_cost": 0.00028009084507042264}, "random_baseline": {"accuracy": 0.6380281690140845, "accuracy_std": 0.015731202749076544, "avg_cost": 0.0002697220774647886, "cost_std": 1.4634657236279151e-05, "small_model_fraction": 0.592220789480527}, "unrestricted_random_baseline": {"accuracy": 0.6528169014084507, "accuracy_std": 0.015444867746099523, "avg_cost": 0.0002796913028169013, "cost_std": 1.8297214482731148e-05, "small_model_fraction": 0.5736652237249776}}, "0.2": {"accuracy": 0.647887323943662, "large_model_usage": 0.2992957746478873, "small_model_usage": 0.7007042253521127, "avg_lambda": 0.4099999999999976, "avg_cost": 0.0002073161971830987, "unrestricted": {"accuracy": 0.6830985915492958, "large_model_usage": 0.2992957746478873, "avg_cost": 0.00021754859154929593}, "random_baseline": {"accuracy": 0.6161971830985915, "accuracy_std": 0.016739245526767216, "avg_cost": 0.0002036345422535211, "cost_std": 2.2198666094385354e-05, "small_model_fraction": 0.6987488613752376}, "unrestricted_random_baseline": {"accuracy": 0.6264084507042254, "accuracy_std": 0.01975913781211016, "avg_cost": 0.00022307552816901413, "cost_std": 1.0104814193894258e-05, "small_model_fraction": 0.6811616225997559}}, "0.25": {"accuracy": 0.6161971830985915, "large_model_usage": 0.25, "small_model_usage": 0.75, "avg_lambda": 0.3100000000000012, "avg_cost": 0.0001743654929577465, "unrestricted": {"accuracy": 0.6654929577464789, "large_model_usage": 0.25, "avg_cost": 0.00018721760563380292}, "random_baseline": {"accuracy": 0.6144366197183099, "accuracy_std": 0.014539311846522967, "avg_cost": 0.00017228866197183107, "cost_std": 1.2532662302955466e-05, "small_model_fraction": 0.755383885621954}, "unrestricted_random_baseline": {"accuracy": 0.6126760563380281, "accuracy_std": 0.018961847912445446, "avg_cost": 0.00018587017605633805, "cost_std": 2.117717284363348e-05, "small_model_fraction": 0.7332939263891569}}}}, {"trial": 10, "trial_seed": 51, "subject": "TruthfulQA", "method": "standard", "calibration_size": 400, "test_size": 284, "small_model_accuracy": 0.5716374269005848, "large_model_accuracy": 0.8362573099415205, "small_model_cost": 3.198552631578946e-05, "large_model_cost": 0.0006132543859649138, "hybrid_results": {"0.05": {"accuracy": 0.8415492957746479, "large_model_usage": 0.8028169014084507, "small_model_usage": 0.19718309859154926, "avg_lambda": 0.900000000000004, "avg_cost": 0.0005260309859154925, "unrestricted": {"accuracy": 0.8415492957746479, "large_model_usage": 0.8028169014084507, "avg_cost": 0.0005260309859154925}, "random_baseline": {"accuracy": 0.813380281690141, "accuracy_std": 0.006863939679442923, "avg_cost": 0.0005268667957746477, "cost_std": 1.5282650519937522e-05, "small_model_fraction": 0.1500568946736156}, "unrestricted_random_baseline": {"accuracy": 0.8098591549295774, "accuracy_std": 0.01399620205236042, "avg_cost": 0.0005249398239436615, "cost_std": 1.1388305356606354e-05, "small_model_fraction": 0.1500568946736156}}, "0.1": {"accuracy": 0.8098591549295775, "large_model_usage": 0.6443661971830986, "small_model_usage": 0.3556338028169014, "avg_lambda": 0.75, "avg_cost": 0.00041578450704225344, "unrestricted": {"accuracy": 0.8309859154929577, "large_model_usage": 0.6443661971830986, "avg_cost": 0.00042931267605633797}, "random_baseline": {"accuracy": 0.7588028169014084, "accuracy_std": 0.0181773373227328, "avg_cost": 0.00041541623239436595, "cost_std": 1.3832497858693627e-05, "small_model_fraction": 0.33972210216432475}, "unrestricted_random_baseline": {"accuracy": 0.7711267605633803, "accuracy_std": 0.019856157999819286, "avg_cost": 0.00042385031690140825, "cost_std": 1.3887787238588457e-05, "small_model_fraction": 0.31644858804170234}}, "0.15": {"accuracy": 0.7535211267605634, "large_model_usage": 0.44014084507042256, "small_model_usage": 0.5598591549295775, "avg_lambda": 0.540000000000001, "avg_cost": 0.0002956366197183097, "unrestricted": {"accuracy": 0.7676056338028169, "large_model_usage": 0.44014084507042256, "avg_cost": 0.0003054676056338025}, "random_baseline": {"accuracy": 0.7102112676056337, "accuracy_std": 0.015985295018050967, "avg_cost": 0.00029591035211267585, "cost_std": 1.123923463870363e-05, "small_model_fraction": 0.5464214381591508}, "unrestricted_random_baseline": {"accuracy": 0.7038732394366197, "accuracy_std": 0.01609351391834711, "avg_cost": 0.00030668197183098575, "cost_std": 1.90559878900148e-05, "small_model_fraction": 0.5295084627738409}}, "0.2": {"accuracy": 0.7253521126760564, "large_model_usage": 0.3485915492957746, "small_model_usage": 0.6514084507042254, "avg_lambda": 0.40000000000000185, "avg_cost": 0.00023429154929577453, "unrestricted": {"accuracy": 0.7570422535211268, "large_model_usage": 0.3485915492957746, "avg_cost": 0.000247432394366197}, "random_baseline": {"accuracy": 0.6869718309859154, "accuracy_std": 0.017777331776569493, "avg_cost": 0.00023694742957746468, "cost_std": 2.21672733777167e-05, "small_model_fraction": 0.651957919951011}, "unrestricted_random_baseline": {"accuracy": 0.6855633802816901, "accuracy_std": 0.01308514378595917, "avg_cost": 0.0002458308098591548, "cost_std": 1.846326657146129e-05, "small_model_fraction": 0.6293507479818214}}, "0.25": {"accuracy": 0.6830985915492958, "large_model_usage": 0.30985915492957744, "small_model_usage": 0.6901408450704225, "avg_lambda": 0.3100000000000012, "avg_cost": 0.0002053338028169014, "unrestricted": {"accuracy": 0.7359154929577465, "large_model_usage": 0.30985915492957744, "avg_cost": 0.00022338309859154935}, "random_baseline": {"accuracy": 0.670774647887324, "accuracy_std": 0.017623230642594623, "avg_cost": 0.00020591165492957745, "cost_std": 1.5828528843307953e-05, "small_model_fraction": 0.7017760824040161}, "unrestricted_random_baseline": {"accuracy": 0.6753521126760563, "accuracy_std": 0.01133343446438807, "avg_cost": 0.0002264426056338027, "cost_std": 1.7390227860629976e-05, "small_model_fraction": 0.6707245380540519}}}}, {"trial": 11, "trial_seed": 52, "subject": "TruthfulQA", "method": "standard", "calibration_size": 400, "test_size": 284, "small_model_accuracy": 0.5511695906432749, "large_model_accuracy": 0.8362573099415205, "small_model_cost": 3.205467836257314e-05, "large_model_cost": 0.0006146374269005851, "hybrid_results": {"0.05": {"accuracy": 0.8415492957746479, "large_model_usage": 0.8098591549295775, "small_model_usage": 0.1901408450704225, "avg_lambda": 0.900000000000004, "avg_cost": 0.0005340795774647886, "unrestricted": {"accuracy": 0.8380281690140845, "large_model_usage": 0.8098591549295775, "avg_cost": 0.0005350232394366197}, "random_baseline": {"accuracy": 0.8183098591549296, "accuracy_std": 0.00973258800076425, "avg_cost": 0.0005401327112676052, "cost_std": 9.593662463504208e-06, "small_model_fraction": 0.13827709392005855}, "unrestricted_random_baseline": {"accuracy": 0.816549295774648, "accuracy_std": 0.014893159142645473, "avg_cost": 0.0005374063028169009, "cost_std": 1.3256824904648918e-05, "small_model_fraction": 0.13665730347106364}}, "0.1": {"accuracy": 0.75, "large_model_usage": 0.6056338028169014, "small_model_usage": 0.3943661971830986, "avg_lambda": 0.6999999999999973, "avg_cost": 0.00038812183098591556, "unrestricted": {"accuracy": 0.7922535211267606, "large_model_usage": 0.6056338028169014, "avg_cost": 0.00040623450704225345}, "random_baseline": {"accuracy": 0.7390845070422535, "accuracy_std": 0.014893159142645473, "avg_cost": 0.00038716619718309867, "cost_std": 2.0052652239366433e-05, "small_model_fraction": 0.3888127420235308}, "unrestricted_random_baseline": {"accuracy": 0.7369718309859155, "accuracy_std": 0.011360750566832152, "avg_cost": 0.0003995847535211267, "cost_std": 1.876176216143229e-05, "small_model_fraction": 0.35772243579357194}}, "0.15": {"accuracy": 0.7183098591549296, "large_model_usage": 0.4647887323943662, "small_model_usage": 0.5352112676056338, "avg_lambda": 0.5999999999999969, "avg_cost": 0.0003095091549295775, "unrestricted": {"accuracy": 0.7429577464788732, "large_model_usage": 0.4647887323943662, "avg_cost": 0.00031974154929577484}, "random_baseline": {"accuracy": 0.6883802816901408, "accuracy_std": 0.016231592354388886, "avg_cost": 0.000311055457746479, "cost_std": 1.4768722443825102e-05, "small_model_fraction": 0.5237509568155344}, "unrestricted_random_baseline": {"accuracy": 0.6985915492957746, "accuracy_std": 0.026027772693769487, "avg_cost": 0.0003169536971830986, "cost_std": 1.997876422151121e-05, "small_model_fraction": 0.5061871096335238}}, "0.2": {"accuracy": 0.6373239436619719, "large_model_usage": 0.3485915492957746, "small_model_usage": 0.6514084507042254, "avg_lambda": 0.4800000000000006, "avg_cost": 0.00023534014084507067, "unrestricted": {"accuracy": 0.7147887323943662, "large_model_usage": 0.3485915492957746, "avg_cost": 0.00024812887323943684}, "random_baseline": {"accuracy": 0.6507042253521128, "accuracy_std": 0.011971830985915477, "avg_cost": 0.00023346468309859165, "cost_std": 1.530156092329792e-05, "small_model_fraction": 0.6510616509111518}, "unrestricted_random_baseline": {"accuracy": 0.6725352112676056, "accuracy_std": 0.01774591995542836, "avg_cost": 0.0002574848591549297, "cost_std": 2.0102557271215317e-05, "small_model_fraction": 0.6291098639307452}}, "0.25": {"accuracy": 0.6161971830985915, "large_model_usage": 0.2605633802816901, "small_model_usage": 0.7394366197183099, "avg_lambda": 0.3299999999999986, "avg_cost": 0.0001782063380281691, "unrestricted": {"accuracy": 0.6690140845070423, "large_model_usage": 0.2605633802816901, "avg_cost": 0.00019305845070422548}, "random_baseline": {"accuracy": 0.6211267605633803, "accuracy_std": 0.014535047493345303, "avg_cost": 0.00017792000000000003, "cost_std": 1.4128428573958653e-05, "small_model_fraction": 0.7491315010058868}, "unrestricted_random_baseline": {"accuracy": 0.6278169014084507, "accuracy_std": 0.01460737219203632, "avg_cost": 0.000192528309859155, "cost_std": 1.289992116577143e-05, "small_model_fraction": 0.7236379334168573}}}}, {"trial": 12, "trial_seed": 53, "subject": "TruthfulQA", "method": "standard", "calibration_size": 400, "test_size": 284, "small_model_accuracy": 0.5526315789473685, "large_model_accuracy": 0.8435672514619883, "small_model_cost": 3.209298245614036e-05, "large_model_cost": 0.0006161286549707615, "hybrid_results": {"0.05": {"accuracy": 0.8028169014084507, "large_model_usage": 0.6901408450704225, "small_model_usage": 0.3098591549295775, "avg_lambda": 0.8100000000000018, "avg_cost": 0.00045846795774647853, "unrestricted": {"accuracy": 0.8063380281690141, "large_model_usage": 0.6901408450704225, "avg_cost": 0.0004618411971830983}, "random_baseline": {"accuracy": 0.7746478873239437, "accuracy_std": 0.015746957588026692, "avg_cost": 0.0004636305985915491, "cost_std": 1.635090112734794e-05, "small_model_fraction": 0.2699504578983333}, "unrestricted_random_baseline": {"accuracy": 0.7711267605633803, "accuracy_std": 0.015588692690970865, "avg_cost": 0.0004635942605633801, "cost_std": 1.3770343320480701e-05, "small_model_fraction": 0.26417471577269225}}, "0.1": {"accuracy": 0.7711267605633803, "large_model_usage": 0.5950704225352113, "small_model_usage": 0.40492957746478875, "avg_lambda": 0.7100000000000012, "avg_cost": 0.0003916721830985913, "unrestricted": {"accuracy": 0.8028169014084507, "large_model_usage": 0.5950704225352113, "avg_cost": 0.00040259471830985896}, "random_baseline": {"accuracy": 0.7323943661971831, "accuracy_std": 0.014771955608030286, "avg_cost": 0.00038680690140845055, "cost_std": 2.277049111518807e-05, "small_model_fraction": 0.3843197983194272}, "unrestricted_random_baseline": {"accuracy": 0.7387323943661972, "accuracy_std": 0.012773490948744387, "avg_cost": 0.00041085102112676044, "cost_std": 1.4475624983681015e-05, "small_model_fraction": 0.3656179694324353}}, "0.15": {"accuracy": 0.7253521126760564, "large_model_usage": 0.4894366197183099, "small_model_usage": 0.5105633802816901, "avg_lambda": 0.5999999999999969, "avg_cost": 0.00032501021126760555, "unrestricted": {"accuracy": 0.7535211267605634, "large_model_usage": 0.4894366197183099, "avg_cost": 0.00033735528169014073}, "random_baseline": {"accuracy": 0.6982394366197183, "accuracy_std": 0.019417346026707395, "avg_cost": 0.00032449869718309847, "cost_std": 1.382229182392954e-05, "small_model_fraction": 0.4984600383221761}, "unrestricted_random_baseline": {"accuracy": 0.6989436619718309, "accuracy_std": 0.00934923806151534, "avg_cost": 0.00033847320422535196, "cost_std": 1.1960875312769605e-05, "small_model_fraction": 0.4773225102506761}}, "0.2": {"accuracy": 0.6619718309859155, "large_model_usage": 0.3626760563380282, "small_model_usage": 0.6373239436619718, "avg_lambda": 0.4699999999999991, "avg_cost": 0.00024760880281690146, "unrestricted": {"accuracy": 0.7183098591549296, "large_model_usage": 0.3626760563380282, "avg_cost": 0.00025866514084507046}, "random_baseline": {"accuracy": 0.6609154929577465, "accuracy_std": 0.015271301719860148, "avg_cost": 0.00024006728873239444, "cost_std": 1.2847495796677123e-05, "small_model_fraction": 0.6309886013762871}, "unrestricted_random_baseline": {"accuracy": 0.6647887323943662, "accuracy_std": 0.010656863345367279, "avg_cost": 0.00024820035211267605, "cost_std": 8.700785648355597e-06, "small_model_fraction": 0.6120576720709505}}, "0.25": {"accuracy": 0.6091549295774648, "large_model_usage": 0.2640845070422535, "small_model_usage": 0.7359154929577465, "avg_lambda": 0.3100000000000012, "avg_cost": 0.00018062288732394364, "unrestricted": {"accuracy": 0.676056338028169, "large_model_usage": 0.2640845070422535, "avg_cost": 0.0001966580985915493}, "random_baseline": {"accuracy": 0.6288732394366197, "accuracy_std": 0.009604353307736554, "avg_cost": 0.00018238588028169017, "cost_std": 1.5536835141600394e-05, "small_model_fraction": 0.745683505549766}, "unrestricted_random_baseline": {"accuracy": 0.6373239436619718, "accuracy_std": 0.013174849953429381, "avg_cost": 0.00019950045774647885, "cost_std": 1.3138877506800649e-05, "small_model_fraction": 0.7182276290986505}}}}, {"trial": 13, "trial_seed": 54, "subject": "TruthfulQA", "method": "standard", "calibration_size": 400, "test_size": 284, "small_model_accuracy": 0.564327485380117, "large_model_accuracy": 0.8347953216374269, "small_model_cost": 3.1960380116959094e-05, "large_model_cost": 0.0006127748538011701, "hybrid_results": {"0.05": {"accuracy": 0.7887323943661971, "large_model_usage": 0.7359154929577465, "small_model_usage": 0.2640845070422535, "avg_lambda": 0.8300000000000048, "avg_cost": 0.0004805989436619715, "unrestricted": {"accuracy": 0.7992957746478874, "large_model_usage": 0.7359154929577465, "avg_cost": 0.00048597218309859145}, "random_baseline": {"accuracy": 0.7771126760563379, "accuracy_std": 0.012797733693933984, "avg_cost": 0.0004869827112676057, "cost_std": 1.347987376473545e-05, "small_model_fraction": 0.22756993175597537}, "unrestricted_random_baseline": {"accuracy": 0.7728873239436619, "accuracy_std": 0.009480923956222706, "avg_cost": 0.0004920943661971832, "cost_std": 6.4283724265288694e-06, "small_model_fraction": 0.21831871698762334}}, "0.1": {"accuracy": 0.7535211267605634, "large_model_usage": 0.6197183098591549, "small_model_usage": 0.3802816901408451, "avg_lambda": 0.6999999999999973, "avg_cost": 0.00040337359154929544, "unrestricted": {"accuracy": 0.7711267605633803, "large_model_usage": 0.6197183098591549, "avg_cost": 0.00041571866197183063}, "random_baseline": {"accuracy": 0.7323943661971831, "accuracy_std": 0.013637265303547244, "avg_cost": 0.00039696348591549275, "cost_std": 1.3909654487084217e-05, "small_model_fraction": 0.3605303788722838}, "unrestricted_random_baseline": {"accuracy": 0.7397887323943662, "accuracy_std": 0.011512523045261826, "avg_cost": 0.00041666397887323914, "cost_std": 1.272825662703999e-05, "small_model_fraction": 0.3392756220060711}}, "0.15": {"accuracy": 0.704225352112676, "large_model_usage": 0.45422535211267606, "small_model_usage": 0.545774647887324, "avg_lambda": 0.5, "avg_cost": 0.0003017327464788732, "unrestricted": {"accuracy": 0.7288732394366197, "large_model_usage": 0.45422535211267606, "avg_cost": 0.00031513415492957726}, "random_baseline": {"accuracy": 0.6859154929577465, "accuracy_std": 0.01155015455412444, "avg_cost": 0.0002888061971830983, "cost_std": 1.5523235595854742e-05, "small_model_fraction": 0.5355274729111014}, "unrestricted_random_baseline": {"accuracy": 0.6975352112676056, "accuracy_std": 0.014124064169106769, "avg_cost": 0.00031675422535211244, "cost_std": 1.75663823927687e-05, "small_model_fraction": 0.512453997545213}}, "0.2": {"accuracy": 0.6690140845070423, "large_model_usage": 0.3732394366197183, "small_model_usage": 0.6267605633802817, "avg_lambda": 0.40000000000000185, "avg_cost": 0.0002496904929577463, "unrestricted": {"accuracy": 0.7007042253521126, "large_model_usage": 0.3732394366197183, "avg_cost": 0.000264120070422535}, "random_baseline": {"accuracy": 0.657394366197183, "accuracy_std": 0.01760915457753517, "avg_cost": 0.00024283669014084483, "cost_std": 1.5984307336589468e-05, "small_model_fraction": 0.6251296709951358}, "unrestricted_random_baseline": {"accuracy": 0.6697183098591548, "accuracy_std": 0.018817449599841364, "avg_cost": 0.00025779158450704205, "cost_std": 2.0795712880489215e-05, "small_model_fraction": 0.6002859762895627}}, "0.25": {"accuracy": 0.6126760563380281, "large_model_usage": 0.30633802816901406, "small_model_usage": 0.693661971830986, "avg_lambda": 0.29000000000000087, "avg_cost": 0.00019924683098591537, "unrestricted": {"accuracy": 0.6901408450704225, "large_model_usage": 0.30633802816901406, "avg_cost": 0.00022168345070422532}, "random_baseline": {"accuracy": 0.6316901408450704, "accuracy_std": 0.01462009823382163, "avg_cost": 0.00020018003521126752, "cost_std": 1.2681031706532591e-05, "small_model_fraction": 0.7119795417496604}, "unrestricted_random_baseline": {"accuracy": 0.6496478873239437, "accuracy_std": 0.01584507042253521, "avg_cost": 0.0002207986971830984, "cost_std": 1.4582978516900054e-05, "small_model_fraction": 0.6733499608165434}}}}, {"trial": 14, "trial_seed": 55, "subject": "TruthfulQA", "method": "standard", "calibration_size": 400, "test_size": 284, "small_model_accuracy": 0.543859649122807, "large_model_accuracy": 0.8304093567251462, "small_model_cost": 3.202353801169593e-05, "large_model_cost": 0.0006144824561403515, "hybrid_results": {"0.05": {"accuracy": 0.8169014084507042, "large_model_usage": 0.7570422535211268, "small_model_usage": 0.24295774647887325, "avg_lambda": 0.900000000000004, "avg_cost": 0.0004983362676056333, "unrestricted": {"accuracy": 0.8169014084507042, "large_model_usage": 0.7570422535211268, "avg_cost": 0.0004988080985915488}, "random_baseline": {"accuracy": 0.7848591549295775, "accuracy_std": 0.013947396000238307, "avg_cost": 0.0005010283098591551, "cost_std": 1.1898636152235124e-05, "small_model_fraction": 0.19940666186016462}, "unrestricted_random_baseline": {"accuracy": 0.7876760563380282, "accuracy_std": 0.014859822426767061, "avg_cost": 0.0004923553873239437, "cost_std": 1.2378197872035547e-05, "small_model_fraction": 0.19859659445243852}}, "0.1": {"accuracy": 0.7992957746478874, "large_model_usage": 0.602112676056338, "small_model_usage": 0.397887323943662, "avg_lambda": 0.7599999999999976, "avg_cost": 0.0003989348591549294, "unrestricted": {"accuracy": 0.8028169014084507, "large_model_usage": 0.602112676056338, "avg_cost": 0.00040316725352112663}, "random_baseline": {"accuracy": 0.7348591549295775, "accuracy_std": 0.018434704384096383, "avg_cost": 0.0004049470070422535, "cost_std": 1.8779834884927682e-05, "small_model_fraction": 0.3700648926072605}, "unrestricted_random_baseline": {"accuracy": 0.7299295774647887, "accuracy_std": 0.021822465397670666, "avg_cost": 0.0003966633098591549, "cost_std": 1.678487740784282e-05, "small_model_fraction": 0.3627984670543731}}, "0.15": {"accuracy": 0.7359154929577465, "large_model_usage": 0.4154929577464789, "small_model_usage": 0.5845070422535211, "avg_lambda": 0.5999999999999969, "avg_cost": 0.00028079401408450706, "unrestricted": {"accuracy": 0.7394366197183099, "large_model_usage": 0.4154929577464789, "avg_cost": 0.00028857570422535214}, "random_baseline": {"accuracy": 0.6591549295774647, "accuracy_std": 0.018618738121693335, "avg_cost": 0.00027896619718309837, "cost_std": 1.280045702638424e-05, "small_model_fraction": 0.5728960990552439}, "unrestricted_random_baseline": {"accuracy": 0.6686619718309859, "accuracy_std": 0.014211574697613055, "avg_cost": 0.0002874423943661971, "cost_std": 1.2589938766854443e-05, "small_model_fraction": 0.5595360321069235}}, "0.2": {"accuracy": 0.6690140845070423, "large_model_usage": 0.30633802816901406, "small_model_usage": 0.693661971830986, "avg_lambda": 0.40000000000000185, "avg_cost": 0.00020899119718309855, "unrestricted": {"accuracy": 0.7147887323943662, "large_model_usage": 0.30633802816901406, "avg_cost": 0.00022084330985915488}, "random_baseline": {"accuracy": 0.6250000000000001, "accuracy_std": 0.019173173788819713, "avg_cost": 0.00020784633802816885, "cost_std": 1.5738269255319772e-05, "small_model_fraction": 0.6961714317295191}, "unrestricted_random_baseline": {"accuracy": 0.6345070422535212, "accuracy_std": 0.016650127348643518, "avg_cost": 0.0002201715492957745, "cost_std": 1.3464117813777218e-05, "small_model_fraction": 0.6758230220697696}}, "0.25": {"accuracy": 0.6267605633802817, "large_model_usage": 0.2323943661971831, "small_model_usage": 0.7676056338028169, "avg_lambda": 0.3100000000000012, "avg_cost": 0.00016410387323943651, "unrestricted": {"accuracy": 0.676056338028169, "large_model_usage": 0.2323943661971831, "avg_cost": 0.0001756954225352112}, "random_baseline": {"accuracy": 0.6098591549295775, "accuracy_std": 0.02203448992558031, "avg_cost": 0.0001647034154929577, "cost_std": 7.665305409032144e-06, "small_model_fraction": 0.7732366504884277}, "unrestricted_random_baseline": {"accuracy": 0.6130281690140845, "accuracy_std": 0.01654929577464791, "avg_cost": 0.000171046690140845, "cost_std": 1.3890016051235835e-05, "small_model_fraction": 0.7533355914866762}}}}, {"trial": 15, "trial_seed": 56, "subject": "TruthfulQA", "method": "standard", "calibration_size": 400, "test_size": 284, "small_model_accuracy": 0.5453216374269005, "large_model_accuracy": 0.8245614035087719, "small_model_cost": 3.199970760233923e-05, "large_model_cost": 0.0006134912280701765, "hybrid_results": {"0.05": {"accuracy": 0.7676056338028169, "large_model_usage": 0.6830985915492958, "small_model_usage": 0.31690140845070425, "avg_lambda": 0.8000000000000037, "avg_cost": 0.0004483130281690138, "unrestricted": {"accuracy": 0.7711267605633803, "large_model_usage": 0.6830985915492958, "avg_cost": 0.0004532778169014082}, "random_baseline": {"accuracy": 0.7404929577464788, "accuracy_std": 0.015432821831557135, "avg_cost": 0.00044743908450704226, "cost_std": 1.2804806482673585e-05, "small_model_fraction": 0.2840595160670083}, "unrestricted_random_baseline": {"accuracy": 0.7366197183098591, "accuracy_std": 0.014500887522516883, "avg_cost": 0.00045419003521126754, "cost_std": 1.5812784764166923e-05, "small_model_fraction": 0.27552149176632723}}, "0.1": {"accuracy": 0.6901408450704225, "large_model_usage": 0.5, "small_model_usage": 0.5, "avg_lambda": 0.6699999999999969, "avg_cost": 0.00033820035211267624, "unrestricted": {"accuracy": 0.7147887323943662, "large_model_usage": 0.5, "avg_cost": 0.0003420102112676058}, "random_baseline": {"accuracy": 0.6834507042253521, "accuracy_std": 0.014725720891779221, "avg_cost": 0.0003423017957746478, "cost_std": 1.863157354579453e-05, "small_model_fraction": 0.47342199545062297}, "unrestricted_random_baseline": {"accuracy": 0.6721830985915493, "accuracy_std": 0.016549295774647865, "avg_cost": 0.00034825978873239424, "cost_std": 1.3258059051218345e-05, "small_model_fraction": 0.4668701214837174}}, "0.15": {"accuracy": 0.6584507042253521, "large_model_usage": 0.43309859154929575, "small_model_usage": 0.5669014084507042, "avg_lambda": 0.5, "avg_cost": 0.0002899397887323945, "unrestricted": {"accuracy": 0.6971830985915493, "large_model_usage": 0.43309859154929575, "avg_cost": 0.0003014468309859157}, "random_baseline": {"accuracy": 0.6598591549295775, "accuracy_std": 0.012908663929453053, "avg_cost": 0.0003027919014084507, "cost_std": 1.370244997945249e-05, "small_model_fraction": 0.5564164359223496}, "unrestricted_random_baseline": {"accuracy": 0.644718309859155, "accuracy_std": 0.021035601452456122, "avg_cost": 0.00030452207746478876, "cost_std": 1.3678895585494134e-05, "small_model_fraction": 0.5366275966211965}}, "0.2": {"accuracy": 0.5669014084507042, "large_model_usage": 0.29577464788732394, "small_model_usage": 0.704225352112676, "avg_lambda": 0.3400000000000022, "avg_cost": 0.0001975947183098591, "unrestricted": {"accuracy": 0.6373239436619719, "large_model_usage": 0.29577464788732394, "avg_cost": 0.00021585528169014074}, "random_baseline": {"accuracy": 0.5954225352112676, "accuracy_std": 0.01624686196487227, "avg_cost": 0.00018662257042253516, "cost_std": 1.3503697565229966e-05, "small_model_fraction": 0.7152236879150174}, "unrestricted_random_baseline": {"accuracy": 0.6070422535211268, "accuracy_std": 0.021830985915492974, "avg_cost": 0.00021150985915492957, "cost_std": 1.3371828420358602e-05, "small_model_fraction": 0.683820713430385}}, "0.25": {"accuracy": 0.5422535211267606, "large_model_usage": 0.25704225352112675, "small_model_usage": 0.7429577464788732, "avg_lambda": 0.20000000000000093, "avg_cost": 0.00017122852112676047, "unrestricted": {"accuracy": 0.6161971830985915, "large_model_usage": 0.25704225352112675, "avg_cost": 0.000192756690140845}, "random_baseline": {"accuracy": 0.5876760563380281, "accuracy_std": 0.014976176280377963, "avg_cost": 0.0001711763028169014, "cost_std": 9.615172193131055e-06, "small_model_fraction": 0.760566046754379}, "unrestricted_random_baseline": {"accuracy": 0.6035211267605635, "accuracy_std": 0.010586828435473863, "avg_cost": 0.00020276887323943663, "cost_std": 1.5001887207870166e-05, "small_model_fraction": 0.7235437201058939}}}}, {"trial": 16, "trial_seed": 57, "subject": "TruthfulQA", "method": "standard", "calibration_size": 400, "test_size": 284, "small_model_accuracy": 0.5804093567251462, "large_model_accuracy": 0.8538011695906432, "small_model_cost": 3.2017397660818725e-05, "large_model_cost": 0.00061403216374269, "hybrid_results": {"0.05": {"accuracy": 0.8133802816901409, "large_model_usage": 0.8169014084507042, "small_model_usage": 0.18309859154929575, "avg_lambda": 0.900000000000004, "avg_cost": 0.000537653169014084, "unrestricted": {"accuracy": 0.8133802816901409, "large_model_usage": 0.8169014084507042, "avg_cost": 0.000537653169014084}, "random_baseline": {"accuracy": 0.7830985915492958, "accuracy_std": 0.005500175828103308, "avg_cost": 0.0005346044366197182, "cost_std": 1.1959304404641886e-05, "small_model_fraction": 0.13123205660706874}, "unrestricted_random_baseline": {"accuracy": 0.7887323943661972, "accuracy_std": 0.011571603327573957, "avg_cost": 0.0005430151408450702, "cost_std": 1.4041671882157834e-05, "small_model_fraction": 0.13123205660706874}}, "0.1": {"accuracy": 0.778169014084507, "large_model_usage": 0.6690140845070423, "small_model_usage": 0.33098591549295775, "avg_lambda": 0.7599999999999976, "avg_cost": 0.0004428996478873237, "unrestricted": {"accuracy": 0.7887323943661971, "large_model_usage": 0.6690140845070423, "avg_cost": 0.0004462588028169012}, "random_baseline": {"accuracy": 0.7352112676056338, "accuracy_std": 0.01737882067500432, "avg_cost": 0.0004368698239436616, "cost_std": 1.0527898502165584e-05, "small_model_fraction": 0.29403466342861356}, "unrestricted_random_baseline": {"accuracy": 0.7429577464788732, "accuracy_std": 0.010795570223067515, "avg_cost": 0.00044822204225352085, "cost_std": 1.1440932756026079e-05, "small_model_fraction": 0.28826306599614404}}, "0.15": {"accuracy": 0.7077464788732394, "large_model_usage": 0.4788732394366197, "small_model_usage": 0.5211267605633803, "avg_lambda": 0.540000000000001, "avg_cost": 0.0003142095070422535, "unrestricted": {"accuracy": 0.7570422535211268, "large_model_usage": 0.4788732394366197, "avg_cost": 0.0003281390845070422}, "random_baseline": {"accuracy": 0.6827464788732394, "accuracy_std": 0.02195839694037867, "avg_cost": 0.00031503954225352103, "cost_std": 1.5063078845407582e-05, "small_model_fraction": 0.5151461340385664}, "unrestricted_random_baseline": {"accuracy": 0.6971830985915493, "accuracy_std": 0.013080405366906153, "avg_cost": 0.0003233932042253519, "cost_std": 1.6204905059969528e-05, "small_model_fraction": 0.49121275936052733}}, "0.2": {"accuracy": 0.6443661971830986, "large_model_usage": 0.30985915492957744, "small_model_usage": 0.6901408450704225, "avg_lambda": 0.3799999999999988, "avg_cost": 0.0002109559859154929, "unrestricted": {"accuracy": 0.704225352112676, "large_model_usage": 0.30985915492957744, "avg_cost": 0.00022321654929577462}, "random_baseline": {"accuracy": 0.6408450704225352, "accuracy_std": 0.015267241823013242, "avg_cost": 0.0002091289788732394, "cost_std": 1.736473701642851e-05, "small_model_fraction": 0.6925531813234046}, "unrestricted_random_baseline": {"accuracy": 0.6380281690140845, "accuracy_std": 0.005409257568921548, "avg_cost": 0.00022063345070422535, "cost_std": 1.3234896720306635e-05, "small_model_fraction": 0.6714874556841396}}, "0.25": {"accuracy": 0.6091549295774648, "large_model_usage": 0.2746478873239437, "small_model_usage": 0.7253521126760563, "avg_lambda": 0.2400000000000003, "avg_cost": 0.0001813996478873239, "unrestricted": {"accuracy": 0.6795774647887324, "large_model_usage": 0.2746478873239437, "avg_cost": 0.00020203345070422528}, "random_baseline": {"accuracy": 0.6362676056338029, "accuracy_std": 0.016216308365731652, "avg_cost": 0.0001884255985915493, "cost_std": 1.845103308763075e-05, "small_model_fraction": 0.7433359788581518}, "unrestricted_random_baseline": {"accuracy": 0.6309859154929577, "accuracy_std": 0.02039823712388768, "avg_cost": 0.00019468883802816898, "cost_std": 1.3238609045765495e-05, "small_model_fraction": 0.7078836088851214}}}}, {"trial": 17, "trial_seed": 58, "subject": "TruthfulQA", "method": "standard", "calibration_size": 400, "test_size": 284, "small_model_accuracy": 0.5497076023391813, "large_model_accuracy": 0.8187134502923976, "small_model_cost": 3.1980116959064336e-05, "large_model_cost": 0.0006142923976608198, "hybrid_results": {"0.05": {"accuracy": 0.8204225352112676, "large_model_usage": 0.7922535211267606, "small_model_usage": 0.20774647887323938, "avg_lambda": 0.8899999999999935, "avg_cost": 0.0005151309859154927, "unrestricted": {"accuracy": 0.8309859154929577, "large_model_usage": 0.7922535211267606, "avg_cost": 0.0005220535211267604}, "random_baseline": {"accuracy": 0.7982394366197183, "accuracy_std": 0.011997693688310468, "avg_cost": 0.0005150582042253523, "cost_std": 1.0158276109820901e-05, "small_model_fraction": 0.17028906143938052}, "unrestricted_random_baseline": {"accuracy": 0.8021126760563382, "accuracy_std": 0.011867816581938553, "avg_cost": 0.0005218322183098594, "cost_std": 1.6406646305800072e-05, "small_model_fraction": 0.15840104973039643}}, "0.1": {"accuracy": 0.7852112676056338, "large_model_usage": 0.6302816901408451, "small_model_usage": 0.3697183098591549, "avg_lambda": 0.6999999999999973, "avg_cost": 0.00040680704225352124, "unrestricted": {"accuracy": 0.8063380281690141, "large_model_usage": 0.6302816901408451, "avg_cost": 0.0004228845070422537}, "random_baseline": {"accuracy": 0.7411971830985916, "accuracy_std": 0.02125839274383602, "avg_cost": 0.00041093619718309834, "cost_std": 1.6327284438615458e-05, "small_model_fraction": 0.3563128621592078}, "unrestricted_random_baseline": {"accuracy": 0.7503521126760563, "accuracy_std": 0.01084712802869672, "avg_cost": 0.0004175614436619715, "cost_std": 1.1918727227548661e-05, "small_model_fraction": 0.32870316660314425}}, "0.15": {"accuracy": 0.7288732394366197, "large_model_usage": 0.426056338028169, "small_model_usage": 0.573943661971831, "avg_lambda": 0.5, "avg_cost": 0.00028647605633802824, "unrestricted": {"accuracy": 0.7605633802816901, "large_model_usage": 0.426056338028169, "avg_cost": 0.00029738450704225365}, "random_baseline": {"accuracy": 0.6757042253521126, "accuracy_std": 0.015058735760804978, "avg_cost": 0.0002825490492957745, "cost_std": 1.3401280688564111e-05, "small_model_fraction": 0.5629562559246285}, "unrestricted_random_baseline": {"accuracy": 0.683450704225352, "accuracy_std": 0.013217129844453123, "avg_cost": 0.00029196757042253496, "cost_std": 1.631271144651932e-05, "small_model_fraction": 0.5442232649406852}}, "0.2": {"accuracy": 0.7112676056338029, "large_model_usage": 0.3626760563380282, "small_model_usage": 0.6373239436619718, "avg_lambda": 0.40000000000000185, "avg_cost": 0.00024458873239436625, "unrestricted": {"accuracy": 0.7429577464788732, "large_model_usage": 0.3626760563380282, "avg_cost": 0.00025777183098591545}, "random_baseline": {"accuracy": 0.6528169014084507, "accuracy_std": 0.016074242550018752, "avg_cost": 0.0002455246126760562, "cost_std": 1.9342793391352906e-05, "small_model_fraction": 0.6348890063264967}, "unrestricted_random_baseline": {"accuracy": 0.6669014084507042, "accuracy_std": 0.011909531355836436, "avg_cost": 0.0002602820070422533, "cost_std": 2.059858250418119e-05, "small_model_fraction": 0.6122497815867024}}, "0.25": {"accuracy": 0.6443661971830986, "large_model_usage": 0.2992957746478873, "small_model_usage": 0.7007042253521127, "avg_lambda": 0.29999999999999843, "avg_cost": 0.0001991802816901408, "unrestricted": {"accuracy": 0.7183098591549296, "large_model_usage": 0.2992957746478873, "avg_cost": 0.00021850422535211276}, "random_baseline": {"accuracy": 0.6362676056338028, "accuracy_std": 0.015352274194782871, "avg_cost": 0.00019747728873239435, "cost_std": 1.2424892041976122e-05, "small_model_fraction": 0.712868558208011}, "unrestricted_random_baseline": {"accuracy": 0.6454225352112675, "accuracy_std": 0.01802665596643692, "avg_cost": 0.00021533644366197183, "cost_std": 1.6998658265932217e-05, "small_model_fraction": 0.6796837116877138}}}}, {"trial": 18, "trial_seed": 59, "subject": "TruthfulQA", "method": "standard", "calibration_size": 400, "test_size": 284, "small_model_accuracy": 0.5701754385964912, "large_model_accuracy": 0.8347953216374269, "small_model_cost": 3.198230994152046e-05, "large_model_cost": 0.0006123947368421063, "hybrid_results": {"0.05": {"accuracy": 0.778169014084507, "large_model_usage": 0.7183098591549296, "small_model_usage": 0.2816901408450704, "avg_lambda": 0.8300000000000048, "avg_cost": 0.00047179507042253516, "unrestricted": {"accuracy": 0.7887323943661971, "large_model_usage": 0.7183098591549296, "avg_cost": 0.00047577394366197167}, "random_baseline": {"accuracy": 0.7612676056338027, "accuracy_std": 0.013978474113576889, "avg_cost": 0.0004680513380281688, "cost_std": 1.427655495018968e-05, "small_model_fraction": 0.2422409650502767}, "unrestricted_random_baseline": {"accuracy": 0.7658450704225352, "accuracy_std": 0.013291962033927381, "avg_cost": 0.0004759088380281686, "cost_std": 6.658309867813643e-06, "small_model_fraction": 0.23538571341363657}}, "0.1": {"accuracy": 0.7570422535211268, "large_model_usage": 0.5669014084507042, "small_model_usage": 0.43309859154929575, "avg_lambda": 0.6999999999999973, "avg_cost": 0.00036868239436619704, "unrestricted": {"accuracy": 0.7746478873239436, "large_model_usage": 0.5669014084507042, "avg_cost": 0.00038376690140845055}, "random_baseline": {"accuracy": 0.7119718309859154, "accuracy_std": 0.017307332010485243, "avg_cost": 0.00036470274647887297, "cost_std": 1.9021056444014305e-05, "small_model_fraction": 0.4198951145435291}, "unrestricted_random_baseline": {"accuracy": 0.7200704225352113, "accuracy_std": 0.015368417494926212, "avg_cost": 0.0003893307394366194, "cost_std": 1.3080157426467738e-05, "small_model_fraction": 0.39390582426798304}}, "0.15": {"accuracy": 0.6901408450704225, "large_model_usage": 0.3873239436619718, "small_model_usage": 0.6126760563380282, "avg_lambda": 0.5, "avg_cost": 0.00026392887323943657, "unrestricted": {"accuracy": 0.7253521126760564, "large_model_usage": 0.3873239436619718, "avg_cost": 0.00027296408450704204}, "random_baseline": {"accuracy": 0.6690140845070423, "accuracy_std": 0.012498760105140018, "avg_cost": 0.0002620965492957744, "cost_std": 1.3355595603039236e-05, "small_model_fraction": 0.6003762970125993}, "unrestricted_random_baseline": {"accuracy": 0.6700704225352114, "accuracy_std": 0.01559266889118627, "avg_cost": 0.0002765148239436619, "cost_std": 1.0558642859579685e-05, "small_model_fraction": 0.5848094158624942}}, "0.2": {"accuracy": 0.6443661971830986, "large_model_usage": 0.2887323943661972, "small_model_usage": 0.7112676056338028, "avg_lambda": 0.3700000000000011, "avg_cost": 0.0002002950704225351, "unrestricted": {"accuracy": 0.6830985915492958, "large_model_usage": 0.2887323943661972, "avg_cost": 0.0002126401408450704}, "random_baseline": {"accuracy": 0.6341549295774648, "accuracy_std": 0.013028169014084524, "avg_cost": 0.00018964679577464794, "cost_std": 1.1001143993877326e-05, "small_model_fraction": 0.7100117904438948}, "unrestricted_random_baseline": {"accuracy": 0.6440140845070422, "accuracy_std": 0.018122689059799694, "avg_cost": 0.00021200654929577454, "cost_std": 1.0641010908374805e-05, "small_model_fraction": 0.6887423105872035}}, "0.25": {"accuracy": 0.6056338028169014, "large_model_usage": 0.2676056338028169, "small_model_usage": 0.7323943661971831, "avg_lambda": 0.28000000000000086, "avg_cost": 0.0001784359154929577, "unrestricted": {"accuracy": 0.6725352112676056, "large_model_usage": 0.2676056338028169, "avg_cost": 0.00019942183098591547}, "random_baseline": {"accuracy": 0.6295774647887324, "accuracy_std": 0.008302694452501156, "avg_cost": 0.00017675950704225347, "cost_std": 1.1727378312221385e-05, "small_model_fraction": 0.7476732082848354}, "unrestricted_random_baseline": {"accuracy": 0.6376760563380282, "accuracy_std": 0.01224319623289699, "avg_cost": 0.00019761436619718307, "cost_std": 1.068774759437845e-05, "small_model_fraction": 0.7115163058473343}}}}, {"trial": 19, "trial_seed": 60, "subject": "TruthfulQA", "method": "standard", "calibration_size": 400, "test_size": 284, "small_model_accuracy": 0.5526315789473685, "large_model_accuracy": 0.8377192982456141, "small_model_cost": 3.2065058479532224e-05, "large_model_cost": 0.0006145643274853809, "hybrid_results": {"0.05": {"accuracy": 0.7746478873239436, "large_model_usage": 0.7359154929577465, "small_model_usage": 0.2640845070422535, "avg_lambda": 0.8000000000000037, "avg_cost": 0.00048371584507042256, "unrestricted": {"accuracy": 0.778169014084507, "large_model_usage": 0.7359154929577465, "avg_cost": 0.0004888355633802815}, "random_baseline": {"accuracy": 0.7556338028169015, "accuracy_std": 0.014535047493345258, "avg_cost": 0.0004810196478873237, "cost_std": 1.0324456699490606e-05, "small_model_fraction": 0.22463286973437308}, "unrestricted_random_baseline": {"accuracy": 0.7464788732394367, "accuracy_std": 0.014771955608030286, "avg_cost": 0.0004838719014084505, "cost_std": 9.858027825161975e-06, "small_model_fraction": 0.21584364272195675}}, "0.1": {"accuracy": 0.7288732394366197, "large_model_usage": 0.6373239436619719, "small_model_usage": 0.36267605633802813, "avg_lambda": 0.6999999999999973, "avg_cost": 0.00041286373239436604, "unrestricted": {"accuracy": 0.7535211267605634, "large_model_usage": 0.6373239436619719, "avg_cost": 0.00042938485915492943}, "random_baseline": {"accuracy": 0.7112676056338028, "accuracy_std": 0.013080405366906162, "avg_cost": 0.0004145777112676052, "cost_std": 7.411344642511732e-06, "small_model_fraction": 0.3462675505760843}, "unrestricted_random_baseline": {"accuracy": 0.7204225352112676, "accuracy_std": 0.01427685558641783, "avg_cost": 0.0004299439436619715, "cost_std": 1.3895861125792e-05, "small_model_fraction": 0.3179050656089186}}, "0.15": {"accuracy": 0.6690140845070423, "large_model_usage": 0.44014084507042256, "small_model_usage": 0.5598591549295775, "avg_lambda": 0.5, "avg_cost": 0.0002963144366197183, "unrestricted": {"accuracy": 0.704225352112676, "large_model_usage": 0.44014084507042256, "avg_cost": 0.00030736373239436613}, "random_baseline": {"accuracy": 0.6426056338028169, "accuracy_std": 0.01968369698503337, "avg_cost": 0.0002838916901408448, "cost_std": 1.7975220713263516e-05, "small_model_fraction": 0.546352429607027}, "unrestricted_random_baseline": {"accuracy": 0.647887323943662, "accuracy_std": 0.021126760563380278, "avg_cost": 0.00030631545774647856, "cost_std": 1.4331904422108791e-05, "small_model_fraction": 0.5273836576916464}}, "0.2": {"accuracy": 0.6373239436619719, "large_model_usage": 0.36619718309859156, "small_model_usage": 0.6338028169014085, "avg_lambda": 0.40000000000000185, "avg_cost": 0.00024867359154929597, "unrestricted": {"accuracy": 0.6795774647887324, "large_model_usage": 0.36619718309859156, "avg_cost": 0.0002605538732394367}, "random_baseline": {"accuracy": 0.6200704225352113, "accuracy_std": 0.016016289317667846, "avg_cost": 0.0002449714436619716, "cost_std": 2.096887924538234e-05, "small_model_fraction": 0.6281393907335722}, "unrestricted_random_baseline": {"accuracy": 0.630281690140845, "accuracy_std": 0.010795570223067558, "avg_cost": 0.0002649859859154927, "cost_std": 1.4823209640085409e-05, "small_model_fraction": 0.6077440317652822}}, "0.25": {"accuracy": 0.5669014084507042, "large_model_usage": 0.29225352112676056, "small_model_usage": 0.7077464788732395, "avg_lambda": 0.2600000000000001, "avg_cost": 0.00019361021126760558, "unrestricted": {"accuracy": 0.647887323943662, "large_model_usage": 0.29225352112676056, "avg_cost": 0.0002146735915492958}, "random_baseline": {"accuracy": 0.6042253521126761, "accuracy_std": 0.023207404300400126, "avg_cost": 0.00019657633802816886, "cost_std": 1.6810542699715425e-05, "small_model_fraction": 0.7226689175700042}, "unrestricted_random_baseline": {"accuracy": 0.6109154929577464, "accuracy_std": 0.01615502781261193, "avg_cost": 0.00021314225352112655, "cost_std": 1.3704786176877621e-05, "small_model_fraction": 0.686508562694986}}}}, {"trial": 20, "trial_seed": 61, "subject": "TruthfulQA", "method": "standard", "calibration_size": 400, "test_size": 284, "small_model_accuracy": 0.5482456140350878, "large_model_accuracy": 0.8318713450292398, "small_model_cost": 3.20786549707603e-05, "large_model_cost": 0.0006157017543859657, "hybrid_results": {"0.05": {"accuracy": 0.8345070422535211, "large_model_usage": 0.7887323943661971, "small_model_usage": 0.21126760563380287, "avg_lambda": 0.8600000000000062, "avg_cost": 0.0005104447183098591, "unrestricted": {"accuracy": 0.8450704225352113, "large_model_usage": 0.7887323943661971, "avg_cost": 0.000521332042253521}, "random_baseline": {"accuracy": 0.8, "accuracy_std": 0.010772576437167863, "avg_cost": 0.000512012042253521, "cost_std": 1.0522315581344877e-05, "small_model_fraction": 0.18035104536056734}, "unrestricted_random_baseline": {"accuracy": 0.8140845070422535, "accuracy_std": 0.013619070144939219, "avg_cost": 0.0005279155633802814, "cost_std": 1.257112631556522e-05, "small_model_fraction": 0.1616963280360284}}, "0.1": {"accuracy": 0.7887323943661971, "large_model_usage": 0.6197183098591549, "small_model_usage": 0.3802816901408451, "avg_lambda": 0.7299999999999969, "avg_cost": 0.0003989799295774646, "unrestricted": {"accuracy": 0.8098591549295775, "large_model_usage": 0.6197183098591549, "avg_cost": 0.0004174517605633801}, "random_baseline": {"accuracy": 0.751056338028169, "accuracy_std": 0.012894248889859529, "avg_cost": 0.0003982439436619717, "cost_std": 1.86841876111368e-05, "small_model_fraction": 0.37133866878411415}, "unrestricted_random_baseline": {"accuracy": 0.7478873239436621, "accuracy_std": 0.015841157576340813, "avg_cost": 0.00041640570422535187, "cost_std": 1.8029113261198465e-05, "small_model_fraction": 0.33968839482404567}}, "0.15": {"accuracy": 0.7464788732394366, "large_model_usage": 0.4612676056338028, "small_model_usage": 0.5387323943661972, "avg_lambda": 0.6100000000000032, "avg_cost": 0.00031064190140845055, "unrestricted": {"accuracy": 0.7640845070422535, "large_model_usage": 0.4612676056338028, "avg_cost": 0.0003186559859154929}, "random_baseline": {"accuracy": 0.6950704225352112, "accuracy_std": 0.015364383260050315, "avg_cost": 0.0003144180985915491, "cost_std": 1.4807024483928062e-05, "small_model_fraction": 0.5227001009438923}, "unrestricted_random_baseline": {"accuracy": 0.7, "accuracy_std": 0.014922267676350083, "avg_cost": 0.0003191949295774646, "cost_std": 1.1821719294122746e-05, "small_model_fraction": 0.5089684914255704}}, "0.2": {"accuracy": 0.6690140845070423, "large_model_usage": 0.3380281690140845, "small_model_usage": 0.6619718309859155, "avg_lambda": 0.40000000000000185, "avg_cost": 0.00022961373239436606, "unrestricted": {"accuracy": 0.721830985915493, "large_model_usage": 0.3380281690140845, "avg_cost": 0.00024276161971830977}, "random_baseline": {"accuracy": 0.6556338028169014, "accuracy_std": 0.017801724872907775, "avg_cost": 0.00022729274647887316, "cost_std": 1.5782632885499523e-05, "small_model_fraction": 0.6615365676555}, "unrestricted_random_baseline": {"accuracy": 0.6588028169014085, "accuracy_std": 0.017707452057295976, "avg_cost": 0.00023314753521126754, "cost_std": 1.6564256920247764e-05, "small_model_fraction": 0.6390085228657754}}, "0.25": {"accuracy": 0.6373239436619719, "large_model_usage": 0.28169014084507044, "small_model_usage": 0.7183098591549295, "avg_lambda": 0.3100000000000012, "avg_cost": 0.00019193767605633793, "unrestricted": {"accuracy": 0.6936619718309859, "large_model_usage": 0.28169014084507044, "avg_cost": 0.00020800809859154924}, "random_baseline": {"accuracy": 0.6366197183098592, "accuracy_std": 0.012773490948744411, "avg_cost": 0.0001916569718309859, "cost_std": 1.2507519152697215e-05, "small_model_fraction": 0.7260920254085941}, "unrestricted_random_baseline": {"accuracy": 0.643661971830986, "accuracy_std": 0.014838948947679917, "avg_cost": 0.00021547218309859152, "cost_std": 1.3689426245079067e-05, "small_model_fraction": 0.6985564077277415}}}}, {"trial": 21, "trial_seed": 62, "subject": "TruthfulQA", "method": "standard", "calibration_size": 400, "test_size": 284, "small_model_accuracy": 0.5570175438596491, "large_model_accuracy": 0.8377192982456141, "small_model_cost": 3.1931871345029295e-05, "large_model_cost": 0.0006127192982456147, "hybrid_results": {"0.05": {"accuracy": 0.8063380281690141, "large_model_usage": 0.795774647887324, "small_model_usage": 0.204225352112676, "avg_lambda": 0.900000000000004, "avg_cost": 0.0005237482394366195, "unrestricted": {"accuracy": 0.8063380281690141, "large_model_usage": 0.795774647887324, "avg_cost": 0.0005241566901408449}, "random_baseline": {"accuracy": 0.7922535211267606, "accuracy_std": 0.009316025743185215, "avg_cost": 0.0005246828169014083, "cost_std": 1.2372386652055592e-05, "small_model_fraction": 0.1531904009764739}, "unrestricted_random_baseline": {"accuracy": 0.7862676056338028, "accuracy_std": 0.00971346072122095, "avg_cost": 0.0005216822535211266, "cost_std": 8.705657689308307e-06, "small_model_fraction": 0.15248713040740336}}, "0.1": {"accuracy": 0.7746478873239436, "large_model_usage": 0.6232394366197183, "small_model_usage": 0.37676056338028174, "avg_lambda": 0.7599999999999976, "avg_cost": 0.00041346654929577453, "unrestricted": {"accuracy": 0.7816901408450704, "large_model_usage": 0.6232394366197183, "avg_cost": 0.0004168679577464788}, "random_baseline": {"accuracy": 0.7397887323943663, "accuracy_std": 0.01677254276593597, "avg_cost": 0.0004096519718309858, "cost_std": 1.5409902013750687e-05, "small_model_fraction": 0.3430734546255022}, "unrestricted_random_baseline": {"accuracy": 0.7507042253521126, "accuracy_std": 0.02099725565616278, "avg_cost": 0.0004229163732394364, "cost_std": 1.1277303988859234e-05, "small_model_fraction": 0.3372169083347945}}, "0.15": {"accuracy": 0.7359154929577465, "large_model_usage": 0.4859154929577465, "small_model_usage": 0.5140845070422535, "avg_lambda": 0.5999999999999969, "avg_cost": 0.0003276989436619719, "unrestricted": {"accuracy": 0.7429577464788732, "large_model_usage": 0.4859154929577465, "avg_cost": 0.0003341848591549296}, "random_baseline": {"accuracy": 0.7024647887323943, "accuracy_std": 0.01776337782483022, "avg_cost": 0.00033469926056338016, "cost_std": 2.2823420616255338e-05, "small_model_fraction": 0.49074814877566275}, "unrestricted_random_baseline": {"accuracy": 0.7021126760563381, "accuracy_std": 0.016074242550018783, "avg_cost": 0.00032718109154929577, "cost_std": 1.4516857668496905e-05, "small_model_fraction": 0.47958069715300916}}, "0.2": {"accuracy": 0.6901408450704225, "large_model_usage": 0.40492957746478875, "small_model_usage": 0.5950704225352113, "avg_lambda": 0.450000000000002, "avg_cost": 0.00026941021126760555, "unrestricted": {"accuracy": 0.7253521126760564, "large_model_usage": 0.40492957746478875, "avg_cost": 0.00028402992957746485}, "random_baseline": {"accuracy": 0.677112676056338, "accuracy_std": 0.015352274194782852, "avg_cost": 0.00027632936619718316, "cost_std": 1.3437696452709745e-05, "small_model_fraction": 0.5911097091238755}, "unrestricted_random_baseline": {"accuracy": 0.6859154929577465, "accuracy_std": 0.01176288245668313, "avg_cost": 0.00028537454225352113, "cost_std": 1.1486187295522848e-05, "small_model_fraction": 0.5659374728930079}}, "0.25": {"accuracy": 0.6373239436619719, "large_model_usage": 0.30633802816901406, "small_model_usage": 0.693661971830986, "avg_lambda": 0.3100000000000012, "avg_cost": 0.00020454401408450702, "unrestricted": {"accuracy": 0.6866197183098591, "large_model_usage": 0.30633802816901406, "avg_cost": 0.00022260035211267614}, "random_baseline": {"accuracy": 0.6595070422535212, "accuracy_std": 0.011789203551255763, "avg_cost": 0.00021070577464788742, "cost_std": 1.7462845080964384e-05, "small_model_fraction": 0.7027963507050505}, "unrestricted_random_baseline": {"accuracy": 0.6661971830985915, "accuracy_std": 0.01144230761216331, "avg_cost": 0.00021811507042253527, "cost_std": 1.767887078401714e-05, "small_model_fraction": 0.6717069414102796}}}}, {"trial": 22, "trial_seed": 63, "subject": "TruthfulQA", "method": "standard", "calibration_size": 400, "test_size": 284, "small_model_accuracy": 0.554093567251462, "large_model_accuracy": 0.8157894736842105, "small_model_cost": 3.205672514619885e-05, "large_model_cost": 0.000614912280701756, "hybrid_results": {"0.05": {"accuracy": 0.7922535211267606, "large_model_usage": 0.7007042253521126, "small_model_usage": 0.29929577464788737, "avg_lambda": 0.8300000000000048, "avg_cost": 0.00045758697183098527, "unrestricted": {"accuracy": 0.7992957746478874, "large_model_usage": 0.7007042253521126, "avg_cost": 0.00046274190140845013}, "random_baseline": {"accuracy": 0.7531690140845071, "accuracy_std": 0.012932653290286571, "avg_cost": 0.0004559642605633801, "cost_std": 1.5438067877810733e-05, "small_model_fraction": 0.26992160814322835}, "unrestricted_random_baseline": {"accuracy": 0.7588028169014084, "accuracy_std": 0.0151244588696581, "avg_cost": 0.00045861760563380263, "cost_std": 1.542697447525038e-05, "small_model_fraction": 0.2610773421354155}}, "0.1": {"accuracy": 0.75, "large_model_usage": 0.5633802816901409, "small_model_usage": 0.43661971830985913, "avg_lambda": 0.6999999999999973, "avg_cost": 0.0003634179577464784, "unrestricted": {"accuracy": 0.7852112676056338, "large_model_usage": 0.5633802816901409, "avg_cost": 0.00037835457746478835}, "random_baseline": {"accuracy": 0.7179577464788732, "accuracy_std": 0.007293068724369034, "avg_cost": 0.0003653122183098588, "cost_std": 2.0866471870800375e-05, "small_model_fraction": 0.43148653308376234}, "unrestricted_random_baseline": {"accuracy": 0.7369718309859155, "accuracy_std": 0.012894248889859527, "avg_cost": 0.0003829143661971829, "cost_std": 1.530655598071867e-05, "small_model_fraction": 0.4058599098562066}}, "0.15": {"accuracy": 0.7077464788732394, "large_model_usage": 0.4119718309859155, "small_model_usage": 0.5880281690140845, "avg_lambda": 0.5500000000000005, "avg_cost": 0.0002762489436619718, "unrestricted": {"accuracy": 0.7288732394366197, "large_model_usage": 0.4119718309859155, "avg_cost": 0.00028588978873239423}, "random_baseline": {"accuracy": 0.6869718309859155, "accuracy_std": 0.01489315914264549, "avg_cost": 0.0002639438380281688, "cost_std": 1.364434990026285e-05, "small_model_fraction": 0.5810416213962006}, "unrestricted_random_baseline": {"accuracy": 0.6862676056338028, "accuracy_std": 0.013947396000238263, "avg_cost": 0.00028362366197183086, "cost_std": 1.5809876934492315e-05, "small_model_fraction": 0.5645009107886932}}, "0.2": {"accuracy": 0.6549295774647887, "large_model_usage": 0.3028169014084507, "small_model_usage": 0.6971830985915493, "avg_lambda": 0.40000000000000185, "avg_cost": 0.00020525598591549283, "unrestricted": {"accuracy": 0.7077464788732394, "large_model_usage": 0.3028169014084507, "avg_cost": 0.0002186503521126759}, "random_baseline": {"accuracy": 0.6443661971830986, "accuracy_std": 0.012597566070421333, "avg_cost": 0.00020243218309859136, "cost_std": 1.2759666390487962e-05, "small_model_fraction": 0.7028435962934134}, "unrestricted_random_baseline": {"accuracy": 0.6580985915492958, "accuracy_std": 0.02000855302137376, "avg_cost": 0.00021043633802816887, "cost_std": 1.639530560083866e-05, "small_model_fraction": 0.6798630034698345}}, "0.25": {"accuracy": 0.6232394366197183, "large_model_usage": 0.24647887323943662, "small_model_usage": 0.7535211267605634, "avg_lambda": 0.3100000000000012, "avg_cost": 0.00016822781690140838, "unrestricted": {"accuracy": 0.6725352112676056, "large_model_usage": 0.24647887323943662, "avg_cost": 0.0001839390845070421}, "random_baseline": {"accuracy": 0.6373239436619718, "accuracy_std": 0.011355292603237411, "avg_cost": 0.0001625114788732393, "cost_std": 1.5864110926177944e-05, "small_model_fraction": 0.7663724906500787}, "unrestricted_random_baseline": {"accuracy": 0.6373239436619718, "accuracy_std": 0.018228421275415203, "avg_cost": 0.00018230690140845057, "cost_std": 1.01575886472411e-05, "small_model_fraction": 0.7394168110552289}}}}, {"trial": 23, "trial_seed": 64, "subject": "TruthfulQA", "method": "standard", "calibration_size": 400, "test_size": 284, "small_model_accuracy": 0.5511695906432749, "large_model_accuracy": 0.8230994152046783, "small_model_cost": 3.2060087719298305e-05, "large_model_cost": 0.0006151900584795328, "hybrid_results": {"0.05": {"accuracy": 0.7887323943661971, "large_model_usage": 0.8028169014084507, "small_model_usage": 0.19718309859154926, "avg_lambda": 0.900000000000004, "avg_cost": 0.0005265140845070429, "unrestricted": {"accuracy": 0.7887323943661971, "large_model_usage": 0.8028169014084507, "avg_cost": 0.000526908450704226}, "random_baseline": {"accuracy": 0.7735915492957746, "accuracy_std": 0.012602486033924828, "avg_cost": 0.0005304736971830984, "cost_std": 1.1852150629107894e-05, "small_model_fraction": 0.15206896990199595}, "unrestricted_random_baseline": {"accuracy": 0.7707746478873239, "accuracy_std": 0.009242538555215963, "avg_cost": 0.0005270331338028167, "cost_std": 9.963772055897283e-06, "small_model_fraction": 0.15139267779396226}}, "0.1": {"accuracy": 0.7394366197183099, "large_model_usage": 0.6267605633802817, "small_model_usage": 0.37323943661971826, "avg_lambda": 0.6999999999999973, "avg_cost": 0.0004042112676056335, "unrestricted": {"accuracy": 0.7570422535211268, "large_model_usage": 0.6267605633802817, "avg_cost": 0.0004188873239436619}, "random_baseline": {"accuracy": 0.7014084507042253, "accuracy_std": 0.016197183098591566, "avg_cost": 0.0004018773591549293, "cost_std": 1.718056224501974e-05, "small_model_fraction": 0.36180405990596465}, "unrestricted_random_baseline": {"accuracy": 0.7161971830985915, "accuracy_std": 0.011909531355836485, "avg_cost": 0.0004184310915492955, "cost_std": 1.1334772207726144e-05, "small_model_fraction": 0.3366363321712797}}, "0.15": {"accuracy": 0.7077464788732394, "large_model_usage": 0.4859154929577465, "small_model_usage": 0.5140845070422535, "avg_lambda": 0.5900000000000027, "avg_cost": 0.0003205563380281687, "unrestricted": {"accuracy": 0.7288732394366197, "large_model_usage": 0.4859154929577465, "avg_cost": 0.0003320140845070418}, "random_baseline": {"accuracy": 0.6654929577464789, "accuracy_std": 0.018896349105632, "avg_cost": 0.00032349376760563346, "cost_std": 1.839137559541581e-05, "small_model_fraction": 0.5052625233226241}, "unrestricted_random_baseline": {"accuracy": 0.6693661971830985, "accuracy_std": 0.013122989544777605, "avg_cost": 0.0003390395422535208, "cost_std": 1.5553804708360617e-05, "small_model_fraction": 0.48561382225528654}}, "0.2": {"accuracy": 0.6690140845070423, "large_model_usage": 0.38028169014084506, "small_model_usage": 0.619718309859155, "avg_lambda": 0.4300000000000031, "avg_cost": 0.0002508309859154928, "unrestricted": {"accuracy": 0.7007042253521126, "large_model_usage": 0.38028169014084506, "avg_cost": 0.00026596478873239417}, "random_baseline": {"accuracy": 0.642605633802817, "accuracy_std": 0.017693442994931162, "avg_cost": 0.00025209112676056317, "cost_std": 1.7758509853247717e-05, "small_model_fraction": 0.6248333833519483}, "unrestricted_random_baseline": {"accuracy": 0.6387323943661972, "accuracy_std": 0.016530555766627472, "avg_cost": 0.00025882528169014063, "cost_std": 1.6184488638985624e-05, "small_model_fraction": 0.5988806737061532}}, "0.25": {"accuracy": 0.6232394366197183, "large_model_usage": 0.2887323943661972, "small_model_usage": 0.7112676056338028, "avg_lambda": 0.3100000000000012, "avg_cost": 0.00019181690140845063, "unrestricted": {"accuracy": 0.6654929577464789, "large_model_usage": 0.2887323943661972, "avg_cost": 0.00020933802816901413}, "random_baseline": {"accuracy": 0.6091549295774648, "accuracy_std": 0.011783944035691206, "avg_cost": 0.00019411447183098585, "cost_std": 1.4736602380586809e-05, "small_model_fraction": 0.7260356666612844}, "unrestricted_random_baseline": {"accuracy": 0.6137323943661971, "accuracy_std": 0.01767942283187775, "avg_cost": 0.00020267059859154917, "cost_std": 1.6002481493098658e-05, "small_model_fraction": 0.6959889744329276}}}}, {"trial": 24, "trial_seed": 65, "subject": "TruthfulQA", "method": "standard", "calibration_size": 400, "test_size": 284, "small_model_accuracy": 0.5599415204678363, "large_model_accuracy": 0.8201754385964912, "small_model_cost": 3.206885964912288e-05, "large_model_cost": 0.0006151549707602342, "hybrid_results": {"0.05": {"accuracy": 0.8274647887323944, "large_model_usage": 0.8169014084507042, "small_model_usage": 0.18309859154929575, "avg_lambda": 0.900000000000004, "avg_cost": 0.0005396876760563375, "unrestricted": {"accuracy": 0.8274647887323944, "large_model_usage": 0.8169014084507042, "avg_cost": 0.0005405538732394362}, "random_baseline": {"accuracy": 0.8158450704225352, "accuracy_std": 0.009322677672446274, "avg_cost": 0.0005404630281690139, "cost_std": 1.1536734551994725e-05, "small_model_fraction": 0.1294273577535374}, "unrestricted_random_baseline": {"accuracy": 0.816549295774648, "accuracy_std": 0.01624686196487227, "avg_cost": 0.0005492013380281689, "cost_std": 8.676442524158359e-06, "small_model_fraction": 0.1279418187112026}}, "0.1": {"accuracy": 0.778169014084507, "large_model_usage": 0.6091549295774648, "small_model_usage": 0.39084507042253525, "avg_lambda": 0.6999999999999973, "avg_cost": 0.0003966031690140844, "unrestricted": {"accuracy": 0.795774647887324, "large_model_usage": 0.6091549295774648, "avg_cost": 0.0004139693661971829}, "random_baseline": {"accuracy": 0.7492957746478874, "accuracy_std": 0.012478905032865724, "avg_cost": 0.0004013346126760563, "cost_std": 1.4030916704301045e-05, "small_model_fraction": 0.37481908346210485}, "unrestricted_random_baseline": {"accuracy": 0.7563380281690141, "accuracy_std": 0.010301928759385756, "avg_cost": 0.00042080137323943664, "cost_std": 1.3375190086146723e-05, "small_model_fraction": 0.3450358372962752}}, "0.15": {"accuracy": 0.7147887323943662, "large_model_usage": 0.4084507042253521, "small_model_usage": 0.5915492957746479, "avg_lambda": 0.5, "avg_cost": 0.0002770468309859155, "unrestricted": {"accuracy": 0.7429577464788732, "large_model_usage": 0.4084507042253521, "avg_cost": 0.00028931443661971845}, "random_baseline": {"accuracy": 0.6788732394366196, "accuracy_std": 0.016349558825695613, "avg_cost": 0.0002701686267605634, "cost_std": 8.757058213187085e-06, "small_model_fraction": 0.579859703963845}, "unrestricted_random_baseline": {"accuracy": 0.6887323943661972, "accuracy_std": 0.011593012417714325, "avg_cost": 0.00028465830985915496, "cost_std": 1.7979695721768226e-05, "small_model_fraction": 0.5588206063073665}}, "0.2": {"accuracy": 0.6584507042253521, "large_model_usage": 0.2852112676056338, "small_model_usage": 0.7147887323943662, "avg_lambda": 0.3400000000000022, "avg_cost": 0.00019908908450704216, "unrestricted": {"accuracy": 0.7183098591549296, "large_model_usage": 0.2852112676056338, "avg_cost": 0.00021156091549295785}, "random_baseline": {"accuracy": 0.6454225352112676, "accuracy_std": 0.010211267605633787, "avg_cost": 0.0002036428873239436, "cost_std": 1.6757943381593047e-05, "small_model_fraction": 0.7135582177739569}, "unrestricted_random_baseline": {"accuracy": 0.6556338028169014, "accuracy_std": 0.013709804460515323, "avg_cost": 0.00021633549295774646, "cost_std": 1.511233255043895e-05, "small_model_fraction": 0.6921688710749766}}, "0.25": {"accuracy": 0.6302816901408451, "large_model_usage": 0.25704225352112675, "small_model_usage": 0.7429577464788732, "avg_lambda": 0.25, "avg_cost": 0.00017663133802816897, "unrestricted": {"accuracy": 0.704225352112676, "large_model_usage": 0.25704225352112675, "avg_cost": 0.00019427218309859155}, "random_baseline": {"accuracy": 0.6376760563380282, "accuracy_std": 0.0134034269793481, "avg_cost": 0.00017744397887323936, "cost_std": 1.155721062254145e-05, "small_model_fraction": 0.7520735348959484}, "unrestricted_random_baseline": {"accuracy": 0.6485915492957748, "accuracy_std": 0.011657003772709033, "avg_cost": 0.0001923295422535211, "cost_std": 1.7618118926893652e-05, "small_model_fraction": 0.7218192641557197}}}}, {"trial": 25, "trial_seed": 66, "subject": "TruthfulQA", "method": "standard", "calibration_size": 400, "test_size": 284, "small_model_accuracy": 0.5599415204678363, "large_model_accuracy": 0.827485380116959, "small_model_cost": 3.2012426900584826e-05, "large_model_cost": 0.000614564327485381, "hybrid_results": {"0.05": {"accuracy": 0.7922535211267606, "large_model_usage": 0.8274647887323944, "small_model_usage": 0.17253521126760563, "avg_lambda": 0.879999999999997, "avg_cost": 0.0005319352112676058, "unrestricted": {"accuracy": 0.8028169014084507, "large_model_usage": 0.8274647887323944, "avg_cost": 0.0005415408450704226}, "random_baseline": {"accuracy": 0.7820422535211267, "accuracy_std": 0.00883094098872145, "avg_cost": 0.0005354057746478874, "cost_std": 9.98836288668563e-06, "small_model_fraction": 0.14183992213368068}, "unrestricted_random_baseline": {"accuracy": 0.7827464788732394, "accuracy_std": 0.013085143785959211, "avg_cost": 0.0005390804225352111, "cost_std": 1.184599067283224e-05, "small_model_fraction": 0.12535103282927001}}, "0.1": {"accuracy": 0.7323943661971831, "large_model_usage": 0.6338028169014085, "small_model_usage": 0.3661971830985915, "avg_lambda": 0.6999999999999973, "avg_cost": 0.00040844929577464796, "unrestricted": {"accuracy": 0.7640845070422535, "large_model_usage": 0.6338028169014085, "avg_cost": 0.00042439295774647897}, "random_baseline": {"accuracy": 0.7161971830985915, "accuracy_std": 0.01682789175401201, "avg_cost": 0.000404284190140845, "cost_std": 1.5372947104872225e-05, "small_model_fraction": 0.35381402327213074}, "unrestricted_random_baseline": {"accuracy": 0.718661971830986, "accuracy_std": 0.010847128028696749, "avg_cost": 0.00042381450704225357, "cost_std": 1.3469838886602366e-05, "small_model_fraction": 0.3264453682976539}}, "0.15": {"accuracy": 0.6901408450704225, "large_model_usage": 0.43309859154929575, "small_model_usage": 0.5669014084507042, "avg_lambda": 0.5, "avg_cost": 0.00028675915492957745, "unrestricted": {"accuracy": 0.7253521126760564, "large_model_usage": 0.43309859154929575, "avg_cost": 0.0003002732394366197}, "random_baseline": {"accuracy": 0.6517605633802818, "accuracy_std": 0.019121371806176707, "avg_cost": 0.0002724026760563379, "cost_std": 1.661486618133187e-05, "small_model_fraction": 0.5627055241373954}, "unrestricted_random_baseline": {"accuracy": 0.673943661971831, "accuracy_std": 0.023260766992836246, "avg_cost": 0.00029764954225352093, "cost_std": 1.6758982234753447e-05, "small_model_fraction": 0.5395074460031105}}, "0.2": {"accuracy": 0.6725352112676056, "large_model_usage": 0.352112676056338, "small_model_usage": 0.647887323943662, "avg_lambda": 0.40000000000000185, "avg_cost": 0.00023749154929577444, "unrestricted": {"accuracy": 0.704225352112676, "large_model_usage": 0.352112676056338, "avg_cost": 0.00024995633802816895}, "random_baseline": {"accuracy": 0.6440140845070423, "accuracy_std": 0.013768461123955742, "avg_cost": 0.00024124077464788722, "cost_std": 1.3599263800569457e-05, "small_model_fraction": 0.6472775692793743}, "unrestricted_random_baseline": {"accuracy": 0.6447183098591549, "accuracy_std": 0.011184774770611668, "avg_cost": 0.0002514309507042253, "cost_std": 1.481090424846038e-05, "small_model_fraction": 0.6258806967949112}}, "0.25": {"accuracy": 0.6302816901408451, "large_model_usage": 0.31690140845070425, "small_model_usage": 0.6830985915492958, "avg_lambda": 0.3100000000000012, "avg_cost": 0.00020896338028168993, "unrestricted": {"accuracy": 0.6830985915492958, "large_model_usage": 0.31690140845070425, "avg_cost": 0.0002273366197183097}, "random_baseline": {"accuracy": 0.6235915492957746, "accuracy_std": 0.014384998743532067, "avg_cost": 0.0002130414788732393, "cost_std": 1.205605994521012e-05, "small_model_fraction": 0.696248603423193}, "unrestricted_random_baseline": {"accuracy": 0.6387323943661972, "accuracy_std": 0.013287297369093808, "avg_cost": 0.0002188251408450703, "cost_std": 1.7007639327139082e-05, "small_model_fraction": 0.6647093716085242}}}}, {"trial": 26, "trial_seed": 67, "subject": "TruthfulQA", "method": "standard", "calibration_size": 400, "test_size": 284, "small_model_accuracy": 0.5482456140350878, "large_model_accuracy": 0.8245614035087719, "small_model_cost": 3.2065058479532176e-05, "large_model_cost": 0.0006158976608187144, "hybrid_results": {"0.05": {"accuracy": 0.8028169014084507, "large_model_usage": 0.8204225352112676, "small_model_usage": 0.17957746478873238, "avg_lambda": 0.900000000000004, "avg_cost": 0.0005375672535211268, "unrestricted": {"accuracy": 0.8063380281690141, "large_model_usage": 0.8204225352112676, "avg_cost": 0.0005388278169014085}, "random_baseline": {"accuracy": 0.7845070422535211, "accuracy_std": 0.014154754395944928, "avg_cost": 0.0005313477816901407, "cost_std": 1.233372885075173e-05, "small_model_fraction": 0.13416586703748506}, "unrestricted_random_baseline": {"accuracy": 0.7922535211267606, "accuracy_std": 0.008907824394840533, "avg_cost": 0.0005427785211267606, "cost_std": 9.946787610235863e-06, "small_model_fraction": 0.13200674920947908}}, "0.1": {"accuracy": 0.7676056338028169, "large_model_usage": 0.6549295774647887, "small_model_usage": 0.34507042253521125, "avg_lambda": 0.6999999999999973, "avg_cost": 0.00042313063380281716, "unrestricted": {"accuracy": 0.7816901408450704, "large_model_usage": 0.6549295774647887, "avg_cost": 0.00043789823943661993}, "random_baseline": {"accuracy": 0.7341549295774648, "accuracy_std": 0.013568907653519485, "avg_cost": 0.00042483404929577457, "cost_std": 1.1372380616693452e-05, "small_model_fraction": 0.3301751670659661}, "unrestricted_random_baseline": {"accuracy": 0.7408450704225353, "accuracy_std": 0.013472624274442933, "avg_cost": 0.0004373982746478872, "cost_std": 2.205799891297219e-05, "small_model_fraction": 0.30488092077921386}}, "0.15": {"accuracy": 0.704225352112676, "large_model_usage": 0.4507042253521127, "small_model_usage": 0.5492957746478873, "avg_lambda": 0.5, "avg_cost": 0.0002970320422535211, "unrestricted": {"accuracy": 0.7253521126760564, "large_model_usage": 0.4507042253521127, "avg_cost": 0.000311855985915493}, "random_baseline": {"accuracy": 0.6785211267605634, "accuracy_std": 0.014088907763198965, "avg_cost": 0.00029506419014084496, "cost_std": 2.2155867212666187e-05, "small_model_fraction": 0.5461593225311967}, "unrestricted_random_baseline": {"accuracy": 0.6897887323943662, "accuracy_std": 0.01161971830985914, "avg_cost": 0.00031578362676056324, "cost_std": 1.7037633900749745e-05, "small_model_fraction": 0.5207685793582764}}, "0.2": {"accuracy": 0.6654929577464789, "large_model_usage": 0.352112676056338, "small_model_usage": 0.647887323943662, "avg_lambda": 0.40000000000000185, "avg_cost": 0.0002370954225352112, "unrestricted": {"accuracy": 0.6971830985915493, "large_model_usage": 0.352112676056338, "avg_cost": 0.0002508911971830985}, "random_baseline": {"accuracy": 0.6323943661971831, "accuracy_std": 0.013193657743095229, "avg_cost": 0.00023659852112676042, "cost_std": 1.2326502604350852e-05, "small_model_fraction": 0.6488199473030369}, "unrestricted_random_baseline": {"accuracy": 0.65, "accuracy_std": 0.017968099749539515, "avg_cost": 0.0002541259154929576, "cost_std": 1.4361469275358761e-05, "small_model_fraction": 0.6251902723026804}}, "0.25": {"accuracy": 0.5985915492957746, "large_model_usage": 0.2676056338028169, "small_model_usage": 0.7323943661971831, "avg_lambda": 0.2700000000000005, "avg_cost": 0.00017772218309859144, "unrestricted": {"accuracy": 0.676056338028169, "large_model_usage": 0.2676056338028169, "avg_cost": 0.00019766584507042238}, "random_baseline": {"accuracy": 0.6109154929577464, "accuracy_std": 0.00986544065219853, "avg_cost": 0.00017572144366197174, "cost_std": 1.067991610573048e-05, "small_model_fraction": 0.7505156032131988}, "unrestricted_random_baseline": {"accuracy": 0.6144366197183099, "accuracy_std": 0.016534305467428036, "avg_cost": 0.00019556105633802804, "cost_std": 1.2599409835533143e-05, "small_model_fraction": 0.7163557055097737}}}}, {"trial": 27, "trial_seed": 68, "subject": "TruthfulQA", "method": "standard", "calibration_size": 400, "test_size": 284, "small_model_accuracy": 0.5482456140350878, "large_model_accuracy": 0.8260233918128655, "small_model_cost": 3.200994152046785e-05, "large_model_cost": 0.0006154736842105275, "hybrid_results": {"0.05": {"accuracy": 0.778169014084507, "large_model_usage": 0.7112676056338029, "small_model_usage": 0.28873239436619713, "avg_lambda": 0.8000000000000037, "avg_cost": 0.000467727112676056, "unrestricted": {"accuracy": 0.7922535211267606, "large_model_usage": 0.7112676056338029, "avg_cost": 0.00047278345070422503}, "random_baseline": {"accuracy": 0.7559859154929578, "accuracy_std": 0.012100591744675745, "avg_cost": 0.00047594207746478836, "cost_std": 1.1156173633157517e-05, "small_model_fraction": 0.2532232266109388}, "unrestricted_random_baseline": {"accuracy": 0.7584507042253522, "accuracy_std": 0.011377108747467246, "avg_cost": 0.0004727194014084505, "cost_std": 1.245544480529743e-05, "small_model_fraction": 0.24455715593985178}}, "0.1": {"accuracy": 0.6936619718309859, "large_model_usage": 0.4859154929577465, "small_model_usage": 0.5140845070422535, "avg_lambda": 0.6100000000000032, "avg_cost": 0.00032693838028168996, "unrestricted": {"accuracy": 0.721830985915493, "large_model_usage": 0.4859154929577465, "avg_cost": 0.00033429049295774635}, "random_baseline": {"accuracy": 0.6908450704225352, "accuracy_std": 0.012177194694218704, "avg_cost": 0.00033737478873239405, "cost_std": 1.5940468020424506e-05, "small_model_fraction": 0.49452139493457725}, "unrestricted_random_baseline": {"accuracy": 0.6943661971830986, "accuracy_std": 0.018948766263484098, "avg_cost": 0.00032865792253521114, "cost_std": 1.23730511012485e-05, "small_model_fraction": 0.4819205902261999}}, "0.15": {"accuracy": 0.6584507042253521, "large_model_usage": 0.43309859154929575, "small_model_usage": 0.5669014084507042, "avg_lambda": 0.5, "avg_cost": 0.00028881161971830964, "unrestricted": {"accuracy": 0.7077464788732394, "large_model_usage": 0.43309859154929575, "avg_cost": 0.0003012341549295773}, "random_baseline": {"accuracy": 0.6609154929577464, "accuracy_std": 0.01889962943706992, "avg_cost": 0.00028123813380281687, "cost_std": 1.5805884101465833e-05, "small_model_fraction": 0.5598669473207407}, "unrestricted_random_baseline": {"accuracy": 0.6683098591549296, "accuracy_std": 0.023292725927522847, "avg_cost": 0.0003027047535211267, "cost_std": 2.3748919701179424e-05, "small_model_fraction": 0.5385759324686549}}, "0.2": {"accuracy": 0.6408450704225352, "large_model_usage": 0.35563380281690143, "small_model_usage": 0.6443661971830985, "avg_lambda": 0.40000000000000185, "avg_cost": 0.0002387834507042254, "unrestricted": {"accuracy": 0.676056338028169, "large_model_usage": 0.35563380281690143, "avg_cost": 0.0002531637323943662}, "random_baseline": {"accuracy": 0.6359154929577465, "accuracy_std": 0.020961797276249604, "avg_cost": 0.00023591813380281694, "cost_std": 2.4835100108766758e-05, "small_model_fraction": 0.6456103540719972}, "unrestricted_random_baseline": {"accuracy": 0.6461267605633803, "accuracy_std": 0.012323943661971839, "avg_cost": 0.000259542323943662, "cost_std": 1.3301471771180361e-05, "small_model_fraction": 0.6209639525255352}}, "0.25": {"accuracy": 0.5774647887323944, "large_model_usage": 0.2535211267605634, "small_model_usage": 0.7464788732394366, "avg_lambda": 0.29000000000000087, "avg_cost": 0.00017303697183098594, "unrestricted": {"accuracy": 0.6408450704225352, "large_model_usage": 0.2535211267605634, "avg_cost": 0.00019029049295774648}, "random_baseline": {"accuracy": 0.601056338028169, "accuracy_std": 0.013179554417051322, "avg_cost": 0.00016545549295774652, "cost_std": 9.351200699796332e-06, "small_model_fraction": 0.7582934122687506}, "unrestricted_random_baseline": {"accuracy": 0.6109154929577464, "accuracy_std": 0.016831575215094548, "avg_cost": 0.00018630014084507042, "cost_std": 1.4246718276691835e-05, "small_model_fraction": 0.7287225583075203}}}}, {"trial": 28, "trial_seed": 69, "subject": "TruthfulQA", "method": "standard", "calibration_size": 400, "test_size": 284, "small_model_accuracy": 0.5497076023391813, "large_model_accuracy": 0.8508771929824561, "small_model_cost": 3.2026169590643274e-05, "large_model_cost": 0.0006146286549707607, "hybrid_results": {"0.05": {"accuracy": 0.7605633802816901, "large_model_usage": 0.823943661971831, "small_model_usage": 0.176056338028169, "avg_lambda": 0.8600000000000062, "avg_cost": 0.0005324834507042254, "unrestricted": {"accuracy": 0.7816901408450704, "large_model_usage": 0.823943661971831, "avg_cost": 0.0005420538732394365}, "random_baseline": {"accuracy": 0.7697183098591549, "accuracy_std": 0.013745930490100802, "avg_cost": 0.0005323802816901413, "cost_std": 9.673330425121887e-06, "small_model_fraction": 0.14099700280705094}, "unrestricted_random_baseline": {"accuracy": 0.7732394366197184, "accuracy_std": 0.009208941430015495, "avg_cost": 0.0005455860915492963, "cost_std": 9.124009802779964e-06, "small_model_fraction": 0.12456998305452291}}, "0.1": {"accuracy": 0.7112676056338029, "large_model_usage": 0.6443661971830986, "small_model_usage": 0.3556338028169014, "avg_lambda": 0.6999999999999973, "avg_cost": 0.0004135045774647886, "unrestricted": {"accuracy": 0.7429577464788732, "large_model_usage": 0.6443661971830986, "avg_cost": 0.0004323144366197182}, "random_baseline": {"accuracy": 0.702112676056338, "accuracy_std": 0.014102101686268178, "avg_cost": 0.0004079953169014084, "cost_std": 1.3882559168571447e-05, "small_model_fraction": 0.34521664866353813}, "unrestricted_random_baseline": {"accuracy": 0.7295774647887324, "accuracy_std": 0.016120455129943107, "avg_cost": 0.00043587137323943665, "cost_std": 1.2964435094412982e-05, "small_model_fraction": 0.31293072536773003}}, "0.15": {"accuracy": 0.6549295774647887, "large_model_usage": 0.44014084507042256, "small_model_usage": 0.5598591549295775, "avg_lambda": 0.5, "avg_cost": 0.0002907158450704222, "unrestricted": {"accuracy": 0.6936619718309859, "large_model_usage": 0.44014084507042256, "avg_cost": 0.0003055961267605631}, "random_baseline": {"accuracy": 0.6394366197183099, "accuracy_std": 0.014102101686268171, "avg_cost": 0.00028781249999999993, "cost_std": 2.2164946376624687e-05, "small_model_fraction": 0.5559756747158441}, "unrestricted_random_baseline": {"accuracy": 0.6475352112676056, "accuracy_std": 0.019250615398361464, "avg_cost": 0.00030273077464788715, "cost_std": 1.617019100911844e-05, "small_model_fraction": 0.530434620457498}}, "0.2": {"accuracy": 0.5880281690140845, "large_model_usage": 0.29225352112676056, "small_model_usage": 0.7077464788732395, "avg_lambda": 0.35999999999999954, "avg_cost": 0.00020007499999999984, "unrestricted": {"accuracy": 0.6443661971830986, "large_model_usage": 0.29225352112676056, "avg_cost": 0.00021257499999999987}, "random_baseline": {"accuracy": 0.5904929577464788, "accuracy_std": 0.014088907763198964, "avg_cost": 0.00020127813380281675, "cost_std": 1.0787150655216228e-05, "small_model_fraction": 0.7115549029975153}, "unrestricted_random_baseline": {"accuracy": 0.606338028169014, "accuracy_std": 0.009934321112440777, "avg_cost": 0.00021681468309859143, "cost_std": 1.15274576858696e-05, "small_model_fraction": 0.6900994504141911}}, "0.25": {"accuracy": 0.5704225352112676, "large_model_usage": 0.2852112676056338, "small_model_usage": 0.7147887323943662, "avg_lambda": 0.3100000000000012, "avg_cost": 0.00019156795774647874, "unrestricted": {"accuracy": 0.6408450704225352, "large_model_usage": 0.2852112676056338, "avg_cost": 0.00020819471830985897}, "random_baseline": {"accuracy": 0.5890845070422535, "accuracy_std": 0.012990047004011756, "avg_cost": 0.00018886725352112665, "cost_std": 1.1209741216028614e-05, "small_model_fraction": 0.7261566983330962}, "unrestricted_random_baseline": {"accuracy": 0.5982394366197183, "accuracy_std": 0.012038958448787843, "avg_cost": 0.00021182616197183086, "cost_std": 1.5641428111259535e-05, "small_model_fraction": 0.6976179245025449}}}}, {"trial": 29, "trial_seed": 70, "subject": "TruthfulQA", "method": "standard", "calibration_size": 400, "test_size": 284, "small_model_accuracy": 0.5760233918128655, "large_model_accuracy": 0.8347953216374269, "small_model_cost": 3.206681286549711e-05, "large_model_cost": 0.0006161666666666675, "hybrid_results": {"0.05": {"accuracy": 0.8309859154929577, "large_model_usage": 0.823943661971831, "small_model_usage": 0.176056338028169, "avg_lambda": 0.900000000000004, "avg_cost": 0.0005399309859154926, "unrestricted": {"accuracy": 0.8309859154929577, "large_model_usage": 0.823943661971831, "avg_cost": 0.0005412619718309856}, "random_baseline": {"accuracy": 0.812323943661972, "accuracy_std": 0.012202622148706644, "avg_cost": 0.0005438326056338026, "cost_std": 1.1993422481960203e-05, "small_model_fraction": 0.13051823289297695}, "unrestricted_random_baseline": {"accuracy": 0.815492957746479, "accuracy_std": 0.007077377197972449, "avg_cost": 0.0005367974295774644, "cost_std": 1.4902596356850469e-05, "small_model_fraction": 0.12823953703843888}}, "0.1": {"accuracy": 0.8063380281690141, "large_model_usage": 0.647887323943662, "small_model_usage": 0.352112676056338, "avg_lambda": 0.7599999999999976, "avg_cost": 0.00042814225352112646, "unrestricted": {"accuracy": 0.8063380281690141, "large_model_usage": 0.647887323943662, "avg_cost": 0.0004323183098591546}, "random_baseline": {"accuracy": 0.7799295774647886, "accuracy_std": 0.01917317378881971, "avg_cost": 0.00043326369718309845, "cost_std": 1.2384301718025739e-05, "small_model_fraction": 0.3219045714905199}, "unrestricted_random_baseline": {"accuracy": 0.7714788732394365, "accuracy_std": 0.010138154964005597, "avg_cost": 0.00042634873239436597, "cost_std": 1.858049940057072e-05, "small_model_fraction": 0.31475501253950927}}, "0.15": {"accuracy": 0.7535211267605634, "large_model_usage": 0.4471830985915493, "small_model_usage": 0.5528169014084507, "avg_lambda": 0.5800000000000017, "avg_cost": 0.0002983887323943661, "unrestricted": {"accuracy": 0.7676056338028169, "large_model_usage": 0.4471830985915493, "avg_cost": 0.00030907183098591545}, "random_baseline": {"accuracy": 0.719718309859155, "accuracy_std": 0.015683843275577546, "avg_cost": 0.0003003427816901408, "cost_std": 1.8236833669594454e-05, "small_model_fraction": 0.5440472758284136}, "unrestricted_random_baseline": {"accuracy": 0.723943661971831, "accuracy_std": 0.00907330896248247, "avg_cost": 0.0003002292253521127, "cost_std": 1.844770742314085e-05, "small_model_fraction": 0.525757426032995}}, "0.2": {"accuracy": 0.7112676056338029, "large_model_usage": 0.31338028169014087, "small_model_usage": 0.6866197183098591, "avg_lambda": 0.40000000000000185, "avg_cost": 0.00021524084507042258, "unrestricted": {"accuracy": 0.7394366197183099, "large_model_usage": 0.31338028169014087, "avg_cost": 0.00022611408450704234}, "random_baseline": {"accuracy": 0.6816901408450703, "accuracy_std": 0.007746478873239427, "avg_cost": 0.00021285873239436617, "cost_std": 1.3362641668756404e-05, "small_model_fraction": 0.6863994554820099}, "unrestricted_random_baseline": {"accuracy": 0.6869718309859155, "accuracy_std": 0.016170369722823992, "avg_cost": 0.00022375454225352114, "cost_std": 1.0634776893696715e-05, "small_model_fraction": 0.6677840777073716}}, "0.25": {"accuracy": 0.6408450704225352, "large_model_usage": 0.2323943661971831, "small_model_usage": 0.7676056338028169, "avg_lambda": 0.2700000000000005, "avg_cost": 0.0001594943661971831, "unrestricted": {"accuracy": 0.7183098591549296, "large_model_usage": 0.2323943661971831, "avg_cost": 0.00017622676056338032}, "random_baseline": {"accuracy": 0.6595070422535212, "accuracy_std": 0.014522246920634686, "avg_cost": 0.00015873834507042252, "cost_std": 1.6909437117024655e-05, "small_model_fraction": 0.7818394363524995}, "unrestricted_random_baseline": {"accuracy": 0.6591549295774648, "accuracy_std": 0.011223505246837469, "avg_cost": 0.00017184845070422535, "cost_std": 1.5970180947880138e-05, "small_model_fraction": 0.753192974181165}}}}, {"trial": 30, "trial_seed": 71, "subject": "TruthfulQA", "method": "standard", "calibration_size": 400, "test_size": 284, "small_model_accuracy": 0.564327485380117, "large_model_accuracy": 0.8260233918128655, "small_model_cost": 3.202997076023393e-05, "large_model_cost": 0.0006142368421052644, "hybrid_results": {"0.05": {"accuracy": 0.8169014084507042, "large_model_usage": 0.8415492957746479, "small_model_usage": 0.15845070422535212, "avg_lambda": 0.900000000000004, "avg_cost": 0.0005495007042253524, "unrestricted": {"accuracy": 0.8133802816901409, "large_model_usage": 0.8415492957746479, "avg_cost": 0.0005509161971830989}, "random_baseline": {"accuracy": 0.7883802816901408, "accuracy_std": 0.006386745474372239, "avg_cost": 0.0005479576056338026, "cost_std": 6.174367132926547e-06, "small_model_fraction": 0.1111909547380587}, "unrestricted_random_baseline": {"accuracy": 0.7876760563380281, "accuracy_std": 0.007224748073480024, "avg_cost": 0.0005583145070422533, "cost_std": 8.381575788943983e-06, "small_model_fraction": 0.10875970044097974}}, "0.1": {"accuracy": 0.7711267605633803, "large_model_usage": 0.6232394366197183, "small_model_usage": 0.37676056338028174, "avg_lambda": 0.6999999999999973, "avg_cost": 0.0004021556338028168, "unrestricted": {"accuracy": 0.7922535211267606, "large_model_usage": 0.6232394366197183, "avg_cost": 0.00041674718309859155}, "random_baseline": {"accuracy": 0.7077464788732395, "accuracy_std": 0.020471044864434716, "avg_cost": 0.00041307890845070403, "cost_std": 7.164885106671139e-06, "small_model_fraction": 0.36427122169219983}, "unrestricted_random_baseline": {"accuracy": 0.721830985915493, "accuracy_std": 0.010795570223067532, "avg_cost": 0.00040805394366197155, "cost_std": 1.122471967906713e-05, "small_model_fraction": 0.339208739584998}}, "0.15": {"accuracy": 0.7183098591549296, "large_model_usage": 0.4471830985915493, "small_model_usage": 0.5528169014084507, "avg_lambda": 0.5, "avg_cost": 0.00029419788732394344, "unrestricted": {"accuracy": 0.7464788732394366, "large_model_usage": 0.4471830985915493, "avg_cost": 0.0003084021126760562}, "random_baseline": {"accuracy": 0.6640845070422535, "accuracy_std": 0.01789896482695608, "avg_cost": 0.00029658271126760556, "cost_std": 1.0034763443738988e-05, "small_model_fraction": 0.5496997210664281}, "unrestricted_random_baseline": {"accuracy": 0.6764084507042253, "accuracy_std": 0.013858217362080564, "avg_cost": 0.00030850929577464777, "cost_std": 1.7679461258247304e-05, "small_model_fraction": 0.5253025075479792}}, "0.2": {"accuracy": 0.6866197183098591, "large_model_usage": 0.35563380281690143, "small_model_usage": 0.6443661971830985, "avg_lambda": 0.40000000000000185, "avg_cost": 0.00024038098591549277, "unrestricted": {"accuracy": 0.7147887323943662, "large_model_usage": 0.35563380281690143, "avg_cost": 0.0002525077464788731}, "random_baseline": {"accuracy": 0.6461267605633803, "accuracy_std": 0.016757752160972954, "avg_cost": 0.00025123936619718304, "cost_std": 1.4148129248872781e-05, "small_model_fraction": 0.6421357675255179}, "unrestricted_random_baseline": {"accuracy": 0.6461267605633803, "accuracy_std": 0.017552737437485227, "avg_cost": 0.0002440120774647887, "cost_std": 1.6716260230870385e-05, "small_model_fraction": 0.6213068128012896}}, "0.25": {"accuracy": 0.6056338028169014, "large_model_usage": 0.29577464788732394, "small_model_usage": 0.704225352112676, "avg_lambda": 0.25, "avg_cost": 0.00019326830985915488, "unrestricted": {"accuracy": 0.6901408450704225, "large_model_usage": 0.29577464788732394, "avg_cost": 0.0002160570422535211}, "random_baseline": {"accuracy": 0.6232394366197183, "accuracy_std": 0.015668024972737345, "avg_cost": 0.00020417130281690128, "cost_std": 1.3063795855591376e-05, "small_model_fraction": 0.7230566195029207}, "unrestricted_random_baseline": {"accuracy": 0.6264084507042254, "accuracy_std": 0.018925851620248877, "avg_cost": 0.00021735052816901406, "cost_std": 1.6219830215359433e-05, "small_model_fraction": 0.6839146348992023}}}}], "unrestricted_hybrid": {"0.05": {"avg_accuracy": 0.8030516431924883, "std_accuracy": 0.024797222546628545, "avg_cost": 0.0005115815962441312, "std_cost": 3.100009310773766e-05}, "0.1": {"avg_accuracy": 0.7719483568075117, "std_accuracy": 0.03173813961925331, "avg_cost": 0.0004061428638497652, "std_cost": 3.0934190474947276e-05}, "0.15": {"avg_accuracy": 0.730281690140845, "std_accuracy": 0.02433741413762024, "avg_cost": 0.00030803159624413144, "std_cost": 1.586715144980458e-05}, "0.2": {"avg_accuracy": 0.6985915492957747, "std_accuracy": 0.03086564366311427, "avg_cost": 0.0002408722065727699, "std_cost": 2.0741230977426262e-05}, "0.25": {"avg_accuracy": 0.6740610328638499, "std_accuracy": 0.02783967293605384, "avg_cost": 0.00020268488262910797, "std_cost": 1.535803221334111e-05}}, "random_baseline": {"0.05": {"avg_accuracy": 0.7774530516431924, "std_accuracy": 0.023148471016873, "avg_cost": 0.0005094424671361501, "std_cost": 3.2576759554179306e-05, "avg_small_model_fraction": 0.18304727531807866, "std_small_model_fraction": 0.05473399504976769}, "0.1": {"avg_accuracy": 0.721431924882629, "std_accuracy": 0.025335109930500564, "avg_cost": 0.0003947105739436619, "std_cost": 2.949642523218521e-05, "avg_small_model_fraction": 0.37851695874118946, "std_small_model_fraction": 0.05087434031744665}, "0.15": {"avg_accuracy": 0.6732863849765258, "std_accuracy": 0.024078900957332127, "avg_cost": 0.0002956757464788731, "std_cost": 1.950516723481829e-05, "avg_small_model_fraction": 0.545285772862136, "std_small_model_fraction": 0.02750551437574078}, "0.2": {"avg_accuracy": 0.6394718309859154, "std_accuracy": 0.024931359515627978, "avg_cost": 0.00022670274647887317, "std_cost": 2.1628934531986468e-05, "avg_small_model_fraction": 0.6641186817071971, "std_small_model_fraction": 0.035114366164181836}, "0.25": {"avg_accuracy": 0.6206572769953053, "std_accuracy": 0.02418005600808857, "avg_cost": 0.0001839843920187793, "std_cost": 1.5616017451437584e-05, "avg_small_model_fraction": 0.7379994987325477, "std_small_model_fraction": 0.024038958116386382}}, "unrestricted_random_baseline": {"0.05": {"avg_accuracy": 0.7785915492957746, "std_accuracy": 0.02377380374755772, "avg_cost": 0.0005132961995305162, "std_cost": 3.243572837597377e-05, "avg_small_model_fraction": 0.1769416415608545, "std_small_model_fraction": 0.052931283562416255}, "0.1": {"avg_accuracy": 0.7272535211267604, "std_accuracy": 0.02629468452820097, "avg_cost": 0.00040677207629107964, "std_cost": 3.122032990102917e-05, "avg_small_model_fraction": 0.3579017305004713, "std_small_model_fraction": 0.053023363669192754}, "0.15": {"avg_accuracy": 0.678086854460094, "std_accuracy": 0.023473124976363186, "avg_cost": 0.00030709123708920174, "std_cost": 1.6667742812272763e-05, "avg_small_model_fraction": 0.5262968309637477, "std_small_model_fraction": 0.027131475814750738}, "0.2": {"avg_accuracy": 0.6477699530516431, "std_accuracy": 0.02389447677624292, "avg_cost": 0.00024075659976525814, "std_cost": 2.11170192419602e-05, "avg_small_model_fraction": 0.6415688689097861, "std_small_model_fraction": 0.035540223429936874}, "0.25": {"avg_accuracy": 0.6290610328638498, "std_accuracy": 0.02213735751640375, "avg_cost": 0.0002024925715962441, "std_cost": 1.5700343178196305e-05, "avg_small_model_fraction": 0.7070882292587805, "std_small_model_fraction": 0.026568105680107503}}}