{"subject": "TruthfulQA", "method": "standard", "iterations": 10, "random_seed": 42, "calibration_size": 400, "total_examples": 684, "small_model": {"avg_accuracy": 0.5558479532163743, "std_accuracy": 0.016408172919997903, "avg_cost": 3.224903508771932e-05, "std_cost": 4.067309268448036e-08, "x_position": 1.0, "x_std": 0.0}, "large_model": {"avg_accuracy": 0.8337719298245615, "std_accuracy": 0.008989742186925852, "avg_cost": 0.0006202976608187139, "std_cost": 1.3052290967908494e-06, "x_position": 0.0, "x_std": 0.0}, "hybrid_models": {"0.05": {"avg_accuracy": 0.801056338028169, "std_accuracy": 0.01897818729325616, "avg_large_model_usage": 0.778169014084507, "std_large_model_usage": 0.05679829591951891, "avg_lambda": 0.8620000000000028, "std_lambda": 0.04237924020083371, "avg_cost": 0.0005141432394366196, "std_cost": 3.598244025204188e-05}, "0.1": {"avg_accuracy": 0.7633802816901408, "std_accuracy": 0.02225842342151986, "avg_large_model_usage": 0.6221830985915493, "std_large_model_usage": 0.031219112884954745, "avg_lambda": 0.7149999999999979, "std_lambda": 0.02202271554554561, "avg_cost": 0.0004061228169014084, "std_cost": 1.9116102356824867e-05}, "0.15": {"avg_accuracy": 0.7024647887323945, "std_accuracy": 0.04432286056939193, "avg_large_model_usage": 0.4440140845070422, "std_large_model_usage": 0.07541214536346103, "avg_lambda": 0.5400000000000007, "std_lambda": 0.0753657747256665, "avg_cost": 0.00029755098591549296, "std_cost": 4.6809812868485e-05}, "0.2": {"avg_accuracy": 0.66443661971831, "std_accuracy": 0.04674088547434261, "avg_large_model_usage": 0.34225352112676055, "std_large_model_usage": 0.06368860841233463, "avg_lambda": 0.42700000000000077, "std_lambda": 0.07225648759800053, "avg_cost": 0.00023289042253521117, "std_cost": 4.056176850701634e-05}, "0.25": {"avg_accuracy": 0.6211267605633802, "std_accuracy": 0.04482215955202694, "avg_large_model_usage": 0.2683098591549296, "std_large_model_usage": 0.03824274361235343, "avg_lambda": 0.29700000000000004, "std_lambda": 0.05568662316930335, "avg_cost": 0.00018296577464788723, "std_cost": 2.3790837744147213e-05}}, "all_trials": [{"trial": 1, "trial_seed": 42, "subject": "TruthfulQA", "method": "standard", "calibration_size": 400, "test_size": 284, "small_model_accuracy": 0.5380116959064327, "large_model_accuracy": 0.8157894736842105, "small_model_cost": 3.2259795321637416e-05, "large_model_cost": 0.0006210087719298245, "hybrid_results": {"0.05": {"accuracy": 0.795774647887324, "large_model_usage": 0.7887323943661971, "small_model_usage": 0.21126760563380287, "avg_lambda": 0.8600000000000062, "avg_cost": 0.0005120799295774648, "unrestricted": {"accuracy": 0.8133802816901409, "large_model_usage": 0.7887323943661971, "avg_cost": 0.0005230095070422536}, "random_baseline": {"accuracy": 0.7869718309859155, "accuracy_std": 0.012424139974913661, "avg_cost": 0.0005099547887323942, "cost_std": 9.78453134240759e-06, "small_model_fraction": 0.18501746360546434}, "unrestricted_random_baseline": {"accuracy": 0.7901408450704226, "accuracy_std": 0.007250443761258484, "avg_cost": 0.0005186211971830984, "cost_std": 1.4276519837373907e-05, "small_model_fraction": 0.1664533931798058}}, "0.1": {"accuracy": 0.7852112676056338, "large_model_usage": 0.5845070422535211, "small_model_usage": 0.4154929577464789, "avg_lambda": 0.7599999999999976, "avg_cost": 0.00039310105633802797, "unrestricted": {"accuracy": 0.7922535211267606, "large_model_usage": 0.5845070422535211, "avg_cost": 0.000395657394366197}, "random_baseline": {"accuracy": 0.7471830985915492, "accuracy_std": 0.014154754395944899, "avg_cost": 0.0003883797183098591, "cost_std": 1.7909366496525955e-05, "small_model_fraction": 0.3871050730394208}, "unrestricted_random_baseline": {"accuracy": 0.7517605633802817, "accuracy_std": 0.02090553184337659, "avg_cost": 0.00039405788732394355, "cost_std": 1.7695122070089936e-05, "small_model_fraction": 0.38276309007259474}}, "0.15": {"accuracy": 0.778169014084507, "large_model_usage": 0.5774647887323944, "small_model_usage": 0.4225352112676056, "avg_lambda": 0.6999999999999973, "avg_cost": 0.00037251654929577454, "unrestricted": {"accuracy": 0.7922535211267606, "large_model_usage": 0.5774647887323944, "avg_cost": 0.0003914461267605632}, "random_baseline": {"accuracy": 0.7369718309859156, "accuracy_std": 0.013179554417051322, "avg_cost": 0.0003699444014084506, "cost_std": 1.3780222499932212e-05, "small_model_fraction": 0.42206820310011633}, "unrestricted_random_baseline": {"accuracy": 0.7373239436619718, "accuracy_std": 0.014788732394366176, "avg_cost": 0.0003908897183098591, "cost_std": 1.2182154615997834e-05, "small_model_fraction": 0.38991599865155335}}, "0.2": {"accuracy": 0.7464788732394366, "large_model_usage": 0.39436619718309857, "small_model_usage": 0.6056338028169015, "avg_lambda": 0.5800000000000017, "avg_cost": 0.0002677982394366194, "unrestricted": {"accuracy": 0.7605633802816901, "large_model_usage": 0.39436619718309857, "avg_cost": 0.0002771151408450701}, "random_baseline": {"accuracy": 0.6943661971830986, "accuracy_std": 0.014154754395944932, "avg_cost": 0.00026659063380281674, "cost_std": 1.5902230086433838e-05, "small_model_fraction": 0.5999340067273985}, "unrestricted_random_baseline": {"accuracy": 0.7007042253521126, "accuracy_std": 0.015103951119385388, "avg_cost": 0.0002705962676056336, "cost_std": 1.3344173126770167e-05, "small_model_fraction": 0.5841090936003713}}, "0.25": {"accuracy": 0.7112676056338029, "large_model_usage": 0.28169014084507044, "small_model_usage": 0.7183098591549295, "avg_lambda": 0.40000000000000185, "avg_cost": 0.00019601654929577456, "unrestricted": {"accuracy": 0.7429577464788732, "large_model_usage": 0.28169014084507044, "avg_cost": 0.0002071573943661971}, "random_baseline": {"accuracy": 0.671830985915493, "accuracy_std": 0.01287018794542045, "avg_cost": 0.0001949327112676055, "cost_std": 1.768085556477763e-05, "small_model_fraction": 0.7218564099804501}, "unrestricted_random_baseline": {"accuracy": 0.6725352112676056, "accuracy_std": 0.015668024972737296, "avg_cost": 0.0002063189084507041, "cost_std": 2.2062017288772068e-05, "small_model_fraction": 0.7029334979872853}}}}, {"trial": 2, "trial_seed": 43, "subject": "TruthfulQA", "method": "standard", "calibration_size": 400, "test_size": 284, "small_model_accuracy": 0.5687134502923976, "large_model_accuracy": 0.8289473684210527, "small_model_cost": 3.22454678362573e-05, "large_model_cost": 0.000621143274853802, "hybrid_results": {"0.05": {"accuracy": 0.7640845070422535, "large_model_usage": 0.676056338028169, "small_model_usage": 0.323943661971831, "avg_lambda": 0.8000000000000037, "avg_cost": 0.0004490116197183096, "unrestricted": {"accuracy": 0.7746478873239436, "large_model_usage": 0.676056338028169, "avg_cost": 0.0004531031690140842}, "random_baseline": {"accuracy": 0.7380281690140844, "accuracy_std": 0.01328729736909379, "avg_cost": 0.00045562926056338026, "cost_std": 1.0656262175971104e-05, "small_model_fraction": 0.29229461051527444}, "unrestricted_random_baseline": {"accuracy": 0.7528169014084507, "accuracy_std": 0.01144230761216331, "avg_cost": 0.00045698200704225364, "cost_std": 1.7663134685146472e-05, "small_model_fraction": 0.2853468018343487}}, "0.1": {"accuracy": 0.7323943661971831, "large_model_usage": 0.5774647887323944, "small_model_usage": 0.4225352112676056, "avg_lambda": 0.6999999999999973, "avg_cost": 0.00037748345070422504, "unrestricted": {"accuracy": 0.7429577464788732, "large_model_usage": 0.5774647887323944, "avg_cost": 0.00039258908450704203}, "random_baseline": {"accuracy": 0.7098591549295775, "accuracy_std": 0.021079759925487013, "avg_cost": 0.00037191042253521134, "cost_std": 1.4845394619225004e-05, "small_model_fraction": 0.4137556996239889}, "unrestricted_random_baseline": {"accuracy": 0.7158450704225352, "accuracy_std": 0.018026655966436923, "avg_cost": 0.00039672827464788744, "cost_std": 1.2686507074052445e-05, "small_model_fraction": 0.3881050118088669}}, "0.15": {"accuracy": 0.6866197183098591, "large_model_usage": 0.397887323943662, "small_model_usage": 0.602112676056338, "avg_lambda": 0.5100000000000009, "avg_cost": 0.00026746936619718314, "unrestricted": {"accuracy": 0.7007042253521126, "large_model_usage": 0.397887323943662, "avg_cost": 0.00028176514084507026}, "random_baseline": {"accuracy": 0.6584507042253522, "accuracy_std": 0.01666501347352002, "avg_cost": 0.00027085549295774653, "cost_std": 1.425190633267196e-05, "small_model_fraction": 0.600569240438829}, "unrestricted_random_baseline": {"accuracy": 0.6753521126760564, "accuracy_std": 0.011000351656206539, "avg_cost": 0.000280038133802817, "cost_std": 1.2980481519089246e-05, "small_model_fraction": 0.5762937643247513}}, "0.2": {"accuracy": 0.6197183098591549, "large_model_usage": 0.2676056338028169, "small_model_usage": 0.7323943661971831, "avg_lambda": 0.34999999999999865, "avg_cost": 0.00018615246478873224, "unrestricted": {"accuracy": 0.6584507042253521, "large_model_usage": 0.2676056338028169, "avg_cost": 0.0001993707746478872}, "random_baseline": {"accuracy": 0.6295774647887324, "accuracy_std": 0.005633802816901433, "avg_cost": 0.00018361397887323936, "cost_std": 1.3710811455741638e-05, "small_model_fraction": 0.7386524535862474}, "unrestricted_random_baseline": {"accuracy": 0.6235915492957748, "accuracy_std": 0.013947396000238312, "avg_cost": 0.0001975654225352111, "cost_std": 1.4014988053716073e-05, "small_model_fraction": 0.7162066069526885}}, "0.25": {"accuracy": 0.5809859154929577, "large_model_usage": 0.2323943661971831, "small_model_usage": 0.7676056338028169, "avg_lambda": 0.20000000000000093, "avg_cost": 0.0001569482394366196, "unrestricted": {"accuracy": 0.6373239436619719, "large_model_usage": 0.2323943661971831, "avg_cost": 0.00017682147887323931}, "random_baseline": {"accuracy": 0.6070422535211267, "accuracy_std": 0.011267605633802802, "avg_cost": 0.00015244707746478862, "cost_std": 1.0800822916138847e-05, "small_model_fraction": 0.7882437833621495}, "unrestricted_random_baseline": {"accuracy": 0.6242957746478872, "accuracy_std": 0.010801311021252124, "avg_cost": 0.00018255626760563368, "cost_std": 1.684266999109943e-05, "small_model_fraction": 0.7544972840547958}}}}, {"trial": 3, "trial_seed": 44, "subject": "TruthfulQA", "method": "standard", "calibration_size": 400, "test_size": 284, "small_model_accuracy": 0.5716374269005848, "large_model_accuracy": 0.8377192982456141, "small_model_cost": 3.227251461988303e-05, "large_model_cost": 0.0006202105263157896, "hybrid_results": {"0.05": {"accuracy": 0.823943661971831, "large_model_usage": 0.795774647887324, "small_model_usage": 0.204225352112676, "avg_lambda": 0.900000000000004, "avg_cost": 0.0005317274647887322, "unrestricted": {"accuracy": 0.823943661971831, "large_model_usage": 0.795774647887324, "avg_cost": 0.0005321992957746476}, "random_baseline": {"accuracy": 0.8045774647887324, "accuracy_std": 0.012424139974913628, "avg_cost": 0.0005291285915492955, "cost_std": 1.2139423552650093e-05, "small_model_fraction": 0.15049726292033427}, "unrestricted_random_baseline": {"accuracy": 0.7996478873239437, "accuracy_std": 0.01302816901408453, "avg_cost": 0.0005328900704225349, "cost_std": 1.038480488886728e-05, "small_model_fraction": 0.14969474466750957}}, "0.1": {"accuracy": 0.778169014084507, "large_model_usage": 0.6232394366197183, "small_model_usage": 0.37676056338028174, "avg_lambda": 0.7299999999999969, "avg_cost": 0.00040516408450704206, "unrestricted": {"accuracy": 0.8028169014084507, "large_model_usage": 0.6232394366197183, "avg_cost": 0.0004250161971830986}, "random_baseline": {"accuracy": 0.7566901408450704, "accuracy_std": 0.014641283722740678, "avg_cost": 0.000403003028169014, "cost_std": 1.4623994276343428e-05, "small_model_fraction": 0.36576380082731225}, "unrestricted_random_baseline": {"accuracy": 0.7721830985915493, "accuracy_std": 0.01613967131882512, "avg_cost": 0.0004347490492957745, "cost_std": 1.6423194024009375e-05, "small_model_fraction": 0.3319981447868172}}, "0.15": {"accuracy": 0.7359154929577465, "large_model_usage": 0.43309859154929575, "small_model_usage": 0.5669014084507042, "avg_lambda": 0.5800000000000017, "avg_cost": 0.0002954457746478874, "unrestricted": {"accuracy": 0.7570422535211268, "large_model_usage": 0.43309859154929575, "avg_cost": 0.0003058190140845071}, "random_baseline": {"accuracy": 0.7225352112676057, "accuracy_std": 0.012177194694218737, "avg_cost": 0.0003075032394366197, "cost_std": 1.2351766089938974e-05, "small_model_fraction": 0.5523792393200069}, "unrestricted_random_baseline": {"accuracy": 0.7147887323943662, "accuracy_std": 0.009707076585979055, "avg_cost": 0.0003045097887323944, "cost_std": 1.1683202380279115e-05, "small_model_fraction": 0.5347358156422316}}, "0.2": {"accuracy": 0.7112676056338029, "large_model_usage": 0.323943661971831, "small_model_usage": 0.676056338028169, "avg_lambda": 0.45999999999999747, "avg_cost": 0.00022533309859154924, "unrestricted": {"accuracy": 0.7429577464788732, "large_model_usage": 0.323943661971831, "avg_cost": 0.00023649507042253508}, "random_baseline": {"accuracy": 0.6936619718309859, "accuracy_std": 0.017745919955428388, "avg_cost": 0.00021807975352112672, "cost_std": 1.2655045692063328e-05, "small_model_fraction": 0.6716310561129001}, "unrestricted_random_baseline": {"accuracy": 0.6961267605633803, "accuracy_std": 0.01674294849009684, "avg_cost": 0.00024657820422535213, "cost_std": 1.4380105247715707e-05, "small_model_fraction": 0.6526461093856267}}, "0.25": {"accuracy": 0.6725352112676056, "large_model_usage": 0.25704225352112675, "small_model_usage": 0.7429577464788732, "avg_lambda": 0.3100000000000012, "avg_cost": 0.00017735422535211277, "unrestricted": {"accuracy": 0.7183098591549296, "large_model_usage": 0.25704225352112675, "avg_cost": 0.00019434014084507043}, "random_baseline": {"accuracy": 0.6746478873239437, "accuracy_std": 0.016074242550018752, "avg_cost": 0.00016871714788732384, "cost_std": 1.8186352027569182e-05, "small_model_fraction": 0.7532363823292498}, "unrestricted_random_baseline": {"accuracy": 0.6841549295774648, "accuracy_std": 0.01045128315468719, "avg_cost": 0.00020709665492957738, "cost_std": 1.193701486762333e-05, "small_model_fraction": 0.7243457252275568}}}}, {"trial": 4, "trial_seed": 45, "subject": "TruthfulQA", "method": "standard", "calibration_size": 400, "test_size": 284, "small_model_accuracy": 0.533625730994152, "large_model_accuracy": 0.8362573099415205, "small_model_cost": 3.225190058479537e-05, "large_model_cost": 0.0006206286549707611, "hybrid_results": {"0.05": {"accuracy": 0.795774647887324, "large_model_usage": 0.7605633802816901, "small_model_usage": 0.23943661971830987, "avg_lambda": 0.8000000000000037, "avg_cost": 0.0005066084507042255, "unrestricted": {"accuracy": 0.795774647887324, "large_model_usage": 0.7605633802816901, "avg_cost": 0.0005106225352112676}, "random_baseline": {"accuracy": 0.7440140845070422, "accuracy_std": 0.01823182178309805, "avg_cost": 0.0005082648239436622, "cost_std": 1.2344524312870964e-05, "small_model_fraction": 0.1937877447002948}, "unrestricted_random_baseline": {"accuracy": 0.7482394366197183, "accuracy_std": 0.010938186314795104, "avg_cost": 0.0005236319366197184, "cost_std": 1.294443648356544e-05, "small_model_fraction": 0.18696544168250953}}, "0.1": {"accuracy": 0.7570422535211268, "large_model_usage": 0.676056338028169, "small_model_usage": 0.323943661971831, "avg_lambda": 0.6999999999999973, "avg_cost": 0.00044293943661971846, "unrestricted": {"accuracy": 0.7711267605633803, "large_model_usage": 0.676056338028169, "avg_cost": 0.0004577492957746482}, "random_baseline": {"accuracy": 0.7070422535211268, "accuracy_std": 0.01580981994411527, "avg_cost": 0.0004379432042253519, "cost_std": 1.3534310215763336e-05, "small_model_fraction": 0.30199904572450426}, "unrestricted_random_baseline": {"accuracy": 0.7130281690140845, "accuracy_std": 0.015528927514929566, "avg_cost": 0.0004575738380281689, "cost_std": 2.022702076984208e-05, "small_model_fraction": 0.27682833827467407}}, "0.15": {"accuracy": 0.6654929577464789, "large_model_usage": 0.5035211267605634, "small_model_usage": 0.4964788732394366, "avg_lambda": 0.5, "avg_cost": 0.00033211549295774647, "unrestricted": {"accuracy": 0.7359154929577465, "large_model_usage": 0.5035211267605634, "avg_cost": 0.0003492000000000001}, "random_baseline": {"accuracy": 0.6323943661971831, "accuracy_std": 0.022502176491085164, "avg_cost": 0.0003274713732394364, "cost_std": 2.00850547391704e-05, "small_model_fraction": 0.4903544537787002}, "unrestricted_random_baseline": {"accuracy": 0.6584507042253521, "accuracy_std": 0.022927916332534794, "avg_cost": 0.0003464370070422532, "cost_std": 1.7827453865412186e-05, "small_model_fraction": 0.4613177746187916}}, "0.2": {"accuracy": 0.6161971830985915, "large_model_usage": 0.4084507042253521, "small_model_usage": 0.5915492957746479, "avg_lambda": 0.40000000000000185, "avg_cost": 0.0002703901408450703, "unrestricted": {"accuracy": 0.7007042253521126, "large_model_usage": 0.4084507042253521, "avg_cost": 0.0002880521126760563}, "random_baseline": {"accuracy": 0.6183098591549296, "accuracy_std": 0.02626486788587539, "avg_cost": 0.0002774628521126758, "cost_std": 1.912501719251029e-05, "small_model_fraction": 0.595262323867982}, "unrestricted_random_baseline": {"accuracy": 0.6154929577464788, "accuracy_std": 0.014500887522516923, "avg_cost": 0.00028730940140845045, "cost_std": 1.8652906323250465e-05, "small_model_fraction": 0.5652441905897253}}, "0.25": {"accuracy": 0.5774647887323944, "large_model_usage": 0.3274647887323944, "small_model_usage": 0.6725352112676056, "avg_lambda": 0.3100000000000012, "avg_cost": 0.00021804507042253523, "unrestricted": {"accuracy": 0.6584507042253521, "large_model_usage": 0.3274647887323944, "avg_cost": 0.00023765774647887316}, "random_baseline": {"accuracy": 0.5616197183098592, "accuracy_std": 0.007086130914612, "avg_cost": 0.0002061217957746478, "cost_std": 1.477061360005228e-05, "small_model_fraction": 0.6842275490104381}, "unrestricted_random_baseline": {"accuracy": 0.591549295774648, "accuracy_std": 0.009707076585979017, "avg_cost": 0.00024363422535211257, "cost_std": 1.8428747276075024e-05, "small_model_fraction": 0.6508940158445913}}}}, {"trial": 5, "trial_seed": 46, "subject": "TruthfulQA", "method": "standard", "calibration_size": 400, "test_size": 284, "small_model_accuracy": 0.5730994152046783, "large_model_accuracy": 0.8347953216374269, "small_model_cost": 3.223815789473688e-05, "large_model_cost": 0.0006194181286549714, "hybrid_results": {"0.05": {"accuracy": 0.7922535211267606, "large_model_usage": 0.6795774647887324, "small_model_usage": 0.3204225352112676, "avg_lambda": 0.8000000000000037, "avg_cost": 0.0004543961267605635, "unrestricted": {"accuracy": 0.7922535211267606, "large_model_usage": 0.6795774647887324, "avg_cost": 0.000459029929577465}, "random_baseline": {"accuracy": 0.775, "accuracy_std": 0.011184774770611703, "avg_cost": 0.00045181517605633796, "cost_std": 1.740565091156539e-05, "small_model_fraction": 0.28104160583125876}, "unrestricted_random_baseline": {"accuracy": 0.772887323943662, "accuracy_std": 0.01576662899805501, "avg_cost": 0.0004560499647887323, "cost_std": 1.44425562591804e-05, "small_model_fraction": 0.27314998307903515}}, "0.1": {"accuracy": 0.7711267605633803, "large_model_usage": 0.5915492957746479, "small_model_usage": 0.4084507042253521, "avg_lambda": 0.6999999999999973, "avg_cost": 0.00038583978873239436, "unrestricted": {"accuracy": 0.7887323943661971, "large_model_usage": 0.5915492957746479, "avg_cost": 0.00040348063380281693}, "random_baseline": {"accuracy": 0.7577464788732394, "accuracy_std": 0.01773194128452698, "avg_cost": 0.0004016902816901408, "cost_std": 1.5473833770152816e-05, "small_model_fraction": 0.39779684518216474}, "unrestricted_random_baseline": {"accuracy": 0.7591549295774647, "accuracy_std": 0.015919231768249716, "avg_cost": 0.0004150496478873239, "cost_std": 1.494048944500082e-05, "small_model_fraction": 0.3677535093245356}}, "0.15": {"accuracy": 0.7077464788732394, "large_model_usage": 0.38028169014084506, "small_model_usage": 0.619718309859155, "avg_lambda": 0.5, "avg_cost": 0.00025901584507042247, "unrestricted": {"accuracy": 0.75, "large_model_usage": 0.38028169014084506, "avg_cost": 0.000271213028169014}, "random_baseline": {"accuracy": 0.7024647887323944, "accuracy_std": 0.015287530525417209, "avg_cost": 0.0002578299295774647, "cost_std": 2.1466231959424247e-05, "small_model_fraction": 0.6137850429706047}, "unrestricted_random_baseline": {"accuracy": 0.7035211267605633, "accuracy_std": 0.0126760563380282, "avg_cost": 0.0002718010915492956, "cost_std": 1.4938750714115405e-05, "small_model_fraction": 0.593012564844691}}, "0.2": {"accuracy": 0.676056338028169, "large_model_usage": 0.29577464788732394, "small_model_usage": 0.704225352112676, "avg_lambda": 0.40000000000000185, "avg_cost": 0.00020430457746478865, "unrestricted": {"accuracy": 0.721830985915493, "large_model_usage": 0.29577464788732394, "avg_cost": 0.00021753697183098586}, "random_baseline": {"accuracy": 0.681338028169014, "accuracy_std": 0.020726848806706243, "avg_cost": 0.00021269936619718304, "cost_std": 2.0718751974151252e-05, "small_model_fraction": 0.7069613608460219}, "unrestricted_random_baseline": {"accuracy": 0.6806338028169014, "accuracy_std": 0.011251088245542561, "avg_cost": 0.0002170459154929577, "cost_std": 1.1875785346816022e-05, "small_model_fraction": 0.6844258606158881}}, "0.25": {"accuracy": 0.6514084507042254, "large_model_usage": 0.21830985915492956, "small_model_usage": 0.7816901408450705, "avg_lambda": 0.3100000000000012, "avg_cost": 0.0001553045774647886, "unrestricted": {"accuracy": 0.6936619718309859, "large_model_usage": 0.21830985915492956, "avg_cost": 0.00016891725352112665}, "random_baseline": {"accuracy": 0.657394366197183, "accuracy_std": 0.013273293538711787, "avg_cost": 0.00015172447183098582, "cost_std": 1.612661632665832e-05, "small_model_fraction": 0.790411073779109}, "unrestricted_random_baseline": {"accuracy": 0.6640845070422536, "accuracy_std": 0.012812257323000162, "avg_cost": 0.00016701116197183092, "cost_std": 1.6513788505297896e-05, "small_model_fraction": 0.7672279327759963}}}}, {"trial": 6, "trial_seed": 47, "subject": "TruthfulQA", "method": "standard", "calibration_size": 400, "test_size": 284, "small_model_accuracy": 0.5526315789473685, "large_model_accuracy": 0.8406432748538012, "small_model_cost": 3.234736842105265e-05, "large_model_cost": 0.0006228888888888894, "hybrid_results": {"0.05": {"accuracy": 0.823943661971831, "large_model_usage": 0.8274647887323944, "small_model_usage": 0.17253521126760563, "avg_lambda": 0.900000000000004, "avg_cost": 0.0005456257042253515, "unrestricted": {"accuracy": 0.8204225352112676, "large_model_usage": 0.8274647887323944, "avg_cost": 0.0005465693661971825}, "random_baseline": {"accuracy": 0.8095070422535212, "accuracy_std": 0.013028169014084514, "avg_cost": 0.000541901514084507, "cost_std": 1.6322490386683704e-05, "small_model_fraction": 0.13083446630870035}, "unrestricted_random_baseline": {"accuracy": 0.8024647887323944, "accuracy_std": 0.010498627828081402, "avg_cost": 0.0005463147887323943, "cost_std": 6.257875211068922e-06, "small_model_fraction": 0.12923650589588573}}, "0.1": {"accuracy": 0.7570422535211268, "large_model_usage": 0.6514084507042254, "small_model_usage": 0.3485915492957746, "avg_lambda": 0.75, "avg_cost": 0.0004173158450704225, "unrestricted": {"accuracy": 0.7816901408450704, "large_model_usage": 0.6514084507042254, "avg_cost": 0.00043723133802816904}, "random_baseline": {"accuracy": 0.7366197183098592, "accuracy_std": 0.01565219068372735, "avg_cost": 0.0004154259507042252, "cost_std": 1.7463622068917145e-05, "small_model_fraction": 0.3481093821406639}, "unrestricted_random_baseline": {"accuracy": 0.7447183098591549, "accuracy_std": 0.013291962033927358, "avg_cost": 0.00043496257042253516, "cost_std": 7.990573976068326e-06, "small_model_fraction": 0.3143852623836498}}, "0.15": {"accuracy": 0.6901408450704225, "large_model_usage": 0.4225352112676056, "small_model_usage": 0.5774647887323944, "avg_lambda": 0.540000000000001, "avg_cost": 0.00028603415492957753, "unrestricted": {"accuracy": 0.721830985915493, "large_model_usage": 0.4225352112676056, "avg_cost": 0.0002954073943661972}, "random_baseline": {"accuracy": 0.6661971830985915, "accuracy_std": 0.01996823504050395, "avg_cost": 0.0002856027464788732, "cost_std": 1.5597070019106676e-05, "small_model_fraction": 0.5704166807652237}, "unrestricted_random_baseline": {"accuracy": 0.6778169014084506, "accuracy_std": 0.015206213296964192, "avg_cost": 0.00030001563380281686, "cost_std": 1.972251026136685e-05, "small_model_fraction": 0.5545444023364452}}, "0.2": {"accuracy": 0.6725352112676056, "large_model_usage": 0.34507042253521125, "small_model_usage": 0.6549295774647887, "avg_lambda": 0.4200000000000014, "avg_cost": 0.00023341443661971834, "unrestricted": {"accuracy": 0.704225352112676, "large_model_usage": 0.34507042253521125, "avg_cost": 0.00024655528169014086}, "random_baseline": {"accuracy": 0.6443661971830986, "accuracy_std": 0.016590449280830626, "avg_cost": 0.00024036802816901419, "cost_std": 1.4031794588297688e-05, "small_model_fraction": 0.6595208613962028}, "unrestricted_random_baseline": {"accuracy": 0.6605633802816901, "accuracy_std": 0.021197066117313257, "avg_cost": 0.00024520584507042257, "cost_std": 1.776763001594904e-05, "small_model_fraction": 0.6372686663938733}}, "0.25": {"accuracy": 0.6232394366197183, "large_model_usage": 0.27816901408450706, "small_model_usage": 0.721830985915493, "avg_lambda": 0.3199999999999981, "avg_cost": 0.0001878721830985915, "unrestricted": {"accuracy": 0.6795774647887324, "large_model_usage": 0.27816901408450706, "avg_cost": 0.0002043792253521126}, "random_baseline": {"accuracy": 0.619718309859155, "accuracy_std": 0.02193297394703865, "avg_cost": 0.00018488390845070425, "cost_std": 1.9575765134226798e-05, "small_model_fraction": 0.7366403389310721}, "unrestricted_random_baseline": {"accuracy": 0.6299295774647887, "accuracy_std": 0.026182128795804657, "avg_cost": 0.0001988295422535212, "cost_std": 1.6057504481389974e-05, "small_model_fraction": 0.708687956784523}}}}, {"trial": 7, "trial_seed": 48, "subject": "TruthfulQA", "method": "standard", "calibration_size": 400, "test_size": 284, "small_model_accuracy": 0.5350877192982456, "large_model_accuracy": 0.8260233918128655, "small_model_cost": 3.2208040935672557e-05, "large_model_cost": 0.000620862573099416, "hybrid_results": {"0.05": {"accuracy": 0.778169014084507, "large_model_usage": 0.8098591549295775, "small_model_usage": 0.1901408450704225, "avg_lambda": 0.879999999999997, "avg_cost": 0.0005299172535211266, "unrestricted": {"accuracy": 0.7922535211267606, "large_model_usage": 0.8098591549295775, "avg_cost": 0.0005386707746478871}, "random_baseline": {"accuracy": 0.7669014084507043, "accuracy_std": 0.011657003772709033, "avg_cost": 0.0005310804929577464, "cost_std": 8.238987298450398e-06, "small_model_fraction": 0.15449693259643768}, "unrestricted_random_baseline": {"accuracy": 0.7619718309859155, "accuracy_std": 0.014013907564881951, "avg_cost": 0.0005402567253521127, "cost_std": 1.60070737627336e-05, "small_model_fraction": 0.13962654487584245}}, "0.1": {"accuracy": 0.7253521126760564, "large_model_usage": 0.6338028169014085, "small_model_usage": 0.3661971830985915, "avg_lambda": 0.6999999999999973, "avg_cost": 0.0004137341549295772, "unrestricted": {"accuracy": 0.7535211267605634, "large_model_usage": 0.6338028169014085, "avg_cost": 0.0004291003521126758}, "random_baseline": {"accuracy": 0.7091549295774648, "accuracy_std": 0.011909531355836478, "avg_cost": 0.0004228588028169014, "cost_std": 1.389968537222111e-05, "small_model_fraction": 0.3518675332516132}, "unrestricted_random_baseline": {"accuracy": 0.7098591549295775, "accuracy_std": 0.016455382317673156, "avg_cost": 0.0004265857042253521, "cost_std": 1.752082685663917e-05, "small_model_fraction": 0.325763602433963}}, "0.15": {"accuracy": 0.6654929577464789, "large_model_usage": 0.4753521126760563, "small_model_usage": 0.5246478873239437, "avg_lambda": 0.5500000000000005, "avg_cost": 0.00032243838028168985, "unrestricted": {"accuracy": 0.6971830985915493, "large_model_usage": 0.4753521126760563, "avg_cost": 0.00033047359154929546}, "random_baseline": {"accuracy": 0.6524647887323944, "accuracy_std": 0.012202622148706644, "avg_cost": 0.00032226573943661965, "cost_std": 1.0340591400689735e-05, "small_model_fraction": 0.5069598151581288}, "unrestricted_random_baseline": {"accuracy": 0.6588028169014084, "accuracy_std": 0.018794374852228873, "avg_cost": 0.00032687936619718306, "cost_std": 2.199776449402478e-05, "small_model_fraction": 0.49330968451516877}}, "0.2": {"accuracy": 0.6338028169014085, "large_model_usage": 0.4119718309859155, "small_model_usage": 0.5880281690140845, "avg_lambda": 0.450000000000002, "avg_cost": 0.0002745017605633802, "unrestricted": {"accuracy": 0.6619718309859155, "large_model_usage": 0.4119718309859155, "avg_cost": 0.000290994718309859}, "random_baseline": {"accuracy": 0.6383802816901408, "accuracy_std": 0.013458813277194654, "avg_cost": 0.0002756028873239436, "cost_std": 8.859379944073629e-06, "small_model_fraction": 0.5883940301332635}, "unrestricted_random_baseline": {"accuracy": 0.6443661971830986, "accuracy_std": 0.014771955608030319, "avg_cost": 0.00029575746478873236, "cost_std": 1.5351222720845292e-05, "small_model_fraction": 0.5603759705663816}}, "0.25": {"accuracy": 0.6197183098591549, "large_model_usage": 0.31690140845070425, "small_model_usage": 0.6830985915492958, "avg_lambda": 0.3199999999999981, "avg_cost": 0.00021381866197183087, "unrestricted": {"accuracy": 0.6549295774647887, "large_model_usage": 0.31690140845070425, "avg_cost": 0.00023063556338028156}, "random_baseline": {"accuracy": 0.5961267605633803, "accuracy_std": 0.01666873292752764, "avg_cost": 0.0002072104577464788, "cost_std": 1.6773076495421326e-05, "small_model_fraction": 0.6914818265840846}, "unrestricted_random_baseline": {"accuracy": 0.6169014084507042, "accuracy_std": 0.01627354930745262, "avg_cost": 0.00022898214788732394, "cost_std": 1.567318664260946e-05, "small_model_fraction": 0.6629134550017984}}}}, {"trial": 8, "trial_seed": 49, "subject": "TruthfulQA", "method": "standard", "calibration_size": 400, "test_size": 284, "small_model_accuracy": 0.5628654970760234, "large_model_accuracy": 0.8508771929824561, "small_model_cost": 3.223859649122807e-05, "large_model_cost": 0.0006186198830409359, "hybrid_results": {"0.05": {"accuracy": 0.8204225352112676, "large_model_usage": 0.8063380281690141, "small_model_usage": 0.19366197183098588, "avg_lambda": 0.879999999999997, "avg_cost": 0.0005282165492957745, "unrestricted": {"accuracy": 0.8345070422535211, "large_model_usage": 0.8063380281690141, "avg_cost": 0.000537674295774648}, "random_baseline": {"accuracy": 0.8084507042253521, "accuracy_std": 0.01936299589417961, "avg_cost": 0.0005294668661971831, "cost_std": 1.644405820650126e-05, "small_model_fraction": 0.15417158735930744}, "unrestricted_random_baseline": {"accuracy": 0.8031690140845071, "accuracy_std": 0.0076254957139816, "avg_cost": 0.0005430282394366197, "cost_std": 8.433190750180054e-06, "small_model_fraction": 0.13804258274095882}}, "0.1": {"accuracy": 0.8028169014084507, "large_model_usage": 0.6267605633802817, "small_model_usage": 0.37323943661971826, "avg_lambda": 0.6999999999999973, "avg_cost": 0.00041355457746478894, "unrestricted": {"accuracy": 0.8169014084507042, "large_model_usage": 0.6267605633802817, "avg_cost": 0.00042613908450704244}, "random_baseline": {"accuracy": 0.7447183098591549, "accuracy_std": 0.017833038397324766, "avg_cost": 0.0004182877112676054, "cost_std": 1.4321043740833616e-05, "small_model_fraction": 0.34971325020066696}, "unrestricted_random_baseline": {"accuracy": 0.7602112676056338, "accuracy_std": 0.01956999046702852, "avg_cost": 0.0004358315492957746, "cost_std": 1.3599813004675902e-05, "small_model_fraction": 0.3282519462148231}}, "0.15": {"accuracy": 0.778169014084507, "large_model_usage": 0.5246478873239436, "small_model_usage": 0.47535211267605637, "avg_lambda": 0.6100000000000032, "avg_cost": 0.00035210387323943686, "unrestricted": {"accuracy": 0.8028169014084507, "large_model_usage": 0.5246478873239436, "avg_cost": 0.0003620334507042256}, "random_baseline": {"accuracy": 0.7257042253521127, "accuracy_std": 0.018794374852228894, "avg_cost": 0.00035330323943661964, "cost_std": 9.57727744845766e-06, "small_model_fraction": 0.45450974632852026}, "unrestricted_random_baseline": {"accuracy": 0.7225352112676057, "accuracy_std": 0.013156015276246064, "avg_cost": 0.0003536400352112675, "cost_std": 1.071590598801006e-05, "small_model_fraction": 0.43757609293174354}}, "0.2": {"accuracy": 0.7253521126760564, "large_model_usage": 0.4295774647887324, "small_model_usage": 0.5704225352112676, "avg_lambda": 0.5, "avg_cost": 0.00029177288732394374, "unrestricted": {"accuracy": 0.7570422535211268, "large_model_usage": 0.4295774647887324, "avg_cost": 0.000302343309859155}, "random_baseline": {"accuracy": 0.6869718309859154, "accuracy_std": 0.011725933665737133, "avg_cost": 0.0002946467957746478, "cost_std": 1.307669601186927e-05, "small_model_fraction": 0.557396702818014}, "unrestricted_random_baseline": {"accuracy": 0.7024647887323943, "accuracy_std": 0.013659975169909064, "avg_cost": 0.000310123838028169, "cost_std": 1.4124460232553606e-05, "small_model_fraction": 0.539370168244566}}, "0.25": {"accuracy": 0.6232394366197183, "large_model_usage": 0.3028169014084507, "small_model_usage": 0.6971830985915493, "avg_lambda": 0.29999999999999843, "avg_cost": 0.00020249119718309847, "unrestricted": {"accuracy": 0.7112676056338029, "large_model_usage": 0.3028169014084507, "avg_cost": 0.000222519366197183}, "random_baseline": {"accuracy": 0.6373239436619719, "accuracy_std": 0.015668024972737334, "avg_cost": 0.00019513028169014084, "cost_std": 1.6264531362305856e-05, "small_model_fraction": 0.7096554671898828}, "unrestricted_random_baseline": {"accuracy": 0.6588028169014086, "accuracy_std": 0.01367811591517958, "avg_cost": 0.00022455869718309855, "cost_std": 1.57765128315114e-05, "small_model_fraction": 0.6754999279980864}}}}, {"trial": 9, "trial_seed": 50, "subject": "TruthfulQA", "method": "standard", "calibration_size": 400, "test_size": 284, "small_model_accuracy": 0.543859649122807, "large_model_accuracy": 0.8289473684210527, "small_model_cost": 3.2244736842105294e-05, "large_model_cost": 0.0006201461988304101, "hybrid_results": {"0.05": {"accuracy": 0.8028169014084507, "large_model_usage": 0.8626760563380281, "small_model_usage": 0.13732394366197187, "avg_lambda": 0.900000000000004, "avg_cost": 0.000571788028169014, "unrestricted": {"accuracy": 0.8063380281690141, "large_model_usage": 0.8626760563380281, "avg_cost": 0.0005731190140845071}, "random_baseline": {"accuracy": 0.8003521126760564, "accuracy_std": 0.008189931936347207, "avg_cost": 0.0005756131338028167, "cost_std": 6.981211314849663e-06, "small_model_fraction": 0.08225557136368886}, "unrestricted_random_baseline": {"accuracy": 0.803169014084507, "accuracy_std": 0.011184774770611687, "avg_cost": 0.000577929612676056, "cost_std": 1.1152120230878495e-05, "small_model_fraction": 0.07999161047644834}}, "0.1": {"accuracy": 0.7535211267605634, "large_model_usage": 0.6549295774647887, "small_model_usage": 0.34507042253521125, "avg_lambda": 0.7100000000000012, "avg_cost": 0.00042418943661971803, "unrestricted": {"accuracy": 0.7746478873239436, "large_model_usage": 0.6549295774647887, "avg_cost": 0.0004427316901408448}, "random_baseline": {"accuracy": 0.7239436619718311, "accuracy_std": 0.013564338228350875, "avg_cost": 0.0004361020422535211, "cost_std": 1.4507786251775064e-05, "small_model_fraction": 0.33331565726671086}, "unrestricted_random_baseline": {"accuracy": 0.7295774647887323, "accuracy_std": 0.02152212223934667, "avg_cost": 0.0004530685915492956, "cost_std": 1.4914136801618666e-05, "small_model_fraction": 0.30177592702277484}}, "0.15": {"accuracy": 0.6619718309859155, "large_model_usage": 0.426056338028169, "small_model_usage": 0.573943661971831, "avg_lambda": 0.5100000000000009, "avg_cost": 0.000284675352112676, "unrestricted": {"accuracy": 0.7077464788732394, "large_model_usage": 0.426056338028169, "avg_cost": 0.00030094295774647865}, "random_baseline": {"accuracy": 0.6426056338028169, "accuracy_std": 0.011490963994423049, "avg_cost": 0.00028919862676056343, "cost_std": 1.507407367698108e-05, "small_model_fraction": 0.5706242770398275}, "unrestricted_random_baseline": {"accuracy": 0.6538732394366198, "accuracy_std": 0.012100591744675745, "avg_cost": 0.00030319380281690144, "cost_std": 1.6704366866963432e-05, "small_model_fraction": 0.5429536439735565}}, "0.2": {"accuracy": 0.6267605633802817, "large_model_usage": 0.31338028169014087, "small_model_usage": 0.6866197183098591, "avg_lambda": 0.40000000000000185, "avg_cost": 0.00021473873239436615, "unrestricted": {"accuracy": 0.6830985915492958, "large_model_usage": 0.31338028169014087, "avg_cost": 0.00022984436619718303}, "random_baseline": {"accuracy": 0.603169014084507, "accuracy_std": 0.012894248889859556, "avg_cost": 0.00021243327464788735, "cost_std": 2.1695235232775104e-05, "small_model_fraction": 0.6895840419667281}, "unrestricted_random_baseline": {"accuracy": 0.629225352112676, "accuracy_std": 0.01210059174467579, "avg_cost": 0.0002396171478873239, "cost_std": 8.319957781256967e-06, "small_model_fraction": 0.6638898826909048}}, "0.25": {"accuracy": 0.5563380281690141, "large_model_usage": 0.25704225352112675, "small_model_usage": 0.7429577464788732, "avg_lambda": 0.29999999999999843, "avg_cost": 0.00017598521126760552, "unrestricted": {"accuracy": 0.6584507042253521, "large_model_usage": 0.25704225352112675, "avg_cost": 0.00019411197183098587}, "random_baseline": {"accuracy": 0.5929577464788732, "accuracy_std": 0.008362212737350634, "avg_cost": 0.00017446644366197182, "cost_std": 1.2977508455990422e-05, "small_model_fraction": 0.7555024375354255}, "unrestricted_random_baseline": {"accuracy": 0.5922535211267606, "accuracy_std": 0.021865034783464812, "avg_cost": 0.00019451415492957743, "cost_std": 2.1687599304522693e-05, "small_model_fraction": 0.7246694464044374}}}}, {"trial": 10, "trial_seed": 51, "subject": "TruthfulQA", "method": "standard", "calibration_size": 400, "test_size": 284, "small_model_accuracy": 0.5789473684210527, "large_model_accuracy": 0.8377192982456141, "small_model_cost": 3.2183771929824545e-05, "large_model_cost": 0.0006180497076023403, "hybrid_results": {"0.05": {"accuracy": 0.8133802816901409, "large_model_usage": 0.7746478873239436, "small_model_usage": 0.22535211267605637, "avg_lambda": 0.900000000000004, "avg_cost": 0.0005120612676056342, "unrestricted": {"accuracy": 0.8133802816901409, "large_model_usage": 0.7746478873239436, "avg_cost": 0.0005120612676056342}, "random_baseline": {"accuracy": 0.794718309859155, "accuracy_std": 0.01767942283187776, "avg_cost": 0.000511898661971831, "cost_std": 1.2663262001473198e-05, "small_model_fraction": 0.1809090331818693}, "unrestricted_random_baseline": {"accuracy": 0.7922535211267605, "accuracy_std": 0.018698476123030657, "avg_cost": 0.0005136688380281691, "cost_std": 1.5604906383401637e-05, "small_model_fraction": 0.1809090331818693}}, "0.1": {"accuracy": 0.7711267605633803, "large_model_usage": 0.602112676056338, "small_model_usage": 0.397887323943662, "avg_lambda": 0.6999999999999973, "avg_cost": 0.0003879063380281693, "unrestricted": {"accuracy": 0.7922535211267606, "large_model_usage": 0.602112676056338, "avg_cost": 0.00040491338028169043}, "random_baseline": {"accuracy": 0.7327464788732394, "accuracy_std": 0.010379861248580065, "avg_cost": 0.0003889290492957745, "cost_std": 2.312647894510486e-05, "small_model_fraction": 0.3928259957800573}, "unrestricted_random_baseline": {"accuracy": 0.742605633802817, "accuracy_std": 0.022680539317166078, "avg_cost": 0.000407639542253521, "cost_std": 1.630797868201226e-05, "small_model_fraction": 0.363797098180816}}, "0.15": {"accuracy": 0.6549295774647887, "large_model_usage": 0.2992957746478873, "small_model_usage": 0.7007042253521127, "avg_lambda": 0.40000000000000185, "avg_cost": 0.00020369507042253515, "unrestricted": {"accuracy": 0.7007042253521126, "large_model_usage": 0.2992957746478873, "avg_cost": 0.00021730774647887312}, "random_baseline": {"accuracy": 0.6380281690140845, "accuracy_std": 0.017378820675004323, "avg_cost": 0.00020460165492957731, "cost_std": 1.4875658065698219e-05, "small_model_fraction": 0.7072516286582992}, "unrestricted_random_baseline": {"accuracy": 0.6471830985915493, "accuracy_std": 0.018417882860031008, "avg_cost": 0.00021627281690140835, "cost_std": 1.1894423248889597e-05, "small_model_fraction": 0.6840164903314532}}, "0.2": {"accuracy": 0.6161971830985915, "large_model_usage": 0.2323943661971831, "small_model_usage": 0.7676056338028169, "avg_lambda": 0.3100000000000012, "avg_cost": 0.0001604978873239436, "unrestricted": {"accuracy": 0.6725352112676056, "large_model_usage": 0.2323943661971831, "avg_cost": 0.00017584999999999994}, "random_baseline": {"accuracy": 0.6246478873239437, "accuracy_std": 0.015604591412788998, "avg_cost": 0.00016024028169014077, "cost_std": 1.6222199347386077e-05, "small_model_fraction": 0.7809838265356267}, "unrestricted_random_baseline": {"accuracy": 0.6225352112676056, "accuracy_std": 0.018009453313442867, "avg_cost": 0.00017170535211267598, "cost_std": 1.4469126510257875e-05, "small_model_fraction": 0.7547796870878644}}, "0.25": {"accuracy": 0.5950704225352113, "large_model_usage": 0.2112676056338028, "small_model_usage": 0.7887323943661972, "avg_lambda": 0.20000000000000093, "avg_cost": 0.0001458218309859155, "unrestricted": {"accuracy": 0.6584507042253521, "large_model_usage": 0.2112676056338028, "avg_cost": 0.00016296971830985913}, "random_baseline": {"accuracy": 0.6112676056338028, "accuracy_std": 0.01682789175401201, "avg_cost": 0.00014319433098591543, "cost_std": 1.4044303031499875e-05, "small_model_fraction": 0.8060340222278911}, "unrestricted_random_baseline": {"accuracy": 0.6154929577464789, "accuracy_std": 0.01207494943625511, "avg_cost": 0.00016637380281690132, "cost_std": 1.3879859121878909e-05, "small_model_fraction": 0.7767647196795878}}}}], "unrestricted_hybrid": {"0.05": {"avg_accuracy": 0.8066901408450704, "std_accuracy": 0.01706566312120089, "avg_cost": 0.0005186059154929577, "std_cost": 3.559561836743289e-05}, "0.1": {"avg_accuracy": 0.7816901408450705, "std_accuracy": 0.02100906181714973, "avg_cost": 0.00042146084507042257, "std_cost": 2.0511949414451026e-05}, "0.15": {"avg_accuracy": 0.7366197183098592, "std_accuracy": 0.036416009785487057, "avg_cost": 0.0003105608450704225, "std_cost": 4.743370020398642e-05}, "0.2": {"avg_accuracy": 0.7063380281690141, "std_accuracy": 0.03615633141688292, "avg_cost": 0.0002464157746478872, "std_cost": 4.023804714881832e-05}, "0.25": {"avg_accuracy": 0.6813380281690141, "std_accuracy": 0.03228126305545471, "avg_cost": 0.00019995098591549292, "std_cost": 2.4239374922072036e-05}}, "random_baseline": {"0.05": {"avg_accuracy": 0.7828521126760564, "std_accuracy": 0.024769844156309777, "avg_cost": 0.0005144753309859155, "std_cost": 3.558092462688962e-05, "avg_small_model_fraction": 0.180530627838263, "std_small_model_fraction": 0.0609851793973727}, "0.1": {"avg_accuracy": 0.7325704225352113, "std_accuracy": 0.018393801352326537, "avg_cost": 0.0004084530211267604, "std_cost": 2.044976829552247e-05, "avg_small_model_fraction": 0.36422522830371035, "std_small_model_fraction": 0.0322097958126191}, "0.15": {"avg_accuracy": 0.6777816901408451, "std_accuracy": 0.03799830922119965, "avg_cost": 0.00029885764436619704, "std_cost": 4.5912585318634114e-05, "avg_small_model_fraction": 0.5488918327558256, "std_small_model_fraction": 0.07942570017995546}, "0.2": {"avg_accuracy": 0.6514788732394365, "std_accuracy": 0.032597267201441606, "avg_cost": 0.00023417378521126754, "std_cost": 4.1894862247681894e-05, "avg_small_model_fraction": 0.6588320663990385, "std_small_model_fraction": 0.06890536395161738}, "0.25": {"avg_accuracy": 0.6229929577464789, "std_accuracy": 0.03497918232026706, "avg_cost": 0.00017788286267605627, "std_cost": 2.2192223130965443e-05, "avg_small_model_fraction": 0.7437289290929752, "std_small_model_fraction": 0.040288406077818036}}, "unrestricted_random_baseline": {"0.05": {"avg_accuracy": 0.7826760563380282, "std_accuracy": 0.02066142457295365, "avg_cost": 0.000520937338028169, "std_cost": 3.641752444463115e-05, "avg_small_model_fraction": 0.17294166416142134, "std_small_model_fraction": 0.06032991031236455}, "0.1": {"avg_accuracy": 0.7398943661971831, "std_accuracy": 0.02075198692456216, "avg_cost": 0.00042562466549295766, "std_cost": 2.0775543715808586e-05, "avg_small_model_fraction": 0.33814219305035154, "std_small_model_fraction": 0.034579464400469044}, "0.15": {"avg_accuracy": 0.6849647887323944, "std_accuracy": 0.030515413941128396, "avg_cost": 0.0003093677394366197, "std_cost": 4.623200119545173e-05, "avg_small_model_fraction": 0.5267676232170386, "std_small_model_fraction": 0.0804686762251619}, "0.2": {"avg_accuracy": 0.6575704225352113, "std_accuracy": 0.03321721700047872, "avg_cost": 0.00024815048591549294, "std_cost": 4.1950586224265816e-05, "avg_small_model_fraction": 0.635831623612789, "std_small_model_fraction": 0.06833623585449654}, "0.25": {"avg_accuracy": 0.635, "std_accuracy": 0.031306836827384554, "avg_cost": 0.00020198755633802813, "std_cost": 2.4397869328215846e-05, "avg_small_model_fraction": 0.7148433961758659, "std_small_model_fraction": 0.041050641841503116}}}