{"subject": "MMLU", "method": "standard", "iterations": 30, "random_seed": 42, "calibration_size": 500, "total_examples": 1000, "small_model": {"avg_accuracy": 0.5909, "std_accuracy": 0.010891433942935775, "avg_cost": 3.4603579999999994e-05, "std_cost": 1.8090888756498073e-07, "x_position": 1.0, "x_std": 0.0}, "large_model": {"avg_accuracy": 0.8279, "std_accuracy": 0.00797642359289759, "avg_cost": 0.000676155333333333, "std_cost": 3.7276903066406785e-06, "x_position": 0.0, "x_std": 0.0}, "hybrid_models": {"0.05": {"avg_accuracy": 0.8018666666666667, "std_accuracy": 0.026107129720101766, "avg_large_model_usage": 0.8689333333333331, "std_large_model_usage": 0.09772578415591705, "avg_lambda": 0.924333333333331, "std_lambda": 0.05754322047141809, "avg_cost": 0.0006237009933333333, "std_cost": 6.521609236635754e-05}, "0.1": {"avg_accuracy": 0.7530666666666667, "std_accuracy": 0.019532764496836837, "avg_large_model_usage": 0.5583333333333333, "std_large_model_usage": 0.041350802759908885, "avg_lambda": 0.7709999999999975, "std_lambda": 0.021346350195451298, "avg_cost": 0.00041560392666666675, "std_cost": 2.90067533136444e-05}, "0.15": {"avg_accuracy": 0.7055333333333333, "std_accuracy": 0.02171133856357598, "avg_large_model_usage": 0.42886666666666673, "std_large_model_usage": 0.09482852360386557, "avg_lambda": 0.6513333333333303, "std_lambda": 0.06515281182641698, "avg_cost": 0.0003165435266666667, "std_cost": 5.7003390046381976e-05}, "0.2": {"avg_accuracy": 0.6634, "std_accuracy": 0.020682681966643815, "avg_large_model_usage": 0.2450666666666667, "std_large_model_usage": 0.032959503771884814, "avg_lambda": 0.4226666666666673, "std_lambda": 0.05893122168162461, "avg_cost": 0.00019388112666666665, "std_cost": 2.3809499128456162e-05}, "0.25": {"avg_accuracy": 0.6174666666666667, "std_accuracy": 0.018754081037351742, "avg_large_model_usage": 0.13033333333333333, "std_large_model_usage": 0.05637749748101827, "avg_lambda": 0.12599999999999975, "std_lambda": 0.11137324633860686, "avg_cost": 0.00011115939333333332, "std_cost": 3.407500740967328e-05}}, "all_trials": [{"trial": 1, "trial_seed": 42, "subject": "MMLU", "method": "standard", "calibration_size": 500, "test_size": 500, "small_model_accuracy": 0.599, "large_model_accuracy": 0.837, "small_model_cost": 3.481870000000001e-05, "large_model_cost": 0.0006812459999999995, "hybrid_results": {"0.05": {"accuracy": 0.82, "large_model_usage": 0.81, "small_model_usage": 0.18999999999999995, "avg_lambda": 0.8999999999999925, "avg_cost": 0.0005947121999999996, "unrestricted": {"accuracy": 0.822, "large_model_usage": 0.81, "avg_cost": 0.0005956641999999996}, "random_baseline": {"accuracy": 0.8236000000000001, "accuracy_std": 0.007578918128598553, "avg_cost": 0.0005950598199999996, "cost_std": 9.445405880723154e-06, "small_model_fraction": 0.13386470528085678}, "unrestricted_random_baseline": {"accuracy": 0.8140000000000001, "accuracy_std": 0.007694153624668496, "avg_cost": 0.0005921676799999997, "cost_std": 6.48857909696725e-06, "small_model_fraction": 0.13239199520193526}}, "0.1": {"accuracy": 0.784, "large_model_usage": 0.542, "small_model_usage": 0.45799999999999996, "avg_lambda": 0.7599999999999948, "avg_cost": 0.0004086962, "unrestricted": {"accuracy": 0.788, "large_model_usage": 0.542, "avg_cost": 0.0004140682000000001}, "random_baseline": {"accuracy": 0.75, "accuracy_std": 0.01258570617804183, "avg_cost": 0.00040636325999999996, "cost_std": 6.9568976724112e-06, "small_model_fraction": 0.42162482927314443}, "unrestricted_random_baseline": {"accuracy": 0.7486, "accuracy_std": 0.012459534501738027, "avg_cost": 0.0004024597799999999, "cost_std": 1.7229580202071084e-05, "small_model_fraction": 0.41331453668494456}}, "0.15": {"accuracy": 0.718, "large_model_usage": 0.346, "small_model_usage": 0.654, "avg_lambda": 0.6100000000000078, "avg_cost": 0.00026924420000000004, "unrestricted": {"accuracy": 0.726, "large_model_usage": 0.346, "avg_cost": 0.00027845220000000023}, "random_baseline": {"accuracy": 0.6940000000000001, "accuracy_std": 0.012521980673998782, "avg_cost": 0.00026882512000000016, "cost_std": 1.2092850237789207e-05, "small_model_fraction": 0.6373521044052437}, "unrestricted_random_baseline": {"accuracy": 0.7022, "accuracy_std": 0.009610411021387155, "avg_cost": 0.0002841587600000001, "cost_std": 1.3857361838329767e-05, "small_model_fraction": 0.6231076564990365}}, "0.2": {"accuracy": 0.684, "large_model_usage": 0.238, "small_model_usage": 0.762, "avg_lambda": 0.4000000000000035, "avg_cost": 0.00018983219999999987, "unrestricted": {"accuracy": 0.7, "large_model_usage": 0.238, "avg_cost": 0.00020116820000000004}, "random_baseline": {"accuracy": 0.6611999999999999, "accuracy_std": 0.008400000000000008, "avg_cost": 0.00018795226000000012, "cost_std": 1.4072018773452512e-05, "small_model_fraction": 0.7601996388456986}, "unrestricted_random_baseline": {"accuracy": 0.6708000000000001, "accuracy_std": 0.008207313811473283, "avg_cost": 0.00020202626000000016, "cost_std": 6.793353972847324e-06, "small_model_fraction": 0.7426632507630787}}, "0.25": {"accuracy": 0.63, "large_model_usage": 0.12, "small_model_usage": 0.88, "avg_lambda": 0.05000000000000044, "avg_cost": 0.00010418019999999991, "unrestricted": {"accuracy": 0.666, "large_model_usage": 0.12, "avg_cost": 0.00011909219999999996}, "random_baseline": {"accuracy": 0.6306, "accuracy_std": 0.008101851640211646, "avg_cost": 9.974824e-05, "cost_std": 8.172993397672611e-06, "small_model_fraction": 0.8927002309463108}, "unrestricted_random_baseline": {"accuracy": 0.639, "accuracy_std": 0.0050000000000000044, "avg_cost": 0.00011887375999999994, "cost_std": 8.779841719666692e-06, "small_model_fraction": 0.8696318982815238}}}}, {"trial": 2, "trial_seed": 43, "subject": "MMLU", "method": "standard", "calibration_size": 500, "test_size": 500, "small_model_accuracy": 0.584, "large_model_accuracy": 0.833, "small_model_cost": 3.459219999999998e-05, "large_model_cost": 0.0006757880000000007, "hybrid_results": {"0.05": {"accuracy": 0.832, "large_model_usage": 1.0, "small_model_usage": 0.0, "avg_lambda": 1.0, "avg_cost": 0.0007069269999999994, "unrestricted": {"accuracy": 0.832, "large_model_usage": 1.0, "avg_cost": 0.0007069269999999994}, "random_baseline": {"accuracy": 0.8320000000000001, "accuracy_std": 1.1102230246251565e-16, "avg_cost": 0.0006725079999999998, "cost_std": 0.0, "small_model_fraction": 0.0}, "unrestricted_random_baseline": {"accuracy": 0.8320000000000001, "accuracy_std": 1.1102230246251565e-16, "avg_cost": 0.0006725079999999998, "cost_std": 0.0, "small_model_fraction": 0.0}}, "0.1": {"accuracy": 0.73, "large_model_usage": 0.516, "small_model_usage": 0.484, "avg_lambda": 0.7599999999999948, "avg_cost": 0.0003822630000000002, "unrestricted": {"accuracy": 0.734, "large_model_usage": 0.516, "avg_cost": 0.00038847900000000017}, "random_baseline": {"accuracy": 0.7219999999999999, "accuracy_std": 0.012296340919151529, "avg_cost": 0.0003839882400000001, "cost_std": 1.2809270210531207e-05, "small_model_fraction": 0.4577774838824587}, "unrestricted_random_baseline": {"accuracy": 0.7181999999999998, "accuracy_std": 0.013782597723216054, "avg_cost": 0.00039514814000000026, "cost_std": 1.4269901062880546e-05, "small_model_fraction": 0.44808309723800477}}, "0.15": {"accuracy": 0.704, "large_model_usage": 0.492, "small_model_usage": 0.508, "avg_lambda": 0.6999999999999935, "avg_cost": 0.00034987100000000013, "unrestricted": {"accuracy": 0.728, "large_model_usage": 0.492, "avg_cost": 0.00036964700000000013}, "random_baseline": {"accuracy": 0.6998, "accuracy_std": 0.009357350052231653, "avg_cost": 0.00034444266, "cost_std": 1.64563086879288e-05, "small_model_fraction": 0.5082955939511772}, "unrestricted_random_baseline": {"accuracy": 0.7156, "accuracy_std": 0.013320660644277383, "avg_cost": 0.0003751863400000001, "cost_std": 1.4955090210105756e-05, "small_model_fraction": 0.4774532209973931}}, "0.2": {"accuracy": 0.66, "large_model_usage": 0.264, "small_model_usage": 0.736, "avg_lambda": 0.44999999999999624, "avg_cost": 0.00020369899999999987, "unrestricted": {"accuracy": 0.666, "large_model_usage": 0.264, "avg_cost": 0.00021323099999999984}, "random_baseline": {"accuracy": 0.6472, "accuracy_std": 0.00976524449258697, "avg_cost": 0.00019231151999999995, "cost_std": 1.169458957371315e-05, "small_model_fraction": 0.7362634003529036}, "unrestricted_random_baseline": {"accuracy": 0.6498, "accuracy_std": 0.005688585061331161, "avg_cost": 0.00021353294, "cost_std": 1.2584670699243565e-05, "small_model_fraction": 0.7213974264959322}}, "0.25": {"accuracy": 0.606, "large_model_usage": 0.132, "small_model_usage": 0.868, "avg_lambda": 0.079999999999999, "avg_cost": 0.00011013899999999997, "unrestricted": {"accuracy": 0.636, "large_model_usage": 0.132, "avg_cost": 0.00012387499999999988}, "random_baseline": {"accuracy": 0.6134000000000001, "accuracy_std": 0.00759210115844093, "avg_cost": 0.00011254231999999987, "cost_std": 8.560659901759904e-06, "small_model_fraction": 0.8821782675432374}, "unrestricted_random_baseline": {"accuracy": 0.614, "accuracy_std": 0.005865151319446077, "avg_cost": 0.0001198124199999999, "cost_std": 8.250546128081466e-06, "small_model_fraction": 0.8607557940959693}}}}, {"trial": 3, "trial_seed": 44, "subject": "MMLU", "method": "standard", "calibration_size": 500, "test_size": 500, "small_model_accuracy": 0.596, "large_model_accuracy": 0.829, "small_model_cost": 3.436219999999997e-05, "large_model_cost": 0.0006723399999999992, "hybrid_results": {"0.05": {"accuracy": 0.762, "large_model_usage": 0.654, "small_model_usage": 0.346, "avg_lambda": 0.810000000000002, "avg_cost": 0.0004896269999999999, "unrestricted": {"accuracy": 0.768, "large_model_usage": 0.654, "avg_cost": 0.0004931149999999998}, "random_baseline": {"accuracy": 0.7476, "accuracy_std": 0.011892854997854814, "avg_cost": 0.0004956801400000001, "cost_std": 1.913354905187209e-05, "small_model_fraction": 0.2863939779722736}, "unrestricted_random_baseline": {"accuracy": 0.7567999999999999, "accuracy_std": 0.009558242516278825, "avg_cost": 0.00049836402, "cost_std": 9.080441526908177e-06, "small_model_fraction": 0.280926703092176}}, "0.1": {"accuracy": 0.706, "large_model_usage": 0.524, "small_model_usage": 0.476, "avg_lambda": 0.6999999999999935, "avg_cost": 0.00037719100000000003, "unrestricted": {"accuracy": 0.738, "large_model_usage": 0.524, "avg_cost": 0.0004014349999999999}, "random_baseline": {"accuracy": 0.7102, "accuracy_std": 0.015606408939919516, "avg_cost": 0.00038040921999999994, "cost_std": 1.204492525388175e-05, "small_model_fraction": 0.4626320853170746}, "unrestricted_random_baseline": {"accuracy": 0.7214, "accuracy_std": 0.010992724866929047, "avg_cost": 0.0004122920199999999, "cost_std": 1.0282695406341546e-05, "small_model_fraction": 0.4246307630140103}}, "0.15": {"accuracy": 0.66, "large_model_usage": 0.296, "small_model_usage": 0.704, "avg_lambda": 0.5, "avg_cost": 0.00022938299999999991, "unrestricted": {"accuracy": 0.672, "large_model_usage": 0.296, "avg_cost": 0.00023962299999999995}, "random_baseline": {"accuracy": 0.6514, "accuracy_std": 0.010734989520255723, "avg_cost": 0.0002393385199999999, "cost_std": 1.1435369458727582e-05, "small_model_fraction": 0.6943141281718578}, "unrestricted_random_baseline": {"accuracy": 0.66, "accuracy_std": 0.01019803902718558, "avg_cost": 0.00025014329999999987, "cost_std": 1.073056164578528e-05, "small_model_fraction": 0.6782634129275341}}, "0.2": {"accuracy": 0.61, "large_model_usage": 0.198, "small_model_usage": 0.802, "avg_lambda": 0.31000000000000166, "avg_cost": 0.000155087, "unrestricted": {"accuracy": 0.64, "large_model_usage": 0.198, "avg_cost": 0.00017060299999999988}, "random_baseline": {"accuracy": 0.6224, "accuracy_std": 0.009414881836751864, "avg_cost": 0.00015601678, "cost_std": 1.5110411931499403e-05, "small_model_fraction": 0.8107695910421958}, "unrestricted_random_baseline": {"accuracy": 0.6312000000000001, "accuracy_std": 0.012592060990957766, "avg_cost": 0.00017531803999999986, "cost_std": 7.295078909950199e-06, "small_model_fraction": 0.7864489955606605}}, "0.25": {"accuracy": 0.582, "large_model_usage": 0.02, "small_model_usage": 0.98, "avg_lambda": 0.0, "avg_cost": 4.500699999999997e-05, "unrestricted": {"accuracy": 0.588, "large_model_usage": 0.02, "avg_cost": 4.670299999999998e-05}, "random_baseline": {"accuracy": 0.5786, "accuracy_std": 0.004476605857119883, "avg_cost": 4.573991999999999e-05, "cost_std": 4.433926345080622e-06, "small_model_fraction": 0.9833147799186742}, "unrestricted_random_baseline": {"accuracy": 0.5818000000000001, "accuracy_std": 0.004044749683231341, "avg_cost": 4.814955999999999e-05, "cost_std": 5.053094873283108e-06, "small_model_fraction": 0.9806563802063333}}}}, {"trial": 4, "trial_seed": 45, "subject": "MMLU", "method": "standard", "calibration_size": 500, "test_size": 500, "small_model_accuracy": 0.587, "large_model_accuracy": 0.828, "small_model_cost": 3.466070000000005e-05, "large_model_cost": 0.0006758779999999994, "hybrid_results": {"0.05": {"accuracy": 0.834, "large_model_usage": 1.0, "small_model_usage": 0.0, "avg_lambda": 1.0, "avg_cost": 0.0007080287999999999, "unrestricted": {"accuracy": 0.834, "large_model_usage": 1.0, "avg_cost": 0.0007080287999999999}, "random_baseline": {"accuracy": 0.834, "accuracy_std": 0.0, "avg_cost": 0.0006734719999999997, "cost_std": 0.0, "small_model_fraction": 0.0}, "unrestricted_random_baseline": {"accuracy": 0.834, "accuracy_std": 0.0, "avg_cost": 0.0006734719999999997, "cost_std": 0.0, "small_model_fraction": 0.0}}, "0.1": {"accuracy": 0.748, "large_model_usage": 0.512, "small_model_usage": 0.488, "avg_lambda": 0.7799999999999914, "avg_cost": 0.0003872888, "unrestricted": {"accuracy": 0.754, "large_model_usage": 0.512, "avg_cost": 0.0003899687999999999}, "random_baseline": {"accuracy": 0.7234, "accuracy_std": 0.0139441744108427, "avg_cost": 0.0003832515600000001, "cost_std": 2.018956540984469e-05, "small_model_fraction": 0.450064588088936}, "unrestricted_random_baseline": {"accuracy": 0.728, "accuracy_std": 0.015491933384829681, "avg_cost": 0.00038630650000000017, "cost_std": 1.3239534336901885e-05, "small_model_fraction": 0.44588503772434057}}, "0.15": {"accuracy": 0.71, "large_model_usage": 0.474, "small_model_usage": 0.526, "avg_lambda": 0.6999999999999935, "avg_cost": 0.00034173679999999993, "unrestricted": {"accuracy": 0.744, "large_model_usage": 0.474, "avg_cost": 0.0003620167999999999}, "random_baseline": {"accuracy": 0.7108, "accuracy_std": 0.010205880657738467, "avg_cost": 0.00034088896000000003, "cost_std": 1.3269428891342695e-05, "small_model_fraction": 0.5211044680173161}, "unrestricted_random_baseline": {"accuracy": 0.72, "accuracy_std": 0.008099382692526642, "avg_cost": 0.0003604185600000001, "cost_std": 1.329204628213427e-05, "small_model_fraction": 0.4894771242135854}}, "0.2": {"accuracy": 0.658, "large_model_usage": 0.236, "small_model_usage": 0.764, "avg_lambda": 0.44999999999999624, "avg_cost": 0.0001864928000000001, "unrestricted": {"accuracy": 0.666, "large_model_usage": 0.236, "avg_cost": 0.00019516080000000005}, "random_baseline": {"accuracy": 0.6534000000000001, "accuracy_std": 0.008901685233707162, "avg_cost": 0.00018330250000000005, "cost_std": 1.12861499866872e-05, "small_model_fraction": 0.7632127205551065}, "unrestricted_random_baseline": {"accuracy": 0.6598, "accuracy_std": 0.0086925255248403, "avg_cost": 0.00018677422000000005, "cost_std": 1.3651518829624803e-05, "small_model_fraction": 0.7496946822863324}}, "0.25": {"accuracy": 0.614, "large_model_usage": 0.112, "small_model_usage": 0.888, "avg_lambda": 0.05999999999999949, "avg_cost": 9.87488e-05, "unrestricted": {"accuracy": 0.632, "large_model_usage": 0.112, "avg_cost": 0.00011232880000000004}, "random_baseline": {"accuracy": 0.6202, "accuracy_std": 0.0071805292284065035, "avg_cost": 9.698729999999997e-05, "cost_std": 8.35226589866487e-06, "small_model_fraction": 0.9000524471189408}, "unrestricted_random_baseline": {"accuracy": 0.625, "accuracy_std": 0.008497058314499208, "avg_cost": 0.00011068114000000002, "cost_std": 6.456818979063941e-06, "small_model_fraction": 0.878873979226699}}}}, {"trial": 5, "trial_seed": 46, "subject": "MMLU", "method": "standard", "calibration_size": 500, "test_size": 500, "small_model_accuracy": 0.589, "large_model_accuracy": 0.817, "small_model_cost": 3.444450000000002e-05, "large_model_cost": 0.0006728339999999998, "hybrid_results": {"0.05": {"accuracy": 0.764, "large_model_usage": 0.808, "small_model_usage": 0.19199999999999995, "avg_lambda": 0.8999999999999925, "avg_cost": 0.0005813904000000003, "unrestricted": {"accuracy": 0.764, "large_model_usage": 0.808, "avg_cost": 0.0005820064000000003}, "random_baseline": {"accuracy": 0.7716000000000001, "accuracy_std": 0.008138795979750328, "avg_cost": 0.0005708618199999998, "cost_std": 6.78575748134279e-06, "small_model_fraction": 0.1432410777432891}, "unrestricted_random_baseline": {"accuracy": 0.776, "accuracy_std": 0.004098780306383843, "avg_cost": 0.0005840612399999997, "cost_std": 9.361023443107026e-06, "small_model_fraction": 0.14227614959205862}}, "0.1": {"accuracy": 0.74, "large_model_usage": 0.544, "small_model_usage": 0.45599999999999996, "avg_lambda": 0.7599999999999948, "avg_cost": 0.0004073664, "unrestricted": {"accuracy": 0.744, "large_model_usage": 0.544, "avg_cost": 0.0004110024}, "random_baseline": {"accuracy": 0.7106, "accuracy_std": 0.012362847568420484, "avg_cost": 0.0004051166599999999, "cost_std": 1.6552860511718136e-05, "small_model_fraction": 0.41583954623313796}, "unrestricted_random_baseline": {"accuracy": 0.7076, "accuracy_std": 0.013469966592386175, "avg_cost": 0.00039939545999999993, "cost_std": 1.2277557075265338e-05, "small_model_fraction": 0.4101439638339914}}, "0.15": {"accuracy": 0.68, "large_model_usage": 0.37, "small_model_usage": 0.63, "avg_lambda": 0.6299999999999967, "avg_cost": 0.00028419439999999996, "unrestricted": {"accuracy": 0.684, "large_model_usage": 0.37, "avg_cost": 0.0002910103999999999}, "random_baseline": {"accuracy": 0.6676000000000001, "accuracy_std": 0.013047605144240065, "avg_cost": 0.00028103242, "cost_std": 1.0968301245480113e-05, "small_model_fraction": 0.6087813161087391}, "unrestricted_random_baseline": {"accuracy": 0.676, "accuracy_std": 0.00903327183250898, "avg_cost": 0.00029225734000000014, "cost_std": 1.4789742182485845e-05, "small_model_fraction": 0.5981044487730452}}, "0.2": {"accuracy": 0.644, "large_model_usage": 0.276, "small_model_usage": 0.724, "avg_lambda": 0.4000000000000035, "avg_cost": 0.0002090864000000001, "unrestricted": {"accuracy": 0.668, "large_model_usage": 0.276, "avg_cost": 0.00022379440000000003}, "random_baseline": {"accuracy": 0.6368, "accuracy_std": 0.009260669522232189, "avg_cost": 0.00020741375999999996, "cost_std": 1.5044545324282855e-05, "small_model_fraction": 0.7264336271194934}, "unrestricted_random_baseline": {"accuracy": 0.648, "accuracy_std": 0.011832159566199242, "avg_cost": 0.00022127912000000002, "cost_std": 1.5963101232830674e-05, "small_model_fraction": 0.7033944010670599}}, "0.25": {"accuracy": 0.602, "large_model_usage": 0.146, "small_model_usage": 0.854, "avg_lambda": 0.05999999999999949, "avg_cost": 0.00011787439999999992, "unrestricted": {"accuracy": 0.636, "large_model_usage": 0.146, "avg_cost": 0.00013556240000000002}, "random_baseline": {"accuracy": 0.6092, "accuracy_std": 0.010127191120937738, "avg_cost": 0.00011677328000000002, "cost_std": 1.2423344726505838e-05, "small_model_fraction": 0.8693119169409897}, "unrestricted_random_baseline": {"accuracy": 0.6164, "accuracy_std": 0.007364781055808794, "avg_cost": 0.00013493488000000003, "cost_std": 1.2640156793711074e-05, "small_model_fraction": 0.8416046943127979}}}}, {"trial": 6, "trial_seed": 47, "subject": "MMLU", "method": "standard", "calibration_size": 500, "test_size": 500, "small_model_accuracy": 0.596, "large_model_accuracy": 0.833, "small_model_cost": 3.464059999999996e-05, "large_model_cost": 0.0006767080000000006, "hybrid_results": {"0.05": {"accuracy": 0.776, "large_model_usage": 0.794, "small_model_usage": 0.20599999999999996, "avg_lambda": 0.8600000000000094, "avg_cost": 0.0005568913999999995, "unrestricted": {"accuracy": 0.792, "large_model_usage": 0.794, "avg_cost": 0.0005741753999999996}, "random_baseline": {"accuracy": 0.7942000000000001, "accuracy_std": 0.009357350052231676, "avg_cost": 0.0005541260399999999, "cost_std": 1.1608155156371674e-05, "small_model_fraction": 0.18661062685942467}, "unrestricted_random_baseline": {"accuracy": 0.7960000000000002, "accuracy_std": 0.006985699678629199, "avg_cost": 0.0005662482399999997, "cost_std": 1.138713261319099e-05, "small_model_fraction": 0.15969133458574736}}, "0.1": {"accuracy": 0.752, "large_model_usage": 0.522, "small_model_usage": 0.478, "avg_lambda": 0.7599999999999948, "avg_cost": 0.0003851273999999998, "unrestricted": {"accuracy": 0.756, "large_model_usage": 0.522, "avg_cost": 0.0003927313999999998}, "random_baseline": {"accuracy": 0.7308, "accuracy_std": 0.010205880657738468, "avg_cost": 0.00038428645999999975, "cost_std": 8.954430216177866e-06, "small_model_fraction": 0.45412771307186833}, "unrestricted_random_baseline": {"accuracy": 0.7390000000000001, "accuracy_std": 0.01132254388377454, "avg_cost": 0.00039320269999999996, "cost_std": 1.1233363458644035e-05, "small_model_fraction": 0.44228471964158356}}, "0.15": {"accuracy": 0.69, "large_model_usage": 0.298, "small_model_usage": 0.702, "avg_lambda": 0.5800000000000031, "avg_cost": 0.0002302834, "unrestricted": {"accuracy": 0.692, "large_model_usage": 0.298, "avg_cost": 0.00023944339999999983}, "random_baseline": {"accuracy": 0.6688000000000001, "accuracy_std": 0.012399999999999993, "avg_cost": 0.00022646939999999984, "cost_std": 1.8048285236664465e-05, "small_model_fraction": 0.6952924256861509}, "unrestricted_random_baseline": {"accuracy": 0.6744000000000001, "accuracy_std": 0.010762899237658944, "avg_cost": 0.00023655475999999977, "cost_std": 1.6286593007206927e-05, "small_model_fraction": 0.6810260106649245}}, "0.2": {"accuracy": 0.652, "large_model_usage": 0.188, "small_model_usage": 0.812, "avg_lambda": 0.3899999999999957, "avg_cost": 0.0001534114000000001, "unrestricted": {"accuracy": 0.66, "large_model_usage": 0.188, "avg_cost": 0.00016349540000000005}, "random_baseline": {"accuracy": 0.6489999999999999, "accuracy_std": 0.008306623862918082, "avg_cost": 0.0001518997800000001, "cost_std": 1.0769317712260043e-05, "small_model_fraction": 0.8150181741044631}, "unrestricted_random_baseline": {"accuracy": 0.647, "accuracy_std": 0.009808159868191392, "avg_cost": 0.00015381171999999998, "cost_std": 1.334963539403221e-05, "small_model_fraction": 0.7993126578300035}}, "0.25": {"accuracy": 0.61, "large_model_usage": 0.122, "small_model_usage": 0.878, "avg_lambda": 0.05999999999999949, "avg_cost": 0.0001022314000000001, "unrestricted": {"accuracy": 0.64, "large_model_usage": 0.122, "avg_cost": 0.00011641540000000011}, "random_baseline": {"accuracy": 0.6266, "accuracy_std": 0.007748548251124214, "avg_cost": 0.00010068308000000011, "cost_std": 1.3284523558246296e-05, "small_model_fraction": 0.8947294318322343}, "unrestricted_random_baseline": {"accuracy": 0.6314000000000001, "accuracy_std": 0.006575712889109444, "avg_cost": 0.00011525478000000014, "cost_std": 7.89489314352007e-06, "small_model_fraction": 0.8726382931137757}}}}, {"trial": 7, "trial_seed": 48, "subject": "MMLU", "method": "standard", "calibration_size": 500, "test_size": 500, "small_model_accuracy": 0.604, "large_model_accuracy": 0.836, "small_model_cost": 3.457159999999999e-05, "large_model_cost": 0.0006758320000000003, "hybrid_results": {"0.05": {"accuracy": 0.838, "large_model_usage": 1.0, "small_model_usage": 0.0, "avg_lambda": 1.0, "avg_cost": 0.0007152458000000006, "unrestricted": {"accuracy": 0.838, "large_model_usage": 1.0, "avg_cost": 0.0007152458000000006}, "random_baseline": {"accuracy": 0.8379999999999999, "accuracy_std": 1.1102230246251565e-16, "avg_cost": 0.0006804359999999998, "cost_std": 0.0, "small_model_fraction": 0.0}, "unrestricted_random_baseline": {"accuracy": 0.8379999999999999, "accuracy_std": 1.1102230246251565e-16, "avg_cost": 0.0006804359999999998, "cost_std": 0.0, "small_model_fraction": 0.0}}, "0.1": {"accuracy": 0.768, "large_model_usage": 0.576, "small_model_usage": 0.42400000000000004, "avg_lambda": 0.7900000000000055, "avg_cost": 0.0004309218, "unrestricted": {"accuracy": 0.772, "large_model_usage": 0.576, "avg_cost": 0.00043473779999999993}, "random_baseline": {"accuracy": 0.752, "accuracy_std": 0.012585706178041832, "avg_cost": 0.0004363408199999998, "cost_std": 8.119878438228048e-06, "small_model_fraction": 0.3819200437139112}, "unrestricted_random_baseline": {"accuracy": 0.7542, "accuracy_std": 0.010675204916066024, "avg_cost": 0.0004459048, "cost_std": 1.559119377648796e-05, "small_model_fraction": 0.3759692630326156}}, "0.15": {"accuracy": 0.746, "large_model_usage": 0.55, "small_model_usage": 0.44999999999999996, "avg_lambda": 0.6999999999999935, "avg_cost": 0.0003951217999999999, "unrestricted": {"accuracy": 0.768, "large_model_usage": 0.55, "avg_cost": 0.0004174497999999999}, "random_baseline": {"accuracy": 0.74, "accuracy_std": 0.010733126291999, "avg_cost": 0.00040156385999999994, "cost_std": 1.7570388247286896e-05, "small_model_fraction": 0.437747598323552}, "unrestricted_random_baseline": {"accuracy": 0.7490000000000001, "accuracy_std": 0.006884765791223408, "avg_cost": 0.00042485929999999986, "cost_std": 1.773996575131973e-05, "small_model_fraction": 0.402928669850813}}, "0.2": {"accuracy": 0.698, "large_model_usage": 0.302, "small_model_usage": 0.698, "avg_lambda": 0.44999999999999624, "avg_cost": 0.0002371858, "unrestricted": {"accuracy": 0.708, "large_model_usage": 0.302, "avg_cost": 0.0002475057999999999}, "random_baseline": {"accuracy": 0.6895999999999999, "accuracy_std": 0.008236504112789563, "avg_cost": 0.00023541144000000002, "cost_std": 1.0862698815966531e-05, "small_model_fraction": 0.6840375610282501}, "unrestricted_random_baseline": {"accuracy": 0.6904, "accuracy_std": 0.009286549412995082, "avg_cost": 0.00025214114, "cost_std": 1.1853094352125896e-05, "small_model_fraction": 0.6679442547832365}}, "0.25": {"accuracy": 0.65, "large_model_usage": 0.174, "small_model_usage": 0.8260000000000001, "avg_lambda": 0.079999999999999, "avg_cost": 0.0001362098, "unrestricted": {"accuracy": 0.676, "large_model_usage": 0.174, "avg_cost": 0.0001568577999999999}, "random_baseline": {"accuracy": 0.6504000000000001, "accuracy_std": 0.006499230723708773, "avg_cost": 0.000133117, "cost_std": 5.696465087753975e-06, "small_model_fraction": 0.8415024536054307}, "unrestricted_random_baseline": {"accuracy": 0.6557999999999999, "accuracy_std": 0.005963220606350235, "avg_cost": 0.00015904919999999996, "cost_std": 9.590881607443588e-06, "small_model_fraction": 0.8093033656842059}}}}, {"trial": 8, "trial_seed": 49, "subject": "MMLU", "method": "standard", "calibration_size": 500, "test_size": 500, "small_model_accuracy": 0.605, "large_model_accuracy": 0.835, "small_model_cost": 3.4565800000000014e-05, "large_model_cost": 0.0006763799999999996, "hybrid_results": {"0.05": {"accuracy": 0.782, "large_model_usage": 0.792, "small_model_usage": 0.20799999999999996, "avg_lambda": 0.8999999999999925, "avg_cost": 0.0005720950000000002, "unrestricted": {"accuracy": 0.782, "large_model_usage": 0.792, "avg_cost": 0.0005728710000000004}, "random_baseline": {"accuracy": 0.7904000000000001, "accuracy_std": 0.008187795796183495, "avg_cost": 0.0005662060799999996, "cost_std": 9.074434027177601e-06, "small_model_fraction": 0.16248471909783158}, "unrestricted_random_baseline": {"accuracy": 0.7902000000000001, "accuracy_std": 0.01129424632279641, "avg_cost": 0.0005697064799999996, "cost_std": 8.71383390899784e-06, "small_model_fraction": 0.16127564644097825}}, "0.1": {"accuracy": 0.75, "large_model_usage": 0.556, "small_model_usage": 0.44399999999999995, "avg_lambda": 0.7599999999999948, "avg_cost": 0.0004098110000000001, "unrestricted": {"accuracy": 0.756, "large_model_usage": 0.556, "avg_cost": 0.0004165790000000001}, "random_baseline": {"accuracy": 0.7328, "accuracy_std": 0.010476640682967045, "avg_cost": 0.0004034586199999998, "cost_std": 7.089592451163917e-06, "small_model_fraction": 0.4153367127121832}, "unrestricted_random_baseline": {"accuracy": 0.7338, "accuracy_std": 0.009357350052231676, "avg_cost": 0.00040843385999999964, "cost_std": 1.0433892528505325e-05, "small_model_fraction": 0.4047916047977744}}, "0.15": {"accuracy": 0.678, "large_model_usage": 0.31, "small_model_usage": 0.69, "avg_lambda": 0.5399999999999986, "avg_cost": 0.00023978299999999984, "unrestricted": {"accuracy": 0.682, "large_model_usage": 0.31, "avg_cost": 0.00024748299999999984}, "random_baseline": {"accuracy": 0.6668000000000001, "accuracy_std": 0.012718490476467729, "avg_cost": 0.0002337922200000001, "cost_std": 1.464879429071214e-05, "small_model_fraction": 0.6802545035619344}, "unrestricted_random_baseline": {"accuracy": 0.6764, "accuracy_std": 0.009991996797437409, "avg_cost": 0.00024152122, "cost_std": 1.1746169011707669e-05, "small_model_fraction": 0.6682572619926453}}, "0.2": {"accuracy": 0.668, "large_model_usage": 0.224, "small_model_usage": 0.776, "avg_lambda": 0.37000000000000255, "avg_cost": 0.00017666699999999995, "unrestricted": {"accuracy": 0.67, "large_model_usage": 0.224, "avg_cost": 0.00019011899999999996}, "random_baseline": {"accuracy": 0.6476, "accuracy_std": 0.009707728879609285, "avg_cost": 0.00017394283999999993, "cost_std": 1.3649710975636103e-05, "small_model_fraction": 0.7785944904304081}, "unrestricted_random_baseline": {"accuracy": 0.6578, "accuracy_std": 0.009693296652842116, "avg_cost": 0.0001887725, "cost_std": 1.4495466738880867e-05, "small_model_fraction": 0.7576351536005279}}, "0.25": {"accuracy": 0.63, "large_model_usage": 0.142, "small_model_usage": 0.858, "avg_lambda": 0.05000000000000044, "avg_cost": 0.00011571900000000004, "unrestricted": {"accuracy": 0.648, "large_model_usage": 0.142, "avg_cost": 0.00013309100000000006}, "random_baseline": {"accuracy": 0.6255999999999999, "accuracy_std": 0.007364781055808794, "avg_cost": 0.00011321641999999998, "cost_std": 1.1101949275852412e-05, "small_model_fraction": 0.8735565526596326}, "unrestricted_random_baseline": {"accuracy": 0.6277999999999999, "accuracy_std": 0.009734474818910375, "avg_cost": 0.00012600279999999995, "cost_std": 4.790000745720177e-06, "small_model_fraction": 0.8464895292126597}}}}, {"trial": 9, "trial_seed": 50, "subject": "MMLU", "method": "standard", "calibration_size": 500, "test_size": 500, "small_model_accuracy": 0.6, "large_model_accuracy": 0.809, "small_model_cost": 3.444400000000001e-05, "large_model_cost": 0.000673792, "hybrid_results": {"0.05": {"accuracy": 0.766, "large_model_usage": 0.814, "small_model_usage": 0.18600000000000005, "avg_lambda": 0.8999999999999925, "avg_cost": 0.0005883783999999997, "unrestricted": {"accuracy": 0.766, "large_model_usage": 0.814, "avg_cost": 0.0005894223999999997}, "random_baseline": {"accuracy": 0.7849999999999999, "accuracy_std": 0.006016643582596535, "avg_cost": 0.0005873767200000001, "cost_std": 5.170324368315751e-06, "small_model_fraction": 0.13359484975318656}, "unrestricted_random_baseline": {"accuracy": 0.7882, "accuracy_std": 0.007236021006050223, "avg_cost": 0.0005946289599999999, "cost_std": 1.1255047743941372e-05, "small_model_fraction": 0.13196193622252708}}, "0.1": {"accuracy": 0.74, "large_model_usage": 0.524, "small_model_usage": 0.476, "avg_lambda": 0.7799999999999914, "avg_cost": 0.0003971784000000002, "unrestricted": {"accuracy": 0.74, "large_model_usage": 0.524, "avg_cost": 0.0004002064000000001}, "random_baseline": {"accuracy": 0.7172, "accuracy_std": 0.009431860898041286, "avg_cost": 0.00039890864000000005, "cost_std": 7.112644385768201e-06, "small_model_fraction": 0.43264951169003396}, "unrestricted_random_baseline": {"accuracy": 0.7198, "accuracy_std": 0.009651942809610932, "avg_cost": 0.00039884614, "cost_std": 1.4927440590549973e-05, "small_model_fraction": 0.4279134368137539}}, "0.15": {"accuracy": 0.712, "large_model_usage": 0.504, "small_model_usage": 0.496, "avg_lambda": 0.6999999999999935, "avg_cost": 0.0003639904000000001, "unrestricted": {"accuracy": 0.74, "large_model_usage": 0.504, "avg_cost": 0.00038530240000000005}, "random_baseline": {"accuracy": 0.7061999999999999, "accuracy_std": 0.008506468127254703, "avg_cost": 0.00036849174000000017, "cost_std": 1.3313970046548887e-05, "small_model_fraction": 0.48455864411869576}, "unrestricted_random_baseline": {"accuracy": 0.7184000000000001, "accuracy_std": 0.006800000000000006, "avg_cost": 0.0003873431800000001, "cost_std": 1.3813482087424485e-05, "small_model_fraction": 0.45122468514799446}}, "0.2": {"accuracy": 0.672, "large_model_usage": 0.248, "small_model_usage": 0.752, "avg_lambda": 0.4000000000000035, "avg_cost": 0.0001945464, "unrestricted": {"accuracy": 0.676, "large_model_usage": 0.248, "avg_cost": 0.0002056424}, "random_baseline": {"accuracy": 0.6514, "accuracy_std": 0.008765842800324459, "avg_cost": 0.00019302103999999996, "cost_std": 1.0839335792492029e-05, "small_model_fraction": 0.7495848896062864}, "unrestricted_random_baseline": {"accuracy": 0.6626, "accuracy_std": 0.005063595560468869, "avg_cost": 0.00020907153999999997, "cost_std": 1.3311264338762092e-05, "small_model_fraction": 0.7322297090160601}}, "0.25": {"accuracy": 0.622, "large_model_usage": 0.176, "small_model_usage": 0.8240000000000001, "avg_lambda": 0.10999999999999954, "avg_cost": 0.00013687840000000005, "unrestricted": {"accuracy": 0.662, "large_model_usage": 0.176, "avg_cost": 0.00015769040000000013}, "random_baseline": {"accuracy": 0.6374, "accuracy_std": 0.0078000000000000074, "avg_cost": 0.0001419558800000001, "cost_std": 1.3007624641017314e-05, "small_model_fraction": 0.8397830289607537}, "unrestricted_random_baseline": {"accuracy": 0.6406, "accuracy_std": 0.008765842800324459, "avg_cost": 0.00016222076000000006, "cost_std": 1.3892727644577159e-05, "small_model_fraction": 0.8072311167001381}}}}, {"trial": 10, "trial_seed": 51, "subject": "MMLU", "method": "standard", "calibration_size": 500, "test_size": 500, "small_model_accuracy": 0.586, "large_model_accuracy": 0.846, "small_model_cost": 3.482450000000001e-05, "large_model_cost": 0.0006813299999999989, "hybrid_results": {"0.05": {"accuracy": 0.788, "large_model_usage": 0.808, "small_model_usage": 0.19199999999999995, "avg_lambda": 0.8999999999999925, "avg_cost": 0.0005917324000000003, "unrestricted": {"accuracy": 0.788, "large_model_usage": 0.808, "avg_cost": 0.0005923284000000003}, "random_baseline": {"accuracy": 0.8074, "accuracy_std": 0.008581375181169958, "avg_cost": 0.0005901812199999994, "cost_std": 9.902982396207653e-06, "small_model_fraction": 0.13858752941776784}, "unrestricted_random_baseline": {"accuracy": 0.8118000000000001, "accuracy_std": 0.0072360210060501765, "avg_cost": 0.0005913463999999993, "cost_std": 7.055481344883437e-06, "small_model_fraction": 0.13766565017621465}}, "0.1": {"accuracy": 0.734, "large_model_usage": 0.522, "small_model_usage": 0.478, "avg_lambda": 0.7599999999999948, "avg_cost": 0.0003916124000000005, "unrestricted": {"accuracy": 0.738, "large_model_usage": 0.522, "avg_cost": 0.0003998244000000007}, "random_baseline": {"accuracy": 0.7134, "accuracy_std": 0.011934822998268564, "avg_cost": 0.00038787732000000025, "cost_std": 1.1485937723042026e-05, "small_model_fraction": 0.4481285928735314}, "unrestricted_random_baseline": {"accuracy": 0.7227999999999999, "accuracy_std": 0.014675149062275326, "avg_cost": 0.0003986583200000002, "cost_std": 1.8570588585653317e-05, "small_model_fraction": 0.43542645808890823}}, "0.15": {"accuracy": 0.698, "large_model_usage": 0.362, "small_model_usage": 0.638, "avg_lambda": 0.6699999999999996, "avg_cost": 0.0002820803999999998, "unrestricted": {"accuracy": 0.698, "large_model_usage": 0.362, "avg_cost": 0.0002866683999999998}, "random_baseline": {"accuracy": 0.6732, "accuracy_std": 0.010476640682967022, "avg_cost": 0.00027268997999999994, "cost_std": 1.15210807581408e-05, "small_model_fraction": 0.617550198722207}, "unrestricted_random_baseline": {"accuracy": 0.6758, "accuracy_std": 0.00750732975164937, "avg_cost": 0.00028918912, "cost_std": 1.316901754154816e-05, "small_model_fraction": 0.610453584694948}}, "0.2": {"accuracy": 0.646, "large_model_usage": 0.248, "small_model_usage": 0.752, "avg_lambda": 0.4000000000000035, "avg_cost": 0.00019496839999999974, "unrestricted": {"accuracy": 0.654, "large_model_usage": 0.248, "avg_cost": 0.00020637239999999975}, "random_baseline": {"accuracy": 0.628, "accuracy_std": 0.00824621125123533, "avg_cost": 0.00019392963999999985, "cost_std": 1.3347028572023095e-05, "small_model_fraction": 0.7522930586050699}, "unrestricted_random_baseline": {"accuracy": 0.6474, "accuracy_std": 0.011629273408085316, "avg_cost": 0.00020938465999999987, "cost_std": 1.3141954267474907e-05, "small_model_fraction": 0.734653610835484}}, "0.25": {"accuracy": 0.584, "large_model_usage": 0.146, "small_model_usage": 0.854, "avg_lambda": 0.079999999999999, "avg_cost": 0.00011785239999999996, "unrestricted": {"accuracy": 0.612, "large_model_usage": 0.146, "avg_cost": 0.0001343564}, "random_baseline": {"accuracy": 0.6002, "accuracy_std": 0.005095095681142804, "avg_cost": 0.00011208653999999994, "cost_std": 8.823957183508998e-06, "small_model_fraction": 0.8715743330876534}, "unrestricted_random_baseline": {"accuracy": 0.6038, "accuracy_std": 0.007400000000000006, "avg_cost": 0.00013096557999999994, "cost_std": 1.0130913012932257e-05, "small_model_fraction": 0.8460463213383332}}}}, {"trial": 11, "trial_seed": 52, "subject": "MMLU", "method": "standard", "calibration_size": 500, "test_size": 500, "small_model_accuracy": 0.589, "large_model_accuracy": 0.846, "small_model_cost": 3.44052e-05, "large_model_cost": 0.0006724879999999981, "hybrid_results": {"0.05": {"accuracy": 0.778, "large_model_usage": 0.822, "small_model_usage": 0.17800000000000005, "avg_lambda": 0.8999999999999925, "avg_cost": 0.0005978198000000005, "unrestricted": {"accuracy": 0.778, "large_model_usage": 0.822, "avg_cost": 0.0005981638000000005}, "random_baseline": {"accuracy": 0.7962, "accuracy_std": 0.008553361912137245, "avg_cost": 0.0006018236199999999, "cost_std": 9.690870148526416e-06, "small_model_fraction": 0.1170196093673076}, "unrestricted_random_baseline": {"accuracy": 0.7938000000000001, "accuracy_std": 0.00922821759604747, "avg_cost": 0.0006016121799999998, "cost_std": 1.2316982692997593e-05, "small_model_fraction": 0.11648049438097668}}, "0.1": {"accuracy": 0.76, "large_model_usage": 0.62, "small_model_usage": 0.38, "avg_lambda": 0.800000000000007, "avg_cost": 0.0004586397999999996, "unrestricted": {"accuracy": 0.762, "large_model_usage": 0.62, "avg_cost": 0.0004625717999999997}, "random_baseline": {"accuracy": 0.7412, "accuracy_std": 0.01085172797300044, "avg_cost": 0.00046288599999999976, "cost_std": 1.39056348552665e-05, "small_model_fraction": 0.3351417715694564}, "unrestricted_random_baseline": {"accuracy": 0.7442, "accuracy_std": 0.011847362575695918, "avg_cost": 0.0004723587199999998, "cost_std": 1.1870634436356107e-05, "small_model_fraction": 0.32897956190011557}}, "0.15": {"accuracy": 0.712, "large_model_usage": 0.502, "small_model_usage": 0.498, "avg_lambda": 0.6999999999999935, "avg_cost": 0.00036035579999999976, "unrestricted": {"accuracy": 0.744, "large_model_usage": 0.502, "avg_cost": 0.00038052379999999985}, "random_baseline": {"accuracy": 0.7030000000000001, "accuracy_std": 0.013122499761859367, "avg_cost": 0.00035965809999999966, "cost_std": 1.4926717233270018e-05, "small_model_fraction": 0.4891719381873313}, "unrestricted_random_baseline": {"accuracy": 0.712, "accuracy_std": 0.009465727652959393, "avg_cost": 0.0003829235799999998, "cost_std": 1.4800807142706828e-05, "small_model_fraction": 0.4575647549189527}}, "0.2": {"accuracy": 0.672, "large_model_usage": 0.324, "small_model_usage": 0.6759999999999999, "avg_lambda": 0.5800000000000031, "avg_cost": 0.00024726379999999985, "unrestricted": {"accuracy": 0.678, "large_model_usage": 0.324, "avg_cost": 0.0002584477999999999}, "random_baseline": {"accuracy": 0.6534000000000001, "accuracy_std": 0.01200166655094201, "avg_cost": 0.00023956056, "cost_std": 1.69708194776328e-05, "small_model_fraction": 0.6664091243330795}, "unrestricted_random_baseline": {"accuracy": 0.6578, "accuracy_std": 0.01167732846159601, "avg_cost": 0.00025333116, "cost_std": 1.564976925984533e-05, "small_model_fraction": 0.6488816184984134}}, "0.25": {"accuracy": 0.636, "large_model_usage": 0.182, "small_model_usage": 0.8180000000000001, "avg_lambda": 0.319999999999996, "avg_cost": 0.00014551580000000003, "unrestricted": {"accuracy": 0.642, "large_model_usage": 0.182, "avg_cost": 0.0001612238}, "random_baseline": {"accuracy": 0.6217999999999999, "accuracy_std": 0.006029925372672539, "avg_cost": 0.00015312694, "cost_std": 7.82953619573985e-06, "small_model_fraction": 0.8258680534877285}, "unrestricted_random_baseline": {"accuracy": 0.6178, "accuracy_std": 0.010934349546269326, "avg_cost": 0.00016090463999999998, "cost_std": 9.201445635464003e-06, "small_model_fraction": 0.8012505587049199}}}}, {"trial": 12, "trial_seed": 53, "subject": "MMLU", "method": "standard", "calibration_size": 500, "test_size": 500, "small_model_accuracy": 0.578, "large_model_accuracy": 0.831, "small_model_cost": 3.448620000000002e-05, "large_model_cost": 0.00067474, "hybrid_results": {"0.05": {"accuracy": 0.766, "large_model_usage": 0.812, "small_model_usage": 0.18799999999999994, "avg_lambda": 0.8899999999999905, "avg_cost": 0.0005862446000000004, "unrestricted": {"accuracy": 0.772, "large_model_usage": 0.812, "avg_cost": 0.0005937686000000004}, "random_baseline": {"accuracy": 0.7868, "accuracy_std": 0.010127191120937738, "avg_cost": 0.00058802474, "cost_std": 9.48005337539838e-06, "small_model_fraction": 0.13821924992869947}, "unrestricted_random_baseline": {"accuracy": 0.7904, "accuracy_std": 0.008754427451295723, "avg_cost": 0.00059511654, "cost_std": 1.0791416333753385e-05, "small_model_fraction": 0.1264676601685138}}, "0.1": {"accuracy": 0.746, "large_model_usage": 0.558, "small_model_usage": 0.44199999999999995, "avg_lambda": 0.7599999999999948, "avg_cost": 0.00041418460000000015, "unrestricted": {"accuracy": 0.746, "large_model_usage": 0.558, "avg_cost": 0.0004203486000000001}, "random_baseline": {"accuracy": 0.7234, "accuracy_std": 0.010001999800039999, "avg_cost": 0.00042010782000000016, "cost_std": 1.6891241370473682e-05, "small_model_fraction": 0.4069564288411874}, "unrestricted_random_baseline": {"accuracy": 0.7188, "accuracy_std": 0.009303762679690413, "avg_cost": 0.0004248642000000001, "cost_std": 9.982838799459765e-06, "small_model_fraction": 0.3973289967197381}}, "0.15": {"accuracy": 0.682, "large_model_usage": 0.338, "small_model_usage": 0.6619999999999999, "avg_lambda": 0.5999999999999999, "avg_cost": 0.0002605006000000001, "unrestricted": {"accuracy": 0.692, "large_model_usage": 0.338, "avg_cost": 0.0002685366000000002}, "random_baseline": {"accuracy": 0.6568, "accuracy_std": 0.009846826900072944, "avg_cost": 0.0002603401800000001, "cost_std": 1.4815401738042763e-05, "small_model_fraction": 0.6469924895408663}, "unrestricted_random_baseline": {"accuracy": 0.6594000000000001, "accuracy_std": 0.010001999800039999, "avg_cost": 0.00026862880000000003, "cost_std": 1.1619468209173772e-05, "small_model_fraction": 0.6344412169049208}}, "0.2": {"accuracy": 0.646, "large_model_usage": 0.258, "small_model_usage": 0.742, "avg_lambda": 0.4000000000000035, "avg_cost": 0.00020021260000000006, "unrestricted": {"accuracy": 0.664, "large_model_usage": 0.258, "avg_cost": 0.00021372460000000008}, "random_baseline": {"accuracy": 0.6356, "accuracy_std": 0.010947145746723216, "avg_cost": 0.0002081083200000001, "cost_std": 1.208742251506081e-05, "small_model_fraction": 0.741155148161557}, "unrestricted_random_baseline": {"accuracy": 0.6432, "accuracy_std": 0.013029197979921876, "avg_cost": 0.0002186584400000001, "cost_std": 1.830464611333417e-05, "small_model_fraction": 0.7200510172684643}}, "0.25": {"accuracy": 0.59, "large_model_usage": 0.14, "small_model_usage": 0.86, "avg_lambda": 0.05999999999999949, "avg_cost": 0.00011441260000000004, "unrestricted": {"accuracy": 0.616, "large_model_usage": 0.14, "avg_cost": 0.00013173660000000007}, "random_baseline": {"accuracy": 0.6026, "accuracy_std": 0.005868560300448489, "avg_cost": 0.00011402336, "cost_std": 4.345613884182528e-06, "small_model_fraction": 0.8751645050759558}, "unrestricted_random_baseline": {"accuracy": 0.612, "accuracy_std": 0.008763560920082665, "avg_cost": 0.00013446486000000003, "cost_std": 7.868607685251552e-06, "small_model_fraction": 0.8481064852719343}}}}, {"trial": 13, "trial_seed": 54, "subject": "MMLU", "method": "standard", "calibration_size": 500, "test_size": 500, "small_model_accuracy": 0.563, "large_model_accuracy": 0.826, "small_model_cost": 3.4402799999999955e-05, "large_model_cost": 0.0006713359999999993, "hybrid_results": {"0.05": {"accuracy": 0.848, "large_model_usage": 1.0, "small_model_usage": 0.0, "avg_lambda": 1.0, "avg_cost": 0.0007077655999999996, "unrestricted": {"accuracy": 0.848, "large_model_usage": 1.0, "avg_cost": 0.0007077655999999996}, "random_baseline": {"accuracy": 0.8480000000000001, "accuracy_std": 1.1102230246251565e-16, "avg_cost": 0.0006732640000000005, "cost_std": 1.0842021724855044e-19, "small_model_fraction": 0.0}, "unrestricted_random_baseline": {"accuracy": 0.8480000000000001, "accuracy_std": 1.1102230246251565e-16, "avg_cost": 0.0006732640000000005, "cost_std": 1.0842021724855044e-19, "small_model_fraction": 0.0}}, "0.1": {"accuracy": 0.73, "large_model_usage": 0.528, "small_model_usage": 0.472, "avg_lambda": 0.7599999999999948, "avg_cost": 0.00039556559999999995, "unrestricted": {"accuracy": 0.74, "large_model_usage": 0.528, "avg_cost": 0.00040322160000000004}, "random_baseline": {"accuracy": 0.7404, "accuracy_std": 0.011164228589562302, "avg_cost": 0.00040674640000000007, "cost_std": 1.519289725602067e-05, "small_model_fraction": 0.4329659688017513}, "unrestricted_random_baseline": {"accuracy": 0.7338, "accuracy_std": 0.008784076502399107, "avg_cost": 0.0004033206600000001, "cost_std": 1.2251401527678472e-05, "small_model_fraction": 0.4209458699907612}}, "0.15": {"accuracy": 0.678, "large_model_usage": 0.328, "small_model_usage": 0.6719999999999999, "avg_lambda": 0.639999999999992, "avg_cost": 0.00025862159999999994, "unrestricted": {"accuracy": 0.676, "large_model_usage": 0.328, "avg_cost": 0.00026476559999999986}, "random_baseline": {"accuracy": 0.6794, "accuracy_std": 0.007485986908885149, "avg_cost": 0.00025822089999999996, "cost_std": 7.010911027391489e-06, "small_model_fraction": 0.6479712472202733}, "unrestricted_random_baseline": {"accuracy": 0.6834, "accuracy_std": 0.009509994742374954, "avg_cost": 0.00026683505999999997, "cost_std": 1.7062216388511768e-05, "small_model_fraction": 0.6383250237230527}}, "0.2": {"accuracy": 0.648, "large_model_usage": 0.24, "small_model_usage": 0.76, "avg_lambda": 0.44999999999999624, "avg_cost": 0.00019140159999999997, "unrestricted": {"accuracy": 0.662, "large_model_usage": 0.24, "avg_cost": 0.00020291360000000008}, "random_baseline": {"accuracy": 0.6519999999999999, "accuracy_std": 0.00638748776906853, "avg_cost": 0.00018457930000000008, "cost_std": 7.344016139551938e-06, "small_model_fraction": 0.7535082171882386}, "unrestricted_random_baseline": {"accuracy": 0.6524, "accuracy_std": 0.01341044369139218, "avg_cost": 0.00020517818000000002, "cost_std": 1.2294098276961984e-05, "small_model_fraction": 0.7354341083177949}}, "0.25": {"accuracy": 0.606, "large_model_usage": 0.154, "small_model_usage": 0.846, "avg_lambda": 0.2699999999999993, "avg_cost": 0.0001271296, "unrestricted": {"accuracy": 0.644, "large_model_usage": 0.154, "avg_cost": 0.00014200560000000005}, "random_baseline": {"accuracy": 0.6266, "accuracy_std": 0.00938296328459193, "avg_cost": 0.00013437552000000004, "cost_std": 1.364532384465832e-05, "small_model_fraction": 0.8544167583036962}, "unrestricted_random_baseline": {"accuracy": 0.6292, "accuracy_std": 0.005810335618533585, "avg_cost": 0.00014448884000000008, "cost_std": 7.3676717140763154e-06, "small_model_fraction": 0.8310610908647874}}}}, {"trial": 14, "trial_seed": 55, "subject": "MMLU", "method": "standard", "calibration_size": 500, "test_size": 500, "small_model_accuracy": 0.593, "large_model_accuracy": 0.834, "small_model_cost": 3.476130000000004e-05, "large_model_cost": 0.0006796259999999994, "hybrid_results": {"0.05": {"accuracy": 0.838, "large_model_usage": 1.0, "small_model_usage": 0.0, "avg_lambda": 1.0, "avg_cost": 0.0007145521999999996, "unrestricted": {"accuracy": 0.838, "large_model_usage": 1.0, "avg_cost": 0.0007145521999999996}, "random_baseline": {"accuracy": 0.8379999999999999, "accuracy_std": 1.1102230246251565e-16, "avg_cost": 0.0006796839999999996, "cost_std": 0.0, "small_model_fraction": 0.0}, "unrestricted_random_baseline": {"accuracy": 0.8379999999999999, "accuracy_std": 1.1102230246251565e-16, "avg_cost": 0.0006796839999999996, "cost_std": 0.0, "small_model_fraction": 0.0}}, "0.1": {"accuracy": 0.776, "large_model_usage": 0.62, "small_model_usage": 0.38, "avg_lambda": 0.800000000000007, "avg_cost": 0.00045944019999999994, "unrestricted": {"accuracy": 0.784, "large_model_usage": 0.62, "avg_cost": 0.0004648361999999999}, "random_baseline": {"accuracy": 0.7696, "accuracy_std": 0.014051334456200248, "avg_cost": 0.00045901184000000017, "cost_std": 1.904796258013973e-05, "small_model_fraction": 0.34144495736857616}, "unrestricted_random_baseline": {"accuracy": 0.7657999999999999, "accuracy_std": 0.014379151574414965, "avg_cost": 0.0004581168200000002, "cost_std": 1.4996843861679698e-05, "small_model_fraction": 0.3330773106358585}}, "0.15": {"accuracy": 0.74, "large_model_usage": 0.512, "small_model_usage": 0.488, "avg_lambda": 0.6999999999999935, "avg_cost": 0.0003657122000000003, "unrestricted": {"accuracy": 0.766, "large_model_usage": 0.512, "avg_cost": 0.0003896282000000002}, "random_baseline": {"accuracy": 0.7243999999999999, "accuracy_std": 0.012289833196589784, "avg_cost": 0.00036748504000000024, "cost_std": 1.5494209211392562e-05, "small_model_fraction": 0.486790174745182}, "unrestricted_random_baseline": {"accuracy": 0.7484, "accuracy_std": 0.012611106216347567, "avg_cost": 0.00039711880000000023, "cost_std": 1.6216719593801997e-05, "small_model_fraction": 0.4497033253642189}}, "0.2": {"accuracy": 0.71, "large_model_usage": 0.28, "small_model_usage": 0.72, "avg_lambda": 0.5, "avg_cost": 0.0002263362, "unrestricted": {"accuracy": 0.718, "large_model_usage": 0.28, "avg_cost": 0.00023455619999999998}, "random_baseline": {"accuracy": 0.6812, "accuracy_std": 0.01223764683262266, "avg_cost": 0.00022859344, "cost_std": 1.2396523784609952e-05, "small_model_fraction": 0.7029223339407473}, "unrestricted_random_baseline": {"accuracy": 0.6848, "accuracy_std": 0.009967948635501666, "avg_cost": 0.00023359004000000002, "cost_std": 1.3216775284402766e-05, "small_model_fraction": 0.6901754740180379}}, "0.25": {"accuracy": 0.662, "large_model_usage": 0.188, "small_model_usage": 0.812, "avg_lambda": 0.31000000000000166, "avg_cost": 0.00014924419999999997, "unrestricted": {"accuracy": 0.69, "large_model_usage": 0.188, "avg_cost": 0.0001650161999999998}, "random_baseline": {"accuracy": 0.6616000000000001, "accuracy_std": 0.00542586398650022, "avg_cost": 0.00014910267999999996, "cost_std": 9.647884355525824e-06, "small_model_fraction": 0.822469891746284}, "unrestricted_random_baseline": {"accuracy": 0.6608, "accuracy_std": 0.007110555533852477, "avg_cost": 0.00016130401999999992, "cost_std": 1.3484896113637694e-05, "small_model_fraction": 0.7980120481086964}}}}, {"trial": 15, "trial_seed": 56, "subject": "MMLU", "method": "standard", "calibration_size": 500, "test_size": 500, "small_model_accuracy": 0.579, "large_model_accuracy": 0.821, "small_model_cost": 3.4900799999999976e-05, "large_model_cost": 0.0006828799999999996, "hybrid_results": {"0.05": {"accuracy": 0.786, "large_model_usage": 0.828, "small_model_usage": 0.17200000000000004, "avg_lambda": 0.8999999999999925, "avg_cost": 0.0006079966000000002, "unrestricted": {"accuracy": 0.786, "large_model_usage": 0.828, "avg_cost": 0.0006085326000000002}, "random_baseline": {"accuracy": 0.796, "accuracy_std": 0.009165151389911667, "avg_cost": 0.00060534692, "cost_std": 1.3197585617134662e-05, "small_model_fraction": 0.1155645119472963}, "unrestricted_random_baseline": {"accuracy": 0.7942, "accuracy_std": 0.007400000000000006, "avg_cost": 0.0006046956599999999, "cost_std": 9.181007864521228e-06, "small_model_fraction": 0.11473732490178623}}, "0.1": {"accuracy": 0.754, "large_model_usage": 0.612, "small_model_usage": 0.388, "avg_lambda": 0.800000000000007, "avg_cost": 0.0004575326, "unrestricted": {"accuracy": 0.762, "large_model_usage": 0.612, "avg_cost": 0.00046184059999999986}, "random_baseline": {"accuracy": 0.7414, "accuracy_std": 0.005370288632839025, "avg_cost": 0.00046277370000000044, "cost_std": 1.763203085846905e-05, "small_model_fraction": 0.3477694963048193}, "unrestricted_random_baseline": {"accuracy": 0.7412000000000001, "accuracy_std": 0.01590471628165685, "avg_cost": 0.00045759454000000025, "cost_std": 1.2942538608495715e-05, "small_model_fraction": 0.34112113475247335}}, "0.15": {"accuracy": 0.724, "large_model_usage": 0.502, "small_model_usage": 0.498, "avg_lambda": 0.6999999999999935, "avg_cost": 0.0003646765999999998, "unrestricted": {"accuracy": 0.74, "large_model_usage": 0.502, "avg_cost": 0.00038812859999999983}, "random_baseline": {"accuracy": 0.7188, "accuracy_std": 0.012106196760337256, "avg_cost": 0.00036881512000000026, "cost_std": 1.3268522556848503e-05, "small_model_fraction": 0.4910703923829654}, "unrestricted_random_baseline": {"accuracy": 0.7182, "accuracy_std": 0.013310146505579854, "avg_cost": 0.00038785910000000025, "cost_std": 1.866156138403222e-05, "small_model_fraction": 0.4548778726230718}}, "0.2": {"accuracy": 0.668, "large_model_usage": 0.246, "small_model_usage": 0.754, "avg_lambda": 0.44999999999999624, "avg_cost": 0.00019328860000000003, "unrestricted": {"accuracy": 0.688, "large_model_usage": 0.246, "avg_cost": 0.0002017726000000001}, "random_baseline": {"accuracy": 0.6482, "accuracy_std": 0.012975361266646885, "avg_cost": 0.00019081188, "cost_std": 1.1502907892598245e-05, "small_model_fraction": 0.7555665367036472}, "unrestricted_random_baseline": {"accuracy": 0.6531999999999999, "accuracy_std": 0.009474175425861616, "avg_cost": 0.00020345220000000003, "cost_std": 1.1800865001515793e-05, "small_model_fraction": 0.7424735238415057}}, "0.25": {"accuracy": 0.652, "large_model_usage": 0.184, "small_model_usage": 0.8160000000000001, "avg_lambda": 0.29999999999999993, "avg_cost": 0.0001430165999999999, "unrestricted": {"accuracy": 0.676, "large_model_usage": 0.184, "avg_cost": 0.0001583326}, "random_baseline": {"accuracy": 0.6325999999999999, "accuracy_std": 0.009718024490605083, "avg_cost": 0.00014697445999999997, "cost_std": 1.1104246915140224e-05, "small_model_fraction": 0.8331492739273113}, "unrestricted_random_baseline": {"accuracy": 0.6442, "accuracy_std": 0.008874683092933523, "avg_cost": 0.00016293336, "cost_std": 9.302387999132247e-06, "small_model_fraction": 0.8095127127537427}}}}, {"trial": 16, "trial_seed": 57, "subject": "MMLU", "method": "standard", "calibration_size": 500, "test_size": 500, "small_model_accuracy": 0.608, "large_model_accuracy": 0.831, "small_model_cost": 3.456690000000001e-05, "large_model_cost": 0.000675418, "hybrid_results": {"0.05": {"accuracy": 0.814, "large_model_usage": 1.0, "small_model_usage": 0.0, "avg_lambda": 1.0, "avg_cost": 0.0007114249999999998, "unrestricted": {"accuracy": 0.814, "large_model_usage": 1.0, "avg_cost": 0.0007114249999999998}, "random_baseline": {"accuracy": 0.8139999999999998, "accuracy_std": 1.1102230246251565e-16, "avg_cost": 0.000676772, "cost_std": 0.0, "small_model_fraction": 0.0}, "unrestricted_random_baseline": {"accuracy": 0.8139999999999998, "accuracy_std": 1.1102230246251565e-16, "avg_cost": 0.000676772, "cost_std": 0.0, "small_model_fraction": 0.0}}, "0.1": {"accuracy": 0.758, "large_model_usage": 0.644, "small_model_usage": 0.356, "avg_lambda": 0.800000000000007, "avg_cost": 0.0004744530000000001, "unrestricted": {"accuracy": 0.764, "large_model_usage": 0.644, "avg_cost": 0.0004784010000000002}, "random_baseline": {"accuracy": 0.7492000000000001, "accuracy_std": 0.01357055636294991, "avg_cost": 0.0004807402199999999, "cost_std": 9.172878173158039e-06, "small_model_fraction": 0.3135907857535079}, "unrestricted_random_baseline": {"accuracy": 0.7534, "accuracy_std": 0.007158212067269319, "avg_cost": 0.00048582865999999985, "cost_std": 1.07474065329456e-05, "small_model_fraction": 0.3074302283322909}}, "0.15": {"accuracy": 0.716, "large_model_usage": 0.554, "small_model_usage": 0.44599999999999995, "avg_lambda": 0.7300000000000023, "avg_cost": 0.00039318900000000006, "unrestricted": {"accuracy": 0.754, "large_model_usage": 0.554, "avg_cost": 0.00041689300000000043}, "random_baseline": {"accuracy": 0.7272000000000001, "accuracy_std": 0.005528109984434106, "avg_cost": 0.0003934219599999999, "cost_std": 9.283937081992629e-06, "small_model_fraction": 0.44039715309843414}, "unrestricted_random_baseline": {"accuracy": 0.7268, "accuracy_std": 0.010205880657738468, "avg_cost": 0.00042558323999999985, "cost_std": 1.2417501360193268e-05, "small_model_fraction": 0.4034088417730726}}, "0.2": {"accuracy": 0.676, "large_model_usage": 0.266, "small_model_usage": 0.734, "avg_lambda": 0.47999999999999593, "avg_cost": 0.00021030100000000009, "unrestricted": {"accuracy": 0.684, "large_model_usage": 0.266, "avg_cost": 0.0002176370000000001}, "random_baseline": {"accuracy": 0.6676, "accuracy_std": 0.010688311372709921, "avg_cost": 0.00021179796000000002, "cost_std": 1.3797050776974014e-05, "small_model_fraction": 0.725780138319182}, "unrestricted_random_baseline": {"accuracy": 0.6656000000000001, "accuracy_std": 0.01310877568653914, "avg_cost": 0.00021590280000000002, "cost_std": 1.2279534481404384e-05, "small_model_fraction": 0.7143328614088356}}, "0.25": {"accuracy": 0.626, "large_model_usage": 0.172, "small_model_usage": 0.8280000000000001, "avg_lambda": 0.10999999999999954, "avg_cost": 0.00013308500000000002, "unrestricted": {"accuracy": 0.664, "large_model_usage": 0.172, "avg_cost": 0.00015252100000000016}, "random_baseline": {"accuracy": 0.6378000000000001, "accuracy_std": 0.007560423268574327, "avg_cost": 0.00013171249999999998, "cost_std": 8.247104539291336e-06, "small_model_fraction": 0.8462699057550185}, "unrestricted_random_baseline": {"accuracy": 0.6426, "accuracy_std": 0.004820788317277584, "avg_cost": 0.00015278820000000004, "cost_std": 1.7072716505817126e-05, "small_model_fraction": 0.8159414878120671}}}}, {"trial": 17, "trial_seed": 58, "subject": "MMLU", "method": "standard", "calibration_size": 500, "test_size": 500, "small_model_accuracy": 0.594, "large_model_accuracy": 0.826, "small_model_cost": 3.466269999999994e-05, "large_model_cost": 0.0006769980000000016, "hybrid_results": {"0.05": {"accuracy": 0.824, "large_model_usage": 1.0, "small_model_usage": 0.0, "avg_lambda": 1.0, "avg_cost": 0.0007129683999999999, "unrestricted": {"accuracy": 0.824, "large_model_usage": 1.0, "avg_cost": 0.0007129683999999999}, "random_baseline": {"accuracy": 0.8240000000000001, "accuracy_std": 1.1102230246251565e-16, "avg_cost": 0.000678248, "cost_std": 1.0842021724855044e-19, "small_model_fraction": 0.0}, "unrestricted_random_baseline": {"accuracy": 0.8240000000000001, "accuracy_std": 1.1102230246251565e-16, "avg_cost": 0.000678248, "cost_std": 1.0842021724855044e-19, "small_model_fraction": 0.0}}, "0.1": {"accuracy": 0.76, "large_model_usage": 0.55, "small_model_usage": 0.44999999999999996, "avg_lambda": 0.7799999999999914, "avg_cost": 0.0004114604000000002, "unrestricted": {"accuracy": 0.76, "large_model_usage": 0.55, "avg_cost": 0.00041476840000000007}, "random_baseline": {"accuracy": 0.728, "accuracy_std": 0.00885437744847147, "avg_cost": 0.0004174977199999999, "cost_std": 1.1130441309921203e-05, "small_model_fraction": 0.41339406381682703}, "unrestricted_random_baseline": {"accuracy": 0.7310000000000001, "accuracy_std": 0.009219544457292894, "avg_cost": 0.00041318137999999993, "cost_std": 7.145235520512939e-06, "small_model_fraction": 0.4082441055318007}}, "0.15": {"accuracy": 0.73, "large_model_usage": 0.534, "small_model_usage": 0.46599999999999997, "avg_lambda": 0.6999999999999935, "avg_cost": 0.0003796084000000001, "unrestricted": {"accuracy": 0.758, "large_model_usage": 0.534, "avg_cost": 0.00040273640000000014}, "random_baseline": {"accuracy": 0.72, "accuracy_std": 0.011593101396951562, "avg_cost": 0.00037737644000000004, "cost_std": 9.61689094408364e-06, "small_model_fraction": 0.4629818725516109}, "unrestricted_random_baseline": {"accuracy": 0.731, "accuracy_std": 0.01780449381476487, "avg_cost": 0.00040219306, "cost_std": 1.3968543175592745e-05, "small_model_fraction": 0.4269757554971691}}, "0.2": {"accuracy": 0.664, "large_model_usage": 0.24, "small_model_usage": 0.76, "avg_lambda": 0.4000000000000035, "avg_cost": 0.0001913203999999997, "unrestricted": {"accuracy": 0.674, "large_model_usage": 0.24, "avg_cost": 0.00020193639999999973}, "random_baseline": {"accuracy": 0.6518, "accuracy_std": 0.007871467461661778, "avg_cost": 0.00019393117999999985, "cost_std": 9.292226978049957e-06, "small_model_fraction": 0.7561122672224314}, "unrestricted_random_baseline": {"accuracy": 0.6634, "accuracy_std": 0.009961927524329825, "avg_cost": 0.0002052435799999999, "cost_std": 1.3105796507942612e-05, "small_model_fraction": 0.7395850734032533}}, "0.25": {"accuracy": 0.606, "large_model_usage": 0.046, "small_model_usage": 0.954, "avg_lambda": 0.0399999999999995, "avg_cost": 6.267239999999999e-05, "unrestricted": {"accuracy": 0.618, "large_model_usage": 0.046, "avg_cost": 6.82684e-05}, "random_baseline": {"accuracy": 0.6096, "accuracy_std": 0.004079215610874231, "avg_cost": 6.262975999999997e-05, "cost_std": 3.901184492740631e-06, "small_model_fraction": 0.9563939581087947}, "unrestricted_random_baseline": {"accuracy": 0.6098, "accuracy_std": 0.004512205669071395, "avg_cost": 6.636134e-05, "cost_std": 2.7608568656125554e-06, "small_model_fraction": 0.9476819972372685}}}}, {"trial": 18, "trial_seed": 59, "subject": "MMLU", "method": "standard", "calibration_size": 500, "test_size": 500, "small_model_accuracy": 0.575, "large_model_accuracy": 0.823, "small_model_cost": 3.434260000000001e-05, "large_model_cost": 0.0006703319999999992, "hybrid_results": {"0.05": {"accuracy": 0.798, "large_model_usage": 0.804, "small_model_usage": 0.19599999999999995, "avg_lambda": 0.939999999999998, "avg_cost": 0.0005785057999999999, "unrestricted": {"accuracy": 0.798, "large_model_usage": 0.804, "avg_cost": 0.0005785057999999999}, "random_baseline": {"accuracy": 0.8054, "accuracy_std": 0.0058685603004484735, "avg_cost": 0.0005793072600000001, "cost_std": 1.3218251571535352e-05, "small_model_fraction": 0.1443832239971285}, "unrestricted_random_baseline": {"accuracy": 0.8032, "accuracy_std": 0.006144916598294892, "avg_cost": 0.0005734541600000002, "cost_std": 6.947731301540077e-06, "small_model_fraction": 0.1443832239971285}}, "0.1": {"accuracy": 0.76, "large_model_usage": 0.624, "small_model_usage": 0.376, "avg_lambda": 0.800000000000007, "avg_cost": 0.0004528458000000003, "unrestricted": {"accuracy": 0.766, "large_model_usage": 0.624, "avg_cost": 0.0004572738000000002}, "random_baseline": {"accuracy": 0.7514000000000001, "accuracy_std": 0.008765842800324459, "avg_cost": 0.0004490494600000003, "cost_std": 7.487255511627801e-06, "small_model_fraction": 0.3419651333811526}, "unrestricted_random_baseline": {"accuracy": 0.7462, "accuracy_std": 0.011981652640600136, "avg_cost": 0.0004483482000000003, "cost_std": 1.3571859251849063e-05, "small_model_fraction": 0.33500275319053946}}, "0.15": {"accuracy": 0.712, "large_model_usage": 0.514, "small_model_usage": 0.486, "avg_lambda": 0.6999999999999935, "avg_cost": 0.0003634458000000003, "unrestricted": {"accuracy": 0.742, "large_model_usage": 0.514, "avg_cost": 0.00038405380000000014}, "random_baseline": {"accuracy": 0.7142, "accuracy_std": 0.0103324730824716, "avg_cost": 0.0003689364000000001, "cost_std": 1.1275495009266778e-05, "small_model_fraction": 0.4825335139233442}, "unrestricted_random_baseline": {"accuracy": 0.7252, "accuracy_std": 0.006823488843692799, "avg_cost": 0.00037914342000000024, "cost_std": 8.71989685464232e-06, "small_model_fraction": 0.4501304581491443}}, "0.2": {"accuracy": 0.672, "large_model_usage": 0.3, "small_model_usage": 0.7, "avg_lambda": 0.5, "avg_cost": 0.0002293617999999999, "unrestricted": {"accuracy": 0.686, "large_model_usage": 0.3, "avg_cost": 0.00023872979999999984}, "random_baseline": {"accuracy": 0.6656000000000002, "accuracy_std": 0.013705473359209444, "avg_cost": 0.0002282635399999999, "cost_std": 1.3871334200011227e-05, "small_model_fraction": 0.6933609270846336}, "unrestricted_random_baseline": {"accuracy": 0.6674, "accuracy_std": 0.011629273408085315, "avg_cost": 0.00023994343999999982, "cost_std": 1.2358630450596038e-05, "small_model_fraction": 0.6786311218394518}}, "0.25": {"accuracy": 0.62, "large_model_usage": 0.198, "small_model_usage": 0.802, "avg_lambda": 0.29999999999999993, "avg_cost": 0.00015213779999999994, "unrestricted": {"accuracy": 0.656, "large_model_usage": 0.198, "avg_cost": 0.00016832979999999994}, "random_baseline": {"accuracy": 0.6307999999999999, "accuracy_std": 0.008109253973085324, "avg_cost": 0.00015016721999999992, "cost_std": 1.4637985193174646e-05, "small_model_fraction": 0.8147843344558886}, "unrestricted_random_baseline": {"accuracy": 0.6364000000000001, "accuracy_std": 0.006858571279792905, "avg_cost": 0.00016875657999999995, "cost_std": 1.2633837802172392e-05, "small_model_fraction": 0.789324790633303}}}}, {"trial": 19, "trial_seed": 60, "subject": "MMLU", "method": "standard", "calibration_size": 500, "test_size": 500, "small_model_accuracy": 0.578, "large_model_accuracy": 0.824, "small_model_cost": 3.501389999999993e-05, "large_model_cost": 0.0006840139999999998, "hybrid_results": {"0.05": {"accuracy": 0.826, "large_model_usage": 1.0, "small_model_usage": 0.0, "avg_lambda": 1.0, "avg_cost": 0.0007159555999999992, "unrestricted": {"accuracy": 0.826, "large_model_usage": 1.0, "avg_cost": 0.0007159555999999992}, "random_baseline": {"accuracy": 0.826, "accuracy_std": 0.0, "avg_cost": 0.0006811440000000001, "cost_std": 1.0842021724855044e-19, "small_model_fraction": 0.0}, "unrestricted_random_baseline": {"accuracy": 0.826, "accuracy_std": 0.0, "avg_cost": 0.0006811440000000001, "cost_std": 1.0842021724855044e-19, "small_model_fraction": 0.0}}, "0.1": {"accuracy": 0.732, "large_model_usage": 0.54, "small_model_usage": 0.45999999999999996, "avg_lambda": 0.7799999999999914, "avg_cost": 0.00041077560000000036, "unrestricted": {"accuracy": 0.734, "large_model_usage": 0.54, "avg_cost": 0.0004130396000000004}, "random_baseline": {"accuracy": 0.7202, "accuracy_std": 0.01521709564930182, "avg_cost": 0.00040833744000000024, "cost_std": 1.3805845474667535e-05, "small_model_fraction": 0.42101441895001174}, "unrestricted_random_baseline": {"accuracy": 0.7222, "accuracy_std": 0.009856977224281294, "avg_cost": 0.00040704416000000013, "cost_std": 1.2203586336417681e-05, "small_model_fraction": 0.417525975727892}}, "0.15": {"accuracy": 0.702, "large_model_usage": 0.512, "small_model_usage": 0.488, "avg_lambda": 0.6999999999999935, "avg_cost": 0.0003681796000000003, "unrestricted": {"accuracy": 0.728, "large_model_usage": 0.512, "avg_cost": 0.00039203160000000035}, "random_baseline": {"accuracy": 0.713, "accuracy_std": 0.013030732903409552, "avg_cost": 0.0003724332800000002, "cost_std": 1.2417879378766758e-05, "small_model_fraction": 0.486647690809292}, "unrestricted_random_baseline": {"accuracy": 0.7212, "accuracy_std": 0.018269099594670796, "avg_cost": 0.0003892150400000001, "cost_std": 1.10325587664149e-05, "small_model_fraction": 0.44989577043208395}}, "0.2": {"accuracy": 0.66, "large_model_usage": 0.28, "small_model_usage": 0.72, "avg_lambda": 0.5, "avg_cost": 0.00022114759999999981, "unrestricted": {"accuracy": 0.668, "large_model_usage": 0.28, "avg_cost": 0.00023018359999999979}, "random_baseline": {"accuracy": 0.655, "accuracy_std": 0.006526867548832294, "avg_cost": 0.00022187159999999994, "cost_std": 9.003344853997394e-06, "small_model_fraction": 0.7131992737751506}, "unrestricted_random_baseline": {"accuracy": 0.6542, "accuracy_std": 0.01086093918590838, "avg_cost": 0.00022050663999999986, "cost_std": 1.0370379547075434e-05, "small_model_fraction": 0.6992763175229096}}, "0.25": {"accuracy": 0.606, "large_model_usage": 0.18, "small_model_usage": 0.8200000000000001, "avg_lambda": 0.28000000000000097, "avg_cost": 0.00014329559999999998, "unrestricted": {"accuracy": 0.648, "large_model_usage": 0.18, "avg_cost": 0.00016021159999999983}, "random_baseline": {"accuracy": 0.6214000000000001, "accuracy_std": 0.007851114570556216, "avg_cost": 0.00014380363999999993, "cost_std": 7.598999170311843e-06, "small_model_fraction": 0.8331561119944357}, "unrestricted_random_baseline": {"accuracy": 0.6295999999999999, "accuracy_std": 0.010910545357588694, "avg_cost": 0.00015397875999999987, "cost_std": 1.6575650823252698e-05, "small_model_fraction": 0.8070914010644992}}}}, {"trial": 20, "trial_seed": 61, "subject": "MMLU", "method": "standard", "calibration_size": 500, "test_size": 500, "small_model_accuracy": 0.573, "large_model_accuracy": 0.816, "small_model_cost": 3.486559999999997e-05, "large_model_cost": 0.0006818879999999995, "hybrid_results": {"0.05": {"accuracy": 0.776, "large_model_usage": 0.838, "small_model_usage": 0.16200000000000003, "avg_lambda": 0.8899999999999905, "avg_cost": 0.0006025840000000003, "unrestricted": {"accuracy": 0.778, "large_model_usage": 0.838, "avg_cost": 0.0006110120000000003}, "random_baseline": {"accuracy": 0.785, "accuracy_std": 0.004312771730569568, "avg_cost": 0.0005967136599999999, "cost_std": 9.29930610832884e-06, "small_model_fraction": 0.12256762671585912}, "unrestricted_random_baseline": {"accuracy": 0.79, "accuracy_std": 0.005059644256269411, "avg_cost": 0.0006063483199999998, "cost_std": 7.618368071023995e-06, "small_model_fraction": 0.10954180257128546}}, "0.1": {"accuracy": 0.736, "large_model_usage": 0.56, "small_model_usage": 0.43999999999999995, "avg_lambda": 0.7599999999999948, "avg_cost": 0.00041486399999999987, "unrestricted": {"accuracy": 0.744, "large_model_usage": 0.56, "avg_cost": 0.0004229679999999997}, "random_baseline": {"accuracy": 0.7118, "accuracy_std": 0.008364209466530602, "avg_cost": 0.00040382746000000016, "cost_std": 7.346849839244111e-06, "small_model_fraction": 0.4126966856170665}, "unrestricted_random_baseline": {"accuracy": 0.7134, "accuracy_std": 0.012745195173083868, "avg_cost": 0.00042297544000000016, "cost_std": 1.199346007315652e-05, "small_model_fraction": 0.4001716169332005}}, "0.15": {"accuracy": 0.696, "large_model_usage": 0.528, "small_model_usage": 0.472, "avg_lambda": 0.6999999999999935, "avg_cost": 0.0003714079999999997, "unrestricted": {"accuracy": 0.732, "large_model_usage": 0.528, "avg_cost": 0.0003992119999999997}, "random_baseline": {"accuracy": 0.6981999999999999, "accuracy_std": 0.013159027319676761, "avg_cost": 0.0003726028800000001, "cost_std": 1.574321774027151e-05, "small_model_fraction": 0.4798597390136725}, "unrestricted_random_baseline": {"accuracy": 0.7038, "accuracy_std": 0.01086093918590838, "avg_cost": 0.00038505514, "cost_std": 1.7492504179573624e-05, "small_model_fraction": 0.43688750188556075}}, "0.2": {"accuracy": 0.632, "large_model_usage": 0.246, "small_model_usage": 0.754, "avg_lambda": 0.44999999999999624, "avg_cost": 0.00019755999999999976, "unrestricted": {"accuracy": 0.646, "large_model_usage": 0.246, "avg_cost": 0.00020591599999999978}, "random_baseline": {"accuracy": 0.6300000000000001, "accuracy_std": 0.013145341380123998, "avg_cost": 0.00019332339999999988, "cost_std": 1.1830200316816307e-05, "small_model_fraction": 0.7485490455971852}, "unrestricted_random_baseline": {"accuracy": 0.6394000000000001, "accuracy_std": 0.005730619512757765, "avg_cost": 0.00020468689999999994, "cost_std": 1.0992653995555437e-05, "small_model_fraction": 0.7356345004438797}}, "0.25": {"accuracy": 0.606, "large_model_usage": 0.188, "small_model_usage": 0.812, "avg_lambda": 0.2699999999999993, "avg_cost": 0.00015010000000000007, "unrestricted": {"accuracy": 0.632, "large_model_usage": 0.188, "avg_cost": 0.0001687799999999999}, "random_baseline": {"accuracy": 0.6185999999999999, "accuracy_std": 0.010547037498748177, "avg_cost": 0.00015325746000000002, "cost_std": 1.0630294307139347e-05, "small_model_fraction": 0.8219004473415445}, "unrestricted_random_baseline": {"accuracy": 0.6247999999999999, "accuracy_std": 0.007600000000000007, "avg_cost": 0.00016136713999999994, "cost_std": 8.902239462539726e-06, "small_model_fraction": 0.7930297312736004}}}}, {"trial": 21, "trial_seed": 62, "subject": "MMLU", "method": "standard", "calibration_size": 500, "test_size": 500, "small_model_accuracy": 0.593, "large_model_accuracy": 0.823, "small_model_cost": 3.4484e-05, "large_model_cost": 0.0006737199999999989, "hybrid_results": {"0.05": {"accuracy": 0.798, "large_model_usage": 0.796, "small_model_usage": 0.20399999999999996, "avg_lambda": 0.8799999999999963, "avg_cost": 0.0005618943999999998, "unrestricted": {"accuracy": 0.808, "large_model_usage": 0.796, "avg_cost": 0.0005736183999999999}, "random_baseline": {"accuracy": 0.8016, "accuracy_std": 0.011892854997854786, "avg_cost": 0.0005570530400000001, "cost_std": 1.3125401118990542e-05, "small_model_fraction": 0.17493633024422794}, "unrestricted_random_baseline": {"accuracy": 0.7974, "accuracy_std": 0.006814690014960334, "avg_cost": 0.0005676233800000002, "cost_std": 1.7192972875323234e-05, "small_model_fraction": 0.1565956861002808}}, "0.1": {"accuracy": 0.764, "large_model_usage": 0.526, "small_model_usage": 0.474, "avg_lambda": 0.7599999999999948, "avg_cost": 0.0003868144, "unrestricted": {"accuracy": 0.77, "large_model_usage": 0.526, "avg_cost": 0.0003944384000000002}, "random_baseline": {"accuracy": 0.738, "accuracy_std": 0.0065115282384398876, "avg_cost": 0.00038293206000000036, "cost_std": 1.2619855136426926e-05, "small_model_fraction": 0.448825785781776}, "unrestricted_random_baseline": {"accuracy": 0.735, "accuracy_std": 0.009348796714016203, "avg_cost": 0.0003912001200000005, "cost_std": 1.4444685944166518e-05, "small_model_fraction": 0.4368990482388338}}, "0.15": {"accuracy": 0.724, "large_model_usage": 0.496, "small_model_usage": 0.504, "avg_lambda": 0.6999999999999935, "avg_cost": 0.00034775440000000017, "unrestricted": {"accuracy": 0.766, "large_model_usage": 0.496, "avg_cost": 0.0003703264000000001}, "random_baseline": {"accuracy": 0.7196, "accuracy_std": 0.010268398122394759, "avg_cost": 0.00034086272000000026, "cost_std": 1.1016508017770428e-05, "small_model_fraction": 0.5099299789123254}, "unrestricted_random_baseline": {"accuracy": 0.7335999999999998, "accuracy_std": 0.008380930735902792, "avg_cost": 0.00036730396000000037, "cost_std": 1.0996387496555323e-05, "small_model_fraction": 0.4746190765225977}}, "0.2": {"accuracy": 0.684, "large_model_usage": 0.234, "small_model_usage": 0.766, "avg_lambda": 0.4000000000000035, "avg_cost": 0.00018441840000000005, "unrestricted": {"accuracy": 0.698, "large_model_usage": 0.234, "avg_cost": 0.00019726239999999992}, "random_baseline": {"accuracy": 0.6648000000000001, "accuracy_std": 0.009431860898041286, "avg_cost": 0.00018165088, "cost_std": 1.3260401620825775e-05, "small_model_fraction": 0.7654475029566541}, "unrestricted_random_baseline": {"accuracy": 0.6648000000000001, "accuracy_std": 0.012464349160706318, "avg_cost": 0.00019332936000000003, "cost_std": 1.7287336446266083e-05, "small_model_fraction": 0.7453547672534083}}, "0.25": {"accuracy": 0.622, "large_model_usage": 0.108, "small_model_usage": 0.892, "avg_lambda": 0.05000000000000044, "avg_cost": 9.503439999999997e-05, "unrestricted": {"accuracy": 0.644, "large_model_usage": 0.108, "avg_cost": 0.00010797040000000003}, "random_baseline": {"accuracy": 0.6308, "accuracy_std": 0.006079473661428271, "avg_cost": 9.569077999999996e-05, "cost_std": 9.602617027644104e-06, "small_model_fraction": 0.905276924328417}, "unrestricted_random_baseline": {"accuracy": 0.633, "accuracy_std": 0.006942621983083919, "avg_cost": 0.00010527953999999999, "cost_std": 1.1077286378188473e-05, "small_model_fraction": 0.8850402668185143}}}}, {"trial": 22, "trial_seed": 63, "subject": "MMLU", "method": "standard", "calibration_size": 500, "test_size": 500, "small_model_accuracy": 0.588, "large_model_accuracy": 0.832, "small_model_cost": 3.4621799999999994e-05, "large_model_cost": 0.0006774519999999993, "hybrid_results": {"0.05": {"accuracy": 0.774, "large_model_usage": 0.8, "small_model_usage": 0.19999999999999996, "avg_lambda": 0.8600000000000094, "avg_cost": 0.0005671730000000001, "unrestricted": {"accuracy": 0.786, "large_model_usage": 0.8, "avg_cost": 0.000587377}, "random_baseline": {"accuracy": 0.78, "accuracy_std": 0.009165151389911688, "avg_cost": 0.0005713678600000003, "cost_std": 4.455228230337861e-06, "small_model_fraction": 0.17155230105866104}, "unrestricted_random_baseline": {"accuracy": 0.789, "accuracy_std": 0.0042190046219458014, "avg_cost": 0.0005903406800000002, "cost_std": 1.0742358134115667e-05, "small_model_fraction": 0.14012253935798186}}, "0.1": {"accuracy": 0.742, "large_model_usage": 0.522, "small_model_usage": 0.478, "avg_lambda": 0.7599999999999948, "avg_cost": 0.0003902490000000002, "unrestricted": {"accuracy": 0.744, "large_model_usage": 0.522, "avg_cost": 0.0003952570000000002}, "random_baseline": {"accuracy": 0.7156, "accuracy_std": 0.01519999999999999, "avg_cost": 0.0003832557400000002, "cost_std": 1.2560105225371336e-05, "small_model_fraction": 0.4467789472243206}, "unrestricted_random_baseline": {"accuracy": 0.716, "accuracy_std": 0.011207140580897529, "avg_cost": 0.0003897497000000003, "cost_std": 1.660874132624142e-05, "small_model_fraction": 0.43898839849154475}}, "0.15": {"accuracy": 0.67, "large_model_usage": 0.3, "small_model_usage": 0.7, "avg_lambda": 0.5500000000000052, "avg_cost": 0.00023576499999999987, "unrestricted": {"accuracy": 0.676, "large_model_usage": 0.3, "avg_cost": 0.0002454289999999998}, "random_baseline": {"accuracy": 0.6714, "accuracy_std": 0.008627861844049177, "avg_cost": 0.00023108665999999983, "cost_std": 1.2134471270244974e-05, "small_model_fraction": 0.6870974636848112}, "unrestricted_random_baseline": {"accuracy": 0.6682, "accuracy_std": 0.011847362575695896, "avg_cost": 0.00024404503999999987, "cost_std": 1.0912983435816246e-05, "small_model_fraction": 0.6720639447244389}}, "0.2": {"accuracy": 0.646, "large_model_usage": 0.216, "small_model_usage": 0.784, "avg_lambda": 0.4000000000000035, "avg_cost": 0.0001752609999999999, "unrestricted": {"accuracy": 0.656, "large_model_usage": 0.216, "avg_cost": 0.00018861699999999986}, "random_baseline": {"accuracy": 0.6444000000000001, "accuracy_std": 0.010836973747315263, "avg_cost": 0.00017110541999999996, "cost_std": 1.885742556945668e-05, "small_model_fraction": 0.7812187417454873}, "unrestricted_random_baseline": {"accuracy": 0.6466000000000001, "accuracy_std": 0.010547037498748177, "avg_cost": 0.0001864030199999999, "cost_std": 5.508947615978929e-06, "small_model_fraction": 0.7604418709637475}}, "0.25": {"accuracy": 0.614, "large_model_usage": 0.128, "small_model_usage": 0.872, "avg_lambda": 0.10999999999999954, "avg_cost": 0.00011158499999999997, "unrestricted": {"accuracy": 0.634, "large_model_usage": 0.128, "avg_cost": 0.00012659299999999996}, "random_baseline": {"accuracy": 0.6218, "accuracy_std": 0.004044749683231341, "avg_cost": 0.00010850859999999999, "cost_std": 1.0632654244354982e-05, "small_model_fraction": 0.880274448835789}, "unrestricted_random_baseline": {"accuracy": 0.6298, "accuracy_std": 0.007400000000000006, "avg_cost": 0.00013002281999999999, "cost_std": 6.491688889310705e-06, "small_model_fraction": 0.8569276925695153}}}}, {"trial": 23, "trial_seed": 64, "subject": "MMLU", "method": "standard", "calibration_size": 500, "test_size": 500, "small_model_accuracy": 0.588, "large_model_accuracy": 0.829, "small_model_cost": 3.463520000000001e-05, "large_model_cost": 0.0006764160000000001, "hybrid_results": {"0.05": {"accuracy": 0.786, "large_model_usage": 0.818, "small_model_usage": 0.18200000000000005, "avg_lambda": 0.8600000000000094, "avg_cost": 0.0005826700000000005, "unrestricted": {"accuracy": 0.796, "large_model_usage": 0.818, "avg_cost": 0.000602274}, "random_baseline": {"accuracy": 0.8036000000000001, "accuracy_std": 0.008380930735902792, "avg_cost": 0.0005912076000000003, "cost_std": 4.848683883282127e-06, "small_model_fraction": 0.1460716805488721}, "unrestricted_random_baseline": {"accuracy": 0.8134, "accuracy_std": 0.0057306195127577175, "avg_cost": 0.0006038043200000003, "cost_std": 8.967983396594844e-06, "small_model_fraction": 0.1155254255035366}}, "0.1": {"accuracy": 0.774, "large_model_usage": 0.562, "small_model_usage": 0.43799999999999994, "avg_lambda": 0.7599999999999948, "avg_cost": 0.00042403400000000043, "unrestricted": {"accuracy": 0.78, "large_model_usage": 0.562, "avg_cost": 0.00043007800000000035}, "random_baseline": {"accuracy": 0.7426, "accuracy_std": 0.00885663593019382, "avg_cost": 0.00042568360000000026, "cost_std": 1.0098509512200286e-05, "small_model_fraction": 0.393252649502758}, "unrestricted_random_baseline": {"accuracy": 0.7444, "accuracy_std": 0.010190191362285607, "avg_cost": 0.0004378134600000002, "cost_std": 1.3123008166742928e-05, "small_model_fraction": 0.3838351038236104}}, "0.15": {"accuracy": 0.678, "large_model_usage": 0.304, "small_model_usage": 0.696, "avg_lambda": 0.5, "avg_cost": 0.0002449659999999998, "unrestricted": {"accuracy": 0.684, "large_model_usage": 0.304, "avg_cost": 0.00025373399999999976}, "random_baseline": {"accuracy": 0.6644, "accuracy_std": 0.010725670142233548, "avg_cost": 0.00024383707999999998, "cost_std": 1.0508220003483025e-05, "small_model_fraction": 0.672270033631421}, "unrestricted_random_baseline": {"accuracy": 0.6744000000000001, "accuracy_std": 0.0065604877867426956, "avg_cost": 0.00025427048000000005, "cost_std": 1.4805722007710458e-05, "small_model_fraction": 0.6586080481061451}}, "0.2": {"accuracy": 0.634, "large_model_usage": 0.214, "small_model_usage": 0.786, "avg_lambda": 0.31000000000000166, "avg_cost": 0.00016978199999999978, "unrestricted": {"accuracy": 0.666, "large_model_usage": 0.214, "avg_cost": 0.00018922199999999985}, "random_baseline": {"accuracy": 0.641, "accuracy_std": 0.010246950765959608, "avg_cost": 0.00017612885999999998, "cost_std": 1.1714746434831608e-05, "small_model_fraction": 0.7894190664476101}, "unrestricted_random_baseline": {"accuracy": 0.6529999999999999, "accuracy_std": 0.007000000000000006, "avg_cost": 0.00019616249999999995, "cost_std": 1.1399736923718906e-05, "small_model_fraction": 0.7591283503651094}}, "0.25": {"accuracy": 0.61, "large_model_usage": 0.14, "small_model_usage": 0.86, "avg_lambda": 0.05000000000000044, "avg_cost": 0.00012002200000000004, "unrestricted": {"accuracy": 0.638, "large_model_usage": 0.14, "avg_cost": 0.0001387619999999999}, "random_baseline": {"accuracy": 0.6164000000000001, "accuracy_std": 0.00902441133814279, "avg_cost": 0.00012035628000000007, "cost_std": 9.896980742509262e-06, "small_model_fraction": 0.8669533273666025}, "unrestricted_random_baseline": {"accuracy": 0.6256, "accuracy_std": 0.007631513611335572, "avg_cost": 0.000136564, "cost_std": 8.213511334867676e-06, "small_model_fraction": 0.8377533263693775}}}}, {"trial": 24, "trial_seed": 65, "subject": "MMLU", "method": "standard", "calibration_size": 500, "test_size": 500, "small_model_accuracy": 0.594, "large_model_accuracy": 0.821, "small_model_cost": 3.442690000000002e-05, "large_model_cost": 0.0006723380000000003, "hybrid_results": {"0.05": {"accuracy": 0.822, "large_model_usage": 1.0, "small_model_usage": 0.0, "avg_lambda": 1.0, "avg_cost": 0.0007076456, "unrestricted": {"accuracy": 0.822, "large_model_usage": 1.0, "avg_cost": 0.0007076456}, "random_baseline": {"accuracy": 0.8219999999999998, "accuracy_std": 1.1102230246251565e-16, "avg_cost": 0.0006731199999999993, "cost_std": 0.0, "small_model_fraction": 0.0}, "unrestricted_random_baseline": {"accuracy": 0.8219999999999998, "accuracy_std": 1.1102230246251565e-16, "avg_cost": 0.0006731199999999993, "cost_std": 0.0, "small_model_fraction": 0.0}}, "0.1": {"accuracy": 0.764, "large_model_usage": 0.616, "small_model_usage": 0.384, "avg_lambda": 0.800000000000007, "avg_cost": 0.0004576056000000001, "unrestricted": {"accuracy": 0.768, "large_model_usage": 0.616, "avg_cost": 0.00046116560000000013}, "random_baseline": {"accuracy": 0.7504, "accuracy_std": 0.013230268326832993, "avg_cost": 0.00045486288000000055, "cost_std": 1.0217700349178398e-05, "small_model_fraction": 0.33661806480558204}, "unrestricted_random_baseline": {"accuracy": 0.7485999999999999, "accuracy_std": 0.010584894897919401, "avg_cost": 0.00046053710000000037, "cost_std": 1.5478563318150725e-05, "small_model_fraction": 0.33103734987524136}}, "0.15": {"accuracy": 0.696, "large_model_usage": 0.36, "small_model_usage": 0.64, "avg_lambda": 0.6499999999999982, "avg_cost": 0.00028539760000000026, "unrestricted": {"accuracy": 0.704, "large_model_usage": 0.36, "avg_cost": 0.00028930160000000026}, "random_baseline": {"accuracy": 0.6952, "accuracy_std": 0.012843675486401826, "avg_cost": 0.00029122910000000014, "cost_std": 1.3354681571419091e-05, "small_model_fraction": 0.6065741762449343}, "unrestricted_random_baseline": {"accuracy": 0.6934, "accuracy_std": 0.01743674281510165, "avg_cost": 0.0002849654800000001, "cost_std": 1.0594916865818257e-05, "small_model_fraction": 0.6004542012202012}}, "0.2": {"accuracy": 0.654, "large_model_usage": 0.198, "small_model_usage": 0.802, "avg_lambda": 0.34999999999999676, "avg_cost": 0.00016398959999999998, "unrestricted": {"accuracy": 0.668, "large_model_usage": 0.198, "avg_cost": 0.00017472559999999983}, "random_baseline": {"accuracy": 0.6477999999999999, "accuracy_std": 0.009937806599043884, "avg_cost": 0.00016374053999999993, "cost_std": 1.8658892676265657e-05, "small_model_fraction": 0.7968953667681908}, "unrestricted_random_baseline": {"accuracy": 0.649, "accuracy_std": 0.007113367697511502, "avg_cost": 0.00017736911999999988, "cost_std": 1.1668623125699095e-05, "small_model_fraction": 0.7800654354501753}}, "0.25": {"accuracy": 0.608, "large_model_usage": 0.02, "small_model_usage": 0.98, "avg_lambda": 0.0, "avg_cost": 4.5421600000000006e-05, "unrestricted": {"accuracy": 0.612, "large_model_usage": 0.02, "avg_cost": 4.7777599999999995e-05}, "random_baseline": {"accuracy": 0.6052, "accuracy_std": 0.004749736834815172, "avg_cost": 4.632865999999999e-05, "cost_std": 3.1328094867706143e-06, "small_model_fraction": 0.9827645262796023}, "unrestricted_random_baseline": {"accuracy": 0.603, "accuracy_std": 0.002720294101747091, "avg_cost": 4.6956e-05, "cost_std": 5.355623426642315e-06, "small_model_fraction": 0.9790712216796353}}}}, {"trial": 25, "trial_seed": 66, "subject": "MMLU", "method": "standard", "calibration_size": 500, "test_size": 500, "small_model_accuracy": 0.587, "large_model_accuracy": 0.819, "small_model_cost": 3.4313799999999936e-05, "large_model_cost": 0.0006700359999999993, "hybrid_results": {"0.05": {"accuracy": 0.788, "large_model_usage": 0.788, "small_model_usage": 0.21199999999999997, "avg_lambda": 0.8999999999999925, "avg_cost": 0.0005692657999999998, "unrestricted": {"accuracy": 0.788, "large_model_usage": 0.788, "avg_cost": 0.0005697337999999998}, "random_baseline": {"accuracy": 0.7872000000000001, "accuracy_std": 0.006079473661428271, "avg_cost": 0.0005699367200000001, "cost_std": 6.524211648436825e-06, "small_model_fraction": 0.15851294795116413}, "unrestricted_random_baseline": {"accuracy": 0.7894000000000001, "accuracy_std": 0.008345058418010034, "avg_cost": 0.00056731268, "cost_std": 7.661508570614537e-06, "small_model_fraction": 0.15777677734079376}}, "0.1": {"accuracy": 0.738, "large_model_usage": 0.532, "small_model_usage": 0.46799999999999997, "avg_lambda": 0.7599999999999948, "avg_cost": 0.00038677380000000014, "unrestricted": {"accuracy": 0.744, "large_model_usage": 0.532, "avg_cost": 0.00039466980000000005}, "random_baseline": {"accuracy": 0.7274, "accuracy_std": 0.015621779668142823, "avg_cost": 0.0003785339800000002, "cost_std": 1.0779912167527292e-05, "small_model_fraction": 0.4455754416001194}, "unrestricted_random_baseline": {"accuracy": 0.7272000000000001, "accuracy_std": 0.011872657663724674, "avg_cost": 0.00039381352000000016, "cost_std": 1.2779273123053566e-05, "small_model_fraction": 0.4331549220713065}}, "0.15": {"accuracy": 0.712, "large_model_usage": 0.498, "small_model_usage": 0.502, "avg_lambda": 0.6999999999999935, "avg_cost": 0.00035147779999999994, "unrestricted": {"accuracy": 0.742, "large_model_usage": 0.498, "avg_cost": 0.00037202580000000017}, "random_baseline": {"accuracy": 0.6981999999999999, "accuracy_std": 0.010332473082471559, "avg_cost": 0.00034802396000000005, "cost_std": 9.901419751853788e-06, "small_model_fraction": 0.501096548146344}, "unrestricted_random_baseline": {"accuracy": 0.7162000000000001, "accuracy_std": 0.01366601624468522, "avg_cost": 0.00036891476000000013, "cost_std": 1.6102277039425237e-05, "small_model_fraction": 0.46877425391153466}}, "0.2": {"accuracy": 0.664, "large_model_usage": 0.266, "small_model_usage": 0.734, "avg_lambda": 0.46000000000000196, "avg_cost": 0.00021024180000000006, "unrestricted": {"accuracy": 0.68, "large_model_usage": 0.266, "avg_cost": 0.0002194698000000001}, "random_baseline": {"accuracy": 0.6546, "accuracy_std": 0.00885663593019382, "avg_cost": 0.00021266710000000005, "cost_std": 9.969148653019487e-06, "small_model_fraction": 0.723262771065726}, "unrestricted_random_baseline": {"accuracy": 0.6524, "accuracy_std": 0.015919798993705932, "avg_cost": 0.00021642854000000002, "cost_std": 1.3073806309732412e-05, "small_model_fraction": 0.7087469967227819}}, "0.25": {"accuracy": 0.61, "large_model_usage": 0.192, "small_model_usage": 0.808, "avg_lambda": 0.25, "avg_cost": 0.0001465618, "unrestricted": {"accuracy": 0.644, "large_model_usage": 0.192, "avg_cost": 0.00016507780000000007}, "random_baseline": {"accuracy": 0.6324000000000001, "accuracy_std": 0.011723480711802285, "avg_cost": 0.00014766504, "cost_std": 8.780894736096091e-06, "small_model_fraction": 0.823432310528089}, "unrestricted_random_baseline": {"accuracy": 0.6352, "accuracy_std": 0.008255906976220124, "avg_cost": 0.00016156963999999996, "cost_std": 9.306674168487929e-06, "small_model_fraction": 0.7943063809947171}}}}, {"trial": 26, "trial_seed": 67, "subject": "MMLU", "method": "standard", "calibration_size": 500, "test_size": 500, "small_model_accuracy": 0.597, "large_model_accuracy": 0.823, "small_model_cost": 3.469199999999999e-05, "large_model_cost": 0.0006775519999999998, "hybrid_results": {"0.05": {"accuracy": 0.804, "large_model_usage": 0.854, "small_model_usage": 0.14600000000000002, "avg_lambda": 0.8999999999999925, "avg_cost": 0.0006237943999999997, "unrestricted": {"accuracy": 0.804, "large_model_usage": 0.854, "avg_cost": 0.0006240343999999998}, "random_baseline": {"accuracy": 0.8168, "accuracy_std": 0.004489988864128722, "avg_cost": 0.0006276428400000002, "cost_std": 6.947143249307622e-06, "small_model_fraction": 0.0836225616775038}, "unrestricted_random_baseline": {"accuracy": 0.8151999999999999, "accuracy_std": 0.005946427498927364, "avg_cost": 0.0006304392200000003, "cost_std": 9.518777474634088e-06, "small_model_fraction": 0.08324923000342221}}, "0.1": {"accuracy": 0.758, "large_model_usage": 0.554, "small_model_usage": 0.44599999999999995, "avg_lambda": 0.7599999999999948, "avg_cost": 0.00041391839999999975, "unrestricted": {"accuracy": 0.768, "large_model_usage": 0.554, "avg_cost": 0.0004203223999999997}, "random_baseline": {"accuracy": 0.7418, "accuracy_std": 0.01063766891757777, "avg_cost": 0.00041208741999999995, "cost_std": 1.3951650089491249e-05, "small_model_fraction": 0.4100948884671626}, "unrestricted_random_baseline": {"accuracy": 0.7404, "accuracy_std": 0.01953048898517394, "avg_cost": 0.0004280350600000001, "cost_std": 1.307427943316199e-05, "small_model_fraction": 0.400133154963756}}, "0.15": {"accuracy": 0.732, "large_model_usage": 0.526, "small_model_usage": 0.474, "avg_lambda": 0.6999999999999935, "avg_cost": 0.0003767624, "unrestricted": {"accuracy": 0.758, "large_model_usage": 0.526, "avg_cost": 0.0004012103999999999}, "random_baseline": {"accuracy": 0.7356, "accuracy_std": 0.011959933110180853, "avg_cost": 0.00038198713999999985, "cost_std": 1.4892934129996072e-05, "small_model_fraction": 0.46789285380953843}, "unrestricted_random_baseline": {"accuracy": 0.7377999999999999, "accuracy_std": 0.00695413546028549, "avg_cost": 0.0004061361599999999, "cost_std": 1.4410931717220824e-05, "small_model_fraction": 0.429862800609775}}, "0.2": {"accuracy": 0.68, "large_model_usage": 0.266, "small_model_usage": 0.734, "avg_lambda": 0.5, "avg_cost": 0.00021633839999999995, "unrestricted": {"accuracy": 0.688, "large_model_usage": 0.266, "avg_cost": 0.00022322639999999998}, "random_baseline": {"accuracy": 0.6782, "accuracy_std": 0.008738420909981368, "avg_cost": 0.00022429225999999998, "cost_std": 9.394576784954181e-06, "small_model_fraction": 0.7174401891547149}, "unrestricted_random_baseline": {"accuracy": 0.673, "accuracy_std": 0.009305912099305463, "avg_cost": 0.00021713163999999996, "cost_std": 7.377991948924833e-06, "small_model_fraction": 0.7067255701085772}}, "0.25": {"accuracy": 0.65, "large_model_usage": 0.174, "small_model_usage": 0.8260000000000001, "avg_lambda": 0.31000000000000166, "avg_cost": 0.00014206239999999998, "unrestricted": {"accuracy": 0.672, "large_model_usage": 0.174, "avg_cost": 0.00015709440000000015}, "random_baseline": {"accuracy": 0.653, "accuracy_std": 0.008955445270895252, "avg_cost": 0.00014614238000000004, "cost_std": 9.419397383463574e-06, "small_model_fraction": 0.8329801200883551}, "unrestricted_random_baseline": {"accuracy": 0.6572000000000001, "accuracy_std": 0.010628264204469148, "avg_cost": 0.00015397332000000003, "cost_std": 1.2635256455236664e-05, "small_model_fraction": 0.8095971129017202}}}}, {"trial": 27, "trial_seed": 68, "subject": "MMLU", "method": "standard", "calibration_size": 500, "test_size": 500, "small_model_accuracy": 0.598, "large_model_accuracy": 0.824, "small_model_cost": 3.4399400000000015e-05, "large_model_cost": 0.0006710759999999995, "hybrid_results": {"0.05": {"accuracy": 0.812, "large_model_usage": 0.794, "small_model_usage": 0.20599999999999996, "avg_lambda": 0.8799999999999963, "avg_cost": 0.0005694784000000004, "unrestricted": {"accuracy": 0.822, "large_model_usage": 0.794, "avg_cost": 0.0005824824000000003}, "random_baseline": {"accuracy": 0.8102, "accuracy_std": 0.004044749683231304, "avg_cost": 0.00058120422, "cost_std": 5.343034901214926e-06, "small_model_fraction": 0.15957489249644047}, "unrestricted_random_baseline": {"accuracy": 0.807, "accuracy_std": 0.009767292357659798, "avg_cost": 0.0005922154199999999, "cost_std": 9.580509062654367e-06, "small_model_fraction": 0.1391500802762333}}, "0.1": {"accuracy": 0.78, "large_model_usage": 0.538, "small_model_usage": 0.46199999999999997, "avg_lambda": 0.7599999999999948, "avg_cost": 0.00040855439999999986, "unrestricted": {"accuracy": 0.786, "large_model_usage": 0.538, "avg_cost": 0.0004139063999999997}, "random_baseline": {"accuracy": 0.7454, "accuracy_std": 0.008765842800324459, "avg_cost": 0.00040707393999999997, "cost_std": 1.703422511323601e-05, "small_model_fraction": 0.4123311583934447}, "unrestricted_random_baseline": {"accuracy": 0.744, "accuracy_std": 0.009633275663033845, "avg_cost": 0.00041511299999999994, "cost_std": 9.576650538053467e-06, "small_model_fraction": 0.40392500682450083}}, "0.15": {"accuracy": 0.704, "large_model_usage": 0.348, "small_model_usage": 0.652, "avg_lambda": 0.6100000000000078, "avg_cost": 0.00027280240000000013, "unrestricted": {"accuracy": 0.72, "large_model_usage": 0.348, "avg_cost": 0.0002810104}, "random_baseline": {"accuracy": 0.6932, "accuracy_std": 0.008304215796810656, "avg_cost": 0.00027237805999999996, "cost_std": 1.559439532551361e-05, "small_model_fraction": 0.62555086836865}, "unrestricted_random_baseline": {"accuracy": 0.6936, "accuracy_std": 0.015067846561469863, "avg_cost": 0.00028295017999999995, "cost_std": 1.2922965437220609e-05, "small_model_fraction": 0.6126589229131395}}, "0.2": {"accuracy": 0.662, "large_model_usage": 0.198, "small_model_usage": 0.802, "avg_lambda": 0.3600000000000036, "avg_cost": 0.00015831440000000008, "unrestricted": {"accuracy": 0.676, "large_model_usage": 0.198, "avg_cost": 0.00017313840000000004}, "random_baseline": {"accuracy": 0.6446000000000001, "accuracy_std": 0.008901685233707162, "avg_cost": 0.00015852728000000001, "cost_std": 1.2000181153699282e-05, "small_model_fraction": 0.805372146549755}, "unrestricted_random_baseline": {"accuracy": 0.6588, "accuracy_std": 0.008634813257969168, "avg_cost": 0.00017588918000000002, "cost_std": 1.274537551285169e-05, "small_model_fraction": 0.7820887401861477}}, "0.25": {"accuracy": 0.616, "large_model_usage": 0.048, "small_model_usage": 0.952, "avg_lambda": 0.0399999999999995, "avg_cost": 6.128640000000004e-05, "unrestricted": {"accuracy": 0.626, "large_model_usage": 0.048, "avg_cost": 6.669840000000002e-05}, "random_baseline": {"accuracy": 0.6113999999999999, "accuracy_std": 0.004294182110716781, "avg_cost": 6.186388000000001e-05, "cost_std": 3.903400424450448e-06, "small_model_fraction": 0.9577697688276904}, "unrestricted_random_baseline": {"accuracy": 0.6122, "accuracy_std": 0.0048538644398046435, "avg_cost": 6.764524000000001e-05, "cost_std": 7.572910753891131e-06, "small_model_fraction": 0.9492693778913814}}}}, {"trial": 28, "trial_seed": 69, "subject": "MMLU", "method": "standard", "calibration_size": 500, "test_size": 500, "small_model_accuracy": 0.615, "large_model_accuracy": 0.832, "small_model_cost": 3.461159999999998e-05, "large_model_cost": 0.0006758480000000004, "hybrid_results": {"0.05": {"accuracy": 0.822, "large_model_usage": 0.83, "small_model_usage": 0.17000000000000004, "avg_lambda": 0.8600000000000094, "avg_cost": 0.0005937162, "unrestricted": {"accuracy": 0.828, "large_model_usage": 0.83, "avg_cost": 0.0006141081999999996}, "random_baseline": {"accuracy": 0.8248, "accuracy_std": 0.005878775382679633, "avg_cost": 0.0006019565, "cost_std": 1.3327257923293977e-05, "small_model_fraction": 0.12808349619578735}, "unrestricted_random_baseline": {"accuracy": 0.8309999999999998, "accuracy_std": 0.007912016177941, "avg_cost": 0.0006227279800000001, "cost_std": 8.508079915680165e-06, "small_model_fraction": 0.096282431876919}}, "0.1": {"accuracy": 0.804, "large_model_usage": 0.53, "small_model_usage": 0.47, "avg_lambda": 0.7599999999999948, "avg_cost": 0.0004026521999999997, "unrestricted": {"accuracy": 0.81, "large_model_usage": 0.53, "avg_cost": 0.00040853619999999983}, "random_baseline": {"accuracy": 0.7575999999999999, "accuracy_std": 0.0123223374405995, "avg_cost": 0.0004038992800000003, "cost_std": 1.2774704697393258e-05, "small_model_fraction": 0.4260453710987095}, "unrestricted_random_baseline": {"accuracy": 0.7607999999999999, "accuracy_std": 0.015025311976794372, "avg_cost": 0.0004202403800000003, "cost_std": 1.4396361346520845e-05, "small_model_fraction": 0.4168693480282785}}, "0.15": {"accuracy": 0.742, "large_model_usage": 0.31, "small_model_usage": 0.69, "avg_lambda": 0.5800000000000031, "avg_cost": 0.00024520020000000007, "unrestricted": {"accuracy": 0.746, "large_model_usage": 0.31, "avg_cost": 0.0002561442000000001}, "random_baseline": {"accuracy": 0.6994, "accuracy_std": 0.005517245689653494, "avg_cost": 0.00024385287999999995, "cost_std": 1.4798790667402544e-05, "small_model_fraction": 0.6715897600323376}, "unrestricted_random_baseline": {"accuracy": 0.7006, "accuracy_std": 0.008856635930193818, "avg_cost": 0.00026078506000000006, "cost_std": 1.2255047890579637e-05, "small_model_fraction": 0.65452273139828}}, "0.2": {"accuracy": 0.7, "large_model_usage": 0.218, "small_model_usage": 0.782, "avg_lambda": 0.4000000000000035, "avg_cost": 0.00017626820000000008, "unrestricted": {"accuracy": 0.726, "large_model_usage": 0.218, "avg_cost": 0.0001892322000000001}, "random_baseline": {"accuracy": 0.669, "accuracy_std": 0.013921206844235867, "avg_cost": 0.00017551138, "cost_std": 1.1180994731579093e-05, "small_model_fraction": 0.779088336220464}, "unrestricted_random_baseline": {"accuracy": 0.6788000000000001, "accuracy_std": 0.009217374897442319, "avg_cost": 0.00019599812, "cost_std": 1.3141152500203282e-05, "small_model_fraction": 0.758871143310018}}, "0.25": {"accuracy": 0.626, "large_model_usage": 0.014, "small_model_usage": 0.986, "avg_lambda": 0.0, "avg_cost": 4.3436199999999976e-05, "unrestricted": {"accuracy": 0.632, "large_model_usage": 0.014, "avg_cost": 4.511219999999999e-05}, "random_baseline": {"accuracy": 0.6279999999999999, "accuracy_std": 0.00268328157299975, "avg_cost": 4.4781059999999994e-05, "cost_std": 3.7229481157276565e-06, "small_model_fraction": 0.9862381486765257}, "unrestricted_random_baseline": {"accuracy": 0.6268, "accuracy_std": 0.0031240998703626643, "avg_cost": 4.6664660000000005e-05, "cost_std": 2.3906020514506414e-06, "small_model_fraction": 0.9836244480194825}}}}, {"trial": 29, "trial_seed": 70, "subject": "MMLU", "method": "standard", "calibration_size": 500, "test_size": 500, "small_model_accuracy": 0.597, "large_model_accuracy": 0.825, "small_model_cost": 3.478389999999999e-05, "large_model_cost": 0.0006786059999999991, "hybrid_results": {"0.05": {"accuracy": 0.792, "large_model_usage": 0.804, "small_model_usage": 0.19599999999999995, "avg_lambda": 0.8999999999999925, "avg_cost": 0.0005872314, "unrestricted": {"accuracy": 0.792, "large_model_usage": 0.804, "avg_cost": 0.0005877114000000001}, "random_baseline": {"accuracy": 0.7857999999999999, "accuracy_std": 0.00729109045342328, "avg_cost": 0.0005843124799999998, "cost_std": 7.877998952119728e-06, "small_model_fraction": 0.14192523058776518}, "unrestricted_random_baseline": {"accuracy": 0.7892000000000001, "accuracy_std": 0.012998461447417549, "avg_cost": 0.0005843374399999999, "cost_std": 1.0000151212376775e-05, "small_model_fraction": 0.14117968302113137}}, "0.1": {"accuracy": 0.738, "large_model_usage": 0.522, "small_model_usage": 0.478, "avg_lambda": 0.7599999999999948, "avg_cost": 0.0003922754000000007, "unrestricted": {"accuracy": 0.744, "large_model_usage": 0.522, "avg_cost": 0.0003978074000000008}, "random_baseline": {"accuracy": 0.7212, "accuracy_std": 0.01203993355463394, "avg_cost": 0.00038978812000000046, "cost_std": 1.9497804348223585e-05, "small_model_fraction": 0.44473558767243127}, "unrestricted_random_baseline": {"accuracy": 0.7209999999999999, "accuracy_std": 0.007000000000000006, "avg_cost": 0.0003966599000000003, "cost_std": 1.0950436951373232e-05, "small_model_fraction": 0.43614315196697767}}, "0.15": {"accuracy": 0.7, "large_model_usage": 0.36, "small_model_usage": 0.64, "avg_lambda": 0.6499999999999982, "avg_cost": 0.0002819674000000001, "unrestricted": {"accuracy": 0.706, "large_model_usage": 0.36, "avg_cost": 0.00028645940000000017}, "random_baseline": {"accuracy": 0.6838, "accuracy_std": 0.014709180806557486, "avg_cost": 0.00027795158000000014, "cost_std": 1.0950801250118744e-05, "small_model_fraction": 0.6160686313812459}, "unrestricted_random_baseline": {"accuracy": 0.683, "accuracy_std": 0.008955445270895212, "avg_cost": 0.0002857493800000002, "cost_std": 1.1109273921710682e-05, "small_model_fraction": 0.609091548736832}}, "0.2": {"accuracy": 0.666, "large_model_usage": 0.206, "small_model_usage": 0.794, "avg_lambda": 0.37000000000000255, "avg_cost": 0.00016693539999999992, "unrestricted": {"accuracy": 0.678, "large_model_usage": 0.206, "avg_cost": 0.00017864339999999998}, "random_baseline": {"accuracy": 0.6428, "accuracy_std": 0.01121427661510096, "avg_cost": 0.00016142828, "cost_std": 1.1831011873276107e-05, "small_model_fraction": 0.794739105725013}, "unrestricted_random_baseline": {"accuracy": 0.6504, "accuracy_std": 0.007031358332498784, "avg_cost": 0.00017485683999999996, "cost_std": 7.213292644444699e-06, "small_model_fraction": 0.7765539579955393}}, "0.25": {"accuracy": 0.62, "large_model_usage": 0.136, "small_model_usage": 0.864, "avg_lambda": 0.05999999999999949, "avg_cost": 0.00011478739999999997, "unrestricted": {"accuracy": 0.662, "large_model_usage": 0.136, "avg_cost": 0.00013150739999999997}, "random_baseline": {"accuracy": 0.6294, "accuracy_std": 0.006069596362197412, "avg_cost": 0.00011307771999999996, "cost_std": 1.3198644111332038e-05, "small_model_fraction": 0.8757366359433761}, "unrestricted_random_baseline": {"accuracy": 0.6344000000000001, "accuracy_std": 0.01003194896318757, "avg_cost": 0.00013049982, "cost_std": 7.155506950845602e-06, "small_model_fraction": 0.8497667290389688}}}}, {"trial": 30, "trial_seed": 71, "subject": "MMLU", "method": "standard", "calibration_size": 500, "test_size": 500, "small_model_accuracy": 0.594, "large_model_accuracy": 0.828, "small_model_cost": 3.480599999999999e-05, "large_model_cost": 0.0006797680000000004, "hybrid_results": {"0.05": {"accuracy": 0.842, "large_model_usage": 1.0, "small_model_usage": 0.0, "avg_lambda": 1.0, "avg_cost": 0.0007073146000000001, "unrestricted": {"accuracy": 0.842, "large_model_usage": 1.0, "avg_cost": 0.0007073146000000001}, "random_baseline": {"accuracy": 0.842, "accuracy_std": 0.0, "avg_cost": 0.0006728360000000005, "cost_std": 0.0, "small_model_fraction": 0.0}, "unrestricted_random_baseline": {"accuracy": 0.842, "accuracy_std": 0.0, "avg_cost": 0.0006728360000000005, "cost_std": 0.0, "small_model_fraction": 0.0}}, "0.1": {"accuracy": 0.766, "large_model_usage": 0.654, "small_model_usage": 0.346, "avg_lambda": 0.800000000000007, "avg_cost": 0.0004780226000000001, "unrestricted": {"accuracy": 0.774, "large_model_usage": 0.654, "avg_cost": 0.0004844225999999999}, "random_baseline": {"accuracy": 0.7678, "accuracy_std": 0.0124402572320672, "avg_cost": 0.0004696962000000001, "cost_std": 1.1070624801879948e-05, "small_model_fraction": 0.3128019945361125}, "unrestricted_random_baseline": {"accuracy": 0.771, "accuracy_std": 0.01230447073221763, "avg_cost": 0.00048345494000000006, "cost_std": 8.940564257383257e-06, "small_model_fraction": 0.3028789293012617}}, "0.15": {"accuracy": 0.72, "large_model_usage": 0.538, "small_model_usage": 0.46199999999999997, "avg_lambda": 0.6999999999999935, "avg_cost": 0.00038282660000000005, "unrestricted": {"accuracy": 0.75, "large_model_usage": 0.538, "avg_cost": 0.00040641859999999996}, "random_baseline": {"accuracy": 0.7354, "accuracy_std": 0.011315476127852518, "avg_cost": 0.00037389567999999995, "cost_std": 1.2434767785913843e-05, "small_model_fraction": 0.46040138798874997}, "unrestricted_random_baseline": {"accuracy": 0.7402, "accuracy_std": 0.010017983829094563, "avg_cost": 0.00040350573999999994, "cost_std": 1.1960200706024992e-05, "small_model_fraction": 0.42382248876678047}}, "0.2": {"accuracy": 0.672, "large_model_usage": 0.234, "small_model_usage": 0.766, "avg_lambda": 0.4000000000000035, "avg_cost": 0.00018571460000000003, "unrestricted": {"accuracy": 0.678, "large_model_usage": 0.234, "avg_cost": 0.00019780260000000022}, "random_baseline": {"accuracy": 0.6608, "accuracy_std": 0.01085172797300044, "avg_cost": 0.00018726742000000008, "cost_std": 1.005138935290047e-05, "small_model_fraction": 0.7660193933906184}, "unrestricted_random_baseline": {"accuracy": 0.6658000000000001, "accuracy_std": 0.00524976189936268, "avg_cost": 0.00019543830000000008, "cost_std": 1.3181617441346169e-05, "small_model_fraction": 0.7472772039282932}}, "0.25": {"accuracy": 0.608, "large_model_usage": 0.028, "small_model_usage": 0.972, "avg_lambda": 0.01999999999999975, "avg_cost": 4.913460000000001e-05, "unrestricted": {"accuracy": 0.618, "large_model_usage": 0.028, "avg_cost": 5.23186e-05}, "random_baseline": {"accuracy": 0.6044, "accuracy_std": 0.0024979991993593614, "avg_cost": 4.639247999999999e-05, "cost_std": 3.2910948818896024e-06, "small_model_fraction": 0.9777838074181113}, "unrestricted_random_baseline": {"accuracy": 0.6060000000000001, "accuracy_std": 0.0040987803063838426, "avg_cost": 5.1352599999999993e-05, "cost_std": 3.978906080821711e-06, "small_model_fraction": 0.972847082463773}}}}], "unrestricted_hybrid": {"0.05": {"avg_accuracy": 0.8045333333333332, "std_accuracy": 0.024942244396917338, "avg_cost": 0.0006279577933333334, "std_cost": 6.230269182650093e-05}, "0.1": {"avg_accuracy": 0.759, "std_accuracy": 0.018437281072146563, "avg_cost": 0.0004216301933333333, "std_cost": 2.787008029039898e-05}, "0.15": {"avg_accuracy": 0.7239333333333332, "std_accuracy": 0.03035120352729946, "avg_cost": 0.00033218885999999995, "std_cost": 6.402907302862042e-05}, "0.2": {"avg_accuracy": 0.6763333333333333, "std_accuracy": 0.019176953761104884, "avg_cost": 0.00020514165999999996, "std_cost": 2.256596723819004e-05}, "0.25": {"avg_accuracy": 0.6421333333333333, "std_accuracy": 0.022294593265832177, "avg_cost": 0.00012504366000000002, "std_cost": 3.9287621204077326e-05}}, "random_baseline": {"0.05": {"avg_accuracy": 0.80724, "std_accuracy": 0.02327478750350542, "avg_cost": 0.0006125624433333335, "std_cost": 4.974807368911833e-05, "avg_small_model_fraction": 0.09956037162804478, "std_small_model_fraction": 0.07722133229317336}, "0.1": {"avg_accuracy": 0.7348933333333334, "std_accuracy": 0.016553084975986267, "avg_cost": 0.0004149597360000001, "std_cost": 3.00105909777769e-05, "avg_small_model_fraction": 0.406136690211435, "std_small_model_fraction": 0.04494255454140348}, "0.15": {"avg_accuracy": 0.6976600000000002, "std_accuracy": 0.024440357335085473, "avg_cost": 0.00031606433466666663, "std_cost": 5.806645542534841e-05, "avg_small_model_fraction": 0.5606046298246735, "std_small_model_fraction": 0.08840230008956725}, "0.2": {"avg_accuracy": 0.6525000000000001, "std_accuracy": 0.015106797586958433, "avg_cost": 0.00019294540533333328, "std_cost": 2.4052969258160718e-05, "avg_small_model_fraction": 0.751729092801332, "std_small_model_fraction": 0.03717200045102517}, "0.25": {"avg_accuracy": 0.6229466666666668, "std_accuracy": 0.016601922244795113, "avg_cost": 0.00011142768, "std_cost": 3.4871348287690096e-05, "avg_small_model_fraction": 0.8807162233701025, "std_small_model_fraction": 0.05303286100581611}}, "unrestricted_random_baseline": {"0.05": {"avg_accuracy": 0.8084733333333334, "std_accuracy": 0.021598363106700644, "avg_cost": 0.0006166011666666665, "std_cost": 4.7399085411645765e-05, "avg_small_model_fraction": 0.09292272582705421, "std_small_model_fraction": 0.07279917949112286}, "0.1": {"avg_accuracy": 0.7357266666666666, "std_accuracy": 0.01592796143753354, "avg_cost": 0.0004216965893333334, "std_cost": 2.8985284461336208e-05, "avg_small_model_fraction": 0.3967378282723303, "std_small_model_fraction": 0.043223682651681285}, "0.15": {"avg_accuracy": 0.7046066666666667, "std_accuracy": 0.026281627718913344, "avg_cost": 0.000332827112, "std_cost": 6.405018694489579e-05, "avg_small_model_fraction": 0.5362328204647632, "std_small_model_fraction": 0.09929052415633781}, "0.2": {"avg_accuracy": 0.65796, "std_accuracy": 0.01287720466560969, "avg_cost": 0.00020472040466666664, "std_cost": 2.2473610253301498e-05, "avg_small_model_fraction": 0.7341701265028238, "std_small_model_fraction": 0.035292057340449386}, "0.25": {"avg_accuracy": 0.6268666666666667, "std_accuracy": 0.017080541235244533, "avg_cost": 0.000124127342, "std_cost": 3.891290265883141e-05, "avg_small_model_fraction": 0.8590815771548113, "std_small_model_fraction": 0.06115537958396471}}}