{"trial": 20, "trial_seed": 61, "subject": "TruthfulQA", "method": "standard", "calibration_size": 300, "test_size": 384, "small_model_accuracy": 0.5628654970760234, "large_model_accuracy": 0.8216374269005848, "small_model_cost": 3.207514619883046e-05, "large_model_cost": 0.0006158654970760243, "hybrid_results": {"0.05": {"accuracy": 0.7916666666666666, "large_model_usage": 0.7135416666666666, "small_model_usage": 0.28645833333333337, "avg_lambda": 0.8000000000000057, "avg_cost": 0.0004676960937499996, "unrestricted": {"accuracy": 0.7942708333333334, "large_model_usage": 0.7135416666666666, "avg_cost": 0.00047337317708333305}, "random_baseline": {"accuracy": 0.7526041666666667, "accuracy_std": 0.013072812914594927, "avg_cost": 0.00046353234374999993, "cost_std": 1.172721579556529e-05, "small_model_fraction": 0.25380584503218967}, "unrestricted_random_baseline": {"accuracy": 0.7705729166666667, "accuracy_std": 0.007407011798608268, "avg_cost": 0.000473695, "cost_std": 1.2321306844650245e-05, "small_model_fraction": 0.24408132093753282}}, "0.1": {"accuracy": 0.7265625, "large_model_usage": 0.484375, "small_model_usage": 0.515625, "avg_lambda": 0.610000000000006, "avg_cost": 0.00032517005208333337, "unrestricted": {"accuracy": 0.7447916666666666, "large_model_usage": 0.484375, "avg_cost": 0.00033252421875}, "random_baseline": {"accuracy": 0.6893229166666666, "accuracy_std": 0.016554390703450845, "avg_cost": 0.0003244118749999998, "cost_std": 1.1096417318109279e-05, "small_model_fraction": 0.49794492929850026}, "unrestricted_random_baseline": {"accuracy": 0.6911458333333333, "accuracy_std": 0.015014099276457886, "avg_cost": 0.00032857898437499986, "cost_std": 1.628909325580385e-05, "small_model_fraction": 0.48534765588413775}}, "0.15": {"accuracy": 0.6744791666666666, "large_model_usage": 0.3723958333333333, "small_model_usage": 0.6276041666666667, "avg_lambda": 0.44999999999999857, "avg_cost": 0.00025049817708333344, "unrestricted": {"accuracy": 0.7135416666666666, "large_model_usage": 0.3723958333333333, "avg_cost": 0.00026334713541666674}, "random_baseline": {"accuracy": 0.65390625, "accuracy_std": 0.013703460538461337, "avg_cost": 0.00024778773437499985, "cost_std": 9.813971295317419e-06, "small_model_fraction": 0.6258536466793189}, "unrestricted_random_baseline": {"accuracy": 0.6557291666666667, "accuracy_std": 0.015086196206208583, "avg_cost": 0.00026292950520833325, "cost_std": 9.241621108350722e-06, "small_model_fraction": 0.6038441045311373}}, "0.2": {"accuracy": 0.5885416666666666, "large_model_usage": 0.28125, "small_model_usage": 0.71875, "avg_lambda": 0.2999999999999981, "avg_cost": 0.00018647734375000014, "unrestricted": {"accuracy": 0.6770833333333334, "large_model_usage": 0.28125, "avg_cost": 0.00020600338541666677}, "random_baseline": {"accuracy": 0.6192708333333334, "accuracy_std": 0.01544163237538916, "avg_cost": 0.00018426377604166669, "cost_std": 1.149178529848089e-05, "small_model_fraction": 0.7355177294054835}, "unrestricted_random_baseline": {"accuracy": 0.6346354166666666, "accuracy_std": 0.009536371574791964, "avg_cost": 0.00020096312500000006, "cost_std": 1.1332945951655397e-05, "small_model_fraction": 0.7020707194689078}}, "0.25": {"accuracy": 0.5729166666666666, "large_model_usage": 0.23697916666666666, "small_model_usage": 0.7630208333333334, "avg_lambda": 0.14000000000000046, "avg_cost": 0.00015840963541666672, "unrestricted": {"accuracy": 0.65625, "large_model_usage": 0.23697916666666666, "avg_cost": 0.00017926380208333335}, "random_baseline": {"accuracy": 0.61015625, "accuracy_std": 0.00924754213779411, "avg_cost": 0.00015433856770833334, "cost_std": 9.973408535386524e-06, "small_model_fraction": 0.7835961333927357}, "unrestricted_random_baseline": {"accuracy": 0.6247395833333333, "accuracy_std": 0.01598754737309462, "avg_cost": 0.00018287182291666673, "cost_std": 1.272895912056319e-05, "small_model_fraction": 0.7478741201129144}}}}