{"trial": 2, "trial_seed": 43, "subject": "TruthfulQA", "method": "standard", "calibration_size": 300, "test_size": 384, "small_model_accuracy": 0.5555555555555556, "large_model_accuracy": 0.8377192982456141, "small_model_cost": 3.2054824561403506e-05, "large_model_cost": 0.0006174707602339186, "hybrid_results": {"0.05": {"accuracy": 0.7838541666666666, "large_model_usage": 0.7265625, "small_model_usage": 0.2734375, "avg_lambda": 0.8000000000000057, "avg_cost": 0.00047770416666666687, "unrestricted": {"accuracy": 0.7916666666666666, "large_model_usage": 0.7265625, "avg_cost": 0.0004832458333333336}, "random_baseline": {"accuracy": 0.7791666666666667, "accuracy_std": 0.018333037402914615, "avg_cost": 0.0004864808333333334, "cost_std": 1.4120076118914955e-05, "small_model_fraction": 0.23874750421115623}, "unrestricted_random_baseline": {"accuracy": 0.77265625, "accuracy_std": 0.01473369287480796, "avg_cost": 0.00048567119791666674, "cost_std": 1.1538081401062203e-05, "small_model_fraction": 0.22928130022014162}}, "0.1": {"accuracy": 0.7057291666666666, "large_model_usage": 0.4713541666666667, "small_model_usage": 0.5286458333333333, "avg_lambda": 0.610000000000006, "avg_cost": 0.0003155062499999997, "unrestricted": {"accuracy": 0.7265625, "large_model_usage": 0.4713541666666667, "avg_cost": 0.0003249749999999998}, "random_baseline": {"accuracy": 0.6950520833333333, "accuracy_std": 0.012567633866680883, "avg_cost": 0.0003205762239583334, "cost_std": 1.4437397446504936e-05, "small_model_fraction": 0.5158119071135766}, "unrestricted_random_baseline": {"accuracy": 0.6875, "accuracy_std": 0.016304662335935968, "avg_cost": 0.0003305525260416667, "cost_std": 1.7548171644687845e-05, "small_model_fraction": 0.49963750969283927}}, "0.15": {"accuracy": 0.6875, "large_model_usage": 0.4088541666666667, "small_model_usage": 0.5911458333333333, "avg_lambda": 0.5, "avg_cost": 0.0002738864583333332, "unrestricted": {"accuracy": 0.71875, "large_model_usage": 0.4088541666666667, "avg_cost": 0.0002858031249999997}, "random_baseline": {"accuracy": 0.6776041666666667, "accuracy_std": 0.015131082339931184, "avg_cost": 0.00028132231770833336, "cost_std": 1.331589770948585e-05, "small_model_fraction": 0.5869063019370699}, "unrestricted_random_baseline": {"accuracy": 0.6807291666666667, "accuracy_std": 0.011945150980395546, "avg_cost": 0.00028711437500000006, "cost_std": 1.2061184645221007e-05, "small_model_fraction": 0.566550404633084}}, "0.2": {"accuracy": 0.640625, "large_model_usage": 0.3020833333333333, "small_model_usage": 0.6979166666666667, "avg_lambda": 0.3899999999999985, "avg_cost": 0.00020685520833333318, "unrestricted": {"accuracy": 0.6796875, "large_model_usage": 0.3020833333333333, "avg_cost": 0.00021918854166666664}, "random_baseline": {"accuracy": 0.6463541666666667, "accuracy_std": 0.01274712317762545, "avg_cost": 0.00021035078124999996, "cost_std": 1.2614777605681321e-05, "small_model_fraction": 0.7014082242719919}, "unrestricted_random_baseline": {"accuracy": 0.6518229166666667, "accuracy_std": 0.017038893544745432, "avg_cost": 0.00022938625000000007, "cost_std": 1.6026346624546152e-05, "small_model_fraction": 0.680340582307027}}, "0.25": {"accuracy": 0.5729166666666666, "large_model_usage": 0.21875, "small_model_usage": 0.78125, "avg_lambda": 0.10999999999999958, "avg_cost": 0.00014874583333333323, "unrestricted": {"accuracy": 0.6458333333333334, "large_model_usage": 0.21875, "avg_cost": 0.0001687093749999999}, "random_baseline": {"accuracy": 0.6057291666666667, "accuracy_std": 0.017044862679118304, "avg_cost": 0.00015355630208333323, "cost_std": 1.3851528589879824e-05, "small_model_fraction": 0.8006699140537108}, "unrestricted_random_baseline": {"accuracy": 0.6088541666666666, "accuracy_std": 0.012035645841970165, "avg_cost": 0.0001675689062499999, "cost_std": 9.294856981610515e-06, "small_model_fraction": 0.7665684479845768}}}}