{"trial": 19, "trial_seed": 60, "subject": "TruthfulQA", "method": "standard", "calibration_size": 300, "test_size": 384, "small_model_accuracy": 0.5511695906432749, "large_model_accuracy": 0.8347953216374269, "small_model_cost": 3.2065058479532224e-05, "large_model_cost": 0.0006162017543859654, "hybrid_results": {"0.05": {"accuracy": 0.7864583333333334, "large_model_usage": 0.7291666666666666, "small_model_usage": 0.27083333333333337, "avg_lambda": 0.8000000000000057, "avg_cost": 0.00047829114583333287, "unrestricted": {"accuracy": 0.796875, "large_model_usage": 0.7291666666666666, "avg_cost": 0.0004866609374999996}, "random_baseline": {"accuracy": 0.7539062499999999, "accuracy_std": 0.009899259275389694, "avg_cost": 0.0004769892447916667, "cost_std": 1.1212943240927793e-05, "small_model_fraction": 0.23609304041176524}, "unrestricted_random_baseline": {"accuracy": 0.7661458333333334, "accuracy_std": 0.013471372037893309, "avg_cost": 0.0004874775000000001, "cost_std": 6.957402994193457e-06, "small_model_fraction": 0.22176455920981145}}, "0.1": {"accuracy": 0.734375, "large_model_usage": 0.6041666666666666, "small_model_usage": 0.39583333333333337, "avg_lambda": 0.699999999999995, "avg_cost": 0.00038955677083333287, "unrestricted": {"accuracy": 0.7682291666666666, "large_model_usage": 0.6041666666666666, "avg_cost": 0.0004097546874999996}, "random_baseline": {"accuracy": 0.7205729166666666, "accuracy_std": 0.010354632995870345, "avg_cost": 0.0003944856249999999, "cost_std": 1.42162738766597e-05, "small_model_fraction": 0.38799990677000107}, "unrestricted_random_baseline": {"accuracy": 0.7296875, "accuracy_std": 0.013010412496663317, "avg_cost": 0.00040753822916666664, "cost_std": 1.1557818599270986e-05, "small_model_fraction": 0.3534225264954668}}, "0.15": {"accuracy": 0.6875, "large_model_usage": 0.390625, "small_model_usage": 0.609375, "avg_lambda": 0.5099999999999983, "avg_cost": 0.00026490572916666673, "unrestricted": {"accuracy": 0.7109375, "large_model_usage": 0.390625, "avg_cost": 0.0002749786458333333}, "random_baseline": {"accuracy": 0.66015625, "accuracy_std": 0.011602431984346595, "avg_cost": 0.00026424127604166654, "cost_std": 1.8857591562150145e-05, "small_model_fraction": 0.6013935225798058}, "unrestricted_random_baseline": {"accuracy": 0.66953125, "accuracy_std": 0.011437599612498814, "avg_cost": 0.00027723825520833327, "cost_std": 1.2125374771293431e-05, "small_model_fraction": 0.584149413902408}}, "0.2": {"accuracy": 0.65625, "large_model_usage": 0.3229166666666667, "small_model_usage": 0.6770833333333333, "avg_lambda": 0.4099999999999973, "avg_cost": 0.00021959322916666665, "unrestricted": {"accuracy": 0.6875, "large_model_usage": 0.3229166666666667, "avg_cost": 0.0002329213541666667}, "random_baseline": {"accuracy": 0.6455729166666667, "accuracy_std": 0.01267509742045621, "avg_cost": 0.00022355851562499987, "cost_std": 1.0316363198200226e-05, "small_model_fraction": 0.6789652627521751}, "unrestricted_random_baseline": {"accuracy": 0.64375, "accuracy_std": 0.011097539454511718, "avg_cost": 0.00022574096354166656, "cost_std": 1.8053029990735093e-05, "small_model_fraction": 0.6561484715911299}}, "0.25": {"accuracy": 0.6145833333333334, "large_model_usage": 0.24479166666666666, "small_model_usage": 0.7552083333333334, "avg_lambda": 0.3100000000000015, "avg_cost": 0.00017010364583333325, "unrestricted": {"accuracy": 0.6588541666666666, "large_model_usage": 0.24479166666666666, "avg_cost": 0.0001837234375}, "random_baseline": {"accuracy": 0.6231770833333334, "accuracy_std": 0.013023437239635416, "avg_cost": 0.00017793690104166653, "cost_std": 1.2481376787873215e-05, "small_model_fraction": 0.7636878690875604}, "unrestricted_random_baseline": {"accuracy": 0.6268229166666667, "accuracy_std": 0.009464990544472016, "avg_cost": 0.0001809695833333332, "cost_std": 9.316815668269218e-06, "small_model_fraction": 0.7403717655759803}}}}