{"trial": 16, "trial_seed": 57, "subject": "TruthfulQA", "method": "standard", "calibration_size": 300, "test_size": 384, "small_model_accuracy": 0.5818713450292398, "large_model_accuracy": 0.8552631578947368, "small_model_cost": 3.2012719298245625e-05, "large_model_cost": 0.0006140789473684211, "hybrid_results": {"0.05": {"accuracy": 0.7994791666666666, "large_model_usage": 0.7421875, "small_model_usage": 0.2578125, "avg_lambda": 0.8000000000000057, "avg_cost": 0.0004872768229166662, "unrestricted": {"accuracy": 0.8046875, "large_model_usage": 0.7421875, "avg_cost": 0.0004904695312499994}, "random_baseline": {"accuracy": 0.7911458333333334, "accuracy_std": 0.00893043135389701, "avg_cost": 0.00048616296875, "cost_std": 1.0252041641180592e-05, "small_model_fraction": 0.21784827625571726}, "unrestricted_random_baseline": {"accuracy": 0.7880208333333334, "accuracy_std": 0.012608040038448141, "avg_cost": 0.0004891108072916666, "cost_std": 1.3623948944895104e-05, "small_model_fraction": 0.2123631472800704}}, "0.1": {"accuracy": 0.7760416666666666, "large_model_usage": 0.6484375, "small_model_usage": 0.3515625, "avg_lambda": 0.699999999999995, "avg_cost": 0.00041704765624999963, "unrestricted": {"accuracy": 0.7916666666666666, "large_model_usage": 0.6484375, "avg_cost": 0.0004316778645833329}, "random_baseline": {"accuracy": 0.7598958333333334, "accuracy_std": 0.013770108402502375, "avg_cost": 0.00041896460937499997, "cost_std": 1.5270502965394097e-05, "small_model_fraction": 0.33850321770371955}, "unrestricted_random_baseline": {"accuracy": 0.75703125, "accuracy_std": 0.011590735981389518, "avg_cost": 0.0004325529166666666, "cost_std": 1.3222310607968043e-05, "small_model_fraction": 0.31336826290340525}}, "0.15": {"accuracy": 0.7213541666666666, "large_model_usage": 0.4453125, "small_model_usage": 0.5546875, "avg_lambda": 0.5, "avg_cost": 0.00029477682291666673, "unrestricted": {"accuracy": 0.7604166666666666, "large_model_usage": 0.4453125, "avg_cost": 0.00030661536458333346}, "random_baseline": {"accuracy": 0.6932291666666667, "accuracy_std": 0.010272439022560422, "avg_cost": 0.0002910801562499999, "cost_std": 2.2947141821147805e-05, "small_model_fraction": 0.5485666562555807}, "unrestricted_random_baseline": {"accuracy": 0.7036458333333333, "accuracy_std": 0.011097539454511723, "avg_cost": 0.0003069346874999999, "cost_std": 1.4047768004189195e-05, "small_model_fraction": 0.5282278338059136}}, "0.2": {"accuracy": 0.6927083333333334, "large_model_usage": 0.3671875, "small_model_usage": 0.6328125, "avg_lambda": 0.40000000000000285, "avg_cost": 0.00024240182291666653, "unrestricted": {"accuracy": 0.7447916666666666, "large_model_usage": 0.3671875, "avg_cost": 0.00025800078125000006}, "random_baseline": {"accuracy": 0.6684895833333333, "accuracy_std": 0.015627169988205715, "avg_cost": 0.00024674234374999997, "cost_std": 1.457688222790146e-05, "small_model_fraction": 0.6385478258789208}, "unrestricted_random_baseline": {"accuracy": 0.6776041666666666, "accuracy_std": 0.012958182608984115, "avg_cost": 0.0002610002083333333, "cost_std": 1.041760814482348e-05, "small_model_fraction": 0.6117485415688665}}, "0.25": {"accuracy": 0.625, "large_model_usage": 0.2630208333333333, "small_model_usage": 0.7369791666666667, "avg_lambda": 0.20000000000000143, "avg_cost": 0.00017398515625, "unrestricted": {"accuracy": 0.6927083333333334, "large_model_usage": 0.2630208333333333, "avg_cost": 0.00019474557291666657}, "random_baseline": {"accuracy": 0.63984375, "accuracy_std": 0.015540134112328859, "avg_cost": 0.00017792562499999997, "cost_std": 1.1334049453669635e-05, "small_model_fraction": 0.7560888605022489}, "unrestricted_random_baseline": {"accuracy": 0.6528645833333334, "accuracy_std": 0.00790312026327787, "avg_cost": 0.0001943890625, "cost_std": 1.653219953607664e-05, "small_model_fraction": 0.7204221001483676}}}}