{"trial": 18, "trial_seed": 59, "subject": "TruthfulQA", "method": "standard", "calibration_size": 300, "test_size": 384, "small_model_accuracy": 0.577485380116959, "large_model_accuracy": 0.847953216374269, "small_model_cost": 3.1978801169590635e-05, "large_model_cost": 0.0006127222222222229, "hybrid_results": {"0.05": {"accuracy": 0.7942708333333334, "large_model_usage": 0.828125, "small_model_usage": 0.171875, "avg_lambda": 0.8799999999999967, "avg_cost": 0.0005366333333333335, "unrestricted": {"accuracy": 0.8177083333333334, "large_model_usage": 0.828125, "avg_cost": 0.0005449718750000003}, "random_baseline": {"accuracy": 0.8052083333333334, "accuracy_std": 0.008051887934239738, "avg_cost": 0.0005411442187500001, "cost_std": 1.0241022798400411e-05, "small_model_fraction": 0.13101980346324676}, "unrestricted_random_baseline": {"accuracy": 0.8104166666666666, "accuracy_std": 0.00885416666666666, "avg_cost": 0.0005501469010416668, "cost_std": 9.648948171968305e-06, "small_model_fraction": 0.1166614115049656}}, "0.1": {"accuracy": 0.7526041666666666, "large_model_usage": 0.6432291666666666, "small_model_usage": 0.35677083333333337, "avg_lambda": 0.699999999999995, "avg_cost": 0.00041404999999999974, "unrestricted": {"accuracy": 0.7786458333333334, "large_model_usage": 0.6432291666666666, "avg_cost": 0.00043098749999999976}, "random_baseline": {"accuracy": 0.7505208333333333, "accuracy_std": 0.016379359576579565, "avg_cost": 0.0004114230989583333, "cost_std": 1.5434204200051184e-05, "small_model_fraction": 0.3420998241566265}, "unrestricted_random_baseline": {"accuracy": 0.7518229166666667, "accuracy_std": 0.011879682008288598, "avg_cost": 0.0004317701302083334, "cost_std": 1.4912070593751838e-05, "small_model_fraction": 0.3129346207535474}}, "0.15": {"accuracy": 0.6927083333333334, "large_model_usage": 0.4322916666666667, "small_model_usage": 0.5677083333333333, "avg_lambda": 0.5, "avg_cost": 0.0002911749999999997, "unrestricted": {"accuracy": 0.7265625, "large_model_usage": 0.4322916666666667, "avg_cost": 0.00030012812499999965}, "random_baseline": {"accuracy": 0.68828125, "accuracy_std": 0.010021813231715549, "avg_cost": 0.00029143013020833307, "cost_std": 1.533435948929266e-05, "small_model_fraction": 0.5536820746749048}, "unrestricted_random_baseline": {"accuracy": 0.6919270833333333, "accuracy_std": 0.012217400110145741, "avg_cost": 0.00030350447916666643, "cost_std": 1.035253354185021e-05, "small_model_fraction": 0.5382654127284434}}, "0.2": {"accuracy": 0.6432291666666666, "large_model_usage": 0.3229166666666667, "small_model_usage": 0.6770833333333333, "avg_lambda": 0.37000000000000194, "avg_cost": 0.00021781562499999986, "unrestricted": {"accuracy": 0.6901041666666666, "large_model_usage": 0.3229166666666667, "avg_cost": 0.0002323833333333332}, "random_baseline": {"accuracy": 0.64765625, "accuracy_std": 0.00823921459351895, "avg_cost": 0.00021891703124999983, "cost_std": 1.4550060759660274e-05, "small_model_fraction": 0.6800018440268013}, "unrestricted_random_baseline": {"accuracy": 0.66328125, "accuracy_std": 0.01643103286911183, "avg_cost": 0.00023506330729166643, "cost_std": 1.7401970282752895e-05, "small_model_fraction": 0.6549172579510287}}, "0.25": {"accuracy": 0.6041666666666666, "large_model_usage": 0.28125, "small_model_usage": 0.71875, "avg_lambda": 0.26999999999999935, "avg_cost": 0.00018612812499999997, "unrestricted": {"accuracy": 0.6770833333333334, "large_model_usage": 0.28125, "avg_cost": 0.0002067427083333333}, "random_baseline": {"accuracy": 0.6390625000000001, "accuracy_std": 0.007103219633846804, "avg_cost": 0.00018989716145833326, "cost_std": 1.3182464481187465e-05, "small_model_fraction": 0.7345655271462147}, "unrestricted_random_baseline": {"accuracy": 0.64921875, "accuracy_std": 0.015670506649302487, "avg_cost": 0.00020496973958333318, "cost_std": 1.3029600829376698e-05, "small_model_fraction": 0.6990686405935127}}}}