{"trial": 17, "trial_seed": 58, "subject": "TruthfulQA", "method": "standard", "calibration_size": 300, "test_size": 384, "small_model_accuracy": 0.5453216374269005, "large_model_accuracy": 0.8216374269005848, "small_model_cost": 3.197076023391814e-05, "large_model_cost": 0.0006144093567251471, "hybrid_results": {"0.05": {"accuracy": 0.7890625, "large_model_usage": 0.7447916666666666, "small_model_usage": 0.25520833333333337, "avg_lambda": 0.8000000000000057, "avg_cost": 0.0004862598958333331, "unrestricted": {"accuracy": 0.7942708333333334, "large_model_usage": 0.7447916666666666, "avg_cost": 0.0004927911458333332}, "random_baseline": {"accuracy": 0.76796875, "accuracy_std": 0.00934964743282096, "avg_cost": 0.00048729914062499956, "cost_std": 9.528207835065597e-06, "small_model_fraction": 0.2200222678644956}, "unrestricted_random_baseline": {"accuracy": 0.7697916666666667, "accuracy_std": 0.011657827752916346, "avg_cost": 0.0004969879166666664, "cost_std": 7.049146807905302e-06, "small_model_fraction": 0.20880863944195263}}, "0.1": {"accuracy": 0.75, "large_model_usage": 0.640625, "small_model_usage": 0.359375, "avg_lambda": 0.699999999999995, "avg_cost": 0.0004117807291666667, "unrestricted": {"accuracy": 0.7786458333333334, "large_model_usage": 0.640625, "avg_cost": 0.0004288692708333333}, "random_baseline": {"accuracy": 0.73515625, "accuracy_std": 0.010990077003129805, "avg_cost": 0.0004197056510416664, "cost_std": 1.4896773451107022e-05, "small_model_fraction": 0.34789697794612384}, "unrestricted_random_baseline": {"accuracy": 0.7424479166666667, "accuracy_std": 0.013997244427475726, "avg_cost": 0.00043139624999999983, "cost_std": 1.2051384577234641e-05, "small_model_fraction": 0.3185573329266957}}, "0.15": {"accuracy": 0.703125, "large_model_usage": 0.4348958333333333, "small_model_usage": 0.5651041666666667, "avg_lambda": 0.5, "avg_cost": 0.0002922130208333333, "unrestricted": {"accuracy": 0.7317708333333334, "large_model_usage": 0.4348958333333333, "avg_cost": 0.0003023067708333334}, "random_baseline": {"accuracy": 0.6736979166666666, "accuracy_std": 0.011707170204681783, "avg_cost": 0.0002942627604166664, "cost_std": 8.51643067671861e-06, "small_model_fraction": 0.5531850702079387}, "unrestricted_random_baseline": {"accuracy": 0.6789062499999999, "accuracy_std": 0.013534152562694052, "avg_cost": 0.0003023619010416665, "cost_std": 1.51432987324865e-05, "small_model_fraction": 0.5358549171912815}}, "0.2": {"accuracy": 0.6927083333333334, "large_model_usage": 0.3723958333333333, "small_model_usage": 0.6276041666666667, "avg_lambda": 0.40000000000000285, "avg_cost": 0.0002495307291666667, "unrestricted": {"accuracy": 0.71875, "large_model_usage": 0.3723958333333333, "avg_cost": 0.0002631140625000001}, "random_baseline": {"accuracy": 0.64453125, "accuracy_std": 0.012283829599032046, "avg_cost": 0.0002468484374999999, "cost_std": 1.4043219981623069e-05, "small_model_fraction": 0.6264671156008721}, "unrestricted_random_baseline": {"accuracy": 0.6536458333333334, "accuracy_std": 0.009743899444723812, "avg_cost": 0.00025586041666666645, "cost_std": 1.0559741165893638e-05, "small_model_fraction": 0.6031456300139567}}, "0.25": {"accuracy": 0.5911458333333334, "large_model_usage": 0.28125, "small_model_usage": 0.71875, "avg_lambda": 0.25, "avg_cost": 0.00018439010416666663, "unrestricted": {"accuracy": 0.6927083333333334, "large_model_usage": 0.28125, "avg_cost": 0.0002064734375}, "random_baseline": {"accuracy": 0.62265625, "accuracy_std": 0.013147818293087851, "avg_cost": 0.00018898117187499995, "cost_std": 1.6544314258483117e-05, "small_model_fraction": 0.7383083043414967}, "unrestricted_random_baseline": {"accuracy": 0.6333333333333334, "accuracy_std": 0.012532509807829127, "avg_cost": 0.0002111436458333333, "cost_std": 1.2486746376001792e-05, "small_model_fraction": 0.7003930056879228}}}}