{"trial": 12, "trial_seed": 53, "subject": "TruthfulQA", "method": "standard", "calibration_size": 300, "test_size": 384, "small_model_accuracy": 0.5614035087719298, "large_model_accuracy": 0.8523391812865497, "small_model_cost": 3.209298245614035e-05, "large_model_cost": 0.0006173918128654981, "hybrid_results": {"0.05": {"accuracy": 0.8151041666666666, "large_model_usage": 0.7421875, "small_model_usage": 0.2578125, "avg_lambda": 0.8000000000000057, "avg_cost": 0.0004888049479166665, "unrestricted": {"accuracy": 0.8255208333333334, "large_model_usage": 0.7421875, "avg_cost": 0.0004949716145833333}, "random_baseline": {"accuracy": 0.7932291666666667, "accuracy_std": 0.013441133229754099, "avg_cost": 0.0004914751562499999, "cost_std": 1.1619308106977641e-05, "small_model_fraction": 0.21969438219942797}, "unrestricted_random_baseline": {"accuracy": 0.79609375, "accuracy_std": 0.01061337701683064, "avg_cost": 0.0004958372916666666, "cost_std": 1.0804685418549383e-05, "small_model_fraction": 0.20915845363392266}}, "0.1": {"accuracy": 0.765625, "large_model_usage": 0.6640625, "small_model_usage": 0.3359375, "avg_lambda": 0.699999999999995, "avg_cost": 0.0004276903645833331, "unrestricted": {"accuracy": 0.8072916666666666, "large_model_usage": 0.6640625, "avg_cost": 0.0004469039062499997}, "random_baseline": {"accuracy": 0.7630208333333333, "accuracy_std": 0.014215983399954347, "avg_cost": 0.0004357658072916664, "cost_std": 1.3893699300879667e-05, "small_model_fraction": 0.32411041749304015}, "unrestricted_random_baseline": {"accuracy": 0.7744791666666666, "accuracy_std": 0.013541666666666681, "avg_cost": 0.0004457771354166664, "cost_std": 7.786064520071841e-06, "small_model_fraction": 0.29128352519730694}}, "0.15": {"accuracy": 0.734375, "large_model_usage": 0.4895833333333333, "small_model_usage": 0.5104166666666667, "avg_lambda": 0.5800000000000045, "avg_cost": 0.0003250497395833333, "unrestricted": {"accuracy": 0.7604166666666666, "large_model_usage": 0.4895833333333333, "avg_cost": 0.00033705494791666643}, "random_baseline": {"accuracy": 0.6984375, "accuracy_std": 0.014768173832039405, "avg_cost": 0.00032355726562499984, "cost_std": 1.4176348540079854e-05, "small_model_fraction": 0.4994748974258173}, "unrestricted_random_baseline": {"accuracy": 0.6955729166666667, "accuracy_std": 0.008514470168891581, "avg_cost": 0.0003283005208333333, "cost_std": 1.1677081595673345e-05, "small_model_fraction": 0.4789636513586815}}, "0.2": {"accuracy": 0.6770833333333334, "large_model_usage": 0.3541666666666667, "small_model_usage": 0.6458333333333333, "avg_lambda": 0.4300000000000041, "avg_cost": 0.00023860182291666674, "unrestricted": {"accuracy": 0.7213541666666666, "large_model_usage": 0.3541666666666667, "avg_cost": 0.0002522528645833334}, "random_baseline": {"accuracy": 0.6536458333333333, "accuracy_std": 0.009389456446520792, "avg_cost": 0.00023735203124999996, "cost_std": 1.7972426132413766e-05, "small_model_fraction": 0.6471736662858284}, "unrestricted_random_baseline": {"accuracy": 0.6708333333333333, "accuracy_std": 0.01956420830311764, "avg_cost": 0.0002580514322916667, "cost_std": 1.0848504169443697e-05, "small_model_fraction": 0.6238504663110068}}, "0.25": {"accuracy": 0.625, "large_model_usage": 0.28125, "small_model_usage": 0.71875, "avg_lambda": 0.3100000000000015, "avg_cost": 0.00018837265625, "unrestricted": {"accuracy": 0.6901041666666666, "large_model_usage": 0.28125, "avg_cost": 0.00020624765625000007}, "random_baseline": {"accuracy": 0.6341145833333333, "accuracy_std": 0.00918868685644655, "avg_cost": 0.0001917734374999999, "cost_std": 8.818962942499834e-06, "small_model_fraction": 0.7329916520001284}, "unrestricted_random_baseline": {"accuracy": 0.6486979166666667, "accuracy_std": 0.015644519058203736, "avg_cost": 0.00020731177083333326, "cost_std": 9.421558761046264e-06, "small_model_fraction": 0.7024516969014682}}}}