{"trial": 26, "trial_seed": 67, "subject": "TruthfulQA", "method": "standard", "calibration_size": 300, "test_size": 384, "small_model_accuracy": 0.5467836257309941, "large_model_accuracy": 0.8377192982456141, "small_model_cost": 3.206505847953217e-05, "large_model_cost": 0.0006165058479532175, "hybrid_results": {"0.05": {"accuracy": 0.7578125, "large_model_usage": 0.640625, "small_model_usage": 0.359375, "avg_lambda": 0.779999999999997, "avg_cost": 0.00042660078124999966, "unrestricted": {"accuracy": 0.765625, "large_model_usage": 0.640625, "avg_cost": 0.0004299132812499997}, "random_baseline": {"accuracy": 0.7515625, "accuracy_std": 0.013691082737598862, "avg_cost": 0.00042392161458333313, "cost_std": 1.2133343211858165e-05, "small_model_fraction": 0.32493465569751845}, "unrestricted_random_baseline": {"accuracy": 0.75625, "accuracy_std": 0.009827063679225628, "avg_cost": 0.00043592966145833314, "cost_std": 1.02702917180514e-05, "small_model_fraction": 0.31926684458703264}}, "0.1": {"accuracy": 0.7161458333333334, "large_model_usage": 0.4947916666666667, "small_model_usage": 0.5052083333333333, "avg_lambda": 0.6399999999999937, "avg_cost": 0.0003315174479166665, "unrestricted": {"accuracy": 0.7317708333333334, "large_model_usage": 0.4947916666666667, "avg_cost": 0.00033965286458333344}, "random_baseline": {"accuracy": 0.6984375, "accuracy_std": 0.017959312134965266, "avg_cost": 0.0003321938541666666, "cost_std": 1.8272869438880734e-05, "small_model_fraction": 0.48762578719598876}, "unrestricted_random_baseline": {"accuracy": 0.6950520833333333, "accuracy_std": 0.012459243451985074, "avg_cost": 0.00033390359375, "cost_std": 1.636878630267946e-05, "small_model_fraction": 0.47370578569507843}}, "0.15": {"accuracy": 0.6901041666666666, "large_model_usage": 0.4322916666666667, "small_model_usage": 0.5677083333333333, "avg_lambda": 0.5, "avg_cost": 0.00029056432291666665, "unrestricted": {"accuracy": 0.7109375, "large_model_usage": 0.4322916666666667, "avg_cost": 0.0003008351562500001}, "random_baseline": {"accuracy": 0.67890625, "accuracy_std": 0.012049724272145082, "avg_cost": 0.0002943108854166667, "cost_std": 1.46800953976577e-05, "small_model_fraction": 0.5576981122930785}, "unrestricted_random_baseline": {"accuracy": 0.6760416666666667, "accuracy_std": 0.012979099785809498, "avg_cost": 0.0003044869010416667, "cost_std": 1.716735132684862e-05, "small_model_fraction": 0.5401243331894968}}, "0.2": {"accuracy": 0.6692708333333334, "large_model_usage": 0.3671875, "small_model_usage": 0.6328125, "avg_lambda": 0.40000000000000285, "avg_cost": 0.0002479028645833333, "unrestricted": {"accuracy": 0.7005208333333334, "large_model_usage": 0.3671875, "avg_cost": 0.00026070494791666677}, "random_baseline": {"accuracy": 0.6632812499999999, "accuracy_std": 0.013584168078901, "avg_cost": 0.0002452234635416667, "cost_std": 7.81162331057625e-06, "small_model_fraction": 0.6306934594723058}, "unrestricted_random_baseline": {"accuracy": 0.6682291666666667, "accuracy_std": 0.015014099276457903, "avg_cost": 0.00025903408854166673, "cost_std": 1.6710722072079132e-05, "small_model_fraction": 0.6087886171616548}}, "0.25": {"accuracy": 0.6458333333333334, "large_model_usage": 0.3072916666666667, "small_model_usage": 0.6927083333333333, "avg_lambda": 0.3100000000000015, "avg_cost": 0.00020686119791666667, "unrestricted": {"accuracy": 0.6901041666666666, "large_model_usage": 0.3072916666666667, "avg_cost": 0.00022315286458333332}, "random_baseline": {"accuracy": 0.6401041666666667, "accuracy_std": 0.015353544763524647, "avg_cost": 0.0002139715625, "cost_std": 1.396022055941421e-05, "small_model_fraction": 0.7009172826651162}, "unrestricted_random_baseline": {"accuracy": 0.6382812500000001, "accuracy_std": 0.011076130374029571, "avg_cost": 0.0002121331510416667, "cost_std": 8.293007354912557e-06, "small_model_fraction": 0.6730416330525387}}}}