{"trial": 28, "trial_seed": 69, "subject": "TruthfulQA", "method": "standard", "calibration_size": 300, "test_size": 384, "small_model_accuracy": 0.5467836257309941, "large_model_accuracy": 0.8464912280701754, "small_model_cost": 3.20296783625731e-05, "large_model_cost": 0.0006146754385964921, "hybrid_results": {"0.05": {"accuracy": 0.8359375, "large_model_usage": 0.8229166666666666, "small_model_usage": 0.17708333333333337, "avg_lambda": 0.8999999999999971, "avg_cost": 0.0005431283854166662, "unrestricted": {"accuracy": 0.8359375, "large_model_usage": 0.8229166666666666, "avg_cost": 0.0005434252604166661}, "random_baseline": {"accuracy": 0.8234375, "accuracy_std": 0.008699631816921904, "avg_cost": 0.0005443438020833331, "cost_std": 7.340530082186138e-06, "small_model_fraction": 0.12279683139048572}, "unrestricted_random_baseline": {"accuracy": 0.8260416666666668, "accuracy_std": 0.005915529526875281, "avg_cost": 0.0005466302083333331, "cost_std": 7.810257697310353e-06, "small_model_fraction": 0.12228730223870611}}, "0.1": {"accuracy": 0.8098958333333334, "large_model_usage": 0.6692708333333334, "small_model_usage": 0.33072916666666663, "avg_lambda": 0.779999999999997, "avg_cost": 0.00044450859374999945, "unrestricted": {"accuracy": 0.8151041666666666, "large_model_usage": 0.6692708333333334, "avg_cost": 0.00044845651041666625}, "random_baseline": {"accuracy": 0.7721354166666666, "accuracy_std": 0.010878436757398663, "avg_cost": 0.0004455664583333332, "cost_std": 1.3436153787237329e-05, "small_model_fraction": 0.2920588399685163}, "unrestricted_random_baseline": {"accuracy": 0.7679687499999999, "accuracy_std": 0.013199297809868952, "avg_cost": 0.0004455192447916665, "cost_std": 1.1062197169800931e-05, "small_model_fraction": 0.28528299616063924}}, "0.15": {"accuracy": 0.7838541666666666, "large_model_usage": 0.6380208333333334, "small_model_usage": 0.36197916666666663, "avg_lambda": 0.699999999999995, "avg_cost": 0.00041137838541666603, "unrestricted": {"accuracy": 0.8098958333333334, "large_model_usage": 0.6380208333333334, "avg_cost": 0.0004285554687499995}, "random_baseline": {"accuracy": 0.7489583333333333, "accuracy_std": 0.008168951636124007, "avg_cost": 0.00041672471354166644, "cost_std": 1.1847706401610424e-05, "small_model_fraction": 0.3489205054855405}, "unrestricted_random_baseline": {"accuracy": 0.7606770833333334, "accuracy_std": 0.010638905320737252, "avg_cost": 0.00042543958333333325, "cost_std": 1.2357869873959883e-05, "small_model_fraction": 0.31943932754572807}}, "0.2": {"accuracy": 0.7265625, "large_model_usage": 0.4375, "small_model_usage": 0.5625, "avg_lambda": 0.5, "avg_cost": 0.00029343046874999993, "unrestricted": {"accuracy": 0.7578125, "large_model_usage": 0.4375, "avg_cost": 0.0003044460937499998}, "random_baseline": {"accuracy": 0.6958333333333333, "accuracy_std": 0.012640271978657903, "avg_cost": 0.00029436518229166667, "cost_std": 1.4407808716577359e-05, "small_model_fraction": 0.5513555435768032}, "unrestricted_random_baseline": {"accuracy": 0.70546875, "accuracy_std": 0.01611430052542186, "avg_cost": 0.00030916856770833327, "cost_std": 1.3501743254482576e-05, "small_model_fraction": 0.5324493303134006}}, "0.25": {"accuracy": 0.6458333333333334, "large_model_usage": 0.3098958333333333, "small_model_usage": 0.6901041666666667, "avg_lambda": 0.3100000000000015, "avg_cost": 0.00020774296874999994, "unrestricted": {"accuracy": 0.7161458333333334, "large_model_usage": 0.3098958333333333, "avg_cost": 0.0002248210937499999}, "random_baseline": {"accuracy": 0.65078125, "accuracy_std": 0.014520418603594416, "avg_cost": 0.00021131260416666658, "cost_std": 1.6431384506044597e-05, "small_model_fraction": 0.6984217471746779}, "unrestricted_random_baseline": {"accuracy": 0.6562500000000001, "accuracy_std": 0.01302083333333335, "avg_cost": 0.00022966070312499992, "cost_std": 1.6394183061277104e-05, "small_model_fraction": 0.669110412285459}}}}