{"trial": 6, "trial_seed": 47, "subject": "TruthfulQA", "method": "standard_basescores", "calibration_size": 400, "test_size": 284, "small_model_accuracy": 0.5482456140350878, "large_model_accuracy": 0.8333333333333334, "small_model_cost": 3.213333333333336e-05, "large_model_cost": 0.0006180818713450298, "hybrid_results": {"0.05": {"accuracy": 0.8380281690140845, "large_model_usage": 1.0, "small_model_usage": 0.0, "avg_lambda": 1.0, "avg_cost": 0.0006479834507042255, "unrestricted": {"accuracy": 0.8380281690140845, "large_model_usage": 1.0, "avg_cost": 0.0006479834507042255}, "random_baseline": {"accuracy": 0.8380281690140844, "accuracy_std": 1.1102230246251565e-16, "avg_cost": 0.0006159788732394368, "cost_std": 1.0842021724855044e-19, "small_model_fraction": 0.0}, "unrestricted_random_baseline": {"accuracy": 0.8380281690140844, "accuracy_std": 1.1102230246251565e-16, "avg_cost": 0.0006159788732394368, "cost_std": 1.0842021724855044e-19, "small_model_fraction": 0.0}}, "0.1": {"accuracy": 0.7852112676056338, "large_model_usage": 0.6302816901408451, "small_model_usage": 0.3697183098591549, "avg_lambda": 0.7599999999999976, "avg_cost": 0.00041759612676056315, "unrestricted": {"accuracy": 0.8063380281690141, "large_model_usage": 0.6302816901408451, "avg_cost": 0.00042308908450704207}, "random_baseline": {"accuracy": 0.748943661971831, "accuracy_std": 0.015108054896425992, "avg_cost": 0.00041861732394366175, "cost_std": 1.5334333422474074e-05, "small_model_fraction": 0.34215589182076706}, "unrestricted_random_baseline": {"accuracy": 0.7517605633802817, "accuracy_std": 0.015448880963718502, "avg_cost": 0.00042243042253521104, "cost_std": 1.8706591940605724e-05, "small_model_fraction": 0.33278142053167026}}, "0.15": {"accuracy": 0.721830985915493, "large_model_usage": 0.43309859154929575, "small_model_usage": 0.5669014084507042, "avg_lambda": 0.5500000000000005, "avg_cost": 0.0002926172535211266, "unrestricted": {"accuracy": 0.7359154929577465, "large_model_usage": 0.43309859154929575, "avg_cost": 0.0003000538732394364}, "random_baseline": {"accuracy": 0.7017605633802818, "accuracy_std": 0.012100591744675789, "avg_cost": 0.00029343380281690124, "cost_std": 1.270751538392503e-05, "small_model_fraction": 0.5554491507535878}, "unrestricted_random_baseline": {"accuracy": 0.6936619718309859, "accuracy_std": 0.012889440294171217, "avg_cost": 0.0003086661267605633, "cost_std": 2.0422400433503835e-05, "small_model_fraction": 0.5427575588545032}}, "0.2": {"accuracy": 0.6795774647887324, "large_model_usage": 0.35563380281690143, "small_model_usage": 0.6443661971830985, "avg_lambda": 0.40000000000000185, "avg_cost": 0.00023547640845070423, "unrestricted": {"accuracy": 0.7253521126760564, "large_model_usage": 0.35563380281690143, "avg_cost": 0.00025159612676056334}, "random_baseline": {"accuracy": 0.6725352112676057, "accuracy_std": 0.014084507042253502, "avg_cost": 0.00024185841549295765, "cost_std": 1.2549321530468264e-05, "small_model_fraction": 0.6529676892660634}, "unrestricted_random_baseline": {"accuracy": 0.6757042253521127, "accuracy_std": 0.011404320239930722, "avg_cost": 0.0002511526408450702, "cost_std": 1.5736304477124072e-05, "small_model_fraction": 0.6254572215984449}}, "0.25": {"accuracy": 0.6197183098591549, "large_model_usage": 0.2887323943661972, "small_model_usage": 0.7112676056338028, "avg_lambda": 0.29999999999999843, "avg_cost": 0.00018955387323943654, "unrestricted": {"accuracy": 0.7007042253521126, "large_model_usage": 0.2887323943661972, "avg_cost": 0.00020989894366197175}, "random_baseline": {"accuracy": 0.643661971830986, "accuracy_std": 0.017450016469526562, "avg_cost": 0.00018768144366197186, "cost_std": 1.5415419445846414e-05, "small_model_fraction": 0.7313406729534994}, "unrestricted_random_baseline": {"accuracy": 0.6598591549295774, "accuracy_std": 0.019105154882043004, "avg_cost": 0.00020406257042253516, "cost_std": 1.563258231662902e-05, "small_model_fraction": 0.6966190735250373}}}}