{"trial": 3, "trial_seed": 44, "subject": "TruthfulQA", "method": "standard_basescores", "calibration_size": 400, "test_size": 284, "small_model_accuracy": 0.5760233918128655, "large_model_accuracy": 0.8362573099415205, "small_model_cost": 3.208538011695908e-05, "large_model_cost": 0.0006144093567251462, "hybrid_results": {"0.05": {"accuracy": 0.8345070422535211, "large_model_usage": 1.0, "small_model_usage": 0.0, "avg_lambda": 1.0, "avg_cost": 0.0006475010563380279, "unrestricted": {"accuracy": 0.8345070422535211, "large_model_usage": 1.0, "avg_cost": 0.0006475010563380279}, "random_baseline": {"accuracy": 0.8345070422535212, "accuracy_std": 1.1102230246251565e-16, "avg_cost": 0.0006154014084507043, "cost_std": 1.0842021724855044e-19, "small_model_fraction": 0.0}, "unrestricted_random_baseline": {"accuracy": 0.8345070422535212, "accuracy_std": 1.1102230246251565e-16, "avg_cost": 0.0006154014084507043, "cost_std": 1.0842021724855044e-19, "small_model_fraction": 0.0}}, "0.1": {"accuracy": 0.7816901408450704, "large_model_usage": 0.6408450704225352, "small_model_usage": 0.35915492957746475, "avg_lambda": 0.7599999999999976, "avg_cost": 0.00042429683098591507, "unrestricted": {"accuracy": 0.795774647887324, "large_model_usage": 0.6408450704225352, "avg_cost": 0.0004299588028169011}, "random_baseline": {"accuracy": 0.7443661971830986, "accuracy_std": 0.022336269756554512, "avg_cost": 0.0004229963380281688, "cost_std": 1.8374047993291708e-05, "small_model_fraction": 0.32647209006670713}, "unrestricted_random_baseline": {"accuracy": 0.7609154929577464, "accuracy_std": 0.020193592899915, "avg_cost": 0.000439301936619718, "cost_std": 1.3050169521885118e-05, "small_model_fraction": 0.3167490285778692}}, "0.15": {"accuracy": 0.7007042253521126, "large_model_usage": 0.43661971830985913, "small_model_usage": 0.5633802816901409, "avg_lambda": 0.5300000000000009, "avg_cost": 0.00029137429577464794, "unrestricted": {"accuracy": 0.7359154929577465, "large_model_usage": 0.43661971830985913, "avg_cost": 0.00030360669014084507}, "random_baseline": {"accuracy": 0.6989436619718309, "accuracy_std": 0.01512445886965814, "avg_cost": 0.00030322408450704215, "cost_std": 2.015524778368104e-05, "small_model_fraction": 0.5547342612132392}, "unrestricted_random_baseline": {"accuracy": 0.703169014084507, "accuracy_std": 0.009584970132969182, "avg_cost": 0.00030279926056338025, "cost_std": 1.2103369515203333e-05, "small_model_fraction": 0.5337280947877279}}, "0.2": {"accuracy": 0.6338028169014085, "large_model_usage": 0.2711267605633803, "small_model_usage": 0.7288732394366197, "avg_lambda": 0.35999999999999954, "avg_cost": 0.0001887052816901408, "unrestricted": {"accuracy": 0.676056338028169, "large_model_usage": 0.2711267605633803, "avg_cost": 0.00020029683098591542}, "random_baseline": {"accuracy": 0.6443661971830986, "accuracy_std": 0.012695603082619682, "avg_cost": 0.0001859207042253521, "cost_std": 6.0320252486603995e-06, "small_model_fraction": 0.7310433575388183}, "unrestricted_random_baseline": {"accuracy": 0.6514084507042254, "accuracy_std": 0.013637265303547225, "avg_cost": 0.00020299038732394365, "cost_std": 1.4451401074241194e-05, "small_model_fraction": 0.7111376868788346}}, "0.25": {"accuracy": 0.5950704225352113, "large_model_usage": 0.22535211267605634, "small_model_usage": 0.7746478873239436, "avg_lambda": 0.20000000000000093, "avg_cost": 0.0001566348591549295, "unrestricted": {"accuracy": 0.6619718309859155, "large_model_usage": 0.22535211267605634, "avg_cost": 0.00017267711267605627}, "random_baseline": {"accuracy": 0.6299295774647886, "accuracy_std": 0.006944747508209895, "avg_cost": 0.00014908999999999997, "cost_std": 1.2383688979160946e-05, "small_model_fraction": 0.7861165192554441}, "unrestricted_random_baseline": {"accuracy": 0.6323943661971831, "accuracy_std": 0.01319365774309522, "avg_cost": 0.0001851157394366197, "cost_std": 1.2581646728830659e-05, "small_model_fraction": 0.7585678450370704}}}}