{"trial": 14, "trial_seed": 55, "subject": "TruthfulQA", "method": "standard_basescores", "calibration_size": 400, "test_size": 284, "small_model_accuracy": 0.543859649122807, "large_model_accuracy": 0.8304093567251462, "small_model_cost": 3.202353801169593e-05, "large_model_cost": 0.0006144824561403515, "hybrid_results": {"0.05": {"accuracy": 0.8169014084507042, "large_model_usage": 1.0, "small_model_usage": 0.0, "avg_lambda": 1.0, "avg_cost": 0.0006501566901408454, "unrestricted": {"accuracy": 0.8169014084507042, "large_model_usage": 1.0, "avg_cost": 0.0006501566901408454}, "random_baseline": {"accuracy": 0.8169014084507042, "accuracy_std": 0.0, "avg_cost": 0.0006180915492957747, "cost_std": 0.0, "small_model_fraction": 0.0}, "unrestricted_random_baseline": {"accuracy": 0.8169014084507042, "accuracy_std": 0.0, "avg_cost": 0.0006180915492957747, "cost_std": 0.0, "small_model_fraction": 0.0}}, "0.1": {"accuracy": 0.8028169014084507, "large_model_usage": 0.7112676056338029, "small_model_usage": 0.28873239436619713, "avg_lambda": 0.8000000000000037, "avg_cost": 0.0004697200704225351, "unrestricted": {"accuracy": 0.7992957746478874, "large_model_usage": 0.7112676056338029, "avg_cost": 0.000474536971830986}, "random_baseline": {"accuracy": 0.7549295774647887, "accuracy_std": 0.01271511977835758, "avg_cost": 0.000466986514084507, "cost_std": 1.1518307988421662e-05, "small_model_fraction": 0.24853664561084243}, "unrestricted_random_baseline": {"accuracy": 0.7538732394366197, "accuracy_std": 0.017707452057295972, "avg_cost": 0.00047301739436619714, "cost_std": 1.6877036823890875e-05, "small_model_fraction": 0.24026670371704026}}, "0.15": {"accuracy": 0.778169014084507, "large_model_usage": 0.6302816901408451, "small_model_usage": 0.3697183098591549, "avg_lambda": 0.6999999999999973, "avg_cost": 0.00040431866197183117, "unrestricted": {"accuracy": 0.795774647887324, "large_model_usage": 0.6302816901408451, "avg_cost": 0.0004239665492957747}, "random_baseline": {"accuracy": 0.7225352112676057, "accuracy_std": 0.011223505246837502, "avg_cost": 0.00039917454225352113, "cost_std": 1.585237354548653e-05, "small_model_fraction": 0.36082166076835415}, "unrestricted_random_baseline": {"accuracy": 0.7341549295774649, "accuracy_std": 0.016534305467428, "avg_cost": 0.00042653781690140844, "cost_std": 2.211462348011672e-05, "small_model_fraction": 0.3270890030436361}}, "0.2": {"accuracy": 0.7007042253521126, "large_model_usage": 0.4014084507042254, "small_model_usage": 0.5985915492957746, "avg_lambda": 0.5, "avg_cost": 0.0002716426056338029, "unrestricted": {"accuracy": 0.7288732394366197, "large_model_usage": 0.4014084507042254, "avg_cost": 0.0002824947183098593}, "random_baseline": {"accuracy": 0.6676056338028169, "accuracy_std": 0.017407333935645437, "avg_cost": 0.00026745214788732386, "cost_std": 1.526844014682609e-05, "small_model_fraction": 0.5886077795976349}, "unrestricted_random_baseline": {"accuracy": 0.6764084507042254, "accuracy_std": 0.015624442065023775, "avg_cost": 0.00028244154929577464, "cost_std": 1.1327030290650657e-05, "small_model_fraction": 0.569976229219932}}, "0.25": {"accuracy": 0.6338028169014085, "large_model_usage": 0.2640845070422535, "small_model_usage": 0.7359154929577465, "avg_lambda": 0.3100000000000012, "avg_cost": 0.00018132570422535197, "unrestricted": {"accuracy": 0.6866197183098591, "large_model_usage": 0.2640845070422535, "avg_cost": 0.0001955792253521126}, "random_baseline": {"accuracy": 0.6235915492957746, "accuracy_std": 0.017210351281478797, "avg_cost": 0.00018049348591549281, "cost_std": 1.4264447537675507e-05, "small_model_fraction": 0.7436691901064211}, "unrestricted_random_baseline": {"accuracy": 0.6408450704225352, "accuracy_std": 0.010445349981824864, "avg_cost": 0.00020395584507042238, "cost_std": 1.578707748186999e-05, "small_model_fraction": 0.7191979000580948}}}}