{"trial": 5, "trial_seed": 46, "subject": "TruthfulQA", "method": "standard", "calibration_size": 400, "test_size": 284, "small_model_accuracy": 0.5730994152046783, "large_model_accuracy": 0.8347953216374269, "small_model_cost": 3.223815789473688e-05, "large_model_cost": 0.0006194181286549714, "hybrid_results": {"0.05": {"accuracy": 0.8274647887323944, "large_model_usage": 0.6795774647887324, "small_model_usage": 0.3204225352112676, "avg_lambda": 0.8000000000000037, "avg_cost": 0.00045489401408450685, "unrestricted": {"accuracy": 0.823943661971831, "large_model_usage": 0.6795774647887324, "avg_cost": 0.00045819683098591535}, "random_baseline": {"accuracy": 0.7806338028169014, "accuracy_std": 0.014776151590392481, "avg_cost": 0.0004513943309859154, "cost_std": 1.5272400396873303e-05, "small_model_fraction": 0.28019367615256296}, "unrestricted_random_baseline": {"accuracy": 0.781338028169014, "accuracy_std": 0.01763729547375804, "avg_cost": 0.00045483996478873237, "cost_std": 1.4719229174108148e-05, "small_model_fraction": 0.27456879610576523}}, "0.1": {"accuracy": 0.8063380281690141, "large_model_usage": 0.6267605633802817, "small_model_usage": 0.37323943661971826, "avg_lambda": 0.6999999999999973, "avg_cost": 0.00040721091549295773, "unrestricted": {"accuracy": 0.8169014084507042, "large_model_usage": 0.6267605633802817, "avg_cost": 0.00042503485915492946}, "random_baseline": {"accuracy": 0.7623239436619718, "accuracy_std": 0.017123685092658908, "avg_cost": 0.00042181996478873245, "cost_std": 9.420449239157187e-06, "small_model_fraction": 0.3614006330755193}, "unrestricted_random_baseline": {"accuracy": 0.7672535211267606, "accuracy_std": 0.01096083268751645, "avg_cost": 0.00043352901408450704, "cost_std": 9.805438965790524e-06, "small_model_fraction": 0.33104547017904873}}, "0.15": {"accuracy": 0.75, "large_model_usage": 0.4225352112676056, "small_model_usage": 0.5774647887323944, "avg_lambda": 0.5, "avg_cost": 0.00028540105633802817, "unrestricted": {"accuracy": 0.7816901408450704, "large_model_usage": 0.4225352112676056, "avg_cost": 0.0002986545774647887}, "random_baseline": {"accuracy": 0.7052816901408451, "accuracy_std": 0.012894248889859577, "avg_cost": 0.00028027767605633803, "cost_std": 1.8089703713997775e-05, "small_model_fraction": 0.5688495673387567}, "unrestricted_random_baseline": {"accuracy": 0.7109154929577465, "accuracy_std": 0.013495611194885102, "avg_cost": 0.0003001230633802817, "cost_std": 1.3965806739731885e-05, "small_model_fraction": 0.5462780870656798}}, "0.2": {"accuracy": 0.6971830985915493, "large_model_usage": 0.323943661971831, "small_model_usage": 0.676056338028169, "avg_lambda": 0.40000000000000185, "avg_cost": 0.00022223204225352107, "unrestricted": {"accuracy": 0.7535211267605634, "large_model_usage": 0.323943661971831, "avg_cost": 0.0002354644366197183}, "random_baseline": {"accuracy": 0.6838028169014084, "accuracy_std": 0.012074949436255087, "avg_cost": 0.0002303851056338028, "cost_std": 1.5487079438649265e-05, "small_model_fraction": 0.6764298957391292}, "unrestricted_random_baseline": {"accuracy": 0.677112676056338, "accuracy_std": 0.02071488180293341, "avg_cost": 0.00023452848591549292, "cost_std": 1.0993920148804855e-05, "small_model_fraction": 0.6538943955089953}}, "0.25": {"accuracy": 0.6514084507042254, "large_model_usage": 0.2676056338028169, "small_model_usage": 0.7323943661971831, "avg_lambda": 0.3100000000000012, "avg_cost": 0.00018283767605633807, "unrestricted": {"accuracy": 0.7288732394366197, "large_model_usage": 0.2676056338028169, "avg_cost": 0.0001999151408450705}, "random_baseline": {"accuracy": 0.6658450704225352, "accuracy_std": 0.01254331910602595, "avg_cost": 0.00018004077464788725, "cost_std": 1.4422334068765449e-05, "small_model_fraction": 0.7435206824806766}, "unrestricted_random_baseline": {"accuracy": 0.6647887323943662, "accuracy_std": 0.007998462458873614, "avg_cost": 0.00019779299295774635, "cost_std": 1.2569989265869246e-05, "small_model_fraction": 0.7144368144348681}}}}