{"trial": 14, "trial_seed": 55, "subject": "TruthfulQA", "method": "standard", "calibration_size": 400, "test_size": 284, "small_model_accuracy": 0.5614035087719298, "large_model_accuracy": 0.8216374269005848, "small_model_cost": 3.2227046783625756e-05, "large_model_cost": 0.0006201900584795331, "hybrid_results": {"0.05": {"accuracy": 0.8098591549295775, "large_model_usage": 0.8274647887323944, "small_model_usage": 0.17253521126760563, "avg_lambda": 0.900000000000004, "avg_cost": 0.0005501746478873239, "unrestricted": {"accuracy": 0.8098591549295775, "large_model_usage": 0.8274647887323944, "avg_cost": 0.0005506464788732395}, "random_baseline": {"accuracy": 0.7915492957746479, "accuracy_std": 0.008596165926573064, "avg_cost": 0.0005519104225352113, "cost_std": 6.89102634615144e-06, "small_model_fraction": 0.11908131838133543}, "unrestricted_random_baseline": {"accuracy": 0.801056338028169, "accuracy_std": 0.008942552886760788, "avg_cost": 0.0005586648239436622, "cost_std": 1.1351178047590172e-05, "small_model_fraction": 0.11827883425133107}}, "0.1": {"accuracy": 0.7746478873239436, "large_model_usage": 0.6232394366197183, "small_model_usage": 0.37676056338028174, "avg_lambda": 0.6999999999999973, "avg_cost": 0.0004060478873239438, "unrestricted": {"accuracy": 0.795774647887324, "large_model_usage": 0.6232394366197183, "avg_cost": 0.000423280281690141}, "random_baseline": {"accuracy": 0.726056338028169, "accuracy_std": 0.014755159746307731, "avg_cost": 0.0004018060211267606, "cost_std": 1.7451818747482216e-05, "small_model_fraction": 0.3642102766599593}, "unrestricted_random_baseline": {"accuracy": 0.7345070422535211, "accuracy_std": 0.014620098233821616, "avg_cost": 0.0004218559154929577, "cost_std": 1.727347711879279e-05, "small_model_fraction": 0.3349016398521906}}, "0.15": {"accuracy": 0.721830985915493, "large_model_usage": 0.41901408450704225, "small_model_usage": 0.5809859154929577, "avg_lambda": 0.5300000000000009, "avg_cost": 0.0002857802816901407, "unrestricted": {"accuracy": 0.7429577464788732, "large_model_usage": 0.41901408450704225, "avg_cost": 0.0002952661971830985}, "random_baseline": {"accuracy": 0.6774647887323944, "accuracy_std": 0.012419149358132656, "avg_cost": 0.00028003496478873227, "cost_std": 1.7252427422973656e-05, "small_model_fraction": 0.5687598881855311}, "unrestricted_random_baseline": {"accuracy": 0.6813380281690141, "accuracy_std": 0.02415246568963735, "avg_cost": 0.0002929820422535211, "cost_std": 2.225301375335313e-05, "small_model_fraction": 0.5526263639599224}}, "0.2": {"accuracy": 0.6514084507042254, "large_model_usage": 0.2640845070422535, "small_model_usage": 0.7359154929577465, "avg_lambda": 0.3900000000000005, "avg_cost": 0.00018859014084507042, "unrestricted": {"accuracy": 0.6901408450704225, "large_model_usage": 0.2640845070422535, "avg_cost": 0.0001971605633802816}, "random_baseline": {"accuracy": 0.6387323943661971, "accuracy_std": 0.011157027829404842, "avg_cost": 0.0001843962676056337, "cost_std": 1.2253302607732669e-05, "small_model_fraction": 0.7340596415913402}, "unrestricted_random_baseline": {"accuracy": 0.6429577464788734, "accuracy_std": 0.010932517391732396, "avg_cost": 0.00019524496478873232, "cost_std": 1.0005965521324594e-05, "small_model_fraction": 0.7194831761254414}}, "0.25": {"accuracy": 0.6091549295774648, "large_model_usage": 0.2323943661971831, "small_model_usage": 0.7676056338028169, "avg_lambda": 0.20000000000000093, "avg_cost": 0.00016118873239436612, "unrestricted": {"accuracy": 0.676056338028169, "large_model_usage": 0.2323943661971831, "avg_cost": 0.00017797746478873236}, "random_baseline": {"accuracy": 0.6158450704225352, "accuracy_std": 0.011619718309859174, "avg_cost": 0.00015932718309859145, "cost_std": 1.6461424247804852e-05, "small_model_fraction": 0.7806636080069626}, "unrestricted_random_baseline": {"accuracy": 0.632394366197183, "accuracy_std": 0.014620098233821646, "avg_cost": 0.00018592510563380274, "cost_std": 1.4770433554089384e-05, "small_model_fraction": 0.7521095458288993}}}}