{"trial": 28, "trial_seed": 69, "subject": "TruthfulQA", "method": "standard", "calibration_size": 400, "test_size": 284, "small_model_accuracy": 0.5716374269005848, "large_model_accuracy": 0.8494152046783626, "small_model_cost": 3.22343567251462e-05, "large_model_cost": 0.0006201140350877194, "hybrid_results": {"0.05": {"accuracy": 0.8169014084507042, "large_model_usage": 0.8450704225352113, "small_model_usage": 0.15492957746478875, "avg_lambda": 0.879999999999997, "avg_cost": 0.0005501190140845075, "unrestricted": {"accuracy": 0.8450704225352113, "large_model_usage": 0.8450704225352113, "avg_cost": 0.0005580274647887329}, "random_baseline": {"accuracy": 0.8151408450704226, "accuracy_std": 0.008802816901408439, "avg_cost": 0.0005541610563380281, "cost_std": 1.0672127904272714e-05, "small_model_fraction": 0.1190635151706037}, "unrestricted_random_baseline": {"accuracy": 0.8214788732394366, "accuracy_std": 0.011360750566832152, "avg_cost": 0.0005563522183098591, "cost_std": 1.5219401968108443e-05, "small_model_fraction": 0.10561101630850173}}, "0.1": {"accuracy": 0.7887323943661971, "large_model_usage": 0.6830985915492958, "small_model_usage": 0.31690140845070425, "avg_lambda": 0.6999999999999973, "avg_cost": 0.0004445485915492958, "unrestricted": {"accuracy": 0.8028169014084507, "large_model_usage": 0.6830985915492958, "avg_cost": 0.00045756267605633804}, "random_baseline": {"accuracy": 0.7584507042253521, "accuracy_std": 0.009984117520251976, "avg_cost": 0.00043684073943661965, "cost_std": 1.524339107320557e-05, "small_model_fraction": 0.29864179695312426}, "unrestricted_random_baseline": {"accuracy": 0.7538732394366197, "accuracy_std": 0.009764383537931007, "avg_cost": 0.00045163602112676044, "cost_std": 7.031191676651994e-06, "small_model_fraction": 0.27650447024149083}}, "0.15": {"accuracy": 0.721830985915493, "large_model_usage": 0.5070422535211268, "small_model_usage": 0.49295774647887325, "avg_lambda": 0.5, "avg_cost": 0.0003325626760563382, "unrestricted": {"accuracy": 0.7640845070422535, "large_model_usage": 0.5070422535211268, "avg_cost": 0.00034814014084507056}, "random_baseline": {"accuracy": 0.7007042253521127, "accuracy_std": 0.023462438378813642, "avg_cost": 0.00033066827464788744, "cost_std": 1.7578707883410506e-05, "small_model_fraction": 0.4891330141438137}, "unrestricted_random_baseline": {"accuracy": 0.7112676056338029, "accuracy_std": 0.014172261829224049, "avg_cost": 0.0003499071830985917, "cost_std": 2.2359226231005297e-05, "small_model_fraction": 0.46263530489807075}}, "0.2": {"accuracy": 0.6654929577464789, "large_model_usage": 0.39436619718309857, "small_model_usage": 0.6056338028169015, "avg_lambda": 0.34999999999999865, "avg_cost": 0.0002582387323943661, "unrestricted": {"accuracy": 0.7288732394366197, "large_model_usage": 0.39436619718309857, "avg_cost": 0.00027721056338028173}, "random_baseline": {"accuracy": 0.653169014084507, "accuracy_std": 0.016459149144394707, "avg_cost": 0.00025757077464788733, "cost_std": 2.1013148844125363e-05, "small_model_fraction": 0.6155601494872011}, "unrestricted_random_baseline": {"accuracy": 0.6672535211267606, "accuracy_std": 0.016905075893179983, "avg_cost": 0.0002708735563380282, "cost_std": 1.9291773755342075e-05, "small_model_fraction": 0.5832885271056314}}, "0.25": {"accuracy": 0.5880281690140845, "large_model_usage": 0.3274647887323944, "small_model_usage": 0.6725352112676056, "avg_lambda": 0.25, "avg_cost": 0.00021040774647887317, "unrestricted": {"accuracy": 0.7007042253521126, "large_model_usage": 0.3274647887323944, "avg_cost": 0.000235280985915493}, "random_baseline": {"accuracy": 0.6327464788732394, "accuracy_std": 0.01659418544781435, "avg_cost": 0.00020507415492957746, "cost_std": 1.831255731216822e-05, "small_model_fraction": 0.6969220125961914}, "unrestricted_random_baseline": {"accuracy": 0.6545774647887325, "accuracy_std": 0.014641283722740702, "avg_cost": 0.0002391483450704225, "cost_std": 1.0924291280372676e-05, "small_model_fraction": 0.6546119271278529}}}}