{"trial": 25, "trial_seed": 66, "subject": "TruthfulQA", "method": "standard", "calibration_size": 400, "test_size": 284, "small_model_accuracy": 0.5555555555555556, "large_model_accuracy": 0.8304093567251462, "small_model_cost": 3.220716374269006e-05, "large_model_cost": 0.0006192894736842116, "hybrid_results": {"0.05": {"accuracy": 0.8133802816901409, "large_model_usage": 0.823943661971831, "small_model_usage": 0.176056338028169, "avg_lambda": 0.900000000000004, "avg_cost": 0.0005460373239436619, "unrestricted": {"accuracy": 0.8133802816901409, "large_model_usage": 0.823943661971831, "avg_cost": 0.0005465091549295773}, "random_baseline": {"accuracy": 0.7940140845070423, "accuracy_std": 0.010938186314795144, "avg_cost": 0.0005465783802816904, "cost_std": 8.749695498994992e-06, "small_model_fraction": 0.12477321918939485}, "unrestricted_random_baseline": {"accuracy": 0.7957746478873239, "accuracy_std": 0.007216162511239183, "avg_cost": 0.0005504030281690144, "cost_std": 1.0035764177009304e-05, "small_model_fraction": 0.12396953122618161}}, "0.1": {"accuracy": 0.7676056338028169, "large_model_usage": 0.6302816901408451, "small_model_usage": 0.3697183098591549, "avg_lambda": 0.7599999999999976, "avg_cost": 0.00042138239436619707, "unrestricted": {"accuracy": 0.778169014084507, "large_model_usage": 0.6302816901408451, "avg_cost": 0.00042629788732394345}, "random_baseline": {"accuracy": 0.7320422535211268, "accuracy_std": 0.011404320239930722, "avg_cost": 0.0004214792605633803, "cost_std": 1.3854581831362901e-05, "small_model_fraction": 0.3371027809332695}, "unrestricted_random_baseline": {"accuracy": 0.7362676056338028, "accuracy_std": 0.014725720891779195, "avg_cost": 0.00042873764084507026, "cost_std": 1.3928577397246986e-05, "small_model_fraction": 0.3287300317045693}}, "0.15": {"accuracy": 0.7007042253521126, "large_model_usage": 0.44014084507042256, "small_model_usage": 0.5598591549295775, "avg_lambda": 0.5800000000000017, "avg_cost": 0.00029620633802816896, "unrestricted": {"accuracy": 0.721830985915493, "large_model_usage": 0.44014084507042256, "avg_cost": 0.0003071288732394365}, "random_baseline": {"accuracy": 0.6869718309859155, "accuracy_std": 0.01872829034784817, "avg_cost": 0.0002994217957746479, "cost_std": 1.6656689166831585e-05, "small_model_fraction": 0.550319998039499}, "unrestricted_random_baseline": {"accuracy": 0.6841549295774648, "accuracy_std": 0.011997693688310421, "avg_cost": 0.0003065336971830986, "cost_std": 1.8740866912754836e-05, "small_model_fraction": 0.5317152214582467}}, "0.2": {"accuracy": 0.6619718309859155, "large_model_usage": 0.30985915492957744, "small_model_usage": 0.6901408450704225, "avg_lambda": 0.40000000000000185, "avg_cost": 0.00021438943661971838, "unrestricted": {"accuracy": 0.6936619718309859, "large_model_usage": 0.30985915492957744, "avg_cost": 0.0002259880281690141}, "random_baseline": {"accuracy": 0.648943661971831, "accuracy_std": 0.017467770077200646, "avg_cost": 0.00021399394366197177, "cost_std": 1.9763357422583237e-05, "small_model_fraction": 0.6896818899292414}, "unrestricted_random_baseline": {"accuracy": 0.6443661971830986, "accuracy_std": 0.01746422079948907, "avg_cost": 0.00021703045774647882, "cost_std": 1.2779664208403784e-05, "small_model_fraction": 0.6699255604454745}}, "0.25": {"accuracy": 0.5950704225352113, "large_model_usage": 0.23943661971830985, "small_model_usage": 0.7605633802816901, "avg_lambda": 0.28000000000000086, "avg_cost": 0.0001653683098591549, "unrestricted": {"accuracy": 0.6584507042253521, "large_model_usage": 0.23943661971830985, "avg_cost": 0.00018192464788732398}, "random_baseline": {"accuracy": 0.6186619718309859, "accuracy_std": 0.009052788825480624, "avg_cost": 0.00016704598591549285, "cost_std": 1.3966950940086588e-05, "small_model_fraction": 0.7731814707042889}, "unrestricted_random_baseline": {"accuracy": 0.6161971830985914, "accuracy_std": 0.011888692264883222, "avg_cost": 0.00017340253521126755, "cost_std": 1.2229496292470368e-05, "small_model_fraction": 0.7449804199354142}}}}