{"trial": 9, "trial_seed": 50, "subject": "TruthfulQA", "method": "standard", "calibration_size": 400, "test_size": 284, "small_model_accuracy": 0.543859649122807, "large_model_accuracy": 0.8289473684210527, "small_model_cost": 3.2244736842105294e-05, "large_model_cost": 0.0006201461988304101, "hybrid_results": {"0.05": {"accuracy": 0.8169014084507042, "large_model_usage": 0.823943661971831, "small_model_usage": 0.176056338028169, "avg_lambda": 0.900000000000004, "avg_cost": 0.0005461609154929574, "unrestricted": {"accuracy": 0.8204225352112676, "large_model_usage": 0.823943661971831, "avg_cost": 0.0005474355633802813}, "random_baseline": {"accuracy": 0.8116197183098592, "accuracy_std": 0.009738955412633346, "avg_cost": 0.0005512129225352117, "cost_std": 1.1004241626045005e-05, "small_model_fraction": 0.125846401346293}, "unrestricted_random_baseline": {"accuracy": 0.8123239436619718, "accuracy_std": 0.011469364418945287, "avg_cost": 0.0005471868661971834, "cost_std": 1.1572445285426857e-05, "small_model_fraction": 0.12367826949131688}}, "0.1": {"accuracy": 0.7746478873239436, "large_model_usage": 0.6267605633802817, "small_model_usage": 0.37323943661971826, "avg_lambda": 0.7100000000000012, "avg_cost": 0.0004117383802816901, "unrestricted": {"accuracy": 0.7887323943661971, "large_model_usage": 0.6267605633802817, "avg_cost": 0.0004237595070422534}, "random_baseline": {"accuracy": 0.7475352112676057, "accuracy_std": 0.015025766403768901, "avg_cost": 0.0004197266549295773, "cost_std": 1.7442925487140622e-05, "small_model_fraction": 0.35449447232853765}, "unrestricted_random_baseline": {"accuracy": 0.7556338028169014, "accuracy_std": 0.013925154882595183, "avg_cost": 0.0004283933450704223, "cost_std": 1.088885332562275e-05, "small_model_fraction": 0.3340469525691763}}, "0.15": {"accuracy": 0.7077464788732394, "large_model_usage": 0.47183098591549294, "small_model_usage": 0.528169014084507, "avg_lambda": 0.5100000000000009, "avg_cost": 0.0003132383802816902, "unrestricted": {"accuracy": 0.75, "large_model_usage": 0.47183098591549294, "avg_cost": 0.0003277031690140846}, "random_baseline": {"accuracy": 0.670774647887324, "accuracy_std": 0.018109001226230697, "avg_cost": 0.0003164210211267604, "cost_std": 1.8025290129712546e-05, "small_model_fraction": 0.5220395566133584}, "unrestricted_random_baseline": {"accuracy": 0.6809859154929578, "accuracy_std": 0.014363435249557122, "avg_cost": 0.0003290284154929576, "cost_std": 1.776147899696816e-05, "small_model_fraction": 0.4974354525795397}}, "0.2": {"accuracy": 0.6795774647887324, "large_model_usage": 0.3626760563380282, "small_model_usage": 0.6373239436619718, "avg_lambda": 0.40000000000000185, "avg_cost": 0.0002462947183098593, "unrestricted": {"accuracy": 0.7253521126760564, "large_model_usage": 0.3626760563380282, "avg_cost": 0.00025976654929577486}, "random_baseline": {"accuracy": 0.6429577464788732, "accuracy_std": 0.011593012417714323, "avg_cost": 0.00024156366197183093, "cost_std": 1.7797844287371507e-05, "small_model_fraction": 0.6359084042012263}, "unrestricted_random_baseline": {"accuracy": 0.6595070422535212, "accuracy_std": 0.024650402286179054, "avg_cost": 0.000273256514084507, "cost_std": 9.805088105602566e-06, "small_model_fraction": 0.6129932868610629}}, "0.25": {"accuracy": 0.5985915492957746, "large_model_usage": 0.31690140845070425, "small_model_usage": 0.6830985915492958, "avg_lambda": 0.29999999999999843, "avg_cost": 0.00021002711267605637, "unrestricted": {"accuracy": 0.7183098591549296, "large_model_usage": 0.31690140845070425, "avg_cost": 0.00023104823943661978}, "random_baseline": {"accuracy": 0.6274647887323943, "accuracy_std": 0.01730733201048525, "avg_cost": 0.00021121802816901407, "cost_std": 1.2616376165527454e-05, "small_model_fraction": 0.6975983437212685}, "unrestricted_random_baseline": {"accuracy": 0.6355633802816902, "accuracy_std": 0.01769344299493116, "avg_cost": 0.0002334723591549296, "cost_std": 1.7389050599275037e-05, "small_model_fraction": 0.66184213605771}}}}