{"trial": 16, "trial_seed": 57, "subject": "TruthfulQA", "method": "standard", "calibration_size": 400, "test_size": 284, "small_model_accuracy": 0.5862573099415205, "large_model_accuracy": 0.8596491228070176, "small_model_cost": 3.221096491228071e-05, "large_model_cost": 0.0006191198830409364, "hybrid_results": {"0.05": {"accuracy": 0.795774647887324, "large_model_usage": 0.7887323943661971, "small_model_usage": 0.21126760563380287, "avg_lambda": 0.8600000000000062, "avg_cost": 0.0005168246478873237, "unrestricted": {"accuracy": 0.8028169014084507, "large_model_usage": 0.7887323943661971, "avg_cost": 0.0005248035211267605}, "random_baseline": {"accuracy": 0.8014084507042254, "accuracy_std": 0.008795771828730143, "avg_cost": 0.0005181552464788733, "cost_std": 7.464289900702647e-06, "small_model_fraction": 0.17429490674597087}, "unrestricted_random_baseline": {"accuracy": 0.8031690140845071, "accuracy_std": 0.015938690560220122, "avg_cost": 0.0005282458802816903, "cost_std": 1.5218770947914858e-05, "small_model_fraction": 0.1607001682899985}}, "0.1": {"accuracy": 0.7711267605633803, "large_model_usage": 0.6338028169014085, "small_model_usage": 0.3661971830985915, "avg_lambda": 0.6999999999999973, "avg_cost": 0.0004142190140845069, "unrestricted": {"accuracy": 0.7816901408450704, "large_model_usage": 0.6338028169014085, "avg_cost": 0.00042806408450704185}, "random_baseline": {"accuracy": 0.7419014084507042, "accuracy_std": 0.01510805489642599, "avg_cost": 0.0004090305281690138, "cost_std": 1.364456107089373e-05, "small_model_fraction": 0.3491186837128847}, "unrestricted_random_baseline": {"accuracy": 0.7538732394366198, "accuracy_std": 0.017566858708825923, "avg_cost": 0.0004315042605633801, "cost_std": 1.6486099215822618e-05, "small_model_fraction": 0.3255288727645699}}, "0.15": {"accuracy": 0.6971830985915493, "large_model_usage": 0.41901408450704225, "small_model_usage": 0.5809859154929577, "avg_lambda": 0.5, "avg_cost": 0.0002813246478873239, "unrestricted": {"accuracy": 0.7323943661971831, "large_model_usage": 0.41901408450704225, "avg_cost": 0.00029437394366197176}, "random_baseline": {"accuracy": 0.6940140845070422, "accuracy_std": 0.017353833141340314, "avg_cost": 0.00028711630281690124, "cost_std": 1.0399158352808483e-05, "small_model_fraction": 0.5755496717116927}, "unrestricted_random_baseline": {"accuracy": 0.6996478873239437, "accuracy_std": 0.009454733508519391, "avg_cost": 0.0002887480985915491, "cost_std": 1.719028057539095e-05, "small_model_fraction": 0.553315734943011}}, "0.2": {"accuracy": 0.6514084507042254, "large_model_usage": 0.2746478873239437, "small_model_usage": 0.7253521126760563, "avg_lambda": 0.3100000000000012, "avg_cost": 0.0001885499999999999, "unrestricted": {"accuracy": 0.6936619718309859, "large_model_usage": 0.2746478873239437, "avg_cost": 0.00020351478873239425}, "random_baseline": {"accuracy": 0.6556338028169014, "accuracy_std": 0.011000351656206539, "avg_cost": 0.00018845897887323943, "cost_std": 1.667458899872445e-05, "small_model_fraction": 0.7336230030611866}, "unrestricted_random_baseline": {"accuracy": 0.6531690140845072, "accuracy_std": 0.008369622763383591, "avg_cost": 0.00020475697183098578, "cost_std": 1.6400545546184052e-05, "small_model_fraction": 0.708125369152148}}, "0.25": {"accuracy": 0.6161971830985915, "large_model_usage": 0.20422535211267606, "small_model_usage": 0.795774647887324, "avg_lambda": 0.10999999999999963, "avg_cost": 0.00014199366197183106, "unrestricted": {"accuracy": 0.6725352112676056, "large_model_usage": 0.20422535211267606, "avg_cost": 0.00016005}, "random_baseline": {"accuracy": 0.6338028169014084, "accuracy_std": 0.012399166803985243, "avg_cost": 0.00014802239436619724, "cost_std": 1.3471896460859987e-05, "small_model_fraction": 0.8129476420130235}, "unrestricted_random_baseline": {"accuracy": 0.6411971830985916, "accuracy_std": 0.015222511493814983, "avg_cost": 0.00015450411971830986, "cost_std": 1.6846776463411113e-05, "small_model_fraction": 0.7821824969105413}}}}