{"trial": 10, "trial_seed": 51, "subject": "TruthfulQA", "method": "standard", "calibration_size": 400, "test_size": 284, "small_model_accuracy": 0.5789473684210527, "large_model_accuracy": 0.8377192982456141, "small_model_cost": 3.2183771929824545e-05, "large_model_cost": 0.0006180497076023403, "hybrid_results": {"0.05": {"accuracy": 0.8309859154929577, "large_model_usage": 0.795774647887324, "small_model_usage": 0.204225352112676, "avg_lambda": 0.900000000000004, "avg_cost": 0.0005266760563380282, "unrestricted": {"accuracy": 0.8309859154929577, "large_model_usage": 0.795774647887324, "avg_cost": 0.0005266760563380282}, "random_baseline": {"accuracy": 0.8028169014084507, "accuracy_std": 0.014938875658870743, "avg_cost": 0.000526892218309859, "cost_std": 1.302150190222221e-05, "small_model_fraction": 0.15596341364244742}, "unrestricted_random_baseline": {"accuracy": 0.8031690140845071, "accuracy_std": 0.011935529053998806, "avg_cost": 0.0005270144014084507, "cost_std": 1.2001748964833724e-05, "small_model_fraction": 0.15596341364244742}}, "0.1": {"accuracy": 0.7746478873239436, "large_model_usage": 0.6126760563380281, "small_model_usage": 0.38732394366197187, "avg_lambda": 0.6999999999999973, "avg_cost": 0.00039558450704225333, "unrestricted": {"accuracy": 0.795774647887324, "large_model_usage": 0.6126760563380281, "avg_cost": 0.0004136971830985915}, "random_baseline": {"accuracy": 0.7397887323943662, "accuracy_std": 0.013587169987024444, "avg_cost": 0.00039618728873239416, "cost_std": 1.4979867792665785e-05, "small_model_fraction": 0.379720319981941}, "unrestricted_random_baseline": {"accuracy": 0.7426056338028169, "accuracy_std": 0.014298549650878116, "avg_cost": 0.0004078804929577462, "cost_std": 1.4880419553529691e-05, "small_model_fraction": 0.3488042435325625}}, "0.15": {"accuracy": 0.6936619718309859, "large_model_usage": 0.352112676056338, "small_model_usage": 0.647887323943662, "avg_lambda": 0.40000000000000185, "avg_cost": 0.0002375915492957747, "unrestricted": {"accuracy": 0.7394366197183099, "large_model_usage": 0.352112676056338, "avg_cost": 0.0002525140845070423}, "random_baseline": {"accuracy": 0.6591549295774649, "accuracy_std": 0.01580981994411525, "avg_cost": 0.0002388635563380282, "cost_std": 1.246047798385932e-05, "small_model_fraction": 0.6493945715922833}, "unrestricted_random_baseline": {"accuracy": 0.6633802816901408, "accuracy_std": 0.015919231768249716, "avg_cost": 0.0002516532042253522, "cost_std": 1.3032250085440947e-05, "small_model_fraction": 0.6239236672391603}}, "0.2": {"accuracy": 0.6549295774647887, "large_model_usage": 0.2992957746478873, "small_model_usage": 0.7007042253521127, "avg_lambda": 0.3100000000000012, "avg_cost": 0.00020076760563380276, "unrestricted": {"accuracy": 0.721830985915493, "large_model_usage": 0.2992957746478873, "avg_cost": 0.0002187887323943662}, "random_baseline": {"accuracy": 0.6348591549295775, "accuracy_std": 0.01527130171986018, "avg_cost": 0.00019820249999999998, "cost_std": 1.9348603748250008e-05, "small_model_fraction": 0.7122484455245536}, "unrestricted_random_baseline": {"accuracy": 0.65, "accuracy_std": 0.01660538890500225, "avg_cost": 0.0002171272887323944, "cost_std": 1.677348468242188e-05, "small_model_fraction": 0.6814886322920656}}, "0.25": {"accuracy": 0.6161971830985915, "large_model_usage": 0.2711267605633803, "small_model_usage": 0.7288732394366197, "avg_lambda": 0.20000000000000093, "avg_cost": 0.00018062676056338023, "unrestricted": {"accuracy": 0.7077464788732394, "large_model_usage": 0.2711267605633803, "avg_cost": 0.00020113380281690135}, "random_baseline": {"accuracy": 0.6253521126760563, "accuracy_std": 0.01568384327557755, "avg_cost": 0.00018351362676056337, "cost_std": 1.74198085869325e-05, "small_model_fraction": 0.7466263532404254}, "unrestricted_random_baseline": {"accuracy": 0.6535211267605633, "accuracy_std": 0.014276855586417793, "avg_cost": 0.000203363309859155, "cost_std": 1.3985181410780538e-05, "small_model_fraction": 0.7116233926569925}}}}