{"trial": 30, "trial_seed": 71, "subject": "TruthfulQA", "method": "standard", "calibration_size": 400, "test_size": 284, "small_model_accuracy": 0.5730994152046783, "large_model_accuracy": 0.8347953216374269, "small_model_cost": 3.2219444444444495e-05, "large_model_cost": 0.0006187397660818721, "hybrid_results": {"0.05": {"accuracy": 0.8028169014084507, "large_model_usage": 0.5985915492957746, "small_model_usage": 0.4014084507042254, "avg_lambda": 0.7899999999999981, "avg_cost": 0.0004043232394366197, "unrestricted": {"accuracy": 0.8063380281690141, "large_model_usage": 0.5985915492957746, "avg_cost": 0.00040552042253521134}, "random_baseline": {"accuracy": 0.7471830985915494, "accuracy_std": 0.016650127348643483, "avg_cost": 0.00040870591549295774, "cost_std": 1.4855489320577053e-05, "small_model_fraction": 0.365573908925528}, "unrestricted_random_baseline": {"accuracy": 0.7450704225352113, "accuracy_std": 0.011485567908662021, "avg_cost": 0.00040469679577464796, "cost_std": 2.0510381866119294e-05, "small_model_fraction": 0.36353274674507813}}, "0.1": {"accuracy": 0.7746478873239436, "large_model_usage": 0.4859154929577465, "small_model_usage": 0.5140845070422535, "avg_lambda": 0.6699999999999969, "avg_cost": 0.0003296471830985917, "unrestricted": {"accuracy": 0.7816901408450704, "large_model_usage": 0.4859154929577465, "avg_cost": 0.00033426690140845087}, "random_baseline": {"accuracy": 0.7183098591549295, "accuracy_std": 0.013727879358885885, "avg_cost": 0.00032612989436619735, "cost_std": 1.04921169521466e-05, "small_model_fraction": 0.49289440164017073}, "unrestricted_random_baseline": {"accuracy": 0.7179577464788731, "accuracy_std": 0.014384998743532069, "avg_cost": 0.00033364961267605647, "cost_std": 1.5742391215616823e-05, "small_model_fraction": 0.48501791699090574}}, "0.15": {"accuracy": 0.7077464788732394, "large_model_usage": 0.36971830985915494, "small_model_usage": 0.630281690140845, "avg_lambda": 0.4800000000000006, "avg_cost": 0.000250337323943662, "unrestricted": {"accuracy": 0.7464788732394366, "large_model_usage": 0.36971830985915494, "avg_cost": 0.0002622598591549297}, "random_baseline": {"accuracy": 0.6795774647887323, "accuracy_std": 0.015185815952709715, "avg_cost": 0.0002560749647887325, "cost_std": 1.6103936335848992e-05, "small_model_fraction": 0.6281153926767902}, "unrestricted_random_baseline": {"accuracy": 0.6908450704225352, "accuracy_std": 0.020153645100358004, "avg_cost": 0.00026378728873239447, "cost_std": 1.2574998978998467e-05, "small_model_fraction": 0.6077878187267814}}, "0.2": {"accuracy": 0.6619718309859155, "large_model_usage": 0.2992957746478873, "small_model_usage": 0.7007042253521127, "avg_lambda": 0.34999999999999865, "avg_cost": 0.00020306267605633795, "unrestricted": {"accuracy": 0.7183098591549296, "large_model_usage": 0.2992957746478873, "avg_cost": 0.0002181260563380281}, "random_baseline": {"accuracy": 0.6630281690140845, "accuracy_std": 0.007394366197183089, "avg_cost": 0.0002035836267605634, "cost_std": 1.5972494634076342e-05, "small_model_fraction": 0.7087172851318448}, "unrestricted_random_baseline": {"accuracy": 0.6683098591549296, "accuracy_std": 0.017378820675004295, "avg_cost": 0.00022612084507042256, "cost_std": 1.501247789857586e-05, "small_model_fraction": 0.6830346621672445}}, "0.25": {"accuracy": 0.6126760563380281, "large_model_usage": 0.25704225352112675, "small_model_usage": 0.7429577464788732, "avg_lambda": 0.22999999999999873, "avg_cost": 0.00017225281690140838, "unrestricted": {"accuracy": 0.7007042253521126, "large_model_usage": 0.25704225352112675, "avg_cost": 0.00019241478873239438}, "random_baseline": {"accuracy": 0.652112676056338, "accuracy_std": 0.012870187945420452, "avg_cost": 0.00016596573943661975, "cost_std": 1.7458261540142446e-05, "small_model_fraction": 0.7612471941875373}, "unrestricted_random_baseline": {"accuracy": 0.6609154929577465, "accuracy_std": 0.01189390546060177, "avg_cost": 0.00019496947183098587, "cost_std": 1.4309967713394396e-05, "small_model_fraction": 0.726871621701492}}}}