{"trial": 18, "trial_seed": 59, "subject": "TruthfulQA", "method": "standard", "calibration_size": 400, "test_size": 284, "small_model_accuracy": 0.5570175438596491, "large_model_accuracy": 0.8421052631578947, "small_model_cost": 3.2185818713450304e-05, "large_model_cost": 0.0006178684210526318, "hybrid_results": {"0.05": {"accuracy": 0.8169014084507042, "large_model_usage": 0.795774647887324, "small_model_usage": 0.204225352112676, "avg_lambda": 0.900000000000004, "avg_cost": 0.0005255750000000001, "unrestricted": {"accuracy": 0.8169014084507042, "large_model_usage": 0.795774647887324, "avg_cost": 0.0005255750000000001}, "random_baseline": {"accuracy": 0.8116197183098592, "accuracy_std": 0.007912748258536691, "avg_cost": 0.0005312355633802816, "cost_std": 1.2648068175445522e-05, "small_model_fraction": 0.15758265771258573}, "unrestricted_random_baseline": {"accuracy": 0.8049295774647888, "accuracy_std": 0.008935617986232065, "avg_cost": 0.0005271319718309858, "cost_std": 1.0725664158805673e-05, "small_model_fraction": 0.15758265771258573}}, "0.1": {"accuracy": 0.7605633802816901, "large_model_usage": 0.5985915492957746, "small_model_usage": 0.4014084507042254, "avg_lambda": 0.7100000000000012, "avg_cost": 0.0003911806338028167, "unrestricted": {"accuracy": 0.7852112676056338, "large_model_usage": 0.5985915492957746, "avg_cost": 0.0004051876760563379}, "random_baseline": {"accuracy": 0.7461267605633803, "accuracy_std": 0.008545535985571536, "avg_cost": 0.00038936919014084483, "cost_std": 1.672173194049332e-05, "small_model_fraction": 0.3870488663047828}, "unrestricted_random_baseline": {"accuracy": 0.7471830985915492, "accuracy_std": 0.013249920931145602, "avg_cost": 0.0004022389788732393, "cost_std": 1.0194322922513927e-05, "small_model_fraction": 0.3631331102321627}}, "0.15": {"accuracy": 0.6971830985915493, "large_model_usage": 0.44014084507042256, "small_model_usage": 0.5598591549295775, "avg_lambda": 0.5500000000000005, "avg_cost": 0.0002951383802816901, "unrestricted": {"accuracy": 0.7359154929577465, "large_model_usage": 0.44014084507042256, "avg_cost": 0.00030528626760563377}, "random_baseline": {"accuracy": 0.6880281690140846, "accuracy_std": 0.012318912453898507, "avg_cost": 0.00029609795774647874, "cost_std": 1.6210736995727645e-05, "small_model_fraction": 0.5510323159369548}, "unrestricted_random_baseline": {"accuracy": 0.7073943661971831, "accuracy_std": 0.018991248676284246, "avg_cost": 0.00029487306338028157, "cost_std": 1.5587268613361505e-05, "small_model_fraction": 0.5337057173946494}}, "0.2": {"accuracy": 0.6654929577464789, "large_model_usage": 0.3345070422535211, "small_model_usage": 0.6654929577464789, "avg_lambda": 0.4399999999999985, "avg_cost": 0.00022686373239436618, "unrestricted": {"accuracy": 0.7007042253521126, "large_model_usage": 0.3345070422535211, "avg_cost": 0.00023927922535211266}, "random_baseline": {"accuracy": 0.6559859154929577, "accuracy_std": 0.01781912806884504, "avg_cost": 0.0002192362323943661, "cost_std": 1.7661213288775743e-05, "small_model_fraction": 0.6676050937771008}, "unrestricted_random_baseline": {"accuracy": 0.6725352112676056, "accuracy_std": 0.011245577058219219, "avg_cost": 0.00024131943661971828, "cost_std": 1.486967875428732e-05, "small_model_fraction": 0.646406763985231}}, "0.25": {"accuracy": 0.6161971830985915, "large_model_usage": 0.25, "small_model_usage": 0.75, "avg_lambda": 0.3100000000000012, "avg_cost": 0.0001707228873239436, "unrestricted": {"accuracy": 0.6690140845070423, "large_model_usage": 0.25, "avg_cost": 0.00018739190140845064}, "random_baseline": {"accuracy": 0.628169014084507, "accuracy_std": 0.009604353307736546, "avg_cost": 0.0001719092957746478, "cost_std": 1.0927081450959608e-05, "small_model_fraction": 0.7634605022290496}, "unrestricted_random_baseline": {"accuracy": 0.6419014084507043, "accuracy_std": 0.011360750566832178, "avg_cost": 0.00018220697183098583, "cost_std": 1.0575473768465163e-05, "small_model_fraction": 0.7349996703417235}}}}