{"trial": 21, "trial_seed": 62, "subject": "TruthfulQA", "method": "standard_basescores", "calibration_size": 400, "test_size": 284, "small_model_accuracy": 0.5570175438596491, "large_model_accuracy": 0.8377192982456141, "small_model_cost": 3.1931871345029295e-05, "large_model_cost": 0.0006127192982456147, "hybrid_results": {"0.05": {"accuracy": 0.8380281690140845, "large_model_usage": 1.0, "small_model_usage": 0.0, "avg_lambda": 1.0, "avg_cost": 0.0006451672535211265, "unrestricted": {"accuracy": 0.8380281690140845, "large_model_usage": 1.0, "avg_cost": 0.0006451672535211265}, "random_baseline": {"accuracy": 0.8380281690140844, "accuracy_std": 1.1102230246251565e-16, "avg_cost": 0.0006132323943661974, "cost_std": 1.0842021724855044e-19, "small_model_fraction": 0.0}, "unrestricted_random_baseline": {"accuracy": 0.8380281690140844, "accuracy_std": 1.1102230246251565e-16, "avg_cost": 0.0006132323943661974, "cost_std": 1.0842021724855044e-19, "small_model_fraction": 0.0}}, "0.1": {"accuracy": 0.795774647887324, "large_model_usage": 0.7112676056338029, "small_model_usage": 0.28873239436619713, "avg_lambda": 0.8000000000000037, "avg_cost": 0.0004690968309859149, "unrestricted": {"accuracy": 0.795774647887324, "large_model_usage": 0.7112676056338029, "avg_cost": 0.0004732658450704218}, "random_baseline": {"accuracy": 0.7704225352112676, "accuracy_std": 0.01077257643716784, "avg_cost": 0.000480754084507042, "cost_std": 1.3873758252614022e-05, "small_model_fraction": 0.24728921565356812}, "unrestricted_random_baseline": {"accuracy": 0.7640845070422535, "accuracy_std": 0.012985273883511093, "avg_cost": 0.0004685005281690139, "cost_std": 9.272090568767288e-06, "small_model_fraction": 0.24011100570719388}}, "0.15": {"accuracy": 0.7112676056338029, "large_model_usage": 0.4753521126760563, "small_model_usage": 0.5246478873239437, "avg_lambda": 0.5900000000000027, "avg_cost": 0.0003177728873239436, "unrestricted": {"accuracy": 0.7394366197183099, "large_model_usage": 0.4753521126760563, "avg_cost": 0.00032810387323943643}, "random_baseline": {"accuracy": 0.6859154929577465, "accuracy_std": 0.012074949436255087, "avg_cost": 0.0003169811267605633, "cost_std": 1.4727487765527219e-05, "small_model_fraction": 0.5078388361395394}, "unrestricted_random_baseline": {"accuracy": 0.7, "accuracy_std": 0.016043360211119063, "avg_cost": 0.0003332114084507041, "cost_std": 1.629009646194693e-05, "small_model_fraction": 0.49005094088391227}}, "0.2": {"accuracy": 0.6584507042253521, "large_model_usage": 0.34507042253521125, "small_model_usage": 0.6549295774647887, "avg_lambda": 0.40000000000000185, "avg_cost": 0.00023214612676056352, "unrestricted": {"accuracy": 0.6971830985915493, "large_model_usage": 0.34507042253521125, "avg_cost": 0.00024645598591549293}, "random_baseline": {"accuracy": 0.6471830985915494, "accuracy_std": 0.014154754395944899, "avg_cost": 0.00023646788732394368, "cost_std": 1.5003926946460277e-05, "small_model_fraction": 0.6552710231969169}, "unrestricted_random_baseline": {"accuracy": 0.6566901408450704, "accuracy_std": 0.017902427913924147, "avg_cost": 0.0002489941549295774, "cost_std": 1.5319234297976267e-05, "small_model_fraction": 0.6306323025701723}}, "0.25": {"accuracy": 0.5985915492957746, "large_model_usage": 0.2852112676056338, "small_model_usage": 0.7147887323943662, "avg_lambda": 0.25, "avg_cost": 0.0001856109154929579, "unrestricted": {"accuracy": 0.6795774647887324, "large_model_usage": 0.2852112676056338, "avg_cost": 0.00020842077464788754}, "random_baseline": {"accuracy": 0.6246478873239436, "accuracy_std": 0.011485567908662063, "avg_cost": 0.00018273214788732398, "cost_std": 1.42954533579816e-05, "small_model_fraction": 0.7353953666523946}, "unrestricted_random_baseline": {"accuracy": 0.6316901408450705, "accuracy_std": 0.011804968038197361, "avg_cost": 0.0002051116901408451, "cost_std": 1.3839209537968039e-05, "small_model_fraction": 0.6961213429762001}}}}