{"trial": 13, "trial_seed": 54, "subject": "TruthfulQA", "method": "standard_basescores", "calibration_size": 400, "test_size": 284, "small_model_accuracy": 0.564327485380117, "large_model_accuracy": 0.8347953216374269, "small_model_cost": 3.1960380116959094e-05, "large_model_cost": 0.0006127748538011701, "hybrid_results": {"0.05": {"accuracy": 0.8169014084507042, "large_model_usage": 0.7816901408450704, "small_model_usage": 0.21830985915492962, "avg_lambda": 0.900000000000004, "avg_cost": 0.0005140105633802814, "unrestricted": {"accuracy": 0.8169014084507042, "large_model_usage": 0.7816901408450704, "avg_cost": 0.0005144753521126756}, "random_baseline": {"accuracy": 0.7922535211267606, "accuracy_std": 0.013361736592260785, "avg_cost": 0.0005206124999999997, "cost_std": 1.68840111585568e-05, "small_model_fraction": 0.1700444718507254}, "unrestricted_random_baseline": {"accuracy": 0.7996478873239437, "accuracy_std": 0.010847128028696746, "avg_cost": 0.0005203936267605631, "cost_std": 9.066287943761795e-06, "small_model_fraction": 0.16924423571086827}}, "0.1": {"accuracy": 0.7746478873239436, "large_model_usage": 0.6091549295774648, "small_model_usage": 0.39084507042253525, "avg_lambda": 0.6999999999999973, "avg_cost": 0.00039239788732394344, "unrestricted": {"accuracy": 0.7887323943661971, "large_model_usage": 0.6091549295774648, "avg_cost": 0.0004097359154929575}, "random_baseline": {"accuracy": 0.7327464788732395, "accuracy_std": 0.012444082427515683, "avg_cost": 0.00038754035211267595, "cost_std": 1.9860579364807285e-05, "small_model_fraction": 0.3794274703234164}, "unrestricted_random_baseline": {"accuracy": 0.7411971830985916, "accuracy_std": 0.010592682363726269, "avg_cost": 0.00041082098591549275, "cost_std": 1.1454089277777443e-05, "small_model_fraction": 0.3495762373487355}}, "0.15": {"accuracy": 0.6901408450704225, "large_model_usage": 0.40492957746478875, "small_model_usage": 0.5950704225352113, "avg_lambda": 0.5, "avg_cost": 0.0002718204225352111, "unrestricted": {"accuracy": 0.7253521126760564, "large_model_usage": 0.40492957746478875, "avg_cost": 0.0002845809859154928}, "random_baseline": {"accuracy": 0.6651408450704226, "accuracy_std": 0.02150195019827045, "avg_cost": 0.0002623192957746478, "cost_std": 1.8818324812803216e-05, "small_model_fraction": 0.587028124666425}, "unrestricted_random_baseline": {"accuracy": 0.677112676056338, "accuracy_std": 0.020291590893273113, "avg_cost": 0.00028418609154929567, "cost_std": 1.914374163175038e-05, "small_model_fraction": 0.5650580051903397}}, "0.2": {"accuracy": 0.6126760563380281, "large_model_usage": 0.2605633802816901, "small_model_usage": 0.7394366197183099, "avg_lambda": 0.3199999999999981, "avg_cost": 0.00018159507042253513, "unrestricted": {"accuracy": 0.6654929577464789, "large_model_usage": 0.2605633802816901, "avg_cost": 0.00019431338028169014}, "random_baseline": {"accuracy": 0.6165492957746478, "accuracy_std": 0.010847128028696756, "avg_cost": 0.00017219355633802806, "cost_std": 1.4874148671500299e-05, "small_model_fraction": 0.7423709341187451}, "unrestricted_random_baseline": {"accuracy": 0.6253521126760563, "accuracy_std": 0.015604591412788997, "avg_cost": 0.00019535443661971818, "cost_std": 1.717305878999779e-05, "small_model_fraction": 0.7204735633826467}}, "0.25": {"accuracy": 0.545774647887324, "large_model_usage": 0.0880281690140845, "small_model_usage": 0.9119718309859155, "avg_lambda": 0.039999999999999765, "avg_cost": 7.92711267605634e-05, "unrestricted": {"accuracy": 0.5704225352112676, "large_model_usage": 0.0880281690140845, "avg_cost": 8.711619718309861e-05}, "random_baseline": {"accuracy": 0.5563380281690142, "accuracy_std": 0.007216162511239144, "avg_cost": 7.749464788732396e-05, "cost_std": 7.2145248059700605e-06, "small_model_fraction": 0.9185441327873534}, "unrestricted_random_baseline": {"accuracy": 0.5661971830985916, "accuracy_std": 0.008879943811914419, "avg_cost": 8.890426056338031e-05, "cost_std": 8.132018171158642e-06, "small_model_fraction": 0.9050371167297603}}}}