{"trial": 18, "trial_seed": 59, "subject": "TruthfulQA", "method": "standard_basescores", "calibration_size": 400, "test_size": 284, "small_model_accuracy": 0.5701754385964912, "large_model_accuracy": 0.8347953216374269, "small_model_cost": 3.198230994152046e-05, "large_model_cost": 0.0006123947368421063, "hybrid_results": {"0.05": {"accuracy": 0.8380281690140845, "large_model_usage": 0.7676056338028169, "small_model_usage": 0.23239436619718312, "avg_lambda": 0.900000000000004, "avg_cost": 0.0005007003521126758, "unrestricted": {"accuracy": 0.8380281690140845, "large_model_usage": 0.7676056338028169, "avg_cost": 0.0005007003521126758}, "random_baseline": {"accuracy": 0.8225352112676056, "accuracy_std": 0.009984117520251976, "avg_cost": 0.0005036044718309859, "cost_std": 1.5068167979187462e-05, "small_model_fraction": 0.19243968521811422}, "unrestricted_random_baseline": {"accuracy": 0.8172535211267606, "accuracy_std": 0.008250263742154906, "avg_cost": 0.0004991881690140843, "cost_std": 1.4560979846168986e-05, "small_model_fraction": 0.19243968521811422}}, "0.1": {"accuracy": 0.795774647887324, "large_model_usage": 0.5669014084507042, "small_model_usage": 0.43309859154929575, "avg_lambda": 0.6999999999999973, "avg_cost": 0.000362925704225352, "unrestricted": {"accuracy": 0.8274647887323944, "large_model_usage": 0.5669014084507042, "avg_cost": 0.00037762992957746463}, "random_baseline": {"accuracy": 0.7524647887323944, "accuracy_std": 0.020291590893273086, "avg_cost": 0.0003593271126760563, "cost_std": 2.038514056943218e-05, "small_model_fraction": 0.42981338967693017}, "unrestricted_random_baseline": {"accuracy": 0.7573943661971831, "accuracy_std": 0.012038958448787843, "avg_cost": 0.0003723845070422534, "cost_std": 8.546066060770675e-06, "small_model_fraction": 0.4044792915932046}}, "0.15": {"accuracy": 0.7394366197183099, "large_model_usage": 0.36971830985915494, "small_model_usage": 0.630281690140845, "avg_lambda": 0.5, "avg_cost": 0.00024985528169014094, "unrestricted": {"accuracy": 0.7746478873239436, "large_model_usage": 0.36971830985915494, "avg_cost": 0.0002575947183098592}, "random_baseline": {"accuracy": 0.6915492957746477, "accuracy_std": 0.017120064480483404, "avg_cost": 0.0002500614436619717, "cost_std": 1.4648838128717764e-05, "small_model_fraction": 0.624623868044889}, "unrestricted_random_baseline": {"accuracy": 0.6975352112676055, "accuracy_std": 0.011295080938271447, "avg_cost": 0.00024542799295774635, "cost_std": 1.3636928689182355e-05, "small_model_fraction": 0.6112894936224683}}, "0.2": {"accuracy": 0.6936619718309859, "large_model_usage": 0.28169014084507044, "small_model_usage": 0.7183098591549295, "avg_lambda": 0.3700000000000011, "avg_cost": 0.00019393978873239438, "unrestricted": {"accuracy": 0.7253521126760564, "large_model_usage": 0.28169014084507044, "avg_cost": 0.0002051862676056338}, "random_baseline": {"accuracy": 0.654225352112676, "accuracy_std": 0.02009203185589732, "avg_cost": 0.00018496845070422528, "cost_std": 1.7238840597004824e-05, "small_model_fraction": 0.7209613866199761}, "unrestricted_random_baseline": {"accuracy": 0.676056338028169, "accuracy_std": 0.021126760563380278, "avg_cost": 0.00020790327464788723, "cost_std": 1.783342285614196e-05, "small_model_fraction": 0.7015846842063219}}, "0.25": {"accuracy": 0.6197183098591549, "large_model_usage": 0.23943661971830985, "small_model_usage": 0.7605633802816901, "avg_lambda": 0.10999999999999963, "avg_cost": 0.000156531338028169, "unrestricted": {"accuracy": 0.7112676056338029, "large_model_usage": 0.23943661971830985, "avg_cost": 0.00017887640845070423}, "random_baseline": {"accuracy": 0.644718309859155, "accuracy_std": 0.01118477477061167, "avg_cost": 0.00015756950704225346, "cost_std": 9.670494346237704e-06, "small_model_fraction": 0.7854128851931326}, "unrestricted_random_baseline": {"accuracy": 0.6492957746478873, "accuracy_std": 0.011045343057294463, "avg_cost": 0.00017194774647887317, "cost_std": 1.1014150270417119e-05, "small_model_fraction": 0.7469142773293099}}}}