{"trial": 29, "trial_seed": 70, "subject": "TruthfulQA", "method": "standard_basescores", "calibration_size": 400, "test_size": 284, "small_model_accuracy": 0.5760233918128655, "large_model_accuracy": 0.8347953216374269, "small_model_cost": 3.206681286549711e-05, "large_model_cost": 0.0006161666666666675, "hybrid_results": {"0.05": {"accuracy": 0.7852112676056338, "large_model_usage": 0.8028169014084507, "small_model_usage": 0.19718309859154926, "avg_lambda": 0.8600000000000062, "avg_cost": 0.0005194929577464786, "unrestricted": {"accuracy": 0.8098591549295775, "large_model_usage": 0.8028169014084507, "avg_cost": 0.0005303521126760563}, "random_baseline": {"accuracy": 0.7834507042253522, "accuracy_std": 0.011914735433583571, "avg_cost": 0.0005197039788732395, "cost_std": 1.2062491886013773e-05, "small_model_fraction": 0.16550887368154868}, "unrestricted_random_baseline": {"accuracy": 0.7883802816901408, "accuracy_std": 0.010015114544597139, "avg_cost": 0.0005312186971830987, "cost_std": 1.0981663288043399e-05, "small_model_fraction": 0.14691760909055593}}, "0.1": {"accuracy": 0.7464788732394366, "large_model_usage": 0.6267605633802817, "small_model_usage": 0.37323943661971826, "avg_lambda": 0.6999999999999973, "avg_cost": 0.0004069436619718313, "unrestricted": {"accuracy": 0.7746478873239436, "large_model_usage": 0.6267605633802817, "avg_cost": 0.0004231760563380282}, "random_baseline": {"accuracy": 0.7309859154929578, "accuracy_std": 0.01536438326005028, "avg_cost": 0.00039792489436619717, "cost_std": 9.66492578192993e-06, "small_model_fraction": 0.3581973241959695}, "unrestricted_random_baseline": {"accuracy": 0.7355633802816902, "accuracy_std": 0.020678939239473264, "avg_cost": 0.0004169198239436618, "cost_std": 1.794556486240422e-05, "small_model_fraction": 0.3304068800440652}}, "0.15": {"accuracy": 0.6795774647887324, "large_model_usage": 0.4119718309859155, "small_model_usage": 0.5880281690140845, "avg_lambda": 0.5, "avg_cost": 0.0002792323943661971, "unrestricted": {"accuracy": 0.7147887323943662, "large_model_usage": 0.4119718309859155, "avg_cost": 0.0002898450704225352}, "random_baseline": {"accuracy": 0.679225352112676, "accuracy_std": 0.02091738977504104, "avg_cost": 0.00027722267605633785, "cost_std": 1.6441652406988974e-05, "small_model_fraction": 0.5768436169052081}, "unrestricted_random_baseline": {"accuracy": 0.6852112676056338, "accuracy_std": 0.00850918730534828, "avg_cost": 0.0002857069718309858, "cost_std": 1.4431368058705471e-05, "small_model_fraction": 0.558674333028019}}, "0.2": {"accuracy": 0.6267605633802817, "large_model_usage": 0.25704225352112675, "small_model_usage": 0.7429577464788732, "avg_lambda": 0.3100000000000012, "avg_cost": 0.0001766971830985914, "unrestricted": {"accuracy": 0.6690140845070423, "large_model_usage": 0.25704225352112675, "avg_cost": 0.00019245774647887314}, "random_baseline": {"accuracy": 0.631338028169014, "accuracy_std": 0.011997693688310464, "avg_cost": 0.0001812086971830985, "cost_std": 1.618123984015376e-05, "small_model_fraction": 0.7523875938473922}, "unrestricted_random_baseline": {"accuracy": 0.6383802816901408, "accuracy_std": 0.016444076660331864, "avg_cost": 0.00019135514084507032, "cost_std": 1.911863972212557e-05, "small_model_fraction": 0.7254049413476251}}, "0.25": {"accuracy": 0.5845070422535211, "large_model_usage": 0.16901408450704225, "small_model_usage": 0.8309859154929577, "avg_lambda": 0.07000000000000021, "avg_cost": 0.00012155633802816905, "unrestricted": {"accuracy": 0.6338028169014085, "large_model_usage": 0.16901408450704225, "avg_cost": 0.00013871830985915496}, "random_baseline": {"accuracy": 0.6095070422535211, "accuracy_std": 0.015782348472545775, "avg_cost": 0.000116160809859155, "cost_std": 1.5051042838419968e-05, "small_model_fraction": 0.8467907078211074}, "unrestricted_random_baseline": {"accuracy": 0.6190140845070423, "accuracy_std": 0.010772576437167891, "avg_cost": 0.00013750552816901412, "cost_std": 9.896354472988429e-06, "small_model_fraction": 0.817408793548573}}}}