{"trial": 21, "trial_seed": 62, "subject": "TruthfulQA", "method": "standard", "calibration_size": 400, "test_size": 284, "small_model_accuracy": 0.543859649122807, "large_model_accuracy": 0.8435672514619883, "small_model_cost": 3.212368421052637e-05, "large_model_cost": 0.0006187660818713462, "hybrid_results": {"0.05": {"accuracy": 0.7992957746478874, "large_model_usage": 0.7922535211267606, "small_model_usage": 0.20774647887323938, "avg_lambda": 0.900000000000004, "avg_cost": 0.0005238588028169012, "unrestricted": {"accuracy": 0.7992957746478874, "large_model_usage": 0.7922535211267606, "avg_cost": 0.0005246616197183096}, "random_baseline": {"accuracy": 0.7785211267605634, "accuracy_std": 0.010732218770512801, "avg_cost": 0.0005240385211267601, "cost_std": 1.2091114020892258e-05, "small_model_fraction": 0.16178046358885526}, "unrestricted_random_baseline": {"accuracy": 0.7785211267605634, "accuracy_std": 0.013310604346081572, "avg_cost": 0.0005261066901408447, "cost_std": 1.645084799251043e-05, "small_model_fraction": 0.16041196907736138}}, "0.1": {"accuracy": 0.7640845070422535, "large_model_usage": 0.6514084507042254, "small_model_usage": 0.3485915492957746, "avg_lambda": 0.75, "avg_cost": 0.00042653485915492955, "unrestricted": {"accuracy": 0.7711267605633803, "large_model_usage": 0.6514084507042254, "avg_cost": 0.00043750669014084513}, "random_baseline": {"accuracy": 0.7415492957746479, "accuracy_std": 0.014363435249557122, "avg_cost": 0.0004358546478873236, "cost_std": 1.6361026277013783e-05, "small_model_fraction": 0.3276804122629393}, "unrestricted_random_baseline": {"accuracy": 0.7338028169014085, "accuracy_std": 0.01761971268055885, "avg_cost": 0.0004300978169014081, "cost_std": 1.2780775706241057e-05, "small_model_fraction": 0.30897765393918925}}, "0.15": {"accuracy": 0.7394366197183099, "large_model_usage": 0.5316901408450704, "small_model_usage": 0.4683098591549296, "avg_lambda": 0.6100000000000032, "avg_cost": 0.00035183767605633795, "unrestricted": {"accuracy": 0.7535211267605634, "large_model_usage": 0.5316901408450704, "avg_cost": 0.0003632038732394364}, "random_baseline": {"accuracy": 0.697887323943662, "accuracy_std": 0.01848507711043197, "avg_cost": 0.00035192411971830963, "cost_std": 1.643455561496884e-05, "small_model_fraction": 0.4550104235209722}, "unrestricted_random_baseline": {"accuracy": 0.7045774647887325, "accuracy_std": 0.01832677983052552, "avg_cost": 0.00037035147887323907, "cost_std": 1.6342961427980536e-05, "small_model_fraction": 0.4356354222792957}}, "0.2": {"accuracy": 0.6795774647887324, "large_model_usage": 0.36971830985915494, "small_model_usage": 0.630281690140845, "avg_lambda": 0.450000000000002, "avg_cost": 0.00024961232394366196, "unrestricted": {"accuracy": 0.7147887323943662, "large_model_usage": 0.36971830985915494, "avg_cost": 0.0002617742957746479}, "random_baseline": {"accuracy": 0.6577464788732394, "accuracy_std": 0.015250991427963158, "avg_cost": 0.0002504575352112675, "cost_std": 1.392002601331863e-05, "small_model_fraction": 0.6292653913178613}, "unrestricted_random_baseline": {"accuracy": 0.6588028169014084, "accuracy_std": 0.011184774770611668, "avg_cost": 0.0002618730985915492, "cost_std": 1.5160731177693658e-05, "small_model_fraction": 0.6085338999025108}}, "0.25": {"accuracy": 0.6443661971830986, "large_model_usage": 0.2887323943661972, "small_model_usage": 0.7112676056338028, "avg_lambda": 0.3199999999999981, "avg_cost": 0.00019854894366197186, "unrestricted": {"accuracy": 0.6830985915492958, "large_model_usage": 0.2887323943661972, "avg_cost": 0.00021254894366197176}, "random_baseline": {"accuracy": 0.6235915492957746, "accuracy_std": 0.013587169987024493, "avg_cost": 0.0001940893309859155, "cost_std": 1.7991284538151305e-05, "small_model_fraction": 0.7163088448515651}, "unrestricted_random_baseline": {"accuracy": 0.6306338028169014, "accuracy_std": 0.00897023887525144, "avg_cost": 0.00020767496478873233, "cost_std": 1.4018818797825391e-05, "small_model_fraction": 0.6924442212651629}}}}