{"trial": 12, "trial_seed": 53, "subject": "TruthfulQA", "method": "standard", "calibration_size": 400, "test_size": 284, "small_model_accuracy": 0.5657894736842105, "large_model_accuracy": 0.8435672514619883, "small_model_cost": 3.229239766081873e-05, "large_model_cost": 0.0006212514619883049, "hybrid_results": {"0.05": {"accuracy": 0.823943661971831, "large_model_usage": 0.8485915492957746, "small_model_usage": 0.15140845070422537, "avg_lambda": 0.8600000000000062, "avg_cost": 0.0005508665492957748, "unrestricted": {"accuracy": 0.8345070422535211, "large_model_usage": 0.8485915492957746, "avg_cost": 0.0005615285211267608}, "random_baseline": {"accuracy": 0.8119718309859156, "accuracy_std": 0.01104534305729446, "avg_cost": 0.000554122147887324, "cost_std": 9.942011085239027e-06, "small_model_fraction": 0.1195073086665207}, "unrestricted_random_baseline": {"accuracy": 0.8161971830985916, "accuracy_std": 0.011223505246837469, "avg_cost": 0.0005617598239436621, "cost_std": 6.139771615316515e-06, "small_model_fraction": 0.10140423075029809}}, "0.1": {"accuracy": 0.7852112676056338, "large_model_usage": 0.6795774647887324, "small_model_usage": 0.3204225352112676, "avg_lambda": 0.7299999999999969, "avg_cost": 0.00043921866197183093, "unrestricted": {"accuracy": 0.8169014084507042, "large_model_usage": 0.6795774647887324, "avg_cost": 0.00045592992957746485}, "random_baseline": {"accuracy": 0.752112676056338, "accuracy_std": 0.012518583686359967, "avg_cost": 0.0004358764788732394, "cost_std": 1.7097989805843136e-05, "small_model_fraction": 0.3090754706769502}, "unrestricted_random_baseline": {"accuracy": 0.7676056338028169, "accuracy_std": 0.02759089862557708, "avg_cost": 0.00046149369718309836, "cost_std": 2.2873231770172542e-05, "small_model_fraction": 0.2807012276814442}}, "0.15": {"accuracy": 0.7535211267605634, "large_model_usage": 0.5140845070422535, "small_model_usage": 0.4859154929577465, "avg_lambda": 0.5900000000000027, "avg_cost": 0.0003430566901408449, "unrestricted": {"accuracy": 0.778169014084507, "large_model_usage": 0.5140845070422535, "avg_cost": 0.0003533735915492956}, "random_baseline": {"accuracy": 0.7024647887323944, "accuracy_std": 0.017051126778596142, "avg_cost": 0.0003404160915492956, "cost_std": 1.4670779513193848e-05, "small_model_fraction": 0.47234992836916745}, "unrestricted_random_baseline": {"accuracy": 0.7070422535211267, "accuracy_std": 0.012577866971473095, "avg_cost": 0.00035221510563380266, "cost_std": 1.2232475135399018e-05, "small_model_fraction": 0.45483274927586115}}, "0.2": {"accuracy": 0.6830985915492958, "large_model_usage": 0.3908450704225352, "small_model_usage": 0.6091549295774648, "avg_lambda": 0.40000000000000185, "avg_cost": 0.0002603172535211267, "unrestricted": {"accuracy": 0.75, "large_model_usage": 0.3908450704225352, "avg_cost": 0.0002760707746478872}, "random_baseline": {"accuracy": 0.6619718309859155, "accuracy_std": 0.013268622310856865, "avg_cost": 0.00025353073943661965, "cost_std": 1.3280074842451608e-05, "small_model_fraction": 0.6128341175618336}, "unrestricted_random_baseline": {"accuracy": 0.6605633802816901, "accuracy_std": 0.0169014084507042, "avg_cost": 0.0002632419718309859, "cost_std": 1.3746051899142709e-05, "small_model_fraction": 0.5860860427278909}}, "0.25": {"accuracy": 0.6302816901408451, "large_model_usage": 0.31338028169014087, "small_model_usage": 0.6866197183098591, "avg_lambda": 0.3100000000000012, "avg_cost": 0.00021148626760563377, "unrestricted": {"accuracy": 0.704225352112676, "large_model_usage": 0.31338028169014087, "avg_cost": 0.00022783133802816908}, "random_baseline": {"accuracy": 0.6359154929577465, "accuracy_std": 0.024607612971346705, "avg_cost": 0.00020907073943661966, "cost_std": 1.6474847737561937e-05, "small_model_fraction": 0.6957447795638719}, "unrestricted_random_baseline": {"accuracy": 0.6492957746478873, "accuracy_std": 0.011157027829404816, "avg_cost": 0.00023434264084507044, "cost_std": 1.0955148623097117e-05, "small_model_fraction": 0.6679923067477871}}}}