{"trial": 5, "trial_seed": 46, "subject": "TruthfulQA", "method": "standard", "calibration_size": 300, "test_size": 384, "small_model_accuracy": 0.5760233918128655, "large_model_accuracy": 0.8377192982456141, "small_model_cost": 3.2044005847953286e-05, "large_model_cost": 0.0006139795321637432, "hybrid_results": {"0.05": {"accuracy": 0.828125, "large_model_usage": 0.8151041666666666, "small_model_usage": 0.18489583333333337, "avg_lambda": 0.8999999999999971, "avg_cost": 0.0005350078124999994, "unrestricted": {"accuracy": 0.828125, "large_model_usage": 0.8151041666666666, "avg_cost": 0.0005350078124999994}, "random_baseline": {"accuracy": 0.7963541666666667, "accuracy_std": 0.009081039465709734, "avg_cost": 0.0005293816406249999, "cost_std": 5.490875131217643e-06, "small_model_fraction": 0.13570527333794266}, "unrestricted_random_baseline": {"accuracy": 0.7955729166666666, "accuracy_std": 0.007920263711325785, "avg_cost": 0.0005266174218749999, "cost_std": 8.078704583806372e-06, "small_model_fraction": 0.13570527333794266}}, "0.1": {"accuracy": 0.78125, "large_model_usage": 0.6328125, "small_model_usage": 0.3671875, "avg_lambda": 0.699999999999995, "avg_cost": 0.00040510156249999986, "unrestricted": {"accuracy": 0.8072916666666666, "large_model_usage": 0.6328125, "avg_cost": 0.00042244010416666607}, "random_baseline": {"accuracy": 0.7385416666666667, "accuracy_std": 0.014414950524060464, "avg_cost": 0.00041109411458333314, "cost_std": 1.0052703033170399e-05, "small_model_fraction": 0.3589366179207879}, "unrestricted_random_baseline": {"accuracy": 0.74140625, "accuracy_std": 0.01597057094423122, "avg_cost": 0.0004231751302083332, "cost_std": 1.4425909237560085e-05, "small_model_fraction": 0.329142008582472}}, "0.15": {"accuracy": 0.7161458333333334, "large_model_usage": 0.4322916666666667, "small_model_usage": 0.5677083333333333, "avg_lambda": 0.5, "avg_cost": 0.0002855182291666667, "unrestricted": {"accuracy": 0.7447916666666666, "large_model_usage": 0.4322916666666667, "avg_cost": 0.00029849739583333345}, "random_baseline": {"accuracy": 0.6804687500000001, "accuracy_std": 0.015052443767310872, "avg_cost": 0.00028086296875, "cost_std": 1.525385232767179e-05, "small_model_fraction": 0.5644290271751435}, "unrestricted_random_baseline": {"accuracy": 0.6919270833333332, "accuracy_std": 0.014360745498918996, "avg_cost": 0.0002943583072916666, "cost_std": 8.276547281113772e-06, "small_model_fraction": 0.5421255827560731}}, "0.2": {"accuracy": 0.6822916666666666, "large_model_usage": 0.3411458333333333, "small_model_usage": 0.6588541666666667, "avg_lambda": 0.40000000000000285, "avg_cost": 0.00022985156250000002, "unrestricted": {"accuracy": 0.7213541666666666, "large_model_usage": 0.3411458333333333, "avg_cost": 0.00024242968750000005}, "random_baseline": {"accuracy": 0.6710937499999999, "accuracy_std": 0.011590735981389511, "avg_cost": 0.00022935859374999994, "cost_std": 1.0079368715651559e-05, "small_model_fraction": 0.6600868176851853}, "unrestricted_random_baseline": {"accuracy": 0.6723958333333333, "accuracy_std": 0.013420936173671991, "avg_cost": 0.0002473309375, "cost_std": 1.3760603464481652e-05, "small_model_fraction": 0.6384725246386143}}, "0.25": {"accuracy": 0.65625, "large_model_usage": 0.2838541666666667, "small_model_usage": 0.7161458333333333, "avg_lambda": 0.3100000000000015, "avg_cost": 0.0001909244791666668, "unrestricted": {"accuracy": 0.7005208333333334, "large_model_usage": 0.2838541666666667, "avg_cost": 0.00020708593750000005}, "random_baseline": {"accuracy": 0.6471354166666666, "accuracy_std": 0.016397980720240807, "avg_cost": 0.00018963463541666665, "cost_std": 1.321364003595047e-05, "small_model_fraction": 0.7269792509067468}, "unrestricted_random_baseline": {"accuracy": 0.6598958333333332, "accuracy_std": 0.014320549046737012, "avg_cost": 0.0002098677864583333, "cost_std": 1.3600773773147102e-05, "small_model_fraction": 0.699207345596805}}}}