{"trial": 21, "trial_seed": 62, "subject": "TruthfulQA", "method": "standard", "calibration_size": 300, "test_size": 384, "small_model_accuracy": 0.5614035087719298, "large_model_accuracy": 0.8421052631578947, "small_model_cost": 3.1927192982456195e-05, "large_model_cost": 0.0006130935672514634, "hybrid_results": {"0.05": {"accuracy": 0.7916666666666666, "large_model_usage": 0.7708333333333334, "small_model_usage": 0.22916666666666663, "avg_lambda": 0.8399999999999972, "avg_cost": 0.00050420234375, "unrestricted": {"accuracy": 0.796875, "large_model_usage": 0.7708333333333334, "avg_cost": 0.0005086658854166668}, "random_baseline": {"accuracy": 0.7919270833333333, "accuracy_std": 0.009421902351627884, "avg_cost": 0.0005090430468750002, "cost_std": 1.618837138955373e-05, "small_model_fraction": 0.18736669622090077}, "unrestricted_random_baseline": {"accuracy": 0.7958333333333333, "accuracy_std": 0.007198059875565242, "avg_cost": 0.0005100494270833336, "cost_std": 1.171374092818601e-05, "small_model_fraction": 0.17968637976714671}}, "0.1": {"accuracy": 0.7552083333333334, "large_model_usage": 0.6171875, "small_model_usage": 0.3828125, "avg_lambda": 0.699999999999995, "avg_cost": 0.0003994783854166667, "unrestricted": {"accuracy": 0.7864583333333334, "large_model_usage": 0.6171875, "avg_cost": 0.0004137596354166667}, "random_baseline": {"accuracy": 0.7361979166666666, "accuracy_std": 0.009173914038441939, "avg_cost": 0.00039598994791666673, "cost_std": 4.854331500632631e-06, "small_model_fraction": 0.36756287234181856}, "unrestricted_random_baseline": {"accuracy": 0.7434895833333334, "accuracy_std": 0.014132253699182437, "avg_cost": 0.00041332812499999996, "cost_std": 1.3657497144219429e-05, "small_model_fraction": 0.3429894444349425}}, "0.15": {"accuracy": 0.7083333333333334, "large_model_usage": 0.4765625, "small_model_usage": 0.5234375, "avg_lambda": 0.5800000000000045, "avg_cost": 0.00031784296874999964, "unrestricted": {"accuracy": 0.734375, "large_model_usage": 0.4765625, "avg_cost": 0.0003279523437499997}, "random_baseline": {"accuracy": 0.6903645833333332, "accuracy_std": 0.018504248878635953, "avg_cost": 0.00032082424479166637, "cost_std": 1.3993700872510297e-05, "small_model_fraction": 0.5080311104936702}, "unrestricted_random_baseline": {"accuracy": 0.68984375, "accuracy_std": 0.011318392610469125, "avg_cost": 0.00032447416666666647, "cost_std": 1.1182111685886633e-05, "small_model_fraction": 0.490636134721516}}, "0.2": {"accuracy": 0.625, "large_model_usage": 0.296875, "small_model_usage": 0.703125, "avg_lambda": 0.3499999999999975, "avg_cost": 0.00020372838541666652, "unrestricted": {"accuracy": 0.6901041666666666, "large_model_usage": 0.296875, "avg_cost": 0.0002158429687499999}, "random_baseline": {"accuracy": 0.63984375, "accuracy_std": 0.0079884696094677, "avg_cost": 0.00020590427083333327, "cost_std": 1.0060274402608133e-05, "small_model_fraction": 0.7043855253148422}, "unrestricted_random_baseline": {"accuracy": 0.6354166666666667, "accuracy_std": 0.011587810136086965, "avg_cost": 0.00021431434895833319, "cost_std": 1.6715014472853927e-05, "small_model_fraction": 0.6835402323493448}}, "0.25": {"accuracy": 0.5833333333333334, "large_model_usage": 0.2760416666666667, "small_model_usage": 0.7239583333333333, "avg_lambda": 0.26999999999999935, "avg_cost": 0.00018199401041666676, "unrestricted": {"accuracy": 0.6770833333333334, "large_model_usage": 0.2760416666666667, "avg_cost": 0.0002030773437499999}, "random_baseline": {"accuracy": 0.6223958333333334, "accuracy_std": 0.013431038498153771, "avg_cost": 0.00018089052083333337, "cost_std": 1.1334387480056747e-05, "small_model_fraction": 0.741783378945547}, "unrestricted_random_baseline": {"accuracy": 0.63203125, "accuracy_std": 0.0138315330467599, "avg_cost": 0.0001985913541666666, "cost_std": 1.536036745647361e-05, "small_model_fraction": 0.7055057581698237}}}}