{"trial": 7, "trial_seed": 48, "subject": "TruthfulQA", "method": "standard", "calibration_size": 300, "test_size": 384, "small_model_accuracy": 0.5453216374269005, "large_model_accuracy": 0.8289473684210527, "small_model_cost": 3.201330409356728e-05, "large_model_cost": 0.0006156111111111118, "hybrid_results": {"0.05": {"accuracy": 0.8151041666666666, "large_model_usage": 0.8020833333333334, "small_model_usage": 0.19791666666666663, "avg_lambda": 0.8999999999999971, "avg_cost": 0.0005285929687499996, "unrestricted": {"accuracy": 0.8151041666666666, "large_model_usage": 0.8020833333333334, "avg_cost": 0.0005285929687499996}, "random_baseline": {"accuracy": 0.7869791666666667, "accuracy_std": 0.00854230180565455, "avg_cost": 0.0005292125000000001, "cost_std": 4.150774116678241e-06, "small_model_fraction": 0.1491063559779555}, "unrestricted_random_baseline": {"accuracy": 0.7864583333333333, "accuracy_std": 0.008869472065586652, "avg_cost": 0.0005350349218749999, "cost_std": 1.2026095707486768e-05, "small_model_fraction": 0.1491063559779555}}, "0.1": {"accuracy": 0.7473958333333334, "large_model_usage": 0.6171875, "small_model_usage": 0.3828125, "avg_lambda": 0.699999999999995, "avg_cost": 0.0003982648437499999, "unrestricted": {"accuracy": 0.7682291666666666, "large_model_usage": 0.6171875, "avg_cost": 0.0004144992187499999}, "random_baseline": {"accuracy": 0.7247395833333334, "accuracy_std": 0.015842667889849105, "avg_cost": 0.0003980176041666663, "cost_std": 1.3858142753179185e-05, "small_model_fraction": 0.3724247499692505}, "unrestricted_random_baseline": {"accuracy": 0.7283854166666667, "accuracy_std": 0.012161765030037096, "avg_cost": 0.00041041656249999963, "cost_std": 8.243693828091611e-06, "small_model_fraction": 0.34460700493185015}}, "0.15": {"accuracy": 0.6822916666666666, "large_model_usage": 0.4192708333333333, "small_model_usage": 0.5807291666666667, "avg_lambda": 0.5, "avg_cost": 0.00028273880208333317, "unrestricted": {"accuracy": 0.7057291666666666, "large_model_usage": 0.4192708333333333, "avg_cost": 0.00029330651041666663}, "random_baseline": {"accuracy": 0.6617187499999999, "accuracy_std": 0.016406249999999997, "avg_cost": 0.0002845196093749998, "cost_std": 1.5065113079373858e-05, "small_model_fraction": 0.5703796433521753}, "unrestricted_random_baseline": {"accuracy": 0.6651041666666667, "accuracy_std": 0.02037916411095194, "avg_cost": 0.0002952119531249999, "cost_std": 9.877867490509963e-06, "small_model_fraction": 0.5522717817285352}}, "0.2": {"accuracy": 0.6432291666666666, "large_model_usage": 0.3385416666666667, "small_model_usage": 0.6614583333333333, "avg_lambda": 0.40000000000000285, "avg_cost": 0.00023209817708333324, "unrestricted": {"accuracy": 0.6875, "large_model_usage": 0.3385416666666667, "avg_cost": 0.00024282734374999997}, "random_baseline": {"accuracy": 0.6390625, "accuracy_std": 0.013740527039867625, "avg_cost": 0.00022608466145833321, "cost_std": 1.6837010814850363e-05, "small_model_fraction": 0.6571528018374633}, "unrestricted_random_baseline": {"accuracy": 0.6442708333333333, "accuracy_std": 0.01260804003844812, "avg_cost": 0.00024320989583333311, "cost_std": 1.1437328210278189e-05, "small_model_fraction": 0.6387682799327328}}, "0.25": {"accuracy": 0.5963541666666666, "large_model_usage": 0.2786458333333333, "small_model_usage": 0.7213541666666667, "avg_lambda": 0.26999999999999935, "avg_cost": 0.00018674921875000002, "unrestricted": {"accuracy": 0.6666666666666666, "large_model_usage": 0.2786458333333333, "avg_cost": 0.0002054315104166667}, "random_baseline": {"accuracy": 0.6151041666666666, "accuracy_std": 0.01151736686848749, "avg_cost": 0.00018537148437499995, "cost_std": 1.3394532228809196e-05, "small_model_fraction": 0.7348586427231366}, "unrestricted_random_baseline": {"accuracy": 0.6236979166666667, "accuracy_std": 0.017671914143045364, "avg_cost": 0.00021237492187499993, "cost_std": 1.5547278351852563e-05, "small_model_fraction": 0.702846370843395}}}}