{"trial": 20, "trial_seed": 61, "subject": "TruthfulQA", "method": "standard_basescores", "calibration_size": 400, "test_size": 284, "small_model_accuracy": 0.5482456140350878, "large_model_accuracy": 0.8318713450292398, "small_model_cost": 3.20786549707603e-05, "large_model_cost": 0.0006157017543859657, "hybrid_results": {"0.05": {"accuracy": 0.8028169014084507, "large_model_usage": 0.7852112676056338, "small_model_usage": 0.21478873239436624, "avg_lambda": 0.900000000000004, "avg_cost": 0.0005198658450704222, "unrestricted": {"accuracy": 0.8028169014084507, "large_model_usage": 0.7852112676056338, "avg_cost": 0.0005198658450704222}, "random_baseline": {"accuracy": 0.778169014084507, "accuracy_std": 0.0109098122428378, "avg_cost": 0.0005218386971830985, "cost_std": 1.569244596474136e-05, "small_model_fraction": 0.1642085609900838}, "unrestricted_random_baseline": {"accuracy": 0.7753521126760562, "accuracy_std": 0.016043360211119045, "avg_cost": 0.000525872147887324, "cost_std": 1.4963727227016298e-05, "small_model_fraction": 0.1642085609900838}}, "0.1": {"accuracy": 0.7359154929577465, "large_model_usage": 0.5809859154929577, "small_model_usage": 0.41901408450704225, "avg_lambda": 0.6999999999999973, "avg_cost": 0.0003775771126760563, "unrestricted": {"accuracy": 0.7676056338028169, "large_model_usage": 0.5809859154929577, "avg_cost": 0.0003936264084507041}, "random_baseline": {"accuracy": 0.698943661971831, "accuracy_std": 0.01584507042253523, "avg_cost": 0.00037961274647887327, "cost_std": 1.5294661077600725e-05, "small_model_fraction": 0.408010995364152}, "unrestricted_random_baseline": {"accuracy": 0.7179577464788732, "accuracy_std": 0.015464923419716756, "avg_cost": 0.00039056320422535215, "cost_std": 1.6335578373316537e-05, "small_model_fraction": 0.38051157700540417}}, "0.15": {"accuracy": 0.6901408450704225, "large_model_usage": 0.44366197183098594, "small_model_usage": 0.556338028169014, "avg_lambda": 0.5999999999999969, "avg_cost": 0.00029795035211267604, "unrestricted": {"accuracy": 0.7253521126760564, "large_model_usage": 0.44366197183098594, "avg_cost": 0.00030759119718309864}, "random_baseline": {"accuracy": 0.6609154929577464, "accuracy_std": 0.015592668891186268, "avg_cost": 0.00030327042253521137, "cost_std": 1.6003750651447484e-05, "small_model_fraction": 0.5444462403761586}, "unrestricted_random_baseline": {"accuracy": 0.6633802816901408, "accuracy_std": 0.020663944719889, "avg_cost": 0.0003030636267605635, "cost_std": 1.793718439339786e-05, "small_model_fraction": 0.5279272830557873}}, "0.2": {"accuracy": 0.6338028169014085, "large_model_usage": 0.30985915492957744, "small_model_usage": 0.6901408450704225, "avg_lambda": 0.40000000000000185, "avg_cost": 0.00021324612676056333, "unrestricted": {"accuracy": 0.6725352112676056, "large_model_usage": 0.30985915492957744, "avg_cost": 0.00022488697183098593}, "random_baseline": {"accuracy": 0.6176056338028169, "accuracy_std": 0.014276855586417829, "avg_cost": 0.00021271133802816896, "cost_std": 2.118372277277499e-05, "small_model_fraction": 0.6895813891339562}, "unrestricted_random_baseline": {"accuracy": 0.6126760563380282, "accuracy_std": 0.013996202052360423, "avg_cost": 0.00021478651408450697, "cost_std": 1.3554683495201171e-05, "small_model_fraction": 0.6696355626543553}}, "0.25": {"accuracy": 0.5352112676056338, "large_model_usage": 0.19718309859154928, "small_model_usage": 0.8028169014084507, "avg_lambda": 0.15999999999999906, "avg_cost": 0.0001378024647887324, "unrestricted": {"accuracy": 0.6126760563380281, "large_model_usage": 0.19718309859154928, "avg_cost": 0.00015567570422535203}, "random_baseline": {"accuracy": 0.5732394366197184, "accuracy_std": 0.013061434500698157, "avg_cost": 0.00014123429577464784, "cost_std": 1.830606232924984e-05, "small_model_fraction": 0.8188491683692641}, "unrestricted_random_baseline": {"accuracy": 0.5887323943661971, "accuracy_std": 0.015332071166955782, "avg_cost": 0.00016112911971830985, "cost_std": 1.3777475064725941e-05, "small_model_fraction": 0.7882245418688244}}}}