{"trial": 23, "trial_seed": 64, "subject": "TruthfulQA", "method": "standard", "calibration_size": 400, "test_size": 284, "small_model_accuracy": 0.5453216374269005, "large_model_accuracy": 0.8245614035087719, "small_model_cost": 3.2261842105263195e-05, "large_model_cost": 0.0006199502923976609, "hybrid_results": {"0.05": {"accuracy": 0.8133802816901409, "large_model_usage": 0.8274647887323944, "small_model_usage": 0.17253521126760563, "avg_lambda": 0.900000000000004, "avg_cost": 0.0005465225352112675, "unrestricted": {"accuracy": 0.8133802816901409, "large_model_usage": 0.8274647887323944, "avg_cost": 0.0005474661971830984}, "random_baseline": {"accuracy": 0.796830985915493, "accuracy_std": 0.012202622148706689, "avg_cost": 0.0005512504577464789, "cost_std": 1.2254482076463118e-05, "small_model_fraction": 0.12494333885557943}, "unrestricted_random_baseline": {"accuracy": 0.7919014084507043, "accuracy_std": 0.012543319106025998, "avg_cost": 0.0005514221126760564, "cost_std": 8.764182567256737e-06, "small_model_fraction": 0.12333762077253481}}, "0.1": {"accuracy": 0.7640845070422535, "large_model_usage": 0.6619718309859155, "small_model_usage": 0.3380281690140845, "avg_lambda": 0.6999999999999973, "avg_cost": 0.00043103661971830986, "unrestricted": {"accuracy": 0.7852112676056338, "large_model_usage": 0.6619718309859155, "avg_cost": 0.0004459873239436618}, "random_baseline": {"accuracy": 0.731338028169014, "accuracy_std": 0.02409078659591173, "avg_cost": 0.00042592038732394385, "cost_std": 1.6500496416890536e-05, "small_model_fraction": 0.3214520764962442}, "unrestricted_random_baseline": {"accuracy": 0.7401408450704225, "accuracy_std": 0.01814661792496494, "avg_cost": 0.00044447147887323967, "cost_std": 2.2858327667313815e-05, "small_model_fraction": 0.296012229553679}}, "0.15": {"accuracy": 0.7429577464788732, "large_model_usage": 0.5316901408450704, "small_model_usage": 0.4683098591549296, "avg_lambda": 0.5999999999999969, "avg_cost": 0.00035401549295774635, "unrestricted": {"accuracy": 0.7605633802816901, "large_model_usage": 0.5316901408450704, "avg_cost": 0.00036512816901408446}, "random_baseline": {"accuracy": 0.6985915492957747, "accuracy_std": 0.022502176491085168, "avg_cost": 0.00035632771126760577, "cost_std": 1.7635635735333662e-05, "small_model_fraction": 0.4525098277966867}, "unrestricted_random_baseline": {"accuracy": 0.707394366197183, "accuracy_std": 0.01578234847254578, "avg_cost": 0.0003724378873239438, "cost_std": 1.4747522677352349e-05, "small_model_fraction": 0.43360069992322053}}, "0.2": {"accuracy": 0.7077464788732394, "large_model_usage": 0.4612676056338028, "small_model_usage": 0.5387323943661972, "avg_lambda": 0.5, "avg_cost": 0.0003067056338028168, "unrestricted": {"accuracy": 0.7394366197183099, "large_model_usage": 0.4612676056338028, "avg_cost": 0.00032090281690140847}, "random_baseline": {"accuracy": 0.6697183098591548, "accuracy_std": 0.01642521660519801, "avg_cost": 0.0003082880281690142, "cost_std": 1.550649688823667e-05, "small_model_fraction": 0.5330114254227606}, "unrestricted_random_baseline": {"accuracy": 0.6788732394366197, "accuracy_std": 0.01927314392028731, "avg_cost": 0.0003107884507042255, "cost_std": 2.5015839506501415e-05, "small_model_fraction": 0.5088537563524769}}, "0.25": {"accuracy": 0.6267605633802817, "large_model_usage": 0.3274647887323944, "small_model_usage": 0.6725352112676056, "avg_lambda": 0.34999999999999865, "avg_cost": 0.00022101549295774642, "unrestricted": {"accuracy": 0.6795774647887324, "large_model_usage": 0.3274647887323944, "avg_cost": 0.00023733239436619713}, "random_baseline": {"accuracy": 0.6401408450704225, "accuracy_std": 0.018417882860031022, "avg_cost": 0.0002253747887323944, "cost_std": 1.538522733178113e-05, "small_model_fraction": 0.6788202137398294}, "unrestricted_random_baseline": {"accuracy": 0.6352112676056338, "accuracy_std": 0.012812257323000188, "avg_cost": 0.00023217433098591552, "cost_std": 1.698048242213064e-05, "small_model_fraction": 0.6510556704680798}}}}