{"trial": 8, "trial_seed": 49, "subject": "TruthfulQA", "method": "standard_basescores", "calibration_size": 400, "test_size": 284, "small_model_accuracy": 0.5745614035087719, "large_model_accuracy": 0.8464912280701754, "small_model_cost": 3.2026900584795304e-05, "large_model_cost": 0.0006131929824561406, "hybrid_results": {"0.05": {"accuracy": 0.823943661971831, "large_model_usage": 0.8133802816901409, "small_model_usage": 0.18661971830985913, "avg_lambda": 0.900000000000004, "avg_cost": 0.0005384538732394364, "unrestricted": {"accuracy": 0.823943661971831, "large_model_usage": 0.8133802816901409, "avg_cost": 0.0005384538732394364}, "random_baseline": {"accuracy": 0.8007042253521128, "accuracy_std": 0.010349956659647233, "avg_cost": 0.0005448654929577467, "cost_std": 6.966215331543165e-06, "small_model_fraction": 0.1286019806524934}, "unrestricted_random_baseline": {"accuracy": 0.8024647887323944, "accuracy_std": 0.010960832687516485, "avg_cost": 0.0005368070070422539, "cost_std": 1.4796841492775382e-05, "small_model_fraction": 0.1286019806524934}}, "0.1": {"accuracy": 0.778169014084507, "large_model_usage": 0.6338028169014085, "small_model_usage": 0.3661971830985915, "avg_lambda": 0.6999999999999973, "avg_cost": 0.00041555246478873205, "unrestricted": {"accuracy": 0.8028169014084507, "large_model_usage": 0.6338028169014085, "avg_cost": 0.0004273482394366193}, "random_baseline": {"accuracy": 0.7573943661971831, "accuracy_std": 0.01025971991854468, "avg_cost": 0.00043011623239436626, "cost_std": 1.5723480525935904e-05, "small_model_fraction": 0.3400757955987543}, "unrestricted_random_baseline": {"accuracy": 0.7598591549295775, "accuracy_std": 0.0101808678132401, "avg_cost": 0.0004316689788732395, "cost_std": 2.1034031114455338e-05, "small_model_fraction": 0.31977905940605517}}, "0.15": {"accuracy": 0.704225352112676, "large_model_usage": 0.47183098591549294, "small_model_usage": 0.528169014084507, "avg_lambda": 0.5, "avg_cost": 0.00031417922535211245, "unrestricted": {"accuracy": 0.7429577464788732, "large_model_usage": 0.47183098591549294, "avg_cost": 0.0003268482394366195}, "random_baseline": {"accuracy": 0.701056338028169, "accuracy_std": 0.020254897208830527, "avg_cost": 0.0003111304929577465, "cost_std": 1.3513178064669323e-05, "small_model_fraction": 0.5145065523115333}, "unrestricted_random_baseline": {"accuracy": 0.720774647887324, "accuracy_std": 0.012797733693933973, "avg_cost": 0.0003302054577464788, "cost_std": 1.5890830493785933e-05, "small_model_fraction": 0.4927072517678522}}, "0.2": {"accuracy": 0.6514084507042254, "large_model_usage": 0.30985915492957744, "small_model_usage": 0.6901408450704225, "avg_lambda": 0.34999999999999865, "avg_cost": 0.00021347500000000005, "unrestricted": {"accuracy": 0.6936619718309859, "large_model_usage": 0.30985915492957744, "avg_cost": 0.0002266933098591549}, "random_baseline": {"accuracy": 0.6563380281690142, "accuracy_std": 0.018645355344892547, "avg_cost": 0.00021857802816901403, "cost_std": 1.7054818702030604e-05, "small_model_fraction": 0.6877861508521886}, "unrestricted_random_baseline": {"accuracy": 0.6626760563380281, "accuracy_std": 0.017163461418244964, "avg_cost": 0.0002206117253521126, "cost_std": 8.381279257778922e-06, "small_model_fraction": 0.6650416888619224}}, "0.25": {"accuracy": 0.5985915492957746, "large_model_usage": 0.2535211267605634, "small_model_usage": 0.7464788732394366, "avg_lambda": 0.10999999999999963, "avg_cost": 0.00016967922535211268, "unrestricted": {"accuracy": 0.676056338028169, "large_model_usage": 0.2535211267605634, "avg_cost": 0.0001917707746478874}, "random_baseline": {"accuracy": 0.629225352112676, "accuracy_std": 0.010915492957746508, "avg_cost": 0.00017366524647887327, "cost_std": 1.3974098358811078e-05, "small_model_fraction": 0.763144600035709}, "unrestricted_random_baseline": {"accuracy": 0.6397887323943663, "accuracy_std": 0.019030379037847416, "avg_cost": 0.0001883608450704225, "cost_std": 1.6923928464212228e-05, "small_model_fraction": 0.7251321454467552}}}}