{"trial": 13, "trial_seed": 54, "subject": "TruthfulQA", "method": "standard", "calibration_size": 400, "test_size": 284, "small_model_accuracy": 0.5687134502923976, "large_model_accuracy": 0.8318713450292398, "small_model_cost": 3.215862573099415e-05, "large_model_cost": 0.0006180614035087729, "hybrid_results": {"0.05": {"accuracy": 0.7852112676056338, "large_model_usage": 0.8415492957746479, "small_model_usage": 0.15845070422535212, "avg_lambda": 0.8899999999999935, "avg_cost": 0.0005491848591549294, "unrestricted": {"accuracy": 0.8063380281690141, "large_model_usage": 0.8415492957746479, "avg_cost": 0.0005562482394366195}, "random_baseline": {"accuracy": 0.7785211267605634, "accuracy_std": 0.0063867454743721935, "avg_cost": 0.0005563715492957746, "cost_std": 1.193851897807276e-05, "small_model_fraction": 0.1175562686612949}, "unrestricted_random_baseline": {"accuracy": 0.7792253521126761, "accuracy_std": 0.009713460721220951, "avg_cost": 0.0005626067253521126, "cost_std": 8.651316013037482e-06, "small_model_fraction": 0.10550071857757587}}, "0.1": {"accuracy": 0.75, "large_model_usage": 0.602112676056338, "small_model_usage": 0.397887323943662, "avg_lambda": 0.6999999999999973, "avg_cost": 0.00039489612676056336, "unrestricted": {"accuracy": 0.7676056338028169, "large_model_usage": 0.602112676056338, "avg_cost": 0.00040843838028169004}, "random_baseline": {"accuracy": 0.7204225352112676, "accuracy_std": 0.01576269667999959, "avg_cost": 0.0003909738028169011, "cost_std": 2.0253716992284833e-05, "small_model_fraction": 0.3808913103205182}, "unrestricted_random_baseline": {"accuracy": 0.7193661971830987, "accuracy_std": 0.01582941323449933, "avg_cost": 0.000412821478873239, "cost_std": 1.3195388695706735e-05, "small_model_fraction": 0.3577778279566182}}, "0.15": {"accuracy": 0.7147887323943662, "large_model_usage": 0.4225352112676056, "small_model_usage": 0.5774647887323944, "avg_lambda": 0.5, "avg_cost": 0.00028725528169014087, "unrestricted": {"accuracy": 0.721830985915493, "large_model_usage": 0.4225352112676056, "avg_cost": 0.0002969454225352113}, "random_baseline": {"accuracy": 0.6742957746478873, "accuracy_std": 0.01831324454076877, "avg_cost": 0.00027713640845070423, "cost_std": 1.976208405525844e-05, "small_model_fraction": 0.5646092395624385}, "unrestricted_random_baseline": {"accuracy": 0.6785211267605633, "accuracy_std": 0.012202622148706644, "avg_cost": 0.00030016890845070416, "cost_std": 1.9283448060438532e-05, "small_model_fraction": 0.5480704191086024}}, "0.2": {"accuracy": 0.7007042253521126, "large_model_usage": 0.34507042253521125, "small_model_usage": 0.6549295774647887, "avg_lambda": 0.40000000000000185, "avg_cost": 0.00023852992957746483, "unrestricted": {"accuracy": 0.7147887323943662, "large_model_usage": 0.34507042253521125, "avg_cost": 0.00024940316901408456}, "random_baseline": {"accuracy": 0.6570422535211268, "accuracy_std": 0.020663944719889017, "avg_cost": 0.000234980985915493, "cost_std": 1.9001966150005573e-05, "small_model_fraction": 0.647772102004365}, "unrestricted_random_baseline": {"accuracy": 0.6605633802816901, "accuracy_std": 0.010229464117136572, "avg_cost": 0.00024513330985915496, "cost_std": 2.1634662265494103e-05, "small_model_fraction": 0.6292140069602351}}, "0.25": {"accuracy": 0.6302816901408451, "large_model_usage": 0.3028169014084507, "small_model_usage": 0.6971830985915493, "avg_lambda": 0.3100000000000012, "avg_cost": 0.0002034876760563381, "unrestricted": {"accuracy": 0.7007042253521126, "large_model_usage": 0.3028169014084507, "avg_cost": 0.00022206514084507047}, "random_baseline": {"accuracy": 0.6450704225352112, "accuracy_std": 0.019208706615473022, "avg_cost": 0.00020616073943661978, "cost_std": 1.4235726613039492e-05, "small_model_fraction": 0.7075810922502135}, "unrestricted_random_baseline": {"accuracy": 0.6461267605633803, "accuracy_std": 0.01568779535911652, "avg_cost": 0.00022441947183098596, "cost_std": 1.2666441379920228e-05, "small_model_fraction": 0.6758736733859554}}}}