{"trial": 23, "trial_seed": 64, "subject": "TruthfulQA", "method": "standard_basescores", "calibration_size": 400, "test_size": 284, "small_model_accuracy": 0.5511695906432749, "large_model_accuracy": 0.8230994152046783, "small_model_cost": 3.2060087719298305e-05, "large_model_cost": 0.0006151900584795328, "hybrid_results": {"0.05": {"accuracy": 0.7711267605633803, "large_model_usage": 0.795774647887324, "small_model_usage": 0.204225352112676, "avg_lambda": 0.900000000000004, "avg_cost": 0.0005265359154929576, "unrestricted": {"accuracy": 0.7711267605633803, "large_model_usage": 0.795774647887324, "avg_cost": 0.0005265359154929576}, "random_baseline": {"accuracy": 0.7464788732394367, "accuracy_std": 0.008907824394840494, "avg_cost": 0.0005337093661971833, "cost_std": 1.4141661110263868e-05, "small_model_fraction": 0.15203153230315983}, "unrestricted_random_baseline": {"accuracy": 0.7383802816901409, "accuracy_std": 0.014088907763198965, "avg_cost": 0.0005312251056338029, "cost_std": 8.110459488568071e-06, "small_model_fraction": 0.15203153230315983}}, "0.1": {"accuracy": 0.704225352112676, "large_model_usage": 0.5985915492957746, "small_model_usage": 0.4014084507042254, "avg_lambda": 0.6999999999999973, "avg_cost": 0.0003920992957746477, "unrestricted": {"accuracy": 0.7288732394366197, "large_model_usage": 0.5985915492957746, "avg_cost": 0.0004046485915492956}, "random_baseline": {"accuracy": 0.6732394366197183, "accuracy_std": 0.021464437541025564, "avg_cost": 0.0003912922535211265, "cost_std": 1.5121449441527021e-05, "small_model_fraction": 0.382574681273951}, "unrestricted_random_baseline": {"accuracy": 0.6852112676056338, "accuracy_std": 0.014955465198306572, "avg_cost": 0.0004102107394366195, "cost_std": 2.252137971777556e-05, "small_model_fraction": 0.3610541002647341}}, "0.15": {"accuracy": 0.6619718309859155, "large_model_usage": 0.4471830985915493, "small_model_usage": 0.5528169014084507, "avg_lambda": 0.5300000000000009, "avg_cost": 0.000297669718309859, "unrestricted": {"accuracy": 0.6901408450704225, "large_model_usage": 0.4471830985915493, "avg_cost": 0.0003105499999999998}, "random_baseline": {"accuracy": 0.6373239436619719, "accuracy_std": 0.014259476571349709, "avg_cost": 0.0002991493309859153, "cost_std": 1.6959841174591754e-05, "small_model_fraction": 0.5445104112136754}, "unrestricted_random_baseline": {"accuracy": 0.6531690140845071, "accuracy_std": 0.018515234964978734, "avg_cost": 0.00031710507042253513, "cost_std": 1.435369933644091e-05, "small_model_fraction": 0.5224222278995017}}, "0.2": {"accuracy": 0.573943661971831, "large_model_usage": 0.29225352112676056, "small_model_usage": 0.7077464788732395, "avg_lambda": 0.3100000000000012, "avg_cost": 0.0001928528169014084, "unrestricted": {"accuracy": 0.6373239436619719, "large_model_usage": 0.29225352112676056, "avg_cost": 0.00021321901408450703}, "random_baseline": {"accuracy": 0.5827464788732394, "accuracy_std": 0.01470887223535591, "avg_cost": 0.0001974795422535212, "cost_std": 1.4922605396389597e-05, "small_model_fraction": 0.7242591922132172}, "unrestricted_random_baseline": {"accuracy": 0.5904929577464789, "accuracy_std": 0.015271301719860148, "avg_cost": 0.00020618876760563387, "cost_std": 1.7412857890875972e-05, "small_model_fraction": 0.6893335354911884}}, "0.25": {"accuracy": 0.5211267605633803, "large_model_usage": 0.1936619718309859, "small_model_usage": 0.8063380281690141, "avg_lambda": 0.060000000000000074, "avg_cost": 0.00013455704225352115, "unrestricted": {"accuracy": 0.573943661971831, "large_model_usage": 0.1936619718309859, "avg_cost": 0.0001538669014084507}, "random_baseline": {"accuracy": 0.563380281690141, "accuracy_std": 0.015185815952709732, "avg_cost": 0.00014003732394366204, "cost_std": 1.3003684428595913e-05, "small_model_fraction": 0.824229657754349}, "unrestricted_random_baseline": {"accuracy": 0.5651408450704226, "accuracy_std": 0.011490963994423025, "avg_cost": 0.0001546774295774648, "cost_std": 1.0100274117852056e-05, "small_model_fraction": 0.7911154977502679}}}}