{"trial": 16, "trial_seed": 57, "subject": "TruthfulQA", "method": "standard_basescores", "calibration_size": 400, "test_size": 284, "small_model_accuracy": 0.5804093567251462, "large_model_accuracy": 0.8538011695906432, "small_model_cost": 3.2017397660818725e-05, "large_model_cost": 0.00061403216374269, "hybrid_results": {"0.05": {"accuracy": 0.8098591549295775, "large_model_usage": 0.7992957746478874, "small_model_usage": 0.20070422535211263, "avg_lambda": 0.879999999999997, "avg_cost": 0.00051776338028169, "unrestricted": {"accuracy": 0.823943661971831, "large_model_usage": 0.7992957746478874, "avg_cost": 0.0005261295774647888}, "random_baseline": {"accuracy": 0.8024647887323944, "accuracy_std": 0.0059859154929577385, "avg_cost": 0.0005174229577464789, "cost_std": 8.888629843677661e-06, "small_model_fraction": 0.16540608429762416}, "unrestricted_random_baseline": {"accuracy": 0.8119718309859154, "accuracy_std": 0.012715119778357606, "avg_cost": 0.000526841056338028, "cost_std": 1.5436914101618806e-05, "small_model_fraction": 0.15103153974883177}}, "0.1": {"accuracy": 0.7605633802816901, "large_model_usage": 0.6232394366197183, "small_model_usage": 0.37676056338028174, "avg_lambda": 0.6999999999999973, "avg_cost": 0.0004030169014084505, "unrestricted": {"accuracy": 0.7816901408450704, "large_model_usage": 0.6232394366197183, "avg_cost": 0.00041861549295774635}, "random_baseline": {"accuracy": 0.7640845070422535, "accuracy_std": 0.008182359182128335, "avg_cost": 0.0003964515492957745, "cost_std": 1.4734135298262424e-05, "small_model_fraction": 0.3625599806595908}, "unrestricted_random_baseline": {"accuracy": 0.7725352112676056, "accuracy_std": 0.017689938995887992, "avg_cost": 0.0004211449295774645, "cost_std": 1.6708944433225154e-05, "small_model_fraction": 0.33575895694277735}}, "0.15": {"accuracy": 0.7323943661971831, "large_model_usage": 0.45774647887323944, "small_model_usage": 0.5422535211267605, "avg_lambda": 0.5900000000000027, "avg_cost": 0.0003065380281690138, "unrestricted": {"accuracy": 0.75, "large_model_usage": 0.45774647887323944, "avg_cost": 0.00031730563380281664}, "random_baseline": {"accuracy": 0.7355633802816902, "accuracy_std": 0.012141506795469468, "avg_cost": 0.0003084077112676055, "cost_std": 1.2633937612579183e-05, "small_model_fraction": 0.5283270347997002}, "unrestricted_random_baseline": {"accuracy": 0.7320422535211268, "accuracy_std": 0.009375723208235444, "avg_cost": 0.00031018489436619707, "cost_std": 1.6960109522878143e-05, "small_model_fraction": 0.5098264635748662}}, "0.2": {"accuracy": 0.6866197183098591, "large_model_usage": 0.27816901408450706, "small_model_usage": 0.721830985915493, "avg_lambda": 0.3900000000000005, "avg_cost": 0.0001937774647887324, "unrestricted": {"accuracy": 0.7288732394366197, "large_model_usage": 0.27816901408450706, "avg_cost": 0.00020578450704225354}, "random_baseline": {"accuracy": 0.694718309859155, "accuracy_std": 0.011360750566832176, "avg_cost": 0.00019102267605633806, "cost_std": 1.5808853872804055e-05, "small_model_fraction": 0.7220687917990743}, "unrestricted_random_baseline": {"accuracy": 0.6926056338028168, "accuracy_std": 0.014000630529627502, "avg_cost": 0.00020599422535211263, "cost_std": 1.5280561857322535e-05, "small_model_fraction": 0.7014386584188635}}, "0.25": {"accuracy": 0.6619718309859155, "large_model_usage": 0.2323943661971831, "small_model_usage": 0.7676056338028169, "avg_lambda": 0.20000000000000093, "avg_cost": 0.00015893239436619725, "unrestricted": {"accuracy": 0.721830985915493, "large_model_usage": 0.2323943661971831, "avg_cost": 0.00017783380281690146}, "random_baseline": {"accuracy": 0.6778169014084507, "accuracy_std": 0.018715045444955343, "avg_cost": 0.0001618465140845071, "cost_std": 1.4043126783251494e-05, "small_model_fraction": 0.7819385278490932}, "unrestricted_random_baseline": {"accuracy": 0.6830985915492958, "accuracy_std": 0.01178394403569119, "avg_cost": 0.0001713370070422536, "cost_std": 1.3917744911592968e-05, "small_model_fraction": 0.7494627049796002}}}}