{"trial": 27, "trial_seed": 68, "subject": "TruthfulQA", "method": "standard", "calibration_size": 400, "test_size": 284, "small_model_accuracy": 0.543859649122807, "large_model_accuracy": 0.8216374269005848, "small_model_cost": 3.2207602339181305e-05, "large_model_cost": 0.0006192397660818729, "hybrid_results": {"0.05": {"accuracy": 0.7676056338028169, "large_model_usage": 0.7394366197183099, "small_model_usage": 0.2605633802816901, "avg_lambda": 0.8000000000000037, "avg_cost": 0.0004868654929577463, "unrestricted": {"accuracy": 0.778169014084507, "large_model_usage": 0.7394366197183099, "avg_cost": 0.0004939218309859154}, "random_baseline": {"accuracy": 0.7411971830985916, "accuracy_std": 0.011704767703235662, "avg_cost": 0.0004946989788732392, "cost_std": 1.2676541797677974e-05, "small_model_fraction": 0.22549747918437565}, "unrestricted_random_baseline": {"accuracy": 0.7380281690140844, "accuracy_std": 0.008212608302599003, "avg_cost": 0.0004976868661971827, "cost_std": 1.1201381509879298e-05, "small_model_fraction": 0.2134771190337825}}, "0.1": {"accuracy": 0.7288732394366197, "large_model_usage": 0.6267605633802817, "small_model_usage": 0.37323943661971826, "avg_lambda": 0.6999999999999973, "avg_cost": 0.0004112105633802815, "unrestricted": {"accuracy": 0.75, "large_model_usage": 0.6267605633802817, "avg_cost": 0.00042423873239436603}, "random_baseline": {"accuracy": 0.7091549295774647, "accuracy_std": 0.011485567908662064, "avg_cost": 0.00042298373239436584, "cost_std": 1.0805250680497824e-05, "small_model_fraction": 0.3543744543319008}, "unrestricted_random_baseline": {"accuracy": 0.7084507042253521, "accuracy_std": 0.015809819944115268, "avg_cost": 0.00041612172535211245, "cost_std": 1.724417025928512e-05, "small_model_fraction": 0.3321811746127421}}, "0.15": {"accuracy": 0.6971830985915493, "large_model_usage": 0.46830985915492956, "small_model_usage": 0.5316901408450705, "avg_lambda": 0.5, "avg_cost": 0.00031524577464788725, "unrestricted": {"accuracy": 0.7112676056338029, "large_model_usage": 0.46830985915492956, "avg_cost": 0.0003261964788732393}, "random_baseline": {"accuracy": 0.655281690140845, "accuracy_std": 0.019442869989124676, "avg_cost": 0.00030796820422535195, "cost_std": 1.5408307959571597e-05, "small_model_fraction": 0.5178489531064818}, "unrestricted_random_baseline": {"accuracy": 0.6693661971830986, "accuracy_std": 0.015938690560220084, "avg_cost": 0.0003274038732394364, "cost_std": 1.8580615679704886e-05, "small_model_fraction": 0.4991946017749728}}, "0.2": {"accuracy": 0.676056338028169, "large_model_usage": 0.38380281690140844, "small_model_usage": 0.6161971830985915, "avg_lambda": 0.40000000000000185, "avg_cost": 0.00026030915492957727, "unrestricted": {"accuracy": 0.7007042253521126, "large_model_usage": 0.38380281690140844, "avg_cost": 0.0002739499999999998}, "random_baseline": {"accuracy": 0.6355633802816901, "accuracy_std": 0.022115927813298894, "avg_cost": 0.00025983753521126746, "cost_std": 1.357280659291837e-05, "small_model_fraction": 0.611432615316837}, "unrestricted_random_baseline": {"accuracy": 0.6471830985915494, "accuracy_std": 0.0147551597463077, "avg_cost": 0.00027529042253521106, "cost_std": 1.063730029555723e-05, "small_model_fraction": 0.5881956516325071}}, "0.25": {"accuracy": 0.6267605633802817, "large_model_usage": 0.323943661971831, "small_model_usage": 0.676056338028169, "avg_lambda": 0.3100000000000012, "avg_cost": 0.00021480211267605633, "unrestricted": {"accuracy": 0.6830985915492958, "large_model_usage": 0.323943661971831, "avg_cost": 0.00023535845070422507}, "random_baseline": {"accuracy": 0.6235915492957746, "accuracy_std": 0.018728290347848193, "avg_cost": 0.00020583285211267605, "cost_std": 1.6060450260289433e-05, "small_model_fraction": 0.6889531415574871}, "unrestricted_random_baseline": {"accuracy": 0.6242957746478873, "accuracy_std": 0.01696364714147312, "avg_cost": 0.00023123070422535196, "cost_std": 1.116958382846956e-05, "small_model_fraction": 0.6539357450708798}}}}