{"trial": 17, "trial_seed": 58, "subject": "TruthfulQA", "method": "standard_basescores", "calibration_size": 400, "test_size": 284, "small_model_accuracy": 0.5497076023391813, "large_model_accuracy": 0.8187134502923976, "small_model_cost": 3.1980116959064336e-05, "large_model_cost": 0.0006142923976608198, "hybrid_results": {"0.05": {"accuracy": 0.7992957746478874, "large_model_usage": 0.7464788732394366, "small_model_usage": 0.2535211267605634, "avg_lambda": 0.8300000000000048, "avg_cost": 0.0004914271126760561, "unrestricted": {"accuracy": 0.8098591549295775, "large_model_usage": 0.7464788732394366, "avg_cost": 0.0004961947183098588}, "random_baseline": {"accuracy": 0.7852112676056339, "accuracy_std": 0.01590364759244538, "avg_cost": 0.0004915065492957745, "cost_std": 1.5935351010505356e-05, "small_model_fraction": 0.21099552432020907}, "unrestricted_random_baseline": {"accuracy": 0.7887323943661972, "accuracy_std": 0.008479996182248135, "avg_cost": 0.0004962438732394367, "cost_std": 1.4620839088698296e-05, "small_model_fraction": 0.20280815511676867}}, "0.1": {"accuracy": 0.778169014084507, "large_model_usage": 0.6161971830985915, "small_model_usage": 0.3838028169014085, "avg_lambda": 0.6999999999999973, "avg_cost": 0.0003979623239436615, "unrestricted": {"accuracy": 0.7992957746478874, "large_model_usage": 0.6161971830985915, "avg_cost": 0.00041608204225352063}, "random_baseline": {"accuracy": 0.7390845070422536, "accuracy_std": 0.008970238875251394, "avg_cost": 0.00040260225352112673, "cost_std": 1.662597798028233e-05, "small_model_fraction": 0.3715018227959314}, "unrestricted_random_baseline": {"accuracy": 0.745774647887324, "accuracy_std": 0.011442307612163308, "avg_cost": 0.0004106684859154929, "cost_std": 8.513551389502729e-06, "small_model_fraction": 0.3403849823816735}}, "0.15": {"accuracy": 0.721830985915493, "large_model_usage": 0.4119718309859155, "small_model_usage": 0.5880281690140845, "avg_lambda": 0.5, "avg_cost": 0.00027679330985915483, "unrestricted": {"accuracy": 0.75, "large_model_usage": 0.4119718309859155, "avg_cost": 0.0002883003521126759}, "random_baseline": {"accuracy": 0.6665492957746479, "accuracy_std": 0.023884039270579763, "avg_cost": 0.00026861278169014083, "cost_std": 1.5801433124347453e-05, "small_model_fraction": 0.5795843553135072}, "unrestricted_random_baseline": {"accuracy": 0.6827464788732394, "accuracy_std": 0.019056421306668736, "avg_cost": 0.00028799609154929565, "cost_std": 1.5850779257533845e-05, "small_model_fraction": 0.5598234080780243}}, "0.2": {"accuracy": 0.6725352112676056, "large_model_usage": 0.2852112676056338, "small_model_usage": 0.7147887323943662, "avg_lambda": 0.3400000000000022, "avg_cost": 0.00019563133802816908, "unrestricted": {"accuracy": 0.7077464788732394, "large_model_usage": 0.2852112676056338, "avg_cost": 0.00020977218309859168}, "random_baseline": {"accuracy": 0.6440140845070422, "accuracy_std": 0.018259003045724174, "avg_cost": 0.00019613316901408452, "cost_std": 1.0850792026696748e-05, "small_model_fraction": 0.7189631294193459}, "unrestricted_random_baseline": {"accuracy": 0.6528169014084507, "accuracy_std": 0.016827891754011996, "avg_cost": 0.0002154498591549296, "cost_std": 1.7558377427550534e-05, "small_model_fraction": 0.6946791746770877}}, "0.25": {"accuracy": 0.5809859154929577, "large_model_usage": 0.1619718309859155, "small_model_usage": 0.8380281690140845, "avg_lambda": 0.10000000000000046, "avg_cost": 0.00011997640845070423, "unrestricted": {"accuracy": 0.6408450704225352, "large_model_usage": 0.1619718309859155, "avg_cost": 0.00013373697183098596}, "random_baseline": {"accuracy": 0.6017605633802817, "accuracy_std": 0.008830940988721434, "avg_cost": 0.00012028728873239437, "cost_std": 9.68804443402352e-06, "small_model_fraction": 0.8488847060110188}, "unrestricted_random_baseline": {"accuracy": 0.6091549295774648, "accuracy_std": 0.010082972579772085, "avg_cost": 0.00013002151408450705, "cost_std": 1.1270441932248936e-05, "small_model_fraction": 0.8252538058285623}}}}