{"trial": 4, "trial_seed": 45, "subject": "TruthfulQA", "method": "standard_basescores", "calibration_size": 400, "test_size": 284, "small_model_accuracy": 0.533625730994152, "large_model_accuracy": 0.8289473684210527, "small_model_cost": 3.2054824561403526e-05, "large_model_cost": 0.0006148274853801175, "hybrid_results": {"0.05": {"accuracy": 0.8450704225352113, "large_model_usage": 0.8204225352112676, "small_model_usage": 0.17957746478873238, "avg_lambda": 0.900000000000004, "avg_cost": 0.0005396126760563378, "unrestricted": {"accuracy": 0.8450704225352113, "large_model_usage": 0.8204225352112676, "avg_cost": 0.0005400845070422534}, "random_baseline": {"accuracy": 0.8130281690140844, "accuracy_std": 0.008399197494279257, "avg_cost": 0.0005424396126760564, "cost_std": 1.2188660742084318e-05, "small_model_fraction": 0.1290637230959211}, "unrestricted_random_baseline": {"accuracy": 0.8137323943661972, "accuracy_std": 0.013403426979348067, "avg_cost": 0.0005406595070422536, "cost_std": 7.058996405082935e-06, "small_model_fraction": 0.128254091797753}}, "0.1": {"accuracy": 0.8098591549295775, "large_model_usage": 0.6302816901408451, "small_model_usage": 0.3697183098591549, "avg_lambda": 0.7599999999999976, "avg_cost": 0.0004196267605633798, "unrestricted": {"accuracy": 0.8098591549295775, "large_model_usage": 0.6302816901408451, "avg_cost": 0.0004218098591549291}, "random_baseline": {"accuracy": 0.7397887323943662, "accuracy_std": 0.02546191012646607, "avg_cost": 0.000417442957746479, "cost_std": 2.193598724361229e-05, "small_model_fraction": 0.33495175381512926}, "unrestricted_random_baseline": {"accuracy": 0.7556338028169015, "accuracy_std": 0.009208941430015495, "avg_cost": 0.0004321809859154931, "cost_std": 1.775892079362978e-05, "small_model_fraction": 0.33120569855494875}}, "0.15": {"accuracy": 0.721830985915493, "large_model_usage": 0.4788732394366197, "small_model_usage": 0.5211267605633803, "avg_lambda": 0.5100000000000009, "avg_cost": 0.0003129929577464787, "unrestricted": {"accuracy": 0.7711267605633803, "large_model_usage": 0.4788732394366197, "avg_cost": 0.00032897183098591534}, "random_baseline": {"accuracy": 0.6943661971830986, "accuracy_std": 0.022535211267605645, "avg_cost": 0.00031910292253521134, "cost_std": 2.1495540429533664e-05, "small_model_fraction": 0.5179284272011037}, "unrestricted_random_baseline": {"accuracy": 0.7024647887323944, "accuracy_std": 0.022059795926305765, "avg_cost": 0.0003300197887323944, "cost_std": 1.794443994764718e-05, "small_model_fraction": 0.4905097195064281}}, "0.2": {"accuracy": 0.6830985915492958, "large_model_usage": 0.36619718309859156, "small_model_usage": 0.6338028169014085, "avg_lambda": 0.40000000000000185, "avg_cost": 0.00024295070422535207, "unrestricted": {"accuracy": 0.7288732394366197, "large_model_usage": 0.36619718309859156, "avg_cost": 0.0002590070422535209}, "random_baseline": {"accuracy": 0.6524647887323944, "accuracy_std": 0.010915492957746464, "avg_cost": 0.0002469686971830986, "cost_std": 1.974290257294534e-05, "small_model_fraction": 0.6381163808067637}, "unrestricted_random_baseline": {"accuracy": 0.6647887323943661, "accuracy_std": 0.015332071166955782, "avg_cost": 0.00026350471830985915, "cost_std": 1.2255758753519867e-05, "small_model_fraction": 0.6105647485705982}}, "0.25": {"accuracy": 0.5915492957746479, "large_model_usage": 0.27816901408450706, "small_model_usage": 0.721830985915493, "avg_lambda": 0.2700000000000005, "avg_cost": 0.00018101408450704222, "unrestricted": {"accuracy": 0.6795774647887324, "large_model_usage": 0.27816901408450706, "avg_cost": 0.0002033802816901408}, "random_baseline": {"accuracy": 0.6214788732394367, "accuracy_std": 0.01824541747610089, "avg_cost": 0.00018680341549295773, "cost_std": 1.4612540508492329e-05, "small_model_fraction": 0.7443955937528507}, "unrestricted_random_baseline": {"accuracy": 0.6330985915492958, "accuracy_std": 0.012478905032865768, "avg_cost": 0.00020595845070422533, "cost_std": 1.711956048414804e-05, "small_model_fraction": 0.7060166534098408}}}}