{"trial": 10, "trial_seed": 51, "subject": "TruthfulQA", "method": "standard_basescores", "calibration_size": 400, "test_size": 284, "small_model_accuracy": 0.5716374269005848, "large_model_accuracy": 0.8362573099415205, "small_model_cost": 3.198552631578946e-05, "large_model_cost": 0.0006132543859649138, "hybrid_results": {"0.05": {"accuracy": 0.8133802816901409, "large_model_usage": 0.7852112676056338, "small_model_usage": 0.21478873239436624, "avg_lambda": 0.900000000000004, "avg_cost": 0.0005168672535211268, "unrestricted": {"accuracy": 0.8133802816901409, "large_model_usage": 0.7852112676056338, "avg_cost": 0.0005172757042253522}, "random_baseline": {"accuracy": 0.805281690140845, "accuracy_std": 0.012100591744675765, "avg_cost": 0.0005178505985915489, "cost_std": 1.1493800880909418e-05, "small_model_fraction": 0.16582194425823862}, "unrestricted_random_baseline": {"accuracy": 0.7943661971830985, "accuracy_std": 0.011377108747467298, "avg_cost": 0.0005188478873239434, "cost_std": 1.2526826338876906e-05, "small_model_fraction": 0.1651192561691641}}, "0.1": {"accuracy": 0.7605633802816901, "large_model_usage": 0.6232394366197183, "small_model_usage": 0.37676056338028174, "avg_lambda": 0.7299999999999969, "avg_cost": 0.0004023390845070423, "unrestricted": {"accuracy": 0.7852112676056338, "large_model_usage": 0.6232394366197183, "avg_cost": 0.00041790246478873235}, "random_baseline": {"accuracy": 0.7475352112676057, "accuracy_std": 0.01725352112676054, "avg_cost": 0.00040291940140845043, "cost_std": 1.3410925264365155e-05, "small_model_fraction": 0.36285326137234997}, "unrestricted_random_baseline": {"accuracy": 0.7496478873239437, "accuracy_std": 0.013122989544777603, "avg_cost": 0.00041285133802816876, "cost_std": 1.4951978716749241e-05, "small_model_fraction": 0.33607842211623584}}, "0.15": {"accuracy": 0.7112676056338029, "large_model_usage": 0.4154929577464789, "small_model_usage": 0.5845070422535211, "avg_lambda": 0.5, "avg_cost": 0.000278430633802817, "unrestricted": {"accuracy": 0.7323943661971831, "large_model_usage": 0.4154929577464789, "avg_cost": 0.00028919119718309863}, "random_baseline": {"accuracy": 0.7035211267605634, "accuracy_std": 0.011442307612163338, "avg_cost": 0.0002798588028169014, "cost_std": 1.3421727369169252e-05, "small_model_fraction": 0.5760221739114133}, "unrestricted_random_baseline": {"accuracy": 0.6985915492957746, "accuracy_std": 0.01251858368636, "avg_cost": 0.000288926338028169, "cost_std": 1.0822211597283436e-05, "small_model_fraction": 0.5575099773578648}}, "0.2": {"accuracy": 0.6584507042253521, "large_model_usage": 0.2992957746478873, "small_model_usage": 0.7007042253521127, "avg_lambda": 0.3700000000000011, "avg_cost": 0.0002050080985915492, "unrestricted": {"accuracy": 0.6971830985915493, "large_model_usage": 0.2992957746478873, "avg_cost": 0.00021748697183098585}, "random_baseline": {"accuracy": 0.6566901408450704, "accuracy_std": 0.018380821318504468, "avg_cost": 0.0002023198943661971, "cost_std": 2.0930747212876952e-05, "small_model_fraction": 0.7023364155784938}, "unrestricted_random_baseline": {"accuracy": 0.6693661971830985, "accuracy_std": 0.015058735760804976, "avg_cost": 0.00021664214788732388, "cost_std": 1.7684677747125852e-05, "small_model_fraction": 0.68086808292608}}, "0.25": {"accuracy": 0.6126760563380281, "large_model_usage": 0.25704225352112675, "small_model_usage": 0.7429577464788732, "avg_lambda": 0.2400000000000003, "avg_cost": 0.0001725080985915493, "unrestricted": {"accuracy": 0.676056338028169, "large_model_usage": 0.25704225352112675, "avg_cost": 0.00019050105633802803}, "random_baseline": {"accuracy": 0.6496478873239436, "accuracy_std": 0.014624337786827576, "avg_cost": 0.00017599676056338025, "cost_std": 1.7253255603590643e-05, "small_model_fraction": 0.7582485799074381}, "unrestricted_random_baseline": {"accuracy": 0.6573943661971832, "accuracy_std": 0.017749412902997946, "avg_cost": 0.00019415080985915487, "cost_std": 1.30612466947621e-05, "small_model_fraction": 0.727293958052519}}}}