{"trial": 8, "trial_seed": 49, "subject": "TruthfulQA", "method": "standard", "calibration_size": 400, "test_size": 284, "small_model_accuracy": 0.5628654970760234, "large_model_accuracy": 0.8508771929824561, "small_model_cost": 3.223859649122807e-05, "large_model_cost": 0.0006186198830409359, "hybrid_results": {"0.05": {"accuracy": 0.8133802816901409, "large_model_usage": 0.8133802816901409, "small_model_usage": 0.18661971830985913, "avg_lambda": 0.879999999999997, "avg_cost": 0.0005299915492957748, "unrestricted": {"accuracy": 0.8380281690140845, "large_model_usage": 0.8133802816901409, "avg_cost": 0.000540991549295775}, "random_baseline": {"accuracy": 0.8200704225352112, "accuracy_std": 0.01161971830985914, "avg_cost": 0.0005349486971830987, "cost_std": 7.800035155634275e-06, "small_model_fraction": 0.15114454669359234}, "unrestricted_random_baseline": {"accuracy": 0.8313380281690141, "accuracy_std": 0.014298549650878116, "avg_cost": 0.0005365517605633804, "cost_std": 1.409905654493417e-05, "small_model_fraction": 0.13238542144127635}}, "0.1": {"accuracy": 0.7887323943661971, "large_model_usage": 0.6126760563380281, "small_model_usage": 0.38732394366197187, "avg_lambda": 0.6999999999999973, "avg_cost": 0.0004031323943661972, "unrestricted": {"accuracy": 0.7992957746478874, "large_model_usage": 0.6126760563380281, "avg_cost": 0.0004173788732394366}, "random_baseline": {"accuracy": 0.7693661971830986, "accuracy_std": 0.018040406278097876, "avg_cost": 0.0004160537323943662, "cost_std": 1.639688466299193e-05, "small_model_fraction": 0.36748698094148974}, "unrestricted_random_baseline": {"accuracy": 0.7732394366197184, "accuracy_std": 0.012318912453898523, "avg_cost": 0.0004213135915492958, "cost_std": 2.135047519421172e-05, "small_model_fraction": 0.34319139170625634}}, "0.15": {"accuracy": 0.7640845070422535, "large_model_usage": 0.5035211267605634, "small_model_usage": 0.4964788732394366, "avg_lambda": 0.6100000000000032, "avg_cost": 0.00033839295774647893, "unrestricted": {"accuracy": 0.778169014084507, "large_model_usage": 0.5035211267605634, "avg_cost": 0.0003488225352112677}, "random_baseline": {"accuracy": 0.7362676056338029, "accuracy_std": 0.019442869989124655, "avg_cost": 0.00033291221830985916, "cost_std": 1.706919392720086e-05, "small_model_fraction": 0.4778919991518215}, "unrestricted_random_baseline": {"accuracy": 0.7334507042253522, "accuracy_std": 0.022272344508265443, "avg_cost": 0.00034959573943661975, "cost_std": 1.3388363989141646e-05, "small_model_fraction": 0.46010565824357585}}, "0.2": {"accuracy": 0.7359154929577465, "large_model_usage": 0.43309859154929575, "small_model_usage": 0.5669014084507042, "avg_lambda": 0.5, "avg_cost": 0.0002924633802816902, "unrestricted": {"accuracy": 0.7640845070422535, "large_model_usage": 0.43309859154929575, "avg_cost": 0.0003043436619718309}, "random_baseline": {"accuracy": 0.7186619718309859, "accuracy_std": 0.01073221877051282, "avg_cost": 0.000297555, "cost_std": 1.6119857240554643e-05, "small_model_fraction": 0.5562191533743587}, "unrestricted_random_baseline": {"accuracy": 0.7137323943661971, "accuracy_std": 0.013911793144463693, "avg_cost": 0.0003001983450704225, "cost_std": 1.6397405029131886e-05, "small_model_fraction": 0.5359588177145276}}, "0.25": {"accuracy": 0.647887323943662, "large_model_usage": 0.2887323943661972, "small_model_usage": 0.7112676056338028, "avg_lambda": 0.29999999999999843, "avg_cost": 0.0001947591549295774, "unrestricted": {"accuracy": 0.7112676056338029, "large_model_usage": 0.2887323943661972, "avg_cost": 0.00021398450704225342}, "random_baseline": {"accuracy": 0.679225352112676, "accuracy_std": 0.01254331910602595, "avg_cost": 0.00019818246478873233, "cost_std": 1.1226182776050133e-05, "small_model_fraction": 0.7228414989253373}, "unrestricted_random_baseline": {"accuracy": 0.6785211267605634, "accuracy_std": 0.019224836189744618, "avg_cost": 0.000208619190140845, "cost_std": 1.6709108471067122e-05, "small_model_fraction": 0.6900550636251953}}}}