{"trial": 15, "trial_seed": 56, "subject": "TruthfulQA", "method": "standard", "calibration_size": 400, "test_size": 284, "small_model_accuracy": 0.5657894736842105, "large_model_accuracy": 0.8201754385964912, "small_model_cost": 3.2195029239766125e-05, "large_model_cost": 0.0006182514619883049, "hybrid_results": {"0.05": {"accuracy": 0.795774647887324, "large_model_usage": 0.8345070422535211, "small_model_usage": 0.16549295774647887, "avg_lambda": 0.879999999999997, "avg_cost": 0.0005446577464788732, "unrestricted": {"accuracy": 0.8028169014084507, "large_model_usage": 0.8345070422535211, "avg_cost": 0.0005516788732394365}, "random_baseline": {"accuracy": 0.7971830985915493, "accuracy_std": 0.01300435585395733, "avg_cost": 0.0005449426760563383, "cost_std": 1.1651708644760782e-05, "small_model_fraction": 0.12557445221492655}, "unrestricted_random_baseline": {"accuracy": 0.8049295774647888, "accuracy_std": 0.009342605043254635, "avg_cost": 0.0005552381338028171, "cost_std": 1.3158882552971497e-05, "small_model_fraction": 0.11359416095247088}}, "0.1": {"accuracy": 0.7570422535211268, "large_model_usage": 0.676056338028169, "small_model_usage": 0.323943661971831, "avg_lambda": 0.6999999999999973, "avg_cost": 0.0004354464788732394, "unrestricted": {"accuracy": 0.778169014084507, "large_model_usage": 0.676056338028169, "avg_cost": 0.0004547704225352112}, "random_baseline": {"accuracy": 0.7369718309859155, "accuracy_std": 0.011997693688310421, "avg_cost": 0.000436560422535211, "cost_std": 1.9859027458885637e-05, "small_model_fraction": 0.3119238573284329}, "unrestricted_random_baseline": {"accuracy": 0.7454225352112676, "accuracy_std": 0.014859822426767018, "avg_cost": 0.00045591411971830957, "cost_std": 9.35753573151214e-06, "small_model_fraction": 0.2789510195910418}}, "0.15": {"accuracy": 0.6971830985915493, "large_model_usage": 0.43661971830985913, "small_model_usage": 0.5633802816901409, "avg_lambda": 0.5, "avg_cost": 0.00029426338028169026, "unrestricted": {"accuracy": 0.721830985915493, "large_model_usage": 0.43661971830985913, "avg_cost": 0.0003037845070422536}, "random_baseline": {"accuracy": 0.6595070422535212, "accuracy_std": 0.016816836529943036, "avg_cost": 0.00029197823943661954, "cost_std": 1.3135783823148193e-05, "small_model_fraction": 0.5528274473281471}, "unrestricted_random_baseline": {"accuracy": 0.6725352112676057, "accuracy_std": 0.02022733326245782, "avg_cost": 0.00030201922535211256, "cost_std": 2.0701676177982733e-05, "small_model_fraction": 0.5365813552651179}}, "0.2": {"accuracy": 0.6408450704225352, "large_model_usage": 0.3204225352112676, "small_model_usage": 0.6795774647887324, "avg_lambda": 0.35999999999999954, "avg_cost": 0.00021898873239436617, "unrestricted": {"accuracy": 0.6901408450704225, "large_model_usage": 0.3204225352112676, "avg_cost": 0.00023160845070422533}, "random_baseline": {"accuracy": 0.6221830985915492, "accuracy_std": 0.020893667181896915, "avg_cost": 0.00021338429577464794, "cost_std": 1.9988723002418276e-05, "small_model_fraction": 0.6812701086164031}, "unrestricted_random_baseline": {"accuracy": 0.6352112676056338, "accuracy_std": 0.004776288720510746, "avg_cost": 0.00022665250000000005, "cost_std": 1.4449472662605553e-05, "small_model_fraction": 0.659736826828719}}, "0.25": {"accuracy": 0.5774647887323944, "large_model_usage": 0.2887323943661972, "small_model_usage": 0.7112676056338028, "avg_lambda": 0.28000000000000086, "avg_cost": 0.00019149577464788734, "unrestricted": {"accuracy": 0.676056338028169, "large_model_usage": 0.2887323943661972, "avg_cost": 0.00021193239436619719}, "random_baseline": {"accuracy": 0.6116197183098591, "accuracy_std": 0.018501837542844592, "avg_cost": 0.00019539341549295778, "cost_std": 1.733653579709647e-05, "small_model_fraction": 0.7281819010824289}, "unrestricted_random_baseline": {"accuracy": 0.6260563380281691, "accuracy_std": 0.01752092296426021, "avg_cost": 0.00021165228873239436, "cost_std": 1.0514485205342031e-05, "small_model_fraction": 0.6933104815802754}}}}