{"trial": 17, "trial_seed": 58, "subject": "TruthfulQA", "method": "standard", "calibration_size": 400, "test_size": 284, "small_model_accuracy": 0.543859649122807, "large_model_accuracy": 0.8157894736842105, "small_model_cost": 3.218011695906433e-05, "large_model_cost": 0.0006190526315789484, "hybrid_results": {"0.05": {"accuracy": 0.7816901408450704, "large_model_usage": 0.778169014084507, "small_model_usage": 0.221830985915493, "avg_lambda": 0.8499999999999961, "avg_cost": 0.0005093940140845071, "unrestricted": {"accuracy": 0.7922535211267606, "large_model_usage": 0.778169014084507, "avg_cost": 0.0005172531690140846}, "random_baseline": {"accuracy": 0.7679577464788732, "accuracy_std": 0.010015114544597093, "avg_cost": 0.0005058264788732395, "cost_std": 1.5682951582474168e-05, "small_model_fraction": 0.186852535708657}, "unrestricted_random_baseline": {"accuracy": 0.772887323943662, "accuracy_std": 0.010113666631228936, "avg_cost": 0.0005155611267605634, "cost_std": 1.3643655132948395e-05, "small_model_fraction": 0.17346094769968745}}, "0.1": {"accuracy": 0.7605633802816901, "large_model_usage": 0.5176056338028169, "small_model_usage": 0.4823943661971831, "avg_lambda": 0.6399999999999962, "avg_cost": 0.0003495630281690137, "unrestricted": {"accuracy": 0.7676056338028169, "large_model_usage": 0.5176056338028169, "avg_cost": 0.00035589401408450666}, "random_baseline": {"accuracy": 0.7007042253521127, "accuracy_std": 0.010680106259227539, "avg_cost": 0.00035396176056338004, "cost_std": 1.833733975971981e-05, "small_model_fraction": 0.4591961570810356}, "unrestricted_random_baseline": {"accuracy": 0.7, "accuracy_std": 0.010058349899356125, "avg_cost": 0.0003543797183098589, "cost_std": 1.3519695419397798e-05, "small_model_fraction": 0.448408488962699}}, "0.15": {"accuracy": 0.7288732394366197, "large_model_usage": 0.45422535211267606, "small_model_usage": 0.545774647887324, "avg_lambda": 0.5100000000000009, "avg_cost": 0.0003068658450704223, "unrestricted": {"accuracy": 0.7429577464788732, "large_model_usage": 0.45422535211267606, "avg_cost": 0.00031689401408450685}, "random_baseline": {"accuracy": 0.6788732394366197, "accuracy_std": 0.014154754395944899, "avg_cost": 0.00030185524647887297, "cost_std": 1.527718637668043e-05, "small_model_fraction": 0.5319499188179373}, "unrestricted_random_baseline": {"accuracy": 0.6827464788732395, "accuracy_std": 0.013678115915179605, "avg_cost": 0.0003127150704225349, "cost_std": 1.5141239279014875e-05, "small_model_fraction": 0.5148624445125853}}, "0.2": {"accuracy": 0.6866197183098591, "large_model_usage": 0.33098591549295775, "small_model_usage": 0.6690140845070423, "avg_lambda": 0.3900000000000005, "avg_cost": 0.00022775316901408447, "unrestricted": {"accuracy": 0.7147887323943662, "large_model_usage": 0.33098591549295775, "avg_cost": 0.0002392954225352113}, "random_baseline": {"accuracy": 0.645774647887324, "accuracy_std": 0.017264296721311637, "avg_cost": 0.0002280701408450704, "cost_std": 1.2807496995503223e-05, "small_model_fraction": 0.6667537715892312}, "unrestricted_random_baseline": {"accuracy": 0.6542253521126761, "accuracy_std": 0.016500527484309812, "avg_cost": 0.00023908499999999983, "cost_std": 1.860066753910231e-05, "small_model_fraction": 0.6470863766549111}}, "0.25": {"accuracy": 0.6584507042253521, "large_model_usage": 0.31338028169014087, "small_model_usage": 0.6866197183098591, "avg_lambda": 0.3100000000000012, "avg_cost": 0.0002112531690140844, "unrestricted": {"accuracy": 0.7112676056338029, "large_model_usage": 0.31338028169014087, "avg_cost": 0.00022802781690140847}, "random_baseline": {"accuracy": 0.6323943661971831, "accuracy_std": 0.014620098233821635, "avg_cost": 0.0002097802464788732, "cost_std": 1.620099790000639e-05, "small_model_fraction": 0.6948689066295679}, "unrestricted_random_baseline": {"accuracy": 0.6443661971830986, "accuracy_std": 0.007551975559692673, "avg_cost": 0.00022069063380281682, "cost_std": 1.801414053402841e-05, "small_model_fraction": 0.666285785986767}}}}