{"trial": 28, "trial_seed": 69, "subject": "TruthfulQA", "method": "standard_basescores", "calibration_size": 400, "test_size": 284, "small_model_accuracy": 0.5497076023391813, "large_model_accuracy": 0.8508771929824561, "small_model_cost": 3.2026169590643274e-05, "large_model_cost": 0.0006146286549707607, "hybrid_results": {"0.05": {"accuracy": 0.8485915492957746, "large_model_usage": 0.8380281690140845, "small_model_usage": 0.1619718309859155, "avg_lambda": 0.900000000000004, "avg_cost": 0.0005497482394366199, "unrestricted": {"accuracy": 0.8485915492957746, "large_model_usage": 0.8380281690140845, "avg_cost": 0.0005501566901408453}, "random_baseline": {"accuracy": 0.8274647887323944, "accuracy_std": 0.011463958166267383, "avg_cost": 0.0005548038028169017, "cost_std": 9.265202334849103e-06, "small_model_fraction": 0.11136309432633076}, "unrestricted_random_baseline": {"accuracy": 0.8295774647887324, "accuracy_std": 0.009073308962482471, "avg_cost": 0.0005498225000000003, "cost_std": 1.363073711899362e-05, "small_model_fraction": 0.11066201474896012}}, "0.1": {"accuracy": 0.7992957746478874, "large_model_usage": 0.6795774647887324, "small_model_usage": 0.3204225352112676, "avg_lambda": 0.7599999999999976, "avg_cost": 0.0004477200704225348, "unrestricted": {"accuracy": 0.8133802816901409, "large_model_usage": 0.6795774647887324, "avg_cost": 0.0004524595070422533}, "random_baseline": {"accuracy": 0.7672535211267605, "accuracy_std": 0.01706566312120087, "avg_cost": 0.00044145887323943653, "cost_std": 1.5940467171544688e-05, "small_model_fraction": 0.28648793772193887}, "unrestricted_random_baseline": {"accuracy": 0.7679577464788732, "accuracy_std": 0.012038958448787869, "avg_cost": 0.00044894672535211266, "cost_std": 7.31238944553317e-06, "small_model_fraction": 0.2783529971086556}}, "0.15": {"accuracy": 0.7183098591549296, "large_model_usage": 0.4823943661971831, "small_model_usage": 0.5176056338028169, "avg_lambda": 0.5, "avg_cost": 0.0003154806338028165, "unrestricted": {"accuracy": 0.7640845070422535, "large_model_usage": 0.4823943661971831, "avg_cost": 0.00032993838028168976}, "random_baseline": {"accuracy": 0.6827464788732395, "accuracy_std": 0.017353833141340286, "avg_cost": 0.0003163292253521126, "cost_std": 1.9353835509884425e-05, "small_model_fraction": 0.5134684946851297}, "unrestricted_random_baseline": {"accuracy": 0.7, "accuracy_std": 0.014922267676350081, "avg_cost": 0.0003334463732394365, "cost_std": 1.7297199766700466e-05, "small_model_fraction": 0.48865269516199455}}, "0.2": {"accuracy": 0.6408450704225352, "large_model_usage": 0.3380281690140845, "small_model_usage": 0.6619718309859155, "avg_lambda": 0.3400000000000022, "avg_cost": 0.00022115669014084502, "unrestricted": {"accuracy": 0.7112676056338029, "large_model_usage": 0.3380281690140845, "avg_cost": 0.00023972007042253506}, "random_baseline": {"accuracy": 0.629225352112676, "accuracy_std": 0.019289219612397674, "avg_cost": 0.00022115073943661966, "cost_std": 1.6119199325607925e-05, "small_model_fraction": 0.6753695267420563}, "unrestricted_random_baseline": {"accuracy": 0.6450704225352113, "accuracy_std": 0.016349558825695637, "avg_cost": 0.0002351248591549295, "cost_std": 1.6969214528753838e-05, "small_model_fraction": 0.6435066687084549}}, "0.25": {"accuracy": 0.5669014084507042, "large_model_usage": 0.2535211267605634, "small_model_usage": 0.7464788732394366, "avg_lambda": 0.10999999999999963, "avg_cost": 0.0001653961267605635, "unrestricted": {"accuracy": 0.676056338028169, "large_model_usage": 0.2535211267605634, "avg_cost": 0.00018903697183098608}, "random_baseline": {"accuracy": 0.6014084507042254, "accuracy_std": 0.0153320711669558, "avg_cost": 0.00016640380281690147, "cost_std": 1.504151185546303e-05, "small_model_fraction": 0.7710789766320627}, "unrestricted_random_baseline": {"accuracy": 0.6179577464788732, "accuracy_std": 0.019557315601406798, "avg_cost": 0.0001945413732394366, "cost_std": 1.0739322944734292e-05, "small_model_fraction": 0.7305009741970092}}}}