{"trial": 25, "trial_seed": 66, "subject": "TruthfulQA", "method": "standard_basescores", "calibration_size": 400, "test_size": 284, "small_model_accuracy": 0.5599415204678363, "large_model_accuracy": 0.827485380116959, "small_model_cost": 3.2012426900584826e-05, "large_model_cost": 0.000614564327485381, "hybrid_results": {"0.05": {"accuracy": 0.7922535211267606, "large_model_usage": 0.8309859154929577, "small_model_usage": 0.16901408450704225, "avg_lambda": 0.900000000000004, "avg_cost": 0.0005415602112676056, "unrestricted": {"accuracy": 0.795774647887324, "large_model_usage": 0.8309859154929577, "avg_cost": 0.0005428419014084507}, "random_baseline": {"accuracy": 0.782394366197183, "accuracy_std": 0.010656863345367305, "avg_cost": 0.0005388514436619719, "cost_std": 8.819826326382058e-06, "small_model_fraction": 0.12531778910083385}, "unrestricted_random_baseline": {"accuracy": 0.7887323943661971, "accuracy_std": 0.006492636941755588, "avg_cost": 0.0005436829929577466, "cost_std": 1.0150716683950183e-05, "small_model_fraction": 0.12311765870977608}}, "0.1": {"accuracy": 0.75, "large_model_usage": 0.6232394366197183, "small_model_usage": 0.37676056338028174, "avg_lambda": 0.7599999999999976, "avg_cost": 0.0004131588028169014, "unrestricted": {"accuracy": 0.7640845070422535, "large_model_usage": 0.6232394366197183, "avg_cost": 0.00041746866197183097}, "random_baseline": {"accuracy": 0.731338028169014, "accuracy_std": 0.014263823310231736, "avg_cost": 0.00041032362676056335, "cost_std": 1.7083269188149573e-05, "small_model_fraction": 0.3457297529478458}, "unrestricted_random_baseline": {"accuracy": 0.7267605633802817, "accuracy_std": 0.01462009823382161, "avg_cost": 0.0004201754929577463, "cost_std": 1.4915842884482609e-05, "small_model_fraction": 0.33833151229220104}}, "0.15": {"accuracy": 0.6690140845070423, "large_model_usage": 0.4119718309859155, "small_model_usage": 0.5880281690140845, "avg_lambda": 0.5, "avg_cost": 0.000275912323943662, "unrestricted": {"accuracy": 0.704225352112676, "large_model_usage": 0.4119718309859155, "avg_cost": 0.0002883559859154929}, "random_baseline": {"accuracy": 0.6588028169014085, "accuracy_std": 0.01331060434608153, "avg_cost": 0.000277285809859155, "cost_std": 1.4293156226632175e-05, "small_model_fraction": 0.5813250342188607}, "unrestricted_random_baseline": {"accuracy": 0.6746478873239437, "accuracy_std": 0.020843167023871463, "avg_cost": 0.00028816112676056336, "cost_std": 1.8022661946482194e-05, "small_model_fraction": 0.5599644276199649}}, "0.2": {"accuracy": 0.6161971830985915, "large_model_usage": 0.2887323943661972, "small_model_usage": 0.7112676056338028, "avg_lambda": 0.34999999999999865, "avg_cost": 0.00019583485915492942, "unrestricted": {"accuracy": 0.676056338028169, "large_model_usage": 0.2887323943661972, "avg_cost": 0.00021038415492957732}, "random_baseline": {"accuracy": 0.622887323943662, "accuracy_std": 0.01593869056022013, "avg_cost": 0.0001947480633802816, "cost_std": 2.0208585454883636e-05, "small_model_fraction": 0.718784829145882}, "unrestricted_random_baseline": {"accuracy": 0.6302816901408452, "accuracy_std": 0.012695603082619702, "avg_cost": 0.00019989204225352104, "cost_std": 1.568251978573578e-05, "small_model_fraction": 0.6938097226188198}}, "0.25": {"accuracy": 0.5633802816901409, "large_model_usage": 0.19718309859154928, "small_model_usage": 0.8028169014084507, "avg_lambda": 0.10999999999999963, "avg_cost": 0.00013623626760563367, "unrestricted": {"accuracy": 0.6302816901408451, "large_model_usage": 0.19718309859154928, "avg_cost": 0.0001538911971830984}, "random_baseline": {"accuracy": 0.605281690140845, "accuracy_std": 0.011404320239930722, "avg_cost": 0.00013493556338028156, "cost_std": 1.4413766220712191e-05, "small_model_fraction": 0.8210908923300679}, "unrestricted_random_baseline": {"accuracy": 0.6059859154929577, "accuracy_std": 0.015544887362425435, "avg_cost": 0.00014598144366197174, "cost_std": 9.098109121768004e-06, "small_model_fraction": 0.7907847006246735}}}}