{"trial": 11, "trial_seed": 52, "subject": "TruthfulQA", "method": "standard", "calibration_size": 400, "test_size": 284, "small_model_accuracy": 0.5570175438596491, "large_model_accuracy": 0.8377192982456141, "small_model_cost": 3.225409356725149e-05, "large_model_cost": 0.0006200058479532172, "hybrid_results": {"0.05": {"accuracy": 0.8204225352112676, "large_model_usage": 0.795774647887324, "small_model_usage": 0.204225352112676, "avg_lambda": 0.900000000000004, "avg_cost": 0.0005289771126760562, "unrestricted": {"accuracy": 0.8204225352112676, "large_model_usage": 0.795774647887324, "avg_cost": 0.0005294489436619717}, "random_baseline": {"accuracy": 0.7982394366197184, "accuracy_std": 0.013550620707671752, "avg_cost": 0.0005352524295774647, "cost_std": 1.77151773184849e-05, "small_model_fraction": 0.15487616089255268}, "unrestricted_random_baseline": {"accuracy": 0.7908450704225353, "accuracy_std": 0.01923450744189813, "avg_cost": 0.0005236415845070424, "cost_std": 1.105785096342094e-05, "small_model_fraction": 0.15407338832336087}}, "0.1": {"accuracy": 0.7429577464788732, "large_model_usage": 0.6126760563380281, "small_model_usage": 0.38732394366197187, "avg_lambda": 0.7100000000000012, "avg_cost": 0.0003988080985915492, "unrestricted": {"accuracy": 0.7746478873239436, "large_model_usage": 0.6126760563380281, "avg_cost": 0.0004156249999999998}, "random_baseline": {"accuracy": 0.7169014084507043, "accuracy_std": 0.017829561832636866, "avg_cost": 0.000403003028169014, "cost_std": 1.9076467229129587e-05, "small_model_fraction": 0.3763455365484311}, "unrestricted_random_baseline": {"accuracy": 0.731338028169014, "accuracy_std": 0.017467770077200646, "avg_cost": 0.00041640890845070404, "cost_std": 1.2804072377248817e-05, "small_model_fraction": 0.347733284380814}}, "0.15": {"accuracy": 0.6690140845070423, "large_model_usage": 0.44366197183098594, "small_model_usage": 0.556338028169014, "avg_lambda": 0.5500000000000005, "avg_cost": 0.000300357394366197, "unrestricted": {"accuracy": 0.7077464788732394, "large_model_usage": 0.44366197183098594, "avg_cost": 0.00031026584507042233}, "random_baseline": {"accuracy": 0.6609154929577465, "accuracy_std": 0.01774941290299795, "avg_cost": 0.0003028956690140844, "cost_std": 2.3297274531506043e-05, "small_model_fraction": 0.5438494248663918}, "unrestricted_random_baseline": {"accuracy": 0.665492957746479, "accuracy_std": 0.016515548450082476, "avg_cost": 0.0003106125352112675, "cost_std": 1.4709434896022497e-05, "small_model_fraction": 0.526991200913361}}, "0.2": {"accuracy": 0.6302816901408451, "large_model_usage": 0.3380281690140845, "small_model_usage": 0.6619718309859155, "avg_lambda": 0.40000000000000185, "avg_cost": 0.00022983626760563364, "unrestricted": {"accuracy": 0.6830985915492958, "large_model_usage": 0.3380281690140845, "avg_cost": 0.0002441742957746478}, "random_baseline": {"accuracy": 0.6239436619718309, "accuracy_std": 0.017018374610696597, "avg_cost": 0.0002346591901408449, "cost_std": 2.5645416239914714e-05, "small_model_fraction": 0.663833969760244}, "unrestricted_random_baseline": {"accuracy": 0.6309859154929578, "accuracy_std": 0.016273549307452617, "avg_cost": 0.0002471783098591548, "cost_std": 2.0223050821938725e-05, "small_model_fraction": 0.6394392690009186}}, "0.25": {"accuracy": 0.573943661971831, "large_model_usage": 0.2640845070422535, "small_model_usage": 0.7359154929577465, "avg_lambda": 0.3100000000000012, "avg_cost": 0.000179526408450704, "unrestricted": {"accuracy": 0.6443661971830986, "large_model_usage": 0.2640845070422535, "avg_cost": 0.00019768133802816886}, "random_baseline": {"accuracy": 0.5968309859154929, "accuracy_std": 0.012424139974913612, "avg_cost": 0.00017354517605633796, "cost_std": 1.3060039139420638e-05, "small_model_fraction": 0.7494310926603521}, "unrestricted_random_baseline": {"accuracy": 0.6003521126760564, "accuracy_std": 0.019108399367908632, "avg_cost": 0.00019752661971830977, "cost_std": 1.4301356775630863e-05, "small_model_fraction": 0.7185423212666681}}}}