{"trial": 9, "trial_seed": 50, "subject": "TruthfulQA", "method": "standard", "calibration_size": 300, "test_size": 384, "small_model_accuracy": 0.5614035087719298, "large_model_accuracy": 0.8333333333333334, "small_model_cost": 3.204590643274857e-05, "large_model_cost": 0.000615818713450294, "hybrid_results": {"0.05": {"accuracy": 0.7760416666666666, "large_model_usage": 0.6822916666666666, "small_model_usage": 0.31770833333333337, "avg_lambda": 0.8000000000000057, "avg_cost": 0.0004478492187500001, "unrestricted": {"accuracy": 0.7838541666666666, "large_model_usage": 0.6822916666666666, "avg_cost": 0.00045373984375000003}, "random_baseline": {"accuracy": 0.7499999999999999, "accuracy_std": 0.013380450602776185, "avg_cost": 0.0004485393229166668, "cost_std": 1.3601901791810486e-05, "small_model_fraction": 0.2877309334746823}, "unrestricted_random_baseline": {"accuracy": 0.76328125, "accuracy_std": 0.014934844748895476, "avg_cost": 0.0004535207031250001, "cost_std": 1.569626431295989e-05, "small_model_fraction": 0.2776403212892763}}, "0.1": {"accuracy": 0.7473958333333334, "large_model_usage": 0.59375, "small_model_usage": 0.40625, "avg_lambda": 0.699999999999995, "avg_cost": 0.00038308880208333325, "unrestricted": {"accuracy": 0.7682291666666666, "large_model_usage": 0.59375, "avg_cost": 0.0003985263020833332}, "random_baseline": {"accuracy": 0.7210937500000001, "accuracy_std": 0.01657076903118024, "avg_cost": 0.00038622169270833347, "cost_std": 1.086110185238787e-05, "small_model_fraction": 0.39866521456516907}, "unrestricted_random_baseline": {"accuracy": 0.73125, "accuracy_std": 0.012958182608984091, "avg_cost": 0.00040157690104166663, "cost_std": 1.5657914667947586e-05, "small_model_fraction": 0.3722208515965186}}, "0.15": {"accuracy": 0.6979166666666666, "large_model_usage": 0.4192708333333333, "small_model_usage": 0.5807291666666667, "avg_lambda": 0.5399999999999987, "avg_cost": 0.00028141171874999975, "unrestricted": {"accuracy": 0.7213541666666666, "large_model_usage": 0.4192708333333333, "avg_cost": 0.00029191692708333315}, "random_baseline": {"accuracy": 0.67578125, "accuracy_std": 0.013291458494973792, "avg_cost": 0.00027733552083333314, "cost_std": 1.3698107206275361e-05, "small_model_fraction": 0.5728375674241428}, "unrestricted_random_baseline": {"accuracy": 0.68828125, "accuracy_std": 0.015231592516551842, "avg_cost": 0.00030131304687499975, "cost_std": 1.4819949769349512e-05, "small_model_fraction": 0.5548421962676756}}, "0.2": {"accuracy": 0.6875, "large_model_usage": 0.390625, "small_model_usage": 0.609375, "avg_lambda": 0.5, "avg_cost": 0.0002633752604166665, "unrestricted": {"accuracy": 0.7161458333333334, "large_model_usage": 0.390625, "avg_cost": 0.00027389609374999977}, "random_baseline": {"accuracy": 0.6645833333333333, "accuracy_std": 0.014859731893424086, "avg_cost": 0.00026036729166666647, "cost_std": 1.6039944782621796e-05, "small_model_fraction": 0.603733933470174}, "unrestricted_random_baseline": {"accuracy": 0.6729166666666667, "accuracy_std": 0.012608040038448128, "avg_cost": 0.00027728432291666653, "cost_std": 1.0910501257498993e-05, "small_model_fraction": 0.5857117967641438}}, "0.25": {"accuracy": 0.6197916666666666, "large_model_usage": 0.2682291666666667, "small_model_usage": 0.7317708333333333, "avg_lambda": 0.31999999999999684, "avg_cost": 0.00018258880208333318, "unrestricted": {"accuracy": 0.6796875, "large_model_usage": 0.2682291666666667, "avg_cost": 0.0001976252604166665}, "random_baseline": {"accuracy": 0.6257812500000001, "accuracy_std": 0.011413857812922465, "avg_cost": 0.00018390124999999985, "cost_std": 1.3576519956719346e-05, "small_model_fraction": 0.7421207465628663}, "unrestricted_random_baseline": {"accuracy": 0.6307291666666667, "accuracy_std": 0.015572829570547965, "avg_cost": 0.0001989905989583332, "cost_std": 1.2218739342143532e-05, "small_model_fraction": 0.7163633660330099}}}}