{"trial": 26, "trial_seed": 67, "subject": "TruthfulQA", "method": "standard", "calibration_size": 400, "test_size": 284, "small_model_accuracy": 0.5423976608187134, "large_model_accuracy": 0.8245614035087719, "small_model_cost": 3.226798245614036e-05, "large_model_cost": 0.0006214649122807027, "hybrid_results": {"0.05": {"accuracy": 0.795774647887324, "large_model_usage": 0.7394366197183099, "small_model_usage": 0.2605633802816901, "avg_lambda": 0.8000000000000037, "avg_cost": 0.0004907676056338028, "unrestricted": {"accuracy": 0.8028169014084507, "large_model_usage": 0.7394366197183099, "avg_cost": 0.0004951690140845071}, "random_baseline": {"accuracy": 0.773943661971831, "accuracy_std": 0.017018374610696576, "avg_cost": 0.00047983746478873223, "cost_std": 1.6293265400959474e-05, "small_model_fraction": 0.22182278968394478}, "unrestricted_random_baseline": {"accuracy": 0.7908450704225352, "accuracy_std": 0.013472624274442978, "avg_cost": 0.000497008802816901, "cost_std": 6.22306059451965e-06, "small_model_fraction": 0.2143526074275389}}, "0.1": {"accuracy": 0.7464788732394366, "large_model_usage": 0.5352112676056338, "small_model_usage": 0.46478873239436624, "avg_lambda": 0.6100000000000032, "avg_cost": 0.00035573239436619735, "unrestricted": {"accuracy": 0.7605633802816901, "large_model_usage": 0.5352112676056338, "avg_cost": 0.00036749295774647905}, "random_baseline": {"accuracy": 0.7066901408450704, "accuracy_std": 0.017036578032961915, "avg_cost": 0.00035425838028169, "cost_std": 1.633971961011308e-05, "small_model_fraction": 0.4510079813104747}, "unrestricted_random_baseline": {"accuracy": 0.7183098591549295, "accuracy_std": 0.014172261829224049, "avg_cost": 0.00036865679577464776, "cost_std": 1.9181103314498983e-05, "small_model_fraction": 0.4310476543213584}}, "0.15": {"accuracy": 0.6725352112676056, "large_model_usage": 0.3767605633802817, "small_model_usage": 0.6232394366197183, "avg_lambda": 0.40000000000000185, "avg_cost": 0.00025282394366197197, "unrestricted": {"accuracy": 0.721830985915493, "large_model_usage": 0.3767605633802817, "avg_cost": 0.00026776760563380295}, "random_baseline": {"accuracy": 0.6514084507042254, "accuracy_std": 0.017885105773521537, "avg_cost": 0.0002474517253521128, "cost_std": 1.2880102665350024e-05, "small_model_fraction": 0.6256668186110478}, "unrestricted_random_baseline": {"accuracy": 0.6545774647887324, "accuracy_std": 0.013768461123955748, "avg_cost": 0.00026740728873239454, "cost_std": 2.1367050907007627e-05, "small_model_fraction": 0.6003040558140987}}, "0.2": {"accuracy": 0.602112676056338, "large_model_usage": 0.2992957746478873, "small_model_usage": 0.7007042253521127, "avg_lambda": 0.29999999999999843, "avg_cost": 0.00020002816901408454, "unrestricted": {"accuracy": 0.7007042253521126, "large_model_usage": 0.2992957746478873, "avg_cost": 0.00021902816901408465}, "random_baseline": {"accuracy": 0.6190140845070423, "accuracy_std": 0.01541272438608403, "avg_cost": 0.00020724000000000003, "cost_std": 1.468598013674503e-05, "small_model_fraction": 0.715273148813087}, "unrestricted_random_baseline": {"accuracy": 0.646830985915493, "accuracy_std": 0.0205345398758075, "avg_cost": 0.0002247302464788733, "cost_std": 2.196100965149287e-05, "small_model_fraction": 0.6830258660486341}}, "0.25": {"accuracy": 0.5669014084507042, "large_model_usage": 0.19014084507042253, "small_model_usage": 0.8098591549295775, "avg_lambda": 0.060000000000000074, "avg_cost": 0.00013210563380281688, "unrestricted": {"accuracy": 0.6549295774647887, "large_model_usage": 0.19014084507042253, "avg_cost": 0.00015075352112676053}, "random_baseline": {"accuracy": 0.5887323943661972, "accuracy_std": 0.01257786697147305, "avg_cost": 0.000134053838028169, "cost_std": 1.4922066095134422e-05, "small_model_fraction": 0.8305530013939415}, "unrestricted_random_baseline": {"accuracy": 0.5943661971830986, "accuracy_std": 0.014670892011267343, "avg_cost": 0.00015068394366197184, "cost_std": 1.0151891453044095e-05, "small_model_fraction": 0.7989033332100013}}}}