{"trial": 7, "trial_seed": 48, "subject": "TruthfulQA", "method": "standard", "calibration_size": 400, "test_size": 284, "small_model_accuracy": 0.5350877192982456, "large_model_accuracy": 0.8260233918128655, "small_model_cost": 3.2208040935672557e-05, "large_model_cost": 0.000620862573099416, "hybrid_results": {"0.05": {"accuracy": 0.8204225352112676, "large_model_usage": 0.8098591549295775, "small_model_usage": 0.1901408450704225, "avg_lambda": 0.879999999999997, "avg_cost": 0.0005330313380281685, "unrestricted": {"accuracy": 0.8274647887323944, "large_model_usage": 0.8098591549295775, "avg_cost": 0.0005402214788732389}, "random_baseline": {"accuracy": 0.7926056338028169, "accuracy_std": 0.012641776810574842, "avg_cost": 0.0005408788732394361, "cost_std": 7.742439958604492e-06, "small_model_fraction": 0.14920675926574858}, "unrestricted_random_baseline": {"accuracy": 0.7926056338028169, "accuracy_std": 0.010138154964005643, "avg_cost": 0.000545339154929577, "cost_std": 1.1594404984048959e-05, "small_model_fraction": 0.13699222518471244}}, "0.1": {"accuracy": 0.7746478873239436, "large_model_usage": 0.647887323943662, "small_model_usage": 0.352112676056338, "avg_lambda": 0.6999999999999973, "avg_cost": 0.0004208693661971829, "unrestricted": {"accuracy": 0.8028169014084507, "large_model_usage": 0.647887323943662, "avg_cost": 0.0004386933098591548}, "random_baseline": {"accuracy": 0.7359154929577466, "accuracy_std": 0.013454206460945615, "avg_cost": 0.00042454591549295734, "cost_std": 1.9463818489505667e-05, "small_model_fraction": 0.339746312947102}, "unrestricted_random_baseline": {"accuracy": 0.7475352112676056, "accuracy_std": 0.01823182178309803, "avg_cost": 0.0004381151760563377, "cost_std": 1.6969285234710373e-05, "small_model_fraction": 0.3094671887951896}}, "0.15": {"accuracy": 0.704225352112676, "large_model_usage": 0.45774647887323944, "small_model_usage": 0.5422535211267605, "avg_lambda": 0.5500000000000005, "avg_cost": 0.0003088341549295774, "unrestricted": {"accuracy": 0.7394366197183099, "large_model_usage": 0.45774647887323944, "avg_cost": 0.0003196651408450703}, "random_baseline": {"accuracy": 0.6943661971830987, "accuracy_std": 0.017871235972464115, "avg_cost": 0.00031316584507042235, "cost_std": 1.8989880964655142e-05, "small_model_fraction": 0.5300705271441672}, "unrestricted_random_baseline": {"accuracy": 0.7014084507042253, "accuracy_std": 0.013709804460515356, "avg_cost": 0.00032255859154929563, "cost_std": 1.5334382976164063e-05, "small_model_fraction": 0.5116709645421755}}, "0.2": {"accuracy": 0.676056338028169, "large_model_usage": 0.352112676056338, "small_model_usage": 0.647887323943662, "avg_lambda": 0.450000000000002, "avg_cost": 0.00024216514084507038, "unrestricted": {"accuracy": 0.7007042253521126, "large_model_usage": 0.352112676056338, "avg_cost": 0.000253975}, "random_baseline": {"accuracy": 0.6633802816901408, "accuracy_std": 0.01916994026593828, "avg_cost": 0.0002400174647887323, "cost_std": 1.8968116381305494e-05, "small_model_fraction": 0.6433271325752827}, "unrestricted_random_baseline": {"accuracy": 0.6732394366197183, "accuracy_std": 0.015332071166955783, "avg_cost": 0.000262375704225352, "cost_std": 1.1596243050288907e-05, "small_model_fraction": 0.6232646706223958}}, "0.25": {"accuracy": 0.6514084507042254, "large_model_usage": 0.2852112676056338, "small_model_usage": 0.7147887323943662, "avg_lambda": 0.3199999999999981, "avg_cost": 0.00019612288732394356, "unrestricted": {"accuracy": 0.6936619718309859, "large_model_usage": 0.2852112676056338, "avg_cost": 0.00021141161971830976}, "random_baseline": {"accuracy": 0.642605633802817, "accuracy_std": 0.013384913949822686, "avg_cost": 0.00019158218309859153, "cost_std": 1.0106049894756217e-05, "small_model_fraction": 0.7215432185907718}, "unrestricted_random_baseline": {"accuracy": 0.655281690140845, "accuracy_std": 0.0126417768105748, "avg_cost": 0.00021127489436619716, "cost_std": 1.5191501640884214e-05, "small_model_fraction": 0.6955708841246313}}}}