{"trial": 22, "trial_seed": 63, "subject": "TruthfulQA", "method": "standard", "calibration_size": 400, "test_size": 284, "small_model_accuracy": 0.5555555555555556, "large_model_accuracy": 0.8084795321637427, "small_model_cost": 3.2250292397660844e-05, "large_model_cost": 0.000619637426900586, "hybrid_results": {"0.05": {"accuracy": 0.778169014084507, "large_model_usage": 0.8309859154929577, "small_model_usage": 0.16901408450704225, "avg_lambda": 0.900000000000004, "avg_cost": 0.0005492876760563372, "unrestricted": {"accuracy": 0.778169014084507, "large_model_usage": 0.8309859154929577, "avg_cost": 0.0005497031690140836}, "random_baseline": {"accuracy": 0.7661971830985915, "accuracy_std": 0.010229464117136572, "avg_cost": 0.0005487778521126757, "cost_std": 1.1057172820429631e-05, "small_model_fraction": 0.1197672654233091}, "unrestricted_random_baseline": {"accuracy": 0.7693661971830985, "accuracy_std": 0.010474982239889788, "avg_cost": 0.0005517999295774643, "cost_std": 7.111706709152186e-06, "small_model_fraction": 0.11905990747598533}}, "0.1": {"accuracy": 0.7394366197183099, "large_model_usage": 0.6056338028169014, "small_model_usage": 0.3943661971830986, "avg_lambda": 0.6999999999999973, "avg_cost": 0.0003944214788732394, "unrestricted": {"accuracy": 0.7640845070422535, "large_model_usage": 0.6056338028169014, "avg_cost": 0.0004111257042253521}, "random_baseline": {"accuracy": 0.6982394366197183, "accuracy_std": 0.014436619718309887, "avg_cost": 0.0004058049999999998, "cost_std": 1.65361397436195e-05, "small_model_fraction": 0.38341995389111644}, "unrestricted_random_baseline": {"accuracy": 0.7098591549295775, "accuracy_std": 0.012908663929453097, "avg_cost": 0.0004162911619718308, "cost_std": 1.2500625334933494e-05, "small_model_fraction": 0.35498176658514385}}, "0.15": {"accuracy": 0.6690140845070423, "large_model_usage": 0.4154929577464789, "small_model_usage": 0.5845070422535211, "avg_lambda": 0.5300000000000009, "avg_cost": 0.00028336514084507025, "unrestricted": {"accuracy": 0.6866197183098591, "large_model_usage": 0.4154929577464789, "avg_cost": 0.00029330176056338017}, "random_baseline": {"accuracy": 0.6359154929577465, "accuracy_std": 0.01374593049010081, "avg_cost": 0.00027324401408450685, "cost_std": 2.2177832654650593e-05, "small_model_fraction": 0.5724883408283794}, "unrestricted_random_baseline": {"accuracy": 0.647887323943662, "accuracy_std": 0.017105574368297367, "avg_cost": 0.0002903545774647886, "cost_std": 1.8321699396238247e-05, "small_model_fraction": 0.555571695681361}}, "0.2": {"accuracy": 0.6443661971830986, "large_model_usage": 0.3380281690140845, "small_model_usage": 0.6619718309859155, "avg_lambda": 0.40000000000000185, "avg_cost": 0.0002292172535211266, "unrestricted": {"accuracy": 0.676056338028169, "large_model_usage": 0.3380281690140845, "avg_cost": 0.00024417499999999996}, "random_baseline": {"accuracy": 0.6144366197183098, "accuracy_std": 0.016078098671592794, "avg_cost": 0.0002260082042253521, "cost_std": 1.427441488832158e-05, "small_model_fraction": 0.6646726672177651}, "unrestricted_random_baseline": {"accuracy": 0.626056338028169, "accuracy_std": 0.011867816581938563, "avg_cost": 0.00024023341549295765, "cost_std": 1.582594561122447e-05, "small_model_fraction": 0.6392077811141031}}, "0.25": {"accuracy": 0.5774647887323944, "large_model_usage": 0.2711267605633803, "small_model_usage": 0.7288732394366197, "avg_lambda": 0.3100000000000012, "avg_cost": 0.00018358345070422532, "unrestricted": {"accuracy": 0.6408450704225352, "large_model_usage": 0.2711267605633803, "avg_cost": 0.00020180880281690143}, "random_baseline": {"accuracy": 0.5929577464788732, "accuracy_std": 0.01281225732300016, "avg_cost": 0.00018263204225352117, "cost_std": 1.2564502536443518e-05, "small_model_fraction": 0.7423621502458855}, "unrestricted_random_baseline": {"accuracy": 0.6158450704225351, "accuracy_std": 0.00937572320823547, "avg_cost": 0.0002013673943661972, "cost_std": 1.1517937627028689e-05, "small_model_fraction": 0.7113343135056421}}}}