{"trial": 1, "trial_seed": 42, "subject": "TruthfulQA", "method": "standard", "calibration_size": 300, "test_size": 384, "small_model_accuracy": 0.5321637426900585, "large_model_accuracy": 0.8245614035087719, "small_model_cost": 3.2059795321637445e-05, "large_model_cost": 0.0006154649122807024, "hybrid_results": {"0.05": {"accuracy": 0.8125, "large_model_usage": 0.8203125, "small_model_usage": 0.1796875, "avg_lambda": 0.8999999999999971, "avg_cost": 0.0005386778645833334, "unrestricted": {"accuracy": 0.8125, "large_model_usage": 0.8203125, "avg_cost": 0.0005386778645833334}, "random_baseline": {"accuracy": 0.7966145833333333, "accuracy_std": 0.009493607361228222, "avg_cost": 0.0005424924739583333, "cost_std": 9.267536493510888e-06, "small_model_fraction": 0.13161874221743722}, "unrestricted_random_baseline": {"accuracy": 0.7942708333333334, "accuracy_std": 0.0078125, "avg_cost": 0.0005368509114583335, "cost_std": 9.178873088319103e-06, "small_model_fraction": 0.13161874221743722}}, "0.1": {"accuracy": 0.7864583333333334, "large_model_usage": 0.671875, "small_model_usage": 0.328125, "avg_lambda": 0.7599999999999959, "avg_cost": 0.0004436674479166664, "unrestricted": {"accuracy": 0.7994791666666666, "large_model_usage": 0.671875, "avg_cost": 0.00044780286458333306}, "random_baseline": {"accuracy": 0.7455729166666666, "accuracy_std": 0.01035463299587033, "avg_cost": 0.00044118132812499995, "cost_std": 9.529625299858931e-06, "small_model_fraction": 0.29447370166979575}, "unrestricted_random_baseline": {"accuracy": 0.7536458333333333, "accuracy_std": 0.016395912748642903, "avg_cost": 0.0004421926041666666, "cost_std": 1.4180262941338643e-05, "small_model_fraction": 0.2873852882389673}}, "0.15": {"accuracy": 0.7682291666666666, "large_model_usage": 0.6588541666666666, "small_model_usage": 0.34114583333333337, "avg_lambda": 0.699999999999995, "avg_cost": 0.00042205807291666625, "unrestricted": {"accuracy": 0.7994791666666666, "large_model_usage": 0.6588541666666666, "avg_cost": 0.0004395007812499996}, "random_baseline": {"accuracy": 0.7380208333333333, "accuracy_std": 0.020937240877335183, "avg_cost": 0.00041214388020833337, "cost_std": 1.553374795395819e-05, "small_model_fraction": 0.3315137864613667}, "unrestricted_random_baseline": {"accuracy": 0.7411458333333333, "accuracy_std": 0.01142276677055277, "avg_cost": 0.0004379324218749999, "cost_std": 1.1320776573266252e-05, "small_model_fraction": 0.3016156799376332}}, "0.2": {"accuracy": 0.7135416666666666, "large_model_usage": 0.4739583333333333, "small_model_usage": 0.5260416666666667, "avg_lambda": 0.5399999999999987, "avg_cost": 0.0003131570312499998, "unrestricted": {"accuracy": 0.734375, "large_model_usage": 0.4739583333333333, "avg_cost": 0.0003264539062499997}, "random_baseline": {"accuracy": 0.6859375000000001, "accuracy_std": 0.013288907106430279, "avg_cost": 0.00031255260416666663, "cost_std": 1.213562751403926e-05, "small_model_fraction": 0.5181783159641266}, "unrestricted_random_baseline": {"accuracy": 0.69453125, "accuracy_std": 0.01774085555234488, "avg_cost": 0.00033816940104166665, "cost_std": 1.2983097299985691e-05, "small_model_fraction": 0.4953864778168912}}, "0.25": {"accuracy": 0.6901041666666666, "large_model_usage": 0.3776041666666667, "small_model_usage": 0.6223958333333333, "avg_lambda": 0.44999999999999857, "avg_cost": 0.00025318307291666644, "unrestricted": {"accuracy": 0.7161458333333334, "large_model_usage": 0.3776041666666667, "avg_cost": 0.0002661153645833331}, "random_baseline": {"accuracy": 0.6591145833333333, "accuracy_std": 0.01657076903118025, "avg_cost": 0.0002569141666666665, "cost_std": 8.9616132116518e-06, "small_model_fraction": 0.6209781656568084}, "unrestricted_random_baseline": {"accuracy": 0.6643229166666667, "accuracy_std": 0.014237434305038155, "avg_cost": 0.00026661932291666645, "cost_std": 9.823165034058509e-06, "small_model_fraction": 0.5988112506080087}}}}