{"trial": 4, "trial_seed": 45, "subject": "TruthfulQA", "method": "standard", "calibration_size": 300, "test_size": 384, "small_model_accuracy": 0.5453216374269005, "large_model_accuracy": 0.8289473684210527, "small_model_cost": 3.205482456140352e-05, "large_model_cost": 0.0006153888888888898, "hybrid_results": {"0.05": {"accuracy": 0.7786458333333334, "large_model_usage": 0.7239583333333334, "small_model_usage": 0.27604166666666663, "avg_lambda": 0.8000000000000057, "avg_cost": 0.00047669374999999986, "unrestricted": {"accuracy": 0.7786458333333334, "large_model_usage": 0.7239583333333334, "avg_cost": 0.00048207395833333317}, "random_baseline": {"accuracy": 0.7497395833333333, "accuracy_std": 0.012939852342505056, "avg_cost": 0.0004808900260416668, "cost_std": 1.2359898199288383e-05, "small_model_fraction": 0.23776279728972918}, "unrestricted_random_baseline": {"accuracy": 0.7526041666666666, "accuracy_std": 0.010925092168439095, "avg_cost": 0.0004859759635416668, "cost_std": 1.4100862111362681e-05, "small_model_fraction": 0.22853959456191308}}, "0.1": {"accuracy": 0.7526041666666666, "large_model_usage": 0.640625, "small_model_usage": 0.359375, "avg_lambda": 0.699999999999995, "avg_cost": 0.0004160791666666664, "unrestricted": {"accuracy": 0.7682291666666666, "large_model_usage": 0.640625, "avg_cost": 0.00043034999999999975}, "random_baseline": {"accuracy": 0.7205729166666667, "accuracy_std": 0.017664237377199594, "avg_cost": 0.00042485093750000016, "cost_std": 1.4447340679608661e-05, "small_model_fraction": 0.341673381361679}, "unrestricted_random_baseline": {"accuracy": 0.7291666666666667, "accuracy_std": 0.016053161466065034, "avg_cost": 0.00043728218750000027, "cost_std": 1.4927313120287307e-05, "small_model_fraction": 0.31720912630435455}}, "0.15": {"accuracy": 0.6927083333333334, "large_model_usage": 0.4635416666666667, "small_model_usage": 0.5364583333333333, "avg_lambda": 0.5, "avg_cost": 0.0003071156250000001, "unrestricted": {"accuracy": 0.7239583333333334, "large_model_usage": 0.4635416666666667, "avg_cost": 0.00032034479166666665}, "random_baseline": {"accuracy": 0.6661458333333334, "accuracy_std": 0.013420936173672016, "avg_cost": 0.00031080216145833335, "cost_std": 1.2708510084073291e-05, "small_model_fraction": 0.5284677901406144}, "unrestricted_random_baseline": {"accuracy": 0.6760416666666667, "accuracy_std": 0.012336165919090647, "avg_cost": 0.00032306390625, "cost_std": 9.977880139810213e-06, "small_model_fraction": 0.50578924713127}}, "0.2": {"accuracy": 0.6536458333333334, "large_model_usage": 0.3723958333333333, "small_model_usage": 0.6276041666666667, "avg_lambda": 0.40000000000000285, "avg_cost": 0.00025007916666666685, "unrestricted": {"accuracy": 0.6979166666666666, "large_model_usage": 0.3723958333333333, "avg_cost": 0.0002628760416666668}, "random_baseline": {"accuracy": 0.64375, "accuracy_std": 0.01231415668493426, "avg_cost": 0.0002606841145833333, "cost_std": 8.054509123132651e-06, "small_model_fraction": 0.626244453327753}, "unrestricted_random_baseline": {"accuracy": 0.63984375, "accuracy_std": 0.00894939597783311, "avg_cost": 0.00025950265624999987, "cost_std": 1.185687065365862e-05, "small_model_fraction": 0.6043069808183202}}, "0.25": {"accuracy": 0.6015625, "large_model_usage": 0.3151041666666667, "small_model_usage": 0.6848958333333333, "avg_lambda": 0.3100000000000015, "avg_cost": 0.00020769895833333327, "unrestricted": {"accuracy": 0.6796875, "large_model_usage": 0.3151041666666667, "avg_cost": 0.00022677187500000004}, "random_baseline": {"accuracy": 0.6145833333333334, "accuracy_std": 0.007547592055307003, "avg_cost": 0.00020182111979166655, "cost_std": 1.0351749365091394e-05, "small_model_fraction": 0.6988961479998153}, "unrestricted_random_baseline": {"accuracy": 0.6309895833333334, "accuracy_std": 0.015928050711846907, "avg_cost": 0.00023143513020833323, "cost_std": 1.9891873300221465e-05, "small_model_fraction": 0.6661997604013032}}}}