{"trial": 12, "trial_seed": 53, "subject": "TruthfulQA", "method": "standard_basescores", "calibration_size": 400, "test_size": 284, "small_model_accuracy": 0.5526315789473685, "large_model_accuracy": 0.8435672514619883, "small_model_cost": 3.209298245614036e-05, "large_model_cost": 0.0006161286549707615, "hybrid_results": {"0.05": {"accuracy": 0.8204225352112676, "large_model_usage": 0.8485915492957746, "small_model_usage": 0.15140845070422537, "avg_lambda": 0.900000000000004, "avg_cost": 0.0005596228873239435, "unrestricted": {"accuracy": 0.8204225352112676, "large_model_usage": 0.8485915492957746, "avg_cost": 0.0005600242957746476}, "random_baseline": {"accuracy": 0.8028169014084507, "accuracy_std": 0.005222674990912431, "avg_cost": 0.0005649189084507041, "cost_std": 8.38066326958537e-06, "small_model_fraction": 0.09675054162963558}, "unrestricted_random_baseline": {"accuracy": 0.7954225352112677, "accuracy_std": 0.011073369854589035, "avg_cost": 0.0005625440845070421, "cost_std": 6.004863830685424e-06, "small_model_fraction": 0.09606324037460116}}, "0.1": {"accuracy": 0.8063380281690141, "large_model_usage": 0.6971830985915493, "small_model_usage": 0.30281690140845074, "avg_lambda": 0.7599999999999976, "avg_cost": 0.00046359471830985903, "unrestricted": {"accuracy": 0.8169014084507042, "large_model_usage": 0.6971830985915493, "avg_cost": 0.0004672214788732393}, "random_baseline": {"accuracy": 0.7514084507042254, "accuracy_std": 0.013472624274442933, "avg_cost": 0.0004620068661971829, "cost_std": 1.4372996712528842e-05, "small_model_fraction": 0.2611722945006991}, "unrestricted_random_baseline": {"accuracy": 0.7588028169014085, "accuracy_std": 0.012121066533844002, "avg_cost": 0.00047542123239436597, "cost_std": 1.96153344912642e-05, "small_model_fraction": 0.25496246737187844}}, "0.15": {"accuracy": 0.7429577464788732, "large_model_usage": 0.5422535211267606, "small_model_usage": 0.4577464788732394, "avg_lambda": 0.6200000000000024, "avg_cost": 0.0003612707746478873, "unrestricted": {"accuracy": 0.7746478873239436, "large_model_usage": 0.5422535211267606, "avg_cost": 0.0003715242957746479}, "random_baseline": {"accuracy": 0.6971830985915494, "accuracy_std": 0.018565389192202925, "avg_cost": 0.0003626220070422533, "cost_std": 1.462729422071118e-05, "small_model_fraction": 0.43637382495072496}, "unrestricted_random_baseline": {"accuracy": 0.7158450704225352, "accuracy_std": 0.016594185447814356, "avg_cost": 0.0003716591197183096, "cost_std": 1.3642252709051875e-05, "small_model_fraction": 0.41881749815545727}}, "0.2": {"accuracy": 0.647887323943662, "large_model_usage": 0.3767605633802817, "small_model_usage": 0.6232394366197183, "avg_lambda": 0.40000000000000185, "avg_cost": 0.0002519961267605633, "unrestricted": {"accuracy": 0.7183098591549296, "large_model_usage": 0.3767605633802817, "avg_cost": 0.0002677214788732394}, "random_baseline": {"accuracy": 0.6397887323943663, "accuracy_std": 0.011893905460601787, "avg_cost": 0.0002466182042253519, "cost_std": 1.5261344152740694e-05, "small_model_fraction": 0.6234765192379277}, "unrestricted_random_baseline": {"accuracy": 0.6362676056338029, "accuracy_std": 0.01168356339650498, "avg_cost": 0.0002562916901408448, "cost_std": 1.431841427701583e-05, "small_model_fraction": 0.5965511911240317}}, "0.25": {"accuracy": 0.5598591549295775, "large_model_usage": 0.28169014084507044, "small_model_usage": 0.7183098591549295, "avg_lambda": 0.20000000000000093, "avg_cost": 0.00018605246478873246, "unrestricted": {"accuracy": 0.6584507042253521, "large_model_usage": 0.28169014084507044, "avg_cost": 0.00020861584507042257}, "random_baseline": {"accuracy": 0.6095070422535211, "accuracy_std": 0.014036008047883637, "avg_cost": 0.00018616176056338024, "cost_std": 1.5566128190270844e-05, "small_model_fraction": 0.7363868517316675}, "unrestricted_random_baseline": {"accuracy": 0.6200704225352112, "accuracy_std": 0.0181226890597997, "avg_cost": 0.0002122899295774647, "cost_std": 1.3170561241872198e-05, "small_model_fraction": 0.6977532864486748}}}}