{"trial": 6, "trial_seed": 47, "subject": "TruthfulQA", "method": "standard", "calibration_size": 300, "test_size": 384, "small_model_accuracy": 0.543859649122807, "large_model_accuracy": 0.827485380116959, "small_model_cost": 3.213333333333337e-05, "large_model_cost": 0.0006172397660818714, "hybrid_results": {"0.05": {"accuracy": 0.8151041666666666, "large_model_usage": 0.7890625, "small_model_usage": 0.2109375, "avg_lambda": 0.8999999999999971, "avg_cost": 0.0005190002604166664, "unrestricted": {"accuracy": 0.8151041666666666, "large_model_usage": 0.7890625, "avg_cost": 0.0005193492187499997}, "random_baseline": {"accuracy": 0.7903645833333333, "accuracy_std": 0.011891093809198817, "avg_cost": 0.0005172108854166666, "cost_std": 1.403003192250554e-05, "small_model_fraction": 0.16790023176420213}, "unrestricted_random_baseline": {"accuracy": 0.7859375, "accuracy_std": 0.011036260468967244, "avg_cost": 0.0005174908072916666, "cost_std": 1.0827444461294634e-05, "small_model_fraction": 0.16730383029978121}}, "0.1": {"accuracy": 0.7916666666666666, "large_model_usage": 0.6380208333333334, "small_model_usage": 0.36197916666666663, "avg_lambda": 0.7599999999999959, "avg_cost": 0.0004227763020833331, "unrestricted": {"accuracy": 0.796875, "large_model_usage": 0.6380208333333334, "avg_cost": 0.00042602109374999974}, "random_baseline": {"accuracy": 0.7380208333333333, "accuracy_std": 0.018929981088944014, "avg_cost": 0.00042263455729166656, "cost_std": 1.017426515953828e-05, "small_model_fraction": 0.3323557101996743}, "unrestricted_random_baseline": {"accuracy": 0.7361979166666666, "accuracy_std": 0.01022280543041173, "avg_cost": 0.0004218805208333333, "cost_std": 1.1344119872800925e-05, "small_model_fraction": 0.32681006673199914}}, "0.15": {"accuracy": 0.7526041666666666, "large_model_usage": 0.4947916666666667, "small_model_usage": 0.5052083333333333, "avg_lambda": 0.610000000000006, "avg_cost": 0.00032964609374999963, "unrestricted": {"accuracy": 0.7630208333333334, "large_model_usage": 0.4947916666666667, "avg_cost": 0.00033870859374999975}, "random_baseline": {"accuracy": 0.6906249999999999, "accuracy_std": 0.012532509807829129, "avg_cost": 0.0003305981770833332, "cost_std": 1.2408665410968134e-05, "small_model_fraction": 0.4915236890849075}, "unrestricted_random_baseline": {"accuracy": 0.7067708333333333, "accuracy_std": 0.015459189666308163, "avg_cost": 0.00033737622395833323, "cost_std": 1.0185800068294913e-05, "small_model_fraction": 0.4760350540387519}}, "0.2": {"accuracy": 0.6848958333333334, "large_model_usage": 0.3489583333333333, "small_model_usage": 0.6510416666666667, "avg_lambda": 0.40000000000000285, "avg_cost": 0.00023642734375000003, "unrestricted": {"accuracy": 0.7213541666666666, "large_model_usage": 0.3489583333333333, "avg_cost": 0.0002484690104166665}, "random_baseline": {"accuracy": 0.6455729166666666, "accuracy_std": 0.012072215610936722, "avg_cost": 0.00022998242187499996, "cost_std": 1.0067762500831024e-05, "small_model_fraction": 0.6508429937148437}, "unrestricted_random_baseline": {"accuracy": 0.6625000000000001, "accuracy_std": 0.008652733190539654, "avg_cost": 0.00024836942708333324, "cost_std": 1.3815831699249673e-05, "small_model_fraction": 0.6302626924351248}}, "0.25": {"accuracy": 0.6484375, "large_model_usage": 0.2890625, "small_model_usage": 0.7109375, "avg_lambda": 0.3100000000000015, "avg_cost": 0.00019506796874999986, "unrestricted": {"accuracy": 0.7057291666666666, "large_model_usage": 0.2890625, "avg_cost": 0.00021131276041666666}, "random_baseline": {"accuracy": 0.6330729166666667, "accuracy_std": 0.012294866285534247, "avg_cost": 0.00019495682291666657, "cost_std": 1.0753319138834652e-05, "small_model_fraction": 0.7215299195203154}, "unrestricted_random_baseline": {"accuracy": 0.6388020833333334, "accuracy_std": 0.012813452708481813, "avg_cost": 0.0002087271614583333, "cost_std": 9.257077797008657e-06, "small_model_fraction": 0.6937660961243619}}}}