{"trial": 8, "trial_seed": 49, "subject": "TruthfulQA", "method": "standard", "calibration_size": 300, "test_size": 384, "small_model_accuracy": 0.5628654970760234, "large_model_accuracy": 0.8523391812865497, "small_model_cost": 3.202690058479531e-05, "large_model_cost": 0.0006141052631578953, "hybrid_results": {"0.05": {"accuracy": 0.8489583333333334, "large_model_usage": 0.7864583333333334, "small_model_usage": 0.21354166666666663, "avg_lambda": 0.8999999999999971, "avg_cost": 0.0005188328125000001, "unrestricted": {"accuracy": 0.8489583333333334, "large_model_usage": 0.7864583333333334, "avg_cost": 0.0005188328125000001}, "random_baseline": {"accuracy": 0.8182291666666666, "accuracy_std": 0.010206207261596567, "avg_cost": 0.0005245832031250002, "cost_std": 7.640574618218018e-06, "small_model_fraction": 0.16367633085816768}, "unrestricted_random_baseline": {"accuracy": 0.8151041666666666, "accuracy_std": 0.01440553821452014, "avg_cost": 0.0005205245833333333, "cost_std": 1.1327211012098589e-05, "small_model_fraction": 0.16367633085816768}}, "0.1": {"accuracy": 0.7942708333333334, "large_model_usage": 0.6067708333333334, "small_model_usage": 0.39322916666666663, "avg_lambda": 0.699999999999995, "avg_cost": 0.00039555677083333323, "unrestricted": {"accuracy": 0.8177083333333334, "large_model_usage": 0.6067708333333334, "avg_cost": 0.0004088536458333333}, "random_baseline": {"accuracy": 0.7583333333333333, "accuracy_std": 0.01326847833630936, "avg_cost": 0.0004032686718750001, "cost_std": 1.69592764553542e-05, "small_model_fraction": 0.3754623198128513}, "unrestricted_random_baseline": {"accuracy": 0.7630208333333334, "accuracy_std": 0.011529137302697223, "avg_cost": 0.00040358669270833334, "cost_std": 1.4541299736842778e-05, "small_model_fraction": 0.3526185313215205}}, "0.15": {"accuracy": 0.7421875, "large_model_usage": 0.4609375, "small_model_usage": 0.5390625, "avg_lambda": 0.5399999999999987, "avg_cost": 0.00030720260416666626, "unrestricted": {"accuracy": 0.7760416666666666, "large_model_usage": 0.4609375, "avg_cost": 0.00031821822916666627}, "random_baseline": {"accuracy": 0.7075520833333333, "accuracy_std": 0.011051612069751535, "avg_cost": 0.0003084176041666665, "cost_std": 1.3737289232875066e-05, "small_model_fraction": 0.5272531650799627}, "unrestricted_random_baseline": {"accuracy": 0.7236979166666668, "accuracy_std": 0.01125831579230065, "avg_cost": 0.00031667216145833313, "cost_std": 1.5936859502316583e-05, "small_model_fraction": 0.5083285224402586}}, "0.2": {"accuracy": 0.6979166666666666, "large_model_usage": 0.3567708333333333, "small_model_usage": 0.6432291666666667, "avg_lambda": 0.40000000000000285, "avg_cost": 0.0002394578124999997, "unrestricted": {"accuracy": 0.7421875, "large_model_usage": 0.3567708333333333, "avg_cost": 0.00025316614583333297}, "random_baseline": {"accuracy": 0.6822916666666667, "accuracy_std": 0.01067390704787458, "avg_cost": 0.00023657874999999983, "cost_std": 1.7853441441735437e-05, "small_model_fraction": 0.6436374803587477}, "unrestricted_random_baseline": {"accuracy": 0.6919270833333334, "accuracy_std": 0.01343356289221763, "avg_cost": 0.00025453874999999983, "cost_std": 1.1026453244512938e-05, "small_model_fraction": 0.6200868139626716}}, "0.25": {"accuracy": 0.6510416666666666, "large_model_usage": 0.2786458333333333, "small_model_usage": 0.7213541666666667, "avg_lambda": 0.3100000000000015, "avg_cost": 0.00018702552083333312, "unrestricted": {"accuracy": 0.7265625, "large_model_usage": 0.2786458333333333, "avg_cost": 0.00020453072916666639}, "random_baseline": {"accuracy": 0.6611979166666666, "accuracy_std": 0.008022355104556967, "avg_cost": 0.00018477700520833318, "cost_std": 1.0235326421564296e-05, "small_model_fraction": 0.733715200195107}, "unrestricted_random_baseline": {"accuracy": 0.6713541666666666, "accuracy_std": 0.014014191715701793, "avg_cost": 0.00020594026041666653, "cost_std": 1.2016452305899774e-05, "small_model_fraction": 0.7036415718678991}}}}