{"trial": 30, "trial_seed": 71, "subject": "TruthfulQA", "method": "standard_basescores", "calibration_size": 400, "test_size": 284, "small_model_accuracy": 0.564327485380117, "large_model_accuracy": 0.8260233918128655, "small_model_cost": 3.202997076023393e-05, "large_model_cost": 0.0006142368421052644, "hybrid_results": {"0.05": {"accuracy": 0.8204225352112676, "large_model_usage": 0.7887323943661971, "small_model_usage": 0.21126760563380287, "avg_lambda": 0.900000000000004, "avg_cost": 0.0005206080985915493, "unrestricted": {"accuracy": 0.8204225352112676, "large_model_usage": 0.7887323943661971, "avg_cost": 0.000521481338028169}, "random_baseline": {"accuracy": 0.8161971830985916, "accuracy_std": 0.009154929577464777, "avg_cost": 0.0005311606690140844, "cost_std": 1.2370807907822956e-05, "small_model_fraction": 0.16081696751089763}, "unrestricted_random_baseline": {"accuracy": 0.8123239436619718, "accuracy_std": 0.00705105084313407, "avg_cost": 0.0005239615140845069, "cost_std": 1.4944197560998903e-05, "small_model_fraction": 0.15931708923807283}}, "0.1": {"accuracy": 0.778169014084507, "large_model_usage": 0.573943661971831, "small_model_usage": 0.426056338028169, "avg_lambda": 0.6999999999999973, "avg_cost": 0.0003730588028169015, "unrestricted": {"accuracy": 0.7992957746478874, "large_model_usage": 0.573943661971831, "avg_cost": 0.0003877841549295775}, "random_baseline": {"accuracy": 0.7404929577464789, "accuracy_std": 0.010331972359944501, "avg_cost": 0.00036721169014084507, "cost_std": 1.23824498141571e-05, "small_model_fraction": 0.41424801244819853}, "unrestricted_random_baseline": {"accuracy": 0.7598591549295775, "accuracy_std": 0.013527727262182057, "avg_cost": 0.00038856387323943657, "cost_std": 1.4186043047753182e-05, "small_model_fraction": 0.38895571028306425}}, "0.15": {"accuracy": 0.7183098591549296, "large_model_usage": 0.39436619718309857, "small_model_usage": 0.6056338028169015, "avg_lambda": 0.5, "avg_cost": 0.00026423485915493, "unrestricted": {"accuracy": 0.7535211267605634, "large_model_usage": 0.39436619718309857, "avg_cost": 0.00027645316901408486}, "random_baseline": {"accuracy": 0.705281690140845, "accuracy_std": 0.014776151590392472, "avg_cost": 0.0002695883450704226, "cost_std": 1.8861431665690668e-05, "small_model_fraction": 0.6011642943027281}, "unrestricted_random_baseline": {"accuracy": 0.7066901408450705, "accuracy_std": 0.012700485068219353, "avg_cost": 0.0002786080281690141, "cost_std": 1.2599246857243901e-05, "small_model_fraction": 0.5801780942757037}}, "0.2": {"accuracy": 0.6584507042253521, "large_model_usage": 0.2887323943661972, "small_model_usage": 0.7112676056338028, "avg_lambda": 0.3400000000000022, "avg_cost": 0.00019344612676056345, "unrestricted": {"accuracy": 0.7147887323943662, "large_model_usage": 0.2887323943661972, "avg_cost": 0.00021108697183098608}, "random_baseline": {"accuracy": 0.6711267605633802, "accuracy_std": 0.016455382317673108, "avg_cost": 0.00019505686619718325, "cost_std": 1.740786320118454e-05, "small_model_fraction": 0.7227512007417203}, "unrestricted_random_baseline": {"accuracy": 0.677112676056338, "accuracy_std": 0.018899629437069892, "avg_cost": 0.00022119552816901415, "cost_std": 1.5043998651744592e-05, "small_model_fraction": 0.6924512404721543}}, "0.25": {"accuracy": 0.6373239436619719, "large_model_usage": 0.25704225352112675, "small_model_usage": 0.7429577464788732, "avg_lambda": 0.20000000000000093, "avg_cost": 0.00016985457746478887, "unrestricted": {"accuracy": 0.704225352112676, "large_model_usage": 0.25704225352112675, "avg_cost": 0.0001916151408450706}, "random_baseline": {"accuracy": 0.6549295774647887, "accuracy_std": 0.01863205148637035, "avg_cost": 0.00016508158450704232, "cost_std": 1.6649419260896277e-05, "small_model_fraction": 0.7632721056930354}, "unrestricted_random_baseline": {"accuracy": 0.6778169014084507, "accuracy_std": 0.0163836480621575, "avg_cost": 0.0001947485915492959, "cost_std": 1.4491509314726511e-05, "small_model_fraction": 0.725896106797643}}}}