{"trial": 26, "trial_seed": 67, "subject": "TruthfulQA", "method": "standard_basescores", "calibration_size": 400, "test_size": 284, "small_model_accuracy": 0.5482456140350878, "large_model_accuracy": 0.8245614035087719, "small_model_cost": 3.2065058479532176e-05, "large_model_cost": 0.0006158976608187144, "hybrid_results": {"0.05": {"accuracy": 0.8169014084507042, "large_model_usage": 0.8309859154929577, "small_model_usage": 0.16901408450704225, "avg_lambda": 0.900000000000004, "avg_cost": 0.0005452792253521129, "unrestricted": {"accuracy": 0.823943661971831, "large_model_usage": 0.8309859154929577, "avg_cost": 0.0005464764084507044}, "random_baseline": {"accuracy": 0.8084507042253521, "accuracy_std": 0.011909531355836458, "avg_cost": 0.0005367464084507043, "cost_std": 1.1068391284547652e-05, "small_model_fraction": 0.12095664953218074}, "unrestricted_random_baseline": {"accuracy": 0.818661971830986, "accuracy_std": 0.0069089496022354075, "avg_cost": 0.0005499009859154929, "cost_std": 6.01748806493081e-06, "small_model_fraction": 0.11890609070111365}}, "0.1": {"accuracy": 0.778169014084507, "large_model_usage": 0.6654929577464789, "small_model_usage": 0.3345070422535211, "avg_lambda": 0.6999999999999973, "avg_cost": 0.00042982147887323934, "unrestricted": {"accuracy": 0.795774647887324, "large_model_usage": 0.6654929577464789, "avg_cost": 0.00044564542253521113}, "random_baseline": {"accuracy": 0.7454225352112677, "accuracy_std": 0.00945473350851939, "avg_cost": 0.00043097721830985894, "cost_std": 1.141171735644811e-05, "small_model_fraction": 0.31871495562245533}, "unrestricted_random_baseline": {"accuracy": 0.7556338028169014, "accuracy_std": 0.019930946053640704, "avg_cost": 0.0004458694014084504, "cost_std": 2.1042550378683864e-05, "small_model_fraction": 0.29161139272005554}}, "0.15": {"accuracy": 0.7077464788732394, "large_model_usage": 0.4507042253521127, "small_model_usage": 0.5492957746478873, "avg_lambda": 0.5, "avg_cost": 0.0003007299295774646, "unrestricted": {"accuracy": 0.7253521126760564, "large_model_usage": 0.4507042253521127, "avg_cost": 0.0003127369718309857}, "random_baseline": {"accuracy": 0.6732394366197182, "accuracy_std": 0.014066890391153627, "avg_cost": 0.0002972371126760561, "cost_std": 1.3363309913894517e-05, "small_model_fraction": 0.5398255081653535}, "unrestricted_random_baseline": {"accuracy": 0.6852112676056338, "accuracy_std": 0.017047490756211557, "avg_cost": 0.0003099821478873237, "cost_std": 1.9944363034279583e-05, "small_model_fraction": 0.5192596093008268}}, "0.2": {"accuracy": 0.6091549295774648, "large_model_usage": 0.2887323943661972, "small_model_usage": 0.7112676056338028, "avg_lambda": 0.3100000000000012, "avg_cost": 0.00019469471830985908, "unrestricted": {"accuracy": 0.6830985915492958, "large_model_usage": 0.2887323943661972, "avg_cost": 0.00021174401408450704}, "random_baseline": {"accuracy": 0.6225352112676057, "accuracy_std": 0.01914405242122282, "avg_cost": 0.0002027940492957745, "cost_std": 1.4671960554535154e-05, "small_model_fraction": 0.7214447100440514}, "unrestricted_random_baseline": {"accuracy": 0.6278169014084507, "accuracy_std": 0.0180953030387741, "avg_cost": 0.0002170829577464787, "cost_std": 2.1041602404985376e-05, "small_model_fraction": 0.6922423398675004}}, "0.25": {"accuracy": 0.5704225352112676, "large_model_usage": 0.1936619718309859, "small_model_usage": 0.8063380281690141, "avg_lambda": 0.07000000000000021, "avg_cost": 0.00013364542253521125, "unrestricted": {"accuracy": 0.6373239436619719, "large_model_usage": 0.1936619718309859, "avg_cost": 0.0001528566901408451}, "random_baseline": {"accuracy": 0.5919014084507043, "accuracy_std": 0.014124064169106726, "avg_cost": 0.0001350813732394366, "cost_std": 1.5384512068191283e-05, "small_model_fraction": 0.8260111483177071}, "unrestricted_random_baseline": {"accuracy": 0.5992957746478873, "accuracy_std": 0.011442307612163308, "avg_cost": 0.00015176538732394358, "cost_std": 9.705278961500351e-06, "small_model_fraction": 0.7931057101344641}}}}