{"trial": 5, "trial_seed": 46, "subject": "TruthfulQA", "method": "standard_basescores", "calibration_size": 400, "test_size": 284, "small_model_accuracy": 0.5716374269005848, "large_model_accuracy": 0.8362573099415205, "small_model_cost": 3.204400584795328e-05, "large_model_cost": 0.0006141900584795328, "hybrid_results": {"0.05": {"accuracy": 0.8169014084507042, "large_model_usage": 0.7640845070422535, "small_model_usage": 0.2359154929577465, "avg_lambda": 0.9499999999999951, "avg_cost": 0.0005032669014084505, "unrestricted": {"accuracy": 0.8169014084507042, "large_model_usage": 0.7640845070422535, "avg_cost": 0.0005032669014084505}, "random_baseline": {"accuracy": 0.7827464788732394, "accuracy_std": 0.009840273670762094, "avg_cost": 0.0004969656690140842, "cost_std": 1.2003822573619786e-05, "small_model_fraction": 0.1905418005836445}, "unrestricted_random_baseline": {"accuracy": 0.7855633802816901, "accuracy_std": 0.011404320239930769, "avg_cost": 0.0004978220070422533, "cost_std": 1.3623616046591062e-05, "small_model_fraction": 0.1905418005836445}}, "0.1": {"accuracy": 0.7922535211267606, "large_model_usage": 0.6126760563380281, "small_model_usage": 0.38732394366197187, "avg_lambda": 0.780000000000001, "avg_cost": 0.00040823169014084503, "unrestricted": {"accuracy": 0.795774647887324, "large_model_usage": 0.6126760563380281, "avg_cost": 0.0004102528169014085}, "random_baseline": {"accuracy": 0.7535211267605634, "accuracy_std": 0.010680106259227523, "avg_cost": 0.00042046394366197174, "cost_std": 8.70719681007761e-06, "small_model_fraction": 0.3537915741378939}, "unrestricted_random_baseline": {"accuracy": 0.7496478873239436, "accuracy_std": 0.01254331910602595, "avg_cost": 0.00041845352112676045, "cost_std": 1.083387174003767e-05, "small_model_fraction": 0.35031971900561737}}, "0.15": {"accuracy": 0.7535211267605634, "large_model_usage": 0.47183098591549294, "small_model_usage": 0.528169014084507, "avg_lambda": 0.6100000000000032, "avg_cost": 0.0003123161971830986, "unrestricted": {"accuracy": 0.778169014084507, "large_model_usage": 0.47183098591549294, "avg_cost": 0.00032302042253521134}, "random_baseline": {"accuracy": 0.7095070422535212, "accuracy_std": 0.013384913949822707, "avg_cost": 0.00030838524647887317, "cost_std": 1.6464117597264212e-05, "small_model_fraction": 0.5185534797183962}, "unrestricted_random_baseline": {"accuracy": 0.7123239436619718, "accuracy_std": 0.012503718946876583, "avg_cost": 0.00032441109154929563, "cost_std": 1.155157014518509e-05, "small_model_fraction": 0.5001659542791623}}, "0.2": {"accuracy": 0.7007042253521126, "large_model_usage": 0.31338028169014087, "small_model_usage": 0.6866197183098591, "avg_lambda": 0.40000000000000185, "avg_cost": 0.00021264014084507039, "unrestricted": {"accuracy": 0.7288732394366197, "large_model_usage": 0.31338028169014087, "avg_cost": 0.0002245908450704226}, "random_baseline": {"accuracy": 0.6683098591549296, "accuracy_std": 0.014670892011267346, "avg_cost": 0.00021898971830985908, "cost_std": 1.6061030176618783e-05, "small_model_fraction": 0.6897752133150505}, "unrestricted_random_baseline": {"accuracy": 0.6654929577464788, "accuracy_std": 0.010082972579772066, "avg_cost": 0.00022247200704225345, "cost_std": 1.1682124071380763e-05, "small_model_fraction": 0.6692465089266427}}, "0.25": {"accuracy": 0.6514084507042254, "large_model_usage": 0.2535211267605634, "small_model_usage": 0.7464788732394366, "avg_lambda": 0.29999999999999843, "avg_cost": 0.00016955563380281681, "unrestricted": {"accuracy": 0.7112676056338029, "large_model_usage": 0.2535211267605634, "avg_cost": 0.00018709788732394368}, "random_baseline": {"accuracy": 0.6471830985915493, "accuracy_std": 0.011550154554124439, "avg_cost": 0.0001647227464788732, "cost_std": 1.3122001529947817e-05, "small_model_fraction": 0.7637850032079664}, "unrestricted_random_baseline": {"accuracy": 0.6492957746478873, "accuracy_std": 0.010349956659647217, "avg_cost": 0.00018554799295774637, "cost_std": 1.0549644919909538e-05, "small_model_fraction": 0.7336512361888009}}}}