{"trial": 2, "trial_seed": 43, "subject": "TruthfulQA", "method": "standard", "calibration_size": 400, "test_size": 284, "small_model_accuracy": 0.5687134502923976, "large_model_accuracy": 0.8289473684210527, "small_model_cost": 3.22454678362573e-05, "large_model_cost": 0.000621143274853802, "hybrid_results": {"0.05": {"accuracy": 0.795774647887324, "large_model_usage": 0.7112676056338029, "small_model_usage": 0.28873239436619713, "avg_lambda": 0.8000000000000037, "avg_cost": 0.0004714376760563377, "unrestricted": {"accuracy": 0.7992957746478874, "large_model_usage": 0.7112676056338029, "avg_cost": 0.00047642359154929535}, "random_baseline": {"accuracy": 0.7735915492957746, "accuracy_std": 0.013911793144463665, "avg_cost": 0.00048102267605633814, "cost_std": 1.619393366762457e-05, "small_model_fraction": 0.25421320475897824}, "unrestricted_random_baseline": {"accuracy": 0.7661971830985915, "accuracy_std": 0.013472624274442933, "avg_cost": 0.0004779786971830986, "cost_std": 1.5087024963156054e-05, "small_model_fraction": 0.24574668402559552}}, "0.1": {"accuracy": 0.7535211267605634, "large_model_usage": 0.6161971830985915, "small_model_usage": 0.3838028169014085, "avg_lambda": 0.6999999999999973, "avg_cost": 0.0004035503521126758, "unrestricted": {"accuracy": 0.7746478873239436, "large_model_usage": 0.6161971830985915, "avg_cost": 0.0004186911971830983}, "random_baseline": {"accuracy": 0.7355633802816901, "accuracy_std": 0.01522251149381497, "avg_cost": 0.00040199380281690146, "cost_std": 1.4154145955465642e-05, "small_model_fraction": 0.3694918203943042}, "unrestricted_random_baseline": {"accuracy": 0.7394366197183099, "accuracy_std": 0.015428804436765224, "avg_cost": 0.00041703334507042257, "cost_std": 1.9815446230651225e-05, "small_model_fraction": 0.34378134076609357}}, "0.15": {"accuracy": 0.6971830985915493, "large_model_usage": 0.426056338028169, "small_model_usage": 0.573943661971831, "avg_lambda": 0.5100000000000009, "avg_cost": 0.00028815598591549275, "unrestricted": {"accuracy": 0.7288732394366197, "large_model_usage": 0.426056338028169, "avg_cost": 0.0003002123239436618}, "random_baseline": {"accuracy": 0.6890845070422535, "accuracy_std": 0.01021126760563379, "avg_cost": 0.0002899734859154929, "cost_std": 1.658211626711447e-05, "small_model_fraction": 0.5654415502491229}, "unrestricted_random_baseline": {"accuracy": 0.6862676056338028, "accuracy_std": 0.0156244420650238, "avg_cost": 0.0003133661267605633, "cost_std": 2.1444369730797858e-05, "small_model_fraction": 0.5449688334474965}}, "0.2": {"accuracy": 0.6549295774647887, "large_model_usage": 0.29225352112676056, "small_model_usage": 0.7077464788732395, "avg_lambda": 0.34999999999999865, "avg_cost": 0.00020390246478873232, "unrestricted": {"accuracy": 0.6830985915492958, "large_model_usage": 0.29225352112676056, "avg_cost": 0.0002158390845070422}, "random_baseline": {"accuracy": 0.651056338028169, "accuracy_std": 0.011295080938271449, "avg_cost": 0.00020326105633802815, "cost_std": 1.025649246700107e-05, "small_model_fraction": 0.7085114006081519}, "unrestricted_random_baseline": {"accuracy": 0.6566901408450705, "accuracy_std": 0.009610805678842368, "avg_cost": 0.00022069482394366193, "cost_std": 1.7569125976076112e-05, "small_model_fraction": 0.6882419759710275}}, "0.25": {"accuracy": 0.6232394366197183, "large_model_usage": 0.25704225352112675, "small_model_usage": 0.7429577464788732, "avg_lambda": 0.20000000000000093, "avg_cost": 0.00017446584507042244, "unrestricted": {"accuracy": 0.6690140845070423, "large_model_usage": 0.25704225352112675, "avg_cost": 0.00019397992957746472}, "random_baseline": {"accuracy": 0.6415492957746479, "accuracy_std": 0.00993432111244077, "avg_cost": 0.00018012140845070419, "cost_std": 1.1838749183956343e-05, "small_model_fraction": 0.7584973563504406}, "unrestricted_random_baseline": {"accuracy": 0.6359154929577465, "accuracy_std": 0.01137710874746727, "avg_cost": 0.00019050922535211262, "cost_std": 1.2365391923515368e-05, "small_model_fraction": 0.7253607335365931}}}}