{"trial": 20, "trial_seed": 61, "subject": "TruthfulQA", "method": "standard", "calibration_size": 400, "test_size": 284, "small_model_accuracy": 0.564327485380117, "large_model_accuracy": 0.8260233918128655, "small_model_cost": 3.2273391812865526e-05, "large_model_cost": 0.0006215847953216381, "hybrid_results": {"0.05": {"accuracy": 0.823943661971831, "large_model_usage": 0.7112676056338029, "small_model_usage": 0.28873239436619713, "avg_lambda": 0.8000000000000037, "avg_cost": 0.0004735584507042253, "unrestricted": {"accuracy": 0.823943661971831, "large_model_usage": 0.7112676056338029, "avg_cost": 0.0004780654929577465}, "random_baseline": {"accuracy": 0.7936619718309859, "accuracy_std": 0.014449496146960018, "avg_cost": 0.0004686121830985913, "cost_std": 1.004510396032294e-05, "small_model_fraction": 0.2511852710401679}, "unrestricted_random_baseline": {"accuracy": 0.7971830985915493, "accuracy_std": 0.01936299589417963, "avg_cost": 0.00047895714788732393, "cost_std": 1.905620102061468e-05, "small_model_fraction": 0.2435372903177753}}, "0.1": {"accuracy": 0.778169014084507, "large_model_usage": 0.6338028169014085, "small_model_usage": 0.3661971830985915, "avg_lambda": 0.6999999999999973, "avg_cost": 0.000411206338028169, "unrestricted": {"accuracy": 0.8063380281690141, "large_model_usage": 0.6338028169014085, "avg_cost": 0.000430431690140845}, "random_baseline": {"accuracy": 0.7661971830985915, "accuracy_std": 0.013380281690140848, "avg_cost": 0.0004110518661971831, "cost_std": 1.794180945677627e-05, "small_model_fraction": 0.3569903043465158}, "unrestricted_random_baseline": {"accuracy": 0.779225352112676, "accuracy_std": 0.012602486033924868, "avg_cost": 0.00042954017605633803, "cost_std": 1.3586184886462519e-05, "small_model_fraction": 0.3243668865775606}}, "0.15": {"accuracy": 0.7183098591549296, "large_model_usage": 0.4084507042253521, "small_model_usage": 0.5915492957746479, "avg_lambda": 0.5, "avg_cost": 0.0002789105633802816, "unrestricted": {"accuracy": 0.7359154929577465, "large_model_usage": 0.4084507042253521, "avg_cost": 0.00028967816901408433}, "random_baseline": {"accuracy": 0.7070422535211268, "accuracy_std": 0.016945365373966178, "avg_cost": 0.0002847544718309859, "cost_std": 1.4252886268645944e-05, "small_model_fraction": 0.5814824384884916}, "unrestricted_random_baseline": {"accuracy": 0.7073943661971831, "accuracy_std": 0.021958396940378704, "avg_cost": 0.0002853380633802816, "cost_std": 1.9997855522211764e-05, "small_model_fraction": 0.563210934543901}}, "0.2": {"accuracy": 0.6866197183098591, "large_model_usage": 0.323943661971831, "small_model_usage": 0.676056338028169, "avg_lambda": 0.40000000000000185, "avg_cost": 0.00022514295774647873, "unrestricted": {"accuracy": 0.7183098591549296, "large_model_usage": 0.323943661971831, "avg_cost": 0.0002365373239436618}, "random_baseline": {"accuracy": 0.6746478873239437, "accuracy_std": 0.010107535277751623, "avg_cost": 0.00022369531690140835, "cost_std": 2.112248365456122e-05, "small_model_fraction": 0.6727204585126578}, "unrestricted_random_baseline": {"accuracy": 0.6866197183098592, "accuracy_std": 0.013080405366906116, "avg_cost": 0.00022711028169014076, "cost_std": 1.5044521291114423e-05, "small_model_fraction": 0.6533854072488594}}, "0.25": {"accuracy": 0.6443661971830986, "large_model_usage": 0.25704225352112675, "small_model_usage": 0.7429577464788732, "avg_lambda": 0.25, "avg_cost": 0.00017540352112676045, "unrestricted": {"accuracy": 0.704225352112676, "large_model_usage": 0.25704225352112675, "avg_cost": 0.00019455140845070403}, "random_baseline": {"accuracy": 0.6630281690140846, "accuracy_std": 0.021070935634334056, "avg_cost": 0.00017544450704225345, "cost_std": 2.033542307503205e-05, "small_model_fraction": 0.7571230957661856}, "unrestricted_random_baseline": {"accuracy": 0.6697183098591549, "accuracy_std": 0.014154754395944895, "avg_cost": 0.000201517218309859, "cost_std": 1.5475908570048095e-05, "small_model_fraction": 0.7246311276658965}}}}