{"trial": 19, "trial_seed": 60, "subject": "TruthfulQA", "method": "standard", "calibration_size": 400, "test_size": 284, "small_model_accuracy": 0.5555555555555556, "large_model_accuracy": 0.8421052631578947, "small_model_cost": 3.226973684210533e-05, "large_model_cost": 0.0006206695906432756, "hybrid_results": {"0.05": {"accuracy": 0.8028169014084507, "large_model_usage": 0.8204225352112676, "small_model_usage": 0.17957746478873238, "avg_lambda": 0.900000000000004, "avg_cost": 0.0005488778169014082, "unrestricted": {"accuracy": 0.8028169014084507, "large_model_usage": 0.8204225352112676, "avg_cost": 0.0005495045774647884}, "random_baseline": {"accuracy": 0.8073943661971832, "accuracy_std": 0.009188724190633712, "avg_cost": 0.0005547145070422532, "cost_std": 1.1530422269619268e-05, "small_model_fraction": 0.12201188235870469}, "unrestricted_random_baseline": {"accuracy": 0.818661971830986, "accuracy_std": 0.009610805678842368, "avg_cost": 0.0005537058450704222, "cost_std": 8.838529134278428e-06, "small_model_fraction": 0.12094668739080798}}, "0.1": {"accuracy": 0.7570422535211268, "large_model_usage": 0.6232394366197183, "small_model_usage": 0.37676056338028174, "avg_lambda": 0.6999999999999973, "avg_cost": 0.00040673697183098574, "unrestricted": {"accuracy": 0.7852112676056338, "large_model_usage": 0.6232394366197183, "avg_cost": 0.00042641302816901367}, "random_baseline": {"accuracy": 0.7503521126760564, "accuracy_std": 0.019121371806176683, "avg_cost": 0.00041155510563380273, "cost_std": 1.824636579746138e-05, "small_model_fraction": 0.3635837388983804}, "unrestricted_random_baseline": {"accuracy": 0.7538732394366197, "accuracy_std": 0.013768461123955722, "avg_cost": 0.0004244266549295773, "cost_std": 8.509137102524986e-06, "small_model_fraction": 0.33014379799608917}}, "0.15": {"accuracy": 0.7253521126760564, "large_model_usage": 0.4295774647887324, "small_model_usage": 0.5704225352112676, "avg_lambda": 0.540000000000001, "avg_cost": 0.0002951102112676054, "unrestricted": {"accuracy": 0.7464788732394366, "large_model_usage": 0.4295774647887324, "avg_cost": 0.0003041806338028167}, "random_baseline": {"accuracy": 0.6971830985915493, "accuracy_std": 0.014938875658870738, "avg_cost": 0.00029400292253521117, "cost_std": 1.4407623537559899e-05, "small_model_fraction": 0.5532961595290979}, "unrestricted_random_baseline": {"accuracy": 0.6936619718309859, "accuracy_std": 0.01613582991181636, "avg_cost": 0.00030890764084507034, "cost_std": 2.222300391813523e-05, "small_model_fraction": 0.5378807537015562}}, "0.2": {"accuracy": 0.6901408450704225, "large_model_usage": 0.34507042253521125, "small_model_usage": 0.6549295774647887, "avg_lambda": 0.4300000000000031, "avg_cost": 0.00023877218309859152, "unrestricted": {"accuracy": 0.7253521126760564, "large_model_usage": 0.34507042253521125, "avg_cost": 0.000250525704225352}, "random_baseline": {"accuracy": 0.6640845070422535, "accuracy_std": 0.01281225732300016, "avg_cost": 0.00023003686619718308, "cost_std": 1.665499939467396e-05, "small_model_fraction": 0.6490440218119656}, "unrestricted_random_baseline": {"accuracy": 0.6862676056338028, "accuracy_std": 0.011935529053998781, "avg_cost": 0.0002635555985915493, "cost_std": 2.132250243541692e-05, "small_model_fraction": 0.6290686240432024}}, "0.25": {"accuracy": 0.6443661971830986, "large_model_usage": 0.25704225352112675, "small_model_usage": 0.7429577464788732, "avg_lambda": 0.3100000000000012, "avg_cost": 0.00018077218309859162, "unrestricted": {"accuracy": 0.6936619718309859, "large_model_usage": 0.25704225352112675, "avg_cost": 0.0001944693661971832}, "random_baseline": {"accuracy": 0.6309859154929578, "accuracy_std": 0.010539879962743522, "avg_cost": 0.0001730540492957747, "cost_std": 1.6392201698180644e-05, "small_model_fraction": 0.7476164460321779}, "unrestricted_random_baseline": {"accuracy": 0.6496478873239436, "accuracy_std": 0.015368417494926193, "avg_cost": 0.00019497876760563382, "cost_std": 1.3571379211045253e-05, "small_model_fraction": 0.7243377470146556}}}}