{"trial": 24, "trial_seed": 65, "subject": "TruthfulQA", "method": "standard", "calibration_size": 400, "test_size": 284, "small_model_accuracy": 0.5657894736842105, "large_model_accuracy": 0.8289473684210527, "small_model_cost": 3.2264181286549725e-05, "large_model_cost": 0.0006217748538011703, "hybrid_results": {"0.05": {"accuracy": 0.778169014084507, "large_model_usage": 0.8133802816901409, "small_model_usage": 0.18661971830985913, "avg_lambda": 0.879999999999997, "avg_cost": 0.0005276676056338029, "unrestricted": {"accuracy": 0.8063380281690141, "large_model_usage": 0.8133802816901409, "avg_cost": 0.0005395408450704227}, "random_baseline": {"accuracy": 0.804225352112676, "accuracy_std": 0.011157027829404816, "avg_cost": 0.0005301936971830983, "cost_std": 8.43493514132595e-06, "small_model_fraction": 0.15963620771434547}, "unrestricted_random_baseline": {"accuracy": 0.8140845070422535, "accuracy_std": 0.009289370393149931, "avg_cost": 0.0005449153169014082, "cost_std": 7.0774902183166975e-06, "small_model_fraction": 0.13949536889632494}}, "0.1": {"accuracy": 0.7323943661971831, "large_model_usage": 0.5880281690140845, "small_model_usage": 0.4119718309859155, "avg_lambda": 0.6999999999999973, "avg_cost": 0.00038328028169014083, "unrestricted": {"accuracy": 0.7711267605633803, "large_model_usage": 0.5880281690140845, "avg_cost": 0.00039926619718309847}, "random_baseline": {"accuracy": 0.7323943661971831, "accuracy_std": 0.016590449280830623, "avg_cost": 0.0003878171126760561, "cost_std": 1.7733640532233686e-05, "small_model_fraction": 0.4045636206952887}, "unrestricted_random_baseline": {"accuracy": 0.7464788732394366, "accuracy_std": 0.015428804436765224, "avg_cost": 0.00040291697183098576, "cost_std": 1.121201176377879e-05, "small_model_fraction": 0.3774463584669935}}, "0.15": {"accuracy": 0.7007042253521126, "large_model_usage": 0.3908450704225352, "small_model_usage": 0.6091549295774648, "avg_lambda": 0.5, "avg_cost": 0.0002665971830985914, "unrestricted": {"accuracy": 0.7253521126760564, "large_model_usage": 0.3908450704225352, "avg_cost": 0.0002766676056338027}, "random_baseline": {"accuracy": 0.6704225352112677, "accuracy_std": 0.017689938995887992, "avg_cost": 0.00025884158450704216, "cost_std": 1.459458369229511e-05, "small_model_fraction": 0.6024957431008513}, "unrestricted_random_baseline": {"accuracy": 0.680281690140845, "accuracy_std": 0.012676056338028152, "avg_cost": 0.0002717266197183097, "cost_std": 1.579013778218294e-05, "small_model_fraction": 0.5854130624900749}}, "0.2": {"accuracy": 0.6549295774647887, "large_model_usage": 0.24647887323943662, "small_model_usage": 0.7535211267605634, "avg_lambda": 0.3900000000000005, "avg_cost": 0.00017811126760563386, "unrestricted": {"accuracy": 0.6690140845070423, "large_model_usage": 0.24647887323943662, "avg_cost": 0.00018637183098591545}, "random_baseline": {"accuracy": 0.6348591549295775, "accuracy_std": 0.013458813277194661, "avg_cost": 0.00017852221830985915, "cost_std": 1.0911102588725678e-05, "small_model_fraction": 0.7525963597962395}, "unrestricted_random_baseline": {"accuracy": 0.6338028169014085, "accuracy_std": 0.01613582991181636, "avg_cost": 0.00019046144366197183, "cost_std": 1.7016686018762543e-05, "small_model_fraction": 0.7385837833232042}}, "0.25": {"accuracy": 0.6091549295774648, "large_model_usage": 0.2147887323943662, "small_model_usage": 0.7852112676056338, "avg_lambda": 0.25, "avg_cost": 0.00015135774647887334, "unrestricted": {"accuracy": 0.6584507042253521, "large_model_usage": 0.2147887323943662, "avg_cost": 0.00016673802816901417}, "random_baseline": {"accuracy": 0.6309859154929577, "accuracy_std": 0.014755159746307738, "avg_cost": 0.00014976355633802819, "cost_std": 1.294563250936379e-05, "small_model_fraction": 0.7979789497545188}, "unrestricted_random_baseline": {"accuracy": 0.6246478873239436, "accuracy_std": 0.01356433822835089, "avg_cost": 0.00016217964788732396, "cost_std": 1.108321838684746e-05, "small_model_fraction": 0.7718890375489694}}}}