{"trial": 1, "trial_seed": 42, "subject": "TruthfulQA", "method": "standard", "calibration_size": 400, "test_size": 284, "small_model_accuracy": 0.5380116959064327, "large_model_accuracy": 0.8157894736842105, "small_model_cost": 3.2259795321637416e-05, "large_model_cost": 0.0006210087719298245, "hybrid_results": {"0.05": {"accuracy": 0.7852112676056338, "large_model_usage": 0.8133802816901409, "small_model_usage": 0.18661971830985913, "avg_lambda": 0.8600000000000062, "avg_cost": 0.0005305577464788734, "unrestricted": {"accuracy": 0.8098591549295775, "large_model_usage": 0.8133802816901409, "avg_cost": 0.0005396633802816903}, "random_baseline": {"accuracy": 0.7693661971830986, "accuracy_std": 0.012523534678717405, "avg_cost": 0.0005297964436619714, "cost_std": 7.981958493243993e-06, "small_model_fraction": 0.15363258204208538}, "unrestricted_random_baseline": {"accuracy": 0.7806338028169015, "accuracy_std": 0.010569247197044834, "avg_cost": 0.0005430809154929573, "cost_std": 7.125946229849069e-06, "small_model_fraction": 0.1381665104825645}}, "0.1": {"accuracy": 0.7640845070422535, "large_model_usage": 0.6584507042253521, "small_model_usage": 0.3415492957746479, "avg_lambda": 0.7599999999999976, "avg_cost": 0.0004393042253521124, "unrestricted": {"accuracy": 0.7816901408450704, "large_model_usage": 0.6584507042253521, "avg_cost": 0.0004437478873239434}, "random_baseline": {"accuracy": 0.7292253521126761, "accuracy_std": 0.01367811591517958, "avg_cost": 0.0004350170774647888, "cost_std": 1.916986647203602e-05, "small_model_fraction": 0.308628216433634}, "unrestricted_random_baseline": {"accuracy": 0.7433098591549296, "accuracy_std": 0.015303742121197677, "avg_cost": 0.00044375637323943666, "cost_std": 9.062408789121625e-06, "small_model_fraction": 0.30108058213041844}}, "0.15": {"accuracy": 0.7535211267605634, "large_model_usage": 0.647887323943662, "small_model_usage": 0.352112676056338, "avg_lambda": 0.6999999999999973, "avg_cost": 0.0004182408450704223, "unrestricted": {"accuracy": 0.7816901408450704, "large_model_usage": 0.647887323943662, "avg_cost": 0.00043723380281690115}, "random_baseline": {"accuracy": 0.7274647887323944, "accuracy_std": 0.011045343057294437, "avg_cost": 0.0004111095422535212, "cost_std": 9.248633912981306e-06, "small_model_fraction": 0.3444047207140105}, "unrestricted_random_baseline": {"accuracy": 0.7316901408450704, "accuracy_std": 0.015492957746478887, "avg_cost": 0.00043899091549295786, "cost_std": 1.8745470254133388e-05, "small_model_fraction": 0.3121448637951955}}, "0.2": {"accuracy": 0.6901408450704225, "large_model_usage": 0.47183098591549294, "small_model_usage": 0.528169014084507, "avg_lambda": 0.5800000000000017, "avg_cost": 0.000318501408450704, "unrestricted": {"accuracy": 0.7147887323943662, "large_model_usage": 0.47183098591549294, "avg_cost": 0.00032755774647887304}, "random_baseline": {"accuracy": 0.681338028169014, "accuracy_std": 0.016155027812611897, "avg_cost": 0.00030770605633802836, "cost_std": 1.5092381333165832e-05, "small_model_fraction": 0.5138138247337275}, "unrestricted_random_baseline": {"accuracy": 0.6922535211267605, "accuracy_std": 0.01630399563773271, "avg_cost": 0.0003230475704225353, "cost_std": 1.353362298452764e-05, "small_model_fraction": 0.4984314828732914}}, "0.25": {"accuracy": 0.6549295774647887, "large_model_usage": 0.36619718309859156, "small_model_usage": 0.6338028169014085, "avg_lambda": 0.40000000000000185, "avg_cost": 0.00024829014084507044, "unrestricted": {"accuracy": 0.6936619718309859, "large_model_usage": 0.36619718309859156, "avg_cost": 0.0002615225352112676}, "random_baseline": {"accuracy": 0.6683098591549296, "accuracy_std": 0.01237915199383586, "avg_cost": 0.00024946338028169015, "cost_std": 1.469407337931493e-05, "small_model_fraction": 0.6330688390016491}, "unrestricted_random_baseline": {"accuracy": 0.6732394366197183, "accuracy_std": 0.008879943811914419, "avg_cost": 0.0002683820422535212, "cost_std": 2.5443747255951056e-05, "small_model_fraction": 0.6105933954901721}}}}