{"trial": 3, "trial_seed": 44, "subject": "TruthfulQA", "method": "standard", "calibration_size": 400, "test_size": 284, "small_model_accuracy": 0.5716374269005848, "large_model_accuracy": 0.8377192982456141, "small_model_cost": 3.227251461988303e-05, "large_model_cost": 0.0006202105263157896, "hybrid_results": {"0.05": {"accuracy": 0.8204225352112676, "large_model_usage": 0.7922535211267606, "small_model_usage": 0.20774647887323938, "avg_lambda": 0.900000000000004, "avg_cost": 0.000527711267605634, "unrestricted": {"accuracy": 0.8204225352112676, "large_model_usage": 0.7922535211267606, "avg_cost": 0.0005281830985915495}, "random_baseline": {"accuracy": 0.7862676056338028, "accuracy_std": 0.014776151590392477, "avg_cost": 0.0005276022887323942, "cost_std": 1.0995298360019024e-05, "small_model_fraction": 0.15732825037684095}, "unrestricted_random_baseline": {"accuracy": 0.7904929577464788, "accuracy_std": 0.010709088961792674, "avg_cost": 0.0005279092957746477, "cost_std": 1.2602227004624905e-05, "small_model_fraction": 0.15652573212401605}}, "0.1": {"accuracy": 0.7640845070422535, "large_model_usage": 0.6302816901408451, "small_model_usage": 0.3697183098591549, "avg_lambda": 0.7299999999999969, "avg_cost": 0.00040919014084507023, "unrestricted": {"accuracy": 0.7816901408450704, "large_model_usage": 0.6302816901408451, "avg_cost": 0.00042655633802816907}, "random_baseline": {"accuracy": 0.7154929577464788, "accuracy_std": 0.016500527484309854, "avg_cost": 0.000407182288732394, "cost_std": 2.0435311568781165e-05, "small_model_fraction": 0.3589160443326862}, "unrestricted_random_baseline": {"accuracy": 0.7246478873239437, "accuracy_std": 0.014838948947679936, "avg_cost": 0.0004359585915492955, "cost_std": 1.2889834277170512e-05, "small_model_fraction": 0.3293785814749845}}, "0.15": {"accuracy": 0.7147887323943662, "large_model_usage": 0.4507042253521127, "small_model_usage": 0.5492957746478873, "avg_lambda": 0.5800000000000017, "avg_cost": 0.0003069788732394366, "unrestricted": {"accuracy": 0.7253521126760564, "large_model_usage": 0.4507042253521127, "avg_cost": 0.00031552816901408444}, "random_baseline": {"accuracy": 0.6802816901408452, "accuracy_std": 0.016273549307452593, "avg_cost": 0.00032023116197183084, "cost_std": 2.178707534091208e-05, "small_model_fraction": 0.5327630580864071}, "unrestricted_random_baseline": {"accuracy": 0.6785211267605633, "accuracy_std": 0.013273293538711787, "avg_cost": 0.0003173991901408449, "cost_std": 1.4168019025039416e-05, "small_model_fraction": 0.5182219064605964}}, "0.2": {"accuracy": 0.676056338028169, "large_model_usage": 0.36971830985915494, "small_model_usage": 0.630281690140845, "avg_lambda": 0.45999999999999747, "avg_cost": 0.0002524225352112676, "unrestricted": {"accuracy": 0.7183098591549296, "large_model_usage": 0.36971830985915494, "avg_cost": 0.000264556338028169}, "random_baseline": {"accuracy": 0.6404929577464789, "accuracy_std": 0.024670512754370932, "avg_cost": 0.0002475817253521127, "cost_std": 8.412493692432899e-06, "small_model_fraction": 0.6255557283048224}, "unrestricted_random_baseline": {"accuracy": 0.6679577464788733, "accuracy_std": 0.015352274194782826, "avg_cost": 0.00027735982394366186, "cost_std": 1.4563703420191477e-05, "small_model_fraction": 0.6049178335344172}}, "0.25": {"accuracy": 0.6232394366197183, "large_model_usage": 0.2887323943661972, "small_model_usage": 0.7112676056338028, "avg_lambda": 0.3100000000000012, "avg_cost": 0.00019813380281690144, "unrestricted": {"accuracy": 0.6795774647887324, "large_model_usage": 0.2887323943661972, "avg_cost": 0.0002137253521126761}, "random_baseline": {"accuracy": 0.6059859154929577, "accuracy_std": 0.014470931400024305, "avg_cost": 0.00019087947183098588, "cost_std": 1.1760263134028971e-05, "small_model_fraction": 0.7178932389171577}, "unrestricted_random_baseline": {"accuracy": 0.6320422535211268, "accuracy_std": 0.01638364806215755, "avg_cost": 0.0002303455985915493, "cost_std": 1.616352294272832e-05, "small_model_fraction": 0.6913742029208274}}}}