{"trial": 15, "trial_seed": 56, "subject": "TruthfulQA", "method": "standard_basescores", "calibration_size": 400, "test_size": 284, "small_model_accuracy": 0.5453216374269005, "large_model_accuracy": 0.8245614035087719, "small_model_cost": 3.199970760233923e-05, "large_model_cost": 0.0006134912280701765, "hybrid_results": {"0.05": {"accuracy": 0.8028169014084507, "large_model_usage": 0.8063380281690141, "small_model_usage": 0.19366197183098588, "avg_lambda": 0.900000000000004, "avg_cost": 0.000526042957746479, "unrestricted": {"accuracy": 0.8028169014084507, "large_model_usage": 0.8063380281690141, "avg_cost": 0.0005265007042253524}, "random_baseline": {"accuracy": 0.7845070422535211, "accuracy_std": 0.013527727262182095, "avg_cost": 0.0005250993309859157, "cost_std": 1.2849445359379893e-05, "small_model_fraction": 0.15038614880117473}, "unrestricted_random_baseline": {"accuracy": 0.7869718309859155, "accuracy_std": 0.01070908896179267, "avg_cost": 0.0005280809507042255, "cost_std": 1.3539344532261196e-05, "small_model_fraction": 0.14959895507132454}}, "0.1": {"accuracy": 0.7711267605633803, "large_model_usage": 0.6161971830985915, "small_model_usage": 0.3838028169014085, "avg_lambda": 0.7599999999999976, "avg_cost": 0.0004087119718309858, "unrestricted": {"accuracy": 0.778169014084507, "large_model_usage": 0.6161971830985915, "avg_cost": 0.00041124718309859136}, "random_baseline": {"accuracy": 0.7218309859154929, "accuracy_std": 0.015267241823013263, "avg_cost": 0.0004085908450704225, "cost_std": 1.9662007848725e-05, "small_model_fraction": 0.3521620677708871}, "unrestricted_random_baseline": {"accuracy": 0.7345070422535211, "accuracy_std": 0.011267605633802802, "avg_cost": 0.0004116589436619719, "cost_std": 1.1448090002676545e-05, "small_model_fraction": 0.34780222557479484}}, "0.15": {"accuracy": 0.7288732394366197, "large_model_usage": 0.4612676056338028, "small_model_usage": 0.5387323943661972, "avg_lambda": 0.5500000000000005, "avg_cost": 0.00030758521126760555, "unrestricted": {"accuracy": 0.7464788732394366, "large_model_usage": 0.4612676056338028, "avg_cost": 0.00031662746478873234}, "random_baseline": {"accuracy": 0.6852112676056338, "accuracy_std": 0.012318912453898533, "avg_cost": 0.00030477908450704213, "cost_std": 1.3106558314977896e-05, "small_model_fraction": 0.5260713287039082}, "unrestricted_random_baseline": {"accuracy": 0.679225352112676, "accuracy_std": 0.017777331776569465, "avg_cost": 0.00031304246478873227, "cost_std": 2.040385578005579e-05, "small_model_fraction": 0.5105212248711783}}, "0.2": {"accuracy": 0.6901408450704225, "large_model_usage": 0.3732394366197183, "small_model_usage": 0.6267605633802817, "avg_lambda": 0.40000000000000185, "avg_cost": 0.00024752887323943655, "unrestricted": {"accuracy": 0.7323943661971831, "large_model_usage": 0.3732394366197183, "avg_cost": 0.00026254999999999985}, "random_baseline": {"accuracy": 0.648943661971831, "accuracy_std": 0.01753860479635531, "avg_cost": 0.0002385772887323942, "cost_std": 2.3740421411886784e-05, "small_model_fraction": 0.6293511460602316}, "unrestricted_random_baseline": {"accuracy": 0.6528169014084507, "accuracy_std": 0.014704657054803573, "avg_cost": 0.0002604114084507041, "cost_std": 1.859347996950842e-05, "small_model_fraction": 0.6035190810483839}}, "0.25": {"accuracy": 0.6091549295774648, "large_model_usage": 0.29577464788732394, "small_model_usage": 0.704225352112676, "avg_lambda": 0.29000000000000087, "avg_cost": 0.00019237394366197167, "unrestricted": {"accuracy": 0.6971830985915493, "large_model_usage": 0.29577464788732394, "avg_cost": 0.00021469788732394362}, "random_baseline": {"accuracy": 0.6334507042253522, "accuracy_std": 0.0138582173620806, "avg_cost": 0.00019581869718309858, "cost_std": 1.654438855657772e-05, "small_model_fraction": 0.7242019351707764}, "unrestricted_random_baseline": {"accuracy": 0.6426056338028169, "accuracy_std": 0.012424139974913628, "avg_cost": 0.00021293320422535207, "cost_std": 1.084614887637877e-05, "small_model_fraction": 0.6858111024996287}}}}