{"trial": 1, "trial_seed": 42, "subject": "TruthfulQA", "method": "standard_basescores", "calibration_size": 400, "test_size": 284, "small_model_accuracy": 0.5350877192982456, "large_model_accuracy": 0.8128654970760234, "small_model_cost": 3.2059795321637445e-05, "large_model_cost": 0.000615394736842106, "hybrid_results": {"0.05": {"accuracy": 0.7887323943661971, "large_model_usage": 0.8204225352112676, "small_model_usage": 0.17957746478873238, "avg_lambda": 0.900000000000004, "avg_cost": 0.0005383644366197183, "unrestricted": {"accuracy": 0.7922535211267606, "large_model_usage": 0.8204225352112676, "avg_cost": 0.0005392447183098591}, "random_baseline": {"accuracy": 0.7732394366197183, "accuracy_std": 0.009474383131742039, "avg_cost": 0.0005395949647887321, "cost_std": 8.159293933351457e-06, "small_model_fraction": 0.13205157918640606}, "unrestricted_random_baseline": {"accuracy": 0.7742957746478873, "accuracy_std": 0.008250263742154906, "avg_cost": 0.0005435581690140842, "cost_std": 7.895630644988923e-06, "small_model_fraction": 0.13054252902073907}}, "0.1": {"accuracy": 0.7464788732394366, "large_model_usage": 0.6302816901408451, "small_model_usage": 0.3697183098591549, "avg_lambda": 0.7599999999999976, "avg_cost": 0.00041673767605633757, "unrestricted": {"accuracy": 0.7711267605633803, "large_model_usage": 0.6302816901408451, "avg_cost": 0.0004210193661971826}, "random_baseline": {"accuracy": 0.7084507042253521, "accuracy_std": 0.008151997818866387, "avg_cost": 0.00041510183098591536, "cost_std": 2.0298138478792247e-05, "small_model_fraction": 0.3405540224762925}, "unrestricted_random_baseline": {"accuracy": 0.7179577464788733, "accuracy_std": 0.012739473602201106, "avg_cost": 0.00042247088028169, "cost_std": 8.454570812978613e-06, "small_model_fraction": 0.333214002470488}}, "0.15": {"accuracy": 0.6971830985915493, "large_model_usage": 0.4471830985915493, "small_model_usage": 0.5528169014084507, "avg_lambda": 0.6100000000000032, "avg_cost": 0.00029853345070422505, "unrestricted": {"accuracy": 0.7147887323943662, "large_model_usage": 0.4471830985915493, "avg_cost": 0.000308082746478873}, "random_baseline": {"accuracy": 0.6443661971830987, "accuracy_std": 0.019414153171958062, "avg_cost": 0.00029302746478873225, "cost_std": 1.1568431464605702e-05, "small_model_fraction": 0.5431892787220644}, "unrestricted_random_baseline": {"accuracy": 0.6591549295774647, "accuracy_std": 0.019208706615473012, "avg_cost": 0.0003141594718309858, "cost_std": 1.9466427174327456e-05, "small_model_fraction": 0.526819102524908}}, "0.2": {"accuracy": 0.6830985915492958, "large_model_usage": 0.38380281690140844, "small_model_usage": 0.6161971830985915, "avg_lambda": 0.5, "avg_cost": 0.0002591531690140845, "unrestricted": {"accuracy": 0.6971830985915493, "large_model_usage": 0.38380281690140844, "avg_cost": 0.00026866725352112667}, "random_baseline": {"accuracy": 0.6161971830985916, "accuracy_std": 0.00862496388303938, "avg_cost": 0.0002472634507042253, "cost_std": 9.999497523579044e-06, "small_model_fraction": 0.6106981469333452}, "unrestricted_random_baseline": {"accuracy": 0.6330985915492958, "accuracy_std": 0.015731202749076547, "avg_cost": 0.0002637840492957746, "cost_std": 1.2987634205648089e-05, "small_model_fraction": 0.5943883327428159}}, "0.25": {"accuracy": 0.6091549295774648, "large_model_usage": 0.2605633802816901, "small_model_usage": 0.7394366197183099, "avg_lambda": 0.3100000000000012, "avg_cost": 0.0001788573943661971, "unrestricted": {"accuracy": 0.6690140845070423, "large_model_usage": 0.2605633802816901, "avg_cost": 0.00019193485915492951}, "random_baseline": {"accuracy": 0.5904929577464789, "accuracy_std": 0.012797733693933961, "avg_cost": 0.0001748778873239435, "cost_std": 9.58694650760163e-06, "small_model_fraction": 0.7483476668448316}, "unrestricted_random_baseline": {"accuracy": 0.5975352112676057, "accuracy_std": 0.015671981045259072, "avg_cost": 0.00019467637323943645, "cost_std": 2.376117267478243e-05, "small_model_fraction": 0.7259292175836817}}}}