{"trial": 4, "trial_seed": 45, "subject": "TruthfulQA", "method": "standard", "calibration_size": 400, "test_size": 284, "small_model_accuracy": 0.533625730994152, "large_model_accuracy": 0.8362573099415205, "small_model_cost": 3.225190058479537e-05, "large_model_cost": 0.0006206286549707611, "hybrid_results": {"0.05": {"accuracy": 0.8028169014084507, "large_model_usage": 0.7323943661971831, "small_model_usage": 0.2676056338028169, "avg_lambda": 0.8000000000000037, "avg_cost": 0.0004876271126760561, "unrestricted": {"accuracy": 0.8028169014084507, "large_model_usage": 0.7323943661971831, "avg_cost": 0.0004915848591549295}, "random_baseline": {"accuracy": 0.7676056338028169, "accuracy_std": 0.014259476571349692, "avg_cost": 0.0004898345774647888, "cost_std": 1.8359511514011837e-05, "small_model_fraction": 0.22604826125992417}, "unrestricted_random_baseline": {"accuracy": 0.7760563380281691, "accuracy_std": 0.014363435249557157, "avg_cost": 0.0004966263380281692, "cost_std": 9.70912462286245e-06, "small_model_fraction": 0.21932170986344063}}, "0.1": {"accuracy": 0.7852112676056338, "large_model_usage": 0.6514084507042254, "small_model_usage": 0.3485915492957746, "avg_lambda": 0.6999999999999973, "avg_cost": 0.00042386654929577446, "unrestricted": {"accuracy": 0.7992957746478874, "large_model_usage": 0.6514084507042254, "avg_cost": 0.00044017640845070405}, "random_baseline": {"accuracy": 0.7369718309859155, "accuracy_std": 0.014522246920634686, "avg_cost": 0.0004229308450704224, "cost_std": 2.1133945554168816e-05, "small_model_fraction": 0.33441515866874966}, "unrestricted_random_baseline": {"accuracy": 0.7461267605633802, "accuracy_std": 0.01302816901408449, "avg_cost": 0.0004515406338028168, "cost_std": 1.8287807898422887e-05, "small_model_fraction": 0.3066950643017472}}, "0.15": {"accuracy": 0.721830985915493, "large_model_usage": 0.4471830985915493, "small_model_usage": 0.5528169014084507, "avg_lambda": 0.5, "avg_cost": 0.00029992992957746464, "unrestricted": {"accuracy": 0.7605633802816901, "large_model_usage": 0.4471830985915493, "avg_cost": 0.0003126341549295774}, "random_baseline": {"accuracy": 0.6830985915492958, "accuracy_std": 0.01991850087849429, "avg_cost": 0.0003074378169014082, "cost_std": 1.9443342237373928e-05, "small_model_fraction": 0.5450567565810448}, "unrestricted_random_baseline": {"accuracy": 0.682394366197183, "accuracy_std": 0.01701837461069658, "avg_cost": 0.00031005225352112646, "cost_std": 1.6198809478358878e-05, "small_model_fraction": 0.5234647659773864}}, "0.2": {"accuracy": 0.6725352112676056, "large_model_usage": 0.3626760563380282, "small_model_usage": 0.6373239436619718, "avg_lambda": 0.40000000000000185, "avg_cost": 0.00024397922535211256, "unrestricted": {"accuracy": 0.7288732394366197, "large_model_usage": 0.3626760563380282, "avg_cost": 0.00025916232394366173}, "random_baseline": {"accuracy": 0.647887323943662, "accuracy_std": 0.018896349105632017, "avg_cost": 0.0002485592957746477, "cost_std": 2.076983699380555e-05, "small_model_fraction": 0.6401500854868453}, "unrestricted_random_baseline": {"accuracy": 0.651056338028169, "accuracy_std": 0.01699285705917416, "avg_cost": 0.0002636858450704222, "cost_std": 1.3459969909597391e-05, "small_model_fraction": 0.6143450235458882}}, "0.25": {"accuracy": 0.6232394366197183, "large_model_usage": 0.2887323943661972, "small_model_usage": 0.7112676056338028, "avg_lambda": 0.3100000000000012, "avg_cost": 0.00019450739436619718, "unrestricted": {"accuracy": 0.6901408450704225, "large_model_usage": 0.2887323943661972, "avg_cost": 0.00021258485915492966}, "random_baseline": {"accuracy": 0.6250000000000001, "accuracy_std": 0.01790242791392419, "avg_cost": 0.00020031309859154926, "cost_std": 1.5308183984243713e-05, "small_model_fraction": 0.7242319779428866}, "unrestricted_random_baseline": {"accuracy": 0.6369718309859154, "accuracy_std": 0.0192506153983615, "avg_cost": 0.00021748524647887318, "cost_std": 1.8793272930235423e-05, "small_model_fraction": 0.6935076764575259}}}}