{"trial": 7, "trial_seed": 48, "subject": "TruthfulQA", "method": "standard_basescores", "calibration_size": 400, "test_size": 284, "small_model_accuracy": 0.543859649122807, "large_model_accuracy": 0.8289473684210527, "small_model_cost": 3.201330409356728e-05, "large_model_cost": 0.0006151432748538018, "hybrid_results": {"0.05": {"accuracy": 0.8063380281690141, "large_model_usage": 0.7852112676056338, "small_model_usage": 0.21478873239436624, "avg_lambda": 0.900000000000004, "avg_cost": 0.0005194820422535211, "unrestricted": {"accuracy": 0.8063380281690141, "large_model_usage": 0.7852112676056338, "avg_cost": 0.0005194820422535211}, "random_baseline": {"accuracy": 0.7848591549295774, "accuracy_std": 0.006763863631091046, "avg_cost": 0.0005258106690140842, "cost_std": 7.633074383236463e-06, "small_model_fraction": 0.16404787508274662}, "unrestricted_random_baseline": {"accuracy": 0.7859154929577465, "accuracy_std": 0.011971830985915515, "avg_cost": 0.0005234647183098589, "cost_std": 9.782891288210974e-06, "small_model_fraction": 0.16404787508274662}}, "0.1": {"accuracy": 0.7816901408450704, "large_model_usage": 0.6830985915492958, "small_model_usage": 0.31690140845070425, "avg_lambda": 0.8000000000000037, "avg_cost": 0.00044970739436619696, "unrestricted": {"accuracy": 0.7887323943661971, "large_model_usage": 0.6830985915492958, "avg_cost": 0.0004560172535211266}, "random_baseline": {"accuracy": 0.7528169014084507, "accuracy_std": 0.016425216605198006, "avg_cost": 0.00045390552816901373, "cost_std": 1.728071883358232e-05, "small_model_fraction": 0.2837032716255757}, "unrestricted_random_baseline": {"accuracy": 0.7595070422535211, "accuracy_std": 0.012990047004011756, "avg_cost": 0.00045072010563380253, "cost_std": 1.6346584965156005e-05, "small_model_fraction": 0.2728825978970356}}, "0.15": {"accuracy": 0.7676056338028169, "large_model_usage": 0.5845070422535211, "small_model_usage": 0.4154929577464789, "avg_lambda": 0.6999999999999973, "avg_cost": 0.00037933415492957736, "unrestricted": {"accuracy": 0.778169014084507, "large_model_usage": 0.5845070422535211, "avg_cost": 0.0003950454225352112}, "random_baseline": {"accuracy": 0.7330985915492958, "accuracy_std": 0.02021507055550327, "avg_cost": 0.0003789624647887321, "cost_std": 1.797095351350298e-05, "small_model_fraction": 0.4043851829752411}, "unrestricted_random_baseline": {"accuracy": 0.7352112676056338, "accuracy_std": 0.008302694452501112, "avg_cost": 0.0003995684859154925, "cost_std": 1.5055135300624324e-05, "small_model_fraction": 0.37744218845696786}}, "0.2": {"accuracy": 0.7147887323943662, "large_model_usage": 0.4119718309859155, "small_model_usage": 0.5880281690140845, "avg_lambda": 0.5, "avg_cost": 0.0002736228873239434, "unrestricted": {"accuracy": 0.75, "large_model_usage": 0.4119718309859155, "avg_cost": 0.00028815809859154904}, "random_baseline": {"accuracy": 0.6982394366197183, "accuracy_std": 0.011683563396504974, "avg_cost": 0.0002719235563380279, "cost_std": 2.0809792516426593e-05, "small_model_fraction": 0.5856676978626456}, "unrestricted_random_baseline": {"accuracy": 0.7109154929577464, "accuracy_std": 0.015624442065023824, "avg_cost": 0.0002965021126760561, "cost_std": 9.746905188663428e-06, "small_model_fraction": 0.5607415030236873}}, "0.25": {"accuracy": 0.6690140845070423, "large_model_usage": 0.28169014084507044, "small_model_usage": 0.7183098591549295, "avg_lambda": 0.34999999999999865, "avg_cost": 0.00019435528169014078, "unrestricted": {"accuracy": 0.7147887323943662, "large_model_usage": 0.28169014084507044, "avg_cost": 0.00020743978873239427}, "random_baseline": {"accuracy": 0.6704225352112676, "accuracy_std": 0.012318912453898507, "avg_cost": 0.0001895919014084506, "cost_std": 1.033575028632915e-05, "small_model_fraction": 0.7216024115774292}, "unrestricted_random_baseline": {"accuracy": 0.6764084507042254, "accuracy_std": 0.013495611194885052, "avg_cost": 0.00020754383802816886, "cost_std": 1.5249398452896805e-05, "small_model_fraction": 0.6991640055644522}}}}