{"trial": 6, "trial_seed": 47, "subject": "TruthfulQA", "method": "standard", "calibration_size": 400, "test_size": 284, "small_model_accuracy": 0.5526315789473685, "large_model_accuracy": 0.8406432748538012, "small_model_cost": 3.234736842105265e-05, "large_model_cost": 0.0006228888888888894, "hybrid_results": {"0.05": {"accuracy": 0.8309859154929577, "large_model_usage": 0.8169014084507042, "small_model_usage": 0.18309859154929575, "avg_lambda": 0.900000000000004, "avg_cost": 0.0005453827464788727, "unrestricted": {"accuracy": 0.8274647887323944, "large_model_usage": 0.8169014084507042, "avg_cost": 0.0005458545774647881}, "random_baseline": {"accuracy": 0.797887323943662, "accuracy_std": 0.010586828435473894, "avg_cost": 0.0005466984154929579, "cost_std": 1.4855300631761026e-05, "small_model_fraction": 0.1312458814896115}, "unrestricted_random_baseline": {"accuracy": 0.8014084507042254, "accuracy_std": 0.0102294641171366, "avg_cost": 0.0005503690140845071, "cost_std": 1.4458575881103222e-05, "small_model_fraction": 0.13044690128320427}}, "0.1": {"accuracy": 0.7816901408450704, "large_model_usage": 0.6584507042253521, "small_model_usage": 0.3415492957746479, "avg_lambda": 0.75, "avg_cost": 0.00042905176056337965, "unrestricted": {"accuracy": 0.7992957746478874, "large_model_usage": 0.6584507042253521, "avg_cost": 0.0004455728873239431}, "random_baseline": {"accuracy": 0.7524647887323944, "accuracy_std": 0.010569247197044798, "avg_cost": 0.0004327895422535211, "cost_std": 1.5080470881730778e-05, "small_model_fraction": 0.32823624014099595}, "unrestricted_random_baseline": {"accuracy": 0.7549295774647888, "accuracy_std": 0.020663944719889003, "avg_cost": 0.00044585753521126753, "cost_std": 1.7523034940502878e-05, "small_model_fraction": 0.3002600078390316}}, "0.15": {"accuracy": 0.7183098591549296, "large_model_usage": 0.45774647887323944, "small_model_usage": 0.5422535211267605, "avg_lambda": 0.540000000000001, "avg_cost": 0.0003107207746478872, "unrestricted": {"accuracy": 0.7464788732394366, "large_model_usage": 0.45774647887323944, "avg_cost": 0.0003214531690140843}, "random_baseline": {"accuracy": 0.6855633802816901, "accuracy_std": 0.017679422831877737, "avg_cost": 0.0003105545422535211, "cost_std": 1.3003417440891647e-05, "small_model_fraction": 0.52861332086133}, "unrestricted_random_baseline": {"accuracy": 0.7119718309859155, "accuracy_std": 0.022754217494934545, "avg_cost": 0.00033131816901408445, "cost_std": 1.985045233240363e-05, "small_model_fraction": 0.5104395024349901}}, "0.2": {"accuracy": 0.6901408450704225, "large_model_usage": 0.3732394366197183, "small_model_usage": 0.6267605633802817, "avg_lambda": 0.4200000000000014, "avg_cost": 0.0002537911971830987, "unrestricted": {"accuracy": 0.7288732394366197, "large_model_usage": 0.3732394366197183, "avg_cost": 0.000268150352112676}, "random_baseline": {"accuracy": 0.6697183098591549, "accuracy_std": 0.012870187945420452, "avg_cost": 0.00026016584507042263, "cost_std": 1.3653738890768403e-05, "small_model_fraction": 0.6250156490493427}, "unrestricted_random_baseline": {"accuracy": 0.6640845070422535, "accuracy_std": 0.01704749075621156, "avg_cost": 0.00026993880281690134, "cost_std": 1.4717833361112202e-05, "small_model_fraction": 0.6007004156036033}}, "0.25": {"accuracy": 0.6408450704225352, "large_model_usage": 0.31338028169014087, "small_model_usage": 0.6866197183098591, "avg_lambda": 0.3199999999999981, "avg_cost": 0.00021055880281690146, "unrestricted": {"accuracy": 0.6936619718309859, "large_model_usage": 0.31338028169014087, "avg_cost": 0.0002301010563380282}, "random_baseline": {"accuracy": 0.6415492957746479, "accuracy_std": 0.01800945331344284, "avg_cost": 0.0002074379577464789, "cost_std": 1.8403377820668604e-05, "small_model_fraction": 0.6982237010961282}, "unrestricted_random_baseline": {"accuracy": 0.6426056338028171, "accuracy_std": 0.016905075893180018, "avg_cost": 0.00022441848591549292, "cost_std": 1.683057861140886e-05, "small_model_fraction": 0.6651316104576155}}}}