{"subject": "TruthfulQA", "method": "standard", "iterations": 30, "random_seed": 42, "calibration_size": 400, "total_examples": 684, "small_model": {"avg_accuracy": 0.55906432748538, "std_accuracy": 0.014554434400145482, "avg_cost": 3.2234697855750505e-05, "std_cost": 4.3365578686349286e-08, "x_position": 1.0, "x_std": 0.0}, "large_model": {"avg_accuracy": 0.8328460038986353, "std_accuracy": 0.011257204458464646, "avg_cost": 0.0006199863547758292, "std_cost": 1.2602785420911144e-06, "x_position": 0.0, "x_std": 0.0}, "hybrid_models": {"0.05": {"avg_accuracy": 0.8058685446009388, "std_accuracy": 0.01683116598144917, "avg_large_model_usage": 0.7892018779342721, "std_large_model_usage": 0.0558127902647999, "avg_lambda": 0.8670000000000023, "std_lambda": 0.04034435111222715, "avg_cost": 0.0005211390258215963, "std_cost": 3.459403054059382e-05}, "0.1": {"avg_accuracy": 0.7661971830985915, "std_accuracy": 0.017359785816417186, "avg_large_model_usage": 0.621244131455399, "std_large_model_usage": 0.043484450312065, "avg_lambda": 0.7063333333333313, "std_lambda": 0.03124988888869116, "avg_cost": 0.00040702940140845064, "std_cost": 2.5840267994118223e-05}, "0.15": {"avg_accuracy": 0.7138497652582159, "std_accuracy": 0.023880289881757787, "avg_large_model_usage": 0.4484741784037558, "std_large_model_usage": 0.05743851331687026, "avg_lambda": 0.529333333333334, "std_lambda": 0.05847696602556884, "avg_cost": 0.00030176461267605626, "std_cost": 3.503246430825848e-05}, "0.2": {"avg_accuracy": 0.6715962441314555, "std_accuracy": 0.025128337740068743, "avg_large_model_usage": 0.34295774647887334, "std_large_model_usage": 0.05286070171331956, "avg_lambda": 0.40300000000000075, "std_lambda": 0.05797413216254328, "avg_cost": 0.00023388691314553989, "std_cost": 3.3088615540738334e-05}, "0.25": {"avg_accuracy": 0.6205399061032864, "std_accuracy": 0.02535347109906104, "avg_large_model_usage": 0.27687793427230045, "std_large_model_usage": 0.03933045877620249, "avg_lambda": 0.276666666666667, "std_lambda": 0.067396010432534, "avg_cost": 0.00018783808685446, "std_cost": 2.4479106559077338e-05}}, "all_trials": [{"trial": 1, "trial_seed": 42, "subject": "TruthfulQA", "method": "standard", "calibration_size": 400, "test_size": 284, "small_model_accuracy": 0.5380116959064327, "large_model_accuracy": 0.8157894736842105, "small_model_cost": 3.2259795321637416e-05, "large_model_cost": 0.0006210087719298245, "hybrid_results": {"0.05": {"accuracy": 0.7852112676056338, "large_model_usage": 0.8133802816901409, "small_model_usage": 0.18661971830985913, "avg_lambda": 0.8600000000000062, "avg_cost": 0.0005305577464788734, "unrestricted": {"accuracy": 0.8098591549295775, "large_model_usage": 0.8133802816901409, "avg_cost": 0.0005396633802816903}, "random_baseline": {"accuracy": 0.7693661971830986, "accuracy_std": 0.012523534678717405, "avg_cost": 0.0005297964436619714, "cost_std": 7.981958493243993e-06, "small_model_fraction": 0.15363258204208538}, "unrestricted_random_baseline": {"accuracy": 0.7806338028169015, "accuracy_std": 0.010569247197044834, "avg_cost": 0.0005430809154929573, "cost_std": 7.125946229849069e-06, "small_model_fraction": 0.1381665104825645}}, "0.1": {"accuracy": 0.7640845070422535, "large_model_usage": 0.6584507042253521, "small_model_usage": 0.3415492957746479, "avg_lambda": 0.7599999999999976, "avg_cost": 0.0004393042253521124, "unrestricted": {"accuracy": 0.7816901408450704, "large_model_usage": 0.6584507042253521, "avg_cost": 0.0004437478873239434}, "random_baseline": {"accuracy": 0.7292253521126761, "accuracy_std": 0.01367811591517958, "avg_cost": 0.0004350170774647888, "cost_std": 1.916986647203602e-05, "small_model_fraction": 0.308628216433634}, "unrestricted_random_baseline": {"accuracy": 0.7433098591549296, "accuracy_std": 0.015303742121197677, "avg_cost": 0.00044375637323943666, "cost_std": 9.062408789121625e-06, "small_model_fraction": 0.30108058213041844}}, "0.15": {"accuracy": 0.7535211267605634, "large_model_usage": 0.647887323943662, "small_model_usage": 0.352112676056338, "avg_lambda": 0.6999999999999973, "avg_cost": 0.0004182408450704223, "unrestricted": {"accuracy": 0.7816901408450704, "large_model_usage": 0.647887323943662, "avg_cost": 0.00043723380281690115}, "random_baseline": {"accuracy": 0.7274647887323944, "accuracy_std": 0.011045343057294437, "avg_cost": 0.0004111095422535212, "cost_std": 9.248633912981306e-06, "small_model_fraction": 0.3444047207140105}, "unrestricted_random_baseline": {"accuracy": 0.7316901408450704, "accuracy_std": 0.015492957746478887, "avg_cost": 0.00043899091549295786, "cost_std": 1.8745470254133388e-05, "small_model_fraction": 0.3121448637951955}}, "0.2": {"accuracy": 0.6901408450704225, "large_model_usage": 0.47183098591549294, "small_model_usage": 0.528169014084507, "avg_lambda": 0.5800000000000017, "avg_cost": 0.000318501408450704, "unrestricted": {"accuracy": 0.7147887323943662, "large_model_usage": 0.47183098591549294, "avg_cost": 0.00032755774647887304}, "random_baseline": {"accuracy": 0.681338028169014, "accuracy_std": 0.016155027812611897, "avg_cost": 0.00030770605633802836, "cost_std": 1.5092381333165832e-05, "small_model_fraction": 0.5138138247337275}, "unrestricted_random_baseline": {"accuracy": 0.6922535211267605, "accuracy_std": 0.01630399563773271, "avg_cost": 0.0003230475704225353, "cost_std": 1.353362298452764e-05, "small_model_fraction": 0.4984314828732914}}, "0.25": {"accuracy": 0.6549295774647887, "large_model_usage": 0.36619718309859156, "small_model_usage": 0.6338028169014085, "avg_lambda": 0.40000000000000185, "avg_cost": 0.00024829014084507044, "unrestricted": {"accuracy": 0.6936619718309859, "large_model_usage": 0.36619718309859156, "avg_cost": 0.0002615225352112676}, "random_baseline": {"accuracy": 0.6683098591549296, "accuracy_std": 0.01237915199383586, "avg_cost": 0.00024946338028169015, "cost_std": 1.469407337931493e-05, "small_model_fraction": 0.6330688390016491}, "unrestricted_random_baseline": {"accuracy": 0.6732394366197183, "accuracy_std": 0.008879943811914419, "avg_cost": 0.0002683820422535212, "cost_std": 2.5443747255951056e-05, "small_model_fraction": 0.6105933954901721}}}}, {"trial": 2, "trial_seed": 43, "subject": "TruthfulQA", "method": "standard", "calibration_size": 400, "test_size": 284, "small_model_accuracy": 0.5687134502923976, "large_model_accuracy": 0.8289473684210527, "small_model_cost": 3.22454678362573e-05, "large_model_cost": 0.000621143274853802, "hybrid_results": {"0.05": {"accuracy": 0.795774647887324, "large_model_usage": 0.7112676056338029, "small_model_usage": 0.28873239436619713, "avg_lambda": 0.8000000000000037, "avg_cost": 0.0004714376760563377, "unrestricted": {"accuracy": 0.7992957746478874, "large_model_usage": 0.7112676056338029, "avg_cost": 0.00047642359154929535}, "random_baseline": {"accuracy": 0.7735915492957746, "accuracy_std": 0.013911793144463665, "avg_cost": 0.00048102267605633814, "cost_std": 1.619393366762457e-05, "small_model_fraction": 0.25421320475897824}, "unrestricted_random_baseline": {"accuracy": 0.7661971830985915, "accuracy_std": 0.013472624274442933, "avg_cost": 0.0004779786971830986, "cost_std": 1.5087024963156054e-05, "small_model_fraction": 0.24574668402559552}}, "0.1": {"accuracy": 0.7535211267605634, "large_model_usage": 0.6161971830985915, "small_model_usage": 0.3838028169014085, "avg_lambda": 0.6999999999999973, "avg_cost": 0.0004035503521126758, "unrestricted": {"accuracy": 0.7746478873239436, "large_model_usage": 0.6161971830985915, "avg_cost": 0.0004186911971830983}, "random_baseline": {"accuracy": 0.7355633802816901, "accuracy_std": 0.01522251149381497, "avg_cost": 0.00040199380281690146, "cost_std": 1.4154145955465642e-05, "small_model_fraction": 0.3694918203943042}, "unrestricted_random_baseline": {"accuracy": 0.7394366197183099, "accuracy_std": 0.015428804436765224, "avg_cost": 0.00041703334507042257, "cost_std": 1.9815446230651225e-05, "small_model_fraction": 0.34378134076609357}}, "0.15": {"accuracy": 0.6971830985915493, "large_model_usage": 0.426056338028169, "small_model_usage": 0.573943661971831, "avg_lambda": 0.5100000000000009, "avg_cost": 0.00028815598591549275, "unrestricted": {"accuracy": 0.7288732394366197, "large_model_usage": 0.426056338028169, "avg_cost": 0.0003002123239436618}, "random_baseline": {"accuracy": 0.6890845070422535, "accuracy_std": 0.01021126760563379, "avg_cost": 0.0002899734859154929, "cost_std": 1.658211626711447e-05, "small_model_fraction": 0.5654415502491229}, "unrestricted_random_baseline": {"accuracy": 0.6862676056338028, "accuracy_std": 0.0156244420650238, "avg_cost": 0.0003133661267605633, "cost_std": 2.1444369730797858e-05, "small_model_fraction": 0.5449688334474965}}, "0.2": {"accuracy": 0.6549295774647887, "large_model_usage": 0.29225352112676056, "small_model_usage": 0.7077464788732395, "avg_lambda": 0.34999999999999865, "avg_cost": 0.00020390246478873232, "unrestricted": {"accuracy": 0.6830985915492958, "large_model_usage": 0.29225352112676056, "avg_cost": 0.0002158390845070422}, "random_baseline": {"accuracy": 0.651056338028169, "accuracy_std": 0.011295080938271449, "avg_cost": 0.00020326105633802815, "cost_std": 1.025649246700107e-05, "small_model_fraction": 0.7085114006081519}, "unrestricted_random_baseline": {"accuracy": 0.6566901408450705, "accuracy_std": 0.009610805678842368, "avg_cost": 0.00022069482394366193, "cost_std": 1.7569125976076112e-05, "small_model_fraction": 0.6882419759710275}}, "0.25": {"accuracy": 0.6232394366197183, "large_model_usage": 0.25704225352112675, "small_model_usage": 0.7429577464788732, "avg_lambda": 0.20000000000000093, "avg_cost": 0.00017446584507042244, "unrestricted": {"accuracy": 0.6690140845070423, "large_model_usage": 0.25704225352112675, "avg_cost": 0.00019397992957746472}, "random_baseline": {"accuracy": 0.6415492957746479, "accuracy_std": 0.00993432111244077, "avg_cost": 0.00018012140845070419, "cost_std": 1.1838749183956343e-05, "small_model_fraction": 0.7584973563504406}, "unrestricted_random_baseline": {"accuracy": 0.6359154929577465, "accuracy_std": 0.01137710874746727, "avg_cost": 0.00019050922535211262, "cost_std": 1.2365391923515368e-05, "small_model_fraction": 0.7253607335365931}}}}, {"trial": 3, "trial_seed": 44, "subject": "TruthfulQA", "method": "standard", "calibration_size": 400, "test_size": 284, "small_model_accuracy": 0.5716374269005848, "large_model_accuracy": 0.8377192982456141, "small_model_cost": 3.227251461988303e-05, "large_model_cost": 0.0006202105263157896, "hybrid_results": {"0.05": {"accuracy": 0.8204225352112676, "large_model_usage": 0.7922535211267606, "small_model_usage": 0.20774647887323938, "avg_lambda": 0.900000000000004, "avg_cost": 0.000527711267605634, "unrestricted": {"accuracy": 0.8204225352112676, "large_model_usage": 0.7922535211267606, "avg_cost": 0.0005281830985915495}, "random_baseline": {"accuracy": 0.7862676056338028, "accuracy_std": 0.014776151590392477, "avg_cost": 0.0005276022887323942, "cost_std": 1.0995298360019024e-05, "small_model_fraction": 0.15732825037684095}, "unrestricted_random_baseline": {"accuracy": 0.7904929577464788, "accuracy_std": 0.010709088961792674, "avg_cost": 0.0005279092957746477, "cost_std": 1.2602227004624905e-05, "small_model_fraction": 0.15652573212401605}}, "0.1": {"accuracy": 0.7640845070422535, "large_model_usage": 0.6302816901408451, "small_model_usage": 0.3697183098591549, "avg_lambda": 0.7299999999999969, "avg_cost": 0.00040919014084507023, "unrestricted": {"accuracy": 0.7816901408450704, "large_model_usage": 0.6302816901408451, "avg_cost": 0.00042655633802816907}, "random_baseline": {"accuracy": 0.7154929577464788, "accuracy_std": 0.016500527484309854, "avg_cost": 0.000407182288732394, "cost_std": 2.0435311568781165e-05, "small_model_fraction": 0.3589160443326862}, "unrestricted_random_baseline": {"accuracy": 0.7246478873239437, "accuracy_std": 0.014838948947679936, "avg_cost": 0.0004359585915492955, "cost_std": 1.2889834277170512e-05, "small_model_fraction": 0.3293785814749845}}, "0.15": {"accuracy": 0.7147887323943662, "large_model_usage": 0.4507042253521127, "small_model_usage": 0.5492957746478873, "avg_lambda": 0.5800000000000017, "avg_cost": 0.0003069788732394366, "unrestricted": {"accuracy": 0.7253521126760564, "large_model_usage": 0.4507042253521127, "avg_cost": 0.00031552816901408444}, "random_baseline": {"accuracy": 0.6802816901408452, "accuracy_std": 0.016273549307452593, "avg_cost": 0.00032023116197183084, "cost_std": 2.178707534091208e-05, "small_model_fraction": 0.5327630580864071}, "unrestricted_random_baseline": {"accuracy": 0.6785211267605633, "accuracy_std": 0.013273293538711787, "avg_cost": 0.0003173991901408449, "cost_std": 1.4168019025039416e-05, "small_model_fraction": 0.5182219064605964}}, "0.2": {"accuracy": 0.676056338028169, "large_model_usage": 0.36971830985915494, "small_model_usage": 0.630281690140845, "avg_lambda": 0.45999999999999747, "avg_cost": 0.0002524225352112676, "unrestricted": {"accuracy": 0.7183098591549296, "large_model_usage": 0.36971830985915494, "avg_cost": 0.000264556338028169}, "random_baseline": {"accuracy": 0.6404929577464789, "accuracy_std": 0.024670512754370932, "avg_cost": 0.0002475817253521127, "cost_std": 8.412493692432899e-06, "small_model_fraction": 0.6255557283048224}, "unrestricted_random_baseline": {"accuracy": 0.6679577464788733, "accuracy_std": 0.015352274194782826, "avg_cost": 0.00027735982394366186, "cost_std": 1.4563703420191477e-05, "small_model_fraction": 0.6049178335344172}}, "0.25": {"accuracy": 0.6232394366197183, "large_model_usage": 0.2887323943661972, "small_model_usage": 0.7112676056338028, "avg_lambda": 0.3100000000000012, "avg_cost": 0.00019813380281690144, "unrestricted": {"accuracy": 0.6795774647887324, "large_model_usage": 0.2887323943661972, "avg_cost": 0.0002137253521126761}, "random_baseline": {"accuracy": 0.6059859154929577, "accuracy_std": 0.014470931400024305, "avg_cost": 0.00019087947183098588, "cost_std": 1.1760263134028971e-05, "small_model_fraction": 0.7178932389171577}, "unrestricted_random_baseline": {"accuracy": 0.6320422535211268, "accuracy_std": 0.01638364806215755, "avg_cost": 0.0002303455985915493, "cost_std": 1.616352294272832e-05, "small_model_fraction": 0.6913742029208274}}}}, {"trial": 4, "trial_seed": 45, "subject": "TruthfulQA", "method": "standard", "calibration_size": 400, "test_size": 284, "small_model_accuracy": 0.533625730994152, "large_model_accuracy": 0.8362573099415205, "small_model_cost": 3.225190058479537e-05, "large_model_cost": 0.0006206286549707611, "hybrid_results": {"0.05": {"accuracy": 0.8028169014084507, "large_model_usage": 0.7323943661971831, "small_model_usage": 0.2676056338028169, "avg_lambda": 0.8000000000000037, "avg_cost": 0.0004876271126760561, "unrestricted": {"accuracy": 0.8028169014084507, "large_model_usage": 0.7323943661971831, "avg_cost": 0.0004915848591549295}, "random_baseline": {"accuracy": 0.7676056338028169, "accuracy_std": 0.014259476571349692, "avg_cost": 0.0004898345774647888, "cost_std": 1.8359511514011837e-05, "small_model_fraction": 0.22604826125992417}, "unrestricted_random_baseline": {"accuracy": 0.7760563380281691, "accuracy_std": 0.014363435249557157, "avg_cost": 0.0004966263380281692, "cost_std": 9.70912462286245e-06, "small_model_fraction": 0.21932170986344063}}, "0.1": {"accuracy": 0.7852112676056338, "large_model_usage": 0.6514084507042254, "small_model_usage": 0.3485915492957746, "avg_lambda": 0.6999999999999973, "avg_cost": 0.00042386654929577446, "unrestricted": {"accuracy": 0.7992957746478874, "large_model_usage": 0.6514084507042254, "avg_cost": 0.00044017640845070405}, "random_baseline": {"accuracy": 0.7369718309859155, "accuracy_std": 0.014522246920634686, "avg_cost": 0.0004229308450704224, "cost_std": 2.1133945554168816e-05, "small_model_fraction": 0.33441515866874966}, "unrestricted_random_baseline": {"accuracy": 0.7461267605633802, "accuracy_std": 0.01302816901408449, "avg_cost": 0.0004515406338028168, "cost_std": 1.8287807898422887e-05, "small_model_fraction": 0.3066950643017472}}, "0.15": {"accuracy": 0.721830985915493, "large_model_usage": 0.4471830985915493, "small_model_usage": 0.5528169014084507, "avg_lambda": 0.5, "avg_cost": 0.00029992992957746464, "unrestricted": {"accuracy": 0.7605633802816901, "large_model_usage": 0.4471830985915493, "avg_cost": 0.0003126341549295774}, "random_baseline": {"accuracy": 0.6830985915492958, "accuracy_std": 0.01991850087849429, "avg_cost": 0.0003074378169014082, "cost_std": 1.9443342237373928e-05, "small_model_fraction": 0.5450567565810448}, "unrestricted_random_baseline": {"accuracy": 0.682394366197183, "accuracy_std": 0.01701837461069658, "avg_cost": 0.00031005225352112646, "cost_std": 1.6198809478358878e-05, "small_model_fraction": 0.5234647659773864}}, "0.2": {"accuracy": 0.6725352112676056, "large_model_usage": 0.3626760563380282, "small_model_usage": 0.6373239436619718, "avg_lambda": 0.40000000000000185, "avg_cost": 0.00024397922535211256, "unrestricted": {"accuracy": 0.7288732394366197, "large_model_usage": 0.3626760563380282, "avg_cost": 0.00025916232394366173}, "random_baseline": {"accuracy": 0.647887323943662, "accuracy_std": 0.018896349105632017, "avg_cost": 0.0002485592957746477, "cost_std": 2.076983699380555e-05, "small_model_fraction": 0.6401500854868453}, "unrestricted_random_baseline": {"accuracy": 0.651056338028169, "accuracy_std": 0.01699285705917416, "avg_cost": 0.0002636858450704222, "cost_std": 1.3459969909597391e-05, "small_model_fraction": 0.6143450235458882}}, "0.25": {"accuracy": 0.6232394366197183, "large_model_usage": 0.2887323943661972, "small_model_usage": 0.7112676056338028, "avg_lambda": 0.3100000000000012, "avg_cost": 0.00019450739436619718, "unrestricted": {"accuracy": 0.6901408450704225, "large_model_usage": 0.2887323943661972, "avg_cost": 0.00021258485915492966}, "random_baseline": {"accuracy": 0.6250000000000001, "accuracy_std": 0.01790242791392419, "avg_cost": 0.00020031309859154926, "cost_std": 1.5308183984243713e-05, "small_model_fraction": 0.7242319779428866}, "unrestricted_random_baseline": {"accuracy": 0.6369718309859154, "accuracy_std": 0.0192506153983615, "avg_cost": 0.00021748524647887318, "cost_std": 1.8793272930235423e-05, "small_model_fraction": 0.6935076764575259}}}}, {"trial": 5, "trial_seed": 46, "subject": "TruthfulQA", "method": "standard", "calibration_size": 400, "test_size": 284, "small_model_accuracy": 0.5730994152046783, "large_model_accuracy": 0.8347953216374269, "small_model_cost": 3.223815789473688e-05, "large_model_cost": 0.0006194181286549714, "hybrid_results": {"0.05": {"accuracy": 0.8274647887323944, "large_model_usage": 0.6795774647887324, "small_model_usage": 0.3204225352112676, "avg_lambda": 0.8000000000000037, "avg_cost": 0.00045489401408450685, "unrestricted": {"accuracy": 0.823943661971831, "large_model_usage": 0.6795774647887324, "avg_cost": 0.00045819683098591535}, "random_baseline": {"accuracy": 0.7806338028169014, "accuracy_std": 0.014776151590392481, "avg_cost": 0.0004513943309859154, "cost_std": 1.5272400396873303e-05, "small_model_fraction": 0.28019367615256296}, "unrestricted_random_baseline": {"accuracy": 0.781338028169014, "accuracy_std": 0.01763729547375804, "avg_cost": 0.00045483996478873237, "cost_std": 1.4719229174108148e-05, "small_model_fraction": 0.27456879610576523}}, "0.1": {"accuracy": 0.8063380281690141, "large_model_usage": 0.6267605633802817, "small_model_usage": 0.37323943661971826, "avg_lambda": 0.6999999999999973, "avg_cost": 0.00040721091549295773, "unrestricted": {"accuracy": 0.8169014084507042, "large_model_usage": 0.6267605633802817, "avg_cost": 0.00042503485915492946}, "random_baseline": {"accuracy": 0.7623239436619718, "accuracy_std": 0.017123685092658908, "avg_cost": 0.00042181996478873245, "cost_std": 9.420449239157187e-06, "small_model_fraction": 0.3614006330755193}, "unrestricted_random_baseline": {"accuracy": 0.7672535211267606, "accuracy_std": 0.01096083268751645, "avg_cost": 0.00043352901408450704, "cost_std": 9.805438965790524e-06, "small_model_fraction": 0.33104547017904873}}, "0.15": {"accuracy": 0.75, "large_model_usage": 0.4225352112676056, "small_model_usage": 0.5774647887323944, "avg_lambda": 0.5, "avg_cost": 0.00028540105633802817, "unrestricted": {"accuracy": 0.7816901408450704, "large_model_usage": 0.4225352112676056, "avg_cost": 0.0002986545774647887}, "random_baseline": {"accuracy": 0.7052816901408451, "accuracy_std": 0.012894248889859577, "avg_cost": 0.00028027767605633803, "cost_std": 1.8089703713997775e-05, "small_model_fraction": 0.5688495673387567}, "unrestricted_random_baseline": {"accuracy": 0.7109154929577465, "accuracy_std": 0.013495611194885102, "avg_cost": 0.0003001230633802817, "cost_std": 1.3965806739731885e-05, "small_model_fraction": 0.5462780870656798}}, "0.2": {"accuracy": 0.6971830985915493, "large_model_usage": 0.323943661971831, "small_model_usage": 0.676056338028169, "avg_lambda": 0.40000000000000185, "avg_cost": 0.00022223204225352107, "unrestricted": {"accuracy": 0.7535211267605634, "large_model_usage": 0.323943661971831, "avg_cost": 0.0002354644366197183}, "random_baseline": {"accuracy": 0.6838028169014084, "accuracy_std": 0.012074949436255087, "avg_cost": 0.0002303851056338028, "cost_std": 1.5487079438649265e-05, "small_model_fraction": 0.6764298957391292}, "unrestricted_random_baseline": {"accuracy": 0.677112676056338, "accuracy_std": 0.02071488180293341, "avg_cost": 0.00023452848591549292, "cost_std": 1.0993920148804855e-05, "small_model_fraction": 0.6538943955089953}}, "0.25": {"accuracy": 0.6514084507042254, "large_model_usage": 0.2676056338028169, "small_model_usage": 0.7323943661971831, "avg_lambda": 0.3100000000000012, "avg_cost": 0.00018283767605633807, "unrestricted": {"accuracy": 0.7288732394366197, "large_model_usage": 0.2676056338028169, "avg_cost": 0.0001999151408450705}, "random_baseline": {"accuracy": 0.6658450704225352, "accuracy_std": 0.01254331910602595, "avg_cost": 0.00018004077464788725, "cost_std": 1.4422334068765449e-05, "small_model_fraction": 0.7435206824806766}, "unrestricted_random_baseline": {"accuracy": 0.6647887323943662, "accuracy_std": 0.007998462458873614, "avg_cost": 0.00019779299295774635, "cost_std": 1.2569989265869246e-05, "small_model_fraction": 0.7144368144348681}}}}, {"trial": 6, "trial_seed": 47, "subject": "TruthfulQA", "method": "standard", "calibration_size": 400, "test_size": 284, "small_model_accuracy": 0.5526315789473685, "large_model_accuracy": 0.8406432748538012, "small_model_cost": 3.234736842105265e-05, "large_model_cost": 0.0006228888888888894, "hybrid_results": {"0.05": {"accuracy": 0.8309859154929577, "large_model_usage": 0.8169014084507042, "small_model_usage": 0.18309859154929575, "avg_lambda": 0.900000000000004, "avg_cost": 0.0005453827464788727, "unrestricted": {"accuracy": 0.8274647887323944, "large_model_usage": 0.8169014084507042, "avg_cost": 0.0005458545774647881}, "random_baseline": {"accuracy": 0.797887323943662, "accuracy_std": 0.010586828435473894, "avg_cost": 0.0005466984154929579, "cost_std": 1.4855300631761026e-05, "small_model_fraction": 0.1312458814896115}, "unrestricted_random_baseline": {"accuracy": 0.8014084507042254, "accuracy_std": 0.0102294641171366, "avg_cost": 0.0005503690140845071, "cost_std": 1.4458575881103222e-05, "small_model_fraction": 0.13044690128320427}}, "0.1": {"accuracy": 0.7816901408450704, "large_model_usage": 0.6584507042253521, "small_model_usage": 0.3415492957746479, "avg_lambda": 0.75, "avg_cost": 0.00042905176056337965, "unrestricted": {"accuracy": 0.7992957746478874, "large_model_usage": 0.6584507042253521, "avg_cost": 0.0004455728873239431}, "random_baseline": {"accuracy": 0.7524647887323944, "accuracy_std": 0.010569247197044798, "avg_cost": 0.0004327895422535211, "cost_std": 1.5080470881730778e-05, "small_model_fraction": 0.32823624014099595}, "unrestricted_random_baseline": {"accuracy": 0.7549295774647888, "accuracy_std": 0.020663944719889003, "avg_cost": 0.00044585753521126753, "cost_std": 1.7523034940502878e-05, "small_model_fraction": 0.3002600078390316}}, "0.15": {"accuracy": 0.7183098591549296, "large_model_usage": 0.45774647887323944, "small_model_usage": 0.5422535211267605, "avg_lambda": 0.540000000000001, "avg_cost": 0.0003107207746478872, "unrestricted": {"accuracy": 0.7464788732394366, "large_model_usage": 0.45774647887323944, "avg_cost": 0.0003214531690140843}, "random_baseline": {"accuracy": 0.6855633802816901, "accuracy_std": 0.017679422831877737, "avg_cost": 0.0003105545422535211, "cost_std": 1.3003417440891647e-05, "small_model_fraction": 0.52861332086133}, "unrestricted_random_baseline": {"accuracy": 0.7119718309859155, "accuracy_std": 0.022754217494934545, "avg_cost": 0.00033131816901408445, "cost_std": 1.985045233240363e-05, "small_model_fraction": 0.5104395024349901}}, "0.2": {"accuracy": 0.6901408450704225, "large_model_usage": 0.3732394366197183, "small_model_usage": 0.6267605633802817, "avg_lambda": 0.4200000000000014, "avg_cost": 0.0002537911971830987, "unrestricted": {"accuracy": 0.7288732394366197, "large_model_usage": 0.3732394366197183, "avg_cost": 0.000268150352112676}, "random_baseline": {"accuracy": 0.6697183098591549, "accuracy_std": 0.012870187945420452, "avg_cost": 0.00026016584507042263, "cost_std": 1.3653738890768403e-05, "small_model_fraction": 0.6250156490493427}, "unrestricted_random_baseline": {"accuracy": 0.6640845070422535, "accuracy_std": 0.01704749075621156, "avg_cost": 0.00026993880281690134, "cost_std": 1.4717833361112202e-05, "small_model_fraction": 0.6007004156036033}}, "0.25": {"accuracy": 0.6408450704225352, "large_model_usage": 0.31338028169014087, "small_model_usage": 0.6866197183098591, "avg_lambda": 0.3199999999999981, "avg_cost": 0.00021055880281690146, "unrestricted": {"accuracy": 0.6936619718309859, "large_model_usage": 0.31338028169014087, "avg_cost": 0.0002301010563380282}, "random_baseline": {"accuracy": 0.6415492957746479, "accuracy_std": 0.01800945331344284, "avg_cost": 0.0002074379577464789, "cost_std": 1.8403377820668604e-05, "small_model_fraction": 0.6982237010961282}, "unrestricted_random_baseline": {"accuracy": 0.6426056338028171, "accuracy_std": 0.016905075893180018, "avg_cost": 0.00022441848591549292, "cost_std": 1.683057861140886e-05, "small_model_fraction": 0.6651316104576155}}}}, {"trial": 7, "trial_seed": 48, "subject": "TruthfulQA", "method": "standard", "calibration_size": 400, "test_size": 284, "small_model_accuracy": 0.5350877192982456, "large_model_accuracy": 0.8260233918128655, "small_model_cost": 3.2208040935672557e-05, "large_model_cost": 0.000620862573099416, "hybrid_results": {"0.05": {"accuracy": 0.8204225352112676, "large_model_usage": 0.8098591549295775, "small_model_usage": 0.1901408450704225, "avg_lambda": 0.879999999999997, "avg_cost": 0.0005330313380281685, "unrestricted": {"accuracy": 0.8274647887323944, "large_model_usage": 0.8098591549295775, "avg_cost": 0.0005402214788732389}, "random_baseline": {"accuracy": 0.7926056338028169, "accuracy_std": 0.012641776810574842, "avg_cost": 0.0005408788732394361, "cost_std": 7.742439958604492e-06, "small_model_fraction": 0.14920675926574858}, "unrestricted_random_baseline": {"accuracy": 0.7926056338028169, "accuracy_std": 0.010138154964005643, "avg_cost": 0.000545339154929577, "cost_std": 1.1594404984048959e-05, "small_model_fraction": 0.13699222518471244}}, "0.1": {"accuracy": 0.7746478873239436, "large_model_usage": 0.647887323943662, "small_model_usage": 0.352112676056338, "avg_lambda": 0.6999999999999973, "avg_cost": 0.0004208693661971829, "unrestricted": {"accuracy": 0.8028169014084507, "large_model_usage": 0.647887323943662, "avg_cost": 0.0004386933098591548}, "random_baseline": {"accuracy": 0.7359154929577466, "accuracy_std": 0.013454206460945615, "avg_cost": 0.00042454591549295734, "cost_std": 1.9463818489505667e-05, "small_model_fraction": 0.339746312947102}, "unrestricted_random_baseline": {"accuracy": 0.7475352112676056, "accuracy_std": 0.01823182178309803, "avg_cost": 0.0004381151760563377, "cost_std": 1.6969285234710373e-05, "small_model_fraction": 0.3094671887951896}}, "0.15": {"accuracy": 0.704225352112676, "large_model_usage": 0.45774647887323944, "small_model_usage": 0.5422535211267605, "avg_lambda": 0.5500000000000005, "avg_cost": 0.0003088341549295774, "unrestricted": {"accuracy": 0.7394366197183099, "large_model_usage": 0.45774647887323944, "avg_cost": 0.0003196651408450703}, "random_baseline": {"accuracy": 0.6943661971830987, "accuracy_std": 0.017871235972464115, "avg_cost": 0.00031316584507042235, "cost_std": 1.8989880964655142e-05, "small_model_fraction": 0.5300705271441672}, "unrestricted_random_baseline": {"accuracy": 0.7014084507042253, "accuracy_std": 0.013709804460515356, "avg_cost": 0.00032255859154929563, "cost_std": 1.5334382976164063e-05, "small_model_fraction": 0.5116709645421755}}, "0.2": {"accuracy": 0.676056338028169, "large_model_usage": 0.352112676056338, "small_model_usage": 0.647887323943662, "avg_lambda": 0.450000000000002, "avg_cost": 0.00024216514084507038, "unrestricted": {"accuracy": 0.7007042253521126, "large_model_usage": 0.352112676056338, "avg_cost": 0.000253975}, "random_baseline": {"accuracy": 0.6633802816901408, "accuracy_std": 0.01916994026593828, "avg_cost": 0.0002400174647887323, "cost_std": 1.8968116381305494e-05, "small_model_fraction": 0.6433271325752827}, "unrestricted_random_baseline": {"accuracy": 0.6732394366197183, "accuracy_std": 0.015332071166955783, "avg_cost": 0.000262375704225352, "cost_std": 1.1596243050288907e-05, "small_model_fraction": 0.6232646706223958}}, "0.25": {"accuracy": 0.6514084507042254, "large_model_usage": 0.2852112676056338, "small_model_usage": 0.7147887323943662, "avg_lambda": 0.3199999999999981, "avg_cost": 0.00019612288732394356, "unrestricted": {"accuracy": 0.6936619718309859, "large_model_usage": 0.2852112676056338, "avg_cost": 0.00021141161971830976}, "random_baseline": {"accuracy": 0.642605633802817, "accuracy_std": 0.013384913949822686, "avg_cost": 0.00019158218309859153, "cost_std": 1.0106049894756217e-05, "small_model_fraction": 0.7215432185907718}, "unrestricted_random_baseline": {"accuracy": 0.655281690140845, "accuracy_std": 0.0126417768105748, "avg_cost": 0.00021127489436619716, "cost_std": 1.5191501640884214e-05, "small_model_fraction": 0.6955708841246313}}}}, {"trial": 8, "trial_seed": 49, "subject": "TruthfulQA", "method": "standard", "calibration_size": 400, "test_size": 284, "small_model_accuracy": 0.5628654970760234, "large_model_accuracy": 0.8508771929824561, "small_model_cost": 3.223859649122807e-05, "large_model_cost": 0.0006186198830409359, "hybrid_results": {"0.05": {"accuracy": 0.8133802816901409, "large_model_usage": 0.8133802816901409, "small_model_usage": 0.18661971830985913, "avg_lambda": 0.879999999999997, "avg_cost": 0.0005299915492957748, "unrestricted": {"accuracy": 0.8380281690140845, "large_model_usage": 0.8133802816901409, "avg_cost": 0.000540991549295775}, "random_baseline": {"accuracy": 0.8200704225352112, "accuracy_std": 0.01161971830985914, "avg_cost": 0.0005349486971830987, "cost_std": 7.800035155634275e-06, "small_model_fraction": 0.15114454669359234}, "unrestricted_random_baseline": {"accuracy": 0.8313380281690141, "accuracy_std": 0.014298549650878116, "avg_cost": 0.0005365517605633804, "cost_std": 1.409905654493417e-05, "small_model_fraction": 0.13238542144127635}}, "0.1": {"accuracy": 0.7887323943661971, "large_model_usage": 0.6126760563380281, "small_model_usage": 0.38732394366197187, "avg_lambda": 0.6999999999999973, "avg_cost": 0.0004031323943661972, "unrestricted": {"accuracy": 0.7992957746478874, "large_model_usage": 0.6126760563380281, "avg_cost": 0.0004173788732394366}, "random_baseline": {"accuracy": 0.7693661971830986, "accuracy_std": 0.018040406278097876, "avg_cost": 0.0004160537323943662, "cost_std": 1.639688466299193e-05, "small_model_fraction": 0.36748698094148974}, "unrestricted_random_baseline": {"accuracy": 0.7732394366197184, "accuracy_std": 0.012318912453898523, "avg_cost": 0.0004213135915492958, "cost_std": 2.135047519421172e-05, "small_model_fraction": 0.34319139170625634}}, "0.15": {"accuracy": 0.7640845070422535, "large_model_usage": 0.5035211267605634, "small_model_usage": 0.4964788732394366, "avg_lambda": 0.6100000000000032, "avg_cost": 0.00033839295774647893, "unrestricted": {"accuracy": 0.778169014084507, "large_model_usage": 0.5035211267605634, "avg_cost": 0.0003488225352112677}, "random_baseline": {"accuracy": 0.7362676056338029, "accuracy_std": 0.019442869989124655, "avg_cost": 0.00033291221830985916, "cost_std": 1.706919392720086e-05, "small_model_fraction": 0.4778919991518215}, "unrestricted_random_baseline": {"accuracy": 0.7334507042253522, "accuracy_std": 0.022272344508265443, "avg_cost": 0.00034959573943661975, "cost_std": 1.3388363989141646e-05, "small_model_fraction": 0.46010565824357585}}, "0.2": {"accuracy": 0.7359154929577465, "large_model_usage": 0.43309859154929575, "small_model_usage": 0.5669014084507042, "avg_lambda": 0.5, "avg_cost": 0.0002924633802816902, "unrestricted": {"accuracy": 0.7640845070422535, "large_model_usage": 0.43309859154929575, "avg_cost": 0.0003043436619718309}, "random_baseline": {"accuracy": 0.7186619718309859, "accuracy_std": 0.01073221877051282, "avg_cost": 0.000297555, "cost_std": 1.6119857240554643e-05, "small_model_fraction": 0.5562191533743587}, "unrestricted_random_baseline": {"accuracy": 0.7137323943661971, "accuracy_std": 0.013911793144463693, "avg_cost": 0.0003001983450704225, "cost_std": 1.6397405029131886e-05, "small_model_fraction": 0.5359588177145276}}, "0.25": {"accuracy": 0.647887323943662, "large_model_usage": 0.2887323943661972, "small_model_usage": 0.7112676056338028, "avg_lambda": 0.29999999999999843, "avg_cost": 0.0001947591549295774, "unrestricted": {"accuracy": 0.7112676056338029, "large_model_usage": 0.2887323943661972, "avg_cost": 0.00021398450704225342}, "random_baseline": {"accuracy": 0.679225352112676, "accuracy_std": 0.01254331910602595, "avg_cost": 0.00019818246478873233, "cost_std": 1.1226182776050133e-05, "small_model_fraction": 0.7228414989253373}, "unrestricted_random_baseline": {"accuracy": 0.6785211267605634, "accuracy_std": 0.019224836189744618, "avg_cost": 0.000208619190140845, "cost_std": 1.6709108471067122e-05, "small_model_fraction": 0.6900550636251953}}}}, {"trial": 9, "trial_seed": 50, "subject": "TruthfulQA", "method": "standard", "calibration_size": 400, "test_size": 284, "small_model_accuracy": 0.543859649122807, "large_model_accuracy": 0.8289473684210527, "small_model_cost": 3.2244736842105294e-05, "large_model_cost": 0.0006201461988304101, "hybrid_results": {"0.05": {"accuracy": 0.8169014084507042, "large_model_usage": 0.823943661971831, "small_model_usage": 0.176056338028169, "avg_lambda": 0.900000000000004, "avg_cost": 0.0005461609154929574, "unrestricted": {"accuracy": 0.8204225352112676, "large_model_usage": 0.823943661971831, "avg_cost": 0.0005474355633802813}, "random_baseline": {"accuracy": 0.8116197183098592, "accuracy_std": 0.009738955412633346, "avg_cost": 0.0005512129225352117, "cost_std": 1.1004241626045005e-05, "small_model_fraction": 0.125846401346293}, "unrestricted_random_baseline": {"accuracy": 0.8123239436619718, "accuracy_std": 0.011469364418945287, "avg_cost": 0.0005471868661971834, "cost_std": 1.1572445285426857e-05, "small_model_fraction": 0.12367826949131688}}, "0.1": {"accuracy": 0.7746478873239436, "large_model_usage": 0.6267605633802817, "small_model_usage": 0.37323943661971826, "avg_lambda": 0.7100000000000012, "avg_cost": 0.0004117383802816901, "unrestricted": {"accuracy": 0.7887323943661971, "large_model_usage": 0.6267605633802817, "avg_cost": 0.0004237595070422534}, "random_baseline": {"accuracy": 0.7475352112676057, "accuracy_std": 0.015025766403768901, "avg_cost": 0.0004197266549295773, "cost_std": 1.7442925487140622e-05, "small_model_fraction": 0.35449447232853765}, "unrestricted_random_baseline": {"accuracy": 0.7556338028169014, "accuracy_std": 0.013925154882595183, "avg_cost": 0.0004283933450704223, "cost_std": 1.088885332562275e-05, "small_model_fraction": 0.3340469525691763}}, "0.15": {"accuracy": 0.7077464788732394, "large_model_usage": 0.47183098591549294, "small_model_usage": 0.528169014084507, "avg_lambda": 0.5100000000000009, "avg_cost": 0.0003132383802816902, "unrestricted": {"accuracy": 0.75, "large_model_usage": 0.47183098591549294, "avg_cost": 0.0003277031690140846}, "random_baseline": {"accuracy": 0.670774647887324, "accuracy_std": 0.018109001226230697, "avg_cost": 0.0003164210211267604, "cost_std": 1.8025290129712546e-05, "small_model_fraction": 0.5220395566133584}, "unrestricted_random_baseline": {"accuracy": 0.6809859154929578, "accuracy_std": 0.014363435249557122, "avg_cost": 0.0003290284154929576, "cost_std": 1.776147899696816e-05, "small_model_fraction": 0.4974354525795397}}, "0.2": {"accuracy": 0.6795774647887324, "large_model_usage": 0.3626760563380282, "small_model_usage": 0.6373239436619718, "avg_lambda": 0.40000000000000185, "avg_cost": 0.0002462947183098593, "unrestricted": {"accuracy": 0.7253521126760564, "large_model_usage": 0.3626760563380282, "avg_cost": 0.00025976654929577486}, "random_baseline": {"accuracy": 0.6429577464788732, "accuracy_std": 0.011593012417714323, "avg_cost": 0.00024156366197183093, "cost_std": 1.7797844287371507e-05, "small_model_fraction": 0.6359084042012263}, "unrestricted_random_baseline": {"accuracy": 0.6595070422535212, "accuracy_std": 0.024650402286179054, "avg_cost": 0.000273256514084507, "cost_std": 9.805088105602566e-06, "small_model_fraction": 0.6129932868610629}}, "0.25": {"accuracy": 0.5985915492957746, "large_model_usage": 0.31690140845070425, "small_model_usage": 0.6830985915492958, "avg_lambda": 0.29999999999999843, "avg_cost": 0.00021002711267605637, "unrestricted": {"accuracy": 0.7183098591549296, "large_model_usage": 0.31690140845070425, "avg_cost": 0.00023104823943661978}, "random_baseline": {"accuracy": 0.6274647887323943, "accuracy_std": 0.01730733201048525, "avg_cost": 0.00021121802816901407, "cost_std": 1.2616376165527454e-05, "small_model_fraction": 0.6975983437212685}, "unrestricted_random_baseline": {"accuracy": 0.6355633802816902, "accuracy_std": 0.01769344299493116, "avg_cost": 0.0002334723591549296, "cost_std": 1.7389050599275037e-05, "small_model_fraction": 0.66184213605771}}}}, {"trial": 10, "trial_seed": 51, "subject": "TruthfulQA", "method": "standard", "calibration_size": 400, "test_size": 284, "small_model_accuracy": 0.5789473684210527, "large_model_accuracy": 0.8377192982456141, "small_model_cost": 3.2183771929824545e-05, "large_model_cost": 0.0006180497076023403, "hybrid_results": {"0.05": {"accuracy": 0.8309859154929577, "large_model_usage": 0.795774647887324, "small_model_usage": 0.204225352112676, "avg_lambda": 0.900000000000004, "avg_cost": 0.0005266760563380282, "unrestricted": {"accuracy": 0.8309859154929577, "large_model_usage": 0.795774647887324, "avg_cost": 0.0005266760563380282}, "random_baseline": {"accuracy": 0.8028169014084507, "accuracy_std": 0.014938875658870743, "avg_cost": 0.000526892218309859, "cost_std": 1.302150190222221e-05, "small_model_fraction": 0.15596341364244742}, "unrestricted_random_baseline": {"accuracy": 0.8031690140845071, "accuracy_std": 0.011935529053998806, "avg_cost": 0.0005270144014084507, "cost_std": 1.2001748964833724e-05, "small_model_fraction": 0.15596341364244742}}, "0.1": {"accuracy": 0.7746478873239436, "large_model_usage": 0.6126760563380281, "small_model_usage": 0.38732394366197187, "avg_lambda": 0.6999999999999973, "avg_cost": 0.00039558450704225333, "unrestricted": {"accuracy": 0.795774647887324, "large_model_usage": 0.6126760563380281, "avg_cost": 0.0004136971830985915}, "random_baseline": {"accuracy": 0.7397887323943662, "accuracy_std": 0.013587169987024444, "avg_cost": 0.00039618728873239416, "cost_std": 1.4979867792665785e-05, "small_model_fraction": 0.379720319981941}, "unrestricted_random_baseline": {"accuracy": 0.7426056338028169, "accuracy_std": 0.014298549650878116, "avg_cost": 0.0004078804929577462, "cost_std": 1.4880419553529691e-05, "small_model_fraction": 0.3488042435325625}}, "0.15": {"accuracy": 0.6936619718309859, "large_model_usage": 0.352112676056338, "small_model_usage": 0.647887323943662, "avg_lambda": 0.40000000000000185, "avg_cost": 0.0002375915492957747, "unrestricted": {"accuracy": 0.7394366197183099, "large_model_usage": 0.352112676056338, "avg_cost": 0.0002525140845070423}, "random_baseline": {"accuracy": 0.6591549295774649, "accuracy_std": 0.01580981994411525, "avg_cost": 0.0002388635563380282, "cost_std": 1.246047798385932e-05, "small_model_fraction": 0.6493945715922833}, "unrestricted_random_baseline": {"accuracy": 0.6633802816901408, "accuracy_std": 0.015919231768249716, "avg_cost": 0.0002516532042253522, "cost_std": 1.3032250085440947e-05, "small_model_fraction": 0.6239236672391603}}, "0.2": {"accuracy": 0.6549295774647887, "large_model_usage": 0.2992957746478873, "small_model_usage": 0.7007042253521127, "avg_lambda": 0.3100000000000012, "avg_cost": 0.00020076760563380276, "unrestricted": {"accuracy": 0.721830985915493, "large_model_usage": 0.2992957746478873, "avg_cost": 0.0002187887323943662}, "random_baseline": {"accuracy": 0.6348591549295775, "accuracy_std": 0.01527130171986018, "avg_cost": 0.00019820249999999998, "cost_std": 1.9348603748250008e-05, "small_model_fraction": 0.7122484455245536}, "unrestricted_random_baseline": {"accuracy": 0.65, "accuracy_std": 0.01660538890500225, "avg_cost": 0.0002171272887323944, "cost_std": 1.677348468242188e-05, "small_model_fraction": 0.6814886322920656}}, "0.25": {"accuracy": 0.6161971830985915, "large_model_usage": 0.2711267605633803, "small_model_usage": 0.7288732394366197, "avg_lambda": 0.20000000000000093, "avg_cost": 0.00018062676056338023, "unrestricted": {"accuracy": 0.7077464788732394, "large_model_usage": 0.2711267605633803, "avg_cost": 0.00020113380281690135}, "random_baseline": {"accuracy": 0.6253521126760563, "accuracy_std": 0.01568384327557755, "avg_cost": 0.00018351362676056337, "cost_std": 1.74198085869325e-05, "small_model_fraction": 0.7466263532404254}, "unrestricted_random_baseline": {"accuracy": 0.6535211267605633, "accuracy_std": 0.014276855586417793, "avg_cost": 0.000203363309859155, "cost_std": 1.3985181410780538e-05, "small_model_fraction": 0.7116233926569925}}}}, {"trial": 11, "trial_seed": 52, "subject": "TruthfulQA", "method": "standard", "calibration_size": 400, "test_size": 284, "small_model_accuracy": 0.5570175438596491, "large_model_accuracy": 0.8377192982456141, "small_model_cost": 3.225409356725149e-05, "large_model_cost": 0.0006200058479532172, "hybrid_results": {"0.05": {"accuracy": 0.8204225352112676, "large_model_usage": 0.795774647887324, "small_model_usage": 0.204225352112676, "avg_lambda": 0.900000000000004, "avg_cost": 0.0005289771126760562, "unrestricted": {"accuracy": 0.8204225352112676, "large_model_usage": 0.795774647887324, "avg_cost": 0.0005294489436619717}, "random_baseline": {"accuracy": 0.7982394366197184, "accuracy_std": 0.013550620707671752, "avg_cost": 0.0005352524295774647, "cost_std": 1.77151773184849e-05, "small_model_fraction": 0.15487616089255268}, "unrestricted_random_baseline": {"accuracy": 0.7908450704225353, "accuracy_std": 0.01923450744189813, "avg_cost": 0.0005236415845070424, "cost_std": 1.105785096342094e-05, "small_model_fraction": 0.15407338832336087}}, "0.1": {"accuracy": 0.7429577464788732, "large_model_usage": 0.6126760563380281, "small_model_usage": 0.38732394366197187, "avg_lambda": 0.7100000000000012, "avg_cost": 0.0003988080985915492, "unrestricted": {"accuracy": 0.7746478873239436, "large_model_usage": 0.6126760563380281, "avg_cost": 0.0004156249999999998}, "random_baseline": {"accuracy": 0.7169014084507043, "accuracy_std": 0.017829561832636866, "avg_cost": 0.000403003028169014, "cost_std": 1.9076467229129587e-05, "small_model_fraction": 0.3763455365484311}, "unrestricted_random_baseline": {"accuracy": 0.731338028169014, "accuracy_std": 0.017467770077200646, "avg_cost": 0.00041640890845070404, "cost_std": 1.2804072377248817e-05, "small_model_fraction": 0.347733284380814}}, "0.15": {"accuracy": 0.6690140845070423, "large_model_usage": 0.44366197183098594, "small_model_usage": 0.556338028169014, "avg_lambda": 0.5500000000000005, "avg_cost": 0.000300357394366197, "unrestricted": {"accuracy": 0.7077464788732394, "large_model_usage": 0.44366197183098594, "avg_cost": 0.00031026584507042233}, "random_baseline": {"accuracy": 0.6609154929577465, "accuracy_std": 0.01774941290299795, "avg_cost": 0.0003028956690140844, "cost_std": 2.3297274531506043e-05, "small_model_fraction": 0.5438494248663918}, "unrestricted_random_baseline": {"accuracy": 0.665492957746479, "accuracy_std": 0.016515548450082476, "avg_cost": 0.0003106125352112675, "cost_std": 1.4709434896022497e-05, "small_model_fraction": 0.526991200913361}}, "0.2": {"accuracy": 0.6302816901408451, "large_model_usage": 0.3380281690140845, "small_model_usage": 0.6619718309859155, "avg_lambda": 0.40000000000000185, "avg_cost": 0.00022983626760563364, "unrestricted": {"accuracy": 0.6830985915492958, "large_model_usage": 0.3380281690140845, "avg_cost": 0.0002441742957746478}, "random_baseline": {"accuracy": 0.6239436619718309, "accuracy_std": 0.017018374610696597, "avg_cost": 0.0002346591901408449, "cost_std": 2.5645416239914714e-05, "small_model_fraction": 0.663833969760244}, "unrestricted_random_baseline": {"accuracy": 0.6309859154929578, "accuracy_std": 0.016273549307452617, "avg_cost": 0.0002471783098591548, "cost_std": 2.0223050821938725e-05, "small_model_fraction": 0.6394392690009186}}, "0.25": {"accuracy": 0.573943661971831, "large_model_usage": 0.2640845070422535, "small_model_usage": 0.7359154929577465, "avg_lambda": 0.3100000000000012, "avg_cost": 0.000179526408450704, "unrestricted": {"accuracy": 0.6443661971830986, "large_model_usage": 0.2640845070422535, "avg_cost": 0.00019768133802816886}, "random_baseline": {"accuracy": 0.5968309859154929, "accuracy_std": 0.012424139974913612, "avg_cost": 0.00017354517605633796, "cost_std": 1.3060039139420638e-05, "small_model_fraction": 0.7494310926603521}, "unrestricted_random_baseline": {"accuracy": 0.6003521126760564, "accuracy_std": 0.019108399367908632, "avg_cost": 0.00019752661971830977, "cost_std": 1.4301356775630863e-05, "small_model_fraction": 0.7185423212666681}}}}, {"trial": 12, "trial_seed": 53, "subject": "TruthfulQA", "method": "standard", "calibration_size": 400, "test_size": 284, "small_model_accuracy": 0.5657894736842105, "large_model_accuracy": 0.8435672514619883, "small_model_cost": 3.229239766081873e-05, "large_model_cost": 0.0006212514619883049, "hybrid_results": {"0.05": {"accuracy": 0.823943661971831, "large_model_usage": 0.8485915492957746, "small_model_usage": 0.15140845070422537, "avg_lambda": 0.8600000000000062, "avg_cost": 0.0005508665492957748, "unrestricted": {"accuracy": 0.8345070422535211, "large_model_usage": 0.8485915492957746, "avg_cost": 0.0005615285211267608}, "random_baseline": {"accuracy": 0.8119718309859156, "accuracy_std": 0.01104534305729446, "avg_cost": 0.000554122147887324, "cost_std": 9.942011085239027e-06, "small_model_fraction": 0.1195073086665207}, "unrestricted_random_baseline": {"accuracy": 0.8161971830985916, "accuracy_std": 0.011223505246837469, "avg_cost": 0.0005617598239436621, "cost_std": 6.139771615316515e-06, "small_model_fraction": 0.10140423075029809}}, "0.1": {"accuracy": 0.7852112676056338, "large_model_usage": 0.6795774647887324, "small_model_usage": 0.3204225352112676, "avg_lambda": 0.7299999999999969, "avg_cost": 0.00043921866197183093, "unrestricted": {"accuracy": 0.8169014084507042, "large_model_usage": 0.6795774647887324, "avg_cost": 0.00045592992957746485}, "random_baseline": {"accuracy": 0.752112676056338, "accuracy_std": 0.012518583686359967, "avg_cost": 0.0004358764788732394, "cost_std": 1.7097989805843136e-05, "small_model_fraction": 0.3090754706769502}, "unrestricted_random_baseline": {"accuracy": 0.7676056338028169, "accuracy_std": 0.02759089862557708, "avg_cost": 0.00046149369718309836, "cost_std": 2.2873231770172542e-05, "small_model_fraction": 0.2807012276814442}}, "0.15": {"accuracy": 0.7535211267605634, "large_model_usage": 0.5140845070422535, "small_model_usage": 0.4859154929577465, "avg_lambda": 0.5900000000000027, "avg_cost": 0.0003430566901408449, "unrestricted": {"accuracy": 0.778169014084507, "large_model_usage": 0.5140845070422535, "avg_cost": 0.0003533735915492956}, "random_baseline": {"accuracy": 0.7024647887323944, "accuracy_std": 0.017051126778596142, "avg_cost": 0.0003404160915492956, "cost_std": 1.4670779513193848e-05, "small_model_fraction": 0.47234992836916745}, "unrestricted_random_baseline": {"accuracy": 0.7070422535211267, "accuracy_std": 0.012577866971473095, "avg_cost": 0.00035221510563380266, "cost_std": 1.2232475135399018e-05, "small_model_fraction": 0.45483274927586115}}, "0.2": {"accuracy": 0.6830985915492958, "large_model_usage": 0.3908450704225352, "small_model_usage": 0.6091549295774648, "avg_lambda": 0.40000000000000185, "avg_cost": 0.0002603172535211267, "unrestricted": {"accuracy": 0.75, "large_model_usage": 0.3908450704225352, "avg_cost": 0.0002760707746478872}, "random_baseline": {"accuracy": 0.6619718309859155, "accuracy_std": 0.013268622310856865, "avg_cost": 0.00025353073943661965, "cost_std": 1.3280074842451608e-05, "small_model_fraction": 0.6128341175618336}, "unrestricted_random_baseline": {"accuracy": 0.6605633802816901, "accuracy_std": 0.0169014084507042, "avg_cost": 0.0002632419718309859, "cost_std": 1.3746051899142709e-05, "small_model_fraction": 0.5860860427278909}}, "0.25": {"accuracy": 0.6302816901408451, "large_model_usage": 0.31338028169014087, "small_model_usage": 0.6866197183098591, "avg_lambda": 0.3100000000000012, "avg_cost": 0.00021148626760563377, "unrestricted": {"accuracy": 0.704225352112676, "large_model_usage": 0.31338028169014087, "avg_cost": 0.00022783133802816908}, "random_baseline": {"accuracy": 0.6359154929577465, "accuracy_std": 0.024607612971346705, "avg_cost": 0.00020907073943661966, "cost_std": 1.6474847737561937e-05, "small_model_fraction": 0.6957447795638719}, "unrestricted_random_baseline": {"accuracy": 0.6492957746478873, "accuracy_std": 0.011157027829404816, "avg_cost": 0.00023434264084507044, "cost_std": 1.0955148623097117e-05, "small_model_fraction": 0.6679923067477871}}}}, {"trial": 13, "trial_seed": 54, "subject": "TruthfulQA", "method": "standard", "calibration_size": 400, "test_size": 284, "small_model_accuracy": 0.5687134502923976, "large_model_accuracy": 0.8318713450292398, "small_model_cost": 3.215862573099415e-05, "large_model_cost": 0.0006180614035087729, "hybrid_results": {"0.05": {"accuracy": 0.7852112676056338, "large_model_usage": 0.8415492957746479, "small_model_usage": 0.15845070422535212, "avg_lambda": 0.8899999999999935, "avg_cost": 0.0005491848591549294, "unrestricted": {"accuracy": 0.8063380281690141, "large_model_usage": 0.8415492957746479, "avg_cost": 0.0005562482394366195}, "random_baseline": {"accuracy": 0.7785211267605634, "accuracy_std": 0.0063867454743721935, "avg_cost": 0.0005563715492957746, "cost_std": 1.193851897807276e-05, "small_model_fraction": 0.1175562686612949}, "unrestricted_random_baseline": {"accuracy": 0.7792253521126761, "accuracy_std": 0.009713460721220951, "avg_cost": 0.0005626067253521126, "cost_std": 8.651316013037482e-06, "small_model_fraction": 0.10550071857757587}}, "0.1": {"accuracy": 0.75, "large_model_usage": 0.602112676056338, "small_model_usage": 0.397887323943662, "avg_lambda": 0.6999999999999973, "avg_cost": 0.00039489612676056336, "unrestricted": {"accuracy": 0.7676056338028169, "large_model_usage": 0.602112676056338, "avg_cost": 0.00040843838028169004}, "random_baseline": {"accuracy": 0.7204225352112676, "accuracy_std": 0.01576269667999959, "avg_cost": 0.0003909738028169011, "cost_std": 2.0253716992284833e-05, "small_model_fraction": 0.3808913103205182}, "unrestricted_random_baseline": {"accuracy": 0.7193661971830987, "accuracy_std": 0.01582941323449933, "avg_cost": 0.000412821478873239, "cost_std": 1.3195388695706735e-05, "small_model_fraction": 0.3577778279566182}}, "0.15": {"accuracy": 0.7147887323943662, "large_model_usage": 0.4225352112676056, "small_model_usage": 0.5774647887323944, "avg_lambda": 0.5, "avg_cost": 0.00028725528169014087, "unrestricted": {"accuracy": 0.721830985915493, "large_model_usage": 0.4225352112676056, "avg_cost": 0.0002969454225352113}, "random_baseline": {"accuracy": 0.6742957746478873, "accuracy_std": 0.01831324454076877, "avg_cost": 0.00027713640845070423, "cost_std": 1.976208405525844e-05, "small_model_fraction": 0.5646092395624385}, "unrestricted_random_baseline": {"accuracy": 0.6785211267605633, "accuracy_std": 0.012202622148706644, "avg_cost": 0.00030016890845070416, "cost_std": 1.9283448060438532e-05, "small_model_fraction": 0.5480704191086024}}, "0.2": {"accuracy": 0.7007042253521126, "large_model_usage": 0.34507042253521125, "small_model_usage": 0.6549295774647887, "avg_lambda": 0.40000000000000185, "avg_cost": 0.00023852992957746483, "unrestricted": {"accuracy": 0.7147887323943662, "large_model_usage": 0.34507042253521125, "avg_cost": 0.00024940316901408456}, "random_baseline": {"accuracy": 0.6570422535211268, "accuracy_std": 0.020663944719889017, "avg_cost": 0.000234980985915493, "cost_std": 1.9001966150005573e-05, "small_model_fraction": 0.647772102004365}, "unrestricted_random_baseline": {"accuracy": 0.6605633802816901, "accuracy_std": 0.010229464117136572, "avg_cost": 0.00024513330985915496, "cost_std": 2.1634662265494103e-05, "small_model_fraction": 0.6292140069602351}}, "0.25": {"accuracy": 0.6302816901408451, "large_model_usage": 0.3028169014084507, "small_model_usage": 0.6971830985915493, "avg_lambda": 0.3100000000000012, "avg_cost": 0.0002034876760563381, "unrestricted": {"accuracy": 0.7007042253521126, "large_model_usage": 0.3028169014084507, "avg_cost": 0.00022206514084507047}, "random_baseline": {"accuracy": 0.6450704225352112, "accuracy_std": 0.019208706615473022, "avg_cost": 0.00020616073943661978, "cost_std": 1.4235726613039492e-05, "small_model_fraction": 0.7075810922502135}, "unrestricted_random_baseline": {"accuracy": 0.6461267605633803, "accuracy_std": 0.01568779535911652, "avg_cost": 0.00022441947183098596, "cost_std": 1.2666441379920228e-05, "small_model_fraction": 0.6758736733859554}}}}, {"trial": 14, "trial_seed": 55, "subject": "TruthfulQA", "method": "standard", "calibration_size": 400, "test_size": 284, "small_model_accuracy": 0.5614035087719298, "large_model_accuracy": 0.8216374269005848, "small_model_cost": 3.2227046783625756e-05, "large_model_cost": 0.0006201900584795331, "hybrid_results": {"0.05": {"accuracy": 0.8098591549295775, "large_model_usage": 0.8274647887323944, "small_model_usage": 0.17253521126760563, "avg_lambda": 0.900000000000004, "avg_cost": 0.0005501746478873239, "unrestricted": {"accuracy": 0.8098591549295775, "large_model_usage": 0.8274647887323944, "avg_cost": 0.0005506464788732395}, "random_baseline": {"accuracy": 0.7915492957746479, "accuracy_std": 0.008596165926573064, "avg_cost": 0.0005519104225352113, "cost_std": 6.89102634615144e-06, "small_model_fraction": 0.11908131838133543}, "unrestricted_random_baseline": {"accuracy": 0.801056338028169, "accuracy_std": 0.008942552886760788, "avg_cost": 0.0005586648239436622, "cost_std": 1.1351178047590172e-05, "small_model_fraction": 0.11827883425133107}}, "0.1": {"accuracy": 0.7746478873239436, "large_model_usage": 0.6232394366197183, "small_model_usage": 0.37676056338028174, "avg_lambda": 0.6999999999999973, "avg_cost": 0.0004060478873239438, "unrestricted": {"accuracy": 0.795774647887324, "large_model_usage": 0.6232394366197183, "avg_cost": 0.000423280281690141}, "random_baseline": {"accuracy": 0.726056338028169, "accuracy_std": 0.014755159746307731, "avg_cost": 0.0004018060211267606, "cost_std": 1.7451818747482216e-05, "small_model_fraction": 0.3642102766599593}, "unrestricted_random_baseline": {"accuracy": 0.7345070422535211, "accuracy_std": 0.014620098233821616, "avg_cost": 0.0004218559154929577, "cost_std": 1.727347711879279e-05, "small_model_fraction": 0.3349016398521906}}, "0.15": {"accuracy": 0.721830985915493, "large_model_usage": 0.41901408450704225, "small_model_usage": 0.5809859154929577, "avg_lambda": 0.5300000000000009, "avg_cost": 0.0002857802816901407, "unrestricted": {"accuracy": 0.7429577464788732, "large_model_usage": 0.41901408450704225, "avg_cost": 0.0002952661971830985}, "random_baseline": {"accuracy": 0.6774647887323944, "accuracy_std": 0.012419149358132656, "avg_cost": 0.00028003496478873227, "cost_std": 1.7252427422973656e-05, "small_model_fraction": 0.5687598881855311}, "unrestricted_random_baseline": {"accuracy": 0.6813380281690141, "accuracy_std": 0.02415246568963735, "avg_cost": 0.0002929820422535211, "cost_std": 2.225301375335313e-05, "small_model_fraction": 0.5526263639599224}}, "0.2": {"accuracy": 0.6514084507042254, "large_model_usage": 0.2640845070422535, "small_model_usage": 0.7359154929577465, "avg_lambda": 0.3900000000000005, "avg_cost": 0.00018859014084507042, "unrestricted": {"accuracy": 0.6901408450704225, "large_model_usage": 0.2640845070422535, "avg_cost": 0.0001971605633802816}, "random_baseline": {"accuracy": 0.6387323943661971, "accuracy_std": 0.011157027829404842, "avg_cost": 0.0001843962676056337, "cost_std": 1.2253302607732669e-05, "small_model_fraction": 0.7340596415913402}, "unrestricted_random_baseline": {"accuracy": 0.6429577464788734, "accuracy_std": 0.010932517391732396, "avg_cost": 0.00019524496478873232, "cost_std": 1.0005965521324594e-05, "small_model_fraction": 0.7194831761254414}}, "0.25": {"accuracy": 0.6091549295774648, "large_model_usage": 0.2323943661971831, "small_model_usage": 0.7676056338028169, "avg_lambda": 0.20000000000000093, "avg_cost": 0.00016118873239436612, "unrestricted": {"accuracy": 0.676056338028169, "large_model_usage": 0.2323943661971831, "avg_cost": 0.00017797746478873236}, "random_baseline": {"accuracy": 0.6158450704225352, "accuracy_std": 0.011619718309859174, "avg_cost": 0.00015932718309859145, "cost_std": 1.6461424247804852e-05, "small_model_fraction": 0.7806636080069626}, "unrestricted_random_baseline": {"accuracy": 0.632394366197183, "accuracy_std": 0.014620098233821646, "avg_cost": 0.00018592510563380274, "cost_std": 1.4770433554089384e-05, "small_model_fraction": 0.7521095458288993}}}}, {"trial": 15, "trial_seed": 56, "subject": "TruthfulQA", "method": "standard", "calibration_size": 400, "test_size": 284, "small_model_accuracy": 0.5657894736842105, "large_model_accuracy": 0.8201754385964912, "small_model_cost": 3.2195029239766125e-05, "large_model_cost": 0.0006182514619883049, "hybrid_results": {"0.05": {"accuracy": 0.795774647887324, "large_model_usage": 0.8345070422535211, "small_model_usage": 0.16549295774647887, "avg_lambda": 0.879999999999997, "avg_cost": 0.0005446577464788732, "unrestricted": {"accuracy": 0.8028169014084507, "large_model_usage": 0.8345070422535211, "avg_cost": 0.0005516788732394365}, "random_baseline": {"accuracy": 0.7971830985915493, "accuracy_std": 0.01300435585395733, "avg_cost": 0.0005449426760563383, "cost_std": 1.1651708644760782e-05, "small_model_fraction": 0.12557445221492655}, "unrestricted_random_baseline": {"accuracy": 0.8049295774647888, "accuracy_std": 0.009342605043254635, "avg_cost": 0.0005552381338028171, "cost_std": 1.3158882552971497e-05, "small_model_fraction": 0.11359416095247088}}, "0.1": {"accuracy": 0.7570422535211268, "large_model_usage": 0.676056338028169, "small_model_usage": 0.323943661971831, "avg_lambda": 0.6999999999999973, "avg_cost": 0.0004354464788732394, "unrestricted": {"accuracy": 0.778169014084507, "large_model_usage": 0.676056338028169, "avg_cost": 0.0004547704225352112}, "random_baseline": {"accuracy": 0.7369718309859155, "accuracy_std": 0.011997693688310421, "avg_cost": 0.000436560422535211, "cost_std": 1.9859027458885637e-05, "small_model_fraction": 0.3119238573284329}, "unrestricted_random_baseline": {"accuracy": 0.7454225352112676, "accuracy_std": 0.014859822426767018, "avg_cost": 0.00045591411971830957, "cost_std": 9.35753573151214e-06, "small_model_fraction": 0.2789510195910418}}, "0.15": {"accuracy": 0.6971830985915493, "large_model_usage": 0.43661971830985913, "small_model_usage": 0.5633802816901409, "avg_lambda": 0.5, "avg_cost": 0.00029426338028169026, "unrestricted": {"accuracy": 0.721830985915493, "large_model_usage": 0.43661971830985913, "avg_cost": 0.0003037845070422536}, "random_baseline": {"accuracy": 0.6595070422535212, "accuracy_std": 0.016816836529943036, "avg_cost": 0.00029197823943661954, "cost_std": 1.3135783823148193e-05, "small_model_fraction": 0.5528274473281471}, "unrestricted_random_baseline": {"accuracy": 0.6725352112676057, "accuracy_std": 0.02022733326245782, "avg_cost": 0.00030201922535211256, "cost_std": 2.0701676177982733e-05, "small_model_fraction": 0.5365813552651179}}, "0.2": {"accuracy": 0.6408450704225352, "large_model_usage": 0.3204225352112676, "small_model_usage": 0.6795774647887324, "avg_lambda": 0.35999999999999954, "avg_cost": 0.00021898873239436617, "unrestricted": {"accuracy": 0.6901408450704225, "large_model_usage": 0.3204225352112676, "avg_cost": 0.00023160845070422533}, "random_baseline": {"accuracy": 0.6221830985915492, "accuracy_std": 0.020893667181896915, "avg_cost": 0.00021338429577464794, "cost_std": 1.9988723002418276e-05, "small_model_fraction": 0.6812701086164031}, "unrestricted_random_baseline": {"accuracy": 0.6352112676056338, "accuracy_std": 0.004776288720510746, "avg_cost": 0.00022665250000000005, "cost_std": 1.4449472662605553e-05, "small_model_fraction": 0.659736826828719}}, "0.25": {"accuracy": 0.5774647887323944, "large_model_usage": 0.2887323943661972, "small_model_usage": 0.7112676056338028, "avg_lambda": 0.28000000000000086, "avg_cost": 0.00019149577464788734, "unrestricted": {"accuracy": 0.676056338028169, "large_model_usage": 0.2887323943661972, "avg_cost": 0.00021193239436619719}, "random_baseline": {"accuracy": 0.6116197183098591, "accuracy_std": 0.018501837542844592, "avg_cost": 0.00019539341549295778, "cost_std": 1.733653579709647e-05, "small_model_fraction": 0.7281819010824289}, "unrestricted_random_baseline": {"accuracy": 0.6260563380281691, "accuracy_std": 0.01752092296426021, "avg_cost": 0.00021165228873239436, "cost_std": 1.0514485205342031e-05, "small_model_fraction": 0.6933104815802754}}}}, {"trial": 16, "trial_seed": 57, "subject": "TruthfulQA", "method": "standard", "calibration_size": 400, "test_size": 284, "small_model_accuracy": 0.5862573099415205, "large_model_accuracy": 0.8596491228070176, "small_model_cost": 3.221096491228071e-05, "large_model_cost": 0.0006191198830409364, "hybrid_results": {"0.05": {"accuracy": 0.795774647887324, "large_model_usage": 0.7887323943661971, "small_model_usage": 0.21126760563380287, "avg_lambda": 0.8600000000000062, "avg_cost": 0.0005168246478873237, "unrestricted": {"accuracy": 0.8028169014084507, "large_model_usage": 0.7887323943661971, "avg_cost": 0.0005248035211267605}, "random_baseline": {"accuracy": 0.8014084507042254, "accuracy_std": 0.008795771828730143, "avg_cost": 0.0005181552464788733, "cost_std": 7.464289900702647e-06, "small_model_fraction": 0.17429490674597087}, "unrestricted_random_baseline": {"accuracy": 0.8031690140845071, "accuracy_std": 0.015938690560220122, "avg_cost": 0.0005282458802816903, "cost_std": 1.5218770947914858e-05, "small_model_fraction": 0.1607001682899985}}, "0.1": {"accuracy": 0.7711267605633803, "large_model_usage": 0.6338028169014085, "small_model_usage": 0.3661971830985915, "avg_lambda": 0.6999999999999973, "avg_cost": 0.0004142190140845069, "unrestricted": {"accuracy": 0.7816901408450704, "large_model_usage": 0.6338028169014085, "avg_cost": 0.00042806408450704185}, "random_baseline": {"accuracy": 0.7419014084507042, "accuracy_std": 0.01510805489642599, "avg_cost": 0.0004090305281690138, "cost_std": 1.364456107089373e-05, "small_model_fraction": 0.3491186837128847}, "unrestricted_random_baseline": {"accuracy": 0.7538732394366198, "accuracy_std": 0.017566858708825923, "avg_cost": 0.0004315042605633801, "cost_std": 1.6486099215822618e-05, "small_model_fraction": 0.3255288727645699}}, "0.15": {"accuracy": 0.6971830985915493, "large_model_usage": 0.41901408450704225, "small_model_usage": 0.5809859154929577, "avg_lambda": 0.5, "avg_cost": 0.0002813246478873239, "unrestricted": {"accuracy": 0.7323943661971831, "large_model_usage": 0.41901408450704225, "avg_cost": 0.00029437394366197176}, "random_baseline": {"accuracy": 0.6940140845070422, "accuracy_std": 0.017353833141340314, "avg_cost": 0.00028711630281690124, "cost_std": 1.0399158352808483e-05, "small_model_fraction": 0.5755496717116927}, "unrestricted_random_baseline": {"accuracy": 0.6996478873239437, "accuracy_std": 0.009454733508519391, "avg_cost": 0.0002887480985915491, "cost_std": 1.719028057539095e-05, "small_model_fraction": 0.553315734943011}}, "0.2": {"accuracy": 0.6514084507042254, "large_model_usage": 0.2746478873239437, "small_model_usage": 0.7253521126760563, "avg_lambda": 0.3100000000000012, "avg_cost": 0.0001885499999999999, "unrestricted": {"accuracy": 0.6936619718309859, "large_model_usage": 0.2746478873239437, "avg_cost": 0.00020351478873239425}, "random_baseline": {"accuracy": 0.6556338028169014, "accuracy_std": 0.011000351656206539, "avg_cost": 0.00018845897887323943, "cost_std": 1.667458899872445e-05, "small_model_fraction": 0.7336230030611866}, "unrestricted_random_baseline": {"accuracy": 0.6531690140845072, "accuracy_std": 0.008369622763383591, "avg_cost": 0.00020475697183098578, "cost_std": 1.6400545546184052e-05, "small_model_fraction": 0.708125369152148}}, "0.25": {"accuracy": 0.6161971830985915, "large_model_usage": 0.20422535211267606, "small_model_usage": 0.795774647887324, "avg_lambda": 0.10999999999999963, "avg_cost": 0.00014199366197183106, "unrestricted": {"accuracy": 0.6725352112676056, "large_model_usage": 0.20422535211267606, "avg_cost": 0.00016005}, "random_baseline": {"accuracy": 0.6338028169014084, "accuracy_std": 0.012399166803985243, "avg_cost": 0.00014802239436619724, "cost_std": 1.3471896460859987e-05, "small_model_fraction": 0.8129476420130235}, "unrestricted_random_baseline": {"accuracy": 0.6411971830985916, "accuracy_std": 0.015222511493814983, "avg_cost": 0.00015450411971830986, "cost_std": 1.6846776463411113e-05, "small_model_fraction": 0.7821824969105413}}}}, {"trial": 17, "trial_seed": 58, "subject": "TruthfulQA", "method": "standard", "calibration_size": 400, "test_size": 284, "small_model_accuracy": 0.543859649122807, "large_model_accuracy": 0.8157894736842105, "small_model_cost": 3.218011695906433e-05, "large_model_cost": 0.0006190526315789484, "hybrid_results": {"0.05": {"accuracy": 0.7816901408450704, "large_model_usage": 0.778169014084507, "small_model_usage": 0.221830985915493, "avg_lambda": 0.8499999999999961, "avg_cost": 0.0005093940140845071, "unrestricted": {"accuracy": 0.7922535211267606, "large_model_usage": 0.778169014084507, "avg_cost": 0.0005172531690140846}, "random_baseline": {"accuracy": 0.7679577464788732, "accuracy_std": 0.010015114544597093, "avg_cost": 0.0005058264788732395, "cost_std": 1.5682951582474168e-05, "small_model_fraction": 0.186852535708657}, "unrestricted_random_baseline": {"accuracy": 0.772887323943662, "accuracy_std": 0.010113666631228936, "avg_cost": 0.0005155611267605634, "cost_std": 1.3643655132948395e-05, "small_model_fraction": 0.17346094769968745}}, "0.1": {"accuracy": 0.7605633802816901, "large_model_usage": 0.5176056338028169, "small_model_usage": 0.4823943661971831, "avg_lambda": 0.6399999999999962, "avg_cost": 0.0003495630281690137, "unrestricted": {"accuracy": 0.7676056338028169, "large_model_usage": 0.5176056338028169, "avg_cost": 0.00035589401408450666}, "random_baseline": {"accuracy": 0.7007042253521127, "accuracy_std": 0.010680106259227539, "avg_cost": 0.00035396176056338004, "cost_std": 1.833733975971981e-05, "small_model_fraction": 0.4591961570810356}, "unrestricted_random_baseline": {"accuracy": 0.7, "accuracy_std": 0.010058349899356125, "avg_cost": 0.0003543797183098589, "cost_std": 1.3519695419397798e-05, "small_model_fraction": 0.448408488962699}}, "0.15": {"accuracy": 0.7288732394366197, "large_model_usage": 0.45422535211267606, "small_model_usage": 0.545774647887324, "avg_lambda": 0.5100000000000009, "avg_cost": 0.0003068658450704223, "unrestricted": {"accuracy": 0.7429577464788732, "large_model_usage": 0.45422535211267606, "avg_cost": 0.00031689401408450685}, "random_baseline": {"accuracy": 0.6788732394366197, "accuracy_std": 0.014154754395944899, "avg_cost": 0.00030185524647887297, "cost_std": 1.527718637668043e-05, "small_model_fraction": 0.5319499188179373}, "unrestricted_random_baseline": {"accuracy": 0.6827464788732395, "accuracy_std": 0.013678115915179605, "avg_cost": 0.0003127150704225349, "cost_std": 1.5141239279014875e-05, "small_model_fraction": 0.5148624445125853}}, "0.2": {"accuracy": 0.6866197183098591, "large_model_usage": 0.33098591549295775, "small_model_usage": 0.6690140845070423, "avg_lambda": 0.3900000000000005, "avg_cost": 0.00022775316901408447, "unrestricted": {"accuracy": 0.7147887323943662, "large_model_usage": 0.33098591549295775, "avg_cost": 0.0002392954225352113}, "random_baseline": {"accuracy": 0.645774647887324, "accuracy_std": 0.017264296721311637, "avg_cost": 0.0002280701408450704, "cost_std": 1.2807496995503223e-05, "small_model_fraction": 0.6667537715892312}, "unrestricted_random_baseline": {"accuracy": 0.6542253521126761, "accuracy_std": 0.016500527484309812, "avg_cost": 0.00023908499999999983, "cost_std": 1.860066753910231e-05, "small_model_fraction": 0.6470863766549111}}, "0.25": {"accuracy": 0.6584507042253521, "large_model_usage": 0.31338028169014087, "small_model_usage": 0.6866197183098591, "avg_lambda": 0.3100000000000012, "avg_cost": 0.0002112531690140844, "unrestricted": {"accuracy": 0.7112676056338029, "large_model_usage": 0.31338028169014087, "avg_cost": 0.00022802781690140847}, "random_baseline": {"accuracy": 0.6323943661971831, "accuracy_std": 0.014620098233821635, "avg_cost": 0.0002097802464788732, "cost_std": 1.620099790000639e-05, "small_model_fraction": 0.6948689066295679}, "unrestricted_random_baseline": {"accuracy": 0.6443661971830986, "accuracy_std": 0.007551975559692673, "avg_cost": 0.00022069063380281682, "cost_std": 1.801414053402841e-05, "small_model_fraction": 0.666285785986767}}}}, {"trial": 18, "trial_seed": 59, "subject": "TruthfulQA", "method": "standard", "calibration_size": 400, "test_size": 284, "small_model_accuracy": 0.5570175438596491, "large_model_accuracy": 0.8421052631578947, "small_model_cost": 3.2185818713450304e-05, "large_model_cost": 0.0006178684210526318, "hybrid_results": {"0.05": {"accuracy": 0.8169014084507042, "large_model_usage": 0.795774647887324, "small_model_usage": 0.204225352112676, "avg_lambda": 0.900000000000004, "avg_cost": 0.0005255750000000001, "unrestricted": {"accuracy": 0.8169014084507042, "large_model_usage": 0.795774647887324, "avg_cost": 0.0005255750000000001}, "random_baseline": {"accuracy": 0.8116197183098592, "accuracy_std": 0.007912748258536691, "avg_cost": 0.0005312355633802816, "cost_std": 1.2648068175445522e-05, "small_model_fraction": 0.15758265771258573}, "unrestricted_random_baseline": {"accuracy": 0.8049295774647888, "accuracy_std": 0.008935617986232065, "avg_cost": 0.0005271319718309858, "cost_std": 1.0725664158805673e-05, "small_model_fraction": 0.15758265771258573}}, "0.1": {"accuracy": 0.7605633802816901, "large_model_usage": 0.5985915492957746, "small_model_usage": 0.4014084507042254, "avg_lambda": 0.7100000000000012, "avg_cost": 0.0003911806338028167, "unrestricted": {"accuracy": 0.7852112676056338, "large_model_usage": 0.5985915492957746, "avg_cost": 0.0004051876760563379}, "random_baseline": {"accuracy": 0.7461267605633803, "accuracy_std": 0.008545535985571536, "avg_cost": 0.00038936919014084483, "cost_std": 1.672173194049332e-05, "small_model_fraction": 0.3870488663047828}, "unrestricted_random_baseline": {"accuracy": 0.7471830985915492, "accuracy_std": 0.013249920931145602, "avg_cost": 0.0004022389788732393, "cost_std": 1.0194322922513927e-05, "small_model_fraction": 0.3631331102321627}}, "0.15": {"accuracy": 0.6971830985915493, "large_model_usage": 0.44014084507042256, "small_model_usage": 0.5598591549295775, "avg_lambda": 0.5500000000000005, "avg_cost": 0.0002951383802816901, "unrestricted": {"accuracy": 0.7359154929577465, "large_model_usage": 0.44014084507042256, "avg_cost": 0.00030528626760563377}, "random_baseline": {"accuracy": 0.6880281690140846, "accuracy_std": 0.012318912453898507, "avg_cost": 0.00029609795774647874, "cost_std": 1.6210736995727645e-05, "small_model_fraction": 0.5510323159369548}, "unrestricted_random_baseline": {"accuracy": 0.7073943661971831, "accuracy_std": 0.018991248676284246, "avg_cost": 0.00029487306338028157, "cost_std": 1.5587268613361505e-05, "small_model_fraction": 0.5337057173946494}}, "0.2": {"accuracy": 0.6654929577464789, "large_model_usage": 0.3345070422535211, "small_model_usage": 0.6654929577464789, "avg_lambda": 0.4399999999999985, "avg_cost": 0.00022686373239436618, "unrestricted": {"accuracy": 0.7007042253521126, "large_model_usage": 0.3345070422535211, "avg_cost": 0.00023927922535211266}, "random_baseline": {"accuracy": 0.6559859154929577, "accuracy_std": 0.01781912806884504, "avg_cost": 0.0002192362323943661, "cost_std": 1.7661213288775743e-05, "small_model_fraction": 0.6676050937771008}, "unrestricted_random_baseline": {"accuracy": 0.6725352112676056, "accuracy_std": 0.011245577058219219, "avg_cost": 0.00024131943661971828, "cost_std": 1.486967875428732e-05, "small_model_fraction": 0.646406763985231}}, "0.25": {"accuracy": 0.6161971830985915, "large_model_usage": 0.25, "small_model_usage": 0.75, "avg_lambda": 0.3100000000000012, "avg_cost": 0.0001707228873239436, "unrestricted": {"accuracy": 0.6690140845070423, "large_model_usage": 0.25, "avg_cost": 0.00018739190140845064}, "random_baseline": {"accuracy": 0.628169014084507, "accuracy_std": 0.009604353307736546, "avg_cost": 0.0001719092957746478, "cost_std": 1.0927081450959608e-05, "small_model_fraction": 0.7634605022290496}, "unrestricted_random_baseline": {"accuracy": 0.6419014084507043, "accuracy_std": 0.011360750566832178, "avg_cost": 0.00018220697183098583, "cost_std": 1.0575473768465163e-05, "small_model_fraction": 0.7349996703417235}}}}, {"trial": 19, "trial_seed": 60, "subject": "TruthfulQA", "method": "standard", "calibration_size": 400, "test_size": 284, "small_model_accuracy": 0.5555555555555556, "large_model_accuracy": 0.8421052631578947, "small_model_cost": 3.226973684210533e-05, "large_model_cost": 0.0006206695906432756, "hybrid_results": {"0.05": {"accuracy": 0.8028169014084507, "large_model_usage": 0.8204225352112676, "small_model_usage": 0.17957746478873238, "avg_lambda": 0.900000000000004, "avg_cost": 0.0005488778169014082, "unrestricted": {"accuracy": 0.8028169014084507, "large_model_usage": 0.8204225352112676, "avg_cost": 0.0005495045774647884}, "random_baseline": {"accuracy": 0.8073943661971832, "accuracy_std": 0.009188724190633712, "avg_cost": 0.0005547145070422532, "cost_std": 1.1530422269619268e-05, "small_model_fraction": 0.12201188235870469}, "unrestricted_random_baseline": {"accuracy": 0.818661971830986, "accuracy_std": 0.009610805678842368, "avg_cost": 0.0005537058450704222, "cost_std": 8.838529134278428e-06, "small_model_fraction": 0.12094668739080798}}, "0.1": {"accuracy": 0.7570422535211268, "large_model_usage": 0.6232394366197183, "small_model_usage": 0.37676056338028174, "avg_lambda": 0.6999999999999973, "avg_cost": 0.00040673697183098574, "unrestricted": {"accuracy": 0.7852112676056338, "large_model_usage": 0.6232394366197183, "avg_cost": 0.00042641302816901367}, "random_baseline": {"accuracy": 0.7503521126760564, "accuracy_std": 0.019121371806176683, "avg_cost": 0.00041155510563380273, "cost_std": 1.824636579746138e-05, "small_model_fraction": 0.3635837388983804}, "unrestricted_random_baseline": {"accuracy": 0.7538732394366197, "accuracy_std": 0.013768461123955722, "avg_cost": 0.0004244266549295773, "cost_std": 8.509137102524986e-06, "small_model_fraction": 0.33014379799608917}}, "0.15": {"accuracy": 0.7253521126760564, "large_model_usage": 0.4295774647887324, "small_model_usage": 0.5704225352112676, "avg_lambda": 0.540000000000001, "avg_cost": 0.0002951102112676054, "unrestricted": {"accuracy": 0.7464788732394366, "large_model_usage": 0.4295774647887324, "avg_cost": 0.0003041806338028167}, "random_baseline": {"accuracy": 0.6971830985915493, "accuracy_std": 0.014938875658870738, "avg_cost": 0.00029400292253521117, "cost_std": 1.4407623537559899e-05, "small_model_fraction": 0.5532961595290979}, "unrestricted_random_baseline": {"accuracy": 0.6936619718309859, "accuracy_std": 0.01613582991181636, "avg_cost": 0.00030890764084507034, "cost_std": 2.222300391813523e-05, "small_model_fraction": 0.5378807537015562}}, "0.2": {"accuracy": 0.6901408450704225, "large_model_usage": 0.34507042253521125, "small_model_usage": 0.6549295774647887, "avg_lambda": 0.4300000000000031, "avg_cost": 0.00023877218309859152, "unrestricted": {"accuracy": 0.7253521126760564, "large_model_usage": 0.34507042253521125, "avg_cost": 0.000250525704225352}, "random_baseline": {"accuracy": 0.6640845070422535, "accuracy_std": 0.01281225732300016, "avg_cost": 0.00023003686619718308, "cost_std": 1.665499939467396e-05, "small_model_fraction": 0.6490440218119656}, "unrestricted_random_baseline": {"accuracy": 0.6862676056338028, "accuracy_std": 0.011935529053998781, "avg_cost": 0.0002635555985915493, "cost_std": 2.132250243541692e-05, "small_model_fraction": 0.6290686240432024}}, "0.25": {"accuracy": 0.6443661971830986, "large_model_usage": 0.25704225352112675, "small_model_usage": 0.7429577464788732, "avg_lambda": 0.3100000000000012, "avg_cost": 0.00018077218309859162, "unrestricted": {"accuracy": 0.6936619718309859, "large_model_usage": 0.25704225352112675, "avg_cost": 0.0001944693661971832}, "random_baseline": {"accuracy": 0.6309859154929578, "accuracy_std": 0.010539879962743522, "avg_cost": 0.0001730540492957747, "cost_std": 1.6392201698180644e-05, "small_model_fraction": 0.7476164460321779}, "unrestricted_random_baseline": {"accuracy": 0.6496478873239436, "accuracy_std": 0.015368417494926193, "avg_cost": 0.00019497876760563382, "cost_std": 1.3571379211045253e-05, "small_model_fraction": 0.7243377470146556}}}}, {"trial": 20, "trial_seed": 61, "subject": "TruthfulQA", "method": "standard", "calibration_size": 400, "test_size": 284, "small_model_accuracy": 0.564327485380117, "large_model_accuracy": 0.8260233918128655, "small_model_cost": 3.2273391812865526e-05, "large_model_cost": 0.0006215847953216381, "hybrid_results": {"0.05": {"accuracy": 0.823943661971831, "large_model_usage": 0.7112676056338029, "small_model_usage": 0.28873239436619713, "avg_lambda": 0.8000000000000037, "avg_cost": 0.0004735584507042253, "unrestricted": {"accuracy": 0.823943661971831, "large_model_usage": 0.7112676056338029, "avg_cost": 0.0004780654929577465}, "random_baseline": {"accuracy": 0.7936619718309859, "accuracy_std": 0.014449496146960018, "avg_cost": 0.0004686121830985913, "cost_std": 1.004510396032294e-05, "small_model_fraction": 0.2511852710401679}, "unrestricted_random_baseline": {"accuracy": 0.7971830985915493, "accuracy_std": 0.01936299589417963, "avg_cost": 0.00047895714788732393, "cost_std": 1.905620102061468e-05, "small_model_fraction": 0.2435372903177753}}, "0.1": {"accuracy": 0.778169014084507, "large_model_usage": 0.6338028169014085, "small_model_usage": 0.3661971830985915, "avg_lambda": 0.6999999999999973, "avg_cost": 0.000411206338028169, "unrestricted": {"accuracy": 0.8063380281690141, "large_model_usage": 0.6338028169014085, "avg_cost": 0.000430431690140845}, "random_baseline": {"accuracy": 0.7661971830985915, "accuracy_std": 0.013380281690140848, "avg_cost": 0.0004110518661971831, "cost_std": 1.794180945677627e-05, "small_model_fraction": 0.3569903043465158}, "unrestricted_random_baseline": {"accuracy": 0.779225352112676, "accuracy_std": 0.012602486033924868, "avg_cost": 0.00042954017605633803, "cost_std": 1.3586184886462519e-05, "small_model_fraction": 0.3243668865775606}}, "0.15": {"accuracy": 0.7183098591549296, "large_model_usage": 0.4084507042253521, "small_model_usage": 0.5915492957746479, "avg_lambda": 0.5, "avg_cost": 0.0002789105633802816, "unrestricted": {"accuracy": 0.7359154929577465, "large_model_usage": 0.4084507042253521, "avg_cost": 0.00028967816901408433}, "random_baseline": {"accuracy": 0.7070422535211268, "accuracy_std": 0.016945365373966178, "avg_cost": 0.0002847544718309859, "cost_std": 1.4252886268645944e-05, "small_model_fraction": 0.5814824384884916}, "unrestricted_random_baseline": {"accuracy": 0.7073943661971831, "accuracy_std": 0.021958396940378704, "avg_cost": 0.0002853380633802816, "cost_std": 1.9997855522211764e-05, "small_model_fraction": 0.563210934543901}}, "0.2": {"accuracy": 0.6866197183098591, "large_model_usage": 0.323943661971831, "small_model_usage": 0.676056338028169, "avg_lambda": 0.40000000000000185, "avg_cost": 0.00022514295774647873, "unrestricted": {"accuracy": 0.7183098591549296, "large_model_usage": 0.323943661971831, "avg_cost": 0.0002365373239436618}, "random_baseline": {"accuracy": 0.6746478873239437, "accuracy_std": 0.010107535277751623, "avg_cost": 0.00022369531690140835, "cost_std": 2.112248365456122e-05, "small_model_fraction": 0.6727204585126578}, "unrestricted_random_baseline": {"accuracy": 0.6866197183098592, "accuracy_std": 0.013080405366906116, "avg_cost": 0.00022711028169014076, "cost_std": 1.5044521291114423e-05, "small_model_fraction": 0.6533854072488594}}, "0.25": {"accuracy": 0.6443661971830986, "large_model_usage": 0.25704225352112675, "small_model_usage": 0.7429577464788732, "avg_lambda": 0.25, "avg_cost": 0.00017540352112676045, "unrestricted": {"accuracy": 0.704225352112676, "large_model_usage": 0.25704225352112675, "avg_cost": 0.00019455140845070403}, "random_baseline": {"accuracy": 0.6630281690140846, "accuracy_std": 0.021070935634334056, "avg_cost": 0.00017544450704225345, "cost_std": 2.033542307503205e-05, "small_model_fraction": 0.7571230957661856}, "unrestricted_random_baseline": {"accuracy": 0.6697183098591549, "accuracy_std": 0.014154754395944895, "avg_cost": 0.000201517218309859, "cost_std": 1.5475908570048095e-05, "small_model_fraction": 0.7246311276658965}}}}, {"trial": 21, "trial_seed": 62, "subject": "TruthfulQA", "method": "standard", "calibration_size": 400, "test_size": 284, "small_model_accuracy": 0.543859649122807, "large_model_accuracy": 0.8435672514619883, "small_model_cost": 3.212368421052637e-05, "large_model_cost": 0.0006187660818713462, "hybrid_results": {"0.05": {"accuracy": 0.7992957746478874, "large_model_usage": 0.7922535211267606, "small_model_usage": 0.20774647887323938, "avg_lambda": 0.900000000000004, "avg_cost": 0.0005238588028169012, "unrestricted": {"accuracy": 0.7992957746478874, "large_model_usage": 0.7922535211267606, "avg_cost": 0.0005246616197183096}, "random_baseline": {"accuracy": 0.7785211267605634, "accuracy_std": 0.010732218770512801, "avg_cost": 0.0005240385211267601, "cost_std": 1.2091114020892258e-05, "small_model_fraction": 0.16178046358885526}, "unrestricted_random_baseline": {"accuracy": 0.7785211267605634, "accuracy_std": 0.013310604346081572, "avg_cost": 0.0005261066901408447, "cost_std": 1.645084799251043e-05, "small_model_fraction": 0.16041196907736138}}, "0.1": {"accuracy": 0.7640845070422535, "large_model_usage": 0.6514084507042254, "small_model_usage": 0.3485915492957746, "avg_lambda": 0.75, "avg_cost": 0.00042653485915492955, "unrestricted": {"accuracy": 0.7711267605633803, "large_model_usage": 0.6514084507042254, "avg_cost": 0.00043750669014084513}, "random_baseline": {"accuracy": 0.7415492957746479, "accuracy_std": 0.014363435249557122, "avg_cost": 0.0004358546478873236, "cost_std": 1.6361026277013783e-05, "small_model_fraction": 0.3276804122629393}, "unrestricted_random_baseline": {"accuracy": 0.7338028169014085, "accuracy_std": 0.01761971268055885, "avg_cost": 0.0004300978169014081, "cost_std": 1.2780775706241057e-05, "small_model_fraction": 0.30897765393918925}}, "0.15": {"accuracy": 0.7394366197183099, "large_model_usage": 0.5316901408450704, "small_model_usage": 0.4683098591549296, "avg_lambda": 0.6100000000000032, "avg_cost": 0.00035183767605633795, "unrestricted": {"accuracy": 0.7535211267605634, "large_model_usage": 0.5316901408450704, "avg_cost": 0.0003632038732394364}, "random_baseline": {"accuracy": 0.697887323943662, "accuracy_std": 0.01848507711043197, "avg_cost": 0.00035192411971830963, "cost_std": 1.643455561496884e-05, "small_model_fraction": 0.4550104235209722}, "unrestricted_random_baseline": {"accuracy": 0.7045774647887325, "accuracy_std": 0.01832677983052552, "avg_cost": 0.00037035147887323907, "cost_std": 1.6342961427980536e-05, "small_model_fraction": 0.4356354222792957}}, "0.2": {"accuracy": 0.6795774647887324, "large_model_usage": 0.36971830985915494, "small_model_usage": 0.630281690140845, "avg_lambda": 0.450000000000002, "avg_cost": 0.00024961232394366196, "unrestricted": {"accuracy": 0.7147887323943662, "large_model_usage": 0.36971830985915494, "avg_cost": 0.0002617742957746479}, "random_baseline": {"accuracy": 0.6577464788732394, "accuracy_std": 0.015250991427963158, "avg_cost": 0.0002504575352112675, "cost_std": 1.392002601331863e-05, "small_model_fraction": 0.6292653913178613}, "unrestricted_random_baseline": {"accuracy": 0.6588028169014084, "accuracy_std": 0.011184774770611668, "avg_cost": 0.0002618730985915492, "cost_std": 1.5160731177693658e-05, "small_model_fraction": 0.6085338999025108}}, "0.25": {"accuracy": 0.6443661971830986, "large_model_usage": 0.2887323943661972, "small_model_usage": 0.7112676056338028, "avg_lambda": 0.3199999999999981, "avg_cost": 0.00019854894366197186, "unrestricted": {"accuracy": 0.6830985915492958, "large_model_usage": 0.2887323943661972, "avg_cost": 0.00021254894366197176}, "random_baseline": {"accuracy": 0.6235915492957746, "accuracy_std": 0.013587169987024493, "avg_cost": 0.0001940893309859155, "cost_std": 1.7991284538151305e-05, "small_model_fraction": 0.7163088448515651}, "unrestricted_random_baseline": {"accuracy": 0.6306338028169014, "accuracy_std": 0.00897023887525144, "avg_cost": 0.00020767496478873233, "cost_std": 1.4018818797825391e-05, "small_model_fraction": 0.6924442212651629}}}}, {"trial": 22, "trial_seed": 63, "subject": "TruthfulQA", "method": "standard", "calibration_size": 400, "test_size": 284, "small_model_accuracy": 0.5555555555555556, "large_model_accuracy": 0.8084795321637427, "small_model_cost": 3.2250292397660844e-05, "large_model_cost": 0.000619637426900586, "hybrid_results": {"0.05": {"accuracy": 0.778169014084507, "large_model_usage": 0.8309859154929577, "small_model_usage": 0.16901408450704225, "avg_lambda": 0.900000000000004, "avg_cost": 0.0005492876760563372, "unrestricted": {"accuracy": 0.778169014084507, "large_model_usage": 0.8309859154929577, "avg_cost": 0.0005497031690140836}, "random_baseline": {"accuracy": 0.7661971830985915, "accuracy_std": 0.010229464117136572, "avg_cost": 0.0005487778521126757, "cost_std": 1.1057172820429631e-05, "small_model_fraction": 0.1197672654233091}, "unrestricted_random_baseline": {"accuracy": 0.7693661971830985, "accuracy_std": 0.010474982239889788, "avg_cost": 0.0005517999295774643, "cost_std": 7.111706709152186e-06, "small_model_fraction": 0.11905990747598533}}, "0.1": {"accuracy": 0.7394366197183099, "large_model_usage": 0.6056338028169014, "small_model_usage": 0.3943661971830986, "avg_lambda": 0.6999999999999973, "avg_cost": 0.0003944214788732394, "unrestricted": {"accuracy": 0.7640845070422535, "large_model_usage": 0.6056338028169014, "avg_cost": 0.0004111257042253521}, "random_baseline": {"accuracy": 0.6982394366197183, "accuracy_std": 0.014436619718309887, "avg_cost": 0.0004058049999999998, "cost_std": 1.65361397436195e-05, "small_model_fraction": 0.38341995389111644}, "unrestricted_random_baseline": {"accuracy": 0.7098591549295775, "accuracy_std": 0.012908663929453097, "avg_cost": 0.0004162911619718308, "cost_std": 1.2500625334933494e-05, "small_model_fraction": 0.35498176658514385}}, "0.15": {"accuracy": 0.6690140845070423, "large_model_usage": 0.4154929577464789, "small_model_usage": 0.5845070422535211, "avg_lambda": 0.5300000000000009, "avg_cost": 0.00028336514084507025, "unrestricted": {"accuracy": 0.6866197183098591, "large_model_usage": 0.4154929577464789, "avg_cost": 0.00029330176056338017}, "random_baseline": {"accuracy": 0.6359154929577465, "accuracy_std": 0.01374593049010081, "avg_cost": 0.00027324401408450685, "cost_std": 2.2177832654650593e-05, "small_model_fraction": 0.5724883408283794}, "unrestricted_random_baseline": {"accuracy": 0.647887323943662, "accuracy_std": 0.017105574368297367, "avg_cost": 0.0002903545774647886, "cost_std": 1.8321699396238247e-05, "small_model_fraction": 0.555571695681361}}, "0.2": {"accuracy": 0.6443661971830986, "large_model_usage": 0.3380281690140845, "small_model_usage": 0.6619718309859155, "avg_lambda": 0.40000000000000185, "avg_cost": 0.0002292172535211266, "unrestricted": {"accuracy": 0.676056338028169, "large_model_usage": 0.3380281690140845, "avg_cost": 0.00024417499999999996}, "random_baseline": {"accuracy": 0.6144366197183098, "accuracy_std": 0.016078098671592794, "avg_cost": 0.0002260082042253521, "cost_std": 1.427441488832158e-05, "small_model_fraction": 0.6646726672177651}, "unrestricted_random_baseline": {"accuracy": 0.626056338028169, "accuracy_std": 0.011867816581938563, "avg_cost": 0.00024023341549295765, "cost_std": 1.582594561122447e-05, "small_model_fraction": 0.6392077811141031}}, "0.25": {"accuracy": 0.5774647887323944, "large_model_usage": 0.2711267605633803, "small_model_usage": 0.7288732394366197, "avg_lambda": 0.3100000000000012, "avg_cost": 0.00018358345070422532, "unrestricted": {"accuracy": 0.6408450704225352, "large_model_usage": 0.2711267605633803, "avg_cost": 0.00020180880281690143}, "random_baseline": {"accuracy": 0.5929577464788732, "accuracy_std": 0.01281225732300016, "avg_cost": 0.00018263204225352117, "cost_std": 1.2564502536443518e-05, "small_model_fraction": 0.7423621502458855}, "unrestricted_random_baseline": {"accuracy": 0.6158450704225351, "accuracy_std": 0.00937572320823547, "avg_cost": 0.0002013673943661972, "cost_std": 1.1517937627028689e-05, "small_model_fraction": 0.7113343135056421}}}}, {"trial": 23, "trial_seed": 64, "subject": "TruthfulQA", "method": "standard", "calibration_size": 400, "test_size": 284, "small_model_accuracy": 0.5453216374269005, "large_model_accuracy": 0.8245614035087719, "small_model_cost": 3.2261842105263195e-05, "large_model_cost": 0.0006199502923976609, "hybrid_results": {"0.05": {"accuracy": 0.8133802816901409, "large_model_usage": 0.8274647887323944, "small_model_usage": 0.17253521126760563, "avg_lambda": 0.900000000000004, "avg_cost": 0.0005465225352112675, "unrestricted": {"accuracy": 0.8133802816901409, "large_model_usage": 0.8274647887323944, "avg_cost": 0.0005474661971830984}, "random_baseline": {"accuracy": 0.796830985915493, "accuracy_std": 0.012202622148706689, "avg_cost": 0.0005512504577464789, "cost_std": 1.2254482076463118e-05, "small_model_fraction": 0.12494333885557943}, "unrestricted_random_baseline": {"accuracy": 0.7919014084507043, "accuracy_std": 0.012543319106025998, "avg_cost": 0.0005514221126760564, "cost_std": 8.764182567256737e-06, "small_model_fraction": 0.12333762077253481}}, "0.1": {"accuracy": 0.7640845070422535, "large_model_usage": 0.6619718309859155, "small_model_usage": 0.3380281690140845, "avg_lambda": 0.6999999999999973, "avg_cost": 0.00043103661971830986, "unrestricted": {"accuracy": 0.7852112676056338, "large_model_usage": 0.6619718309859155, "avg_cost": 0.0004459873239436618}, "random_baseline": {"accuracy": 0.731338028169014, "accuracy_std": 0.02409078659591173, "avg_cost": 0.00042592038732394385, "cost_std": 1.6500496416890536e-05, "small_model_fraction": 0.3214520764962442}, "unrestricted_random_baseline": {"accuracy": 0.7401408450704225, "accuracy_std": 0.01814661792496494, "avg_cost": 0.00044447147887323967, "cost_std": 2.2858327667313815e-05, "small_model_fraction": 0.296012229553679}}, "0.15": {"accuracy": 0.7429577464788732, "large_model_usage": 0.5316901408450704, "small_model_usage": 0.4683098591549296, "avg_lambda": 0.5999999999999969, "avg_cost": 0.00035401549295774635, "unrestricted": {"accuracy": 0.7605633802816901, "large_model_usage": 0.5316901408450704, "avg_cost": 0.00036512816901408446}, "random_baseline": {"accuracy": 0.6985915492957747, "accuracy_std": 0.022502176491085168, "avg_cost": 0.00035632771126760577, "cost_std": 1.7635635735333662e-05, "small_model_fraction": 0.4525098277966867}, "unrestricted_random_baseline": {"accuracy": 0.707394366197183, "accuracy_std": 0.01578234847254578, "avg_cost": 0.0003724378873239438, "cost_std": 1.4747522677352349e-05, "small_model_fraction": 0.43360069992322053}}, "0.2": {"accuracy": 0.7077464788732394, "large_model_usage": 0.4612676056338028, "small_model_usage": 0.5387323943661972, "avg_lambda": 0.5, "avg_cost": 0.0003067056338028168, "unrestricted": {"accuracy": 0.7394366197183099, "large_model_usage": 0.4612676056338028, "avg_cost": 0.00032090281690140847}, "random_baseline": {"accuracy": 0.6697183098591548, "accuracy_std": 0.01642521660519801, "avg_cost": 0.0003082880281690142, "cost_std": 1.550649688823667e-05, "small_model_fraction": 0.5330114254227606}, "unrestricted_random_baseline": {"accuracy": 0.6788732394366197, "accuracy_std": 0.01927314392028731, "avg_cost": 0.0003107884507042255, "cost_std": 2.5015839506501415e-05, "small_model_fraction": 0.5088537563524769}}, "0.25": {"accuracy": 0.6267605633802817, "large_model_usage": 0.3274647887323944, "small_model_usage": 0.6725352112676056, "avg_lambda": 0.34999999999999865, "avg_cost": 0.00022101549295774642, "unrestricted": {"accuracy": 0.6795774647887324, "large_model_usage": 0.3274647887323944, "avg_cost": 0.00023733239436619713}, "random_baseline": {"accuracy": 0.6401408450704225, "accuracy_std": 0.018417882860031022, "avg_cost": 0.0002253747887323944, "cost_std": 1.538522733178113e-05, "small_model_fraction": 0.6788202137398294}, "unrestricted_random_baseline": {"accuracy": 0.6352112676056338, "accuracy_std": 0.012812257323000188, "avg_cost": 0.00023217433098591552, "cost_std": 1.698048242213064e-05, "small_model_fraction": 0.6510556704680798}}}}, {"trial": 24, "trial_seed": 65, "subject": "TruthfulQA", "method": "standard", "calibration_size": 400, "test_size": 284, "small_model_accuracy": 0.5657894736842105, "large_model_accuracy": 0.8289473684210527, "small_model_cost": 3.2264181286549725e-05, "large_model_cost": 0.0006217748538011703, "hybrid_results": {"0.05": {"accuracy": 0.778169014084507, "large_model_usage": 0.8133802816901409, "small_model_usage": 0.18661971830985913, "avg_lambda": 0.879999999999997, "avg_cost": 0.0005276676056338029, "unrestricted": {"accuracy": 0.8063380281690141, "large_model_usage": 0.8133802816901409, "avg_cost": 0.0005395408450704227}, "random_baseline": {"accuracy": 0.804225352112676, "accuracy_std": 0.011157027829404816, "avg_cost": 0.0005301936971830983, "cost_std": 8.43493514132595e-06, "small_model_fraction": 0.15963620771434547}, "unrestricted_random_baseline": {"accuracy": 0.8140845070422535, "accuracy_std": 0.009289370393149931, "avg_cost": 0.0005449153169014082, "cost_std": 7.0774902183166975e-06, "small_model_fraction": 0.13949536889632494}}, "0.1": {"accuracy": 0.7323943661971831, "large_model_usage": 0.5880281690140845, "small_model_usage": 0.4119718309859155, "avg_lambda": 0.6999999999999973, "avg_cost": 0.00038328028169014083, "unrestricted": {"accuracy": 0.7711267605633803, "large_model_usage": 0.5880281690140845, "avg_cost": 0.00039926619718309847}, "random_baseline": {"accuracy": 0.7323943661971831, "accuracy_std": 0.016590449280830623, "avg_cost": 0.0003878171126760561, "cost_std": 1.7733640532233686e-05, "small_model_fraction": 0.4045636206952887}, "unrestricted_random_baseline": {"accuracy": 0.7464788732394366, "accuracy_std": 0.015428804436765224, "avg_cost": 0.00040291697183098576, "cost_std": 1.121201176377879e-05, "small_model_fraction": 0.3774463584669935}}, "0.15": {"accuracy": 0.7007042253521126, "large_model_usage": 0.3908450704225352, "small_model_usage": 0.6091549295774648, "avg_lambda": 0.5, "avg_cost": 0.0002665971830985914, "unrestricted": {"accuracy": 0.7253521126760564, "large_model_usage": 0.3908450704225352, "avg_cost": 0.0002766676056338027}, "random_baseline": {"accuracy": 0.6704225352112677, "accuracy_std": 0.017689938995887992, "avg_cost": 0.00025884158450704216, "cost_std": 1.459458369229511e-05, "small_model_fraction": 0.6024957431008513}, "unrestricted_random_baseline": {"accuracy": 0.680281690140845, "accuracy_std": 0.012676056338028152, "avg_cost": 0.0002717266197183097, "cost_std": 1.579013778218294e-05, "small_model_fraction": 0.5854130624900749}}, "0.2": {"accuracy": 0.6549295774647887, "large_model_usage": 0.24647887323943662, "small_model_usage": 0.7535211267605634, "avg_lambda": 0.3900000000000005, "avg_cost": 0.00017811126760563386, "unrestricted": {"accuracy": 0.6690140845070423, "large_model_usage": 0.24647887323943662, "avg_cost": 0.00018637183098591545}, "random_baseline": {"accuracy": 0.6348591549295775, "accuracy_std": 0.013458813277194661, "avg_cost": 0.00017852221830985915, "cost_std": 1.0911102588725678e-05, "small_model_fraction": 0.7525963597962395}, "unrestricted_random_baseline": {"accuracy": 0.6338028169014085, "accuracy_std": 0.01613582991181636, "avg_cost": 0.00019046144366197183, "cost_std": 1.7016686018762543e-05, "small_model_fraction": 0.7385837833232042}}, "0.25": {"accuracy": 0.6091549295774648, "large_model_usage": 0.2147887323943662, "small_model_usage": 0.7852112676056338, "avg_lambda": 0.25, "avg_cost": 0.00015135774647887334, "unrestricted": {"accuracy": 0.6584507042253521, "large_model_usage": 0.2147887323943662, "avg_cost": 0.00016673802816901417}, "random_baseline": {"accuracy": 0.6309859154929577, "accuracy_std": 0.014755159746307738, "avg_cost": 0.00014976355633802819, "cost_std": 1.294563250936379e-05, "small_model_fraction": 0.7979789497545188}, "unrestricted_random_baseline": {"accuracy": 0.6246478873239436, "accuracy_std": 0.01356433822835089, "avg_cost": 0.00016217964788732396, "cost_std": 1.108321838684746e-05, "small_model_fraction": 0.7718890375489694}}}}, {"trial": 25, "trial_seed": 66, "subject": "TruthfulQA", "method": "standard", "calibration_size": 400, "test_size": 284, "small_model_accuracy": 0.5555555555555556, "large_model_accuracy": 0.8304093567251462, "small_model_cost": 3.220716374269006e-05, "large_model_cost": 0.0006192894736842116, "hybrid_results": {"0.05": {"accuracy": 0.8133802816901409, "large_model_usage": 0.823943661971831, "small_model_usage": 0.176056338028169, "avg_lambda": 0.900000000000004, "avg_cost": 0.0005460373239436619, "unrestricted": {"accuracy": 0.8133802816901409, "large_model_usage": 0.823943661971831, "avg_cost": 0.0005465091549295773}, "random_baseline": {"accuracy": 0.7940140845070423, "accuracy_std": 0.010938186314795144, "avg_cost": 0.0005465783802816904, "cost_std": 8.749695498994992e-06, "small_model_fraction": 0.12477321918939485}, "unrestricted_random_baseline": {"accuracy": 0.7957746478873239, "accuracy_std": 0.007216162511239183, "avg_cost": 0.0005504030281690144, "cost_std": 1.0035764177009304e-05, "small_model_fraction": 0.12396953122618161}}, "0.1": {"accuracy": 0.7676056338028169, "large_model_usage": 0.6302816901408451, "small_model_usage": 0.3697183098591549, "avg_lambda": 0.7599999999999976, "avg_cost": 0.00042138239436619707, "unrestricted": {"accuracy": 0.778169014084507, "large_model_usage": 0.6302816901408451, "avg_cost": 0.00042629788732394345}, "random_baseline": {"accuracy": 0.7320422535211268, "accuracy_std": 0.011404320239930722, "avg_cost": 0.0004214792605633803, "cost_std": 1.3854581831362901e-05, "small_model_fraction": 0.3371027809332695}, "unrestricted_random_baseline": {"accuracy": 0.7362676056338028, "accuracy_std": 0.014725720891779195, "avg_cost": 0.00042873764084507026, "cost_std": 1.3928577397246986e-05, "small_model_fraction": 0.3287300317045693}}, "0.15": {"accuracy": 0.7007042253521126, "large_model_usage": 0.44014084507042256, "small_model_usage": 0.5598591549295775, "avg_lambda": 0.5800000000000017, "avg_cost": 0.00029620633802816896, "unrestricted": {"accuracy": 0.721830985915493, "large_model_usage": 0.44014084507042256, "avg_cost": 0.0003071288732394365}, "random_baseline": {"accuracy": 0.6869718309859155, "accuracy_std": 0.01872829034784817, "avg_cost": 0.0002994217957746479, "cost_std": 1.6656689166831585e-05, "small_model_fraction": 0.550319998039499}, "unrestricted_random_baseline": {"accuracy": 0.6841549295774648, "accuracy_std": 0.011997693688310421, "avg_cost": 0.0003065336971830986, "cost_std": 1.8740866912754836e-05, "small_model_fraction": 0.5317152214582467}}, "0.2": {"accuracy": 0.6619718309859155, "large_model_usage": 0.30985915492957744, "small_model_usage": 0.6901408450704225, "avg_lambda": 0.40000000000000185, "avg_cost": 0.00021438943661971838, "unrestricted": {"accuracy": 0.6936619718309859, "large_model_usage": 0.30985915492957744, "avg_cost": 0.0002259880281690141}, "random_baseline": {"accuracy": 0.648943661971831, "accuracy_std": 0.017467770077200646, "avg_cost": 0.00021399394366197177, "cost_std": 1.9763357422583237e-05, "small_model_fraction": 0.6896818899292414}, "unrestricted_random_baseline": {"accuracy": 0.6443661971830986, "accuracy_std": 0.01746422079948907, "avg_cost": 0.00021703045774647882, "cost_std": 1.2779664208403784e-05, "small_model_fraction": 0.6699255604454745}}, "0.25": {"accuracy": 0.5950704225352113, "large_model_usage": 0.23943661971830985, "small_model_usage": 0.7605633802816901, "avg_lambda": 0.28000000000000086, "avg_cost": 0.0001653683098591549, "unrestricted": {"accuracy": 0.6584507042253521, "large_model_usage": 0.23943661971830985, "avg_cost": 0.00018192464788732398}, "random_baseline": {"accuracy": 0.6186619718309859, "accuracy_std": 0.009052788825480624, "avg_cost": 0.00016704598591549285, "cost_std": 1.3966950940086588e-05, "small_model_fraction": 0.7731814707042889}, "unrestricted_random_baseline": {"accuracy": 0.6161971830985914, "accuracy_std": 0.011888692264883222, "avg_cost": 0.00017340253521126755, "cost_std": 1.2229496292470368e-05, "small_model_fraction": 0.7449804199354142}}}}, {"trial": 26, "trial_seed": 67, "subject": "TruthfulQA", "method": "standard", "calibration_size": 400, "test_size": 284, "small_model_accuracy": 0.5423976608187134, "large_model_accuracy": 0.8245614035087719, "small_model_cost": 3.226798245614036e-05, "large_model_cost": 0.0006214649122807027, "hybrid_results": {"0.05": {"accuracy": 0.795774647887324, "large_model_usage": 0.7394366197183099, "small_model_usage": 0.2605633802816901, "avg_lambda": 0.8000000000000037, "avg_cost": 0.0004907676056338028, "unrestricted": {"accuracy": 0.8028169014084507, "large_model_usage": 0.7394366197183099, "avg_cost": 0.0004951690140845071}, "random_baseline": {"accuracy": 0.773943661971831, "accuracy_std": 0.017018374610696576, "avg_cost": 0.00047983746478873223, "cost_std": 1.6293265400959474e-05, "small_model_fraction": 0.22182278968394478}, "unrestricted_random_baseline": {"accuracy": 0.7908450704225352, "accuracy_std": 0.013472624274442978, "avg_cost": 0.000497008802816901, "cost_std": 6.22306059451965e-06, "small_model_fraction": 0.2143526074275389}}, "0.1": {"accuracy": 0.7464788732394366, "large_model_usage": 0.5352112676056338, "small_model_usage": 0.46478873239436624, "avg_lambda": 0.6100000000000032, "avg_cost": 0.00035573239436619735, "unrestricted": {"accuracy": 0.7605633802816901, "large_model_usage": 0.5352112676056338, "avg_cost": 0.00036749295774647905}, "random_baseline": {"accuracy": 0.7066901408450704, "accuracy_std": 0.017036578032961915, "avg_cost": 0.00035425838028169, "cost_std": 1.633971961011308e-05, "small_model_fraction": 0.4510079813104747}, "unrestricted_random_baseline": {"accuracy": 0.7183098591549295, "accuracy_std": 0.014172261829224049, "avg_cost": 0.00036865679577464776, "cost_std": 1.9181103314498983e-05, "small_model_fraction": 0.4310476543213584}}, "0.15": {"accuracy": 0.6725352112676056, "large_model_usage": 0.3767605633802817, "small_model_usage": 0.6232394366197183, "avg_lambda": 0.40000000000000185, "avg_cost": 0.00025282394366197197, "unrestricted": {"accuracy": 0.721830985915493, "large_model_usage": 0.3767605633802817, "avg_cost": 0.00026776760563380295}, "random_baseline": {"accuracy": 0.6514084507042254, "accuracy_std": 0.017885105773521537, "avg_cost": 0.0002474517253521128, "cost_std": 1.2880102665350024e-05, "small_model_fraction": 0.6256668186110478}, "unrestricted_random_baseline": {"accuracy": 0.6545774647887324, "accuracy_std": 0.013768461123955748, "avg_cost": 0.00026740728873239454, "cost_std": 2.1367050907007627e-05, "small_model_fraction": 0.6003040558140987}}, "0.2": {"accuracy": 0.602112676056338, "large_model_usage": 0.2992957746478873, "small_model_usage": 0.7007042253521127, "avg_lambda": 0.29999999999999843, "avg_cost": 0.00020002816901408454, "unrestricted": {"accuracy": 0.7007042253521126, "large_model_usage": 0.2992957746478873, "avg_cost": 0.00021902816901408465}, "random_baseline": {"accuracy": 0.6190140845070423, "accuracy_std": 0.01541272438608403, "avg_cost": 0.00020724000000000003, "cost_std": 1.468598013674503e-05, "small_model_fraction": 0.715273148813087}, "unrestricted_random_baseline": {"accuracy": 0.646830985915493, "accuracy_std": 0.0205345398758075, "avg_cost": 0.0002247302464788733, "cost_std": 2.196100965149287e-05, "small_model_fraction": 0.6830258660486341}}, "0.25": {"accuracy": 0.5669014084507042, "large_model_usage": 0.19014084507042253, "small_model_usage": 0.8098591549295775, "avg_lambda": 0.060000000000000074, "avg_cost": 0.00013210563380281688, "unrestricted": {"accuracy": 0.6549295774647887, "large_model_usage": 0.19014084507042253, "avg_cost": 0.00015075352112676053}, "random_baseline": {"accuracy": 0.5887323943661972, "accuracy_std": 0.01257786697147305, "avg_cost": 0.000134053838028169, "cost_std": 1.4922066095134422e-05, "small_model_fraction": 0.8305530013939415}, "unrestricted_random_baseline": {"accuracy": 0.5943661971830986, "accuracy_std": 0.014670892011267343, "avg_cost": 0.00015068394366197184, "cost_std": 1.0151891453044095e-05, "small_model_fraction": 0.7989033332100013}}}}, {"trial": 27, "trial_seed": 68, "subject": "TruthfulQA", "method": "standard", "calibration_size": 400, "test_size": 284, "small_model_accuracy": 0.543859649122807, "large_model_accuracy": 0.8216374269005848, "small_model_cost": 3.2207602339181305e-05, "large_model_cost": 0.0006192397660818729, "hybrid_results": {"0.05": {"accuracy": 0.7676056338028169, "large_model_usage": 0.7394366197183099, "small_model_usage": 0.2605633802816901, "avg_lambda": 0.8000000000000037, "avg_cost": 0.0004868654929577463, "unrestricted": {"accuracy": 0.778169014084507, "large_model_usage": 0.7394366197183099, "avg_cost": 0.0004939218309859154}, "random_baseline": {"accuracy": 0.7411971830985916, "accuracy_std": 0.011704767703235662, "avg_cost": 0.0004946989788732392, "cost_std": 1.2676541797677974e-05, "small_model_fraction": 0.22549747918437565}, "unrestricted_random_baseline": {"accuracy": 0.7380281690140844, "accuracy_std": 0.008212608302599003, "avg_cost": 0.0004976868661971827, "cost_std": 1.1201381509879298e-05, "small_model_fraction": 0.2134771190337825}}, "0.1": {"accuracy": 0.7288732394366197, "large_model_usage": 0.6267605633802817, "small_model_usage": 0.37323943661971826, "avg_lambda": 0.6999999999999973, "avg_cost": 0.0004112105633802815, "unrestricted": {"accuracy": 0.75, "large_model_usage": 0.6267605633802817, "avg_cost": 0.00042423873239436603}, "random_baseline": {"accuracy": 0.7091549295774647, "accuracy_std": 0.011485567908662064, "avg_cost": 0.00042298373239436584, "cost_std": 1.0805250680497824e-05, "small_model_fraction": 0.3543744543319008}, "unrestricted_random_baseline": {"accuracy": 0.7084507042253521, "accuracy_std": 0.015809819944115268, "avg_cost": 0.00041612172535211245, "cost_std": 1.724417025928512e-05, "small_model_fraction": 0.3321811746127421}}, "0.15": {"accuracy": 0.6971830985915493, "large_model_usage": 0.46830985915492956, "small_model_usage": 0.5316901408450705, "avg_lambda": 0.5, "avg_cost": 0.00031524577464788725, "unrestricted": {"accuracy": 0.7112676056338029, "large_model_usage": 0.46830985915492956, "avg_cost": 0.0003261964788732393}, "random_baseline": {"accuracy": 0.655281690140845, "accuracy_std": 0.019442869989124676, "avg_cost": 0.00030796820422535195, "cost_std": 1.5408307959571597e-05, "small_model_fraction": 0.5178489531064818}, "unrestricted_random_baseline": {"accuracy": 0.6693661971830986, "accuracy_std": 0.015938690560220084, "avg_cost": 0.0003274038732394364, "cost_std": 1.8580615679704886e-05, "small_model_fraction": 0.4991946017749728}}, "0.2": {"accuracy": 0.676056338028169, "large_model_usage": 0.38380281690140844, "small_model_usage": 0.6161971830985915, "avg_lambda": 0.40000000000000185, "avg_cost": 0.00026030915492957727, "unrestricted": {"accuracy": 0.7007042253521126, "large_model_usage": 0.38380281690140844, "avg_cost": 0.0002739499999999998}, "random_baseline": {"accuracy": 0.6355633802816901, "accuracy_std": 0.022115927813298894, "avg_cost": 0.00025983753521126746, "cost_std": 1.357280659291837e-05, "small_model_fraction": 0.611432615316837}, "unrestricted_random_baseline": {"accuracy": 0.6471830985915494, "accuracy_std": 0.0147551597463077, "avg_cost": 0.00027529042253521106, "cost_std": 1.063730029555723e-05, "small_model_fraction": 0.5881956516325071}}, "0.25": {"accuracy": 0.6267605633802817, "large_model_usage": 0.323943661971831, "small_model_usage": 0.676056338028169, "avg_lambda": 0.3100000000000012, "avg_cost": 0.00021480211267605633, "unrestricted": {"accuracy": 0.6830985915492958, "large_model_usage": 0.323943661971831, "avg_cost": 0.00023535845070422507}, "random_baseline": {"accuracy": 0.6235915492957746, "accuracy_std": 0.018728290347848193, "avg_cost": 0.00020583285211267605, "cost_std": 1.6060450260289433e-05, "small_model_fraction": 0.6889531415574871}, "unrestricted_random_baseline": {"accuracy": 0.6242957746478873, "accuracy_std": 0.01696364714147312, "avg_cost": 0.00023123070422535196, "cost_std": 1.116958382846956e-05, "small_model_fraction": 0.6539357450708798}}}}, {"trial": 28, "trial_seed": 69, "subject": "TruthfulQA", "method": "standard", "calibration_size": 400, "test_size": 284, "small_model_accuracy": 0.5716374269005848, "large_model_accuracy": 0.8494152046783626, "small_model_cost": 3.22343567251462e-05, "large_model_cost": 0.0006201140350877194, "hybrid_results": {"0.05": {"accuracy": 0.8169014084507042, "large_model_usage": 0.8450704225352113, "small_model_usage": 0.15492957746478875, "avg_lambda": 0.879999999999997, "avg_cost": 0.0005501190140845075, "unrestricted": {"accuracy": 0.8450704225352113, "large_model_usage": 0.8450704225352113, "avg_cost": 0.0005580274647887329}, "random_baseline": {"accuracy": 0.8151408450704226, "accuracy_std": 0.008802816901408439, "avg_cost": 0.0005541610563380281, "cost_std": 1.0672127904272714e-05, "small_model_fraction": 0.1190635151706037}, "unrestricted_random_baseline": {"accuracy": 0.8214788732394366, "accuracy_std": 0.011360750566832152, "avg_cost": 0.0005563522183098591, "cost_std": 1.5219401968108443e-05, "small_model_fraction": 0.10561101630850173}}, "0.1": {"accuracy": 0.7887323943661971, "large_model_usage": 0.6830985915492958, "small_model_usage": 0.31690140845070425, "avg_lambda": 0.6999999999999973, "avg_cost": 0.0004445485915492958, "unrestricted": {"accuracy": 0.8028169014084507, "large_model_usage": 0.6830985915492958, "avg_cost": 0.00045756267605633804}, "random_baseline": {"accuracy": 0.7584507042253521, "accuracy_std": 0.009984117520251976, "avg_cost": 0.00043684073943661965, "cost_std": 1.524339107320557e-05, "small_model_fraction": 0.29864179695312426}, "unrestricted_random_baseline": {"accuracy": 0.7538732394366197, "accuracy_std": 0.009764383537931007, "avg_cost": 0.00045163602112676044, "cost_std": 7.031191676651994e-06, "small_model_fraction": 0.27650447024149083}}, "0.15": {"accuracy": 0.721830985915493, "large_model_usage": 0.5070422535211268, "small_model_usage": 0.49295774647887325, "avg_lambda": 0.5, "avg_cost": 0.0003325626760563382, "unrestricted": {"accuracy": 0.7640845070422535, "large_model_usage": 0.5070422535211268, "avg_cost": 0.00034814014084507056}, "random_baseline": {"accuracy": 0.7007042253521127, "accuracy_std": 0.023462438378813642, "avg_cost": 0.00033066827464788744, "cost_std": 1.7578707883410506e-05, "small_model_fraction": 0.4891330141438137}, "unrestricted_random_baseline": {"accuracy": 0.7112676056338029, "accuracy_std": 0.014172261829224049, "avg_cost": 0.0003499071830985917, "cost_std": 2.2359226231005297e-05, "small_model_fraction": 0.46263530489807075}}, "0.2": {"accuracy": 0.6654929577464789, "large_model_usage": 0.39436619718309857, "small_model_usage": 0.6056338028169015, "avg_lambda": 0.34999999999999865, "avg_cost": 0.0002582387323943661, "unrestricted": {"accuracy": 0.7288732394366197, "large_model_usage": 0.39436619718309857, "avg_cost": 0.00027721056338028173}, "random_baseline": {"accuracy": 0.653169014084507, "accuracy_std": 0.016459149144394707, "avg_cost": 0.00025757077464788733, "cost_std": 2.1013148844125363e-05, "small_model_fraction": 0.6155601494872011}, "unrestricted_random_baseline": {"accuracy": 0.6672535211267606, "accuracy_std": 0.016905075893179983, "avg_cost": 0.0002708735563380282, "cost_std": 1.9291773755342075e-05, "small_model_fraction": 0.5832885271056314}}, "0.25": {"accuracy": 0.5880281690140845, "large_model_usage": 0.3274647887323944, "small_model_usage": 0.6725352112676056, "avg_lambda": 0.25, "avg_cost": 0.00021040774647887317, "unrestricted": {"accuracy": 0.7007042253521126, "large_model_usage": 0.3274647887323944, "avg_cost": 0.000235280985915493}, "random_baseline": {"accuracy": 0.6327464788732394, "accuracy_std": 0.01659418544781435, "avg_cost": 0.00020507415492957746, "cost_std": 1.831255731216822e-05, "small_model_fraction": 0.6969220125961914}, "unrestricted_random_baseline": {"accuracy": 0.6545774647887325, "accuracy_std": 0.014641283722740702, "avg_cost": 0.0002391483450704225, "cost_std": 1.0924291280372676e-05, "small_model_fraction": 0.6546119271278529}}}}, {"trial": 29, "trial_seed": 70, "subject": "TruthfulQA", "method": "standard", "calibration_size": 400, "test_size": 284, "small_model_accuracy": 0.5906432748538012, "large_model_accuracy": 0.8406432748538012, "small_model_cost": 3.226681286549711e-05, "large_model_cost": 0.00062158187134503, "hybrid_results": {"0.05": {"accuracy": 0.8098591549295775, "large_model_usage": 0.8345070422535211, "small_model_usage": 0.16549295774647887, "avg_lambda": 0.900000000000004, "avg_cost": 0.0005571602112676059, "unrestricted": {"accuracy": 0.8098591549295775, "large_model_usage": 0.8345070422535211, "avg_cost": 0.0005581038732394369}, "random_baseline": {"accuracy": 0.8109154929577466, "accuracy_std": 0.0066901408450704145, "avg_cost": 0.0005610594718309859, "cost_std": 1.053557050402685e-05, "small_model_fraction": 0.10931616144959169}, "unrestricted_random_baseline": {"accuracy": 0.8123239436619718, "accuracy_std": 0.01527130171986017, "avg_cost": 0.0005620054225352114, "cost_std": 1.1894192940640717e-05, "small_model_fraction": 0.10771487541718354}}, "0.1": {"accuracy": 0.7746478873239436, "large_model_usage": 0.6267605633802817, "small_model_usage": 0.37323943661971826, "avg_lambda": 0.7599999999999976, "avg_cost": 0.00042226584507042267, "unrestricted": {"accuracy": 0.778169014084507, "large_model_usage": 0.6267605633802817, "avg_cost": 0.00042764612676056353}, "random_baseline": {"accuracy": 0.7503521126760563, "accuracy_std": 0.016624044555088535, "avg_cost": 0.0004159550352112676, "cost_std": 9.555850254101042e-06, "small_model_fraction": 0.33821641481358766}, "unrestricted_random_baseline": {"accuracy": 0.751056338028169, "accuracy_std": 0.013732394366197165, "avg_cost": 0.00042504278169014077, "cost_std": 1.7962189598261877e-05, "small_model_fraction": 0.3290866944497091}}, "0.15": {"accuracy": 0.7147887323943662, "large_model_usage": 0.397887323943662, "small_model_usage": 0.602112676056338, "avg_lambda": 0.5100000000000009, "avg_cost": 0.00027439964788732375, "unrestricted": {"accuracy": 0.7288732394366197, "large_model_usage": 0.397887323943662, "avg_cost": 0.0002842095070422534}, "random_baseline": {"accuracy": 0.6869718309859154, "accuracy_std": 0.016624044555088553, "avg_cost": 0.00027430045774647874, "cost_std": 1.8194449632983734e-05, "small_model_fraction": 0.5891283761753121}, "unrestricted_random_baseline": {"accuracy": 0.6890845070422535, "accuracy_std": 0.016594185447814318, "avg_cost": 0.00028249257042253507, "cost_std": 1.4207259389568296e-05, "small_model_fraction": 0.5724821713757272}}, "0.2": {"accuracy": 0.6795774647887324, "large_model_usage": 0.27816901408450706, "small_model_usage": 0.721830985915493, "avg_lambda": 0.35999999999999954, "avg_cost": 0.00019706866197183084, "unrestricted": {"accuracy": 0.7147887323943662, "large_model_usage": 0.27816901408450706, "avg_cost": 0.00020877288732394351}, "random_baseline": {"accuracy": 0.6640845070422535, "accuracy_std": 0.011157027829404816, "avg_cost": 0.00020362623239436606, "cost_std": 1.6802239511739166e-05, "small_model_fraction": 0.7203501815624191}, "unrestricted_random_baseline": {"accuracy": 0.6630281690140845, "accuracy_std": 0.012503718946876534, "avg_cost": 0.0002078869366197182, "cost_std": 2.024609798505775e-05, "small_model_fraction": 0.7004894548022541}}, "0.25": {"accuracy": 0.6373239436619719, "large_model_usage": 0.23943661971830985, "small_model_usage": 0.7605633802816901, "avg_lambda": 0.2700000000000005, "avg_cost": 0.00016804049295774634, "unrestricted": {"accuracy": 0.7007042253521126, "large_model_usage": 0.23943661971830985, "avg_cost": 0.0001845264084507041}, "random_baseline": {"accuracy": 0.6338028169014084, "accuracy_std": 0.01659044928083064, "avg_cost": 0.0001628509507042253, "cost_std": 1.6559783729459892e-05, "small_model_fraction": 0.7696076519026118}, "unrestricted_random_baseline": {"accuracy": 0.6461267605633803, "accuracy_std": 0.014959609708625352, "avg_cost": 0.0001805014436619717, "cost_std": 1.1139011820831484e-05, "small_model_fraction": 0.7416329459185286}}}}, {"trial": 30, "trial_seed": 71, "subject": "TruthfulQA", "method": "standard", "calibration_size": 400, "test_size": 284, "small_model_accuracy": 0.5730994152046783, "large_model_accuracy": 0.8347953216374269, "small_model_cost": 3.2219444444444495e-05, "large_model_cost": 0.0006187397660818721, "hybrid_results": {"0.05": {"accuracy": 0.8028169014084507, "large_model_usage": 0.5985915492957746, "small_model_usage": 0.4014084507042254, "avg_lambda": 0.7899999999999981, "avg_cost": 0.0004043232394366197, "unrestricted": {"accuracy": 0.8063380281690141, "large_model_usage": 0.5985915492957746, "avg_cost": 0.00040552042253521134}, "random_baseline": {"accuracy": 0.7471830985915494, "accuracy_std": 0.016650127348643483, "avg_cost": 0.00040870591549295774, "cost_std": 1.4855489320577053e-05, "small_model_fraction": 0.365573908925528}, "unrestricted_random_baseline": {"accuracy": 0.7450704225352113, "accuracy_std": 0.011485567908662021, "avg_cost": 0.00040469679577464796, "cost_std": 2.0510381866119294e-05, "small_model_fraction": 0.36353274674507813}}, "0.1": {"accuracy": 0.7746478873239436, "large_model_usage": 0.4859154929577465, "small_model_usage": 0.5140845070422535, "avg_lambda": 0.6699999999999969, "avg_cost": 0.0003296471830985917, "unrestricted": {"accuracy": 0.7816901408450704, "large_model_usage": 0.4859154929577465, "avg_cost": 0.00033426690140845087}, "random_baseline": {"accuracy": 0.7183098591549295, "accuracy_std": 0.013727879358885885, "avg_cost": 0.00032612989436619735, "cost_std": 1.04921169521466e-05, "small_model_fraction": 0.49289440164017073}, "unrestricted_random_baseline": {"accuracy": 0.7179577464788731, "accuracy_std": 0.014384998743532069, "avg_cost": 0.00033364961267605647, "cost_std": 1.5742391215616823e-05, "small_model_fraction": 0.48501791699090574}}, "0.15": {"accuracy": 0.7077464788732394, "large_model_usage": 0.36971830985915494, "small_model_usage": 0.630281690140845, "avg_lambda": 0.4800000000000006, "avg_cost": 0.000250337323943662, "unrestricted": {"accuracy": 0.7464788732394366, "large_model_usage": 0.36971830985915494, "avg_cost": 0.0002622598591549297}, "random_baseline": {"accuracy": 0.6795774647887323, "accuracy_std": 0.015185815952709715, "avg_cost": 0.0002560749647887325, "cost_std": 1.6103936335848992e-05, "small_model_fraction": 0.6281153926767902}, "unrestricted_random_baseline": {"accuracy": 0.6908450704225352, "accuracy_std": 0.020153645100358004, "avg_cost": 0.00026378728873239447, "cost_std": 1.2574998978998467e-05, "small_model_fraction": 0.6077878187267814}}, "0.2": {"accuracy": 0.6619718309859155, "large_model_usage": 0.2992957746478873, "small_model_usage": 0.7007042253521127, "avg_lambda": 0.34999999999999865, "avg_cost": 0.00020306267605633795, "unrestricted": {"accuracy": 0.7183098591549296, "large_model_usage": 0.2992957746478873, "avg_cost": 0.0002181260563380281}, "random_baseline": {"accuracy": 0.6630281690140845, "accuracy_std": 0.007394366197183089, "avg_cost": 0.0002035836267605634, "cost_std": 1.5972494634076342e-05, "small_model_fraction": 0.7087172851318448}, "unrestricted_random_baseline": {"accuracy": 0.6683098591549296, "accuracy_std": 0.017378820675004295, "avg_cost": 0.00022612084507042256, "cost_std": 1.501247789857586e-05, "small_model_fraction": 0.6830346621672445}}, "0.25": {"accuracy": 0.6126760563380281, "large_model_usage": 0.25704225352112675, "small_model_usage": 0.7429577464788732, "avg_lambda": 0.22999999999999873, "avg_cost": 0.00017225281690140838, "unrestricted": {"accuracy": 0.7007042253521126, "large_model_usage": 0.25704225352112675, "avg_cost": 0.00019241478873239438}, "random_baseline": {"accuracy": 0.652112676056338, "accuracy_std": 0.012870187945420452, "avg_cost": 0.00016596573943661975, "cost_std": 1.7458261540142446e-05, "small_model_fraction": 0.7612471941875373}, "unrestricted_random_baseline": {"accuracy": 0.6609154929577465, "accuracy_std": 0.01189390546060177, "avg_cost": 0.00019496947183098587, "cost_std": 1.4309967713394396e-05, "small_model_fraction": 0.726871621701492}}}}], "unrestricted_hybrid": {"0.05": {"avg_accuracy": 0.8122065727699532, "std_accuracy": 0.01546447802065784, "avg_cost": 0.0005252869131455398, "std_cost": 3.4766280573092e-05}, "0.1": {"avg_accuracy": 0.7847417840375589, "std_accuracy": 0.015948626996678924, "avg_cost": 0.00042095780516431913, "std_cost": 2.733121658916219e-05}, "0.15": {"avg_accuracy": 0.7406103286384976, "std_accuracy": 0.02238678624880467, "avg_cost": 0.0003132824530516431, "std_cost": 3.571087930203795e-05}, "0.2": {"avg_accuracy": 0.7125586854460093, "std_accuracy": 0.022389555196258605, "avg_cost": 0.0002470491197183098, "std_cost": 3.317930988843697e-05}, "0.25": {"avg_accuracy": 0.6866197183098591, "std_accuracy": 0.02137955178227929, "avg_cost": 0.00020566907276995305, "std_cost": 2.4563329702974556e-05}}, "random_baseline": {"0.05": {"avg_accuracy": 0.7896713615023476, "std_accuracy": 0.019614147527283063, "avg_cost": 0.0005233575481220656, "std_cost": 3.559298724881553e-05, "avg_small_model_fraction": 0.16818400295321098, "std_small_model_fraction": 0.05878007146071353}, "0.1": {"avg_accuracy": 0.7353638497652581, "std_accuracy": 0.018491265341083255, "avg_cost": 0.00040848265023474164, "std_cost": 2.5846610287508148e-05, "avg_small_model_fraction": 0.36234247648169887, "std_small_model_fraction": 0.04368753729626794}, "0.15": {"avg_accuracy": 0.6844953051643191, "std_accuracy": 0.0212703421686662, "avg_cost": 0.0003011152664319248, "std_cost": 3.481014353965516e-05, "avg_small_model_fraction": 0.5414316316375996, "std_small_model_fraction": 0.05949951637260429}, "0.2": {"avg_accuracy": 0.6531572769953052, "std_accuracy": 0.021320799714689222, "avg_cost": 0.00023315249413145538, "std_cost": 3.250641990532546e-05, "avg_small_model_fraction": 0.6569085707293006, "std_small_model_fraction": 0.056259453113138104}, "0.25": {"avg_accuracy": 0.6317957746478874, "std_accuracy": 0.020857269856778084, "avg_cost": 0.00018690477934272298, "std_cost": 2.425851029115317e-05, "avg_small_model_fraction": 0.7352532969144809, "std_small_model_fraction": 0.04162754454917326}}, "unrestricted_random_baseline": {"0.05": {"avg_accuracy": 0.7927347417840375, "std_accuracy": 0.021064949354219188, "avg_cost": 0.0005271602218309859, "std_cost": 3.5822886522544885e-05, "avg_small_model_fraction": 0.16112791700969012, "std_small_model_fraction": 0.05906032004172614}, "0.1": {"avg_accuracy": 0.7414436619718309, "std_accuracy": 0.01894487272951582, "avg_cost": 0.00042171946713615005, "std_cost": 2.786470227563806e-05, "avg_small_model_fraction": 0.3386460976718494, "std_small_model_fraction": 0.04621592820357756}, "0.15": {"avg_accuracy": 0.6905399061032864, "std_accuracy": 0.020313927930538548, "avg_cost": 0.0003138355962441313, "std_cost": 3.734637452025399e-05, "avg_small_model_fraction": 0.5218357143275404, "std_small_model_fraction": 0.060641125289020006}, "0.2": {"avg_accuracy": 0.660774647887324, "std_accuracy": 0.019097062896708216, "avg_cost": 0.0002473593474178403, "std_cost": 3.223517574564706e-05, "avg_small_model_fraction": 0.6345135780049624, "std_small_model_fraction": 0.05641382427044541}, "0.25": {"avg_accuracy": 0.6404107981220657, "std_accuracy": 0.019314631163101503, "avg_cost": 0.0002055586654929577, "std_cost": 2.6164326841940596e-05, "avg_small_model_fraction": 0.7049140100747773, "std_small_model_fraction": 0.04178495858909004}}}