{"subject": "TruthfulQA", "method": "standard_basescores", "iterations": 30, "random_seed": 42, "calibration_size": 400, "total_examples": 684, "small_model": {"avg_accuracy": 0.556335282651072, "std_accuracy": 0.012042461547327405, "avg_cost": 3.203621832358677e-05, "std_cost": 4.1788899969096624e-08, "x_position": 1.0, "x_std": 0.0}, "large_model": {"avg_accuracy": 0.8313840155945417, "std_accuracy": 0.009469752671727963, "avg_cost": 0.0006146627680311897, "std_cost": 1.1901210604054217e-06, "x_position": 0.0, "x_std": 0.0}, "hybrid_models": {"0.05": {"avg_accuracy": 0.813262910798122, "std_accuracy": 0.020959496977898402, "avg_large_model_usage": 0.8478873239436618, "std_large_model_usage": 0.09638887547026236, "avg_lambda": 0.9173333333333361, "std_lambda": 0.056269194256022816, "avg_cost": 0.0005541338145539905, "std_cost": 5.939112698712658e-05}, "0.1": {"avg_accuracy": 0.7710093896713615, "std_accuracy": 0.025104478723754618, "avg_large_model_usage": 0.6311032863849765, "std_large_model_usage": 0.0426592737780322, "avg_lambda": 0.7409999999999987, "std_lambda": 0.03599537007264653, "avg_cost": 0.0004140784154929576, "std_cost": 2.9726840583037227e-05}, "0.15": {"avg_accuracy": 0.7144366197183097, "std_accuracy": 0.027775270446031584, "avg_large_model_usage": 0.45575117370892027, "std_large_model_usage": 0.05261057770803232, "avg_lambda": 0.5580000000000006, "std_lambda": 0.0593520569708132, "avg_cost": 0.00030386010563380274, "std_cost": 3.159152929040863e-05}, "0.2": {"avg_accuracy": 0.6610328638497652, "std_accuracy": 0.03509291533791988, "avg_large_model_usage": 0.328169014084507, "std_large_model_usage": 0.04431027103306866, "avg_lambda": 0.39433333333333415, "std_lambda": 0.058747103947532554, "avg_cost": 0.00022206550469483567, "std_cost": 2.8176740114108406e-05}, "0.25": {"avg_accuracy": 0.5978873239436618, "std_accuracy": 0.03656324645330948, "avg_large_model_usage": 0.2401408450704225, "std_large_model_usage": 0.044514990078973825, "avg_lambda": 0.20333333333333356, "std_lambda": 0.08908173525227027, "avg_cost": 0.0001630117488262911, "std_cost": 2.550645265167248e-05}}, "all_trials": [{"trial": 1, "trial_seed": 42, "subject": "TruthfulQA", "method": "standard_basescores", "calibration_size": 400, "test_size": 284, "small_model_accuracy": 0.5350877192982456, "large_model_accuracy": 0.8128654970760234, "small_model_cost": 3.2059795321637445e-05, "large_model_cost": 0.000615394736842106, "hybrid_results": {"0.05": {"accuracy": 0.7887323943661971, "large_model_usage": 0.8204225352112676, "small_model_usage": 0.17957746478873238, "avg_lambda": 0.900000000000004, "avg_cost": 0.0005383644366197183, "unrestricted": {"accuracy": 0.7922535211267606, "large_model_usage": 0.8204225352112676, "avg_cost": 0.0005392447183098591}, "random_baseline": {"accuracy": 0.7732394366197183, "accuracy_std": 0.009474383131742039, "avg_cost": 0.0005395949647887321, "cost_std": 8.159293933351457e-06, "small_model_fraction": 0.13205157918640606}, "unrestricted_random_baseline": {"accuracy": 0.7742957746478873, "accuracy_std": 0.008250263742154906, "avg_cost": 0.0005435581690140842, "cost_std": 7.895630644988923e-06, "small_model_fraction": 0.13054252902073907}}, "0.1": {"accuracy": 0.7464788732394366, "large_model_usage": 0.6302816901408451, "small_model_usage": 0.3697183098591549, "avg_lambda": 0.7599999999999976, "avg_cost": 0.00041673767605633757, "unrestricted": {"accuracy": 0.7711267605633803, "large_model_usage": 0.6302816901408451, "avg_cost": 0.0004210193661971826}, "random_baseline": {"accuracy": 0.7084507042253521, "accuracy_std": 0.008151997818866387, "avg_cost": 0.00041510183098591536, "cost_std": 2.0298138478792247e-05, "small_model_fraction": 0.3405540224762925}, "unrestricted_random_baseline": {"accuracy": 0.7179577464788733, "accuracy_std": 0.012739473602201106, "avg_cost": 0.00042247088028169, "cost_std": 8.454570812978613e-06, "small_model_fraction": 0.333214002470488}}, "0.15": {"accuracy": 0.6971830985915493, "large_model_usage": 0.4471830985915493, "small_model_usage": 0.5528169014084507, "avg_lambda": 0.6100000000000032, "avg_cost": 0.00029853345070422505, "unrestricted": {"accuracy": 0.7147887323943662, "large_model_usage": 0.4471830985915493, "avg_cost": 0.000308082746478873}, "random_baseline": {"accuracy": 0.6443661971830987, "accuracy_std": 0.019414153171958062, "avg_cost": 0.00029302746478873225, "cost_std": 1.1568431464605702e-05, "small_model_fraction": 0.5431892787220644}, "unrestricted_random_baseline": {"accuracy": 0.6591549295774647, "accuracy_std": 0.019208706615473012, "avg_cost": 0.0003141594718309858, "cost_std": 1.9466427174327456e-05, "small_model_fraction": 0.526819102524908}}, "0.2": {"accuracy": 0.6830985915492958, "large_model_usage": 0.38380281690140844, "small_model_usage": 0.6161971830985915, "avg_lambda": 0.5, "avg_cost": 0.0002591531690140845, "unrestricted": {"accuracy": 0.6971830985915493, "large_model_usage": 0.38380281690140844, "avg_cost": 0.00026866725352112667}, "random_baseline": {"accuracy": 0.6161971830985916, "accuracy_std": 0.00862496388303938, "avg_cost": 0.0002472634507042253, "cost_std": 9.999497523579044e-06, "small_model_fraction": 0.6106981469333452}, "unrestricted_random_baseline": {"accuracy": 0.6330985915492958, "accuracy_std": 0.015731202749076547, "avg_cost": 0.0002637840492957746, "cost_std": 1.2987634205648089e-05, "small_model_fraction": 0.5943883327428159}}, "0.25": {"accuracy": 0.6091549295774648, "large_model_usage": 0.2605633802816901, "small_model_usage": 0.7394366197183099, "avg_lambda": 0.3100000000000012, "avg_cost": 0.0001788573943661971, "unrestricted": {"accuracy": 0.6690140845070423, "large_model_usage": 0.2605633802816901, "avg_cost": 0.00019193485915492951}, "random_baseline": {"accuracy": 0.5904929577464789, "accuracy_std": 0.012797733693933961, "avg_cost": 0.0001748778873239435, "cost_std": 9.58694650760163e-06, "small_model_fraction": 0.7483476668448316}, "unrestricted_random_baseline": {"accuracy": 0.5975352112676057, "accuracy_std": 0.015671981045259072, "avg_cost": 0.00019467637323943645, "cost_std": 2.376117267478243e-05, "small_model_fraction": 0.7259292175836817}}}}, {"trial": 2, "trial_seed": 43, "subject": "TruthfulQA", "method": "standard_basescores", "calibration_size": 400, "test_size": 284, "small_model_accuracy": 0.5555555555555556, "large_model_accuracy": 0.8260233918128655, "small_model_cost": 3.205482456140351e-05, "large_model_cost": 0.0006160906432748544, "hybrid_results": {"0.05": {"accuracy": 0.823943661971831, "large_model_usage": 1.0, "small_model_usage": 0.0, "avg_lambda": 1.0, "avg_cost": 0.000646777464788732, "unrestricted": {"accuracy": 0.823943661971831, "large_model_usage": 1.0, "avg_cost": 0.000646777464788732}, "random_baseline": {"accuracy": 0.823943661971831, "accuracy_std": 0.0, "avg_cost": 0.0006147605633802814, "cost_std": 0.0, "small_model_fraction": 0.0}, "unrestricted_random_baseline": {"accuracy": 0.823943661971831, "accuracy_std": 0.0, "avg_cost": 0.0006147605633802814, "cost_std": 0.0, "small_model_fraction": 0.0}}, "0.1": {"accuracy": 0.7359154929577465, "large_model_usage": 0.6161971830985915, "small_model_usage": 0.3838028169014085, "avg_lambda": 0.7599999999999976, "avg_cost": 0.00041003802816901366, "unrestricted": {"accuracy": 0.7464788732394366, "large_model_usage": 0.6161971830985915, "avg_cost": 0.000413502816901408}, "random_baseline": {"accuracy": 0.7359154929577465, "accuracy_std": 0.018431341307468162, "avg_cost": 0.0004089239436619715, "cost_std": 1.498189773344807e-05, "small_model_fraction": 0.35280818145665416}, "unrestricted_random_baseline": {"accuracy": 0.7352112676056338, "accuracy_std": 0.01841788286003099, "avg_cost": 0.00041190845070422497, "cost_std": 2.060170805002476e-05, "small_model_fraction": 0.34687568789825085}}, "0.15": {"accuracy": 0.6936619718309859, "large_model_usage": 0.4788732394366197, "small_model_usage": 0.5211267605633803, "avg_lambda": 0.6299999999999976, "avg_cost": 0.0003190309859154926, "unrestricted": {"accuracy": 0.7147887323943662, "large_model_usage": 0.4788732394366197, "avg_cost": 0.0003283056338028166}, "random_baseline": {"accuracy": 0.6887323943661972, "accuracy_std": 0.01261723441349924, "avg_cost": 0.0003208741549295772, "cost_std": 1.5402737472362816e-05, "small_model_fraction": 0.5086326006746343}, "unrestricted_random_baseline": {"accuracy": 0.702112676056338, "accuracy_std": 0.016901408450704238, "avg_cost": 0.00033673035211267575, "cost_std": 2.022886131160849e-05, "small_model_fraction": 0.49275232828354243}}, "0.2": {"accuracy": 0.647887323943662, "large_model_usage": 0.30985915492957744, "small_model_usage": 0.6901408450704225, "avg_lambda": 0.40000000000000185, "avg_cost": 0.00021392535211267594, "unrestricted": {"accuracy": 0.6654929577464789, "large_model_usage": 0.30985915492957744, "avg_cost": 0.00022449577464788724}, "random_baseline": {"accuracy": 0.6485915492957748, "accuracy_std": 0.013978474113576915, "avg_cost": 0.0002124919718309859, "cost_std": 1.1507763324719107e-05, "small_model_fraction": 0.6885969631932718}, "unrestricted_random_baseline": {"accuracy": 0.6457746478873239, "accuracy_std": 0.01104534305729444, "avg_cost": 0.00022764580985915484, "cost_std": 2.0666605204572412e-05, "small_model_fraction": 0.6704980346746469}}, "0.25": {"accuracy": 0.602112676056338, "large_model_usage": 0.2605633802816901, "small_model_usage": 0.7394366197183099, "avg_lambda": 0.25, "avg_cost": 0.00017558732394366207, "unrestricted": {"accuracy": 0.6514084507042254, "large_model_usage": 0.2605633802816901, "avg_cost": 0.00019381267605633802}, "random_baseline": {"accuracy": 0.6306338028169014, "accuracy_std": 0.013768461123955768, "avg_cost": 0.00018161830985915492, "cost_std": 1.119192526368517e-05, "small_model_fraction": 0.754240245575279}, "unrestricted_random_baseline": {"accuracy": 0.6309859154929578, "accuracy_std": 0.013156015276246087, "avg_cost": 0.00019130348591549296, "cost_std": 1.1649266834672859e-05, "small_model_fraction": 0.7230343648249787}}}}, {"trial": 3, "trial_seed": 44, "subject": "TruthfulQA", "method": "standard_basescores", "calibration_size": 400, "test_size": 284, "small_model_accuracy": 0.5760233918128655, "large_model_accuracy": 0.8362573099415205, "small_model_cost": 3.208538011695908e-05, "large_model_cost": 0.0006144093567251462, "hybrid_results": {"0.05": {"accuracy": 0.8345070422535211, "large_model_usage": 1.0, "small_model_usage": 0.0, "avg_lambda": 1.0, "avg_cost": 0.0006475010563380279, "unrestricted": {"accuracy": 0.8345070422535211, "large_model_usage": 1.0, "avg_cost": 0.0006475010563380279}, "random_baseline": {"accuracy": 0.8345070422535212, "accuracy_std": 1.1102230246251565e-16, "avg_cost": 0.0006154014084507043, "cost_std": 1.0842021724855044e-19, "small_model_fraction": 0.0}, "unrestricted_random_baseline": {"accuracy": 0.8345070422535212, "accuracy_std": 1.1102230246251565e-16, "avg_cost": 0.0006154014084507043, "cost_std": 1.0842021724855044e-19, "small_model_fraction": 0.0}}, "0.1": {"accuracy": 0.7816901408450704, "large_model_usage": 0.6408450704225352, "small_model_usage": 0.35915492957746475, "avg_lambda": 0.7599999999999976, "avg_cost": 0.00042429683098591507, "unrestricted": {"accuracy": 0.795774647887324, "large_model_usage": 0.6408450704225352, "avg_cost": 0.0004299588028169011}, "random_baseline": {"accuracy": 0.7443661971830986, "accuracy_std": 0.022336269756554512, "avg_cost": 0.0004229963380281688, "cost_std": 1.8374047993291708e-05, "small_model_fraction": 0.32647209006670713}, "unrestricted_random_baseline": {"accuracy": 0.7609154929577464, "accuracy_std": 0.020193592899915, "avg_cost": 0.000439301936619718, "cost_std": 1.3050169521885118e-05, "small_model_fraction": 0.3167490285778692}}, "0.15": {"accuracy": 0.7007042253521126, "large_model_usage": 0.43661971830985913, "small_model_usage": 0.5633802816901409, "avg_lambda": 0.5300000000000009, "avg_cost": 0.00029137429577464794, "unrestricted": {"accuracy": 0.7359154929577465, "large_model_usage": 0.43661971830985913, "avg_cost": 0.00030360669014084507}, "random_baseline": {"accuracy": 0.6989436619718309, "accuracy_std": 0.01512445886965814, "avg_cost": 0.00030322408450704215, "cost_std": 2.015524778368104e-05, "small_model_fraction": 0.5547342612132392}, "unrestricted_random_baseline": {"accuracy": 0.703169014084507, "accuracy_std": 0.009584970132969182, "avg_cost": 0.00030279926056338025, "cost_std": 1.2103369515203333e-05, "small_model_fraction": 0.5337280947877279}}, "0.2": {"accuracy": 0.6338028169014085, "large_model_usage": 0.2711267605633803, "small_model_usage": 0.7288732394366197, "avg_lambda": 0.35999999999999954, "avg_cost": 0.0001887052816901408, "unrestricted": {"accuracy": 0.676056338028169, "large_model_usage": 0.2711267605633803, "avg_cost": 0.00020029683098591542}, "random_baseline": {"accuracy": 0.6443661971830986, "accuracy_std": 0.012695603082619682, "avg_cost": 0.0001859207042253521, "cost_std": 6.0320252486603995e-06, "small_model_fraction": 0.7310433575388183}, "unrestricted_random_baseline": {"accuracy": 0.6514084507042254, "accuracy_std": 0.013637265303547225, "avg_cost": 0.00020299038732394365, "cost_std": 1.4451401074241194e-05, "small_model_fraction": 0.7111376868788346}}, "0.25": {"accuracy": 0.5950704225352113, "large_model_usage": 0.22535211267605634, "small_model_usage": 0.7746478873239436, "avg_lambda": 0.20000000000000093, "avg_cost": 0.0001566348591549295, "unrestricted": {"accuracy": 0.6619718309859155, "large_model_usage": 0.22535211267605634, "avg_cost": 0.00017267711267605627}, "random_baseline": {"accuracy": 0.6299295774647886, "accuracy_std": 0.006944747508209895, "avg_cost": 0.00014908999999999997, "cost_std": 1.2383688979160946e-05, "small_model_fraction": 0.7861165192554441}, "unrestricted_random_baseline": {"accuracy": 0.6323943661971831, "accuracy_std": 0.01319365774309522, "avg_cost": 0.0001851157394366197, "cost_std": 1.2581646728830659e-05, "small_model_fraction": 0.7585678450370704}}}}, {"trial": 4, "trial_seed": 45, "subject": "TruthfulQA", "method": "standard_basescores", "calibration_size": 400, "test_size": 284, "small_model_accuracy": 0.533625730994152, "large_model_accuracy": 0.8289473684210527, "small_model_cost": 3.2054824561403526e-05, "large_model_cost": 0.0006148274853801175, "hybrid_results": {"0.05": {"accuracy": 0.8450704225352113, "large_model_usage": 0.8204225352112676, "small_model_usage": 0.17957746478873238, "avg_lambda": 0.900000000000004, "avg_cost": 0.0005396126760563378, "unrestricted": {"accuracy": 0.8450704225352113, "large_model_usage": 0.8204225352112676, "avg_cost": 0.0005400845070422534}, "random_baseline": {"accuracy": 0.8130281690140844, "accuracy_std": 0.008399197494279257, "avg_cost": 0.0005424396126760564, "cost_std": 1.2188660742084318e-05, "small_model_fraction": 0.1290637230959211}, "unrestricted_random_baseline": {"accuracy": 0.8137323943661972, "accuracy_std": 0.013403426979348067, "avg_cost": 0.0005406595070422536, "cost_std": 7.058996405082935e-06, "small_model_fraction": 0.128254091797753}}, "0.1": {"accuracy": 0.8098591549295775, "large_model_usage": 0.6302816901408451, "small_model_usage": 0.3697183098591549, "avg_lambda": 0.7599999999999976, "avg_cost": 0.0004196267605633798, "unrestricted": {"accuracy": 0.8098591549295775, "large_model_usage": 0.6302816901408451, "avg_cost": 0.0004218098591549291}, "random_baseline": {"accuracy": 0.7397887323943662, "accuracy_std": 0.02546191012646607, "avg_cost": 0.000417442957746479, "cost_std": 2.193598724361229e-05, "small_model_fraction": 0.33495175381512926}, "unrestricted_random_baseline": {"accuracy": 0.7556338028169015, "accuracy_std": 0.009208941430015495, "avg_cost": 0.0004321809859154931, "cost_std": 1.775892079362978e-05, "small_model_fraction": 0.33120569855494875}}, "0.15": {"accuracy": 0.721830985915493, "large_model_usage": 0.4788732394366197, "small_model_usage": 0.5211267605633803, "avg_lambda": 0.5100000000000009, "avg_cost": 0.0003129929577464787, "unrestricted": {"accuracy": 0.7711267605633803, "large_model_usage": 0.4788732394366197, "avg_cost": 0.00032897183098591534}, "random_baseline": {"accuracy": 0.6943661971830986, "accuracy_std": 0.022535211267605645, "avg_cost": 0.00031910292253521134, "cost_std": 2.1495540429533664e-05, "small_model_fraction": 0.5179284272011037}, "unrestricted_random_baseline": {"accuracy": 0.7024647887323944, "accuracy_std": 0.022059795926305765, "avg_cost": 0.0003300197887323944, "cost_std": 1.794443994764718e-05, "small_model_fraction": 0.4905097195064281}}, "0.2": {"accuracy": 0.6830985915492958, "large_model_usage": 0.36619718309859156, "small_model_usage": 0.6338028169014085, "avg_lambda": 0.40000000000000185, "avg_cost": 0.00024295070422535207, "unrestricted": {"accuracy": 0.7288732394366197, "large_model_usage": 0.36619718309859156, "avg_cost": 0.0002590070422535209}, "random_baseline": {"accuracy": 0.6524647887323944, "accuracy_std": 0.010915492957746464, "avg_cost": 0.0002469686971830986, "cost_std": 1.974290257294534e-05, "small_model_fraction": 0.6381163808067637}, "unrestricted_random_baseline": {"accuracy": 0.6647887323943661, "accuracy_std": 0.015332071166955782, "avg_cost": 0.00026350471830985915, "cost_std": 1.2255758753519867e-05, "small_model_fraction": 0.6105647485705982}}, "0.25": {"accuracy": 0.5915492957746479, "large_model_usage": 0.27816901408450706, "small_model_usage": 0.721830985915493, "avg_lambda": 0.2700000000000005, "avg_cost": 0.00018101408450704222, "unrestricted": {"accuracy": 0.6795774647887324, "large_model_usage": 0.27816901408450706, "avg_cost": 0.0002033802816901408}, "random_baseline": {"accuracy": 0.6214788732394367, "accuracy_std": 0.01824541747610089, "avg_cost": 0.00018680341549295773, "cost_std": 1.4612540508492329e-05, "small_model_fraction": 0.7443955937528507}, "unrestricted_random_baseline": {"accuracy": 0.6330985915492958, "accuracy_std": 0.012478905032865768, "avg_cost": 0.00020595845070422533, "cost_std": 1.711956048414804e-05, "small_model_fraction": 0.7060166534098408}}}}, {"trial": 5, "trial_seed": 46, "subject": "TruthfulQA", "method": "standard_basescores", "calibration_size": 400, "test_size": 284, "small_model_accuracy": 0.5716374269005848, "large_model_accuracy": 0.8362573099415205, "small_model_cost": 3.204400584795328e-05, "large_model_cost": 0.0006141900584795328, "hybrid_results": {"0.05": {"accuracy": 0.8169014084507042, "large_model_usage": 0.7640845070422535, "small_model_usage": 0.2359154929577465, "avg_lambda": 0.9499999999999951, "avg_cost": 0.0005032669014084505, "unrestricted": {"accuracy": 0.8169014084507042, "large_model_usage": 0.7640845070422535, "avg_cost": 0.0005032669014084505}, "random_baseline": {"accuracy": 0.7827464788732394, "accuracy_std": 0.009840273670762094, "avg_cost": 0.0004969656690140842, "cost_std": 1.2003822573619786e-05, "small_model_fraction": 0.1905418005836445}, "unrestricted_random_baseline": {"accuracy": 0.7855633802816901, "accuracy_std": 0.011404320239930769, "avg_cost": 0.0004978220070422533, "cost_std": 1.3623616046591062e-05, "small_model_fraction": 0.1905418005836445}}, "0.1": {"accuracy": 0.7922535211267606, "large_model_usage": 0.6126760563380281, "small_model_usage": 0.38732394366197187, "avg_lambda": 0.780000000000001, "avg_cost": 0.00040823169014084503, "unrestricted": {"accuracy": 0.795774647887324, "large_model_usage": 0.6126760563380281, "avg_cost": 0.0004102528169014085}, "random_baseline": {"accuracy": 0.7535211267605634, "accuracy_std": 0.010680106259227523, "avg_cost": 0.00042046394366197174, "cost_std": 8.70719681007761e-06, "small_model_fraction": 0.3537915741378939}, "unrestricted_random_baseline": {"accuracy": 0.7496478873239436, "accuracy_std": 0.01254331910602595, "avg_cost": 0.00041845352112676045, "cost_std": 1.083387174003767e-05, "small_model_fraction": 0.35031971900561737}}, "0.15": {"accuracy": 0.7535211267605634, "large_model_usage": 0.47183098591549294, "small_model_usage": 0.528169014084507, "avg_lambda": 0.6100000000000032, "avg_cost": 0.0003123161971830986, "unrestricted": {"accuracy": 0.778169014084507, "large_model_usage": 0.47183098591549294, "avg_cost": 0.00032302042253521134}, "random_baseline": {"accuracy": 0.7095070422535212, "accuracy_std": 0.013384913949822707, "avg_cost": 0.00030838524647887317, "cost_std": 1.6464117597264212e-05, "small_model_fraction": 0.5185534797183962}, "unrestricted_random_baseline": {"accuracy": 0.7123239436619718, "accuracy_std": 0.012503718946876583, "avg_cost": 0.00032441109154929563, "cost_std": 1.155157014518509e-05, "small_model_fraction": 0.5001659542791623}}, "0.2": {"accuracy": 0.7007042253521126, "large_model_usage": 0.31338028169014087, "small_model_usage": 0.6866197183098591, "avg_lambda": 0.40000000000000185, "avg_cost": 0.00021264014084507039, "unrestricted": {"accuracy": 0.7288732394366197, "large_model_usage": 0.31338028169014087, "avg_cost": 0.0002245908450704226}, "random_baseline": {"accuracy": 0.6683098591549296, "accuracy_std": 0.014670892011267346, "avg_cost": 0.00021898971830985908, "cost_std": 1.6061030176618783e-05, "small_model_fraction": 0.6897752133150505}, "unrestricted_random_baseline": {"accuracy": 0.6654929577464788, "accuracy_std": 0.010082972579772066, "avg_cost": 0.00022247200704225345, "cost_std": 1.1682124071380763e-05, "small_model_fraction": 0.6692465089266427}}, "0.25": {"accuracy": 0.6514084507042254, "large_model_usage": 0.2535211267605634, "small_model_usage": 0.7464788732394366, "avg_lambda": 0.29999999999999843, "avg_cost": 0.00016955563380281681, "unrestricted": {"accuracy": 0.7112676056338029, "large_model_usage": 0.2535211267605634, "avg_cost": 0.00018709788732394368}, "random_baseline": {"accuracy": 0.6471830985915493, "accuracy_std": 0.011550154554124439, "avg_cost": 0.0001647227464788732, "cost_std": 1.3122001529947817e-05, "small_model_fraction": 0.7637850032079664}, "unrestricted_random_baseline": {"accuracy": 0.6492957746478873, "accuracy_std": 0.010349956659647217, "avg_cost": 0.00018554799295774637, "cost_std": 1.0549644919909538e-05, "small_model_fraction": 0.7336512361888009}}}}, {"trial": 6, "trial_seed": 47, "subject": "TruthfulQA", "method": "standard_basescores", "calibration_size": 400, "test_size": 284, "small_model_accuracy": 0.5482456140350878, "large_model_accuracy": 0.8333333333333334, "small_model_cost": 3.213333333333336e-05, "large_model_cost": 0.0006180818713450298, "hybrid_results": {"0.05": {"accuracy": 0.8380281690140845, "large_model_usage": 1.0, "small_model_usage": 0.0, "avg_lambda": 1.0, "avg_cost": 0.0006479834507042255, "unrestricted": {"accuracy": 0.8380281690140845, "large_model_usage": 1.0, "avg_cost": 0.0006479834507042255}, "random_baseline": {"accuracy": 0.8380281690140844, "accuracy_std": 1.1102230246251565e-16, "avg_cost": 0.0006159788732394368, "cost_std": 1.0842021724855044e-19, "small_model_fraction": 0.0}, "unrestricted_random_baseline": {"accuracy": 0.8380281690140844, "accuracy_std": 1.1102230246251565e-16, "avg_cost": 0.0006159788732394368, "cost_std": 1.0842021724855044e-19, "small_model_fraction": 0.0}}, "0.1": {"accuracy": 0.7852112676056338, "large_model_usage": 0.6302816901408451, "small_model_usage": 0.3697183098591549, "avg_lambda": 0.7599999999999976, "avg_cost": 0.00041759612676056315, "unrestricted": {"accuracy": 0.8063380281690141, "large_model_usage": 0.6302816901408451, "avg_cost": 0.00042308908450704207}, "random_baseline": {"accuracy": 0.748943661971831, "accuracy_std": 0.015108054896425992, "avg_cost": 0.00041861732394366175, "cost_std": 1.5334333422474074e-05, "small_model_fraction": 0.34215589182076706}, "unrestricted_random_baseline": {"accuracy": 0.7517605633802817, "accuracy_std": 0.015448880963718502, "avg_cost": 0.00042243042253521104, "cost_std": 1.8706591940605724e-05, "small_model_fraction": 0.33278142053167026}}, "0.15": {"accuracy": 0.721830985915493, "large_model_usage": 0.43309859154929575, "small_model_usage": 0.5669014084507042, "avg_lambda": 0.5500000000000005, "avg_cost": 0.0002926172535211266, "unrestricted": {"accuracy": 0.7359154929577465, "large_model_usage": 0.43309859154929575, "avg_cost": 0.0003000538732394364}, "random_baseline": {"accuracy": 0.7017605633802818, "accuracy_std": 0.012100591744675789, "avg_cost": 0.00029343380281690124, "cost_std": 1.270751538392503e-05, "small_model_fraction": 0.5554491507535878}, "unrestricted_random_baseline": {"accuracy": 0.6936619718309859, "accuracy_std": 0.012889440294171217, "avg_cost": 0.0003086661267605633, "cost_std": 2.0422400433503835e-05, "small_model_fraction": 0.5427575588545032}}, "0.2": {"accuracy": 0.6795774647887324, "large_model_usage": 0.35563380281690143, "small_model_usage": 0.6443661971830985, "avg_lambda": 0.40000000000000185, "avg_cost": 0.00023547640845070423, "unrestricted": {"accuracy": 0.7253521126760564, "large_model_usage": 0.35563380281690143, "avg_cost": 0.00025159612676056334}, "random_baseline": {"accuracy": 0.6725352112676057, "accuracy_std": 0.014084507042253502, "avg_cost": 0.00024185841549295765, "cost_std": 1.2549321530468264e-05, "small_model_fraction": 0.6529676892660634}, "unrestricted_random_baseline": {"accuracy": 0.6757042253521127, "accuracy_std": 0.011404320239930722, "avg_cost": 0.0002511526408450702, "cost_std": 1.5736304477124072e-05, "small_model_fraction": 0.6254572215984449}}, "0.25": {"accuracy": 0.6197183098591549, "large_model_usage": 0.2887323943661972, "small_model_usage": 0.7112676056338028, "avg_lambda": 0.29999999999999843, "avg_cost": 0.00018955387323943654, "unrestricted": {"accuracy": 0.7007042253521126, "large_model_usage": 0.2887323943661972, "avg_cost": 0.00020989894366197175}, "random_baseline": {"accuracy": 0.643661971830986, "accuracy_std": 0.017450016469526562, "avg_cost": 0.00018768144366197186, "cost_std": 1.5415419445846414e-05, "small_model_fraction": 0.7313406729534994}, "unrestricted_random_baseline": {"accuracy": 0.6598591549295774, "accuracy_std": 0.019105154882043004, "avg_cost": 0.00020406257042253516, "cost_std": 1.563258231662902e-05, "small_model_fraction": 0.6966190735250373}}}}, {"trial": 7, "trial_seed": 48, "subject": "TruthfulQA", "method": "standard_basescores", "calibration_size": 400, "test_size": 284, "small_model_accuracy": 0.543859649122807, "large_model_accuracy": 0.8289473684210527, "small_model_cost": 3.201330409356728e-05, "large_model_cost": 0.0006151432748538018, "hybrid_results": {"0.05": {"accuracy": 0.8063380281690141, "large_model_usage": 0.7852112676056338, "small_model_usage": 0.21478873239436624, "avg_lambda": 0.900000000000004, "avg_cost": 0.0005194820422535211, "unrestricted": {"accuracy": 0.8063380281690141, "large_model_usage": 0.7852112676056338, "avg_cost": 0.0005194820422535211}, "random_baseline": {"accuracy": 0.7848591549295774, "accuracy_std": 0.006763863631091046, "avg_cost": 0.0005258106690140842, "cost_std": 7.633074383236463e-06, "small_model_fraction": 0.16404787508274662}, "unrestricted_random_baseline": {"accuracy": 0.7859154929577465, "accuracy_std": 0.011971830985915515, "avg_cost": 0.0005234647183098589, "cost_std": 9.782891288210974e-06, "small_model_fraction": 0.16404787508274662}}, "0.1": {"accuracy": 0.7816901408450704, "large_model_usage": 0.6830985915492958, "small_model_usage": 0.31690140845070425, "avg_lambda": 0.8000000000000037, "avg_cost": 0.00044970739436619696, "unrestricted": {"accuracy": 0.7887323943661971, "large_model_usage": 0.6830985915492958, "avg_cost": 0.0004560172535211266}, "random_baseline": {"accuracy": 0.7528169014084507, "accuracy_std": 0.016425216605198006, "avg_cost": 0.00045390552816901373, "cost_std": 1.728071883358232e-05, "small_model_fraction": 0.2837032716255757}, "unrestricted_random_baseline": {"accuracy": 0.7595070422535211, "accuracy_std": 0.012990047004011756, "avg_cost": 0.00045072010563380253, "cost_std": 1.6346584965156005e-05, "small_model_fraction": 0.2728825978970356}}, "0.15": {"accuracy": 0.7676056338028169, "large_model_usage": 0.5845070422535211, "small_model_usage": 0.4154929577464789, "avg_lambda": 0.6999999999999973, "avg_cost": 0.00037933415492957736, "unrestricted": {"accuracy": 0.778169014084507, "large_model_usage": 0.5845070422535211, "avg_cost": 0.0003950454225352112}, "random_baseline": {"accuracy": 0.7330985915492958, "accuracy_std": 0.02021507055550327, "avg_cost": 0.0003789624647887321, "cost_std": 1.797095351350298e-05, "small_model_fraction": 0.4043851829752411}, "unrestricted_random_baseline": {"accuracy": 0.7352112676056338, "accuracy_std": 0.008302694452501112, "avg_cost": 0.0003995684859154925, "cost_std": 1.5055135300624324e-05, "small_model_fraction": 0.37744218845696786}}, "0.2": {"accuracy": 0.7147887323943662, "large_model_usage": 0.4119718309859155, "small_model_usage": 0.5880281690140845, "avg_lambda": 0.5, "avg_cost": 0.0002736228873239434, "unrestricted": {"accuracy": 0.75, "large_model_usage": 0.4119718309859155, "avg_cost": 0.00028815809859154904}, "random_baseline": {"accuracy": 0.6982394366197183, "accuracy_std": 0.011683563396504974, "avg_cost": 0.0002719235563380279, "cost_std": 2.0809792516426593e-05, "small_model_fraction": 0.5856676978626456}, "unrestricted_random_baseline": {"accuracy": 0.7109154929577464, "accuracy_std": 0.015624442065023824, "avg_cost": 0.0002965021126760561, "cost_std": 9.746905188663428e-06, "small_model_fraction": 0.5607415030236873}}, "0.25": {"accuracy": 0.6690140845070423, "large_model_usage": 0.28169014084507044, "small_model_usage": 0.7183098591549295, "avg_lambda": 0.34999999999999865, "avg_cost": 0.00019435528169014078, "unrestricted": {"accuracy": 0.7147887323943662, "large_model_usage": 0.28169014084507044, "avg_cost": 0.00020743978873239427}, "random_baseline": {"accuracy": 0.6704225352112676, "accuracy_std": 0.012318912453898507, "avg_cost": 0.0001895919014084506, "cost_std": 1.033575028632915e-05, "small_model_fraction": 0.7216024115774292}, "unrestricted_random_baseline": {"accuracy": 0.6764084507042254, "accuracy_std": 0.013495611194885052, "avg_cost": 0.00020754383802816886, "cost_std": 1.5249398452896805e-05, "small_model_fraction": 0.6991640055644522}}}}, {"trial": 8, "trial_seed": 49, "subject": "TruthfulQA", "method": "standard_basescores", "calibration_size": 400, "test_size": 284, "small_model_accuracy": 0.5745614035087719, "large_model_accuracy": 0.8464912280701754, "small_model_cost": 3.2026900584795304e-05, "large_model_cost": 0.0006131929824561406, "hybrid_results": {"0.05": {"accuracy": 0.823943661971831, "large_model_usage": 0.8133802816901409, "small_model_usage": 0.18661971830985913, "avg_lambda": 0.900000000000004, "avg_cost": 0.0005384538732394364, "unrestricted": {"accuracy": 0.823943661971831, "large_model_usage": 0.8133802816901409, "avg_cost": 0.0005384538732394364}, "random_baseline": {"accuracy": 0.8007042253521128, "accuracy_std": 0.010349956659647233, "avg_cost": 0.0005448654929577467, "cost_std": 6.966215331543165e-06, "small_model_fraction": 0.1286019806524934}, "unrestricted_random_baseline": {"accuracy": 0.8024647887323944, "accuracy_std": 0.010960832687516485, "avg_cost": 0.0005368070070422539, "cost_std": 1.4796841492775382e-05, "small_model_fraction": 0.1286019806524934}}, "0.1": {"accuracy": 0.778169014084507, "large_model_usage": 0.6338028169014085, "small_model_usage": 0.3661971830985915, "avg_lambda": 0.6999999999999973, "avg_cost": 0.00041555246478873205, "unrestricted": {"accuracy": 0.8028169014084507, "large_model_usage": 0.6338028169014085, "avg_cost": 0.0004273482394366193}, "random_baseline": {"accuracy": 0.7573943661971831, "accuracy_std": 0.01025971991854468, "avg_cost": 0.00043011623239436626, "cost_std": 1.5723480525935904e-05, "small_model_fraction": 0.3400757955987543}, "unrestricted_random_baseline": {"accuracy": 0.7598591549295775, "accuracy_std": 0.0101808678132401, "avg_cost": 0.0004316689788732395, "cost_std": 2.1034031114455338e-05, "small_model_fraction": 0.31977905940605517}}, "0.15": {"accuracy": 0.704225352112676, "large_model_usage": 0.47183098591549294, "small_model_usage": 0.528169014084507, "avg_lambda": 0.5, "avg_cost": 0.00031417922535211245, "unrestricted": {"accuracy": 0.7429577464788732, "large_model_usage": 0.47183098591549294, "avg_cost": 0.0003268482394366195}, "random_baseline": {"accuracy": 0.701056338028169, "accuracy_std": 0.020254897208830527, "avg_cost": 0.0003111304929577465, "cost_std": 1.3513178064669323e-05, "small_model_fraction": 0.5145065523115333}, "unrestricted_random_baseline": {"accuracy": 0.720774647887324, "accuracy_std": 0.012797733693933973, "avg_cost": 0.0003302054577464788, "cost_std": 1.5890830493785933e-05, "small_model_fraction": 0.4927072517678522}}, "0.2": {"accuracy": 0.6514084507042254, "large_model_usage": 0.30985915492957744, "small_model_usage": 0.6901408450704225, "avg_lambda": 0.34999999999999865, "avg_cost": 0.00021347500000000005, "unrestricted": {"accuracy": 0.6936619718309859, "large_model_usage": 0.30985915492957744, "avg_cost": 0.0002266933098591549}, "random_baseline": {"accuracy": 0.6563380281690142, "accuracy_std": 0.018645355344892547, "avg_cost": 0.00021857802816901403, "cost_std": 1.7054818702030604e-05, "small_model_fraction": 0.6877861508521886}, "unrestricted_random_baseline": {"accuracy": 0.6626760563380281, "accuracy_std": 0.017163461418244964, "avg_cost": 0.0002206117253521126, "cost_std": 8.381279257778922e-06, "small_model_fraction": 0.6650416888619224}}, "0.25": {"accuracy": 0.5985915492957746, "large_model_usage": 0.2535211267605634, "small_model_usage": 0.7464788732394366, "avg_lambda": 0.10999999999999963, "avg_cost": 0.00016967922535211268, "unrestricted": {"accuracy": 0.676056338028169, "large_model_usage": 0.2535211267605634, "avg_cost": 0.0001917707746478874}, "random_baseline": {"accuracy": 0.629225352112676, "accuracy_std": 0.010915492957746508, "avg_cost": 0.00017366524647887327, "cost_std": 1.3974098358811078e-05, "small_model_fraction": 0.763144600035709}, "unrestricted_random_baseline": {"accuracy": 0.6397887323943663, "accuracy_std": 0.019030379037847416, "avg_cost": 0.0001883608450704225, "cost_std": 1.6923928464212228e-05, "small_model_fraction": 0.7251321454467552}}}}, {"trial": 9, "trial_seed": 50, "subject": "TruthfulQA", "method": "standard_basescores", "calibration_size": 400, "test_size": 284, "small_model_accuracy": 0.5526315789473685, "large_model_accuracy": 0.8230994152046783, "small_model_cost": 3.204590643274858e-05, "large_model_cost": 0.0006138538011695924, "hybrid_results": {"0.05": {"accuracy": 0.8063380281690141, "large_model_usage": 1.0, "small_model_usage": 0.0, "avg_lambda": 1.0, "avg_cost": 0.0006454306338028169, "unrestricted": {"accuracy": 0.8063380281690141, "large_model_usage": 1.0, "avg_cost": 0.0006454306338028169}, "random_baseline": {"accuracy": 0.8063380281690142, "accuracy_std": 1.1102230246251565e-16, "avg_cost": 0.0006133732394366196, "cost_std": 0.0, "small_model_fraction": 0.0}, "unrestricted_random_baseline": {"accuracy": 0.8063380281690142, "accuracy_std": 1.1102230246251565e-16, "avg_cost": 0.0006133732394366196, "cost_std": 0.0, "small_model_fraction": 0.0}}, "0.1": {"accuracy": 0.7394366197183099, "large_model_usage": 0.5845070422535211, "small_model_usage": 0.4154929577464789, "avg_lambda": 0.7599999999999976, "avg_cost": 0.0003900433098591546, "unrestricted": {"accuracy": 0.75, "large_model_usage": 0.5845070422535211, "avg_cost": 0.0003929869718309856}, "random_baseline": {"accuracy": 0.7126760563380281, "accuracy_std": 0.015038138383142718, "avg_cost": 0.00039800559859154904, "cost_std": 1.4436405359495851e-05, "small_model_fraction": 0.3846810834556809}, "unrestricted_random_baseline": {"accuracy": 0.7098591549295774, "accuracy_std": 0.014449496146960023, "avg_cost": 0.000395096056338028, "cost_std": 1.3162436312726024e-05, "small_model_fraction": 0.3796215749848265}}, "0.15": {"accuracy": 0.7183098591549296, "large_model_usage": 0.46830985915492956, "small_model_usage": 0.5316901408450705, "avg_lambda": 0.6100000000000032, "avg_cost": 0.0003133672535211269, "unrestricted": {"accuracy": 0.7323943661971831, "large_model_usage": 0.46830985915492956, "avg_cost": 0.0003209658450704225}, "random_baseline": {"accuracy": 0.6679577464788732, "accuracy_std": 0.01746777007720063, "avg_cost": 0.0003160462323943661, "cost_std": 1.7682289626809905e-05, "small_model_fraction": 0.5164703854429097}, "unrestricted_random_baseline": {"accuracy": 0.6816901408450704, "accuracy_std": 0.011157027829404816, "avg_cost": 0.0003216263380281689, "cost_std": 1.6372956354360323e-05, "small_model_fraction": 0.5034100752992453}}, "0.2": {"accuracy": 0.676056338028169, "large_model_usage": 0.3767605633802817, "small_model_usage": 0.6232394366197183, "avg_lambda": 0.4800000000000006, "avg_cost": 0.0002508109154929578, "unrestricted": {"accuracy": 0.7112676056338029, "large_model_usage": 0.3767605633802817, "avg_cost": 0.0002651348591549296}, "random_baseline": {"accuracy": 0.6422535211267606, "accuracy_std": 0.01877787552953998, "avg_cost": 0.0002480050352112677, "cost_std": 1.794913827996497e-05, "small_model_fraction": 0.6239909924921897}, "unrestricted_random_baseline": {"accuracy": 0.6612676056338028, "accuracy_std": 0.012177194694218702, "avg_cost": 0.00027564665492957747, "cost_std": 1.0385423174442583e-05, "small_model_fraction": 0.5993712790239656}}, "0.25": {"accuracy": 0.6514084507042254, "large_model_usage": 0.2640845070422535, "small_model_usage": 0.7359154929577465, "avg_lambda": 0.3100000000000012, "avg_cost": 0.0001831841549295775, "unrestricted": {"accuracy": 0.6901408450704225, "large_model_usage": 0.2640845070422535, "avg_cost": 0.00019527570422535218}, "random_baseline": {"accuracy": 0.6154929577464789, "accuracy_std": 0.011442307612163345, "avg_cost": 0.00018206066901408456, "cost_std": 1.0997677669871092e-05, "small_model_fraction": 0.7402265423620801}, "unrestricted_random_baseline": {"accuracy": 0.6137323943661973, "accuracy_std": 0.014859822426767063, "avg_cost": 0.00019344823943661976, "cost_std": 1.9556322152027125e-05, "small_model_fraction": 0.7194438245523745}}}}, {"trial": 10, "trial_seed": 51, "subject": "TruthfulQA", "method": "standard_basescores", "calibration_size": 400, "test_size": 284, "small_model_accuracy": 0.5716374269005848, "large_model_accuracy": 0.8362573099415205, "small_model_cost": 3.198552631578946e-05, "large_model_cost": 0.0006132543859649138, "hybrid_results": {"0.05": {"accuracy": 0.8133802816901409, "large_model_usage": 0.7852112676056338, "small_model_usage": 0.21478873239436624, "avg_lambda": 0.900000000000004, "avg_cost": 0.0005168672535211268, "unrestricted": {"accuracy": 0.8133802816901409, "large_model_usage": 0.7852112676056338, "avg_cost": 0.0005172757042253522}, "random_baseline": {"accuracy": 0.805281690140845, "accuracy_std": 0.012100591744675765, "avg_cost": 0.0005178505985915489, "cost_std": 1.1493800880909418e-05, "small_model_fraction": 0.16582194425823862}, "unrestricted_random_baseline": {"accuracy": 0.7943661971830985, "accuracy_std": 0.011377108747467298, "avg_cost": 0.0005188478873239434, "cost_std": 1.2526826338876906e-05, "small_model_fraction": 0.1651192561691641}}, "0.1": {"accuracy": 0.7605633802816901, "large_model_usage": 0.6232394366197183, "small_model_usage": 0.37676056338028174, "avg_lambda": 0.7299999999999969, "avg_cost": 0.0004023390845070423, "unrestricted": {"accuracy": 0.7852112676056338, "large_model_usage": 0.6232394366197183, "avg_cost": 0.00041790246478873235}, "random_baseline": {"accuracy": 0.7475352112676057, "accuracy_std": 0.01725352112676054, "avg_cost": 0.00040291940140845043, "cost_std": 1.3410925264365155e-05, "small_model_fraction": 0.36285326137234997}, "unrestricted_random_baseline": {"accuracy": 0.7496478873239437, "accuracy_std": 0.013122989544777603, "avg_cost": 0.00041285133802816876, "cost_std": 1.4951978716749241e-05, "small_model_fraction": 0.33607842211623584}}, "0.15": {"accuracy": 0.7112676056338029, "large_model_usage": 0.4154929577464789, "small_model_usage": 0.5845070422535211, "avg_lambda": 0.5, "avg_cost": 0.000278430633802817, "unrestricted": {"accuracy": 0.7323943661971831, "large_model_usage": 0.4154929577464789, "avg_cost": 0.00028919119718309863}, "random_baseline": {"accuracy": 0.7035211267605634, "accuracy_std": 0.011442307612163338, "avg_cost": 0.0002798588028169014, "cost_std": 1.3421727369169252e-05, "small_model_fraction": 0.5760221739114133}, "unrestricted_random_baseline": {"accuracy": 0.6985915492957746, "accuracy_std": 0.01251858368636, "avg_cost": 0.000288926338028169, "cost_std": 1.0822211597283436e-05, "small_model_fraction": 0.5575099773578648}}, "0.2": {"accuracy": 0.6584507042253521, "large_model_usage": 0.2992957746478873, "small_model_usage": 0.7007042253521127, "avg_lambda": 0.3700000000000011, "avg_cost": 0.0002050080985915492, "unrestricted": {"accuracy": 0.6971830985915493, "large_model_usage": 0.2992957746478873, "avg_cost": 0.00021748697183098585}, "random_baseline": {"accuracy": 0.6566901408450704, "accuracy_std": 0.018380821318504468, "avg_cost": 0.0002023198943661971, "cost_std": 2.0930747212876952e-05, "small_model_fraction": 0.7023364155784938}, "unrestricted_random_baseline": {"accuracy": 0.6693661971830985, "accuracy_std": 0.015058735760804976, "avg_cost": 0.00021664214788732388, "cost_std": 1.7684677747125852e-05, "small_model_fraction": 0.68086808292608}}, "0.25": {"accuracy": 0.6126760563380281, "large_model_usage": 0.25704225352112675, "small_model_usage": 0.7429577464788732, "avg_lambda": 0.2400000000000003, "avg_cost": 0.0001725080985915493, "unrestricted": {"accuracy": 0.676056338028169, "large_model_usage": 0.25704225352112675, "avg_cost": 0.00019050105633802803}, "random_baseline": {"accuracy": 0.6496478873239436, "accuracy_std": 0.014624337786827576, "avg_cost": 0.00017599676056338025, "cost_std": 1.7253255603590643e-05, "small_model_fraction": 0.7582485799074381}, "unrestricted_random_baseline": {"accuracy": 0.6573943661971832, "accuracy_std": 0.017749412902997946, "avg_cost": 0.00019415080985915487, "cost_std": 1.30612466947621e-05, "small_model_fraction": 0.727293958052519}}}}, {"trial": 11, "trial_seed": 52, "subject": "TruthfulQA", "method": "standard_basescores", "calibration_size": 400, "test_size": 284, "small_model_accuracy": 0.5511695906432749, "large_model_accuracy": 0.8362573099415205, "small_model_cost": 3.205467836257314e-05, "large_model_cost": 0.0006146374269005851, "hybrid_results": {"0.05": {"accuracy": 0.8415492957746479, "large_model_usage": 1.0, "small_model_usage": 0.0, "avg_lambda": 1.0, "avg_cost": 0.000646532746478873, "unrestricted": {"accuracy": 0.8415492957746479, "large_model_usage": 1.0, "avg_cost": 0.000646532746478873}, "random_baseline": {"accuracy": 0.8415492957746478, "accuracy_std": 1.1102230246251565e-16, "avg_cost": 0.0006144014084507037, "cost_std": 0.0, "small_model_fraction": 0.0}, "unrestricted_random_baseline": {"accuracy": 0.8415492957746478, "accuracy_std": 1.1102230246251565e-16, "avg_cost": 0.0006144014084507037, "cost_std": 0.0, "small_model_fraction": 0.0}}, "0.1": {"accuracy": 0.7816901408450704, "large_model_usage": 0.6232394366197183, "small_model_usage": 0.37676056338028174, "avg_lambda": 0.7700000000000036, "avg_cost": 0.00041471584507042235, "unrestricted": {"accuracy": 0.7922535211267606, "large_model_usage": 0.6232394366197183, "avg_cost": 0.00041779330985915484}, "random_baseline": {"accuracy": 0.745774647887324, "accuracy_std": 0.015809819944115226, "avg_cost": 0.00042232714788732376, "cost_std": 1.9562684449702827e-05, "small_model_fraction": 0.34316426693352114}, "unrestricted_random_baseline": {"accuracy": 0.7489436619718309, "accuracy_std": 0.011997693688310421, "avg_cost": 0.0004175191549295772, "cost_std": 1.3606479219862924e-05, "small_model_fraction": 0.33788181599165}}, "0.15": {"accuracy": 0.6971830985915493, "large_model_usage": 0.44366197183098594, "small_model_usage": 0.556338028169014, "avg_lambda": 0.5800000000000017, "avg_cost": 0.0002966383802816901, "unrestricted": {"accuracy": 0.7253521126760564, "large_model_usage": 0.44366197183098594, "avg_cost": 0.00030789894366197183}, "random_baseline": {"accuracy": 0.6795774647887324, "accuracy_std": 0.018764665626020013, "avg_cost": 0.00029898728873239424, "cost_std": 2.265719067895788e-05, "small_model_fraction": 0.5458435688610962}, "unrestricted_random_baseline": {"accuracy": 0.6982394366197182, "accuracy_std": 0.015108054896426025, "avg_cost": 0.000306912042253521, "cost_std": 1.4240908225596592e-05, "small_model_fraction": 0.526514875368987}}, "0.2": {"accuracy": 0.6302816901408451, "large_model_usage": 0.352112676056338, "small_model_usage": 0.647887323943662, "avg_lambda": 0.4899999999999987, "avg_cost": 0.00023849049295774635, "unrestricted": {"accuracy": 0.704225352112676, "large_model_usage": 0.352112676056338, "avg_cost": 0.0002516735915492957}, "random_baseline": {"accuracy": 0.6612676056338028, "accuracy_std": 0.016945365373966122, "avg_cost": 0.0002422526056338027, "cost_std": 2.5484938846197872e-05, "small_model_fraction": 0.6456540892890792}, "unrestricted_random_baseline": {"accuracy": 0.6647887323943661, "accuracy_std": 0.01237915199383586, "avg_cost": 0.0002513676056338027, "cost_std": 2.0926674294056408e-05, "small_model_fraction": 0.6230253749568538}}, "0.25": {"accuracy": 0.5985915492957746, "large_model_usage": 0.25704225352112675, "small_model_usage": 0.7429577464788732, "avg_lambda": 0.3100000000000012, "avg_cost": 0.00017535669014084496, "unrestricted": {"accuracy": 0.6690140845070423, "large_model_usage": 0.25704225352112675, "avg_cost": 0.0001917228873239435}, "random_baseline": {"accuracy": 0.6225352112676056, "accuracy_std": 0.012379151993835897, "avg_cost": 0.0001694100352112675, "cost_std": 1.2969944026755922e-05, "small_model_fraction": 0.7540229055221985}, "unrestricted_random_baseline": {"accuracy": 0.6383802816901408, "accuracy_std": 0.018367325853161948, "avg_cost": 0.0001904326056338027, "cost_std": 1.2194466847797954e-05, "small_model_fraction": 0.7259304204217224}}}}, {"trial": 12, "trial_seed": 53, "subject": "TruthfulQA", "method": "standard_basescores", "calibration_size": 400, "test_size": 284, "small_model_accuracy": 0.5526315789473685, "large_model_accuracy": 0.8435672514619883, "small_model_cost": 3.209298245614036e-05, "large_model_cost": 0.0006161286549707615, "hybrid_results": {"0.05": {"accuracy": 0.8204225352112676, "large_model_usage": 0.8485915492957746, "small_model_usage": 0.15140845070422537, "avg_lambda": 0.900000000000004, "avg_cost": 0.0005596228873239435, "unrestricted": {"accuracy": 0.8204225352112676, "large_model_usage": 0.8485915492957746, "avg_cost": 0.0005600242957746476}, "random_baseline": {"accuracy": 0.8028169014084507, "accuracy_std": 0.005222674990912431, "avg_cost": 0.0005649189084507041, "cost_std": 8.38066326958537e-06, "small_model_fraction": 0.09675054162963558}, "unrestricted_random_baseline": {"accuracy": 0.7954225352112677, "accuracy_std": 0.011073369854589035, "avg_cost": 0.0005625440845070421, "cost_std": 6.004863830685424e-06, "small_model_fraction": 0.09606324037460116}}, "0.1": {"accuracy": 0.8063380281690141, "large_model_usage": 0.6971830985915493, "small_model_usage": 0.30281690140845074, "avg_lambda": 0.7599999999999976, "avg_cost": 0.00046359471830985903, "unrestricted": {"accuracy": 0.8169014084507042, "large_model_usage": 0.6971830985915493, "avg_cost": 0.0004672214788732393}, "random_baseline": {"accuracy": 0.7514084507042254, "accuracy_std": 0.013472624274442933, "avg_cost": 0.0004620068661971829, "cost_std": 1.4372996712528842e-05, "small_model_fraction": 0.2611722945006991}, "unrestricted_random_baseline": {"accuracy": 0.7588028169014085, "accuracy_std": 0.012121066533844002, "avg_cost": 0.00047542123239436597, "cost_std": 1.96153344912642e-05, "small_model_fraction": 0.25496246737187844}}, "0.15": {"accuracy": 0.7429577464788732, "large_model_usage": 0.5422535211267606, "small_model_usage": 0.4577464788732394, "avg_lambda": 0.6200000000000024, "avg_cost": 0.0003612707746478873, "unrestricted": {"accuracy": 0.7746478873239436, "large_model_usage": 0.5422535211267606, "avg_cost": 0.0003715242957746479}, "random_baseline": {"accuracy": 0.6971830985915494, "accuracy_std": 0.018565389192202925, "avg_cost": 0.0003626220070422533, "cost_std": 1.462729422071118e-05, "small_model_fraction": 0.43637382495072496}, "unrestricted_random_baseline": {"accuracy": 0.7158450704225352, "accuracy_std": 0.016594185447814356, "avg_cost": 0.0003716591197183096, "cost_std": 1.3642252709051875e-05, "small_model_fraction": 0.41881749815545727}}, "0.2": {"accuracy": 0.647887323943662, "large_model_usage": 0.3767605633802817, "small_model_usage": 0.6232394366197183, "avg_lambda": 0.40000000000000185, "avg_cost": 0.0002519961267605633, "unrestricted": {"accuracy": 0.7183098591549296, "large_model_usage": 0.3767605633802817, "avg_cost": 0.0002677214788732394}, "random_baseline": {"accuracy": 0.6397887323943663, "accuracy_std": 0.011893905460601787, "avg_cost": 0.0002466182042253519, "cost_std": 1.5261344152740694e-05, "small_model_fraction": 0.6234765192379277}, "unrestricted_random_baseline": {"accuracy": 0.6362676056338029, "accuracy_std": 0.01168356339650498, "avg_cost": 0.0002562916901408448, "cost_std": 1.431841427701583e-05, "small_model_fraction": 0.5965511911240317}}, "0.25": {"accuracy": 0.5598591549295775, "large_model_usage": 0.28169014084507044, "small_model_usage": 0.7183098591549295, "avg_lambda": 0.20000000000000093, "avg_cost": 0.00018605246478873246, "unrestricted": {"accuracy": 0.6584507042253521, "large_model_usage": 0.28169014084507044, "avg_cost": 0.00020861584507042257}, "random_baseline": {"accuracy": 0.6095070422535211, "accuracy_std": 0.014036008047883637, "avg_cost": 0.00018616176056338024, "cost_std": 1.5566128190270844e-05, "small_model_fraction": 0.7363868517316675}, "unrestricted_random_baseline": {"accuracy": 0.6200704225352112, "accuracy_std": 0.0181226890597997, "avg_cost": 0.0002122899295774647, "cost_std": 1.3170561241872198e-05, "small_model_fraction": 0.6977532864486748}}}}, {"trial": 13, "trial_seed": 54, "subject": "TruthfulQA", "method": "standard_basescores", "calibration_size": 400, "test_size": 284, "small_model_accuracy": 0.564327485380117, "large_model_accuracy": 0.8347953216374269, "small_model_cost": 3.1960380116959094e-05, "large_model_cost": 0.0006127748538011701, "hybrid_results": {"0.05": {"accuracy": 0.8169014084507042, "large_model_usage": 0.7816901408450704, "small_model_usage": 0.21830985915492962, "avg_lambda": 0.900000000000004, "avg_cost": 0.0005140105633802814, "unrestricted": {"accuracy": 0.8169014084507042, "large_model_usage": 0.7816901408450704, "avg_cost": 0.0005144753521126756}, "random_baseline": {"accuracy": 0.7922535211267606, "accuracy_std": 0.013361736592260785, "avg_cost": 0.0005206124999999997, "cost_std": 1.68840111585568e-05, "small_model_fraction": 0.1700444718507254}, "unrestricted_random_baseline": {"accuracy": 0.7996478873239437, "accuracy_std": 0.010847128028696746, "avg_cost": 0.0005203936267605631, "cost_std": 9.066287943761795e-06, "small_model_fraction": 0.16924423571086827}}, "0.1": {"accuracy": 0.7746478873239436, "large_model_usage": 0.6091549295774648, "small_model_usage": 0.39084507042253525, "avg_lambda": 0.6999999999999973, "avg_cost": 0.00039239788732394344, "unrestricted": {"accuracy": 0.7887323943661971, "large_model_usage": 0.6091549295774648, "avg_cost": 0.0004097359154929575}, "random_baseline": {"accuracy": 0.7327464788732395, "accuracy_std": 0.012444082427515683, "avg_cost": 0.00038754035211267595, "cost_std": 1.9860579364807285e-05, "small_model_fraction": 0.3794274703234164}, "unrestricted_random_baseline": {"accuracy": 0.7411971830985916, "accuracy_std": 0.010592682363726269, "avg_cost": 0.00041082098591549275, "cost_std": 1.1454089277777443e-05, "small_model_fraction": 0.3495762373487355}}, "0.15": {"accuracy": 0.6901408450704225, "large_model_usage": 0.40492957746478875, "small_model_usage": 0.5950704225352113, "avg_lambda": 0.5, "avg_cost": 0.0002718204225352111, "unrestricted": {"accuracy": 0.7253521126760564, "large_model_usage": 0.40492957746478875, "avg_cost": 0.0002845809859154928}, "random_baseline": {"accuracy": 0.6651408450704226, "accuracy_std": 0.02150195019827045, "avg_cost": 0.0002623192957746478, "cost_std": 1.8818324812803216e-05, "small_model_fraction": 0.587028124666425}, "unrestricted_random_baseline": {"accuracy": 0.677112676056338, "accuracy_std": 0.020291590893273113, "avg_cost": 0.00028418609154929567, "cost_std": 1.914374163175038e-05, "small_model_fraction": 0.5650580051903397}}, "0.2": {"accuracy": 0.6126760563380281, "large_model_usage": 0.2605633802816901, "small_model_usage": 0.7394366197183099, "avg_lambda": 0.3199999999999981, "avg_cost": 0.00018159507042253513, "unrestricted": {"accuracy": 0.6654929577464789, "large_model_usage": 0.2605633802816901, "avg_cost": 0.00019431338028169014}, "random_baseline": {"accuracy": 0.6165492957746478, "accuracy_std": 0.010847128028696756, "avg_cost": 0.00017219355633802806, "cost_std": 1.4874148671500299e-05, "small_model_fraction": 0.7423709341187451}, "unrestricted_random_baseline": {"accuracy": 0.6253521126760563, "accuracy_std": 0.015604591412788997, "avg_cost": 0.00019535443661971818, "cost_std": 1.717305878999779e-05, "small_model_fraction": 0.7204735633826467}}, "0.25": {"accuracy": 0.545774647887324, "large_model_usage": 0.0880281690140845, "small_model_usage": 0.9119718309859155, "avg_lambda": 0.039999999999999765, "avg_cost": 7.92711267605634e-05, "unrestricted": {"accuracy": 0.5704225352112676, "large_model_usage": 0.0880281690140845, "avg_cost": 8.711619718309861e-05}, "random_baseline": {"accuracy": 0.5563380281690142, "accuracy_std": 0.007216162511239144, "avg_cost": 7.749464788732396e-05, "cost_std": 7.2145248059700605e-06, "small_model_fraction": 0.9185441327873534}, "unrestricted_random_baseline": {"accuracy": 0.5661971830985916, "accuracy_std": 0.008879943811914419, "avg_cost": 8.890426056338031e-05, "cost_std": 8.132018171158642e-06, "small_model_fraction": 0.9050371167297603}}}}, {"trial": 14, "trial_seed": 55, "subject": "TruthfulQA", "method": "standard_basescores", "calibration_size": 400, "test_size": 284, "small_model_accuracy": 0.543859649122807, "large_model_accuracy": 0.8304093567251462, "small_model_cost": 3.202353801169593e-05, "large_model_cost": 0.0006144824561403515, "hybrid_results": {"0.05": {"accuracy": 0.8169014084507042, "large_model_usage": 1.0, "small_model_usage": 0.0, "avg_lambda": 1.0, "avg_cost": 0.0006501566901408454, "unrestricted": {"accuracy": 0.8169014084507042, "large_model_usage": 1.0, "avg_cost": 0.0006501566901408454}, "random_baseline": {"accuracy": 0.8169014084507042, "accuracy_std": 0.0, "avg_cost": 0.0006180915492957747, "cost_std": 0.0, "small_model_fraction": 0.0}, "unrestricted_random_baseline": {"accuracy": 0.8169014084507042, "accuracy_std": 0.0, "avg_cost": 0.0006180915492957747, "cost_std": 0.0, "small_model_fraction": 0.0}}, "0.1": {"accuracy": 0.8028169014084507, "large_model_usage": 0.7112676056338029, "small_model_usage": 0.28873239436619713, "avg_lambda": 0.8000000000000037, "avg_cost": 0.0004697200704225351, "unrestricted": {"accuracy": 0.7992957746478874, "large_model_usage": 0.7112676056338029, "avg_cost": 0.000474536971830986}, "random_baseline": {"accuracy": 0.7549295774647887, "accuracy_std": 0.01271511977835758, "avg_cost": 0.000466986514084507, "cost_std": 1.1518307988421662e-05, "small_model_fraction": 0.24853664561084243}, "unrestricted_random_baseline": {"accuracy": 0.7538732394366197, "accuracy_std": 0.017707452057295972, "avg_cost": 0.00047301739436619714, "cost_std": 1.6877036823890875e-05, "small_model_fraction": 0.24026670371704026}}, "0.15": {"accuracy": 0.778169014084507, "large_model_usage": 0.6302816901408451, "small_model_usage": 0.3697183098591549, "avg_lambda": 0.6999999999999973, "avg_cost": 0.00040431866197183117, "unrestricted": {"accuracy": 0.795774647887324, "large_model_usage": 0.6302816901408451, "avg_cost": 0.0004239665492957747}, "random_baseline": {"accuracy": 0.7225352112676057, "accuracy_std": 0.011223505246837502, "avg_cost": 0.00039917454225352113, "cost_std": 1.585237354548653e-05, "small_model_fraction": 0.36082166076835415}, "unrestricted_random_baseline": {"accuracy": 0.7341549295774649, "accuracy_std": 0.016534305467428, "avg_cost": 0.00042653781690140844, "cost_std": 2.211462348011672e-05, "small_model_fraction": 0.3270890030436361}}, "0.2": {"accuracy": 0.7007042253521126, "large_model_usage": 0.4014084507042254, "small_model_usage": 0.5985915492957746, "avg_lambda": 0.5, "avg_cost": 0.0002716426056338029, "unrestricted": {"accuracy": 0.7288732394366197, "large_model_usage": 0.4014084507042254, "avg_cost": 0.0002824947183098593}, "random_baseline": {"accuracy": 0.6676056338028169, "accuracy_std": 0.017407333935645437, "avg_cost": 0.00026745214788732386, "cost_std": 1.526844014682609e-05, "small_model_fraction": 0.5886077795976349}, "unrestricted_random_baseline": {"accuracy": 0.6764084507042254, "accuracy_std": 0.015624442065023775, "avg_cost": 0.00028244154929577464, "cost_std": 1.1327030290650657e-05, "small_model_fraction": 0.569976229219932}}, "0.25": {"accuracy": 0.6338028169014085, "large_model_usage": 0.2640845070422535, "small_model_usage": 0.7359154929577465, "avg_lambda": 0.3100000000000012, "avg_cost": 0.00018132570422535197, "unrestricted": {"accuracy": 0.6866197183098591, "large_model_usage": 0.2640845070422535, "avg_cost": 0.0001955792253521126}, "random_baseline": {"accuracy": 0.6235915492957746, "accuracy_std": 0.017210351281478797, "avg_cost": 0.00018049348591549281, "cost_std": 1.4264447537675507e-05, "small_model_fraction": 0.7436691901064211}, "unrestricted_random_baseline": {"accuracy": 0.6408450704225352, "accuracy_std": 0.010445349981824864, "avg_cost": 0.00020395584507042238, "cost_std": 1.578707748186999e-05, "small_model_fraction": 0.7191979000580948}}}}, {"trial": 15, "trial_seed": 56, "subject": "TruthfulQA", "method": "standard_basescores", "calibration_size": 400, "test_size": 284, "small_model_accuracy": 0.5453216374269005, "large_model_accuracy": 0.8245614035087719, "small_model_cost": 3.199970760233923e-05, "large_model_cost": 0.0006134912280701765, "hybrid_results": {"0.05": {"accuracy": 0.8028169014084507, "large_model_usage": 0.8063380281690141, "small_model_usage": 0.19366197183098588, "avg_lambda": 0.900000000000004, "avg_cost": 0.000526042957746479, "unrestricted": {"accuracy": 0.8028169014084507, "large_model_usage": 0.8063380281690141, "avg_cost": 0.0005265007042253524}, "random_baseline": {"accuracy": 0.7845070422535211, "accuracy_std": 0.013527727262182095, "avg_cost": 0.0005250993309859157, "cost_std": 1.2849445359379893e-05, "small_model_fraction": 0.15038614880117473}, "unrestricted_random_baseline": {"accuracy": 0.7869718309859155, "accuracy_std": 0.01070908896179267, "avg_cost": 0.0005280809507042255, "cost_std": 1.3539344532261196e-05, "small_model_fraction": 0.14959895507132454}}, "0.1": {"accuracy": 0.7711267605633803, "large_model_usage": 0.6161971830985915, "small_model_usage": 0.3838028169014085, "avg_lambda": 0.7599999999999976, "avg_cost": 0.0004087119718309858, "unrestricted": {"accuracy": 0.778169014084507, "large_model_usage": 0.6161971830985915, "avg_cost": 0.00041124718309859136}, "random_baseline": {"accuracy": 0.7218309859154929, "accuracy_std": 0.015267241823013263, "avg_cost": 0.0004085908450704225, "cost_std": 1.9662007848725e-05, "small_model_fraction": 0.3521620677708871}, "unrestricted_random_baseline": {"accuracy": 0.7345070422535211, "accuracy_std": 0.011267605633802802, "avg_cost": 0.0004116589436619719, "cost_std": 1.1448090002676545e-05, "small_model_fraction": 0.34780222557479484}}, "0.15": {"accuracy": 0.7288732394366197, "large_model_usage": 0.4612676056338028, "small_model_usage": 0.5387323943661972, "avg_lambda": 0.5500000000000005, "avg_cost": 0.00030758521126760555, "unrestricted": {"accuracy": 0.7464788732394366, "large_model_usage": 0.4612676056338028, "avg_cost": 0.00031662746478873234}, "random_baseline": {"accuracy": 0.6852112676056338, "accuracy_std": 0.012318912453898533, "avg_cost": 0.00030477908450704213, "cost_std": 1.3106558314977896e-05, "small_model_fraction": 0.5260713287039082}, "unrestricted_random_baseline": {"accuracy": 0.679225352112676, "accuracy_std": 0.017777331776569465, "avg_cost": 0.00031304246478873227, "cost_std": 2.040385578005579e-05, "small_model_fraction": 0.5105212248711783}}, "0.2": {"accuracy": 0.6901408450704225, "large_model_usage": 0.3732394366197183, "small_model_usage": 0.6267605633802817, "avg_lambda": 0.40000000000000185, "avg_cost": 0.00024752887323943655, "unrestricted": {"accuracy": 0.7323943661971831, "large_model_usage": 0.3732394366197183, "avg_cost": 0.00026254999999999985}, "random_baseline": {"accuracy": 0.648943661971831, "accuracy_std": 0.01753860479635531, "avg_cost": 0.0002385772887323942, "cost_std": 2.3740421411886784e-05, "small_model_fraction": 0.6293511460602316}, "unrestricted_random_baseline": {"accuracy": 0.6528169014084507, "accuracy_std": 0.014704657054803573, "avg_cost": 0.0002604114084507041, "cost_std": 1.859347996950842e-05, "small_model_fraction": 0.6035190810483839}}, "0.25": {"accuracy": 0.6091549295774648, "large_model_usage": 0.29577464788732394, "small_model_usage": 0.704225352112676, "avg_lambda": 0.29000000000000087, "avg_cost": 0.00019237394366197167, "unrestricted": {"accuracy": 0.6971830985915493, "large_model_usage": 0.29577464788732394, "avg_cost": 0.00021469788732394362}, "random_baseline": {"accuracy": 0.6334507042253522, "accuracy_std": 0.0138582173620806, "avg_cost": 0.00019581869718309858, "cost_std": 1.654438855657772e-05, "small_model_fraction": 0.7242019351707764}, "unrestricted_random_baseline": {"accuracy": 0.6426056338028169, "accuracy_std": 0.012424139974913628, "avg_cost": 0.00021293320422535207, "cost_std": 1.084614887637877e-05, "small_model_fraction": 0.6858111024996287}}}}, {"trial": 16, "trial_seed": 57, "subject": "TruthfulQA", "method": "standard_basescores", "calibration_size": 400, "test_size": 284, "small_model_accuracy": 0.5804093567251462, "large_model_accuracy": 0.8538011695906432, "small_model_cost": 3.2017397660818725e-05, "large_model_cost": 0.00061403216374269, "hybrid_results": {"0.05": {"accuracy": 0.8098591549295775, "large_model_usage": 0.7992957746478874, "small_model_usage": 0.20070422535211263, "avg_lambda": 0.879999999999997, "avg_cost": 0.00051776338028169, "unrestricted": {"accuracy": 0.823943661971831, "large_model_usage": 0.7992957746478874, "avg_cost": 0.0005261295774647888}, "random_baseline": {"accuracy": 0.8024647887323944, "accuracy_std": 0.0059859154929577385, "avg_cost": 0.0005174229577464789, "cost_std": 8.888629843677661e-06, "small_model_fraction": 0.16540608429762416}, "unrestricted_random_baseline": {"accuracy": 0.8119718309859154, "accuracy_std": 0.012715119778357606, "avg_cost": 0.000526841056338028, "cost_std": 1.5436914101618806e-05, "small_model_fraction": 0.15103153974883177}}, "0.1": {"accuracy": 0.7605633802816901, "large_model_usage": 0.6232394366197183, "small_model_usage": 0.37676056338028174, "avg_lambda": 0.6999999999999973, "avg_cost": 0.0004030169014084505, "unrestricted": {"accuracy": 0.7816901408450704, "large_model_usage": 0.6232394366197183, "avg_cost": 0.00041861549295774635}, "random_baseline": {"accuracy": 0.7640845070422535, "accuracy_std": 0.008182359182128335, "avg_cost": 0.0003964515492957745, "cost_std": 1.4734135298262424e-05, "small_model_fraction": 0.3625599806595908}, "unrestricted_random_baseline": {"accuracy": 0.7725352112676056, "accuracy_std": 0.017689938995887992, "avg_cost": 0.0004211449295774645, "cost_std": 1.6708944433225154e-05, "small_model_fraction": 0.33575895694277735}}, "0.15": {"accuracy": 0.7323943661971831, "large_model_usage": 0.45774647887323944, "small_model_usage": 0.5422535211267605, "avg_lambda": 0.5900000000000027, "avg_cost": 0.0003065380281690138, "unrestricted": {"accuracy": 0.75, "large_model_usage": 0.45774647887323944, "avg_cost": 0.00031730563380281664}, "random_baseline": {"accuracy": 0.7355633802816902, "accuracy_std": 0.012141506795469468, "avg_cost": 0.0003084077112676055, "cost_std": 1.2633937612579183e-05, "small_model_fraction": 0.5283270347997002}, "unrestricted_random_baseline": {"accuracy": 0.7320422535211268, "accuracy_std": 0.009375723208235444, "avg_cost": 0.00031018489436619707, "cost_std": 1.6960109522878143e-05, "small_model_fraction": 0.5098264635748662}}, "0.2": {"accuracy": 0.6866197183098591, "large_model_usage": 0.27816901408450706, "small_model_usage": 0.721830985915493, "avg_lambda": 0.3900000000000005, "avg_cost": 0.0001937774647887324, "unrestricted": {"accuracy": 0.7288732394366197, "large_model_usage": 0.27816901408450706, "avg_cost": 0.00020578450704225354}, "random_baseline": {"accuracy": 0.694718309859155, "accuracy_std": 0.011360750566832176, "avg_cost": 0.00019102267605633806, "cost_std": 1.5808853872804055e-05, "small_model_fraction": 0.7220687917990743}, "unrestricted_random_baseline": {"accuracy": 0.6926056338028168, "accuracy_std": 0.014000630529627502, "avg_cost": 0.00020599422535211263, "cost_std": 1.5280561857322535e-05, "small_model_fraction": 0.7014386584188635}}, "0.25": {"accuracy": 0.6619718309859155, "large_model_usage": 0.2323943661971831, "small_model_usage": 0.7676056338028169, "avg_lambda": 0.20000000000000093, "avg_cost": 0.00015893239436619725, "unrestricted": {"accuracy": 0.721830985915493, "large_model_usage": 0.2323943661971831, "avg_cost": 0.00017783380281690146}, "random_baseline": {"accuracy": 0.6778169014084507, "accuracy_std": 0.018715045444955343, "avg_cost": 0.0001618465140845071, "cost_std": 1.4043126783251494e-05, "small_model_fraction": 0.7819385278490932}, "unrestricted_random_baseline": {"accuracy": 0.6830985915492958, "accuracy_std": 0.01178394403569119, "avg_cost": 0.0001713370070422536, "cost_std": 1.3917744911592968e-05, "small_model_fraction": 0.7494627049796002}}}}, {"trial": 17, "trial_seed": 58, "subject": "TruthfulQA", "method": "standard_basescores", "calibration_size": 400, "test_size": 284, "small_model_accuracy": 0.5497076023391813, "large_model_accuracy": 0.8187134502923976, "small_model_cost": 3.1980116959064336e-05, "large_model_cost": 0.0006142923976608198, "hybrid_results": {"0.05": {"accuracy": 0.7992957746478874, "large_model_usage": 0.7464788732394366, "small_model_usage": 0.2535211267605634, "avg_lambda": 0.8300000000000048, "avg_cost": 0.0004914271126760561, "unrestricted": {"accuracy": 0.8098591549295775, "large_model_usage": 0.7464788732394366, "avg_cost": 0.0004961947183098588}, "random_baseline": {"accuracy": 0.7852112676056339, "accuracy_std": 0.01590364759244538, "avg_cost": 0.0004915065492957745, "cost_std": 1.5935351010505356e-05, "small_model_fraction": 0.21099552432020907}, "unrestricted_random_baseline": {"accuracy": 0.7887323943661972, "accuracy_std": 0.008479996182248135, "avg_cost": 0.0004962438732394367, "cost_std": 1.4620839088698296e-05, "small_model_fraction": 0.20280815511676867}}, "0.1": {"accuracy": 0.778169014084507, "large_model_usage": 0.6161971830985915, "small_model_usage": 0.3838028169014085, "avg_lambda": 0.6999999999999973, "avg_cost": 0.0003979623239436615, "unrestricted": {"accuracy": 0.7992957746478874, "large_model_usage": 0.6161971830985915, "avg_cost": 0.00041608204225352063}, "random_baseline": {"accuracy": 0.7390845070422536, "accuracy_std": 0.008970238875251394, "avg_cost": 0.00040260225352112673, "cost_std": 1.662597798028233e-05, "small_model_fraction": 0.3715018227959314}, "unrestricted_random_baseline": {"accuracy": 0.745774647887324, "accuracy_std": 0.011442307612163308, "avg_cost": 0.0004106684859154929, "cost_std": 8.513551389502729e-06, "small_model_fraction": 0.3403849823816735}}, "0.15": {"accuracy": 0.721830985915493, "large_model_usage": 0.4119718309859155, "small_model_usage": 0.5880281690140845, "avg_lambda": 0.5, "avg_cost": 0.00027679330985915483, "unrestricted": {"accuracy": 0.75, "large_model_usage": 0.4119718309859155, "avg_cost": 0.0002883003521126759}, "random_baseline": {"accuracy": 0.6665492957746479, "accuracy_std": 0.023884039270579763, "avg_cost": 0.00026861278169014083, "cost_std": 1.5801433124347453e-05, "small_model_fraction": 0.5795843553135072}, "unrestricted_random_baseline": {"accuracy": 0.6827464788732394, "accuracy_std": 0.019056421306668736, "avg_cost": 0.00028799609154929565, "cost_std": 1.5850779257533845e-05, "small_model_fraction": 0.5598234080780243}}, "0.2": {"accuracy": 0.6725352112676056, "large_model_usage": 0.2852112676056338, "small_model_usage": 0.7147887323943662, "avg_lambda": 0.3400000000000022, "avg_cost": 0.00019563133802816908, "unrestricted": {"accuracy": 0.7077464788732394, "large_model_usage": 0.2852112676056338, "avg_cost": 0.00020977218309859168}, "random_baseline": {"accuracy": 0.6440140845070422, "accuracy_std": 0.018259003045724174, "avg_cost": 0.00019613316901408452, "cost_std": 1.0850792026696748e-05, "small_model_fraction": 0.7189631294193459}, "unrestricted_random_baseline": {"accuracy": 0.6528169014084507, "accuracy_std": 0.016827891754011996, "avg_cost": 0.0002154498591549296, "cost_std": 1.7558377427550534e-05, "small_model_fraction": 0.6946791746770877}}, "0.25": {"accuracy": 0.5809859154929577, "large_model_usage": 0.1619718309859155, "small_model_usage": 0.8380281690140845, "avg_lambda": 0.10000000000000046, "avg_cost": 0.00011997640845070423, "unrestricted": {"accuracy": 0.6408450704225352, "large_model_usage": 0.1619718309859155, "avg_cost": 0.00013373697183098596}, "random_baseline": {"accuracy": 0.6017605633802817, "accuracy_std": 0.008830940988721434, "avg_cost": 0.00012028728873239437, "cost_std": 9.68804443402352e-06, "small_model_fraction": 0.8488847060110188}, "unrestricted_random_baseline": {"accuracy": 0.6091549295774648, "accuracy_std": 0.010082972579772085, "avg_cost": 0.00013002151408450705, "cost_std": 1.1270441932248936e-05, "small_model_fraction": 0.8252538058285623}}}}, {"trial": 18, "trial_seed": 59, "subject": "TruthfulQA", "method": "standard_basescores", "calibration_size": 400, "test_size": 284, "small_model_accuracy": 0.5701754385964912, "large_model_accuracy": 0.8347953216374269, "small_model_cost": 3.198230994152046e-05, "large_model_cost": 0.0006123947368421063, "hybrid_results": {"0.05": {"accuracy": 0.8380281690140845, "large_model_usage": 0.7676056338028169, "small_model_usage": 0.23239436619718312, "avg_lambda": 0.900000000000004, "avg_cost": 0.0005007003521126758, "unrestricted": {"accuracy": 0.8380281690140845, "large_model_usage": 0.7676056338028169, "avg_cost": 0.0005007003521126758}, "random_baseline": {"accuracy": 0.8225352112676056, "accuracy_std": 0.009984117520251976, "avg_cost": 0.0005036044718309859, "cost_std": 1.5068167979187462e-05, "small_model_fraction": 0.19243968521811422}, "unrestricted_random_baseline": {"accuracy": 0.8172535211267606, "accuracy_std": 0.008250263742154906, "avg_cost": 0.0004991881690140843, "cost_std": 1.4560979846168986e-05, "small_model_fraction": 0.19243968521811422}}, "0.1": {"accuracy": 0.795774647887324, "large_model_usage": 0.5669014084507042, "small_model_usage": 0.43309859154929575, "avg_lambda": 0.6999999999999973, "avg_cost": 0.000362925704225352, "unrestricted": {"accuracy": 0.8274647887323944, "large_model_usage": 0.5669014084507042, "avg_cost": 0.00037762992957746463}, "random_baseline": {"accuracy": 0.7524647887323944, "accuracy_std": 0.020291590893273086, "avg_cost": 0.0003593271126760563, "cost_std": 2.038514056943218e-05, "small_model_fraction": 0.42981338967693017}, "unrestricted_random_baseline": {"accuracy": 0.7573943661971831, "accuracy_std": 0.012038958448787843, "avg_cost": 0.0003723845070422534, "cost_std": 8.546066060770675e-06, "small_model_fraction": 0.4044792915932046}}, "0.15": {"accuracy": 0.7394366197183099, "large_model_usage": 0.36971830985915494, "small_model_usage": 0.630281690140845, "avg_lambda": 0.5, "avg_cost": 0.00024985528169014094, "unrestricted": {"accuracy": 0.7746478873239436, "large_model_usage": 0.36971830985915494, "avg_cost": 0.0002575947183098592}, "random_baseline": {"accuracy": 0.6915492957746477, "accuracy_std": 0.017120064480483404, "avg_cost": 0.0002500614436619717, "cost_std": 1.4648838128717764e-05, "small_model_fraction": 0.624623868044889}, "unrestricted_random_baseline": {"accuracy": 0.6975352112676055, "accuracy_std": 0.011295080938271447, "avg_cost": 0.00024542799295774635, "cost_std": 1.3636928689182355e-05, "small_model_fraction": 0.6112894936224683}}, "0.2": {"accuracy": 0.6936619718309859, "large_model_usage": 0.28169014084507044, "small_model_usage": 0.7183098591549295, "avg_lambda": 0.3700000000000011, "avg_cost": 0.00019393978873239438, "unrestricted": {"accuracy": 0.7253521126760564, "large_model_usage": 0.28169014084507044, "avg_cost": 0.0002051862676056338}, "random_baseline": {"accuracy": 0.654225352112676, "accuracy_std": 0.02009203185589732, "avg_cost": 0.00018496845070422528, "cost_std": 1.7238840597004824e-05, "small_model_fraction": 0.7209613866199761}, "unrestricted_random_baseline": {"accuracy": 0.676056338028169, "accuracy_std": 0.021126760563380278, "avg_cost": 0.00020790327464788723, "cost_std": 1.783342285614196e-05, "small_model_fraction": 0.7015846842063219}}, "0.25": {"accuracy": 0.6197183098591549, "large_model_usage": 0.23943661971830985, "small_model_usage": 0.7605633802816901, "avg_lambda": 0.10999999999999963, "avg_cost": 0.000156531338028169, "unrestricted": {"accuracy": 0.7112676056338029, "large_model_usage": 0.23943661971830985, "avg_cost": 0.00017887640845070423}, "random_baseline": {"accuracy": 0.644718309859155, "accuracy_std": 0.01118477477061167, "avg_cost": 0.00015756950704225346, "cost_std": 9.670494346237704e-06, "small_model_fraction": 0.7854128851931326}, "unrestricted_random_baseline": {"accuracy": 0.6492957746478873, "accuracy_std": 0.011045343057294463, "avg_cost": 0.00017194774647887317, "cost_std": 1.1014150270417119e-05, "small_model_fraction": 0.7469142773293099}}}}, {"trial": 19, "trial_seed": 60, "subject": "TruthfulQA", "method": "standard_basescores", "calibration_size": 400, "test_size": 284, "small_model_accuracy": 0.5526315789473685, "large_model_accuracy": 0.8377192982456141, "small_model_cost": 3.2065058479532224e-05, "large_model_cost": 0.0006145643274853809, "hybrid_results": {"0.05": {"accuracy": 0.7746478873239436, "large_model_usage": 0.778169014084507, "small_model_usage": 0.221830985915493, "avg_lambda": 0.8600000000000062, "avg_cost": 0.0005025095070422535, "unrestricted": {"accuracy": 0.7852112676056338, "large_model_usage": 0.778169014084507, "avg_cost": 0.0005131714788732398}, "random_baseline": {"accuracy": 0.7577464788732394, "accuracy_std": 0.01914405242122277, "avg_cost": 0.0005004165140845068, "cost_std": 1.6187441796238684e-05, "small_model_fraction": 0.19236903186912374}, "unrestricted_random_baseline": {"accuracy": 0.7619718309859154, "accuracy_std": 0.010818515137843095, "avg_cost": 0.0005098509859154928, "cost_std": 1.655567446130417e-05, "small_model_fraction": 0.17406519459704778}}, "0.1": {"accuracy": 0.7464788732394366, "large_model_usage": 0.5985915492957746, "small_model_usage": 0.4014084507042254, "avg_lambda": 0.7100000000000012, "avg_cost": 0.00038769260563380263, "unrestricted": {"accuracy": 0.7640845070422535, "large_model_usage": 0.5985915492957746, "avg_cost": 0.00040320669014084495}, "random_baseline": {"accuracy": 0.7028169014084507, "accuracy_std": 0.012115951080341747, "avg_cost": 0.0003886156338028167, "cost_std": 1.6714838143958635e-05, "small_model_fraction": 0.3894798395863744}, "unrestricted_random_baseline": {"accuracy": 0.7095070422535212, "accuracy_std": 0.01824541747610089, "avg_cost": 0.0003962413028169012, "cost_std": 1.0794645509625841e-05, "small_model_fraction": 0.36284618469180224}}, "0.15": {"accuracy": 0.6901408450704225, "large_model_usage": 0.4119718309859155, "small_model_usage": 0.5880281690140845, "avg_lambda": 0.5300000000000009, "avg_cost": 0.00027752359154929583, "unrestricted": {"accuracy": 0.7253521126760564, "large_model_usage": 0.4119718309859155, "avg_cost": 0.0002878334507042254}, "random_baseline": {"accuracy": 0.6524647887323944, "accuracy_std": 0.018163690671167503, "avg_cost": 0.00027632214788732393, "cost_std": 1.520683457137826e-05, "small_model_fraction": 0.5786114315839613}, "unrestricted_random_baseline": {"accuracy": 0.6517605633802817, "accuracy_std": 0.010732218770512782, "avg_cost": 0.000289556514084507, "cost_std": 2.173635094653623e-05, "small_model_fraction": 0.5609120803512544}}, "0.2": {"accuracy": 0.6584507042253521, "large_model_usage": 0.3345070422535211, "small_model_usage": 0.6654929577464789, "avg_lambda": 0.40000000000000185, "avg_cost": 0.00022824190140845063, "unrestricted": {"accuracy": 0.6971830985915493, "large_model_usage": 0.3345070422535211, "avg_cost": 0.00023981936619718307}, "random_baseline": {"accuracy": 0.6158450704225352, "accuracy_std": 0.012641776810574839, "avg_cost": 0.0002198406690140845, "cost_std": 1.6696189350354824e-05, "small_model_fraction": 0.6632152976539639}, "unrestricted_random_baseline": {"accuracy": 0.6313380281690141, "accuracy_std": 0.022493910216456973, "avg_cost": 0.0002503084507042253, "cost_std": 2.1333762380394e-05, "small_model_fraction": 0.6433397966795305}}, "0.25": {"accuracy": 0.5563380281690141, "large_model_usage": 0.2640845070422535, "small_model_usage": 0.7359154929577465, "avg_lambda": 0.20000000000000093, "avg_cost": 0.00017464330985915495, "unrestricted": {"accuracy": 0.6619718309859155, "large_model_usage": 0.2640845070422535, "avg_cost": 0.00019633345070422536}, "random_baseline": {"accuracy": 0.5919014084507043, "accuracy_std": 0.011512523045261826, "avg_cost": 0.00016497767605633804, "cost_std": 1.8383046865282722e-05, "small_model_fraction": 0.7552301625666227}, "unrestricted_random_baseline": {"accuracy": 0.6049295774647886, "accuracy_std": 0.01133343446438807, "avg_cost": 0.00019442306338028167, "cost_std": 1.2574170176796639e-05, "small_model_fraction": 0.7179938225415288}}}}, {"trial": 20, "trial_seed": 61, "subject": "TruthfulQA", "method": "standard_basescores", "calibration_size": 400, "test_size": 284, "small_model_accuracy": 0.5482456140350878, "large_model_accuracy": 0.8318713450292398, "small_model_cost": 3.20786549707603e-05, "large_model_cost": 0.0006157017543859657, "hybrid_results": {"0.05": {"accuracy": 0.8028169014084507, "large_model_usage": 0.7852112676056338, "small_model_usage": 0.21478873239436624, "avg_lambda": 0.900000000000004, "avg_cost": 0.0005198658450704222, "unrestricted": {"accuracy": 0.8028169014084507, "large_model_usage": 0.7852112676056338, "avg_cost": 0.0005198658450704222}, "random_baseline": {"accuracy": 0.778169014084507, "accuracy_std": 0.0109098122428378, "avg_cost": 0.0005218386971830985, "cost_std": 1.569244596474136e-05, "small_model_fraction": 0.1642085609900838}, "unrestricted_random_baseline": {"accuracy": 0.7753521126760562, "accuracy_std": 0.016043360211119045, "avg_cost": 0.000525872147887324, "cost_std": 1.4963727227016298e-05, "small_model_fraction": 0.1642085609900838}}, "0.1": {"accuracy": 0.7359154929577465, "large_model_usage": 0.5809859154929577, "small_model_usage": 0.41901408450704225, "avg_lambda": 0.6999999999999973, "avg_cost": 0.0003775771126760563, "unrestricted": {"accuracy": 0.7676056338028169, "large_model_usage": 0.5809859154929577, "avg_cost": 0.0003936264084507041}, "random_baseline": {"accuracy": 0.698943661971831, "accuracy_std": 0.01584507042253523, "avg_cost": 0.00037961274647887327, "cost_std": 1.5294661077600725e-05, "small_model_fraction": 0.408010995364152}, "unrestricted_random_baseline": {"accuracy": 0.7179577464788732, "accuracy_std": 0.015464923419716756, "avg_cost": 0.00039056320422535215, "cost_std": 1.6335578373316537e-05, "small_model_fraction": 0.38051157700540417}}, "0.15": {"accuracy": 0.6901408450704225, "large_model_usage": 0.44366197183098594, "small_model_usage": 0.556338028169014, "avg_lambda": 0.5999999999999969, "avg_cost": 0.00029795035211267604, "unrestricted": {"accuracy": 0.7253521126760564, "large_model_usage": 0.44366197183098594, "avg_cost": 0.00030759119718309864}, "random_baseline": {"accuracy": 0.6609154929577464, "accuracy_std": 0.015592668891186268, "avg_cost": 0.00030327042253521137, "cost_std": 1.6003750651447484e-05, "small_model_fraction": 0.5444462403761586}, "unrestricted_random_baseline": {"accuracy": 0.6633802816901408, "accuracy_std": 0.020663944719889, "avg_cost": 0.0003030636267605635, "cost_std": 1.793718439339786e-05, "small_model_fraction": 0.5279272830557873}}, "0.2": {"accuracy": 0.6338028169014085, "large_model_usage": 0.30985915492957744, "small_model_usage": 0.6901408450704225, "avg_lambda": 0.40000000000000185, "avg_cost": 0.00021324612676056333, "unrestricted": {"accuracy": 0.6725352112676056, "large_model_usage": 0.30985915492957744, "avg_cost": 0.00022488697183098593}, "random_baseline": {"accuracy": 0.6176056338028169, "accuracy_std": 0.014276855586417829, "avg_cost": 0.00021271133802816896, "cost_std": 2.118372277277499e-05, "small_model_fraction": 0.6895813891339562}, "unrestricted_random_baseline": {"accuracy": 0.6126760563380282, "accuracy_std": 0.013996202052360423, "avg_cost": 0.00021478651408450697, "cost_std": 1.3554683495201171e-05, "small_model_fraction": 0.6696355626543553}}, "0.25": {"accuracy": 0.5352112676056338, "large_model_usage": 0.19718309859154928, "small_model_usage": 0.8028169014084507, "avg_lambda": 0.15999999999999906, "avg_cost": 0.0001378024647887324, "unrestricted": {"accuracy": 0.6126760563380281, "large_model_usage": 0.19718309859154928, "avg_cost": 0.00015567570422535203}, "random_baseline": {"accuracy": 0.5732394366197184, "accuracy_std": 0.013061434500698157, "avg_cost": 0.00014123429577464784, "cost_std": 1.830606232924984e-05, "small_model_fraction": 0.8188491683692641}, "unrestricted_random_baseline": {"accuracy": 0.5887323943661971, "accuracy_std": 0.015332071166955782, "avg_cost": 0.00016112911971830985, "cost_std": 1.3777475064725941e-05, "small_model_fraction": 0.7882245418688244}}}}, {"trial": 21, "trial_seed": 62, "subject": "TruthfulQA", "method": "standard_basescores", "calibration_size": 400, "test_size": 284, "small_model_accuracy": 0.5570175438596491, "large_model_accuracy": 0.8377192982456141, "small_model_cost": 3.1931871345029295e-05, "large_model_cost": 0.0006127192982456147, "hybrid_results": {"0.05": {"accuracy": 0.8380281690140845, "large_model_usage": 1.0, "small_model_usage": 0.0, "avg_lambda": 1.0, "avg_cost": 0.0006451672535211265, "unrestricted": {"accuracy": 0.8380281690140845, "large_model_usage": 1.0, "avg_cost": 0.0006451672535211265}, "random_baseline": {"accuracy": 0.8380281690140844, "accuracy_std": 1.1102230246251565e-16, "avg_cost": 0.0006132323943661974, "cost_std": 1.0842021724855044e-19, "small_model_fraction": 0.0}, "unrestricted_random_baseline": {"accuracy": 0.8380281690140844, "accuracy_std": 1.1102230246251565e-16, "avg_cost": 0.0006132323943661974, "cost_std": 1.0842021724855044e-19, "small_model_fraction": 0.0}}, "0.1": {"accuracy": 0.795774647887324, "large_model_usage": 0.7112676056338029, "small_model_usage": 0.28873239436619713, "avg_lambda": 0.8000000000000037, "avg_cost": 0.0004690968309859149, "unrestricted": {"accuracy": 0.795774647887324, "large_model_usage": 0.7112676056338029, "avg_cost": 0.0004732658450704218}, "random_baseline": {"accuracy": 0.7704225352112676, "accuracy_std": 0.01077257643716784, "avg_cost": 0.000480754084507042, "cost_std": 1.3873758252614022e-05, "small_model_fraction": 0.24728921565356812}, "unrestricted_random_baseline": {"accuracy": 0.7640845070422535, "accuracy_std": 0.012985273883511093, "avg_cost": 0.0004685005281690139, "cost_std": 9.272090568767288e-06, "small_model_fraction": 0.24011100570719388}}, "0.15": {"accuracy": 0.7112676056338029, "large_model_usage": 0.4753521126760563, "small_model_usage": 0.5246478873239437, "avg_lambda": 0.5900000000000027, "avg_cost": 0.0003177728873239436, "unrestricted": {"accuracy": 0.7394366197183099, "large_model_usage": 0.4753521126760563, "avg_cost": 0.00032810387323943643}, "random_baseline": {"accuracy": 0.6859154929577465, "accuracy_std": 0.012074949436255087, "avg_cost": 0.0003169811267605633, "cost_std": 1.4727487765527219e-05, "small_model_fraction": 0.5078388361395394}, "unrestricted_random_baseline": {"accuracy": 0.7, "accuracy_std": 0.016043360211119063, "avg_cost": 0.0003332114084507041, "cost_std": 1.629009646194693e-05, "small_model_fraction": 0.49005094088391227}}, "0.2": {"accuracy": 0.6584507042253521, "large_model_usage": 0.34507042253521125, "small_model_usage": 0.6549295774647887, "avg_lambda": 0.40000000000000185, "avg_cost": 0.00023214612676056352, "unrestricted": {"accuracy": 0.6971830985915493, "large_model_usage": 0.34507042253521125, "avg_cost": 0.00024645598591549293}, "random_baseline": {"accuracy": 0.6471830985915494, "accuracy_std": 0.014154754395944899, "avg_cost": 0.00023646788732394368, "cost_std": 1.5003926946460277e-05, "small_model_fraction": 0.6552710231969169}, "unrestricted_random_baseline": {"accuracy": 0.6566901408450704, "accuracy_std": 0.017902427913924147, "avg_cost": 0.0002489941549295774, "cost_std": 1.5319234297976267e-05, "small_model_fraction": 0.6306323025701723}}, "0.25": {"accuracy": 0.5985915492957746, "large_model_usage": 0.2852112676056338, "small_model_usage": 0.7147887323943662, "avg_lambda": 0.25, "avg_cost": 0.0001856109154929579, "unrestricted": {"accuracy": 0.6795774647887324, "large_model_usage": 0.2852112676056338, "avg_cost": 0.00020842077464788754}, "random_baseline": {"accuracy": 0.6246478873239436, "accuracy_std": 0.011485567908662063, "avg_cost": 0.00018273214788732398, "cost_std": 1.42954533579816e-05, "small_model_fraction": 0.7353953666523946}, "unrestricted_random_baseline": {"accuracy": 0.6316901408450705, "accuracy_std": 0.011804968038197361, "avg_cost": 0.0002051116901408451, "cost_std": 1.3839209537968039e-05, "small_model_fraction": 0.6961213429762001}}}}, {"trial": 22, "trial_seed": 63, "subject": "TruthfulQA", "method": "standard_basescores", "calibration_size": 400, "test_size": 284, "small_model_accuracy": 0.554093567251462, "large_model_accuracy": 0.8157894736842105, "small_model_cost": 3.205672514619885e-05, "large_model_cost": 0.000614912280701756, "hybrid_results": {"0.05": {"accuracy": 0.7711267605633803, "large_model_usage": 0.7570422535211268, "small_model_usage": 0.24295774647887325, "avg_lambda": 0.8300000000000048, "avg_cost": 0.0004972105633802814, "unrestricted": {"accuracy": 0.7816901408450704, "large_model_usage": 0.7570422535211268, "avg_cost": 0.0005026964788732394}, "random_baseline": {"accuracy": 0.7619718309859155, "accuracy_std": 0.016074242550018752, "avg_cost": 0.0005001486267605632, "cost_std": 1.3752069510676227e-05, "small_model_fraction": 0.2019397708395959}, "unrestricted_random_baseline": {"accuracy": 0.7665492957746479, "accuracy_std": 0.009188724190633712, "avg_cost": 0.0005054066549295774, "cost_std": 1.0632746047958653e-05, "small_model_fraction": 0.19252763529302983}}, "0.1": {"accuracy": 0.7394366197183099, "large_model_usage": 0.6091549295774648, "small_model_usage": 0.39084507042253525, "avg_lambda": 0.7100000000000012, "avg_cost": 0.0003944147887323942, "unrestricted": {"accuracy": 0.7676056338028169, "large_model_usage": 0.6091549295774648, "avg_cost": 0.00040997816901408433}, "random_baseline": {"accuracy": 0.7204225352112675, "accuracy_std": 0.013193657743095224, "avg_cost": 0.00040589369718309836, "cost_std": 1.7145322354548774e-05, "small_model_fraction": 0.37830555077954336}, "unrestricted_random_baseline": {"accuracy": 0.7257042253521128, "accuracy_std": 0.010138154964005597, "avg_cost": 0.0004170373239436618, "cost_std": 1.220733509932192e-05, "small_model_fraction": 0.35160360012754066}}, "0.15": {"accuracy": 0.676056338028169, "large_model_usage": 0.4225352112676056, "small_model_usage": 0.5774647887323944, "avg_lambda": 0.5100000000000009, "avg_cost": 0.00028423169014084505, "unrestricted": {"accuracy": 0.7007042253521126, "large_model_usage": 0.4225352112676056, "avg_cost": 0.0002947176056338027}, "random_baseline": {"accuracy": 0.6566901408450705, "accuracy_std": 0.00894255288676076, "avg_cost": 0.00027458204225352104, "cost_std": 2.2775873678436778e-05, "small_model_fraction": 0.5673456955312333}, "unrestricted_random_baseline": {"accuracy": 0.6644366197183098, "accuracy_std": 0.02047407290155861, "avg_cost": 0.00029443267605633794, "cost_std": 1.8376240207017378e-05, "small_model_fraction": 0.549355105250314}}, "0.2": {"accuracy": 0.6514084507042254, "large_model_usage": 0.34507042253521125, "small_model_usage": 0.6549295774647887, "avg_lambda": 0.40000000000000185, "avg_cost": 0.00023098521126760574, "unrestricted": {"accuracy": 0.6866197183098591, "large_model_usage": 0.34507042253521125, "avg_cost": 0.0002455274647887325}, "random_baseline": {"accuracy": 0.6355633802816902, "accuracy_std": 0.019043404623929533, "avg_cost": 0.0002293516549295774, "cost_std": 1.4532815895444086e-05, "small_model_fraction": 0.6587001972867955}, "unrestricted_random_baseline": {"accuracy": 0.6355633802816902, "accuracy_std": 0.013477224795073699, "avg_cost": 0.0002428576408450704, "cost_std": 1.4837874219648636e-05, "small_model_fraction": 0.6337501845734987}}, "0.25": {"accuracy": 0.5633802816901409, "large_model_usage": 0.25, "small_model_usage": 0.75, "avg_lambda": 0.20000000000000093, "avg_cost": 0.00016710492957746492, "unrestricted": {"accuracy": 0.6443661971830986, "large_model_usage": 0.25, "avg_cost": 0.00018685140845070425}, "random_baseline": {"accuracy": 0.6014084507042254, "accuracy_std": 0.015412724386084004, "avg_cost": 0.00016365542253521124, "cost_std": 1.0608594491397986e-05, "small_model_fraction": 0.7682990182661245}, "unrestricted_random_baseline": {"accuracy": 0.6165492957746479, "accuracy_std": 0.015140845070422558, "avg_cost": 0.00018722031690140842, "cost_std": 1.3619363619128842e-05, "small_model_fraction": 0.7344201632307328}}}}, {"trial": 23, "trial_seed": 64, "subject": "TruthfulQA", "method": "standard_basescores", "calibration_size": 400, "test_size": 284, "small_model_accuracy": 0.5511695906432749, "large_model_accuracy": 0.8230994152046783, "small_model_cost": 3.2060087719298305e-05, "large_model_cost": 0.0006151900584795328, "hybrid_results": {"0.05": {"accuracy": 0.7711267605633803, "large_model_usage": 0.795774647887324, "small_model_usage": 0.204225352112676, "avg_lambda": 0.900000000000004, "avg_cost": 0.0005265359154929576, "unrestricted": {"accuracy": 0.7711267605633803, "large_model_usage": 0.795774647887324, "avg_cost": 0.0005265359154929576}, "random_baseline": {"accuracy": 0.7464788732394367, "accuracy_std": 0.008907824394840494, "avg_cost": 0.0005337093661971833, "cost_std": 1.4141661110263868e-05, "small_model_fraction": 0.15203153230315983}, "unrestricted_random_baseline": {"accuracy": 0.7383802816901409, "accuracy_std": 0.014088907763198965, "avg_cost": 0.0005312251056338029, "cost_std": 8.110459488568071e-06, "small_model_fraction": 0.15203153230315983}}, "0.1": {"accuracy": 0.704225352112676, "large_model_usage": 0.5985915492957746, "small_model_usage": 0.4014084507042254, "avg_lambda": 0.6999999999999973, "avg_cost": 0.0003920992957746477, "unrestricted": {"accuracy": 0.7288732394366197, "large_model_usage": 0.5985915492957746, "avg_cost": 0.0004046485915492956}, "random_baseline": {"accuracy": 0.6732394366197183, "accuracy_std": 0.021464437541025564, "avg_cost": 0.0003912922535211265, "cost_std": 1.5121449441527021e-05, "small_model_fraction": 0.382574681273951}, "unrestricted_random_baseline": {"accuracy": 0.6852112676056338, "accuracy_std": 0.014955465198306572, "avg_cost": 0.0004102107394366195, "cost_std": 2.252137971777556e-05, "small_model_fraction": 0.3610541002647341}}, "0.15": {"accuracy": 0.6619718309859155, "large_model_usage": 0.4471830985915493, "small_model_usage": 0.5528169014084507, "avg_lambda": 0.5300000000000009, "avg_cost": 0.000297669718309859, "unrestricted": {"accuracy": 0.6901408450704225, "large_model_usage": 0.4471830985915493, "avg_cost": 0.0003105499999999998}, "random_baseline": {"accuracy": 0.6373239436619719, "accuracy_std": 0.014259476571349709, "avg_cost": 0.0002991493309859153, "cost_std": 1.6959841174591754e-05, "small_model_fraction": 0.5445104112136754}, "unrestricted_random_baseline": {"accuracy": 0.6531690140845071, "accuracy_std": 0.018515234964978734, "avg_cost": 0.00031710507042253513, "cost_std": 1.435369933644091e-05, "small_model_fraction": 0.5224222278995017}}, "0.2": {"accuracy": 0.573943661971831, "large_model_usage": 0.29225352112676056, "small_model_usage": 0.7077464788732395, "avg_lambda": 0.3100000000000012, "avg_cost": 0.0001928528169014084, "unrestricted": {"accuracy": 0.6373239436619719, "large_model_usage": 0.29225352112676056, "avg_cost": 0.00021321901408450703}, "random_baseline": {"accuracy": 0.5827464788732394, "accuracy_std": 0.01470887223535591, "avg_cost": 0.0001974795422535212, "cost_std": 1.4922605396389597e-05, "small_model_fraction": 0.7242591922132172}, "unrestricted_random_baseline": {"accuracy": 0.5904929577464789, "accuracy_std": 0.015271301719860148, "avg_cost": 0.00020618876760563387, "cost_std": 1.7412857890875972e-05, "small_model_fraction": 0.6893335354911884}}, "0.25": {"accuracy": 0.5211267605633803, "large_model_usage": 0.1936619718309859, "small_model_usage": 0.8063380281690141, "avg_lambda": 0.060000000000000074, "avg_cost": 0.00013455704225352115, "unrestricted": {"accuracy": 0.573943661971831, "large_model_usage": 0.1936619718309859, "avg_cost": 0.0001538669014084507}, "random_baseline": {"accuracy": 0.563380281690141, "accuracy_std": 0.015185815952709732, "avg_cost": 0.00014003732394366204, "cost_std": 1.3003684428595913e-05, "small_model_fraction": 0.824229657754349}, "unrestricted_random_baseline": {"accuracy": 0.5651408450704226, "accuracy_std": 0.011490963994423025, "avg_cost": 0.0001546774295774648, "cost_std": 1.0100274117852056e-05, "small_model_fraction": 0.7911154977502679}}}}, {"trial": 24, "trial_seed": 65, "subject": "TruthfulQA", "method": "standard_basescores", "calibration_size": 400, "test_size": 284, "small_model_accuracy": 0.5599415204678363, "large_model_accuracy": 0.8201754385964912, "small_model_cost": 3.206885964912288e-05, "large_model_cost": 0.0006151549707602342, "hybrid_results": {"0.05": {"accuracy": 0.8204225352112676, "large_model_usage": 1.0, "small_model_usage": 0.0, "avg_lambda": 1.0, "avg_cost": 0.0006491605633802814, "unrestricted": {"accuracy": 0.8204225352112676, "large_model_usage": 1.0, "avg_cost": 0.0006491605633802814}, "random_baseline": {"accuracy": 0.8204225352112676, "accuracy_std": 0.0, "avg_cost": 0.000617070422535211, "cost_std": 0.0, "small_model_fraction": 0.0}, "unrestricted_random_baseline": {"accuracy": 0.8204225352112676, "accuracy_std": 0.0, "avg_cost": 0.000617070422535211, "cost_std": 0.0, "small_model_fraction": 0.0}}, "0.1": {"accuracy": 0.7887323943661971, "large_model_usage": 0.7394366197183099, "small_model_usage": 0.2605633802816901, "avg_lambda": 0.8000000000000037, "avg_cost": 0.00048472394366197164, "unrestricted": {"accuracy": 0.8063380281690141, "large_model_usage": 0.7394366197183099, "avg_cost": 0.0004915126760563378}, "random_baseline": {"accuracy": 0.768661971830986, "accuracy_std": 0.014350481600221723, "avg_cost": 0.00048665499999999963, "cost_std": 1.6789661299067184e-05, "small_model_fraction": 0.22369084876626041}, "unrestricted_random_baseline": {"accuracy": 0.7647887323943661, "accuracy_std": 0.009154929577464777, "avg_cost": 0.0004932394718309854, "cost_std": 1.0401463733022203e-05, "small_model_fraction": 0.21204808749137813}}, "0.15": {"accuracy": 0.75, "large_model_usage": 0.4647887323943662, "small_model_usage": 0.5352112676056338, "avg_lambda": 0.6100000000000032, "avg_cost": 0.00031480140845070406, "unrestricted": {"accuracy": 0.7640845070422535, "large_model_usage": 0.4647887323943662, "avg_cost": 0.00032317464788732383}, "random_baseline": {"accuracy": 0.6767605633802817, "accuracy_std": 0.019843665920570918, "avg_cost": 0.0003054798239436618, "cost_std": 1.2825777281926714e-05, "small_model_fraction": 0.5151101296808209}, "unrestricted_random_baseline": {"accuracy": 0.6820422535211268, "accuracy_std": 0.008487303375488685, "avg_cost": 0.000318156056338028, "cost_std": 1.320791437744598e-05, "small_model_fraction": 0.5007499189382533}}, "0.2": {"accuracy": 0.7359154929577465, "large_model_usage": 0.4014084507042254, "small_model_usage": 0.5985915492957746, "avg_lambda": 0.5, "avg_cost": 0.00027512535211267607, "unrestricted": {"accuracy": 0.7535211267605634, "large_model_usage": 0.4014084507042254, "avg_cost": 0.00028441408450704224}, "random_baseline": {"accuracy": 0.6767605633802817, "accuracy_std": 0.022699664029980554, "avg_cost": 0.0002851984859154929, "cost_std": 1.8297747463577918e-05, "small_model_fraction": 0.5831550643516579}, "unrestricted_random_baseline": {"accuracy": 0.673943661971831, "accuracy_std": 0.014189747661697083, "avg_cost": 0.00029004753521126743, "cost_std": 1.59649878876038e-05, "small_model_fraction": 0.5672247716944279}}, "0.25": {"accuracy": 0.6091549295774648, "large_model_usage": 0.23943661971830985, "small_model_usage": 0.7605633802816901, "avg_lambda": 0.20000000000000093, "avg_cost": 0.00016601267605633808, "unrestricted": {"accuracy": 0.6901408450704225, "large_model_usage": 0.23943661971830985, "avg_cost": 0.00018312535211267614}, "random_baseline": {"accuracy": 0.620774647887324, "accuracy_std": 0.008189931936347221, "avg_cost": 0.00016420038732394368, "cost_std": 1.4099845579623284e-05, "small_model_fraction": 0.7702846734730554}, "unrestricted_random_baseline": {"accuracy": 0.6253521126760564, "accuracy_std": 0.018173926620512613, "avg_cost": 0.00018142929577464787, "cost_std": 1.2854154620539157e-05, "small_model_fraction": 0.7409362192220551}}}}, {"trial": 25, "trial_seed": 66, "subject": "TruthfulQA", "method": "standard_basescores", "calibration_size": 400, "test_size": 284, "small_model_accuracy": 0.5599415204678363, "large_model_accuracy": 0.827485380116959, "small_model_cost": 3.2012426900584826e-05, "large_model_cost": 0.000614564327485381, "hybrid_results": {"0.05": {"accuracy": 0.7922535211267606, "large_model_usage": 0.8309859154929577, "small_model_usage": 0.16901408450704225, "avg_lambda": 0.900000000000004, "avg_cost": 0.0005415602112676056, "unrestricted": {"accuracy": 0.795774647887324, "large_model_usage": 0.8309859154929577, "avg_cost": 0.0005428419014084507}, "random_baseline": {"accuracy": 0.782394366197183, "accuracy_std": 0.010656863345367305, "avg_cost": 0.0005388514436619719, "cost_std": 8.819826326382058e-06, "small_model_fraction": 0.12531778910083385}, "unrestricted_random_baseline": {"accuracy": 0.7887323943661971, "accuracy_std": 0.006492636941755588, "avg_cost": 0.0005436829929577466, "cost_std": 1.0150716683950183e-05, "small_model_fraction": 0.12311765870977608}}, "0.1": {"accuracy": 0.75, "large_model_usage": 0.6232394366197183, "small_model_usage": 0.37676056338028174, "avg_lambda": 0.7599999999999976, "avg_cost": 0.0004131588028169014, "unrestricted": {"accuracy": 0.7640845070422535, "large_model_usage": 0.6232394366197183, "avg_cost": 0.00041746866197183097}, "random_baseline": {"accuracy": 0.731338028169014, "accuracy_std": 0.014263823310231736, "avg_cost": 0.00041032362676056335, "cost_std": 1.7083269188149573e-05, "small_model_fraction": 0.3457297529478458}, "unrestricted_random_baseline": {"accuracy": 0.7267605633802817, "accuracy_std": 0.01462009823382161, "avg_cost": 0.0004201754929577463, "cost_std": 1.4915842884482609e-05, "small_model_fraction": 0.33833151229220104}}, "0.15": {"accuracy": 0.6690140845070423, "large_model_usage": 0.4119718309859155, "small_model_usage": 0.5880281690140845, "avg_lambda": 0.5, "avg_cost": 0.000275912323943662, "unrestricted": {"accuracy": 0.704225352112676, "large_model_usage": 0.4119718309859155, "avg_cost": 0.0002883559859154929}, "random_baseline": {"accuracy": 0.6588028169014085, "accuracy_std": 0.01331060434608153, "avg_cost": 0.000277285809859155, "cost_std": 1.4293156226632175e-05, "small_model_fraction": 0.5813250342188607}, "unrestricted_random_baseline": {"accuracy": 0.6746478873239437, "accuracy_std": 0.020843167023871463, "avg_cost": 0.00028816112676056336, "cost_std": 1.8022661946482194e-05, "small_model_fraction": 0.5599644276199649}}, "0.2": {"accuracy": 0.6161971830985915, "large_model_usage": 0.2887323943661972, "small_model_usage": 0.7112676056338028, "avg_lambda": 0.34999999999999865, "avg_cost": 0.00019583485915492942, "unrestricted": {"accuracy": 0.676056338028169, "large_model_usage": 0.2887323943661972, "avg_cost": 0.00021038415492957732}, "random_baseline": {"accuracy": 0.622887323943662, "accuracy_std": 0.01593869056022013, "avg_cost": 0.0001947480633802816, "cost_std": 2.0208585454883636e-05, "small_model_fraction": 0.718784829145882}, "unrestricted_random_baseline": {"accuracy": 0.6302816901408452, "accuracy_std": 0.012695603082619702, "avg_cost": 0.00019989204225352104, "cost_std": 1.568251978573578e-05, "small_model_fraction": 0.6938097226188198}}, "0.25": {"accuracy": 0.5633802816901409, "large_model_usage": 0.19718309859154928, "small_model_usage": 0.8028169014084507, "avg_lambda": 0.10999999999999963, "avg_cost": 0.00013623626760563367, "unrestricted": {"accuracy": 0.6302816901408451, "large_model_usage": 0.19718309859154928, "avg_cost": 0.0001538911971830984}, "random_baseline": {"accuracy": 0.605281690140845, "accuracy_std": 0.011404320239930722, "avg_cost": 0.00013493556338028156, "cost_std": 1.4413766220712191e-05, "small_model_fraction": 0.8210908923300679}, "unrestricted_random_baseline": {"accuracy": 0.6059859154929577, "accuracy_std": 0.015544887362425435, "avg_cost": 0.00014598144366197174, "cost_std": 9.098109121768004e-06, "small_model_fraction": 0.7907847006246735}}}}, {"trial": 26, "trial_seed": 67, "subject": "TruthfulQA", "method": "standard_basescores", "calibration_size": 400, "test_size": 284, "small_model_accuracy": 0.5482456140350878, "large_model_accuracy": 0.8245614035087719, "small_model_cost": 3.2065058479532176e-05, "large_model_cost": 0.0006158976608187144, "hybrid_results": {"0.05": {"accuracy": 0.8169014084507042, "large_model_usage": 0.8309859154929577, "small_model_usage": 0.16901408450704225, "avg_lambda": 0.900000000000004, "avg_cost": 0.0005452792253521129, "unrestricted": {"accuracy": 0.823943661971831, "large_model_usage": 0.8309859154929577, "avg_cost": 0.0005464764084507044}, "random_baseline": {"accuracy": 0.8084507042253521, "accuracy_std": 0.011909531355836458, "avg_cost": 0.0005367464084507043, "cost_std": 1.1068391284547652e-05, "small_model_fraction": 0.12095664953218074}, "unrestricted_random_baseline": {"accuracy": 0.818661971830986, "accuracy_std": 0.0069089496022354075, "avg_cost": 0.0005499009859154929, "cost_std": 6.01748806493081e-06, "small_model_fraction": 0.11890609070111365}}, "0.1": {"accuracy": 0.778169014084507, "large_model_usage": 0.6654929577464789, "small_model_usage": 0.3345070422535211, "avg_lambda": 0.6999999999999973, "avg_cost": 0.00042982147887323934, "unrestricted": {"accuracy": 0.795774647887324, "large_model_usage": 0.6654929577464789, "avg_cost": 0.00044564542253521113}, "random_baseline": {"accuracy": 0.7454225352112677, "accuracy_std": 0.00945473350851939, "avg_cost": 0.00043097721830985894, "cost_std": 1.141171735644811e-05, "small_model_fraction": 0.31871495562245533}, "unrestricted_random_baseline": {"accuracy": 0.7556338028169014, "accuracy_std": 0.019930946053640704, "avg_cost": 0.0004458694014084504, "cost_std": 2.1042550378683864e-05, "small_model_fraction": 0.29161139272005554}}, "0.15": {"accuracy": 0.7077464788732394, "large_model_usage": 0.4507042253521127, "small_model_usage": 0.5492957746478873, "avg_lambda": 0.5, "avg_cost": 0.0003007299295774646, "unrestricted": {"accuracy": 0.7253521126760564, "large_model_usage": 0.4507042253521127, "avg_cost": 0.0003127369718309857}, "random_baseline": {"accuracy": 0.6732394366197182, "accuracy_std": 0.014066890391153627, "avg_cost": 0.0002972371126760561, "cost_std": 1.3363309913894517e-05, "small_model_fraction": 0.5398255081653535}, "unrestricted_random_baseline": {"accuracy": 0.6852112676056338, "accuracy_std": 0.017047490756211557, "avg_cost": 0.0003099821478873237, "cost_std": 1.9944363034279583e-05, "small_model_fraction": 0.5192596093008268}}, "0.2": {"accuracy": 0.6091549295774648, "large_model_usage": 0.2887323943661972, "small_model_usage": 0.7112676056338028, "avg_lambda": 0.3100000000000012, "avg_cost": 0.00019469471830985908, "unrestricted": {"accuracy": 0.6830985915492958, "large_model_usage": 0.2887323943661972, "avg_cost": 0.00021174401408450704}, "random_baseline": {"accuracy": 0.6225352112676057, "accuracy_std": 0.01914405242122282, "avg_cost": 0.0002027940492957745, "cost_std": 1.4671960554535154e-05, "small_model_fraction": 0.7214447100440514}, "unrestricted_random_baseline": {"accuracy": 0.6278169014084507, "accuracy_std": 0.0180953030387741, "avg_cost": 0.0002170829577464787, "cost_std": 2.1041602404985376e-05, "small_model_fraction": 0.6922423398675004}}, "0.25": {"accuracy": 0.5704225352112676, "large_model_usage": 0.1936619718309859, "small_model_usage": 0.8063380281690141, "avg_lambda": 0.07000000000000021, "avg_cost": 0.00013364542253521125, "unrestricted": {"accuracy": 0.6373239436619719, "large_model_usage": 0.1936619718309859, "avg_cost": 0.0001528566901408451}, "random_baseline": {"accuracy": 0.5919014084507043, "accuracy_std": 0.014124064169106726, "avg_cost": 0.0001350813732394366, "cost_std": 1.5384512068191283e-05, "small_model_fraction": 0.8260111483177071}, "unrestricted_random_baseline": {"accuracy": 0.5992957746478873, "accuracy_std": 0.011442307612163308, "avg_cost": 0.00015176538732394358, "cost_std": 9.705278961500351e-06, "small_model_fraction": 0.7931057101344641}}}}, {"trial": 27, "trial_seed": 68, "subject": "TruthfulQA", "method": "standard_basescores", "calibration_size": 400, "test_size": 284, "small_model_accuracy": 0.5482456140350878, "large_model_accuracy": 0.8260233918128655, "small_model_cost": 3.200994152046785e-05, "large_model_cost": 0.0006154736842105275, "hybrid_results": {"0.05": {"accuracy": 0.8133802816901409, "large_model_usage": 0.6901408450704225, "small_model_usage": 0.3098591549295775, "avg_lambda": 0.8100000000000018, "avg_cost": 0.00045687957746478854, "unrestricted": {"accuracy": 0.8309859154929577, "large_model_usage": 0.6901408450704225, "avg_cost": 0.0004623654929577463}, "random_baseline": {"accuracy": 0.7806338028169014, "accuracy_std": 0.011360750566832181, "avg_cost": 0.0004687687323943661, "cost_std": 1.356332287202842e-05, "small_model_fraction": 0.2718148449371349}, "unrestricted_random_baseline": {"accuracy": 0.7855633802816901, "accuracy_std": 0.014211574697613097, "avg_cost": 0.0004700713732394366, "cost_std": 1.1176160104342362e-05, "small_model_fraction": 0.26241252035109475}}, "0.1": {"accuracy": 0.7852112676056338, "large_model_usage": 0.5774647887323944, "small_model_usage": 0.4225352112676056, "avg_lambda": 0.7299999999999969, "avg_cost": 0.00037883028169014075, "unrestricted": {"accuracy": 0.8063380281690141, "large_model_usage": 0.5774647887323944, "avg_cost": 0.0003928161971830985}, "random_baseline": {"accuracy": 0.7559859154929578, "accuracy_std": 0.011789203551255765, "avg_cost": 0.0003901331690140843, "cost_std": 1.452482059120546e-05, "small_model_fraction": 0.4055837324686917}, "unrestricted_random_baseline": {"accuracy": 0.7426056338028169, "accuracy_std": 0.017566858708825923, "avg_cost": 0.00038765387323943645, "cost_std": 1.3604445599378773e-05, "small_model_fraction": 0.3816132361556292}}, "0.15": {"accuracy": 0.7394366197183099, "large_model_usage": 0.4471830985915493, "small_model_usage": 0.5528169014084507, "avg_lambda": 0.5800000000000017, "avg_cost": 0.00030326690140845055, "unrestricted": {"accuracy": 0.7676056338028169, "large_model_usage": 0.4471830985915493, "avg_cost": 0.0003121894366197182}, "random_baseline": {"accuracy": 0.7007042253521127, "accuracy_std": 0.023356512608136634, "avg_cost": 0.00029786450704225346, "cost_std": 1.7942949714316848e-05, "small_model_fraction": 0.5350920030825695}, "unrestricted_random_baseline": {"accuracy": 0.7098591549295775, "accuracy_std": 0.011267605633802849, "avg_cost": 0.00031486021126760555, "cost_std": 1.5805466889639297e-05, "small_model_fraction": 0.5197996471769047}}, "0.2": {"accuracy": 0.704225352112676, "large_model_usage": 0.3485915492957746, "small_model_usage": 0.6514084507042254, "avg_lambda": 0.40000000000000185, "avg_cost": 0.0002371683098591549, "unrestricted": {"accuracy": 0.7429577464788732, "large_model_usage": 0.3485915492957746, "avg_cost": 0.0002509147887323943}, "random_baseline": {"accuracy": 0.6707746478873239, "accuracy_std": 0.010235522432217333, "avg_cost": 0.00023856091549295764, "cost_std": 1.0869747670792602e-05, "small_model_fraction": 0.6483785480948578}, "unrestricted_random_baseline": {"accuracy": 0.6894366197183099, "accuracy_std": 0.01596589302641498, "avg_cost": 0.00025638612676056336, "cost_std": 1.11219119931626e-05, "small_model_fraction": 0.6248184228163594}}, "0.25": {"accuracy": 0.6197183098591549, "large_model_usage": 0.2605633802816901, "small_model_usage": 0.7394366197183099, "avg_lambda": 0.2700000000000005, "avg_cost": 0.00017718239436619718, "unrestricted": {"accuracy": 0.7077464788732394, "large_model_usage": 0.2605633802816901, "avg_cost": 0.0001963302816901408}, "random_baseline": {"accuracy": 0.6369718309859155, "accuracy_std": 0.012444082427515721, "avg_cost": 0.0001681030633802817, "cost_std": 1.3867718935454864e-05, "small_model_fraction": 0.7511885619894534}, "unrestricted_random_baseline": {"accuracy": 0.657394366197183, "accuracy_std": 0.01753860479635531, "avg_cost": 0.00019168126760563386, "cost_std": 1.735813759396455e-05, "small_model_fraction": 0.7183709489606432}}}}, {"trial": 28, "trial_seed": 69, "subject": "TruthfulQA", "method": "standard_basescores", "calibration_size": 400, "test_size": 284, "small_model_accuracy": 0.5497076023391813, "large_model_accuracy": 0.8508771929824561, "small_model_cost": 3.2026169590643274e-05, "large_model_cost": 0.0006146286549707607, "hybrid_results": {"0.05": {"accuracy": 0.8485915492957746, "large_model_usage": 0.8380281690140845, "small_model_usage": 0.1619718309859155, "avg_lambda": 0.900000000000004, "avg_cost": 0.0005497482394366199, "unrestricted": {"accuracy": 0.8485915492957746, "large_model_usage": 0.8380281690140845, "avg_cost": 0.0005501566901408453}, "random_baseline": {"accuracy": 0.8274647887323944, "accuracy_std": 0.011463958166267383, "avg_cost": 0.0005548038028169017, "cost_std": 9.265202334849103e-06, "small_model_fraction": 0.11136309432633076}, "unrestricted_random_baseline": {"accuracy": 0.8295774647887324, "accuracy_std": 0.009073308962482471, "avg_cost": 0.0005498225000000003, "cost_std": 1.363073711899362e-05, "small_model_fraction": 0.11066201474896012}}, "0.1": {"accuracy": 0.7992957746478874, "large_model_usage": 0.6795774647887324, "small_model_usage": 0.3204225352112676, "avg_lambda": 0.7599999999999976, "avg_cost": 0.0004477200704225348, "unrestricted": {"accuracy": 0.8133802816901409, "large_model_usage": 0.6795774647887324, "avg_cost": 0.0004524595070422533}, "random_baseline": {"accuracy": 0.7672535211267605, "accuracy_std": 0.01706566312120087, "avg_cost": 0.00044145887323943653, "cost_std": 1.5940467171544688e-05, "small_model_fraction": 0.28648793772193887}, "unrestricted_random_baseline": {"accuracy": 0.7679577464788732, "accuracy_std": 0.012038958448787869, "avg_cost": 0.00044894672535211266, "cost_std": 7.31238944553317e-06, "small_model_fraction": 0.2783529971086556}}, "0.15": {"accuracy": 0.7183098591549296, "large_model_usage": 0.4823943661971831, "small_model_usage": 0.5176056338028169, "avg_lambda": 0.5, "avg_cost": 0.0003154806338028165, "unrestricted": {"accuracy": 0.7640845070422535, "large_model_usage": 0.4823943661971831, "avg_cost": 0.00032993838028168976}, "random_baseline": {"accuracy": 0.6827464788732395, "accuracy_std": 0.017353833141340286, "avg_cost": 0.0003163292253521126, "cost_std": 1.9353835509884425e-05, "small_model_fraction": 0.5134684946851297}, "unrestricted_random_baseline": {"accuracy": 0.7, "accuracy_std": 0.014922267676350081, "avg_cost": 0.0003334463732394365, "cost_std": 1.7297199766700466e-05, "small_model_fraction": 0.48865269516199455}}, "0.2": {"accuracy": 0.6408450704225352, "large_model_usage": 0.3380281690140845, "small_model_usage": 0.6619718309859155, "avg_lambda": 0.3400000000000022, "avg_cost": 0.00022115669014084502, "unrestricted": {"accuracy": 0.7112676056338029, "large_model_usage": 0.3380281690140845, "avg_cost": 0.00023972007042253506}, "random_baseline": {"accuracy": 0.629225352112676, "accuracy_std": 0.019289219612397674, "avg_cost": 0.00022115073943661966, "cost_std": 1.6119199325607925e-05, "small_model_fraction": 0.6753695267420563}, "unrestricted_random_baseline": {"accuracy": 0.6450704225352113, "accuracy_std": 0.016349558825695637, "avg_cost": 0.0002351248591549295, "cost_std": 1.6969214528753838e-05, "small_model_fraction": 0.6435066687084549}}, "0.25": {"accuracy": 0.5669014084507042, "large_model_usage": 0.2535211267605634, "small_model_usage": 0.7464788732394366, "avg_lambda": 0.10999999999999963, "avg_cost": 0.0001653961267605635, "unrestricted": {"accuracy": 0.676056338028169, "large_model_usage": 0.2535211267605634, "avg_cost": 0.00018903697183098608}, "random_baseline": {"accuracy": 0.6014084507042254, "accuracy_std": 0.0153320711669558, "avg_cost": 0.00016640380281690147, "cost_std": 1.504151185546303e-05, "small_model_fraction": 0.7710789766320627}, "unrestricted_random_baseline": {"accuracy": 0.6179577464788732, "accuracy_std": 0.019557315601406798, "avg_cost": 0.0001945413732394366, "cost_std": 1.0739322944734292e-05, "small_model_fraction": 0.7305009741970092}}}}, {"trial": 29, "trial_seed": 70, "subject": "TruthfulQA", "method": "standard_basescores", "calibration_size": 400, "test_size": 284, "small_model_accuracy": 0.5760233918128655, "large_model_accuracy": 0.8347953216374269, "small_model_cost": 3.206681286549711e-05, "large_model_cost": 0.0006161666666666675, "hybrid_results": {"0.05": {"accuracy": 0.7852112676056338, "large_model_usage": 0.8028169014084507, "small_model_usage": 0.19718309859154926, "avg_lambda": 0.8600000000000062, "avg_cost": 0.0005194929577464786, "unrestricted": {"accuracy": 0.8098591549295775, "large_model_usage": 0.8028169014084507, "avg_cost": 0.0005303521126760563}, "random_baseline": {"accuracy": 0.7834507042253522, "accuracy_std": 0.011914735433583571, "avg_cost": 0.0005197039788732395, "cost_std": 1.2062491886013773e-05, "small_model_fraction": 0.16550887368154868}, "unrestricted_random_baseline": {"accuracy": 0.7883802816901408, "accuracy_std": 0.010015114544597139, "avg_cost": 0.0005312186971830987, "cost_std": 1.0981663288043399e-05, "small_model_fraction": 0.14691760909055593}}, "0.1": {"accuracy": 0.7464788732394366, "large_model_usage": 0.6267605633802817, "small_model_usage": 0.37323943661971826, "avg_lambda": 0.6999999999999973, "avg_cost": 0.0004069436619718313, "unrestricted": {"accuracy": 0.7746478873239436, "large_model_usage": 0.6267605633802817, "avg_cost": 0.0004231760563380282}, "random_baseline": {"accuracy": 0.7309859154929578, "accuracy_std": 0.01536438326005028, "avg_cost": 0.00039792489436619717, "cost_std": 9.66492578192993e-06, "small_model_fraction": 0.3581973241959695}, "unrestricted_random_baseline": {"accuracy": 0.7355633802816902, "accuracy_std": 0.020678939239473264, "avg_cost": 0.0004169198239436618, "cost_std": 1.794556486240422e-05, "small_model_fraction": 0.3304068800440652}}, "0.15": {"accuracy": 0.6795774647887324, "large_model_usage": 0.4119718309859155, "small_model_usage": 0.5880281690140845, "avg_lambda": 0.5, "avg_cost": 0.0002792323943661971, "unrestricted": {"accuracy": 0.7147887323943662, "large_model_usage": 0.4119718309859155, "avg_cost": 0.0002898450704225352}, "random_baseline": {"accuracy": 0.679225352112676, "accuracy_std": 0.02091738977504104, "avg_cost": 0.00027722267605633785, "cost_std": 1.6441652406988974e-05, "small_model_fraction": 0.5768436169052081}, "unrestricted_random_baseline": {"accuracy": 0.6852112676056338, "accuracy_std": 0.00850918730534828, "avg_cost": 0.0002857069718309858, "cost_std": 1.4431368058705471e-05, "small_model_fraction": 0.558674333028019}}, "0.2": {"accuracy": 0.6267605633802817, "large_model_usage": 0.25704225352112675, "small_model_usage": 0.7429577464788732, "avg_lambda": 0.3100000000000012, "avg_cost": 0.0001766971830985914, "unrestricted": {"accuracy": 0.6690140845070423, "large_model_usage": 0.25704225352112675, "avg_cost": 0.00019245774647887314}, "random_baseline": {"accuracy": 0.631338028169014, "accuracy_std": 0.011997693688310464, "avg_cost": 0.0001812086971830985, "cost_std": 1.618123984015376e-05, "small_model_fraction": 0.7523875938473922}, "unrestricted_random_baseline": {"accuracy": 0.6383802816901408, "accuracy_std": 0.016444076660331864, "avg_cost": 0.00019135514084507032, "cost_std": 1.911863972212557e-05, "small_model_fraction": 0.7254049413476251}}, "0.25": {"accuracy": 0.5845070422535211, "large_model_usage": 0.16901408450704225, "small_model_usage": 0.8309859154929577, "avg_lambda": 0.07000000000000021, "avg_cost": 0.00012155633802816905, "unrestricted": {"accuracy": 0.6338028169014085, "large_model_usage": 0.16901408450704225, "avg_cost": 0.00013871830985915496}, "random_baseline": {"accuracy": 0.6095070422535211, "accuracy_std": 0.015782348472545775, "avg_cost": 0.000116160809859155, "cost_std": 1.5051042838419968e-05, "small_model_fraction": 0.8467907078211074}, "unrestricted_random_baseline": {"accuracy": 0.6190140845070423, "accuracy_std": 0.010772576437167891, "avg_cost": 0.00013750552816901412, "cost_std": 9.896354472988429e-06, "small_model_fraction": 0.817408793548573}}}}, {"trial": 30, "trial_seed": 71, "subject": "TruthfulQA", "method": "standard_basescores", "calibration_size": 400, "test_size": 284, "small_model_accuracy": 0.564327485380117, "large_model_accuracy": 0.8260233918128655, "small_model_cost": 3.202997076023393e-05, "large_model_cost": 0.0006142368421052644, "hybrid_results": {"0.05": {"accuracy": 0.8204225352112676, "large_model_usage": 0.7887323943661971, "small_model_usage": 0.21126760563380287, "avg_lambda": 0.900000000000004, "avg_cost": 0.0005206080985915493, "unrestricted": {"accuracy": 0.8204225352112676, "large_model_usage": 0.7887323943661971, "avg_cost": 0.000521481338028169}, "random_baseline": {"accuracy": 0.8161971830985916, "accuracy_std": 0.009154929577464777, "avg_cost": 0.0005311606690140844, "cost_std": 1.2370807907822956e-05, "small_model_fraction": 0.16081696751089763}, "unrestricted_random_baseline": {"accuracy": 0.8123239436619718, "accuracy_std": 0.00705105084313407, "avg_cost": 0.0005239615140845069, "cost_std": 1.4944197560998903e-05, "small_model_fraction": 0.15931708923807283}}, "0.1": {"accuracy": 0.778169014084507, "large_model_usage": 0.573943661971831, "small_model_usage": 0.426056338028169, "avg_lambda": 0.6999999999999973, "avg_cost": 0.0003730588028169015, "unrestricted": {"accuracy": 0.7992957746478874, "large_model_usage": 0.573943661971831, "avg_cost": 0.0003877841549295775}, "random_baseline": {"accuracy": 0.7404929577464789, "accuracy_std": 0.010331972359944501, "avg_cost": 0.00036721169014084507, "cost_std": 1.23824498141571e-05, "small_model_fraction": 0.41424801244819853}, "unrestricted_random_baseline": {"accuracy": 0.7598591549295775, "accuracy_std": 0.013527727262182057, "avg_cost": 0.00038856387323943657, "cost_std": 1.4186043047753182e-05, "small_model_fraction": 0.38895571028306425}}, "0.15": {"accuracy": 0.7183098591549296, "large_model_usage": 0.39436619718309857, "small_model_usage": 0.6056338028169015, "avg_lambda": 0.5, "avg_cost": 0.00026423485915493, "unrestricted": {"accuracy": 0.7535211267605634, "large_model_usage": 0.39436619718309857, "avg_cost": 0.00027645316901408486}, "random_baseline": {"accuracy": 0.705281690140845, "accuracy_std": 0.014776151590392472, "avg_cost": 0.0002695883450704226, "cost_std": 1.8861431665690668e-05, "small_model_fraction": 0.6011642943027281}, "unrestricted_random_baseline": {"accuracy": 0.7066901408450705, "accuracy_std": 0.012700485068219353, "avg_cost": 0.0002786080281690141, "cost_std": 1.2599246857243901e-05, "small_model_fraction": 0.5801780942757037}}, "0.2": {"accuracy": 0.6584507042253521, "large_model_usage": 0.2887323943661972, "small_model_usage": 0.7112676056338028, "avg_lambda": 0.3400000000000022, "avg_cost": 0.00019344612676056345, "unrestricted": {"accuracy": 0.7147887323943662, "large_model_usage": 0.2887323943661972, "avg_cost": 0.00021108697183098608}, "random_baseline": {"accuracy": 0.6711267605633802, "accuracy_std": 0.016455382317673108, "avg_cost": 0.00019505686619718325, "cost_std": 1.740786320118454e-05, "small_model_fraction": 0.7227512007417203}, "unrestricted_random_baseline": {"accuracy": 0.677112676056338, "accuracy_std": 0.018899629437069892, "avg_cost": 0.00022119552816901415, "cost_std": 1.5043998651744592e-05, "small_model_fraction": 0.6924512404721543}}, "0.25": {"accuracy": 0.6373239436619719, "large_model_usage": 0.25704225352112675, "small_model_usage": 0.7429577464788732, "avg_lambda": 0.20000000000000093, "avg_cost": 0.00016985457746478887, "unrestricted": {"accuracy": 0.704225352112676, "large_model_usage": 0.25704225352112675, "avg_cost": 0.0001916151408450706}, "random_baseline": {"accuracy": 0.6549295774647887, "accuracy_std": 0.01863205148637035, "avg_cost": 0.00016508158450704232, "cost_std": 1.6649419260896277e-05, "small_model_fraction": 0.7632721056930354}, "unrestricted_random_baseline": {"accuracy": 0.6778169014084507, "accuracy_std": 0.0163836480621575, "avg_cost": 0.0001947485915492959, "cost_std": 1.4491509314726511e-05, "small_model_fraction": 0.725896106797643}}}}], "unrestricted_hybrid": {"0.05": {"avg_accuracy": 0.8166666666666665, "std_accuracy": 0.01880719765160027, "avg_cost": 0.0005558828755868543, "std_cost": 5.8057534066489576e-05}, "0.1": {"avg_accuracy": 0.7873239436619719, "std_accuracy": 0.022075714451659637, "avg_cost": 0.00042341127934272284, "std_cost": 2.7055243114227217e-05}, "0.15": {"avg_accuracy": 0.7417840375586855, "std_accuracy": 0.025880215153268445, "avg_cost": 0.0003151126877934272, "std_cost": 3.3002302621535985e-05}, "0.2": {"avg_accuracy": 0.7042253521126761, "std_accuracy": 0.02765074880138439, "avg_cost": 0.00023587512910798117, "std_cost": 2.7676901310955677e-05}, "0.25": {"avg_accuracy": 0.6679577464788732, "std_accuracy": 0.037475884374808366, "avg_cost": 0.00018128968309859159, "std_cost": 2.7234359964221032e-05}}, "random_baseline": {"0.05": {"avg_accuracy": 0.8004107981220657, "std_accuracy": 0.024645386079213627, "avg_cost": 0.0005473049941314552, "std_cost": 4.511638582546973e-05, "avg_small_model_fraction": 0.11874928246892744, "std_small_model_fraction": 0.07870802952242395}, "0.1": {"avg_accuracy": 0.7389906103286386, "std_accuracy": 0.022270105347494414, "avg_cost": 0.0004155059542253519, "std_cost": 3.0495636238525205e-05, "avg_small_model_fraction": 0.3442899236975524, "std_small_model_fraction": 0.05083781445988762}, "0.15": {"avg_accuracy": 0.6852230046948355, "std_accuracy": 0.023983239062158126, "avg_cost": 0.00030301074647887317, "std_cost": 3.1753603570213705e-05, "avg_small_model_fraction": 0.5334708984972655, "std_small_model_fraction": 0.05404472715504649}, "0.2": {"avg_accuracy": 0.6468896713615024, "std_accuracy": 0.02499373116757136, "avg_cost": 0.00022160354929577457, "std_cost": 2.83923504439403e-05, "avg_small_model_fraction": 0.6738577118811103, "std_small_model_fraction": 0.048222280152428666}, "0.25": {"avg_accuracy": 0.6191079812206574, "std_accuracy": 0.02831708929076901, "avg_cost": 0.0001619264589201878, "std_cost": 2.5498099956159227e-05, "avg_small_model_fraction": 0.7752076469903144, "std_small_model_fraction": 0.043741836678294246}}, "unrestricted_random_baseline": {"0.05": {"avg_accuracy": 0.8013849765258215, "std_accuracy": 0.02445108103529701, "avg_cost": 0.0005485924624413145, "std_cost": 4.407515525984178e-05, "avg_small_model_fraction": 0.11574864168566479, "std_small_model_fraction": 0.07629294916674927}, "0.1": {"avg_accuracy": 0.7439553990610329, "std_accuracy": 0.020182350641987517, "avg_cost": 0.00042378800234741765, "std_cost": 2.7762444484321497e-05, "avg_small_model_fraction": 0.3282688725418826, "std_small_model_fraction": 0.04626250135492599}, "0.15": {"avg_accuracy": 0.6934154929577464, "std_accuracy": 0.022335372365198204, "avg_cost": 0.0003156449812206572, "std_cost": 3.471784205242771e-05, "avg_small_model_fraction": 0.5141562861988532, "std_small_model_fraction": 0.056477498439011335}, "0.2": {"avg_accuracy": 0.6542136150234742, "std_accuracy": 0.025117998873286904, "avg_cost": 0.0002363462007042253, "std_cost": 2.8911580349045335e-05, "avg_small_model_fraction": 0.6501570844585284, "std_small_model_fraction": 0.04734326668636031}, "0.25": {"avg_accuracy": 0.6283333333333335, "std_accuracy": 0.028996321499373885, "avg_cost": 0.00018107349882629103, "std_cost": 2.756204290636688e-05, "avg_small_model_fraction": 0.7438363920111161, "std_small_model_fraction": 0.046712554414747635}}}