{
  "rte": {
    "median": 0.8,
    "interquartile_range": 0.01,
    "average_scores_for_each_prompt": [0.812, 0.796, 0.82, 0.792, 0.8, 0.8, 0.8, 0.784, 0.804, 0.808],
    "0": {"accuracy": 0.812, "average": 0.812},
    "1": {"accuracy": 0.796, "average": 0.796},
    "2": {"accuracy": 0.82, "average": 0.82},
    "3": {"accuracy": 0.792, "average": 0.792},
    "4": {"accuracy": 0.8, "average": 0.8},
    "5": {"accuracy": 0.8, "average": 0.8},
    "6": {"accuracy": 0.8, "average": 0.8},
    "7": {"accuracy": 0.784, "average": 0.784},
    "8": {"accuracy": 0.804, "average": 0.804},
    "9": {"accuracy": 0.808, "average": 0.808}
  },
  "cb": {
    "median": 0.833,
    "interquartile_range": 0.041,
    "average_scores_for_each_prompt": [0.833, 0.833, 0.792, 0.833, 0.833, 0.917, 0.875, 0.792, 0.833, 0.792, 0.875, 0.792, 0.833, 0.833, 0.792],
    "0": {"accuracy": 0.833, "average": 0.833},
    "1": {"accuracy": 0.833, "average": 0.833},
    "2": {"accuracy": 0.792, "average": 0.792},
    "3": {"accuracy": 0.833, "average": 0.833},
    "4": {"accuracy": 0.833, "average": 0.833},
    "5": {"accuracy": 0.917, "average": 0.917},
    "6": {"accuracy": 0.875, "average": 0.875},
    "7": {"accuracy": 0.792, "average": 0.792},
    "8": {"accuracy": 0.833, "average": 0.833},
    "9": {"accuracy": 0.792, "average": 0.792},
    "10": {"accuracy": 0.875, "average": 0.875},
    "11": {"accuracy": 0.792, "average": 0.792},
    "12": {"accuracy": 0.833, "average": 0.833},
    "13": {"accuracy": 0.833, "average": 0.833},
    "14": {"accuracy": 0.792, "average": 0.792}
  },
  "winogrande": {
    "median": 0.676,
    "interquartile_range": 0.007,
    "average_scores_for_each_prompt": [0.681, 0.669, 0.688, 0.674, 0.676],
    "0": {"accuracy": 0.681, "average": 0.681},
    "1": {"accuracy": 0.669, "average": 0.669},
    "2": {"accuracy": 0.688, "average": 0.688},
    "3": {"accuracy": 0.674, "average": 0.674},
    "4": {"accuracy": 0.676, "average": 0.676}
  },
  "wic": {
    "median": 0.645,
    "interquartile_range": 0.089,
    "average_scores_for_each_prompt": [0.645, 0.658, 0.645, 0.538, 0.545, 0.655, 0.665, 0.523, 0.624, 0.652],
    "0": {"accuracy": 0.645, "average": 0.645},
    "1": {"accuracy": 0.658, "average": 0.658},
    "2": {"accuracy": 0.645, "average": 0.645},
    "3": {"accuracy": 0.538, "average": 0.538},
    "4": {"accuracy": 0.545, "average": 0.545},
    "5": {"accuracy": 0.655, "average": 0.655},
    "6": {"accuracy": 0.665, "average": 0.665},
    "7": {"accuracy": 0.523, "average": 0.523},
    "8": {"accuracy": 0.624, "average": 0.624},
    "9": {"accuracy": 0.652, "average": 0.652}
  },
  "wsc": {
    "median": 0.618,
    "interquartile_range": 0.074,
    "average_scores_for_each_prompt": [0.597, 0.569, 0.708, 0.681, 0.569, 0.639, 0.625, 0.528, 0.611, 0.653],
    "0": {"accuracy": 0.597, "average": 0.597},
    "1": {"accuracy": 0.569, "average": 0.569},
    "2": {"accuracy": 0.708, "average": 0.708},
    "3": {"accuracy": 0.681, "average": 0.681},
    "4": {"accuracy": 0.569, "average": 0.569},
    "5": {"accuracy": 0.639, "average": 0.639},
    "6": {"accuracy": 0.625, "average": 0.625},
    "7": {"accuracy": 0.528, "average": 0.528},
    "8": {"accuracy": 0.611, "average": 0.611},
    "9": {"accuracy": 0.653, "average": 0.653}
  },
  "copa": {
    "median": 0.883,
    "interquartile_range": 0.052,
    "average_scores_for_each_prompt": [0.912, 0.838, 0.926, 0.838, 0.897, 0.868, 0.853, 0.897],
    "0": {"accuracy": 0.912, "average": 0.912},
    "1": {"accuracy": 0.838, "average": 0.838},
    "2": {"accuracy": 0.926, "average": 0.926},
    "3": {"accuracy": 0.838, "average": 0.838},
    "4": {"accuracy": 0.897, "average": 0.897},
    "5": {"accuracy": 0.868, "average": 0.868},
    "6": {"accuracy": 0.853, "average": 0.853},
    "7": {"accuracy": 0.897, "average": 0.897}
  },
  "h-swag": {
    "median": 0.427,
    "interquartile_range": 0.0,
    "average_scores_for_each_prompt": [0.427],
    "h-swag": {"accuracy": 0.427, "average": 0.427}
  },
  "story_cloze": {
    "median": 0.929,
    "interquartile_range": 0.006,
    "average_scores_for_each_prompt": [0.929, 0.929, 0.929, 0.922, 0.923],
    "0": {"accuracy": 0.929, "average": 0.929},
    "1": {"accuracy": 0.929, "average": 0.929},
    "2": {"accuracy": 0.929, "average": 0.929},
    "3": {"accuracy": 0.922, "average": 0.922},
    "4": {"accuracy": 0.923, "average": 0.923}
  },
  "anli-r1": {
    "median": 0.671,
    "interquartile_range": 0.022,
    "average_scores_for_each_prompt": [0.626, 0.681, 0.685, 0.67, 0.688, 0.681, 0.659, 0.673, 0.671, 0.622, 0.658, 0.68, 0.633, 0.668, 0.676],
    "0": {"accuracy": 0.626, "average": 0.626},
    "1": {"accuracy": 0.681, "average": 0.681},
    "2": {"accuracy": 0.685, "average": 0.685},
    "3": {"accuracy": 0.67, "average": 0.67},
    "4": {"accuracy": 0.688, "average": 0.688},
    "5": {"accuracy": 0.681, "average": 0.681},
    "6": {"accuracy": 0.659, "average": 0.659},
    "7": {"accuracy": 0.673, "average": 0.673},
    "8": {"accuracy": 0.671, "average": 0.671},
    "9": {"accuracy": 0.622, "average": 0.622},
    "10": {"accuracy": 0.658, "average": 0.658},
    "11": {"accuracy": 0.68, "average": 0.68},
    "12": {"accuracy": 0.633, "average": 0.633},
    "13": {"accuracy": 0.668, "average": 0.668},
    "14": {"accuracy": 0.676, "average": 0.676}
  },
  "anli-r2": {
    "median": 0.513,
    "interquartile_range": 0.018,
    "average_scores_for_each_prompt": [0.498, 0.524, 0.532, 0.49, 0.513, 0.52, 0.508, 0.513, 0.514, 0.494, 0.499, 0.514, 0.498, 0.505, 0.518],
    "0": {"accuracy": 0.498, "average": 0.498},
    "1": {"accuracy": 0.524, "average": 0.524},
    "2": {"accuracy": 0.532, "average": 0.532},
    "3": {"accuracy": 0.49, "average": 0.49},
    "4": {"accuracy": 0.513, "average": 0.513},
    "5": {"accuracy": 0.52, "average": 0.52},
    "6": {"accuracy": 0.508, "average": 0.508},
    "7": {"accuracy": 0.513, "average": 0.513},
    "8": {"accuracy": 0.514, "average": 0.514},
    "9": {"accuracy": 0.494, "average": 0.494},
    "10": {"accuracy": 0.499, "average": 0.499},
    "11": {"accuracy": 0.514, "average": 0.514},
    "12": {"accuracy": 0.498, "average": 0.498},
    "13": {"accuracy": 0.505, "average": 0.505},
    "14": {"accuracy": 0.518, "average": 0.518}
  },
  "anli-r3": {
    "median": 0.497,
    "interquartile_range": 0.014,
    "average_scores_for_each_prompt": [0.499, 0.511, 0.486, 0.512, 0.496, 0.503, 0.497, 0.488, 0.483, 0.487, 0.502, 0.493, 0.513, 0.506, 0.495],
    "0": {"accuracy": 0.499, "average": 0.499},
    "1": {"accuracy": 0.511, "average": 0.511},
    "2": {"accuracy": 0.486, "average": 0.486},
    "3": {"accuracy": 0.512, "average": 0.512},
    "4": {"accuracy": 0.496, "average": 0.496},
    "5": {"accuracy": 0.503, "average": 0.503},
    "6": {"accuracy": 0.497, "average": 0.497},
    "7": {"accuracy": 0.488, "average": 0.488},
    "8": {"accuracy": 0.483, "average": 0.483},
    "9": {"accuracy": 0.487, "average": 0.487},
    "10": {"accuracy": 0.502, "average": 0.502},
    "11": {"accuracy": 0.493, "average": 0.493},
    "12": {"accuracy": 0.513, "average": 0.513},
    "13": {"accuracy": 0.506, "average": 0.506},
    "14": {"accuracy": 0.495, "average": 0.495}
  },
  "config": {
    "inference_dataset": null,
    "split": "test",
    "few_shot_random_seed": null,
    "num_val_samples": 32,
    "max_datapoints_per_dataset_without_templates": null,
    "should_save_to_gcp": false,
    "prediction_dir": "exp_out/merging/ia3-bigscience-T0_3B/tpa-es10_ia3_base_test/rte,cb,winogrande,wic,wsc,copa,h-swag,story_cloze,anli-r1,anli-r2,anli-r3/lambda_searched/predictions/multiple_prompts",
    "max_gen_len": 64,
    "eval_batch_size": 16,
    "eval_template_idx": null,
    "length_normalization": false,
    "use_bfloat16_during_eval": true,
    "world_size": null,
    "did_run_finish": false,
    "fields_toIterateOver": ["inference_dataset", "eval_template_idx"],
    "values_toIterateOver": {
      "rte": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
      "cb": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14],
      "winogrande": [0, 1, 2, 3, 4],
      "wic": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
      "wsc": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
      "copa": [0, 1, 2, 3, 4, 5, 6, 7],
      "h-swag": [0],
      "story_cloze": [0, 1, 2, 3, 4],
      "anli-r1": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14],
      "anli-r2": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14],
      "anli-r3": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]
    }
  },
  "model_config": {"pretrained_model": "bigscience/T0_3B", "max_seq_len": 512, "peft_method": "ia3", "peft_params": null, "checkpoint_to_directly_load_model": null, "function_to_merge_checkpoints": null, "model_checkpoints": null, "trainable_param_regex": null, "model_lambda": null, "search_lambda": "none", "temperature": 1, "learn_pretraining_lambda": false, "pretraining_init": 1, "lambda_init": 0.2, "lambda_type": "vector-shared", "pretrained_lambda_type": "scalar-shared"},
  "average": 0.681
}
{
  "rte": {
    "median": 0.802,
    "interquartile_range": 0.01,
    "average_scores_for_each_prompt": [0.808, 0.8, 0.82, 0.792, 0.8, 0.804, 0.796, 0.788, 0.804, 0.808],
    "0": {"accuracy": 0.808, "average": 0.808},
    "1": {"accuracy": 0.8, "average": 0.8},
    "2": {"accuracy": 0.82, "average": 0.82},
    "3": {"accuracy": 0.792, "average": 0.792},
    "4": {"accuracy": 0.8, "average": 0.8},
    "5": {"accuracy": 0.804, "average": 0.804},
    "6": {"accuracy": 0.796, "average": 0.796},
    "7": {"accuracy": 0.788, "average": 0.788},
    "8": {"accuracy": 0.804, "average": 0.804},
    "9": {"accuracy": 0.808, "average": 0.808}
  },
  "cb": {
    "median": 0.833,
    "interquartile_range": 0.041,
    "average_scores_for_each_prompt": [0.833, 0.833, 0.792, 0.833, 0.792, 0.917, 0.875, 0.792, 0.833, 0.792, 0.875, 0.792, 0.833, 0.833, 0.833],
    "0": {"accuracy": 0.833, "average": 0.833},
    "1": {"accuracy": 0.833, "average": 0.833},
    "2": {"accuracy": 0.792, "average": 0.792},
    "3": {"accuracy": 0.833, "average": 0.833},
    "4": {"accuracy": 0.792, "average": 0.792},
    "5": {"accuracy": 0.917, "average": 0.917},
    "6": {"accuracy": 0.875, "average": 0.875},
    "7": {"accuracy": 0.792, "average": 0.792},
    "8": {"accuracy": 0.833, "average": 0.833},
    "9": {"accuracy": 0.792, "average": 0.792},
    "10": {"accuracy": 0.875, "average": 0.875},
    "11": {"accuracy": 0.792, "average": 0.792},
    "12": {"accuracy": 0.833, "average": 0.833},
    "13": {"accuracy": 0.833, "average": 0.833},
    "14": {"accuracy": 0.833, "average": 0.833}
  },
  "winogrande": {
    "median": 0.679,
    "interquartile_range": 0.008,
    "average_scores_for_each_prompt": [0.681, 0.67, 0.683, 0.679, 0.673],
    "0": {"accuracy": 0.681, "average": 0.681},
    "1": {"accuracy": 0.67, "average": 0.67},
    "2": {"accuracy": 0.683, "average": 0.683},
    "3": {"accuracy": 0.679, "average": 0.679},
    "4": {"accuracy": 0.673, "average": 0.673}
  },
  "wic": {
    "median": 0.645,
    "interquartile_range": 0.094,
    "average_scores_for_each_prompt": [0.658, 0.652, 0.652, 0.531, 0.528, 0.645, 0.665, 0.515, 0.639, 0.645],
    "0": {"accuracy": 0.658, "average": 0.658},
    "1": {"accuracy": 0.652, "average": 0.652},
    "2": {"accuracy": 0.652, "average": 0.652},
    "3": {"accuracy": 0.531, "average": 0.531},
    "4": {"accuracy": 0.528, "average": 0.528},
    "5": {"accuracy": 0.645, "average": 0.645},
    "6": {"accuracy": 0.665, "average": 0.665},
    "7": {"accuracy": 0.515, "average": 0.515},
    "8": {"accuracy": 0.639, "average": 0.639},
    "9": {"accuracy": 0.645, "average": 0.645}
  },
  "wsc": {
    "median": 0.611,
    "interquartile_range": 0.08,
    "average_scores_for_each_prompt": [0.597, 0.542, 0.708, 0.694, 0.556, 0.625, 0.625, 0.528, 0.597, 0.653],
    "0": {"accuracy": 0.597, "average": 0.597},
    "1": {"accuracy": 0.542, "average": 0.542},
    "2": {"accuracy": 0.708, "average": 0.708},
    "3": {"accuracy": 0.694, "average": 0.694},
    "4": {"accuracy": 0.556, "average": 0.556},
    "5": {"accuracy": 0.625, "average": 0.625},
    "6": {"accuracy": 0.625, "average": 0.625},
    "7": {"accuracy": 0.528, "average": 0.528},
    "8": {"accuracy": 0.597, "average": 0.597},
    "9": {"accuracy": 0.653, "average": 0.653}
  },
  "copa": {
    "median": 0.89,
    "interquartile_range": 0.063,
    "average_scores_for_each_prompt": [0.912, 0.838, 0.926, 0.838, 0.912, 0.868, 0.853, 0.912],
    "0": {"accuracy": 0.912, "average": 0.912},
    "1": {"accuracy": 0.838, "average": 0.838},
    "2": {"accuracy": 0.926, "average": 0.926},
    "3": {"accuracy": 0.838, "average": 0.838},
    "4": {"accuracy": 0.912, "average": 0.912},
    "5": {"accuracy": 0.868, "average": 0.868},
    "6": {"accuracy": 0.853, "average": 0.853},
    "7": {"accuracy": 0.912, "average": 0.912}
  },
  "h-swag": {
    "median": 0.426,
    "interquartile_range": 0.0,
    "average_scores_for_each_prompt": [0.426],
    "h-swag": {"accuracy": 0.426, "average": 0.426}
  },
  "story_cloze": {
    "median": 0.923,
    "interquartile_range": 0.005,
    "average_scores_for_each_prompt": [0.922, 0.927, 0.93, 0.923, 0.921],
    "0": {"accuracy": 0.922, "average": 0.922},
    "1": {"accuracy": 0.927, "average": 0.927},
    "2": {"accuracy": 0.93, "average": 0.93},
    "3": {"accuracy": 0.923, "average": 0.923},
    "4": {"accuracy": 0.921, "average": 0.921}
  },
  "anli-r1": {
    "median": 0.665,
    "interquartile_range": 0.02,
    "average_scores_for_each_prompt": [0.626, 0.674, 0.683, 0.664, 0.682, 0.671, 0.653, 0.665, 0.671, 0.623, 0.652, 0.675, 0.629, 0.662, 0.671],
    "0": {"accuracy": 0.626, "average": 0.626},
    "1": {"accuracy": 0.674, "average": 0.674},
    "2": {"accuracy": 0.683, "average": 0.683},
    "3": {"accuracy": 0.664, "average": 0.664},
    "4": {"accuracy": 0.682, "average": 0.682},
    "5": {"accuracy": 0.671, "average": 0.671},
    "6": {"accuracy": 0.653, "average": 0.653},
    "7": {"accuracy": 0.665, "average": 0.665},
    "8": {"accuracy": 0.671, "average": 0.671},
    "9": {"accuracy": 0.623, "average": 0.623},
    "10": {"accuracy": 0.652, "average": 0.652},
    "11": {"accuracy": 0.675, "average": 0.675},
    "12": {"accuracy": 0.629, "average": 0.629},
    "13": {"accuracy": 0.662, "average": 0.662},
    "14": {"accuracy": 0.671, "average": 0.671}
  },
  "anli-r2": {
    "median": 0.51,
    "interquartile_range": 0.017,
    "average_scores_for_each_prompt": [0.5, 0.521, 0.522, 0.495, 0.512, 0.515, 0.503, 0.512, 0.514, 0.493, 0.493, 0.518, 0.495, 0.504, 0.51],
    "0": {"accuracy": 0.5, "average": 0.5},
    "1": {"accuracy": 0.521, "average": 0.521},
    "2": {"accuracy": 0.522, "average": 0.522},
    "3": {"accuracy": 0.495, "average": 0.495},
    "4": {"accuracy": 0.512, "average": 0.512},
    "5": {"accuracy": 0.515, "average": 0.515},
    "6": {"accuracy": 0.503, "average": 0.503},
    "7": {"accuracy": 0.512, "average": 0.512},
    "8": {"accuracy": 0.514, "average": 0.514},
    "9": {"accuracy": 0.493, "average": 0.493},
    "10": {"accuracy": 0.493, "average": 0.493},
    "11": {"accuracy": 0.518, "average": 0.518},
    "12": {"accuracy": 0.495, "average": 0.495},
    "13": {"accuracy": 0.504, "average": 0.504},
    "14": {"accuracy": 0.51, "average": 0.51}
  },
  "anli-r3": {
    "median": 0.497,
    "interquartile_range": 0.01,
    "average_scores_for_each_prompt": [0.498, 0.503, 0.487, 0.508, 0.498, 0.503, 0.495, 0.489, 0.492, 0.489, 0.497, 0.491, 0.512, 0.5, 0.497],
    "0": {"accuracy": 0.498, "average": 0.498},
    "1": {"accuracy": 0.503, "average": 0.503},
    "2": {"accuracy": 0.487, "average": 0.487},
    "3": {"accuracy": 0.508, "average": 0.508},
    "4": {"accuracy": 0.498, "average": 0.498},
    "5": {"accuracy": 0.503, "average": 0.503},
    "6": {"accuracy": 0.495, "average": 0.495},
    "7": {"accuracy": 0.489, "average": 0.489},
    "8": {"accuracy": 0.492, "average": 0.492},
    "9": {"accuracy": 0.489, "average": 0.489},
    "10": {"accuracy": 0.497, "average": 0.497},
    "11": {"accuracy": 0.491, "average": 0.491},
    "12": {"accuracy": 0.512, "average": 0.512},
    "13": {"accuracy": 0.5, "average": 0.5},
    "14": {"accuracy": 0.497, "average": 0.497}
  },
  "config": {
    "inference_dataset": null,
    "split": "test",
    "few_shot_random_seed": null,
    "num_val_samples": 32,
    "max_datapoints_per_dataset_without_templates": null,
    "should_save_to_gcp": false,
    "prediction_dir": "exp_out/merging/ia3-bigscience-T0_3B/tpa-es10_ia3_base_test/rte,cb,winogrande,wic,wsc,copa,h-swag,story_cloze,anli-r1,anli-r2,anli-r3/lambda_searched/predictions/multiple_prompts",
    "max_gen_len": 64,
    "eval_batch_size": 16,
    "eval_template_idx": null,
    "length_normalization": false,
    "use_bfloat16_during_eval": true,
    "world_size": null,
    "did_run_finish": false,
    "fields_toIterateOver": ["inference_dataset", "eval_template_idx"],
    "values_toIterateOver": {
      "rte": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
      "cb": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14],
      "winogrande": [0, 1, 2, 3, 4],
      "wic": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
      "wsc": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
      "copa": [0, 1, 2, 3, 4, 5, 6, 7],
      "h-swag": [0],
      "story_cloze": [0, 1, 2, 3, 4],
      "anli-r1": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14],
      "anli-r2": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14],
      "anli-r3": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]
    }
  },
  "model_config": {"pretrained_model": "bigscience/T0_3B", "max_seq_len": 512, "peft_method": "ia3", "peft_params": null, "checkpoint_to_directly_load_model": null, "function_to_merge_checkpoints": null, "model_checkpoints": null, "trainable_param_regex": null, "model_lambda": null, "search_lambda": "none", "temperature": 1, "learn_pretraining_lambda": false, "pretraining_init": 1, "lambda_init": 0.2, "lambda_type": "vector-shared", "pretrained_lambda_type": "scalar-shared"},
  "average": 0.68
}
{
  "rte": {
    "median": 0.8,
    "interquartile_range": 0.01,
    "average_scores_for_each_prompt": [0.816, 0.788, 0.816, 0.792, 0.804, 0.8, 0.796, 0.78, 0.8, 0.8],
    "0": {"accuracy": 0.816, "average": 0.816},
    "1": {"accuracy": 0.788, "average": 0.788},
    "2": {"accuracy": 0.816, "average": 0.816},
    "3": {"accuracy": 0.792, "average": 0.792},
    "4": {"accuracy": 0.804, "average": 0.804},
    "5": {"accuracy": 0.8, "average": 0.8},
    "6": {"accuracy": 0.796, "average": 0.796},
    "7": {"accuracy": 0.78, "average": 0.78},
    "8": {"accuracy": 0.8, "average": 0.8},
    "9": {"accuracy": 0.8, "average": 0.8}
  },
  "cb": {
    "median": 0.833,
    "interquartile_range": 0.041,
    "average_scores_for_each_prompt": [0.833, 0.833, 0.792, 0.833, 0.833, 0.875, 0.875, 0.792, 0.833, 0.792, 0.833, 0.792, 0.833, 0.833, 0.792],
    "0": {"accuracy": 0.833, "average": 0.833},
    "1": {"accuracy": 0.833, "average": 0.833},
    "2": {"accuracy": 0.792, "average": 0.792},
    "3": {"accuracy": 0.833, "average": 0.833},
    "4": {"accuracy": 0.833, "average": 0.833},
    "5": {"accuracy": 0.875, "average": 0.875},
    "6": {"accuracy": 0.875, "average": 0.875},
    "7": {"accuracy": 0.792, "average": 0.792},
    "8": {"accuracy": 0.833, "average": 0.833},
    "9": {"accuracy": 0.792, "average": 0.792},
    "10": {"accuracy": 0.833, "average": 0.833},
    "11": {"accuracy": 0.792, "average": 0.792},
    "12": {"accuracy": 0.833, "average": 0.833},
    "13": {"accuracy": 0.833, "average": 0.833},
    "14": {"accuracy": 0.792, "average": 0.792}
  },
  "winogrande": {
    "median": 0.675,
    "interquartile_range": 0.005,
    "average_scores_for_each_prompt": [0.678, 0.669, 0.679, 0.673, 0.675],
    "0": {"accuracy": 0.678, "average": 0.678},
    "1": {"accuracy": 0.669, "average": 0.669},
    "2": {"accuracy": 0.679, "average": 0.679},
    "3": {"accuracy": 0.673, "average": 0.673},
    "4": {"accuracy": 0.675, "average": 0.675}
  },
  "wic": {
    "median": 0.638,
    "interquartile_range": 0.092,
    "average_scores_for_each_prompt": [0.663, 0.652, 0.642, 0.526, 0.536, 0.629, 0.657, 0.517, 0.634, 0.65],
    "0": {"accuracy": 0.663, "average": 0.663},
    "1": {"accuracy": 0.652, "average": 0.652},
    "2": {"accuracy": 0.642, "average": 0.642},
    "3": {"accuracy": 0.526, "average": 0.526},
    "4": {"accuracy": 0.536, "average": 0.536},
    "5": {"accuracy": 0.629, "average": 0.629},
    "6": {"accuracy": 0.657, "average": 0.657},
    "7": {"accuracy": 0.517, "average": 0.517},
    "8": {"accuracy": 0.634, "average": 0.634},
    "9": {"accuracy": 0.65, "average": 0.65}
  },
  "wsc": {
    "median": 0.618,
    "interquartile_range": 0.07,
    "average_scores_for_each_prompt": [0.625, 0.528, 0.708, 0.681, 0.583, 0.653, 0.611, 0.528, 0.583, 0.653],
    "0": {"accuracy": 0.625, "average": 0.625},
    "1": {"accuracy": 0.528, "average": 0.528},
    "2": {"accuracy": 0.708, "average": 0.708},
    "3": {"accuracy": 0.681, "average": 0.681},
    "4": {"accuracy": 0.583, "average": 0.583},
    "5": {"accuracy": 0.653, "average": 0.653},
    "6": {"accuracy": 0.611, "average": 0.611},
    "7": {"accuracy": 0.528, "average": 0.528},
    "8": {"accuracy": 0.583, "average": 0.583},
    "9": {"accuracy": 0.653, "average": 0.653}
  },
  "copa": {
    "median": 0.883,
    "interquartile_range": 0.063,
    "average_scores_for_each_prompt": [0.897, 0.824, 0.926, 0.838, 0.912, 0.868, 0.838, 0.897],
    "0": {"accuracy": 0.897, "average": 0.897},
    "1": {"accuracy": 0.824, "average": 0.824},
    "2": {"accuracy": 0.926, "average": 0.926},
    "3": {"accuracy": 0.838, "average": 0.838},
    "4": {"accuracy": 0.912, "average": 0.912},
    "5": {"accuracy": 0.868, "average": 0.868},
    "6": {"accuracy": 0.838, "average": 0.838},
    "7": {"accuracy": 0.897, "average": 0.897}
  },
  "h-swag": {
    "median": 0.427,
    "interquartile_range": 0.0,
    "average_scores_for_each_prompt": [0.427],
    "h-swag": {"accuracy": 0.427, "average": 0.427}
  },
  "story_cloze": {
    "median": 0.925,
    "interquartile_range": 0.001,
    "average_scores_for_each_prompt": [0.924, 0.925, 0.93, 0.925, 0.922],
    "0": {"accuracy": 0.924, "average": 0.924},
    "1": {"accuracy": 0.925, "average": 0.925},
    "2": {"accuracy": 0.93, "average": 0.93},
    "3": {"accuracy": 0.925, "average": 0.925},
    "4": {"accuracy": 0.922, "average": 0.922}
  },
  "anli-r1": {
    "median": 0.669,
    "interquartile_range": 0.016,
    "average_scores_for_each_prompt": [0.625, 0.678, 0.685, 0.665, 0.683, 0.671, 0.658, 0.669, 0.669, 0.615, 0.655, 0.675, 0.628, 0.665, 0.671],
    "0": {"accuracy": 0.625, "average": 0.625},
    "1": {"accuracy": 0.678, "average": 0.678},
    "2": {"accuracy": 0.685, "average": 0.685},
    "3": {"accuracy": 0.665, "average": 0.665},
    "4": {"accuracy": 0.683, "average": 0.683},
    "5": {"accuracy": 0.671, "average": 0.671},
    "6": {"accuracy": 0.658, "average": 0.658},
    "7": {"accuracy": 0.669, "average": 0.669},
    "8": {"accuracy": 0.669, "average": 0.669},
    "9": {"accuracy": 0.615, "average": 0.615},
    "10": {"accuracy": 0.655, "average": 0.655},
    "11": {"accuracy": 0.675, "average": 0.675},
    "12": {"accuracy": 0.628, "average": 0.628},
    "13": {"accuracy": 0.665, "average": 0.665},
    "14": {"accuracy": 0.671, "average": 0.671}
  },
  "anli-r2": {
    "median": 0.509,
    "interquartile_range": 0.019,
    "average_scores_for_each_prompt": [0.496, 0.518, 0.527, 0.496, 0.509, 0.512, 0.507, 0.516, 0.515, 0.492, 0.497, 0.517, 0.494, 0.498, 0.511],
    "0": {"accuracy": 0.496, "average": 0.496},
    "1": {"accuracy": 0.518, "average": 0.518},
    "2": {"accuracy": 0.527, "average": 0.527},
    "3": {"accuracy": 0.496, "average": 0.496},
    "4": {"accuracy": 0.509, "average": 0.509},
    "5": {"accuracy": 0.512, "average": 0.512},
    "6": {"accuracy": 0.507, "average": 0.507},
    "7": {"accuracy": 0.516, "average": 0.516},
    "8": {"accuracy": 0.515, "average": 0.515},
    "9": {"accuracy": 0.492, "average": 0.492},
    "10": {"accuracy": 0.497, "average": 0.497},
    "11": {"accuracy": 0.517, "average": 0.517},
    "12": {"accuracy": 0.494, "average": 0.494},
    "13": {"accuracy": 0.498, "average": 0.498},
    "14": {"accuracy": 0.511, "average": 0.511}
  },
  "anli-r3": {
    "median": 0.497,
    "interquartile_range": 0.013,
    "average_scores_for_each_prompt": [0.5, 0.503, 0.488, 0.511, 0.502, 0.497, 0.49, 0.486, 0.492, 0.482, 0.501, 0.486, 0.508, 0.5, 0.493],
    "0": {"accuracy": 0.5, "average": 0.5},
    "1": {"accuracy": 0.503, "average": 0.503},
    "2": {"accuracy": 0.488, "average": 0.488},
    "3": {"accuracy": 0.511, "average": 0.511},
    "4": {"accuracy": 0.502, "average": 0.502},
    "5": {"accuracy": 0.497, "average": 0.497},
    "6": {"accuracy": 0.49, "average": 0.49},
    "7": {"accuracy": 0.486, "average": 0.486},
    "8": {"accuracy": 0.492, "average": 0.492},
    "9": {"accuracy": 0.482, "average": 0.482},
    "10": {"accuracy": 0.501, "average": 0.501},
    "11": {"accuracy": 0.486, "average": 0.486},
    "12": {"accuracy": 0.508, "average": 0.508},
    "13": {"accuracy": 0.5, "average": 0.5},
    "14": {"accuracy": 0.493, "average": 0.493}
  },
  "config": {
    "inference_dataset": null,
    "split": "test",
    "few_shot_random_seed": null,
    "num_val_samples": 32,
    "max_datapoints_per_dataset_without_templates": null,
    "should_save_to_gcp": false,
    "prediction_dir": "exp_out/merging/ia3-bigscience-T0_3B/tpa-es10_ia3_base_test/rte,cb,winogrande,wic,wsc,copa,h-swag,story_cloze,anli-r1,anli-r2,anli-r3/lambda_searched/predictions/multiple_prompts",
    "max_gen_len": 64,
    "eval_batch_size": 16,
    "eval_template_idx": null,
    "length_normalization": false,
    "use_bfloat16_during_eval": true,
    "world_size": null,
    "did_run_finish": false,
    "fields_toIterateOver": ["inference_dataset", "eval_template_idx"],
    "values_toIterateOver": {
      "rte": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
      "cb": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14],
      "winogrande": [0, 1, 2, 3, 4],
      "wic": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
      "wsc": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
      "copa": [0, 1, 2, 3, 4, 5, 6, 7],
      "h-swag": [0],
      "story_cloze": [0, 1, 2, 3, 4],
      "anli-r1": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14],
      "anli-r2": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14],
      "anli-r3": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]
    }
  },
  "model_config": {"pretrained_model": "bigscience/T0_3B", "max_seq_len": 512, "peft_method": "ia3", "peft_params": null, "checkpoint_to_directly_load_model": null, "function_to_merge_checkpoints": null, "model_checkpoints": null, "trainable_param_regex": null, "model_lambda": null, "search_lambda": "none", "temperature": 1, "learn_pretraining_lambda": false, "pretraining_init": 1, "lambda_init": 0.2, "lambda_type": "vector-shared", "pretrained_lambda_type": "scalar-shared"},
  "average": 0.679
}
{
  "rte": {
    "median": 0.796,
    "interquartile_range": 0.011,
    "average_scores_for_each_prompt": [0.804, 0.792, 0.812, 0.788, 0.8, 0.796, 0.78, 0.792, 0.796, 0.808],
    "0": {"accuracy": 0.804, "average": 0.804},
    "1": {"accuracy": 0.792, "average": 0.792},
    "2": {"accuracy": 0.812, "average": 0.812},
    "3": {"accuracy": 0.788, "average": 0.788},
    "4": {"accuracy": 0.8, "average": 0.8},
    "5": {"accuracy": 0.796, "average": 0.796},
    "6": {"accuracy": 0.78, "average": 0.78},
    "7": {"accuracy": 0.792, "average": 0.792},
    "8": {"accuracy": 0.796, "average": 0.796},
    "9": {"accuracy": 0.808, "average": 0.808}
  },
  "cb": {
    "median": 0.833,
    "interquartile_range": 0.041,
    "average_scores_for_each_prompt": [0.833, 0.792, 0.792, 0.833, 0.833, 0.917, 0.875, 0.75, 0.833, 0.792, 0.875, 0.75, 0.792, 0.833, 0.792],
    "0": {"accuracy": 0.833, "average": 0.833},
    "1": {"accuracy": 0.792, "average": 0.792},
    "2": {"accuracy": 0.792, "average": 0.792},
    "3": {"accuracy": 0.833, "average": 0.833},
    "4": {"accuracy": 0.833, "average": 0.833},
    "5": {"accuracy": 0.917, "average": 0.917},
    "6": {"accuracy": 0.875, "average": 0.875},
    "7": {"accuracy": 0.75, "average": 0.75},
    "8": {"accuracy": 0.833, "average": 0.833},
    "9": {"accuracy": 0.792, "average": 0.792},
    "10": {"accuracy": 0.875, "average": 0.875},
    "11": {"accuracy": 0.75, "average": 0.75},
    "12": {"accuracy": 0.792, "average": 0.792},
    "13": {"accuracy": 0.833, "average": 0.833},
    "14": {"accuracy": 0.792, "average": 0.792}
  },
  "winogrande": {
    "median": 0.698,
    "interquartile_range": 0.003,
    "average_scores_for_each_prompt": [0.715, 0.698, 0.697, 0.696, 0.7],
    "0": {"accuracy": 0.715, "average": 0.715},
    "1": {"accuracy": 0.698, "average": 0.698},
    "2": {"accuracy": 0.697, "average": 0.697},
    "3": {"accuracy": 0.696, "average": 0.696},
    "4": {"accuracy": 0.7, "average": 0.7}
  },
  "wic": {
    "median": 0.641,
    "interquartile_range": 0.078,
    "average_scores_for_each_prompt": [0.645, 0.653, 0.637, 0.536, 0.553, 0.66, 0.665, 0.523, 0.635, 0.647],
    "0": {"accuracy": 0.645, "average": 0.645},
    "1": {"accuracy": 0.653, "average": 0.653},
    "2": {"accuracy": 0.637, "average": 0.637},
    "3": {"accuracy": 0.536, "average": 0.536},
    "4": {"accuracy": 0.553, "average": 0.553},
    "5": {"accuracy": 0.66, "average": 0.66},
    "6": {"accuracy": 0.665, "average": 0.665},
    "7": {"accuracy": 0.523, "average": 0.523},
    "8": {"accuracy": 0.635, "average": 0.635},
    "9": {"accuracy": 0.647, "average": 0.647}
  },
  "wsc": {
    "median": 0.632,
    "interquartile_range": 0.087,
    "average_scores_for_each_prompt": [0.625, 0.542, 0.722, 0.667, 0.556, 0.639, 0.639, 0.514, 0.667, 0.625],
    "0": {"accuracy": 0.625, "average": 0.625},
    "1": {"accuracy": 0.542, "average": 0.542},
    "2": {"accuracy": 0.722, "average": 0.722},
    "3": {"accuracy": 0.667, "average": 0.667},
    "4": {"accuracy": 0.556, "average": 0.556},
    "5": {"accuracy": 0.639, "average": 0.639},
    "6": {"accuracy": 0.639, "average": 0.639},
    "7": {"accuracy": 0.514, "average": 0.514},
    "8": {"accuracy": 0.667, "average": 0.667},
    "9": {"accuracy": 0.625, "average": 0.625}
  },
  "copa": {
    "median": 0.867,
    "interquartile_range": 0.051,
    "average_scores_for_each_prompt": [0.882, 0.824, 0.926, 0.838, 0.912, 0.853, 0.838, 0.882],
    "0": {"accuracy": 0.882, "average": 0.882},
    "1": {"accuracy": 0.824, "average": 0.824},
    "2": {"accuracy": 0.926, "average": 0.926},
    "3": {"accuracy": 0.838, "average": 0.838},
    "4": {"accuracy": 0.912, "average": 0.912},
    "5": {"accuracy": 0.853, "average": 0.853},
    "6": {"accuracy": 0.838, "average": 0.838},
    "7": {"accuracy": 0.882, "average": 0.882}
  },
  "h-swag": {
    "median": 0.426,
    "interquartile_range": 0.0,
    "average_scores_for_each_prompt": [0.426],
    "h-swag": {"accuracy": 0.426, "average": 0.426}
  },
  "story_cloze": {
    "median": 0.921,
    "interquartile_range": 0.003,
    "average_scores_for_each_prompt": [0.919, 0.922, 0.925, 0.921, 0.918],
    "0": {"accuracy": 0.919, "average": 0.919},
    "1": {"accuracy": 0.922, "average": 0.922},
    "2": {"accuracy": 0.925, "average": 0.925},
    "3": {"accuracy": 0.921, "average": 0.921},
    "4": {"accuracy": 0.918, "average": 0.918}
  },
  "anli-r1": {
    "median": 0.662,
    "interquartile_range": 0.025,
    "average_scores_for_each_prompt": [0.628, 0.671, 0.673, 0.662, 0.679, 0.674, 0.646, 0.661, 0.674, 0.622, 0.649, 0.67, 0.624, 0.654, 0.667],
    "0": {"accuracy": 0.628, "average": 0.628},
    "1": {"accuracy": 0.671, "average": 0.671},
    "2": {"accuracy": 0.673, "average": 0.673},
    "3": {"accuracy": 0.662, "average": 0.662},
    "4": {"accuracy": 0.679, "average": 0.679},
    "5": {"accuracy": 0.674, "average": 0.674},
    "6": {"accuracy": 0.646, "average": 0.646},
    "7": {"accuracy": 0.661, "average": 0.661},
    "8": {"accuracy": 0.674, "average": 0.674},
    "9": {"accuracy": 0.622, "average": 0.622},
    "10": {"accuracy": 0.649, "average": 0.649},
    "11": {"accuracy": 0.67, "average": 0.67},
    "12": {"accuracy": 0.624, "average": 0.624},
    "13": {"accuracy": 0.654, "average": 0.654},
    "14": {"accuracy": 0.667, "average": 0.667}
  },
  "anli-r2": {
    "median": 0.51,
    "interquartile_range": 0.016,
    "average_scores_for_each_prompt": [0.505, 0.524, 0.52, 0.5, 0.51, 0.525, 0.51, 0.514, 0.517, 0.495, 0.504, 0.523, 0.501, 0.499, 0.509],
    "0": {"accuracy": 0.505, "average": 0.505},
    "1": {"accuracy": 0.524, "average": 0.524},
    "2": {"accuracy": 0.52, "average": 0.52},
    "3": {"accuracy": 0.5, "average": 0.5},
    "4": {"accuracy": 0.51, "average": 0.51},
    "5": {"accuracy": 0.525, "average": 0.525},
    "6": {"accuracy": 0.51, "average": 0.51},
    "7": {"accuracy": 0.514, "average": 0.514},
    "8": {"accuracy": 0.517, "average": 0.517},
    "9": {"accuracy": 0.495, "average": 0.495},
    "10": {"accuracy": 0.504, "average": 0.504},
    "11": {"accuracy": 0.523, "average": 0.523},
    "12": {"accuracy": 0.501, "average": 0.501},
    "13": {"accuracy": 0.499, "average": 0.499},
    "14": {"accuracy": 0.509, "average": 0.509}
  },
  "anli-r3": {
    "median": 0.49,
    "interquartile_range": 0.012,
    "average_scores_for_each_prompt": [0.497, 0.5, 0.483, 0.507, 0.489, 0.498, 0.489, 0.479, 0.485, 0.49, 0.505, 0.487, 0.513, 0.493, 0.487],
    "0": {"accuracy": 0.497, "average": 0.497},
    "1": {"accuracy": 0.5, "average": 0.5},
    "2": {"accuracy": 0.483, "average": 0.483},
    "3": {"accuracy": 0.507, "average": 0.507},
    "4": {"accuracy": 0.489, "average": 0.489},
    "5": {"accuracy": 0.498, "average": 0.498},
    "6": {"accuracy": 0.489, "average": 0.489},
    "7": {"accuracy": 0.479, "average": 0.479},
    "8": {"accuracy": 0.485, "average": 0.485},
    "9": {"accuracy": 0.49, "average": 0.49},
    "10": {"accuracy": 0.505, "average": 0.505},
    "11": {"accuracy": 0.487, "average": 0.487},
    "12": {"accuracy": 0.513, "average": 0.513},
    "13": {"accuracy": 0.493, "average": 0.493},
    "14": {"accuracy": 0.487, "average": 0.487}
  },
  "config": {
    "inference_dataset": null,
    "split": "test",
    "few_shot_random_seed": null,
    "num_val_samples": 32,
    "max_datapoints_per_dataset_without_templates": null,
    "should_save_to_gcp": false,
    "prediction_dir": "exp_out/merging/ia3-bigscience-T0_3B/tpa-es10_ia3_base_test/rte,cb,winogrande,wic,wsc,copa,h-swag,story_cloze,anli-r1,anli-r2,anli-r3/lambda_searched/predictions/multiple_prompts",
    "max_gen_len": 64,
    "eval_batch_size": 16,
    "eval_template_idx": null,
    "length_normalization": false,
    "use_bfloat16_during_eval": true,
    "world_size": null,
    "did_run_finish": false,
    "fields_toIterateOver": ["inference_dataset", "eval_template_idx"],
    "values_toIterateOver": {
      "rte": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
      "cb": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14],
      "winogrande": [0, 1, 2, 3, 4],
      "wic": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
      "wsc": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
      "copa": [0, 1, 2, 3, 4, 5, 6, 7],
      "h-swag": [0],
      "story_cloze": [0, 1, 2, 3, 4],
      "anli-r1": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14],
      "anli-r2": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14],
      "anli-r3": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]
    }
  },
  "model_config": {"pretrained_model": "bigscience/T0_3B", "max_seq_len": 512, "peft_method": "ia3", "peft_params": null, "checkpoint_to_directly_load_model": null, "function_to_merge_checkpoints": null, "model_checkpoints": null, "trainable_param_regex": null, "model_lambda": null, "search_lambda": "none", "temperature": 1, "learn_pretraining_lambda": false, "pretraining_init": 1, "lambda_init": 0.2, "lambda_type": "vector-shared", "pretrained_lambda_type": "scalar-shared"},
  "average": 0.68
}
{
  "rte": {
    "median": 0.79,
    "interquartile_range": 0.028,
    "average_scores_for_each_prompt": [0.808, 0.776, 0.812, 0.788, 0.804, 0.792, 0.771, 0.776, 0.808, 0.788],
    "0": {"accuracy": 0.808, "average": 0.808},
    "1": {"accuracy": 0.776, "average": 0.776},
    "2": {"accuracy": 0.812, "average": 0.812},
    "3": {"accuracy": 0.788, "average": 0.788},
    "4": {"accuracy": 0.804, "average": 0.804},
    "5": {"accuracy": 0.792, "average": 0.792},
    "6": {"accuracy": 0.771, "average": 0.771},
    "7": {"accuracy": 0.776, "average": 0.776},
    "8": {"accuracy": 0.808, "average": 0.808},
    "9": {"accuracy": 0.788, "average": 0.788}
  },
  "cb": {
    "median": 0.833,
    "interquartile_range": 0.041,
    "average_scores_for_each_prompt": [0.833, 0.833, 0.792, 0.833, 0.833, 0.875, 0.833, 0.75, 0.833, 0.792, 0.875, 0.792, 0.833, 0.75, 0.792],
    "0": {"accuracy": 0.833, "average": 0.833},
    "1": {"accuracy": 0.833, "average": 0.833},
    "2": {"accuracy": 0.792, "average": 0.792},
    "3": {"accuracy": 0.833, "average": 0.833},
    "4": {"accuracy": 0.833, "average": 0.833},
    "5": {"accuracy": 0.875, "average": 0.875},
    "6": {"accuracy": 0.833, "average": 0.833},
    "7": {"accuracy": 0.75, "average": 0.75},
    "8": {"accuracy": 0.833, "average": 0.833},
    "9": {"accuracy": 0.792, "average": 0.792},
    "10": {"accuracy": 0.875, "average": 0.875},
    "11": {"accuracy": 0.792, "average": 0.792},
    "12": {"accuracy": 0.833, "average": 0.833},
    "13": {"accuracy": 0.75, "average": 0.75},
    "14": {"accuracy": 0.792, "average": 0.792}
  },
  "winogrande": {
    "median": 0.671,
    "interquartile_range": 0.008,
    "average_scores_for_each_prompt": [0.677, 0.668, 0.671, 0.667, 0.676],
    "0": {"accuracy": 0.677, "average": 0.677},
    "1": {"accuracy": 0.668, "average": 0.668},
    "2": {"accuracy": 0.671, "average": 0.671},
    "3": {"accuracy": 0.667, "average": 0.667},
    "4": {"accuracy": 0.676, "average": 0.676}
  },
  "wic": {
    "median": 0.673,
    "interquartile_range": 0.07,
    "average_scores_for_each_prompt": [0.686, 0.683, 0.682, 0.599, 0.594, 0.7, 0.686, 0.566, 0.662, 0.665],
    "0": {"accuracy": 0.686, "average": 0.686},
    "1": {"accuracy": 0.683, "average": 0.683},
    "2": {"accuracy": 0.682, "average": 0.682},
    "3": {"accuracy": 0.599, "average": 0.599},
    "4": {"accuracy": 0.594, "average": 0.594},
    "5": {"accuracy": 0.7, "average": 0.7},
    "6": {"accuracy": 0.686, "average": 0.686},
    "7": {"accuracy": 0.566, "average": 0.566},
    "8": {"accuracy": 0.662, "average": 0.662},
    "9": {"accuracy": 0.665, "average": 0.665}
  },
  "wsc": {
    "median": 0.625,
    "interquartile_range": 0.083,
    "average_scores_for_each_prompt": [0.597, 0.542, 0.694, 0.667, 0.556, 0.625, 0.639, 0.514, 0.625, 0.653],
    "0": {"accuracy": 0.597, "average": 0.597},
    "1": {"accuracy": 0.542, "average": 0.542},
    "2": {"accuracy": 0.694, "average": 0.694},
    "3": {"accuracy": 0.667, "average": 0.667},
    "4": {"accuracy": 0.556, "average": 0.556},
    "5": {"accuracy": 0.625, "average": 0.625},
    "6": {"accuracy": 0.639, "average": 0.639},
    "7": {"accuracy": 0.514, "average": 0.514},
    "8": {"accuracy": 0.625, "average": 0.625},
    "9": {"accuracy": 0.653, "average": 0.653}
  },
  "copa": {
    "median": 0.889,
    "interquartile_range": 0.066,
    "average_scores_for_each_prompt": [0.912, 0.824, 0.926, 0.824, 0.897, 0.882, 0.838, 0.897],
    "0": {"accuracy": 0.912, "average": 0.912},
    "1": {"accuracy": 0.824, "average": 0.824},
    "2": {"accuracy": 0.926, "average": 0.926},
    "3": {"accuracy": 0.824, "average": 0.824},
    "4": {"accuracy": 0.897, "average": 0.897},
    "5": {"accuracy": 0.882, "average": 0.882},
    "6": {"accuracy": 0.838, "average": 0.838},
    "7": {"accuracy": 0.897, "average": 0.897}
  },
  "h-swag": {
    "median": 0.424,
    "interquartile_range": 0.0,
    "average_scores_for_each_prompt": [0.424],
    "h-swag": {"accuracy": 0.424, "average": 0.424}
  },
  "story_cloze": {
    "median": 0.917,
    "interquartile_range": 0.003,
    "average_scores_for_each_prompt": [0.918, 0.917, 0.923, 0.914, 0.915],
    "0": {"accuracy": 0.918, "average": 0.918},
    "1": {"accuracy": 0.917, "average": 0.917},
    "2": {"accuracy": 0.923, "average": 0.923},
    "3": {"accuracy": 0.914, "average": 0.914},
    "4": {"accuracy": 0.915, "average": 0.915}
  },
  "anli-r1": {
    "median": 0.662,
    "interquartile_range": 0.018,
    "average_scores_for_each_prompt": [0.613, 0.671, 0.682, 0.66, 0.679, 0.665, 0.651, 0.663, 0.662, 0.624, 0.654, 0.677, 0.622, 0.65, 0.666],
    "0": {"accuracy": 0.613, "average": 0.613},
    "1": {"accuracy": 0.671, "average": 0.671},
    "2": {"accuracy": 0.682, "average": 0.682},
    "3": {"accuracy": 0.66, "average": 0.66},
    "4": {"accuracy": 0.679, "average": 0.679},
    "5": {"accuracy": 0.665, "average": 0.665},
    "6": {"accuracy": 0.651, "average": 0.651},
    "7": {"accuracy": 0.663, "average": 0.663},
    "8": {"accuracy": 0.662, "average": 0.662},
    "9": {"accuracy": 0.624, "average": 0.624},
    "10": {"accuracy": 0.654, "average": 0.654},
    "11": {"accuracy": 0.677, "average": 0.677},
    "12": {"accuracy": 0.622, "average": 0.622},
    "13": {"accuracy": 0.65, "average": 0.65},
    "14": {"accuracy": 0.666, "average": 0.666}
  },
  "anli-r2": {
    "median": 0.504,
    "interquartile_range": 0.016,
    "average_scores_for_each_prompt": [0.494, 0.517, 0.521, 0.498, 0.52, 0.514, 0.501, 0.504, 0.513, 0.494, 0.503, 0.518, 0.494, 0.501, 0.513],
    "0": {"accuracy": 0.494, "average": 0.494},
    "1": {"accuracy": 0.517, "average": 0.517},
    "2": {"accuracy": 0.521, "average": 0.521},
    "3": {"accuracy": 0.498, "average": 0.498},
    "4": {"accuracy": 0.52, "average": 0.52},
    "5": {"accuracy": 0.514, "average": 0.514},
    "6": {"accuracy": 0.501, "average": 0.501},
    "7": {"accuracy": 0.504, "average": 0.504},
    "8": {"accuracy": 0.513, "average": 0.513},
    "9": {"accuracy": 0.494, "average": 0.494},
    "10": {"accuracy": 0.503, "average": 0.503},
    "11": {"accuracy": 0.518, "average": 0.518},
    "12": {"accuracy": 0.494, "average": 0.494},
    "13": {"accuracy": 0.501, "average": 0.501},
    "14": {"accuracy": 0.513, "average": 0.513}
  },
  "anli-r3": {
    "median": 0.491,
    "interquartile_range": 0.009,
    "average_scores_for_each_prompt": [0.491, 0.501, 0.485, 0.495, 0.497, 0.494, 0.486, 0.495, 0.487, 0.482, 0.491, 0.483, 0.512, 0.489, 0.492],
    "0": {"accuracy": 0.491, "average": 0.491},
    "1": {"accuracy": 0.501, "average": 0.501},
    "2": {"accuracy": 0.485, "average": 0.485},
    "3": {"accuracy": 0.495, "average": 0.495},
    "4": {"accuracy": 0.497, "average": 0.497},
    "5": {"accuracy": 0.494, "average": 0.494},
    "6": {"accuracy": 0.486, "average": 0.486},
    "7": {"accuracy": 0.495, "average": 0.495},
    "8": {"accuracy": 0.487, "average": 0.487},
    "9": {"accuracy": 0.482, "average": 0.482},
    "10": {"accuracy": 0.491, "average": 0.491},
    "11": {"accuracy": 0.483, "average": 0.483},
    "12": {"accuracy": 0.512, "average": 0.512},
    "13": {"accuracy": 0.489, "average": 0.489},
    "14": {"accuracy": 0.492, "average": 0.492}
  },
  "config": {
    "inference_dataset": null,
    "split": "test",
    "few_shot_random_seed": null,
    "num_val_samples": 32,
    "max_datapoints_per_dataset_without_templates": null,
    "should_save_to_gcp": false,
    "prediction_dir": "exp_out/merging/ia3-bigscience-T0_3B/tpa-es10_ia3_base_test/rte,cb,winogrande,wic,wsc,copa,h-swag,story_cloze,anli-r1,anli-r2,anli-r3/lambda_searched/predictions/multiple_prompts",
    "max_gen_len": 64,
    "eval_batch_size": 16,
    "eval_template_idx": null,
    "length_normalization": false,
    "use_bfloat16_during_eval": true,
    "world_size": null,
    "did_run_finish": false,
    "fields_toIterateOver": ["inference_dataset", "eval_template_idx"],
    "values_toIterateOver": {
      "rte": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
      "cb": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14],
      "winogrande": [0, 1, 2, 3, 4],
      "wic": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
      "wsc": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
      "copa": [0, 1, 2, 3, 4, 5, 6, 7],
      "h-swag": [0],
      "story_cloze": [0, 1, 2, 3, 4],
      "anli-r1": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14],
      "anli-r2": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14],
      "anli-r3": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]
    }
  },
  "model_config": {"pretrained_model": "bigscience/T0_3B", "max_seq_len": 512, "peft_method": "ia3", "peft_params": null, "checkpoint_to_directly_load_model": null, "function_to_merge_checkpoints": null, "model_checkpoints": null, "trainable_param_regex": null, "model_lambda": null, "search_lambda": "none", "temperature": 1, "learn_pretraining_lambda": false, "pretraining_init": 1, "lambda_init": 0.2, "lambda_type": "vector-shared", "pretrained_lambda_type": "scalar-shared"},
  "average": 0.68
}
{
  "rte": {
    "median": 0.802,
    "interquartile_range": 0.014,
    "average_scores_for_each_prompt": [0.808, 0.788, 0.82, 0.792, 0.8, 0.804, 0.8, 0.788, 0.808, 0.808],
    "0": {"accuracy": 0.808, "average": 0.808},
    "1": {"accuracy": 0.788, "average": 0.788},
    "2": {"accuracy": 0.82, "average": 0.82},
    "3": {"accuracy": 0.792, "average": 0.792},
    "4": {"accuracy": 0.8, "average": 0.8},
    "5": {"accuracy": 0.804, "average": 0.804},
    "6": {"accuracy": 0.8, "average": 0.8},
    "7": {"accuracy": 0.788, "average": 0.788},
    "8": {"accuracy": 0.808, "average": 0.808},
    "9": {"accuracy": 0.808, "average": 0.808}
  },
  "cb": {
    "median": 0.833,
    "interquartile_range": 0.041,
    "average_scores_for_each_prompt": [0.833, 0.833, 0.792, 0.833, 0.833, 0.917, 0.875, 0.792, 0.833, 0.792, 0.875, 0.792, 0.833, 0.833, 0.792],
    "0": {"accuracy": 0.833, "average": 0.833},
    "1": {"accuracy": 0.833, "average": 0.833},
    "2": {"accuracy": 0.792, "average": 0.792},
    "3": {"accuracy": 0.833, "average": 0.833},
    "4": {"accuracy": 0.833, "average": 0.833},
    "5": {"accuracy": 0.917, "average": 0.917},
    "6": {"accuracy": 0.875, "average": 0.875},
    "7": {"accuracy": 0.792, "average": 0.792},
    "8": {"accuracy": 0.833, "average": 0.833},
    "9": {"accuracy": 0.792, "average": 0.792},
    "10": {"accuracy": 0.875, "average": 0.875},
    "11": {"accuracy": 0.792, "average": 0.792},
    "12": {"accuracy": 0.833, "average": 0.833},
    "13": {"accuracy": 0.833, "average": 0.833},
    "14": {"accuracy": 0.792, "average": 0.792}
  },
  "winogrande": {
    "median": 0.677,
    "interquartile_range": 0.002,
    "average_scores_for_each_prompt": [0.679, 0.668, 0.683, 0.677, 0.677],
    "0": {"accuracy": 0.679, "average": 0.679},
    "1": {"accuracy": 0.668, "average": 0.668},
    "2": {"accuracy": 0.683, "average": 0.683},
    "3": {"accuracy": 0.677, "average": 0.677},
    "4": {"accuracy": 0.677, "average": 0.677}
  },
  "wic": {
    "median": 0.647,
    "interquartile_range": 0.09,
    "average_scores_for_each_prompt": [0.649, 0.658, 0.645, 0.538, 0.541, 0.653, 0.665, 0.523, 0.625, 0.649],
    "0": {"accuracy": 0.649, "average": 0.649},
    "1": {"accuracy": 0.658, "average": 0.658},
    "2": {"accuracy": 0.645, "average": 0.645},
    "3": {"accuracy": 0.538, "average": 0.538},
    "4": {"accuracy": 0.541, "average": 0.541},
    "5": {"accuracy": 0.653, "average": 0.653},
    "6": {"accuracy": 0.665, "average": 0.665},
    "7": {"accuracy": 0.523, "average": 0.523},
    "8": {"accuracy": 0.625, "average": 0.625},
    "9": {"accuracy": 0.649, "average": 0.649}
  },
  "wsc": {
    "median": 0.618,
    "interquartile_range": 0.077,
    "average_scores_for_each_prompt": [0.597, 0.569, 0.708, 0.681, 0.569, 0.653, 0.625, 0.528, 0.611, 0.653],
    "0": {"accuracy": 0.597, "average": 0.597},
    "1": {"accuracy": 0.569, "average": 0.569},
    "2": {"accuracy": 0.708, "average": 0.708},
    "3": {"accuracy": 0.681, "average": 0.681},
    "4": {"accuracy": 0.569, "average": 0.569},
    "5": {"accuracy": 0.653, "average": 0.653},
    "6": {"accuracy": 0.625, "average": 0.625},
    "7": {"accuracy": 0.528, "average": 0.528},
    "8": {"accuracy": 0.611, "average": 0.611},
    "9": {"accuracy": 0.653, "average": 0.653}
  },
  "copa": {
    "median": 0.883,
    "interquartile_range": 0.052,
    "average_scores_for_each_prompt": [0.897, 0.838, 0.926, 0.838, 0.897, 0.868, 0.853, 0.912],
    "0": {"accuracy": 0.897, "average": 0.897},
    "1": {"accuracy": 0.838, "average": 0.838},
    "2": {"accuracy": 0.926, "average": 0.926},
    "3": {"accuracy": 0.838, "average": 0.838},
    "4": {"accuracy": 0.897, "average": 0.897},
    "5": {"accuracy": 0.868, "average": 0.868},
    "6": {"accuracy": 0.853, "average": 0.853},
    "7": {"accuracy": 0.912, "average": 0.912}
  },
  "h-swag": {
    "median": 0.426,
    "interquartile_range": 0.0,
    "average_scores_for_each_prompt": [0.426],
    "h-swag": {"accuracy": 0.426, "average": 0.426}
  },
  "story_cloze": {
    "median": 0.927,
    "interquartile_range": 0.006,
    "average_scores_for_each_prompt": [0.927, 0.928, 0.929, 0.922, 0.922],
    "0": {"accuracy": 0.927, "average": 0.927},
    "1": {"accuracy": 0.928, "average": 0.928},
    "2": {"accuracy": 0.929, "average": 0.929},
    "3": {"accuracy": 0.922, "average": 0.922},
    "4": {"accuracy": 0.922, "average": 0.922}
  },
  "anli-r1": {
    "median": 0.669,
    "interquartile_range": 0.02,
    "average_scores_for_each_prompt": [0.626, 0.681, 0.688, 0.669, 0.691, 0.677, 0.658, 0.669, 0.672, 0.623, 0.659, 0.683, 0.633, 0.667, 0.676],
    "0": {"accuracy": 0.626, "average": 0.626},
    "1": {"accuracy": 0.681, "average": 0.681},
    "2": {"accuracy": 0.688, "average": 0.688},
    "3": {"accuracy": 0.669, "average": 0.669},
    "4": {"accuracy": 0.691, "average": 0.691},
    "5": {"accuracy": 0.677, "average": 0.677},
    "6": {"accuracy": 0.658, "average": 0.658},
    "7": {"accuracy": 0.669, "average": 0.669},
    "8": {"accuracy": 0.672, "average": 0.672},
    "9": {"accuracy": 0.623, "average": 0.623},
    "10": {"accuracy": 0.659, "average": 0.659},
    "11": {"accuracy": 0.683, "average": 0.683},
    "12": {"accuracy": 0.633, "average": 0.633},
    "13": {"accuracy": 0.667, "average": 0.667},
    "14": {"accuracy": 0.676, "average": 0.676}
  },
  "anli-r2": {
    "median": 0.512,
    "interquartile_range": 0.018,
    "average_scores_for_each_prompt": [0.497, 0.525, 0.533, 0.492, 0.514, 0.518, 0.511, 0.514, 0.512, 0.496, 0.5, 0.515, 0.494, 0.504, 0.517],
    "0": {"accuracy": 0.497, "average": 0.497},
    "1": {"accuracy": 0.525, "average": 0.525},
    "2": {"accuracy": 0.533, "average": 0.533},
    "3": {"accuracy": 0.492, "average": 0.492},
    "4": {"accuracy": 0.514, "average": 0.514},
    "5": {"accuracy": 0.518, "average": 0.518},
    "6": {"accuracy": 0.511, "average": 0.511},
    "7": {"accuracy": 0.514, "average": 0.514},
    "8": {"accuracy": 0.512, "average": 0.512},
    "9": {"accuracy": 0.496, "average": 0.496},
    "10": {"accuracy": 0.5, "average": 0.5},
    "11": {"accuracy": 0.515, "average": 0.515},
    "12": {"accuracy": 0.494, "average": 0.494},
    "13": {"accuracy": 0.504, "average": 0.504},
    "14": {"accuracy": 0.517, "average": 0.517}
  },
  "anli-r3": {
    "median": 0.497,
    "interquartile_range": 0.013,
    "average_scores_for_each_prompt": [0.501, 0.511, 0.487, 0.51, 0.496, 0.506, 0.496, 0.489, 0.487, 0.487, 0.502, 0.497, 0.512, 0.505, 0.497],
    "0": {"accuracy": 0.501, "average": 0.501},
    "1": {"accuracy": 0.511, "average": 0.511},
    "2": {"accuracy": 0.487, "average": 0.487},
    "3": {"accuracy": 0.51, "average": 0.51},
    "4": {"accuracy": 0.496, "average": 0.496},
    "5": {"accuracy": 0.506, "average": 0.506},
    "6": {"accuracy": 0.496, "average": 0.496},
    "7": {"accuracy": 0.489, "average": 0.489},
    "8": {"accuracy": 0.487, "average": 0.487},
    "9": {"accuracy": 0.487, "average": 0.487},
    "10": {"accuracy": 0.502, "average": 0.502},
    "11": {"accuracy": 0.497, "average": 0.497},
    "12": {"accuracy": 0.512, "average": 0.512},
    "13": {"accuracy": 0.505, "average": 0.505},
    "14": {"accuracy": 0.497, "average": 0.497}
  },
  "config": {
    "inference_dataset": null,
    "split": "test",
    "few_shot_random_seed": null,
    "num_val_samples": 32,
    "max_datapoints_per_dataset_without_templates": null,
    "should_save_to_gcp": false,
    "prediction_dir": "exp_out/merging/ia3-bigscience-T0_3B/tpa-es10_ia3_base_test/rte,cb,winogrande,wic,wsc,copa,h-swag,story_cloze,anli-r1,anli-r2,anli-r3/lambda_searched/predictions/multiple_prompts",
    "max_gen_len": 64,
    "eval_batch_size": 16,
    "eval_template_idx": null,
    "length_normalization": false,
    "use_bfloat16_during_eval": true,
    "world_size": null,
    "did_run_finish": false,
    "fields_toIterateOver": ["inference_dataset", "eval_template_idx"],
    "values_toIterateOver": {
      "rte": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
      "cb": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14],
      "winogrande": [0, 1, 2, 3, 4],
      "wic": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
      "wsc": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
      "copa": [0, 1, 2, 3, 4, 5, 6, 7],
      "h-swag": [0],
      "story_cloze": [0, 1, 2, 3, 4],
      "anli-r1": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14],
      "anli-r2": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14],
      "anli-r3": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]
    }
  },
  "model_config": {"pretrained_model": "bigscience/T0_3B", "max_seq_len": 512, "peft_method": "ia3", "peft_params": null, "checkpoint_to_directly_load_model": null, "function_to_merge_checkpoints": null, "model_checkpoints": null, "trainable_param_regex": null, "model_lambda": null, "search_lambda": "none", "temperature": 1, "learn_pretraining_lambda": false, "pretraining_init": 1, "lambda_init": 0.2, "lambda_type": "vector-shared", "pretrained_lambda_type": "scalar-shared"},
  "average": 0.681
}
{
  "rte": {
    "median": 0.808,
    "interquartile_range": 0.016,
    "average_scores_for_each_prompt": [0.816, 0.792, 0.824, 0.792, 0.812, 0.804, 0.808, 0.784, 0.808, 0.808],
    "0": {"accuracy": 0.816, "average": 0.816},
    "1": {"accuracy": 0.792, "average": 0.792},
    "2": {"accuracy": 0.824, "average": 0.824},
    "3": {"accuracy": 0.792, "average": 0.792},
    "4": {"accuracy": 0.812, "average": 0.812},
    "5": {"accuracy": 0.804, "average": 0.804},
    "6": {"accuracy": 0.808, "average": 0.808},
    "7": {"accuracy": 0.784, "average": 0.784},
    "8": {"accuracy": 0.808, "average": 0.808},
    "9": {"accuracy": 0.808, "average": 0.808}
  },
  "cb": {
    "median": 0.833,
    "interquartile_range": 0.041,
    "average_scores_for_each_prompt": [0.833, 0.833, 0.792, 0.833, 0.833, 0.917, 0.875, 0.792, 0.833, 0.792, 0.875, 0.792, 0.833, 0.833, 0.792],
    "0": {"accuracy": 0.833, "average": 0.833},
    "1": {"accuracy": 0.833, "average": 0.833},
    "2": {"accuracy": 0.792, "average": 0.792},
    "3": {"accuracy": 0.833, "average": 0.833},
    "4": {"accuracy": 0.833, "average": 0.833},
    "5": {"accuracy": 0.917, "average": 0.917},
    "6": {"accuracy": 0.875, "average": 0.875},
    "7": {"accuracy": 0.792, "average": 0.792},
    "8": {"accuracy": 0.833, "average": 0.833},
    "9": {"accuracy": 0.792, "average": 0.792},
    "10": {"accuracy": 0.875, "average": 0.875},
    "11": {"accuracy": 0.792, "average": 0.792},
    "12": {"accuracy": 0.833, "average": 0.833},
    "13": {"accuracy": 0.833, "average": 0.833},
    "14": {"accuracy": 0.792, "average": 0.792}
  },
  "winogrande": {
    "median": 0.672,
    "interquartile_range": 0.011,
    "average_scores_for_each_prompt": [0.683, 0.67, 0.684, 0.672, 0.672],
    "0": {"accuracy": 0.683, "average": 0.683},
    "1": {"accuracy": 0.67, "average": 0.67},
    "2": {"accuracy": 0.684, "average": 0.684},
    "3": {"accuracy": 0.672, "average": 0.672},
    "4": {"accuracy": 0.672, "average": 0.672}
  },
  "wic": {
    "median": 0.645,
    "interquartile_range": 0.087,
    "average_scores_for_each_prompt": [0.649, 0.653, 0.645, 0.533, 0.538, 0.645, 0.658, 0.518, 0.632, 0.645],
    "0": {"accuracy": 0.649, "average": 0.649},
    "1": {"accuracy": 0.653, "average": 0.653},
    "2": {"accuracy": 0.645, "average": 0.645},
    "3": {"accuracy": 0.533, "average": 0.533},
    "4": {"accuracy": 0.538, "average": 0.538},
    "5": {"accuracy": 0.645, "average": 0.645},
    "6": {"accuracy": 0.658, "average": 0.658},
    "7": {"accuracy": 0.518, "average": 0.518},
    "8": {"accuracy": 0.632, "average": 0.632},
    "9": {"accuracy": 0.645, "average": 0.645}
  },
  "wsc": {
    "median": 0.611,
    "interquartile_range": 0.077,
    "average_scores_for_each_prompt": [0.597, 0.569, 0.708, 0.694, 0.569, 0.653, 0.625, 0.528, 0.597, 0.653],
    "0": {"accuracy": 0.597, "average": 0.597},
    "1": {"accuracy": 0.569, "average": 0.569},
    "2": {"accuracy": 0.708, "average": 0.708},
    "3": {"accuracy": 0.694, "average": 0.694},
    "4": {"accuracy": 0.569, "average": 0.569},
    "5": {"accuracy": 0.653, "average": 0.653},
    "6": {"accuracy": 0.625, "average": 0.625},
    "7": {"accuracy": 0.528, "average": 0.528},
    "8": {"accuracy": 0.597, "average": 0.597},
    "9": {"accuracy": 0.653, "average": 0.653}
  },
  "copa": {
    "median": 0.883,
    "interquartile_range": 0.063,
    "average_scores_for_each_prompt": [0.912, 0.838, 0.926, 0.838, 0.897, 0.868, 0.838, 0.897],
    "0": {"accuracy": 0.912, "average": 0.912},
    "1": {"accuracy": 0.838, "average": 0.838},
    "2": {"accuracy": 0.926, "average": 0.926},
    "3": {"accuracy": 0.838, "average": 0.838},
    "4": {"accuracy": 0.897, "average": 0.897},
    "5": {"accuracy": 0.868, "average": 0.868},
    "6": {"accuracy": 0.838, "average": 0.838},
    "7": {"accuracy": 0.897, "average": 0.897}
  },
  "h-swag": {
    "median": 0.427,
    "interquartile_range": 0.0,
    "average_scores_for_each_prompt": [0.427],
    "h-swag": {"accuracy": 0.427, "average": 0.427}
  },
  "story_cloze": {
    "median": 0.928,
    "interquartile_range": 0.004,
    "average_scores_for_each_prompt": [0.928, 0.93, 0.93, 0.926, 0.926],
    "0": {"accuracy": 0.928, "average": 0.928},
    "1": {"accuracy": 0.93, "average": 0.93},
    "2": {"accuracy": 0.93, "average": 0.93},
    "3": {"accuracy": 0.926, "average": 0.926},
    "4": {"accuracy": 0.926, "average": 0.926}
  },
  "anli-r1": {
    "median": 0.67,
    "interquartile_range": 0.019,
    "average_scores_for_each_prompt": [0.625, 0.676, 0.685, 0.666, 0.688, 0.673, 0.657, 0.67, 0.671, 0.62, 0.655, 0.68, 0.629, 0.665, 0.675],
    "0": {"accuracy": 0.625, "average": 0.625},
    "1": {"accuracy": 0.676, "average": 0.676},
    "2": {"accuracy": 0.685, "average": 0.685},
    "3": {"accuracy": 0.666, "average": 0.666},
    "4": {"accuracy": 0.688, "average": 0.688},
    "5": {"accuracy": 0.673, "average": 0.673},
    "6": {"accuracy": 0.657, "average": 0.657},
    "7": {"accuracy": 0.67, "average": 0.67},
    "8": {"accuracy": 0.671, "average": 0.671},
    "9": {"accuracy": 0.62, "average": 0.62},
    "10": {"accuracy": 0.655, "average": 0.655},
    "11": {"accuracy": 0.68, "average": 0.68},
    "12": {"accuracy": 0.629, "average": 0.629},
    "13": {"accuracy": 0.665, "average": 0.665},
    "14": {"accuracy": 0.675, "average": 0.675}
  },
  "anli-r2": {
    "median": 0.513,
    "interquartile_range": 0.017,
    "average_scores_for_each_prompt": [0.496, 0.524, 0.528, 0.492, 0.514, 0.515, 0.506, 0.515, 0.513, 0.493, 0.501, 0.517, 0.496, 0.503, 0.515],
    "0": {"accuracy": 0.496, "average": 0.496},
    "1": {"accuracy": 0.524, "average": 0.524},
    "2": {"accuracy": 0.528, "average": 0.528},
    "3": {"accuracy": 0.492, "average": 0.492},
    "4": {"accuracy": 0.514, "average": 0.514},
    "5": {"accuracy": 0.515, "average": 0.515},
    "6": {"accuracy": 0.506, "average": 0.506},
    "7": {"accuracy": 0.515, "average": 0.515},
    "8": {"accuracy": 0.513, "average": 0.513},
    "9": {"accuracy": 0.493, "average": 0.493},
    "10": {"accuracy": 0.501, "average": 0.501},
    "11": {"accuracy": 0.517, "average": 0.517},
    "12": {"accuracy": 0.496, "average": 0.496},
    "13": {"accuracy": 0.503, "average": 0.503},
    "14": {"accuracy": 0.515, "average": 0.515}
  },
  "anli-r3": {
    "median": 0.496,
    "interquartile_range": 0.012,
    "average_scores_for_each_prompt": [0.5, 0.504, 0.484, 0.507, 0.496, 0.502, 0.493, 0.488, 0.489, 0.489, 0.504, 0.493, 0.513, 0.502, 0.494],
    "0": {"accuracy": 0.5, "average": 0.5},
    "1": {"accuracy": 0.504, "average": 0.504},
    "2": {"accuracy": 0.484, "average": 0.484},
    "3": {"accuracy": 0.507, "average": 0.507},
    "4": {"accuracy": 0.496, "average": 0.496},
    "5": {"accuracy": 0.502, "average": 0.502},
    "6": {"accuracy": 0.493, "average": 0.493},
    "7": {"accuracy": 0.488, "average": 0.488},
    "8": {"accuracy": 0.489, "average": 0.489},
    "9": {"accuracy": 0.489, "average": 0.489},
    "10": {"accuracy": 0.504, "average": 0.504},
    "11": {"accuracy": 0.493, "average": 0.493},
    "12": {"accuracy": 0.513, "average": 0.513},
    "13": {"accuracy": 0.502, "average": 0.502},
    "14": {"accuracy": 0.494, "average": 0.494}
  },
  "config": {
    "inference_dataset": null,
    "split": "test",
    "few_shot_random_seed": null,
    "num_val_samples": 32,
    "max_datapoints_per_dataset_without_templates": null,
    "should_save_to_gcp": false,
    "prediction_dir": "exp_out/merging/ia3-bigscience-T0_3B/tpa-es10_ia3_base_test/rte,cb,winogrande,wic,wsc,copa,h-swag,story_cloze,anli-r1,anli-r2,anli-r3/lambda_searched/predictions/multiple_prompts",
    "max_gen_len": 64,
    "eval_batch_size": 16,
    "eval_template_idx": null,
    "length_normalization": false,
    "use_bfloat16_during_eval": true,
    "world_size": null,
    "did_run_finish": false,
    "fields_toIterateOver": ["inference_dataset", "eval_template_idx"],
    "values_toIterateOver": {
      "rte": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
      "cb": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14],
      "winogrande": [0, 1, 2, 3, 4],
      "wic": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
      "wsc": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
      "copa": [0, 1, 2, 3, 4, 5, 6, 7],
      "h-swag": [0],
      "story_cloze": [0, 1, 2, 3, 4],
      "anli-r1": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14],
      "anli-r2": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14],
      "anli-r3": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]
    }
  },
  "model_config": {"pretrained_model": "bigscience/T0_3B", "max_seq_len": 512, "peft_method": "ia3", "peft_params": null, "checkpoint_to_directly_load_model": null, "function_to_merge_checkpoints": null, "model_checkpoints": null, "trainable_param_regex": null, "model_lambda": null, "search_lambda": "none", "temperature": 1, "learn_pretraining_lambda": false, "pretraining_init": 1, "lambda_init": 0.2, "lambda_type": "vector-shared", "pretrained_lambda_type": "scalar-shared"},
  "average": 0.681
}
{
  "rte": {
    "median": 0.798,
    "interquartile_range": 0.02,
    "average_scores_for_each_prompt": [0.808, 0.78, 0.816, 0.792, 0.8, 0.796, 0.763, 0.784, 0.8, 0.808],
    "0": {"accuracy": 0.808, "average": 0.808},
    "1": {"accuracy": 0.78, "average": 0.78},
    "2": {"accuracy": 0.816, "average": 0.816},
    "3": {"accuracy": 0.792, "average": 0.792},
    "4": {"accuracy": 0.8, "average": 0.8},
    "5": {"accuracy": 0.796, "average": 0.796},
    "6": {"accuracy": 0.763, "average": 0.763},
    "7": {"accuracy": 0.784, "average": 0.784},
    "8": {"accuracy": 0.8, "average": 0.8},
    "9": {"accuracy": 0.808, "average": 0.808}
  },
  "cb": {
    "median": 0.833,
    "interquartile_range": 0.041,
    "average_scores_for_each_prompt": [0.833, 0.792, 0.792, 0.833, 0.792, 0.875, 0.875, 0.75, 0.833, 0.792, 0.875, 0.708, 0.833, 0.833, 0.792],
    "0": {"accuracy": 0.833, "average": 0.833},
    "1": {"accuracy": 0.792, "average": 0.792},
    "2": {"accuracy": 0.792, "average": 0.792},
    "3": {"accuracy": 0.833, "average": 0.833},
    "4": {"accuracy": 0.792, "average": 0.792},
    "5": {"accuracy": 0.875, "average": 0.875},
    "6": {"accuracy": 0.875, "average": 0.875},
    "7": {"accuracy": 0.75, "average": 0.75},
    "8": {"accuracy": 0.833, "average": 0.833},
    "9": {"accuracy": 0.792, "average": 0.792},
    "10": {"accuracy": 0.875, "average": 0.875},
    "11": {"accuracy": 0.708, "average": 0.708},
    "12": {"accuracy": 0.833, "average": 0.833},
    "13": {"accuracy": 0.833, "average": 0.833},
    "14": {"accuracy": 0.792, "average": 0.792}
  },
  "winogrande": {
    "median": 0.67,
    "interquartile_range": 0.006,
    "average_scores_for_each_prompt": [0.679, 0.666, 0.675, 0.669, 0.67],
    "0": {"accuracy": 0.679, "average": 0.679},
    "1": {"accuracy": 0.666, "average": 0.666},
    "2": {"accuracy": 0.675, "average": 0.675},
    "3": {"accuracy": 0.669, "average": 0.669},
    "4": {"accuracy": 0.67, "average": 0.67}
  },
  "wic": {
    "median": 0.631,
    "interquartile_range": 0.056,
    "average_scores_for_each_prompt": [0.629, 0.64, 0.635, 0.538, 0.569, 0.653, 0.647, 0.52, 0.625, 0.632],
    "0": {"accuracy": 0.629, "average": 0.629},
    "1": {"accuracy": 0.64, "average": 0.64},
    "2": {"accuracy": 0.635, "average": 0.635},
    "3": {"accuracy": 0.538, "average": 0.538},
    "4": {"accuracy": 0.569, "average": 0.569},
    "5": {"accuracy": 0.653, "average": 0.653},
    "6": {"accuracy": 0.647, "average": 0.647},
    "7": {"accuracy": 0.52, "average": 0.52},
    "8": {"accuracy": 0.625, "average": 0.625},
    "9": {"accuracy": 0.632, "average": 0.632}
  },
  "wsc": {
    "median": 0.625,
    "interquartile_range": 0.066,
    "average_scores_for_each_prompt": [0.583, 0.528, 0.708, 0.681, 0.569, 0.625, 0.639, 0.542, 0.639, 0.625],
    "0": {"accuracy": 0.583, "average": 0.583},
    "1": {"accuracy": 0.528, "average": 0.528},
    "2": {"accuracy": 0.708, "average": 0.708},
    "3": {"accuracy": 0.681, "average": 0.681},
    "4": {"accuracy": 0.569, "average": 0.569},
    "5": {"accuracy": 0.625, "average": 0.625},
    "6": {"accuracy": 0.639, "average": 0.639},
    "7": {"accuracy": 0.542, "average": 0.542},
    "8": {"accuracy": 0.639, "average": 0.639},
    "9": {"accuracy": 0.625, "average": 0.625}
  },
  "copa": {
    "median": 0.882,
    "interquartile_range": 0.048,
    "average_scores_for_each_prompt": [0.897, 0.838, 0.926, 0.824, 0.882, 0.882, 0.853, 0.897],
    "0": {"accuracy": 0.897, "average": 0.897},
    "1": {"accuracy": 0.838, "average": 0.838},
    "2": {"accuracy": 0.926, "average": 0.926},
    "3": {"accuracy": 0.824, "average": 0.824},
    "4": {"accuracy": 0.882, "average": 0.882},
    "5": {"accuracy": 0.882, "average": 0.882},
    "6": {"accuracy": 0.853, "average": 0.853},
    "7": {"accuracy": 0.897, "average": 0.897}
  },
  "h-swag": {
    "median": 0.428,
    "interquartile_range": 0.0,
    "average_scores_for_each_prompt": [0.428],
    "h-swag": {"accuracy": 0.428, "average": 0.428}
  },
  "story_cloze": {
    "median": 0.926,
    "interquartile_range": 0.003,
    "average_scores_for_each_prompt": [0.927, 0.928, 0.926, 0.924, 0.92],
    "0": {"accuracy": 0.927, "average": 0.927},
    "1": {"accuracy": 0.928, "average": 0.928},
    "2": {"accuracy": 0.926, "average": 0.926},
    "3": {"accuracy": 0.924, "average": 0.924},
    "4": {"accuracy": 0.92, "average": 0.92}
  },
  "anli-r1": {
    "median": 0.674,
    "interquartile_range": 0.024,
    "average_scores_for_each_prompt": [0.624, 0.675, 0.681, 0.658, 0.676, 0.688, 0.657, 0.674, 0.675, 0.625, 0.647, 0.676, 0.634, 0.663, 0.676],
    "0": {"accuracy": 0.624, "average": 0.624},
    "1": {"accuracy": 0.675, "average": 0.675},
    "2": {"accuracy": 0.681, "average": 0.681},
    "3": {"accuracy": 0.658, "average": 0.658},
    "4": {"accuracy": 0.676, "average": 0.676},
    "5": {"accuracy": 0.688, "average": 0.688},
    "6": {"accuracy": 0.657, "average": 0.657},
    "7": {"accuracy": 0.674, "average": 0.674},
    "8": {"accuracy": 0.675, "average": 0.675},
    "9": {"accuracy": 0.625, "average": 0.625},
    "10": {"accuracy": 0.647, "average": 0.647},
    "11": {"accuracy": 0.676, "average": 0.676},
    "12": {"accuracy": 0.634, "average": 0.634},
    "13": {"accuracy": 0.663, "average": 0.663},
    "14": {"accuracy": 0.676, "average": 0.676}
  },
  "anli-r2": {
    "median": 0.512,
    "interquartile_range": 0.017,
    "average_scores_for_each_prompt": [0.496, 0.528, 0.522, 0.501, 0.512, 0.52, 0.513, 0.512, 0.52, 0.495, 0.491, 0.518, 0.502, 0.503, 0.517],
    "0": {"accuracy": 0.496, "average": 0.496},
    "1": {"accuracy": 0.528, "average": 0.528},
    "2": {"accuracy": 0.522, "average": 0.522},
    "3": {"accuracy": 0.501, "average": 0.501},
    "4": {"accuracy": 0.512, "average": 0.512},
    "5": {"accuracy": 0.52, "average": 0.52},
    "6": {"accuracy": 0.513, "average": 0.513},
    "7": {"accuracy": 0.512, "average": 0.512},
    "8": {"accuracy": 0.52, "average": 0.52},
    "9": {"accuracy": 0.495, "average": 0.495},
    "10": {"accuracy": 0.491, "average": 0.491},
    "11": {"accuracy": 0.518, "average": 0.518},
    "12": {"accuracy": 0.502, "average": 0.502},
    "13": {"accuracy": 0.503, "average": 0.503},
    "14": {"accuracy": 0.517, "average": 0.517}
  },
  "anli-r3": {
    "median": 0.497,
    "interquartile_range": 0.016,
    "average_scores_for_each_prompt": [0.488, 0.505, 0.485, 0.502, 0.497, 0.504, 0.497, 0.484, 0.487, 0.486, 0.498, 0.488, 0.512, 0.502, 0.504],
    "0": {"accuracy": 0.488, "average": 0.488},
    "1": {"accuracy": 0.505, "average": 0.505},
    "2": {"accuracy": 0.485, "average": 0.485},
    "3": {"accuracy": 0.502, "average": 0.502},
    "4": {"accuracy": 0.497, "average": 0.497},
    "5": {"accuracy": 0.504, "average": 0.504},
    "6": {"accuracy": 0.497, "average": 0.497},
    "7": {"accuracy": 0.484, "average": 0.484},
    "8": {"accuracy": 0.487, "average": 0.487},
    "9": {"accuracy": 0.486, "average": 0.486},
    "10": {"accuracy": 0.498, "average": 0.498},
    "11": {"accuracy": 0.488, "average": 0.488},
    "12": {"accuracy": 0.512, "average": 0.512},
    "13": {"accuracy": 0.502, "average": 0.502},
    "14": {"accuracy": 0.504, "average": 0.504}
  },
  "config": {
    "inference_dataset": null,
    "split": "test",
    "few_shot_random_seed": null,
    "num_val_samples": 32,
    "max_datapoints_per_dataset_without_templates": null,
    "should_save_to_gcp": false,
    "prediction_dir": "exp_out/merging/ia3-bigscience-T0_3B/tpa-es10_ia3_base_test/rte,cb,winogrande,wic,wsc,copa,h-swag,story_cloze,anli-r1,anli-r2,anli-r3/lambda_searched/predictions/multiple_prompts",
    "max_gen_len": 64,
    "eval_batch_size": 16,
    "eval_template_idx": null,
    "length_normalization": false,
    "use_bfloat16_during_eval": true,
    "world_size": null,
    "did_run_finish": false,
    "fields_toIterateOver": ["inference_dataset", "eval_template_idx"],
    "values_toIterateOver": {
      "rte": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
      "cb": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14],
      "winogrande": [0, 1, 2, 3, 4],
      "wic": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
      "wsc": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
      "copa": [0, 1, 2, 3, 4, 5, 6, 7],
      "h-swag": [0],
      "story_cloze": [0, 1, 2, 3, 4],
      "anli-r1": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14],
      "anli-r2": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14],
      "anli-r3": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]
    }
  },
  "model_config": {"pretrained_model": "bigscience/T0_3B", "max_seq_len": 512, "peft_method": "ia3", "peft_params": null, "checkpoint_to_directly_load_model": null, "function_to_merge_checkpoints": null, "model_checkpoints": null, "trainable_param_regex": null, "model_lambda": null, "search_lambda": "none", "temperature": 1, "learn_pretraining_lambda": false, "pretraining_init": 1, "lambda_init": 0.2, "lambda_type": "vector-shared", "pretrained_lambda_type": "scalar-shared"},
  "average": 0.68
}
{
  "rte": {
    "median": 0.804,
    "interquartile_range": 0.018,
    "average_scores_for_each_prompt": [0.812, 0.792, 0.816, 0.792, 0.808, 0.8, 0.796, 0.788, 0.808, 0.812],
    "0": {"accuracy": 0.812, "average": 0.812},
    "1": {"accuracy": 0.792, "average": 0.792},
    "2": {"accuracy": 0.816, "average": 0.816},
    "3": {"accuracy": 0.792, "average": 0.792},
    "4": {"accuracy": 0.808, "average": 0.808},
    "5": {"accuracy": 0.8, "average": 0.8},
    "6": {"accuracy": 0.796, "average": 0.796},
    "7": {"accuracy": 0.788, "average": 0.788},
    "8": {"accuracy": 0.808, "average": 0.808},
    "9": {"accuracy": 0.812, "average": 0.812}
  },
  "cb": {
    "median": 0.833,
    "interquartile_range": 0.02,
    "average_scores_for_each_prompt": [0.833, 0.833, 0.792, 0.833, 0.833, 0.917, 0.875, 0.75, 0.833, 0.833, 0.875, 0.792, 0.833, 0.833, 0.792],
    "0": {"accuracy": 0.833, "average": 0.833},
    "1": {"accuracy": 0.833, "average": 0.833},
    "2": {"accuracy": 0.792, "average": 0.792},
    "3": {"accuracy": 0.833, "average": 0.833},
    "4": {"accuracy": 0.833, "average": 0.833},
    "5": {"accuracy": 0.917, "average": 0.917},
    "6": {"accuracy": 0.875, "average": 0.875},
    "7": {"accuracy": 0.75, "average": 0.75},
    "8": {"accuracy": 0.833, "average": 0.833},
    "9": {"accuracy": 0.833, "average": 0.833},
    "10": {"accuracy": 0.875, "average": 0.875},
    "11": {"accuracy": 0.792, "average": 0.792},
    "12": {"accuracy": 0.833, "average": 0.833},
    "13": {"accuracy": 0.833, "average": 0.833},
    "14": {"accuracy": 0.792, "average": 0.792}
  },
  "winogrande": {
    "median": 0.674,
    "interquartile_range": 0.006,
    "average_scores_for_each_prompt": [0.679, 0.672, 0.684, 0.674, 0.673],
    "0": {"accuracy": 0.679, "average": 0.679},
    "1": {"accuracy": 0.672, "average": 0.672},
    "2": {"accuracy": 0.684, "average": 0.684},
    "3": {"accuracy": 0.674, "average": 0.674},
    "4": {"accuracy": 0.673, "average": 0.673}
  },
  "wic": {
    "median": 0.647,
    "interquartile_range": 0.095,
    "average_scores_for_each_prompt": [0.655, 0.653, 0.65, 0.533, 0.528, 0.644, 0.658, 0.517, 0.629, 0.649],
    "0": {"accuracy": 0.655, "average": 0.655},
    "1": {"accuracy": 0.653, "average": 0.653},
    "2": {"accuracy": 0.65, "average": 0.65},
    "3": {"accuracy": 0.533, "average": 0.533},
    "4": {"accuracy": 0.528, "average": 0.528},
    "5": {"accuracy": 0.644, "average": 0.644},
    "6": {"accuracy": 0.658, "average": 0.658},
    "7": {"accuracy": 0.517, "average": 0.517},
    "8": {"accuracy": 0.629, "average": 0.629},
    "9": {"accuracy": 0.649, "average": 0.649}
  },
  "wsc": {
    "median": 0.618,
    "interquartile_range": 0.074,
    "average_scores_for_each_prompt": [0.611, 0.556, 0.708, 0.681, 0.569, 0.639, 0.625, 0.528, 0.597, 0.653],
    "0": {"accuracy": 0.611, "average": 0.611},
    "1": {"accuracy": 0.556, "average": 0.556},
    "2": {"accuracy": 0.708, "average": 0.708},
    "3": {"accuracy": 0.681, "average": 0.681},
    "4": {"accuracy": 0.569, "average": 0.569},
    "5": {"accuracy": 0.639, "average": 0.639},
    "6": {"accuracy": 0.625, "average": 0.625},
    "7": {"accuracy": 0.528, "average": 0.528},
    "8": {"accuracy": 0.597, "average": 0.597},
    "9": {"accuracy": 0.653, "average": 0.653}
  },
  "copa": {
    "median": 0.875,
    "interquartile_range": 0.063,
    "average_scores_for_each_prompt": [0.912, 0.838, 0.926, 0.838, 0.897, 0.853, 0.838, 0.897],
    "0": {"accuracy": 0.912, "average": 0.912},
    "1": {"accuracy": 0.838, "average": 0.838},
    "2": {"accuracy": 0.926, "average": 0.926},
    "3": {"accuracy": 0.838, "average": 0.838},
    "4": {"accuracy": 0.897, "average": 0.897},
    "5": {"accuracy": 0.853, "average": 0.853},
    "6": {"accuracy": 0.838, "average": 0.838},
    "7": {"accuracy": 0.897, "average": 0.897}
  },
  "h-swag": {
    "median": 0.427,
    "interquartile_range": 0.0,
    "average_scores_for_each_prompt": [0.427],
    "h-swag": {"accuracy": 0.427, "average": 0.427}
  },
  "story_cloze": {
    "median": 0.931,
    "interquartile_range": 0.003,
    "average_scores_for_each_prompt": [0.933, 0.931, 0.933, 0.93, 0.929],
    "0": {"accuracy": 0.933, "average": 0.933},
    "1": {"accuracy": 0.931, "average": 0.931},
    "2": {"accuracy": 0.933, "average": 0.933},
    "3": {"accuracy": 0.93, "average": 0.93},
    "4": {"accuracy": 0.929, "average": 0.929}
  },
  "anli-r1": {
    "median": 0.67,
    "interquartile_range": 0.02,
    "average_scores_for_each_prompt": [0.627, 0.677, 0.687, 0.668, 0.683, 0.68, 0.659, 0.673, 0.675, 0.62, 0.659, 0.68, 0.631, 0.665, 0.67],
    "0": {"accuracy": 0.627, "average": 0.627},
    "1": {"accuracy": 0.677, "average": 0.677},
    "2": {"accuracy": 0.687, "average": 0.687},
    "3": {"accuracy": 0.668, "average": 0.668},
    "4": {"accuracy": 0.683, "average": 0.683},
    "5": {"accuracy": 0.68, "average": 0.68},
    "6": {"accuracy": 0.659, "average": 0.659},
    "7": {"accuracy": 0.673, "average": 0.673},
    "8": {"accuracy": 0.675, "average": 0.675},
    "9": {"accuracy": 0.62, "average": 0.62},
    "10": {"accuracy": 0.659, "average": 0.659},
    "11": {"accuracy": 0.68, "average": 0.68},
    "12": {"accuracy": 0.631, "average": 0.631},
    "13": {"accuracy": 0.665, "average": 0.665},
    "14": {"accuracy": 0.67, "average": 0.67}
  },
  "anli-r2": {
    "median": 0.513,
    "interquartile_range": 0.017,
    "average_scores_for_each_prompt": [0.5, 0.521, 0.53, 0.497, 0.516, 0.52, 0.507, 0.518, 0.515, 0.49, 0.5, 0.516, 0.499, 0.509, 0.513],
    "0": {"accuracy": 0.5, "average": 0.5},
    "1": {"accuracy": 0.521, "average": 0.521},
    "2": {"accuracy": 0.53, "average": 0.53},
    "3": {"accuracy": 0.497, "average": 0.497},
    "4": {"accuracy": 0.516, "average": 0.516},
    "5": {"accuracy": 0.52, "average": 0.52},
    "6": {"accuracy": 0.507, "average": 0.507},
    "7": {"accuracy": 0.518, "average": 0.518},
    "8": {"accuracy": 0.515, "average": 0.515},
    "9": {"accuracy": 0.49, "average": 0.49},
    "10": {"accuracy": 0.5, "average": 0.5},
    "11": {"accuracy": 0.516, "average": 0.516},
    "12": {"accuracy": 0.499, "average": 0.499},
    "13": {"accuracy": 0.509, "average": 0.509},
    "14": {"accuracy": 0.513, "average": 0.513}
  },
  "anli-r3": {
    "median": 0.497,
    "interquartile_range": 0.013,
    "average_scores_for_each_prompt": [0.499, 0.507, 0.486, 0.51, 0.497, 0.501, 0.491, 0.488, 0.492, 0.487, 0.501, 0.497, 0.516, 0.507, 0.497],
    "0": {"accuracy": 0.499, "average": 0.499},
    "1": {"accuracy": 0.507, "average": 0.507},
    "2": {"accuracy": 0.486, "average": 0.486},
    "3": {"accuracy": 0.51, "average": 0.51},
    "4": {"accuracy": 0.497, "average": 0.497},
    "5": {"accuracy": 0.501, "average": 0.501},
    "6": {"accuracy": 0.491, "average": 0.491},
    "7": {"accuracy": 0.488, "average": 0.488},
    "8": {"accuracy": 0.492, "average": 0.492},
    "9": {"accuracy": 0.487, "average": 0.487},
    "10": {"accuracy": 0.501, "average": 0.501},
    "11": {"accuracy": 0.497, "average": 0.497},
    "12": {"accuracy": 0.516, "average": 0.516},
    "13": {"accuracy": 0.507, "average": 0.507},
    "14": {"accuracy": 0.497, "average": 0.497}
  },
  "config": {
    "inference_dataset": null,
    "split": "test",
    "few_shot_random_seed": null,
    "num_val_samples": 32,
    "max_datapoints_per_dataset_without_templates": null,
    "should_save_to_gcp": false,
    "prediction_dir": "exp_out/merging/ia3-bigscience-T0_3B/tpa-es10_ia3_base_test/rte,cb,winogrande,wic,wsc,copa,h-swag,story_cloze,anli-r1,anli-r2,anli-r3/lambda_searched/predictions/multiple_prompts",
    "max_gen_len": 64,
    "eval_batch_size": 16,
    "eval_template_idx": null,
    "length_normalization": false,
    "use_bfloat16_during_eval": true,
    "world_size": null,
    "did_run_finish": false,
    "fields_toIterateOver": ["inference_dataset", "eval_template_idx"],
    "values_toIterateOver": {
      "rte": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
      "cb": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14],
      "winogrande": [0, 1, 2, 3, 4],
      "wic": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
      "wsc": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
      "copa": [0, 1, 2, 3, 4, 5, 6, 7],
      "h-swag": [0],
      "story_cloze": [0, 1, 2, 3, 4],
      "anli-r1": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14],
      "anli-r2": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14],
      "anli-r3": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]
    }
  },
  "model_config": {"pretrained_model": "bigscience/T0_3B", "max_seq_len": 512, "peft_method": "ia3", "peft_params": null, "checkpoint_to_directly_load_model": null, "function_to_merge_checkpoints": null, "model_checkpoints": null, "trainable_param_regex": null, "model_lambda": null, "search_lambda": "none", "temperature": 1, "learn_pretraining_lambda": false, "pretraining_init": 1, "lambda_init": 0.2, "lambda_type": "vector-shared", "pretrained_lambda_type": "scalar-shared"},
  "average": 0.681
}
{
  "rte": {
    "median": 0.796,
    "interquartile_range": 0.018,
    "average_scores_for_each_prompt": [0.808, 0.796, 0.812, 0.784, 0.808, 0.796, 0.776, 0.776, 0.796, 0.796],
    "0": {"accuracy": 0.808, "average": 0.808},
    "1": {"accuracy": 0.796, "average": 0.796},
    "2": {"accuracy": 0.812, "average": 0.812},
    "3": {"accuracy": 0.784, "average": 0.784},
    "4": {"accuracy": 0.808, "average": 0.808},
    "5": {"accuracy": 0.796, "average": 0.796},
    "6": {"accuracy": 0.776, "average": 0.776},
    "7": {"accuracy": 0.776, "average": 0.776},
    "8": {"accuracy": 0.796, "average": 0.796},
    "9": {"accuracy": 0.796, "average": 0.796}
  },
  "cb": {
    "median": 0.833,
    "interquartile_range": 0.041,
    "average_scores_for_each_prompt": [0.833, 0.792, 0.792, 0.833, 0.833, 0.875, 0.875, 0.75, 0.833, 0.792, 0.875, 0.792, 0.792, 0.833, 0.833],
    "0": {"accuracy": 0.833, "average": 0.833},
    "1": {"accuracy": 0.792, "average": 0.792},
    "2": {"accuracy": 0.792, "average": 0.792},
    "3": {"accuracy": 0.833, "average": 0.833},
    "4": {"accuracy": 0.833, "average": 0.833},
    "5": {"accuracy": 0.875, "average": 0.875},
    "6": {"accuracy": 0.875, "average": 0.875},
    "7": {"accuracy": 0.75, "average": 0.75},
    "8": {"accuracy": 0.833, "average": 0.833},
    "9": {"accuracy": 0.792, "average": 0.792},
    "10": {"accuracy": 0.875, "average": 0.875},
    "11": {"accuracy": 0.792, "average": 0.792},
    "12": {"accuracy": 0.792, "average": 0.792},
    "13": {"accuracy": 0.833, "average": 0.833},
    "14": {"accuracy": 0.833, "average": 0.833}
  },
  "winogrande": {
    "median": 0.674,
    "interquartile_range": 0.008,
    "average_scores_for_each_prompt": [0.681, 0.67, 0.678, 0.674, 0.67],
    "0": {"accuracy": 0.681, "average": 0.681},
    "1": {"accuracy": 0.67, "average": 0.67},
    "2": {"accuracy": 0.678, "average": 0.678},
    "3": {"accuracy": 0.674, "average": 0.674},
    "4": {"accuracy": 0.67, "average": 0.67}
  },
  "wic": {
    "median": 0.645,
    "interquartile_range": 0.086,
    "average_scores_for_each_prompt": [0.647, 0.65, 0.644, 0.538, 0.541, 0.649, 0.665, 0.518, 0.627, 0.645],
    "0": {"accuracy": 0.647, "average": 0.647},
    "1": {"accuracy": 0.65, "average": 0.65},
    "2": {"accuracy": 0.644, "average": 0.644},
    "3": {"accuracy": 0.538, "average": 0.538},
    "4": {"accuracy": 0.541, "average": 0.541},
    "5": {"accuracy": 0.649, "average": 0.649},
    "6": {"accuracy": 0.665, "average": 0.665},
    "7": {"accuracy": 0.518, "average": 0.518},
    "8": {"accuracy": 0.627, "average": 0.627},
    "9": {"accuracy": 0.645, "average": 0.645}
  },
  "wsc": {
    "median": 0.639,
    "interquartile_range": 0.076,
    "average_scores_for_each_prompt": [0.639, 0.556, 0.681, 0.639, 0.556, 0.653, 0.625, 0.514, 0.639, 0.653],
    "0": {"accuracy": 0.639, "average": 0.639},
    "1": {"accuracy": 0.556, "average": 0.556},
    "2": {"accuracy": 0.681, "average": 0.681},
    "3": {"accuracy": 0.639, "average": 0.639},
    "4": {"accuracy": 0.556, "average": 0.556},
    "5": {"accuracy": 0.653, "average": 0.653},
    "6": {"accuracy": 0.625, "average": 0.625},
    "7": {"accuracy": 0.514, "average": 0.514},
    "8": {"accuracy": 0.639, "average": 0.639},
    "9": {"accuracy": 0.653, "average": 0.653}
  },
  "copa": {
    "median": 0.883,
    "interquartile_range": 0.059,
    "average_scores_for_each_prompt": [0.897, 0.838, 0.912, 0.838, 0.897, 0.868, 0.838, 0.897],
    "0": {"accuracy": 0.897, "average": 0.897},
    "1": {"accuracy": 0.838, "average": 0.838},
    "2": {"accuracy": 0.912, "average": 0.912},
    "3": {"accuracy": 0.838, "average": 0.838},
    "4": {"accuracy": 0.897, "average": 0.897},
    "5": {"accuracy": 0.868, "average": 0.868},
    "6": {"accuracy": 0.838, "average": 0.838},
    "7": {"accuracy": 0.897, "average": 0.897}
  },
  "h-swag": {
    "median": 0.426,
    "interquartile_range": 0.0,
    "average_scores_for_each_prompt": [0.426],
    "h-swag": {"accuracy": 0.426, "average": 0.426}
  },
  "story_cloze": {
    "median": 0.921,
    "interquartile_range": 0.004,
    "average_scores_for_each_prompt": [0.921, 0.921, 0.927, 0.917, 0.916],
    "0": {"accuracy": 0.921, "average": 0.921},
    "1": {"accuracy": 0.921, "average": 0.921},
    "2": {"accuracy": 0.927, "average": 0.927},
    "3": {"accuracy": 0.917, "average": 0.917},
    "4": {"accuracy": 0.916, "average": 0.916}
  },
  "anli-r1": {
    "median": 0.669,
    "interquartile_range": 0.013,
    "average_scores_for_each_prompt": [0.64, 0.673, 0.68, 0.667, 0.676, 0.674, 0.667, 0.673, 0.677, 0.638, 0.661, 0.669, 0.638, 0.659, 0.67],
    "0": {"accuracy": 0.64, "average": 0.64},
    "1": {"accuracy": 0.673, "average": 0.673},
    "2": {"accuracy": 0.68, "average": 0.68},
    "3": {"accuracy": 0.667, "average": 0.667},
    "4": {"accuracy": 0.676, "average": 0.676},
    "5": {"accuracy": 0.674, "average": 0.674},
    "6": {"accuracy": 0.667, "average": 0.667},
    "7": {"accuracy": 0.673, "average": 0.673},
    "8": {"accuracy": 0.677, "average": 0.677},
    "9": {"accuracy": 0.638, "average": 0.638},
    "10": {"accuracy": 0.661, "average": 0.661},
    "11": {"accuracy": 0.669, "average": 0.669},
    "12": {"accuracy": 0.638, "average": 0.638},
    "13": {"accuracy": 0.659, "average": 0.659},
    "14": {"accuracy": 0.67, "average": 0.67}
  },
  "anli-r2": {
    "median": 0.515,
    "interquartile_range": 0.016,
    "average_scores_for_each_prompt": [0.499, 0.515, 0.518, 0.502, 0.513, 0.515, 0.518, 0.52, 0.52, 0.505, 0.5, 0.522, 0.503, 0.502, 0.518],
    "0": {"accuracy": 0.499, "average": 0.499},
    "1": {"accuracy": 0.515, "average": 0.515},
    "2": {"accuracy": 0.518, "average": 0.518},
    "3": {"accuracy": 0.502, "average": 0.502},
    "4": {"accuracy": 0.513, "average": 0.513},
    "5": {"accuracy": 0.515, "average": 0.515},
    "6": {"accuracy": 0.518, "average": 0.518},
    "7": {"accuracy": 0.52, "average": 0.52},
    "8": {"accuracy": 0.52, "average": 0.52},
    "9": {"accuracy": 0.505, "average": 0.505},
    "10": {"accuracy": 0.5, "average": 0.5},
    "11": {"accuracy": 0.522, "average": 0.522},
    "12": {"accuracy": 0.503, "average": 0.503},
    "13": {"accuracy": 0.502, "average": 0.502},
    "14": {"accuracy": 0.518, "average": 0.518}
  },
  "anli-r3": {
    "median": 0.499,
    "interquartile_range": 0.013,
    "average_scores_for_each_prompt": [0.501, 0.504, 0.485, 0.51, 0.497, 0.506, 0.494, 0.485, 0.485, 0.491, 0.499, 0.497, 0.511, 0.511, 0.499],
    "0": {"accuracy": 0.501, "average": 0.501},
    "1": {"accuracy": 0.504, "average": 0.504},
    "2": {"accuracy": 0.485, "average": 0.485},
    "3": {"accuracy": 0.51, "average": 0.51},
    "4": {"accuracy": 0.497, "average": 0.497},
    "5": {"accuracy": 0.506, "average": 0.506},
    "6": {"accuracy": 0.494, "average": 0.494},
    "7": {"accuracy": 0.485, "average": 0.485},
    "8": {"accuracy": 0.485, "average": 0.485},
    "9": {"accuracy": 0.491, "average": 0.491},
    "10": {"accuracy": 0.499, "average": 0.499},
    "11": {"accuracy": 0.497, "average": 0.497},
    "12": {"accuracy": 0.511, "average": 0.511},
    "13": {"accuracy": 0.511, "average": 0.511},
    "14": {"accuracy": 0.499, "average": 0.499}
  },
  "config": {
    "inference_dataset": null,
    "split": "test",
    "few_shot_random_seed": null,
    "num_val_samples": 32,
    "max_datapoints_per_dataset_without_templates": null,
    "should_save_to_gcp": false,
    "prediction_dir": "exp_out/merging/ia3-bigscience-T0_3B/tpa-es10_ia3_base_test/rte,cb,winogrande,wic,wsc,copa,h-swag,story_cloze,anli-r1,anli-r2,anli-r3/lambda_searched/predictions/multiple_prompts",
    "max_gen_len": 64,
    "eval_batch_size": 16,
    "eval_template_idx": null,
    "length_normalization": false,
    "use_bfloat16_during_eval": true,
    "world_size": null,
    "did_run_finish": false,
    "fields_toIterateOver": ["inference_dataset", "eval_template_idx"],
    "values_toIterateOver": {
      "rte": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
      "cb": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14],
      "winogrande": [0, 1, 2, 3, 4],
      "wic": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
      "wsc": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
      "copa": [0, 1, 2, 3, 4, 5, 6, 7],
      "h-swag": [0],
      "story_cloze": [0, 1, 2, 3, 4],
      "anli-r1": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14],
      "anli-r2": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14],
      "anli-r3": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]
    }
  },
  "model_config": {"pretrained_model": "bigscience/T0_3B", "max_seq_len": 512, "peft_method": "ia3", "peft_params": null, "checkpoint_to_directly_load_model": null, "function_to_merge_checkpoints": null, "model_checkpoints": null, "trainable_param_regex": null, "model_lambda": null, "search_lambda": "none", "temperature": 1, "learn_pretraining_lambda": false, "pretraining_init": 1, "lambda_init": 0.2, "lambda_type": "vector-shared", "pretrained_lambda_type": "scalar-shared"},
  "average": 0.682
}
{
  "rte": {
    "median": 0.784,
    "interquartile_range": 0.014,
    "average_scores_for_each_prompt": [0.788, 0.784, 0.8, 0.776, 0.804, 0.784, 0.767, 0.776, 0.792, 0.78],
    "0": {"accuracy": 0.788, "average": 0.788},
    "1": {"accuracy": 0.784, "average": 0.784},
    "2": {"accuracy": 0.8, "average": 0.8},
    "3": {"accuracy": 0.776, "average": 0.776},
    "4": {"accuracy": 0.804, "average": 0.804},
    "5": {"accuracy": 0.784, "average": 0.784},
    "6": {"accuracy": 0.767, "average": 0.767},
    "7": {"accuracy": 0.776, "average": 0.776},
    "8": {"accuracy": 0.792, "average": 0.792},
    "9": {"accuracy": 0.78, "average": 0.78}
  },
  "cb": {
    "median": 0.833,
    "interquartile_range": 0.041,
    "average_scores_for_each_prompt": [0.833, 0.792, 0.792, 0.833, 0.833, 0.833, 0.875, 0.75, 0.833, 0.792, 0.833, 0.792, 0.792, 0.833, 0.833],
    "0": {"accuracy": 0.833, "average": 0.833},
    "1": {"accuracy": 0.792, "average": 0.792},
    "2": {"accuracy": 0.792, "average": 0.792},
    "3": {"accuracy": 0.833, "average": 0.833},
    "4": {"accuracy": 0.833, "average": 0.833},
    "5": {"accuracy": 0.833, "average": 0.833},
    "6": {"accuracy": 0.875, "average": 0.875},
    "7": {"accuracy": 0.75, "average": 0.75},
    "8": {"accuracy": 0.833, "average": 0.833},
    "9": {"accuracy": 0.792, "average": 0.792},
    "10": {"accuracy": 0.833, "average": 0.833},
    "11": {"accuracy": 0.792, "average": 0.792},
    "12": {"accuracy": 0.792, "average": 0.792},
    "13": {"accuracy": 0.833, "average": 0.833},
    "14": {"accuracy": 0.833, "average": 0.833}
  },
  "winogrande": {
    "median": 0.67,
    "interquartile_range": 0.003,
    "average_scores_for_each_prompt": [0.677, 0.669, 0.673, 0.67, 0.67],
    "0": {"accuracy": 0.677, "average": 0.677},
    "1": {"accuracy": 0.669, "average": 0.669},
    "2": {"accuracy": 0.673, "average": 0.673},
    "3": {"accuracy": 0.67, "average": 0.67},
    "4": {"accuracy": 0.67, "average": 0.67}
  },
  "wic": {
    "median": 0.639,
    "interquartile_range": 0.094,
    "average_scores_for_each_prompt": [0.652, 0.647, 0.65, 0.523, 0.531, 0.632, 0.655, 0.507, 0.632, 0.649],
    "0": {"accuracy": 0.652, "average": 0.652},
    "1": {"accuracy": 0.647, "average": 0.647},
    "2": {"accuracy": 0.65, "average": 0.65},
    "3": {"accuracy": 0.523, "average": 0.523},
    "4": {"accuracy": 0.531, "average": 0.531},
    "5": {"accuracy": 0.632, "average": 0.632},
    "6": {"accuracy": 0.655, "average": 0.655},
    "7": {"accuracy": 0.507, "average": 0.507},
    "8": {"accuracy": 0.632, "average": 0.632},
    "9": {"accuracy": 0.649, "average": 0.649}
  },
  "wsc": {
    "median": 0.639,
    "interquartile_range": 0.07,
    "average_scores_for_each_prompt": [0.625, 0.556, 0.681, 0.653, 0.569, 0.653, 0.625, 0.514, 0.653, 0.653],
    "0": {"accuracy": 0.625, "average": 0.625},
    "1": {"accuracy": 0.556, "average": 0.556},
    "2": {"accuracy": 0.681, "average": 0.681},
    "3": {"accuracy": 0.653, "average": 0.653},
    "4": {"accuracy": 0.569, "average": 0.569},
    "5": {"accuracy": 0.653, "average": 0.653},
    "6": {"accuracy": 0.625, "average": 0.625},
    "7": {"accuracy": 0.514, "average": 0.514},
    "8": {"accuracy": 0.653, "average": 0.653},
    "9": {"accuracy": 0.653, "average": 0.653}
  },
  "copa": {
    "median": 0.875,
    "interquartile_range": 0.052,
    "average_scores_for_each_prompt": [0.897, 0.824, 0.912, 0.853, 0.912, 0.853, 0.838, 0.897],
    "0": {"accuracy": 0.897, "average": 0.897},
    "1": {"accuracy": 0.824, "average": 0.824},
    "2": {"accuracy": 0.912, "average": 0.912},
    "3": {"accuracy": 0.853, "average": 0.853},
    "4": {"accuracy": 0.912, "average": 0.912},
    "5": {"accuracy": 0.853, "average": 0.853},
    "6": {"accuracy": 0.838, "average": 0.838},
    "7": {"accuracy": 0.897, "average": 0.897}
  },
  "h-swag": {
    "median": 0.425,
    "interquartile_range": 0.0,
    "average_scores_for_each_prompt": [0.425],
    "h-swag": {"accuracy": 0.425, "average": 0.425}
  },
  "story_cloze": {
    "median": 0.92,
    "interquartile_range": 0.008,
    "average_scores_for_each_prompt": [0.922, 0.92, 0.924, 0.914, 0.912],
    "0": {"accuracy": 0.922, "average": 0.922},
    "1": {"accuracy": 0.92, "average": 0.92},
    "2": {"accuracy": 0.924, "average": 0.924},
    "3": {"accuracy": 0.914, "average": 0.914},
    "4": {"accuracy": 0.912, "average": 0.912}
  },
  "anli-r1": {
    "median": 0.665,
    "interquartile_range": 0.012,
    "average_scores_for_each_prompt": [0.634, 0.665, 0.679, 0.667, 0.672, 0.666, 0.667, 0.671, 0.672, 0.64, 0.656, 0.661, 0.637, 0.66, 0.658],
    "0": {"accuracy": 0.634, "average": 0.634},
    "1": {"accuracy": 0.665, "average": 0.665},
    "2": {"accuracy": 0.679, "average": 0.679},
    "3": {"accuracy": 0.667, "average": 0.667},
    "4": {"accuracy": 0.672, "average": 0.672},
    "5": {"accuracy": 0.666, "average": 0.666},
    "6": {"accuracy": 0.667, "average": 0.667},
    "7": {"accuracy": 0.671, "average": 0.671},
    "8": {"accuracy": 0.672, "average": 0.672},
    "9": {"accuracy": 0.64, "average": 0.64},
    "10": {"accuracy": 0.656, "average": 0.656},
    "11": {"accuracy": 0.661, "average": 0.661},
    "12": {"accuracy": 0.637, "average": 0.637},
    "13": {"accuracy": 0.66, "average": 0.66},
    "14": {"accuracy": 0.658, "average": 0.658}
  },
  "anli-r2": {
    "median": 0.51,
    "interquartile_range": 0.016,
    "average_scores_for_each_prompt": [0.497, 0.512, 0.523, 0.499, 0.504, 0.51, 0.51, 0.515, 0.516, 0.504, 0.497, 0.515, 0.494, 0.491, 0.513],
    "0": {"accuracy": 0.497, "average": 0.497},
    "1": {"accuracy": 0.512, "average": 0.512},
    "2": {"accuracy": 0.523, "average": 0.523},
    "3": {"accuracy": 0.499, "average": 0.499},
    "4": {"accuracy": 0.504, "average": 0.504},
    "5": {"accuracy": 0.51, "average": 0.51},
    "6": {"accuracy": 0.51, "average": 0.51},
    "7": {"accuracy": 0.515, "average": 0.515},
    "8": {"accuracy": 0.516, "average": 0.516},
    "9": {"accuracy": 0.504, "average": 0.504},
    "10": {"accuracy": 0.497, "average": 0.497},
    "11": {"accuracy": 0.515, "average": 0.515},
    "12": {"accuracy": 0.494, "average": 0.494},
    "13": {"accuracy": 0.491, "average": 0.491},
    "14": {"accuracy": 0.513, "average": 0.513}
  },
  "anli-r3": {
    "median": 0.499,
    "interquartile_range": 0.013,
    "average_scores_for_each_prompt": [0.503, 0.502, 0.488, 0.51, 0.495, 0.507, 0.497, 0.484, 0.483, 0.49, 0.504, 0.495, 0.512, 0.507, 0.499],
    "0": {"accuracy": 0.503, "average": 0.503},
    "1": {"accuracy": 0.502, "average": 0.502},
    "2": {"accuracy": 0.488, "average": 0.488},
    "3": {"accuracy": 0.51, "average": 0.51},
    "4": {"accuracy": 0.495, "average": 0.495},
    "5": {"accuracy": 0.507, "average": 0.507},
    "6": {"accuracy": 0.497, "average": 0.497},
    "7": {"accuracy": 0.484, "average": 0.484},
    "8": {"accuracy": 0.483, "average": 0.483},
    "9": {"accuracy": 0.49, "average": 0.49},
    "10": {"accuracy": 0.504, "average": 0.504},
    "11": {"accuracy": 0.495, "average": 0.495},
    "12": {"accuracy": 0.512, "average": 0.512},
    "13": {"accuracy": 0.507, "average": 0.507},
    "14": {"accuracy": 0.499, "average": 0.499}
  },
  "config": {
    "inference_dataset": null,
    "split": "test",
    "few_shot_random_seed": null,
    "num_val_samples": 32,
    "max_datapoints_per_dataset_without_templates": null,
    "should_save_to_gcp": false,
    "prediction_dir": "exp_out/merging/ia3-bigscience-T0_3B/tpa-es10_ia3_base_test/rte,cb,winogrande,wic,wsc,copa,h-swag,story_cloze,anli-r1,anli-r2,anli-r3/lambda_searched/predictions/multiple_prompts",
    "max_gen_len": 64,
    "eval_batch_size": 16,
    "eval_template_idx": null,
    "length_normalization": false,
    "use_bfloat16_during_eval": true,
    "world_size": null,
    "did_run_finish": false,
    "fields_toIterateOver": ["inference_dataset", "eval_template_idx"],
    "values_toIterateOver": {
      "rte": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
      "cb": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14],
      "winogrande": [0, 1, 2, 3, 4],
      "wic": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
      "wsc": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
      "copa": [0, 1, 2, 3, 4, 5, 6, 7],
      "h-swag": [0],
      "story_cloze": [0, 1, 2, 3, 4],
      "anli-r1": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14],
      "anli-r2": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14],
      "anli-r3": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]
    }
  },
  "model_config": {"pretrained_model": "bigscience/T0_3B", "max_seq_len": 512, "peft_method": "ia3", "peft_params": null, "checkpoint_to_directly_load_model": null, "function_to_merge_checkpoints": null, "model_checkpoints": null, "trainable_param_regex": null, "model_lambda": null, "search_lambda": "none", "temperature": 1, "learn_pretraining_lambda": false, "pretraining_init": 1, "lambda_init": 0.2, "lambda_type": "vector-shared", "pretrained_lambda_type": "scalar-shared"},
  "average": 0.678
}
{
  "rte": {
    "median": 0.788,
    "interquartile_range": 0.023,
    "average_scores_for_each_prompt": [0.808, 0.784, 0.808, 0.784, 0.808, 0.78, 0.78, 0.78, 0.792, 0.792],
    "0": {"accuracy": 0.808, "average": 0.808},
    "1": {"accuracy": 0.784, "average": 0.784},
    "2": {"accuracy": 0.808, "average": 0.808},
    "3": {"accuracy": 0.784, "average": 0.784},
    "4": {"accuracy": 0.808, "average": 0.808},
    "5": {"accuracy": 0.78, "average": 0.78},
    "6": {"accuracy": 0.78, "average": 0.78},
    "7": {"accuracy": 0.78, "average": 0.78},
    "8": {"accuracy": 0.792, "average": 0.792},
    "9": {"accuracy": 0.792, "average": 0.792}
  },
  "cb": {
    "median": 0.792,
    "interquartile_range": 0.041,
    "average_scores_for_each_prompt": [0.792, 0.792, 0.792, 0.833, 0.792, 0.833, 0.875, 0.75, 0.833, 0.792, 0.792, 0.75, 0.75, 0.833, 0.833],
    "0": {"accuracy": 0.792, "average": 0.792},
    "1": {"accuracy": 0.792, "average": 0.792},
    "2": {"accuracy": 0.792, "average": 0.792},
    "3": {"accuracy": 0.833, "average": 0.833},
    "4": {"accuracy": 0.792, "average": 0.792},
    "5": {"accuracy": 0.833, "average": 0.833},
    "6": {"accuracy": 0.875, "average": 0.875},
    "7": {"accuracy": 0.75, "average": 0.75},
    "8": {"accuracy": 0.833, "average": 0.833},
    "9": {"accuracy": 0.792, "average": 0.792},
    "10": {"accuracy": 0.792, "average": 0.792},
    "11": {"accuracy": 0.75, "average": 0.75},
    "12": {"accuracy": 0.75, "average": 0.75},
    "13": {"accuracy": 0.833, "average": 0.833},
    "14": {"accuracy": 0.833, "average": 0.833}
  },
  "winogrande": {
    "median": 0.669,
    "interquartile_range": 0.005,
    "average_scores_for_each_prompt": [0.681, 0.666, 0.671, 0.661, 0.669],
    "0": {"accuracy": 0.681, "average": 0.681},
    "1": {"accuracy": 0.666, "average": 0.666},
    "2": {"accuracy": 0.671, "average": 0.671},
    "3": {"accuracy": 0.661, "average": 0.661},
    "4": {"accuracy": 0.669, "average": 0.669}
  },
  "wic": {
    "median": 0.635,
    "interquartile_range": 0.099,
    "average_scores_for_each_prompt": [0.649, 0.65, 0.642, 0.525, 0.523, 0.62, 0.655, 0.508, 0.627, 0.645],
    "0": {"accuracy": 0.649, "average": 0.649},
    "1": {"accuracy": 0.65, "average": 0.65},
    "2": {"accuracy": 0.642, "average": 0.642},
    "3": {"accuracy": 0.525, "average": 0.525},
    "4": {"accuracy": 0.523, "average": 0.523},
    "5": {"accuracy": 0.62, "average": 0.62},
    "6": {"accuracy": 0.655, "average": 0.655},
    "7": {"accuracy": 0.508, "average": 0.508},
    "8": {"accuracy": 0.627, "average": 0.627},
    "9": {"accuracy": 0.645, "average": 0.645}
  },
  "wsc": {
    "median": 0.611,
    "interquartile_range": 0.069,
    "average_scores_for_each_prompt": [0.597, 0.556, 0.694, 0.625, 0.556, 0.597, 0.625, 0.514, 0.653, 0.639],
    "0": {"accuracy": 0.597, "average": 0.597},
    "1": {"accuracy": 0.556, "average": 0.556},
    "2": {"accuracy": 0.694, "average": 0.694},
    "3": {"accuracy": 0.625, "average": 0.625},
    "4": {"accuracy": 0.556, "average": 0.556},
    "5": {"accuracy": 0.597, "average": 0.597},
    "6": {"accuracy": 0.625, "average": 0.625},
    "7": {"accuracy": 0.514, "average": 0.514},
    "8": {"accuracy": 0.653, "average": 0.653},
    "9": {"accuracy": 0.639, "average": 0.639}
  },
  "copa": {
    "median": 0.861,
    "interquartile_range": 0.073,
    "average_scores_for_each_prompt": [0.868, 0.809, 0.912, 0.824, 0.897, 0.853, 0.824, 0.897],
    "0": {"accuracy": 0.868, "average": 0.868},
    "1": {"accuracy": 0.809, "average": 0.809},
    "2": {"accuracy": 0.912, "average": 0.912},
    "3": {"accuracy": 0.824, "average": 0.824},
    "4": {"accuracy": 0.897, "average": 0.897},
    "5": {"accuracy": 0.853, "average": 0.853},
    "6": {"accuracy": 0.824, "average": 0.824},
    "7": {"accuracy": 0.897, "average": 0.897}
  },
  "h-swag": {
    "median": 0.423,
    "interquartile_range": 0.0,
    "average_scores_for_each_prompt": [0.423],
    "h-swag": {"accuracy": 0.423, "average": 0.423}
  },
  "story_cloze": {
    "median": 0.911,
    "interquartile_range": 0.003,
    "average_scores_for_each_prompt": [0.911, 0.913, 0.921, 0.91, 0.909],
    "0": {"accuracy": 0.911, "average": 0.911},
    "1": {"accuracy": 0.913, "average": 0.913},
    "2": {"accuracy": 0.921, "average": 0.921},
    "3": {"accuracy": 0.91, "average": 0.91},
    "4": {"accuracy": 0.909, "average": 0.909}
  },
  "anli-r1": {
    "median": 0.671,
    "interquartile_range": 0.007,
    "average_scores_for_each_prompt": [0.639, 0.671, 0.677, 0.672, 0.677, 0.669, 0.669, 0.678, 0.674, 0.654, 0.664, 0.673, 0.651, 0.668, 0.673],
    "0": {"accuracy": 0.639, "average": 0.639},
    "1": {"accuracy": 0.671, "average": 0.671},
    "2": {"accuracy": 0.677, "average": 0.677},
    "3": {"accuracy": 0.672, "average": 0.672},
    "4": {"accuracy": 0.677, "average": 0.677},
    "5": {"accuracy": 0.669, "average": 0.669},
    "6": {"accuracy": 0.669, "average": 0.669},
    "7": {"accuracy": 0.678, "average": 0.678},
    "8": {"accuracy": 0.674, "average": 0.674},
    "9": {"accuracy": 0.654, "average": 0.654},
    "10": {"accuracy": 0.664, "average": 0.664},
    "11": {"accuracy": 0.673, "average": 0.673},
    "12": {"accuracy": 0.651, "average": 0.651},
    "13": {"accuracy": 0.668, "average": 0.668},
    "14": {"accuracy": 0.673, "average": 0.673}
  },
  "anli-r2": {
    "median": 0.515,
    "interquartile_range": 0.005,
    "average_scores_for_each_prompt": [0.499, 0.515, 0.521, 0.505, 0.516, 0.512, 0.519, 0.521, 0.517, 0.512, 0.512, 0.515, 0.511, 0.504, 0.515],
    "0": {"accuracy": 0.499, "average": 0.499},
    "1": {"accuracy": 0.515, "average": 0.515},
    "2": {"accuracy": 0.521, "average": 0.521},
    "3": {"accuracy": 0.505, "average": 0.505},
    "4": {"accuracy": 0.516, "average": 0.516},
    "5": {"accuracy": 0.512, "average": 0.512},
    "6": {"accuracy": 0.519, "average": 0.519},
    "7": {"accuracy": 0.521, "average": 0.521},
    "8": {"accuracy": 0.517, "average": 0.517},
    "9": {"accuracy": 0.512, "average": 0.512},
    "10": {"accuracy": 0.512, "average": 0.512},
    "11": {"accuracy": 0.515, "average": 0.515},
    "12": {"accuracy": 0.511, "average": 0.511},
    "13": {"accuracy": 0.504, "average": 0.504},
    "14": {"accuracy": 0.515, "average": 0.515}
  },
  "anli-r3": {
    "median": 0.503,
    "interquartile_range": 0.009,
    "average_scores_for_each_prompt": [0.501, 0.504, 0.491, 0.509, 0.501, 0.512, 0.497, 0.485, 0.5, 0.499, 0.503, 0.505, 0.521, 0.51, 0.507],
    "0": {"accuracy": 0.501, "average": 0.501},
    "1": {"accuracy": 0.504, "average": 0.504},
    "2": {"accuracy": 0.491, "average": 0.491},
    "3": {"accuracy": 0.509, "average": 0.509},
    "4": {"accuracy": 0.501, "average": 0.501},
    "5": {"accuracy": 0.512, "average": 0.512},
    "6": {"accuracy": 0.497, "average": 0.497},
    "7": {"accuracy": 0.485, "average": 0.485},
    "8": {"accuracy": 0.5, "average": 0.5},
    "9": {"accuracy": 0.499, "average": 0.499},
    "10": {"accuracy": 0.503, "average": 0.503},
    "11": {"accuracy": 0.505, "average": 0.505},
    "12": {"accuracy": 0.521, "average": 0.521},
    "13": {"accuracy": 0.51, "average": 0.51},
    "14": {"accuracy": 0.507, "average": 0.507}
  },
  "config": {
    "inference_dataset": null,
    "split": "test",
    "few_shot_random_seed": null,
    "num_val_samples": 32,
    "max_datapoints_per_dataset_without_templates": null,
    "should_save_to_gcp": false,
    "prediction_dir": "exp_out/merging/ia3-bigscience-T0_3B/tpa-es10_ia3_base_test/rte,cb,winogrande,wic,wsc,copa,h-swag,story_cloze,anli-r1,anli-r2,anli-r3/lambda_searched/predictions/multiple_prompts",
    "max_gen_len": 64,
    "eval_batch_size": 16,
    "eval_template_idx": null,
    "length_normalization": false,
    "use_bfloat16_during_eval": true,
    "world_size": null,
    "did_run_finish": false,
    "fields_toIterateOver": ["inference_dataset", "eval_template_idx"],
    "values_toIterateOver": {
      "rte": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
      "cb": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14],
      "winogrande": [0, 1, 2, 3, 4],
      "wic": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
      "wsc": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
      "copa": [0, 1, 2, 3, 4, 5, 6, 7],
      "h-swag": [0],
      "story_cloze": [0, 1, 2, 3, 4],
      "anli-r1": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14],
      "anli-r2": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14],
      "anli-r3": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]
    }
  },
  "model_config": {"pretrained_model": "bigscience/T0_3B", "max_seq_len": 512, "peft_method": "ia3", "peft_params": null, "checkpoint_to_directly_load_model": null, "function_to_merge_checkpoints": null, "model_checkpoints": null, "trainable_param_regex": null, "model_lambda": null, "search_lambda": "none", "temperature": 1, "learn_pretraining_lambda": false, "pretraining_init": 1, "lambda_init": 0.2, "lambda_type": "vector-shared", "pretrained_lambda_type": "scalar-shared"},
  "average": 0.671
}
{
  "rte": {
    "median": 0.808,
    "interquartile_range": 0.019,
    "average_scores_for_each_prompt": [0.804, 0.796, 0.824, 0.8, 0.82, 0.812, 0.792, 0.784, 0.816, 0.816],
    "0": {"accuracy": 0.804, "average": 0.804},
    "1": {"accuracy": 0.796, "average": 0.796},
    "2": {"accuracy": 0.824, "average": 0.824},
    "3": {"accuracy": 0.8, "average": 0.8},
    "4": {"accuracy": 0.82, "average": 0.82},
    "5": {"accuracy": 0.812, "average": 0.812},
    "6": {"accuracy": 0.792, "average": 0.792},
    "7": {"accuracy": 0.784, "average": 0.784},
    "8": {"accuracy": 0.816, "average": 0.816},
    "9": {"accuracy": 0.816, "average": 0.816}
  },
  "cb": {
    "median": 0.833,
    "interquartile_range": 0.042,
    "average_scores_for_each_prompt": [0.833, 0.833, 0.833, 0.833, 0.792, 0.917, 0.875, 0.792, 0.833, 0.833, 0.875, 0.792, 0.875, 0.833, 0.875],
    "0": {"accuracy": 0.833, "average": 0.833},
    "1": {"accuracy": 0.833, "average": 0.833},
    "2": {"accuracy": 0.833, "average": 0.833},
    "3": {"accuracy": 0.833, "average": 0.833},
    "4": {"accuracy": 0.792, "average": 0.792},
    "5": {"accuracy": 0.917, "average": 0.917},
    "6": {"accuracy": 0.875, "average": 0.875},
    "7": {"accuracy": 0.792, "average": 0.792},
    "8": {"accuracy": 0.833, "average": 0.833},
    "9": {"accuracy": 0.833, "average": 0.833},
    "10": {"accuracy": 0.875, "average": 0.875},
    "11": {"accuracy": 0.792, "average": 0.792},
    "12": {"accuracy": 0.875, "average": 0.875},
    "13": {"accuracy": 0.833, "average": 0.833},
    "14": {"accuracy": 0.875, "average": 0.875}
  },
  "winogrande": {
    "median": 0.677,
    "interquartile_range": 0.005,
    "average_scores_for_each_prompt": [0.679, 0.67, 0.687, 0.677, 0.674],
    "0": {"accuracy": 0.679, "average": 0.679},
    "1": {"accuracy": 0.67, "average": 0.67},
    "2": {"accuracy": 0.687, "average": 0.687},
    "3": {"accuracy": 0.677, "average": 0.677},
    "4": {"accuracy": 0.674, "average": 0.674}
  },
  "wic": {
    "median": 0.639,
    "interquartile_range": 0.077,
    "average_scores_for_each_prompt": [0.637, 0.655, 0.642, 0.566, 0.564, 0.668, 0.67, 0.546, 0.622, 0.658],
    "0": {"accuracy": 0.637, "average": 0.637},
    "1": {"accuracy": 0.655, "average": 0.655},
    "2": {"accuracy": 0.642, "average": 0.642},
    "3": {"accuracy": 0.566, "average": 0.566},
    "4": {"accuracy": 0.564, "average": 0.564},
    "5": {"accuracy": 0.668, "average": 0.668},
    "6": {"accuracy": 0.67, "average": 0.67},
    "7": {"accuracy": 0.546, "average": 0.546},
    "8": {"accuracy": 0.622, "average": 0.622},
    "9": {"accuracy": 0.658, "average": 0.658}
  },
  "wsc": {
    "median": 0.625,
    "interquartile_range": 0.087,
    "average_scores_for_each_prompt": [0.583, 0.542, 0.694, 0.681, 0.556, 0.639, 0.639, 0.514, 0.611, 0.653],
    "0": {"accuracy": 0.583, "average": 0.583},
    "1": {"accuracy": 0.542, "average": 0.542},
    "2": {"accuracy": 0.694, "average": 0.694},
    "3": {"accuracy": 0.681, "average": 0.681},
    "4": {"accuracy": 0.556, "average": 0.556},
    "5": {"accuracy": 0.639, "average": 0.639},
    "6": {"accuracy": 0.639, "average": 0.639},
    "7": {"accuracy": 0.514, "average": 0.514},
    "8": {"accuracy": 0.611, "average": 0.611},
    "9": {"accuracy": 0.653, "average": 0.653}
  },
  "copa": {
    "median": 0.883,
    "interquartile_range": 0.063,
    "average_scores_for_each_prompt": [0.897, 0.838, 0.912, 0.838, 0.912, 0.868, 0.838, 0.897],
    "0": {"accuracy": 0.897, "average": 0.897},
    "1": {"accuracy": 0.838, "average": 0.838},
    "2": {"accuracy": 0.912, "average": 0.912},
    "3": {"accuracy": 0.838, "average": 0.838},
    "4": {"accuracy": 0.912, "average": 0.912},
    "5": {"accuracy": 0.868, "average": 0.868},
    "6": {"accuracy": 0.838, "average": 0.838},
    "7": {"accuracy": 0.897, "average": 0.897}
  },
  "h-swag": {
    "median": 0.427,
    "interquartile_range": 0.0,
    "average_scores_for_each_prompt": [0.427],
    "h-swag": {"accuracy": 0.427, "average": 0.427}
  },
  "story_cloze": {
    "median": 0.928,
    "interquartile_range": 0.005,
    "average_scores_for_each_prompt": [0.928, 0.93, 0.934, 0.924, 0.925],
    "0": {"accuracy": 0.928, "average": 0.928},
    "1": {"accuracy": 0.93, "average": 0.93},
    "2": {"accuracy": 0.934, "average": 0.934},
    "3": {"accuracy": 0.924, "average": 0.924},
    "4": {"accuracy": 0.925, "average": 0.925}
  },
  "anli-r1": {
    "median": 0.67,
    "interquartile_range": 0.019,
    "average_scores_for_each_prompt": [0.633, 0.679, 0.68, 0.67, 0.683, 0.678, 0.665, 0.667, 0.676, 0.62, 0.659, 0.679, 0.638, 0.661, 0.684],
    "0": {"accuracy": 0.633, "average": 0.633},
    "1": {"accuracy": 0.679, "average": 0.679},
    "2": {"accuracy": 0.68, "average": 0.68},
    "3": {"accuracy": 0.67, "average": 0.67},
    "4": {"accuracy": 0.683, "average": 0.683},
    "5": {"accuracy": 0.678, "average": 0.678},
    "6": {"accuracy": 0.665, "average": 0.665},
    "7": {"accuracy": 0.667, "average": 0.667},
    "8": {"accuracy": 0.676, "average": 0.676},
    "9": {"accuracy": 0.62, "average": 0.62},
    "10": {"accuracy": 0.659, "average": 0.659},
    "11": {"accuracy": 0.679, "average": 0.679},
    "12": {"accuracy": 0.638, "average": 0.638},
    "13": {"accuracy": 0.661, "average": 0.661},
    "14": {"accuracy": 0.684, "average": 0.684}
  },
  "anli-r2": {
    "median": 0.507,
    "interquartile_range": 0.016,
    "average_scores_for_each_prompt": [0.499, 0.519, 0.518, 0.493, 0.514, 0.519, 0.505, 0.507, 0.508, 0.501, 0.492, 0.517, 0.494, 0.506, 0.516],
    "0": {"accuracy": 0.499, "average": 0.499},
    "1": {"accuracy": 0.519, "average": 0.519},
    "2": {"accuracy": 0.518, "average": 0.518},
    "3": {"accuracy": 0.493, "average": 0.493},
    "4": {"accuracy": 0.514, "average": 0.514},
    "5": {"accuracy": 0.519, "average": 0.519},
    "6": {"accuracy": 0.505, "average": 0.505},
    "7": {"accuracy": 0.507, "average": 0.507},
    "8": {"accuracy": 0.508, "average": 0.508},
    "9": {"accuracy": 0.501, "average": 0.501},
    "10": {"accuracy": 0.492, "average": 0.492},
    "11": {"accuracy": 0.517, "average": 0.517},
    "12": {"accuracy": 0.494, "average": 0.494},
    "13": {"accuracy": 0.506, "average": 0.506},
    "14": {"accuracy": 0.516, "average": 0.516}
  },
  "anli-r3": {
    "median": 0.488,
    "interquartile_range": 0.015,
    "average_scores_for_each_prompt": [0.488, 0.506, 0.474, 0.502, 0.492, 0.5, 0.486, 0.484, 0.482, 0.481, 0.5, 0.487, 0.516, 0.499, 0.487],
    "0": {"accuracy": 0.488, "average": 0.488},
    "1": {"accuracy": 0.506, "average": 0.506},
    "2": {"accuracy": 0.474, "average": 0.474},
    "3": {"accuracy": 0.502, "average": 0.502},
    "4": {"accuracy": 0.492, "average": 0.492},
    "5": {"accuracy": 0.5, "average": 0.5},
    "6": {"accuracy": 0.486, "average": 0.486},
    "7": {"accuracy": 0.484, "average": 0.484},
    "8": {"accuracy": 0.482, "average": 0.482},
    "9": {"accuracy": 0.481, "average": 0.481},
    "10": {"accuracy": 0.5, "average": 0.5},
    "11": {"accuracy": 0.487, "average": 0.487},
    "12": {"accuracy": 0.516, "average": 0.516},
    "13": {"accuracy": 0.499, "average": 0.499},
    "14": {"accuracy": 0.487, "average": 0.487}
  },
  "config": {
    "inference_dataset": null,
    "split": "test",
    "few_shot_random_seed": null,
    "num_val_samples": 32,
    "max_datapoints_per_dataset_without_templates": null,
    "should_save_to_gcp": false,
    "prediction_dir": "exp_out/merging/ia3-bigscience-T0_3B/tpa-es10_ia3_base_test/rte,cb,winogrande,wic,wsc,copa,h-swag,story_cloze,anli-r1,anli-r2,anli-r3/lambda_searched/predictions/multiple_prompts",
    "max_gen_len": 64,
    "eval_batch_size": 16,
    "eval_template_idx": null,
    "length_normalization": false,
    "use_bfloat16_during_eval": true,
    "world_size": null,
    "did_run_finish": false,
    "fields_toIterateOver": ["inference_dataset", "eval_template_idx"],
    "values_toIterateOver": {
      "rte": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
      "cb": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14],
      "winogrande": [0, 1, 2, 3, 4],
      "wic": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
      "wsc": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
      "copa": [0, 1, 2, 3, 4, 5, 6, 7],
      "h-swag": [0],
      "story_cloze": [0, 1, 2, 3, 4],
      "anli-r1": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14],
      "anli-r2": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14],
      "anli-r3": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]
    }
  },
  "model_config": {"pretrained_model": "bigscience/T0_3B", "max_seq_len": 512, "peft_method": "ia3", "peft_params": null, "checkpoint_to_directly_load_model": null, "function_to_merge_checkpoints": null, "model_checkpoints": null, "trainable_param_regex": null, "model_lambda": null, "search_lambda": "none", "temperature": 1, "learn_pretraining_lambda": false, "pretraining_init": 1, "lambda_init": 0.2, "lambda_type": "vector-shared", "pretrained_lambda_type": "scalar-shared"},
  "average": 0.68
}
{
  "rte": {
    "median": 0.8,
    "interquartile_range": 0.021,
    "average_scores_for_each_prompt": [0.812, 0.796, 0.812, 0.788, 0.808, 0.8, 0.78, 0.776, 0.8, 0.812],
    "0": {"accuracy": 0.812, "average": 0.812},
    "1": {"accuracy": 0.796, "average": 0.796},
    "2": {"accuracy": 0.812, "average": 0.812},
    "3": {"accuracy": 0.788, "average": 0.788},
    "4": {"accuracy": 0.808, "average": 0.808},
    "5": {"accuracy": 0.8, "average": 0.8},
    "6": {"accuracy": 0.78, "average": 0.78},
    "7": {"accuracy": 0.776, "average": 0.776},
    "8": {"accuracy": 0.8, "average": 0.8},
    "9": {"accuracy": 0.812, "average": 0.812}
  },
  "cb": {
    "median": 0.833,
    "interquartile_range": 0.02,
    "average_scores_for_each_prompt": [0.833, 0.833, 0.792, 0.833, 0.833, 0.917, 0.875, 0.75, 0.833, 0.833, 0.875, 0.792, 0.792, 0.833, 0.833],
    "0": {"accuracy": 0.833, "average": 0.833},
    "1": {"accuracy": 0.833, "average": 0.833},
    "2": {"accuracy": 0.792, "average": 0.792},
    "3": {"accuracy": 0.833, "average": 0.833},
    "4": {"accuracy": 0.833, "average": 0.833},
    "5": {"accuracy": 0.917, "average": 0.917},
    "6": {"accuracy": 0.875, "average": 0.875},
    "7": {"accuracy": 0.75, "average": 0.75},
    "8": {"accuracy": 0.833, "average": 0.833},
    "9": {"accuracy": 0.833, "average": 0.833},
    "10": {"accuracy": 0.875, "average": 0.875},
    "11": {"accuracy": 0.792, "average": 0.792},
    "12": {"accuracy": 0.792, "average": 0.792},
    "13": {"accuracy": 0.833, "average": 0.833},
    "14": {"accuracy": 0.833, "average": 0.833}
  },
  "winogrande": {
    "median": 0.674,
    "interquartile_range": 0.005,
    "average_scores_for_each_prompt": [0.679, 0.67, 0.681, 0.674, 0.674],
    "0": {"accuracy": 0.679, "average": 0.679},
    "1": {"accuracy": 0.67, "average": 0.67},
    "2": {"accuracy": 0.681, "average": 0.681},
    "3": {"accuracy": 0.674, "average": 0.674},
    "4": {"accuracy": 0.674, "average": 0.674}
  },
  "wic": {
    "median": 0.639,
    "interquartile_range": 0.081,
    "average_scores_for_each_prompt": [0.642, 0.657, 0.635, 0.546, 0.558, 0.655, 0.672, 0.523, 0.614, 0.649],
    "0": {"accuracy": 0.642, "average": 0.642},
    "1": {"accuracy": 0.657, "average": 0.657},
    "2": {"accuracy": 0.635, "average": 0.635},
    "3": {"accuracy": 0.546, "average": 0.546},
    "4": {"accuracy": 0.558, "average": 0.558},
    "5": {"accuracy": 0.655, "average": 0.655},
    "6": {"accuracy": 0.672, "average": 0.672},
    "7": {"accuracy": 0.523, "average": 0.523},
    "8": {"accuracy": 0.614, "average": 0.614},
    "9": {"accuracy": 0.649, "average": 0.649}
  },
  "wsc": {
    "median": 0.618,
    "interquartile_range": 0.074,
    "average_scores_for_each_prompt": [0.611, 0.569, 0.667, 0.681, 0.569, 0.653, 0.625, 0.514, 0.611, 0.653],
    "0": {"accuracy": 0.611, "average": 0.611},
    "1": {"accuracy": 0.569, "average": 0.569},
    "2": {"accuracy": 0.667, "average": 0.667},
    "3": {"accuracy": 0.681, "average": 0.681},
    "4": {"accuracy": 0.569, "average": 0.569},
    "5": {"accuracy": 0.653, "average": 0.653},
    "6": {"accuracy": 0.625, "average": 0.625},
    "7": {"accuracy": 0.514, "average": 0.514},
    "8": {"accuracy": 0.611, "average": 0.611},
    "9": {"accuracy": 0.653, "average": 0.653}
  },
  "copa": {
    "median": 0.883,
    "interquartile_range": 0.052,
    "average_scores_for_each_prompt": [0.897, 0.838, 0.912, 0.838, 0.912, 0.868, 0.853, 0.897],
    "0": {"accuracy": 0.897, "average": 0.897},
    "1": {"accuracy": 0.838, "average": 0.838},
    "2": {"accuracy": 0.912, "average": 0.912},
    "3": {"accuracy": 0.838, "average": 0.838},
    "4": {"accuracy": 0.912, "average": 0.912},
    "5": {"accuracy": 0.868, "average": 0.868},
    "6": {"accuracy": 0.853, "average": 0.853},
    "7": {"accuracy": 0.897, "average": 0.897}
  },
  "h-swag": {
    "median": 0.426,
    "interquartile_range": 0.0,
    "average_scores_for_each_prompt": [0.426],
    "h-swag": {"accuracy": 0.426, "average": 0.426}
  },
  "story_cloze": {
    "median": 0.924,
    "interquartile_range": 0.005,
    "average_scores_for_each_prompt": [0.924, 0.929, 0.93, 0.921, 0.924],
    "0": {"accuracy": 0.924, "average": 0.924},
    "1": {"accuracy": 0.929, "average": 0.929},
    "2": {"accuracy": 0.93, "average": 0.93},
    "3": {"accuracy": 0.921, "average": 0.921},
    "4": {"accuracy": 0.924, "average": 0.924}
  },
  "anli-r1": {
    "median": 0.674,
    "interquartile_range": 0.016,
    "average_scores_for_each_prompt": [0.638, 0.674, 0.685, 0.669, 0.677, 0.679, 0.665, 0.674, 0.677, 0.632, 0.661, 0.676, 0.637, 0.661, 0.674],
    "0": {"accuracy": 0.638, "average": 0.638},
    "1": {"accuracy": 0.674, "average": 0.674},
    "2": {"accuracy": 0.685, "average": 0.685},
    "3": {"accuracy": 0.669, "average": 0.669},
    "4": {"accuracy": 0.677, "average": 0.677},
    "5": {"accuracy": 0.679, "average": 0.679},
    "6": {"accuracy": 0.665, "average": 0.665},
    "7": {"accuracy": 0.674, "average": 0.674},
    "8": {"accuracy": 0.677, "average": 0.677},
    "9": {"accuracy": 0.632, "average": 0.632},
    "10": {"accuracy": 0.661, "average": 0.661},
    "11": {"accuracy": 0.676, "average": 0.676},
    "12": {"accuracy": 0.637, "average": 0.637},
    "13": {"accuracy": 0.661, "average": 0.661},
    "14": {"accuracy": 0.674, "average": 0.674}
  },
  "anli-r2": {
    "median": 0.511,
    "interquartile_range": 0.013,
    "average_scores_for_each_prompt": [0.497, 0.512, 0.518, 0.498, 0.516, 0.51, 0.513, 0.516, 0.511, 0.502, 0.5, 0.519, 0.499, 0.504, 0.513],
    "0": {"accuracy": 0.497, "average": 0.497},
    "1": {"accuracy": 0.512, "average": 0.512},
    "2": {"accuracy": 0.518, "average": 0.518},
    "3": {"accuracy": 0.498, "average": 0.498},
    "4": {"accuracy": 0.516, "average": 0.516},
    "5": {"accuracy": 0.51, "average": 0.51},
    "6": {"accuracy": 0.513, "average": 0.513},
    "7": {"accuracy": 0.516, "average": 0.516},
    "8": {"accuracy": 0.511, "average": 0.511},
    "9": {"accuracy": 0.502, "average": 0.502},
    "10": {"accuracy": 0.5, "average": 0.5},
    "11": {"accuracy": 0.519, "average": 0.519},
    "12": {"accuracy": 0.499, "average": 0.499},
    "13": {"accuracy": 0.504, "average": 0.504},
    "14": {"accuracy": 0.513, "average": 0.513}
  },
  "anli-r3": {
    "median": 0.494,
    "interquartile_range": 0.018,
    "average_scores_for_each_prompt": [0.49, 0.506, 0.482, 0.51, 0.495, 0.504, 0.487, 0.488, 0.477, 0.484, 0.498, 0.494, 0.514, 0.506, 0.491],
    "0": {"accuracy": 0.49, "average": 0.49},
    "1": {"accuracy": 0.506, "average": 0.506},
    "2": {"accuracy": 0.482, "average": 0.482},
    "3": {"accuracy": 0.51, "average": 0.51},
    "4": {"accuracy": 0.495, "average": 0.495},
    "5": {"accuracy": 0.504, "average": 0.504},
    "6": {"accuracy": 0.487, "average": 0.487},
    "7": {"accuracy": 0.488, "average": 0.488},
    "8": {"accuracy": 0.477, "average": 0.477},
    "9": {"accuracy": 0.484, "average": 0.484},
    "10": {"accuracy": 0.498, "average": 0.498},
    "11": {"accuracy": 0.494, "average": 0.494},
    "12": {"accuracy": 0.514, "average": 0.514},
    "13": {"accuracy": 0.506, "average": 0.506},
    "14": {"accuracy": 0.491, "average": 0.491}
  },
  "config": {
    "inference_dataset": null,
    "split": "test",
    "few_shot_random_seed": null,
    "num_val_samples": 32,
    "max_datapoints_per_dataset_without_templates": null,
    "should_save_to_gcp": false,
    "prediction_dir": "exp_out/merging/ia3-bigscience-T0_3B/tpa-es10_ia3_base_test/rte,cb,winogrande,wic,wsc,copa,h-swag,story_cloze,anli-r1,anli-r2,anli-r3/lambda_searched/predictions/multiple_prompts",
    "max_gen_len": 64,
    "eval_batch_size": 16,
    "eval_template_idx": null,
    "length_normalization": false,
    "use_bfloat16_during_eval": true,
    "world_size": null,
    "did_run_finish": false,
    "fields_toIterateOver": ["inference_dataset", "eval_template_idx"],
    "values_toIterateOver": {
      "rte": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
      "cb": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14],
      "winogrande": [0, 1, 2, 3, 4],
      "wic": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
      "wsc": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
      "copa": [0, 1, 2, 3, 4, 5, 6, 7],
      "h-swag": [0],
      "story_cloze": [0, 1, 2, 3, 4],
      "anli-r1": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14],
      "anli-r2": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14],
      "anli-r3": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]
    }
  },
  "model_config": {"pretrained_model": "bigscience/T0_3B", "max_seq_len": 512, "peft_method": "ia3", "peft_params": null, "checkpoint_to_directly_load_model": null, "function_to_merge_checkpoints": null, "model_checkpoints": null, "trainable_param_regex": null, "model_lambda": null, "search_lambda": "none", "temperature": 1, "learn_pretraining_lambda": false, "pretraining_init": 1, "lambda_init": 0.2, "lambda_type": "vector-shared", "pretrained_lambda_type": "scalar-shared"},
  "average": 0.68
}
{
  "rte": {
    "median": 0.792,
    "interquartile_range": 0.021,
    "average_scores_for_each_prompt": [0.808, 0.796, 0.812, 0.784, 0.808, 0.788, 0.776, 0.78, 0.788, 0.8],
    "0": {"accuracy": 0.808, "average": 0.808},
    "1": {"accuracy": 0.796, "average": 0.796},
    "2": {"accuracy": 0.812, "average": 0.812},
    "3": {"accuracy": 0.784, "average": 0.784},
    "4": {"accuracy": 0.808, "average": 0.808},
    "5": {"accuracy": 0.788, "average": 0.788},
    "6": {"accuracy": 0.776, "average": 0.776},
    "7": {"accuracy": 0.78, "average": 0.78},
    "8": {"accuracy": 0.788, "average": 0.788},
    "9": {"accuracy": 0.8, "average": 0.8}
  },
  "cb": {
    "median": 0.833,
    "interquartile_range": 0.02,
    "average_scores_for_each_prompt": [0.833, 0.833, 0.792, 0.833, 0.833, 0.875, 0.875, 0.75, 0.833, 0.833, 0.875, 0.792, 0.792, 0.833, 0.833],
    "0": {"accuracy": 0.833, "average": 0.833},
    "1": {"accuracy": 0.833, "average": 0.833},
    "2": {"accuracy": 0.792, "average": 0.792},
    "3": {"accuracy": 0.833, "average": 0.833},
    "4": {"accuracy": 0.833, "average": 0.833},
    "5": {"accuracy": 0.875, "average": 0.875},
    "6": {"accuracy": 0.875, "average": 0.875},
    "7": {"accuracy": 0.75, "average": 0.75},
    "8": {"accuracy": 0.833, "average": 0.833},
    "9": {"accuracy": 0.833, "average": 0.833},
    "10": {"accuracy": 0.875, "average": 0.875},
    "11": {"accuracy": 0.792, "average": 0.792},
    "12": {"accuracy": 0.792, "average": 0.792},
    "13": {"accuracy": 0.833, "average": 0.833},
    "14": {"accuracy": 0.833, "average": 0.833}
  },
  "winogrande": {
    "median": 0.674,
    "interquartile_range": 0.006,
    "average_scores_for_each_prompt": [0.68, 0.674, 0.679, 0.673, 0.668],
    "0": {"accuracy": 0.68, "average": 0.68},
    "1": {"accuracy": 0.674, "average": 0.674},
    "2": {"accuracy": 0.679, "average": 0.679},
    "3": {"accuracy": 0.673, "average": 0.673},
    "4": {"accuracy": 0.668, "average": 0.668}
  },
  "wic": {
    "median": 0.641,
    "interquartile_range": 0.085,
    "average_scores_for_each_prompt": [0.644, 0.649, 0.637, 0.538, 0.541, 0.647, 0.66, 0.518, 0.625, 0.645],
    "0": {"accuracy": 0.644, "average": 0.644},
    "1": {"accuracy": 0.649, "average": 0.649},
    "2": {"accuracy": 0.637, "average": 0.637},
    "3": {"accuracy": 0.538, "average": 0.538},
    "4": {"accuracy": 0.541, "average": 0.541},
    "5": {"accuracy": 0.647, "average": 0.647},
    "6": {"accuracy": 0.66, "average": 0.66},
    "7": {"accuracy": 0.518, "average": 0.518},
    "8": {"accuracy": 0.625, "average": 0.625},
    "9": {"accuracy": 0.645, "average": 0.645}
  },
  "wsc": {
    "median": 0.632,
    "interquartile_range": 0.076,
    "average_scores_for_each_prompt": [0.625, 0.556, 0.681, 0.639, 0.556, 0.653, 0.625, 0.514, 0.639, 0.653],
    "0": {"accuracy": 0.625, "average": 0.625},
    "1": {"accuracy": 0.556, "average": 0.556},
    "2": {"accuracy": 0.681, "average": 0.681},
    "3": {"accuracy": 0.639, "average": 0.639},
    "4": {"accuracy": 0.556, "average": 0.556},
    "5": {"accuracy": 0.653, "average": 0.653},
    "6": {"accuracy": 0.625, "average": 0.625},
    "7": {"accuracy": 0.514, "average": 0.514},
    "8": {"accuracy": 0.639, "average": 0.639},
    "9": {"accuracy": 0.653, "average": 0.653}
  },
  "copa": {
    "median": 0.875,
    "interquartile_range": 0.063,
    "average_scores_for_each_prompt": [0.912, 0.838, 0.912, 0.838, 0.897, 0.853, 0.838, 0.897],
    "0": {"accuracy": 0.912, "average": 0.912},
    "1": {"accuracy": 0.838, "average": 0.838},
    "2": {"accuracy": 0.912, "average": 0.912},
    "3": {"accuracy": 0.838, "average": 0.838},
    "4": {"accuracy": 0.897, "average": 0.897},
    "5": {"accuracy": 0.853, "average": 0.853},
    "6": {"accuracy": 0.838, "average": 0.838},
    "7": {"accuracy": 0.897, "average": 0.897}
  },
  "h-swag": {
    "median": 0.425,
    "interquartile_range": 0.0,
    "average_scores_for_each_prompt": [0.425],
    "h-swag": {"accuracy": 0.425, "average": 0.425}
  },
  "story_cloze": {
    "median": 0.921,
    "interquartile_range": 0.004,
    "average_scores_for_each_prompt": [0.922, 0.921, 0.927, 0.916, 0.918],
    "0": {"accuracy": 0.922, "average": 0.922},
    "1": {"accuracy": 0.921, "average": 0.921},
    "2": {"accuracy": 0.927, "average": 0.927},
    "3": {"accuracy": 0.916, "average": 0.916},
    "4": {"accuracy": 0.918, "average": 0.918}
  },
  "anli-r1": {
    "median": 0.67,
    "interquartile_range": 0.013,
    "average_scores_for_each_prompt": [0.641, 0.673, 0.683, 0.667, 0.677, 0.674, 0.668, 0.675, 0.677, 0.637, 0.661, 0.67, 0.639, 0.663, 0.67],
    "0": {"accuracy": 0.641, "average": 0.641},
    "1": {"accuracy": 0.673, "average": 0.673},
    "2": {"accuracy": 0.683, "average": 0.683},
    "3": {"accuracy": 0.667, "average": 0.667},
    "4": {"accuracy": 0.677, "average": 0.677},
    "5": {"accuracy": 0.674, "average": 0.674},
    "6": {"accuracy": 0.668, "average": 0.668},
    "7": {"accuracy": 0.675, "average": 0.675},
    "8": {"accuracy": 0.677, "average": 0.677},
    "9": {"accuracy": 0.637, "average": 0.637},
    "10": {"accuracy": 0.661, "average": 0.661},
    "11": {"accuracy": 0.67, "average": 0.67},
    "12": {"accuracy": 0.639, "average": 0.639},
    "13": {"accuracy": 0.663, "average": 0.663},
    "14": {"accuracy": 0.67, "average": 0.67}
  },
  "anli-r2": {
    "median": 0.517,
    "interquartile_range": 0.014,
    "average_scores_for_each_prompt": [0.502, 0.517, 0.52, 0.503, 0.511, 0.517, 0.518, 0.517, 0.52, 0.508, 0.496, 0.524, 0.506, 0.502, 0.518],
    "0": {"accuracy": 0.502, "average": 0.502},
    "1": {"accuracy": 0.517, "average": 0.517},
    "2": {"accuracy": 0.52, "average": 0.52},
    "3": {"accuracy": 0.503, "average": 0.503},
    "4": {"accuracy": 0.511, "average": 0.511},
    "5": {"accuracy": 0.517, "average": 0.517},
    "6": {"accuracy": 0.518, "average": 0.518},
    "7": {"accuracy": 0.517, "average": 0.517},
    "8": {"accuracy": 0.52, "average": 0.52},
    "9": {"accuracy": 0.508, "average": 0.508},
    "10": {"accuracy": 0.496, "average": 0.496},
    "11": {"accuracy": 0.524, "average": 0.524},
    "12": {"accuracy": 0.506, "average": 0.506},
    "13": {"accuracy": 0.502, "average": 0.502},
    "14": {"accuracy": 0.518, "average": 0.518}
  },
  "anli-r3": {
    "median": 0.5,
    "interquartile_range": 0.012,
    "average_scores_for_each_prompt": [0.502, 0.5, 0.484, 0.508, 0.497, 0.508, 0.495, 0.482, 0.484, 0.492, 0.502, 0.497, 0.512, 0.51, 0.5],
    "0": {"accuracy": 0.502, "average": 0.502},
    "1": {"accuracy": 0.5, "average": 0.5},
    "2": {"accuracy": 0.484, "average": 0.484},
    "3": {"accuracy": 0.508, "average": 0.508},
    "4": {"accuracy": 0.497, "average": 0.497},
    "5": {"accuracy": 0.508, "average": 0.508},
    "6": {"accuracy": 0.495, "average": 0.495},
    "7": {"accuracy": 0.482, "average": 0.482},
    "8": {"accuracy": 0.484, "average": 0.484},
    "9": {"accuracy": 0.492, "average": 0.492},
    "10": {"accuracy": 0.502, "average": 0.502},
    "11": {"accuracy": 0.497, "average": 0.497},
    "12": {"accuracy": 0.512, "average": 0.512},
    "13": {"accuracy": 0.51, "average": 0.51},
    "14": {"accuracy": 0.5, "average": 0.5}
  },
  "config": {
    "inference_dataset": null,
    "split": "test",
    "few_shot_random_seed": null,
    "num_val_samples": 32,
    "max_datapoints_per_dataset_without_templates": null,
    "should_save_to_gcp": false,
    "prediction_dir": "exp_out/merging/ia3-bigscience-T0_3B/tpa-es10_ia3_base_test/rte,cb,winogrande,wic,wsc,copa,h-swag,story_cloze,anli-r1,anli-r2,anli-r3/lambda_searched/predictions/multiple_prompts",
    "max_gen_len": 64,
    "eval_batch_size": 16,
    "eval_template_idx": null,
    "length_normalization": false,
    "use_bfloat16_during_eval": true,
    "world_size": null,
    "did_run_finish": false,
    "fields_toIterateOver": ["inference_dataset", "eval_template_idx"],
    "values_toIterateOver": {
      "rte": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
      "cb": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14],
      "winogrande": [0, 1, 2, 3, 4],
      "wic": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
      "wsc": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
      "copa": [0, 1, 2, 3, 4, 5, 6, 7],
      "h-swag": [0],
      "story_cloze": [0, 1, 2, 3, 4],
      "anli-r1": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14],
      "anli-r2": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14],
      "anli-r3": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]
    }
  },
  "model_config": {"pretrained_model": "bigscience/T0_3B", "max_seq_len": 512, "peft_method": "ia3", "peft_params": null, "checkpoint_to_directly_load_model": null, "function_to_merge_checkpoints": null, "model_checkpoints": null, "trainable_param_regex": null, "model_lambda": null, "search_lambda": "none", "temperature": 1, "learn_pretraining_lambda": false, "pretraining_init": 1, "lambda_init": 0.2, "lambda_type": "vector-shared", "pretrained_lambda_type": "scalar-shared"},
  "average": 0.68
}
{
  "rte": {
    "median": 0.794,
    "interquartile_range": 0.014,
    "average_scores_for_each_prompt": [0.8, 0.792, 0.804, 0.784, 0.812, 0.788, 0.763, 0.776, 0.796, 0.796],
    "0": {"accuracy": 0.8, "average": 0.8},
    "1": {"accuracy": 0.792, "average": 0.792},
    "2": {"accuracy": 0.804, "average": 0.804},
    "3": {"accuracy": 0.784, "average": 0.784},
    "4": {"accuracy": 0.812, "average": 0.812},
    "5": {"accuracy": 0.788, "average": 0.788},
    "6": {"accuracy": 0.763, "average": 0.763},
    "7": {"accuracy": 0.776, "average": 0.776},
    "8": {"accuracy": 0.796, "average": 0.796},
    "9": {"accuracy": 0.796, "average": 0.796}
  },
  "cb": {
    "median": 0.833,
    "interquartile_range": 0.041,
    "average_scores_for_each_prompt": [0.833, 0.792, 0.792, 0.833, 0.833, 0.875, 0.875, 0.75, 0.833, 0.833, 0.875, 0.792, 0.792, 0.833, 0.833],
    "0": {"accuracy": 0.833, "average": 0.833},
    "1": {"accuracy": 0.792, "average": 0.792},
    "2": {"accuracy": 0.792, "average": 0.792},
    "3": {"accuracy": 0.833, "average": 0.833},
    "4": {"accuracy": 0.833, "average": 0.833},
    "5": {"accuracy": 0.875, "average": 0.875},
    "6": {"accuracy": 0.875, "average": 0.875},
    "7": {"accuracy": 0.75, "average": 0.75},
    "8": {"accuracy": 0.833, "average": 0.833},
    "9": {"accuracy": 0.833, "average": 0.833},
    "10": {"accuracy": 0.875, "average": 0.875},
    "11": {"accuracy": 0.792, "average": 0.792},
    "12": {"accuracy": 0.792, "average": 0.792},
    "13": {"accuracy": 0.833, "average": 0.833},
    "14": {"accuracy": 0.833, "average": 0.833}
  },
  "winogrande": {
    "median": 0.671,
    "interquartile_range": 0.005,
    "average_scores_for_each_prompt": [0.682, 0.669, 0.676, 0.671, 0.671],
    "0": {"accuracy": 0.682, "average": 0.682},
    "1": {"accuracy": 0.669, "average": 0.669},
    "2": {"accuracy": 0.676, "average": 0.676},
    "3": {"accuracy": 0.671, "average": 0.671},
    "4": {"accuracy": 0.671, "average": 0.671}
  },
  "wic": {
    "median": 0.637,
    "interquartile_range": 0.088,
    "average_scores_for_each_prompt": [0.647, 0.645, 0.637, 0.53, 0.536, 0.637, 0.662, 0.513, 0.627, 0.647],
    "0": {"accuracy": 0.647, "average": 0.647},
    "1": {"accuracy": 0.645, "average": 0.645},
    "2": {"accuracy": 0.637, "average": 0.637},
    "3": {"accuracy": 0.53, "average": 0.53},
    "4": {"accuracy": 0.536, "average": 0.536},
    "5": {"accuracy": 0.637, "average": 0.637},
    "6": {"accuracy": 0.662, "average": 0.662},
    "7": {"accuracy": 0.513, "average": 0.513},
    "8": {"accuracy": 0.627, "average": 0.627},
    "9": {"accuracy": 0.647, "average": 0.647}
  },
  "wsc": {
    "median": 0.632,
    "interquartile_range": 0.076,
    "average_scores_for_each_prompt": [0.625, 0.556, 0.681, 0.639, 0.556, 0.653, 0.625, 0.5, 0.639, 0.653],
    "0": {"accuracy": 0.625, "average": 0.625},
    "1": {"accuracy": 0.556, "average": 0.556},
    "2": {"accuracy": 0.681, "average": 0.681},
    "3": {"accuracy": 0.639, "average": 0.639},
    "4": {"accuracy": 0.556, "average": 0.556},
    "5": {"accuracy": 0.653, "average": 0.653},
    "6": {"accuracy": 0.625, "average": 0.625},
    "7": {"accuracy": 0.5, "average": 0.5},
    "8": {"accuracy": 0.639, "average": 0.639},
    "9": {"accuracy": 0.653, "average": 0.653}
  },
  "copa": {
    "median": 0.867,
    "interquartile_range": 0.052,
    "average_scores_for_each_prompt": [0.882, 0.838, 0.912, 0.853, 0.912, 0.853, 0.838, 0.897],
    "0": {"accuracy": 0.882, "average": 0.882},
    "1": {"accuracy": 0.838, "average": 0.838},
    "2": {"accuracy": 0.912, "average": 0.912},
    "3": {"accuracy": 0.853, "average": 0.853},
    "4": {"accuracy": 0.912, "average": 0.912},
    "5": {"accuracy": 0.853, "average": 0.853},
    "6": {"accuracy": 0.838, "average": 0.838},
    "7": {"accuracy": 0.897, "average": 0.897}
  },
  "h-swag": {
    "median": 0.425,
    "interquartile_range": 0.0,
    "average_scores_for_each_prompt": [0.425],
    "h-swag": {"accuracy": 0.425, "average": 0.425}
  },
  "story_cloze": {
    "median": 0.922,
    "interquartile_range": 0.006,
    "average_scores_for_each_prompt": [0.922, 0.922, 0.928, 0.915, 0.916],
    "0": {"accuracy": 0.922, "average": 0.922},
    "1": {"accuracy": 0.922, "average": 0.922},
    "2": {"accuracy": 0.928, "average": 0.928},
    "3": {"accuracy": 0.915, "average": 0.915},
    "4": {"accuracy": 0.916, "average": 0.916}
  },
  "anli-r1": {
    "median": 0.667,
    "interquartile_range": 0.014,
    "average_scores_for_each_prompt": [0.637, 0.667, 0.675, 0.669, 0.675, 0.67, 0.665, 0.672, 0.673, 0.634, 0.654, 0.663, 0.635, 0.661, 0.667],
    "0": {"accuracy": 0.637, "average": 0.637},
    "1": {"accuracy": 0.667, "average": 0.667},
    "2": {"accuracy": 0.675, "average": 0.675},
    "3": {"accuracy": 0.669, "average": 0.669},
    "4": {"accuracy": 0.675, "average": 0.675},
    "5": {"accuracy": 0.67, "average": 0.67},
    "6": {"accuracy": 0.665, "average": 0.665},
    "7": {"accuracy": 0.672, "average": 0.672},
    "8": {"accuracy": 0.673, "average": 0.673},
    "9": {"accuracy": 0.634, "average": 0.634},
    "10": {"accuracy": 0.654, "average": 0.654},
    "11": {"accuracy": 0.663, "average": 0.663},
    "12": {"accuracy": 0.635, "average": 0.635},
    "13": {"accuracy": 0.661, "average": 0.661},
    "14": {"accuracy": 0.667, "average": 0.667}
  },
  "anli-r2": {
    "median": 0.512,
    "interquartile_range": 0.016,
    "average_scores_for_each_prompt": [0.502, 0.515, 0.523, 0.498, 0.512, 0.513, 0.518, 0.521, 0.519, 0.503, 0.495, 0.517, 0.501, 0.495, 0.512],
    "0": {"accuracy": 0.502, "average": 0.502},
    "1": {"accuracy": 0.515, "average": 0.515},
    "2": {"accuracy": 0.523, "average": 0.523},
    "3": {"accuracy": 0.498, "average": 0.498},
    "4": {"accuracy": 0.512, "average": 0.512},
    "5": {"accuracy": 0.513, "average": 0.513},
    "6": {"accuracy": 0.518, "average": 0.518},
    "7": {"accuracy": 0.521, "average": 0.521},
    "8": {"accuracy": 0.519, "average": 0.519},
    "9": {"accuracy": 0.503, "average": 0.503},
    "10": {"accuracy": 0.495, "average": 0.495},
    "11": {"accuracy": 0.517, "average": 0.517},
    "12": {"accuracy": 0.501, "average": 0.501},
    "13": {"accuracy": 0.495, "average": 0.495},
    "14": {"accuracy": 0.512, "average": 0.512}
  },
  "anli-r3": {
    "median": 0.502,
    "interquartile_range": 0.008,
    "average_scores_for_each_prompt": [0.504, 0.502, 0.484, 0.512, 0.496, 0.502, 0.497, 0.482, 0.485, 0.495, 0.502, 0.5, 0.512, 0.507, 0.502],
    "0": {"accuracy": 0.504, "average": 0.504},
    "1": {"accuracy": 0.502, "average": 0.502},
    "2": {"accuracy": 0.484, "average": 0.484},
    "3": {"accuracy": 0.512, "average": 0.512},
    "4": {"accuracy": 0.496, "average": 0.496},
    "5": {"accuracy": 0.502, "average": 0.502},
    "6": {"accuracy": 0.497, "average": 0.497},
    "7": {"accuracy": 0.482, "average": 0.482},
    "8": {"accuracy": 0.485, "average": 0.485},
    "9": {"accuracy": 0.495, "average": 0.495},
    "10": {"accuracy": 0.502, "average": 0.502},
    "11": {"accuracy": 0.5, "average": 0.5},
    "12": {"accuracy": 0.512, "average": 0.512},
    "13": {"accuracy": 0.507, "average": 0.507},
    "14": {"accuracy": 0.502, "average": 0.502}
  },
  "config": {
    "inference_dataset": null,
    "split": "test",
    "few_shot_random_seed": null,
    "num_val_samples": 32,
    "max_datapoints_per_dataset_without_templates": null,
    "should_save_to_gcp": false,
    "prediction_dir": "exp_out/merging/ia3-bigscience-T0_3B/tpa-es10_ia3_base_test/rte,cb,winogrande,wic,wsc,copa,h-swag,story_cloze,anli-r1,anli-r2,anli-r3/lambda_searched/predictions/multiple_prompts",
    "max_gen_len": 64,
    "eval_batch_size": 16,
    "eval_template_idx": null,
    "length_normalization": false,
    "use_bfloat16_during_eval": true,
    "world_size": null,
    "did_run_finish": false,
    "fields_toIterateOver": ["inference_dataset", "eval_template_idx"],
    "values_toIterateOver": {
      "rte": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
      "cb": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14],
      "winogrande": [0, 1, 2, 3, 4],
      "wic": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
      "wsc": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
      "copa": [0, 1, 2, 3, 4, 5, 6, 7],
      "h-swag": [0],
      "story_cloze": [0, 1, 2, 3, 4],
      "anli-r1": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14],
      "anli-r2": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14],
      "anli-r3": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]
    }
  },
  "model_config": {"pretrained_model": "bigscience/T0_3B", "max_seq_len": 512, "peft_method": "ia3", "peft_params": null, "checkpoint_to_directly_load_model": null, "function_to_merge_checkpoints": null, "model_checkpoints": null, "trainable_param_regex": null, "model_lambda": null, "search_lambda": "none", "temperature": 1, "learn_pretraining_lambda": false, "pretraining_init": 1, "lambda_init": 0.2, "lambda_type": "vector-shared", "pretrained_lambda_type": "scalar-shared"},
  "average": 0.678
}
{
  "rte": {
    "median": 0.796,
    "interquartile_range": 0.025,
    "average_scores_for_each_prompt": [0.808, 0.796, 0.812, 0.78, 0.816, 0.796, 0.78, 0.776, 0.788, 0.804],
    "0": {"accuracy": 0.808, "average": 0.808},
    "1": {"accuracy": 0.796, "average": 0.796},
    "2": {"accuracy": 0.812, "average": 0.812},
    "3": {"accuracy": 0.78, "average": 0.78},
    "4": {"accuracy": 0.816, "average": 0.816},
    "5": {"accuracy": 0.796, "average": 0.796},
    "6": {"accuracy": 0.78, "average": 0.78},
    "7": {"accuracy": 0.776, "average": 0.776},
    "8": {"accuracy": 0.788, "average": 0.788},
    "9": {"accuracy": 0.804, "average": 0.804}
  },
  "cb": {
    "median": 0.833,
    "interquartile_range": 0.02,
    "average_scores_for_each_prompt": [0.833, 0.833, 0.792, 0.833, 0.833, 0.875, 0.875, 0.75, 0.833, 0.833, 0.875, 0.792, 0.792, 0.833, 0.833],
    "0": {"accuracy": 0.833, "average": 0.833},
    "1": {"accuracy": 0.833, "average": 0.833},
    "2": {"accuracy": 0.792, "average": 0.792},
    "3": {"accuracy": 0.833, "average": 0.833},
    "4": {"accuracy": 0.833, "average": 0.833},
    "5": {"accuracy": 0.875, "average": 0.875},
    "6": {"accuracy": 0.875, "average": 0.875},
    "7": {"accuracy": 0.75, "average": 0.75},
    "8": {"accuracy": 0.833, "average": 0.833},
    "9": {"accuracy": 0.833, "average": 0.833},
    "10": {"accuracy": 0.875, "average": 0.875},
    "11": {"accuracy": 0.792, "average": 0.792},
    "12": {"accuracy": 0.792, "average": 0.792},
    "13": {"accuracy": 0.833, "average": 0.833},
    "14": {"accuracy": 0.833, "average": 0.833}
  },
  "winogrande": {
    "median": 0.67,
    "interquartile_range": 0.004,
    "average_scores_for_each_prompt": [0.676, 0.67, 0.672, 0.664, 0.668],
    "0": {"accuracy": 0.676, "average": 0.676},
    "1": {"accuracy": 0.67, "average": 0.67},
    "2": {"accuracy": 0.672, "average": 0.672},
    "3": {"accuracy": 0.664, "average": 0.664},
    "4": {"accuracy": 0.668, "average": 0.668}
  },
  "wic": {
    "median": 0.645,
    "interquartile_range": 0.089,
    "average_scores_for_each_prompt": [0.649, 0.65, 0.644, 0.533, 0.538, 0.645, 0.66, 0.515, 0.625, 0.647],
    "0": {"accuracy": 0.649, "average": 0.649},
    "1": {"accuracy": 0.65, "average": 0.65},
    "2": {"accuracy": 0.644, "average": 0.644},
    "3": {"accuracy": 0.533, "average": 0.533},
    "4": {"accuracy": 0.538, "average": 0.538},
    "5": {"accuracy": 0.645, "average": 0.645},
    "6": {"accuracy": 0.66, "average": 0.66},
    "7": {"accuracy": 0.515, "average": 0.515},
    "8": {"accuracy": 0.625, "average": 0.625},
    "9": {"accuracy": 0.647, "average": 0.647}
  },
  "wsc": {
    "median": 0.625,
    "interquartile_range": 0.069,
    "average_scores_for_each_prompt": [0.625, 0.556, 0.681, 0.639, 0.556, 0.639, 0.625, 0.514, 0.611, 0.653],
    "0": {"accuracy": 0.625, "average": 0.625},
    "1": {"accuracy": 0.556, "average": 0.556},
    "2": {"accuracy": 0.681, "average": 0.681},
    "3": {"accuracy": 0.639, "average": 0.639},
    "4": {"accuracy": 0.556, "average": 0.556},
    "5": {"accuracy": 0.639, "average": 0.639},
    "6": {"accuracy": 0.625, "average": 0.625},
    "7": {"accuracy": 0.514, "average": 0.514},
    "8": {"accuracy": 0.611, "average": 0.611},
    "9": {"accuracy": 0.653, "average": 0.653}
  },
  "copa": {
    "median": 0.867,
    "interquartile_range": 0.048,
    "average_scores_for_each_prompt": [0.882, 0.838, 0.912, 0.838, 0.882, 0.853, 0.838, 0.897],
    "0": {"accuracy": 0.882, "average": 0.882},
    "1": {"accuracy": 0.838, "average": 0.838},
    "2": {"accuracy": 0.912, "average": 0.912},
    "3": {"accuracy": 0.838, "average": 0.838},
    "4": {"accuracy": 0.882, "average": 0.882},
    "5": {"accuracy": 0.853, "average": 0.853},
    "6": {"accuracy": 0.838, "average": 0.838},
    "7": {"accuracy": 0.897, "average": 0.897}
  },
  "h-swag": {
    "median": 0.426,
    "interquartile_range": 0.0,
    "average_scores_for_each_prompt": [0.426],
    "h-swag": {"accuracy": 0.426, "average": 0.426}
  },
  "story_cloze": {
    "median": 0.921,
    "interquartile_range": 0.004,
    "average_scores_for_each_prompt": [0.922, 0.921, 0.929, 0.916, 0.918],
    "0": {"accuracy": 0.922, "average": 0.922},
    "1": {"accuracy": 0.921, "average": 0.921},
    "2": {"accuracy": 0.929, "average": 0.929},
    "3": {"accuracy": 0.916, "average": 0.916},
    "4": {"accuracy": 0.918, "average": 0.918}
  },
  "anli-r1": {
    "median": 0.672,
    "interquartile_range": 0.013,
    "average_scores_for_each_prompt": [0.642, 0.675, 0.685, 0.667, 0.676, 0.675, 0.666, 0.676, 0.677, 0.635, 0.662, 0.672, 0.641, 0.663, 0.673],
    "0": {"accuracy": 0.642, "average": 0.642},
    "1": {"accuracy": 0.675, "average": 0.675},
    "2": {"accuracy": 0.685, "average": 0.685},
    "3": {"accuracy": 0.667, "average": 0.667},
    "4": {"accuracy": 0.676, "average": 0.676},
    "5": {"accuracy": 0.675, "average": 0.675},
    "6": {"accuracy": 0.666, "average": 0.666},
    "7": {"accuracy": 0.676, "average": 0.676},
    "8": {"accuracy": 0.677, "average": 0.677},
    "9": {"accuracy": 0.635, "average": 0.635},
    "10": {"accuracy": 0.662, "average": 0.662},
    "11": {"accuracy": 0.672, "average": 0.672},
    "12": {"accuracy": 0.641, "average": 0.641},
    "13": {"accuracy": 0.663, "average": 0.663},
    "14": {"accuracy": 0.673, "average": 0.673}
  },
  "anli-r2": {
    "median": 0.514,
    "interquartile_range": 0.017,
    "average_scores_for_each_prompt": [0.502, 0.514, 0.519, 0.502, 0.513, 0.517, 0.517, 0.52, 0.52, 0.504, 0.5, 0.524, 0.505, 0.501, 0.52],
    "0": {"accuracy": 0.502, "average": 0.502},
    "1": {"accuracy": 0.514, "average": 0.514},
    "2": {"accuracy": 0.519, "average": 0.519},
    "3": {"accuracy": 0.502, "average": 0.502},
    "4": {"accuracy": 0.513, "average": 0.513},
    "5": {"accuracy": 0.517, "average": 0.517},
    "6": {"accuracy": 0.517, "average": 0.517},
    "7": {"accuracy": 0.52, "average": 0.52},
    "8": {"accuracy": 0.52, "average": 0.52},
    "9": {"accuracy": 0.504, "average": 0.504},
    "10": {"accuracy": 0.5, "average": 0.5},
    "11": {"accuracy": 0.524, "average": 0.524},
    "12": {"accuracy": 0.505, "average": 0.505},
    "13": {"accuracy": 0.501, "average": 0.501},
    "14": {"accuracy": 0.52, "average": 0.52}
  },
  "anli-r3": {
    "median": 0.5,
    "interquartile_range": 0.015,
    "average_scores_for_each_prompt": [0.503, 0.502, 0.484, 0.508, 0.501, 0.506, 0.49, 0.488, 0.482, 0.487, 0.5, 0.497, 0.512, 0.512, 0.496],
    "0": {"accuracy": 0.503, "average": 0.503},
    "1": {"accuracy": 0.502, "average": 0.502},
    "2": {"accuracy": 0.484, "average": 0.484},
    "3": {"accuracy": 0.508, "average": 0.508},
    "4": {"accuracy": 0.501, "average": 0.501},
    "5": {"accuracy": 0.506, "average": 0.506},
    "6": {"accuracy": 0.49, "average": 0.49},
    "7": {"accuracy": 0.488, "average": 0.488},
    "8": {"accuracy": 0.482, "average": 0.482},
    "9": {"accuracy": 0.487, "average": 0.487},
    "10": {"accuracy": 0.5, "average": 0.5},
    "11": {"accuracy": 0.497, "average": 0.497},
    "12": {"accuracy": 0.512, "average": 0.512},
    "13": {"accuracy": 0.512, "average": 0.512},
    "14": {"accuracy": 0.496, "average": 0.496}
  },
  "config": {
    "inference_dataset": null,
    "split": "test",
    "few_shot_random_seed": null,
    "num_val_samples": 32,
    "max_datapoints_per_dataset_without_templates": null,
    "should_save_to_gcp": false,
    "prediction_dir": "exp_out/merging/ia3-bigscience-T0_3B/tpa-es10_ia3_base_test/rte,cb,winogrande,wic,wsc,copa,h-swag,story_cloze,anli-r1,anli-r2,anli-r3/lambda_searched/predictions/multiple_prompts",
    "max_gen_len": 64,
    "eval_batch_size": 16,
    "eval_template_idx": null,
    "length_normalization": false,
    "use_bfloat16_during_eval": true,
    "world_size": null,
    "did_run_finish": false,
    "fields_toIterateOver": ["inference_dataset", "eval_template_idx"],
    "values_toIterateOver": {
      "rte": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
      "cb": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14],
      "winogrande": [0, 1, 2, 3, 4],
      "wic": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
      "wsc": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
      "copa": [0, 1, 2, 3, 4, 5, 6, 7],
      "h-swag": [0],
      "story_cloze": [0, 1, 2, 3, 4],
      "anli-r1": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14],
      "anli-r2": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14],
      "anli-r3": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]
    }
  },
  "model_config": {"pretrained_model": "bigscience/T0_3B", "max_seq_len": 512, "peft_method": "ia3", "peft_params": null, "checkpoint_to_directly_load_model": null, "function_to_merge_checkpoints": null, "model_checkpoints": null, "trainable_param_regex": null, "model_lambda": null, "search_lambda": "none", "temperature": 1, "learn_pretraining_lambda": false, "pretraining_init": 1, "lambda_init": 0.2, "lambda_type": "vector-shared", "pretrained_lambda_type": "scalar-shared"},
  "average": 0.679
}
{
  "rte": {
    "median": 0.788,
    "interquartile_range": 0.018,
    "average_scores_for_each_prompt": [0.812, 0.796, 0.812, 0.784, 0.804, 0.792, 0.771, 0.776, 0.784, 0.784],
    "0": {"accuracy": 0.812, "average": 0.812},
    "1": {"accuracy": 0.796, "average": 0.796},
    "2": {"accuracy": 0.812, "average": 0.812},
    "3": {"accuracy": 0.784, "average": 0.784},
    "4": {"accuracy": 0.804, "average": 0.804},
    "5": {"accuracy": 0.792, "average": 0.792},
    "6": {"accuracy": 0.771, "average": 0.771},
    "7": {"accuracy": 0.776, "average": 0.776},
    "8": {"accuracy": 0.784, "average": 0.784},
    "9": {"accuracy": 0.784, "average": 0.784}
  },
  "cb": {
    "median": 0.833,
    "interquartile_range": 0.041,
    "average_scores_for_each_prompt": [0.833, 0.792, 0.792, 0.833, 0.833, 0.875, 0.875, 0.75, 0.833, 0.792, 0.833, 0.792, 0.792, 0.833, 0.792],
    "0": {"accuracy": 0.833, "average": 0.833},
    "1": {"accuracy": 0.792, "average": 0.792},
    "2": {"accuracy": 0.792, "average": 0.792},
    "3": {"accuracy": 0.833, "average": 0.833},
    "4": {"accuracy": 0.833, "average": 0.833},
    "5": {"accuracy": 0.875, "average": 0.875},
    "6": {"accuracy": 0.875, "average": 0.875},
    "7": {"accuracy": 0.75, "average": 0.75},
    "8": {"accuracy": 0.833, "average": 0.833},
    "9": {"accuracy": 0.792, "average": 0.792},
    "10": {"accuracy": 0.833, "average": 0.833},
    "11": {"accuracy": 0.792, "average": 0.792},
    "12": {"accuracy": 0.792, "average": 0.792},
    "13": {"accuracy": 0.833, "average": 0.833},
    "14": {"accuracy": 0.792, "average": 0.792}
  },
  "winogrande": {
    "median": 0.676,
    "interquartile_range": 0.009,
    "average_scores_for_each_prompt": [0.691, 0.671, 0.683, 0.676, 0.674],
    "0": {"accuracy": 0.691, "average": 0.691},
    "1": {"accuracy": 0.671, "average": 0.671},
    "2": {"accuracy": 0.683, "average": 0.683},
    "3": {"accuracy": 0.676, "average": 0.676},
    "4": {"accuracy": 0.674, "average": 0.674}
  },
  "wic": {
    "median": 0.641,
    "interquartile_range": 0.083,
    "average_scores_for_each_prompt": [0.644, 0.645, 0.644, 0.531, 0.54, 0.639, 0.657, 0.52, 0.629, 0.649],
    "0": {"accuracy": 0.644, "average": 0.644},
    "1": {"accuracy": 0.645, "average": 0.645},
    "2": {"accuracy": 0.644, "average": 0.644},
    "3": {"accuracy": 0.531, "average": 0.531},
    "4": {"accuracy": 0.54, "average": 0.54},
    "5": {"accuracy": 0.639, "average": 0.639},
    "6": {"accuracy": 0.657, "average": 0.657},
    "7": {"accuracy": 0.52, "average": 0.52},
    "8": {"accuracy": 0.629, "average": 0.629},
    "9": {"accuracy": 0.649, "average": 0.649}
  },
  "wsc": {
    "median": 0.632,
    "interquartile_range": 0.08,
    "average_scores_for_each_prompt": [0.639, 0.542, 0.681, 0.639, 0.556, 0.611, 0.625, 0.528, 0.653, 0.653],
    "0": {"accuracy": 0.639, "average": 0.639},
    "1": {"accuracy": 0.542, "average": 0.542},
    "2": {"accuracy": 0.681, "average": 0.681},
    "3": {"accuracy": 0.639, "average": 0.639},
    "4": {"accuracy": 0.556, "average": 0.556},
    "5": {"accuracy": 0.611, "average": 0.611},
    "6": {"accuracy": 0.625, "average": 0.625},
    "7": {"accuracy": 0.528, "average": 0.528},
    "8": {"accuracy": 0.653, "average": 0.653},
    "9": {"accuracy": 0.653, "average": 0.653}
  },
  "copa": {
    "median": 0.861,
    "interquartile_range": 0.052,
    "average_scores_for_each_prompt": [0.868, 0.838, 0.912, 0.853, 0.912, 0.853, 0.824, 0.897],
    "0": {"accuracy": 0.868, "average": 0.868},
    "1": {"accuracy": 0.838, "average": 0.838},
    "2": {"accuracy": 0.912, "average": 0.912},
    "3": {"accuracy": 0.853, "average": 0.853},
    "4": {"accuracy": 0.912, "average": 0.912},
    "5": {"accuracy": 0.853, "average": 0.853},
    "6": {"accuracy": 0.824, "average": 0.824},
    "7": {"accuracy": 0.897, "average": 0.897}
  },
  "h-swag": {
    "median": 0.425,
    "interquartile_range": 0.0,
    "average_scores_for_each_prompt": [0.425],
    "h-swag": {"accuracy": 0.425, "average": 0.425}
  },
  "story_cloze": {
    "median": 0.921,
    "interquartile_range": 0.003,
    "average_scores_for_each_prompt": [0.921, 0.921, 0.927, 0.917, 0.918],
    "0": {"accuracy": 0.921, "average": 0.921},
    "1": {"accuracy": 0.921, "average": 0.921},
    "2": {"accuracy": 0.927, "average": 0.927},
    "3": {"accuracy": 0.917, "average": 0.917},
    "4": {"accuracy": 0.918, "average": 0.918}
  },
  "anli-r1": {
    "median": 0.664,
    "interquartile_range": 0.015,
    "average_scores_for_each_prompt": [0.642, 0.664, 0.678, 0.666, 0.675, 0.669, 0.663, 0.675, 0.674, 0.631, 0.655, 0.664, 0.632, 0.657, 0.669],
    "0": {"accuracy": 0.642, "average": 0.642},
    "1": {"accuracy": 0.664, "average": 0.664},
    "2": {"accuracy": 0.678, "average": 0.678},
    "3": {"accuracy": 0.666, "average": 0.666},
    "4": {"accuracy": 0.675, "average": 0.675},
    "5": {"accuracy": 0.669, "average": 0.669},
    "6": {"accuracy": 0.663, "average": 0.663},
    "7": {"accuracy": 0.675, "average": 0.675},
    "8": {"accuracy": 0.674, "average": 0.674},
    "9": {"accuracy": 0.631, "average": 0.631},
    "10": {"accuracy": 0.655, "average": 0.655},
    "11": {"accuracy": 0.664, "average": 0.664},
    "12": {"accuracy": 0.632, "average": 0.632},
    "13": {"accuracy": 0.657, "average": 0.657},
    "14": {"accuracy": 0.669, "average": 0.669}
  },
  "anli-r2": {
    "median": 0.514,
    "interquartile_range": 0.016,
    "average_scores_for_each_prompt": [0.497, 0.52, 0.518, 0.504, 0.513, 0.521, 0.517, 0.522, 0.517, 0.506, 0.5, 0.517, 0.5, 0.498, 0.514],
    "0": {"accuracy": 0.497, "average": 0.497},
    "1": {"accuracy": 0.52, "average": 0.52},
    "2": {"accuracy": 0.518, "average": 0.518},
    "3": {"accuracy": 0.504, "average": 0.504},
    "4": {"accuracy": 0.513, "average": 0.513},
    "5": {"accuracy": 0.521, "average": 0.521},
    "6": {"accuracy": 0.517, "average": 0.517},
    "7": {"accuracy": 0.522, "average": 0.522},
    "8": {"accuracy": 0.517, "average": 0.517},
    "9": {"accuracy": 0.506, "average": 0.506},
    "10": {"accuracy": 0.5, "average": 0.5},
    "11": {"accuracy": 0.517, "average": 0.517},
    "12": {"accuracy": 0.5, "average": 0.5},
    "13": {"accuracy": 0.498, "average": 0.498},
    "14": {"accuracy": 0.514, "average": 0.514}
  },
  "anli-r3": {
    "median": 0.497,
    "interquartile_range": 0.008,
    "average_scores_for_each_prompt": [0.5, 0.502, 0.486, 0.508, 0.497, 0.502, 0.495, 0.477, 0.484, 0.494, 0.498, 0.494, 0.514, 0.505, 0.497],
    "0": {"accuracy": 0.5, "average": 0.5},
    "1": {"accuracy": 0.502, "average": 0.502},
    "2": {"accuracy": 0.486, "average": 0.486},
    "3": {"accuracy": 0.508, "average": 0.508},
    "4": {"accuracy": 0.497, "average": 0.497},
    "5": {"accuracy": 0.502, "average": 0.502},
    "6": {"accuracy": 0.495, "average": 0.495},
    "7": {"accuracy": 0.477, "average": 0.477},
    "8": {"accuracy": 0.484, "average": 0.484},
    "9": {"accuracy": 0.494, "average": 0.494},
    "10": {"accuracy": 0.498, "average": 0.498},
    "11": {"accuracy": 0.494, "average": 0.494},
    "12": {"accuracy": 0.514, "average": 0.514},
    "13": {"accuracy": 0.505, "average": 0.505},
    "14": {"accuracy": 0.497, "average": 0.497}
  },
  "config": {
    "inference_dataset": null,
    "split": "test",
    "few_shot_random_seed": null,
    "num_val_samples": 32,
    "max_datapoints_per_dataset_without_templates": null,
    "should_save_to_gcp": false,
    "prediction_dir": "exp_out/merging/ia3-bigscience-T0_3B/tpa-es10_ia3_base_test/rte,cb,winogrande,wic,wsc,copa,h-swag,story_cloze,anli-r1,anli-r2,anli-r3/lambda_searched/predictions/multiple_prompts",
    "max_gen_len": 64,
    "eval_batch_size": 16,
    "eval_template_idx": null,
    "length_normalization": false,
    "use_bfloat16_during_eval": true,
    "world_size": null,
    "did_run_finish": false,
    "fields_toIterateOver": ["inference_dataset", "eval_template_idx"],
    "values_toIterateOver": {
      "rte": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
      "cb": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14],
      "winogrande": [0, 1, 2, 3, 4],
      "wic": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
      "wsc": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
      "copa": [0, 1, 2, 3, 4, 5, 6, 7],
      "h-swag": [0],
      "story_cloze": [0, 1, 2, 3, 4],
      "anli-r1": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14],
      "anli-r2": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14],
      "anli-r3": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]
    }
  },
  "model_config": {"pretrained_model": "bigscience/T0_3B", "max_seq_len": 512, "peft_method": "ia3", "peft_params": null, "checkpoint_to_directly_load_model": null, "function_to_merge_checkpoints": null, "model_checkpoints": null, "trainable_param_regex": null, "model_lambda": null, "search_lambda": "none", "temperature": 1, "learn_pretraining_lambda": false, "pretraining_init": 1, "lambda_init": 0.2, "lambda_type": "vector-shared", "pretrained_lambda_type": "scalar-shared"},
  "average": 0.677
}
{
  "rte": {
    "median": 0.794,
    "interquartile_range": 0.02,
    "average_scores_for_each_prompt": [0.808, 0.796, 0.808, 0.784, 0.808, 0.792, 0.78, 0.776, 0.792, 0.8],
    "0": {"accuracy": 0.808, "average": 0.808},
    "1": {"accuracy": 0.796, "average": 0.796},
    "2": {"accuracy": 0.808, "average": 0.808},
    "3": {"accuracy": 0.784, "average": 0.784},
    "4": {"accuracy": 0.808, "average": 0.808},
    "5": {"accuracy": 0.792, "average": 0.792},
    "6": {"accuracy": 0.78, "average": 0.78},
    "7": {"accuracy": 0.776, "average": 0.776},
    "8": {"accuracy": 0.792, "average": 0.792},
    "9": {"accuracy": 0.8, "average": 0.8}
  },
  "cb": {
    "median": 0.833,
    "interquartile_range": 0.02,
    "average_scores_for_each_prompt": [0.833, 0.833, 0.792, 0.833, 0.833, 0.875, 0.875, 0.75, 0.833, 0.833, 0.875, 0.792, 0.792, 0.833, 0.833],
    "0": {"accuracy": 0.833, "average": 0.833},
    "1": {"accuracy": 0.833, "average": 0.833},
    "2": {"accuracy": 0.792, "average": 0.792},
    "3": {"accuracy": 0.833, "average": 0.833},
    "4": {"accuracy": 0.833, "average": 0.833},
    "5": {"accuracy": 0.875, "average": 0.875},
    "6": {"accuracy": 0.875, "average": 0.875},
    "7": {"accuracy": 0.75, "average": 0.75},
    "8": {"accuracy": 0.833, "average": 0.833},
    "9": {"accuracy": 0.833, "average": 0.833},
    "10": {"accuracy": 0.875, "average": 0.875},
    "11": {"accuracy": 0.792, "average": 0.792},
    "12": {"accuracy": 0.792, "average": 0.792},
    "13": {"accuracy": 0.833, "average": 0.833},
    "14": {"accuracy": 0.833, "average": 0.833}
  },
  "winogrande": {
    "median": 0.672,
    "interquartile_range": 0.007,
    "average_scores_for_each_prompt": [0.681, 0.672, 0.679, 0.672, 0.67],
    "0": {"accuracy": 0.681, "average": 0.681},
    "1": {"accuracy": 0.672, "average": 0.672},
    "2": {"accuracy": 0.679, "average": 0.679},
    "3": {"accuracy": 0.672, "average": 0.672},
    "4": {"accuracy": 0.67, "average": 0.67}
  },
  "wic": {
    "median": 0.643,
    "interquartile_range": 0.085,
    "average_scores_for_each_prompt": [0.645, 0.653, 0.642, 0.535, 0.54, 0.645, 0.66, 0.518, 0.625, 0.647],
    "0": {"accuracy": 0.645, "average": 0.645},
    "1": {"accuracy": 0.653, "average": 0.653},
    "2": {"accuracy": 0.642, "average": 0.642},
    "3": {"accuracy": 0.535, "average": 0.535},
    "4": {"accuracy": 0.54, "average": 0.54},
    "5": {"accuracy": 0.645, "average": 0.645},
    "6": {"accuracy": 0.66, "average": 0.66},
    "7": {"accuracy": 0.518, "average": 0.518},
    "8": {"accuracy": 0.625, "average": 0.625},
    "9": {"accuracy": 0.647, "average": 0.647}
  },
  "wsc": {
    "median": 0.639,
    "interquartile_range": 0.076,
    "average_scores_for_each_prompt": [0.639, 0.556, 0.681, 0.639, 0.556, 0.653, 0.625, 0.514, 0.639, 0.653],
    "0": {"accuracy": 0.639, "average": 0.639},
    "1": {"accuracy": 0.556, "average": 0.556},
    "2": {"accuracy": 0.681, "average": 0.681},
    "3": {"accuracy": 0.639, "average": 0.639},
    "4": {"accuracy": 0.556, "average": 0.556},
    "5": {"accuracy": 0.653, "average": 0.653},
    "6": {"accuracy": 0.625, "average": 0.625},
    "7": {"accuracy": 0.514, "average": 0.514},
    "8": {"accuracy": 0.639, "average": 0.639},
    "9": {"accuracy": 0.653, "average": 0.653}
  },
  "copa": {
    "median": 0.875,
    "interquartile_range": 0.059,
    "average_scores_for_each_prompt": [0.897, 0.838, 0.912, 0.838, 0.897, 0.853, 0.838, 0.897],
    "0": {"accuracy": 0.897, "average": 0.897},
    "1": {"accuracy": 0.838, "average": 0.838},
    "2": {"accuracy": 0.912, "average": 0.912},
    "3": {"accuracy": 0.838, "average": 0.838},
    "4": {"accuracy": 0.897, "average": 0.897},
    "5": {"accuracy": 0.853, "average": 0.853},
    "6": {"accuracy": 0.838, "average": 0.838},
    "7": {"accuracy": 0.897, "average": 0.897}
  },
  "h-swag": {
    "median": 0.426,
    "interquartile_range": 0.0,
    "average_scores_for_each_prompt": [0.426],
    "h-swag": {"accuracy": 0.426, "average": 0.426}
  },
  "story_cloze": {
    "median": 0.921,
    "interquartile_range": 0.004,
    "average_scores_for_each_prompt": [0.921, 0.922, 0.926, 0.915, 0.918],
    "0": {"accuracy": 0.921, "average": 0.921},
    "1": {"accuracy": 0.922, "average": 0.922},
    "2": {"accuracy": 0.926, "average": 0.926},
    "3": {"accuracy": 0.915, "average": 0.915},
    "4": {"accuracy": 0.918, "average": 0.918}
  },
  "anli-r1": {
    "median": 0.668,
    "interquartile_range": 0.013,
    "average_scores_for_each_prompt": [0.642, 0.673, 0.678, 0.668, 0.678, 0.674, 0.666, 0.674, 0.675, 0.636, 0.661, 0.668, 0.637, 0.661, 0.671],
    "0": {"accuracy": 0.642, "average": 0.642},
    "1": {"accuracy": 0.673, "average": 0.673},
    "2": {"accuracy": 0.678, "average": 0.678},
    "3": {"accuracy": 0.668, "average": 0.668},
    "4": {"accuracy": 0.678, "average": 0.678},
    "5": {"accuracy": 0.674, "average": 0.674},
    "6": {"accuracy": 0.666, "average": 0.666},
    "7": {"accuracy": 0.674, "average": 0.674},
    "8": {"accuracy": 0.675, "average": 0.675},
    "9": {"accuracy": 0.636, "average": 0.636},
    "10": {"accuracy": 0.661, "average": 0.661},
    "11": {"accuracy": 0.668, "average": 0.668},
    "12": {"accuracy": 0.637, "average": 0.637},
    "13": {"accuracy": 0.661, "average": 0.661},
    "14": {"accuracy": 0.671, "average": 0.671}
  },
  "anli-r2": {
    "median": 0.515,
    "interquartile_range": 0.017,
    "average_scores_for_each_prompt": [0.5, 0.515, 0.519, 0.501, 0.515, 0.519, 0.521, 0.519, 0.52, 0.505, 0.499, 0.524, 0.503, 0.501, 0.516],
    "0": {"accuracy": 0.5, "average": 0.5},
    "1": {"accuracy": 0.515, "average": 0.515},
    "2": {"accuracy": 0.519, "average": 0.519},
    "3": {"accuracy": 0.501, "average": 0.501},
    "4": {"accuracy": 0.515, "average": 0.515},
    "5": {"accuracy": 0.519, "average": 0.519},
    "6": {"accuracy": 0.521, "average": 0.521},
    "7": {"accuracy": 0.519, "average": 0.519},
    "8": {"accuracy": 0.52, "average": 0.52},
    "9": {"accuracy": 0.505, "average": 0.505},
    "10": {"accuracy": 0.499, "average": 0.499},
    "11": {"accuracy": 0.524, "average": 0.524},
    "12": {"accuracy": 0.503, "average": 0.503},
    "13": {"accuracy": 0.501, "average": 0.501},
    "14": {"accuracy": 0.516, "average": 0.516}
  },
  "anli-r3": {
    "median": 0.499,
    "interquartile_range": 0.011,
    "average_scores_for_each_prompt": [0.502, 0.504, 0.484, 0.507, 0.497, 0.505, 0.495, 0.484, 0.484, 0.491, 0.499, 0.498, 0.51, 0.511, 0.502],
    "0": {"accuracy": 0.502, "average": 0.502},
    "1": {"accuracy": 0.504, "average": 0.504},
    "2": {"accuracy": 0.484, "average": 0.484},
    "3": {"accuracy": 0.507, "average": 0.507},
    "4": {"accuracy": 0.497, "average": 0.497},
    "5": {"accuracy": 0.505, "average": 0.505},
    "6": {"accuracy": 0.495, "average": 0.495},
    "7": {"accuracy": 0.484, "average": 0.484},
    "8": {"accuracy": 0.484, "average": 0.484},
    "9": {"accuracy": 0.491, "average": 0.491},
    "10": {"accuracy": 0.499, "average": 0.499},
    "11": {"accuracy": 0.498, "average": 0.498},
    "12": {"accuracy": 0.51, "average": 0.51},
    "13": {"accuracy": 0.511, "average": 0.511},
    "14": {"accuracy": 0.502, "average": 0.502}
  },
  "config": {
    "inference_dataset": null,
    "split": "test",
    "few_shot_random_seed": null,
    "num_val_samples": 32,
    "max_datapoints_per_dataset_without_templates": null,
    "should_save_to_gcp": false,
    "prediction_dir": "exp_out/merging/ia3-bigscience-T0_3B/tpa-es10_ia3_base_test/rte,cb,winogrande,wic,wsc,copa,h-swag,story_cloze,anli-r1,anli-r2,anli-r3/lambda_searched/predictions/multiple_prompts",
    "max_gen_len": 64,
    "eval_batch_size": 16,
    "eval_template_idx": null,
    "length_normalization": false,
    "use_bfloat16_during_eval": true,
    "world_size": null,
    "did_run_finish": false,
    "fields_toIterateOver": ["inference_dataset", "eval_template_idx"],
    "values_toIterateOver": {
      "rte": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
      "cb": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14],
      "winogrande": [0, 1, 2, 3, 4],
      "wic": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
      "wsc": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
      "copa": [0, 1, 2, 3, 4, 5, 6, 7],
      "h-swag": [0],
      "story_cloze": [0, 1, 2, 3, 4],
      "anli-r1": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14],
      "anli-r2": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14],
      "anli-r3": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]
    }
  },
  "model_config": {"pretrained_model": "bigscience/T0_3B", "max_seq_len": 512, "peft_method": "ia3", "peft_params": null, "checkpoint_to_directly_load_model": null, "function_to_merge_checkpoints": null, "model_checkpoints": null, "trainable_param_regex": null, "model_lambda": null, "search_lambda": "none", "temperature": 1, "learn_pretraining_lambda": false, "pretraining_init": 1, "lambda_init": 0.2, "lambda_type": "vector-shared", "pretrained_lambda_type": "scalar-shared"},
  "average": 0.68
}
{
  "rte": {
    "median": 0.786,
    "interquartile_range": 0.02,
    "average_scores_for_each_prompt": [0.808, 0.792, 0.804, 0.78, 0.804, 0.784, 0.776, 0.776, 0.784, 0.788],
    "0": {"accuracy": 0.808, "average": 0.808},
    "1": {"accuracy": 0.792, "average": 0.792},
    "2": {"accuracy": 0.804, "average": 0.804},
    "3": {"accuracy": 0.78, "average": 0.78},
    "4": {"accuracy": 0.804, "average": 0.804},
    "5": {"accuracy": 0.784, "average": 0.784},
    "6": {"accuracy": 0.776, "average": 0.776},
    "7": {"accuracy": 0.776, "average": 0.776},
    "8": {"accuracy": 0.784, "average": 0.784},
    "9": {"accuracy": 0.788, "average": 0.788}
  },
  "cb": {
    "median": 0.833,
    "interquartile_range": 0.041,
    "average_scores_for_each_prompt": [0.833, 0.792, 0.792, 0.833, 0.833, 0.833, 0.833, 0.75, 0.833, 0.833, 0.875, 0.75, 0.792, 0.833, 0.792],
    "0": {"accuracy": 0.833, "average": 0.833},
    "1": {"accuracy": 0.792, "average": 0.792},
    "2": {"accuracy": 0.792, "average": 0.792},
    "3": {"accuracy": 0.833, "average": 0.833},
    "4": {"accuracy": 0.833, "average": 0.833},
    "5": {"accuracy": 0.833, "average": 0.833},
    "6": {"accuracy": 0.833, "average": 0.833},
    "7": {"accuracy": 0.75, "average": 0.75},
    "8": {"accuracy": 0.833, "average": 0.833},
    "9": {"accuracy": 0.833, "average": 0.833},
    "10": {"accuracy": 0.875, "average": 0.875},
    "11": {"accuracy": 0.75, "average": 0.75},
    "12": {"accuracy": 0.792, "average": 0.792},
    "13": {"accuracy": 0.833, "average": 0.833},
    "14": {"accuracy": 0.792, "average": 0.792}
  },
  "winogrande": {
    "median": 0.669,
    "interquartile_range": 0.009,
    "average_scores_for_each_prompt": [0.68, 0.665, 0.674, 0.665, 0.669],
    "0": {"accuracy": 0.68, "average": 0.68},
    "1": {"accuracy": 0.665, "average": 0.665},
    "2": {"accuracy": 0.674, "average": 0.674},
    "3": {"accuracy": 0.665, "average": 0.665},
    "4": {"accuracy": 0.669, "average": 0.669}
  },
  "wic": {
    "median": 0.655,
    "interquartile_range": 0.084,
    "average_scores_for_each_prompt": [0.668, 0.667, 0.652, 0.558, 0.563, 0.673, 0.673, 0.538, 0.645, 0.657],
    "0": {"accuracy": 0.668, "average": 0.668},
    "1": {"accuracy": 0.667, "average": 0.667},
    "2": {"accuracy": 0.652, "average": 0.652},
    "3": {"accuracy": 0.558, "average": 0.558},
    "4": {"accuracy": 0.563, "average": 0.563},
    "5": {"accuracy": 0.673, "average": 0.673},
    "6": {"accuracy": 0.673, "average": 0.673},
    "7": {"accuracy": 0.538, "average": 0.538},
    "8": {"accuracy": 0.645, "average": 0.645},
    "9": {"accuracy": 0.657, "average": 0.657}
  },
  "wsc": {
    "median": 0.632,
    "interquartile_range": 0.083,
    "average_scores_for_each_prompt": [0.611, 0.556, 0.667, 0.653, 0.542, 0.639, 0.625, 0.5, 0.653, 0.653],
    "0": {"accuracy": 0.611, "average": 0.611},
    "1": {"accuracy": 0.556, "average": 0.556},
    "2": {"accuracy": 0.667, "average": 0.667},
    "3": {"accuracy": 0.653, "average": 0.653},
    "4": {"accuracy": 0.542, "average": 0.542},
    "5": {"accuracy": 0.639, "average": 0.639},
    "6": {"accuracy": 0.625, "average": 0.625},
    "7": {"accuracy": 0.5, "average": 0.5},
    "8": {"accuracy": 0.653, "average": 0.653},
    "9": {"accuracy": 0.653, "average": 0.653}
  },
  "copa": {
    "median": 0.883,
    "interquartile_range": 0.059,
    "average_scores_for_each_prompt": [0.897, 0.838, 0.912, 0.838, 0.897, 0.868, 0.838, 0.897],
    "0": {"accuracy": 0.897, "average": 0.897},
    "1": {"accuracy": 0.838, "average": 0.838},
    "2": {"accuracy": 0.912, "average": 0.912},
    "3": {"accuracy": 0.838, "average": 0.838},
    "4": {"accuracy": 0.897, "average": 0.897},
    "5": {"accuracy": 0.868, "average": 0.868},
    "6": {"accuracy": 0.838, "average": 0.838},
    "7": {"accuracy": 0.897, "average": 0.897}
  },
  "h-swag": {
    "median": 0.424,
    "interquartile_range": 0.0,
    "average_scores_for_each_prompt": [0.424],
    "h-swag": {"accuracy": 0.424, "average": 0.424}
  },
  "story_cloze": {
    "median": 0.916,
    "interquartile_range": 0.002,
    "average_scores_for_each_prompt": [0.916, 0.916, 0.923, 0.912, 0.914],
    "0": {"accuracy": 0.916, "average": 0.916},
    "1": {"accuracy": 0.916, "average": 0.916},
    "2": {"accuracy": 0.923, "average": 0.923},
    "3": {"accuracy": 0.912, "average": 0.912},
    "4": {"accuracy": 0.914, "average": 0.914}
  },
  "anli-r1": {
    "median": 0.667,
    "interquartile_range": 0.017,
    "average_scores_for_each_prompt": [0.635, 0.67, 0.672, 0.664, 0.677, 0.668, 0.659, 0.671, 0.673, 0.637, 0.654, 0.667, 0.629, 0.654, 0.674],
    "0": {"accuracy": 0.635, "average": 0.635},
    "1": {"accuracy": 0.67, "average": 0.67},
    "2": {"accuracy": 0.672, "average": 0.672},
    "3": {"accuracy": 0.664, "average": 0.664},
    "4": {"accuracy": 0.677, "average": 0.677},
    "5": {"accuracy": 0.668, "average": 0.668},
    "6": {"accuracy": 0.659, "average": 0.659},
    "7": {"accuracy": 0.671, "average": 0.671},
    "8": {"accuracy": 0.673, "average": 0.673},
    "9": {"accuracy": 0.637, "average": 0.637},
    "10": {"accuracy": 0.654, "average": 0.654},
    "11": {"accuracy": 0.667, "average": 0.667},
    "12": {"accuracy": 0.629, "average": 0.629},
    "13": {"accuracy": 0.654, "average": 0.654},
    "14": {"accuracy": 0.674, "average": 0.674}
  },
  "anli-r2": {
    "median": 0.515,
    "interquartile_range": 0.016,
    "average_scores_for_each_prompt": [0.498, 0.52, 0.516, 0.501, 0.513, 0.523, 0.515, 0.52, 0.515, 0.504, 0.495, 0.524, 0.498, 0.503, 0.517],
    "0": {"accuracy": 0.498, "average": 0.498},
    "1": {"accuracy": 0.52, "average": 0.52},
    "2": {"accuracy": 0.516, "average": 0.516},
    "3": {"accuracy": 0.501, "average": 0.501},
    "4": {"accuracy": 0.513, "average": 0.513},
    "5": {"accuracy": 0.523, "average": 0.523},
    "6": {"accuracy": 0.515, "average": 0.515},
    "7": {"accuracy": 0.52, "average": 0.52},
    "8": {"accuracy": 0.515, "average": 0.515},
    "9": {"accuracy": 0.504, "average": 0.504},
    "10": {"accuracy": 0.495, "average": 0.495},
    "11": {"accuracy": 0.524, "average": 0.524},
    "12": {"accuracy": 0.498, "average": 0.498},
    "13": {"accuracy": 0.503, "average": 0.503},
    "14": {"accuracy": 0.517, "average": 0.517}
  },
  "anli-r3": {
    "median": 0.495,
    "interquartile_range": 0.01,
    "average_scores_for_each_prompt": [0.49, 0.495, 0.482, 0.5, 0.496, 0.507, 0.492, 0.472, 0.477, 0.494, 0.496, 0.489, 0.512, 0.504, 0.499],
    "0": {"accuracy": 0.49, "average": 0.49},
    "1": {"accuracy": 0.495, "average": 0.495},
    "2": {"accuracy": 0.482, "average": 0.482},
    "3": {"accuracy": 0.5, "average": 0.5},
    "4": {"accuracy": 0.496, "average": 0.496},
    "5": {"accuracy": 0.507, "average": 0.507},
    "6": {"accuracy": 0.492, "average": 0.492},
    "7": {"accuracy": 0.472, "average": 0.472},
    "8": {"accuracy": 0.477, "average": 0.477},
    "9": {"accuracy": 0.494, "average": 0.494},
    "10": {"accuracy": 0.496, "average": 0.496},
    "11": {"accuracy": 0.489, "average": 0.489},
    "12": {"accuracy": 0.512, "average": 0.512},
    "13": {"accuracy": 0.504, "average": 0.504},
    "14": {"accuracy": 0.499, "average": 0.499}
  },
  "config": {
    "inference_dataset": null,
    "split": "test",
    "few_shot_random_seed": null,
    "num_val_samples": 32,
    "max_datapoints_per_dataset_without_templates": null,
    "should_save_to_gcp": false,
    "prediction_dir": "exp_out/merging/ia3-bigscience-T0_3B/tpa-es10_ia3_base_test/rte,cb,winogrande,wic,wsc,copa,h-swag,story_cloze,anli-r1,anli-r2,anli-r3/lambda_searched/predictions/multiple_prompts",
    "max_gen_len": 64,
    "eval_batch_size": 16,
    "eval_template_idx": null,
    "length_normalization": false,
    "use_bfloat16_during_eval": true,
    "world_size": null,
    "did_run_finish": false,
    "fields_toIterateOver": ["inference_dataset", "eval_template_idx"],
    "values_toIterateOver": {
      "rte": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
      "cb": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14],
      "winogrande": [0, 1, 2, 3, 4],
      "wic": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
      "wsc": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
      "copa": [0, 1, 2, 3, 4, 5, 6, 7],
      "h-swag": [0],
      "story_cloze": [0, 1, 2, 3, 4],
      "anli-r1": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14],
      "anli-r2": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14],
      "anli-r3": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]
    }
  },
  "model_config": {"pretrained_model": "bigscience/T0_3B", "max_seq_len": 512, "peft_method": "ia3", "peft_params": null, "checkpoint_to_directly_load_model": null, "function_to_merge_checkpoints": null, "model_checkpoints": null, "trainable_param_regex": null, "model_lambda": null, "search_lambda": "none", "temperature": 1, "learn_pretraining_lambda": false, "pretraining_init": 1, "lambda_init": 0.2, "lambda_type": "vector-shared", "pretrained_lambda_type": "scalar-shared"},
  "average": 0.68
}
{
  "rte": {
    "median": 0.796,
    "interquartile_range": 0.022,
    "average_scores_for_each_prompt": [0.808, 0.796, 0.812, 0.784, 0.808, 0.796, 0.776, 0.776, 0.788, 0.804],
    "0": {"accuracy": 0.808, "average": 0.808},
    "1": {"accuracy": 0.796, "average": 0.796},
    "2": {"accuracy": 0.812, "average": 0.812},
    "3": {"accuracy": 0.784, "average": 0.784},
    "4": {"accuracy": 0.808, "average": 0.808},
    "5": {"accuracy": 0.796, "average": 0.796},
    "6": {"accuracy": 0.776, "average": 0.776},
    "7": {"accuracy": 0.776, "average": 0.776},
    "8": {"accuracy": 0.788, "average": 0.788},
    "9": {"accuracy": 0.804, "average": 0.804}
  },
  "cb": {
    "median": 0.833,
    "interquartile_range": 0.041,
    "average_scores_for_each_prompt": [0.833, 0.792, 0.792, 0.833, 0.833, 0.875, 0.875, 0.75, 0.833, 0.792, 0.875, 0.792, 0.792, 0.833, 0.833],
    "0": {"accuracy": 0.833, "average": 0.833},
    "1": {"accuracy": 0.792, "average": 0.792},
    "2": {"accuracy": 0.792, "average": 0.792},
    "3": {"accuracy": 0.833, "average": 0.833},
    "4": {"accuracy": 0.833, "average": 0.833},
    "5": {"accuracy": 0.875, "average": 0.875},
    "6": {"accuracy": 0.875, "average": 0.875},
    "7": {"accuracy": 0.75, "average": 0.75},
    "8": {"accuracy": 0.833, "average": 0.833},
    "9": {"accuracy": 0.792, "average": 0.792},
    "10": {"accuracy": 0.875, "average": 0.875},
    "11": {"accuracy": 0.792, "average": 0.792},
    "12": {"accuracy": 0.792, "average": 0.792},
    "13": {"accuracy": 0.833, "average": 0.833},
    "14": {"accuracy": 0.833, "average": 0.833}
  },
  "winogrande": {
    "median": 0.672,
    "interquartile_range": 0.007,
    "average_scores_for_each_prompt": [0.679, 0.672, 0.679, 0.672, 0.672],
    "0": {"accuracy": 0.679, "average": 0.679},
    "1": {"accuracy": 0.672, "average": 0.672},
    "2": {"accuracy": 0.679, "average": 0.679},
    "3": {"accuracy": 0.672, "average": 0.672},
    "4": {"accuracy": 0.672, "average": 0.672}
  },
  "wic": {
    "median": 0.642,
    "interquartile_range": 0.083,
    "average_scores_for_each_prompt": [0.645, 0.644, 0.642, 0.533, 0.541, 0.647, 0.658, 0.518, 0.625, 0.642],
    "0": {"accuracy": 0.645, "average": 0.645},
    "1": {"accuracy": 0.644, "average": 0.644},
    "2": {"accuracy": 0.642, "average": 0.642},
    "3": {"accuracy": 0.533, "average": 0.533},
    "4": {"accuracy": 0.541, "average": 0.541},
    "5": {"accuracy": 0.647, "average": 0.647},
    "6": {"accuracy": 0.658, "average": 0.658},
    "7": {"accuracy": 0.518, "average": 0.518},
    "8": {"accuracy": 0.625, "average": 0.625},
    "9": {"accuracy": 0.642, "average": 0.642}
  },
  "wsc": {
    "median": 0.639,
    "interquartile_range": 0.076,
    "average_scores_for_each_prompt": [0.639, 0.556, 0.667, 0.639, 0.556, 0.653, 0.625, 0.514, 0.639, 0.653],
    "0": {"accuracy": 0.639, "average": 0.639},
    "1": {"accuracy": 0.556, "average": 0.556},
    "2": {"accuracy": 0.667, "average": 0.667},
    "3": {"accuracy": 0.639, "average": 0.639},
    "4": {"accuracy": 0.556, "average": 0.556},
    "5": {"accuracy": 0.653, "average": 0.653},
    "6": {"accuracy": 0.625, "average": 0.625},
    "7": {"accuracy": 0.514, "average": 0.514},
    "8": {"accuracy": 0.639, "average": 0.639},
    "9": {"accuracy": 0.653, "average": 0.653}
  },
  "copa": {
    "median": 0.867,
    "interquartile_range": 0.063,
    "average_scores_for_each_prompt": [0.897, 0.838, 0.912, 0.838, 0.882, 0.853, 0.838, 0.912],
    "0": {"accuracy": 0.897, "average": 0.897},
    "1": {"accuracy": 0.838, "average": 0.838},
    "2": {"accuracy": 0.912, "average": 0.912},
    "3": {"accuracy": 0.838, "average": 0.838},
    "4": {"accuracy": 0.882, "average": 0.882},
    "5": {"accuracy": 0.853, "average": 0.853},
    "6": {"accuracy": 0.838, "average": 0.838},
    "7": {"accuracy": 0.912, "average": 0.912}
  },
  "h-swag": {
    "median": 0.425,
    "interquartile_range": 0.0,
    "average_scores_for_each_prompt": [0.425],
    "h-swag": {"accuracy": 0.425, "average": 0.425}
  },
  "story_cloze": {
    "median": 0.921,
    "interquartile_range": 0.004,
    "average_scores_for_each_prompt": [0.921, 0.922, 0.926, 0.916, 0.918],
    "0": {"accuracy": 0.921, "average": 0.921},
    "1": {"accuracy": 0.922, "average": 0.922},
    "2": {"accuracy": 0.926, "average": 0.926},
    "3": {"accuracy": 0.916, "average": 0.916},
    "4": {"accuracy": 0.918, "average": 0.918}
  },
  "anli-r1": {
    "median": 0.669,
    "interquartile_range": 0.014,
    "average_scores_for_each_prompt": [0.643, 0.669, 0.679, 0.666, 0.678, 0.673, 0.666, 0.673, 0.677, 0.637, 0.658, 0.67, 0.637, 0.661, 0.669],
    "0": {"accuracy": 0.643, "average": 0.643},
    "1": {"accuracy": 0.669, "average": 0.669},
    "2": {"accuracy": 0.679, "average": 0.679},
    "3": {"accuracy": 0.666, "average": 0.666},
    "4": {"accuracy": 0.678, "average": 0.678},
    "5": {"accuracy": 0.673, "average": 0.673},
    "6": {"accuracy": 0.666, "average": 0.666},
    "7": {"accuracy": 0.673, "average": 0.673},
    "8": {"accuracy": 0.677, "average": 0.677},
    "9": {"accuracy": 0.637, "average": 0.637},
    "10": {"accuracy": 0.658, "average": 0.658},
    "11": {"accuracy": 0.67, "average": 0.67},
    "12": {"accuracy": 0.637, "average": 0.637},
    "13": {"accuracy": 0.661, "average": 0.661},
    "14": {"accuracy": 0.669, "average": 0.669}
  },
  "anli-r2": {
    "median": 0.516,
    "interquartile_range": 0.014,
    "average_scores_for_each_prompt": [0.503, 0.518, 0.519, 0.503, 0.513, 0.516, 0.517, 0.522, 0.52, 0.507, 0.498, 0.522, 0.506, 0.502, 0.516],
    "0": {"accuracy": 0.503, "average": 0.503},
    "1": {"accuracy": 0.518, "average": 0.518},
    "2": {"accuracy": 0.519, "average": 0.519},
    "3": {"accuracy": 0.503, "average": 0.503},
    "4": {"accuracy": 0.513, "average": 0.513},
    "5": {"accuracy": 0.516, "average": 0.516},
    "6": {"accuracy": 0.517, "average": 0.517},
    "7": {"accuracy": 0.522, "average": 0.522},
    "8": {"accuracy": 0.52, "average": 0.52},
    "9": {"accuracy": 0.507, "average": 0.507},
    "10": {"accuracy": 0.498, "average": 0.498},
    "11": {"accuracy": 0.522, "average": 0.522},
    "12": {"accuracy": 0.506, "average": 0.506},
    "13": {"accuracy": 0.502, "average": 0.502},
    "14": {"accuracy": 0.516, "average": 0.516}
  },
  "anli-r3": {
    "median": 0.501,
    "interquartile_range": 0.012,
    "average_scores_for_each_prompt": [0.502, 0.504, 0.484, 0.509, 0.502, 0.505, 0.494, 0.482, 0.484, 0.491, 0.5, 0.497, 0.513, 0.512, 0.501],
    "0": {"accuracy": 0.502, "average": 0.502},
    "1": {"accuracy": 0.504, "average": 0.504},
    "2": {"accuracy": 0.484, "average": 0.484},
    "3": {"accuracy": 0.509, "average": 0.509},
    "4": {"accuracy": 0.502, "average": 0.502},
    "5": {"accuracy": 0.505, "average": 0.505},
    "6": {"accuracy": 0.494, "average": 0.494},
    "7": {"accuracy": 0.482, "average": 0.482},
    "8": {"accuracy": 0.484, "average": 0.484},
    "9": {"accuracy": 0.491, "average": 0.491},
    "10": {"accuracy": 0.5, "average": 0.5},
    "11": {"accuracy": 0.497, "average": 0.497},
    "12": {"accuracy": 0.513, "average": 0.513},
    "13": {"accuracy": 0.512, "average": 0.512},
    "14": {"accuracy": 0.501, "average": 0.501}
  },
  "config": {
    "inference_dataset": null,
    "split": "test",
    "few_shot_random_seed": null,
    "num_val_samples": 32,
    "max_datapoints_per_dataset_without_templates": null,
    "should_save_to_gcp": false,
    "prediction_dir": "exp_out/merging/ia3-bigscience-T0_3B/tpa-es10_ia3_base_test/rte,cb,winogrande,wic,wsc,copa,h-swag,story_cloze,anli-r1,anli-r2,anli-r3/lambda_searched/predictions/multiple_prompts",
    "max_gen_len": 64,
    "eval_batch_size": 16,
    "eval_template_idx": null,
    "length_normalization": false,
    "use_bfloat16_during_eval": true,
    "world_size": null,
    "did_run_finish": false,
    "fields_toIterateOver": ["inference_dataset", "eval_template_idx"],
    "values_toIterateOver": {
      "rte": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
      "cb": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14],
      "winogrande": [0, 1, 2, 3, 4],
      "wic": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
      "wsc": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
      "copa": [0, 1, 2, 3, 4, 5, 6, 7],
      "h-swag": [0],
      "story_cloze": [0, 1, 2, 3, 4],
      "anli-r1": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14],
      "anli-r2": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14],
      "anli-r3": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]
    }
  },
  "model_config": {"pretrained_model": "bigscience/T0_3B", "max_seq_len": 512, "peft_method": "ia3", "peft_params": null, "checkpoint_to_directly_load_model": null, "function_to_merge_checkpoints": null, "model_checkpoints": null, "trainable_param_regex": null, "model_lambda": null, "search_lambda": "none", "temperature": 1, "learn_pretraining_lambda": false, "pretraining_init": 1, "lambda_init": 0.2, "lambda_type": "vector-shared", "pretrained_lambda_type": "scalar-shared"},
  "average": 0.68
}
{
  "rte": {
    "median": 0.794,
    "interquartile_range": 0.024,
    "average_scores_for_each_prompt": [0.808, 0.792, 0.812, 0.776, 0.808, 0.788, 0.767, 0.78, 0.796, 0.8],
    "0": {"accuracy": 0.808, "average": 0.808},
    "1": {"accuracy": 0.792, "average": 0.792},
    "2": {"accuracy": 0.812, "average": 0.812},
    "3": {"accuracy": 0.776, "average": 0.776},
    "4": {"accuracy": 0.808, "average": 0.808},
    "5": {"accuracy": 0.788, "average": 0.788},
    "6": {"accuracy": 0.767, "average": 0.767},
    "7": {"accuracy": 0.78, "average": 0.78},
    "8": {"accuracy": 0.796, "average": 0.796},
    "9": {"accuracy": 0.8, "average": 0.8}
  },
  "cb": {
    "median": 0.833,
    "interquartile_range": 0.041,
    "average_scores_for_each_prompt": [0.833, 0.792, 0.792, 0.833, 0.833, 0.875, 0.875, 0.75, 0.833, 0.792, 0.875, 0.75, 0.792, 0.833, 0.833],
    "0": {"accuracy": 0.833, "average": 0.833},
    "1": {"accuracy": 0.792, "average": 0.792},
    "2": {"accuracy": 0.792, "average": 0.792},
    "3": {"accuracy": 0.833, "average": 0.833},
    "4": {"accuracy": 0.833, "average": 0.833},
    "5": {"accuracy": 0.875, "average": 0.875},
    "6": {"accuracy": 0.875, "average": 0.875},
    "7": {"accuracy": 0.75, "average": 0.75},
    "8": {"accuracy": 0.833, "average": 0.833},
    "9": {"accuracy": 0.792, "average": 0.792},
    "10": {"accuracy": 0.875, "average": 0.875},
    "11": {"accuracy": 0.75, "average": 0.75},
    "12": {"accuracy": 0.792, "average": 0.792},
    "13": {"accuracy": 0.833, "average": 0.833},
    "14": {"accuracy": 0.833, "average": 0.833}
  },
  "winogrande": {
    "median": 0.672,
    "interquartile_range": 0.008,
    "average_scores_for_each_prompt": [0.682, 0.669, 0.677, 0.672, 0.666],
    "0": {"accuracy": 0.682, "average": 0.682},
    "1": {"accuracy": 0.669, "average": 0.669},
    "2": {"accuracy": 0.677, "average": 0.677},
    "3": {"accuracy": 0.672, "average": 0.672},
    "4": {"accuracy": 0.666, "average": 0.666}
  },
  "wic": {
    "median": 0.635,
    "interquartile_range": 0.073,
    "average_scores_for_each_prompt": [0.635, 0.644, 0.635, 0.54, 0.551, 0.649, 0.663, 0.523, 0.625, 0.637],
    "0": {"accuracy": 0.635, "average": 0.635},
    "1": {"accuracy": 0.644, "average": 0.644},
    "2": {"accuracy": 0.635, "average": 0.635},
    "3": {"accuracy": 0.54, "average": 0.54},
    "4": {"accuracy": 0.551, "average": 0.551},
    "5": {"accuracy": 0.649, "average": 0.649},
    "6": {"accuracy": 0.663, "average": 0.663},
    "7": {"accuracy": 0.523, "average": 0.523},
    "8": {"accuracy": 0.625, "average": 0.625},
    "9": {"accuracy": 0.637, "average": 0.637}
  },
  "wsc": {
    "median": 0.625,
    "interquartile_range": 0.066,
    "average_scores_for_each_prompt": [0.583, 0.542, 0.681, 0.639, 0.569, 0.611, 0.639, 0.514, 0.639, 0.653],
    "0": {"accuracy": 0.583, "average": 0.583},
    "1": {"accuracy": 0.542, "average": 0.542},
    "2": {"accuracy": 0.681, "average": 0.681},
    "3": {"accuracy": 0.639, "average": 0.639},
    "4": {"accuracy": 0.569, "average": 0.569},
    "5": {"accuracy": 0.611, "average": 0.611},
    "6": {"accuracy": 0.639, "average": 0.639},
    "7": {"accuracy": 0.514, "average": 0.514},
    "8": {"accuracy": 0.639, "average": 0.639},
    "9": {"accuracy": 0.653, "average": 0.653}
  },
  "copa": {
    "median": 0.875,
    "interquartile_range": 0.063,
    "average_scores_for_each_prompt": [0.912, 0.838, 0.926, 0.838, 0.897, 0.853, 0.838, 0.897],
    "0": {"accuracy": 0.912, "average": 0.912},
    "1": {"accuracy": 0.838, "average": 0.838},
    "2": {"accuracy": 0.926, "average": 0.926},
    "3": {"accuracy": 0.838, "average": 0.838},
    "4": {"accuracy": 0.897, "average": 0.897},
    "5": {"accuracy": 0.853, "average": 0.853},
    "6": {"accuracy": 0.838, "average": 0.838},
    "7": {"accuracy": 0.897, "average": 0.897}
  },
  "h-swag": {
    "median": 0.426,
    "interquartile_range": 0.0,
    "average_scores_for_each_prompt": [0.426],
    "h-swag": {"accuracy": 0.426, "average": 0.426}
  },
  "story_cloze": {
    "median": 0.921,
    "interquartile_range": 0.003,
    "average_scores_for_each_prompt": [0.924, 0.921, 0.926, 0.921, 0.92],
    "0": {"accuracy": 0.924, "average": 0.924},
    "1": {"accuracy": 0.921, "average": 0.921},
    "2": {"accuracy": 0.926, "average": 0.926},
    "3": {"accuracy": 0.921, "average": 0.921},
    "4": {"accuracy": 0.92, "average": 0.92}
  },
  "anli-r1": {
    "median": 0.667,
    "interquartile_range": 0.016,
    "average_scores_for_each_prompt": [0.644, 0.674, 0.68, 0.663, 0.678, 0.674, 0.665, 0.679, 0.676, 0.635, 0.655, 0.667, 0.635, 0.662, 0.669],
    "0": {"accuracy": 0.644, "average": 0.644},
    "1": {"accuracy": 0.674, "average": 0.674},
    "2": {"accuracy": 0.68, "average": 0.68},
    "3": {"accuracy": 0.663, "average": 0.663},
    "4": {"accuracy": 0.678, "average": 0.678},
    "5": {"accuracy": 0.674, "average": 0.674},
    "6": {"accuracy": 0.665, "average": 0.665},
    "7": {"accuracy": 0.679, "average": 0.679},
    "8": {"accuracy": 0.676, "average": 0.676},
    "9": {"accuracy": 0.635, "average": 0.635},
    "10": {"accuracy": 0.655, "average": 0.655},
    "11": {"accuracy": 0.667, "average": 0.667},
    "12": {"accuracy": 0.635, "average": 0.635},
    "13": {"accuracy": 0.662, "average": 0.662},
    "14": {"accuracy": 0.669, "average": 0.669}
  },
  "anli-r2": {
    "median": 0.516,
    "interquartile_range": 0.017,
    "average_scores_for_each_prompt": [0.499, 0.517, 0.513, 0.503, 0.518, 0.517, 0.516, 0.521, 0.517, 0.501, 0.498, 0.52, 0.501, 0.498, 0.52],
    "0": {"accuracy": 0.499, "average": 0.499},
    "1": {"accuracy": 0.517, "average": 0.517},
    "2": {"accuracy": 0.513, "average": 0.513},
    "3": {"accuracy": 0.503, "average": 0.503},
    "4": {"accuracy": 0.518, "average": 0.518},
    "5": {"accuracy": 0.517, "average": 0.517},
    "6": {"accuracy": 0.516, "average": 0.516},
    "7": {"accuracy": 0.521, "average": 0.521},
    "8": {"accuracy": 0.517, "average": 0.517},
    "9": {"accuracy": 0.501, "average": 0.501},
    "10": {"accuracy": 0.498, "average": 0.498},
    "11": {"accuracy": 0.52, "average": 0.52},
    "12": {"accuracy": 0.501, "average": 0.501},
    "13": {"accuracy": 0.498, "average": 0.498},
    "14": {"accuracy": 0.52, "average": 0.52}
  },
  "anli-r3": {
    "median": 0.497,
    "interquartile_range": 0.01,
    "average_scores_for_each_prompt": [0.496, 0.498, 0.482, 0.511, 0.497, 0.509, 0.492, 0.485, 0.487, 0.493, 0.5, 0.497, 0.511, 0.505, 0.499],
    "0": {"accuracy": 0.496, "average": 0.496},
    "1": {"accuracy": 0.498, "average": 0.498},
    "2": {"accuracy": 0.482, "average": 0.482},
    "3": {"accuracy": 0.511, "average": 0.511},
    "4": {"accuracy": 0.497, "average": 0.497},
    "5": {"accuracy": 0.509, "average": 0.509},
    "6": {"accuracy": 0.492, "average": 0.492},
    "7": {"accuracy": 0.485, "average": 0.485},
    "8": {"accuracy": 0.487, "average": 0.487},
    "9": {"accuracy": 0.493, "average": 0.493},
    "10": {"accuracy": 0.5, "average": 0.5},
    "11": {"accuracy": 0.497, "average": 0.497},
    "12": {"accuracy": 0.511, "average": 0.511},
    "13": {"accuracy": 0.505, "average": 0.505},
    "14": {"accuracy": 0.499, "average": 0.499}
  },
  "config": {
    "inference_dataset": null,
    "split": "test",
    "few_shot_random_seed": null,
    "num_val_samples": 32,
    "max_datapoints_per_dataset_without_templates": null,
    "should_save_to_gcp": false,
    "prediction_dir": "exp_out/merging/ia3-bigscience-T0_3B/tpa-es10_ia3_base_test/rte,cb,winogrande,wic,wsc,copa,h-swag,story_cloze,anli-r1,anli-r2,anli-r3/lambda_searched/predictions/multiple_prompts",
    "max_gen_len": 64,
    "eval_batch_size": 16,
    "eval_template_idx": null,
    "length_normalization": false,
    "use_bfloat16_during_eval": true,
    "world_size": null,
    "did_run_finish": false,
    "fields_toIterateOver": ["inference_dataset", "eval_template_idx"],
    "values_toIterateOver": {
      "rte": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
      "cb": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14],
      "winogrande": [0, 1, 2, 3, 4],
      "wic": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
      "wsc": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
      "copa": [0, 1, 2, 3, 4, 5, 6, 7],
      "h-swag": [0],
      "story_cloze": [0, 1, 2, 3, 4],
      "anli-r1": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14],
      "anli-r2": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14],
      "anli-r3": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]
    }
  },
  "model_config": {"pretrained_model": "bigscience/T0_3B", "max_seq_len": 512, "peft_method": "ia3", "peft_params": null, "checkpoint_to_directly_load_model": null, "function_to_merge_checkpoints": null, "model_checkpoints": null, "trainable_param_regex": null, "model_lambda": null, "search_lambda": "none", "temperature": 1, "learn_pretraining_lambda": false, "pretraining_init": 1, "lambda_init": 0.2, "lambda_type": "vector-shared", "pretrained_lambda_type": "scalar-shared"},
  "average": 0.678
}
