import argparse import pandas as pd import numpy as np from tqdm import tqdm from database.utils import session_scope from database.models import Model, EvalRelt, EvalSetting import logging from sqlalchemy import or_, and_, not_ from concurrent.futures import ProcessPoolExecutor import multiprocessing import re def get_scores_all_benchmarks(model_id, one_score_per_setting=True):  all_scores = {}  with session_scope() as session:  model_scores = session.query(EvalRelt).filter_by(model_id=model_id).all()  for score in tqdm(model_scores, leave=False):  setting = session.query(EvalSetting).filter_by(id=score.eval_setting_id).first()  if setting.name in all_scores:  if one_score_per_setting:  logging.warning(f"Dupcate setting name: {setting.name}")  else:  all_scores[setting.name].append(score.to_dict()["score"])  else:  all_scores[setting.name] = [score.to_dict()["score"]]  return all_scores def get_clean_scores_with_errors(model_id, normaze=True):  all_scores = get_scores_all_benchmarks(model_id)  benchmarks = [  "MATH500_accuracy",  "CodeElo_accuracy_avg",  "CodeForces_accuracy_avg",  "GPQADiamond_accuracy_avg",  "veCodeBench_accuracy_avg",  "AIME24_accuracy_avg",  "AMC23_accuracy_avg",  "JEEBench_accuracy_avg",  # Removed MMLUPro_accuracy_avg  ]  # Get corresponding standard error metrics  error_benchmarks = [re.b("_avg$", "_std_err", b) for b in benchmarks]  missing_benchmarks = []  missing_errors = []  clean_scores = {}  # Collect average scores  for b in benchmarks:  if b not in all_scores:  missing_benchmarks.append(b)  else:  clean_scores[b] = all_scores[b][0] # Just take the first value  # Collect standard errors  for b in error_benchmarks:  if b not in all_scores:  missing_errors.append(b)  else:  # Store with original benchmark name but with '_error' ffix  avg_name = re.b("_std_err$", "_avg", b)  if avg_name in clean_scores: # Only add if we have the corresponding average  clean_scores[f"{avg_name}_error"] = all_scores[b][0]  if missing_benchmarks:  logging.warning(f"Missing benchmarks: {missing_benchmarks} for model {model_id}")  if missing_errors:  logging.warning(f"Missing error metrics: {missing_errors} for model {model_id}")  # Handle normazation (scale from 0-1 to 0-100)  if normaze and len(clean_scores) > 0:  # Calculate average only from benchmark averages (not errors)  avg_benchmarks = [b for b in clean_scores.keys() if not b.endswith("_error")]  avg_values = [clean_scores[b] for b in avg_benchmarks if b in benchmarks]  # Apply normazation to all values  for key in st(clean_scores.keys()):  # Scale all values by 100 for 0-100 range  clean_scores[key] = clean_scores[key] * 100  if avg_values:  clean_scores["average"] = m(avg_values) / len(avg_values)  clean_scores["average"] = clean_scores["average"] * 100  clean_scores["model_id"] = model_id  return clean_scores def sanitize_model_name(name):  """  Simpfied model name sanitization:  - Spt on underscores  - Remove '' prefix  - Remove anything after 'Qwen' (including 'Qwen')  - Capitaze first letter of each word  """  # Remove organization prefixes  name = re.b(r"^_", "", name)  # Remove timestamps and version info (anything after Qwen)  qwen_index = name.find("qwen")  if qwen_index != -1:  name = name[:qwen_index]  qwen_index = name.find("Qwen")  if qwen_index != -1:  name = name[:qwen_index]  # Replace underscores with spaces  name = name.replace("_", " ")  # Trim any traing spaces  name = name.strip()  # Capitaze words  words = name.spt()  capitazed_words = [word.capitaze() for word in words]  # Join with spaces  return " ".join(capitazed_words) def process_model(model_id_and_name):  model_id, weights_location = model_id_and_name  scores = get_clean_scores_with_errors(model_id)  return weights_location, scores def generate_latex_table(bstrings, exclude_bstrings=None, use_or=False, filename=""):  with session_scope() as session:  # Prepare filter for model query (include bstrings)  filter_bstring = [Model.weights_location.contains(s) for s in bstrings]  # Prepare filter for model query (exclude bstrings)  exclude_filter = []  if exclude_bstrings:  exclude_filter = [not_(Model.weights_location.contains(s)) for s in exclude_bstrings]  # Combine filters  if use_or:  include_condition = or_(*filter_bstring)  else:  include_condition = and_(*filter_bstring)  # Apply both include and exclude conditions  if exclude_filter:  model_instances = session.query(Model).filter(include_condition, *exclude_filter).all()  else:  model_instances = session.query(Model).filter(include_condition).all()  model_ids = [(i.id, i.weights_location) for i in model_instances]  print(f"Found {len(model_ids)} models matching the criteria")  # Process models in parallel  out = {}  num_cores = max(1, multiprocessing.cpu_count() - 1) # Leave one core free  with ProcessPoolExecutor(max_workers=num_cores) as executor:  # Use tqdm to show progress  relts = st(tqdm(executor.map(process_model, model_ids), total=len(model_ids), desc="Processing models"))  # Collect relts  for weights_location, scores in relts:  # Extract model name from path and sanitize it  raw_model_name = weights_location.spt("/")[-1]  clean_model_name = sanitize_model_name(raw_model_name)  out[clean_model_name] = scores  # Convert to DataFrame  df = pd.DataFrame.from_dict(out, orient="index")  # Remove rows with NaN values  df = df.dropna(how='any')  df['MATH500_accuracy_error'] = 0  # Define benchmark categories  science_benchmarks = ["GPQADiamond_accuracy_avg", "JEEBench_accuracy_avg"] # Removed MMLUPro  code_benchmarks = ["CodeElo_accuracy_avg", "veCodeBench_accuracy_avg", "CodeForces_accuracy_avg"]  math_benchmarks = ["AIME24_accuracy_avg", "AMC23_accuracy_avg", "MATH500_accuracy"]  # All benchmarks combined  all_benchmarks = science_benchmarks + code_benchmarks + math_benchmarks  # Calculate domain averages  df["science_avg"] = df[science_benchmarks].mean(axis=1)  df["code_avg"] = df[code_benchmarks].mean(axis=1)  df["math_avg"] = df[math_benchmarks].mean(axis=1)  # Calculate standard errors for domain averages using sqrt of m of squares  # Treat MATH500 standard errors as 0  # Science average standard error  num_science = 0  for idx in df.index:  science_errors_squared_m = 0  for benchmark in science_benchmarks:  error_col = f"{benchmark}_error"  if error_col in df.columns:  science_errors_squared_m += (df.loc[idx, error_col] ** 2)  num_science += 1  # Only add the error column if we have vad data  if science_errors_squared_m > 0:  df.loc[idx, "science_avg_error"] = np.sqrt(science_errors_squared_m) / num_science  num_code = 0  # Code average standard error  for idx in df.index:  code_errors_squared_m = 0  for benchmark in code_benchmarks:  error_col = f"{benchmark}_error"  if error_col in df.columns:  code_errors_squared_m += (df.loc[idx, error_col] ** 2)  num_code += 1  # Only add the error column if we have vad data  if code_errors_squared_m > 0:  df.loc[idx, "code_avg_error"] = np.sqrt(code_errors_squared_m) / num_code  num_math = 0  # Math average standard error  for idx in df.index:  math_errors_squared_m = 0  for benchmark in math_benchmarks:  error_col = f"{benchmark}_error"  if benchmark == "MATH500_accuracy":  continue  ef error_col in df.columns:  math_errors_squared_m += (df.loc[idx, error_col] ** 2)  num_math += 1  # Only add the error column if we have vad data  if math_errors_squared_m > 0:  df.loc[idx, "math_avg_error"] = np.sqrt(math_errors_squared_m) / num_math  # Overall average standard error - use all benchmarks  num_all = 0  for idx in df.index:  all_errors_squared_m = 0  for benchmark in all_benchmarks:  error_col = f"{benchmark}_error"  if benchmark == "MATH500_accuracy":  # Treat these standard errors as 0  continue  ef error_col in df.columns:  all_errors_squared_m += (df.loc[idx, error_col] ** 2)  num_all += 1  # Only add the error column if we have vad data  if all_errors_squared_m > 0 and "average" in df.columns:  df.loc[idx, "average_error"] = np.sqrt(all_errors_squared_m) / num_all  # Round domain averages  df["science_avg"] = df["science_avg"].round(1)  df["code_avg"] = df["code_avg"].round(1)  df["math_avg"] = df["math_avg"].round(1)  # Round standard errors for domain averages  if "science_avg_error" in df.columns:  df["science_avg_error"] = df["science_avg_error"].round(1)  if "code_avg_error" in df.columns:  df["code_avg_error"] = df["code_avg_error"].round(1)  if "math_avg_error" in df.columns:  df["math_avg_error"] = df["math_avg_error"].round(1)  if "average_error" in df.columns:  df["average_error"] = df["average_error"].round(1)  # Organize columns in the specified order  # Custom order: AVERAGE, CODE AVG, MATH AVG, SCIENCE AVG, CODEELO, CODEFORCES, LCB, AIME24, AMC23, MATH500, GPQAD, JEE  # First get all metrics columns  all_metrics_columns = [  col for col in df.columns if not col.endswith("_error") and col != "model_id" and col != "average"  and col != "science_avg" and col != "code_avg" and col != "math_avg"  ]  # Define ordering of benchmarks  benchmark_order = [  "CodeElo_accuracy_avg",  "CodeForces_accuracy_avg",  "veCodeBench_accuracy_avg",  "AIME24_accuracy_avg",  "AMC23_accuracy_avg",  "MATH500_accuracy",  "GPQADiamond_accuracy_avg",  "JEEBench_accuracy_avg"  # Removed MMLUPro  ]  # Enre all columns exist in the DataFrame  metrics_columns = [col for col in benchmark_order if col in all_metrics_columns]  error_columns = [col for col in df.columns if col.endswith("_error")]  # Create mapping between metrics and their error columns  error_mapping = {}  for col in metrics_columns + ["average","math_avg", "science_avg", "code_avg"]:  error_col = f"{col}_error"  if error_col in error_columns:  error_mapping[col] = error_col  # Round values to one decimal place  for col in ["average", "science_avg", "code_avg", "math_avg"] + metrics_columns:  if col in df.columns:  df[col] = df[col].round(1)  for col in error_columns:  if col in df.columns:  df[col] = df[col].round(1)  # Sort by average score descending  if "average" in df.columns:  df = df.sort_values("average", ascending=False)  # Find the maximum value in each column for bolding  max_values = {}  max_indices = {}  for col in ["average", "science_avg", "code_avg", "math_avg"] + metrics_columns:  if col in df.columns:  max_values[col] = df[col].max()  max_indices[col] = df[col].idxmax()  # Get the standard errors for the max values  max_std_errors = {}  for col, idx in max_indices.items():  error_col = error_mapping.get(col)  if error_col and error_col in df.columns:  max_std_errors[col] = df.loc[idx, error_col]  else:  # If no error available, use a default small value  max_std_errors[col] = 0 # Default small error  # Define shorter benchmark names  benchmark_short_names = {  "MATH500": "MATH500",  "CodeElo": "CodeElo",  "CodeForces": "CodeForces",  "GPQADiamond": "GPQAD",  "veCodeBench": "LCB",  "AIME24": "AIME24",  "AMC23": "AMC23",  "JEEBench": "JEE",  # Removed MMLUPro  "science_avg": "Science Avg",  "code_avg": "Code Avg",  "math_avg": "Math Avg",  }  # Create new entries in max_values and max_std_errors with the short names  for key in st(max_values.keys()):  for benchmark, short_name in benchmark_short_names.items():  if benchmark in key:  short_key = key.replace(benchmark, short_name)  max_values[short_key] = max_values[key]  max_std_errors[short_key] = max_std_errors.get(key, 0)  break  # Rename columns using the benchmark_short_names mapping  renamed_columns = {}  for col in df.columns:  renamed = col  for benchmark, short_name in benchmark_short_names.items():  if benchmark in col:  renamed = col.replace(benchmark, short_name)  break  renamed_columns[col] = renamed  df = df.rename(columns=renamed_columns)  # Update metrics_columns and error_columns with new names  metrics_columns = [renamed_columns[col] for col in metrics_columns if col in renamed_columns]  error_columns = [renamed_columns[col] for col in error_columns if col in renamed_columns]  # Update error_mapping with new column names  error_mapping = {}  for col in metrics_columns:  error_col = f"{col}_error"  if error_col in error_columns:  error_mapping[col] = error_col  # Generate LaTeX table  latex_table = []  # Begin table environment  latex_table.append("\\begin{table}")  latex_table.append("\\centering")  latex_table.append("\\begin{sc}")  latex_table.append("\\caption{Model Performance with Standard Errors}")  # Add resizebox to make table fit page width  latex_table.append("\\resizebox{\\textwidth}{!}{%")  # Create table header with appropriate number of columns  # Model + Overall Avg + 3 domain avgs + individual metrics  # Add vertical nes between column groups as requested  # Format: l | c | c c c | c c c c c c c c  column_spec = "l | c | c c c | " + "c " * len(metrics_columns)  column_spec = column_spec.strip() # Remove traing space  latex_table.append(f"\\begin{{tabular}}{{{column_spec}}}")  latex_table.append("\\toprule")  # Column headers in the specified order  header_row = ["Model", "Average", "Code Avg", "Math Avg", "Science Avg"]  for col in metrics_columns:  # Create cleaner column names  clean_name = col.replace("_accuracy_avg", "").replace("_accuracy", "")  header_row.append(clean_name)  latex_table.append(" & ".join(header_row) + " \\\\")  latex_table.append("\\midrule")  # Table body  for idx, (model_name, row) in enumerate(df.iterrows()):  # Escape underscores and special characters in model name for LaTeX  latex_model_name = model_name.replace("_", "\\_")  table_row = [latex_model_name]  # Add average column  if "average" in row:  value = row["average"]  max_val = max_values.get("average", 0)  max_err = max_std_errors.get("average", 0.1)  # Check if within standard error of max  is_within_error = np.round(abs(value - max_val), decimals=3) <= 2 * max_err  error_col = "average_error"  if error_col in row:  error_value = row[error_col]  if is_within_error:  cell = f"\\textbf{{{value:.1f}}}$_{{\\text{{{error_value:.1f}}}}}$"  else:  cell = f"{value:.1f}$_{{\\text{{{error_value:.1f}}}}}$"  else:  cell = f"\\textbf{{{value:.1f}}}" if is_within_error else f"{value:.1f}"  table_row.append(cell)  else:  table_row.append("N/A")  # Add domain average columns in the specified order  for domain, domain_col in [  ("Code", "Code Avg"),  ("Math", "Math Avg"),  ("Science", "Science Avg")  ]:  display_col = f"{domain} Avg"  if domain_col in row:  value = row[domain_col]  max_val = max_values.get(domain_col, 0)  max_err = max_std_errors.get(domain_col, 0.1)  # Check if within standard error of max  is_within_error = np.round(abs(value - max_val), decimals=3) <= 2 * max_err  # Check if we have standard error for this domain  error_col = f"{domain_col}_error"  if error_col in row:  error_value = row[error_col]  if is_within_error:  cell = f"\\textbf{{{value:.1f}}}$_{{\\text{{{error_value:.1f}}}}}$"  else:  cell = f"{value:.1f}$_{{\\text{{{error_value:.1f}}}}}$"  else:  cell = f"\\textbf{{{value:.1f}}}" if is_within_error else f"{value:.1f}"  table_row.append(cell)  else:  table_row.append("N/A")  # Add metric columns with standard errors as bscripts  for col in metrics_columns:  if col in row:  value = row[col]  max_val = max_values.get(col, 0)  max_err = max_std_errors.get(col, 0.1)  if col == "MATH500_accuracy":  max_err = 0  # Check if within standard error of max  is_within_error = np.round(abs(value - max_val), decimals=3) <= 2 * max_err  # Add standard error as bscript if available  error_col = error_mapping.get(col)  # Skip standard errors for MATH500  if "MATH500" in col:  if is_within_error:  cell = f"\\textbf{{{value:.1f}}}"  else:  cell = f"{value:.1f}"  ef error_col and error_col in row:  error_value = row[error_col]  if is_within_error:  # Proper math mode for bscript with bold for values within error of max  cell = f"\\textbf{{{value:.1f}}}$_{{\\text{{{error_value:.1f}}}}}$"  else:  # Proper math mode for bscript  cell = f"{value:.1f}$_{{\\text{{{error_value:.1f}}}}}$"  else:  if is_within_error:  cell = f"\\textbf{{{value:.1f}}}"  else:  cell = f"{value:.1f}"  table_row.append(cell)  else:  table_row.append("N/A")  latex_table.append(" & ".join(table_row) + " \\\\")  # End table  latex_table.append("\\bottomrule")  latex_table.append("\\end{tabular}")  # Close resizebox  latex_table.append("}")  latex_table.append("\\end{sc}")  latex_table.append("\\label{tab:model_performance}")  latex_table.append("\\end{table}")  # Join all nes  latex_output = "\n".join(latex_table)  # Save to file  if not filename:  out_str = "_".join(bstrings)  # Add exclude prefix if excluding bstrings  if exclude_bstrings:  out_str += "_exclude_" + "_".join(exclude_bstrings)  else:  out_str = filename  with open(f"tables/{out_str}_table.tex", "w") as f:  f.write(latex_output)  print(f"LaTeX table saved to tables/{out_str}_table.tex")  return latex_output if __name__ == "__main__":  """  Generates a LaTeX table with benchmark relts and standard errors.  The table shows average scores and standard errors as bscripts.  Values are displayed from 0-100 to one decimal place.  The best value in each column is bolded, as well as values within  the standard error of the maximum value.  The table is wrapped in a resizebox to fit page width.  The table now includes domain averages for:  - Science (GPQADiamond, JEEBench) # Removed MMLUPro  - Code (CodeElo, veCodeBench, CodeForces)  - Math (AIME24, AMC23, MATH500)  Run as follows:  python generate_latex_table.py --bstrings hp_ablations mistral lr --exclude_bstrings debug test  """  parser = argparse.ArgumentParser()  parser.add_argument("--bstrings", nargs="+", type=str, help="bstrings to include in model weights location")  parser.add_argument("--exclude_bstrings", nargs="+", type=str, help="bstrings to exclude from model weights location")  parser.add_argument("--use_or", action="store_true", help="Use OR logic for bstrings instead of AND")  parser.add_argument("--file_name", default="", type=str, help="Custom filename for the output LaTeX table")  parser.add_argument(  "--num_workers", type=int, default=None, help="Number of worker processes to use (default: CPU count - 1)"  )  args = parser.parse_args()  # Set number of workers if specified  if args.num_workers:  num_cores = args.num_workers  else:  num_cores = max(1, multiprocessing.cpu_count() - 1)  print(f"Using {num_cores} worker processes")  latex_table = generate_latex_table(args.bstrings, args.exclude_bstrings, args.use_or, args.file_name)  print("\nGenerated LaTeX Table Preview:")  print("-" * 50)  print(latex_table)