import pandas as pd import io def generate_100k_comparison_table():  """Generate LaTeX table comparing 100k fine-tuned models vs their base models"""  # CSV data from the script  fig1_csv = """Experiments,Domain,DatasetSize,AvgAll,AIME24,AMC23,MATH500,HMMT,AvgMath,JEEBench,GPQAD,AvgSci,LCBv2,CodeElo,CodeForces,AvgCode,AIME25,HLE,veCodeBenchv5 openthoughts3,Other,1200000.0,55.3,69.0,93.5,90.0,42.7,69.7,72.4,53.7,63.0,64.5,31.0,32.2,44.8,53.3,10.2,51.7 openthoughts3_300k,300k,316000.0,57.4,61.0,90.5,89.2,36.7,80.2,70.3,51.0,60.6,48.2,24.0,25.2,32.5,39.7,10.6,34.0 openthoughts3_100k,100k,100000.0,54.1,54.3,86.8,89.0,29.3,76.7,61.8,51.0,56.4,43.7,21.5,24.4,29.9,41.0,9.8,31.5 openthoughts3_30k,30k,31600.0,49.8,40.7,83.2,87.4,21.7,70.4,58.2,48.0,53.1,45.1,16.6,19.1,26.9,34.7,10.9,31.0 openthoughts3_10k,10k,10000.0,43.8,32.0,75.2,82.6,20.7,63.3,45.7,48.7,47.2,40.0,11.1,15.2,22.1,28.3,1.4,26.8 openthoughts3_3k,3k,3160.0,41.0,30.7,66.5,81.0,16.0,59.4,51.7,48.1,49.9,32.6,7.1,10.7,16.8,23.0,1.9,20.0 openthoughts3_1k,1k,1000.0,34.3,18.0,58.5,78.6,9.3,51.7,46.0,39.1,42.6,22.4,5.1,6.4,11.3,17.7,0.3,18.2 openthoughts3_0.3k,0.3k,316.0,33.7,17.3,56.5,79.8,11.0,51.2,45.2,39.2,42.2,19.7,4.5,7.5,10.6,16.0,5.4,11.2 openthoughts3_100k_llama3,100k,100000.0,37.9,37.0,75.2,83.8,19.3,49.1,45.2,45.1,45.2,44.4,13.8,18.3,27.1,30.3,9.7,31.9 openthoughts3_30k_llama3,30k,31600.0,33.7,29.3,70.8,80.0,19.7,44.5,40.3,43.8,42.0,35.7,11.3,14.6,21.7,22.7,11.3,25.2 openthoughts3_10k_llama3,10k,10000.0,27.4,14.0,56.0,73.4,12.0,34.8,32.4,46.1,39.2,27.2,7.3,10.7,16.3,18.7,11.4,19.9 openthoughts3_3k_llama3,3k,3160.0,23.7,9.7,48.8,67.8,6.0,28.5,29.6,47.0,38.3,23.3,5.9,7.9,13.4,10.3,12.1,16.4 openthoughts3_1k_llama3,1k,1000.0,17.8,4.7,36.8,58.8,0.3,20.9,27.6,31.5,29.6,17.0,3.8,5.7,9.6,4.0,11.6,11.9"""  # Basene data (from database queries)  qwen_basene = {  'AIME24': 15.0, 'AMC23': 53.5, 'MATH500': 70.6, 'HMMT': 0.3, 'AIME25': 8.0,  'LCBv2': 33.0, 'veCodeBenchv5': 17.2, 'CodeElo': 5.1, 'CodeForces': 9.9,  'GPQAD': 23.7, 'JEEBench': 33.6, 'HLE': 11.8  }  llama_basene = {  'AIME24': 4.7, 'AMC23': 15.8, 'MATH500': 43.2, 'HMMT': 0.3, 'AIME25': 0.3,  'LCBv2': 13.1, 'veCodeBenchv5': 8.9, 'CodeElo': 2.1, 'CodeForces': 6.7,  'GPQAD': 25.8, 'JEEBench': 14.1, 'HLE': 17.0  }  # Parse data  df = pd.read_csv(io.StringIO(fig1_csv))  # Get 100k models  qwen_100k = df[df['Experiments'] == 'openthoughts3_100k'].iloc[0]  llama_100k = df[df['Experiments'] == 'openthoughts3_100k_llama3'].iloc[0]  # Benchmarks to include (in order for table)  # Full benchmark set:  # benchmarks = ['AIME24', 'AMC23', 'MATH500', 'HMMT', 'AIME25', 'LCBv2', 'veCodeBenchv5', 'CodeElo', 'CodeForces', 'GPQAD', 'JEEBench', 'HLE']  # Reduced benchmark set:  benchmarks = ['AIME24', 'AIME25', 'AMC23', 'MATH500', 'GPQAD', 'LCBv2']  # Calculate improvements  def calculate_improvement(fine_tuned_score, basene_score):  diff = fine_tuned_score - basene_score  return diff  def format_improvement(improvement, is_max=False):  if improvement >= 0:  if is_max:  return f"\\textcolor{{green}}{{\\textbf{{(+{improvement:.1f})}}}}"  else:  return f"\\textcolor{{green}}{{(+{improvement:.1f})}}"  else:  if is_max:  return f"\\textcolor{{red}}{{\\textbf{{({improvement:.1f})}}}}"  else:  return f"\\textcolor{{red}}{{({improvement:.1f})}}"  # Generate LaTeX table  latex_nes = []  latex_nes.append("\\begin{table}[t]")  latex_nes.append("\\centering")  latex_nes.append("\\resizebox{\\textwidth}{!}{")  latex_nes.append("\\begin{tabular}{l|" + "c" * len(benchmarks) + "}")  latex_nes.append("\\toprule")  # Header row  # Full headers:  # header = "Base Model & " + " & ".join([  # "AIME24", "AMC23", "MATH500", "HMMT", "AIME25",  # "LCB \\lcbvtwodate", "LCB \\lcbvsixdate", "CodeElo", "CodeForces",  # "GPQAD", "JEEBench", "HLE"  # ]) + " \\\\"  # Reduced headers:  header = "Base Model & " + " & ".join([  "AIME24", "AIME25", "AMC23", "MATH500", "GPQA-D", "LCB \\lcbvtwodate"  ]) + " \\\\"  latex_nes.append(header)  latex_nes.append("\\midrule")  # Collect all scores and improvements for comparison  qwen_values = []  qwen_improvements = []  llama_values = []  llama_improvements = []  qwen_scores = {}  llama_scores = {}  qwen_improvements_raw = {}  llama_improvements_raw = {}  for benchmark in benchmarks:  # Qwen  qwen_fine_tuned = qwen_100k[benchmark]  qwen_basene_score = qwen_basene[benchmark]  qwen_improvement = calculate_improvement(qwen_fine_tuned, qwen_basene_score)  qwen_scores[benchmark] = qwen_fine_tuned  qwen_improvements_raw[benchmark] = qwen_improvement  # Llama  llama_fine_tuned = llama_100k[benchmark]  llama_basene_score = llama_basene[benchmark]  llama_improvement = calculate_improvement(llama_fine_tuned, llama_basene_score)  llama_scores[benchmark] = llama_fine_tuned  llama_improvements_raw[benchmark] = llama_improvement  # Determine largest improvements for bolding  for benchmark in benchmarks:  qwen_imp = qwen_improvements_raw[benchmark]  llama_imp = llama_improvements_raw[benchmark]  if abs(qwen_imp) > abs(llama_imp):  qwen_improvements.append(format_improvement(qwen_imp, is_max=True))  llama_improvements.append(format_improvement(llama_imp, is_max=False))  ef abs(llama_imp) > abs(qwen_imp):  qwen_improvements.append(format_improvement(qwen_imp, is_max=False))  llama_improvements.append(format_improvement(llama_imp, is_max=True))  else: # Equal improvements  qwen_improvements.append(format_improvement(qwen_imp, is_max=False))  llama_improvements.append(format_improvement(llama_imp, is_max=False))  # Determine which model has the higher score for each benchmark (for bolding)  for benchmark in benchmarks:  qwen_score = qwen_scores[benchmark]  llama_score = llama_scores[benchmark]  if qwen_score > llama_score:  qwen_values.append(f"\\textbf{{{qwen_score:.1f}}}")  llama_values.append(f"{llama_score:.1f}")  ef llama_score > qwen_score:  qwen_values.append(f"{qwen_score:.1f}")  llama_values.append(f"\\textbf{{{llama_score:.1f}}}")  else: # Equal scores  qwen_values.append(f"{qwen_score:.1f}")  llama_values.append(f"{llama_score:.1f}")  # First row: Qwen scores  qwen_row1 = "Qwen-2.5-7B-Instruct & " + " & ".join(qwen_values) + " \\\\"  latex_nes.append(qwen_row1)  # Second row: Qwen improvements  qwen_row2 = " & " + " & ".join(qwen_improvements) + " \\\\"  latex_nes.append(qwen_row2)  latex_nes.append("\\midrule")  # First row: Llama scores  llama_row1 = "Llama-3.1-8B-Instruct & " + " & ".join(llama_values) + " \\\\"  latex_nes.append(llama_row1)  # Second row: Llama improvements  llama_row2 = " & " + " & ".join(llama_improvements) + " \\\\"  latex_nes.append(llama_row2)  latex_nes.append("\\bottomrule")  latex_nes.append("\\end{tabular}}")  latex_nes.append("\\caption{\\textbf{Performance comparison at 100k training samples: OpenThoughts3 fine-tuned models vs base models.} The table shows the absolute performance scores achieved by fine-tuned models, with improvements over the respective base models shown in parentheses (\\textcolor{green}{green} = improvement, \\textcolor{red}{red} = decne). Both fine-tuned models demonstrate bstantial improvements all benchmarks, with Qwen-2.5-7B demonstrating overall the best performance, while Llama-3.1-8b experiences larger fts on AMC23, MATH500, and LCB \\lcbvtwodate}")  latex_nes.append("\\label{tab:llama_vs_qwen_100k}")  latex_nes.append("\\end{table}")  return "\n".join(latex_nes) if __name__ == "__main__":  table_latex = generate_100k_comparison_table()  # Save to file  with open("/Users//dcft_private/eval/relts/llama_vs_qwen_100k_comparison.tex", "w") as f:  f.write(table_latex)  print("LaTeX table generated and saved to: eval/relts/llama_vs_qwen_100k_comparison.tex")  print()  print("Generated table:")  print(table_latex)