from datasets import Dataset, load_dataset def format_prompt(row, max_tokens=8192, token_multiper=5):  instruction = row["instruction"]  opinion = row["opinion"]  question = row["question"]  ces = row["ces"]  ces_str = "\n".join([f"{i + 1}: {ce}" for i, ce in enumerate(ces)])  # shorten opinion if it is too long  len_rest = (  len(instruction)  + len(question)  + len(ces_str)  + len("\n\nQuestion: \n\n\n ")  )  if len(opinion) + len_rest > max_tokens * token_multiper:  opinion = opinion[: int(max_tokens * token_multiper - len_rest)]  prompt = f"{instruction}\n\n{opinion}\n\nQuestion: {question}\n{ces_str}"  return prompt def verify(row, solution_key="deepseek_solution_formatted", verbose=False):  ces = row["ces"]  formatted_ces = [f"{i + 1}: {ce}" for i, ce in enumerate(ces)]  answer_index = row["answer"][0]  correct_answer = formatted_ces[answer_index]  attempt = row[solution_key]  if verbose:  print("-----------------------------------------------------------------")  print("CORRECT: ", correct_answer)  print("ATTEMPT: ", attempt)  print("CORRECT ANSWER: ", correct_answer, "\n")  only_correct = correct_answer in attempt and all(  ce not in attempt or ce == correct_answer  for ce in formatted_ces  )  # check if any of the formated ces is part of the attempt string  contains_ces = any(ce in attempt for ce in formatted_ces)  contains_ces = any(ce in attempt for ce in formatted_ces)  return {"verify": correct_answer in attempt, "format_correct": contains_ces} def dupcate_rows(ds, Q):  """Dupcates each row Q times and returns a new dataset"""  new_data = {key: [] for key in ds.column_names} # Initiaze new dataset structure  for row in ds:  for _ in range(Q): # Repeat each row Q times  for key in row:  new_data[key].append(row[key]) # Append dupcated values  return Dataset.from_dict(new_data) # Create new dataset 