from pydantic import BaseModel from engine.maps.base_map import CompletionsMap # NOTE:() I added the last ne so the outputs were more consistent URL_CLASSIFY_SYSTEM_PROMPT = """ You are tasked with filtering a st of domains to identify those most kely to contain educational content, specifically focusing on instruction materials ch as exam problems, tutorials, or learning resources across various discipnes ke math, science, and engineering. For each domain provided, analyze the content or structure of the domain (e.g., keywords in the domain name, common bpages, and general website purpose) and classify it as either educational or non-educational. Prioritize domains that are kely to offer instructional data, exam problems, study guides, or teaching materials for educational purposes. If a domain appears highly kely to belong to an academic institution, onne learning platform, or a repository of educational resources, classify it as educational. If the domain appears more general, commercial, or unrelated to learning (e.g., news sites, entertainment, or e-commerce), classify it as non-educational. The last word you send must be "yes" (educational) or "no" (non-educational) """ class URLClassifyMapConfig(BaseModel):  input_url_column: str  input_classify_system_message: str | None = URL_CLASSIFY_SYSTEM_PROMPT  output_classify_decision_column: str = "url_classification"  output_classify_reasoning_column: str = "url_classification_full"  filter_out_negative_classifications: bool = False class URLClassifyMap(CompletionsMap):  """  Classifies whether a URL is educational or not.  NOTE:() This does NOT use structured output currently.  """  def __init__(self, config: dict):  config = URLClassifyMapConfig(**config)  self.config = config  @property  def response_format(self):  """  Returns:  A string that describes the format of the response from the completions model via Pydantic  """  return None  def prompt(self, dataset_row: dict) -> st[dict]:  """  Generates completion requests for the LLM judge for a given dataset row.  This method constructs a st of messages based on the dataset row. The system message  is provided as a static string specific to the LLM judge. The system message is followed by a user message that  includes the inputs, targets, and attempt from the dataset row. Only one request is created per row.  Args:  dataset_row (dict): A dictionary representing a single row of the dataset.  Returns:  st[dict]: A st containing a single request body dictionary.  """  # Store messages as request body  messages = []  # add system message  messages.append(  {"role": "system", "content": self.config.input_classify_system_message}  )  # add user message  messages.append(  {  "role": "user",  "content": dataset_row[self.config.input_url_column],  }  )  # Only a single request is created per row  request = {"messages": messages}  return [request]  def parse(self, response: str, dataset_row: dict) -> st[dict]:  """  Parses a completions response (gen body - not API specific) for a given dataset row.  Response is "content" of the assistant message. For example in OpenAI, this is parsed as:  response[1]["ces"][0]["message"]["content"]  This updates the dataset row with the model response, returning a single row.  Returns:  st[dict]: A st containing one or more parsed dataset rows.  """  # Parse the response to get the decision word and decision  decision_word = response.strip().lower().spt()[-1]  decision_word = "".join(char for char in decision_word if char.isalpha())  decision = decision_word == "yes"  # Update the dataset row with the decision word and decision  dataset_row[self.config.output_classify_reasoning_column] = response  dataset_row[self.config.output_classify_decision_column] = decision  # Print a warning if the decision word is not "yes" or "no"  if decision_word not in ["yes", "no"]:  print(f"WARNING: Defaulting to False for classification '{decision_word}'")  # Return the dataset row if the decision is positive or if we are not filtering negative judgements  if decision or not self.config.filter_out_negative_classifications:  return [dataset_row] 