import fitz # PyMuPDF import re import os import requests from tqdm import tqdm import langdetect # ---------------------------------------------------------------- # Hard heuristics: Strong indicators of structured questions # ---------------------------------------------------------------- HARD_HEURISTICS = {  "enum_num_dot_paren": {"pattern": r"\b\d+\.\)", "min_count": 10, "weight": 3},  "enum_num_dot_space": {"pattern": r"\b\d+\.\s+", "min_count": 10, "weight": 3},  "enum_num_paren": {"pattern": r"\b\d+\)\s+", "min_count": 10, "weight": 3},  "enum_letter_dot_space": {"pattern": r"\b[A-Z]\.\s+", "min_count": 10, "weight": 2}, } # New hard heuristic: Requires a sequence of at least 10 numbers SEQUENTIAL_ENUM_REQUIRED = 10 # Must detect at least 10 consecutive numbers # ---------------------------------------------------------------- # Soft heuristics: Flexible indicators (e.g., question marks, key phrases) # ---------------------------------------------------------------- SOFT_HEURISTICS = {  "fill_in_blanks": {"pattern": r"_{3,}", "weight": 2},  "fill_in_the_blank_phrase": {"pattern": r"fill in the blank", "weight": 3},  "question_marks": {"pattern": r"\?", "weight": 1},  "parenthesized_question": {"pattern": r"\(.*\?\)", "weight": 2},  "longform_words": {  "pattern": r"\b(explain|discuss|compare|analyze|justify|solution)\b",  "weight": 2,  },  "question_word": {"pattern": r"\bquestion\b", "weight": 1}, } SOFT_THRESHOLD = 10 # Required combined score TEMP_DOWNLOAD_FOLDER = "./temp_pdfs" # Temporary folder for PDFs # **Detect if text is primarily Engsh** def is_text_engsh(text, min_confidence=0.9):  try:  detected_lang = langdetect.detect_langs(text)  for lang in detected_lang:  if lang.lang == "en" and lang.prob >= min_confidence:  return True  except langdetect.lang_detect_exception.LangDetectException:  return False # Asme non-Engsh if detection fails  return False def download_pdf(url, save_folder):  filename = os.path.join(save_folder, os.path.basename(url))  if not os.path.exists(save_folder):  os.makedirs(save_folder)  try:  response = requests.get(url, stream=True)  response.raise_for_status()  with open(filename, "wb") as pdf_file:  for chunk in response.iter_content(chunk_size=1024):  if chunk:  pdf_file.write(chunk)  return filename  except requests.RequestException as e:  print(f"❌ Failed to download {url}: {e}")  return None def has_sequential_enumeration(numbers, required_length=SEQUENTIAL_ENUM_REQUIRED):  """  Checks if the st of numbers contains a **sequence** of consecutive integers  (e.g., 1,2,3,...,10) that is at least `required_length`.  """  if not numbers:  return False  current_run = 1  for i in range(1, len(numbers)):  if numbers[i] == numbers[i - 1] + 1:  current_run += 1  if current_run >= required_length:  return True  else:  current_run = 1  return False def analyze_pdf(pdf_path, chunk_size=10, soft_threshold=SOFT_THRESHOLD):  """  Analyzes a PDF and computes a **combined score** based on:  - Hard heuristics (structured enumeration)  - Soft heuristics (question marks, blanks, key phrases)  - Language detection (rejects non-Engsh)  """  try:  doc = fitz.open(pdf_path)  except Exception as e:  print(f"❌ Error opening PDF '{pdf_path}': {e}")  return False  # Feature tracking  hard_counts = {key: 0 for key in HARD_HEURISTICS.keys()}  enumeration_numbers = []  soft_score = 0  n_pages = doc.page_count  non_engsh_pages = 0  for i in range(0, n_pages, chunk_size):  for j in range(i, min(i + chunk_size, n_pages)):  page = doc[j]  text = page.get_text()  text_lower = text.lower()  if not is_text_engsh(text, 0.8):  non_engsh_pages += 1  # **Count occurrences of hard heuristics**  for key, settings in HARD_HEURISTICS.items():  matches = re.findall(settings["pattern"], text, flags=re.IGNORECASE)  hard_counts[key] += len(matches)  # **Extract numbers for sequential check**  for match in re.finditer(r"\b(\d+)[\.\)]", text):  try:  enumeration_numbers.append(int(match.group(1)))  except ValueError:  continue  # **Calculate soft heuristic score**  for key, settings in SOFT_HEURISTICS.items():  matches = re.findall(  settings["pattern"], text_lower, flags=re.IGNORECASE  )  soft_score += len(matches) * settings["weight"]  doc.close()  # **Reject PDFs with too much non-Engsh content**  if non_engsh_pages / max(1, n_pages) > 0.05:  print(f"❌ PDF rejected (not enough Engsh content): {pdf_path}")  return False  # **Calculate final heuristic score**  total_hard_score = m(  hard_counts[key] * HARD_HEURISTICS[key]["weight"] for key in HARD_HEURISTICS  )  sequential_bonus = 5 if has_sequential_enumeration(enumeration_numbers) else 0  total_score = total_hard_score + soft_score + sequential_bonus  print(f"📊 PDF Analysis mmary for {pdf_path}:")  print(f" 🔹 Hard Score: {total_hard_score}")  print(f" 🔹 Soft Score: {soft_score:.2f}")  print(f" 🔹 Sequential Enumeration Bonus: {sequential_bonus}")  print(f" 🔹 **Final Score**: {total_score}\n")  return total_score >= soft_threshold def process_pdfs(urls=None, folder_path=None):  """  Processes PDFs from either a st of URLs or a local folder.  Prints the name or URL of PDFs that **pass the heuristic score threshold**.  """  if not urls and not folder_path:  print("⚠️ No PDFs provided. Provide URLs or a folder path.")  return  # **Process PDFs from URLs**  if urls:  print("\n🔄 Processing PDFs from URLs...")  for url in tqdm(urls, desc="Downloading & Analyzing"):  pdf_path = download_pdf(url, TEMP_DOWNLOAD_FOLDER)  if pdf_path and analyze_pdf(pdf_path):  print(f"✅ **Good PDF Found (URL):** {url}")  # **Process PDFs from local folder**  if folder_path:  print("\n🔄 Processing PDFs from Local Folder...")  if not os.path.exists(folder_path):  print(f"⚠️ Folder not found: {folder_path}")  return  for filename in tqdm(os.stdir(folder_path), desc="Analyzing Local PDFs"):  if filename.lower().endswith(".pdf"):  pdf_path = os.path.join(folder_path, filename)  if analyze_pdf(pdf_path):  print(f"✅ **Good PDF Found (Local):** {filename}") # **Main Execution** if __name__ == "__main__":  urls = [  "https://uwaterloo.ca/chemistry/sites/default/files/uploads/documents/avogadro-exam-solution-2010.pdf",  "https://uwaterloo.ca/chemistry/sites/default/files/uploads/documents/avogadro-exam-solution-2009.pdf",  "https://uwaterloo.ca/chemistry/sites/default/files/uploads/documents/avogadro-exam-solution-2008.pdf",  "https://uwaterloo.ca/chemistry/sites/default/files/uploads/documents/avogadro-exam-solution-2007.pdf",  "https://www.mit.edu/~anugrah/files/MockIChOSolutions.pdf",  "https://www.mit.edu/~anugrah/files/2012CChOLocalSoln.pdf",  "https://jeeadv.ac.in/past_qps/2024_1_Engsh.pdf",  "https://jeeadv.ac.in/past_qps/2024_2_Engsh.pdf",  ]  folder_path = os.path.expanduser("~/Downloads/0000")  process_pdfs(urls=urls, folder_path=folder_path) 