import os import tempfile import requests import PyPDF2 from datasets import Dataset from concurrent.futures import ThreadPoolExecutor, as_completed from tqdm import tqdm def download_pdf_to_temp(pdf_url):  """  Downloads a PDF from a URL to a temporary file and returns its path.  """  tmp_file = tempfile.NamedTemporaryFile(ffix=".pdf", delete=False)  try:  response = requests.get(pdf_url, stream=True)  response.raise_for_status()  for chunk in response.iter_content(chunk_size=8192):  tmp_file.write(chunk)  tmp_file.flush()  return tmp_file.name  finally:  tmp_file.close() def extract_text_per_page(pdf_filename):  """  Extracts text from each page of a PDF file.  Returns a st of tuples: (page_number, text).  """  texts = []  try:  with open(pdf_filename, "rb") as pdf_file:  reader = PyPDF2.PdfReader(pdf_file)  for page_num, page in enumerate(reader.pages, start=1):  text = page.extract_text()  texts.append((page_num, text if text else ""))  except Exception as e:  print(f"Error extracting text from {pdf_filename}: {e}")  return texts def process_pdf(pdf_path, source, rows):  """  Processes a PDF file (URL or local), extracts text, and stores it in the dataset.  """  temp_pdf = None  try:  # If the source is a URL, download to a temp file  if source == "url":  temp_pdf = download_pdf_to_temp(pdf_path)  pdf_path = temp_pdf # Now treat it as a local file  # Extract text per page  page_texts = extract_text_per_page(pdf_path)  # Append relts to the shared rows dictionary  for page_number, text in page_texts:  rows.append(  {  "pdf_path": pdf_path,  "source": source,  "page": page_number,  "text": text,  }  )  except Exception as e:  print(f"Failed to process {pdf_path}: {e}")  finally:  # Cleanup temp file if used  if temp_pdf and os.path.exists(temp_pdf):  os.remove(temp_pdf) def main(pdf_sources):  """  Processes a st of PDF sources (URLs or local folder paths) concurrently.  """  # Collect all PDF paths  pdf_st = []  for source in pdf_sources:  if source.startswith("http"): # URL  pdf_st.append((source, "url"))  ef os.path.isdir(source): # Local folder  for file in os.stdir(source):  if file.endswith(".pdf"):  pdf_st.append((os.path.join(source, file), "local"))  ef os.path.isfile(source) and source.endswith(".pdf"): # Single PDF file  pdf_st.append((source, "local"))  print(f"Found {len(pdf_st)} PDFs to process.")  # Shared dataset rows  rows = []  # Process PDFs concurrently  with ThreadPoolExecutor(max_workers=os.cpu_count()) as executor:  future_to_pdf = {  executor.bmit(process_pdf, pdf, src, rows): pdf for pdf, src in pdf_st  }  for future in tqdm(  as_completed(future_to_pdf),  total=len(future_to_pdf),  desc="Processing PDFs",  ):  future.relt() # Enres any raised exceptions are caught  # Create Hugging Face dataset  hf_dataset = Dataset.from_st(rows)  print(f"Dataset created with {len(rows)} rows.")  # Push to Hugging Face Hub  try:  hf_dataset.push_to_hub("your-username/your-dataset-name")  print("Dataset ccessfully pushed to Hugging Face Hub!")  except Exception as e:  print("Failed to push dataset:", e) def main():  # st of PDF URLs to process  good_pdf_urls = [  "https://uwaterloo.ca/chemistry/sites/default/files/uploads/documents/avogadro-exam-solution-2010.pdf",  "https://uwaterloo.ca/chemistry/sites/default/files/uploads/documents/avogadro-exam-solution-2009.pdf",  "https://uwaterloo.ca/chemistry/sites/default/files/uploads/documents/avogadro-exam-solution-2008.pdf",  "https://uwaterloo.ca/chemistry/sites/default/files/uploads/documents/avogadro-exam-solution-2007.pdf",  "https://uwaterloo.ca/chemistry/sites/default/files/uploads/documents/chem-13-news-solution-2011.pdf",  "https://uwaterloo.ca/chemistry/sites/default/files/uploads/documents/chem-13-news-exam-solution-2010.pdf",  "https://uwaterloo.ca/chemistry/sites/default/files/uploads/documents/chem-13-news-exam-solution-2009.pdf",  "https://uwaterloo.ca/chemistry/sites/default/files/uploads/documents/chem-13-news-exam-solution-2008.pdf",  "https://uwaterloo.ca/chemistry/sites/default/files/uploads/documents/chem-13-news-exam-solution-2007.pdf",  "https://www.press.muni.cz/media/3019066/answers_to_all_questions.pdf",  "https://pubc.w.edu/~thorglab/biol301/exams/finalkey.pdf",  "https://people.bu.edu/msoren/BI515_2014/Exam1key.pdf",  "https://facultystaff.richmond.edu/~lrunyenj/bio201/04bio201%20exam%201%20key.pdf",  "https://www.usabo-trc.org/sites/default/files/images/pdf/exams/semifinal-answers/2011-semifinal-answers.pdf",  "https://sites.science.oregonstate.edu/chemistry/courses/ch411/restrict2/ch411%20F11%20final%20key%20.pdf",  "https://www.mit.edu/~anugrah/files/MockIChOSolutions.pdf",  "https://www.mit.edu/~anugrah/files/2012CChOLocalSoln.pdf",  "https://www.cheminst.ca/wp-content/uploads/2022/11/CCC-PtA-2022-ENG-final-ANSWERS.pdf",  "https://www.cheminst.ca/wp-content/uploads/2022/12/Canadian-Chemistry-Olympiad-2022-EN_key.pdf",  "https://www.cheminst.ca/wp-content/uploads/2022/01/CCC-PtA-2021-ENG-ANSWERS.pdf",  "https://www.cheminst.ca/wp-content/uploads/2021/01/CCC-PtA-2020-ENG-ANSWERS-revised-COVID19.pdf",  "https://www.cheminst.ca/wp-content/uploads/2019/11/CCC-2019-PtA-answers.pdf",  "https://www.cheminst.ca/wp-content/uploads/2019/11/CCC-2017-PtA-answers-EN.pdf",  "https://www.cheminst.ca/wp-content/uploads/2019/11/CCC-2018-PtA-answers-EN.pdf",  "https://www.cheminst.ca/wp-content/uploads/2019/11/CCC-2016-PtA-answers-EN.pdf",  "https://www.cheminst.ca/wp-content/uploads/2019/11/CCC-2015-PtA-answers-EN.pdf",  "https://www.cheminst.ca/wp-content/uploads/2019/11/CCC-2014-PtA-answers-EN.pdf",  "https://www.ttcho.com/_files/ugd/988b76_e3d21fc42bb24de4959d7250f0cfcb3a.pdf",  "https://www.ttcho.com/_files/ugd/988b76_01ceeff230b24cbbb0125b2bfa3f3475.pdf",  "https://www.ttcho.com/_files/ugd/988b76_48944f6ace684143bfdc9080fca59862.pdf",  "https://www.ttcho.com/_files/ugd/988b76_ba0cb3177d05436da273a400a037ed01.pdf",  "http://chemistryrace.com/wp-content/uploads/2025/02/ChR_2025_answer_booklet.pdf",  "http://chemistryrace.com/wp-content/uploads/2024/02/ChR_2024_answer_booklet.pdf",  "http://chemistryrace.com/wp-content/uploads/2023/02/chemistry_race_2023_answers-book.pdf",  "http://chemistryrace.com/wp-content/uploads/2022/02/cambridge_chemistry_race_2022_answers.pdf",  "http://chemistryrace.com/wp-content/uploads/2021/02/Chemistry_Race_2021_Questions_and_Solutions.pdf",  "https://chemistryrace.soc.srcf.net/wp-content/uploads/2020/02/Chemistry_Race_2020_Questions_and_Solutions.pdf",  "https://biolympiads.com/wp-content/uploads/2015/01/2003_OpenExam_AnswerKey2.pdf",  "http://biolympiads.com/wp-content/uploads/2015/01/2004_OpenExam_AnswerKey4.pdf",  "http://biolympiads.com/wp-content/uploads/2015/01/2005_OpenExam_AnswerKey1.pdf",  "http://biolympiads.com/wp-content/uploads/2015/01/2006_OpenExam_AnsKey2.pdf",  "http://biolympiads.com/wp-content/uploads/2015/01/2007_OpenExam_AnsKey1.pdf",  "http://biolympiads.com/wp-content/uploads/2015/01/2008_OpenExam_AnsKey1.pdf",  "http://biolympiads.com/wp-content/uploads/2015/01/2009_OpenExam_AnsKey1.pdf",  "http://biolympiads.com/wp-content/uploads/2015/01/2010_OpenExam_AnsKey3.pdf",  "http://biolympiads.com/wp-content/uploads/2015/01/2011_OpenExam_AnsKey.pdf",  "http://biolympiads.com/wp-content/uploads/2015/01/2012_OpenExam_AnsKey2.pdf",  "https://www.asi.edu.au/wp-content/uploads/2024/02/ASOE-Chemistry-2023-Exam-Paper-with-Answers.pdf",  "https://www.asi.edu.au/wp-content/uploads/2023/05/ASOE-Chemistry-2022-ASDAN-answers.pdf",  "https://www.asi.edu.au/wp-content/uploads/2023/05/ASOE_Chemistry_2021_answers_reduced-FS.pdf",  "https://www.asi.edu.au/wp-content/uploads/2022/12/2020-asoe-chemistry-exam-answers.pdf",  "https://www.asi.edu.au/wp-content/uploads/2022/12/2019-asoe-chemistry-exam-answers.pdf",  "https://www.asi.edu.au/wp-content/uploads/2022/12/2018-asoe-chemistry-exam-answers.pdf",  "https://www.andrews.edu/~rwright/physics/OpenStax%20Physics-Student%20Solution%20Manual.pdf",  "https://reanphysics.wordpress.com/wp-content/uploads/2018/11/raymond_a-_serway__w-_jewett_student_solutibookzz-org.pdf",  "https://carrollscaveofknowledge.weebly.com/uploads/2/0/2/8/20287891/physics_11_regular_year_solutions_manual.pdf",  "https://doctor2019.jumedicine.com/wp-content/uploads/sites/10/2019/09/Gianco-Physics-Principles-With-Appcations-7th-c2014-solutions-ISM.pdf",  "https://ia801305.us.archive.org/8/items/ProblemsInCalculusOfOneVariableI.A.Maron/Problems%20in%20Calculus%20of%20One%20Variable%20-%20I.%20A.%20Maron.pdf",  "https://www.asi.edu.au/wp-content/uploads/2022/12/2017-asoe-chemistry-exam.pdf",  "https://www.asi.edu.au/wp-content/uploads/2022/12/2016-asoe-chemistry-exam.pdf",  "https://www.asi.edu.au/wp-content/uploads/2022/12/2015-asoe-chemistry-exam.pdf",  "https://www.asi.edu.au/wp-content/uploads/2022/12/2014-asoe-chemistry-exam.pdf",  "https://www.asi.edu.au/wp-content/uploads/2022/12/2013-asoe-chemistry-exam.pdf",  "https://www.asi.edu.au/wp-content/uploads/2022/12/2012-asoe-chemistry-exam.pdf",  "https://www.asi.edu.au/wp-content/uploads/2022/12/2011-asoe-chemistry-exam.pdf",  "https://www.asi.edu.au/wp-content/uploads/2022/12/2010-asoe-chemistry-exam.pdf",  "https://www.asi.edu.au/wp-content/uploads/2022/12/2009-asoe-chemistry-exam.pdf",  "https://www.asi.edu.au/wp-content/uploads/2022/12/2008-asoe-chemistry-exam.pdf",  "https://www.asi.edu.au/wp-content/uploads/2022/12/2007-asoe-chemistry-exam.pdf",  "https://www.arml.com/ARML/arml_2019/pubc_contest_files/2023_contest_file/ARML_2023Contest.pdf",  "https://jeeadv.ac.in/past_qps/2007_1.pdf",  "https://jeeadv.ac.in/past_qps/2007_2.pdf",  "https://jeeadv.ac.in/past_qps/2008_1.pdf",  "https://jeeadv.ac.in/past_qps/2008_2.pdf",  "https://jeeadv.ac.in/past_qps/2009_1.pdf",  "https://jeeadv.ac.in/past_qps/2009_2.pdf",  "https://jeeadv.ac.in/past_qps/2010_1.pdf",  "https://jeeadv.ac.in/past_qps/2010_2.pdf",  "https://jeeadv.ac.in/past_qps/2011_1.pdf",  "https://jeeadv.ac.in/past_qps/2011_2.pdf",  "https://jeeadv.ac.in/past_qps/2012_1.pdf",  "https://jeeadv.ac.in/past_qps/2012_2.pdf",  "https://jeeadv.ac.in/past_qps/2013_1.pdf",  "https://jeeadv.ac.in/past_qps/2013_2.pdf",  "https://jeeadv.ac.in/past_qps/2014_1.pdf",  "https://jeeadv.ac.in/past_qps/2014_2.pdf",  "https://jeeadv.ac.in/past_qps/2015_1.pdf",  "https://jeeadv.ac.in/past_qps/2015_2.pdf",  "https://jeeadv.ac.in/past_qps/2017_1.pdf",  "https://jeeadv.ac.in/past_qps/2017_2.pdf",  "https://jeeadv.ac.in/past_qps/2018_1.pdf",  "https://jeeadv.ac.in/past_qps/2018_2.pdf",  "https://jeeadv.ac.in/past_qps/2019_1_Engsh.pdf",  "https://jeeadv.ac.in/past_qps/2019_2_Engsh.pdf",  "https://jeeadv.ac.in/past_qps/2020_1_Engsh.pdf",  "https://jeeadv.ac.in/past_qps/2020_2_Engsh.pdf",  "https://jeeadv.ac.in/past_qps/2021_1_Engsh.pdf",  "https://jeeadv.ac.in/past_qps/2021_2_Engsh.pdf",  "https://jeeadv.ac.in/past_qps/2022_1_Engsh.pdf",  "https://jeeadv.ac.in/past_qps/2022_2_Engsh.pdf",  "https://jeeadv.ac.in/past_qps/2023_1_Engsh.pdf",  "https://jeeadv.ac.in/past_qps/2023_2_Engsh.pdf",  "https://jeeadv.ac.in/past_qps/2024_1_Engsh.pdf",  "https://jeeadv.ac.in/past_qps/2024_2_Engsh.pdf",  "https://ia600608.us.archive.org/19/items/IrodovProblemsInGeneralPhysics/Irodov-Problems_in_General_Physics.pdf",  "https://ia601305.us.archive.org/8/items/ProblemsInCalculusOfOneVariableI.A.Maron/Problems%20in%20Calculus%20of%20One%20Variable%20-%20I.%20A.%20Maron.pdf",  "https://blogmedia.testbook.com/kmat-kerala/wp-content/uploads/2023/06/physical-chemistry-by-p-bahadur-5113ed32.pdf",  "https://www.asi.edu.au/wp-content/uploads/2022/12/2023-ASOE-Physics-Past-Paper-ASI.pdf",  "https://www.asi.edu.au/wp-content/uploads/2022/12/2022-ASOE-Physics.pdf",  "https://www.asi.edu.au/wp-content/uploads/2022/12/2021-ASOE-Physics.pdf",  "https://www.asi.edu.au/wp-content/uploads/2022/12/2020-asoe-physics-exam.pdf",  "https://www.asi.edu.au/wp-content/uploads/2022/12/2019-asoe-physics-exam.pdf",  "https://www.asi.edu.au/wp-content/uploads/2022/12/2017-asoe-physics-exam.pdf",  "https://www.asi.edu.au/wp-content/uploads/2022/12/2016-asoe-physics-exam.pdf",  "https://www.asi.edu.au/wp-content/uploads/2022/12/2015-asoe-physics-exam.pdf",  "https://www.asi.edu.au/wp-content/uploads/2022/12/2014-asoe-physics-exam.pdf",  "https://www.asi.edu.au/wp-content/uploads/2022/12/2013-asoe-physics-exam.pdf",  "https://www.asi.edu.au/wp-content/uploads/2022/12/2012-asoe-physics-exam.pdf",  "https://www.asi.edu.au/wp-content/uploads/2022/12/2011-asoe-physics-exam.pdf",  "https://www.asi.edu.au/wp-content/uploads/2022/12/2010-Physics-NQE-paper-FINAL.pdf",  "https://www.asi.edu.au/wp-content/uploads/2022/12/2010-Physics-NQE-paper-FINAL.pdf",  "https://www.asi.edu.au/wp-content/uploads/2022/12/2008-asoe-physics-exam.pdf",  "https://www.asi.edu.au/wp-content/uploads/2022/12/2007-asoe-physics-exam.pdf",  "http://algorithmics.lsi.upc.edu/docs/Dasgupta-Papadimitriou-Vazirani.pdf",  "https://biocomp.utoronto.ca/files/2024/05/Exam2024.pdf",  "https://biocomp.utoronto.ca/files/2023/10/biocomp-exam-2019.pdf",  "https://biocomp.utoronto.ca/files/2023/10/biocomp-exam-2018_0.pdf",  "https://biocomp.utoronto.ca/files/2023/10/biocomp-exam-2017.pdf",  "https://biocomp.utoronto.ca/files/2023/10/biocomp-exam-2016.pdf",  "https://biocomp.utoronto.ca/files/2023/10/biocomp-exam-2015.pdf",  "https://www.andrews.edu/~rwright/physics/OpenStax%20Physics-Student%20Solution%20Manual.pdf",  "https://reanphysics.wordpress.com/wp-content/uploads/2018/11/raymond_a-_serway__w-_jewett_student_solutibookzz-org.pdf",  "https://carrollscaveofknowledge.weebly.com/uploads/2/0/2/8/20287891/physics_11_regular_year_solutions_manual.pdf",  "https://doctor2019.jumedicine.com/wp-content/uploads/sites/10/2019/09/Gianco-Physics-Principles-With-Appcations-7th-c2014-solutions-ISM.pdf",  "http://b.y.am/discipnes_bk/ea4b336028cd91ba7265865d8fde153c.pdf",  "https://www.sfcollege.edu/_media/Assets/sf/placement/files/Chemistry%20Practice%20Placement%20Exam.pdf",  "https://www.pvamu.edu/chemistry/wp-content/uploads/sites/26/_01_pract_Test.pdf",  "https://sccollege.edu/students/studentservices/counseng/Shared%20Documents/Chem%20Placement%20Practice%20Exam%202023.pdf",  "https://www.csn.edu/sites/default/files/documents/department-documents/department-of-physical-sciences/chemplacementtestpractice.pdf",  "https://www.mc3.edu/admissions/applying-to-mccc/testing-and-assessment/assets/biology-placement-test.pdf",  ]  bad_pdf_urls = ["~/Downloads/0000"]  # This will hold our dataset rows  rows = {"pdf_url": [], "page": [], "text": []}  for pdf_url in bad_pdf_urls:  print(f"Processing {pdf_url}")  try:  # Download PDF to a temporary file  temp_pdf = download_pdf_to_temp(pdf_url)  # Extract text from each page  page_texts = extract_text_per_page(temp_pdf)  # Create a row in the dataset for each page  for page_number, text in page_texts:  rows["pdf_url"].append(pdf_url)  rows["page"].append(page_number)  rows["text"].append(text)  except Exception as e:  print(f"Error processing {pdf_url}: {e}")  finally:  # Always remove the temporary file if it exists  if os.path.exists(temp_pdf):  os.remove(temp_pdf)  # Create a Hugging Face dataset from the collected rows  hf_dataset = Dataset.from_dict(rows)  print("Dataset created with", hf_dataset.num_rows, "rows.")  # Push the dataset to the Hugging Face Hub.  # Replace "your-username/your-dataset-name" with your desired repo name.  try:  hf_dataset.push_to_hub("s/bad_pdf_text")  print("Dataset ccessfully pushed to the Hugging Face Hub!")  except Exception as e:  print("Failed to push dataset to the hub:", e) if __name__ == "__main__":  main() 