{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 36,
   "id": "7d301cb5",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import time\n",
    "import requests, pathlib, re, urllib.parse\n",
    "from bs4 import BeautifulSoup"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "id": "ffa5232d",
   "metadata": {},
   "outputs": [],
   "source": [
    "html_url_template = 'https://aemint-search-client-funcapp-prod.azurewebsites.net/api/faceted-search?siteName=oecd&interfaceLanguage=en&orderBy=mostRecent&page={page_idx}&pageSize=10&hiddenFacets=oecd-search-config-pillars%3Apublications&facets=oecd-serials%3Ag1ghghge&facets=oecd-languages%3Aen&minPublicationYear=2020&maxPublicationYear=2025'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "id": "93bd6013",
   "metadata": {},
   "outputs": [],
   "source": [
    "page_idxes = [i for i in range(12)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "id": "f6405b99",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Crawling html page from https://aemint-search-client-funcapp-prod.azurewebsites.net/api/faceted-search?siteName=oecd&interfaceLanguage=en&orderBy=mostRecent&page=0&pageSize=10&hiddenFacets=oecd-search-config-pillars%3Apublications&facets=oecd-serials%3Ag1ghghge&facets=oecd-languages%3Aen&minPublicationYear=2020&maxPublicationYear=2025\n",
      "Found 10 urls in page 0\n",
      "Crawling html page from https://aemint-search-client-funcapp-prod.azurewebsites.net/api/faceted-search?siteName=oecd&interfaceLanguage=en&orderBy=mostRecent&page=1&pageSize=10&hiddenFacets=oecd-search-config-pillars%3Apublications&facets=oecd-serials%3Ag1ghghge&facets=oecd-languages%3Aen&minPublicationYear=2020&maxPublicationYear=2025\n",
      "Found 10 urls in page 1\n",
      "Crawling html page from https://aemint-search-client-funcapp-prod.azurewebsites.net/api/faceted-search?siteName=oecd&interfaceLanguage=en&orderBy=mostRecent&page=2&pageSize=10&hiddenFacets=oecd-search-config-pillars%3Apublications&facets=oecd-serials%3Ag1ghghge&facets=oecd-languages%3Aen&minPublicationYear=2020&maxPublicationYear=2025\n",
      "Found 10 urls in page 2\n",
      "Crawling html page from https://aemint-search-client-funcapp-prod.azurewebsites.net/api/faceted-search?siteName=oecd&interfaceLanguage=en&orderBy=mostRecent&page=3&pageSize=10&hiddenFacets=oecd-search-config-pillars%3Apublications&facets=oecd-serials%3Ag1ghghge&facets=oecd-languages%3Aen&minPublicationYear=2020&maxPublicationYear=2025\n",
      "Found 10 urls in page 3\n",
      "Crawling html page from https://aemint-search-client-funcapp-prod.azurewebsites.net/api/faceted-search?siteName=oecd&interfaceLanguage=en&orderBy=mostRecent&page=4&pageSize=10&hiddenFacets=oecd-search-config-pillars%3Apublications&facets=oecd-serials%3Ag1ghghge&facets=oecd-languages%3Aen&minPublicationYear=2020&maxPublicationYear=2025\n",
      "Found 10 urls in page 4\n",
      "Crawling html page from https://aemint-search-client-funcapp-prod.azurewebsites.net/api/faceted-search?siteName=oecd&interfaceLanguage=en&orderBy=mostRecent&page=5&pageSize=10&hiddenFacets=oecd-search-config-pillars%3Apublications&facets=oecd-serials%3Ag1ghghge&facets=oecd-languages%3Aen&minPublicationYear=2020&maxPublicationYear=2025\n",
      "Found 10 urls in page 5\n",
      "Crawling html page from https://aemint-search-client-funcapp-prod.azurewebsites.net/api/faceted-search?siteName=oecd&interfaceLanguage=en&orderBy=mostRecent&page=6&pageSize=10&hiddenFacets=oecd-search-config-pillars%3Apublications&facets=oecd-serials%3Ag1ghghge&facets=oecd-languages%3Aen&minPublicationYear=2020&maxPublicationYear=2025\n",
      "Found 10 urls in page 6\n",
      "Crawling html page from https://aemint-search-client-funcapp-prod.azurewebsites.net/api/faceted-search?siteName=oecd&interfaceLanguage=en&orderBy=mostRecent&page=7&pageSize=10&hiddenFacets=oecd-search-config-pillars%3Apublications&facets=oecd-serials%3Ag1ghghge&facets=oecd-languages%3Aen&minPublicationYear=2020&maxPublicationYear=2025\n",
      "Found 10 urls in page 7\n",
      "Crawling html page from https://aemint-search-client-funcapp-prod.azurewebsites.net/api/faceted-search?siteName=oecd&interfaceLanguage=en&orderBy=mostRecent&page=8&pageSize=10&hiddenFacets=oecd-search-config-pillars%3Apublications&facets=oecd-serials%3Ag1ghghge&facets=oecd-languages%3Aen&minPublicationYear=2020&maxPublicationYear=2025\n",
      "Found 10 urls in page 8\n",
      "Crawling html page from https://aemint-search-client-funcapp-prod.azurewebsites.net/api/faceted-search?siteName=oecd&interfaceLanguage=en&orderBy=mostRecent&page=9&pageSize=10&hiddenFacets=oecd-search-config-pillars%3Apublications&facets=oecd-serials%3Ag1ghghge&facets=oecd-languages%3Aen&minPublicationYear=2020&maxPublicationYear=2025\n",
      "Found 10 urls in page 9\n",
      "Crawling html page from https://aemint-search-client-funcapp-prod.azurewebsites.net/api/faceted-search?siteName=oecd&interfaceLanguage=en&orderBy=mostRecent&page=10&pageSize=10&hiddenFacets=oecd-search-config-pillars%3Apublications&facets=oecd-serials%3Ag1ghghge&facets=oecd-languages%3Aen&minPublicationYear=2020&maxPublicationYear=2025\n",
      "Found 10 urls in page 10\n",
      "Crawling html page from https://aemint-search-client-funcapp-prod.azurewebsites.net/api/faceted-search?siteName=oecd&interfaceLanguage=en&orderBy=mostRecent&page=11&pageSize=10&hiddenFacets=oecd-search-config-pillars%3Apublications&facets=oecd-serials%3Ag1ghghge&facets=oecd-languages%3Aen&minPublicationYear=2020&maxPublicationYear=2025\n",
      "Found 4 urls in page 11\n",
      "Found 114 urls in total\n"
     ]
    }
   ],
   "source": [
    "pdf_html_list = []\n",
    "report_list = []\n",
    "\n",
    "for idx in page_idxes:\n",
    "    html_url = html_url_template.format(page_idx=idx)\n",
    "    print('Crawling html page from', html_url)\n",
    "    web_json = requests.get(html_url).json()\n",
    "    \n",
    "    results = web_json['results']\n",
    "    url_list = [hit['url'] for hit in results]\n",
    "    report_name_list = [hit['title'] for hit in results]\n",
    "    pdf_html_list.extend(url_list)\n",
    "    report_list.extend(report_name_list)\n",
    "    print('Found', len(url_list), 'urls in page', idx)\n",
    "\n",
    "print('Found', len(pdf_html_list), 'urls in total')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "id": "b574d42d",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['https://www.oecd.org/en/publications/oecd-economic-surveys-luxembourg-2025_803b3ea1-en.html',\n",
       " 'https://www.oecd.org/en/publications/oecd-economic-surveys-turkiye-2025_d01c660f-en.html',\n",
       " 'https://www.oecd.org/en/publications/oecd-economic-surveys-israel-2025_d6dd02bc-en.html',\n",
       " 'https://www.oecd.org/en/publications/oecd-economic-surveys-lithuania-2025_4abf1ea5-en.html',\n",
       " 'https://www.oecd.org/en/publications/oecd-economic-surveys-costa-rica-2025_048cf07b-en.html',\n",
       " 'https://www.oecd.org/en/publications/oecd-economic-surveys-czechia-2025_7a70af5c-en.html',\n",
       " 'https://www.oecd.org/en/publications/oecd-economic-surveys-ireland-2025_9a368560-en.html',\n",
       " 'https://www.oecd.org/en/publications/oecd-economic-surveys-poland-2025_483d3bb9-en.html',\n",
       " 'https://www.oecd.org/en/publications/oecd-economic-surveys-chile-2025_efad96ce-en.html',\n",
       " 'https://www.oecd.org/en/publications/oecd-economic-surveys-greece-2024_a35a56b6-en.html',\n",
       " 'https://www.oecd.org/en/publications/oecd-economic-surveys-indonesia-2024_de87555a-en.html',\n",
       " 'https://www.oecd.org/en/publications/oecd-economic-surveys-morocco-2024_80777ea7-en.html',\n",
       " 'https://www.oecd.org/en/publications/oecd-economic-surveys-belgium-2024_c671124e-en.html',\n",
       " 'https://www.oecd.org/en/publications/oecd-economic-surveys-colombia-2024_a1a22cd6-en.html',\n",
       " 'https://www.oecd.org/en/publications/oecd-economic-surveys-united-kingdom-2024_709e70b8-en.html',\n",
       " 'https://www.oecd.org/en/publications/oecd-economic-surveys-malaysia-2024_e45ca31a-en.html',\n",
       " 'https://www.oecd.org/en/publications/oecd-economic-surveys-korea-2024_c243e16a-en.html',\n",
       " 'https://www.oecd.org/en/publications/oecd-economic-surveys-france-2024_bd96e2ed-en.html',\n",
       " 'https://www.oecd.org/en/publications/oecd-economic-surveys-austria-2024_60ea1561-en.html',\n",
       " 'https://www.oecd.org/en/publications/oecd-economic-surveys-slovenia-2024_bc4a107b-en.html',\n",
       " 'https://www.oecd.org/en/publications/oecd-economic-surveys-united-states-2024_cdfff156-en.html',\n",
       " 'https://www.oecd.org/en/publications/oecd-economic-surveys-norway-2024_cb13475f-en.html',\n",
       " 'https://www.oecd.org/en/publications/oecd-economic-surveys-estonia-2024_33e6beee-en.html',\n",
       " 'https://www.oecd.org/en/publications/oecd-economic-surveys-new-zealand-2024_603809f2-en.html',\n",
       " 'https://www.oecd.org/en/publications/oecd-economic-surveys-latvia-2024_dfeae75b-en.html',\n",
       " 'https://www.oecd.org/en/publications/oecd-economic-surveys-slovak-republic-2024_397ca086-en.html',\n",
       " 'https://www.oecd.org/en/publications/oecd-economic-surveys-switzerland-2024_070d119b-en.html',\n",
       " 'https://www.oecd.org/en/publications/oecd-economic-surveys-romania-2024_106b32c4-en.html',\n",
       " 'https://www.oecd.org/en/publications/oecd-economic-surveys-hungary-2024_795451e5-en.html',\n",
       " 'https://www.oecd.org/en/publications/oecd-economic-surveys-mexico-2024_b8d974db-en.html',\n",
       " 'https://www.oecd.org/en/publications/oecd-economic-surveys-egypt-2024_af900de2-en.html',\n",
       " 'https://www.oecd.org/en/publications/oecd-economic-surveys-italy-2024_78add673-en.html',\n",
       " 'https://www.oecd.org/en/publications/oecd-economic-surveys-japan-2024_41e807f9-en.html',\n",
       " 'https://www.oecd.org/en/publications/oecd-economic-surveys-denmark-2024_d5c6f307-en.html',\n",
       " 'https://www.oecd.org/en/publications/oecd-economic-surveys-brazil-2023_a2d6acac-en.html',\n",
       " 'https://www.oecd.org/en/publications/oecd-economic-surveys-thailand-2023_4815cb4b-en.html',\n",
       " 'https://www.oecd.org/en/publications/oecd-economic-surveys-australia-2023_1794a7c9-en.html',\n",
       " 'https://www.oecd.org/en/publications/oecd-economic-surveys-spain-2023_5b50cc51-en.html',\n",
       " 'https://www.oecd.org/en/publications/oecd-economic-surveys-peru-2023_081e0906-en.html',\n",
       " 'https://www.oecd.org/en/publications/oecd-economic-surveys-croatia-2023_4f945053-en.html',\n",
       " 'https://www.oecd.org/en/publications/oecd-economic-surveys-european-union-and-euro-area-2023_7ebe8cc3-en.html',\n",
       " 'https://www.oecd.org/en/publications/oecd-economic-surveys-netherlands-2023_dbda2baf-en.html',\n",
       " 'https://www.oecd.org/en/publications/oecd-economic-surveys-iceland-2023_b3880f1a-en.html',\n",
       " 'https://www.oecd.org/en/publications/oecd-economic-surveys-portugal-2023_2b8ee40a-en.html',\n",
       " 'https://www.oecd.org/en/publications/oecd-economic-surveys-sweden-2023_ceed5fd4-en.html',\n",
       " 'https://www.oecd.org/en/publications/oecd-economic-surveys-germany-2023_9642a3f5-en.html',\n",
       " 'https://www.oecd.org/en/publications/oecd-economic-surveys-viet-nam-2023_8f2a6ecb-en.html',\n",
       " 'https://www.oecd.org/en/publications/oecd-economic-surveys-bulgaria-2023_5ca812a4-en.html',\n",
       " 'https://www.oecd.org/en/publications/oecd-economic-surveys-israel-2023_901365a6-en.html',\n",
       " 'https://www.oecd.org/en/publications/oecd-economic-surveys-czech-republic-2023_e392e937-en.html',\n",
       " 'https://www.oecd.org/en/publications/oecd-economic-surveys-canada-2023_7eb16f83-en.html',\n",
       " 'https://www.oecd.org/en/publications/oecd-economic-surveys-poland-2023_6fc99a4b-en.html',\n",
       " 'https://www.oecd.org/en/publications/oecd-economic-surveys-turkiye-2023_864ab2ba-en.html',\n",
       " 'https://www.oecd.org/en/publications/oecd-economic-surveys-costa-rica-2023_8e8171b0-en.html',\n",
       " 'https://www.oecd.org/en/publications/oecd-economic-surveys-greece-2023_c5f11cd5-en.html',\n",
       " 'https://www.oecd.org/en/publications/oecd-economic-surveys-finland-2022_516252a7-en.html',\n",
       " 'https://www.oecd.org/en/publications/oecd-economic-surveys-ireland-2022_46a6ea85-en.html',\n",
       " 'https://www.oecd.org/en/publications/oecd-economic-surveys-luxembourg-2022_9409d9b6-en.html',\n",
       " 'https://www.oecd.org/en/publications/oecd-economic-surveys-lithuania-2022_0829329f-en.html',\n",
       " 'https://www.oecd.org/en/publications/oecd-economic-surveys-united-states-2022_eeb7cbe9-en.html',\n",
       " 'https://www.oecd.org/en/publications/oecd-economic-surveys-chile-2022_311ec37e-en.html',\n",
       " 'https://www.oecd.org/en/publications/oecd-economic-surveys-korea-2022_20bf3d6e-en.html',\n",
       " 'https://www.oecd.org/en/publications/oecd-economic-surveys-south-africa-2022_d6a7301d-en.html',\n",
       " 'https://www.oecd.org/en/publications/oecd-economic-surveys-united-kingdom-2022_7c0f1268-en.html',\n",
       " 'https://www.oecd.org/en/publications/oecd-economic-surveys-slovenia-2022_d63f5a2f-en.html',\n",
       " 'https://www.oecd.org/en/publications/oecd-economic-surveys-estonia-2022_21ef46e4-en.html',\n",
       " 'https://www.oecd.org/en/publications/oecd-economic-surveys-belgium-2022_01c0a8f0-en.html',\n",
       " 'https://www.oecd.org/en/publications/oecd-economic-surveys-tunisia-2022_7f9459cf-en.html',\n",
       " 'https://www.oecd.org/en/publications/oecd-economic-surveys-china-2022_b0e499cf-en.html',\n",
       " 'https://www.oecd.org/en/publications/oecd-economic-surveys-latvia-2022_c0113448-en.html',\n",
       " 'https://www.oecd.org/en/publications/oecd-economic-surveys-norway-2022_df7b87ab-en.html',\n",
       " 'https://www.oecd.org/en/publications/oecd-economic-surveys-mexico-2022_2e1de26c-en.html',\n",
       " 'https://www.oecd.org/en/publications/oecd-economic-surveys-colombia-2022_04bf9377-en.html',\n",
       " 'https://www.oecd.org/en/publications/oecd-economic-surveys-new-zealand-2022_a4fd214c-en.html',\n",
       " 'https://www.oecd.org/en/publications/oecd-economic-surveys-romania-2022_e2174606-en.html',\n",
       " 'https://www.oecd.org/en/publications/oecd-economic-surveys-switzerland-2022_1fde6924-en.html',\n",
       " 'https://www.oecd.org/en/publications/oecd-economic-surveys-slovak-republic-2022_78ef10f8-en.html',\n",
       " 'https://www.oecd.org/en/publications/oecd-economic-surveys-austria-2021_eaf9ec79-en.html',\n",
       " 'https://www.oecd.org/en/publications/oecd-economic-surveys-denmark-2021_86f7b2d9-en.html',\n",
       " 'https://www.oecd.org/en/publications/oecd-economic-surveys-portugal-2021_13b842d6-en.html',\n",
       " 'https://www.oecd.org/en/publications/oecd-economic-surveys-japan-2021_6b749602-en.html',\n",
       " 'https://www.oecd.org/en/publications/oecd-economic-surveys-france-2021_289a0a17-en.html',\n",
       " 'https://www.oecd.org/en/publications/oecd-economic-surveys-australia-2021_ce96b16a-en.html',\n",
       " 'https://www.oecd.org/en/publications/oecd-economic-surveys-european-union-2021_a77ab220-en.html',\n",
       " 'https://www.oecd.org/en/publications/oecd-economic-surveys-euro-area-2021_214e9f0a-en.html',\n",
       " 'https://www.oecd.org/en/publications/oecd-economic-surveys-italy-2021_07d8b9cd-en.html',\n",
       " 'https://www.oecd.org/en/publications/oecd-economic-survey-malaysia-2021_cc9499dd-en.html',\n",
       " 'https://www.oecd.org/en/publications/oecd-economic-surveys-hungary-2021_1d39d866-en.html',\n",
       " 'https://www.oecd.org/en/publications/oecd-economic-surveys-sweden-2021_f61d0a54-en.html',\n",
       " 'https://www.oecd.org/en/publications/oecd-economic-surveys-iceland-2021_c4edf686-en.html',\n",
       " 'https://www.oecd.org/en/publications/oecd-economic-surveys-netherlands-2021_dd476bd3-en.html',\n",
       " 'https://www.oecd.org/en/publications/oecd-economic-surveys-spain-2021_79e92d88-en.html',\n",
       " 'https://www.oecd.org/en/publications/oecd-economic-surveys-indonesia-2021_fd7e6249-en.html',\n",
       " 'https://www.oecd.org/en/publications/oecd-economic-surveys-canada-2021_16e4abc0-en.html',\n",
       " 'https://www.oecd.org/en/publications/oecd-economic-surveys-chile-2021_79b39420-en.html',\n",
       " 'https://www.oecd.org/en/publications/oecd-economic-surveys-bulgaria-2021_1fe2940d-en.html',\n",
       " 'https://www.oecd.org/en/publications/oecd-economic-surveys-turkey-2021_2cd09ab1-en.html',\n",
       " 'https://www.oecd.org/en/publications/oecd-economic-surveys-brazil-2020_250240ad-en.html',\n",
       " 'https://www.oecd.org/en/publications/oecd-economic-surveys-finland-2020_673aeb7f-en.html',\n",
       " 'https://www.oecd.org/en/publications/oecd-economic-surveys-germany-2020_91973c69-en.html',\n",
       " 'https://www.oecd.org/en/publications/oecd-economic-surveys-czech-republic-2020_1b180a5a-en.html',\n",
       " 'https://www.oecd.org/en/publications/oecd-economic-surveys-poland-2020_0e32d909-en.html',\n",
       " 'https://www.oecd.org/en/publications/oecd-economic-surveys-lithuania-2020_62663b1d-en.html',\n",
       " 'https://www.oecd.org/en/publications/oecd-economic-surveys-united-kingdom-2020_2f684241-en.html',\n",
       " 'https://www.oecd.org/en/publications/oecd-economic-surveys-thailand-2020_ad2e50fa-en.html',\n",
       " 'https://www.oecd.org/en/publications/oecd-economic-surveys-israel-2020_d6a7d907-en.html',\n",
       " 'https://www.oecd.org/en/publications/oecd-economic-surveys-korea-2020_2dde9480-en.html',\n",
       " 'https://www.oecd.org/en/publications/oecd-economic-surveys-south-africa-2019_530e7ce0-en.html',\n",
       " 'https://www.oecd.org/en/publications/oecd-economic-surveys-greece-2020_b04b25de-en.html',\n",
       " 'https://www.oecd.org/en/publications/oecd-economic-surveys-slovenia-2020_a4209041-en.html',\n",
       " 'https://www.oecd.org/en/publications/oecd-economic-surveys-costa-rica-2020_2e0fea6c-en.html',\n",
       " 'https://www.oecd.org/en/publications/oecd-economic-surveys-united-states-2020_12323be9-en.html',\n",
       " 'https://www.oecd.org/en/publications/oecd-economic-surveys-ireland-2020_dec600f3-en.html',\n",
       " 'https://www.oecd.org/en/publications/oecd-economic-surveys-belgium-2020_1327040c-en.html']"
      ]
     },
     "execution_count": 30,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pdf_html_list"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "id": "809b5eb1",
   "metadata": {},
   "outputs": [],
   "source": [
    "def download_oecd_pdf(page_url: str, report_name: str, dest_dir: str) -> pathlib.Path:\n",
    "    \"\"\"\n",
    "    Given an OECD publication page (HTML), fetch and save its companion PDF.\n",
    "    Returns the local Path of the downloaded file.\n",
    "    \"\"\"\n",
    "    # 1 ── grab the HTML page\n",
    "    html = requests.get(page_url, timeout=30)\n",
    "    html.raise_for_status()\n",
    "\n",
    "    # 2 ── look for a <a> tag whose href ends in .pdf\n",
    "    soup = BeautifulSoup(html.text, \"html.parser\")\n",
    "    pdf_url = None\n",
    "    for a in soup.find_all(\"a\", href=True):\n",
    "        href = a[\"href\"]\n",
    "        if href.lower().endswith(\".pdf\"):\n",
    "            pdf_url = urllib.parse.urljoin(page_url, href)\n",
    "            break\n",
    "\n",
    "    # fallback: regex search (catches edge cases if the DOM structure changes)\n",
    "    if pdf_url is None:\n",
    "        m = re.search(r'href=[\"\\']([^\"\\']+\\.pdf)[\"\\']', html.text, re.I)\n",
    "        if m:\n",
    "            pdf_url = urllib.parse.urljoin(page_url, m.group(1))\n",
    "\n",
    "    if pdf_url is None:\n",
    "        raise RuntimeError(\"No PDF link found on the page!\")\n",
    "\n",
    "    # 3 ── stream-download the PDF\n",
    "    file_path = os.path.join(dest_dir, report_name + \".pdf\")\n",
    "\n",
    "    with requests.get(pdf_url, stream=True, timeout=60) as r:\n",
    "        r.raise_for_status()\n",
    "        with open(file_path, \"wb\") as f:\n",
    "            for chunk in r.iter_content(chunk_size=8192):\n",
    "                f.write(chunk)\n",
    "\n",
    "    print(f\"Saved → {file_path}\")\n",
    "    return file_path\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "id": "a6dbeb4c",
   "metadata": {},
   "outputs": [],
   "source": [
    "HEADERS = {\n",
    "    # any recent browser UA string will do\n",
    "    \"User-Agent\": (\n",
    "        \"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:125.0) \"\n",
    "        \"Gecko/20100101 Firefox/125.0\"\n",
    "    ),\n",
    "    \"Accept\": \"text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8\",\n",
    "    \"Accept-Language\": \"en-US,en;q=0.5\",\n",
    "    \"Referer\": \"https://www.oecd.org/\",\n",
    "}\n",
    "\n",
    "def download_oecd_pdf(page_url: str, report_name: str, dest_dir: str) -> pathlib.Path:\n",
    "    \"\"\"Download the first PDF linked from an OECD publication page.\"\"\"\n",
    "    with requests.Session() as s:\n",
    "        s.headers.update(HEADERS)\n",
    "\n",
    "        # 1 ── fetch the HTML\n",
    "        resp = s.get(page_url, timeout=30)\n",
    "        resp.raise_for_status()\n",
    "\n",
    "        # 2 ── parse out the PDF link\n",
    "        soup = BeautifulSoup(resp.text, \"html.parser\")\n",
    "        pdf_url = None\n",
    "        for a in soup.find_all(\"a\", href=True):\n",
    "            href = a[\"href\"]\n",
    "            if href.lower().endswith(\".pdf\"):\n",
    "                pdf_url = urllib.parse.urljoin(page_url, href)\n",
    "                break\n",
    "        if not pdf_url:\n",
    "            # fallback regex, in case the DOM structure changes\n",
    "            m = re.search(r'href=[\"\\']([^\"\\']+\\.pdf)[\"\\']', resp.text, re.I)\n",
    "            if m:\n",
    "                pdf_url = urllib.parse.urljoin(page_url, m.group(1))\n",
    "        if not pdf_url:\n",
    "            raise RuntimeError(\"No PDF link found on the page!\")\n",
    "\n",
    "        # 3 ── stream-download the PDF (pass Referer again)\n",
    "        out_path = os.path.join(dest_dir, report_name + \".pdf\")\n",
    "\n",
    "        with s.get(pdf_url, stream=True, timeout=60, headers={\"Referer\": page_url}) as r:\n",
    "            r.raise_for_status()\n",
    "            with open(out_path, \"wb\") as f:\n",
    "                for chunk in r.iter_content(chunk_size=8192):\n",
    "                    f.write(chunk)\n",
    "\n",
    "    print(f\"✓  Saved → {out_path}\")\n",
    "    return out_path\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "id": "d0000e97",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Failed to download OECD_Economic_Surveys_Luxembourg_2025: 403 Client Error: Forbidden for url: https://www.oecd.org/en/publications/oecd-economic-surveys-luxembourg-2025_803b3ea1-en.html\n",
      "Failed to download OECD_Economic_Surveys_Türkiye_2025: 403 Client Error: Forbidden for url: https://www.oecd.org/en/publications/oecd-economic-surveys-turkiye-2025_d01c660f-en.html\n",
      "Failed to download OECD_Economic_Surveys_Israel_2025: 403 Client Error: Forbidden for url: https://www.oecd.org/en/publications/oecd-economic-surveys-israel-2025_d6dd02bc-en.html\n"
     ]
    }
   ],
   "source": [
    "save_dir = '/data/group_data/rag-robust-eval/data/economic/pdf_files'\n",
    "os.makedirs(save_dir, exist_ok=True)\n",
    "\n",
    "for pdf_html, report_name in zip(pdf_html_list[:3], report_list[:3]):\n",
    "    # Clean up the report name to make it a valid filename\n",
    "    report_name = re.sub(r'[<>:\"/\\\\|?*]', '_', report_name)\n",
    "    report_name = re.sub(r'\\s+', '_', report_name)\n",
    "    report_name = re.sub(r'_{2,}', '_', report_name)  # Remove duplicate underscores\n",
    "    report_name = report_name.strip('_')  # Remove leading/trailing underscores\n",
    "\n",
    "    try:\n",
    "        download_oecd_pdf(pdf_html, report_name, save_dir)\n",
    "        time.sleep(2)\n",
    "    except Exception as e:\n",
    "        print(f\"Failed to download {report_name}: {e}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "ecf5cc4b",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "9b811a2d",
   "metadata": {},
   "outputs": [],
   "source": [
    "content = pd.read_json('/data/group_data/rag-robust-eval/data/economic/json_files/803b3ea1-en/auto/803b3ea1-en_content_list.json')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "fd2f8a92",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "type                                                          table\n",
       "text                                                            NaN\n",
       "text_level                                                      NaN\n",
       "page_idx                                                          8\n",
       "img_path          images/7bfd4f4b10a6850eec4f70ff0d6512725699f1d...\n",
       "img_caption                                                     NaN\n",
       "img_footnote                                                    NaN\n",
       "table_caption     [(Numbers in parentheses refer to the OECD ave...\n",
       "table_footnote                                                   []\n",
       "table_body        \\n\\n<html><body><table><tr><td colspan=\"6\">N D...\n",
       "Name: 46, dtype: object"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "content.iloc[46]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "bfab9654",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "46"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "table_idx = int(content[content['type'] == 'table'].index[0])\n",
    "table_idx"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ae178dbc",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "rag-graph",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.11"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
