"""
Just run `python get_wikipedia.py` all parameters are hardcoded.

Finds low-resource Wikipedia articles in languages with at least ten articles (after cutoff date)
that have English versions.

To be nice, fill in the DEFAULT_USER_AGENT variable below

Stores files in
"""

import json
import logging
import re
import time
from datetime import datetime

import mwparserfromhell
import requests
import utils
from joblib import Memory
from tqdm import tqdm

############################################################################
# GLOBAL CONSTANTS / CONFIG
############################################################################

REQUEST_DELAY = 0.05  # We'll use 0.05s between each request

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = utils.get_logger(__name__)

# Joblib cache configuration
_JOBLIB_CACHE_DIR = utils.get_project_root() / ".cache" / "joblib" / "wiki"
_JOBLIB_CACHE_DIR.mkdir(parents=True, exist_ok=True)
memory = Memory(location=str(_JOBLIB_CACHE_DIR), verbose=0)

DEFAULT_USER_AGENT = "<PUT YOUR USER AGENT HERE>"


@memory.cache(ignore=["request_delay", "timeout"])  # don't let timing knobs change cache key
def _cached_fetch_json(
    url: str,
    params: dict | None,
    user_agent: str = DEFAULT_USER_AGENT,
    method: str = "GET",
    timeout: int = 30,
    request_delay: float | None = None,
) -> dict:
    # Only applied on cache MISS: this function body is skipped on cache HIT
    if request_delay and request_delay > 0:
        time.sleep(request_delay)
    headers = {"User-Agent": user_agent}
    resp = requests.request(method, url, params=params, headers=headers, timeout=timeout)
    resp.raise_for_status()
    return resp.json()


############################################################################
# HELPER FUNCTIONS
############################################################################


def make_jsonifiable(art):
    return {k: str(v).split()[0] if isinstance(v, datetime) else v for k, v in art.items()}


# def remove_latin_chars(text):
#     # first remove entire parenthesized things with latin letters:
#     text = re.sub(r"\([^\)\(]*[a-zA-Z0-9][^\)\(]*\) ?", "", text)
#     # then remove other standalone Latin sequences:
#     text = re.sub(r"[a-zA-Z0-9][a-zA-Z0-9 ]*", "", text)
#     return text


def request_with_throttle(url, params=None, session=None, request_delay=REQUEST_DELAY):
    """
    A helper that wraps a GET request with a simple throttle (sleep)
    to avoid hitting Wikipedia too fast.
    """
    user_agent = DEFAULT_USER_AGENT
    if session is not None:
        try:
            user_agent = session.headers.get("User-Agent", DEFAULT_USER_AGENT)
        except Exception:
            pass
    # Sleep only on cache miss: move delay into cached function body
    return _cached_fetch_json(
        url=url,
        params=params,
        user_agent=user_agent,
        request_delay=request_delay,
    )


def get_created_pages_via_logevents(
    lang_code="ug",
    batch_size=50,
    start_ts="2024-06-01T00:00:00Z",
    lecontinue_token=None,
    session=None,
    request_delay=REQUEST_DELAY,
):
    """
    Query the creation log (logevents) instead of recentchanges.
    This lets us see older page creations beyond the short 'recentchanges' window.

    We'll fetch up to `batch_size` log entries of type 'create' in the main namespace
    (lenamespace=0), from 'start_ts' forward in time (ledir=newer).

    Returns a tuple: (list_of_created_pages, next_continue_token)

    Each item in list_of_created_pages looks like:
      {
        'ns': 0,
        'title': 'Page_Title',
        'timestamp': '2024-07-01T13:12:00Z',
        'logid': 12345,
        ...
      }
    """
    logger.info(
        f"Fetching up to {batch_size} creation log events from {lang_code} wiki "
        f"starting {start_ts} | lecontinue={lecontinue_token}"
    )

    url = f"https://{lang_code}.wikipedia.org/w/api.php"
    params = {
        "action": "query",
        "list": "logevents",
        "letype": "create",
        "leprop": "title|timestamp",
        "lenamespace": 0,  # main namespace
        "lelimit": batch_size,
        # 'ledir': 'newer',      # from older to newer
        "ledir": "older",  # from newer to older
        # 'lestart': start_ts,
        "format": "json",
    }

    if lecontinue_token:
        params["lecontinue"] = lecontinue_token

    data = request_with_throttle(url, params=params, session=session, request_delay=request_delay)

    events = data.get("query", {}).get("logevents", [])
    next_token = None
    if "continue" in data and "lecontinue" in data["continue"]:
        next_token = data["continue"]["lecontinue"]

    logger.info(f"Received {len(events)} logevents. next_lecontinue={next_token}")
    return events, next_token


def fetch_page_stripped_text_and_revision(
    lang_code,
    page_title,
    session=None,
    request_delay=REQUEST_DELAY,
    prefer_latest: bool = False,
):
    """
    1) Fetch the revision for `page_title`.
       - If prefer_latest is True, fetch the latest/current revision.
       - Otherwise, fetch the earliest revision.
    2) Check for #REDIRECT
    3) Use mwparserfromhell to remove <ref>...</ref> tags and template blocks {{...}}.
    4) If removing a specific template fails (ValueError), we skip just that template.
    5) Return (stripped_text, revision_id, revision_timestamp).
    6) If mwparserfromhell parse fails entirely, skip the page.
    """
    url = f"https://{lang_code}.wikipedia.org/w/api.php"
    params = {
        "action": "query",
        "redirects": 1,
        "prop": "revisions",
        "rvslots": "main",
        "rvprop": "ids|timestamp|content",  # get revision IDs + timestamp + wikitext
        "rvlimit": 1,
        "rvdir": "older" if prefer_latest else "newer",
        "titles": page_title,
        "format": "json",
        "formatversion": 2,
    }

    data = request_with_throttle(url, params=params, session=session, request_delay=request_delay)
    pages = data.get("query", {}).get("pages", [])
    if not pages:
        logger.info(f"No pages found for title={page_title}")
        return None, None, None

    page_info = pages[0]
    if "revisions" not in page_info:
        logger.info(f"No revisions found for page={page_title}")
        return None, None, None

    rev_data = page_info["revisions"][0]
    revision_id = rev_data.get("revid")
    revision_timestamp = rev_data.get("timestamp")  # ISO8601 string, e.g. "2024-08-05T12:34:56Z"
    wiki_text = rev_data.get("slots", {}).get("main", {}).get("content", "")

    # Check for #REDIRECT
    if "#redirect" in wiki_text.lower():
        logger.info(f"Page '{page_title}' is a redirect. Skipping.")
        return None, None, None

    # Parse & remove references + templates
    try:
        code = mwparserfromhell.parse(wiki_text)
    except (ValueError, mwparserfromhell.parser.ParserError) as e:
        logger.warning(f"mwparserfromhell parse error on page '{page_title}': {e}")
        return None, None, None

    # Remove <ref>...</ref> tags
    for tag_node in code.ifilter_tags(recursive=True):
        if tag_node.tag and tag_node.tag.lower() == "ref":
            try:
                code.remove(tag_node)
            except ValueError:
                pass

    # Remove template blocks: {{...}}
    for template_node in code.ifilter_templates(recursive=True):
        try:
            code.remove(template_node)
        except ValueError:
            pass

    # Strip remaining wiki markup
    stripped_text = code.strip_code().strip()
    return stripped_text, revision_id, revision_timestamp


def get_page_creation_date(lang_code, page_title, session=None, request_delay=REQUEST_DELAY):
    """
    Fetch earliest revision date for `page_title`. Returns a datetime or None.
    """
    url = f"https://{lang_code}.wikipedia.org/w/api.php"
    params = {
        "action": "query",
        "prop": "revisions",
        "rvprop": "timestamp",
        "rvlimit": 1,
        "rvdir": "newer",
        "titles": page_title,
        "format": "json",
        "redirects": 1,
    }
    data = request_with_throttle(url, params=params, session=session, request_delay=request_delay)
    pages = data.get("query", {}).get("pages", {})
    if not pages:
        return None

    page_info = next(iter(pages.values()))
    revs = page_info.get("revisions")
    if not revs:
        return None

    ts_str = revs[0]["timestamp"]  # e.g. "2024-08-05T12:34:56Z"
    return datetime.strptime(ts_str, "%Y-%m-%dT%H:%M:%SZ")


def get_wikidata_item_for_page(lang_code, page_id, session=None, request_delay=REQUEST_DELAY):
    """
    Return QID (like 'Q12345') for a given page_id on lang_code wiki, if any.
    """
    url = f"https://{lang_code}.wikipedia.org/w/api.php"
    params = {
        "action": "query",
        "prop": "pageprops",
        "ppprop": "wikibase_item",
        "pageids": page_id,
        "format": "json",
    }
    data = request_with_throttle(url, params=params, session=session, request_delay=request_delay)
    pages = data.get("query", {}).get("pages", {})
    if not pages:
        return None

    page_info = next(iter(pages.values()))
    return page_info.get("pageprops", {}).get("wikibase_item")


def get_sitelinks_for_wikidata_item(qid, session=None, request_delay=REQUEST_DELAY):
    """
    Return a dict of { 'en': 'English_Title', 'de': 'German_Title', ... }
    from a Wikidata item.
    """
    if not qid:
        return {}

    url = "https://www.wikidata.org/w/api.php"
    params = {"action": "wbgetentities", "ids": qid, "props": "sitelinks", "format": "json"}
    data = request_with_throttle(url, params=params, session=session, request_delay=request_delay)
    entities = data.get("entities", {})
    item_data = entities.get(qid, {})
    sitelinks = item_data.get("sitelinks", {})

    results = {}
    for sitecode, info in sitelinks.items():
        if sitecode.endswith("wiki"):
            lang = sitecode[:-4]  # 'en', 'fr', etc.
            results[lang] = info["title"]
    return results


## Removed: get_local_langlinks - no longer needed when only retrieving English counterpart

############################################################################
# MAIN LOGIC
############################################################################


def find_fresh_articles(
    min_chars: int,
    target_count: int,
    lang_code: str,
    cutoff_dt: datetime,
    batch_size=30,
    request_delay: float = REQUEST_DELAY,
    user_agent: str = "OpenAI-ExampleScript/1.0 (https://openai.com/; info@openai.com)",
):
    """
    We'll fetch newly created pages from the 'create' log (logevents),
    which typically has a longer history than 'recentchanges'.
    This helps for low-activity wikis like Uyghur.

    We'll stop when we find 'target_count' valid articles or exhaust the log.
    """
    if cutoff_dt is None:
        cutoff_dt = datetime(2024, 6, 1)

    # Convert Python datetime to ISO8601 for 'lestart'
    start_ts = cutoff_dt.strftime("%Y-%m-%dT%H:%M:%SZ")

    logger.info(
        f"Looking for {target_count} articles in {lang_code} that also have English versions, "
        f"batch_size={batch_size}, cutoff={cutoff_dt}, min_chars={min_chars}."
    )

    # Create a plain Session
    session = requests.Session()
    session.headers.update({"User-Agent": user_agent})

    found_articles = []
    num_found = 0
    lecontinue_token = None

    seen_rev_ids = set()
    done = False
    while not done and (num_found < target_count):
        # 1) Fetch a batch of creation log events
        created_pages, next_token = get_created_pages_via_logevents(
            lang_code=lang_code,
            batch_size=batch_size,
            start_ts=start_ts,
            lecontinue_token=lecontinue_token,
            session=session,
            request_delay=request_delay,
        )

        if not created_pages:
            logger.info("No more creation log events found. Stopping.")
            break

        for event in created_pages:
            # event might look like:
            #   { 'ns': 0, 'title': 'PageTitle', 'timestamp': '2025-01-01T12:34:56Z', 'logid': 12345, ... }
            title = event["title"]
            created_ts_str = event["timestamp"]  # e.g. "2025-01-01T12:34:56Z"
            created_dt = datetime.strptime(created_ts_str, "%Y-%m-%dT%H:%M:%SZ")

            # If the creation date is before our cutoff, skip
            if created_dt < cutoff_dt:
                done = True
                print(f"STRANGE: {created_dt} < {cutoff_dt}   {title}")
                continue

            # 2) Retrieve earliest revision text
            stripped_text, rev_id, _ = fetch_page_stripped_text_and_revision(
                lang_code,
                title,
                session=session,
                request_delay=request_delay,
                prefer_latest=True,
            )
            if rev_id in seen_rev_ids:
                continue
            seen_rev_ids.add(rev_id)
            if not stripped_text:
                continue

            # remove 200px|thumbnail|right| stuff from front
            stripped_text = re.sub(r"^[a-zA-Z0-9|]+", "", stripped_text, count=1).lstrip()

            # 3) Check char count

            if len(stripped_text) < min_chars:
                logger.info(f"Skipping, only {len(stripped_text)} chars (< {min_chars}): {title}")
                continue

            # 4) Find English counterpart via Wikidata
            #    The log event doesn't necessarily give pageid. Let's do one more query:
            pageid = get_pageid_from_title(
                lang_code, title, session=session, request_delay=request_delay
            )
            if pageid is None:
                logger.info(f"Could not find pageid for '{title}' - skipping.")
                continue

            qid = get_wikidata_item_for_page(
                lang_code, pageid, session=session, request_delay=request_delay
            )
            en_title = None
            if qid:
                sitelinks = get_sitelinks_for_wikidata_item(
                    qid, session=session, request_delay=request_delay
                )
                en_title = sitelinks.get("en")

            if en_title is not None:  # only get webpages with English versions
                continue

            stable_url = (
                f"https://{lang_code}.wikipedia.org/w/index.php?title="
                f"{title.replace(' ', '_')}&oldid={rev_id}"
            )
            local_url = f"https://{lang_code}.wikipedia.org/wiki/{title.replace(' ', '_')}"

            # Optionally fetch and attach English version details (stable URL + content)
            en_ver: dict[str, object] = {}
            assert isinstance(en_title, str)
            try:
                en_plain_text, en_rev_id, en_rev_ts = fetch_page_stripped_text_and_revision(
                    "en",
                    en_title,
                    session=session,
                    request_delay=request_delay,
                    prefer_latest=True,
                )
                en_local_url = f"https://en.wikipedia.org/wiki/{en_title.replace(' ', '_')}"
                en_stable_url = (
                    f"https://en.wikipedia.org/w/index.php?title={en_title.replace(' ', '_')}&oldid={en_rev_id}"
                    if en_rev_id
                    else None
                )
                en_ver = {
                    "title": en_title,
                    "rev_id": en_rev_id,
                    "rev_timestamp": en_rev_ts,
                    "stable_url": en_stable_url,
                    "local_url": en_local_url,
                    "plain_text": en_plain_text,
                }
                found_articles.append(
                    {
                        "title": title,
                        "creation_dt": created_dt,
                        "rev_id": rev_id,
                        "plain_text": stripped_text,
                        "stable_url": stable_url,
                        "local_url": local_url,
                        "wikidata_item": qid,
                        "en_ver": en_ver,
                    }
                )
                num_found += 1
                logger.info(
                    f"Found valid article '{title}' ({len(stripped_text)} chars). "
                    f"{num_found}/{target_count}"
                )

                if num_found >= target_count:
                    break
            except Exception as e:
                logger.info(f"Could not fetch English version for '{title}': {e}")

        if next_token:
            lecontinue_token = next_token
        else:
            logger.info("No lecontinue token found - no more logs to process.")
            break

    logger.info(f"Finished. Found {len(found_articles)} valid articles total: with_en={num_found}")
    return found_articles


def get_pageid_from_title(lang_code, title, session=None, request_delay=REQUEST_DELAY):
    """
    Given a wiki title, returns the numeric pageid if the page exists, else None.
    This is helpful because the creation log doesn't always give a pageid.
    """
    url = f"https://{lang_code}.wikipedia.org/w/api.php"
    params = {"action": "query", "titles": title, "format": "json"}
    data = request_with_throttle(url, params=params, session=session, request_delay=request_delay)
    pages = data.get("query", {}).get("pages", {})
    if not pages:
        return None

    # Could be multiple pages if it's a redirect, but typically 1
    page_info = next(iter(pages.values()))
    if "pageid" not in page_info:
        return None
    return page_info["pageid"]


if __name__ == "__main__":
    languages = [
        "ti",
        "dz",
        "iu",
        "nqo",
        "awa",
        "tcy",
        "bxr",
        "dv",
        "tyv",
        "av",
        "lo",
        "kv",
        "udm",
        "ab",
        "bo",
        "bh",
        "ug",
        "mni",
        "km",
        "mhr",
        "zgh",
        "sa",
        "sat",
        "shn",
        "mai",
        "am",
        "yi",
        "as",
        "sah",
        "sd",
        "or",
        "os",
        "ps",
        "si",
        "mn",
        "ne",
        "gu",
        "kn",
        "pa",
        "cv",
        "ba",
        "new",
        "pnb",
        "ky",
        "ckb",
        "ml",
        "mr",
        "te",
        "my",
        "tg",
        "zh-yue",
        "mk",
        "hi",
        "bn",
        "th",
        "ta",
        "ka",
        "ur",
        "kk",
        "el",
        "be",
        "bg",
        "hy",
        "he",
        "ce",
        "ko",
        "fa",
        "ar",
        "uk",
        "ja",
    ]

    target_count = 10

    for lang in tqdm(languages):
        print(f"*** lang: {lang}")
        articles = find_fresh_articles(
            target_count=target_count,
            batch_size=30,
            lang_code=lang,
            cutoff_dt=datetime(2024, 6, 1),
            min_chars=3_000,
            request_delay=REQUEST_DELAY,  # 0.05 by default
            user_agent="YOUR USER AGENT HERE",
        )

        if len(articles) >= target_count:
            assert len(articles) == target_count
            data_dir = utils.get_project_root() / "data" / "wiki_source"
            data_dir.mkdir(parents=True, exist_ok=True)
            fname = data_dir / f"{lang}.{len(articles):03}.json"
            with open(fname, "w", encoding="utf-8") as f:
                json.dump(
                    [make_jsonifiable(art) for art in articles], f, ensure_ascii=False, indent=4
                )
            logger.info(f"Saved {len(articles)} articles to {fname}")
        else:
            print(f"FAILED to find {len(articles)} < {target_count} articles for {lang}")

    # Done
