#!/usr/bin/env python3
# crawl_django_repos_all.py
"""
Clone every GitHub repository that
  • matches the keyword "django"
  • has at least 100 stars

Each successful clone is recorded to downloaded.jsonl:
  {url, stars, updated_at, path}

The process is fully resumable.
"""

from __future__ import annotations
import json
import os
import subprocess
import sys
import time
from pathlib import Path
from typing import List, Set

import requests
from tqdm import tqdm
from dotenv import load_dotenv
load_dotenv()

# ────────────────── CONFIGURATION ──────────────────
QUERY      = "django"       # ← keyword to search
STAR_FLOOR = 100               # minimum stars required
PER_PAGE = 100                 # GitHub maximum
DEST = Path("data_train/websites/django_github-repos")
LOG_FILE = DEST / "downloaded.jsonl"

GITHUB_TOKEN = os.getenv("GITHUB_TOKEN")
print(GITHUB_TOKEN)
HEADERS = {"Accept": "application/vnd.github+json"}
if GITHUB_TOKEN:
    HEADERS["Authorization"] = f"token {GITHUB_TOKEN}"

# ───────────── LOW-LEVEL SEARCH HELPERS ─────────────
def github_search(params: dict, retry: int = 3) -> dict:
    """Wrapper around GET /search/repositories with basic rate-limit handling."""
    url = "https://api.github.com/search/repositories"
    while True:
        r = requests.get(url, headers=HEADERS, params=params, timeout=30)
        if r.status_code == 403 and "X-RateLimit-Reset" in r.headers:
            wait = max(int(r.headers["X-RateLimit-Reset"]) - int(time.time()) + 5, 5)
            print(f"Rate limited → sleeping {wait}s")
            time.sleep(wait)
            continue
        try:
            r.raise_for_status()
        except requests.HTTPError:
            if retry:
                retry -= 1
                time.sleep(2)
                continue
            raise
        return r.json()

def count_repos(star_min: int, star_max: int) -> int:
    q = f"{QUERY} stars:{star_min}..{star_max}"
    return github_search({"q": q, "per_page": 1})["total_count"]

def fetch_page(star_min: int, star_max: int, page: int) -> List[dict]:
    params = {
        "q": f"{QUERY} stars:{star_min}..{star_max}",
        "per_page": PER_PAGE,
        "page": page,
        "sort": "stars",
        "order": "desc",
    }
    return github_search(params).get("items", [])

# ───────────────── LOGGING & CLONING ─────────────────
def read_log() -> Set[str]:
    done = set()
    if LOG_FILE.exists():
        with LOG_FILE.open() as fp:
            for line in fp:
                try:
                    done.add(json.loads(line)["url"])
                except Exception:
                    pass
    return done

def append_log(url: str, stars: int, updated_at: str, path: Path) -> None:
    record = {
        "url": url,
        "stars": stars,
        "updated_at": updated_at,
        "path": str(path),
    }
    with LOG_FILE.open("a") as fp:
        fp.write(json.dumps(record, ensure_ascii=False) + "\n")

def git_clone(url: str, dst: Path) -> None:
    if dst.exists():
        return
    subprocess.run(
        ["git", "clone", "--depth=1", url, str(dst)],
        check=True,
        stdout=subprocess.DEVNULL,
        stderr=subprocess.STDOUT,
    )

# ────────────── RECURSIVE STAR-BUCKETING ─────────────
def split_and_process(star_min: int, star_max: int, downloaded: Set[str]) -> None:
    total = count_repos(star_min, star_max)
    if total == 0:
        return

    # If bucket exceeds the 1000-item API cap, split in half and recurse.
    if total > 1000 and star_max - star_min > 0:
        mid = (star_min + star_max) // 2
        split_and_process(star_min, mid, downloaded)
        split_and_process(mid + 1, star_max, downloaded)
        return

    # Otherwise, page through this bucket.
    pages = (total + PER_PAGE - 1) // PER_PAGE
    for page in range(1, pages + 1):
        for repo in tqdm(
            fetch_page(star_min, star_max, page),
            desc=f"⭐ {star_min}..{star_max} p{page}",
            unit="repo",
        ):
            url = repo["clone_url"]
            if url in downloaded:
                continue
            local_dir = DEST / repo["full_name"]
            try:
                git_clone(url, local_dir)
                append_log(
                    url,
                    repo["stargazers_count"],
                    repo["pushed_at"],   # last code push (ISO-8601 UTC)
                    local_dir,
                )
                downloaded.add(url)
            except subprocess.CalledProcessError as e:
                print(f"❌ Clone failed {url}: {e}")

# ───────────────────────── MAIN ─────────────────────────
def star_ceiling() -> int:
    """Return the highest star count among 'django' repos."""
    res = github_search(
        {"q": QUERY, "per_page": 1, "sort": "stars", "order": "desc"}
    )
    return max(res["items"][0]["stargazers_count"], STAR_FLOOR) if res["items"] else STAR_FLOOR

def main() -> None:
    DEST.mkdir(exist_ok=True)
    downloaded = read_log()
    highest = star_ceiling()
    print(f"Highest star count found: {highest}")
    split_and_process(STAR_FLOOR, highest, downloaded)
    print("✅ All matching repositories processed.")

if __name__ == "__main__":
    while True:
        try:
            main()
            time.sleep(300)
        except KeyboardInterrupt:
            print("\nInterrupted. Progress saved in downloaded.jsonl")
            sys.exit(1)