import requests
import json
import datetime
import gzip
from urllib.parse import quote_plus

# --- CONFIGURATION ---
TARGET_URLS = [
    "https://osf.io/zp6q8",
    "https://osf.io/agqwv/",
    "https://archive.org/details/osf-registrations-agqwv-v1",
    "https://archive.org/details/osf-registrations-zp6q8-v1",
]

SINCE_DATE = "2022-01-01"  # Format: YYYY-MM-DD
LIMIT_PER_INDEX = 100  # Max results to fetch per crawl index


def parse_crawl_id(crawl_id):
    """
    Converts a Common Crawl ID (e.g., 'CC-MAIN-2023-50') into a datetime object.
    Returns None if the ID format is unexpected.
    """
    try:
        # Expected format: CC-MAIN-YYYY-WW (Year-Week)
        parts = crawl_id.split("-")
        if len(parts) >= 4 and parts[0] == "CC" and parts[1] == "MAIN":
            year = int(parts[2])
            week = int(parts[3])
            # identifying the Monday of the given week
            return datetime.datetime.strptime(f"{year}-W{week}-1", "%Y-W%W-%w")
    except ValueError:
        pass
    return None


def get_indexes_since(date_cutoff):
    """
    Fetches available indexes and filters them by date.
    """
    collinfo_url = "https://index.commoncrawl.org/collinfo.json"

    print(f"[*] Fetching list of indexes since {date_cutoff.date()}...")

    try:
        response = requests.get(collinfo_url)
        response.raise_for_status()
        all_indexes = response.json()

        valid_indexes = []
        for index in all_indexes:
            idx_date = parse_crawl_id(index["id"])
            if idx_date and idx_date >= date_cutoff:
                valid_indexes.append(index["id"])

        # Sort so we search the most recent first
        valid_indexes.sort(reverse=True)
        print(f"[*] Found {len(valid_indexes)} relevant indexes.")
        return valid_indexes

    except Exception as e:
        print(f"[!] Error fetching index list: {e}")
        return []


def search_index(target_url, index_id, date_cutoff):
    """
    Searches a specific index and filters results by precise timestamp.
    """
    encoded_url = quote_plus(target_url)
    index_url = f"https://index.commoncrawl.org/{index_id}-index"

    # We ask for specific fields to make parsing faster
    params = {"url": target_url, "output": "json", "fl": "timestamp,url,filename,offset,length,status"}  # Field list

    found_records = []
    try:
        response = requests.get(index_url, params=params, timeout=30)

        if response.status_code == 200:
            for line in response.text.strip().split("\n"):
                try:
                    record = json.loads(line)

                    # Convert CDX timestamp (YYYYMMDDHHMMSS) to datetime
                    ts_str = record["timestamp"]
                    record_date = datetime.datetime.strptime(ts_str, "%Y%m%d%H%M%S")

                    # Fine-grained filtering: Check exact capture time
                    if record_date >= date_cutoff:
                        # Add a readable date field for display
                        record["date_obj"] = record_date
                        found_records.append(record)

                except (ValueError, KeyError) as e:
                    continue  # Skip malformed records

        elif response.status_code != 404:
            print(f"[!] Unexpected status {response.status_code} for {index_id}")

    except Exception as e:
        print(f"[!] Error searching {index_id}: {e}")

    return found_records


def fetch_content(record):
    """
    Fetches and decompresses the content from Common Crawl using the record details.
    """
    offset = int(record["offset"])
    length = int(record["length"])
    url = f"https://data.commoncrawl.org/{record['filename']}"

    headers = {"Range": f"bytes={offset}-{offset + length - 1}"}

    try:
        response = requests.get(url, headers=headers, timeout=30)
        if response.status_code == 206:
            return gzip.decompress(response.content)
        else:
            print(f"    [!] Failed to download: HTTP {response.status_code}")
    except Exception as e:
        print(f"    [!] Error downloading content: {e}")
    return None


def main():
    # 1. Parse the cutoff date
    try:
        cutoff_dt = datetime.datetime.strptime(SINCE_DATE, "%Y-%m-%d")
    except ValueError:
        print("Error: SINCE_DATE must be in YYYY-MM-DD format.")
        return

    # 2. Get relevant indexes (Coarse Filter)
    indexes = get_indexes_since(cutoff_dt)

    if not indexes:
        print("No indexes found for that date range.")
        return

    all_captures = []

    # 3. Iterate and Search (Fine Filter)
    for index in indexes:
        print(f"    -> Searching index {index}...")
        for url in TARGET_URLS:
            captures = search_index(url, index, cutoff_dt)
            if captures:
                print(f"       Found {len(captures)} matching capture(s) for URL {url}.")
                all_captures.extend(captures)

            # Optional: Stop early if you have enough data
            if len(all_captures) >= LIMIT_PER_INDEX:
                print("    -> Reached capture limit, stopping search.")
                break
        else:
            print("       No matches in this index.")

    # 4. Results
    print(f"\n--- RESULTS: {len(all_captures)} captures since {SINCE_DATE} ---")

    # Sort final results by date (newest first)
    all_captures.sort(key=lambda x: x["date_obj"], reverse=True)

    for cap in all_captures:
        print(f"[{cap['date_obj']}] Status: {cap.get('status', 'N/A')} - {cap['filename']}")

        content = fetch_content(cap)
        if content:
            try:
                text = content.decode("utf-8", errors="replace")
                print(f"    Snippet:\n    {text[:500]}...\n")
                # save to file
                with open(f"capture_{cap['date_obj'].strftime('%Y%m%d_%H%M%S')}.html", "w", encoding="utf-8") as f:
                    f.write(text)
            except Exception as e:
                print(f"    [!] Could not decode content: {e}")


if __name__ == "__main__":
    main()
