import argparse
import configparser
import json
import os
import time

import requests
from tqdm import tqdm

os.chdir("../../../")
from setup_utils import add_api_env

add_api_env()

url = "https://api.nytimes.com/svc/archive/v1/{year}/{month}.json?api-key={api_key}"
data_dir = os.path.join(BASE_DIR, "datasets/nyt/data")


def main(years, months, news_desks, output_dir):
    for year in years:
        print(f"Pulling headlines from archive for {year}")
        headlines = {desk: [] for desk in news_desks}
        pbar = tqdm(months)
        for month in pbar:
            response = requests.get(
                url.format(year=year, month=month, api_key=nyt_api_key)
            )

            if response.status_code == 200:
                docs = response.json()["response"]["docs"]

                for doc in docs:
                    if doc["news_desk"] in news_desks:
                        headlines[doc["news_desk"]].append(
                            {
                                "headline": doc["headline"]["main"],
                                "pub_date": doc["pub_date"],
                                "news_desk": doc["news_desk"],
                                "web_url": doc["web_url"],
                                "snippet": doc.get("snippet", ""),
                                "abstract": doc.get("abstract", ""),
                                "print_section": doc.get("print_section", ""),
                                "print_page": doc.get("print_page", ""),
                                "section": doc.get("section_name", ""),
                                "word_count": doc["word_count"],
                                "year": year,
                            }
                        )
            pbar.set_description(
                f"Month: {month}, Business: {len(headlines['Business'])}, Foreign: {len(headlines['Foreign'])}, Politics: {len(headlines['Politics'])}, Washington: {len(headlines['Washington'])}, National: {len(headlines['National'])}"
            )
            time.sleep(12)

        for desk in news_desks:
            print(f"{len(headlines[desk])} headlines for {desk} in {year}")
            with open(
                os.path.join(data_dir, str(year), f"{desk}_headlines.json"), "w"
            ) as f:
                json.dump(headlines[desk], f)


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Process some parameters.")
    parser.add_argument(
        "--start_year", type=int, help="Start year for range of NYT headlines to pull"
    )
    parser.add_argument(
        "--end_year", type=int, help="End year for range of NYT headlines to pull"
    )
    parser.add_argument(
        "--start_month", type=int, help="Start month for range of NYT headlines to pull"
    )
    parser.add_argument(
        "--end_month", type=int, help="End month for range of NYT headlines to pull"
    )
    parser.add_argument(
        "--news_desks",
        nargs="+",
        type=str,
        help="List of news desks to pull headlines from",
    )
    parser.add_argument(
        "--output_dir",
        type=str,
        help="Output directory to save headlines. Headlines will be saved in subdirectories by year and in separate json files by news desks.",
    )

    args = parser.parse_args()

    years = range(args.start_year, int(args.end_year + 1))
    months = range(args.start_month, int(args.end_month + 1))

    main(years, months, args.news_desks, args.output_dir)
