import json
import random
from datetime import datetime

import numpy as np


def convert_date_to_timestamp(date_str):
    # Parsing the date string using the appropriate format
    date_obj = datetime.strptime(date_str, "%Y-%m-%dT%H:%M:%S%z")

    # Converting the datetime object to a timestamp (integer)
    timestamp = int(date_obj.timestamp())

    return timestamp


def clean_data(paths, save_to="cleaned_data.json", save_stats="stats.json"):
    """
    paths: list of paths to json files containing headlines and dates

    saves cleaned data to cleaned_data.json after normalizing dates
    """
    data = []
    for path in paths:
        with open(path, "r") as f:
            data += json.load(f)
    cleaned = []
    dates = []
    for item in data:
        if "date" in item and isinstance(item["date"], int):
            date = item["date"]
        else:
            date = convert_date_to_timestamp(item["pub_date"])
        year = datetime.fromtimestamp(date).year
        cleaned.append({"headline": item["headline"], "date": date, "year": year})
        dates.append(date)
    std_date = np.std(dates)
    mean_date = np.mean(dates)
    with open(save_stats, "w") as f:
        json.dump(
            {
                "std": std_date,
                "mean": mean_date,
                "mean_as_date": datetime.fromtimestamp(mean_date).isoformat(),
                "std_as_years": std_date / (60 * 60 * 24 * 365),
            },
            f,
        )
    random.shuffle(cleaned)
    with open(save_to, "w") as f:
        json.dump(
            [
                {
                    "headline": item["headline"],
                    "date": (item["date"] - mean_date) / std_date,
                    "year": item["year"],
                }
                for item in cleaned
            ],
            f,
        )


if __name__ == "__main__":
    paths = ["./good_future_headlines.json", "./good_past_headlines.json"]
    clean_data(paths, save_to="data/cleaned_data.json", save_stats="data/stats.json")
    # paths = ['./easy_headlines.json']
    # clean_data(paths, save_to='cleaned_easy_headlines.json', save_stats='easy_stats.json')
