import glob
import json
import os
import random
import re
import subprocess
import time
from collections import defaultdict
from copy import deepcopy

import matplotlib.pyplot as plt
import numpy as np
import yaml

interval = 1000


result_list = []
cnt_dict = defaultdict(int)
url_dict = defaultdict(int)
data = []
popular_data = []

start_indices = range(1, 20000, interval)
for i in start_indices:
    with open(
        f"data/en0042-18.{str(i)}.{str(i + interval-1)}.jsonl", "r"
    ) as f:
        print(f"data/en0042-18.{str(i)}.{str(i + interval-1)}.jsonl")
        for line in f:
            data.append(json.loads(line))

start_indices = range(1, 20000, interval)
for i in start_indices:
    with open(
        f"data/en0042-34.{str(i)}.{str(i + interval-1)}.jsonl", "r"
    ) as f:
        print(f"data/en0042-34.{str(i)}.{str(i + interval-1)}.jsonl")
        for line in f:
            data.append(json.loads(line))

for _, element in enumerate(data):
    tree = element["ax_tree"]

    is_popular = True
    if "WARC-Target-URI" not in element["metadata"]:
        cnt_dict["missing WARC-Target-URI"] += 1
        is_popular = False
    else:
        if "amazon" in element["metadata"]["WARC-Target-URI"]:
            cnt_dict["amazon"] += 1
        elif "walmart" in element["metadata"]["WARC-Target-URI"]:
            cnt_dict["walmart"] += 1
        elif "redfin" in element["metadata"]["WARC-Target-URI"]:
            cnt_dict["redfin"] += 1
        elif "github" in element["metadata"]["WARC-Target-URI"]:
            cnt_dict["github"] += 1
        elif "tripadvisor" in element["metadata"]["WARC-Target-URI"]:
            cnt_dict["tripadvisor"] += 1
        elif "twitter" in element["metadata"]["WARC-Target-URI"]:
            cnt_dict["twitter"] += 1
        elif "reddit" in element["metadata"]["WARC-Target-URI"]:
            cnt_dict["reddit"] += 1
        elif "quora" in element["metadata"]["WARC-Target-URI"]:
            cnt_dict["quora"] += 1
        elif "ebay" in element["metadata"]["WARC-Target-URI"]:
            cnt_dict["ebay"] += 1
        elif "etsy" in element["metadata"]["WARC-Target-URI"]:
            cnt_dict["etsy"] += 1
        elif "homedepot" in element["metadata"]["WARC-Target-URI"]:
            cnt_dict["homedepot"] += 1
        elif "target" in element["metadata"]["WARC-Target-URI"]:
            cnt_dict["target"] += 1
        elif "mapquest" in element["metadata"]["WARC-Target-URI"]:
            cnt_dict["mapquest"] += 1
        elif "bestbuy" in element["metadata"]["WARC-Target-URI"]:
            cnt_dict["bestbuy"] += 1
        else:
            is_popular = False

        url_dict[element["metadata"]["WARC-Target-URI"].split("/")[2]] += 1
    if is_popular:
        popular_data.append(deepcopy(element))


print(cnt_dict)
coverage_cnt = 0
sorted_dd = sorted(url_dict.items(), key=lambda item: item[1], reverse=True)
for k, v in sorted_dd:
    if v > 10:
        coverage_cnt += v
        print(k, v)
print(
    coverage_cnt, sum(url_dict.values()), coverage_cnt / sum(url_dict.values())
)
print(
    sum(cnt_dict.values()),
    sum(url_dict.values()),
    sum(cnt_dict.values()) / sum(url_dict.values()),
)

# pie chart
value_frequencies = defaultdict(int)
for key, value in url_dict.items():
    value_frequencies[value] += 1

categorized_frequencies = defaultdict(int)
for value, frequency in value_frequencies.items():
    if 6 <= value < 12:
        categorized_frequencies["6-12"] += frequency * value
    elif 12 <= value < 25:
        categorized_frequencies["12-25"] += frequency * value
    elif 25 <= value < 50:
        categorized_frequencies["25-50"] += frequency * value
    elif 50 <= value < 100:
        categorized_frequencies["50-100"] += frequency * value
    elif value >= 100:
        categorized_frequencies[">100"] += frequency * value
    else:
        categorized_frequencies[value] += frequency * value

categorized_frequencies = sorted(
    categorized_frequencies.items(),
    key=lambda item: int(str(item[0]).replace(">", "").split("-")[0]),
)
values = [k for k, v in categorized_frequencies]
frequencies = [v for k, v in categorized_frequencies]

plt.figure(figsize=(10, 7))
plt.pie(frequencies, labels=values, autopct="%1.1f%%", startangle=140)
plt.savefig("pie_chart")
plt.show()


print("len(popular_data)", len(popular_data))

with open(
    "../visualization/data/agent_data/popular_42-18_42-34.json", "w"
) as o_f:
    json.dump(popular_data, o_f)
