import hashlib
import json
import multiprocessing
import os
import random

import requests
import tqdm


def func(url):
    fname = 'outputs/c4/crawl/' + str(hashlib.md5(url.encode('utf-8')).hexdigest()) + '.html'
    if os.path.exists(fname):
        return True

    headers = {"User-Agent": "Mozilla/5.0"}  # Helps avoid being blocked
    try:
        response = requests.get(url, headers=headers)
    except:
        return False
    # Check if request was successful
    if response.status_code == 200:
        # Save HTML to a file
        with open(fname, "w", encoding="utf-8") as file:
            file.write(response.text)
        return True
    else:
        return False


if __name__ == '__main__':
    with open('outputs/c4/url.json') as f:
        urls = json.load(f)

    random.shuffle(urls)

    with multiprocessing.Pool(128) as p:
        success = total = 0
        pbar = tqdm.tqdm(p.imap(func, urls), total=len(urls))
        for x in pbar:
            success += int(x)
            total += 1
            pbar.set_postfix(success=success, success_rate=success / total)
