import os
import json
import time
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm

from concurrent.futures import ProcessPoolExecutor, as_completed
import re

base_path = ''

MAX_RETRIES = 5
WAIT_PERIOD = 5  # seconds

def fetch_url(wiki_url):
    retries = 0
    while retries < MAX_RETRIES:
        headers = {'User-Agent': 'CoolBot/0.0 (https://example.org/coolbot/; coolbot@example.org)'}
        response = requests.get(wiki_url, headers=headers)
        if response.status_code == 429:
            time.sleep(WAIT_PERIOD * (retries + 1))
            retries += 1
        else:
            return response
    
    raise Exception("Maximum retries occurred")


def clean_table_html(content):
    # Remove all hyperlinks
    for a in content.find_all('a'):
        a.unwrap()

    # Remove images, as the images are taken care of in other functions.
    for img in content.find_all('img'):
        img.decompose()

    # Remove inline styles (style) (class attribute is used to apply color, font, borders, padding, margins, etc.)
    # Remove unique identifiers (id) and class to simplify table HTML structure.
    # data-mw-deduplicate is purposed for specifying mediaWiki that powers Wikipedia, hence we remove it.
    # typeof is metadata to describe the type of resource, which is redundant.
    for tag in content.find_all(True):
        tag.attrs = {key: value for key, value in tag.attrs.items() if key not in ['style', 'id', 'class', 'data-mw-deduplicate', 'typeof']}

    for script in content.find_all(['script', 'style']):
        script.decompose()

    return str(content)


def extract_tables_with_sections(soup):
    
    # Only in the image & table extracting, we ignore the images in these sections.
    ignored_sections = {'References', 'External links', 'See also', 'Further reading', 'Footnotes', 'Notes'}

    tables = []
    table_section_indices = []
    info_box_table_html = None
    
    # Find the summary part
    summary_started = False
    cur_section_name = 'Summary'
    cur_section_id = 0

    specific_tags = {'p', 'table',}
    _h = re.compile(r'^h\d+$') # h2, h3, ... etc.
    heading_tags = {tag.name for tag in soup.find_all() if _h.match(tag.name)}

    all_tages = list(heading_tags.union(specific_tags))

    for element in soup.find_all(all_tages):
        if element.name == 'p' and not summary_started:
            summary_started = True
        
        if summary_started:
            if 'h' in element.name:
                # Update section title
                cur_section_name = element.text.strip()
                cur_section_id += 1

            elif element.name == 'table':

                # We will not use these sections
                if cur_section_name in ignored_sections:
                    continue
                
                if 'infobox' in element.get('class', []):
                    # Collect infobox content
                    info_box_table_html = clean_table_html(element)
                    continue

                table_html = clean_table_html(element)
                tables.append(table_html)

                table_section_indices.append(cur_section_id)

    return tables, table_section_indices, info_box_table_html


def construct_interleaved_document(chunk):
    chunk_documents = []
    for wiki_url in tqdm(chunk, total=len(chunk)):
        
        try:
            response = fetch_url(wiki_url)

            if response.status_code != 200:
                response.raise_for_status()

            soup = BeautifulSoup(response.text, 'lxml')
                
            tables_with_sections = extract_tables_with_sections(soup=soup)

            chunk_documents.append((wiki_url, tables_with_sections))

        except Exception as e:
            print(f"Error occured: {e} in {wiki_url}")
            chunk_documents.append((wiki_url, ([],[],None)))

    return chunk_documents


# Since the encyclopedic-vqa misses the (1) infobox and (2) tabular modality in the Wikipedia documents.
# We will add those missing parts to the KB.
if __name__ == '__main__':

    file_path = os.path.join(base_path, 'encyclopedic_kb_wiki', 'encyclopedic_kb_wiki_cleaned.json')
    kb_wiki = json.load(open(file_path, 'r'))
    num_kb = len(kb_wiki)

    wiki_meta_list = []
    for wiki_url in kb_wiki:
        wiki_meta_list.append(wiki_url)

    # Parallel computing
    num_workers = min(os.cpu_count(), 32)  # Too many CPU causes 426 Client error, since there are too many requests.
    print(f"num_workers = {num_workers}")
    chunk_size = len(wiki_meta_list) // num_workers
    chunks = [wiki_meta_list[i*chunk_size:(i+1)*chunk_size] if i != (num_workers-1)
            else wiki_meta_list[i*chunk_size:] for i in range(num_workers)]

    global_kbs = []
    with ProcessPoolExecutor(max_workers=num_workers) as executor:
        futures = [executor.submit(construct_interleaved_document, chunk) for chunk in chunks]

        for idx, future in enumerate(as_completed(futures)):
            chunk_kbs = future.result()
            global_kbs.extend(chunk_kbs)
            print(f'cpu {idx} is done!')

    assert len(global_kbs) == num_kb

    for doc_info in tqdm(global_kbs, total=len(global_kbs)):
        
        if doc_info == None:  # Case that the wikipedia url is not linked.
            continue

        wiki_url = doc_info[0]

        # Update table info
        kb_wiki[wiki_url]['tables'] = doc_info[1][0]
        kb_wiki[wiki_url]['table_section_indices'] = doc_info[1][1]
        
        # Add infobox information to the summary part.
        if doc_info[1][2] is not None:
            try:
                kb_wiki[wiki_url]['section_texts'][0] = doc_info[1][2] + '\n' + kb_wiki[wiki_url]['section_texts'][0]
            except:
                print(f'{wiki_url} was incomplete originally from the encyclopedic-vqa, and thus empty section_texts in the KB')
                del kb_wiki[wiki_url]

    with open(os.path.join(base_path, 'encyclopedic_kb_wiki', 'encyclopedic_kb_wiki_cleaned_table.json'), 'w') as f:
        json.dump(kb_wiki, f)

    print('Done!')
