import os
import json
import pandas as pd
from bs4 import BeautifulSoup
from tqdm import tqdm

import re

base_path = ''
refer_base_path = ''

def clean_table_html(content):
    # Remove all hyperlinks
    for a in content.find_all('a'):
        a.unwrap()

    # Remove images, as the images are taken care of in other functions.
    for img in content.find_all('img'):
        img.decompose()

    # Remove inline styles (style) (class attribute is used to apply color, font, borders, padding, margins, etc.)
    # Remove unique identifiers (id) and class to simplify table HTML structure.
    # data-mw-deduplicate is purposed for specifying mediaWiki that powers Wikipedia, hence we remove it.
    # typeof is metadata to describe the type of resource, which is redundant.
    for tag in content.find_all(True):
        tag.attrs = {key: value for key, value in tag.attrs.items() if key not in ['style', 'id', 'class', 'data-mw-deduplicate', 'typeof']}

    for script in content.find_all(['script', 'style']):
        script.decompose()

    return str(content)

def extract_images_with_sections(soup, title):

    # Only in the image & table extracting, we ignore the images in these sections.
    ignored_sections = {'References', 'External links', 'See also', 'Further reading', 'Footnotes', 'Notes'}

    supported_img_extensions = {'jpg', 'png', 'JPEG', 'JPG', 'SVG'}

    image_urls = []
    image_reference_descriptions = []
    image_section_indices = []

    # Find the summary part
    summary_started = False
    cur_section_name = title  # The wikipedia title would be the title of the summary part.
    cur_section_id = 0


    # Similar to the infobox, we manually load the thumbnail image
    # This is because the infobox is out of the summary box.
    infobox = soup.find('table', {'class': 'infobox'})
    if infobox:
        # Load thumbnail image
        for element in infobox.find_all('img'):
            
            # Ignore other non-image modality data like GIF
            if element['src'].split('.')[-1] not in supported_img_extensions:
                continue
            
            img_url = "https:" + element['src']
            img_description = element.get('alt', '')

            # Check if the image has a parent figure or div with a description
            parent = element.find_parent(['figure', 'div'])
            if parent:
                caption = parent.find('figcaption')
                if caption:
                    img_description = caption.get_text(strip=True)

            image_urls.append(img_url)
            image_reference_descriptions.append(img_description)
            image_section_indices.append(cur_section_id)

    # Find images in the document.
    specific_tags = {'p', 'img'}
    _h = re.compile(r'^h\d+$') # h2, h3, ... etc.
    heading_tags = {tag.name for tag in soup.find_all() if _h.match(tag.name)}

    all_tags = list(heading_tags.union(specific_tags))

    for element in soup.find_all(all_tags):
        if element.name == 'p' and not summary_started:
            summary_started = True
        
        if summary_started:
            if 'h' in element.name:
                # Update section title
                cur_section_name = element.get_text(strip=True)
                cur_section_id += 1

            elif element.name == 'img':

                # We will not use these sections
                if cur_section_name in ignored_sections:
                    continue

                # Too small image is mostly redundant.
                if int(element.get('height', 39)) < 40:
                    continue

                # Ignore other non-image modality data like GIF
                if element['src'].split('.')[-1] not in supported_img_extensions:
                    continue

                img_url = "https:" + element['src']
                img_description = element.get('alt', '')

                # Check if the image has a parent figure or div with a description
                parent = element.find_parent(['figure', 'div'])
                if parent:
                    caption = parent.find('figcaption')
                    if caption:
                        img_description = caption.get_text(strip=True)

                image_urls.append(img_url)
                image_reference_descriptions.append(img_description)
                image_section_indices.append(cur_section_id)

    return image_urls, image_reference_descriptions, image_section_indices


def extract_tables_with_sections(soup, title):
    
    # Only in the image & table extracting, we ignore the images in these sections.
    ignored_sections = {'References', 'External links', 'See also', 'Further reading', 'Footnotes', 'Notes'}

    tables = []
    table_section_indices = []

    # Find the summary part
    summary_started = False
    cur_section_name = title  # The title would be the title of the summary part.
    cur_section_id = 0

    specific_tags = {'p', 'table',}
    _h = re.compile(r'^h\d+$') # h2, h3, ... etc.
    heading_tags = {tag.name for tag in soup.find_all() if _h.match(tag.name)}

    all_tags = list(heading_tags.union(specific_tags))

    for element in soup.find_all(all_tags):
        if element.name == 'p' and not summary_started:
            summary_started = True
        
        if summary_started:
            if 'h' in element.name:
                # Update section title
                cur_section_name = element.text.strip()
                cur_section_id += 1

            elif element.name == 'table':

                # We will not use these sections
                if cur_section_name in ignored_sections:
                    continue
                
                # The infobox is not treated as table here.
                if 'infobox' in element.get('class', []):
                    continue

                table_html = clean_table_html(element)
                tables.append(table_html)

                table_section_indices.append(cur_section_id)

    return tables, table_section_indices


def extract_text_with_sections(soup, title):

    section_texts = []
    section_titles = []

    cur_sec_title = title

    summary_started = False
    temp_section_text = []

    # Manually add the infobox info.
    infobox = soup.find('table', {'class': 'infobox'})
    if infobox:
        # Collect infobox content
        info_box_table_html = clean_table_html(infobox)
        temp_section_text.append(info_box_table_html)

    specific_tags = {'p', 'ul', 'ol'}
    _h = re.compile(r'^h\d+$') # h2, h3, ... etc.
    heading_tags = {tag.name for tag in soup.find_all() if _h.match(tag.name)}

    all_tags = list(heading_tags.union(specific_tags))

    for element in soup.find_all(all_tags):
        if element.name == 'p' and not summary_started:
            summary_started = True

        if summary_started:
            # Section is changed
            if 'h' in element.name:
                # When section is changed we upload the stacked information to the lists.
                # In this old wikipedia, the 'Contents' provides all the section titles, which is uncommon in the current wikipedia pages.
                if cur_sec_title != 'Contents':
                    section_texts.append('\n'.join(temp_section_text))
                    section_titles.append(cur_sec_title)

                temp_section_text = []

                cur_sec_title = element.get_text(strip=True)                

            elif element.name == 'p': 

                paragraph_text = element.get_text(strip=True)
                # Ignore the warning message by the old wikipedia pages.
                if paragraph_text.startswith('This is anold revisionof this page'):
                    continue

                temp_section_text.append(paragraph_text)
            
            # When a 'ul' tag is found, the code iterates over all 'li' elements, which are bullet points in the document.
            elif element.name == 'ul':
                if element.find_parent('table') is None:  # To exclude the infobox card
                    # Extract text from all list items within the unordered list
                    for li in element.find_all('li'):
                        list_item_text = li.get_text(strip=True)
                        temp_section_text.append(f"- {list_item_text}")
            
            elif element.name == 'ol':
                list_items = element.find_all('li')
                for idx, li in enumerate(list_items):
                    list_item_text = li.get_text(strip=True)
                    temp_section_text.append(f"{idx + 1}. {list_item_text}")

    # Add the last section
    section_texts.append('\n'.join(temp_section_text))
    section_titles.append(cur_sec_title)

    return section_texts, section_titles

def construct_interleaved_document(html_path, doc_title):

    # Read the HTML content from the file
    with open(html_path, 'r', encoding='utf-8') as file:
        html_content = file.read()
        soup = BeautifulSoup(html_content, 'lxml')

    images_with_sections = extract_images_with_sections(soup=soup, title=doc_title)
    texts_with_sections = extract_text_with_sections(soup=soup, title=doc_title)
    tables_with_sections = extract_tables_with_sections(soup=soup, title=doc_title)

    return texts_with_sections, images_with_sections, tables_with_sections


# Here, we will extarct the HTML file from the WikiTablQuestions (page/xxx-page/yyy.html).
if __name__ == '__main__':

    table_df = pd.read_json(os.path.join(base_path, 'openwikitable_wikipedia', 'splitted_tables.json'))
    table_df = table_df[table_df['dataset'] == 'WikiTQ']
    
    kb_wiki = {}

    for table in tqdm(table_df.itertuples(index=False), total=len(table_df)):
        page_id, table_id = table.original_table_id.split('-')

        html_path = os.path.join(refer_base_path, 'page', f'{page_id}-page', f'{table_id}.html')
        wiki_url_path = os.path.join(refer_base_path, 'page', f'{page_id}-page', f'{table_id}.json')

        wiki_meta = json.load(open(wiki_url_path, 'r'))
        wiki_url = wiki_meta['url']
        title = wiki_meta['title']

        if wiki_url in kb_wiki:
            continue

        doc_info = construct_interleaved_document(html_path, title)

        kb_wiki[wiki_url] = {}
        
        kb_wiki[wiki_url]['url'] = wiki_url
        kb_wiki[wiki_url]['title'] = title

        # Update text info
        kb_wiki[wiki_url]['section_texts'] = doc_info[0][0]
        kb_wiki[wiki_url]['section_titles'] = doc_info[0][1]

        # Update image info
        kb_wiki[wiki_url]['image_urls'] = doc_info[1][0]
        kb_wiki[wiki_url]['image_reference_descriptions'] = doc_info[1][1]
        kb_wiki[wiki_url]['image_section_indices'] = doc_info[1][2]

        # Update table info
        kb_wiki[wiki_url]['tables'] = doc_info[2][0]
        kb_wiki[wiki_url]['table_section_indices'] = doc_info[2][1]

    with open(os.path.join(base_path, 'openwikitable_wikipedia', 'openwikitable_kb_wiki.json'), 'w') as f:
        json.dump(kb_wiki, f)

    print('Done!')
