import csv
import json
import os

from tqdm import tqdm

from pipelines.prompta.rag.collect_wiki import get_final_url, get_textarea_content
from pipelines.prompta.rag.chunk import count_tokens, split_into_chunks_by_tokens

# Function to read and print CSV file
def read_csv_file(filename):
    with open(filename, mode='r') as file:
        csv_reader = csv.reader(file)
        header = next(csv_reader)  # Skip the header row
        print(f'Header: {header}')
        
        object_ids, names, identicals = [], [], []
        for row in csv_reader:
            object_id, name, identical = row
            object_ids.append(object_id)
            names.append(name)
            identicals.append(identical)
    
    return object_ids, names, identicals


def get_item_category(name, seed=0):
    description = open(f'./meta_data/item-desc/{name}.txt', 'r', encoding='utf-8').read()
    queries = [
        {
            "role": "system", 
            "content": (
                "You are an assistant that determines the category of a Minecraft object based on its description. "
                "The valid categories are [underground block, overground block, item]. "
                "Underground blocks are typically found below the surface, such as cobblestone and diamond ore. "
                "overground blocks are typically found above the surface, such as wood logs, water blocks, and leaves. "
                "Items are objects usually found in inventory or storage containers."
            )
        },
        {
            "role": "user", 
            "content": (
                f"The item id is {objname}. The item description is:\n\n{description}\n\n"
                "What is the category of the item?"
            )
        }
    ]

    raw_response = global_openai_client.chat.completions.create(
        model="gpt-4o-mini", # Specify the GPT-4 engine
        response_format= None, #{"type": "json_object"}
        messages=queries,
        max_tokens=512, # Maximum number of tokens in the response
        n=1, # Number of completions to generate
        stop=None, # Token at which to stop generating further tokens
        temperature=None, # Controls the randomness of the response
        seed=seed
    ).choices[0].message.content

    return raw_response

def get_item_category_json(raw_response, seed=0):

    queries = [
        {
            "role": "system", 
            "content": (
                "You are an assistant that summarizes and translates the response to JSON."
            )
        },
        {
            "role": "user", 
            "content": (
                f"The response is {raw_response}. "
                "You should respond in JSON format as follows: "
                "{'category': '[underground block|overground block|item]'}"
            )
        }
    ]

    json_response = global_openai_client.chat.completions.create(
        model="gpt-4o-mini", # Specify the GPT-4 engine
        response_format= {"type": "json_object"},
        messages=queries,
        max_tokens=64, # Maximum number of tokens in the response
        n=1, # Number of completions to generate
        stop=None, # Token at which to stop generating further tokens
        temperature=None, # Controls the randomness of the response
        seed=seed
    ).choices[0].message.content

    for i in range(10):
        try:
            json_response = json.loads(json_response)
            category = json_response['category']
            if 'underground' in category.lower():
                return 'underground block'
            elif 'overground' in category.lower():
                return 'overground block'
            elif 'item' in category.lower():
                return 'item'
            else:
                raise ValueError(f"Invalid category: {category}")
        except:
            continue

    return "item"


if __name__ == '__main__':
    # URL of the page to scrape
    url = 'https://minecraft.fandom.com/wiki'  # Replace with the actual URL

    csv_filename = './meta_data/MC-meta_data.csv'
    id2url_json_path = './meta_data/id2url.json'
    desc_save_folder = './meta_data/item-desc'
    records_path = './meta_data/records.json'
    object_category_path = './meta_data/object-category.json'
    error_log_path = './meta_data/error.log'
    if not os.path.exists(desc_save_folder):
        os.makedirs(desc_save_folder)
    # Read and print the CSV file contents
    object_ids, names, identicals = read_csv_file(csv_filename)
    
    if os.path.exists(id2url_json_path):
        f = open(id2url_json_path, 'r', encoding='utf-8')
        id2url_json = json.load(f)
        f.close()
    else:
        id2url_json = {}
    for obj_id in tqdm(object_ids):
        if obj_id in id2url_json:
            continue
        id2url_json[obj_id] = {}
        id2url_json[obj_id]['url'] = get_final_url(url + '/' + obj_id)
        id2url_json[obj_id]['src_url'] = f"{id2url_json[obj_id]['url']}?action=edit"
    f = open(id2url_json_path, 'w', encoding='utf-8')
    json.dump(id2url_json, f)
    f.close()
    
    if os.path.exists(records_path):
        f = open(records_path, 'r', encoding='utf-8')
        records = json.load(f)
        f.close()
    else:
        records = {}
    
    errors = []
    for obj_id, val in tqdm(id2url_json.items()):
        if val['src_url'] in records:
            if os.path.exists(f'./meta_data/item-desc/{obj_id}.txt'):
                continue
            content = open(f'./meta_data/item-desc/{records[val["src_url"]]}.txt', 'r', encoding='utf-8').read()
            f = open(f'./meta_data/item-desc/{obj_id}.txt', 'w', encoding='utf-8')
            f.write(content)
            f.close()
        else:
            content, _ = get_textarea_content(val['src_url'])
            if content:
                fisrt_chunk = split_into_chunks_by_tokens(content, 8000)[0]
                f = open(f'./meta_data/item-desc/{obj_id}.txt', 'w', encoding='utf-8')
                f.write(fisrt_chunk)
                f.close()
                records[val['src_url']] = obj_id
            else:
                try:
                    id_pieces = obj_id.split('_')
                    for i in range(1, len(id_pieces)):
                        reduced_object_id = '_'.join(id_pieces[i:])
                        id2url_json[obj_id]['url'] = get_final_url(url + '/' + reduced_object_id)
                        id2url_json[obj_id]['src_url'] = f"{id2url_json[obj_id]['url']}?action=edit"
                        content, _ = get_textarea_content(id2url_json[obj_id]['src_url'])
                        if content:
                            break
                        elif i == len(id_pieces) - 1:
                            errors.append((obj_id, val))
                            raise Exception
                    fisrt_chunk = split_into_chunks_by_tokens(content, 8000)[0]
                    f = open(f'./meta_data/item-desc/{obj_id}.txt', 'w', encoding='utf-8')
                    f.write(fisrt_chunk)
                    f.close()
                    records[id2url_json[obj_id]['src_url']] = obj_id
                except:
                    content = ' '.join(obj_id.split('_'))
                    f = open(f'./meta_data/item-desc/{obj_id}.txt', 'w', encoding='utf-8')
                    f.write(content)
                    f.close()
                    records[val['src_url']] = obj_id

    f = open(id2url_json_path, 'w', encoding='utf-8')
    json.dump(id2url_json, f)
    f.close()

    f = open(records_path, 'w', encoding='utf-8')
    json.dump(records, f)
    f.close()

    f = open(error_log_path, 'w', encoding='utf-8')
    for e in errors:
        f.write(str(e) + '\n')
    f.close()

    need_item_category = input("Do you want to get the item category? (y/n): ").lower() in ['y', 'yes']
    if need_item_category:
        try:
            from prompta.utils.set_api import global_openai_client
        except:
            print("Please set your OpenAI API key by os.environ['OPENAI_API_KEY'] = 'sk-xxxx'")
        from pathlib import Path
        undergroundBlocks, overgroundBlocks, items = [], [], []

        for path in tqdm(os.listdir(desc_save_folder)):
            objname = Path(path).stem
            raw_response = get_item_category(objname)

            category = get_item_category_json(raw_response)

            if category == 'underground block':
                undergroundBlocks.append(objname)
            elif category == 'overground block':
                overgroundBlocks.append(objname)
            elif category == 'item':
                items.append(objname)
            else:
                print(objname, 'category:', category)
        
        object_category = {'undergroundBlocks': undergroundBlocks, 'overgroundBlocks': overgroundBlocks, 'items': items}
        f = open(object_category_path, 'w', encoding='utf-8')
        json.dump(object_category, f)
        f.close()
            
        
