import json
import re
import pandas as pd
from typing import List, Dict
from tqdm import tqdm
from .utils.openai import openai_call
from .utils.logger import get_logger
from .utils.constants import OUTPUT_PATH, INPUT_PATH
from .utils.prompts import *

format_logger = get_logger("formatter")
main_logger = get_logger("main")
match_citation = get_logger("match citations")

def llm_json_formatter(content: str) -> str:
    prompt = f'''Please check if the text conforms to JSON format. If it does not, output the correct JSON format result or extract the part in JSON format; if it does, return the original text.
        Please return a valid JSON result, without any extra explanations or symbols.
        TEXT: {content}'''
    res = openai_call(
        messages=[{"role": "user", "content": prompt.format(content)}],
        llm_model_name='gpt-4o-mini'
    )
    return res

def extract_json_res(content: str) -> Dict:
    json_marker_pattern = r"```json([\s\S]*?)```"
    match = re.search(json_marker_pattern, content, re.DOTALL)
    json_content = match.group(1)
    dict_content = json.loads(json_content)
    return dict_content

def format_json_res(content: str) -> Dict:
    try:
        json_res = extract_json_res(content=content)
        return json_res
    except Exception as e:
        format_logger.debug(f"failed to directly load the content: {content}. ERROR: {e}. try to use LLM to format it.")
    try:
        new_content = llm_json_formatter(content=content)
        json_res = extract_json_res(new_content)
        return json_res
    except Exception as e:
        format_logger.debug(f"the content: {content}, improved by the LLM: {new_content}, still not conform to the JSON format. ERROR: {e}")


class preprocess:
    def __init__(self, raw: Dict) -> None:
        self.title: str = next((item['content'] for item in raw if item.get('name') == 'paper title'), None)
        self.reference: List = next((item['content'] for item in raw if item.get('name') == 'reference'), None)
        self.abstract: str = next((item['content'] for item in raw if item.get('name') == 'abstract'), None)
        self.related_work: List[Dict] = next((item['content'] for item in raw if item.get('name') == 'related work'), None)
        self.mentioned_ref: List = []
    
    def parse_reference(self):
        '''extract the first author, date, and title information of every reference entry.'''
        ref_df = pd.DataFrame(columns=['author', 'date', 'title', 'origin'])
        for r in tqdm(self.reference, desc='parse the reference'):
            issuccess = False
            for _ in range(3):
                try:
                    res = openai_call(
                        messages=[
                            {"role":"system", "content": PARSE_REF_SYS},
                            {"role": "user", "content": PARSE_REF_USER.format(ref=r)}
                        ], 
                        llm_model_name='gpt-4o-mini',
                    )
                    parsed_res = format_json_res(res)
                    parsed_res['origin'] = r
                    ref_df.loc[len(ref_df)] = parsed_res
                    issuccess = True
                    break
                except:
                    pass
            if not issuccess:
                main_logger.info(f"failed to parse the reference: {r}")
        ref_df.to_csv(OUTPUT_PATH / "parsed_reference.csv", index=False)
        self.ref_df = ref_df
        # TODO: whether to return

    def process_abs(self):
        '''pack abstract extraction prompt'''
        packed_abs = f"TEXT:\n- paper title: {self.title}\n- abstract: {self.abstract}"
        return packed_abs

    def process_rel(self):
        '''extract citations of related work and match them'''
        processed_rw = []
        for unit in self.related_work:
            para_title: str = unit['subtitle']
            para_content: List = unit['paragraphs']
            # extract citations
            for para in para_content:
                messages = [
                    {"role": "system", 
                    "content": EXTRACT_REF_SYS},
                    {"role": "user",
                    "content": EXTRACT_REF_USER.format(content=para)}
                ]
                issuccess = False
                for _ in range(3):
                    try:
                        _res = openai_call(messages=messages)
                        res = format_json_res(_res)
                        issuccess = True
                        break
                    except:
                        pass
                if not issuccess:
                    main_logger.info(f"failed to get JSON paragraph citations.")
                    # TODO: raise error...
                # match citations
                matched_ref = []
                for r in res:
                    author = r['authors'].split(' ')[0]
                    year = r['year']
                    target_entry = self.ref_df[
                        (self.ref_df['authors'].str.contains(author, na=False)) &
                        (self.ref_df['date']==year)
                    ]
                    if target_entry.empty:
                        match_citation.info(f"failed to match the citation: {r}")
                    else:
                        matched_ref.append(target_entry['origin'])
                self.mentioned_ref +=  matched_ref
                processed_rw.append(
                    {
                        "theme": para_title,
                        "para_content": para,
                        "reference": '\n'.join(matched_ref)
                    }
                )
    # TODO: whether to return 

class GraphBuilder:
    def __init__(self, abs_content: str, rel_content) -> None:
        self.abs_content = abs_content
        self.rel_content = rel_content
    
    def builder(self, content):
        '''build graph: extract entity and relation'''
        def extract(prompt, content) -> Dict:
            messages = [
                    {"role": "system", "content": prompt},
                    {"role": "user", "content": content}
                ]
            issuccess = False
            for _ in range(3):
                _entity = openai_call(messages=messages, json_format=True)
                try:
                    entity = format_json_res(_entity)
                    issuccess = True
                    return entity
                except:
                    pass
            if not issuccess:
                main_logger.error(f"failed to extract JSON format entities/relations from abstract. result: {_entity}")
                # TODO: build graph failed?
        
        abstract_entity = extract(EXTRACT_ABSTRACT_ENTITY, self.abs_content)
        if abstract_entity:
            abstract_relation = extract(EXTRACT_ABSTRACT_RELATION, self.abs_content + f"\nENTITIES:\n{abstract_entity}")
        relwork_entity = extract(EXTRACT_RELATED_WORK_ENTITY, self.rel_content)

    def checker(self):
        '''rule-based, check entity and relation'''
        # check if all the entities mentioned in the relations already exist in the entity list


        # check if there is any discrete entity node

    def reviewer(self):
