import json
import re
import os
import traceback
import pandas as pd
import networkx as nx
from typing import Dict, Tuple, List
from tqdm import tqdm
from transformers import BertTokenizer
from pathlib import Path
# from ..utils.openai import get_json_chat_completion, get_chat_completion
# from ..utils.logger import get_logger
# from ..utils.websearch import title_search
# from ..utils.prompts import *

from ..utils.openai import get_json_chat_completion, get_chat_completion
from ..utils.logger import get_logger
from ..utils.websearch import title_search
from ..utils.prompts import *


class Paper2Graph:
    def __init__(self, basic_info: Dict, temp_data_path: Path, debug_mode=False) -> None:
        keys = ['title', 'abstract', 'related work', 'reference', 'date', 'isAPA', 'topic']
        self.title, self.abstract, self.related_work, self.reference, self.timepoint, self.isAPA, self.topic = [basic_info.get(key) for key in keys]
        self.path = temp_data_path / self.title
        os.makedirs(self.path, exist_ok=True)
        self.logger = get_logger(f"GraphExtraction - {self.title}")
        self.debug = debug_mode
        self.entity = pd.DataFrame(
            [
                {
                    "entity name": self.title,
                    "entity type": "paper",
                    "timestamp": self.timepoint,
                    "description": self.abstract
                },
                {
                    "entity name": self.topic,
                    "entity type": "topic",
                    "timestamp": None,
                    "description": None
                }
            ]
        )
        self.relation = pd.DataFrame(
            [
                {
                    "entity1": self.title,
                    "relation": "relate to",
                    "entity2": self.topic
                }
            ]
        )
        
        
    def _extract_theme(self):
        '''Extracting Key Issues Addressed, Proposed Methods, and Application Domains from the Abstract.'''
        res: Dict[Dict] = get_json_chat_completion(
            [{"role": "user", "content": EXTRACT_THEME.format(title=self.title, abstract=self.abstract)}],
            self.logger
        )
        if not res:
            self.logger.warning(f"Failed to get theme information of the paper *{self.title}*")
            return
        keys = ['problem', 'method', 'domain']
        themes = [res.get(key) for key in keys]
        rels = ['addressed the problem', 'proposed', 'applies in']
        # import pdb; pdb.set_trace()
        for ind, unit in enumerate(themes):
            if unit:
                temp_name = unit.get('name')
                if temp_name:
                    temp_rel = [self.title, rels[ind], temp_name]
                    temp_ent = [temp_name, keys[ind], None, unit.get('description')]
                    self.entity.loc[len(self.entity)] = temp_ent
                    self.relation.loc[len(self.relation)] = temp_rel
        
    def _extract_related_papers(self, temp_save=False, isretry=False):
        '''Extract relevant literature, methods, domains, and issues related to this paper from the related work.'''
        save = temp_save
        def preprocess(isretry=isretry):
            if isretry:
                # directly load tempfiles from directory
                citations = pd.read_csv(self.path / "PreprocessFiles/citations.csv")
                ref_df = pd.read_csv(self.path / "PreprocessFiles/ref_df.csv")
                return ref_df, citations
            if self.isAPA:
                return preprocess_APA(save)
            else:
                return preprocess_IEEE(save)
        
        def preprocess_IEEE(save) -> Tuple[pd.DataFrame, pd.DataFrame]:
            '''parse reference, match citation, and create paper entities'''   
            
            def parse_ref() -> pd.DataFrame:
                ref_df = pd.DataFrame(columns=['title', 'date', 'raw'])
                for r in tqdm(self.reference, desc='Parse references'):
                    m = [
                        {"role": "system", "content": PARSE_IEEE_REF_SYS},
                        {"role": "user", "content": PARSE_REF_USER.format(ref=r)}
                    ]
                    parsed_r = get_json_chat_completion(messages=m)
                    if not parsed_r:
                        self.logger.warning(f"failed to parse the reference: {r}.")
                        ref_df.loc[len(ref_df)] = {"raw": r}
                    else:
                        ref_df.loc[len(ref_df)] = parsed_r
                
                return ref_df
                
            def get_context(hspan=100) -> pd.DataFrame:
                tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
                tokens = tokenizer.tokenize(self.related_work)
                
                cits = re.finditer(r"\[(\d+(?:,\s*\d+)*)\]", self.related_work)
                cits = list(cits)
                citations = pd.DataFrame(columns=['list', 'context'])
                
                citcnt = 0
                token_start = token_end = ptr = 0
                print(cits)
                for i, token in enumerate(tokens):
                    if citcnt>=len(cits):
                            continue

                    try:
                        target_start_ind, target_end_ind = cits[citcnt].span()
                        _token = token.strip('##')
                        token_start = self.related_work.index(_token, ptr)
                        token_end = token_start + len(_token)
                        ptr = token_end
                    except:
                        self.logger.warning(f"Uncommon error: Failed to locate target token in related work. The token will be skipped which may lead to unexpected error...")
                        traceback.print_exc()
                        continue

                    if token_start >= target_start_ind and token_end <= target_end_ind:
                        start_token_index = end_token_index = i
                        res_start_index = max(0, start_token_index - hspan)
                        res_end_index = min(len(tokens), end_token_index + hspan + 1)
                        
                        selected_tokens = tokens[res_start_index: res_end_index]
                        target_string = tokenizer.convert_tokens_to_string(selected_tokens)
                        
                        try:
                            rec = [json.loads(cits[citcnt].group()), target_string]
                            citations.loc[len(citations)] = rec
                        except Exception as e:
                            self.logger.warning(f"Uncommon error: Failed to add the context of the citation to result dataframe: {e}. This citation list will be omitted.")
                            traceback.print_exc()

                        citcnt += 1    
                        if citcnt == len(cits):
                            break
                
                if citcnt < len(cits):
                    # usually will not happen...
                    self.logger.warning(f"Only {citcnt} citations out of {len(cits)} have been successfully matched.")
                    if self.debug:
                        check = input("Would you like to stop and check whether this error occurred? Reply with 'y' or 'n'.") == 'y'
                        if check:
                            #import pdb; pdb.set_trace()
                            print()
                return citations
            
            ref_df = parse_ref()
            citations = get_context()
            if save:
                output_dir = self.path / "PreprocessFiles"
                print(f"Parsed references and citations will be save at {str(output_dir)}")
                os.makedirs(output_dir, exist_ok=True)
                ref_df.to_csv(output_dir / "ref_df.csv", index=False)
                citations.to_csv(output_dir / "citations.csv", index=False)
            
            return ref_df, citations
                  
        def preprocess_APA(save) -> Tuple[pd.DataFrame, pd.DataFrame]:
            def parse_ref() -> pd.DataFrame:
                ref_df = pd.DataFrame(columns=['author', 'title', 'date', 'raw'])
                for r in tqdm(self.reference, desc='Parse references'):
                    m = [
                        {"role":"system", "content": PARSE_APA_REF_SYS},
                        {"role": "user", "content": PARSE_REF_USER.format(ref=r)}
                    ]
                    parsed_r = get_json_chat_completion(messages=m)
                    if not parsed_r:
                        self.logger.warning(f"failed to parse the reference: {r}.")
                        ref_df.loc[len(ref_df)] = {"raw": r}
                    else:
                        try:
                            temp_df = pd.DataFrame(parsed_r) # in case the reference entries is not correctly split
                            # if len(temp_df) >= 2:
                            #     import pdb; pdb.set_trace()
                            ref_df = pd.concat([ref_df, temp_df], ignore_index=True)
                        except Exception as e:
                            print(e)
                            #import pdb; pdb.set_trace()
                # ref_df.to_csv('/root/mypaperTKG/PaperTKG/test/temp_ref.csv', index=False)
                # ref_df = pd.read_csv('/root/mypaperTKG/PaperTKG/test/temp_ref.csv')
                return ref_df
                
            def match_citation(ref_df, hspan=100) -> pd.DataFrame:
                citations = pd.DataFrame(columns=['list', 'context'])
                m = [
                    {
                        "role": "user",
                        "content": EXTRACT_APA_REF+self.related_work
                    }
                ]
                cits: List[Dict] = get_json_chat_completion(m)

                # in case one paper is cited more than once
                pop_list = []
                cnts = {}
                cnt_list = []
                for i, c in enumerate(cits):
                    content = c.get('raw reference')
                    if not content:
                        content = c.get('raw_reference')
                        cits[i]['raw reference'] = content
                    if not content:
                        self.logger.warning(f"Cannot match the citation *{c}* extracted from related work: Wrong format of the result returned by LLMs. This citation will be skipped.")
                        pop_list.append(i)
                        continue
                        
                    if content in cnts:
                        cnts[content] += 1
                    else:
                        cnts[content] = 0
                    cnt_list.append(cnts[content])
                del cnts
                cits = [item for index, item in enumerate(cits) if index not in pop_list]
                    
                pop_list = []
                for ind, c in enumerate(cits):
                    target_str = c.get('raw reference')
                    ref_list = c.get('parsed result')
                    if not ref_list:
                        ref_list = c.get('parsed_result')
                    if not (target_str and ref_list):
                        self.logger.warning(f"Cannot match the citation *{c}* extracted from related work: Wrong format of the result returned by LLMs. This citation will be skipped.")
                        pop_list.append(ind)
                        continue
                    # get span
                    print(f"Target String: {target_str}")

                    # Use re.escape to handle special characters
                    try:
                        match_ = re.finditer(re.escape(target_str), self.related_work)
                        match_ = list(match_)
                    except re.error as e:
                        print(f"Regex error: {e} for target_str: {target_str}")
                        continue



                    if match_:
                        if cnt_list[ind] < len(match_):
                            c['span'] = match_[cnt_list[ind]].span()
                        else:
                            self.logger.warning(f"Cannot match the citation *{c}* extracted from related work: LLMs identified more occurrences of this citation compared to using a regular function for matching. This citation will be skipped.")
                            pop_list.append(ind)
                            continue
                    else:
                        # drop the symbols then try again
                        target_str = target_str.split('.')[0].strip().strip('(').strip(')')
                        p_ind = target_str.find('(')
                        c_ind = target_str.find(',')
                        if p_ind == -1 and c_ind == -1:
                            temp_ind = -1
                        elif p_ind != -1 and c_ind != -1:
                            temp_ind = min(p_ind, c_ind)
                        else:
                            temp_ind = p_ind if p_ind != -1 else c_ind
                        if temp_ind != -1:
                            target_str = target_str[:temp_ind].strip()
                            
                        match_ = re.finditer(target_str, self.related_work)
                        match_ = list(match_)
                        if match_:
                            if cnt_list[ind] < len(match_):
                                c['span'] = match_[cnt_list[ind]].span()
                            else:
                                self.logger.warning(f"Cannot match the citation *{c}* extracted from related work: LLMs identified more occurrences of this citation compared to using a regular function for matching. This citation will be skipped.")
                                pop_list.append(ind)
                                continue
                        else:
                            self.logger.warning(f"Cannot match the citation *{c}* extracted from related work: Cannot identify it in the content using a regular function. This citation will be skipped.")
                            pop_list.append(ind)
                            continue
                        
                    # connect reference entry with the index
                    index_list = []
                    for cunit in ref_list:
                        year = cunit['year']
                        if 'et al' in cunit['authors'] or len(cunit['authors'].split(' ')) == 1:
                            author = cunit['authors'].split(' ')[0]
                            target_entry = ref_df[
                                (ref_df['author'].str.contains(author, na=False)) &
                                (ref_df['date']==year) &
                               (~ref_df['author'].str.contains('and', na=False))
                            ]
                            if target_entry.empty:
                                target_entry = ref_df[
                                    (ref_df['author'].str.contains(author, na=False)) &
                                    (ref_df['date']==year)
                                ]
                            elif len(target_entry) > 1:
                                for _, row in target_entry.iterrows():
                                    if author in row['author'].split(' ')[-1]:
                                        target_entry = row
                                        break
                        else:
                            author = cunit['authors'].replace('&', "and")
                            target_entry = ref_df[
                                (ref_df['author']==author) &
                                (ref_df['date']==year)
                            ]
                            # in case something went wrong in the reference parsing phase
                            if target_entry.empty:
                                target_entry = ref_df[
                                (ref_df['author'].str.contains(author.split(' ')[0])) &
                                (ref_df['date']==year)
                            ]
                            
                        if target_entry.empty:
                            self.logger.warning(f"failed to match the citation in the parsed reference list: {json.dumps(cunit)}. This citation will be skipped.")
                            pop_list.append(ind)
                            continue
                        else:
                            if isinstance(target_entry, pd.DataFrame) and len(target_entry)==1:
                                index_list.append(target_entry.iloc[0].name+1)
                            elif isinstance(target_entry, pd.Series):
                                index_list.append(target_entry.name)
                            else:
                                # check if all the matched references are the same
                                names = set(target_entry['author'])
                                if len(names) == 1:
                                    index_list.append(target_entry.iloc[0].name+1)
                                else:
                                    self.logger.warning(f"More than one reference entries are matched with this citaion. To avoid mismatch, this citation will be skipped.")
                                    pop_list.append(ind)
                                    continue

                    c['list'] = index_list
                
                cits = [item for index, item in enumerate(cits) if index not in pop_list]
                
                # get context
                tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
                tokens = tokenizer.tokenize(self.related_work)
                citcnt = 0
                token_start = token_end = ptr = 0
                for i, token in enumerate(tokens):
                    if citcnt>=len(cits):
                            break
                    try:
                        target_start_ind, target_end_ind = cits[citcnt]['span']
                        _token = token.strip('##')
                        token_start = self.related_work.index(_token, ptr)
                        token_end = token_start + len(_token)
                        ptr = token_end
                    except:
                        print(citcnt,len(cits))
                        print("###############\n",cits,"\n################\n",cits[citcnt]['span'])
                        self.logger.warning("Uncommon error: Failed to locate target token in related work. The token will be skipped which may lead to unexpected error...")
                        traceback.print_exc()
                        continue

                    if token_start >= target_start_ind and token_end <= target_end_ind:
                        start_token_index = end_token_index = i
                        res_start_index = max(0, start_token_index - hspan)
                        res_end_index = min(len(tokens), end_token_index + hspan + 1)
                        
                        selected_tokens = tokens[res_start_index: res_end_index]
                        target_string = tokenizer.convert_tokens_to_string(selected_tokens)
                        try:
                            rec = [cits[citcnt]['list'], target_string]
                            citations.loc[len(citations)] = rec
                        except Exception as e:
                            self.logger.warning(f"Uncommon error: Failed to add the context of the citation to result dataframe: {e}. This citation list will be omitted.")
                            traceback.print_exc()
                            
                        citcnt += 1
                        if citcnt == len(cits):
                            break
                if citcnt < len(cits):
                    # usually will not happen...
                    self.logger.warning(f"Only {citcnt} citations out of {len(cits)} have been successfully matched.")
                    if self.debug:
                        check = input("Would you like to stop and check whether this error occurred? Reply with 'y' or 'n'.") == 'y'
                        if check:
                            #import pdb; pdb.set_trace()
                            print()
                return citations
            
            ref_df = parse_ref()
            citations = match_citation(ref_df=ref_df)
            if save:
                output_dir = self.path / "PreprocessFiles"
                print(f"Parsed references and citations will be save at {str(output_dir)}")
                os.makedirs(output_dir, exist_ok=True)
                ref_df.to_csv(output_dir / "ref_df.csv", index=False)
                citations.to_csv(output_dir / "citations.csv", index=False)
            
            return ref_df, citations
            
        def smooth():
            nopaper_entity_name = self.entity[(self.entity['entity type'] != "paper") & (self.entity['entity type'] != "topic")]['entity name'].values
            str_entity_name = ', '.join(nopaper_entity_name)
            
            trycnt = 1
            stop = False
            while trycnt < 4 and not stop:
                similar_entities = get_json_chat_completion(
                    [{"role": "user", "content": FIND_SIMILAR_ENTITY.format(content=self.related_work, names=str_entity_name)}],
                    self.logger
                )
                if similar_entities:
                    str_similar_entities = [json.dumps(g) for g in similar_entities]
                    self.logger.info(f"Find {len(similar_entities)} group(s) of entities that can be merged:" + "\n" + '\n'.join(str_similar_entities))
                    for group in similar_entities:
                        new_name, dup = group['new name'], group['duplicates']
                        target_entity = self.entity[self.entity['entity name'] == new_name]
                        if target_entity.empty:
                            self.logger.info(f"{trycnt}/3 - Error occurred while extracting synonymous names. Detailed information: {str(group)}")
                            trycnt += 1
                            stop = False
                            break
                        else:
                            to_remove = [d for d in dup if d != new_name]
                            self.entity = self.entity[~self.entity['entity name'].isin(to_remove)]
                            self.relation['entity1'] = self.relation['entity1'].replace(to_remove, new_name)
                            self.relation['entity2'] = self.relation['entity2'].replace(to_remove, new_name)
                            stop = True
                else:
                    self.logger.info("No duplicate found.")
                    break
            
        def level1():
            '''extract relations between each paper entity and entities of method, problem, and domain type in its context.'''
            ref_df, citations = preprocess()
            for _, row in citations.iterrows():
                # extract entities and relations (with the citations) from excerpts
                extraction = get_json_chat_completion(
                    [{"role": "user", "content": LEVEL1.format(citations=str(row['list']), title=self.title)+row['context']}],
                    self.logger
                )
                
                drop_list = []
                # clear extraction
                for ind, unit in enumerate(extraction):
                    if 'entity_name' in unit:
                        unit['entity name'] = unit['entity_name']
                    if 'entity_type' in unit:
                        unit['entity type'] = unit['entity_type']
                    entity_type = unit.get('entity type')
                    entity_name = unit.get('entity name')
                    if entity_name and entity_type and (entity_type in ['method', 'problem', 'domain']):
                        # entity without type specified or wrong specification will be dropped
                        k = ["entity name", "entity type", "description"]
                        temp_entity = {
                            _k: unit[_k] for _k in k
                        }
                        self.entity.loc[len(self.entity)] = temp_entity
                    else:
                        drop_list.append(ind)
                extraction = [item for index, item in enumerate(extraction) if index not in drop_list]

                if isinstance(row['list'], str):
                    row['list'] = json.loads(row['list'])
                for ind in row['list']:
                    title = ref_df.loc[ind-1]['title']
                    # create paper entity based on the citation list
                    self.entity.loc[len(self.entity)] = {
                        "entity name": title,
                        "entity type": "paper",
                        "timestamp": ref_df.loc[ind-1]['date']
                    }
                    # add citation relation
                    self.relation.loc[len(self.relation)] = {
                        "entity1": self.title,
                        "relation": "cites",
                        "entity2": title
                    }
                    # update relation list
                    for unit in extraction:
                        temp_relation = {
                            "entity1": unit['entity name'],
                            "relation": unit['relation'],
                            "entity2": title
                        }
                        self.relation.loc[len(self.relation)]= temp_relation
        
        def level2():
            '''extract relations between entities that are not paper type'''
            self.entity.drop_duplicates(subset='entity name', inplace=True, keep='first')
            smooth()
            other_entity = self.entity[(self.entity['entity type'] != "paper") & (self.entity['entity type'] != "topic")]['entity name'].values
            relation2 = get_json_chat_completion(
                [{"role": "user", "content": LEVEL2.format(entities=json.dumps(list(other_entity)), content=self.related_work)}],
                self.logger
            )
            rel2_df = pd.DataFrame(relation2)
            rel2_entity = pd.concat([rel2_df['entity1'], rel2_df['entity2']])
            unexpected_entity_index = rel2_entity[~rel2_entity.isin(self.entity['entity name'])].index
            if not unexpected_entity_index.empty:
                self.logger.info(f"The indexes of unexpected entities in the level2 relation extraction: {list(unexpected_entity_index)}. These entities will not be added.")
                rel2_df.drop(index=unexpected_entity_index, inplace=True)
            else:
                self.logger.info("The entities of level2 relation list match the entity list.")
            self.relation = pd.concat([self.relation, rel2_df], ignore_index=True)
            
        level1()
        level2()    
        
    def _complete_paper_info(self) -> int:
        '''get abstract and publication date information for paper entities mentioned in the related work'''
        paper = self.entity[(self.entity['entity type'] == 'paper') & (self.entity['entity name'] != self.title)]
        success_cnt = 0
        for i in tqdm(range(len(paper)), desc='complete paper entity information'):
            p = paper.iloc[i]
            p_title = p['entity name']
            p_date = p['timestamp']
            search_res = title_search(p_title).run_pipeline()
            if search_res:
                # import pdb; pdb.set_trace()
                self.entity.loc[p.name] = [
                    search_res['title'],
                    "paper",
                    search_res['date'] if search_res['date'] else p_date,
                    search_res['abstract'] if search_res['abstract'] else p['description']
                ]

                success_cnt += 1
            else:
                self.logger.info(f"failed to update paper entity: {p_title}")
        self.logger.info(f"successfully complete {success_cnt} papers out of {len(paper)}")
        return success_cnt
        
    def _print_graph_info(self, success_update_paper_num):
        # import pdb; pdb.set_trace()
        G = nx.Graph()
        for _, row in self.relation.iterrows():
            e1, rel, e2 = row.tolist()
            G.add_edge(e1, e2, relation=rel)
        # get basic info
        connected_components = len(list(nx.connected_components(G)))
        node_num = G.number_of_nodes()
        edge_num = G.number_of_edges()
        
        if node_num != len(self.entity):
            self.logger.warning("There is a mismatch between the entities in the relationship edges and the list of entities.")
            # import pdb; pdb.set_trace() # TODO: check mismatch entities
        
        # get numbers of all the entity typies
        entity_type = ['method', 'problem', 'paper', 'domain']
        nums = {k: None for k in entity_type}
        for k in entity_type:
            nums[k] = len(self.entity[self.entity['entity type'] == k])
        
        nums.update(
            {
                "connected components": connected_components,
                "node": node_num,
                "edge": edge_num,
                "successfully updated papers": success_update_paper_num
            }
        )
        self.info = nums
        self.logger.info(
            f'''Basic information of the graph extracted from paper titled {self.title}:
            {json.dumps(nums, indent=4)}'''
        )
        
    def _visualize(self, path: Path):
        '''Deprecated: This method performs poorly when there are a large number of graph nodes.'''
        from pyvis.network import Network
        
        def get_color(entity_type):
            color_map = {
                "problem": "#264653",
                "paper": "#2A9D8F",
                "method": "#E9C46A",
                "domain": "#F4A261",
                "topic": "#780000"
            }
            return color_map.get(entity_type, "gray")
        
        G = nx.Graph()
        for _, row in self.relation.iterrows():
            e1, rel, e2 = row.tolist()
            G.add_edge(e1, e2, relation=rel)
        
        for _, e in self.entity.iterrows():
            node = e["entity name"]
            G.add_node(node, 
                    entity_type=e.get("entity type"),
                    timestamp=e.get("timestamp"),
                    description=e.get("description"))
            
        net = Network(height="750px", width="100%", directed=True)

        for node, data in G.nodes(data=True):
            net.add_node(node, label=node, title=str(data), color=get_color(data.get("entity_type")))

        for source, target, data in G.edges(data=True):
            net.add_edge(source, target, title=data['relation'], arrowStrikethrough=False)

        net.toggle_physics(True) 
        net.show_buttons(filter_=['physics'])  

        net.write_html(str(path / f"graph.html"))    
        
    def extract(self, temp_save, isretry):
        self._extract_theme()
        self._extract_related_papers(temp_save=temp_save, isretry=isretry)
        success_update_paper_num = self._complete_paper_info()
        self._print_graph_info(success_update_paper_num)
        
    def download(self, vis=False):
        dir_path = self.path / "final_result"
        os.makedirs(dir_path, exist_ok=True)
        resume_info = {
            "title": self.title,
            "year": self.timepoint,
            "topic": self.topic,
            "entity": str(dir_path / "entity.csv"),
            "relation": str(dir_path / "relation.csv")
        }
        #todo
        with open(dir_path / "resume.json", 'w') as f:
            json.dump(resume_info, f, indent=4)
        self.entity.to_csv(dir_path / "entity.csv", index=False)
        self.relation.to_csv(dir_path / "relation.csv", index=False)
        if vis:
            self._visualize(self.path/ "final result")
 
class Graph:
    def __init__(self, entity: pd.DataFrame, relation: pd.DataFrame, title: str, year: int, topic: str):
        self.entity = entity
        self.relation = relation
        self.title = title
        self.timepoint = year
        self.topic = topic
        self.problem = list(self.entity[self.entity['entity type'] == "problem"]['entity name'].values)
        
    # FIXME: an API get information (input is problem and topic) not a detailed method
    # FIXME: decouple get_citation_pdf from extend
    #region
    # def extend(self, pdf_path, human_intervention=False, isretry=False, temp_save=False) -> Tuple[List[str], int]:
    #     '''Deprecated. Going to be replaced...add more paper entities to the graph by web-search. It may fail due to connection.'''
    #     import requests
    #     import time
    #     from bs4 import BeautifulSoup
    #     os.makedirs(pdf_path, exist_ok=True)
    #     new_papers = 0
    #     total_papers = []
    #     start_extend = time.time()
        
    #     def download_paper(url, title):
    #         for _ in range(10):
    #             res = requests.get(url=url)
    #             if res.status_code == 200:
    #                 break
    #             elif res.status_code == 404:
    #                 print("Uncommon error: cannot find the page. Stop searching")
    #                 return
    #             elif res.status_code == 429:
    #                 print("Request rate exceeded, try again later.")
    #                 time.sleep(3)
    #         if not res.status_code == 200:
    #             print("Failed to get response from arxiv.")
    #             return
    #         else:
    #             with open(Path(pdf_path) / f"{title}.pdf", 'wb') as f:
    #                 if f.write(res.content):
    #                     return True
            
    #     # TODO: move out later
    #     def search_for_related_papers(start_year: str, topic: str, _problem: str) -> List[Dict]:
    #         PREPS = [
    #             "in", 
    #             "on", 
    #             "at", 
    #             "by", 
    #             "with", 
    #             "from", 
    #             "to", 
    #             "of", 
    #             "about", 
    #             "after", 
    #             "before", 
    #             "during", 
    #             "through", 
    #             "under"
    #         ]
    #         # drop prepositions to get more related papers
    #         problem_list = _problem.lower().split(' ')
    #         for prep in PREPS:
    #             if prep in problem_list:
    #                 pop_ind = problem_list.index(prep)
    #                 problem_list.pop(pop_ind)
    #         problem = ' '.join(problem_list)

    #         url = "https://arxiv.org/search/advanced"
    #         params = {
    #             "advanced": "",
    #             "terms-0-operator": "AND",
    #             "terms-0-term": topic,
    #             "terms-0-field": "all",
    #             "terms-1-operator": "AND",
    #             "terms-1-term": problem,
    #             "terms-1-field": "all",
    #             "classification-computer_science": "y",
    #             "classification-physics_archives": "all",
    #             "classification-include_cross_list": "include",
    #             "date-filter_by": "specific_year",
    #             "date-year": start_year,
    #             "date-date_type": "submitted_date_first",
    #             "abstracts": "show",
    #             "size": "100",
    #             "order": "-announced_date_first",
    #             "format": "rss"
    #         }

    #         print(f"Try to search for more papers with the keywords: topic={topic}, problem={problem}, year={start_year}")
    #         for _ in range(10):
    #             res = requests.get(url=url, params=params)
    #             if res.status_code == 200:
    #                 break
    #             elif res.status_code == 400:
    #                 print("Wrong params. Stop searching...")
    #                 return
    #             elif res.status_code == 404:
    #                 print("Uncommon error: cannot find the page. Stop searching")
    #                 return
    #             elif res.status_code == 429:
    #                 print("Request rate exceeded, try again later.")
    #                 time.sleep(3)
    #         if not res.status_code == 200:
    #             print("Failed to get response from arxiv.")
    #             return
            
    #         soup = BeautifulSoup(res.text, 'html.parser')
    #         blocks = soup.find_all('li', class_='arxiv-result')
    #         result_data = []

    #         # process data
    #         for block in blocks:
    #             try:
    #                 pdf_link = soup.find('a', href=True, string='pdf')['href']

    #                 title = block.find('p', class_='title is-5 mathjax').text.strip()

    #                 _abstract = block.find('p', class_='abstract mathjax').text.strip()
    #                 abstract = re.search(r"▽ More(.*)△ Less", _abstract, re.DOTALL).group(1).strip()

    #                 submitted_date = block.find('p', class_='is-size-7').text.strip()
    #                 year = re.search(r"originally announced [a-zA-Z]+ (\d{4})", submitted_date).group(1)

    #                 _data = {
    #                     "title": title,
    #                     "abstract": abstract,
    #                     "year": year, 
    #                     "pdf_link": pdf_link
    #                 }
    #                 result_data.append(_data)

    #             except Exception as e:
    #                 print(f"Failed to load an data entry: {e} Skip it.")
    #                 traceback.print_exc()
    #         if not len(result_data):
    #             print("No related paper found")
    #         print(f"Got {len(result_data)} papers in total for {start_year}.")

    #         # filter
    #         if human_intervention:
    #             getdrop = False
    #             for _ in range(3):
    #                 drop = input("List the index of the paper you do not want to add to the graph. Please give a valid list.")
    #                 try:
    #                     drop = json.loads(drop)
    #                     for d in drop:
    #                         assert isinstance(d, int) and d >=0 and d<len(result_data)
    #                     getdrop = True
    #                     break
    #                 except:
    #                     pass
    #             if getdrop:
    #                 for d in drop:
    #                     result_data.pop(d)
    #         else:
    #             print("check if these papers actually related to our query using LLMs...")
    #             get_drop = []
    #             for ind, paper in enumerate(result_data):
    #                 title = paper.get('title')
    #                 abstract = paper.get('abstract')
    #                 if title and abstract:
    #                     isrelated = 'yes' in get_chat_completion(
    #                         [
    #                             {
    #                                 "role": "user",
    #                                 "content": FILTER_IRRELEVANT.format(title=title, abstract=abstract)
    #                             }
    #                         ]
    #                     ).lower()
    #                     print(f"{title}, related={isrelated}")
    #                     if not isrelated:
    #                         get_drop.append(ind)
    #                 else:
    #                     print("Cannot get basic information of the paper from response... Skip it.")
    #             for di in get_drop:
    #                 result_data.pop(di)

    #         # add into graph and download PDFs    
    #         if result_data:
    #             print(f"Filtered by LLMs, the following papers are related: {', '.join([p['title'] for p in result_data])}")
    #             for _data in result_data:
    #                 title = _data['title']
    #                 abstract = _data['abstract']
    #                 year = _data['year']
    #                 link = _data['pdf_link']
    #                 # create node and realtion
    #                 self.entity.loc[len(self.entity)] = {
    #                                     "entity name": title,
    #                                     "entity type": "paper",
    #                                     "timestamp": year,
    #                                     "description": abstract
    #                                 }
    #                 rel_2 = pd.DataFrame(
    #                                     {
    #                                         'entity1': [title, title],
    #                                         'relation': ['relate to', 'provide solution to'],
    #                                         'entity2': [topic, _problem]
    #                                     }
    #                                 )
    #                 self.relation = pd.concat([self.relation, rel_2], ignore_index=True)

    #                 if link:
    #                     if download_paper(link, title):
    #                         print(f"Successfully download related paper titled *{title}*")
    #                     else:
    #                         print(f"Error occurred when downloading *{title}*")
    #                 else:
    #                     print(f"Cannot find pdf_link for the paper titled *{title}*")

    #         return result_data

    #     def get_citation_pdf():
    #         def get_arxiv_link(title):
    #             def simple_string(string):
    #                 return string.lower().replace(" ", "").strip()
    #             url = "https://arxiv.org/search/"
    #             params = {
    #                 "query": title,
    #                 "searchtype": "title",
    #                 "abstracts": "show",
    #                 "order": "-announced_date_first",
    #             }
    #             for _ in range(10):
    #                 res = requests.get(url=url, params=params)
    #                 if res.status_code == 200:
    #                     break
    #                 elif res.status_code == 400:
    #                     print("Wrong params. Stop searching...")
    #                     return
    #                 elif res.status_code == 404:
    #                     print("Uncommon error: cannot find the page. Stop searching")
    #                     return
    #                 elif res.status_code == 429:
    #                     print("Request rate exceeded, try again later.")
    #                     time.sleep(3)
    #             if not res.status_code == 200:
    #                 print("Failed to get response from arxiv.")
    #                 return

    #             soup = BeautifulSoup(res.text, 'html.parser')
    #             blocks = soup.find_all('li', class_='arxiv-result')
    #             if not blocks:
    #                 print(f"Cannot find the paper titled *{title}* on arxiv.")
    #                 return
    #             else:
    #                 for block in blocks: # usually, len(blocks) is either 0 or 1
    #                     # TODO: abstract similarity can be introduced for comparison
    #                     pdf_link = soup.find('a', href=True, string='pdf')['href']
    #                     res_title = block.find('p', class_='title is-5 mathjax').text.strip()
    #                     if simple_string(title) == simple_string(res_title):
    #                         return pdf_link
    #                     else:
    #                         print(f"Only found paper titled *{res_title}* when searching for the one titled *{title}*")

    #         papers = self.entity[self.entity['entity type'] == "paper"].to_dict(orient='records')
    #         print(f"Downloading papers mentioned in related work: {json.dumps([p.get('entity name') for p in papers], indent=4)}")
    #         for p in papers:
    #             p_title = p.get('entity name')
    #             p_abstract = p.get('description')
    #             p_year = p.get('timestamp')
    #             if p_title != self.title:
    #                 pdf_link = get_arxiv_link(p_title)
    #                 if pdf_link:
    #                     if download_paper(pdf_link, p_title):
    #                         print(f"successfully download the paper in related work: {p_title}")
    #                         _pdata = {
    #                             "title": p_title,
    #                             "abstract": p_abstract,
    #                             "year": p_year
    #                         }
    #                         total_papers.append(_pdata)
    #                     else:
    #                         print(f"Error occurred when downloading *{p_title}*")
                    
    #     problems = self.entity[self.entity['entity type'] == "problem"]['entity name'].values
    #     for p in list(problems):
    #         for y in range(self.timepoint, 2024):
    #             temp_res = search_for_related_papers(
    #                 start_year=y,
    #                 topic=self.topic,
    #                 _problem=p
    #             )
    #             new_papers += len(temp_res)
    #             total_papers += temp_res
    #             check_point = time.time()
    #             if check_point - start_extend > 1000:
    #                 print("Too much time spent getting more papers by querying semantic scholar. Quittng...")
    #     if not isretry:
    #         get_citation_pdf()

    #     if temp_save: # FIXME: path
    #         self.entity.to_csv('/root/mypaperTKG/PaperTKG/test1/source_paper/output/Chain-of-Thought Prompting Elicits Reasoning in Large Language Models/final_result/entity.csv', index=False)
    #         self.relation.to_csv('/root/mypaperTKG/PaperTKG/test1/source_paper/output/Chain-of-Thought Prompting Elicits Reasoning in Large Language Models/final_result/relation.csv', index=False)

        
    #     return new_papers, total_papers
    #endregion