import requests
import arxiv
from typing import Dict
from bs4 import BeautifulSoup
from urllib.parse import quote
from .logger import get_logger
from .extract_web_abs import extract_abstract

logger = get_logger(__name__)

class title_search:
    def __init__(self, title) -> None:
        self.title = title
    
    def match_title(self, searched_title: str) -> bool:
        '''
        check if the paper obtained from websearch is exact the paper we want
        '''
        return self.title.strip().lower() == searched_title.strip().lower()
    
    def s2(self) -> bool | Dict:
        fields = "title,url,authors,abstract,publicationDate,year"
        url = f"https://api.semanticscholar.org/graph/v1/paper/search/match?query={self.title}&fields={fields}"

        for i in range(3):
            try:
                res_ = requests.get(url).json()
                break
            except Exception as e:
                logger.info(f"web request error: {e}")
                if i == 2:
                    logger.error("Failed to fetch data after 3 attempts.")
                    return False
                
        if "data" in res_:
            # parse the returned result
            res_data = res_['data'][0]
            title_ = res_data['title']
            if not self.match_title(title_):
                logger.info(f"[s2] search for paper: {self.title}, but get the paper: {title_}")
                return False
            authors_ = ','.join([unit['name'] for unit in res_data['authors']])
            link_ = res_data['url']
            abstract_ = res_data['abstract']
            if not abstract_:
                if link_:
                    logger.info(f'''failed to scrape the abstract from senmantic scholar: {self.title},
                                        try to scrape it from link: {link_}''')
                    web_abs = extract_abstract(url=link_, paper_title=self.title)
                    if web_abs:
                        abstract_ = web_abs
                    else:
                        logger.info(f"failed to get the abstract of paper: {self.title} using senmantic scholar.")
                        abstract_ = ''
                else:
                    logger.info(f"failed to get the abstract and link of paper: {self.title} using senmantic scholar.")
                    abstract_ = ''
                    link_ = ''
                    
            date_ = res_data['year'] if res_data['publicationDate'] is None else res_data['publicationDate']
            
            paper_metadata = {
                'title': self.title,
                'authors': authors_,
                'link': link_,
                'abstract': abstract_,
                'date': date_
            }
            return paper_metadata
        else:
            logger.info(f"cannot find the paper \{self.title}\ on semantic scholar.")
            return False
        
    def arxiv(self) -> bool | Dict:
        client = arxiv.Client()
        search = arxiv.Search(
            query=f"ti:{self.title}",
            max_results=1,
        )
        res = client.results(search)
        paper_info = next(res, None)
        if paper_info is not None:
            _title = paper_info.title
            if not self.match_title(_title):
                logger.info(f"[arxiv] search for paper: {self.title}, but get the paper: {_title}")
                return False
            paper_metadata = {
                'title': _title,
                'authors': ','.join([str(aut) for aut in paper_info.authors]),
                'link': paper_info.links[0].href,
                'abstract': paper_info.summary,
                'date': str(paper_info.updated.date())
            }
            return paper_metadata
        else:
            logger.info(f"cannot find the paper \{self.title}\ on arxiv.")
            return False
        
    def google_scholar(self) -> bool | Dict:
        
        '''only scrape the abstract. information of authors and date will not be updated if using google schoalr search.'''
        
        # request url
        encoded_title = quote(self.title)
        url = f"http://www.google.com/scholar?as_q=&as_epq={encoded_title}&hl=en"
        headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}
        
        for i in range(3):
            try:
                response = requests.get(url, headers=headers)
                break
            except Exception as e:
                logger.info(f"web request error: {e}")
                if i == 2:
                    logger.error("Failed to fetch data after 3 attempts.")
                    return False
                
        html_content = response.text
        
        # parse the content
        soup = BeautifulSoup(html_content, 'html.parser')

        try:
            target_div = soup.find('div', class_='gs_r gs_or gs_scl gs_fmar')
            if target_div:
                a_tag = target_div.find('h3', class_='gs_rt').find('a')
                paper_title = a_tag.get_text()
                if not self.match_title(self.title, paper_title):
                    logger.info(f"[google scholar] search for paper: {self.title}, but get the paper: {paper_title}")
                    return False
                paper_link = a_tag.get('href')
                abstract_element = target_div.find('div', class_='gsh_csp').get_text()
                
                if not abstract_element:
                    if paper_link:
                        logger.info(f'''failed to scrape the abstract from google scholar: {self.title},
                                    try to scrape it from link: {paper_link}''')
                        web_abs = extract_abstract(url=paper_link, paper_title=self.title)
                        if not web_abs:
                            logger.info(f"failed to get the abstract of paper: {self.title} using google scholar.")
                            abstract_ = ''
                        else:
                            abstract_ = web_abs
                    else:
                        logger.info(f"failed to get the information of paper: {self.title} using google scholar.")
                        return False
                else:
                    abstract_ = abstract_element
                    
                paper_metadata = {
                                'title': self.title,
                                'authors': '',
                                'link': paper_link,
                                'abstract': abstract_,
                                'date': ''
                            }
                return paper_metadata
            
            else:
                logger.info(f"failed to get the information of paper: {self.title} using google scholar.")
                return False
            
        except Exception as e:
            print(f"try to find the paper information on google scholar, but failed to parse the page: {e}")
            return False
            
    def run_pipeline(self) -> bool | Dict:
        
        '''Sequentially search in arXiv, Semantic Scholar, and Google Scholar 
        until successfully obtaining the article information.'''
        
        arxiv_res = self.arxiv()
        if not arxiv_res:
            s2_res = self.s2()
            if not s2_res:
                gs_res = self.google_scholar()
                if not gs_res:
                    logger.info(f"websearch result: Failed to get the information of {self.title}.")
                    return False
                else:
                    return gs_res
            else:
                return s2_res
        else:
            return arxiv_res
