import requests
import arxiv
import time
from typing import Dict
from bs4 import BeautifulSoup
from urllib.parse import quote
from .extract_web_abs import extract_abstract
import urllib.request
import urllib.parse
import time
import xml.etree.ElementTree as ET
import os
import requests

# 清除任何代理设置
# os.environ['HTTP_PROXY'] = ''
# os.environ['HTTPS_PROXY'] = ''
# os.environ['http_proxy'] = ''
# os.environ['https_proxy'] = ''


os.environ['HTTP_PROXY'] = 'http://10.16.4.198:7890'
os.environ['HTTPS_PROXY'] = 'http://10.16.4.198:7890'
os.environ['http_proxy'] = 'http://10.16.4.198:7890'
os.environ['https_proxy'] = 'http://10.16.4.198:7890'

proxies = {
    'http': 'http://10.16.4.198:7890',
    'https': 'http://10.16.4.198:7890',
}
class title_search:
    def __init__(self, title) -> None:
        self.title = title
    
    def match_title(self, searched_title: str) -> bool:
        '''
        check if the paper obtained from websearch is exact the paper we want
        '''
        return self.title.strip().lower().replace(" ","") == searched_title.strip().lower().replace(" ","")
    
    def s2(self) -> bool | Dict:
        fields = "title,url,authors,abstract,publicationDate,year"
        url = f"https://api.semanticscholar.org/graph/v1/paper/search/match?query={self.title}&fields={fields}"

        for i in range(3):
            try:
                res_ = requests.get(url).json()
                break
            except Exception as e:
                print(f"web request error: {e}")
                time.sleep(5)
                if i == 2:
                    print("Failed to fetch data after 3 attempts.")
                    return False
                
        if "data" in res_:
            # parse the returned result
            res_data = res_['data'][0]
            title_ = res_data['title']
            if not self.match_title(title_):
                print(f"[s2] search for paper: {self.title}, but get the paper: {title_}")
                return False
            authors_ = ','.join([unit['name'] for unit in res_data['authors']])
            link_ = res_data['url']
            abstract_ = res_data['abstract']
            if not abstract_:
                if link_:
                    print(f'''failed to scrape the abstract from senmantic scholar: {self.title},
                                        try to scrape it from link: {link_}''')
                    web_abs = extract_abstract(url=link_, paper_title=self.title)
                    if web_abs:
                        abstract_ = web_abs
                    else:
                        print(f"failed to get the abstract of paper: {self.title} using senmantic scholar.")
                        abstract_ = ''
                else:
                    print(f"failed to get the abstract and link of paper: {self.title} using senmantic scholar.")
                    abstract_ = ''
                    link_ = ''
                    
            date_ = res_data['year'] if res_data['publicationDate'] is None else res_data['publicationDate']
            
            paper_metadata = {
                'title': title_,
                'authors': authors_,
                'link': link_,
                'abstract': abstract_,
                'date': date_
            }
            return paper_metadata
        else:
            print(f"cannot find the paper \{self.title}\ on semantic scholar.")
            return False
        

    def arxiv(self) -> bool | Dict:
            client = arxiv.Client()
            search = arxiv.Search(
                query=f"ti:{self.title}",
                max_results=1,
            )


            for _ in range(3):
                try:
    
                    print(11111111111111111)
                    res = client.results(search)
                    print(333333333333333)
                    paper_info = next(res, None)
                    print(333334444444333333)
                    if paper_info is not None:
                        _title = paper_info.title
                        print(333334444444555555555555)
                        if not self.match_title(_title):
                            print(f"[arxiv] search for paper: {self.title}, but get the paper: {_title}")
                            return False
                        print(11111114444444555555555555)
                        paper_metadata = {
                            'title': _title,
                            'link': paper_info.links[0].href,
                            'abstract': paper_info.summary,
                            'date': str(paper_info.updated.date())
                        }
                        time.sleep(1)
                        return paper_metadata
                except:
                    print(2222222222222222)
                    print(f"cannot find the paper \{self.title}\ on arxiv.")
                    time.sleep(3)
            return False       
 

        
    def google_scholar(self) -> bool | Dict:
        
        '''only scrape the abstract. information of authors and date will not be updated if using google schoalr search.'''
        
        # request url
        encoded_title = quote(self.title)
        url = f"http://www.google.com/scholar?as_q=&as_epq={encoded_title}&hl=en"
        # headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}
        for i in range(3):
            try:
                time.sleep(1)
                response = requests.get(url)
                break
            except requests.ConnectionError as e:
                print("Connection error occurred when connecting to google scholar:", e)
                return False
            except Exception as e:
                print(f"web request error: {e}")
                if i == 2:
                    print("Failed to fetch data after 3 attempts.")
                    return False
                
        html_content = response.text
        
        # parse the content
        soup = BeautifulSoup(html_content, 'html.parser')

        try:
            target_div = soup.find('div', class_='gs_r gs_or gs_scl gs_fmar')
            if target_div:
                a_tag = target_div.find('h3', class_='gs_rt').find('a')
                paper_title = a_tag.get_text()
                if not self.match_title(self.title, paper_title):
                    print(f"[google scholar] search for paper: {self.title}, but get the paper: {paper_title}")
                    return False
                paper_link = a_tag.get('href')
                abstract_element = target_div.find('div', class_='gsh_csp').get_text()
                
                if not abstract_element:
                    if paper_link:
                        print(f'''failed to scrape the abstract from google scholar: {self.title},
                                    try to scrape it from link: {paper_link}''')
                        web_abs = extract_abstract(url=paper_link, paper_title=self.title)
                        if not web_abs:
                            print(f"failed to get the abstract of paper: {self.title} using google scholar.")
                            abstract_ = ''
                        else:
                            abstract_ = web_abs
                    else:
                        print(f"failed to get the information of paper: {self.title} using google scholar.")
                        return False
                else:
                    abstract_ = abstract_element
                    
                paper_metadata = {
                                'title': paper_title,
                                'authors': '',
                                'link': paper_link,
                                'abstract': abstract_,
                                'date': ''
                            }
                return paper_metadata
            
            else:
                print(f"failed to get the information of paper: {self.title} using google scholar.")
                return False
            
        except Exception as e:
            print(f"try to find the paper information on google scholar, but failed to parse the page: {e}")
            return False
            
    def run_pipeline(self) -> bool | Dict:
        
        '''Sequentially search in arXiv, Semantic Scholar, and Google Scholar 
        until successfully obtaining the article information.'''
        
        
        arxiv_res = self.arxiv()
        
        if not arxiv_res:
            
            s2_res = self.s2()
            if not s2_res:
                gs_res = self.google_scholar()
                if not gs_res:
                    print(f"websearch result: Failed to get the information of {self.title}.")
                    return False
                else:
                    return gs_res
            else:
                return s2_res
        else:
            return arxiv_res
