import requests
import time
from bs4 import BeautifulSoup
import json

class LeetCodeScraper:
    def __init__(self):
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36',
            'Accept-Language': 'en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7',
            'Referer': 'https://leetcode.cn/'
        }
        self.base_url = "https://leetcode.cn"
        
    def get_all_slugs(self):
        """通过API获取所有题目的slug列表"""
        all_slugs = []
        page = 1
        
        while True:
            url = f"{self.base_url}/api/problems/algorithms/?page={page}"
            response = requests.get(url, headers=self.headers)
            data = response.json()
            
            if not data.get('stat_status_pairs'):
                break
                
            for problem in data['stat_status_pairs']:
                # 获取英文slug（中文站返回的也是英文slug）
                slug = problem['stat']['question__title_slug']
                all_slugs.append(slug)
                
            page += 1
            time.sleep(1)  # 遵守Robots协议
            
        return all_slugs

    def get_problem_data(self, slug):
        try:
            url = f"{self.base_url}/problems/{slug}"
            response = requests.get(url, headers=self.headers, timeout=10)
            response.raise_for_status()
            
            soup = BeautifulSoup(response.text, 'html.parser')
            
            # 提取题目描述（首段）
            question_div = soup.find('div', class_='content__j51F')
            if not question_div:
                return None
                
            question_lines = [line.strip() for line in question_div.stripped_strings]
            question = " ".join(question_lines[:5])  # 保留核心描述
            
            # 提取函数声明模板
            kotlin_decl = self.get_code_template(soup, 'kotlin')
            php_decl = self.get_code_template(soup, 'php')
            scala_decl = self.get_code_template(soup, 'scala')
            
            # 提取测试用例
            test_cases = []
            examples = soup.find_all('li', class_='example-item')
            for example in examples:
                test_cases.append(example.get_text(strip=True))
            test_case = "\n".join(test_cases)
            
            return {
                "prompt": f"{kotlin_decl}\n    \"{question}\"",
                "slug": slug,
                "question": question,
                "declaration": {
                    "kotlin": f"class Solution {{\n    {kotlin_decl}\n}}",
                    "php": f"class Solution {{\n    {php_decl}\n}}",
                    "scala": f"object Solution {{\n    {scala_decl}\n}}"
                },
                "test_case": test_case
            }
            
        except Exception as e:
            print(f"Error fetching problem {slug}: {str(e)}")
            return None

    def get_code_template(self, soup, language):
        """动态提取题目对应的函数声明模板"""
        code_block = soup.find('div', class_='code-block')
        if not code_block:
            return self.generate_default_template(language)
            
        code_lines = code_block.find_all('li')
        for line in code_lines:
            if language in line.text.lower():
                # 提取函数名和参数
                func_match = re.search(r'fun|function|def\s+(\w+)\s*\(', line.text)
                if not func_match:
                    return self.generate_default_template(language)
                    
                func_name = func_match.group(1)
                # 提取参数部分（需要更精确的正则）
                param_match = re.search(rf'{language}\w+\s+(\w+)\s*\((.*?)\)', line.text)
                if not param_match:
                    return self.generate_default_template(language)
                    
                params = param_match.group(2)
                
                if language == 'kotlin':
                    return f"fun {func_name}({params}): Int {{\n    \n}}"
                elif language == 'php':
                    return f"function {func_name}({params}): int {{\n    \n}}"
                elif language == 'scala':
                    return f"def {func_name}({params}): Int = {{\n    \n}}"
                    
        return self.generate_default_template(language)

    def generate_default_template(self, language):
        """生成默认函数模板（当无法解析时）"""
        if language == 'kotlin':
            return "fun function(...): Int {\n    \n}"
        elif language == 'php':
            return "function function(...): int {\n    \n}"
        elif language == 'scala':
            return "def function(...): Int = {\n    \n}"

    def scrape_problems(self, output_file, max_count=500):
        """爬取指定数量的题目并保存为JSON"""
        slugs = self.get_all_slugs()[:max_count]
        results = {}
        
        for slug in slugs:
            print(f"Scraping problem {slug}...")
            data = self.get_problem_data(slug)
            if data:
                results[slug] = data
            time.sleep(2)  # 遵守Robots协议
                
        with open(output_file, 'w', encoding='utf-8') as f:
            json.dump(results, f, ensure_ascii=False, indent=2)
        print(f"Scraped {len(results)} problems successfully")

# 修改主程序部分，仅测试roman-to-integer
if __name__ == "__main__":
    scraper = LeetCodeScraper()
    test_slug = "roman-to-integer"
    problem_data = scraper.get_problem_data(test_slug)
    
    if problem_data:
        print("测试成功！以下是生成的JSON数据：")
        print(json.dumps(problem_data, ensure_ascii=False, indent=2))
    else:
        print("测试失败，未能获取题目数据。")

