import aiohttp
import tenacity
import asyncio
from termcolor import colored
from typing import List, Dict, Tuple
from enum import Enum
from unidiff import PatchSet

from swegraft.utils.common import wait_coros_with_progress
from swegraft.utils.github import get_token, get_headers
from swegraft.utils.llm import make_llm_from_provider_config

PR_KEYWORDS = {
    "close",
    "closes",
    "closed",
    "fix",
    "fixes",
    "fixed",
    "resolve",
    "resolves",
    "resolved",
}

SKIP_PR_KEYWORDS = {
    "mispelling",
    "typo",
    "dependabot",
}


# implement comparison for RelevanceLevel
class RelevanceLevel(Enum):
    HIGH = 3
    MID = 2
    LOW = 1
    NO = 0

    def __lt__(self, other: "RelevanceLevel"):
        return self.value < other.value

    def __gt__(self, other: "RelevanceLevel"):
        return self.value > other.value

    def __eq__(self, other: "RelevanceLevel"):
        return self.value == other.value

    def __ge__(self, other: "RelevanceLevel"):
        return self.value >= other.value

    def __le__(self, other: "RelevanceLevel"):
        return self.value <= other.value

    def __ne__(self, other: "RelevanceLevel"):
        return self.value != other.value


# from SWE-Bench-Verified and Multi-SWE-Bench
SKIP_REPOS = [
    # C
    "facebook/zstd",
    "jqlang/jq",
    "ponylang/ponyc",
    # C++
    "catchorg/Catch2",
    "fmtlib/fmt",
    "nlohmann/json",
    "simdjson/simdjson",
    "yhirose/cpp-httplib",
    # Go
    "cli/cli",
    "grpc/grpc-go",
    "zeromicro/go-zero",
    # Java
    "alibaba/fastjson2",
    "elastic/logstash",
    "mockito/mockito",
    # JavaScript
    "anuraghazra/github-readme-stats",
    "axios/axios",
    "expressjs/express",
    "iamkun/dayjs",
    "Kong/insomnia",
    "sveltejs/svelte",
    # Rust
    "BurntSushi/ripgrep",
    "clap-rs/clap",
    "nushell/nushell",
    "serde-rs/serde",
    "sharkdp/bat",
    "sharkdp/fd",
    "rayon-rs/rayon",
    "tokio-rs/bytes",
    "tokio-rs/tokio",
    "tokio-rs/tracing",
    # TypeScript
    "darkreader/darkreader",
    "mui/material-ui",
    "vuejs/core",
    # Python from SWE-Bench-Verified
    "matplotlib/matplotlib",
    "psf/requests",
    "scikit-learn/scikit-learn",
    "sphinx-doc/sphinx",
    "pytest-dev/pytest",
    "pylint-dev/pylint",
    "django/django",
    "astropy/astropy",
    "pydata/xarray",
    "pallets/flask",
    "mwaskom/seaborn",
    "sympy/sympy",
]


@tenacity.retry(
    stop=tenacity.stop_after_attempt(3),
    wait=tenacity.wait_exponential(multiplier=1, min=2, max=10),
    retry=tenacity.retry_if_exception_type(aiohttp.ClientError),
)
async def get_repo_topics(repo: str, omit_topics: List[str] = []) -> List[str]:
    """Get the topic labels of a repository"""
    url = f"https://api.github.com/repos/{repo}/topics"
    async with aiohttp.ClientSession() as session:
        async with session.get(url, headers=get_headers(get_token())) as response:
            response.raise_for_status()
            response = await response.json()
            topics = response.get("names", [])
            return [topic for topic in topics if topic.lower() not in omit_topics]


@tenacity.retry(
    stop=tenacity.stop_after_attempt(3),
    wait=tenacity.wait_exponential(multiplier=1, min=2, max=10),
    retry=tenacity.retry_if_exception_type(aiohttp.ClientError),
)
async def get_repos_by_topic(topic: str, min_stars: int = 100) -> List[Dict]:
    """Get repositories with topic, and stars >= min_stars"""
    all_repos = []
    per_page = 200
    page = 1
    while True:
        url = f"https://api.github.com/search/repositories?q=topic:{topic}+stars:>{min_stars}&sort=stars&order=desc&per_page={per_page}&page={page}"
        async with aiohttp.ClientSession() as session:
            async with session.get(url, headers=get_headers(get_token())) as response:
                response.raise_for_status()
                response = await response.json()
                all_repos.extend(response.get("items", []))
                if response.get("incomplete_results", False):
                    page += 1
                else:
                    break
    return [
        repo
        for repo in all_repos
        if repo["full_name"] not in SKIP_REPOS
        and repo["full_name"].lower() not in SKIP_REPOS
    ]


def extract_patches(
    patch: str, ignore_files: List[str] = [".gitignore"]
) -> tuple[str, str]:
    test_patch = ""
    fix_patch = ""
    # remote .gitignore part
    for hunk in PatchSet(patch):
        if any(ignore_file in hunk.path for ignore_file in ignore_files):
            continue
        if any(
            test_word in hunk.path for test_word in ["test", "tests", "e2e", "testing"]
        ):
            test_patch += str(hunk)
        else:
            fix_patch += str(hunk)
    return fix_patch, test_patch


def extract_patches_by_content(patch: str):
    test_patch = ""
    fix_patch = ""
    # remote .gitignore part
    for hunk in PatchSet(patch):
        if ".gitignore" in hunk.path:
            continue
        if any(
            test_word in str(hunk) for test_word in ["test", "tests", "e2e", "testing"]
        ):
            test_patch += str(hunk)
        else:
            fix_patch += str(hunk)
    return fix_patch, test_patch


@tenacity.retry(
    stop=tenacity.stop_after_attempt(3),
    wait=tenacity.wait_exponential(multiplier=1, min=2, max=10),
    retry=tenacity.retry_if_exception_type(aiohttp.ClientError),
)
async def get_repo_metadata(repo: str) -> Dict:
    """Get detailed metadata for a repository"""
    url = f"https://api.github.com/repos/{repo}"
    async with aiohttp.ClientSession() as session:
        async with session.get(url, headers=get_headers(get_token())) as response:
            response.raise_for_status()
            return await response.json()


async def search_repos_by_keywords(
    keywords: List[str] = None,
    min_stars: int = 100,
) -> List[Dict]:
    """Search repositories using multiple criteria"""
    coros = [_search_repos_single_keyward(keyword, min_stars) for keyword in keywords]
    repos = await wait_coros_with_progress(
        coros, "Searching for repositories with keywords"
    )
    repos = sum(repos, [])
    # dedup
    dedup_repos = []
    seen_repos = set()
    for llm_repo in repos:
        if llm_repo["full_name"] in seen_repos:
            continue
        dedup_repos.append(llm_repo)
        seen_repos.add(llm_repo["full_name"])
    return dedup_repos


@tenacity.retry(
    stop=tenacity.stop_after_attempt(3),
    wait=tenacity.wait_exponential(multiplier=1, min=2, max=10),
    retry=tenacity.retry_if_exception_type(aiohttp.ClientError),
)
async def _search_repos_single_keyward(
    keyword: str,
    min_stars: int = 100,
) -> List[Dict]:
    """Execute a single GitHub search query and return repositories"""
    all_repos = []
    per_page = 100
    page = 1
    keyword = keyword.replace(" ", "+")
    params = {
        "q": f"{keyword} stars:>={min_stars}",
        "per_page": per_page,
        "sort": "stars",
        "order": "desc",
    }
    url = "https://api.github.com/search/repositories"
    while True:
        try:
            params["page"] = page
            async with aiohttp.ClientSession() as session:
                async with session.get(
                    url, headers=get_headers(get_token()), params=params
                ) as response:
                    response.raise_for_status()
                    response_data = await response.json()
                    repos = response_data.get("items", [])
                    if not repos:
                        break
                    all_repos.extend(repos)
                    page += 1
                    if len(repos) < per_page:
                        break
        except Exception as e:
            print(f"Error in single query search: {e}")
            break
    return all_repos


@tenacity.retry(
    stop=tenacity.stop_after_attempt(3),
    wait=tenacity.wait_exponential(multiplier=1, min=2, max=10),
    retry=tenacity.retry_if_exception_type(aiohttp.ClientError),
)
async def get_repo_readme(repo: str) -> str:
    """Get the README content of a repository"""
    # Try common README filenames
    readme_files = [
        "README.md",
        "README.rst",
        "README.txt",
        "README",
        "readme.md",
        "readme.txt",
    ]

    for readme_file in readme_files:
        try:
            url = f"https://api.github.com/repos/{repo}/contents/{readme_file}"
            async with aiohttp.ClientSession() as session:
                async with session.get(
                    url, headers=get_headers(get_token())
                ) as response:
                    if response.status == 200:
                        response_data = await response.json()
                        if response_data.get("content"):
                            import base64

                            content = base64.b64decode(response_data["content"]).decode(
                                "utf-8"
                            )
                            return content
                    elif response.status == 404:
                        continue  # Try next filename
                    else:
                        response.raise_for_status()
        except Exception as e:
            print(f"Error fetching {readme_file} for {repo}: {e}")
            continue

    return ""  # Return empty string if no README found


@tenacity.retry(
    stop=tenacity.stop_after_attempt(3),
    wait=tenacity.wait_exponential(multiplier=1, min=5, max=60),
)
async def extract_keywords_from_readme_llm(
    readme_content: str, repo_name: str, provider_config: dict, model: str
) -> List[str]:
    """Use LLM to extract meaningful keywords from README content for repository search"""
    if not readme_content.strip():
        return []

    max_readme_length = 8000  # Reasonable limit for most LLMs
    if len(readme_content) > max_readme_length:
        readme_content = readme_content[:max_readme_length] + "..."

    prompt = f"""You are an expert at analyzing software repositories and extracting meaningful search keywords that reflect functionality and tool categories.

Given the README content of the repository "{repo_name}", extract keywords that capture BOTH the specific functionality AND broader tool categories. Focus on:

1. **Core Functions**: What does this software actually DO? (e.g., "text processing", "image recognition", "data visualization", "API testing")
2. **Primary Capabilities**: Main features and operations (e.g., "parsing", "encryption", "monitoring", "deployment", "compression")
3. **Tool Categories**: What TYPE of tool is this? (e.g., "CLI tool", "web framework", "database", "testing framework", "code analysis", "build tool")
4. **Problem Domain**: What domain does it operate in? (e.g., "developer tools", "system administration", "data science", "web development")
5. **Use Cases**: How is it used? (e.g., "CI/CD", "data analysis", "web scraping", "file conversion", "automation")
6. **Related Ecosystems**: What broader ecosystems does it belong to? (e.g., "command line utilities", "development environment", "system tools", "productivity tools")
7. **Problem Solving**: What problems does it solve? (e.g., "performance optimization", "code analysis", "database migration")
8. **Features**: What features does it have? (e.g., "GPU-accelerated", "multi-threading", "parallel processing", "distributed computing", "highlighting", "syntax checking", "code completion", "refactoring", "debugging", "profiling", "testing", "linting", "formatting", "version control", "code generation", "code review", "code analysis", "code refactoring", "code optimization", "code security", "code performance", "code readability", "code maintainability", "code portability", "code compatibility", "code security", "code performance", "code readability", "code maintainability", "code portability", "code compatibility")

README Content:
{readme_content}

Extract 5-10 keywords that include BOTH specific functionality AND broader categories to ensure comprehensive search coverage.

Examples of GOOD keywords (mix of specific and broad):
**For a tool like ripgrep:**
- Specific: "text search", "pattern matching", "grep replacement", "regex search"  

**For a web framework like Flask:**: "web framework", "HTTP routing", "template rendering", "REST API"

**For a testing tool:**: "unit testing", "test runner", "assertion library", "mock testing"

Include keywords that would help find similar tools in the same category, even if they have different specific implementations.

AVOID only these overly generic terms:
- "software", "project", "application", "code", "programming", "development", "open source", "github", "repository"

But DO include useful categorical terms like:
- "CLI tool", "web framework", "testing framework", "database", "library", "developer tools", etc.

Respond with a Python string list mixing BOTH specific functionality AND broader tool categories, sorted by relevance:
```python
["specific_function1", "tool_category1", "domain_area1", "specific_function2", "tool_category2", ...]
```"""

    llm_client = make_llm_from_provider_config(provider_config)
    try:
        response = await llm_client.chat_completion_async(
            messages=[{"role": "user", "content": prompt}],
            model=model,
            max_tokens=512,
            temperature=0.0,  # Lower temperature for more consistent keyword extraction
        )
        content = response.choices[0].message.content.strip()
        print(f"LLM response: {content}")
        keywords = content.split("```python")[1].split("```")[0]
        keywords = eval(keywords)

        # Filter and improve keywords to ensure they're functional
        filtered_keywords = []
        generic_terms = {
            "software",
            "project",
            "application",
            "app",
            "code",
            "development",
            "programming",
            "open source",
            "github",
            "repository",
            "python",
            "javascript",
            "java",
            "go",
            "rust",
            "typescript",
            "node.js",
        }

        for keyword in keywords:
            if isinstance(keyword, str) and len(keyword.strip()) > 2:
                clean_keyword = keyword.strip().lower()
                # Skip generic terms
                if clean_keyword in generic_terms:
                    continue
                filtered_keywords.append(clean_keyword)

        print(f"Filtered functional keywords: {filtered_keywords}")
        return filtered_keywords[
            :15
        ]  # Return top 15 keywords to match new prompt range

    except Exception as e:
        print(f"Error calling LLM for keyword extraction: {e}")

    return []


ISSUE_COUNT_QUERY = """
query ($owner: String = "pranjay-poddar", $name: String = "Dev-Geeks") {
  repository(owner: $owner, name: $name) {
    all:issues {
      totalCount
    }
    closed:issues(states:CLOSED) {
      totalCount
    }
    open:issues(states:OPEN) {
      totalCount
    }
  }
}
"""

async def repo_issue_count(
    repo: dict
) -> int:
    """Get the number of issues for a repository"""
    async with aiohttp.ClientSession() as session:
        async with session.post(
            "https://api.github.com/graphql",
            headers=get_headers(get_token()),
            json={
                "query": ISSUE_COUNT_QUERY,
                "variables": {
                    "owner": repo["owner"]["login"],
                    "name": repo["name"],
                },
            },
        ) as resp:
            data = await resp.json()
            count = data["data"]["repository"]["all"]["totalCount"]
            repo['issue_count'] = count
            return repo
        
async def find_similar_repos(
    seed_repo: dict,
    min_stars: int = 100,
    provider_config: dict = None,
    model: str = None,
    min_relevance_level: int = 1,
    concurrency: int = 100,
) -> List[Dict]:
    """Find similar repositories using multiple search strategies"""
    readme_content = await get_repo_readme(seed_repo["full_name"])
    seed_repo["readme"] = readme_content
    llm_keywords = await extract_keywords_from_readme_llm(
        readme_content, seed_repo["full_name"], provider_config, model
    )
    print(f"LLM extracted keywords: {llm_keywords}")

    raw_repos = await search_repos_by_keywords(
        keywords=llm_keywords,
        min_stars=min_stars,
    )
    print(colored(f"LLM keyword search added {len(raw_repos)} repositories", "light_cyan"))
    repos = []
    coros = [repo_issue_count(repo) for repo in raw_repos]
    raw_repos = await wait_coros_with_progress(coros, "Getting issue count for repositories")
    repos = [repo for repo in raw_repos if repo['issue_count'] >= 100]
    print(colored(f"Total {len(repos)} repositories with more than 1000 issues", "light_cyan"))
    async def _get_repo_readme(repo: dict) -> str:
        repo["readme"] = await get_repo_readme(repo["full_name"])
        return repo

    coros = [_get_repo_readme(repo) for repo in repos]
    repos = await wait_coros_with_progress(coros, "Getting README for repositories")
    repos = [
        repo
        for repo in repos
        if repo["full_name"] not in SKIP_REPOS
        and repo["full_name"].lower() not in SKIP_REPOS
    ]
    repos = [repo for repo in repos if repo["full_name"] != seed_repo["full_name"]]
    semaphore = asyncio.Semaphore(concurrency)

    async def _get_repo_relevance(repo: dict) -> dict:
        async with semaphore:
            try:
                relevance_level, relevance_reason = await repo_relevance_level(
                    seed_repo, repo, provider_config, model
                )
                repo["relevance_level"] = relevance_level.value
                repo["relevance_reason"] = relevance_reason
            except Exception as e:
                print(
                    colored(
                        f"Error calculating relevance for {repo['full_name']}: {e}",
                        "light_cyan",
                    )
                )
                repo["relevance_level"] = RelevanceLevel.NO.value
                repo["relevance_reason"] = str(e)
        return repo

    coros = [_get_repo_relevance(repo) for repo in repos]
    repos = await wait_coros_with_progress(coros, "Calculating relevance")
    repos = [repo for repo in repos if repo["relevance_level"] >= min_relevance_level]
    repos.sort(
        key=lambda x: (x["relevance_level"], x["stargazers_count"]), reverse=True
    )
    return repos


async def repo_relevance_level(
    seed_repo: dict,
    candidate_repo: dict,
    provider_config: dict,
    model: str,
) -> Tuple[RelevanceLevel, str]:
    """Use LLM to determine relevance between two repositories based on README content"""

    # Truncate READMEs if too long
    max_readme_length = 4000
    seed_readme = seed_repo["readme"]
    candidate_readme = candidate_repo["readme"]
    if len(seed_readme) > max_readme_length:
        seed_readme = seed_readme[:max_readme_length] + "..."
    if len(candidate_readme) > max_readme_length:
        candidate_readme = candidate_readme[:max_readme_length] + "..."

    prompt = f"""You are an expert at analyzing software repositories and determining their functional relevance.

Compare these two repositories and determine how FUNCTIONALLY RELEVANT they are to each other.

SEED REPOSITORY: {seed_repo}
README Content:
{seed_readme}

CANDIDATE REPOSITORY: {candidate_repo}  
README Content:
{candidate_readme}

Analyze their functional relevance based on:
1. **Core Functionality**: Do they solve similar problems or serve similar purposes?
2. **Problem Domain**: Do they operate in the same or related domains?
3. **Technologies**: Do they use similar technologies?
4. **Components**: Do they share similar components?

Classify the relevance into exactly ONE of these 4 levels based on the 4 dimensions above:

**High**: Strong alignment across multiple dimensions
- Core Functionality: Solve the same or very similar problems with similar approaches
- Problem Domain: Operate in the same specific domain or closely related subdomains
- Technologies: Use similar tech stacks, frameworks, or programming paradigms
- Components: Share many similar architectural components or design patterns
- Examples: Two web scraping libraries, two React UI frameworks, two database ORMs, two CLI testing tools, two different types of CLI tools(may have similar components on parse args)

**Mid**: Moderate alignment with some overlap
- Core Functionality: Related functionality but different approaches or scope
- Problem Domain: Adjacent domains that users might work across
- Technologies: Some technology overlap but different primary focuses
- Components: Share some common components or architectural patterns
- Examples: Web framework vs API client, data processing vs data visualization, static site generator vs CMS, compiler vs code formatter

**Low**: Limited alignment but some shared elements
- Core Functionality: Different primary purposes but may serve complementary roles
- Problem Domain: Different domains but could be used in same broader context
- Technologies: Share some underlying technologies, libraries, or platforms
- Components: Few shared components but similar technical foundation
- Examples: Text editor vs syntax highlighter, different front-end projects use same framework like vue, database vs caching layer, linter vs static analyzer

**No** 
- Core Functionality: Completely different purposes and use cases
- Problem Domain: Unrelated application domains
- Technologies: Different technology stacks with minimal overlap
- Components: No significant shared components or patterns
- Examples: Game engine vs database tool, image editor vs web scraper, cryptocurrency vs weather app, video player vs file compressor

Respond with python list with two elements, "level", "reason", in the following format:
```python
["High", "The two repositories solve similar problems and have similar functionality."]
``` 
"""

    llm_client = make_llm_from_provider_config(provider_config)

    @tenacity.retry(
        stop=tenacity.stop_after_attempt(10),
        wait=tenacity.wait_exponential(multiplier=1, min=5, max=60),
    )
    async def _query_llm():
        return await llm_client.chat_completion_async(
            messages=[{"role": "user", "content": prompt}],
            model=model,
            max_tokens=1024,
            temperature=0.0,
        )

    response = await _query_llm()
    response = response.choices[0].message.content
    response = response.split("```python")[1].split("```")[0].strip()
    response = eval(response)
    print(
        colored(
            f"Relevance of {seed_repo['full_name']} and {candidate_repo['full_name']}: {response}",
            "light_cyan",
        )
    )
    if response[0].lower() == "high":
        return RelevanceLevel.HIGH, response[1]
    elif response[0].lower() == "mid":
        return RelevanceLevel.MID, response[1]
    elif response[0].lower() == "low":
        return RelevanceLevel.LOW, response[1]
    else:
        return RelevanceLevel.NO, response[1]
