#!/usr/bin/env python3
"""
CVE Reproducibility Filter Script

Scans CVE JSON files and scores them based on reproducibility potential.
Filters out CVEs that are difficult to reproduce in Docker environments.
"""

import json
import os
import sys
from pathlib import Path
from typing import Dict, List, Optional
from dataclasses import dataclass, field
import re


# Firmware/IoT vendors (cannot reproduce from source, requires hardware/mock)
FIRMWARE_VENDORS = {
    # Router vendors
    "tenda", "totolink", "tp-link", "netgear", "linksys", "zyxel",
    "mikrotik", "ubiquiti", "wavlink", "comfast", "draytek",
    "d-link", "dlink", "h3c", "ruijie", "reyee", "mercury",
    "fast", "lb-link", "trendnet", "edimax", "buffalo",
    # Camera/Security
    "hikvision", "dahua", "avtech", "reolink", "foscam", "swann",
    # Industrial control/Embedded
    "siemens", "schneider", "rockwell", "abb", "omron", "delta",
    "mitsubishi", "automationdirect",
}

# Firmware product keywords
FIRMWARE_PRODUCTS = {"firmware", "nvr", "dvr", "bios", "bootloader"}


def is_firmware_iot(vendor: str, product: str) -> bool:
    """Check if the product is firmware/IoT device"""
    vendor_lower = vendor.lower()
    product_lower = product.lower()

    # 1. Vendor match
    if any(v in vendor_lower for v in FIRMWARE_VENDORS):
        return True

    # 2. Product keyword match
    if any(p in product_lower for p in FIRMWARE_PRODUCTS):
        return True

    return False


@dataclass
class CVEReproducibility:
    """CVE Reproducibility Score"""
    cve_id: str
    score: int = 0
    reasons: List[str] = field(default_factory=list)
    product: str = ""
    vendor: str = ""
    version: str = ""
    description: str = ""
    poc_url: Optional[str] = None
    exploit_available: bool = False
    cvss_score: float = 0.0
    cwe_id: str = ""


class CVEReproducibilityFilter:
    """CVE Reproducibility Filter"""

    def __init__(self, cves_dir: str = "cves"):
        self.cves_dir = Path(cves_dir)
        self.results: List[CVEReproducibility] = []

    def analyze_cve(self, cve_path: Path) -> Optional[CVEReproducibility]:
        """Analyze reproducibility of a single CVE"""
        try:
            with open(cve_path, 'r', encoding='utf-8') as f:
                data = json.load(f)
        except Exception as e:
            print(f"Error reading {cve_path}: {e}")
            return None

        cve_id = data.get('cveMetadata', {}).get('cveId', 'Unknown')
        result = CVEReproducibility(cve_id=cve_id)

        # Get basic information
        containers = data.get('containers', {})
        cna = containers.get('cna', {})

        # 1. Analyze affected products
        affected = cna.get('affected', [])
        if affected:
            first_affected = affected[0]
            result.vendor = first_affected.get('vendor', 'Unknown')
            result.product = first_affected.get('product', 'Unknown')

            versions = first_affected.get('versions', [])
            if versions:
                result.version = versions[0].get('version', 'Unknown')

        # Firmware/IoT penalty (cannot reproduce from source)
        if is_firmware_iot(result.vendor, result.product):
            result.score -= 50
            result.reasons.append(f"Firmware/IoT device: {result.vendor} (hard to reproduce)")

        # 2. Get description
        descriptions = cna.get('descriptions', [])
        if descriptions:
            result.description = descriptions[0].get('value', '')

        # 3. Analyze references to find POC/Exploit and Commit URLs
        references = cna.get('references', [])
        poc_keywords = ['poc', 'exploit', 'proof-of-concept', 'demo']
        commit_keywords = ['commit', 'commits', '/commit/', 'patch']

        has_commit_url = False
        for ref in references:
            url = ref.get('url', '').lower()
            tags = ref.get('tags', [])

            # Check for POC/Exploit
            if any(keyword in url for keyword in poc_keywords) or 'exploit' in tags:
                if not result.poc_url:
                    result.poc_url = ref.get('url')
                    result.score += 30
                    result.reasons.append(f"Found POC/Exploit URL: {url}")
                    result.exploit_available = True

            # Check for Commit URL (patch available makes reproduction easier)
            if any(keyword in url for keyword in commit_keywords) or 'patch' in tags:
                if not has_commit_url:
                    has_commit_url = True
                    result.score += 15
                    result.reasons.append(f"Found commit/patch URL: {url}")

            # GitHub repository link (source code may be available)
            if 'github.com' in url and '/commit/' not in url and not result.poc_url:
                result.score += 5
                result.reasons.append(f"Found GitHub repo: {url}")

        # 4. Analyze CVE Program Container and ADP containers for additional POCs
        adp_containers = containers.get('adp', [])
        for adp in adp_containers:
            adp_refs = adp.get('references', [])
            for ref in adp_refs:
                url = ref.get('url', '').lower()
                tags = ref.get('tags', [])

                if any(keyword in url for keyword in poc_keywords) or 'exploit' in tags:
                    if not result.poc_url:
                        result.poc_url = ref.get('url')
                        result.score += 25
                        result.reasons.append(f"Found POC in ADP: {url}")
                        result.exploit_available = True

                if any(keyword in url for keyword in commit_keywords) or 'patch' in tags:
                    if not has_commit_url:
                        has_commit_url = True
                        result.score += 15
                        result.reasons.append(f"Found commit/patch in ADP: {url}")

        # 5. Get CVSS score
        metrics = cna.get('metrics', [])
        for metric in metrics:
            if 'cvssV3_1' in metric:
                result.cvss_score = metric['cvssV3_1'].get('baseScore', 0.0)
                if result.cvss_score >= 7.0:
                    result.score += 10
                    result.reasons.append(f"High CVSS score: {result.cvss_score}")
                break

        # Also check CVSS score in ADP containers
        for adp in adp_containers:
            adp_metrics = adp.get('metrics', [])
            for metric in adp_metrics:
                if 'cvssV3_1' in metric and result.cvss_score == 0.0:
                    result.cvss_score = metric['cvssV3_1'].get('baseScore', 0.0)
                    if result.cvss_score >= 7.0:
                        result.score += 10
                        result.reasons.append(f"High CVSS score (ADP): {result.cvss_score}")
                    break

        # 6. Record CWE type without extra score (to avoid bias)
        problem_types = cna.get('problemTypes', [])
        for pt in problem_types:
            for desc in pt.get('descriptions', []):
                cwe_id = desc.get('cweId', '')
                if cwe_id:
                    result.cwe_id = cwe_id
                    break
            if result.cwe_id:
                break

        # Also check CWE in ADP containers
        if not result.cwe_id:
            for adp in adp_containers:
                adp_problems = adp.get('problemTypes', [])
                for pt in adp_problems:
                    for desc in pt.get('descriptions', []):
                        cwe_id = desc.get('cweId', '')
                        if cwe_id:
                            result.cwe_id = cwe_id
                            break
                    if result.cwe_id:
                        break
                if result.cwe_id:
                    break

        # 7. Early filter for hard-to-reproduce major vendor products
        excluded_vendors = ['apple', 'microsoft', 'google android', 'ios']
        excluded_products = ['windows', 'macos', 'ios', 'android', 'chrome os']

        vendor_lower = result.vendor.lower()
        product_lower = result.product.lower()

        # Check if it's a hard-to-reproduce system-level product
        is_hard_to_reproduce = False
        if any(vendor.lower() in vendor_lower for vendor in excluded_vendors):
            # Allow open-source/web products from these vendors
            if not any(keyword in product_lower for keyword in
                      ['asp.net', 'core', 'gvisor', 'chrome', 'chromium', 'edge']):
                is_hard_to_reproduce = True

        if any(product.lower() in product_lower for product in excluded_products):
            is_hard_to_reproduce = True

        if is_hard_to_reproduce:
            result.score -= 30
            result.reasons.append(f"Difficult to dockerize: {result.vendor} {result.product}")

        # 8. Check for specific version number (not "n/a" or "unspecified")
        if result.version not in ['n/a', 'Unknown', 'unspecified', '']:
            result.score += 10
            result.reasons.append(f"Specific version available: {result.version}")

        # 9. Check CISA SSVC assessment
        for adp in adp_containers:
            if adp.get('providerMetadata', {}).get('shortName') == 'CISA-ADP':
                adp_metrics = adp.get('metrics', [])
                for metric in adp_metrics:
                    if 'other' in metric and metric['other'].get('type') == 'ssvc':
                        content = metric['other'].get('content', {})
                        options = content.get('options', [])
                        for option in options:
                            if option.get('Exploitation') == 'poc':
                                result.score += 20
                                result.reasons.append("CISA confirmed POC available")
                            elif option.get('Exploitation') == 'active':
                                result.score += 25
                                result.reasons.append("CISA confirmed active exploitation")

        # 10. Check if description mentions specific attack methods
        attack_keywords = [
            'payload', 'request', 'parameter', 'endpoint', 'uri', 'url',
            'input', 'form', 'field', 'cookie', 'header', 'body'
        ]

        desc_lower = result.description.lower()
        if any(keyword in desc_lower for keyword in attack_keywords):
            result.score += 5
            result.reasons.append("Description contains attack details")

        # 11. Score based on programming language/tech stack (easier to reproduce = higher score)
        # High score: scripting languages, web frameworks, easy to dockerize
        high_score_langs = {
            'python': 20, 'django': 20, 'flask': 20, 'fastapi': 20,
            'php': 18, 'laravel': 18, 'wordpress': 18, 'drupal': 15,
            'node': 15, 'nodejs': 15, 'express': 15, 'npm': 15,
            'ruby': 15, 'rails': 15,
            'java': 10, 'spring': 12, 'tomcat': 12, 'struts': 12,
            'go': 10, 'golang': 10,
        }
        # Medium score: requires more configuration but still reproducible
        medium_score_langs = {
            'mysql': 8, 'postgresql': 8, 'mongodb': 8, 'redis': 8,
            'nginx': 8, 'apache': 8,
            'docker': 10, 'kubernetes': 5,
            'linux': 5, 'ubuntu': 5, 'debian': 5, 'centos': 5,
        }
        # Low score: system-level, compiled languages
        low_score_langs = {
            'c++': 3, 'cpp': 3,
            'c': 2,
            'rust': 5,
            '.net': 5, 'asp.net': 8,
        }

        text_to_check = f"{result.product} {result.vendor} {result.description}".lower()

        lang_scored = False
        # Check high score languages first
        for lang, score in high_score_langs.items():
            if lang in text_to_check:
                result.score += score
                result.reasons.append(f"Easy to reproduce language/framework: {lang} (+{score})")
                lang_scored = True
                break

        # If no high score language, check medium score
        if not lang_scored:
            for lang, score in medium_score_langs.items():
                if lang in text_to_check:
                    result.score += score
                    result.reasons.append(f"Reproducible technology: {lang} (+{score})")
                    lang_scored = True
                    break

        # If no medium score, check low score languages
        if not lang_scored:
            for lang, score in low_score_langs.items():
                if lang in text_to_check:
                    result.score += score
                    result.reasons.append(f"Harder to reproduce language: {lang} (+{score})")
                    break

        return result

    def scan_all_cves(self, year: Optional[str] = None, limit: Optional[int] = None):
        """Scan all CVE files"""
        if year:
            search_path = self.cves_dir / year
        else:
            search_path = self.cves_dir

        cve_files = list(search_path.glob('**/*.json'))

        # Exclude delta files
        cve_files = [f for f in cve_files if 'delta' not in f.name.lower()]

        if limit:
            cve_files = cve_files[:limit]

        print(f"Scanning {len(cve_files)} CVE files...")

        for i, cve_file in enumerate(cve_files, 1):
            if i % 1000 == 0:
                print(f"Processed {i}/{len(cve_files)} files...")

            result = self.analyze_cve(cve_file)
            if result and result.score > 0:
                self.results.append(result)

        # Sort by score
        self.results.sort(key=lambda x: x.score, reverse=True)

    def scan_latest_cves(self, latest_count: int = 1000):
        """Scan the latest N CVE files (sorted by datePublished)"""
        from datetime import datetime, timezone

        print(f"Finding latest {latest_count} CVE files...")

        # Only scan recent years
        cve_files = []
        for year in ['2025']:
            year_path = self.cves_dir / year
            if year_path.exists():
                year_files = list(year_path.glob('**/*.json'))
                cve_files.extend([f for f in year_files if 'delta' not in f.name.lower()])

        print(f"Reading {len(cve_files)} CVE files to get publish dates...")

        # Read publish dates
        cve_with_dates = []
        for i, cve_file in enumerate(cve_files):
            if i % 5000 == 0:
                print(f"Processed {i}/{len(cve_files)} files...")

            try:
                with open(cve_file, 'r', encoding='utf-8') as f:
                    data = json.load(f)

                date_published = data.get('cveMetadata', {}).get('datePublished')
                if date_published:
                    # Parse date
                    try:
                        dt = datetime.fromisoformat(date_published.replace('Z', '+00:00'))
                        if dt.tzinfo is None:
                            dt = dt.replace(tzinfo=timezone.utc)
                        cve_with_dates.append((dt, cve_file))
                    except:
                        continue
            except:
                continue

        print(f"Found {len(cve_with_dates)} CVEs with valid publish dates")

        # Sort by publish date, newest first
        cve_with_dates.sort(key=lambda x: x[0], reverse=True)

        # Take latest N
        latest_files = [cve_file for _, cve_file in cve_with_dates[:latest_count]]

        print(f"Analyzing {len(latest_files)} latest CVE files...")

        for i, cve_file in enumerate(latest_files, 1):
            if i % 100 == 0:
                print(f"Processed {i}/{len(latest_files)} files...")

            result = self.analyze_cve(cve_file)
            if result and result.score > 0:
                self.results.append(result)

        # Sort by score
        self.results.sort(key=lambda x: x.score, reverse=True)

        print(f"Found {len(self.results)} reproducible CVEs from latest {latest_count}")

    def extract_full_cve_info(self, cve_path: Path) -> Dict:
        """Extract full CVE information for reproduction"""
        try:
            with open(cve_path, 'r', encoding='utf-8') as f:
                data = json.load(f)
        except Exception as e:
            return {}

        containers = data.get('containers', {})
        cna = containers.get('cna', {})
        metadata = data.get('cveMetadata', {})
        adp_containers = containers.get('adp', [])

        # Extract all useful information
        full_info = {
            'cve_id': metadata.get('cveId', ''),
            'date_published': metadata.get('datePublished', ''),
            'date_updated': metadata.get('dateUpdated', ''),

            # Title
            'title': cna.get('title', ''),

            # Product info
            'affected': cna.get('affected', []),

            # Description
            'descriptions': cna.get('descriptions', []),

            # Problem types
            'problem_types': cna.get('problemTypes', []),

            # Impacts (CAPEC)
            'impacts': cna.get('impacts', []),

            # Metrics
            'metrics': cna.get('metrics', []),

            # References
            'references': cna.get('references', []),

            # Configurations (trigger conditions)
            'configurations': cna.get('configurations', []),

            # Workarounds (hints for trigger conditions)
            'workarounds': cna.get('workarounds', []),

            # Solutions
            'solutions': cna.get('solutions', []),

            # Exploits
            'exploits': cna.get('exploits', []),

            # ADP info (includes CISA assessment)
            'adp': adp_containers
        }

        # Extract KEV info and ADP references from ADP containers
        full_info['kev'] = None
        full_info['adp_references'] = []

        for adp in adp_containers:
            # Get KEV info
            metrics = adp.get('metrics', [])
            for metric in metrics:
                if 'other' in metric and metric['other'].get('type') == 'kev':
                    full_info['kev'] = metric['other'].get('content', {})

            # Get ADP references (supplemental PoCs)
            adp_refs = adp.get('references', [])
            if adp_refs:
                full_info['adp_references'].extend(adp_refs)

        return full_info

    def generate_reproduce_files(self, min_score: int = 30, output_dir: str = None):
        """Generate MD documents and summary JSON for reproducible CVEs"""
        import os
        from datetime import datetime

        if output_dir is None:
            output_dir = "output"

        os.makedirs(output_dir, exist_ok=True)

        filtered_results = [r for r in self.results if r.score >= min_score]

        # Summary information
        summary = {
            'generated_at': datetime.now().isoformat(),
            'filter_score': min_score,
            'total_analyzed': len(self.results),
            'total_reproducible': len(filtered_results),
            'cves': []
        }

        print(f"\nGenerating reproduction files for {len(filtered_results)} CVEs...")

        for i, result in enumerate(filtered_results, 1):
            if i % 10 == 0:
                print(f"Processing {i}/{len(filtered_results)}...")

            # Get full information
            cve_file = None
            for year in ['2025', '2024', '2023', '2022', '2021']:
                potential_path = self.cves_dir / year
                if potential_path.exists():
                    found_files = list(potential_path.glob(f"**/{result.cve_id}.json"))
                    if found_files:
                        cve_file = found_files[0]
                        break

            if not cve_file:
                print(f"Warning: Cannot find JSON file for {result.cve_id}")
                continue

            full_info = self.extract_full_cve_info(cve_file)

            # Generate MD document
            md_content = self.generate_cve_md(result, full_info)
            md_filename = os.path.join(output_dir, f"{result.cve_id}.md")
            with open(md_filename, 'w', encoding='utf-8') as f:
                f.write(md_content)

            # Add to summary
            summary['cves'].append({
                'cve_id': result.cve_id,
                'score': result.score,
                'vendor': result.vendor,
                'product': result.product,
                'version': result.version,
                'cvss_score': result.cvss_score,
                'cwe_id': result.cwe_id,
                'poc_url': result.poc_url,
                'exploit_available': result.exploit_available,
                'reasons': result.reasons,
                'date_published': full_info.get('date_published', ''),
                'file_path': f"{result.cve_id}.md"
            })

        # Create CVE list sorted by publish date
        cves_by_date = sorted(
            summary['cves'],
            key=lambda x: x.get('date_published', ''),
            reverse=True
        )
        summary['cves_by_date'] = cves_by_date

        # Save summary JSON
        summary_file = os.path.join(output_dir, 'summary.json')
        with open(summary_file, 'w', encoding='utf-8') as f:
            json.dump(summary, f, indent=2, ensure_ascii=False)

        print(f"\nGenerated {len(filtered_results)} reproduction files in {output_dir}/")
        print(f"   - Individual CVE markdown files")
        print(f"   - summary.json with metadata")

        return summary

    def generate_cve_md(self, result: CVEReproducibility, full_info: Dict) -> str:
        """Generate Markdown document for a single CVE"""
        md = f"# {result.cve_id}\n\n"

        # Title (if available)
        title = full_info.get('title', '')
        if title:
            md += f"**{title}**\n\n"

        # Basic information
        md += "## Basic Information\n\n"
        md += f"- **Score**: {result.score}\n"
        md += f"- **Vendor**: {result.vendor}\n"
        md += f"- **Product**: {result.product}\n"
        md += f"- **Version**: {result.version}\n"
        md += f"- **CVSS Score**: {result.cvss_score}\n"
        md += f"- **CWE**: {result.cwe_id}\n"
        md += f"- **Date Published**: {full_info.get('date_published', 'N/A')}\n"
        md += f"- **Exploit Available**: {result.exploit_available}\n\n"

        # KEV Status (if in CISA KEV)
        kev = full_info.get('kev')
        if kev:
            md += "## CISA KEV Status\n\n"
            md += f"- **Date Added**: {kev.get('dateAdded', 'N/A')}\n"
            md += f"- **Reference**: {kev.get('reference', 'N/A')}\n\n"

        # Description
        md += "## Description\n\n"
        for desc in full_info.get('descriptions', []):
            if desc.get('lang', '').startswith('en'):
                md += f"{desc.get('value', '')}\n\n"
                break

        # Affected products details
        md += "## Affected Products\n\n"
        for affected in full_info.get('affected', []):
            md += f"### {affected.get('vendor', 'Unknown')} - {affected.get('product', 'Unknown')}\n\n"

            # Default status
            default_status = affected.get('defaultStatus')
            if default_status:
                md += f"**Default Status**: {default_status}\n\n"

            # Package info (for installation)
            package_name = affected.get('packageName')
            collection_url = affected.get('collectionURL')
            if package_name or collection_url:
                md += "**Package Info**:\n"
                if package_name:
                    md += f"- Package Name: `{package_name}`\n"
                if collection_url:
                    md += f"- Collection URL: {collection_url}\n"
                md += "\n"

            # Version info with changes (fix versions)
            versions = affected.get('versions', [])
            if versions:
                md += "**Versions:**\n\n"
                md += "| Version Range | Status | Version Type | Fixed At |\n"
                md += "|--------------|--------|--------------|----------|\n"
                for v in versions:
                    status = v.get('status', 'affected')
                    version = v.get('version', 'unknown')
                    version_type = v.get('versionType', '')
                    less_than = v.get('lessThan', '')
                    less_than_or_equal = v.get('lessThanOrEqual', '')

                    # Build version range string
                    if less_than:
                        version_range = f"{version} - < {less_than}"
                    elif less_than_or_equal:
                        version_range = f"{version} - <= {less_than_or_equal}"
                    else:
                        version_range = version

                    # Get fixed versions from changes
                    changes = v.get('changes', [])
                    fixed_versions = [c.get('at', '') for c in changes if c.get('status') == 'unaffected']
                    fixed_at = ', '.join(fixed_versions) if fixed_versions else '-'

                    md += f"| {version_range} | {status} | {version_type} | {fixed_at} |\n"
                md += "\n"

            # Platform info
            platforms = affected.get('platforms', [])
            if platforms:
                md += f"**Platforms**: {', '.join(platforms)}\n\n"

            # Repository info
            repo = affected.get('repo')
            if repo:
                md += f"**Repository**: {repo}\n\n"

            # CPEs
            cpes = affected.get('cpes', [])
            if cpes:
                md += "**CPEs**:\n"
                for cpe in cpes[:5]:  # Limit to 5
                    md += f"- `{cpe}`\n"
                if len(cpes) > 5:
                    md += f"- ... and {len(cpes) - 5} more\n"
                md += "\n"

            # Program files (vulnerable files)
            program_files = affected.get('programFiles', [])
            if program_files:
                md += "**Vulnerable Files**:\n"
                for pf in program_files:
                    md += f"- `{pf}`\n"
                md += "\n"

            # Program routines (vulnerable functions)
            program_routines = affected.get('programRoutines', [])
            if program_routines:
                md += "**Vulnerable Functions**:\n"
                for routine in program_routines:
                    md += f"- `{routine.get('name', 'Unknown')}`\n"
                md += "\n"

            # Module info
            modules = affected.get('modules', [])
            if modules:
                md += f"**Modules**: {', '.join(modules)}\n\n"

        # Vulnerability types
        md += "## Vulnerability Types\n\n"
        for pt in full_info.get('problem_types', []):
            for desc in pt.get('descriptions', []):
                cwe = desc.get('cweId', '')
                description = desc.get('description', '')
                if cwe:
                    md += f"- {cwe}: {description}\n"
        md += "\n"

        # Attack Patterns (CAPEC)
        impacts = full_info.get('impacts', [])
        if impacts:
            md += "## Attack Patterns (CAPEC)\n\n"
            for impact in impacts:
                capec_id = impact.get('capecId', '')
                for desc in impact.get('descriptions', []):
                    capec_desc = desc.get('value', '')
                    if capec_id:
                        md += f"- {capec_id}: {capec_desc}\n"
                    elif capec_desc:
                        md += f"- {capec_desc}\n"
            md += "\n"

        # CVSS details
        md += "## CVSS Metrics\n\n"
        for metric in full_info.get('metrics', []):
            if 'cvssV3_1' in metric:
                cvss = metric['cvssV3_1']
                md += f"**CVSS v3.1**:\n"
                md += f"- Base Score: {cvss.get('baseScore', 'N/A')}\n"
                md += f"- Base Severity: {cvss.get('baseSeverity', 'N/A')}\n"
                md += f"- Vector: {cvss.get('vectorString', 'N/A')}\n\n"

                # Parse attack vector details
                vector_string = cvss.get('vectorString', '')
                if vector_string:
                    md += "**Attack Vector Details**:\n\n"
                    md += "| Metric | Value | Description |\n"
                    md += "|--------|-------|-------------|\n"

                    # Parse CVSS vector in order
                    vector_parts = vector_string.split('/')
                    vector_map = {
                        'AV:N': ('Attack Vector', 'Network', 'Can be exploited remotely over network'),
                        'AV:A': ('Attack Vector', 'Adjacent', 'Requires adjacent network access'),
                        'AV:L': ('Attack Vector', 'Local', 'Requires local access'),
                        'AV:P': ('Attack Vector', 'Physical', 'Requires physical access'),
                        'AC:L': ('Attack Complexity', 'Low', 'Low attack complexity'),
                        'AC:H': ('Attack Complexity', 'High', 'High attack complexity'),
                        'PR:N': ('Privileges Required', 'None', 'No privileges required'),
                        'PR:L': ('Privileges Required', 'Low', 'Low privileges required'),
                        'PR:H': ('Privileges Required', 'High', 'High privileges required'),
                        'UI:N': ('User Interaction', 'None', 'No user interaction required'),
                        'UI:R': ('User Interaction', 'Required', 'User interaction required'),
                        'S:U': ('Scope', 'Unchanged', 'Scope unchanged'),
                        'S:C': ('Scope', 'Changed', 'Scope changed'),
                        'C:N': ('Confidentiality', 'None', 'No confidentiality impact'),
                        'C:L': ('Confidentiality', 'Low', 'Low confidentiality impact'),
                        'C:H': ('Confidentiality', 'High', 'High confidentiality impact'),
                        'I:N': ('Integrity', 'None', 'No integrity impact'),
                        'I:L': ('Integrity', 'Low', 'Low integrity impact'),
                        'I:H': ('Integrity', 'High', 'High integrity impact'),
                        'A:N': ('Availability', 'None', 'No availability impact'),
                        'A:L': ('Availability', 'Low', 'Low availability impact'),
                        'A:H': ('Availability', 'High', 'High availability impact'),
                    }

                    for part in vector_parts:
                        if part in vector_map:
                            metric_name, value, desc = vector_map[part]
                            md += f"| {metric_name} | {value} | {desc} |\n"
                    md += "\n"
                break

        # POC and reference links
        md += "## References and POCs\n\n"

        # Categorize links
        poc_refs = []
        patch_refs = []
        other_refs = []

        commit_keywords = ['commit', 'commits', '/commit/', 'patch', 'fix', 'pull']

        for ref in full_info.get('references', []):
            url = ref.get('url', '').lower()
            tags = ref.get('tags', [])

            if any(tag in ['exploit', 'poc', 'proof-of-concept'] for tag in tags):
                poc_refs.append(ref)
            elif any(tag in ['patch', 'fix'] for tag in tags) or any(kw in url for kw in commit_keywords):
                patch_refs.append(ref)
            else:
                other_refs.append(ref)

        # Commit/Patch URLs
        if patch_refs:
            md += "### Commit/Patch URLs\n\n"
            for ref in patch_refs:
                md += f"- [{ref.get('name', ref.get('url', ''))}]({ref.get('url', '')})"
                tags = ref.get('tags', [])
                if tags:
                    md += f" (tags: {', '.join(tags)})"
                md += "\n"
            md += "\n"

        # POC/Exploits
        if poc_refs:
            md += "### POC/Exploits\n\n"
            for ref in poc_refs:
                md += f"- [{ref.get('name', ref.get('url', ''))}]({ref.get('url', '')})"
                tags = ref.get('tags', [])
                if tags:
                    md += f" (tags: {', '.join(tags)})"
                md += "\n"
            md += "\n"

        # Other References
        if other_refs:
            md += "### Other References\n\n"
            for ref in other_refs:
                md += f"- [{ref.get('name', ref.get('url', ''))}]({ref.get('url', '')})"
                tags = ref.get('tags', [])
                if tags:
                    md += f" (tags: {', '.join(tags)})"
                md += "\n"
            md += "\n"

        # ADP References (supplemental PoCs from CVE Program Container, etc.)
        adp_refs = full_info.get('adp_references', [])
        if adp_refs:
            md += "### ADP Supplemental References\n\n"
            for ref in adp_refs:
                url = ref.get('url', '')
                tags = ref.get('tags', [])
                md += f"- [{url}]({url})"
                if tags:
                    md += f" (tags: {', '.join(tags)})"
                md += "\n"
            md += "\n"

        # Configurations (trigger conditions)
        configurations = full_info.get('configurations', [])
        if configurations:
            md += "## Trigger Conditions (Configurations)\n\n"
            for config in configurations:
                config_value = config.get('value', '')
                if config_value:
                    md += f"{config_value}\n\n"

        # Workarounds (hints for trigger conditions)
        workarounds = full_info.get('workarounds', [])
        if workarounds:
            md += "## Workarounds\n\n"
            for workaround in workarounds:
                workaround_value = workaround.get('value', '')
                if workaround_value:
                    md += f"{workaround_value}\n\n"

        # CISA assessment
        md += "## CISA Assessment\n\n"
        for adp in full_info.get('adp', []):
            if adp.get('providerMetadata', {}).get('shortName') == 'CISA-ADP':
                metrics = adp.get('metrics', [])
                for metric in metrics:
                    if 'other' in metric and metric['other'].get('type') == 'ssvc':
                        content = metric['other'].get('content', {})
                        options = content.get('options', [])
                        md += "**SSVC Decision Points**:\n"
                        for option in options:
                            for key, value in option.items():
                                md += f"- {key}: {value}\n"
                        md += "\n"

        # Solutions
        solutions = full_info.get('solutions', [])
        if solutions:
            md += "## Solutions\n\n"
            for solution in solutions:
                md += f"{solution.get('value', '')}\n\n"

        return md
    


def main():
    """Main function"""
    import argparse

    parser = argparse.ArgumentParser(description='CVE Reproducibility Filter Tool')
    parser.add_argument('--cves-dir', type=str, default='cves',
                        help='Directory containing CVE JSON files (default: cves)')
    parser.add_argument('--year', type=str, help='Specify year, e.g., 2024')
    parser.add_argument('--limit', type=int, help='Limit scan count (for testing)')
    parser.add_argument('--latest', type=int, help='Analyze latest N CVEs (e.g., 1000)')
    parser.add_argument('--min-score', type=int, default=50, help='Minimum score threshold (default: 50)')
    parser.add_argument('--top', type=int, help='Show only top N results')
    parser.add_argument('--output-dir', type=str, default='output',
                        help='Output directory (default: output)')

    args = parser.parse_args()

    filter_tool = CVEReproducibilityFilter(cves_dir=args.cves_dir)

    print("=== CVE Reproducibility Filter Tool ===\n")

    if args.latest:
        print(f"Analyzing latest {args.latest} CVEs...")
        filter_tool.scan_latest_cves(latest_count=args.latest)
    else:
        filter_tool.scan_all_cves(year=args.year, limit=args.limit)

    print(f"\nGenerating reproduction files...")

    # Generate reproduction files
    filter_tool.generate_reproduce_files(min_score=args.min_score, output_dir=args.output_dir)

    print(f"\n=== Filtering Complete ===")
    print(f"Total scanned: {len(filter_tool.results)}")
    filtered_count = len([r for r in filter_tool.results if r.score >= args.min_score])
    print(f"Reproducible: {filtered_count}")
    print(f"Success rate: {filtered_count/len(filter_tool.results)*100:.2f}%" if filter_tool.results else "N/A")


if __name__ == "__main__":
    main()