import requests
from datetime import datetime
import os
from typing import List, Dict
import time
import json
from pathlib import Path
from dotenv import load_dotenv

class GitHubAPIClient:
    def __init__(self):
        """Initialize GitHub API client using token from .env file."""
        load_dotenv()  # Load environment variables from .env file
        self.token = os.getenv('GITHUB_TOKEN')
        if not self.token:
            raise ValueError("GITHUB_TOKEN not found in .env file")
        
        self.headers = {
            'Accept': 'application/vnd.github.v3+json',
            'Authorization': f'token {self.token}'
        }

    def get_commits_before_date(self, repo_url: str, before_date: str) -> List[Dict]:
        """Get all commits before a specific date."""
        try:
            # Extract owner and repo from URL
            repo_path = repo_url.split('github.com/')[-1].rstrip('/')
            
            api_url = f"https://api.github.com/repos/{repo_path}/commits"
            params = {
                'until': before_date,
                'per_page': 100
            }
            
            all_commits = []
            page = 1
            
            while True:
                params['page'] = page
                response = requests.get(api_url, headers=self.headers, params=params)
                
                if response.status_code == 403 and 'rate limit exceeded' in response.text.lower():
                    reset_time = int(response.headers.get('X-RateLimit-Reset', 0))
                    wait_time = max(reset_time - time.time(), 0)
                    print(f"Rate limit exceeded. Waiting {wait_time:.0f} seconds...")
                    time.sleep(wait_time + 1)
                    continue
                
                if response.status_code == 404:
                    print(f"Repository not found: {repo_url}")
                    return []
                    
                response.raise_for_status()
                commits = response.json()
                
                if not commits:
                    break
                    
                for commit in commits:
                    commit_info = {
                        'sha': commit['sha'],
                        'date': commit['commit']['author']['date'],
                        'message': commit['commit']['message'],
                        'author': commit['commit']['author']['name']
                    }
                    all_commits.append(commit_info)
                
                if len(commits) < 100:
                    break
                    
                page += 1
                time.sleep(1)  # Be nice to GitHub API
            
            return all_commits
            
        except Exception as e:
            print(f"Error fetching commits for {repo_url}: {str(e)}")
            return []

def process_repository_file(file_path: Path, output_dir: Path, client: GitHubAPIClient) -> None:
    """Process a single repository JSON file."""
    try:
        # Generate the expected output file path
        output_file = output_dir / f"commits_{file_path.stem}.json"
        
        # # Skip if already processed
        # if output_file.exists():
        #     print(f"Skipping {file_path.name} - already processed")
        #     return
            
        with open(file_path, 'r', encoding='utf-8') as f:
            issues = json.load(f)
        
        # Ensure issues is a list
        if not isinstance(issues, list):
            issues = [issues]
        
        # Get the latest issue date
        latest_date = max(
            datetime.strptime(issue.get('created_at', '0001-01-01T00:00:00Z'), "%Y-%m-%dT%H:%M:%SZ")
            for issue in issues
        )
        
        # Extract repository URL from the file name or content
        repo_url = None
        if issues and issues[0].get('url'):
            # Example URL: https://github.com/owner/repo/issues/123
            # Convert to: https://github.com/owner/repo
            repo_url = '/'.join(issues[0]['url'].split('/')[:5])
        
        if not repo_url:
            print(f"Could not determine repository URL from {file_path}")
            return
        
        # Get commits before the latest issue date
        commits = client.get_commits_before_date(
            repo_url,
            latest_date.strftime("%Y-%m-%dT%H:%M:%SZ")
        )
        
        # Save results
        result = {
            'repository': repo_url,
            'latest_issue_date': latest_date.isoformat(),
            'total_commits': len(commits),
            'commits': commits
        }
        
        with open(output_file, 'w', encoding='utf-8') as f:
            json.dump(result, f, indent=2)
            
        print(f"Processed {repo_url}: {len(commits)} commits found")
        
    except Exception as e:
        print(f"Error processing file {file_path}: {str(e)}")

def main():
    """Process all JSON files in the input directory."""
    # Setup directories
    input_dir = Path("CHANGE_IT_TO_YOUR_PATH") # The path you generated data from scon_filter.py
    output_dir = Path("github_commits")
    output_dir.mkdir(exist_ok=True)
    
    # Initialize GitHub API client
    client = GitHubAPIClient()
    
    # Get list of all files to process
    all_files = list(input_dir.glob("*.json"))
    total_files = len(all_files)
    
    # Count already processed files
    already_processed = sum(1 for file in all_files if (output_dir / f"commits_{file.stem}.json").exists())
    
    print(f"Found {total_files} total files, {already_processed} already processed")
    
    # Process all JSON files
    for i, file_path in enumerate(all_files):
        output_file = output_dir / f"commits_{file_path.stem}.json"
        
        # if output_file.exists():
        #     print(f"[{i+1}/{total_files}] Skipping {file_path.name} - already processed")
        #     continue
            
        print(f"\n[{i+1}/{total_files}] Processing file: {file_path}")
        process_repository_file(file_path, output_dir, client)
        
    print(f"\nProcessing complete. Results saved in {output_dir}")

if __name__ == "__main__":
    main()