import csv
from pathlib import Path
import tiktoken
import subprocess
import tempfile
import shutil
import uuid
import os
import re
import pandas as pd

# --- Configuration ---
OUTPUT_DIR = Path('case-study/M76_Quantitative_Study_of_the_CHAC_Workbench/output')
OUTPUT_CSV = OUTPUT_DIR / 'table_5_growth_analysis.csv'
TOOLKIT_DIR_NAME = 'chac_toolkit'
FILE_PATTERNS = ['*.md', '*.sh']

# --- Helper Functions ---
def run_command(command, cwd):
    """Runs a shell command and returns its output."""
    result = subprocess.run(command, shell=True, capture_output=True, text=True, cwd=cwd)
    if result.returncode != 0:
        # Allow git log to fail (e.g., on a brand new repo with no commits)
        if "your current branch 'main' does not have any commits yet" in result.stderr:
            return ""
        raise RuntimeError(f"Command failed: {command}\nStderr: {result.stderr}")
    return result.stdout.strip()

def calculate_token_count(directory, encoding):
    """Calculates the total token count for specified file types in a directory."""
    total_tokens = 0
    
    files_to_process = []
    for pattern in FILE_PATTERNS:
        files_to_process.extend(Path(directory).rglob(pattern))

    for file_path in files_to_process:
        # Exclude any files within an '_archive' directory
        if '_archive' in file_path.parts:
            continue
        try:
            content = file_path.read_text(encoding='utf-8')
            total_tokens += len(encoding.encode(content))
        except Exception:
            pass # Ignore files that can't be read
    return total_tokens

# --- Main Analysis Logic ---
def main():
    """
    Performs a historical analysis of the repository's growth in token count.
    """
    print(f"--- Step 8: Repository Growth Analysis ---")
    OUTPUT_DIR.mkdir(exist_ok=True)
    
    repo_path = Path.cwd()
    temp_dir = Path(tempfile.gettempdir()) / f"chac_workbench_clone_{uuid.uuid4().hex}"

    try:
        # 1. Create a temporary, safe clone of the repository
        print(f"Cloning repository to temporary directory: {temp_dir}")
        run_command(f"git clone {repo_path} {temp_dir}", repo_path)

        # 2. Get the list of all commit hashes
        print("Getting commit history...")
        commit_hashes = run_command("git rev-list --reverse HEAD", temp_dir).splitlines()
        if not commit_hashes:
            print("[WARNING] No commits found. Skipping analysis.")
            return

        # 3. Initialize tokenizer
        encoding = tiktoken.get_encoding("cl100k_base")

        # 4. Iterate through each commit and analyze
        results = []
        total_commits = len(commit_hashes)
        for i, commit_hash in enumerate(commit_hashes):
            print(f"Analyzing commit {i+1}/{total_commits}: {commit_hash[:7]}...")
            
            # Checkout the commit in the temporary repo
            run_command(f"git checkout {commit_hash} --quiet", temp_dir)
            
            # Calculate token counts
            project_tokens = calculate_token_count(temp_dir, encoding)
            toolkit_tokens = calculate_token_count(temp_dir / TOOLKIT_DIR_NAME, encoding)
            
            # Get timestamp and associated max case number
            commit_info = run_command(f"git show -s --format=%ci__%B {commit_hash}", temp_dir)
            timestamp, _, commit_message = commit_info.partition('__')
            
            # Find the max M-series case number mentioned in the commit message
            case_numbers = [int(n) for n in re.findall(r'\bM(\d+)\b', commit_message)]
            max_case_num = max(case_numbers) if case_numbers else None

            results.append({
                'commit_hash': commit_hash,
                'timestamp': timestamp,
                'max_case_num': max_case_num,
                'total_project_tokens': project_tokens,
                'total_toolkit_tokens': toolkit_tokens
            })

        # --- Post-processing: Fill forward the max_case_num ---
        if results:
            df = pd.DataFrame(results)
            df['max_case_num'] = df['max_case_num'].fillna(method='ffill')
            df = df.dropna(subset=['max_case_num']) # Drop early commits before the first M-case
            df['max_case_num'] = df['max_case_num'].astype(int)
            results = df.to_dict('records')

        # 5. Write results to CSV
        if results:
            with open(OUTPUT_CSV, 'w', newline='', encoding='utf-8') as f:
                writer = csv.DictWriter(f, fieldnames=results[0].keys())
                writer.writeheader()
                writer.writerows(results)
            print(f"[✔] Growth analysis complete. Results saved to: {OUTPUT_CSV}")
        else:
            print("[INFO] No results to write.")

    finally:
        # 6. ALWAYS clean up the temporary repository
        print(f"Cleaning up temporary directory: {temp_dir}")
        if temp_dir.exists():
            shutil.rmtree(temp_dir, ignore_errors=True)
        
        # 7. ALWAYS ensure we are back on the main branch in the original repo
        print("Restoring original repository state...")
        run_command("git checkout main --quiet", repo_path)


if __name__ == "__main__":
    main()
