import csv
from pathlib import Path
import tiktoken

# --- Configuration ---
TOOLKIT_DIR = Path('chac_toolkit')
OUTPUT_DIR = Path('case-study/M76_Quantitative_Study_of_the_CHAC_Workbench/output')
OUTPUT_CSV = OUTPUT_DIR / 'table_4_static_token_counts.csv'
FILE_PATTERNS = ['*.md', '*.sh']

# --- Main Analysis Logic ---
def main():
    """
    Performs a static analysis of all files in the chac_toolkit directory,
    calculating the token count for each.
    """
    print(f"--- Step 7: Static Token Analysis ---")
    OUTPUT_DIR.mkdir(exist_ok=True)

    # Initialize the tokenizer
    try:
        encoding = tiktoken.get_encoding("cl100k_base")
    except Exception as e:
        print(f"[ERROR] Could not initialize tokenizer: {e}")
        return

    all_files_data = []
    total_token_count = 0

    # Find all relevant files
    files_to_process = []
    for pattern in FILE_PATTERNS:
        files_to_process.extend(TOOLKIT_DIR.rglob(pattern))
    
    # Sort files for consistent output
    sorted_files = sorted(files_to_process, key=lambda p: p.as_posix())

    # --- UX Improvement: Apply user-defined exclusion rules ---
    excluded_prefixes = ('CHAC_Human',)
    excluded_files = ('CHANGELOG.md', 'README.md')

    for file_path in sorted_files:
        # Exclude files in '_archive' directory
        if '_archive' in file_path.parts:
            continue
        # Exclude files with specified prefixes or names
        if file_path.name.startswith(excluded_prefixes) or file_path.name in excluded_files:
            continue
            
        try:
            content = file_path.read_text(encoding='utf-8')
            token_count = len(encoding.encode(content))
            all_files_data.append({
                'File_Path': file_path.as_posix(),
                'Token_Count': token_count
            })
            total_token_count += token_count
        except Exception as e:
            print(f"[WARNING] Could not process file {file_path}: {e}")
    
    # Add a total count at the end
    all_files_data.append({
        'File_Path': '--- TOTAL ---',
        'Token_Count': total_token_count
    })

    # Write to CSV
    try:
        with open(OUTPUT_CSV, 'w', newline='', encoding='utf-8') as f:
            writer = csv.DictWriter(f, fieldnames=['File_Path', 'Token_Count'])
            writer.writeheader()
            writer.writerows(all_files_data)
        print(f"[✔] Static token analysis complete. Results saved to: {OUTPUT_CSV}")
    except Exception as e:
        print(f"[ERROR] Could not write to CSV file: {e}")


if __name__ == "__main__":
    main()
