import csv
import re
from pathlib import Path

# --- Configuration ---
# Define paths relative to the project root, where this script is expected to be run from.
INPUT_CSV = Path('case-study/M76_Quantitative_Study_of_the_CHAC_Workbench/output/log_to_case_map_v2.csv')
LOG_SOURCE_DIR = Path('.ai-cli-log')
OUTPUT_CSV = Path('case-study/M76_Quantitative_Study_of_the_CHAC_Workbench/output/final_log_analysis_data.csv')
MENTION_THRESHOLD = 2 # A case must be mentioned at least this many times to be included

def get_simple_case_id(full_case_id):
    """Extracts the core ID (e.g., 'M72') from the full directory name."""
    match = re.match(r'([AM]\d+)', full_case_id)
    return match.group(1) if match else None

def main():
    """
    Reads the ambiguous map, analyzes log content, and creates a final,
    one-to-many (long format) data map.
    """
    print("--- Starting Final One-to-Many Mapping (v2 - Corrected Paths) ---")
    
    if not INPUT_CSV.is_file():
        print(f"Error: Input map not found at {INPUT_CSV}")
        return

    # Prepare the output file
    with open(OUTPUT_CSV, 'w', newline='', encoding='utf-8') as f_out:
        writer = csv.writer(f_out)
        writer.writerow(['log_file', 'case_study_id', 'commit_hash', 'mention_count'])

    # Read the input CSV
    with open(INPUT_CSV, 'r', newline='', encoding='utf-8') as f_in:
        reader = csv.reader(f_in)
        header = next(reader) # Skip header

        for row in reader:
            log_file_rel_path, commit_hash, case_ids_str, status = row
            
            # Use the name from the CSV to build the correct path
            log_file_name = Path(log_file_rel_path).name
            log_file_path = LOG_SOURCE_DIR / log_file_name
            
            if not log_file_path.is_file():
                print(f"Warning: Log file not found: {log_file_path}")
                continue

            try:
                log_content = log_file_path.read_text(encoding='utf-8')
            except Exception as e:
                print(f"Warning: Could not read {log_file_path}: {e}")
                continue

            candidate_full_ids = case_ids_str.split(';')
            
            for full_id in candidate_full_ids:
                if not full_id: continue
                
                simple_id = get_simple_case_id(full_id)
                if not simple_id: continue

                # Count mentions of the simple ID (e.g., "M72")
                mention_count = len(re.findall(r'\b' + re.escape(simple_id) + r'\b', log_content))

                if mention_count >= MENTION_THRESHOLD:
                    # Write a new row for this valid mapping
                    with open(OUTPUT_CSV, 'a', newline='', encoding='utf-8') as f_out:
                        writer = csv.writer(f_out)
                        writer.writerow([log_file_name, full_id, commit_hash, mention_count])
                        print(f"Mapped: {log_file_name} -> {full_id} (Mentions: {mention_count})")

    print("\n--- Final Mapping Complete ---")
    print(f"Final analysis-ready data saved to: {OUTPUT_CSV}")

if __name__ == "__main__":
    main()