
import json
from pathlib import Path
import subprocess

# --- Configuration ---
INTERMEDIATE_DATA_DIR = Path('case-study/M76_Quantitative_Study_of_the_CHAC_Workbench/intermediate_data')
RAW_LOGS_DIR = Path('.ai-cli-log')

def main():
    """
    Verifies the metadata log count in each intermediate file against the raw log.
    """
    print("--- Step 1a: Verifying Extraction Accuracy ---")
    
    mismatches = []
    
    json_files = sorted(list(INTERMEDIATE_DATA_DIR.glob('*_metrics.json')))
    
    for i, json_file in enumerate(json_files):
        # 1. Get count from our script's output
        with open(json_file, 'r') as f:
            data = json.load(f)
        script_count = data.get('metadata_log_count', 0)
        
        # 2. Get ground truth count from raw file using grep
        raw_log_path = RAW_LOGS_DIR / json_file.name.replace('_metrics.json', '.txt')
        if not raw_log_path.is_file():
            print(f"  [!] WARNING: Raw log not found for {json_file.name}")
            continue
            
        try:
            # Use a robust grep that looks for the start of a block
            result = subprocess.run(
                f'grep -c "---" {raw_log_path}', 
                shell=True, capture_output=True, text=True, check=True
            )
            # Each block has two '---', so we divide by 2
            grep_count = int(result.stdout.strip()) // 2
        except (subprocess.CalledProcessError, ValueError):
            grep_count = -1 # Indicate an error
            
        # 3. Compare and report
        if script_count == grep_count:
            print(f"  ({i+1}/{len(json_files)}) [✔] OK: {raw_log_path.name} (Script: {script_count}, Grep: {grep_count})")
        else:
            print(f"  ({i+1}/{len(json_files)}) [✖] MISMATCH: {raw_log_path.name} (Script: {script_count}, Grep: {grep_count})")
            mismatches.append(raw_log_path.name)
            
    print("\n--- Verification Complete ---")
    if not mismatches:
        print("  [✔] All files passed verification. Data extraction is 100% accurate.")
    else:
        print(f"  [✖] Found {len(mismatches)} files with count mismatches:")
        for fname in mismatches:
            print(f"    - {fname}")

if __name__ == "__main__":
    main()
