
import json
from pathlib import Path
import subprocess

# --- Configuration ---
INTERMEDIATE_DATA_DIR = Path('case-study/M76_Quantitative_Study_of_the_CHAC_Workbench/intermediate_data')
RAW_LOGS_DIR = Path('.ai-cli-log')

def main():
    """
    Verifies the metadata log count using a more robust grep command.
    """
    print("--- Step 1b: Verifying Extraction Accuracy (v2) ---")
    
    mismatches = []
    
    json_files = sorted(list(INTERMEDIATE_DATA_DIR.glob('*_metrics.json')))
    
    for i, json_file in enumerate(json_files):
        # 1. Get count from our script's output
        with open(json_file, 'r') as f:
            data = json.load(f)
        script_count = data.get('metadata_log_count', 0)
        
        # 2. Get ground truth count from raw file using a more specific grep
        raw_log_path = RAW_LOGS_DIR / json_file.name.replace('_metrics.json', '.txt')
        if not raw_log_path.is_file():
            print(f"  [!] WARNING: Raw log not found for {json_file.name}")
            continue
            
        try:
            # This command, designed by the user, uses a precise regex to
            # positively identify only the legitimate METADATA LOG headers.
            # It checks that the line starts with optional whitespace, contains
            # "METADATA LOG" followed by any non-whitespace chars, and then ends.
            result = subprocess.run(
                f'grep -E -c "^[ \t]*METADATA LOG\S*$" {raw_log_path}',
                shell=True, capture_output=True, text=True
            )
            # The command returns 1 if not found, so we check returncode
            if result.returncode == 0:
                grep_count = int(result.stdout.strip())
            else:
                grep_count = 0

        except (subprocess.CalledProcessError, ValueError):
            grep_count = -1 # Indicate an error
            
        # --- Comparison Logic with Three States (OK, WARNING, MISMATCH) ---
        if script_count == grep_count:
            print(f"  ({i+1}/{len(json_files)}) [✔] OK: {raw_log_path.name} (Script: {script_count}, Grep: {grep_count})")
        else:
            # Calculate the difference and the allowed margin
            difference = abs(script_count - grep_count)
            # Use max(1, ...) to ensure margin is at least 1 for small numbers
            margin = max(1, grep_count * 0.05)
            
            if grep_count > 0:
                percentage_diff = (difference / grep_count) * 100
            else:
                percentage_diff = float('inf') # Avoid division by zero

            if difference <= margin:
                print(f"  ({i+1}/{len(json_files)}) [!] WARNING: {raw_log_path.name} (Script: {script_count}, Grep: {grep_count}, Diff: {percentage_diff:.1f}%)")
            else:
                print(f"  ({i+1}/{len(json_files)}) [✖] MISMATCH: {raw_log_path.name} (Script: {script_count}, Grep: {grep_count}, Diff: {percentage_diff:.1f}%)")
                mismatches.append(raw_log_path.name)
            
    print("\n--- Verification Complete ---")
    if not mismatches:
        print("  [✔] All files passed verification. Data extraction is 100% accurate.")
    else:
        print(f"  [✖] Found {len(mismatches)} files with count mismatches:")
        for fname in mismatches:
            print(f"    - {fname}")

if __name__ == "__main__":
    main()
