import json
import os
import pandas as pd
import logging
from typing import Dict, List, Tuple
from pathlib import Path

logger = logging.getLogger(__name__)

# Discover and match LLM reports with ground truth using CSV mapping
def discover_and_match_reports(
    llm_path: str = "../real_analysis_results/",
    dataset_csv_path: str = "../../dataset_splits/val.csv",
    use_all_splits: bool = True
) -> Tuple[List[Dict], List[str], List[str]]:
    logger.info("Starting CSV-based data discovery...")
    
    llm_reports = _load_llm_reports(llm_path)
    logger.info(f"Found {len(llm_reports)} LLM reports")
    
    dataset_df = _load_dataset_files(use_all_splits)
    logger.info(f"Loaded {len(dataset_df)} entries from dataset CSV(s)")
    
    matched_pairs, unmatched_llm, unmatched_gt = _match_reports_csv_based(
        llm_reports, dataset_df
    )
    
    logger.info(f"Discovery complete: {len(matched_pairs)} matched, {len(unmatched_llm)} unmatched LLM, {len(unmatched_gt)} unmatched GT")
    
    return matched_pairs, unmatched_llm, unmatched_gt

# Load all LLM report files and extract study IDs
def _load_llm_reports(llm_path: str) -> Dict[str, str]:
    llm_reports = {}
    llm_dir = Path(llm_path)
    
    if not llm_dir.exists():
        logger.warning(f"LLM directory not found: {llm_path}")
        return llm_reports
    
    for report_file in llm_dir.glob("analysis_*.json"):
        try:
            study_id = report_file.stem.replace("analysis_", "")
            llm_reports[study_id] = str(report_file)
            logger.debug(f"Found LLM report: {study_id}")
        except Exception as e:
            logger.warning(f"Error processing {report_file}: {e}")
    
    return llm_reports

# Load the main dataset CSV file for dicom_id mapping
def _load_dataset_files(use_all_splits: bool = True) -> pd.DataFrame:
    dataset_csv_path = "../../final_dataset_fixed.csv"
    
    if not os.path.exists(dataset_csv_path):
        logger.error(f"Main dataset CSV not found: {dataset_csv_path}")
        return pd.DataFrame()
    
    try:
        df = pd.read_csv(dataset_csv_path)
        logger.info(f"Loaded {len(df)} entries from {dataset_csv_path}")
        
        required_columns = ['dicom_id', 'patient_folder', 'study_id']
        missing_columns = [col for col in required_columns if col not in df.columns]
        
        if missing_columns:
            logger.error(f"Missing required columns in dataset: {missing_columns}")
            return pd.DataFrame()
        
        logger.info(f"Dataset loaded successfully with columns: {list(df.columns)}")
        return df
        
    except Exception as e:
        logger.error(f"Error loading {dataset_csv_path}: {e}")
        return pd.DataFrame()

# Match reports using CSV-based precise mapping
def _match_reports_csv_based(
    llm_reports: Dict[str, str], 
    dataset_df: pd.DataFrame
) -> Tuple[List[Dict], List[str], List[str]]:
    matched_pairs = []
    unmatched_llm = []
    unmatched_gt = []
    matched_gt_ids = set()
    
    for study_id, llm_file in llm_reports.items():
        mask = dataset_df['dicom_id'] == study_id
        matching_rows = dataset_df[mask]
        
        if not matching_rows.empty:
            row = matching_rows.iloc[0]
            report_path = row.get('report_path', '')
            
            if report_path and _validate_report_path(report_path):
                matched_pair = {
                    'study_id': study_id,
                    'llm_report': llm_file,
                    'ground_truth': report_path,
                    'patient_folder': row.get('patient_folder', ''),
                    'has_reflacx': row.get('ifReflacx', False),
                    'has_eyegaze': row.get('ifEyegaze', False),
                    'condition': row.get('condition', ''),
                    'report_path': report_path,
                    'image_path': row.get('image_path', '')
                }
                matched_pairs.append(matched_pair)
                matched_gt_ids.add(study_id)
                
                logger.debug(f"Matched: {study_id}")
            else:
                logger.warning(f"Invalid report path for {study_id}: {report_path}")
                unmatched_llm.append(study_id)
        else:
            logger.debug(f"No CSV entry found for LLM report: {study_id}")
            unmatched_llm.append(study_id)
    
    for _, row in dataset_df.iterrows():
        study_id = row.get('dicom_id', '')
        if study_id and study_id not in matched_gt_ids and study_id not in llm_reports:
            report_path = row.get('report_path', '')
            if report_path and _validate_report_path(report_path):
                unmatched_gt.append(study_id)
    
    return matched_pairs, unmatched_llm, unmatched_gt

# Validate that a report path exists and is accessible
def _validate_report_path(report_path: str) -> bool:
    if not report_path:
        return False
    
    if os.path.exists(report_path):
        return True
    
    relative_path = os.path.join("..", "..", report_path)
    if os.path.exists(relative_path):
        return True
    
    return False

# Load ground truth radiology report using dicom_id mapping
def load_ground_truth_report(dicom_id: str) -> Dict:
    try:
        dataset_csv_path = "../../final_dataset_fixed.csv"
        
        if not os.path.exists(dataset_csv_path):
            logger.error(f"Main dataset CSV not found: {dataset_csv_path}")
            return None
        
        import pandas as pd
        df = pd.read_csv(dataset_csv_path)
        
        matching_rows = df[df['dicom_id'] == dicom_id]
        
        if matching_rows.empty:
            logger.warning(f"No matching entry found for dicom_id: {dicom_id}")
            return None
        
        row = matching_rows.iloc[0]
        patient_folder = row['patient_folder']
        study_id = row['study_id']
        
        ground_truth_path = f"../../cleaned_reports/{patient_folder}/CXR-DICOM/s{study_id}.txt"
        
        if not os.path.exists(ground_truth_path):
            logger.warning(f"Ground truth report not found: {ground_truth_path}")
            return None
        
        with open(ground_truth_path, 'r', encoding='utf-8') as f:
            content = f.read()
        
        cleaned_text = _clean_radiology_report(content)
        
        logger.info(f"Successfully loaded ground truth for {dicom_id} from {ground_truth_path}")
        
        return {
            'report_text': cleaned_text.strip(),
            'source_path': ground_truth_path,
            'file_type': 'radiology_report',
            'raw_text': content.strip(),
            'patient_folder': patient_folder,
            'study_id': study_id,
            'dicom_id': dicom_id
        }
        
    except Exception as e:
        logger.error(f"Error loading ground truth report for {dicom_id}: {e}")
        return None

# Clean radiology report text by extracting the final report section
def _clean_radiology_report(raw_text: str) -> str:
    import re
    
    lines = raw_text.split('\n')
    
    final_report_start = -1
    for i, line in enumerate(lines):
        if 'FINAL REPORT' in line.upper():
            final_report_start = i
            break
    
    if final_report_start == -1:
        for i, line in enumerate(lines):
            if '___' in line and 'FINAL REPORT' not in line.upper():
                final_report_start = i + 1
                break
    
    if final_report_start >= 0:
        report_lines = lines[final_report_start:]
    else:
        report_lines = lines
    
    cleaned_text = '\n'.join(report_lines)
    
    cleaned_text = re.sub(r'WET READ:.*?\n', '', cleaned_text, flags=re.IGNORECASE)
    cleaned_text = re.sub(r'WET READ VERSION.*?\n', '', cleaned_text, flags=re.IGNORECASE)
    cleaned_text = re.sub(r'^[\s_]*\n', '', cleaned_text, flags=re.MULTILINE)
    cleaned_text = re.sub(r'\n\s*\n\s*\n', '\n\n', cleaned_text)
    
    return cleaned_text.strip()

# Entry point for report discovery
def create_data_discovery() -> callable:
    return discover_and_match_reports


