import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import re # For extracting submission number from PDF link
import os

# --- Configuration ---
FDA_AI_ML_LIST_URL = "https://www.fda.gov/medical-devices/software-medical-device-samd/artificial-intelligence-and-machine-learning-aiml-enabled-medical-devices"
FDA_DEVICE_PAGE_URL_TEMPLATE = "https://www.accessdata.fda.gov/scripts/cdrh/cfdocs/cfPMN/pmn.cfm?ID={}"
FDA_PDF_BASE_URL = "https://www.accessdata.fda.gov/cdrh_docs"

VERCEL_EXCEL_FILE_PATH = "data/AI_devices_taxonomy.xlsx" # New path for Excel data
OUTPUT_CSV_FILE = "/Users/arun/Documents/fda-search/py_src/fda_ai_records.csv"
# Desired final column names for taxonomy fields, matching Excel source as per user feedback
TARGET_TAXONOMY_COLUMNS_FINAL_NAMES = ['lead_panel', 'data_type', 'clinical_function', 'ai_function', 'ai_function_subclass']

# Possible names for the submission number column in the Excel file
POSSIBLE_SUBMISSION_COL_NAMES_EXCEL = ['Submission Number', 'Submission No.', 'Record Key', 'PMA/510(k)', 'submission_number']

# Mapping from actual Excel column names (keys) to desired final column names (values).
# For taxonomy fields, the final name is now the same as the Excel source name.
# The submission number will be mapped to "Submission Number".
EXCEL_COLUMN_MAP = {
    # Submission number variants will be found by POSSIBLE_SUBMISSION_COL_NAMES_EXCEL and mapped to "submission_number" (internally, then used for merge)
    'data_type': 'data_type',
    'clinical_function': 'clinical_function',
    'AI_function': 'ai_function', # Will be mapped to ai_function
    'AI_function_subclass': 'ai_function_subclass', # Will be mapped to ai_function_subclass
    'lead_panel': 'lead_panel'
}

# --- Helper Functions ---
def get_soup(url):
    """Fetches content from a URL and returns a BeautifulSoup object."""
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    try:
        response = requests.get(url, headers=headers, timeout=30)
        response.raise_for_status()
        return BeautifulSoup(response.content, "html.parser")
    except requests.exceptions.RequestException as e:
        print(f"Error fetching {url}: {e}")
        return None

# --- Phase 1: FDA Data Extraction ---
def scrape_fda_main_list(existing_submission_numbers=None):
    """Scrapes the main list of AI/ML enabled medical devices from FDA."""
    if existing_submission_numbers is None:
        existing_submission_numbers = set()

    print("Scraping FDA main AI/ML device list...")
    soup = get_soup(FDA_AI_ML_LIST_URL)
    if not soup:
        return []
    devices = []
    table = soup.find("table")
    if not table:
        print("Could not find the main table on the FDA AI/ML page.")
        return []
    tbody = table.find("tbody")
    if not tbody: tbody = table
    for row in tbody.find_all("tr"):
        cols = row.find_all("td")
        if not cols or len(cols) < 5: continue
        try:
            date_of_decision = cols[0].text.strip()
            submission_number_raw = cols[1].text.strip()
            device_model = cols[2].text.strip()
            company = cols[3].text.strip()
            panel = cols[4].text.strip()
            primary_product_code = cols[7].text.strip() if len(cols) > 7 else ""
            submission_match = re.match(r"([A-Za-z0-9]+)", submission_number_raw)
            submission_number = submission_match.group(1).upper() if submission_match else submission_number_raw.upper()
            
            if not submission_number: continue
            if submission_number in existing_submission_numbers:
                # print(f"Skipping existing device: {submission_number}")
                continue

            devices.append({
                "date_of_final_decision": date_of_decision, "submission_number": submission_number,
                "device_model": device_model, "company": company, "panel_lead": panel,
                "primary_product_code": primary_product_code, "summary_pdf_link": ""
            })
        except IndexError: print(f"Skipping a row due to IndexError (unexpected row structure): {row}")
        except Exception as e: print(f"Error processing a row: {e} - Row: {row}")
    print(f"Found {len(devices)} initial records from FDA main list.")

    return devices

def scrape_fda_pdf_links(devices_list):
    """Scrapes PDF links for each device."""
    print("Scraping FDA PDF links...")
    updated_devices = []
    for i, device in enumerate(devices_list):
        submission_id = device["submission_number"] # Use new key
        print(f"Fetching PDF link for {submission_id} ({i+1}/{len(devices_list)})...")
        page_url = FDA_DEVICE_PAGE_URL_TEMPLATE.format(submission_id)
        soup = get_soup(page_url)
        if not soup:
            device["summary_pdf_link"] = "Not Found" # Use new key
            updated_devices.append(device)
            continue
        pdf_link_tag = soup.find("a", href=lambda href: href and href.lower().endswith(".pdf") and "summary" in href.lower())
        if not pdf_link_tag: pdf_link_tag = soup.find("a", href=lambda href: href and href.lower().endswith(".pdf"))
        if pdf_link_tag and pdf_link_tag.get("href"):
            link = pdf_link_tag["href"]
            if not link.startswith("http"): link = FDA_PDF_BASE_URL + (link if link.startswith("/") else "/" + link)
            device["summary_pdf_link"] = link # Use new key
        else: device["summary_pdf_link"] = "Not Found" # Use new key
        updated_devices.append(device)
        time.sleep(0.2) 
    print("Finished scraping PDF links.")
    return updated_devices

# --- Phase 2: Taxonomy Data from Excel ---
def load_taxonomy_from_excel(file_path):
    """Loads taxonomy data from the specified Excel file."""
    print(f"Loading taxonomy data from Excel file: {file_path}...")
    try:
        df_excel = pd.read_excel(file_path)
        print(f"Successfully loaded Excel file. Columns found: {df_excel.columns.tolist()}")

        actual_excel_submission_col_name = None
        for potential_name in POSSIBLE_SUBMISSION_COL_NAMES_EXCEL:
            if potential_name in df_excel.columns:
                actual_excel_submission_col_name = potential_name
                break
        
        if not actual_excel_submission_col_name:
            print(f"Error: Could not find a submission number column in {file_path} using possible names: {POSSIBLE_SUBMISSION_COL_NAMES_EXCEL}")
            return pd.DataFrame()

        print(f"Identified '{actual_excel_submission_col_name}' as the submission number column in Excel.")

        # Build the list of actual Excel columns to select and the rename map
        # The goal is to have 'submission_number' (final name) and the original Excel names for the 5 taxonomy fields.
        
        selected_excel_columns_map = {} # Key: actual_excel_name, Value: final_name_for_df_taxonomy
        
        # 1. Handle Submission Number (rename to "submission_number" for consistency)
        selected_excel_columns_map[actual_excel_submission_col_name] = "submission_number" # This is the merge key
        
        # 2. Handle the 5 target taxonomy columns (map to their final lowercase names)
        for target_col_final_name in TARGET_TAXONOMY_COLUMNS_FINAL_NAMES:
            if target_col_final_name in df_excel.columns:
                if target_col_final_name not in selected_excel_columns_map:
                    selected_excel_columns_map[target_col_final_name] = target_col_final_name
            else:
                found_variant = False
                for excel_variant_name, mapped_final_name in EXCEL_COLUMN_MAP.items():
                    if mapped_final_name == target_col_final_name and excel_variant_name in df_excel.columns:
                        if excel_variant_name not in selected_excel_columns_map:
                             selected_excel_columns_map[excel_variant_name] = mapped_final_name
                             found_variant = True
                             break
                if not found_variant:
                    print(f"Warning: Target taxonomy column '{target_col_final_name}' not found in Excel directly or via map.")

        # Select only the columns we've decided to keep (keys of the map - actual names from Excel)
        df_taxonomy_subset = df_excel[list(selected_excel_columns_map.keys())].copy()
        
        # Rename them to our desired final names
        df_taxonomy_subset.rename(columns=selected_excel_columns_map, inplace=True)
        
        # Ensure all TARGET_TAXONOMY_COLUMNS_FINAL_NAMES and "submission_number" (merge key) are present
        final_required_columns_for_taxonomy_df = ["submission_number"] + TARGET_TAXONOMY_COLUMNS_FINAL_NAMES
        for col_name in final_required_columns_for_taxonomy_df:
            if col_name not in df_taxonomy_subset.columns:
                print(f"Adding missing required column '{col_name}' as empty after selection/rename in taxonomy_df.")
                df_taxonomy_subset[col_name] = pd.NA

        if "submission_number" not in df_taxonomy_subset.columns: # This is the merge key
            print("Critical Error: Merge key 'submission_number' column is unexpectedly missing from taxonomy_df. Cannot proceed.")
            return pd.DataFrame()
        
        df_taxonomy_subset["submission_number"] = df_taxonomy_subset["submission_number"].astype(str).str.upper().str.strip() # Normalize merge key
        
        # Ensure only the final required columns are in the returned DataFrame, in the correct order
        df_taxonomy = df_taxonomy_subset[final_required_columns_for_taxonomy_df].copy()

        print(f"Loaded and processed {len(df_taxonomy)} records from Excel. Columns for merge: {df_taxonomy.columns.tolist()}")
        return df_taxonomy

    except FileNotFoundError:
        print(f"Error: Excel file not found at {file_path}")
        return pd.DataFrame()
    except Exception as e:
        print(f"Error reading or processing Excel file {file_path}: {e}")
        return pd.DataFrame()

# --- Main Execution ---
if __name__ == "__main__":
    print("Starting FDA AI/ML Data Processor...")
    df_fda = None

    if os.path.exists(OUTPUT_CSV_FILE):
        try:
            temp_df = pd.read_csv(OUTPUT_CSV_FILE, dtype={'submission_number': str})
            if 'submission_number' in temp_df.columns and 'summary_pdf_link' in temp_df.columns:
                df_fda = temp_df
                print(f"Loaded existing FDA data from {OUTPUT_CSV_FILE}. Records: {len(df_fda)}")
            else:
                print(f"{OUTPUT_CSV_FILE} exists but seems incomplete (missing submission_number or summary_pdf_link). Will re-scrape FDA data.")
        except pd.errors.EmptyDataError:
            print(f"{OUTPUT_CSV_FILE} is empty. Will scrape FDA data.")
        except Exception as e:
            print(f"Error loading {OUTPUT_CSV_FILE}: {e}. Will scrape FDA data.")

    existing_submission_numbers_set = set()
    print(df_fda)
    print(df_fda.columns)
    if df_fda is not None and "submission_number" in df_fda.columns:
        existing_submission_numbers_set = set(df_fda["submission_number"].astype(str).str.upper().str.strip().tolist())
        print(f"Found {len(existing_submission_numbers_set)} existing submission numbers in {OUTPUT_CSV_FILE}.")

    print("Proceeding to scrape FDA data from web...")
    fda_devices_raw = scrape_fda_main_list(existing_submission_numbers_set)
    # fda_devices_raw = fda_devices_raw[:5]
    print("TRIMMING FOR DEBUG PUPOSES! ")
    print(fda_devices_raw)

    if not fda_devices_raw:
        print("No new data scraped from FDA main list.")
        if df_fda is None: # If no existing data and no new data, exit
            print("No data (existing or new) to process. Exiting.")
            exit()
        # If existing data was loaded, and no new data, proceed with existing data for merge
        df_merged = df_fda.copy()
    else:
        print(f"Found {len(fda_devices_raw)} new records from FDA main list.")
        fda_devices_with_pdfs = scrape_fda_pdf_links(fda_devices_raw)
        df_new_fda = pd.DataFrame(fda_devices_with_pdfs)
        
        if df_new_fda.empty:
            print("No new FDA data to add from web scraping.")
            if df_fda is None:
                print("No data (existing or new) to process. Exiting.")
                exit()
            df_merged = df_fda.copy()
        else:
            df_new_fda["submission_number"] = df_new_fda["submission_number"].astype(str).str.upper().str.strip()
            
            if df_fda is not None:
                # Concatenate existing and new data
                df_fda["submission_number"] = df_fda["submission_number"].astype(str).str.upper().str.strip()
                df_merged = pd.concat([df_fda, df_new_fda], ignore_index=True)
                print(f"Combined existing and new FDA data. Total records: {len(df_merged)}")
            else:
                df_merged = df_new_fda.copy()
                print(f"FDA Data scraped from web: {len(df_merged)} records.")

    df_taxonomy = load_taxonomy_from_excel(VERCEL_EXCEL_FILE_PATH)

    if not df_taxonomy.empty:
        print("Merging FDA data with Taxonomy data from Excel...")
        df_merged["submission_number"] = df_merged["submission_number"].astype(str).str.upper().str.strip()
        
        old_taxonomy_names_to_drop = ['AI_function', 'AI_function_subclass'] # Case-sensitive original names
        all_taxonomy_cols_from_target_list = TARGET_TAXONOMY_COLUMNS_FINAL_NAMES
        
        cols_to_drop_from_merged = [col for col in all_taxonomy_cols_from_target_list if col in df_merged.columns]
        cols_to_drop_from_merged.extend(c for c in old_taxonomy_names_to_drop if c in df_merged.columns and c not in cols_to_drop_from_merged)

        if cols_to_drop_from_merged:
            print(f"Dropping existing/old taxonomy columns from merged data before re-merge: {cols_to_drop_from_merged}")
            df_merged = df_merged.drop(columns=cols_to_drop_from_merged, errors='ignore')
            
        df_merged = pd.merge(df_merged, df_taxonomy, on="submission_number", how="left")
        print(f"Merge complete. Total records after merge: {len(df_merged)}")
    else:
        print("Taxonomy DataFrame from Excel is empty. Proceeding with FDA data only.")


    # Initialize summary columns if they don't exist (using new names)
    # "gemini_cost_summary", "concepts", "thesis", "search_boost_text" are primarily handled by other scripts or this one later,
    # but initializing them here ensures they exist in the CSV.
    for col_name in ["summary_keywords", "summary", "generated_questions", "concepts", "thesis", "search_boost_text", "gemini_cost_summary"]:
        if col_name not in df_merged.columns:
            df_merged[col_name] = pd.NA # Use pd.NA for missing string data
        else:
            # Ensure existing empty strings are also pd.NA for consistency, or handle as needed
            df_merged[col_name] = df_merged[col_name].fillna(pd.NA)


    # Define the desired final column order using new names
    base_fda_columns = ["date_of_final_decision", "submission_number", "device_model", "company", 
                        "panel_lead", "primary_product_code", "summary_pdf_link"]
    # TARGET_TAXONOMY_COLUMNS_FINAL_NAMES is already updated: ['lead_panel', 'data_type', 'clinical_function', 'ai_function', 'ai_function_subclass']
    # Note: 'lead_panel' is in both base_fda_columns and TARGET_TAXONOMY_COLUMNS_FINAL_NAMES. Ensure it's handled correctly.
    # The taxonomy data for 'lead_panel' should ideally overwrite or be the source if different from scraped.
    # The current merge logic might result in 'panel_lead' from scraping and 'lead_panel' from taxonomy.
    # This needs to be reconciled. For now, assume 'panel_lead' is the one from scraping, and 'lead_panel' from taxonomy.
    # The user request was to rename 'Panel (lead)' to 'panel_lead'.
    # The taxonomy file has 'lead_panel'. These are distinct and should both be present if data exists for them.

    # search_boost_text will be added here as well
    summary_gemini_cols = ["summary_keywords", "summary", "generated_questions", "concepts", "thesis", "search_boost_text", "gemini_cost_summary"]
    
    # Construct desired_columns_order carefully
    # Start with base FDA columns
    desired_columns_order = list(base_fda_columns)
    # Add taxonomy columns, ensuring no duplicates from base_fda_columns (e.g. if panel_lead and lead_panel are meant to be one)
    # For now, assume they are distinct as per current naming.
    for col in TARGET_TAXONOMY_COLUMNS_FINAL_NAMES:
        if col not in desired_columns_order:
            desired_columns_order.append(col)
    # Add summary/Gemini columns
    for col in summary_gemini_cols:
        if col not in desired_columns_order:
            desired_columns_order.append(col)
            
    final_df_cols = []
    for col_name_ordered in desired_columns_order:
        if col_name_ordered in df_merged.columns:
            final_df_cols.append(col_name_ordered)
        else:
            # Only add as empty if it's a truly expected column that might be missing sometimes.
            # These are columns that might be populated by other scripts or data sources.
            expected_optional_cols = ["summary_keywords", "summary", "gemini_cost_summary", 
                                      "generated_questions", "concepts", "thesis", "search_boost_text", "primary_product_code"]
            if col_name_ordered in expected_optional_cols:
                 print(f"Warning: Column '{col_name_ordered}' expected but not found in merged data. Adding as empty (NA) column.")
                 df_merged[col_name_ordered] = pd.NA
                 final_df_cols.append(col_name_ordered)
            # else:
            #    print(f"Info: Column '{col_name_ordered}' from desired_columns_order not in df_merged and not in expected_optional_cols. Skipping.")
    
    # Add any other columns from df_merged that were not in desired_columns_order (e.g. unexpected ones)
    # This ensures no data loss, placing them at the end.
    for col_in_merged in df_merged.columns:
        if col_in_merged not in final_df_cols:
            final_df_cols.append(col_in_merged)
            print(f"Info: Adding column '{col_in_merged}' to final_df_cols as it was in df_merged but not in explicit order.")
            
    df_final = df_merged[final_df_cols].copy()

    try:
        df_final.to_csv(OUTPUT_CSV_FILE, index=False, encoding='utf-8')
        print(f"Successfully saved combined data to {OUTPUT_CSV_FILE}")
        print(f"Total records in final CSV: {len(df_final)}")
        print("Final DataFrame head:\n", df_final.head(3))
    except Exception as e:
        print(f"Error saving data to CSV: {e}")

    print("Processing finished.")
