import pandas as pd
import google.generativeai as genai
import os
import requests # For downloading PDFs from URLs
import io # For handling byte streams (downloaded PDF content)
import PyPDF2 # For reading PDF content
from pydantic import BaseModel, Field, ValidationError as PydanticValidationError
from typing import List, Optional
import argparse # For command-line arguments
import json # For manually parsing JSON if needed
import time # For retry delay
import dirtyjson # For parsing potentially malformed JSON

# Define the Pydantic model for structured output
class DocumentAnalysis(BaseModel):
    summary: str = Field(description="A thorough two-paragraph summary of the document's content.")
    keywords: List[str] = Field(description="A list of 10 salient keywords from the content.")
    # Removed min_items and max_items from generated_questions to simplify schema for Gemini API
    generated_questions: List[str] = Field(description="A list of 5 questions a clinician or scientist might ask about this document.")
    concepts: List[str] = Field(description="Based on the keywords and generated questions, a list of 5 concepts pertaining to the document.")
    thesis: str = Field(description="A 2 sentence thesis, a clear statement of purpose, methology, science of the device.")
def extract_text_from_pdf(pdf_url, chunk_size_pages=200):
    """
    Downloads a PDF from a URL and extracts text from it, splitting into chunks.
    Returns a list of text chunks, or None if an error occurs.
    """
    print(f"Attempting to download and extract text from PDF URL: {pdf_url} (chunk size: {chunk_size_pages} pages)")
    try:
        headers = { # Some websites might block requests without a common User-Agent
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }
        response = requests.get(pdf_url, headers=headers, timeout=30) # Added timeout
        response.raise_for_status()  # Raises an HTTPError for bad responses (4XX or 5XX)

        # Check if the content type is PDF, though this is not always reliable
        content_type = response.headers.get('content-type', '').lower()
        if 'application/pdf' not in content_type:
            print(f"Warning: Content-Type for {pdf_url} is '{content_type}', not 'application/pdf'. Attempting to parse anyway.")

        pdf_file_like_object = io.BytesIO(response.content)
        reader = PyPDF2.PdfReader(pdf_file_like_object)

        if reader.is_encrypted:
            try:
                reader.decrypt('')
            except Exception as decrypt_error:
                print(f"Could not decrypt PDF {pdf_url}: {decrypt_error}.")
                return None

        if not reader.pages:
            print(f"Warning: No pages found in PDF from URL {pdf_url}.")
            return None

        total_pages = len(reader.pages)
        text_chunks = []

        if total_pages <= chunk_size_pages:
            # If total pages are within chunk size, extract all text as a single chunk
            full_text = ""
            for page_num in range(total_pages):
                page = reader.pages[page_num]
                page_text = page.extract_text()
                if page_text:
                    full_text += page_text + "\\n"
            if full_text.strip():
                text_chunks.append(full_text)
                print(f"Extracted full text from {pdf_url} (total pages: {total_pages}, length: {len(full_text)} chars).")
        else:
            # If total pages exceed chunk size, use existing chunking logic
            current_chunk_text = ""
            pages_in_current_chunk = 0
            for page_num in range(total_pages):
                page = reader.pages[page_num]
                page_text = page.extract_text()
                if page_text:
                    current_chunk_text += page_text + "\\n"
                pages_in_current_chunk += 1

                if pages_in_current_chunk >= chunk_size_pages or (page_num == total_pages - 1):
                    if current_chunk_text.strip():
                        text_chunks.append(current_chunk_text)
                        print(f"Extracted chunk {len(text_chunks)} from {pdf_url} (pages {page_num - pages_in_current_chunk + 2}-{page_num + 1}, length: {len(current_chunk_text)} chars).")
                    current_chunk_text = ""
                    pages_in_current_chunk = 0
        
        if not text_chunks:
            print(f"Warning: No text extracted from PDF at {pdf_url} after processing. The PDF might be image-based, empty, or unreadable.")
            return None
        
        print(f"Successfully extracted {len(text_chunks)} text chunk(s) from {pdf_url}.")
        return text_chunks

    except requests.exceptions.HTTPError as http_err:
        print(f"HTTP error occurred while fetching {pdf_url}: {http_err}")
        return None
    except requests.exceptions.ConnectionError as conn_err:
        print(f"Connection error occurred while fetching {pdf_url}: {conn_err}")
        return None
    except requests.exceptions.Timeout as timeout_err:
        print(f"Timeout occurred while fetching {pdf_url}: {timeout_err}")
        return None
    except requests.exceptions.RequestException as req_err: # Catch other request-related errors
        print(f"An error occurred during the request to {pdf_url}: {req_err}")
        return None
    except PyPDF2.errors.PdfReadError as pdf_error:
        print(f"Error reading PDF content from {pdf_url} with PyPDF2 (PdfReadError): {pdf_error}. The file might be corrupted or not a valid PDF.")
        return None
    except Exception as e:
        print(f"An unexpected error occurred while processing PDF from URL {pdf_url}: {e}")
        return None

def get_gemini_summary_and_keywords_with_api(model, pdf_text_chunks: List[str]):
    """
    Generates a summary for each chunk, concatenates them, and then generates
    keywords, questions, concepts, and thesis from the concatenated summary.
    Returns the final summary, keywords list, questions list, concepts list,
    thesis string, and the total calculated cost.
    """
    PROMPT_TOKEN_THRESHOLD = 200000 
    INPUT_PRICE_TIER1 = 1.25 / 1_000_000
    INPUT_PRICE_TIER2 = 2.50 / 1_000_000
    OUTPUT_PRICE_TIER1 = 10.00 / 1_000_000
    OUTPUT_PRICE_TIER2 = 15.00 / 1_000_000
    
    total_calculated_cost = 0.0
    max_retries = 1

    # --- Step 1: Get summary for each chunk ---
    all_summaries = []
    for i, pdf_text_chunk in enumerate(pdf_text_chunks):
        print(f"Summarizing chunk {i+1}/{len(pdf_text_chunks)}...")
        attempt = 0
        chunk_summary_cost = 0.0

        # Prompt for summarizing a single chunk
        summary_prompt = f"""Summarize the following chunk of a document in two paragraphs. Do NOT include anything about the document type (e.g., FDA 510(k) letter).
Chunk content:
---
{pdf_text_chunk}
---
"""
        while attempt <= max_retries:
            try:
                if attempt > 0:
                    print(f"Retrying Gemini API call for chunk {i+1} summary (Attempt {attempt + 1}/{max_retries + 1})...")
                    time.sleep(2)

                response = model.generate_content(summary_prompt)

                if hasattr(response, 'usage_metadata') and response.usage_metadata:
                    prompt_tokens = response.usage_metadata.prompt_token_count
                    candidates_tokens = response.usage_metadata.candidates_token_count
                    current_input_price_per_token = INPUT_PRICE_TIER1 if prompt_tokens <= PROMPT_TOKEN_THRESHOLD else INPUT_PRICE_TIER2
                    current_output_price_per_token = OUTPUT_PRICE_TIER1 if prompt_tokens <= PROMPT_TOKEN_THRESHOLD else OUTPUT_PRICE_TIER2

                    chunk_summary_cost = (prompt_tokens * current_input_price_per_token) + \
                                         (candidates_tokens * current_output_price_per_token)
                    total_calculated_cost += chunk_summary_cost
                    print(f"Gemini API Usage for chunk {i+1} summary: Prompt Tokens: {prompt_tokens}, Candidates Tokens: {candidates_tokens}, Chunk Cost: ${chunk_summary_cost:.6f}, Total Cost: ${total_calculated_cost:.6f}")
                else:
                     print(f"Warning: Usage metadata not found for chunk {i+1} summary. Chunk cost will be 0.0.")

                if response.text and response.text.strip():
                    all_summaries.append(response.text.strip())
                    print(f"Successfully summarized chunk {i+1}.")
                    break # Move to next chunk
                else:
                    print(f"Warning: Empty summary received for chunk {i+1}.")
                    if attempt < max_retries:
                        attempt += 1
                    else:
                        print(f"Failed to get a valid summary for chunk {i+1} after all retries. Skipping this chunk.")
                        break # Skip this chunk

            except Exception as e_chunk_summary:
                print(f"Error during API call for chunk {i+1} summary (Attempt {attempt + 1}): {e_chunk_summary}")
                if 'response' in locals() and hasattr(response, 'prompt_feedback'): 
                    print(f"Gemini prompt feedback for chunk {i+1} summary: {response.prompt_feedback}")
                if attempt < max_retries:
                    attempt += 1
                else:
                    print(f"Max retries reached for chunk {i+1} summary. Skipping this chunk.")
                    break # Skip this chunk
        # End of while loop for chunk summary retries
    # End of for loop iterating through chunks

    if not all_summaries:
        print("No summaries could be generated from any chunk.")
        return None, None, None, None, None, total_calculated_cost

    # If there's only one chunk summary, use it directly for final analysis
    if len(all_summaries) == 1:
        print("Only one chunk summary generated. Using it directly for final structured output.")
        single_chunk_summary = all_summaries[0]
        
        attempt = 0
        single_chunk_analysis_cost = 0.0

        # Prompt for structured output from a single summary
        single_summary_prompt = f"""Analyze the following summary of a document and provide: 
1. A thorough two-paragraph summary that distills all relevant content about the device's purpose, methology, science, and results from the entire document.
2. Exactly 10 salient keywords from the entire document.
3. Exactly 5 insightful questions that a clinician or scientist might ask about the entire document to yield good results for further investigation or understanding.
4. Based on the keywords and generated questions, a list of 5 concepts pertaining to the entire document.
5. Based on all of the above, a 2 sentence thesis, a clear statement of purpose, methology, science of the device for the entire document.
Summary content:
---
{single_chunk_summary}
---
"""
        while attempt <= max_retries:
            try:
                if attempt > 0:
                    print(f"Retrying Gemini API call for single chunk analysis (Attempt {attempt + 1}/{max_retries + 1})...")
                    time.sleep(2)

                print(f"Sending request to Gemini API for single chunk structured output (Attempt {attempt + 1})...")
                generation_config = genai.types.GenerationConfig(
                    response_mime_type="application/json",
                    response_schema=DocumentAnalysis
                )
                
                response = model.generate_content(
                    single_summary_prompt,
                    generation_config=generation_config
                )

                if hasattr(response, 'usage_metadata') and response.usage_metadata:
                    prompt_tokens = response.usage_metadata.prompt_token_count
                    candidates_tokens = response.usage_metadata.candidates_token_count
                    current_input_price_per_token = INPUT_PRICE_TIER1 if prompt_tokens <= PROMPT_TOKEN_THRESHOLD else INPUT_PRICE_TIER2
                    current_output_price_per_token = OUTPUT_PRICE_TIER1 if prompt_tokens <= PROMPT_TOKEN_THRESHOLD else OUTPUT_PRICE_TIER2

                    single_chunk_analysis_cost = (prompt_tokens * current_input_price_per_token) + \
                                                 (candidates_tokens * current_output_price_per_token)
                    total_calculated_cost += single_chunk_analysis_cost # Add cost of this call
                    print(f"Gemini API Usage for single chunk analysis: Prompt Tokens: {prompt_tokens}, Candidates Tokens: {candidates_tokens}, Chunk Cost: ${single_chunk_analysis_cost:.6f}, Total Cost: ${total_calculated_cost:.6f}")
                else:
                     print("Warning: Usage metadata not found for single chunk analysis. Chunk cost will be 0.0.")

                raw_text_response_single = response.text
                parsed_successfully_single = False

                try:
                    start_json_single = raw_text_response_single.find('{')
                    end_json_single = raw_text_response_single.rfind('}')
                    
                    if start_json_single != -1 and end_json_single != -1 and end_json_single > start_json_single:
                        json_str_single = raw_text_response_single[start_json_single : end_json_single+1]
                        print(f"Attempting Pydantic validation on extracted JSON for single chunk analysis (Attempt {attempt + 1})...")
                        analysis_data_single = DocumentAnalysis.model_validate_json(json_str_single)
                        parsed_successfully_single = True
                    else:
                        print(f"Could not find a valid JSON block in the response for Pydantic for single chunk analysis.")
                        print(f"Attempting dirtyjson parsing on raw response for single chunk analysis (Attempt {attempt + 1})...")
                        try:
                            data_from_dirtyjson_single = dirtyjson.loads(raw_text_response_single)
                            if isinstance(data_from_dirtyjson_single, dict):
                                print(f"dirtyjson parsed to dict for single chunk analysis, now validating with Pydantic...")
                                analysis_data_single = DocumentAnalysis.model_validate(data_from_dirtyjson_single)
                                parsed_successfully_single = True
                            else:
                                print(f"dirtyjson did not parse to a dictionary for single chunk analysis.")
                                raise ValueError(f"dirtyjson did not produce a dictionary for single chunk analysis.")
                        except Exception as dirty_json_error_single:
                            print(f"dirtyjson parsing failed for single chunk analysis: {dirty_json_error_single}")
                            raise 

                except PydanticValidationError as pve_single:
                    print(f"Pydantic validation failed for single chunk analysis (Attempt {attempt + 1}): {pve_single}")
                    if 'json_str_single' in locals() and json_str_single:
                        try:
                            data_from_dirtyjson_single = dirtyjson.loads(json_str_single)
                            if isinstance(data_from_dirtyjson_single, dict):
                                analysis_data_single = DocumentAnalysis.model_validate(data_from_dirtyjson_single)
                                parsed_successfully_single = True
                            else:
                                raise ValueError(f"dirtyjson did not parse extracted string to dict for single chunk analysis")
                        except Exception as dj_error_single:
                            print(f"Fallback dirtyjson parsing or Pydantic validation failed for single chunk analysis: {dj_error_single}")
                            raise
                    else:
                        raise ValueError(f"Initial JSON block extraction failed for single chunk analysis, and Pydantic failed on raw.")

                except Exception as general_parse_error_single:
                    print(f"General error during parsing/validation for single chunk analysis (Attempt {attempt + 1}): {general_parse_error_single}")
                    raise

                if parsed_successfully_single:
                    final_summary = analysis_data_single.summary
                    final_keywords = analysis_data_single.keywords
                    final_questions = analysis_data_single.generated_questions
                    final_concepts = analysis_data_single.concepts
                    final_thesis = analysis_data_single.thesis

                    print("********** Final Single Chunk Gemini Parsed Output **********")
                    print(f"Final Summary: {final_summary[:500]}...")
                    print(f"Final Keywords: {final_keywords}")
                    print(f"Final Generated Questions: {final_questions}")
                    print(f"Final Concepts: {final_concepts}")
                    print(f"Final Thesis: {final_thesis}")
                    print(f"Total Cost: ${total_calculated_cost:.6f}")
                    print("**********************************************************")

                    # Basic validation for final results
                    if not final_summary: print("Warning: Final summary is empty.")
                    if not final_keywords or len(final_keywords) != 10: print(f"Warning: Final keywords count is {len(final_keywords) if final_keywords else 0}, expected 10.")
                    if not final_questions or len(final_questions) != 5: print(f"Warning: Final questions count is {len(final_questions) if final_questions else 0}, expected 5.")
                    if not final_concepts or len(final_concepts) != 5: print(f"Warning: Final concepts count is {len(final_concepts) if final_concepts else 0}, expected 5.")
                    if not final_thesis: print("Warning: Final thesis is empty.")

                    return final_summary, final_keywords, final_questions, final_concepts, final_thesis, total_calculated_cost

                else: # Should not be reached if exceptions are raised correctly
                    if attempt < max_retries:
                        attempt += 1
                        continue # Retry single chunk analysis
                    else:
                        print(f"Failed to get valid single chunk structured output after all retries.")
                        return None, None, None, None, None, total_calculated_cost

            except Exception as e_single_chunk_analysis:
                print(f"Error during API call for single chunk analysis (Attempt {attempt + 1}): {e_single_chunk_analysis}")
                if 'response' in locals() and hasattr(response, 'prompt_feedback'): 
                    print(f"Gemini prompt feedback for single chunk analysis: {response.prompt_feedback}")
                if attempt < max_retries:
                    attempt += 1
                else:
                    print(f"Max retries reached for single chunk analysis. Aborting processing for this PDF.")
                    return None, None, None, None, None, total_calculated_cost
            # End of while loop for single chunk analysis retries

        return None, None, None, None, None, total_calculated_cost # Should be reached only if all retries fail for single chunk analysis

    else: # If there's more than one chunk summary, concatenate and get final structured output
        concatenated_summary = "\\n\\n--- (Chunk Break) ---\\n\\n".join(all_summaries)
        print(f"\nConcatenated summary created (length: {len(concatenated_summary)} chars).")

        # --- Step 2: Get final structured output from concatenated summary ---
        print("Generating final structured output from concatenated summary...")
        attempt = 0
        final_analysis_cost = 0.0

        # Original prompt structure for the final analysis
        final_prompt = f"""This is a concatenated summary of an FDA 510(k) clearance letter, potentially from multiple chunks. Do NOT include anything in your output about how the FDA cleared it or that it is a 501(k) letter-- we know that already. Analyze this concatenated summary and provide: 
1. A thorough two-paragraph summary that distills all relevant content about the device's purpose, methology, science, and results from the entire document.
2. Exactly 10 salient keywords from the entire document.
3. Exactly 5 insightful questions that a clinician or scientist might ask about the entire document to yield good results for further investigation or understanding.
4. Based on the keywords and generated questions, a list of 5 concepts pertaining to the entire document.
5. Based on all of the above, a 2 sentence thesis, a clear statement of purpose, methology, science of the device for the entire document.
Concatenated Summary content:
---
{concatenated_summary}
---
"""

        while attempt <= max_retries:
            try:
                if attempt > 0:
                    print(f"Retrying Gemini API call for final analysis (Attempt {attempt + 1}/{max_retries + 1})...")
                    time.sleep(2)

                print(f"Sending request to Gemini API for final structured output (Attempt {attempt + 1})...")
                generation_config = genai.types.GenerationConfig(
                    response_mime_type="application/json",
                    response_schema=DocumentAnalysis
                )
                
                response = model.generate_content(
                    final_prompt,
                    generation_config=generation_config,
                    request_options={"timeout": 120} # Added timeout
                )

                if hasattr(response, 'usage_metadata') and response.usage_metadata:
                    prompt_tokens = response.usage_metadata.prompt_token_count
                    candidates_tokens = response.usage_metadata.candidates_token_count
                    current_input_price_per_token = INPUT_PRICE_TIER1 if prompt_tokens <= PROMPT_TOKEN_THRESHOLD else INPUT_PRICE_TIER2
                    current_output_price_per_token = OUTPUT_PRICE_TIER1 if prompt_tokens <= PROMPT_TOKEN_THRESHOLD else OUTPUT_PRICE_TIER2

                    final_analysis_cost = (prompt_tokens * current_input_price_per_token) + \
                                          (candidates_tokens * current_output_price_per_token)
                    total_calculated_cost += final_analysis_cost # Add cost of final call
                    print(f"Gemini API Usage for final analysis: Prompt Tokens: {prompt_tokens}, Candidates Tokens: {candidates_tokens}, Final Cost: ${final_analysis_cost:.6f}, Total Cost: ${total_calculated_cost:.6f}")
                else:
                     print("Warning: Usage metadata not found for final analysis. Final cost will be 0.0.")

                raw_text_response_final = response.text
                parsed_successfully_final = False

                try:
                    start_json_final = raw_text_response_final.find('{')
                    end_json_final = raw_text_response_final.rfind('}')
                    
                    if start_json_final != -1 and end_json_final != -1 and end_json_final > start_json_final:
                        json_str_final = raw_text_response_final[start_json_final : end_json_final+1]
                        print(f"Attempting Pydantic validation on extracted JSON for final analysis (Attempt {attempt + 1})...")
                        analysis_data_final = DocumentAnalysis.model_validate_json(json_str_final)
                        parsed_successfully_final = True
                    else:
                        print(f"Could not find a valid JSON block in the response for Pydantic for final analysis.")
                        print(f"Attempting dirtyjson parsing on raw response for final analysis (Attempt {attempt + 1})...")
                        try:
                            data_from_dirtyjson_final = dirtyjson.loads(raw_text_response_final)
                            if isinstance(data_from_dirtyjson_final, dict):
                                print(f"dirtyjson parsed to dict for final analysis, now validating with Pydantic...")
                                analysis_data_final = DocumentAnalysis.model_validate(data_from_dirtyjson_final)
                                parsed_successfully_final = True
                            else:
                                print(f"dirtyjson did not parse to a dictionary for final analysis.")
                                raise ValueError(f"dirtyjson did not produce a dictionary for final analysis.")
                        except Exception as dirty_json_error_final:
                            print(f"dirtyjson parsing failed for final analysis: {dirty_json_error_final}")
                            raise 

                except PydanticValidationError as pve_final:
                    print(f"Pydantic validation failed for final analysis (Attempt {attempt + 1}): {pve_final}")
                    if 'json_str_final' in locals() and json_str_final:
                        try:
                            data_from_dirtyjson_final = dirtyjson.loads(json_str_final)
                            if isinstance(data_from_dirtyjson_final, dict):
                                analysis_data_final = DocumentAnalysis.model_validate(data_from_dirtyjson_final)
                                parsed_successfully_final = True
                            else:
                                raise ValueError(f"dirtyjson did not parse extracted string to dict for final analysis")
                        except Exception as dj_error_final:
                            print(f"Fallback dirtyjson parsing or Pydantic validation failed for final analysis: {dj_error_final}")
                            raise
                    else:
                        raise ValueError(f"Initial JSON block extraction failed for final analysis, and Pydantic failed on raw.")

                except Exception as general_parse_error_final:
                    print(f"General error during parsing/validation for final analysis (Attempt {attempt + 1}): {general_parse_error_final}")
                    raise

                if parsed_successfully_final:
                    final_summary = analysis_data_final.summary
                    final_keywords = analysis_data_final.keywords
                    final_questions = analysis_data_final.generated_questions
                    final_concepts = analysis_data_final.concepts
                    final_thesis = analysis_data_final.thesis

                    print("********** Final Aggregated Gemini Parsed Output **********")
                    print(f"Final Summary: {final_summary[:500]}...")
                    print(f"Final Keywords: {final_keywords}")
                    print(f"Final Generated Questions: {final_questions}")
                    print(f"Final Concepts: {final_concepts}")
                    print(f"Final Thesis: {final_thesis}")
                    print(f"Total Cost: ${total_calculated_cost:.6f}")
                    print("**********************************************************")

                    # Basic validation for final results
                    if not final_summary: print("Warning: Final summary is empty.")
                    if not final_keywords or len(final_keywords) != 10: print(f"Warning: Final keywords count is {len(final_keywords) if final_keywords else 0}, expected 10.")
                    if not final_questions or len(final_questions) != 5: print(f"Warning: Final questions count is {len(final_questions) if final_questions else 0}, expected 5.")
                    if not final_concepts or len(final_concepts) != 5: print(f"Warning: Final concepts count is {len(final_concepts) if final_concepts else 0}, expected 5.")
                    if not final_thesis: print("Warning: Final thesis is empty.")

                    return final_summary, final_keywords, final_questions, final_concepts, final_thesis, total_calculated_cost

                else: # Should not be reached if exceptions are raised correctly
                    if attempt < max_retries:
                        attempt += 1
                        continue # Retry final analysis
                    else:
                        print(f"Failed to get valid final structured output after all retries.")
                        return None, None, None, None, None, total_calculated_cost

            except Exception as e_final_analysis:
                print(f"Error during API call for final analysis (Attempt {attempt + 1}): {e_final_analysis}")
                if 'response' in locals() and hasattr(response, 'prompt_feedback'): 
                    print(f"Gemini prompt feedback for final analysis: {response.prompt_feedback}")
                if attempt < max_retries:
                    attempt += 1
                else:
                    print(f"Max retries reached for final analysis. Aborting processing for this PDF.")
                    return None, None, None, None, None, total_calculated_cost
            # End of while loop for final analysis retries

        return None, None, None, None, None, total_calculated_cost # Should be reached only if all retries fail for final analysis

def main(num_records_to_process):
    csv_file = '/Users/arun/Documents/fda-search/py_src/fda_ai_records.csv'
    print(f"Attempting to process up to {num_records_to_process} record(s).")
    try:
        df = pd.read_csv(csv_file)
    except FileNotFoundError:
        print(f"Error: {csv_file} not found.")
        return
    except Exception as e:
        print(f"Error reading {csv_file}: {e}")
        return

    if df.empty:
        print("The CSV file is empty. No records to process.")
        return

    # Load manual device updates
    manual_updates_file = 'manual_device_updates.json'
    manual_updates_data = {}
    try:
        with open(manual_updates_file, 'r') as f:
            manual_updates_list = json.load(f)
            # Convert list of single-key dicts to a single dict for easier lookup
            for item in manual_updates_list:
                manual_updates_data.update(item)
        print(f"Successfully loaded manual device updates from {manual_updates_file}.")
    except FileNotFoundError:
        print(f"Warning: Manual updates file not found at {manual_updates_file}. Proceeding without manual updates.")
    except json.JSONDecodeError as e:
        print(f"Error decoding JSON from {manual_updates_file}: {e}. Proceeding without manual updates.")
    except Exception as e:
        print(f"An unexpected error occurred while loading manual updates: {e}. Proceeding without manual updates.")

    # Ensure the target columns exist, if not, add them
    target_columns = ['summary_keywords', 'summary', 'gemini_cost_summary', 'generated_questions', 'concepts', 'thesis', 'search_boost_text']
    for col in target_columns:
        if col not in df.columns:
            df[col] = pd.NA # Use pandas NA for missing values

    pdf_column_name = 'summary_pdf_link' # Use new column name

    if pdf_column_name not in df.columns:
        print(f"Error: '{pdf_column_name}' column not found in the CSV. This column is required to find PDFs.")
        return

    # --- Process multiple records ---
    records_processed_count = 0
    processed_any_record_in_this_run = False
    running_cost = 0
    running_failed_count = 0

    for idx, row in df.iterrows():
        # # skip the first 10 rows
        # if idx < 5:
        #     continue

        # User Request: Only process if 'thesis' is empty, NaN, or "Error"
        current_thesis = str(row.get('thesis', '')).strip()
        if not (pd.isna(row.get('thesis')) or current_thesis == "" or current_thesis.lower() == "none" or current_thesis.lower() == "error"):
            # print(f"Skipping record at index {idx}: 'thesis' column is already populated and not 'Error'. Value: '{current_thesis}'")
            continue # Skip this record

        if records_processed_count >= num_records_to_process:
            print(f"Target number of records ({num_records_to_process}) to process reached.")
            break

        submission_number = str(row.get('submission_number', '')).strip()
        manual_summary_content = manual_updates_data.get(submission_number)

        pdf_url_is_valid = pd.notna(row.get(pdf_column_name)) and \
                           isinstance(row.get(pdf_column_name), str) and \
                           str(row.get(pdf_column_name)).strip().lower().startswith('http')

        pdf_text_chunks = None
        used_manual_summary = False

        if pdf_url_is_valid:
            pdf_url_to_process = str(row.get(pdf_column_name))
            print(f"\nProcessing record at index {idx} (Thesis: '{current_thesis}') with PDF URL: {pdf_url_to_process}")

            try:
                pdf_text_chunks = extract_text_from_pdf(pdf_url_to_process) # Renamed for clarity
            except Exception as pdf_extraction_error:
                print(f"Error during PDF extraction for {pdf_url_to_process}: {pdf_extraction_error}")
                pdf_text_chunks = None

        if not pdf_text_chunks and manual_summary_content:
            print(f"PDF extraction failed or URL invalid for {submission_number}. Using manual summary.")
            pdf_text_chunks = [manual_summary_content] # Treat manual summary as a single chunk
            df.loc[idx, pdf_column_name] = "error" # Mark PDF link as "error"
            used_manual_summary = True
        elif not pdf_text_chunks and not manual_summary_content:
             print(f"Could not extract text from PDF for idx {idx}: {row.get(pdf_column_name)} and no manual summary found for {submission_number}. Skipping Gemini call for this record.")
             df.loc[idx, 'summary'] = "Error: Could not extract text from PDF URL or PDF empty, and no manual summary available"
             df.loc[idx, 'summary_keywords'] = ""
             df.loc[idx, 'gemini_cost_summary'] = 0.0
             df.loc[idx, 'concepts'] = ""
             df.loc[idx, 'thesis'] = "Error" # Set thesis to "Error"
             df.loc[idx, 'search_boost_text'] = ""
             df.loc[idx, pdf_column_name] = "error" # Mark PDF link as "error"
        
        # Proceed with Gemini call if we have text chunks (either from PDF or manual summary)
        if pdf_text_chunks:
            api_key = os.getenv("GEMINI_API_KEY")
            if not api_key:
                print("Error: GEMINI_API_KEY environment variable not set. Halting further processing.")
                df.loc[idx, 'summary'] = "Error: GEMINI_API_KEY not set"
                df.loc[idx, 'summary_keywords'] = ""
                df.loc[idx, 'gemini_cost_summary'] = 0.0
                df.loc[idx, 'concepts'] = ""
                df.loc[idx, 'thesis'] = "Error"   # Set thesis to "Error"
                df.loc[idx, 'search_boost_text'] = ""
                # If API key is missing, we probably shouldn't continue trying other records.
                # However, the current loop structure will save changes made so far.
                # To stop all further processing and saving, we could `return` here.
                # For now, let it mark this record and continue to save.
            else:
                try:
                    genai.configure(api_key=api_key)
                    # Using the user-specified model
                    model = genai.GenerativeModel('gemini-2.5-flash')

                    summary, keywords_list, questions_list, concepts_list, thesis_str, cost = get_gemini_summary_and_keywords_with_api(model, pdf_text_chunks) # Pass chunks
                    running_cost += cost
                    if summary and keywords_list and questions_list and concepts_list and thesis_str:
                        df.loc[idx, 'summary'] = summary
                        df.loc[idx, 'summary_keywords'] = ", ".join(keywords_list)
                        df.loc[idx, 'generated_questions'] = "; ".join(questions_list)
                        df.loc[idx, 'concepts'] = ", ".join(concepts_list) # Store as comma-separated string
                        df.loc[idx, 'thesis'] = thesis_str
                        df.loc[idx, 'gemini_cost_summary'] = cost

                        # Populate search_boost_text
                        company_name = str(row.get('company', ''))
                        device_model_name = str(row.get('device_model', ''))
                        keywords_str = df.loc[idx, 'summary_keywords'] if pd.notna(df.loc[idx, 'summary_keywords']) else ''
                        df.loc[idx, 'search_boost_text'] = f"{company_name} {device_model_name} {keywords_str}".strip()

                        print(f"Successfully processed record {idx}. Summary, keywords, questions, concepts, thesis, and search_boost_text updated.")
                    else:
                        print(f"Failed to get all valid fields (summary/keywords/questions/concepts/thesis) from Gemini for record {idx}.")
                        # Keep old values if new ones are missing, or set to error
                        df.loc[idx, 'summary'] = summary if summary else df.loc[idx, 'summary']
                        df.loc[idx, 'summary_keywords'] = ", ".join(keywords_list) if keywords_list else df.loc[idx, 'summary_keywords']
                        df.loc[idx, 'generated_questions'] = "; ".join(questions_list) if questions_list else df.loc[idx, 'generated_questions']
                        df.loc[idx, 'concepts'] = ", ".join(concepts_list) if concepts_list else "Error: Failed to generate concepts"
                        df.loc[idx, 'thesis'] = thesis_str if thesis_str else "Error" # Set to "Error" if thesis_str is empty/None
                        df.loc[idx, 'gemini_cost_summary'] = cost if cost is not None else 0.0

                        if not (summary and keywords_list and questions_list and concepts_list and thesis_str):
                            running_failed_count += 1

                        # Populate search_boost_text even on partial failure, using available data
                        company_name = str(row.get('company', ''))
                        device_model_name = str(row.get('device_model', ''))
                        keywords_str = df.loc[idx, 'summary_keywords'] if pd.notna(df.loc[idx, 'summary_keywords']) and "Error" not in str(df.loc[idx, 'summary_keywords']) else ''
                        df.loc[idx, 'search_boost_text'] = f"{company_name} {device_model_name} {keywords_str}".strip()

                        if not summary: df.loc[idx, 'summary'] = "Error: Failed to generate summary"
                        if not keywords_list: df.loc[idx, 'summary_keywords'] = "Error: Failed to generate keywords"
                        # Error messages for new fields if they are specifically missing
                        if not concepts_list and (summary or keywords_list or questions_list or thesis_str): # only log if other parts were ok
                            df.loc[idx, 'concepts'] = "Error: Failed to generate concepts"
                        if not thesis_str and (summary or keywords_list or questions_list or concepts_list): # only log if other parts were ok
                            df.loc[idx, 'thesis'] = "Error: Failed to generate thesis"

                except Exception as e:
                    print(f"An error occurred during Gemini API configuration or call for record {idx}: {e}")
                    df.loc[idx, 'summary'] = f"Error: Gemini API call failed ({e})"
                    df.loc[idx, 'summary_keywords'] = ""
                    df.loc[idx, 'generated_questions'] = ""
                    df.loc[idx, 'concepts'] = ""
                    df.loc[idx, 'thesis'] = "Error"
                    df.loc[idx, 'search_boost_text'] = "" # Clear on API error
                    df.loc[idx, 'gemini_cost_summary'] = 0.0

            records_processed_count += 1
            processed_any_record_in_this_run = True
            print("Running Cost:", running_cost)
            print("Running Failed:", running_failed_count)
        # Do not increment records_processed_count here, as we are looking for N *valid* records to attempt.

    if records_processed_count == 0 and num_records_to_process > 0:
        print(f"Could not find any records with valid PDF URLs within the first {len(df)} rows to process the requested {num_records_to_process} records.")
    elif records_processed_count < num_records_to_process and num_records_to_process > 0 :
         print(f"Processed {records_processed_count} record(s). Fewer than requested ({num_records_to_process}) were found with valid PDF URLs or the end of the file was reached.")


    # --- Save Updated CSV if any changes were made or attempted ---
    if processed_any_record_in_this_run: # Save if we attempted to process anything
        try:
            df.to_csv(csv_file, index=False)
            print(f"Successfully updated {csv_file} with results for {records_processed_count} record(s).")
        except Exception as e:
            print(f"Error writing updated data to {csv_file}: {e}")
    elif num_records_to_process > 0 : # If user wanted to process but nothing was suitable or processed
        print("No changes made to the CSV file as no records were processed or no valid URLs found for the requested count.")

# python process_pdf_summary.py -n 5
if __name__ == '__main__':
    parser = argparse.ArgumentParser(description="Process PDF summaries using Gemini API and update a CSV file.")
    parser.add_argument('-n', '--num_records', type=int, default=1,
                        help="Number of records to process (default: 1).")
    args = parser.parse_args()
    
    main(args.num_records)
