from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI
from pydantic import BaseModel,Field
from typing import Optional, List, Union, Dict, Any
import json
import os
import time
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from langchain_core.output_parsers import PydanticOutputParser
import re


class InputFile(BaseModel):
    file_name: Optional[str] = Field(description="the provided file name, if available")
    file_format: str = Field(description="e.g., FASTQ, BAM, VCF, mzML")
    data_type: str = Field(description="e.g., raw sequencing reads, aligned reads, variant calls, proteomics spectra")
    paired_end: Optional[bool] = Field(description="true/false/null")


class OutputFile(BaseModel):
    file_format: str = Field(description="e.g., VCF, TSV, abundance table, PDF report")
    data_type: str = Field(description="e.g., variants, gene expression matrix, species abundance")


class AnalysisTask(BaseModel):
    input_files: List[InputFile] = Field(description="list of input files")
    output_files: List[OutputFile] = Field(description="list of output files")
    analysis_goal: str = Field(description="a detailed description of the intended analysis, including start point, key processing steps, and desired outcome")
    
    def to_dict(self):
        return {
            "input_files": [file.dict() for file in self.input_files],
            "output_files": [file.dict() for file in self.output_files],
            "analysis_goal": self.analysis_goal
        }


prompt = '''
You are an assistant for bioinformatics workflow design.

Your task is to carefully read the user's natural question about their analysis task,
and decompose it into a structured natural language response with the following sections:

1. **Input files**  
   - List each input file mentioned by the user.  
   - For each file, describe:  
     - file name (if provided, otherwise say "not specified")  
     - file format (e.g., FASTQ, BAM, VCF, mzML)  
     - data type (e.g., raw sequencing reads, aligned reads, variant calls, proteomics spectra)  
     - whether it is paired-end (true/false/unknown)

2. **Output files**  
   - Describe the expected output files.  
   - Include file format (e.g., VCF, TSV, abundance table, PDF report) and data type (e.g., variants, gene expression matrix, species abundance).  
   - If the user did not specify, infer the most common output for the analysis goal.

3. **Analysis goal**  
   - Provide a detailed sentence describing the intended analysis, including:  
     - starting point (input files)  
     - main processing steps (e.g., quality control, alignment, variant calling)  
     - desired outcome (the type of result the user wants)

Rules:  
- Always extract actual file names if provided.  
- If information is missing, clearly state it as "not specified" or "unknown".  
- The output must be well-structured natural language, divided into the three sections above.  
- The output must be in JSON format as follow: {format_instructions}
'''

human_input = '''
now the user's question is: {user_question}
'''

class AnalysisAgent:
    def __init__(self, temperature: float = 0.5):
        self.parser = PydanticOutputParser(pydantic_object=AnalysisTask)
        print(self.parser.get_format_instructions())
        self.prompt = ChatPromptTemplate.from_messages([
            ("system", prompt),
            ("human", human_input)
        ])
        from langchain_openai import ChatOpenAI

        self.llm = ChatOpenAI(
            base_url="",
            api_key="",
            model="",
            temperature=temperature
        )

    def analyze(self,user_question: str):
        chain = self.prompt | self.llm | self.parser
        try:
            result = chain.invoke({"user_question": user_question, "format_instructions": self.parser.get_format_instructions()})
            return result
        except Exception as e:
            print(e)
            return None

def split(user_question: str,filename_list:list,file_description:str):
    analysis_agent = AnalysisAgent()
    result = analysis_agent.analyze(f"the files user input: {', '.join(filename_list)} ,user's question:{user_question}, the files description: {file_description}")
    if(result):
        return result.model_dump()
    else:
        return None 
