from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI
from pydantic import BaseModel,Field
from typing import Optional, List, Union, Dict, Any
import json
import os
import time
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from langchain_core.output_parsers import PydanticOutputParser
import re
import tqdm

class step_score(BaseModel):
    step_score_command_installation: float = Field(ge=0,le=2,description="Score for the environment/installation command,(0=incorrect/unusable, 2=perfectly correct)")
    step_score_command_mkdir: float = Field(ge=0,le=2,description="Score for the mkdir/path creation command,(0=incorrect/unusable, 2=perfectly correct)")
    step_score_command_executable: float = Field(ge=0,le=2,description="Score for the execution command,(0=incorrect/unusable, 2=perfectly correct)")
    step_command_success: bool = Field(description="Whether the command is executable,(True=executable, False=unexecutable)")
    step_wrong_reason: str = Field(description="The reason why the command is wrong,if it is executable '' ")

class step_score_result(BaseModel):
    step_score_list: List[step_score] = Field(description="List of steps scores,each step has three scores: installation, mkdir, executable")
    
def step_score(steps):
    step_content = steps
    Step_score_summary = 0
    Step_score = []
    parser = PydanticOutputParser(pydantic_object=step_score_result)
    steps_prompt ='''
            You are an expert bioinformatician. You are evaluating the steps of the bioinformatics workflow for correctness and executability. 
            Each step is a dictionary, and the keys are: tool_name, command.(command is the installation command, mkdir command, or execution command)
            For each step below, you must judge three aspects separately: 
            **Environment / Installation Command**
            Evaluate whether the installation command correctly and completely installs the required software and all its dependencies.
            Scoring (0–2): [0 = Completely incorrect or unusable; software cannot be installed(do not have Installation Command)
            ,0.5 = Mostly incorrect; major dependencies missing or software unusable(Example:pip install fastqc (FastQC is not a Python package, installation fails).)
            ,1 = Partially correct; software installs but manual modifications or additional dependencies required(Example: conda install fastqc (fails unless correct channels are added).)
            ,1.5 = Mostly correct; minor issues only (e.g., warnings, optional dependencies missing)(Example:mamba install -c bioconda fastqc do not have its own environment)
            ,2 = Perfectly correct and complete; software and all dependencies installed and functional](Example:mamba create -n fastqc python=3.11 -y && conda activate fastqc && mamba install -c bioconda fastqc)

            **Mkdir / Path Creation Command**
            Evaluate whether the command correctly creates all required directories and handles paths properly, including input/output paths, existing folders, and permissions.
            Scoring (0–2):
            [0 = Completely incorrect or fails to create directories / incorrect paths(do not have mkdir / Path Creation Command)
            ,0.5 = Mostly incorrect; some directories not created or paths incorrect(Example:tool:star mkdir fastqc)
            ,1 = Partially correct; may require path or command modifications(Example:tool:star mkdir star)
            ,1.5 = Mostly correct; only minor issues (e.g., warnings, redundant paths)(Example:tool:star mkdir ./star)
            ,2 = Perfectly correct; all directories and paths handled correctly](Example:tool:star mkdir ./output/star)

            **Execution Command**

            Evaluate whether the execution command is likely to run successfully given that the previous steps are correctly completed, and whether it produces the expected output.
            Scoring (0–2):[0 = Completely fails; output unusable(Example: fastqc)
            ,0.5 = Mostly fails; output likely incorrect(Example:fastqc sample.fastq)
            ,1 = Partially executable; may require parameter or path adjustments(Example:fastqc ./input/sample.fastq -o output/)
            ,1.5 = Mostly executable; minor issues only (e.g., warnings)(Example:fastqc ./input/sample.fastq.gz -o ./output/fastqc/)
            ,2 = Fully executable; output meets expectations](Example:fastqc ./input/sample.fastq -o ./output/fastqc/)

            Finally, decide whether the step as a whole is executable (True/False).
            Your output should be in the following format:
            {format_instructions}

    '''
    llm = ChatOpenAI(
                base_url="",
                api_key="",
                model="",
                temperature=0.3
            )
    prompt = ChatPromptTemplate.from_messages([
            ("system", steps_prompt),
            ("human","The workflow steps are:\n{steps}")
        ])
    chain = prompt | llm | parser
    result = chain.invoke({"steps":step_content,"format_instructions":parser.get_format_instructions()})
    if(result):
        count = 0
        result = result.model_dump()
        for item in result['step_score_list']:
            if(item['step_command_success']):
                Step_score_summary += item['step_score_command_installation'] + item['step_score_command_mkdir'] + item['step_score_command_executable']
                Step_score_summary += 4 
                Step_score.append(
                    {
                        "step_score_command_installation":item['step_score_command_installation'],
                        "step_score_command_mkdir":item['step_score_command_mkdir'],
                        "step_score_command_executable":item['step_score_command_executable'],
                        "step_command_success":item['step_command_success'],
                        "step_debug_time":0,
                    }
                )
                count+=1

                
    return Step_score,Step_score_summary/count,steps

class workflow_score_result(BaseModel):
    Completion_level : float =Field(description="how complete the workflow is (all required core steps present=3, some steps missing=2,most of the steps missing=1, none of the core steps are present=0)")
    Redundancy : float = Field(description="how redundant the workflow is (0=no redundancy, 1=some redundancy,2= most redundant,3=very redundant)")

def workflow_score(user_question,Step_score,file_input):
    workflow_format = PydanticOutputParser(pydantic_object=workflow_score_result)
    workflow_prompt='''
    You are an expert bioinformatics workflow evaluator.
    Your task is to evaluate a given bioinformatics workflow based on step-level scores and success indicators.
    Your evaluation must be precise, consistent, and avoid subjective judgment beyond the scoring criteria.
    Rate the workflow on three dimensions:
    **Completion_level (0–3)** (Measures whether the workflow achieves {user_question} intended goals / core functionality)
    3 = Fully complete → Workflow meets all core requirements and produces all required final outputs.(Example: requirements and produces all necessary final outputs
An RNA-seq workflow starts from raw FASTQ files, performs quality control (FastQC), trims low-quality reads (Trimmomatic), aligns reads to the reference genome (STAR), quantifies gene expression (featureCounts), and produces differential expression tables and visualization plots. All steps complete and successful.)
    2 = Partially complete → Workflow meets some core requirements, but some steps or functions are missing; partially usable.(Same RNA-seq workflow, but only performs QC and alignment; quantification and differential analysis are missing. Outputs exist but do not cover core results (e.g., differential expression table).)
    1 = Barely complete → Most core requirements are not met; only a few outputs or functions are present.(Only performs FASTQ QC, or only produces alignment files without further analysis. No usable final results.)
    0 = Not complete → Core functionality is not met; workflow is unusable or fails to produce required outputs.(Attempted RNA-seq workflow fails due to missing tools or incorrect inputs, producing no valid outputs.)


    **Redundancy (0–3)**
    0 = No redundancy → All steps unique, no duplicates.(Example:A ChIP-seq workflow runs QC → alignment → duplicate removal → peak calling. Each step appears once, no repetition.)
    1 = Some redundancy → Minor duplication, does not break workflow.(FastQC is run twice during QC, but other steps are unique. Workflow still functions correctly.)
    2 = Mostly redundant → Many repeated steps without necessity.(Multiple alignments or repeated QC steps on the same RNA-seq data. Increases runtime but does not fully break results.)
    3 = Very redundant → Workflow bloated with repetitive or overlapping steps.).(Same FASTQ files are repeatedly aligned and quantified, steps are duplicated multiple times. Workflow becomes complex and wasteful.)

Important principles:

    Be objective: base scores only on explicit evidence from the workflow, not assumptions.
    Be consistent: apply the same standards to all workflows being evaluated.
    Provide the output in strict JSON format matching the Pydantic model:
{format_instructions}
    '''
    llm = ChatOpenAI(
                base_url="",
                api_key="",
                model="",
                temperature=0.3
            )
    workflow_prompt = ChatPromptTemplate.from_messages([
            ("system", workflow_prompt),
            ("human","The user question is: {user_question}\nThe tool steps  are:\n{steps_summary},The input file is:{file_input}")
        ])
    workflow_chain = workflow_prompt | llm | workflow_format
    result = workflow_chain.invoke({"user_question":user_question,"steps_summary":Step_score,"file_input":file_input,"format_instructions":workflow_format.get_format_instructions()})
    if(result):
        result = result.model_dump()
        return result
    else:
        print("workflow_score failed")
        return None


def LLM_score(user_question,steps,file_input):
    Step_score,score,step_content = step_score(steps)
    if(step_content):
        Workflow_score = workflow_score(user_question,steps,file_input)
    return Workflow_score,Step_score,score,step_content    


            

