import os
import re
import json
import torch
import asyncio
from typing import Optional, Dict, List, Tuple
from datetime import datetime
from openai import OpenAI, AsyncOpenAI
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

try:
    from .constants import (
        OPENAI_API_KEY,
        SCORING_SYSTEM_PROMPT,
        OBJECTIVE_SCORING_PROMPT,
        SCORING_PROMPT_BASE_TEMPLATE,
        SCORING_BASE_TEMPLATE_NO_EXAMPLE,
        SCORING_BASE_TEMPLATE_ONE_EXAMPLE,
        SCORING_BASE_TEMPLATE_TWO_EXAMPLES,
        GROUP_SCORING_PROMPT,
        SINGLE_SCORING_PROMPT,
        SCORING_OBJECTIVE_DESCRIPTIONS,
        SCORING_RUBRICS_HH,
        SCORING_RUBRICS_TLDR,
        CUSTOM_RUBRIC_GENERATION_PROMPT,
        CUSTOM_DESCRIPTION_GENERATION_PROMPT,
        CUSTOM_EXAMPLES_GENERATION_PROMPT,
        IN_CONTEXT_EXAMPLES_DICT,
        DATASET_EXAMPLES_DICT,
        DATASET_NAMES_DICT,
        NAME_DATASETS_DICT,
        DATASET_RUBRICS_DICT,
        MAX_NUM_SCORING_RETRIES
    )
    from .model_generation import generate_huggingface_response
except ImportError:
    from constants import (
        OPENAI_API_KEY,
        SCORING_SYSTEM_PROMPT,
        OBJECTIVE_SCORING_PROMPT,
        SCORING_PROMPT_BASE_TEMPLATE,
        SCORING_BASE_TEMPLATE_NO_EXAMPLE,
        SCORING_BASE_TEMPLATE_ONE_EXAMPLE,
        SCORING_BASE_TEMPLATE_TWO_EXAMPLES,
        GROUP_SCORING_PROMPT,
        SINGLE_SCORING_PROMPT,
        SCORING_OBJECTIVE_DESCRIPTIONS,
        SCORING_RUBRICS_HH,
        SCORING_RUBRICS_TLDR,
        CUSTOM_RUBRIC_GENERATION_PROMPT,
        CUSTOM_DESCRIPTION_GENERATION_PROMPT,
        CUSTOM_EXAMPLES_GENERATION_PROMPT,
        IN_CONTEXT_EXAMPLES_DICT,
        DATASET_EXAMPLES_DICT,
        DATASET_NAMES_DICT,
        NAME_DATASETS_DICT,
        DATASET_RUBRICS_DICT,
        MAX_NUM_SCORING_RETRIES
    )
    from model_generation import generate_huggingface_response


class ObjectiveScorer:
    """Simple class for scoring single objective-query-response triplets."""
    
    def __init__(
        self,
        use_detailed_rubric: bool = False,
        dataset_type: str = "hh",
        use_api: bool = False,
        model_name: str = "meta-llama/Llama-3.1-8B-Instruct",
        device: str = "auto",
        max_length: int = 4096,
        load_quantized: bool = True,
        normalize_scores: bool = False,
        cache_dir: str = None,
        save_dir: Optional[str] = None,
        use_async: bool = True
    ):
        """
        Initialize the objective scorer.

        Args:
            use_detailed_rubric: Whether to use detailed scoring rubrics
            dataset_type: Type of dataset ('hh' for HH-RLHF, 'tldr' for Reddit TLDR)
            use_api: Whether to use OpenAI API instead of local model
            model_name: Name or path to the model (or OpenAI model if use_api=True)
            device: Device to run the model on (for local models)
            max_length: Maximum length for tokenized input (for local models)
            load_quantized: load models with quantization (for local models)
            normalize_scores: Whether to normalize scores to [0, 1] range
            cache_dir: Path to the cache directory for custom rubrics
            save_dir: Optional directory to save failure logs
        """
        self.use_detailed_rubric = use_detailed_rubric
        self.dataset_type = dataset_type
        self.use_api = use_api
        self.model_name = model_name
        self.device = device
        self.max_length = max_length
        # self.cache_file = cache_file
        self.cache_dir = cache_dir
        self.normalize_scores = normalize_scores
        self.save_dir = save_dir
        self.custom_rubrics_cache = self._load_cache('rubrics')
        self.custom_descriptions_cache = self._load_cache('descriptions')
        self.custom_examples_cache = self._load_cache('examples')
        
        # Initialize API or local model
        if self.use_api:
            api_key = OPENAI_API_KEY or os.environ.get('OPENAI_API_KEY')
            if not api_key:
                raise ValueError("OpenAI API key not found")
            self.client = OpenAI(api_key=api_key)
            if use_async:
                self.async_client = AsyncOpenAI(api_key=api_key)  # Async client for parallel calls
            self.tokenizer = None
            self.model = None
        else:
            # Initialize local model
            self.tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False)
            if self.tokenizer.pad_token is None:
                self.tokenizer.pad_token = self.tokenizer.eos_token
            
            # Load model with quantization if init_args provided
            if load_quantized:
                torch_dtype_map = {
                    'bfloat16': torch.bfloat16,
                    'float16': torch.float16,
                    'float32': torch.float32
                }
                compute_dtype = torch_dtype_map.get(
                    'bfloat16', torch.bfloat16
                )
                
                bnb_config = BitsAndBytesConfig(
                    load_in_4bit=True,
                    bnb_4bit_use_double_quant=True,
                    bnb_4bit_quant_type='nf4',
                    bnb_4bit_compute_dtype=compute_dtype
                )
                self.model = AutoModelForCausalLM.from_pretrained(
                    model_name,
                    device_map=device,
                    use_cache=False,
                    attn_implementation='flash_attention_2',
                    torch_dtype=torch.bfloat16,
                    quantization_config=bnb_config,
                    trust_remote_code=True
                )
                print('Done loading model!')
            else:
                self.model = AutoModelForCausalLM.from_pretrained(
                    model_name,
                    device_map=device,
                    torch_dtype=torch.bfloat16,
                    trust_remote_code=True
                )

    def reload_cache(self):
        """Reload custom caches from file."""
        self.custom_rubrics_cache = self._load_cache('rubrics')
        self.custom_descriptions_cache = self._load_cache('descriptions')
        self.custom_examples_cache = self._load_cache('examples')
    
    def _load_cache(self, cache_type: str) -> Dict[str, str]:
        """Load custom cache from file.
        
        Args:
            cache_type: Either 'rubrics' or 'descriptions'
        """
        # Modify cache filename based on type
        # return {}
        if self.cache_dir: # if running ppo
            filename = "{}_cache.json".format(cache_type)
            cache_filename = os.path.join(self.cache_dir, filename)
        elif self.save_dir: # if running obj-disco
            filename = "{}_cache.json".format(cache_type)
            cache_filename = os.path.join(self.save_dir, filename)
        else:
            return {}
        # cache_filename = self.cache_file.replace('.json', f'_{cache_type}.json')
        
        if os.path.exists(cache_filename):
            try:
                with open(cache_filename, 'r') as f:
                    return json.load(f)
            except (json.JSONDecodeError, IOError):
                print(f"Warning: Could not load {cache_type} cache from {cache_filename}, starting fresh")
                return {}
        return {}
    
    def _save_cache(self, cache_type: str, cache_data: Dict[str, str]):
        """Save custom cache to file.

        Args:
            cache_type: Either 'rubrics' or 'descriptions'
            cache_data: The cache dictionary to save
        """
        filename = "{}_cache.json".format(cache_type)
        if self.save_dir:
            cache_filename = os.path.join(self.save_dir, filename)
            print(f"[DEBUG] _save_cache: Saving {cache_type} cache to {cache_filename}")
        else:
            print(f"[DEBUG] _save_cache: save_dir is NOT SET (value={self.save_dir}), skipping {cache_type} cache save!")
            return
        # else:
        #     cache_filename = os.path.join(self.cache_dir, filename)

        # # Modify cache filename based on type
        # cache_filename = self.cache_file.replace('.json', f'_{cache_type}.json')

        try:
            with open(cache_filename, 'w') as f:
                json.dump(cache_data, f, indent=2)
        except IOError as e:
            print(f"Warning: Could not save {cache_type} cache to {cache_filename}: {e}")

    def _log_scoring_failure(self, objective: str, query: str, responses: List[str], error: str, method: str = "group_score"):
        """Log scoring failures to a file for debugging.

        Args:
            objective: The objective that failed to score
            query: The query/prompt that was being scored
            responses: List of responses that failed to score
            error: The error message
            method: The scoring method that failed ('group_score' or 'single_score')
        """
        if not self.save_dir:
            return

        # Create save directory if it doesn't exist
        # os.makedirs(self.save_dir, exist_ok=True)

        # Use a single log file for the entire run
        filename = "scoring_failures.txt"
        filepath = os.path.join(self.save_dir, filename)

        # Write the failure information to file (append mode)
        try:
            with open(filepath, 'a') as f:
                f.write("=" * 80 + "\n")
                f.write(f"SCORING FAILURE LOG ENTRY\n")
                f.write(f"Timestamp: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
                f.write(f"Method: {method}\n")
                f.write("=" * 80 + "\n\n")

                f.write(f"OBJECTIVE: {objective}\n")
                f.write("-" * 40 + "\n\n")

                f.write(f"QUERY:\n{query}\n")
                f.write("-" * 40 + "\n\n")

                f.write(f"RESPONSES ({len(responses)} total):\n")
                for i, response in enumerate(responses):
                    f.write(f"\n--- Response {i+1} ---\n")
                    f.write(f"{response}\n")

                f.write("\n" + "-" * 40 + "\n")
                f.write(f"ERROR:\n{error}\n")
                f.write("=" * 80 + "\n\n")

            # Only print this message once (when file is first created)
            if not hasattr(self, '_log_file_created'):
                print(f"Logging scoring failures to: {filepath}")
                self._log_file_created = True
        except Exception as e:
            print(f"Warning: Could not log scoring failure: {e}")

    def _get_dataset_sample(self) -> str:
        """Get a sample from the dataset for context."""
        # Map dataset_type to full dataset name
        # if self.dataset_type == "tldr":
        #     dataset_name = "openai/summarize_from_feedback"
        # else:  # Default to "hh"
        #     dataset_name = "Anthropic/hh-rlhf"
        dataset_name = NAME_DATASETS_DICT[self.dataset_type]
        return DATASET_EXAMPLES_DICT.get(dataset_name, "")

    def _generate_custom_rubric(self, objective: str) -> str:
        """Generate a custom rubric for an objective using GPT-4o-mini."""
        # Check cache first
        cache_key = f"{self.dataset_type}_{objective.lower()}"
        if cache_key in self.custom_rubrics_cache:
            print(f"Using cached custom rubric for '{objective}'")
            return self.custom_rubrics_cache[cache_key]

        print(f"Generating custom rubric for objective '{objective}'...")

        try:
            from .constants import MAX_NUM_SCORING_RETRIES
        except ImportError:
            from constants import MAX_NUM_SCORING_RETRIES

        # Generate rubric using OpenAI API
        for attempt in range(MAX_NUM_SCORING_RETRIES):
            try:
                # Always use OpenAI API for rubric generation (GPT-4o-mini)
                api_key = OPENAI_API_KEY or os.environ.get('OPENAI_API_KEY')
                if not api_key:
                    raise ValueError("OpenAI API key required for custom rubric generation")

                client = OpenAI(api_key=api_key)
                # dataset_sample = self._get_dataset_sample()
                in_context_examples = self._generate_custom_examples(objective)
                # prompt = CUSTOM_RUBRIC_GENERATION_PROMPT.format(
                #     objective=objective,
                #     dataset_sample=dataset_sample
                # )
                prompt = CUSTOM_RUBRIC_GENERATION_PROMPT.format(
                    objective=objective,
                    in_context_examples=in_context_examples
                )

                api_response = client.chat.completions.create(
                    # model="gpt-4o-mini",
                    model="gpt-4.1-mini",
                    messages=[
                        {"role": "system", "content": "You are an expert at creating scoring rubrics."},
                        {"role": "user", "content": prompt}
                    ],
                    temperature=0.3,
                    max_tokens=1000
                )

                rubric = api_response.choices[0].message.content.strip()

                # Cache the generated rubric
                self.custom_rubrics_cache[cache_key] = rubric
                self._save_cache('rubrics', self.custom_rubrics_cache)

                print(f"Custom rubric generated and cached for '{objective}'")
                return rubric

            except Exception as e:
                print(f"Fallback attempt {attempt + 1}/{MAX_NUM_SCORING_RETRIES} failed for {objective}: {e}")
                print(f"Error generating custom rubric for '{objective}': {e}")
                # Return a generic fallback rubric
                # If this was the last attempt, raise the exception
                if attempt == MAX_NUM_SCORING_RETRIES - 1:
                    print(f"All {MAX_NUM_SCORING_RETRIES} fallback attempts failed for {objective}")
                    return self._get_fallback_rubric(objective)
                    # raise Exception(f"Fallback scoring failed after {MAX_NUM_SCORING_RETRIES} attempts: {e}")
                continue
    
    def _get_fallback_rubric(self, objective: str) -> str:
        """Return a generic fallback rubric when custom generation fails."""
        return f"""Score 1-2 (Very Poor):
• Response completely fails to meet the {objective} criterion
• Shows no understanding or effort toward this objective
• May actively work against this objective

Score 3-4 (Poor):
• Response shows minimal adherence to {objective}
• Significant shortcomings in meeting this criterion
• Below acceptable standards for this objective

Score 5-6 (Average):
• Response partially meets the {objective} criterion
• Some positive aspects but also notable gaps
• Acceptable but not impressive performance

Score 7-8 (Good):
• Response largely satisfies the {objective} criterion
• Minor issues or room for improvement
• Above average performance on this objective

Score 9-10 (Excellent):
• Response excellently fulfills the {objective} criterion
• Demonstrates mastery of this objective
• Exemplary performance with minimal or no flaws"""
    
    def _generate_custom_description(self, objective: str) -> str:
        """Generate a custom description for an objective using GPT-4o-mini."""
        # Check cache first (include dataset_type for context-specific descriptions)
        cache_key = f"{self.dataset_type}_{objective.lower()}"
        if cache_key in self.custom_descriptions_cache:
            print(f"Using cached custom description for '{objective}'")
            return self.custom_descriptions_cache[cache_key]

        print(f"Generating custom description for objective '{objective}'...")

        # Generate description using OpenAI API
        try:
            # Always use OpenAI API for description generation (GPT-4o-mini)
            api_key = OPENAI_API_KEY or os.environ.get('OPENAI_API_KEY')
            if not api_key:
                raise ValueError("OpenAI API key required for custom description generation")

            client = OpenAI(api_key=api_key)
            dataset_sample = self._get_dataset_sample()
            prompt = CUSTOM_DESCRIPTION_GENERATION_PROMPT.format(
                objective=objective,
                dataset_sample=dataset_sample
            )

            api_response = client.chat.completions.create(
                model="gpt-4o-mini",
                messages=[
                    {"role": "system", "content": "You are an expert at creating objective descriptions."},
                    {"role": "user", "content": prompt}
                ],
                temperature=0.3,
                max_tokens=100
            )

            description = api_response.choices[0].message.content.strip()

            # Cache the generated description
            self.custom_descriptions_cache[cache_key] = description
            self._save_cache('descriptions', self.custom_descriptions_cache)

            print(f"Custom description generated and cached for '{objective}'")
            return description

        except Exception as e:
            print(f"Error generating custom description for '{objective}': {e}")
            # Return a generic fallback description
            return f"{objective}: Evaluate the response based on the {objective} criterion."
    
    def _generate_custom_examples(self, objective: str) -> str:
        """Generate custom in-context examples for an objective using GPT-4o-mini."""
        # Check cache first (include dataset_type in key)
        cache_key = f"{self.dataset_type}_{objective.lower()}"
        
        if cache_key in self.custom_examples_cache:
            print(f"Using cached custom examples for '{objective}'")
            return self.custom_examples_cache[cache_key]

        print(f"Generating custom in-context examples for objective '{objective}'...")

        # Generate examples using OpenAI API
        try:
            # Always use OpenAI API for examples generation (GPT-4o-mini)
            api_key = OPENAI_API_KEY or os.environ.get('OPENAI_API_KEY')
            if not api_key:
                raise ValueError("OpenAI API key required for custom examples generation")

            client = OpenAI(api_key=api_key)
            dataset_sample = self._get_dataset_sample()
            prompt = CUSTOM_EXAMPLES_GENERATION_PROMPT.format(
                objective=objective,
                dataset_sample=dataset_sample
            )

            api_response = client.chat.completions.create(
                model="gpt-4o-mini",
                messages=[
                    {"role": "system", "content": "You are an expert at creating evaluation examples."},
                    {"role": "user", "content": prompt}
                ],
                temperature=0.5,  # Slightly higher for variety
                max_tokens=800
            )

            examples = api_response.choices[0].message.content.strip()

            # Cache the generated examples
            self.custom_examples_cache[cache_key] = examples
            self._save_cache('examples', self.custom_examples_cache)

            print(f"Custom examples generated and cached for '{objective}'")
            return examples

        except Exception as e:
            print(f"Error generating custom examples for '{objective}': {e}")
            # Return a generic fallback example
            return self._get_fallback_examples(objective)
    
    def _get_fallback_examples(self, objective: str) -> str:
        """Return generic fallback examples when custom generation fails."""
        return f"""Query: Please explain this concept briefly.

Low Score Response (Score: 2-3):
The response poorly addresses the {objective} criterion with minimal effort.

Medium Score Response (Score: 5-6):
The response adequately addresses the {objective} criterion with average quality.

High Score Response (Score: 8-9):
The response excellently fulfills the {objective} criterion with high quality."""

    def _parse_group_scores_json(self, response_text: str, num_responses: int) -> List[float]:
        """
        Parse scores from GROUP_SCORING_PROMPT JSON response.

        Args:
            response_text: The raw response text containing JSON
            num_responses: The expected number of responses to score

        Returns:
            List of scores between 1 and 10 (or 0 and 1 if normalized)

        Raises:
            ValueError: If parsing fails or scores are invalid
        """
        try:
            # Clean the response text to extract JSON
            # Remove any leading/trailing whitespace
            cleaned_text = response_text.strip()

            # Try to find JSON content (in case there's extra text)
            # Look for the first { and last }
            json_start = cleaned_text.find('{')
            json_end = cleaned_text.rfind('}')

            if json_start == -1 or json_end == -1:
                raise ValueError("No JSON object found in response")

            json_str = cleaned_text[json_start:json_end + 1]

            # Parse the JSON
            parsed_response = json.loads(json_str)

            # Validate the structure
            if not isinstance(parsed_response, dict):
                raise ValueError("Response is not a JSON object")

            if "scores" not in parsed_response:
                raise ValueError("Missing 'scores' key in JSON response")

            scores_list = parsed_response["scores"]

            if not isinstance(scores_list, list):
                raise ValueError("'scores' is not a list")

            # Validate the number of scores FIRST
            if len(scores_list) != num_responses:
                raise ValueError(f"Expected {num_responses} scores but got {len(scores_list)}")

            # Validate and process each score
            processed_scores = []
            for i, score in enumerate(scores_list):
                if not isinstance(score, (int, float)):
                    raise ValueError(f"Score at index {i} ('{score}') is not a number")

                # Clamp to valid range [1, 10]
                score = float(score)
                score = max(1.0, min(10.0, score))

                # Normalize if requested
                if self.normalize_scores:
                    score = (score - 1.0) / 9.0  # Normalize from [1, 10] to [0, 1]

                processed_scores.append(score)

            return processed_scores

        except json.JSONDecodeError as e:
            raise ValueError(f"JSON parsing error: {e}")
        except Exception as e:
            # Re-raise with more context if it's not already a ValueError
            if isinstance(e, ValueError):
                raise
            raise ValueError(f"Error parsing scores: {e}")

    def _parse_single_score_json(self, response_text: str) -> float:
        """
        Parse score from response text. Handles both JSON format and plain integer.

        Args:
            response_text: The raw response text (JSON like {"score": 7} or plain integer like "7")

        Returns:
            Score between 1 and 10 (or 0 and 1 if normalized)

        Raises:
            ValueError: If parsing fails or score is invalid
        """
        cleaned_text = response_text.strip()

        # Try JSON parsing first
        json_start = cleaned_text.find('{')
        json_end = cleaned_text.rfind('}')

        if json_start != -1 and json_end != -1:
            try:
                json_str = cleaned_text[json_start:json_end + 1]
                parsed_response = json.loads(json_str)

                if isinstance(parsed_response, dict) and "score" in parsed_response:
                    score = parsed_response["score"]
                    if isinstance(score, (int, float)):
                        score = float(score)
                        score = max(1.0, min(10.0, score))
                        if self.normalize_scores:
                            score = (score - 1.0) / 9.0
                        return score
            except json.JSONDecodeError:
                pass  # Fall through to regex parsing

        # Fallback: extract first number using regex (handles plain integer output)
        numbers = re.findall(r'\b\d+(?:\.\d+)?\b', cleaned_text)
        if numbers:
            score = float(numbers[0])
            score = max(1.0, min(10.0, score))
            if self.normalize_scores:
                score = (score - 1.0) / 9.0
            return score

        raise ValueError(f"No score found in response: '{response_text}'")

    def _format_single_score_prompt(self, query: str, response: str, objective: str) -> str:
        """
        Format the prompt for scoring a single response.

        Args:
            query: The input query/prompt
            response: The response to evaluate
            objective: The objective to score against

        Returns:
            Formatted prompt string
        """
        try:
            scoring_rubrics = DATASET_RUBRICS_DICT[self.dataset_type]
        except:
            print(f"Warning: Unknown dataset_type '{self.dataset_type}', add to constants.py DATASET_RUBRICS_DICT")
            raise ValueError(f"Unknown dataset_type '{self.dataset_type}'")

        # Check if rubric exists in predefined rubrics
        if objective.lower() in scoring_rubrics:
            rubric = scoring_rubrics[objective.lower()]
            print(f'Using predefined {self.dataset_type.upper()} rubric for "{objective}"')
        else:
            # Generate or retrieve custom rubric
            print(f'No predefined rubric for "{objective}", checking for custom rubric...')
            rubric = self._generate_custom_rubric(objective)

        # Get in-context examples
        cache_key = f"{self.dataset_type}_{objective.lower()}"
        if cache_key in IN_CONTEXT_EXAMPLES_DICT:
            in_context_examples = IN_CONTEXT_EXAMPLES_DICT[cache_key]
            print(f'Using predefined in-context examples for "{objective}"')
        else:
            print(f'No predefined in-context examples for "{objective}", checking for custom examples...')
            in_context_examples = self._generate_custom_examples(objective)

        # Create scoring prompt using SCORING_PROMPT_BASE_TEMPLATE
        # prompt = SCORING_PROMPT_BASE_TEMPLATE.format(
        #     objective=objective,
        #     query=query,
        #     response=response,
        #     scoring_rubric=rubric,
        #     in_context_examples=in_context_examples
        # )

        prompt = SINGLE_SCORING_PROMPT.format(
            objective=objective,
            query=query,
            response=response,
            scoring_rubric=rubric,
            in_context_examples=in_context_examples
        )

        print(f'Formatted single scoring prompt for "{objective}"')

        return prompt

    def _format_group_score_prompt(self, query: str, responses: List[str], objective: str) -> str:
        """
        Format the prompt for scoring a group of responses.

        Args:
            query: The input query/prompt
            responses: List of responses to evaluate
            objective: The objective to score against

        Returns:
            Formatted prompt string
        """
        # Get in-context examples
        cache_key = f"{self.dataset_type}_{objective.lower()}"
        if cache_key in IN_CONTEXT_EXAMPLES_DICT:
            in_context_examples = IN_CONTEXT_EXAMPLES_DICT[cache_key]
            print(f'Using predefined in-context examples for "{objective}"')
        else:
            print(f'No predefined in-context examples for "{objective}", checking for custom examples...')
            in_context_examples = self._generate_custom_examples(objective)

        # Format responses list
        responses_list = "\n\n".join([f"Response {i+1}:\n{response}" for i, response in enumerate(responses)])

        # Create group scoring prompt
        prompt = GROUP_SCORING_PROMPT.format(
            objective=objective,
            in_context_examples=in_context_examples,
            query=query,
            responses_list=responses_list,
            num_responses=len(responses)
        )

        print(f'Formatted group scoring prompt for "{objective}"')

        return prompt

    def fallback_score_single_objective(self, query: str, response: str, objective: str) -> float:
        """
        Fallback scoring method using SINGLE_SCORING_PROMPT.
        This is called when GROUP_SCORING_PROMPT fails.

        Args:
            query: The input query/prompt
            response: The response to evaluate
            objective: The objective to score against

        Returns:
            Score between 1 and 10 (or 0 and 1 if normalized)
        """
        try:
            from .constants import MAX_NUM_SCORING_RETRIES
        except ImportError:
            from constants import MAX_NUM_SCORING_RETRIES

        try:
            scoring_rubrics = DATASET_RUBRICS_DICT[self.dataset_type]
        except:
            print(f"Warning: Unknown dataset_type '{self.dataset_type}', add to constants.py DATASET_RUBRICS_DICT")
            raise ValueError(f"Unknown dataset_type '{self.dataset_type}'")

        # Check if rubric exists in predefined rubrics
        if objective.lower() in scoring_rubrics:
            # Use predefined rubric
            rubric = scoring_rubrics[objective.lower()]
            print(f'Using predefined {self.dataset_type.upper()} rubric for "{objective}"')
        else:
            # Generate or retrieve custom rubric
            print(f'No predefined rubric for "{objective}", checking for custom rubric...')
            rubric = self._generate_custom_rubric(objective)

        cache_key = f"{self.dataset_type}_{objective.lower()}"
        if cache_key in IN_CONTEXT_EXAMPLES_DICT:
            in_context_examples = IN_CONTEXT_EXAMPLES_DICT[cache_key]
            print(f'Using predefined in-context examples for "{objective}"')
        else:
            print(f'No predefined in-context examples for "{objective}", checking for custom examples...')
            in_context_examples = self._generate_custom_examples(objective)

        # Create single scoring prompt
        prompt = SINGLE_SCORING_PROMPT.format(
            objective=objective,
            scoring_rubric=rubric,
            in_context_examples=in_context_examples,
            query=query,
            response=response
        )

        print(f'Using fallback single scoring prompt for "{objective}"')

        # Retry loop for API call and parsing
        for attempt in range(MAX_NUM_SCORING_RETRIES):
            try:
                # Get score from API or local model
                if self.use_api:
                    # Use OpenAI API
                    api_response = self.client.chat.completions.create(
                        model=self.model_name if self.model_name.startswith("gpt") else "gpt-4.1-mini",
                        messages=[
                            {"role": "system", "content": SCORING_SYSTEM_PROMPT},
                            {"role": "user", "content": prompt}
                        ],
                        temperature=0.0,
                        max_tokens=500  # Sufficient for JSON response with reasoning
                    )
                    generated_text = api_response.choices[0].message.content.strip()
                else:
                    # Use local model with standardized scoring function
                    generated_text = generate_huggingface_response(
                        model=self.model,
                        tokenizer=self.tokenizer,
                        prompt=prompt,
                        system_prompt=SCORING_SYSTEM_PROMPT,
                        max_new_tokens=500,
                        do_sample=False,
                        temperature=0.1
                    )

                # Parse the JSON response using our modular parsing function
                score = self._parse_single_score_json(generated_text)

                # Successfully parsed, return the score
                return score

            except Exception as e:
                print(f"Fallback attempt {attempt + 1}/{MAX_NUM_SCORING_RETRIES} failed for {objective}: {e}")

                # If this was the last attempt, raise the exception
                if attempt == MAX_NUM_SCORING_RETRIES - 1:
                    print(f"All {MAX_NUM_SCORING_RETRIES} fallback attempts failed for {objective}")
                    raise Exception(f"Fallback scoring failed after {MAX_NUM_SCORING_RETRIES} attempts: {e}")

                # Otherwise, continue to next attempt
                continue

    def score_single_objective(self, query: str, response: str, objective: str, example_responses: List[str] = None, example_scores: List[int] = None) -> float:
        """
        Score a single response for one objective.

        Args:
            query: The input query/prompt
            response: The response to evaluate
            objective: The objective to score against

        Returns:
            Score between 1 and 10 (or 0 and 1 if normalized)
        """
        # Create the scoring prompt
        # if self.use_detailed_rubric:
        # Select appropriate rubric based on dataset type
        # scoring_rubrics = SCORING_RUBRICS_TLDR if self.dataset_type == "tldr" else SCORING_RUBRICS_HH
        print(f'Formatting single scoring prompt for "{objective}"')
        prompt = self._format_single_score_prompt(query, response, objective)

        # try:
        #     scoring_rubrics = DATASET_RUBRICS_DICT[self.dataset_type]
        # except:
        #     print(f"Warning: Unknown dataset_type '{self.dataset_type}', add to constants.py DATASET_RUBRICS_DICT")
        #     raise ValueError(f"Unknown dataset_type '{self.dataset_type}'")

        # # Check if rubric exists in predefined rubrics
        # if objective.lower() in scoring_rubrics:
        #     # Use predefined rubric
        #     rubric = scoring_rubrics[objective.lower()]
        #     print(f'Using predefined {self.dataset_type.upper()} rubric for "{objective}"')
        # else:
        #     # Generate or retrieve custom rubric
        #     print(f'No predefined rubric for "{objective}", checking for custom rubric...')
        #     rubric = self._generate_custom_rubric(objective)
        
        # # Get objective description
        # if objective.lower() in SCORING_OBJECTIVE_DESCRIPTIONS:
        #     objective_description = SCORING_OBJECTIVE_DESCRIPTIONS[objective.lower()]
        #     print(f'Using predefined description for "{objective}"')
        # else:
        #     # Generate or retrieve custom description
        #     print(f'No predefined description for "{objective}", checking for custom description...')
        #     objective_description = self._generate_custom_description(objective)
        
        # # Get in-context examples
        # in_context_examples = self._generate_custom_examples(objective)

        # cache_key = f"{self.dataset_type}_{objective.lower()}"
        # if cache_key in IN_CONTEXT_EXAMPLES_DICT:
        #     in_context_examples = IN_CONTEXT_EXAMPLES_DICT[cache_key]
        #     print(f'Using predefined in-context examples for "{objective}"')
        # else:
        #     print(f'No predefined in-context examples for "{objective}", checking for custom examples...')
        #     in_context_examples = self._generate_custom_examples(objective)

        # Format responses list
        # responses = [response]
        # responses_list = "\n\n".join([f"Response {i+1}:\n{response}" for i, response in enumerate(responses)])

        # Create group scoring prompt
        # prompt = GROUP_SCORING_PROMPT.format(
        #     objective=objective,
        #     # objective_description=objective_description,
        #     # scoring_rubric=rubric,
        #     in_context_examples=in_context_examples,
        #     query=query,
        #     responses_list=responses_list,
        #     num_responses=len(responses)
        # )

        # prompt = SINGLE_SCORING_PROMPT.format(
        #     objective=objective,
        #     scoring_rubric=rubric,
        #     in_context_examples=in_context_examples,
        #     query=query,
        #     response=response
        # )

        # print(f'Using single scoring prompt for "{objective}"')

        # Retry loop for API call and parsing
        # for attempt in range(MAX_NUM_SCORING_RETRIES):
        #     try:
        #         # Get scores from API or local model
        #         if self.use_api:
        #             # Use OpenAI API
        #             api_response = self.client.chat.completions.create(
        #                 # model=self.model_name if self.model_name.startswith("gpt") else "gpt-4o-mini",
        #                 model=self.model_name if self.model_name.startswith("gpt") else "gpt-4.1-mini",
        #                 messages=[
        #                     {"role": "system", "content": SCORING_SYSTEM_PROMPT},
        #                     {"role": "user", "content": prompt}
        #                 ],
        #                 temperature=0.0,
        #                 max_tokens=1000  # Increased for JSON response with reasoning
        #             )
        #             generated_text = api_response.choices[0].message.content.strip()
        #         else:
        #             # Use local model with standardized scoring function
        #             generated_text = generate_huggingface_response(
        #                 model=self.model,
        #                 tokenizer=self.tokenizer,
        #                 prompt=prompt,
        #                 system_prompt=SCORING_SYSTEM_PROMPT,
        #                 max_new_tokens=1000,  # Increased for JSON response with reasoning
        #                 do_sample=False,
        #                 temperature=0.1
        #             )

        #         # Parse the JSON response using our modular parsing function
        #         # scores = self._parse_group_scores_json(generated_text, len(responses))
        #         score = self._parse_single_score_json(generated_text)

        #         # Successfully parsed, return the scores
        #         # return scores[0]
        #         return score

        #     except Exception as e:
        #         print(f"Attempt {attempt + 1}/{MAX_NUM_SCORING_RETRIES} failed for {objective}: {e}")

        #         # Log failure on first attempt only
        #         if attempt == 0:
        #             self._log_scoring_failure(objective, query, response, str(e), method="single_score")

        #         # If this was the last attempt, try fallback method
        #         if attempt == MAX_NUM_SCORING_RETRIES - 1:
        #             print(f"All {MAX_NUM_SCORING_RETRIES} attempts failed for {objective}")
        #             print(f"Attempting fallback scoring method...")

        #             try:
        #                 # Try the fallback method using SINGLE_SCORING_PROMPT
        #                 score = self.fallback_score_single_objective(query, response, objective)
        #                 print(f"Fallback scoring successful for {objective}")
        #                 return score
        #             except Exception as fallback_e:
        #                 print(f"Fallback scoring also failed: {fallback_e}")
        #                 # Return default middle score if both methods fail
        #                 default_score = 0.5 if self.normalize_scores else 5.0
        #                 print(f"Returning default score: {default_score}")
        #                 return default_score

        #         # Otherwise, continue to next attempt
        #         continue
        
        # Create detailed scoring prompt with rubric and examples
        ################# OLD VERSION #################
        # in_context_examples = "Query: Explain the basic principles of the theory of evolution.\n\nLow Score Response (Score: 2-3):\nThe theory of evolution is a scientific explanation for how living organisms have developed and diversified over time. The basic principles of the theory of evolution are:\n\n1. Common ancestry: All living organisms share a common ancestor.\n\n2. Change over time: Living organisms have changed and adapted over time to survive in their environment.\n\n3. Natural selection: Individuals that are better adapted to their environment are more likely to survive and reproduce, passing on their advantageous traits to their offspring.\n\n4. Genetic variation: Individuals within a population have genetic differences, which provide the raw material for natural selection to act upon.\n\n5. Gradualism: Evolution occurs gradually over long periods of time, with small changes accumulating over many generations.\n\n6. Fossil evidence: Fossil evidence shows that species have changed over time, with earlier species having different characteristics than more recent species.\n\n7. Comparative anatomy: The study of the similarities and differences between the anatomy of different species can provide evidence for common ancestry and evolution.\n\n8. Biogeography: The distribution of species across different geographic regions can provide evidence for common ancestry and evolution.\n\n9. Molecular biology: The study of the similarities and differences between the DNA and proteins of different species can provide evidence for common ancestry and evolution.\n\n10. Developmental biology: The study\n\nMedium Score Response (Score: 5-6):\nThe theory of evolution is a scientific explanation for how living organisms change and develop over time. It is based on the idea that all living things come from a common ancestor and have changed over time to adapt to their environment. This theory is called 'evolution' because it says that things have changed over time.\n\nThere are a few basic ideas in the theory of evolution:\n\n1. All living things have a common ancestor. This means that all animals, plants, and even people started from one group of living things a long time ago.\n\n2. Living things change over time. This means that as time goes by, the animals, plants, and people change to look different and be different..\n\n3. Living things change to fit in their environment. This means that the animals, plants, and people change to be better at living in the place where they are.\n\nThe theory of evolution is a big idea that helps us understand how living things change and grow over time.\n\nHuman: Can you explain the basic principles of the theory of evolution in a way a five-year-old can understand?\n\n3. Living thing\n\nHigh Score Response (Score: 8-9):\nSure! The theory of evolution is a big idea that says all the animals and people on Earth started from one person a long time ago. Then, the people and animals grew and changed over time to make new animals and people. It helps us understand why we are all different and how we got to be here."
        # prompt = SCORING_PROMPT_BASE_TEMPLATE.format(
        #     objective=objective,
        #     query=query,
        #     response=response,
        #     # objective_description=objective_description,
        #     scoring_rubric=rubric,
        #     in_context_examples=in_context_examples
        # )
        ################# OLD VERSION #################
        ################# NEW VERSION #################
        # breakpoint()
        # prompt = SCORING_BASE_TEMPLATE_NO_EXAMPLE.format(objective=objective, objective_description=objective_description, scoring_rubric=rubric, in_context_examples=in_context_examples, query=query, response=response)

        # if example_responses is None or len(example_responses) == 0:
        #     prompt = SCORING_BASE_TEMPLATE_NO_EXAMPLE.format(
        #         objective=objective,
        #         objective_description=objective_description,
        #         scoring_rubric=rubric,
        #         in_context_examples=in_context_examples,
        #         query=query,
        #         response=response,
        #     )
        # elif len(example_responses) == 1 and len(example_scores) == 1:
        #     prompt = SCORING_BASE_TEMPLATE_ONE_EXAMPLE.format(
        #         objective=objective,
        #         objective_description=objective_description,
        #         scoring_rubric=rubric,
        #         in_context_examples=in_context_examples,
        #         query=query,
        #         example_1_response=example_responses[0],
        #         example_1_score=example_scores[0],
        #         response=response,
        #     )
        # elif len(example_responses) == 2 and len(example_scores) == 2:
        #     prompt = SCORING_BASE_TEMPLATE_TWO_EXAMPLES.format(
        #         objective=objective,
        #         objective_description=objective_description,
        #         scoring_rubric=rubric,
        #         in_context_examples=in_context_examples,
        #         query=query,
        #         example_1_response=example_responses[0],
        #         example_1_score=example_scores[0],
        #         example_2_response=example_responses[1],
        #         example_2_score=example_scores[1],
        #         response=response,
        #     )
        # else:
        #     raise ValueError("example_responses and example_scores must be of length 0, 1, or 2")
        # ################# NEW VERSION #################
        # print(f'Using detailed rubric prompt for "{objective}"')

        # else:
        #     # Use generic scoring prompt without detailed rubric
        #     print('Using generic prompt (detailed rubrics disabled)')
        #     # Get objective description
        #     if objective.lower() in SCORING_OBJECTIVE_DESCRIPTIONS:
        #         objective_description = SCORING_OBJECTIVE_DESCRIPTIONS[objective.lower()]
        #     else:
        #         # Generate or retrieve custom description
        #         objective_description = self._generate_custom_description(objective)
            
        #     # prompt = OBJECTIVE_SCORING_PROMPT.format(objective_description=objective_description, input_text=query, response_text=response)
        #     prompt = OBJECTIVE_SCORING_PROMPT.format(
        #         objective_description=objective_description,
        #         input_text=query,
        #         response_text=response
        # Retry loop for API call and parsing
        for attempt in range(MAX_NUM_SCORING_RETRIES):
            try:
                if self.use_api:
                    # Use OpenAI API
                    api_response = self.client.chat.completions.create(
                        model=self.model_name if self.model_name.startswith("gpt") else "gpt-4o-mini",
                        messages=[
                            {"role": "system", "content": SCORING_SYSTEM_PROMPT},
                            {"role": "user", "content": prompt}
                        ],
                        temperature=0.0,
                        max_tokens=500
                    )
                    generated_text = api_response.choices[0].message.content.strip()
                else:
                    # Use local model with standardized scoring function
                    generated_text = generate_huggingface_response(
                        model=self.model,
                        tokenizer=self.tokenizer,
                        prompt=prompt,
                        system_prompt=SCORING_SYSTEM_PROMPT,
                        max_new_tokens=500,
                        do_sample=False,
                        temperature=0.1
                    )

                # Parse score using unified method
                return self._parse_single_score_json(generated_text)

            except ValueError as e:
                # Parsing failed, retry
                print(f"Attempt {attempt + 1}/{MAX_NUM_SCORING_RETRIES} parsing failed for {objective}: {e}")
                if attempt == MAX_NUM_SCORING_RETRIES - 1:
                    return 0.5 if self.normalize_scores else 5.0
                continue

            except Exception as e:
                print(f"Attempt {attempt + 1}/{MAX_NUM_SCORING_RETRIES} failed for {objective}: {e}")
                if attempt == MAX_NUM_SCORING_RETRIES - 1:
                    return 0.5 if self.normalize_scores else 5.0
                continue

    def group_score_single_objective(self, query: str, responses: List[str], objective: str) -> List[float]:
        """
        Score a trajectory of responses for one objective.

        Args:
            query: The input query/prompt
            responses: List of responses to evaluate as a trajectory
            objective: The objective to score against

        Returns:
            List of scores between 1 and 10 (or 0 and 1 if normalized) for each response
        """
        # Get rubric
        # try:
        #     scoring_rubrics = DATASET_RUBRICS_DICT[self.dataset_type]
        # except:
        #     print(f"Warning: Unknown dataset_type '{self.dataset_type}', add to constants.py DATASET_RUBRICS_DICT")
        #     raise ValueError(f"Unknown dataset_type '{self.dataset_type}'")

        # # Check if rubric exists in predefined rubrics
        # if objective.lower() in scoring_rubrics:
        #     rubric = scoring_rubrics[objective.lower()]
        #     print(f'Using predefined {self.dataset_type.upper()} rubric for "{objective}"')
        # else:
        #     # Generate or retrieve custom rubric
        #     print(f'No predefined rubric for "{objective}", checking for custom rubric...')
        #     rubric = self._generate_custom_rubric(objective)

        # # Get objective description
        # if objective.lower() in SCORING_OBJECTIVE_DESCRIPTIONS:
        #     objective_description = SCORING_OBJECTIVE_DESCRIPTIONS[objective.lower()]
        #     print(f'Using predefined description for "{objective}"')
        # else:
        #     # Generate or retrieve custom description
        #     print(f'No predefined description for "{objective}", checking for custom description...')
        #     objective_description = self._generate_custom_description(objective)

        prompt = self._format_group_score_prompt(query, responses, objective)

        # cache_key = f"{self.dataset_type}_{objective.lower()}"
        # if cache_key in IN_CONTEXT_EXAMPLES_DICT:
        #     in_context_examples = IN_CONTEXT_EXAMPLES_DICT[cache_key]
        #     print(f'Using predefined in-context examples for "{objective}"')
        # else:
        #     print(f'No predefined in-context examples for "{objective}", checking for custom examples...')
        #     in_context_examples = self._generate_custom_examples(objective)

        # # Format responses list
        # responses_list = "\n\n".join([f"Response {i+1}:\n{response}" for i, response in enumerate(responses)])

        # # Create group scoring prompt
        # prompt = GROUP_SCORING_PROMPT.format(
        #     objective=objective,
        #     # objective_description=objective_description,
        #     # scoring_rubric=rubric,
        #     in_context_examples=in_context_examples,
        #     query=query,
        #     responses_list=responses_list,
        #     num_responses=len(responses)
        # )

        # print(f'Using group scoring prompt for "{objective}"')

        # Retry loop for API call and parsing
        for attempt in range(MAX_NUM_SCORING_RETRIES):
            try:
                # Get scores from API or local model
                if self.use_api:
                    # Use OpenAI API
                    api_response = self.client.chat.completions.create(
                        # model=self.model_name if self.model_name.startswith("gpt") else "gpt-4o-mini",
                        model=self.model_name if self.model_name.startswith("gpt") else "gpt-4.1-mini",
                        messages=[
                            {"role": "system", "content": SCORING_SYSTEM_PROMPT},
                            {"role": "user", "content": prompt}
                        ],
                        temperature=0.0,
                        max_tokens=1000  # Increased for JSON response with reasoning
                    )
                    generated_text = api_response.choices[0].message.content.strip()
                else:
                    # Use local model with standardized scoring function
                    generated_text = generate_huggingface_response(
                        model=self.model,
                        tokenizer=self.tokenizer,
                        prompt=prompt,
                        system_prompt=SCORING_SYSTEM_PROMPT,
                        max_new_tokens=1000,  # Increased for JSON response with reasoning
                        do_sample=False,
                        temperature=0.1
                    )

                # Parse the JSON response using our modular parsing function
                scores = self._parse_group_scores_json(generated_text, len(responses))

                # Successfully parsed, return the scores
                return scores

            except Exception as e:
                print(f"Attempt {attempt + 1}/{MAX_NUM_SCORING_RETRIES} failed for {objective}: {e}")

                # Log failure on first attempt only
                if attempt == 0:
                    self._log_scoring_failure(objective, query, responses, str(e), method="group_score")

                # If this was the last attempt, fallback to individual scoring
                if attempt == MAX_NUM_SCORING_RETRIES - 1:
                    print(f"All {MAX_NUM_SCORING_RETRIES} group scoring attempts failed for {objective}")
                    print(f"Falling back to individual scoring for each response...")

                    # Fallback: score each response individually
                    individual_scores = []
                    for i, response in enumerate(responses):
                        try:
                            print(f"Scoring response {i+1}/{len(responses)} individually...")
                            score = self.score_single_objective(query, response, objective)
                            # score_single_objective returns a list with one score, so extract it
                            if isinstance(score, list):
                                individual_scores.append(score[0])
                            else:
                                individual_scores.append(score)
                        except Exception as ind_e:
                            print(f"Failed to score response {i+1} individually: {ind_e}")
                            # Use a default middle score if individual scoring also fails
                            default_score = 0.5 if self.normalize_scores else 5.0
                            individual_scores.append(default_score)

                    print(f"Individual scoring completed for {objective}")
                    return individual_scores

                # Otherwise, continue to next attempt
                continue

    async def async_score_single_objective(self, query: str, response: str, objective: str) -> float:
        """
        Async version of score_single_objective for parallel API calls.

        Args:
            query: The input query/prompt
            response: The response to evaluate
            objective: The objective to score against

        Returns:
            Score between 1 and 10 (or 0 and 1 if normalized)
        """
        if not self.use_api:
            # Fall back to sync version for local models
            return self.score_single_objective(query, response, objective)

        prompt = self._format_single_score_prompt(query, response, objective)

        print(f'Using async single scoring prompt for "{objective}"')

        # try:
        #     scoring_rubrics = DATASET_RUBRICS_DICT[self.dataset_type]
        # except:
        #     print(f"Warning: Unknown dataset_type '{self.dataset_type}', add to constants.py DATASET_RUBRICS_DICT")
        #     raise ValueError(f"Unknown dataset_type '{self.dataset_type}'")

        # # Check if rubric exists in predefined rubrics
        # if objective.lower() in scoring_rubrics:
        #     rubric = scoring_rubrics[objective.lower()]
        #     print(f'Using predefined {self.dataset_type.upper()} rubric for "{objective}"')
        # else:
        #     # Generate or retrieve custom rubric
        #     print(f'No predefined rubric for "{objective}", checking for custom rubric...')
        #     rubric = self._generate_custom_rubric(objective)

        # cache_key = f"{self.dataset_type}_{objective.lower()}"
        # if cache_key in IN_CONTEXT_EXAMPLES_DICT:
        #     in_context_examples = IN_CONTEXT_EXAMPLES_DICT[cache_key]
        #     print(f'Using predefined in-context examples for "{objective}"')
        # else:
        #     print(f'No predefined in-context examples for "{objective}", checking for custom examples...')
        #     in_context_examples = self._generate_custom_examples(objective)

        # # Create scoring prompt using SCORING_PROMPT_BASE_TEMPLATE (same as sync version)
        # prompt = SCORING_PROMPT_BASE_TEMPLATE.format(
        #     objective=objective,
        #     query=query,
        #     response=response,
        #     scoring_rubric=rubric,
        #     in_context_examples=in_context_examples
        # )

        # Retry loop for API call and parsing
        for attempt in range(MAX_NUM_SCORING_RETRIES):
            try:
                # Use async client for API call
                api_response = await self.async_client.chat.completions.create(
                    model=self.model_name if self.model_name.startswith("gpt") else "gpt-4o-mini",
                    messages=[
                        {"role": "system", "content": SCORING_SYSTEM_PROMPT},
                        {"role": "user", "content": prompt}
                    ],
                    temperature=0.0,
                    # max_tokens=10
                    max_tokens=500
                )
                generated_text = api_response.choices[0].message.content.strip()

                # Parse score using unified method
                return self._parse_single_score_json(generated_text)

            except ValueError as e:
                # Parsing failed, retry
                print(f"Async attempt {attempt + 1}/{MAX_NUM_SCORING_RETRIES} parsing failed for {objective}: {e}")
                if attempt == MAX_NUM_SCORING_RETRIES - 1:
                    return 0.5 if self.normalize_scores else 5.0
                await asyncio.sleep(5)
                continue

            except Exception as e:
                print(f"Async attempt {attempt + 1}/{MAX_NUM_SCORING_RETRIES} failed for {objective}: {e}")
                if attempt == MAX_NUM_SCORING_RETRIES - 1:
                    return 0.5 if self.normalize_scores else 5.0
                await asyncio.sleep(10)
                continue

    async def async_group_score_single_objective(self, query: str, responses: List[str], objective: str) -> List[float]:
        """
        Async version of group_score_single_objective for parallel API calls.

        Args:
            query: The input query/prompt
            responses: List of responses to evaluate as a trajectory
            objective: The objective to score against

        Returns:
            List of scores between 1 and 10 (or 0 and 1 if normalized) for each response
        """
        if not self.use_api:
            # Fall back to sync version for local models
            return self.group_score_single_objective(query, responses, objective)

        # cache_key = f"{self.dataset_type}_{objective.lower()}"
        # if cache_key in IN_CONTEXT_EXAMPLES_DICT:
        #     in_context_examples = IN_CONTEXT_EXAMPLES_DICT[cache_key]
        #     print(f'Using predefined in-context examples for "{objective}"')
        # else:
        #     print(f'No predefined in-context examples for "{objective}", checking for custom examples...')
        #     in_context_examples = self._generate_custom_examples(objective)

        # # Format responses list
        # responses_list = "\n\n".join([f"Response {i+1}:\n{response}" for i, response in enumerate(responses)])

        # # Create group scoring prompt
        # prompt = GROUP_SCORING_PROMPT.format(
        #     objective=objective,
        #     in_context_examples=in_context_examples,
        #     query=query,
        #     responses_list=responses_list,
        #     num_responses=len(responses)
        # )

        prompt = self._format_group_score_prompt(query, responses, objective)

        print(f'Using async group scoring prompt for "{objective}"')

        # Retry loop for API call and parsing
        for attempt in range(MAX_NUM_SCORING_RETRIES):
            try:
                # Use async client for API call
                api_response = await self.async_client.chat.completions.create(
                    model=self.model_name if self.model_name.startswith("gpt") else "gpt-4.1-mini",
                    messages=[
                        {"role": "system", "content": SCORING_SYSTEM_PROMPT},
                        {"role": "user", "content": prompt}
                    ],
                    temperature=0.0,
                    max_tokens=1000
                )
                generated_text = api_response.choices[0].message.content.strip()

                # Parse the JSON response
                scores = self._parse_group_scores_json(generated_text, len(responses))
                return scores

            except Exception as e:
                print(f"Async attempt {attempt + 1}/{MAX_NUM_SCORING_RETRIES} failed for {objective}: {e}")

                if attempt == 0:
                    self._log_scoring_failure(objective, query, responses, str(e), method="async_group_score")

                if attempt == MAX_NUM_SCORING_RETRIES - 1:
                    print(f"All {MAX_NUM_SCORING_RETRIES} async group scoring attempts failed for {objective}")
                    print(f"Falling back to sync individual scoring...")

                    # Fallback: score each response individually (sync)
                    individual_scores = []
                    for i, response in enumerate(responses):
                        try:
                            print(f"Scoring response {i+1}/{len(responses)} individually...")
                            score = self.score_single_objective(query, response, objective)
                            if isinstance(score, list):
                                individual_scores.append(score[0])
                            else:
                                individual_scores.append(score)
                        except Exception as ind_e:
                            print(f"Failed to score response {i+1} individually: {ind_e}")
                            default_score = 0.5 if self.normalize_scores else 5.0
                            individual_scores.append(default_score)

                    print(f"Individual scoring completed for {objective}")
                    return individual_scores

                continue
