"""
Filter Reranker Results and Prepare SFT Data

This script filters reranker results, builds SFT data, and converts to LLaMA-Factory format.

Filtering logic:
    1. For each data point, check ALL 8 candidate samples against thresholds
    2. Filter by: total >= min_total_score AND each dimension >= min (if min >= 0)
    3. Select best among qualified samples by: (total, mechanism, methodology, motivation) descending

All outputs are saved to --output_dir:
    - sft_filtered_{filter_str}.jsonl      : Intermediate format with metadata
    - sft_llama_factory_{filter_str}.jsonl : LLaMA-Factory conversations format

Usage:
    python hypothesis_composition_filter_and_prepare_sft.py \
        --input_path /path/to/reranked.jsonl \
        --output_dir /path/to/output/ \
        --min_total_score 8 \
        --min_motivation 2 \
        --min_mechanism 3 \
        --min_methodology 2 \
        --normalize 1
"""

import os
import sys
import json
import argparse
import re
from typing import Dict, Optional, Tuple
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed
from functools import partial

# Add paths for imports
current_dir = os.path.dirname(os.path.abspath(__file__))
parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
sys.path.insert(0, parent_dir)
sys.path.insert(0, os.path.join(parent_dir, 'Preprocessing', 'sft_data_preparation'))
sys.path.insert(0, os.path.join(current_dir, 'Legacy'))  # For hypothesis_composition_reasoning_trace

from prompt_store import instruction_prompts
from hypothesis_composition_reasoning_trace import sample_one_MDP_for_one_paper_from_hypothesis_components
from common_utils import clean_eos_tokens


def normalize_format(hypothesis: str, unify_linebreak: bool = True) -> str:
    """
    Normalize section headers in a generated hypothesis.
    
    Normalization includes:
    1. Remove ** bold markers from headers
    2. Standardize header text to exact format: "- Section (DESC): "
    3. Unify line break format to inline when unify_linebreak=True
    
    Safety guarantees:
    - Content text is NEVER modified
    - Only header format and spacing is changed
    - Final format: "- Motivation (WHY): content" (single space after colon)
    
    Args:
        hypothesis: The hypothesis text to normalize
        unify_linebreak: If True, unify all sections to inline format.
                        ": \\n  content" → ": content" (newline removed, content preserved)
    """
    if not hypothesis:
        return hypothesis
    
    result = hypothesis
    
    # Target headers with trailing space
    headers = [
        ('- Motivation (WHY): ', r'- *\*{0,2} *Motivation *\(WHY\) *\*{0,2} *: *\*{0,2} *'),
        ('- Mechanism (HOW IT WORKS): ', r'- *\*{0,2} *Mechanism *\(HOW IT WORKS\) *\*{0,2} *: *\*{0,2} *'),
        ("- Methodology (HOW IT'S INTEGRATED): ", r"- *\*{0,2} *Methodology *\(HOW IT'S INTEGRATED\) *\*{0,2} *: *\*{0,2} *"),
    ]
    
    # Step 1: Normalize headers to standard format with trailing space
    for target, pattern in headers:
        result = re.sub(pattern, target, result, flags=re.IGNORECASE)
    
    # Step 2: Unify line breaks if requested
    if unify_linebreak:
        # Convert "- Header: \ncontent" to "- Header: content"
        # Remove newline and any leading whitespace from next line
        for target, _ in headers:
            # Match header (with trailing space) followed by newline and optional whitespace
            pattern = re.escape(target) + r'\n[ \t]*'
            result = re.sub(pattern, target, result)
    
    return result.strip()


def has_required_sections(text: str) -> bool:
    """
    Check if text contains all three required sections with proper formatting.
    
    Requirements:
    1. All three sections must be present
    2. Each section must appear exactly ONCE
    
    After normalize_format(), all sections should be in format:
    - '- Motivation (WHY): content...'
    - '- Mechanism (HOW IT WORKS): content...'
    - "- Methodology (HOW IT'S INTEGRATED): content..."
    
    Args:
        text: The generated hypothesis text (should be normalized first)
    
    Returns:
        True if text contains all required sections exactly once, False otherwise
    """
    if not text:
        return False
    
    # After normalize_format(), expect exact header format
    required_headers = [
        '- Motivation (WHY):',
        '- Mechanism (HOW IT WORKS):',
        "- Methodology (HOW IT'S INTEGRATED):",
    ]
    
    # Each header must appear exactly once
    for header in required_headers:
        count = text.count(header)
        if count != 1:
            return False
    
    return True


def has_template_placeholders(text: str) -> bool:
    """
    Check if text contains template placeholders that should be filtered.
    
    These placeholders come from the prompt template in prompt_store.py
    (prepare_HC_sft_data_to_go_comprehensive_v2_delta). They are format examples
    that should be filled by the model, not copied verbatim.
    
    Note: We use specific placeholder strings rather than generic [xxx] patterns
    because scientific text legitimately contains brackets (e.g., chemical formulas
    like [11C], mathematical notation, citations).
    
    Args:
        text: The generated hypothesis text to check
    
    Returns:
        True if text contains template placeholders, False otherwise
    """
    if not text:
        return False
    
    # Template placeholders from prompt_store.py
    # These are exact strings that indicate unfilled template content
    template_placeholders = [
        # From the output format example in prompt
        "[Key concept derived from or inspired by the inspiration paper]",
        "[Why this addresses a gap",
        "[How the concept works",
        "[How to integrate it",
        "[specific implementation steps]",
        # From reasoning section
        "[Your reasoning process here",
        # Common partial matches (in case of slight variations)
        "what specific limitation does it solve?]",
        "in this context]",
    ]
    
    return any(placeholder in text for placeholder in template_placeholders)


def check_sample_qualifies(
    sample_info: Optional[Dict],
    threshold: float,
    min_motivation: int = -1,
    min_mechanism: int = -1,
    min_methodology: int = -1,
    check_format: bool = True,
    normalize: bool = True
) -> bool:
    """
    Check if a single sample meets all thresholds AND format requirements.
    
    This function checks:
    1. Score thresholds (total, motivation, mechanism, methodology)
    2. Format requirements (no template placeholders, has all 3 sections)
    
    Format checking is done BEFORE selecting best sample, so that if the 
    highest-scoring sample has format issues, another valid sample can be selected.
    
    Args:
        sample_info: Sample dict with 'total', 'scores', and 'generated_hypothesis' fields
        threshold: Minimum total score (0-12)
        min_motivation: Minimum motivation score (0-4), -1 means no constraint
        min_mechanism: Minimum mechanism score (0-4), -1 means no constraint
        min_methodology: Minimum methodology score (0-4), -1 means no constraint
        check_format: If True, also check template placeholders and required sections
        normalize: If True, normalize format before checking (only used if check_format=True)
    
    Returns:
        True if sample meets all thresholds and format requirements, False otherwise
    """
    if sample_info is None:
        return False
    scores = sample_info.get('scores')
    if scores is None:
        return False
    
    # Check score thresholds
    total = sample_info.get('total', 0) or 0
    motivation = scores.get('motivation', 0) or 0
    mechanism = scores.get('mechanism', 0) or 0
    methodology = scores.get('methodology', 0) or 0
    
    if total < threshold:
        return False
    if min_motivation >= 0 and motivation < min_motivation:
        return False
    if min_mechanism >= 0 and mechanism < min_mechanism:
        return False
    if min_methodology >= 0 and methodology < min_methodology:
        return False
    
    # Check format requirements (if enabled)
    if check_format:
        hyp = sample_info.get('generated_hypothesis', '')
        if not hyp:
            return False
        
        # Check for template placeholders
        if has_template_placeholders(hyp):
            return False
        
        # Normalize and check required sections
        normalized = normalize_format(hyp, unify_linebreak=True) if normalize else hyp
        if not has_required_sections(normalized):
            return False
    
    return True


def get_sample_sort_key(sample_info: Optional[Dict]) -> Tuple[float, int, int, int]:
    """
    Get sort key for ranking samples.
    
    Ranking priority (descending): total > mechanism > methodology > motivation
    
    Args:
        sample_info: Sample dict with 'total' and 'scores' fields
    
    Returns:
        Tuple of (total, mechanism, methodology, motivation) for sorting
    """
    if sample_info is None:
        return (0, 0, 0, 0)
    scores = sample_info.get('scores', {})
    return (
        sample_info.get('total', 0) or 0,
        scores.get('mechanism', 0) or 0,
        scores.get('methodology', 0) or 0,
        scores.get('motivation', 0) or 0
    )


def build_sft_sample(
    sample: Dict,
    sft_qa_data_dir: str,
    prompts: list,
    normalize: bool = True
) -> Dict:
    """
    Build a complete SFT training sample (intermediate format with metadata).
    
    Args:
        sample: Reranker-selected sample with generated_hypothesis, reasoning_trace, etc.
        sft_qa_data_dir: Directory containing SFT QA data (for research question, background)
        prompts: Prompt templates
        normalize: Whether to normalize hypothesis format
    
    Returns:
        SFT sample dict with 'input', 'output' and metadata fields.
        Returns None if sample is invalid.
        
    Note:
        This returns intermediate format. Use convert_to_llama_factory_format() 
        to convert to LLaMA-Factory conversations format.
        
    Supports two modes:
        1. Normal mode: Uses GT inspirations from sft_qa_data
        2. Bounded mode: Uses bounded inspirations from sample (when 'tier' field exists)
    """
    file_name = sample['file_name']
    step_idx = sample['step_idx']
    
    # Detect bounded mode
    is_bounded_mode = sample.get('tier') is not None
    
    # Load SFT QA data
    sft_qa_path = os.path.join(sft_qa_data_dir, file_name)
    if not os.path.exists(sft_qa_path):
        return None
    
    with open(sft_qa_path, 'r') as f:
        sft_qa_data = json.load(f)
    
    research_question = sft_qa_data.get("research_question", "")
    background_survey = sft_qa_data.get("background_survey", "")
    inspirations = sft_qa_data.get("inspiration", [])
    hypothesis_components = sft_qa_data.get("hypothesis_components", {})
    
    try:
        mdp_road = sample_one_MDP_for_one_paper_from_hypothesis_components(
            inspirations, hypothesis_components, file_name
        )
    except Exception as e:
        print(f"Warning: Could not build MDP road for {file_name}: {e}")
        return None
    
    if step_idx >= len(mdp_road):
        print(f"Warning: step_idx {step_idx} out of range for {file_name}")
        return None
    
    # Get inspiration title and abstract
    if is_bounded_mode:
        # Bounded mode: use bounded inspiration from sample
        title = sample.get('inspiration_title', '')
        abstract = sample.get('inspiration_abstract', '')
    else:
        # Normal mode: use GT inspiration from sft_qa_data
        insp_id, _ = mdp_road[step_idx]
        cur_insp = inspirations[insp_id]
        title = cur_insp.get("found_title", "")
        abstract = cur_insp.get("found_abstract", "")
    
    # Build prev_hyp
    if step_idx > 0:
        prev_deltas = [mdp_road[j][1] for j in range(step_idx)]
        prev_hyp = "\n\n".join(prev_deltas)
    else:
        prev_hyp = "No previous hypothesis."
    
    # Build input prompt
    input_prompt = (
        prompts[0] + research_question +
        prompts[1] + background_survey +
        prompts[2] + prev_hyp +
        prompts[3] + title +
        prompts[4] + abstract +
        prompts[5]
    )
    
    # Build output (reasoning trace + hypothesis)
    reasoning_trace = sample.get('reasoning_trace', '')
    generated_hypothesis = sample.get('generated_hypothesis', '')
    
    # Must have reasoning trace (matches hypothesis_composition_prepare_sft_data_to_go.py)
    if not reasoning_trace or not generated_hypothesis:
        return None  # Filter out samples without reasoning trace
    
    # Note: Template placeholder check and required sections check are already done
    # in check_sample_qualifies() BEFORE selecting best sample. The checks below
    # are safety nets and should rarely trigger.
    
    # Safety check: template placeholders (should already be filtered)
    if has_template_placeholders(generated_hypothesis):
        return None
    
    # Apply normalization (unify header format and line breaks)
    if normalize:
        generated_hypothesis = normalize_format(generated_hypothesis)
    
    # Safety check: required sections (should already be filtered)
    if not has_required_sections(generated_hypothesis):
        return None
    
    # Format output: <think>\n + reasoning + \n</think>\n\n + hypothesis
    # (matches hypothesis_composition_prepare_sft_data_to_go.py format)
    output = f"<think>\n{reasoning_trace}\n</think>\n\n{generated_hypothesis}"
    
    # Clean any existing EOS tokens - let the training framework handle them
    output = clean_eos_tokens(output)
    
    # Validate format: must have proper <think> structure
    if not output.startswith('<think>\n') or '\n</think>\n\n' not in output:
        return None  # Will be counted as failed
    
    # Get score from reranker_scores
    reranker_score = sample.get('reranker_scores', {}).get('selected_score', 0) or 0
    
    # Return intermediate format with all metadata (NOT LLaMA-Factory format yet)
    result = {
        'file_name': file_name,
        'step_idx': step_idx,
        'input': input_prompt,
        'output': output,
        'reranker_score': reranker_score,
        'sample_idx': sample.get('sample_idx', 0)
    }
    
    # Add bounded mode metadata if present
    if is_bounded_mode:
        result['tier'] = sample.get('tier')
        result['bounded_similarity'] = sample.get('bounded_similarity')
        result['gt_inspiration_title'] = sample.get('gt_inspiration_title')
        result['inspiration_title'] = title  # The bounded inspiration used
    
    return result


def convert_and_save_llama_factory_format(input_path: str, output_path: str):
    """
    Convert intermediate SFT data to LLaMA-Factory conversations format and save.
    
    Args:
        input_path: Path to intermediate SFT data (with 'input' and 'output' fields)
        output_path: Path to save LLaMA-Factory format data
    
    Output format (LLaMA-Factory conversations):
        {
            "conversations": [
                {"role": "user", "content": "..."},
                {"role": "assistant", "content": "..."}
            ]
        }
    """
    print("="*60)
    print("Converting and Saving to LLaMA-Factory Format")
    print("="*60)
    print(f"Input: {input_path}")
    print(f"Output: {output_path}")
    
    results = []
    with open(input_path, 'r') as f:
        for line in tqdm(f, desc="Converting"):
            sample = json.loads(line)
            # Convert to LLaMA-Factory conversations format
            converted = {
                "conversations": [
                    {"role": "user", "content": sample['input']},
                    {"role": "assistant", "content": sample['output']}
                ]
            }
            results.append(converted)
    
    # Save
    output_dir = os.path.dirname(output_path)
    if output_dir:
        os.makedirs(output_dir, exist_ok=True)
    
    with open(output_path, 'w') as f:
        for result in results:
            f.write(json.dumps(result, ensure_ascii=False) + '\n')
    
    print(f"\n✅ Converted {len(results)} samples")
    print(f"   Saved to: {output_path}")
    print(f"\n📋 Use with LLaMA-Factory:")
    print(f"   - Template: deepseekr1")
    print(f"   - Dataset format: sharegpt")


def main():
    parser = argparse.ArgumentParser(description='Filter and prepare SFT data')
    
    parser.add_argument("--input_path", type=str, required=True,
                       help="Path to reranked.jsonl from hypothesis_composition_llm_reranker.py")
    parser.add_argument("--output_dir", type=str, required=True,
                       help="Directory to save all outputs")
    parser.add_argument("--sft_qa_data_dir", type=str,
                       default="<YOUR_SFT_QA_DATA_DIR>",
                       help="Path to SFT QA data directory")
    parser.add_argument("--min_total_score", type=float, default=8.0,
                       help="Minimum total score (0-12 scale, default: 8.0)")
    parser.add_argument("--min_motivation", type=int, default=-1,
                       help="Minimum motivation score (0-4). -1 means no constraint (default: -1)")
    parser.add_argument("--min_mechanism", type=int, default=3,
                       help="Minimum mechanism score (0-4). -1 means no constraint (default: 3, ensuring core method ~75%+ match)")
    parser.add_argument("--min_methodology", type=int, default=-1,
                       help="Minimum methodology score (0-4). -1 means no constraint (default: -1)")
    parser.add_argument("--normalize", type=int, default=1,
                       help="Normalize hypothesis section headers (1=yes, 0=no, default: 1). "
                            "When enabled, standardizes variant headers like '**Motivation (WHY):**', "
                            "'### Mechanism:', '#### Implementation Steps:' to the dominant format: "
                            "'- Motivation (WHY):', '- Mechanism (HOW IT WORKS):', "
                            "'- Methodology (HOW IT'S INTEGRATED):'. "
                            "Note: 93%+ of samples already use this format, so normalization "
                            "mainly fixes edge cases. Only replaces standalone headers, never modifies content.")
    parser.add_argument("--num_workers", type=int, default=32,
                       help="Number of workers for parallel processing (default: 32)")
    
    args = parser.parse_args()
    
    # Convert normalize to bool
    args.normalize = bool(args.normalize)
    
    # Create output directory
    os.makedirs(args.output_dir, exist_ok=True)
    
    # Define output paths based on thresholds
    # Format: t8_mech3 means total>=8, mechanism>=3
    # Only include dimension constraints that are active (>= 0)
    if args.min_total_score == int(args.min_total_score):
        filter_parts = [f"t{int(args.min_total_score)}"]
    else:
        filter_parts = [f"t{args.min_total_score}".replace(".", "p")]
    
    if args.min_motivation >= 0:
        filter_parts.append(f"mot{args.min_motivation}")
    if args.min_mechanism >= 0:
        filter_parts.append(f"mech{args.min_mechanism}")
    if args.min_methodology >= 0:
        filter_parts.append(f"meth{args.min_methodology}")
    
    filter_str = "_".join(filter_parts)
    intermediate_path = os.path.join(args.output_dir, f"sft_filtered_{filter_str}.jsonl")
    llama_factory_path = os.path.join(args.output_dir, f"sft_llama_factory_{filter_str}.jsonl")
    
    print("="*60)
    print("Filter and Prepare SFT Data")
    print("="*60)
    print(f"Input: {args.input_path}")
    print(f"Output dir: {args.output_dir}")
    print(f"  - Intermediate: {intermediate_path}")
    print(f"  - LLaMA-Factory: {llama_factory_path}")
    print(f"Min total score: {args.min_total_score}")
    print(f"Min motivation: {args.min_motivation if args.min_motivation >= 0 else 'None'}")
    print(f"Min mechanism: {args.min_mechanism if args.min_mechanism >= 0 else 'None'}")
    print(f"Min methodology: {args.min_methodology if args.min_methodology >= 0 else 'None'}")
    print(f"Normalize: {args.normalize}")
    
    # Load data points from reranker output
    # Each data point represents one (file_name, step_idx) combination
    # and contains 8 candidate samples (from num_samples=8 generation)
    data_points = []
    with open(args.input_path, 'r') as f:
        for line in tqdm(f, desc="Loading"):
            data_points.append(json.loads(line))
    
    print(f"Loaded {len(data_points)} data points")
    
    # Detect bounded mode and show statistics
    is_bounded_mode = data_points[0].get('tier') is not None if data_points else False
    if is_bounded_mode:
        print(f"Mode: BOUNDED (detected 'tier' field)")
        tier_counts = {}
        for dp in data_points:
            t = dp.get('tier', 'unknown')
            tier_counts[t] = tier_counts.get(t, 0) + 1
        print(f"Tier distribution: {tier_counts}")
    else:
        print(f"Mode: NORMAL")
    
    # Filter and select best sample from all 8 candidate samples per data point
    # Logic:
    #   1. Check ALL 8 candidate samples against:
    #      - Score thresholds (total, motivation, mechanism, methodology)
    #      - Format requirements (no template placeholders, has all 3 sections)
    #   2. Select the best among qualified samples
    #   3. Rank by: (total, mechanism, methodology, motivation) - descending
    #
    # BOUNDED MODE SPECIAL LOGIC:
    #   For bounded composition, each (file_name, step_idx) may have multiple tiers.
    #   Priority: hard > medium > easy
    #   - If hard tier has qualified sample → select best from hard, ignore medium/easy
    #   - Else if medium tier has qualified sample → select best from medium, ignore easy
    #   - Else if easy tier has qualified sample → select best from easy
    #   - Else discard this (file_name, step_idx)
    #
    # Format checking is done BEFORE selecting best, so if the highest-scoring
    # sample has format issues, another valid sample can be selected instead.
    
    def find_best_qualified_sample(data_point, args):
        """Find the best qualified sample from a data point's candidates."""
        reranker_scores = data_point.get('reranker_scores')
        if not reranker_scores:
            return None, 0
        
        candidate_samples = reranker_scores.get('all_samples', [])
        qualified = [s for s in candidate_samples 
                    if check_sample_qualifies(s, args.min_total_score, 
                                              args.min_motivation, 
                                              args.min_mechanism, 
                                              args.min_methodology,
                                              check_format=True,
                                              normalize=args.normalize)]
        
        if not qualified:
            return None, 0
        
        best_sample = max(qualified, key=get_sample_sort_key)
        return best_sample, len(qualified)
    
    def create_filtered_data_point(data_point, best_sample):
        """Create a copy of data point with the selected best sample."""
        reranker_scores = data_point.get('reranker_scores', {})
        best_idx = best_sample.get('idx', 0)
        data_point_copy = data_point.copy()
        data_point_copy['reranker_scores'] = reranker_scores.copy()
        data_point_copy['reranker_scores']['selected_idx'] = best_idx
        data_point_copy['reranker_scores']['selected_score'] = best_sample.get('total', 0)
        data_point_copy['generated_hypothesis'] = best_sample.get('generated_hypothesis', '')
        data_point_copy['reasoning_trace'] = best_sample.get('reasoning_trace', '')
        data_point_copy['sample_idx'] = best_sample.get('sample_idx', best_idx)
        return data_point_copy
    
    # Process each data point: filter all candidate samples, then select best
    filtered_data_points = []
    total_qualified_samples = 0
    
    if is_bounded_mode:
        # BOUNDED MODE: Group by (file_name, step_idx), then apply tier priority
        # Priority: hard > medium > easy
        # Each unique (file_name, step_idx) produces at most 1 output sample
        
        from collections import defaultdict
        
        # Valid tiers for bounded mode
        valid_tiers = {'hard', 'medium', 'easy'}
        
        # Group data points by (file_name, step_idx)
        # groups[key] = {tier: data_point, ...}
        groups = defaultdict(dict)
        invalid_tier_count = 0
        
        for data_point in data_points:
            key = (data_point['file_name'], data_point['step_idx'])
            tier = data_point.get('tier')
            
            # Validate tier field exists and is valid
            if tier is None:
                raise ValueError(f"Missing 'tier' field in bounded mode for {key}")
            if tier not in valid_tiers:
                raise ValueError(f"Invalid tier '{tier}' in bounded mode for {key}. Expected one of: {valid_tiers}")
            
            groups[key][tier] = data_point
        
        print(f"Unique (file_name, step_idx) groups: {len(groups)}")
        
        # Tier priority order
        tier_priority = ['hard', 'medium', 'easy']
        tier_selection_stats = {'hard': 0, 'medium': 0, 'easy': 0, 'none': 0}
        
        for key, tier_data_points in groups.items():
            selected = False
            for tier in tier_priority:
                if tier in tier_data_points:
                    data_point = tier_data_points[tier]
                    best_sample, num_qualified = find_best_qualified_sample(data_point, args)
                    
                    if best_sample is not None:
                        total_qualified_samples += num_qualified
                        filtered_data_points.append(create_filtered_data_point(data_point, best_sample))
                        tier_selection_stats[tier] += 1
                        selected = True
                        break  # Found qualified sample in this tier, skip lower priority tiers
            
            if not selected:
                tier_selection_stats['none'] += 1
        
        print(f"Tier selection distribution:")
        print(f"  - Selected from hard:   {tier_selection_stats['hard']}")
        print(f"  - Selected from medium: {tier_selection_stats['medium']}")
        print(f"  - Selected from easy:   {tier_selection_stats['easy']}")
        print(f"  - No qualified sample:  {tier_selection_stats['none']}")
    
    else:
        # NORMAL MODE: Each data point is processed independently
        for data_point in data_points:
            best_sample, num_qualified = find_best_qualified_sample(data_point, args)
            
            if best_sample is not None:
                total_qualified_samples += num_qualified
                filtered_data_points.append(create_filtered_data_point(data_point, best_sample))
    
    # Build filter description for logging
    filter_desc_parts = [f"total >= {args.min_total_score}"]
    if args.min_motivation >= 0:
        filter_desc_parts.append(f"motivation >= {args.min_motivation}")
    if args.min_mechanism >= 0:
        filter_desc_parts.append(f"mechanism >= {args.min_mechanism}")
    if args.min_methodology >= 0:
        filter_desc_parts.append(f"methodology >= {args.min_methodology}")
    filter_desc = " AND ".join(filter_desc_parts)
    
    print(f"After filtering ({filter_desc}):")
    print(f"  - Data points with qualified samples: {len(filtered_data_points)} ({len(filtered_data_points)/len(data_points)*100:.1f}%)")
    print(f"  - Total qualified samples across all data: {total_qualified_samples}")
    print(f"  - Avg qualified per data point: {total_qualified_samples/len(filtered_data_points):.2f}" if filtered_data_points else "")
    
    if not filtered_data_points:
        print("No data points passed the threshold!")
        return
    
    # Build complete SFT samples with input/output
    prompts = instruction_prompts("prepare_HC_sft_data_to_go_comprehensive_v2_delta")
    
    num_workers = args.num_workers
    print(f"Using {num_workers} workers for parallel processing")
    
    # Create partial function with fixed arguments
    build_func = partial(
        build_sft_sample,
        sft_qa_data_dir=args.sft_qa_data_dir,
        prompts=prompts,
        normalize=args.normalize
    )
    
    results = []
    failed = 0
    
    with ThreadPoolExecutor(max_workers=num_workers) as executor:
        # Submit all tasks
        future_to_data = {executor.submit(build_func, dp): dp for dp in filtered_data_points}
        
        # Collect results with progress bar
        for future in tqdm(as_completed(future_to_data), total=len(filtered_data_points), desc="Building SFT samples"):
            try:
                sft_sample = future.result()
                if sft_sample:
                    results.append(sft_sample)
                else:
                    failed += 1
            except Exception as e:
                failed += 1
                # print(f"Error: {e}")
    
    print(f"Built {len(results)} SFT samples, {failed} failed")
    
    # Sort results for deterministic output order
    # Normal mode: sort by (file_name, step_idx)
    # Bounded mode: sort by (file_name, step_idx, tier)
    tier_order = {'hard': 0, 'medium': 1, 'easy': 2}  # hard first
    if is_bounded_mode:
        results.sort(key=lambda r: (r.get('file_name', ''), r.get('step_idx', 0), 
                                    tier_order.get(r.get('tier', ''), 99)))
    else:
        results.sort(key=lambda r: (r.get('file_name', ''), r.get('step_idx', 0)))
    
    # Save intermediate results (with metadata)
    with open(intermediate_path, 'w') as f:
        for result in results:
            f.write(json.dumps(result, ensure_ascii=False) + '\n')
    
    print(f"\n✅ Saved {len(results)} samples to {intermediate_path}")
    
    # Statistics
    if results:
        scores = [r.get('reranker_score', 0) for r in results]
        print(f"\nFiltered score statistics (0-12 scale):")
        print(f"  Min:  {min(scores)}")
        print(f"  Max:  {max(scores)}")
        print(f"  Mean: {sum(scores)/len(scores):.2f}")
        
        # Bounded mode statistics
        if is_bounded_mode:
            result_tier_counts = {}
            for r in results:
                t = r.get('tier', 'unknown')
                result_tier_counts[t] = result_tier_counts.get(t, 0) + 1
            print(f"\nBounded mode - samples by tier:")
            for tier, count in sorted(result_tier_counts.items()):
                print(f"  {tier}: {count}")
    
    # Convert and save to LLaMA-Factory format
    print(f"\n" + "="*60)
    convert_and_save_llama_factory_format(intermediate_path, llama_factory_path)
    
    # Summary
    print(f"\n" + "="*60)
    print("📋 Output Summary")
    print("="*60)
    print(f"Intermediate (with metadata): {intermediate_path}")
    print(f"LLaMA-Factory format:         {llama_factory_path}")
    print(f"\n🚀 Ready for training with LLaMA-Factory:")
    print(f"   - Template: deepseekr1")
    print(f"   - Dataset format: sharegpt")


if __name__ == "__main__":
    main()

