import os, sys, json, random, math

# Add Legacy folder to path for hypothesis_composition_reasoning_trace
sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'Legacy'))
from hypothesis_composition_reasoning_trace import sample_one_MDP_for_one_paper_from_hypothesis_components
from negative_inspiration_collection_random_sampling_utils import load_title_abstract_and_year_from_all_sources, random_sample_k_papers_from_paper_all_before_year


# ============================================================================
# Helper function to load embedding-based similar papers
# ============================================================================
# Input:
#   embedding_neg_insp_dir: directory containing embedding-based negative inspiration files
#   file_name: current file being processed
#   gdth_insp_title: ground truth inspiration title
# Output:
#   embedding_similar_papers: [[title, abstract, year], ...] or [] if not found
def load_embedding_similar_papers(embedding_neg_insp_dir, file_name, gdth_insp_title):
    """
    Load embedding-based similar papers from bounded_inspiration_recommendations.
    These are obtained via Semantic Scholar's Recommendations API.
    
    File format expected:
    {
        "inspirations": [
            {
                "original_title": "...",
                "recommendations": [
                    {"title": "...", "abstract": "...", "year": ..., "doi": ...},
                    ...
                ]
            },
            ...
        ]
    }
    """
    if embedding_neg_insp_dir is None:
        return []
    
    embedding_file = os.path.join(embedding_neg_insp_dir, file_name)
    if not os.path.exists(embedding_file):
        return []
    
    try:
        with open(embedding_file, 'r') as f:
            embedding_data = json.load(f)
        
        # Find the inspiration matching gdth_insp_title
        for insp in embedding_data.get("inspirations", []):
            if insp.get("original_title") == gdth_insp_title:
                recommendations = insp.get("recommendations", [])
                # Convert to [[title, abstract, year], ...] format
                # Filter out entries with missing title or abstract
                result = [
                    [rec.get("title", ""), rec.get("abstract", ""), rec.get("year", 0)]
                    for rec in recommendations
                    if rec.get("title") and rec.get("abstract")
                ]
                return result
        
        return []
    except Exception as e:
        print(f"Warning: Failed to load embedding similar papers from {embedding_file}: {e}")
        return []


# ============================================================================
# Helper function to dynamically allocate hard negative quotas
# ============================================================================
def allocate_hard_negative_quotas(
    num_negative_inspirations,
    main_paper_similar_available,
    insp_similar_available,
    embedding_similar_available,
    target_hard_ratio=0.35  # Target ~35% hard negatives
):
    """
    Dynamically allocate quotas for each hard negative source with PRIORITY weighting.
    
    Priority (from highest to lowest):
    1. Embedding similar (50% weight) - Deep semantic similarity, hardest negatives
    2. Inspiration keyword similar (30% weight) - Related to ground truth
    3. Main paper keyword similar (20% weight) - Related to main paper topic
    
    Strategy:
    - Total hard negatives should be around 30-35% of total (e.g., 4-5 out of 14)
    - Allocate by priority weights, capped by available papers
    - Redistribute unallocated quota to lower priority sources
    
    Returns:
        (num_main, num_insp, num_embedding, num_random)
    """
    target_hard = math.ceil(num_negative_inspirations * target_hard_ratio)  # e.g., 5 for 14 negatives with 0.35 ratio
    
    # Priority weights (must sum to 1.0)
    # Higher weight = higher priority = more quota
    weights = {
        'embedding': 0.50,  # Highest priority - semantic similarity
        'insp': 0.30,       # Medium priority - keyword similar to inspiration
        'main': 0.20        # Lower priority - keyword similar to main paper
    }
    
    # Calculate initial quota based on weights
    allocations = {'main': 0, 'insp': 0, 'embedding': 0}
    available = {
        'main': main_paper_similar_available,
        'insp': insp_similar_available,
        'embedding': embedding_similar_available
    }
    
    # Check if any source has data
    if all(v == 0 for v in available.values()):
        return 0, 0, 0, num_negative_inspirations
    
    # Allocate by priority order: embedding -> insp -> main
    priority_order = ['embedding', 'insp', 'main']
    remaining_quota = target_hard
    
    for source in priority_order:
        if available[source] == 0:
            continue
        
        # Calculate weighted quota for this source
        # But also consider redistributing from unavailable sources
        active_weight = weights[source]
        # Normalize weight based on what sources are actually available
        total_active_weight = sum(weights[s] for s in priority_order if available[s] > 0)
        if total_active_weight > 0:
            normalized_weight = active_weight / total_active_weight
        else:
            normalized_weight = 0
        
        # Calculate quota (at least 1 if available and there's remaining quota)
        quota = max(1, int(target_hard * normalized_weight)) if remaining_quota > 0 else 0
        quota = min(quota, remaining_quota)  # Don't exceed remaining
        
        # Cap by available papers
        actual = min(quota, available[source])
        allocations[source] = actual
        remaining_quota -= actual
    
    # If there's still remaining quota, try to fill from sources with extra capacity
    # (in priority order)
    for source in priority_order:
        if remaining_quota <= 0:
            break
        extra_capacity = available[source] - allocations[source]
        if extra_capacity > 0:
            additional = min(remaining_quota, extra_capacity)
            allocations[source] += additional
            remaining_quota -= additional
    
    total_hard = allocations['main'] + allocations['insp'] + allocations['embedding']
    num_random = num_negative_inspirations - total_hard
    
    return allocations['main'], allocations['insp'], allocations['embedding'], max(0, num_random)


# ============================================================================
# Main negative sampling function with multiple hard negative sources
# ============================================================================
# Input:
#   paper_all: {year: [[title, abstract, year], ...]}
#   cur_paper_year: int
#   num_negative_inspirations: int
#   ncbi_neg_insp_dir, ss_neg_insp_dir: directories for negative inspiration files (keyword-based)
#   file_name: current file being processed
#   neg_insp_source: 'ncbi' or 'ss'
#   gdth_insp_title: ground truth inspiration title
#   embedding_neg_insp_dir: directory for embedding-based negative inspirations (optional, can be None)
#   ban_list: list of [title, abstract] pairs to exclude from sampling (optional)
#   target_hard_ratio: target ratio of hard negatives (default 0.35 = 35%)
# Output:
#   negative_inspirations: [[title, abstract, year], ...]
def keyword_overlap_with_random_sample_k_papers_from_paper_all_before_year(
    paper_all, cur_paper_year, num_negative_inspirations, 
    ncbi_neg_insp_dir, ss_neg_insp_dir, file_name, neg_insp_source, gdth_insp_title, 
    embedding_neg_insp_dir=None, ban_list=None, target_hard_ratio=0.35
):
    # ========================================================================
    # Step 1: Load keyword-based similar papers (from NCBI or Semantic Scholar)
    # ========================================================================
    if neg_insp_source == 'ncbi':
        cur_neg_insp_file = os.path.join(ncbi_neg_insp_dir, file_name)
    elif neg_insp_source == 'ss':
        cur_neg_insp_file = os.path.join(ss_neg_insp_dir, file_name)
    else:
        raise ValueError(f"Invalid negative inspiration source: {neg_insp_source}")
    
    # Load keyword file (file must exist since we got it from get_files_to_process)
    cur_main_paper_similar_papers = []
    cur_inspiration_keyword_similar_papers = []
    
    with open(cur_neg_insp_file, 'r') as f:
        cur_neg_insp_data = json.load(f)
    
    # cur_main_paper_similar_papers: [[title, abstract, year, doi], ...]
    cur_main_paper_similar_papers = cur_neg_insp_data["main_paper"]['similar_papers']
    
    # cur_inspiration_keyword_similar_papers: find by matching gdth_insp_title
    # Note: cur_neg_insp_data["inspirations"] is a list of {original_title, similar_papers} objects
    cur_inspiration_keyword_similar_papers = [
        insp["similar_papers"]
        for insp in cur_neg_insp_data["inspirations"]
        if insp['original_title'] == gdth_insp_title
    ]
    if len(cur_inspiration_keyword_similar_papers) != 1:
        if len(cur_inspiration_keyword_similar_papers) > 1:
            print(f"Warning: Found {len(cur_inspiration_keyword_similar_papers)} matching inspirations for '{gdth_insp_title}', using first one.")
        cur_inspiration_keyword_similar_papers = cur_inspiration_keyword_similar_papers[0] if cur_inspiration_keyword_similar_papers else []
    else:
        # cur_inspiration_keyword_similar_papers: [[title, abstract, year, doi], ...]
        cur_inspiration_keyword_similar_papers = cur_inspiration_keyword_similar_papers[0]
    
    # ========================================================================
    # Step 2: Load embedding-based similar papers (from Semantic Scholar Recommendations)
    # ========================================================================
    cur_embedding_similar_papers = load_embedding_similar_papers(
        embedding_neg_insp_dir, file_name, gdth_insp_title
    )
    
    # ========================================================================
    # Step 3: Dynamically allocate quotas based on available sources
    # Target: ~30-35% hard negatives (from keyword + embedding sources)
    # ========================================================================
    num_main, num_insp, num_embedding, num_random = allocate_hard_negative_quotas(
        num_negative_inspirations=num_negative_inspirations,
        main_paper_similar_available=len(cur_main_paper_similar_papers),
        insp_similar_available=len(cur_inspiration_keyword_similar_papers),
        embedding_similar_available=len(cur_embedding_similar_papers),
        target_hard_ratio=target_hard_ratio
    )
    
    # ========================================================================
    # Step 4: Sample from each source according to allocated quotas
    # ========================================================================
    # main_paper_similar_papers: [[title, abstract, year], ...]
    main_paper_similar_papers = [
        [p[0], p[1], p[2]] for p in cur_main_paper_similar_papers[:num_main]
    ]
    
    # inspiration_keyword_similar_papers: [[title, abstract, year], ...]
    inspiration_keyword_similar_papers = [
        [p[0], p[1], p[2]] for p in cur_inspiration_keyword_similar_papers[:num_insp]
    ]
    
    # embedding_similar_papers: [[title, abstract, year], ...]
    embedding_similar_papers = cur_embedding_similar_papers[:num_embedding]
    
    # sampled_random_papers: [[title, abstract, year], ...]
    sampled_random_papers = (
        random_sample_k_papers_from_paper_all_before_year(paper_all, cur_paper_year, num_random, ban_list) 
        if num_random > 0 else []
    )
    
    # ========================================================================
    # Step 5: Aggregate all negative inspirations
    # ========================================================================
    # Note: Hard negative sources are not filtered by ban_list since they have their own filtering
    negative_inspirations = (
        main_paper_similar_papers + 
        inspiration_keyword_similar_papers + 
        embedding_similar_papers + 
        sampled_random_papers
    )
    
    assert len(negative_inspirations) == num_negative_inspirations, \
        f"Expected {num_negative_inspirations} negatives, got {len(negative_inspirations)}"
    
    return negative_inspirations




# Input:
#   sft_qa_data_dir: the directory of the sft qa data; used to sample random negative inspirations
#   paper_all: {year: [[title, abstract, year], ...]}
#   num_negative_inspirations: int
#   strategy: str, one of ['pure_random', 'keyword_overlap_with_random']
#       'pure_random': all negative inspirations are by random sampling from all papers published before the current paper's year
#       'keyword_overlap_with_random': first collect the negative inspirations by keyword overlap + embedding, 
#           then add random samples to make up the total number of negative inspirations
#   embedding_neg_insp_dir: directory for embedding-based negative inspirations (optional, can be None)
#       If provided, will include Semantic Scholar Recommendations as additional hard negatives
#   target_hard_ratio: target ratio of hard negatives (default 0.35 = 35%)
# Output:
#   collected_data: [[bkg, negative_insp, gdth_insp, year_pmid], ...]
#   where:
#       bkg: [research_question, background_survey, pre_step_hyp]
#       negative_insp: [[title, abstract, year], ...]
#       gdth_insp: [found_title, found_abstract, insp, relation]
#       year_pmid: str, format "year_pmid" for tracking (e.g., "2023_12345678")
def collect_negative_inspiration(
    sft_qa_data_dir, ncbi_neg_insp_dir, ss_neg_insp_dir, paper_all, 
    num_negative_inspirations, strategy, 
    embedding_neg_insp_dir=None, target_hard_ratio=0.35
):
    # check input parameters
    assert strategy in ['pure_random', 'keyword_overlap_with_random']
    # files_to_process: a list of [file_name, source] tuples
    files_to_process = get_files_to_process(ncbi_neg_insp_dir, ss_neg_insp_dir, sft_qa_data_dir)
    # list all json files in the directory
    collected_data = []
    # load the data from each json file
    for cur_idx, (cur_file, cur_neg_insp_source) in enumerate(files_to_process):
        # print progress
        if cur_idx % 1000 == 0:
            print(f"Processing file {cur_idx} / {len(files_to_process)}")
        # load the data from the json file
        # Note: Files from ncbi/ss directories must also exist in sft_qa_data_dir with same filename
        cur_file_full_path = os.path.join(sft_qa_data_dir, cur_file)
        with open(cur_file_full_path, 'r') as f:
            cur_data = json.load(f)
        
        cur_paper_year = int(cur_file.split('_')[0])
        cur_paper_pmid = cur_file.split('_')[1].split('.')[0] if '_' in cur_file else ''
        # Special case: 0000 means 2020
        if cur_paper_year == 0:
            cur_paper_year = 2020
        # Create year_pmid string for tracking
        year_pmid_str = f"{cur_paper_year}_{cur_paper_pmid}"
        
        # Check for required fields (v2 format with hypothesis_components)
        required_fields = ["inspiration", "hypothesis_components", "research_question", "background_survey", "title", "abstract"]
        missing_fields = [field for field in required_fields if field not in cur_data]
        if missing_fields:
            print(f"  Warning: Missing fields {missing_fields} in {cur_file}, skipping")
            continue
        
        if not cur_data["hypothesis_components"]:
            raise ValueError(f"Empty hypothesis_components in {cur_file}")
            
        cur_inspirations = cur_data["inspiration"]
        cur_paper_name = cur_file
        
        # Build MDP road from hypothesis_components (v2 format: sequential order 0->1->2->...)
        # cur_MDP_road: [[insp_id, delta_hyp], ...]
        cur_MDP_road = sample_one_MDP_for_one_paper_from_hypothesis_components(
            cur_inspirations, cur_data["hypothesis_components"], cur_paper_name
        )
        # for each step in the MDP road, there's one ground truth inspiration; and we need to randomly sample 14 papers published before cur_paper_year as negative inspirations
        # pre_step_hyp accumulates all previous delta hypotheses (concatenated)
        pre_step_hyp = None
        for cur_step in cur_MDP_road:
            # cur_bkg_to_collect
            cur_bkg_to_collect = [cur_data["research_question"], cur_data["background_survey"], pre_step_hyp]
            # cur_gdth_insp_to_collect
            cur_gdth_insp = cur_inspirations[cur_step[0]]
            cur_gdth_insp_to_collect = [cur_gdth_insp["found_title"], cur_gdth_insp["found_abstract"], cur_gdth_insp['insp'], cur_gdth_insp['relation']]
            # Create ban list with ground truth inspiration and current paper
            # Note: Ban list filtering ensures ground truth and current paper are excluded from negative samples
            # but adds significant overhead when processing many files.
            # Trade-off: Setting to None improves performance but may occasionally include ground truth as negative
            # (though probability is very low given the large paper pool)
            # ban_list = [
            #     [cur_gdth_insp["found_title"], cur_gdth_insp["found_abstract"]],  # Ground truth inspiration
            #     [cur_data["title"], cur_data["abstract"]]  # Current paper
            # ]
            ban_list = None  # Disabled for performance reasons
            
            # cur_negative_insp_to_collect
            if strategy == 'pure_random':
                # sampled_random_papers: [[title, abstract, year], ...]
                cur_negative_insp_to_collect = random_sample_k_papers_from_paper_all_before_year(
                    paper_all, cur_paper_year, num_negative_inspirations, ban_list
                )
            elif strategy == 'keyword_overlap_with_random':
                # negative_inspirations: [[title, abstract, year], ...]
                # Includes: keyword-based (main paper + inspiration) + embedding-based + random
                cur_negative_insp_to_collect = keyword_overlap_with_random_sample_k_papers_from_paper_all_before_year(
                    paper_all, cur_paper_year, num_negative_inspirations, 
                    ncbi_neg_insp_dir, ss_neg_insp_dir, cur_file, cur_neg_insp_source, 
                    cur_gdth_insp["found_title"],
                    embedding_neg_insp_dir=embedding_neg_insp_dir,
                    ban_list=ban_list,
                    target_hard_ratio=target_hard_ratio
                )
            else:
                raise ValueError(f"Invalid strategy: {strategy}")
            # aggregate: Added year_pmid as 4th element
            cur_full_data_to_collect = [cur_bkg_to_collect, cur_negative_insp_to_collect, cur_gdth_insp_to_collect, year_pmid_str]
            # add to collected_data
            collected_data.append(cur_full_data_to_collect)
            # update pre_step_hyp: accumulate all past delta hypotheses
            cur_delta_hyp = cur_step[1]
            if pre_step_hyp is None:
                pre_step_hyp = cur_delta_hyp
            else:
                pre_step_hyp = pre_step_hyp + "\n\n" + cur_delta_hyp
    return collected_data


# Function: get all available files from ncbi and semantic scholar directories
# Input:
#   ncbi_neg_insp_dir: the directory of the ncbi negative inspirations
#   ss_neg_insp_dir: the directory of the semantic scholar negative inspirations
#   sft_qa_data_dir: the directory containing the main SFT QA data (for filtering)
# Output:
#   files_to_process: a list of files to process, each element is a tuple of (file_name, source)
def get_files_to_process(ncbi_neg_insp_dir, ss_neg_insp_dir, sft_qa_data_dir):    
    # get files from ncbi_neg_insp_dir
    ncbi_neg_insp_files = set([f for f in os.listdir(ncbi_neg_insp_dir) if f.endswith('.json')])
    # get files from ss_neg_insp_dir
    ss_neg_insp_files = set([f for f in os.listdir(ss_neg_insp_dir) if f.endswith('.json')])
    # get files from sft_qa_data_dir (only process files that exist here)
    sft_qa_files = set([f for f in os.listdir(sft_qa_data_dir) if f.endswith('.json')])
    
    # Combine all files from both sources, prefer ncbi if file exists in both
    # Only include files that also exist in sft_qa_data_dir
    ncbi_only = ncbi_neg_insp_files - ss_neg_insp_files
    ss_only = ss_neg_insp_files - ncbi_neg_insp_files
    both = ncbi_neg_insp_files & ss_neg_insp_files
    all_neg_insp_files = ncbi_neg_insp_files | ss_neg_insp_files
    valid_files = all_neg_insp_files & sft_qa_files  # Only files that exist in sft_qa_data_dir
    
    files_to_process = [(f, 'ncbi') for f in valid_files if f in ncbi_neg_insp_files] + \
                       [(f, 'ss') for f in valid_files if f in ss_only]
    
    # Sort by actual year (handling 0000 as 2020), then by PMID
    def get_sort_key(file_tuple):
        filename = file_tuple[0]
        year_str = filename.split('_')[0]
        year = int(year_str) if year_str != '0000' else 2020  # Special case: 0000 means 2020
        pmid = filename.split('_')[1].split('.')[0] if '_' in filename else ''
        return (year, pmid)  # Sort by year first, then by PMID within same year
    
    files_to_process.sort(key=get_sort_key)

    print(f"Found {len(ncbi_neg_insp_files)} ncbi negative inspiration files")
    print(f"Found {len(ss_neg_insp_files)} semantic scholar negative inspiration files")
    print(f"  - ncbi only: {len(ncbi_only)}, ss only: {len(ss_only)}, both: {len(both)}")
    print(f"Found {len(sft_qa_files)} sft_qa_data files")
    print(f"Found {len(files_to_process)} files to process (after filtering)")
    return files_to_process



if __name__ == "__main__":
    # ========================================================================
    # Mode Selection: "train" or "test"
    # ========================================================================
    MODE = "train"  # Change to "train" to process training data
    
    # ========================================================================
    # Parameters
    # ========================================================================
    num_negative_inspirations = 14
    strategy = 'keyword_overlap_with_random'
    # Target hard negative ratio: ~30-35% (4-5 out of 14)
    # Paper D.1: "approx. 30% hard, 70% random"
    target_hard_ratio = 0.35

    # ========================================================================
    # Base directories and suffixes - MODIFY THESE
    # ========================================================================
    BASE_DIR = "<YOUR_DATA_ROOT>/sft_qa_data"
    wos_raw_data_dir = "<YOUR_DATA_ROOT>/web_of_science_title_abstracts/2015_2019"  # Optional: for Web of Science data
    
    # Suffixes for train vs test datasets
    SUFFIX_TRAIN = "train"
    SUFFIX_TEST = "test"
    
    # ========================================================================
    # Directory configuration based on MODE
    # ========================================================================
    sft_qa_data_dir_train = f"{BASE_DIR}/pubmed_sft_qa_data_v2_{SUFFIX_TRAIN}"
    sft_qa_data_dir_test = f"{BASE_DIR}/pubmed_sft_qa_data_v2_{SUFFIX_TEST}"
    
    if MODE == "train":
        # Process training data: use only train data as negative pool
        sft_qa_data_dir_to_process = sft_qa_data_dir_train
        negative_inspiration_sft_qa_data_source = [sft_qa_data_dir_train]
        # Keyword-based (no suffix for train)
        ncbi_neg_insp_dir = f"{BASE_DIR}/negative_inspiration_collection_keyword_overlap_ncbi"
        ss_neg_insp_dir = f"{BASE_DIR}/negative_inspiration_collection_keyword_overlap_semantic_scholar"
        # Embedding-based
        embedding_neg_insp_dir = f"{BASE_DIR}/bounded_inspiration_recommendations_v2_{SUFFIX_TRAIN}"
        # Output
        output_data_dir = f"{BASE_DIR}/inspiration_retrieval_QA_data_with_embedding_neg_insp_{SUFFIX_TRAIN}"
    else:  # MODE == "test"
        # Process test data: use train+test as negative pool
        sft_qa_data_dir_to_process = sft_qa_data_dir_test
        negative_inspiration_sft_qa_data_source = [sft_qa_data_dir_train, sft_qa_data_dir_test]
        # Keyword-based
        ncbi_neg_insp_dir = f"{BASE_DIR}/negative_inspiration_collection_keyword_overlap_ncbi_{SUFFIX_TEST}"
        ss_neg_insp_dir = f"{BASE_DIR}/negative_inspiration_collection_keyword_overlap_semantic_scholar_{SUFFIX_TEST}"
        # Embedding-based
        embedding_neg_insp_dir = f"{BASE_DIR}/bounded_inspiration_recommendations_v2_{SUFFIX_TEST}"
        # Output
        output_data_dir = f"{BASE_DIR}/inspiration_retrieval_QA_data_with_embedding_neg_insp_{SUFFIX_TEST}"
    
    # Set to None to disable embedding-based hard negatives
    # embedding_neg_insp_dir = None
    # Create output directory if it doesn't exist
    os.makedirs(output_data_dir, exist_ok=True)
    
    # Print configuration
    print("=" * 60)
    print("Negative Inspiration Collection Configuration")
    print("=" * 60)
    print(f"Mode: {MODE}")
    print(f"Strategy: {strategy}")
    print(f"Num negatives: {num_negative_inspirations}")
    print(f"Target hard ratio: {target_hard_ratio:.0%}")
    print(f"Embedding source: {'Enabled' if embedding_neg_insp_dir else 'Disabled'}")
    print(f"Processing: {sft_qa_data_dir_to_process}")
    print(f"Output: {output_data_dir}")
    print("=" * 60)

    # get [title, abstract, year] from all sources (downloaded from Web of Science and sft_qa_data_dir)
    # paper_all is only used to sample negative inspirations
    if not os.path.exists(os.path.join(output_data_dir, "paper_all.json")):
        print("Not found paper_all.json, loading from all sources...")        
        paper_all = load_title_abstract_and_year_from_all_sources(wos_raw_data_dir, negative_inspiration_sft_qa_data_source)
        # save paper_all to a json file
        with open(os.path.join(output_data_dir, "paper_all.json"), 'w') as f:
            json.dump(paper_all, f, indent=4)
    else:
        with open(os.path.join(output_data_dir, "paper_all.json"), 'r') as f:
            paper_all = json.load(f)
        print(f"Loaded paper_all from {os.path.join(output_data_dir, 'paper_all.json')} with length {len(paper_all)}")

    # collect negative inspirations: the internal data should be ordered by year (ascending)
    collected_data = collect_negative_inspiration(
        sft_qa_data_dir_to_process, ncbi_neg_insp_dir, ss_neg_insp_dir, paper_all, 
        num_negative_inspirations, strategy,
        embedding_neg_insp_dir=embedding_neg_insp_dir,
        target_hard_ratio=target_hard_ratio
    )

    # save collected data
    with open(os.path.join(output_data_dir, f"collected_inspiration_retrieval_QA_data_{num_negative_inspirations}_{strategy}_{len(collected_data)}.json"), 'w') as f:
        json.dump(collected_data, f, indent=4)

    print(f"Collected {len(collected_data)} negative inspirations")
    print(f"Saved collected data to {os.path.join(output_data_dir, f'collected_inspiration_retrieval_QA_data_{num_negative_inspirations}_{strategy}_{len(collected_data)}.json')}")