import faiss
import json
import numpy as np


def parse_kaggle(description_path: str,
                 competitions_path: str,
                 vector_index_path: str,
                 metadata_path: str,
                 retrieve_model,
                 top_n_competitions=3, threshold=0):
    """
    Parses Kaggle competition data to extract a list of relevant project ideas 
    based on the similarity to a given description.

    Parameters:
        :param description_path: Path to description of the competition.
        :param competitions_path: Path to the stored competition data in json format.
        :param vector_index_path: Path to the FAISS index file containing competition embeddings.
        :param metadata_path: Path to a JSON or pickle file mapping FAISS index to (link, description).
        :param retrieve_model: Embedding model.
        :param top_n_competitions: Number of top similar competitions to consider. Default is 3.

    Returns:
        list: A list of extracted ideas or insights from the discussions of the selected competitions.
    """

    with open(description_path, 'r') as f:
        description = '\n'.join(f.readlines())

    links_scores = get_similar_competitions(description, retrieve_model,
                                            vector_index_path, metadata_path,
                                            top_n=top_n_competitions)
    final_links = []

    for link, score in links_scores:
        if score >= threshold:
            final_links.append(link)

    list_of_ideas = get_list_of_ideas(final_links, competitions_path)
    return list_of_ideas


def get_list_of_ideas(links_list, competitions_path):
    with open(competitions_path, 'r', encoding='utf-8') as f:
        data = json.load(f)

    all_ideas = []

    for link in links_list:
        competition = data[link]
        for solution_text in competition['solution_texts']:
            all_ideas.append(solution_text)

    return all_ideas


def get_similar_competitions(description: str,
                             retrieve_model,
                             vector_index_path: str,
                             metadata_path: str,
                             top_n: int = 3):
    """
    Retrieves the most similar Kaggle competitions to the given description using cosine similarity.

    Parameters:
        description (str): Text description of the target competition or idea.
        retrieve_model: Model with an encode method that returns a normalized vector.
        vector_index_path (str): Path to the FAISS index of L2-normalized competition embeddings.
        metadata_path (str): Path to JSON file mapping FAISS indices to (link, description).
        top_n (int): Number of similar competitions to retrieve.

    Returns:
        list of (link, similarity_score) tuples.
    """
    query_embedding = retrieve_model.encode([description], normalize_embeddings=True)
    query_embedding = np.array(query_embedding).astype('float32')

    index = faiss.read_index(vector_index_path)

    scores, indices = index.search(query_embedding, top_n)

    with open(metadata_path, 'r', encoding='utf-8') as f:
        metadata = json.load(f)

    results = []
    for idx, score in zip(indices[0], scores[0]):
        if idx < len(metadata):
            entry = metadata[idx]
            link = entry["link"] if isinstance(entry, dict) else entry[0]
            results.append((link, float(score)))

    return results
