import nltk
import spacy
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from bertopic.dimensionality import BaseDimensionalityReduction
from transformers import pipeline
import hdbscan

nltk.download('stopwords')
nlp = spacy.load("fr_core_news_sm")

def get_custom_bertopic_config():
    custom_stops = [
        'tagnom','tagurl','tag_nom','tag_url',
    ]
    all_stops = custom_stops + stopwords.words('french') + list(nlp.Defaults.stop_words)
    
    vec = CountVectorizer(stop_words=all_stops)
    empty_dim_model = BaseDimensionalityReduction()
    
    hdbscan_model = hdbscan.HDBSCAN(
        min_cluster_size=5,
        min_samples=2,
        metric='euclidean',
        cluster_selection_method='eom',
        prediction_data=True
    )

    return vec, empty_dim_model, hdbscan_model