from tqdm import tqdm
from gensim.models import Word2Vec
import random

def perturb(X_train_text, y_train, model_path, percent_to_change=5):
    X_train_perturbed = []
    w2v_model = Word2Vec.load(model_path) 
    
    for r, doc in enumerate(tqdm(X_train_text)):
        label = y_train[r]
        tokens = doc.split()
        num_words_to_change = len(tokens) * (percent_to_change / 100)
        words_changed = 0
        indices_to_change = set(random.sample(range(len(tokens)), int(num_words_to_change)))
    
        new_tokens = []
        for i, word in enumerate(tokens):
            if i in indices_to_change and word in w2v_model.wv:
                if label == 1:
                    similar_words = w2v_model.wv.most_similar(word, topn=5)
                    chosen_word = random.choice(similar_words)[0]  # Choose one similar word randomly
                else:
                    similar_words = w2v_model.wv.most_similar(word, topn=400)[::-1]
                    most_dissimilar = similar_words[:5]  # Take the 5 most dissimilar words
                    chosen_word = random.choice(most_dissimilar)[0]

                new_tokens.append(chosen_word)
                words_changed += 1
            else:
                new_tokens.append(word)
    
            if words_changed >= num_words_to_change:
                break
    
        X_train_perturbed.append(' '.join(new_tokens))
        
    return X_train_perturbed