{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "114306db-240d-4551-bbc0-312ba048834d",
   "metadata": {},
   "source": [
    "# Word2Vec with OneBillion"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "d7dbc0f4-65ff-4019-86a8-542e95779cbe",
   "metadata": {},
   "source": [
    "## Prepare"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e28d767a-8e22-437c-84fc-dc0fa545156d",
   "metadata": {
    "collapsed": true,
    "jupyter": {
     "outputs_hidden": true,
     "source_hidden": true
    },
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Loading Vectorizer\n",
      "Loading Data\n"
     ]
    }
   ],
   "source": [
    "from gensim.models import Word2Vec\n",
    "from sklearn.metrics.pairwise import cosine_similarity\n",
    "from scipy.stats import spearmanr\n",
    "import numpy as np\n",
    "import os\n",
    "import random\n",
    "random.seed(42)\n",
    "from time import time\n",
    "import pickle\n",
    "from collections import defaultdict\n",
    "from Tools import Tools\n",
    "from scipy.stats import spearmanr\n",
    "from contextlib import redirect_stdout\n",
    "from DirectoriesUtil import Dicrectories\n",
    "\n",
    "target_word_weight=defaultdict(list)\n",
    "target_similarity=defaultdict(list)\n",
    "\n",
    "def preprocess_text(text):\n",
    "    return text\n",
    "\n",
    "dataset_name = \"mturk-771\"\n",
    "dataset_dir = os.path.join(\"datasets\", dataset_name)\n",
    "files_start_name = os.path.join(dataset_dir, dataset_name)\n",
    "\n",
    "print(\"Loading Vectorizer\")\n",
    "vectorizer_X = Tools.read_pickle_data(\"vectorizer_X.pickle\")\n",
    "feature_names = vectorizer_X.get_feature_names_out()\n",
    "number_of_features = vectorizer_X.get_feature_names_out().shape[0]\n",
    "print(\"Loading Data\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "827fcd4f-933e-49c8-90fb-e274133c61fb",
   "metadata": {
    "collapsed": true,
    "jupyter": {
     "outputs_hidden": true,
     "source_hidden": true
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['us', 'centers', 'disease', 'control', 'prevention', 'initially', 'advised', 'school', 'systems', 'close', 'outbreaks', 'occurred', 'reversed', 'saying', 'apparent', 'virus', 'meant', 'schools', 'day', 'care', 'stay', 'open', 'even', 'confirmed', 'cases', 'swine', 'flu']\n"
     ]
    }
   ],
   "source": [
    "# either this or the next cell to build sentences\n",
    "sentences = []\n",
    "X_train = Tools.read_pickle_data(\"X.pickle\")\n",
    "for i in range(X_train.shape[0]):\n",
    "    # Find the indices of non-zero elements in the row\n",
    "    word_indices = X_train[i].indices\n",
    "    # Map indices to the actual words\n",
    "    words = [feature_names[idx] for idx in word_indices]\n",
    "    sentences.append(words)\n",
    "print(sentences[0])\n",
    "with open('big_sentences.pickle', 'wb') as file:\n",
    "    pickle.dump(sentences, file)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b99a7c66-cf00-47f5-a0de-19d2f89358c2",
   "metadata": {
    "collapsed": true,
    "jupyter": {
     "outputs_hidden": true,
     "source_hidden": true
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['us', 'centers', 'disease', 'control', 'prevention', 'initially', 'advised', 'school', 'systems', 'close', 'outbreaks', 'occurred', 'reversed', 'saying', 'apparent', 'virus', 'meant', 'schools', 'day', 'care', 'stay', 'open', 'even', 'confirmed', 'cases', 'swine', 'flu']\n"
     ]
    }
   ],
   "source": [
    "# run it if you saved the file in previous step\n",
    "with open('big_sentences.pickle', 'rb') as file:\n",
    "    sentences = pickle.load(file)\n",
    "print(sentences[0])"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "e19c423b-1262-4383-b796-8cb05f512188",
   "metadata": {},
   "source": [
    "## Train"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "56261b46-6da6-49e4-996c-e35c6e7c6f74",
   "metadata": {
    "collapsed": true,
    "jupyter": {
     "outputs_hidden": true,
     "source_hidden": true
    },
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "771\n",
      "Dataset words count:  1113\n",
      "[36479 34923 38298 ...  8222 39410 17410]\n",
      "['toy', 'surname', 'view', 'sail', 'search', 'record', 'option', 'adhesive', 'covering', 'creation', 'conditions', 'ring', 'rubbish', 'man', 'head', 'skin', 'society', 'money', 'shadow', 'shirt', 'base', 'blue', 'club', 'mortal', 'swimming', 'helmet', 'eye', 'rhythm', 'mission', 'debt', 'supporter', 'neon', 'build', 'mother', 'instrument', 'news', 'line', 'sight', 'timer', 'toe', 'interest', 'stem', 'straight', 'license', 'relation', 'decrease', 'gathering', 'violet', 'snowboarding', 'overhead', 'missile', 'drama', 'meal', 'vessel', 'procedure', 'check', 'oak', 'chick', 'shark', 'print', 'swing', 'meter', 'side', 'budget', 'taxpayer', 'listing', 'basin', 'throne', 'rest', 'pie', 'collision', 'database', 'basketball', 'worm', 'transport', 'rash', 'layer', 'fall', 'horse', 'approving', 'novice', 'appearance', 'color', 'brand', 'disease', 'field', 'snap', 'script', 'smash', 'keyboard', 'playground', 'gas', 'reaction', 'housing', 'cache', 'law', 'bee', 'volcano', 'motorcycle', 'push', 'tie', 'opening', 'poem', 'quantity', 'algorithm', 'copy', 'wife', 'brake', 'executive', 'increase', 'agency', 'store', 'sense', 'buffer', 'danger', 'aircraft', 'diversion', 'papers', 'arc', 'congress', 'cart', 'intensity', 'probability', 'contract', 'loan', 'beat', 'office', 'terms', 'window', 'floor', 'drawer', 'hearing', 'latex', 'bin', 'mere', 'apparel', 'crown', 'lesson', 'participant', 'house', 'school', 'parcel', 'entrance', 'tennis', 'flavor', 'separation', 'flow', 'pond', 'distributor', 'bar', 'decade', 'instrumentation', 'organization', 'sharp', 'hair', 'railway', 'vine', 'barrel', 'happening', 'lace', 'drum', 'mayor', 'trap', 'flat', 'domain', 'point', 'agent', 'cloth', 'squash', 'finger', 'battle', 'height', 'body', 'appliance', 'fruit', 'dinner', 'counter', 'literature', 'publication', 'fishing', 'broadcast', 'formation', 'animal', 'draw', 'pump', 'bend', 'worker', 'stand', 'join', 'movie', 'handbag', 'connection', 'lawyer', 'stream', 'lamb', 'compound', 'step', 'commitment', 'icon', 'justice', 'plane', 'stock', 'element', 'segment', 'exploitation', 'ego', 'satin', 'piano', 'texture', 'workplace', 'coffee', 'investment', 'toilet', 'beach', 'position', 'bond', 'iron', 'hold', 'sentiment', 'discovery', 'scandal', 'turn', 'deep', 'cave', 'taxi', 'weapon', 'tract', 'armor', 'boot', 'argument', 'query', 'flute', 'union', 'flower', 'instruction', 'message', 'pumpkin', 'desert', 'press', 'chicken', 'contact', 'needle', 'yacht', 'funds', 'drink', 'phantom', 'verse', 'tower', 'lounge', 'brandy', 'birth', 'tin', 'stitch', 'rub', 'course', 'wind', 'jaw', 'sweater', 'trail', 'jay', 'snow', 'bench', 'rice', 'operative', 'ink', 'individual', 'chandler', 'dice', 'thief', 'crack', 'diving', 'cab', 'figure', 'light', 'organ', 'romance', 'wear', 'chief', 'sweet', 'trip', 'dialogue', 'measure', 'soccer', 'opportunity', 'garlic', 'burst', 'coat', 'video', 'bed', 'army', 'tune', 'request', 'party', 'silence', 'hall', 'category', 'recycling', 'stadium', 'coloring', 'site', 'property', 'heritage', 'toast', 'bone', 'governor', 'minute', 'age', 'butterfly', 'hose', 'creativity', 'manager', 'politician', 'trunk', 'grill', 'football', 'serving', 'establishment', 'explanation', 'patio', 'case', 'moment', 'blanket', 'support', 'grip', 'gum', 'column', 'attraction', 'indication', 'taste', 'relief', 'center', 'change', 'corridor', 'distance', 'flight', 'good', 'vault', 'substance', 'official', 'soldier', 'creature', 'frost', 'tree', 'refrigerator', 'temperature', 'location', 'bakery', 'examination', 'reason', 'acoustic', 'equipment', 'wine', 'amount', 'motion', 'weight', 'hit', 'transformation', 'heat', 'melody', 'rock', 'fault', 'bulb', 'product', 'proceedings', 'bird', 'drop', 'stake', 'plantation', 'rainbow', 'driving', 'trouble', 'sex', 'court', 'guarantee', 'temple', 'kitchen', 'member', 'commerce', 'fantasy', 'guess', 'status', 'signal', 'lightning', 'captain', 'tribunal', 'condition', 'life', 'system', 'behavior', 'guru', 'breakfast', 'scale', 'low', 'revolution', 'black', 'engineering', 'ridge', 'recorder', 'current', 'reward', 'load', 'discharge', 'packaging', 'finish', 'code', 'aim', 'sausage', 'prison', 'anatomy', 'ear', 'wave', 'ferry', 'note', 'attitude', 'herb', 'pipe', 'athletics', 'cube', 'stone', 'province', 'gauge', 'brochure', 'quartz', 'employee', 'fragrance', 'mail', 'glass', 'mask', 'berry', 'ankle', 'approval', 'protection', 'feeling', 'slash', 'kick', 'tourist', 'speculation', 'chamber', 'sheet', 'aroma', 'flour', 'express', 'beef', 'arrangement', 'bit', 'binary', 'cooking', 'holiday', 'murphy', 'objective', 'encouragement', 'sprint', 'chair', 'golf', 'cuisine', 'extract', 'communication', 'event', 'signature', 'parent', 'government', 'legion', 'curve', 'patch', 'stick', 'fabric', 'stranger', 'interaction', 'graphic', 'day', 'brace', 'pan', 'help', 'musician', 'cast', 'diamond', 'luggage', 'server', 'burning', 'ceiling', 'yarn', 'universe', 'butter', 'gem', 'hope', 'security', 'adult', 'projector', 'water', 'country', 'cloud', 'assembly', 'door', 'sun', 'garden', 'commission', 'chess', 'zinc', 'visitor', 'credit', 'fund', 'polyester', 'victory', 'starter', 'endorsement', 'gateway', 'attorney', 'quotation', 'seed', 'relative', 'aspen', 'mount', 'territory', 'knowledge', 'radio', 'woman', 'quiet', 'size', 'cutter', 'leader', 'ticker', 'gamble', 'sleeve', 'shelter', 'foot', 'onion', 'land', 'nut', 'conservation', 'storm', 'outlet', 'table', 'average', 'region', 'language', 'street', 'jury', 'sewing', 'story', 'stance', 'call', 'answer', 'crow', 'coin', 'determination', 'bail', 'museum', 'bottom', 'soup', 'ticket', 'remedy', 'poultry', 'twist', 'chuck', 'cheese', 'obligation', 'treatment', 'stroke', 'priest', 'period', 'chin', 'delivery', 'vacation', 'capital', 'rod', 'framework', 'editor', 'opinion', 'action', 'balance', 'middle', 'textile', 'stage', 'gun', 'deal', 'occurrence', 'sand', 'foundation', 'trade', 'doubt', 'room', 'glove', 'artillery', 'gray', 'lineup', 'child', 'flora', 'plot', 'skiing', 'childhood', 'gallon', 'structure', 'cartoon', 'shock', 'recreation', 'trainer', 'rate', 'goal', 'couch', 'plastic', 'glue', 'technology', 'washer', 'hen', 'rugby', 'hawk', 'approach', 'company', 'ocean', 'plea', 'document', 'juvenile', 'pick', 'mixture', 'elevator', 'book', 'link', 'encyclopedia', 'receiver', 'cord', 'statement', 'crop', 'farmer', 'cook', 'find', 'seat', 'notice', 'circle', 'mouth', 'retailer', 'tool', 'collection', 'form', 'blow', 'problem', 'cake', 'seminar', 'anniversary', 'nick', 'building', 'rover', 'comparison', 'descent', 'payment', 'crowd', 'stamp', 'game', 'matter', 'implementation', 'count', 'comfort', 'joint', 'mark', 'shoes', 'letter', 'report', 'clock', 'piece', 'urge', 'teaching', 'bay', 'sphere', 'brass', 'painting', 'journey', 'belief', 'fauna', 'lyric', 'son', 'degree', 'person', 'rule', 'billboard', 'driver', 'touch', 'perfume', 'dashboard', 'tongue', 'crush', 'installation', 'scene', 'strategy', 'amusement', 'business', 'phenomenon', 'desk', 'representation', 'campaign', 'arrow', 'technician', 'feedback', 'unit', 'ground', 'attribute', 'beam', 'female', 'place', 'ease', 'trust', 'cat', 'surprise', 'wipe', 'relaxation', 'bathroom', 'inventory', 'roll', 'noise', 'invoice', 'minister', 'desire', 'division', 'cousin', 'star', 'quiz', 'liquid', 'magnolia', 'flyer', 'conclusion', 'horn', 'type', 'lift', 'bunny', 'processing', 'second', 'prose', 'square', 'ball', 'tolerance', 'eight', 'cup', 'tiger', 'commander', 'slice', 'climb', 'nutrition', 'calendar', 'cattle', 'blade', 'influence', 'punishment', 'spark', 'root', 'department', 'citrus', 'work', 'softball', 'health', 'wit', 'blizzard', 'baby', 'male', 'zoo', 'shop', 'hole', 'information', 'mode', 'alien', 'pod', 'skill', 'travel', 'highway', 'baseball', 'jail', 'intervention', 'bank', 'institution', 'insert', 'currency', 'purple', 'garment', 'deficit', 'walk', 'tooth', 'continent', 'lens', 'science', 'spy', 'tax', 'bedroom', 'passage', 'boat', 'solid', 'microwave', 'kiss', 'religion', 'time', 'cap', 'oval', 'board', 'selection', 'representative', 'success', 'summer', 'scrutiny', 'infinite', 'charge', 'daisy', 'tea', 'bite', 'sole', 'kitty', 'community', 'bug', 'drug', 'slope', 'century', 'gulf', 'alphabet', 'sum', 'pen', 'occupation', 'young', 'workshop', 'radar', 'protocol', 'swim', 'coach', 'recognition', 'picture', 'booklet', 'word', 'reference', 'front', 'fox', 'winter', 'disposition', 'development', 'impulse', 'rubber', 'draft', 'result', 'dock', 'mistake', 'damage', 'rain', 'potato', 'burn', 'gender', 'tank', 'dividend', 'donkey', 'watch', 'space', 'list', 'traveling', 'friend', 'liquor', 'gasoline', 'yard', 'map', 'cut', 'move', 'season', 'tent', 'fiction', 'vocal', 'collapse', 'lien', 'mate', 'satan', 'performer', 'dryer', 'kid', 'bread', 'forest', 'forecast', 'park', 'tail', 'style', 'roof', 'movement', 'question', 'fuel', 'judgment', 'process', 'insect', 'orange', 'income', 'agreement', 'possession', 'choice', 'channel', 'afternoon', 'dress', 'implement', 'flag', 'music', 'song', 'packet', 'iris', 'architecture', 'cell', 'punch', 'circulation', 'yellow', 'acrylic', 'queue', 'spaghetti', 'decision', 'article', 'loss', 'plant', 'apple', 'arm', 'emission', 'porch', 'tomato', 'handle', 'association', 'motive', 'meeting', 'dirt', 'prayer', 'quality', 'energy', 'number', 'police', 'feather', 'steel', 'joke', 'theory', 'cost', 'modification', 'oxygen', 'meat', 'fight', 'study', 'beginner', 'hydrogen', 'shoulder', 'share', 'direction', 'construction', 'heart', 'racing', 'operator', 'student', 'duty', 'mechanism', 'brush', 'genre', 'aluminum', 'cafe', 'printer', 'hurt', 'tub', 'pyramid', 'environment', 'cover', 'branch', 'furniture', 'production', 'activity', 'access', 'doctor', 'card', 'slave', 'writing', 'occasion', 'lion', 'strip', 'dad', 'athlete', 'hack', 'customers', 'pink', 'bishop', 'brother', 'flame', 'sauce', 'chance', 'software', 'secretary', 'distribution', 'purpose', 'padding', 'notebook', 'newspaper', 'control', 'ray', 'topic', 'garbage', 'faith', 'magic', 'radiation', 'pinnacle', 'simulation', 'dash', 'flesh', 'profit', 'hood', 'situation', 'daughter', 'certificate', 'stuff', 'red', 'hand', 'speech', 'net', 'device', 'brain', 'flash', 'parade', 'warranty', 'postage', 'textbook', 'soil', 'band', 'rise', 'surface', 'crew', 'bulletin', 'evidence', 'salt', 'image', 'spot', 'examiner', 'cement', 'birthday', 'memory', 'assets', 'gear', 'metal', 'format', 'bait', 'novel', 'stretch', 'rocker', 'sound', 'dressing', 'editing', 'level', 'account', 'cylinder', 'vision', 'grass', 'bill', 'candy', 'eagle', 'racer', 'break', 'corruption', 'craft', 'score', 'welfare', 'personnel', 'nickel', 'piazza', 'character', 'display', 'officer', 'week', 'variety', 'freeze', 'permission', 'dog', 'drill', 'food', 'carriage', 'dentist', 'lake', 'machine', 'hockey', 'alloy', 'bun', 'operation', 'poker', 'origin', 'wagon', 'chemical', 'devil', 'growth', 'skull', 'weather', 'package', 'egg', 'box', 'power', 'pressure', 'church', 'play', 'knife', 'pupil', 'tsunami', 'chain', 'jumper', 'voice', 'feature', 'digit', 'area', 'addition', 'smell', 'hamburger', 'adjustment', 'click', 'boy', 'target', 'text', 'spending', 'ass', 'knight', 'maple', 'prince', 'balloon', 'computer', 'alarm', 'brick', 'girl', 'film', 'creek', 'wolf', 'illness']\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Running Words: 100%|██████████| 1112/1112 [00:00<00:00, 330526.97it/s]\n"
     ]
    }
   ],
   "source": [
    "from tqdm import tqdm\n",
    "import logging\n",
    "from gensim.models.callbacks import CallbackAny2Vec\n",
    "\n",
    "pair_list = Tools.get_dataset_pairs(files_start_name)\n",
    "print(len(pair_list))\n",
    "output_active, target_words = Tools.get_dataset_targets(files_start_name, vectorizer_X, pair_list)\n",
    "print(output_active)\n",
    "print(target_words)\n",
    "\n",
    "# Initialize parameters\n",
    "vector_size = 100  # Size of the word vectors\n",
    "window = 5         # Context window size\n",
    "min_count = 1      # Minimum word count to include in the model\n",
    "epochs = 25\n",
    "\n",
    "result_filepath = Dicrectories.test(dataset_name, \"word2vec_progress\")\n",
    "with open(result_filepath, 'w') as file, redirect_stdout(file):\n",
    "    # Custom callback to print progress at the end of each epoch\n",
    "    class EpochLogger(CallbackAny2Vec):\n",
    "        def __init__(self):\n",
    "            self.epoch = 0\n",
    "    \n",
    "        def on_epoch_end(self, model):\n",
    "            self.epoch += 1\n",
    "            print(f'Epoch {self.epoch} finished')\n",
    "    \n",
    "    # Turn off Gensim's default INFO logs\n",
    "    logging.getLogger('gensim').setLevel(logging.WARNING)\n",
    "    \n",
    "    # Train Word2Vec model\n",
    "    print(\"\\nTraining Word2Vec model...\")\n",
    "    model = Word2Vec(sentences=sentences, vector_size=vector_size, window=window, min_count=min_count, sg=1, epochs=epochs)\n",
    "    \n",
    "    profile = []\n",
    "    \n",
    "    words_progress_bar = tqdm(total=len(target_words), desc=\"Running Words\")\n",
    "    start_training = time()\n",
    "    for word in target_words:\n",
    "        if word in model.wv:\n",
    "            profile.append(model.wv[word])\n",
    "        else:\n",
    "            print(f\"Word '{word}' not found in the model's vocabulary.\")\n",
    "        words_progress_bar.update(1)\n",
    "    stop_training = time()\n",
    "    epoch_time = stop_training - start_training\n",
    "    words_progress_bar.close() \n",
    "    profile = np.array(profile)\n",
    "    \n",
    "    similarity = cosine_similarity(profile)\n",
    "    target_similarity = {}"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "e7e6b85e-3c66-463d-a9e1-4e1e7ac00b54",
   "metadata": {},
   "source": [
    "## Score"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d95a1738-1258-4704-9eae-5a99af22cbd0",
   "metadata": {
    "collapsed": true,
    "jupyter": {
     "outputs_hidden": true,
     "source_hidden": true
    },
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2024-08-25 16:57:03,748 : INFO : Note: detected 96 virtual cores but NumExpr set to maximum of 64, check \"NUMEXPR_MAX_THREADS\" environment variable.\n",
      "2024-08-25 16:57:03,749 : INFO : Note: NumExpr detected 96 cores but \"NUMEXPR_MAX_THREADS\" not set, so enforcing safe limit of 8.\n",
      "2024-08-25 16:57:03,749 : INFO : NumExpr defaulting to 8 threads.\n"
     ]
    }
   ],
   "source": [
    "# Save the results\n",
    "from scipy.stats import kendalltau\n",
    "import pandas as pd\n",
    "\n",
    "result_filepath = Dicrectories.test(dataset_name, \"word2vec\")\n",
    "with open(result_filepath, 'w') as file, redirect_stdout(file):\n",
    "    print(\"\\nEvaluating Word2Vec Model Over %d Epochs:\" % epochs)\n",
    "    print(\"No of features: %d\" % number_of_features)\n",
    "    Tools.print_training_time(epoch_time)\n",
    "    \n",
    "    # Extract word vectors for target words\n",
    "    profile = np.empty((len(target_words), vector_size))\n",
    "    for i, word in enumerate(target_words):\n",
    "        if word in model.wv:\n",
    "            profile[i, :] = model.wv[word]\n",
    "        else:\n",
    "            profile[i, :] = np.zeros(vector_size)\n",
    "    \n",
    "    # Calculate cosine similarity\n",
    "    similarity = cosine_similarity(profile)\n",
    "    for i in range(len(target_words)):\n",
    "        sorted_index = np.argsort(-1*similarity[i,:])\n",
    "        for j in range(1, len(target_words)):\n",
    "            target_similarity[(target_words[i], target_words[sorted_index[j]])]  = similarity[i,sorted_index[j]]\n",
    "\n",
    "    calculated_score=[]\n",
    "    extracted_list = []\n",
    "    original_score=[]\n",
    "    word_pairs=[]\n",
    "    for (x,y) in pair_list:\n",
    "            if x in target_similarity:\n",
    "                # print(\"{} = {:.2f} - {}\".format(x, target_similarity[x] * 10, y))\n",
    "                word1_prof = target_similarity[x] * 10\n",
    "                extracted_list.append((x, word1_prof))\n",
    "                calculated_score.append(word1_prof)\n",
    "                original_score.append(y)\n",
    "                word_pairs.append(x)\n",
    "    spearman_TM = spearmanr(original_score, calculated_score)\n",
    "    spearman_TM = round(spearman_TM[0], 3)\n",
    "    print(f'Spearman Word2Vec: {spearman_TM}')\n",
    "\n",
    "    total_list=[]\n",
    "    total_list.append(original_score)\n",
    "    total_list.append(calculated_score)\n",
    "\n",
    "    similarity = cosine_similarity(total_list)\n",
    "    print(f'Cosine Word2Vec \\n{similarity}')\n",
    "\n",
    "    Word2Vec_corr= np.corrcoef(original_score, calculated_score)\n",
    "    print(f'Pearson Word2Vec \\n{Word2Vec_corr}')\n",
    "\n",
    "    kendal_Word2Vec, _ = kendalltau(original_score, calculated_score)\n",
    "    print(f'Kendal Word2Vec: {kendal_Word2Vec}')\n",
    "\n",
    "    data = pd.DataFrame([original_score,calculated_score])\n",
    "    data=data.transpose()\n",
    "    data.columns=['Original','Word2Vec']\n",
    "    correlation = data.corr()\n",
    "    print(\"Pearson Corr \\n\", correlation)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
