{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "6a60e61d-4946-4f95-845c-d4d3c329fb23",
   "metadata": {},
   "source": [
    "# Phase 2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "73d0d45c-b002-4f32-ae57-90eced7d651f",
   "metadata": {
    "collapsed": true,
    "jupyter": {
     "outputs_hidden": true,
     "source_hidden": true
    },
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "import os\n",
    "import numpy as np\n",
    "from contextlib import redirect_stdout\n",
    "from tqdm import tqdm\n",
    "from time import time\n",
    "from collections import defaultdict\n",
    "from tmu.models.autoencoder.autoencoder import TMAutoEncoder\n",
    "from sklearn.metrics.pairwise import cosine_similarity\n",
    "from Evaluation import Evaluation\n",
    "from Tools import Tools\n",
    "from DirectoriesUtil import Dicrectories\n",
    "\n",
    "target_similarity=defaultdict(list)\n",
    "clause_weight_threshold = 0\n",
    "clause_drop_p = 0.0\n",
    "factor = 200\n",
    "clauses = 160\n",
    "T = factor*40\n",
    "s = 5.0\n",
    "epochs = 25\n",
    "number_of_examples = 500\n",
    "accumulation = 10\n",
    "sub_accumulation = 10\n",
    "top_max_clauses1 = 0\n",
    "top_max_clauses2 = 0\n",
    "with_clause_update = False\n",
    "max_spearman = 0.9\n",
    "true_weight = 0.7\n",
    "false_weight = 1 - true_weight\n",
    "neg_length = 10\n",
    "\n",
    "eval = Evaluation()\n",
    "def preprocess_text(text):\n",
    "    return text\n",
    "vectorizer_X = Tools.read_pickle_data(\"vectorizer_X.pickle\")\n",
    "feature_names = vectorizer_X.get_feature_names_out()\n",
    "number_of_features = vectorizer_X.get_feature_names_out().shape[0]\n",
    "\n",
    "for dataset_name in os.listdir(Dicrectories.datasets):\n",
    "    if dataset_name == 'mturk-287':\n",
    "        current_folder_path = os.path.join(Dicrectories.datasets, dataset_name)\n",
    "        if os.path.isdir(current_folder_path):\n",
    "            files_start_name = os.path.join(current_folder_path, dataset_name)\n",
    "\n",
    "            pair_list = Tools.get_dataset_pairs(files_start_name)\n",
    "            output_active, target_words = Tools.get_dataset_targets(files_start_name, vectorizer_X, pair_list)\n",
    "            \n",
    "            result_filepath = Dicrectories.test(dataset_name,\"all_phase2\")\n",
    "            with open(result_filepath, 'w') as file, redirect_stdout(file):\n",
    "                tm = TMAutoEncoder(clauses, T, s, output_active, max_included_literals=3, accumulation=accumulation, feature_negation=False, platform='CPU', output_balancing=0.5)\n",
    "                total_training = 0\n",
    "                print(\"Epochs: %d\" % epochs)\n",
    "                print(\"Target words: %d\" % len(target_words))\n",
    "                print(\"No of features: %d\" % number_of_features)\n",
    "                print(\"Clauses: %d\" % clauses)\n",
    "                print(\"with_clause_update: %s\" % with_clause_update)\n",
    "                print(\"Examples: %d\" % number_of_examples)\n",
    "                print(\"Accumulation: %d\" % accumulation)\n",
    "                print(\"Sub Accumulation: %d\" % sub_accumulation)\n",
    "                print(\"true_weight: %f\" % true_weight)\n",
    "                print(\"false_weight: %f\" % false_weight)\n",
    "                print(\"top_max_clauses1: %d\" % top_max_clauses1)\n",
    "                print(\"top_max_clauses2: %d\\n\" % top_max_clauses2)\n",
    "                \n",
    "                epochs_progress_bar = tqdm(total=epochs, desc=\"Running Epochs\")\n",
    "                for e in range(epochs):\n",
    "                    print(\"\\nEpoch #: %d\" % e)\n",
    "                    start_training = time()\n",
    "                    tm.knowledge_fit(\n",
    "                        number_of_examples = number_of_examples,\n",
    "                        number_of_features = number_of_features,\n",
    "                        sub_accumulation = sub_accumulation,\n",
    "                        top_max_clauses1 = top_max_clauses1,\n",
    "                        top_max_clauses2 = top_max_clauses2,\n",
    "                        neg_length = neg_length,\n",
    "                        with_clause_update = with_clause_update,\n",
    "                        true_weight = true_weight,\n",
    "                        false_weight = false_weight,\n",
    "                        print_c = False\n",
    "                        )\n",
    "                    stop_training = time()\n",
    "                    epoch_time = stop_training - start_training\n",
    "                    Tools.print_training_time(epoch_time)\n",
    "                    total_training = total_training + epoch_time\n",
    "\n",
    "                    profile = np.empty((len(target_words), clauses))\n",
    "                    for i in range(len(target_words)):\n",
    "                        weights = tm.get_weights(i)\n",
    "                        profile[i,:] = np.where(weights >= clause_weight_threshold, weights, 0)\n",
    "                    similarity = cosine_similarity(profile)\n",
    "                    for i in range(len(target_words)):\n",
    "                        sorted_index = np.argsort(-1*similarity[i,:])\n",
    "                        for j in range(1, len(target_words)):\n",
    "                            target_similarity[(target_words[i], target_words[sorted_index[j]])]  = similarity[i,sorted_index[j]]\n",
    "                    spearman = eval.calculate(target_similarity,pair_list)\n",
    "                    if spearman > max_spearman:\n",
    "                        break\n",
    "                    epochs_progress_bar.update(1)\n",
    "                epochs_progress_bar.close()\n",
    "\n",
    "                print(\"\\n=====================================\\nClauses\\n=====================================\")\n",
    "                for j in range(clauses):\n",
    "                    print(\"Clause #%-2d \" % (j), end=' ')\n",
    "                    for tw in range(len(target_words)):\n",
    "                        print(\"%s:W%-5d \" % (target_words[tw], tm.get_weight(tw, j)), end='| ')\n",
    "                    l = [] \n",
    "                    number_of_literals = 0 \n",
    "                    for k in range(tm.clause_bank.number_of_literals):\n",
    "                        if tm.get_ta_action(j, k) == 1:\n",
    "                            number_of_literals = number_of_literals + 1\n",
    "                            if k < tm.clause_bank.number_of_features:\n",
    "                                l.append(\"%s(%d)\" % (feature_names[k], tm.clause_bank.get_ta_state(j, k)))\n",
    "                            else:\n",
    "                                l.append(\"¬%s(%d)\" % (feature_names[k-tm.clause_bank.number_of_features], tm.clause_bank.get_ta_state(j, k)))\n",
    "                    print(\": No of features:%-6d\" % (number_of_literals), end=\" ==> \")\n",
    "                    try:\n",
    "                        print(\" - \".join(l))\n",
    "                    except UnicodeEncodeError:\n",
    "                        print(\" exception \")\n",
    "                \n",
    "                print(\"\\n=====================================\\nWord Similarity\\n=====================================\")\n",
    "                max_word_length = len(max(target_words, key=len))\n",
    "                list_of_words = []\n",
    "                target_words_with_min_max = []\n",
    "                for i in range(len(target_words)):\n",
    "                    row_of_similarity = []\n",
    "                    sorted_index = np.argsort(-1*similarity[i,:])\n",
    "                    min_similarity = 1.0\n",
    "                    max_similarity = 0.0\n",
    "                    word_similarity = []\n",
    "                    for j in range(1, len(target_words)):\n",
    "                        target_similarity[(target_words[i], target_words[sorted_index[j]])]  = similarity[i,sorted_index[j]]\n",
    "                        row_of_similarity.append(target_words[sorted_index[j]])\n",
    "                        word_similarity.append(\"{:<{}}({:.2f})  \".format(target_words[sorted_index[j]], max_word_length, similarity[i, sorted_index[j]]))\n",
    "                        if(min_similarity > similarity[i,sorted_index[j]]):\n",
    "                            min_similarity = similarity[i,sorted_index[j]]\n",
    "                        if(max_similarity < similarity[i,sorted_index[j]]):\n",
    "                            max_similarity = similarity[i,sorted_index[j]]\n",
    "                \n",
    "                    output_line = f\"{target_words[i]:<{max_word_length}}: Min:{min_similarity:.2f}, Max:{max_similarity:.2f}\"\n",
    "                    print(output_line, end='     ==> ')\n",
    "                    print(word_similarity)\n",
    "                    list_of_words.append(row_of_similarity)\n",
    "                    target_words_with_min_max.append(output_line)\n",
    "\n",
    "                Tools.print_training_time(total_training)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "efacc3c8-446c-4b46-a838-f754f0041209",
   "metadata": {},
   "source": [
    "# Batched Phase 2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3bc4d5c7-4d3a-4dce-87a1-265569d90bfb",
   "metadata": {
    "collapsed": true,
    "jupyter": {
     "outputs_hidden": true,
     "source_hidden": true
    },
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "import os\n",
    "import numpy as np\n",
    "from contextlib import redirect_stdout\n",
    "from tqdm import tqdm\n",
    "from time import time\n",
    "from collections import defaultdict\n",
    "from tmu.models.autoencoder.autoencoder import TMAutoEncoder\n",
    "from sklearn.metrics.pairwise import cosine_similarity\n",
    "from Evaluation import Evaluation\n",
    "from Tools import Tools\n",
    "from DirectoriesUtil import Dicrectories\n",
    "\n",
    "target_similarity=defaultdict(list)\n",
    "clause_weight_threshold = 0\n",
    "clause_drop_p = 0.0\n",
    "factor = 200\n",
    "clauses = 160\n",
    "T = factor*40\n",
    "s = 5.0\n",
    "epochs = 25\n",
    "number_of_examples = 50\n",
    "accumulation = 14\n",
    "sub_accumulation = 10\n",
    "top_max_clauses1 = 0\n",
    "top_max_clauses2 = 0\n",
    "with_clause_update = False\n",
    "max_spearman = 0.9\n",
    "true_weight = 0.7\n",
    "false_weight = 1 - true_weight\n",
    "neg_length = 10\n",
    "batch_size = 100\n",
    "\n",
    "eval = Evaluation()\n",
    "tools = Tools(Dicrectories.knowledge)\n",
    "def preprocess_text(text):\n",
    "    return text\n",
    "vectorizer_X = Tools.read_pickle_data(\"vectorizer_X.pickle\")\n",
    "feature_names = vectorizer_X.get_feature_names_out()\n",
    "number_of_features = vectorizer_X.get_feature_names_out().shape[0]\n",
    "\n",
    "for dataset_name in os.listdir(Dicrectories.datasets):\n",
    "    if dataset_name == 'mturk-771':\n",
    "        current_folder_path = os.path.join(Dicrectories.datasets, dataset_name)\n",
    "        if os.path.isdir(current_folder_path):\n",
    "            files_start_name = os.path.join(current_folder_path, dataset_name)\n",
    "    \n",
    "            pair_list = Tools.get_dataset_pairs(files_start_name)\n",
    "            # print(pair_list)\n",
    "            output_active, target_words = Tools.get_dataset_targets(files_start_name, vectorizer_X, pair_list)\n",
    "            # print(output_active)\n",
    "            # print(target_words)\n",
    "            num_batches = len(target_words) // batch_size + (len(target_words) % batch_size != 0)\n",
    "            total_training = 0\n",
    "    \n",
    "            result_filepath = Dicrectories.test(dataset_name,\"batched_phase2\")\n",
    "            with open(result_filepath, 'w') as file, redirect_stdout(file):\n",
    "                print(\"Epochs: %d\" % epochs)\n",
    "                print(\"Target words: %d\" % len(target_words))\n",
    "                print(\"No of features: %d\" % number_of_features)\n",
    "                print(\"Clauses: %d\" % clauses)\n",
    "                print(\"with_clause_update: %s\" % with_clause_update)\n",
    "                print(\"Examples: %d\" % number_of_examples)\n",
    "                print(\"Accumulation: %d\" % accumulation)\n",
    "                print(\"Sub Accumulation: %d\" % sub_accumulation)\n",
    "                print(\"true_weight: %f\" % true_weight)\n",
    "                print(\"false_weight: %f\" % false_weight)\n",
    "                print(\"neg length: %d\" % neg_length)\n",
    "                print(\"s: %f\" % s)\n",
    "                print(\"top_max_clauses1: %d\" % top_max_clauses1)\n",
    "                print(\"top_max_clauses2: %d\\n\" % top_max_clauses2)\n",
    "                \n",
    "                for batch_idx in range(num_batches):\n",
    "                    print(f\"\\nTraining Batch {batch_idx+1}/{num_batches}\")\n",
    "                    start_batch = batch_idx * batch_size\n",
    "                    end_batch = start_batch + batch_size\n",
    "                    if end_batch > len(target_words):\n",
    "                        end_batch = len(target_words)\n",
    "                    current_output_active = output_active[start_batch:end_batch]\n",
    "                    print(current_output_active)\n",
    "                    tm = TMAutoEncoder(clauses, T, s, current_output_active, max_included_literals=3, accumulation=accumulation, feature_negation=False, platform='CPU', output_balancing=0.5)\n",
    "                    epochs_progress_bar = tqdm(total=epochs, desc=\"Running Epochs\")\n",
    "                    for e in range(epochs):\n",
    "                        start_training = time()\n",
    "                        print(f\"\\nEpoch {e+1}/{epochs}\")\n",
    "                        tm.knowledge_fit(\n",
    "                            number_of_examples = number_of_examples,\n",
    "                            number_of_features = number_of_features,\n",
    "                            sub_accumulation = sub_accumulation,\n",
    "                            top_max_clauses1 = top_max_clauses1,\n",
    "                            top_max_clauses2 = top_max_clauses2,\n",
    "                            neg_length = neg_length,\n",
    "                            tools = tools,\n",
    "                            with_clause_update = with_clause_update,\n",
    "                            true_weight = true_weight,\n",
    "                            false_weight = false_weight,\n",
    "                            print_c = False\n",
    "                        )\n",
    "                        stop_training = time()\n",
    "                        epoch_time = stop_training - start_training\n",
    "                        Tools.print_training_time(epoch_time)\n",
    "                        total_training = total_training + epoch_time\n",
    "    \n",
    "                        profile = np.empty((len(current_output_active), clauses))\n",
    "                        for i in range(len(current_output_active)):\n",
    "                            weights = tm.get_weights(i)\n",
    "                            profile[i,:] = np.where(weights >= clause_weight_threshold, weights, 0)\n",
    "                        similarity = cosine_similarity(profile)\n",
    "                        for i in range(len(current_output_active)):\n",
    "                            sorted_index = np.argsort(-1*similarity[i,:])\n",
    "                            for j in range(1, len(current_output_active)):\n",
    "                                target_similarity[(target_words[start_batch+i], target_words[start_batch+sorted_index[j]])]  = similarity[i,sorted_index[j]]\n",
    "                        # print(target_similarity)\n",
    "                        spearman = eval.calculate(target_similarity, pair_list)\n",
    "                        epochs_progress_bar.update(1)\n",
    "                    epochs_progress_bar.close()\n",
    "    \n",
    "                print(f\"\\nTotal training time: {total_training:.2f}s\")\n",
    "                print(f\"Total evaluation on {len(target_similarity)} word pairs:\")\n",
    "                spearman = eval.calculate(target_similarity, pair_list)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
