{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import re\n",
    "import csv\n",
    "import numpy as np\n",
    "from matplotlib import pyplot as plt\n",
    "\n",
    "# nlp\n",
    "import spacy\n",
    "import benepar\n",
    "import nltk\n",
    "# benepar_en didn't work\n",
    "fresh = False\n",
    "if fresh:\n",
    "    benepar.download('benepar_en3')\n",
    "from benepar import BeneparComponent\n",
    "#from benepar.spacy_plugin import BeneparComponent\n",
    "from spacy.matcher import Matcher\n",
    "\n",
    "## machine learning\n",
    "from sklearn.metrics import pairwise_distances\n",
    "from sklearn.metrics.pairwise import pairwise_kernels\n",
    "from sklearn.decomposition import PCA\n",
    "from sentence_transformers import SentenceTransformer\n",
    "# clustering and dendrogram\n",
    "from scipy.cluster.hierarchy import dendrogram\n",
    "from scipy.cluster.hierarchy import linkage\n",
    "from scipy.cluster.hierarchy import cophenet\n",
    "from scipy.spatial.distance import pdist\n",
    "from sklearn.cluster import AgglomerativeClustering"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "## homegrown library functions\n",
    "from imp import reload\n",
    "import concept_processing\n",
    "\n",
    "import concept_processing.io\n",
    "reload(concept_processing.io)\n",
    "# for loading original data\n",
    "from concept_processing.io import get_datapoint_iterator\n",
    "from concept_processing.io import load_concept_examples\n",
    "from concept_processing.io import capture_all_concepts\n",
    "from concept_processing.io import get_datapoint_iterator\n",
    "\n",
    "# nlp functionality for extracting concepts from text\n",
    "import concept_processing.extraction\n",
    "reload(concept_processing.extraction)\n",
    "from concept_processing.extraction import add_inc_and_exc_matchers\n",
    "from concept_processing.extraction import is_concept\n",
    "from concept_processing.extraction import iterate_concepts\n",
    "from concept_processing.extraction import iterate_concepts_in_span\n",
    "from concept_processing.extraction import extract_concepts\n",
    "\n",
    "# dealing with the concepts as strings\n",
    "import concept_processing.concepts\n",
    "reload(concept_processing.concepts)\n",
    "from concept_processing.concepts import concept_dict_to_list\n",
    "from concept_processing.concepts import display_most_frequent_concepts\n",
    "from concept_processing.concepts import build_embedding_matrix\n",
    "\n",
    "# manipulating data as a presence/absence matrix\n",
    "import concept_processing.pam\n",
    "reload(concept_processing.pam)\n",
    "from concept_processing.pam import convert_raw_bof_to_pam\n",
    "from concept_processing.pam import prune_and_reindex_concepts\n",
    "from concept_processing.pam import count_features_in_each_datapoint\n",
    "from concept_processing.pam import count_datapoints_in_each_feature\n",
    "\n",
    "import concept_processing.io\n",
    "reload(concept_processing.io)\n",
    "from concept_processing.io import store_concept_objects\n",
    "from concept_processing.io import load_concept_objects\n",
    "from concept_processing.io import form_data_dirname\n",
    "from concept_processing.io import form_processed_fname\n",
    "from concept_processing.io import get_file_info\n",
    "\n",
    "\n",
    "import concept_processing.labels\n",
    "reload(concept_processing.labels)\n",
    "from concept_processing.labels import create_labels_as_indices\n",
    "from concept_processing.labels import label_indices_to_one_hot\n",
    "\n",
    "\n",
    "# Dendrogram/hierarchical clustering\n",
    "import concept_processing.plot_support\n",
    "reload(concept_processing.plot_support)\n",
    "from concept_processing.plot_support import plot_rank_versus_freq\n",
    "from concept_processing.plot_support import plot_dendrogram\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Survey Data\n",
    "\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/luke/.local/lib/python3.8/site-packages/torch/cuda/__init__.py:52: UserWarning: CUDA initialization: Found no NVIDIA driver on your system. Please check that you have an NVIDIA GPU and installed a driver from http://www.nvidia.com/Download/index.aspx (Triggered internally at  /pytorch/c10/cuda/CUDAFunctions.cpp:100.)\n",
      "  return torch._C._cuda_getDeviceCount() > 0\n"
     ]
    }
   ],
   "source": [
    "# load nlp model (for constituency parser)\n",
    "# and create inclusion and exclusion matchers adding\n",
    "# these to the nlp pipeline.\n",
    "datastem = '20210504'\n",
    "modelstem = 'lg'\n",
    "if modelstem == 'sm':\n",
    "    nlp = spacy.load(\"en_core_web_sm\")\n",
    "elif modelstem == 'md':\n",
    "    nlp = spacy.load(\"en_core_web_md\")\n",
    "elif modelstem == 'lg':\n",
    "    nlp = spacy.load(\"en_core_web_lg\")\n",
    "else:\n",
    "    raise ValueError(f\"Unrecognised modelstem: {modelstem}\")\n",
    "incmatcher, excmatcher = add_inc_and_exc_matchers(nlp)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "# path location of files\n",
    "\n",
    "datapath = form_data_dirname(datastem)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "188_5sjzeb4w03.csv, (id_, label, text) = ('6WBL9G9QUQT7', 'ball', 'the umpire did not call it a strike')\n",
      "(id_, label, text) = ('QY1T0U8H5SR6', 'foul', 'ball was hit out of bounds')\n",
      "(id_, label, text) = ('7F0T5YP684G5', 'play', 'ball was hit into the outfield')\n",
      "(id_, label, text) = ('IJJHE4LLFDNK', 'strike', 'the batter swung')\n",
      "(id_, label, text) = ('YO21SMDKHEZ0', 'foul', 'ball went outside the 3rd base line')\n",
      "(id_, label, text) = ('O7B3HXXJWT3F', 'strike', 'the umpire called the ball a strike')\n",
      "(id_, label, text) = ('6X6FAHGVQURW', 'foul', 'ball was hit out of bounds')\n",
      "(id_, label, text) = ('VL0L2H9HFYRU', 'out', 'the ball was caught by the out fielder')\n",
      "(id_, label, text) = ('YF11BLVZJV4P', 'strike', 'batter swung and missed')\n",
      "(id_, label, text) = ('9FGH7JM6K6R5', 'out', 'short stop caught the ball in the air')\n",
      "129_h8c8ny4x7u.csv, (id_, label, text) = ('AQJIV3RI7AN5', 'foul', \"Yasiel Puig was at bat with a fresh count and struck the ball but it went foul. Puig didn't put the ball in play. He stayed alive to face another pitch.\")\n",
      "(id_, label, text) = ('TSR0FTD92YV2', 'strike', 'the pitch clipped the corner of the strike zone on the side of the plate opposite the batter. Also, the catcher framed the pitch well, as it may have been just off of the plate. The umpire rightfully called it a strike.')\n",
      "Fails with line: TSR0FTD92YV2,strike,the pitch clipped the corner of the strike zone on the side of the plate opposite the batter. Also, the catcher framed the pitch well, as it may have been just off of the plate. The umpire rightfully called it a strike.\n"
     ]
    }
   ],
   "source": [
    "# show some of the data\n",
    "show_example_data = True\n",
    "if show_example_data:\n",
    "    for i, (id_, label, text) in enumerate(get_datapoint_iterator(datapath)):\n",
    "            print(f\"(id_, label, text) = {(id_, label, text)}\")\n",
    "            if i > 10:\n",
    "                break\n",
    "else:\n",
    "    print(\"No output requested.\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "188_5sjzeb4w03.csv, 129_h8c8ny4x7u.csv, 197_ni25hikvxf.csv, 4_xu9afrhg0o.csv, 142_d8jaik03id.csv, 38_38doqz3unu.csv, 18_omkydub7qy.csv, 8_hfxm8dkunn.csv, 195_nwk7f5mq3h.csv, 119_hdkgwx4tgr.csv, 100_5397borrxc.csv, 31_jhmyyhmjh5.csv, 166_5lv4sjg1qv.csv, 10_ten3l2nuwn.csv, 184_hzlbex83q3.csv, 55_o97cc78tyd.csv, 179_enmmbxkd9b.csv, 135_7g73f18y8h.csv, 49_8fcz4c6oqy.csv, 164_qcoq7yni4y.csv, 118_66k0pevslq.csv, 54_07djfakfxh.csv, 141_tvldgnl6pt.csv, 138_yx53ai19ar.csv, 107_jfii3uf345.csv, 24_1y633kulxy.csv, 27_adgb2e3ry7.csv, 155_97dannvd9q.csv, 16_omnxwedc88.csv, 83_slgfcmc4xj.csv, 25_ceghr7ntgk.csv, 145_1p84lqxs5t.csv, 90_60znzzhvt3.csv, 187_kp3lfo9rti.csv, 109_ehl2s3ah2w.csv, 1_z949c1i541.csv, 151_8k3mbw3phz.csv, 3_5uxjfwf4c9.csv, 26_2k5thw2ykv.csv, 46_eg7yytwm57.csv, 102_r9vbr9i6qn.csv, 159_wmf9y5lb3d.csv, 122_hmocfdllhz.csv, 110_zq7vq985bq.csv, 53_4t2mju8m3y.csv, 26_kcs08kmlzv.csv, 150_hvfxdsp6uq.csv, 1_ofoaso8yxd.csv, 189_4mh7pjb6hj.csv, 21_x90g7ie9t7.csv, 20_z3zjd5sgm4.csv, 39_3064xz3lgl.csv, 51_km3hwahw1m.csv, 32_jitoqenvci.csv, 23_lce4ue1yit.csv, 2_wrrttduqx0.csv, 29_juqgc9mrcx.csv, 64_gjf3ihxyfj.csv, 73_4s2wt6mgps.csv, 146_umxfdzx3sg.csv, 79_wkvkhcnsn0.csv, 88_p169fqr8hl.csv, 35_g0ldvn0rjm.csv, 163_g4ownm07q9.csv, 58_lvcsk0yy8g.csv, 158_lhrhfgkazf.csv, 111_4dftcjzv4j.csv, 97_ksnb8zg7e2.csv, 131_bivn5or23h.csv, 178_7rflrddsnn.csv, 27_r6anjd4ie1.csv, 171_evt2lbu5wz.csv, 7_viucp080gt.csv, 59_tvvx1filo9.csv, 92_u8wrf1zhh5.csv, 191_yxm7fc16uc.csv, 113_s8v38lnghb.csv, 68_4fxf0cl1sa.csv, 47_qrdef2nnjn.csv, 71_jjxizibivj.csv, 28_7svph4yobz.csv, 182_icy94tmqbq.csv, 37_7swqi7vgnl.csv, 7_swfaqmk6y6.csv, 194_tnwwh3zbnj.csv, 20_1zahygarmq.csv, 161_qu7m9mil3a.csv, 125_onkofvwzgk.csv, 67_ip2ys2xhib.csv, 56_lxflttkcgn.csv, 148_nekma0liim.csv, 85_i7f88e7b7g.csv, 123_g66af1qsqo.csv, 140_rpfb54del9.csv, 114_fkhid5yyst.csv, 19_sevtqojtws.csv, 160_9xq61i8z5q.csv, 76_f4jqlrtl0d.csv, 57_l8ux51os5l.csv, 147_qz39am2soc.csv, 4_vet6mgh5md.csv, 106_oa27xk4eqt.csv, 33_zybtjc12ef.csv, 21_x8u5ofrlzo.csv, 66_8gquz8hbpz.csv, 63_j79b3nmr83.csv, 84_q4rh6yklie.csv, 86_kvt6l8gw8d.csv, 16_j1uppkkwpc.csv, 192_isa0drpmu4.csv, 5_lcampqa5d8.csv, 169_2y8d92hhek.csv, 74_08dh1ae49m.csv, 41_rrwls6y9fl.csv, 36_4bg4d4bf4q.csv, 117_mgq0ymf5hc.csv, 65_5y9bhpar44.csv, 174_vw5a8s9vzj.csv, 165_vcamdtt7pl.csv, 75_gginvgfabf.csv, 14_apv4u3qebs.csv, 69_4tlegp20n1.csv, 103_v129kmq3c5.csv, 17_qvtzqld86i.csv, 15_jzm9dvtw73.csv, 183_ex67vzlp8e.csv, 14_mx46s0fx52.csv, 6_qyv4vhzoe.csv, 78_tshg4zfas1.csv, 82_3qr5zxdcxe.csv, 48_u6mx1k6p64.csv, 193_kjtylxs5x6.csv, 91_6nvvo85k2r.csv, 185_9bjzxvbwca.csv, 156_yk739g3286.csv, 70_0b98yn7zuy.csv, 154_i7n85wzpa3.csv, 132_s5i7ngx7bu.csv, 180_jm6cn0n08c.csv, 104_jos9y36cxr.csv, 170_szpuak9x3b.csv, 61_l3jkp84irv.csv, 168_39vz7oesxk.csv, 101_9l21auewdi.csv, 15_j7tl3bmirm.csv, 134_behtf6grhp.csv, 124_4p9vzltfyn.csv, 80_zbkdixjx1u.csv, 157_7onih683qj.csv, 22_udsmb087i5.csv, 13_y1i6zh8y9w.csv, 96_fe55r9bnfd.csv, 120_tfdaxlk9n2.csv, 130_7a6m1my07s.csv, 121_aobrexg5sv.csv, 115_iocn2dpj2o.csv, 11_0na6xsa4cd.csv, 77_gup8yvrjg8.csv, 181_wk573tzjde.csv, 42_kr1ailiclk.csv, 43_1nemhlm74b.csv, 40_zh8fjx6tiy.csv, 11_7vk8oa1o24.csv, 198_shm86f7xzz.csv, 167_bk26q01ex6.csv, 99_7fcky2iv54.csv, 0_imfn3fsnqb.csv, 199_hzu91su6jo.csv, 44_zsqcy1dp0n.csv, 24_pa58yc2bsn.csv, 105_3acmrslh5t.csv, 18_o65lxoj85y.csv, 52_wfyl6gdoxn.csv, 62_6v7alh2xza.csv, 6_blz9uaf562.csv, 176_cl4orgir7l.csv, 25_zdmhvwm1pk.csv, 95_6ikda3av09.csv, 139_1qp7vpj70o.csv, 89_28gvnlaz8j.csv, 175_954mvwcwet.csv, 98_8et5hi6nqf.csv, 81_dbvr7ry56o.csv, 143_w11kkydg7n.csv, 2_9actgs9mi7.csv, 93_7s7jo61qgr.csv, 126_s6vu9wsf2o.csv, 50_tmt7xio73x.csv, 60_zry6xyk2v2.csv, 127_q2ifamcjop.csv, 173_op7k51t6eh.csv, 153_mwu7z89cr2.csv, 149_tvntt1n3zf.csv, 136_dd735psc8g.csv, 87_zzzdc841sd.csv, 186_77g5px55z1.csv, 133_c0kk72b3fm.csv, 12_i0qb55ygnr.csv, 128_y3nqk9k0x4.csv, 144_6g4qagf0aw.csv, 116_wf72qb1ctj.csv, 172_gg34wiyfxn.csv, 34_egy98kczu6.csv, 152_uhh1hem3ei.csv, 162_gzcq0eli5q.csv, 3_j0hn3a0xzt.csv, 9_m11ut0fzvo.csv, 13_xb7ph2k0m6.csv, 5_wa4f0flvdd.csv, 94_gcso78g1mh.csv, 112_7g35kmtkhr.csv, 72_37k8qe6uf3.csv, 45_4g2awfbbtn.csv, 196_h8ebmn0b4i.csv, 30_rb578trmsj.csv, 8_v4bqc7mvzy.csv, 108_qkxdz8djgt.csv, 137_eusnnkienm.csv, "
     ]
    }
   ],
   "source": [
    "# discover all concepts in the directory chosen.\n",
    "concept_dict, ids, label_strs, raw_bofs = capture_all_concepts(nlp, datapath, incmatcher, excmatcher)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "data.shape = (2181, 1963)\n",
      "We have 2181 datapoints with 1963 independent concepts.\n"
     ]
    }
   ],
   "source": [
    "# now we convert the raw bag of concepts list-of-lists\n",
    "# to a presence absence matrix (pam)\n",
    "pam =  convert_raw_bof_to_pam(raw_bofs, C=len(concept_dict))\n",
    "print(f\"data.shape = {pam.shape}\")\n",
    "print(f\"We have {pam.shape[0]} datapoints with {pam.shape[1]} independent concepts.\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "No output requested.\n"
     ]
    }
   ],
   "source": [
    "# extract the concepts as a list\n",
    "concepts = concept_dict_to_list(concept_dict)\n",
    "show_concepts = False\n",
    "if show_concepts:\n",
    "    print(f\"concepts[:20] = {concepts[:20]}\")\n",
    "\n",
    "    print(f\"len(concept_dict) = {len(concept_dict)}\")\n",
    "    print(f\"concept_dict['it hit the ground before being fielded'] = {concept_dict['it hit the ground before being fielded']}\")\n",
    "else:\n",
    "    print(\"No output requested.\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "labels_as_indices[:10] = [0 1 4 5 1 5 1 3 5 3]\n",
      "categories: ['ball' 'foul' 'none' 'out' 'play' 'strike']\n"
     ]
    }
   ],
   "source": [
    "## convert the labels to one hot vectors\n",
    "# there is one odd label that shouldn't be there. We change this to 'strike'.\n",
    "label_strs[label_strs == ' it could be called a strike because the pitch landed in the strike zone before being hit'] = 'strike'\n",
    "labels_as_indices, categories = create_labels_as_indices(label_strs)\n",
    "\n",
    "print(f\"labels_as_indices[:10] = {labels_as_indices[:10]}\")\n",
    "print(f\"categories: {categories}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### What do we have\n",
    " \n",
    "At this stage we have the following parameters:\n",
    "\n",
    "* `concept_dict` (dict) - a dictionary of `C` concepts. Each key is a string representing the raw concept, each value is the index.\n",
    "* `concepts` (list) - the reverse lookup version of concept dict (`concepts[i] = k` where `concept_dict[k]=i`)\n",
    "* `ids` - a list of `N` uniquely identifying labels for the datapoints\n",
    "* `label_strs` - a list of `N` labels stored as strings. One per datapoint.\n",
    "* `categories` - an ordered list of category strings, where `categories[i]` is the string representation of category `i`.\n",
    "* `labels_as_indices` - labels as an index which maps to the label str via the category.\n",
    "* `raw_bofs` - a list of lists of \n",
    "* `pam` - a presence/absence matrix where `pam[i,j]=1` if `i`th datapoint contains `j`th concept.\n",
    "\n",
    "We discard `concept_dict`, `label_strs`, `raw_bofs`, to avoid duplication and as we can recover these from the others. The remainder are passed to the store function."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "concepts[688] = video would not load\n",
      "rows_to_remove = [  44  105  113  155  159  183  188  189  196  225  250  255  378  406\n",
      "  460  530  587  661  676  678  799  823  865  910  935  995 1011 1067\n",
      " 1122 1138 1167 1174 1194 1214 1233 1273 1309 1314 1418 1493 1497 1514\n",
      " 1526 1545 1696 1743 1798 1808 1823 1961 1970 1996 2042 2059 2099]\n",
      "concepts_to_remove = [\"catcher's interference\", 'there is no video', 'no video with supported format and mime type found error message', 'then it was a groundout', 'then it is another groundout that is not listed', 'the batter hit it foul', 'the count is already 22', 'the video is broken', 'line drive', 'video did not display or', 'there is an error message', 'no video with supported format and mime type found', \"the pitcher didn't throw the ball in this clip\", \"video didn't\", 'video is not playing', 'he readied for the next pitch', 'the pitcher did not throw the ball', 'video would not load', 'i did not see this as a choice though', 'video did not', 'the video failed to load', 'the baserunner returns to the base after attempting to steal a base', 'there was no activity of a pitch in the video clip', 'this video would not play', 'there is no video', \"the video doesn't load\", 'the video did not load', 'the pitcher did not throw the ball', 'video did not', 'what happened', 'the network showing the game', 'the pitcher did throw the ball', 'they go back to the game', 'video did not load', 'the video did not play', 'and the batter hit it out for a homerun', 'screen is black', 'video did not load', 'video did not work', 'there was no video of the the aftermath of the', 'the video is not working', 'the video did not load', \"the pitcher didn't throw the ball\", 'the batter was awarded an intentional walk', 'the umpire called it a ball or a strike', 'the umpires decision', 'i have to rate this as no activity', 'the clip had lasted longer', \"the umpire's decision\", \"video doesn't\", 'the video is not loading at all', 'once again video wouldnt load', 'it to load', 'video did not', 'the video does not', 'video did not load', 'the video did not load']\n"
     ]
    }
   ],
   "source": [
    "print(f\"concepts[688] = {concepts[688]}\")\n",
    "rows_to_remove = np.where(label_strs=='none')[0]\n",
    "_, cols_to_remove = np.where(pam[\n",
    "    label_strs == 'none',:])\n",
    "concepts_to_remove = [ concepts[col] for col in cols_to_remove ]\n",
    "print(f\"rows_to_remove = {rows_to_remove}\")\n",
    "print(f\"concepts_to_remove = {concepts_to_remove}\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "datadir = form_data_dirname(datastem)\n",
    "csvfname = f\"{datastem}.csv\"\n",
    "file_info = np.array(get_file_info(datadir, csvfname))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([\"the umpire called catcher's interference.\", 'there is no video.',\n",
       "       'No video with supported format and MIME type found error message.',\n",
       "       'then it was a groundout but none of those are listed.',\n",
       "       'then it is another groundout that is not listed.', 'Broken video',\n",
       "       'Broken video', 'Broken video',\n",
       "       'the batter hit it foul but the count is already 2-2.',\n",
       "       'The video is broken.', 'the batter got a line drive single.',\n",
       "       'video did not display or load',\n",
       "       'there is an error message. No video with supported format and MIME type found.',\n",
       "       \"the pitcher didn't throw the ball in this clip.\",\n",
       "       \"video didn't load\", 'Video is not playing :(',\n",
       "       'got the ball back from the catcher. He readied for the next pitch.',\n",
       "       'the pitcher did not throw the ball.', 'video would not load.',\n",
       "       'Intentional walk. I did not see this as a choice though.',\n",
       "       'Video did not load',\n",
       "       'the video failed to load and the play button could not be clicked.',\n",
       "       'The baserunner returns to the base after attempting to steal a base.',\n",
       "       'there was no activity of a pitch in the video clip.',\n",
       "       'this video would not play.', 'No video',\n",
       "       'there is no video/will not play', \"The video doesn't load.\",\n",
       "       'This is labeled as none because the video did not load.',\n",
       "       'Bad format on the video.',\n",
       "       '*the pitcher did not throw the ball...', 'video did not load',\n",
       "       \"we don't know what happened. The network showing the game was airing a commercial. If the pitcher did throw the ball, we'll have to wait until they go back to the game to see what happened/\",\n",
       "       'video did not load.',\n",
       "       \"This is a ground out, which isn't an option.\", '',\n",
       "       'The video did not play.', '',\n",
       "       'and the batter hit it out for a homerun.',\n",
       "       'Screen is black. No activity', 'Screen black, No activity.',\n",
       "       'video did not load.', 'video did not work.',\n",
       "       'there was no video of the the aftermath of the pitch',\n",
       "       'the video is not working.', 'The video did not load.',\n",
       "       \"The pitcher didn't throw the ball. The batter was awarded an intentional walk.\",\n",
       "       'the clip ended before we saw if the umpire called it a \"ball\" or a \"strike.\"  this looked like a \"borderline\" pitch, meaning it may have been called a \"strike\" or a \"ball.\" the clip cut off before we could see the umpires decision so i have to rate this as \"no activity.\"  if the clip had lasted longer, we would have seen the umpire\\'s decision.',\n",
       "       \"video doesn't play\",\n",
       "       'The video is not loading at all. Been trying for a few minutes.',\n",
       "       'Once again video wouldn�t load. Spent a few minutes trying to get it to load. Videos 1 and 10 don�t work. 2 through 9 worked great though.',\n",
       "       'video did not play', 'the video does not load',\n",
       "       'video did not load.', 'The video did not load.'], dtype='<U368')"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "file_info[rows_to_remove,-1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array(['6WBL9G9QUQT7', 'QY1T0U8H5SR6', '7F0T5YP684G5', ...,\n",
       "       'E5F1WUMRQZMY', 'K1VN9P1M4P44', 'RGJQBHE7IQGU'], dtype='<U12')"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "filter_ = (label_strs=='none')[:10]\n",
    "ids"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Before removing none label\n",
      "pam.shape = (2181, 1963)\n",
      "len(concepts) = 1963\n",
      "ids.shape = (2181,)\n",
      "labels_as_indices.shape = (2181,)\n",
      "categories = ['ball' 'foul' 'none' 'out' 'play' 'strike']\n",
      "After removing none label\n",
      "pam.shape = (2126, 1915)\n",
      "len(concepts) = 1915\n",
      "ids.shape = (2126,)\n",
      "labels_as_indices.shape = (2126,)\n",
      "categories = ['ball' 'foul' 'out' 'play' 'strike']\n"
     ]
    }
   ],
   "source": [
    "resultsdict = dict(concepts=concepts, ids=ids, categories=categories,\n",
    "                   labels_as_indices=labels_as_indices, pam=pam)\n",
    "remove_none_labels = True\n",
    "if remove_none_labels:\n",
    "    print(\"Before removing none label\")\n",
    "    print(f\"pam.shape = {pam.shape}\")\n",
    "    print(f\"len(concepts) = {len(concepts)}\")\n",
    "    print(f\"ids.shape = {ids.shape}\")\n",
    "    print(f\"labels_as_indices.shape = {labels_as_indices.shape}\")\n",
    "    print(f\"categories = {categories}\")\n",
    "    rows_to_remove = (label_strs=='none')\n",
    "    pam = pam[~rows_to_remove,:]\n",
    "    cols_to_remove = (count_datapoints_in_each_feature(pam) == 0)\n",
    "    pam = pam[:,~cols_to_remove]\n",
    "    concepts = [ concept for concept, to_remove in zip(concepts, cols_to_remove) if not to_remove ]\n",
    "    label_strs = label_strs[~rows_to_remove]\n",
    "    labels_as_indices, categories = create_labels_as_indices(label_strs)\n",
    "    ids = ids[~rows_to_remove]                                                   \n",
    "    resultsdict = dict(concepts=concepts, ids=ids, categories=categories,\n",
    "                       labels_as_indices=labels_as_indices, pam=pam)\n",
    "    resultsdict['rows_to_remove'] = rows_to_remove\n",
    "    #labels_as_indices = labels_as_indices[~rows_to_remove]                                                   \n",
    "    #categories = categories[categories != 'none'] \n",
    "    print(\"After removing none label\")\n",
    "    print(f\"pam.shape = {pam.shape}\")\n",
    "    print(f\"len(concepts) = {len(concepts)}\")\n",
    "    print(f\"ids.shape = {ids.shape}\")\n",
    "    print(f\"labels_as_indices.shape = {labels_as_indices.shape}\")\n",
    "    print(f\"categories = {categories}\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "for c in concepts:\n",
    "    if 'video' in c:\n",
    "        print(f\"video in {c}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pickle\n",
    "procpath = form_processed_fname(datastem, modelstem, 'pkl', 'raw_concepts')\n",
    "#store_concept_objects(procpath, resultsdict)\n",
    "pickle.dump(resultsdict, open( procpath, \"wb\" ))\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "#concepts, ids, categories, labels_as_indices, pam = load_concept_objects(procpath)\n",
    "resultsdict = pickle.load(open( procpath, \"rb\" ))\n",
    "\n",
    "#labels = label_indices_to_one_hot(labels_as_indices, num_categories=len(categories))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Briefly analyse discovered concepts"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(data.shape[0],data.shape[1],concept_counts.shape) = (2126, 1915, (1915,))\n",
      "np.sort(concept_counts)[-1:-K:-1] = [109  66  48  46  43  28  20  20  17  17  16  15  13  13  12  12  12  11\n",
      "  11   9   9   9   9   9   9   8   8   8   8   8   8   8   8   7   7   7\n",
      "   7   7   7   7   7   7   7   7   7   7   6   6   6   6   6   6   6   6\n",
      "   6   6   5   5   5   5   5   5   5   5   5   5   5   5   5   5   5   4\n",
      "   4   4   4   4   4   4   4   4   4   4   4   4   4   4   4   4   3   3\n",
      "   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3\n",
      "   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3\n",
      "   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3\n",
      "   3   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2\n",
      "   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2\n",
      "   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2\n",
      "   2]\n"
     ]
    }
   ],
   "source": [
    "concept_counts = count_datapoints_in_each_feature(pam)\n",
    "K = 200\n",
    "print(f\"(data.shape[0],data.shape[1],concept_counts.shape) = {(pam.shape[0],pam.shape[1],concept_counts.shape)}\")\n",
    "print(f\"np.sort(concept_counts)[-1:-K:-1] = {np.sort(concept_counts)[-1:-K:-1]}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Text(0.5, 1.0, \"Raw ranked concept counts Vikranth's data\")"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYUAAAEaCAYAAAD+E0veAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjQuMCwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8+yak3AAAACXBIWXMAAAsTAAALEwEAmpwYAAAm2klEQVR4nO3dd5xU1f3/8ddnZyvLsvTem4gooCsgkIixYUGNXbHFghhL8tPkm+Y38ZumJsYYDWpsQUyiMcSoaGKLiigILEUFLCBdpNel7+75/XHvLsO6Ozu7zMyd8n4+HjyYuffOnc/MnZ33nHvuPdecc4iIiABkBV2AiIgkD4WCiIhUUyiIiEg1hYKIiFRTKIiISDWFgoiIVFMoyFeY2R1m9pcYrGeUma2ORU2SWGb2YzN7zL/d3cycmWUHVEtCPkdmttzMTor38yQ7hUKM+R+s3WZWZmZrzWyimTUNui5pvKC/FOvS2PA2s4v9z6nVmJ5tZuvN7Ezn3K+dc9fGrtoG1efMrHcDlp9oZlfFsaTanrNBNaYShUJ8jHHONQUGAYOBHyXyyc0slMjnk5TzPNAcOL7G9NGAA15p7IqTLTil4RQKceScWwu8ihcOAJjZD83sczPbYWaLzOybYfNWmNkx/u2x/q+RI/z715jZ87U9j/9L6SEz+7eZ7QROMLMzzGyemW03s1VmdkfY8lW/fK80s5VmttHMflLHunPM7Gkz+6eZ5ZpZR//2BjNbZma3hC1b4NeyxcwWAcdGen/M7Agze93MNpvZOjP7sT89z8zuM7M1/r/7zCzPnzfKzFab2W3+r9ovzexbNWr4nf9ebjOzd82swJ83zMymm9lWM/vAzEaFPe5tM7vTzGb579kLZtbSn/2O//9WvwV4XC2vJeTvcqnatnPMrIs/b7iZzfbrmW1mw8Med9Aui/Bf/5G2k5mNBn4MXOTX9IE//SozW+rXsMzMxtas1Tm3B3gWuKLGrCuAvznnyi1CK8TMzvPrHhBW4zVmthJ401/mH+a1lLeZ2TtVn2N/3kQzm2BmL/t1zjSzXv68qvf6A/91XRT2uFq3eY3aepvZVP95N5rZ32tbzl/2cv9zsqnm59/MhpjZDP+z8qWZ/dHMcuuq0cxamNlL5v1dbPFvd67ruZOac07/YvgPWA6c5N/uDHwE/CFs/gVAR7xAvgjYCXTw500CbvNvPwJ8DtwQNu//1fGcE4FtwAh/vfnAKOBI//5RwDrgHH/57ni/CB8FCoCBwF7gcH/+HcBf/Hkv++sP+euaA/wUyAV6AkuBU/3H3QVMA1oCXYAFwOo6ai4CvgRu8+stAob6834OvA+0BdoA04Ff+PNGAeX+MjnA6cAuoIU/fwLwNtDJr3k4kOff3+QvnwWc7N9v4z/ubeALYABQCPwT+EuN9ys7wnb/vr+tDwPMf09b+e/FFuByIBu4xL/fqubnJfy9b8h2CntsIbAdOMy/3wE4oo56R/jLFvj3i4HdwKAIdWQD3wKWAL1rzJvkP3/V+q72t2kecB8wv8bndRMwxF/nX4Fnwua7qvVHs81rvK6ngZ9w4O9gZB2vvz9QBnzdr/Fe/zmq/naPAYb59XUHPga+G6HGVsB5QBP/df8DeD7o76NGfYcFXUC6/fP/yMuAHf4H579A8wjLzwfO9m9fA7zo3/4YuLbqjwVYARxdxzomApPqqes+4Pf+7ao/5M5h82cBF/u37wBeBKYC9wPmTx8KrKyx3h8Bf/ZvLwVGh80bR92hcAkwr455nwOnh90/FVju3x6F9+WVHTZ/vf8HnOXPG1jLOn8APFVj2qvAlf7tt4G7wub1B/bhBUvV+xUpFD6t2o41pl8OzKoxbQZwVdjnpb5QiLSdaobCVrwvp4IoPquLgUv929cBH9RTx/eARTXqqZrXM8LzNPeXKQ77vD4WNv904JOw+7WFQq3bvJbnmoT3g6pzXfX4y/2Ug4Oo0N/eJ9Wx/HeBf9VVYy3LDwK21LcNkvGfdh/FxznOuSK8D3M/oHXVDDO7wszm+83SrXi/TKvmTwW+ZmYd8L6MngVGmFl3vF9y8yM856rwO2Y21Mze8puz24Dx4XX41obd3gWEd4gPw2th3OX8TznQDehYVbtf/4+Bdv78jjXqWBGh3i54X/616VjjsSv8aVU2OefKa6m9Nd6vw9rW2w24oEbtI/F+TVepWXsOX33P6lLX66n5WqrW3SnK9ULk7VTNObcTr/U5HvjS3z3TL8J6J3FgF9Ll/v1Ivg9McM7VdiRQ9Xvn70q7y9+Vth0v+ODg9zKq1xSmrm1e0//gtdRmmdlCM7u6jvUd9Fn137tNYa+hr78LaK3/Gn5NhM+CmTUxsz/5u6O24+1ybG4p2L+nUIgj59xUvF9F9wCYWTe8XQE34e0+aI63i8X85ZfgfdhvBt5xzm3H++MZB7zrnKuM9HQ17v8N79d+F+dcMfBw1fNE6TXgTuC/Zlb1pb8KWOacax72r8g5d7o//0u8L8cqXSOsfxXe7qfarMH7Eg9fz5ooat4I7AF61fF8T9WovdA5d1fYMjVr3++vM5qhhFfV8bw1X0vVur/wb+/E2+VQpX0Uz1XlK3U55151zp2MF3af4H3e6vIUcKLfRzIMbzdOJKcAt5vZefXUcilwNnAS3o+Z7v70hnz+GsU5t9Y5d51zriNwPfCg1X6U0EGfVTNrgrcLqMpDeO9fH+dcM7wfP5Hqvw1v1+FQf/mvV6260S8mIAqF+LsPONnMBuI1UR2wAcDvLBtQY/mpeKEx1b//do370SoCNjvn9pjZELw/1AZxzv0GL1z+a2at8XZd7DCzH5jXoRvyOxurOpSfBX7kd7p1xgu3urwEdDCz75rXsVxkZkP9eU/jffm08Z/3p3h9HPXVWwk8AdxrXod4yMyOM6+T+i/AGDM71Z+eb16ndXhn4GVm1t//gvg5MNk5V4G3vSqpO8QAHgN+YWZ9zHOUmbUC/g30NbNLzTvk8yK8XVMv+Y+bD1xsXod+CXB+fa8zzDqgu5llAZhZOzM728wK8foeyvy663q/lgPv4r3frzvvwIhIFuIdoTTBzM6KsFyR//yb8ALv19G9nGrriPxe18nMLgjbplvw/t5qew8mA2ea2Ui/A/nnHPx9WITX51Lmt7ZuqKfGIrxdXFvNO0DhZ42pPxkoFOLMObcBr1n+U+fcIuB3ePuU1+F1BL9X4yFT8T5g79RxP1rfBn5uZjvwvlSfbWT9v8A7hPENvF99Z+LtL12G9yv6MX86wP/h7RpZhtfSeCrCenfgdfaOwWsNLQZO8Gf/EigFPsTrvJ3rT4vG9/zHzAY2A3cDWc65VXi/Xn+M9yW/Cm93SPjfwFN4Lbu1eLuhbvFr3QX8CnjP3/U0rJbnvRfvPX4N78vkcbz9+pvw3rPb8L4k/wc40zm30X/c/+K1MLbgvX9/i/J1gteZCbDJzOb6r+VWvNbJZrxDTmt+mdX0JF5Lpr5dRwA45z7Aez2PmtlpdSw2Ce9z8AVeH8T70aw7zB3Ak/57fWEDH3ssMNPMyvBayt9xzi2tuZBzbiFwI977/SXe+x++W+x7eD+kduC1tmoexVSzxvvwDgbYiPd6G31Yb9CqOhBFMpqZvY3XsfpY0LWIBEktBRERqaZQEBGRatp9JCIi1dRSEBGRagoFERGpltIjGrZu3dp179496DJERFLKnDlzNjrn2tQ2L6VDobxJa/712jt0admk/oVFRATwRmSua15S7T4ys3PM7FEz+7uZnVLf8nvLKznj/mm8vmhdIsoTEUl7cQ8FM3vCvDHQF9SYPtrMPjWzJWb2QwDn3PPOuevwBvS6qLb1hevTtildWzXhukml/OrlReyviDQ0kIiI1CcRLYWJeOOlVPNHDpwAnIY3DswlZtY/bJHb/fkR5WZnMXn8cC4f1o1Hpy3joj/NYM3W3bGrXEQkw8Q9FJxz7+CNwxJuCLDEObfUObcPeAY42x9I7G7gP865udGsPz8nxC/OGcADlwzms3VlnHH/NN76ZH1sX4SISIYIqk+hEwePXb/an3Yz3nC755vZ+NoeaGbjzKzUzEo3bNhQPX3MwI68eNMI2jXL51sTZ3P3K59Qrt1JIiINklQdzc65+51zxzjnxjvnHq5jmUeccyXOuZI2bQ4+oqpnm6Y8f+MILhnSlYfe/pxLH53J2m17ElK7iEg6CCoUvuDgC5p05sBFRw5Jfk6IO889kvsuGsSCNds44/5pvPPZhvofKCIigYXCbKCPmfXwL3BxMd7Y5zFzzuBOvHjTSFo1zeXKP8/id699SkWlxnkSEYkkEYekPo13UZnDzGy1mV3jX2v1JrwLp38MPOtf9CKmerdtygs3juT8ozvzwJtLGPvY+6zfrt1JIiJ1SclRUs1sDDCmd+/e1y1evDiqx0yes5rbn/+Ipnk53H/xIIb3jvZ67CIi6cXM5jjnSmqbl1QdzdFyzk1xzo0rLi6uf2Hf+cd05oUbR1JckM3Yx2fyhzcWa3eSiEgNKRkKjXVY+yJevGkk5wzqxO/f+Iwrn5jFxrK9QZclIpI0MioUAArzsrn3woHcde6RzF6+mdP/MI33l24KuiwRkaSQcaEAYGZcPKQrz984gsK8bC599H0mvLWESu1OEpEMl5KhYGZjzOyRbdu2HdJ6Du/QjCk3j+SMozry21c/5VsTZ7N5574YVSkiknpSMhQa09Fcl6Z52dx/8SB+ec4AZny+idP/MI3S5TWHahIRyQwpGQqxZmZcNqwbz317OLnZWVz0yPs8PPVz7U4SkYyjUAgzoFMxL90yklP6t+Ou/3zCtZNK2aLdSSKSQRQKNTTLz+HBsUdzx5j+TFu8gTMfeJe5K7cEXZaISEIoFGphZlw1ogeTxw/HDC58eAaPTVtKKp79LSLSECkZCrE6+qg+A7s05+Wbv8YJ/dryy5c/5vqn5rBt1/64PqeISJBSMhRiefRRfYqb5PDI5cdw+xmH8+Yn6znjgWl8sGpr3J9XRCQIKRkKiWZmXPu1njw7/jicg/Mfns7Ts1YGXZaISMwpFBrg6K4tePmWkQzr2YofPfcRP3thAft1yU8RSSMKhQZq3iSXP191LNeO7MGTM1Zw5ROzdNiqiKQNhUIjZIeyuP3M/txzwUBKl2/h7Anv8enaHUGXJSJyyBQKh+D8YzrzzPXD2L2/gnMffI/XF60LuiQRkUOSkqGQqENSo3F01xZMuWkkvdo2ZdxTpfzxzcU6n0FEUlZKhkIiD0mNRvvifJ69/jjOGtiRe177jJufnsfufRVBlyUi0mDZQReQLvJzQtx30SAO79CMu1/5hGUbd/LoFSV0bF4QdGkiIlFLyZZCsjIzxh/fi8evLGHFpl2c9cd3NQy3iKQUhUIcfKNfO56/cThN87K55NH3+ftsnegmIqlBoRAnvdsW8cKN3oluP/jnR9zx4kLKdaKbiCQ5hUIcFTfJ4c9XHcs1I3swcfpyrvrzbLbu0oluIpK8FApxlh3K4n/P7M9vzj+KWcs2c/aE91i8Tie6iUhyUigkyIUlXXh63DB27q3gmw9O5w2d6CYiSSglQyGZTl5riGO6tWDKzSPo0bqQ654q5d7XP2PRmu06p0FEkoal8tm3JSUlrrS0NOgyGmzP/gr+Z/KHvPjBmuppnZoX0LNNIb3aNKVXm0J6tmlKrzZNadcsDzMLsFoRSTdmNsc5V1LrPIVCMJxzLF5fxuJ1ZXy+oYylG8r4fMNOlm4oY2dYy6EwN0QPPyx6tm7KUZ2LOa5XK/JzQgFWLyKpLFIo6IzmgJgZfdsV0bdd0UHTnXOs277XDwkvKD7fUEbp8i28MN9rWRTmhhjVry2n9G/HCf3a0iw/J4iXICJpSKGQZMyM9sX5tC/OZ3jv1gfN27WvnJnLNvPawnW8vmgdL3/4JTkhY3iv1pxyRDtO7t+OtkX5AVUuIulAu49SVEWlY97KLby2aB2vLlzLik27MIPBXZpz6hHtufjYrhQ3UQtCRL5KfQppzjnHp+t28NpCLyAWrtnOSYe347Era93mIpLh1KeQ5syMfu2b0a99M245sQ+/ffUTHnz7c1Zv2UXnFk2CLk9EUkhKnqcgkV06tBsGPDNrVdCliEiKUSikoU7NCzjhsLY8M3sV+zUIn4g0QEqGQqqe0ZxIlw3rxsayvby2UMNpiEj0UjIUku1ynMno633b0Kl5AX+duSLoUkQkhaRkKEj9QlnGpUO7Mv3zTXy+oSzockQkRSgU0tiFJV3IzjL+NlNXfhOR6CgU0libojxOHdCeyXNWs2e/RmIVkfopFNLcZUO7sW33fl768MugSxGRFKBQSHPDerakV5tCdTiLSFQUCmnOzBg7tBvzVm5l4RodwisikSkUMsB5R3cmLzuLv6rDWUTqoVDIAMVNchgzsCMvzPuCsr3lQZcjIklMoZAhLhvWjZ37Knh+3hdBlyIiSUyhkCEGdi7miI7N+Mv7K0jl4dJFJL4UChnCzLhsWDc+WbuDuSu3Bl2OiCSplAwFDYjXOGcN7EjTvGwdnioidUrJUNCAeI1TmJfNuUd34qUPv2TLzn1BlyMiSSglQ0Ea79KhXdlXXsnkOauDLkVEkpBCIcP0a9+MId1bMun95VRUqsNZRA6mUMhAV4/szqrNu3l9kS7AIyIHUyhkoJP7t6dziwKeeG9Z0KWISJJRKGSgUJZx1fDuzFq2mQVf6AguETlAoZChLjy2C4W5IZ54V60FETlAoZChmuXncOGxXZjy4RrWb98TdDkikiQUChnsquHdKa90TJqhk9lExKNQyGDdWhVy8uHt+OvMFbpcp4gACoWMd/XIHmzZtZ9/afRUEUGhkPGG9mjJER2b8cS7yzR6qogoFDKdmXH1iB4sXl/GtMUbgy5HRAKmUBDOHNiBNkV5OplNRBQKAnnZIS4f1o23P93AkvU7gi5HRAKkUBAAxg7tSm52Fk+8tzzoUkQkQNlBFyDJoVXTPL45qBPPzV3N+cd0Jj87dND8wrwQ3VoVBlSdiCSKQkGqXT2yB38vXcW5D06vdf7Pzz6CK47rntiiRCShUjIUzGwMMKZ3795Bl5JWDmtfxD9vGM6GHXu/Mu+vM1fwq5c/ZnivVvRuWxRAdSKSCJbKx6aXlJS40tLSoMvICOt37OHU379DpxYFPHfDCHKz1R0lkqrMbI5zrqS2efrLlqi0LcrnznOPZMEX23ngzcVBlyMicaJQkKiNHtCB84/pzIS3ljBnxZagyxGROFAoSIP8bEx/OhQXcOuz89m5tzzockQkxhQK0iBF+Tnce+FAVm7exS9f/jjockQkxlLy6CMJ1tCerRj39Z78aepSlm/cSXbIquc1y8/hOyf1oW87HaEkkooUCtIot57cl01l+1i6oYzwvUgfrt7G64vW8Z2T+nD913uSHVJjVCSVKBSkUfKyQ9xzwcCvTN9YtpefvbCQ3776Ka8sWMtvLziKfu2bBVChiDSGfsZJTLVumseEsUfz4NijWbN1N2MeeFfXahBJIQoFiYvTj+zA67cez/F92/LzlxZx89PzdLSSSApQKEjctCzM5ZHLj+EHo/vx74++5OwJ72lobpEkp1CQuMrKMm4Y1Yu/XDuUrbv2cfYf32PFpp1BlyUidVAoSEIM79Waf317BJUO7nnts6DLEZE6KBQkYbq0bMI1I3sw5YM1LPhiW9DliEgtFAqSUOOO70mLJjnc/conQZciIrWIKhTMbEQ000Tq0yw/hxtP6M20xRt5d/HGoMsRkRqibSk8EOU0kXpdNqwbnZoXcPcrn1BZqfMXRJJJxDOazew4YDjQxsxuDZvVDAjV/iiRyPJzQtx6cl9u+8cH/PrfH9OlZRMA+rRtyvDerQOuTiSz1TfMRS7Q1F8ufISz7cD58SpK0t85gzvx9KyVPPbusoOmP3zZMYwe0D6gqkQkqstxmlk359yKBNTTILocZ2qrqHRs270fgPLKSsZNmsNn63Ywefxw+nfUeEki8RKLy3HmmdkjZvaamb1Z9S+GNUoGCmUZLQtzaVmYS9uifB65/Bia5edw3aRSNpbtDbo8kYwUbUvhA+BhYA5QUTXdOTcnfqXVTy2F9PPR6m1c8KfptG6aR6fmBQfNyw4Z3zmxL0N6tAyoOpH0EIuWQrlz7iHn3Czn3JyqfzGsUQSAIzsX8+DYo+nqdz6H+2xdGd95Zh5lGlhPJG6ivZ7CFDP7NvAvoLpd75zbHJeqJKN9o187vtGv3Vemz1u5hXMfms49r37KHWcdEUBlIukv2lC40v//+2HTHNAztuWI1G1w1xZcMawbT85YzjmDOzGoS/OgSxJJO1GFgnOuR7wLEYnG9049jFcXruO6SaX0aFVIdsi446wjdE1okRiJKhTM7IrapjvnJsW2HJHIivJzuP+SwTzw5mLKKxyzlm3mublf8MPT+gVdmkhaiHb30bFht/OBE4G5QMxCwcx6Aj8Bip1zOjFO6jSkR0ueumYoAOc9NJ2ZyzYFXJFI+ojq6CPn3M1h/64DjsY70zkiM3vCzNab2YIa00eb2admtsTMfug/x1Ln3DWNeRGSuYb0aMlHq7exa5+OSBKJhcYOnb0TiKafYSIwOnyCmYWACcBpQH/gEjPr38g6JMMN7dGS8krH3BVbgy5FJC1E26cwBe9oI/AGwjsceLa+xznn3jGz7jUmDwGWOOeW+ut+BjgbWBRlLeOAcQBdu3aN5iGSxkq6tySUZcxctomRfTSYnsihirZP4Z6w2+XACufc6kY+ZydgVdj91cBQM2sF/AoYbGY/cs7dWduDnXOPAI+Ad0ZzI2uQNNE0L5sBHZsxc6lOmRGJhWgPSZ1qZu040OG8ONaFOOc2AeNjvV5Jf0N6tOTJ6SvYs7+C/ByN6C5yKKK98tqFwCzgAuBCYKaZNfYIoS+ALmH3O/vTRBplaI9W7KuoZP6qrUGXIpLyot199BPgWOfcegAzawO8AUxuxHPOBvqYWQ+8MLgYuLQR6xEB4NgeLTGDP7yxmDc6rou4bCjLuGxYt+oL+4jIwaINhayqQPBtIopWhpk9DYwCWpvZauBnzrnHzewm4FW8TusnnHMLG1K0mY0BxvTu3bshD5M0VVyQwyn92/Hu4o18uHprxGV37a9gx95yfv3NIxNTnEiKiXbo7N8CRwFP+5MuAj50zv0gjrXVS0NnS0Pd+Le5zFy6mZk/PpFQlgVdjkggGj10tpn1NrMRzrnvA3/CC4ajgBn4RwCJpJLRR7RnY9le5q7cEnQpIkmpvl1A9+Fdjxnn3HPOuVudc7fiDaF9X3xLE4m9UYe1ITeUxSsL1gZdikhSqq9PoZ1z7qOaE51zH9VyUppI0ivKz2Fkn9a8smAtY4dGd/Jj5xZNyM1u7Mn/IqmlvlBoHmFeQYR5Iklr9ID2vPnJer7xu6lRLX/OoI7cd/HgOFclkhzqC4VSM7vOOfdo+EQzuxbves2B0NFHcii+ObgTzfKz2VteWe+yT89ayYylGoVVMkd9ofBd4F9mNpYDIVAC5ALfjGNdETnnpgBTSkpKrguqBkldOaEsRg/oENWym3fu4/+mLOLLbbvpUKzGsaS/iKHgnFsHDDezE4AB/uSXnXNvxr0ykSRQdcnP+Su30uFIhYKkv2jHPnoLeCvOtYgknf4dm5EbymL+qq2cdmR0rQuRVBbtGc0iGSkvO8ThHZsxb+VW9lfU3wcRLstMJ8hJylEoiNRjcJfmTJy+nD4/+U+DHpcbyuKfNwznyM7FcapMJPZSMhR09JEk0vXH96RNUR7RDAlTZff+Cia89TnzV29VKEhKSclQ0NFHkkgdigu48YSG/QCprHQ8Nm0ZKzftjFNVIvGh0zRF4iAry+jasgkrN+8KuhSRBlEoiMSJFwq7gy5DpEEUCiJx0qVlE1Zu2tmgvgiRoCkUROKka8sm7NxXwead+4IuRSRqKdnRLJIKurXyLvk58u63qOt0hSM6FvPs+OMSWJVIZCkZCjokVVLBiN6tueXEPuzaW17r/A9Xb2PW8s3s2V9Bfk4owdWJ1C4lQ0GHpEoqyM8JcevJfeuc//fZK5m1fDMby/bSuUWTBFYmUjf1KYgEpE1RHgAbduwNuBKRAxQKIgFp0zQfUChIclEoiASkdVEuABvKFAqSPBQKIgFpVajdR5J8FAoiAcnNzqJFkxyFgiSVlDz6SCRdtCnK490lG7n9+Y8iLpedlcX1x/fUJUEl7lIyFHSegqSLUYe15Z9zVvOfj9bWuYzDu1Z091ZNuGpEj8QVJxnJUnlclpKSEldaWhp0GSJxtb+ikj4/+Q+3ntyXW07sE3Q5kgbMbI5zrqS2eepTEElyOaEsCnJC7NizP+hSJAMoFERSQLOCbLbvrn24DJFYUiiIpICi/Bx27FVLQeJPoSCSAory1VKQxFAoiKSAZvk56lOQhFAoiKSAovxstu9RS0HiT6EgkgKK1FKQBEnJk9dEMk2zgmy27d7PC/O/iLjc4R2a0bddUYKqknSUkqGgM5ol03Ru0YT9FY7vPDM/4nJ92jbl9VuPT0xRkpZSMhR05TXJNJcN7crXeremIsIIBPe+9hmzl29OYFWSjlIyFEQyjZnRvXVhxGXaFOWxZ39FgiqSdKWOZpE0kZ8TYs/+yqDLkBSnUBBJEwU5IfZVVFJRmbqDXErwFAoiaaIg1/tz1i4kORQKBZE0kZ8TAmC3QkEOgUJBJE1UhYJaCnIoFAoiaaJAoSAxoFAQSRMHWgo6AkkaT6EgkiYK1KcgMaBQEEkTVUcf7d6nUJDG0xnNImkiL9trKazYvIulG8piuu72xfk0ydXXRSbQVhZJE8UFOQD87/MLYr7uY7u34B/jh8d8vZJ8UjIUNEqqyFd1admESVcPYcuufTFd76QZK9hYFtt1SvJKyVDQKKkitft63zYxX+fUTzewbvuemK9XkpM6mkUkolCWaTylDKJQEJGIskNGuUIhYygURCSi7KwstRQyiEJBRCIKZRnlFTpLOlMoFEQkomz1KWQUhYKIRBRSn0JGUSiISERqKWQWhYKIRBTKyqK80uGcgiETKBREJKLsLANQayFDKBREJKLskBcK6lfIDAoFEYlILYXMolAQkYhCWd7XhFoKmUGhICIRqaWQWRQKIhJRKKuqT0FnNWcChYKIRFTVUiivUEshE6Tk9RREJHGqWgp/eGMxRfnBfGW0aprH+ON7YmaBPH8mSclQ0JXXRBKnT7siWhbm8tKHawJ5/v2Vjn3llZx5VAe6tGwSSA2ZxFL5LMWSkhJXWloadBkiEkfPzV3Nrc9+wNTvj6Jbq8Kgy0kLZjbHOVdS2zz1KYhIUqvaY6SDnxJDoSAiSc3wUiGV92qkEoWCiCS1qpaCIiExFAoikhLUUEgMhYKIJLWs6sNQlQqJoFAQkaRWvftImZAQCgURSWpVHc06+igxFAoiktQOdDQrFRJBoSAiSa26R0GZkBAKBRFJaupTSCyFgogkOf/kNe0+SgiFgogktSy1FBJKoSAiSa1quGyFQmIoFEQkqR04dU2pkAgKBRFJaupoTiyFgogkNQ2Il1gKBRFJaho6O7EUCiKS1NRSSCyFgogkNR19lFgKBRFJageGuVAqJIJCQUSSmnYfJZZCQUSS2oGO5oALyRAKBRFJagfOU1AqJIJCQUSSmnYfJZZCQUSSmnYfJZZCQUSSmnYfJZZCQUSS2oEB8SQRsoMuoIqZFQIPAvuAt51zfw24JBFJAjp5LbHi2lIwsyfMbL2ZLagxfbSZfWpmS8zsh/7kc4HJzrnrgLPiWZeIpI4DHc1KhUSId0thIvBHYFLVBDMLAROAk4HVwGwzexHoDHzkL1YR57pEJEVUXXntVy9/THHBkmCLSXFXDu/O6Ud2iLhMXEPBOfeOmXWvMXkIsMQ5txTAzJ4BzsYLiM7AfCK0YMxsHDDOv7u3ZiskgYqBbQGsI9rH1LdcpPl1zattes1prYGNUdQXD7HYJo1ZT7JvEwhuu8Rsm6xIvr+VlNsmzx642afOhZxzcf0HdAcWhN0/H3gs7P7leK2JQuDPwEPA2CjXXRrv+iM89yNBrCPax9S3XKT5dc2rbXrNaam+TRqznmTfJkFul6C2SSK2S6puk/pec9J0NDvndgLfCrqOBpgS0DqifUx9y0WaX9e82qbH4n2IlVjV0tD1aJvULaht0pDHNHa7pOo2gQj1mJ8acePvPnrJOTfAv38ccIdz7lT//o8AnHN3NmLdpc65khiWK4dI2yQ5abskn2TdJkGcpzAb6GNmPcwsF7gYeLGR63okdmVJjGibJCdtl+STlNskri0FM3saGIXXobIO+Jlz7nEzOx24DwgBTzjnfhW3IkREJGpx330kIiKpQ8NciIhINYWCiIhUS6tQMLNCM3vSzB41s7FB1yNgZj3N7HEzmxx0LeIxs3P8v5G/m9kpQdcjHjM73MweNrPJZnZDUHUkfSho/KTk05Bt4pxb6py7JphKM0cDt8nz/t/IeOCiIOrNFA3cLh8758YDFwIjgqgXUiAU8MZPGh0+IWz8pNOA/sAlZtYfb5iMVf5iGj8pfiYS/TaRxJhIw7fJ7f58iZ+JNGC7mNlZwMvAvxNb5gFJHwrOuXeAzTUmV4+f5JzbB9QcPwlS4LWlqgZuE0mAhmwT89wN/Mc5NzfRtWaShv6tOOdedM6dBgS2+ztVvzg7caBFAF4YdAKeA84zs4dIvtPK012t28TMWpnZw8DgqrPXJWHq+ju5GTgJON/MxgdRWIar629llJndb2Z/IsCWQtKMfRQLKTh+Utpzzm3C23ctScI5dz9wf9B1yMGcc28DbwdcRsq2FL4AuoTd7+xPk+BomyQfbZPklNTbJVVDIZbjJ0lsaJskH22T5JTU2yXpQ8EfP2kGcJiZrTaza5xz5cBNwKvAx8CzzrmFQdaZSbRNko+2SXJKxe2isY9ERKRa0rcUREQkcRQKIiJSTaEgIiLVFAoiIlJNoSAiItUUCiIiUk2hIBIlM6sws/lmtsDMpphZ80NYV1kMSxOJGYWCSPR2O+cGOecG4I18eWPQBYnEmkJBpHFm4I12iZkNMbMZZjbPzKab2WH+9KvM7Dkze8XMFpvZb2quxMxa+489I8H1i9QqrUZJFUkE/yIpJwKP+5M+Ab7mnCs3s5OAXwPn+fMGAYOBvcCnZvaAc26Vv552eGPe3O6cez2BL0GkTgoFkegVmNl8vBbCx0DVF3kx8KSZ9QEckBP2mP8657YBmNkioBveWPo5wH+BG51zUxNTvkj9tPtIJHq7nXOD8L7YjQN9Cr8A3vL7GsYA+WGP2Rt2u4IDP8TKgTnAqfEsWKShFAoiDeSc2wXcAtxmZtl4LYWq8fCvinY1wNVAPzP7QcyLFGkkhYJIIzjn5gEfApcAvwHuNLN5NGCXrHOuwn/8N8zs23EpVKSBNHS2iIhUU0tBRESqKRRERKSaQkFERKopFEREpJpCQUREqikURESkmkJBRESqKRRERKTa/weSKkpFrI2MoQAAAABJRU5ErkJggg==\n",
      "text/plain": [
       "<Figure size 432x288 with 1 Axes>"
      ]
     },
     "metadata": {
      "needs_background": "light"
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "plot_rank_versus_freq(concept_counts)\n",
    "plt.title(\"Raw ranked concept counts Survey data\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Top 40 concepts:\n",
      "\tRank\tID\tCount\tConcept\n",
      "\t0\t82\t109 : the batter did not swing\n",
      "\t1\t170\t66 : the batter hit the ball\n",
      "\t2\t19\t48 : the batter didn't swing\n",
      "\t3\t160\t46 : it hit the ground\n",
      "\t4\t86\t43 : the batter swung and missed\n",
      "\t5\t151\t28 : the pitcher threw the ball\n",
      "\t6\t166\t20 : the batter did not swing at the ball\n",
      "\t7\t328\t20 : the batter did not swing at the pitch\n",
      "\t8\t39\t17 : the batter made contact\n",
      "\t9\t60\t17 : it was a strike\n",
      "\t10\t150\t16 : the umpire called it a ball\n",
      "\t11\t44\t15 : the batter did not swing at it\n",
      "\t12\t220\t13 : it was outside the strike zone\n",
      "\t13\t350\t13 : the batter hit the ball in the air\n",
      "\t14\t264\t12 : the hitter hit the ball\n",
      "\t15\t124\t12 : it was in the strike zone\n",
      "\t16\t421\t12 : the ball was in the strike zone\n",
      "\t17\t261\t11 : the batter hit the ball into foul territory\n",
      "\t18\t165\t11 : the ball landed outside of the strike zone\n",
      "\t19\t133\t9 : the umpire called it a strike\n",
      "\t20\t1276\t9 : the batter hit the\n",
      "\t21\t69\t9 : the batter hit it into foul territory\n",
      "\t22\t1520\t9 : the ball was not thrown through the strike zone\n",
      "\t23\t222\t9 : the batter made contact with the ball\n",
      "\t24\t217\t9 : the batter hit it\n",
      "\t25\t55\t8 : it touched the ground\n",
      "\t26\t114\t8 : it was called a ball\n",
      "\t27\t252\t8 : it was a ball\n",
      "\t28\t1521\t8 : additionally the batter did not swing at the pitch\n",
      "\t29\t45\t8 : the batter hit the ball on the ground\n",
      "\t30\t339\t8 : who caught the ball in the air\n",
      "\t31\t80\t8 : it was caught in the air by the center fielder\n",
      "\t32\t403\t8 : the batter swung and missed the ball\n",
      "\t33\t653\t7 : the batter hit the ball on the ground to the third baseman\n",
      "\t34\t90\t7 : the batter swung at the pitch and missed for a strike\n",
      "\t35\t83\t7 : the umpire called the pitch a ball\n",
      "\t36\t257\t7 : it was below the strike zone\n",
      "\t37\t254\t7 : the right fielder caught the ball\n",
      "\t38\t1125\t7 : it was caught\n",
      "\t39\t250\t7 : the batter didn't swing at it\n"
     ]
    }
   ],
   "source": [
    "display_most_frequent_concepts(concepts, pam, K=40)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Miscellaneous"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "the batter: with rawid 2 is not a concept\n",
      "a line drive: with rawid 168 is not a concept\n",
      "that ball hit into the air: with rawid 207 is not a concept\n",
      "line drive: with rawid 272 is not a concept\n",
      "the batter was: with rawid 327 is not a concept\n",
      "it did was not a fair ball: with rawid 490 is not a concept\n",
      "batter swung and pitch and missed the ball which is a: with rawid 718 is not a concept\n",
      "batter swung at pitch and missed ball which is a strike: with rawid 719 is not a concept\n",
      "a base hit: with rawid 772 is not a concept\n",
      "batter swung and missed the ball: with rawid 968 is not a concept\n",
      "fielder's: with rawid 1055 is not a concept\n",
      "therefore the batter is: with rawid 1346 is not a concept\n",
      "hit the ball to left center field: with rawid 1446 is not a concept\n",
      "batter swung and: with rawid 1470 is not a concept\n",
      "running was thrown out at first: with rawid 1518 is not a concept\n",
      "s pop up fly: with rawid 1904 is not a concept\n"
     ]
    }
   ],
   "source": [
    "# are concepts actually concepts\n",
    "not_concepts = []\n",
    "for rawid, concept in enumerate(concepts):\n",
    "    if not is_concept(nlp(concept), incmatcher, excmatcher):\n",
    "        print(f\"{concept}: with rawid {rawid} is not a concept\")\n",
    "        not_concepts.append(rawid)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "is_concept(nlp(\"it is a foul ball\"), incmatcher, excmatcher)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [],
   "source": [
    "## tests for inputs that break the nlp model\n",
    "#texts = []\n",
    "#texts.append(\"there was not a hit but rather a ground ball to the first baseman.\")\n",
    "#texts.append(\"The first baseman fielded the grounder and threw the ball to the covering pitcher for an out at first base.\")\n",
    "#texts.append(\"there was not a hit but rather a ground ball to the first baseman. The first baseman fielded the grounder.\")\n",
    "#texts.append(\"The batter took a �check swing,�.\")\n",
    "#texts.append(\"The batter completed a �full swing.� The first base umpire declared that the batter did not take a full swing.\")\n",
    "#texts.append(\"the ball was not thrown through the strike zone. The batter took a �check swing,� and the umpire behind the plate appealed to the first base umpire, who had a better view, to determine whether the batter completed a �full swing.� The first base umpire declared that the batter did not take a full swing, and therefore the pitch was called a ball, rather than a strike.\")\n",
    "##texts.append(\"there was not a hit but rather a ground ball to the first baseman.  The first baseman fielded the grounder.\")\n",
    "##text = \"The cat sat on the mat.\"\n",
    "##texts.append(\"there was not a hit but rather a ground ball to the first baseman.  The first baseman fielded the grounder and threw the ball to the covering pitcher for an out at first base.\")\n",
    "#for text in texts:\n",
    "#    print(f\"text = {text}\")\n",
    "#    doc = nlp(text)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
