{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import re\n",
    "import csv\n",
    "import numpy as np\n",
    "from matplotlib import pyplot as plt\n",
    "\n",
    "# nlp\n",
    "import spacy\n",
    "import benepar\n",
    "import nltk\n",
    "# benepar_en didn't work\n",
    "fresh = False\n",
    "if fresh:\n",
    "    benepar.download('benepar_en3')\n",
    "from benepar import BeneparComponent\n",
    "#from benepar.spacy_plugin import BeneparComponent\n",
    "from spacy.matcher import Matcher\n",
    "\n",
    "## machine learning\n",
    "from sklearn.metrics import pairwise_distances\n",
    "from sklearn.metrics.pairwise import pairwise_kernels\n",
    "from sklearn.decomposition import PCA\n",
    "from sentence_transformers import SentenceTransformer\n",
    "# clustering and dendrogram\n",
    "from scipy.cluster.hierarchy import dendrogram\n",
    "from scipy.cluster.hierarchy import linkage\n",
    "from scipy.cluster.hierarchy import cophenet\n",
    "from scipy.spatial.distance import pdist\n",
    "from sklearn.cluster import AgglomerativeClustering"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "%load_ext autoreload\n",
    "%autoreload 2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "## homegrown library functions\n",
    "from imp import reload\n",
    "import concept_processing\n",
    "\n",
    "import concept_processing.io\n",
    "reload(concept_processing.io)\n",
    "# for loading original data\n",
    "from concept_processing.io import get_datapoint_iterator\n",
    "from concept_processing.io import load_concept_examples\n",
    "from concept_processing.io import capture_all_concepts\n",
    "from concept_processing.io import get_datapoint_iterator\n",
    "\n",
    "# nlp functionality for extracting concepts from text\n",
    "import concept_processing.extraction\n",
    "reload(concept_processing.extraction)\n",
    "from concept_processing.extraction import add_inc_and_exc_matchers\n",
    "from concept_processing.extraction import is_concept\n",
    "from concept_processing.extraction import iterate_concepts\n",
    "from concept_processing.extraction import iterate_concepts_in_span\n",
    "from concept_processing.extraction import extract_concepts\n",
    "\n",
    "# dealing with the concepts as strings\n",
    "import concept_processing.concepts\n",
    "reload(concept_processing.concepts)\n",
    "from concept_processing.concepts import concept_dict_to_list\n",
    "from concept_processing.concepts import display_most_frequent_concepts\n",
    "from concept_processing.concepts import build_embedding_matrix\n",
    "\n",
    "# manipulating data as a presence/absence matrix\n",
    "import concept_processing.pam\n",
    "reload(concept_processing.pam)\n",
    "from concept_processing.pam import convert_raw_bof_to_pam\n",
    "from concept_processing.pam import prune_and_reindex_concepts\n",
    "from concept_processing.pam import count_features_in_each_datapoint\n",
    "from concept_processing.pam import count_datapoints_in_each_feature\n",
    "\n",
    "import concept_processing.io\n",
    "reload(concept_processing.io)\n",
    "from concept_processing.io import store_concept_objects\n",
    "from concept_processing.io import load_concept_objects\n",
    "from concept_processing.io import form_data_dirname\n",
    "from concept_processing.io import form_processed_fname\n",
    "from concept_processing.io import get_file_info\n",
    "\n",
    "\n",
    "import concept_processing.labels\n",
    "reload(concept_processing.labels)\n",
    "from concept_processing.labels import create_labels_as_indices\n",
    "from concept_processing.labels import label_indices_to_one_hot\n",
    "\n",
    "\n",
    "# Dendrogram/hierarchical clustering\n",
    "import concept_processing.plot_support\n",
    "reload(concept_processing.plot_support)\n",
    "from concept_processing.plot_support import plot_rank_versus_freq\n",
    "from concept_processing.plot_support import plot_dendrogram\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## File data from Surveys\n",
    "\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "# load nlp model (for constituency parser)\n",
    "# and create inclusion and exclusion matchers adding\n",
    "# these to the nlp pipeline.\n",
    "\n",
    "# python -m spacy download en_core_web_sm\n",
    "\n",
    "datastem = '20210504'\n",
    "modelstem = 'lg'\n",
    "if modelstem == 'sm':\n",
    "    nlp = spacy.load(\"en_core_web_sm\")\n",
    "elif modelstem == 'md':\n",
    "    nlp = spacy.load(\"en_core_web_md\")\n",
    "elif modelstem == 'lg':\n",
    "    nlp = spacy.load(\"en_core_web_lg\")\n",
    "else:\n",
    "    raise ValueError(f\"Unrecognised modelstem: {modelstem}\")\n",
    "incmatcher, excmatcher = add_inc_and_exc_matchers(nlp)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "# path location of files\n",
    "\n",
    "datapath = 'surveys'\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0_imfn3fsnqb.csv, (id_, label, text) = ('ZK1QJKN7QT2Q', 'ball', 'the ball was not thrown through the strike zone. Additionally, the batter did not swing at the pitch.')\n",
      "(id_, label, text) = ('WKIAO1SNZ7O9', 'ball', 'the ball was not thrown through the strike zone. The batter took a ï¿½check swing,ï¿½ and the umpire behind the plate appealed to the first base umpire, who had a better view, to determine whether the batter completed a ï¿½full swing.ï¿½ The first base umpire declared that the batter did not take a full swing, and therefore the pitch was called a ball, rather than a strike.')\n",
      "(id_, label, text) = ('75XVC2M6XM6Z', 'ball', 'the ball was not thrown through the strike zone. Additionally, the batter did not swing at the pitch.')\n",
      "(id_, label, text) = ('BCNTSLUNUBJQ', 'ball', 'the ball was not thrown through the strike zone. Additionally, the batter did not swing at the pitch.')\n",
      "(id_, label, text) = ('DVTVF8YNKA86', 'ball', 'the ball was not thrown through the strike zone. Additionally, the batter did not swing at the pitch.')\n",
      "(id_, label, text) = ('FJXFI7C0SQGI', 'ball', 'the ball was not thrown through the strike zone. Additionally, the batter did not swing at the pitch.')\n",
      "(id_, label, text) = ('GAXSXT1YWAAR', 'ball', 'the ball was not thrown through the strike zone. Additionally, the batter did not swing at the pitch.')\n",
      "(id_, label, text) = ('U7I8HGLX1WWG', 'ball', 'the ball was not thrown through the strike zone. Additionally, the batter did not swing at the pitch.')\n",
      "(id_, label, text) = ('ERZUTNWJUF2Q', 'ball', 'the ball was not thrown through the strike zone. Additionally, the batter did not swing at the pitch.')\n",
      "(id_, label, text) = ('09IIL1GTV6QD', 'ball', 'the ball appeared to be thrown in the strike zone, but the umpire called it a ball (and the batter did not swing).')\n",
      "100_5397borrxc.csv, (id_, label, text) = ('8WY2QMY5FIK1', 'strike', 'the batter swung and missed.')\n",
      "(id_, label, text) = ('1TWEE7PPIH4B', 'strike', 'it was in the strike zone.')\n",
      "Fails with line: 1TWEE7PPIH4B,strike,it was in the strike zone.\n"
     ]
    }
   ],
   "source": [
    "# show some of the data\n",
    "show_example_data = True\n",
    "if show_example_data:\n",
    "    for i, (id_, label, text) in enumerate(get_datapoint_iterator(datapath)):\n",
    "            print(f\"(id_, label, text) = {(id_, label, text)}\")\n",
    "            if i > 10:\n",
    "                break\n",
    "else:\n",
    "    print(\"No output requested.\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0_imfn3fsnqb.csv, 100_5397borrxc.csv, 101_9l21auewdi.csv, 102_r9vbr9i6qn.csv, 103_v129kmq3c5.csv, 104_jos9y36cxr.csv, 105_3acmrslh5t.csv, 106_oa27xk4eqt.csv, 107_jfii3uf345.csv, 108_qkxdz8djgt.csv, 109_ehl2s3ah2w.csv, 10_ten3l2nuwn.csv, 110_zq7vq985bq.csv, 111_4dftcjzv4j.csv, 112_7g35kmtkhr.csv, 113_s8v38lnghb.csv, 114_fkhid5yyst.csv, 115_iocn2dpj2o.csv, 116_wf72qb1ctj.csv, 117_mgq0ymf5hc.csv, 118_66k0pevslq.csv, 119_hdkgwx4tgr.csv, 11_0na6xsa4cd.csv, 11_7vk8oa1o24.csv, 120_tfdaxlk9n2.csv, 121_aobrexg5sv.csv, 122_hmocfdllhz.csv, 123_g66af1qsqo.csv, 124_4p9vzltfyn.csv, 125_onkofvwzgk.csv, 126_s6vu9wsf2o.csv, 127_q2ifamcjop.csv, 128_y3nqk9k0x4.csv, 129_h8c8ny4x7u.csv, 12_i0qb55ygnr.csv, 130_7a6m1my07s.csv, 131_bivn5or23h.csv, 132_s5i7ngx7bu.csv, 133_c0kk72b3fm.csv, 134_behtf6grhp.csv, 135_7g73f18y8h.csv, 136_dd735psc8g.csv, 137_eusnnkienm.csv, 138_yx53ai19ar.csv, 139_1qp7vpj70o.csv, 13_xb7ph2k0m6.csv, 13_y1i6zh8y9w.csv, 140_rpfb54del9.csv, 141_tvldgnl6pt.csv, 142_d8jaik03id.csv, 143_w11kkydg7n.csv, 144_6g4qagf0aw.csv, 145_1p84lqxs5t.csv, 146_umxfdzx3sg.csv, 147_qz39am2soc.csv, 148_nekma0liim.csv, 149_tvntt1n3zf.csv, 14_apv4u3qebs.csv, 14_mx46s0fx52.csv, 150_hvfxdsp6uq.csv, 151_8k3mbw3phz.csv, 152_uhh1hem3ei.csv, 153_mwu7z89cr2.csv, 154_i7n85wzpa3.csv, 155_97dannvd9q.csv, 156_yk739g3286.csv, 157_7onih683qj.csv, 158_lhrhfgkazf.csv, 159_wmf9y5lb3d.csv, 15_j7tl3bmirm.csv, 15_jzm9dvtw73.csv, 160_9xq61i8z5q.csv, 161_qu7m9mil3a.csv, 162_gzcq0eli5q.csv, 163_g4ownm07q9.csv, 164_qcoq7yni4y.csv, 165_vcamdtt7pl.csv, 166_5lv4sjg1qv.csv, 167_bk26q01ex6.csv, 168_39vz7oesxk.csv, 169_2y8d92hhek.csv, 16_j1uppkkwpc.csv, 16_omnxwedc88.csv, 170_szpuak9x3b.csv, 171_evt2lbu5wz.csv, 172_gg34wiyfxn.csv, 173_op7k51t6eh.csv, 174_vw5a8s9vzj.csv, 175_954mvwcwet.csv, 176_cl4orgir7l.csv, 178_7rflrddsnn.csv, 179_enmmbxkd9b.csv, 17_qvtzqld86i.csv, 180_jm6cn0n08c.csv, 181_wk573tzjde.csv, 182_icy94tmqbq.csv, 183_ex67vzlp8e.csv, 184_hzlbex83q3.csv, 185_9bjzxvbwca.csv, 186_77g5px55z1.csv, 187_kp3lfo9rti.csv, 188_5sjzeb4w03.csv, 189_4mh7pjb6hj.csv, 18_o65lxoj85y.csv, 18_omkydub7qy.csv, 191_yxm7fc16uc.csv, 192_isa0drpmu4.csv, 193_kjtylxs5x6.csv, 194_tnwwh3zbnj.csv, 195_nwk7f5mq3h.csv, 196_h8ebmn0b4i.csv, 197_ni25hikvxf.csv, 198_shm86f7xzz.csv, 199_hzu91su6jo.csv, 19_sevtqojtws.csv, 1_ofoaso8yxd.csv, 1_z949c1i541.csv, 20_1zahygarmq.csv, 20_z3zjd5sgm4.csv, 21_x8u5ofrlzo.csv, 21_x90g7ie9t7.csv, 22_udsmb087i5.csv, 23_lce4ue1yit.csv, 24_1y633kulxy.csv, 24_pa58yc2bsn.csv, 25_ceghr7ntgk.csv, 25_zdmhvwm1pk.csv, 26_2k5thw2ykv.csv, 26_kcs08kmlzv.csv, 27_adgb2e3ry7.csv, 27_r6anjd4ie1.csv, 28_7svph4yobz.csv, 29_juqgc9mrcx.csv, 2_9actgs9mi7.csv, 2_wrrttduqx0.csv, 30_rb578trmsj.csv, 31_jhmyyhmjh5.csv, 32_jitoqenvci.csv, 33_zybtjc12ef.csv, 34_egy98kczu6.csv, 35_g0ldvn0rjm.csv, 36_4bg4d4bf4q.csv, 37_7swqi7vgnl.csv, 38_38doqz3unu.csv, 39_3064xz3lgl.csv, 3_5uxjfwf4c9.csv, 3_j0hn3a0xzt.csv, 40_zh8fjx6tiy.csv, 41_rrwls6y9fl.csv, 42_kr1ailiclk.csv, 43_1nemhlm74b.csv, 44_zsqcy1dp0n.csv, 45_4g2awfbbtn.csv, 46_eg7yytwm57.csv, 47_qrdef2nnjn.csv, 48_u6mx1k6p64.csv, 49_8fcz4c6oqy.csv, 4_vet6mgh5md.csv, 4_xu9afrhg0o.csv, 50_tmt7xio73x.csv, 51_km3hwahw1m.csv, 52_wfyl6gdoxn.csv, 53_4t2mju8m3y.csv, 54_07djfakfxh.csv, 55_o97cc78tyd.csv, 56_lxflttkcgn.csv, 57_l8ux51os5l.csv, 58_lvcsk0yy8g.csv, 59_tvvx1filo9.csv, 5_lcampqa5d8.csv, 5_wa4f0flvdd.csv, 60_zry6xyk2v2.csv, 61_l3jkp84irv.csv, 62_6v7alh2xza.csv, 63_j79b3nmr83.csv, 64_gjf3ihxyfj.csv, 65_5y9bhpar44.csv, 66_8gquz8hbpz.csv, 67_ip2ys2xhib.csv, 68_4fxf0cl1sa.csv, 69_4tlegp20n1.csv, 6_blz9uaf562.csv, 6_qyv4vhzoe.csv, 70_0b98yn7zuy.csv, 71_jjxizibivj.csv, 72_37k8qe6uf3.csv, 73_4s2wt6mgps.csv, 74_08dh1ae49m.csv, 75_gginvgfabf.csv, 76_f4jqlrtl0d.csv, 77_gup8yvrjg8.csv, 78_tshg4zfas1.csv, 79_wkvkhcnsn0.csv, 7_swfaqmk6y6.csv, 7_viucp080gt.csv, 80_zbkdixjx1u.csv, 81_dbvr7ry56o.csv, 82_3qr5zxdcxe.csv, 83_slgfcmc4xj.csv, 84_q4rh6yklie.csv, 85_i7f88e7b7g.csv, 86_kvt6l8gw8d.csv, 87_zzzdc841sd.csv, 88_p169fqr8hl.csv, 89_28gvnlaz8j.csv, 8_hfxm8dkunn.csv, 8_v4bqc7mvzy.csv, 90_60znzzhvt3.csv, 91_6nvvo85k2r.csv, 92_u8wrf1zhh5.csv, 93_7s7jo61qgr.csv, 94_gcso78g1mh.csv, 95_6ikda3av09.csv, 96_fe55r9bnfd.csv, 97_ksnb8zg7e2.csv, 98_8et5hi6nqf.csv, 99_7fcky2iv54.csv, 9_m11ut0fzvo.csv, "
     ]
    }
   ],
   "source": [
    "# discover all concepts in the directory chosen.\n",
    "concept_dict, ids, label_strs, raw_bofs = capture_all_concepts(nlp, datapath, incmatcher, excmatcher)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "data.shape = (2181, 1958)\n",
      "We have 2181 datapoints with 1958 independent concepts.\n"
     ]
    }
   ],
   "source": [
    "# now we convert the raw bag of concepts list-of-lists\n",
    "# to a presence absence matrix (pam)\n",
    "pam =  convert_raw_bof_to_pam(raw_bofs, C=len(concept_dict))\n",
    "print(f\"data.shape = {pam.shape}\")\n",
    "print(f\"We have {pam.shape[0]} datapoints with {pam.shape[1]} independent concepts.\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "No output requested.\n"
     ]
    }
   ],
   "source": [
    "# extract the concepts as a list\n",
    "concepts = concept_dict_to_list(concept_dict)\n",
    "show_concepts = False\n",
    "if show_concepts:\n",
    "    print(f\"concepts[:20] = {concepts[:20]}\")\n",
    "\n",
    "    print(f\"len(concept_dict) = {len(concept_dict)}\")\n",
    "    print(f\"concept_dict['it hit the ground before being fielded'] = {concept_dict['it hit the ground before being fielded']}\")\n",
    "else:\n",
    "    print(\"No output requested.\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "labels_as_indices[:10] = [0 0 0 0 0 0 0 0 0 0]\n",
      "categories: ['ball' 'foul' 'none' 'out' 'play' 'strike']\n"
     ]
    }
   ],
   "source": [
    "## convert the labels to one hot vectors\n",
    "# there is one odd label that shouldn't be there. We change this to 'strike'.\n",
    "label_strs[label_strs == ' it could be called a strike because the pitch landed in the strike zone before being hit'] = 'strike'\n",
    "labels_as_indices, categories = create_labels_as_indices(label_strs)\n",
    "\n",
    "print(f\"labels_as_indices[:10] = {labels_as_indices[:10]}\")\n",
    "print(f\"categories: {categories}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### What do we have\n",
    " \n",
    "At this stage we have the following parameters:\n",
    "\n",
    "* `concept_dict` (dict) - a dictionary of `C` concepts. Each key is a string representing the raw concept, each value is the index.\n",
    "* `concepts` (list) - the reverse lookup version of concept dict (`concepts[i] = k` where `concept_dict[k]=i`)\n",
    "* `ids` - a list of `N` uniquely identifying labels for the datapoints\n",
    "* `label_strs` - a list of `N` labels stored as strings. One per datapoint.\n",
    "* `categories` - an ordered list of category strings, where `categories[i]` is the string representation of category `i`.\n",
    "* `labels_as_indices` - labels as an index which maps to the label str via the category.\n",
    "* `raw_bofs` - a list of lists of \n",
    "* `pam` - a presence/absence matrix where `pam[i,j]=1` if `i`th datapoint contains `j`th concept.\n",
    "\n",
    "We discard `concept_dict`, `label_strs`, `raw_bofs`, to avoid duplication and as we can recover these from the others. The remainder are passed to the store function."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "concepts[688] = it was caught by the first basemen\n",
      "rows_to_remove = [  15   36   71  131  245  254  380  389  475  485  494  515  545  590\n",
      "  725  731  756  874  887  923 1074 1213 1217 1231 1236 1246 1364 1371\n",
      " 1424 1459 1469 1489 1559 1564 1569 1570 1646 1650 1728 1768 1805 1813\n",
      " 1823 1850 1868 1873 1885 1963 1998 2001 2048 2119 2134 2147 2149]\n",
      "concepts_to_remove = ['there is no video', \"the pitcher didn't throw the ball in this clip\", 'there is no video', 'the pitcher did not throw the ball', 'video did not work', 'there was no video of the the aftermath of the', 'the video is not loading at all', 'it to load', 'this video would not play', 'the video is broken', \"catcher's interference\", 'video did not', \"video didn't\", 'the baserunner returns to the base after attempting to steal a base', 'the video does not', 'the batter hit it foul', 'the count is already 22', 'video did not', 'the umpire called it a ball or a strike', 'the umpires decision', 'i have to rate this as no activity', 'the clip had lasted longer', \"the umpire's decision\", 'screen is black', 'line drive', 'video did not display or', 'the video is not working', 'no video with supported format and mime type found error message', 'video is not playing', 'the video failed to load', 'there is an error message', 'no video with supported format and mime type found', 'video did not load', 'the video did not play', 'then it was a groundout', 'then it is another groundout that is not listed', 'and the batter hit it out for a homerun', 'the pitcher did not throw the ball', 'video did not load', 'the video did not load', 'video did not', 'he readied for the next pitch', 'the video did not load', 'what happened', 'the network showing the game', 'the pitcher did throw the ball', 'they go back to the game', \"video doesn't\", \"the video doesn't load\", 'there was no activity of a pitch in the video clip', \"the pitcher didn't throw the ball\", 'the batter was awarded an intentional walk', 'the video did not load', 'video did not load', 'video would not load', 'i did not see this as a choice though']\n"
     ]
    }
   ],
   "source": [
    "print(f\"concepts[688] = {concepts[688]}\")\n",
    "rows_to_remove = np.where(label_strs=='none')[0]\n",
    "_, cols_to_remove = np.where(pam[\n",
    "    label_strs == 'none',:])\n",
    "concepts_to_remove = [ concepts[col] for col in cols_to_remove ]\n",
    "print(f\"rows_to_remove = {rows_to_remove}\")\n",
    "print(f\"concepts_to_remove = {concepts_to_remove}\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "datadir = datapath\n",
    "csvfname = f\"{datastem}.csv\"\n",
    "file_info = np.array(get_file_info(datadir, csvfname))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array(['there is no video.',\n",
       "       \"the pitcher didn't throw the ball in this clip.\",\n",
       "       'there is no video/will not play',\n",
       "       'the pitcher did not throw the ball.', 'video did not work.',\n",
       "       'there was no video of the the aftermath of the pitch',\n",
       "       'The video is not loading at all. Been trying for a few minutes.',\n",
       "       'Once again video wouldnï¿½t load. Spent a few minutes trying to get it to load. Videos 1 and 10 donï¿½t work. 2 through 9 worked great though.',\n",
       "       'this video would not play.', 'The video is broken.',\n",
       "       \"the umpire called catcher's interference.\", 'video did not play',\n",
       "       'No video', \"video didn't load\",\n",
       "       'The baserunner returns to the base after attempting to steal a base.',\n",
       "       'the video does not load',\n",
       "       'the batter hit it foul but the count is already 2-2.',\n",
       "       'video did not load',\n",
       "       'the clip ended before we saw if the umpire called it a \"ball\" or a \"strike.\"  this looked like a \"borderline\" pitch, meaning it may have been called a \"strike\" or a \"ball.\" the clip cut off before we could see the umpires decision so i have to rate this as \"no activity.\"  if the clip had lasted longer, we would have seen the umpire\\'s decision.',\n",
       "       \"This is a ground out, which isn't an option.\", '',\n",
       "       'Screen is black. No activity', 'Screen black, No activity.',\n",
       "       'the batter got a line drive single.',\n",
       "       'video did not display or load', 'the video is not working.',\n",
       "       'No video with supported format and MIME type found error message.',\n",
       "       'Video is not playing :(',\n",
       "       'the video failed to load and the play button could not be clicked.',\n",
       "       'there is an error message. No video with supported format and MIME type found.',\n",
       "       'video did not load.', 'Bad format on the video.',\n",
       "       'The video did not play.', 'Broken video', 'Broken video',\n",
       "       'Broken video',\n",
       "       'then it was a groundout but none of those are listed.',\n",
       "       'then it is another groundout that is not listed.',\n",
       "       'and the batter hit it out for a homerun.',\n",
       "       '*the pitcher did not throw the ball...', 'video did not load.',\n",
       "       'The video did not load.', '', 'Video did not load',\n",
       "       'got the ball back from the catcher. He readied for the next pitch.',\n",
       "       'This is labeled as none because the video did not load.',\n",
       "       \"we don't know what happened. The network showing the game was airing a commercial. If the pitcher did throw the ball, we'll have to wait until they go back to the game to see what happened/\",\n",
       "       \"video doesn't play\", \"The video doesn't load.\",\n",
       "       'there was no activity of a pitch in the video clip.',\n",
       "       \"The pitcher didn't throw the ball. The batter was awarded an intentional walk.\",\n",
       "       'The video did not load.', 'video did not load.',\n",
       "       'video would not load.',\n",
       "       'Intentional walk. I did not see this as a choice though.'],\n",
       "      dtype='<U376')"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "file_info[rows_to_remove,-1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array(['ZK1QJKN7QT2Q', 'WKIAO1SNZ7O9', '75XVC2M6XM6Z', ...,\n",
       "       'SWJI1NIX8WOL', 'XVWY30KEG31J', 'W1ND57WTYJNE'], dtype='<U12')"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "filter_ = (label_strs=='none')[:10]\n",
    "ids"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Before removing none label\n",
      "pam.shape = (2181, 1958)\n",
      "len(concepts) = 1958\n",
      "ids.shape = (2181,)\n",
      "labels_as_indices.shape = (2181,)\n",
      "categories = ['ball' 'foul' 'none' 'out' 'play' 'strike']\n",
      "After removing none label\n",
      "pam.shape = (2126, 1911)\n",
      "len(concepts) = 1911\n",
      "ids.shape = (2126,)\n",
      "labels_as_indices.shape = (2126,)\n",
      "categories = ['ball' 'foul' 'out' 'play' 'strike']\n"
     ]
    }
   ],
   "source": [
    "resultsdict = dict(concepts=concepts, ids=ids, categories=categories,\n",
    "                   labels_as_indices=labels_as_indices, pam=pam)\n",
    "remove_none_labels = True\n",
    "if remove_none_labels:\n",
    "    print(\"Before removing none label\")\n",
    "    print(f\"pam.shape = {pam.shape}\")\n",
    "    print(f\"len(concepts) = {len(concepts)}\")\n",
    "    print(f\"ids.shape = {ids.shape}\")\n",
    "    print(f\"labels_as_indices.shape = {labels_as_indices.shape}\")\n",
    "    print(f\"categories = {categories}\")\n",
    "    rows_to_remove = (label_strs=='none')\n",
    "    pam = pam[~rows_to_remove,:]\n",
    "    cols_to_remove = (count_datapoints_in_each_feature(pam) == 0)\n",
    "    pam = pam[:,~cols_to_remove]\n",
    "    concepts = [ concept for concept, to_remove in zip(concepts, cols_to_remove) if not to_remove ]\n",
    "    label_strs = label_strs[~rows_to_remove]\n",
    "    labels_as_indices, categories = create_labels_as_indices(label_strs)\n",
    "    ids = ids[~rows_to_remove]                                                   \n",
    "    resultsdict = dict(concepts=concepts, ids=ids, categories=categories,\n",
    "                       labels_as_indices=labels_as_indices, pam=pam)\n",
    "    resultsdict['rows_to_remove'] = rows_to_remove\n",
    "    #labels_as_indices = labels_as_indices[~rows_to_remove]                                                   \n",
    "    #categories = categories[categories != 'none'] \n",
    "    print(\"After removing none label\")\n",
    "    print(f\"pam.shape = {pam.shape}\")\n",
    "    print(f\"len(concepts) = {len(concepts)}\")\n",
    "    print(f\"ids.shape = {ids.shape}\")\n",
    "    print(f\"labels_as_indices.shape = {labels_as_indices.shape}\")\n",
    "    print(f\"categories = {categories}\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [],
   "source": [
    "for c in concepts:\n",
    "    if 'video' in c:\n",
    "        print(f\"video in {c}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pickle\n",
    "procpath = form_processed_fname(datastem, modelstem, 'pkl', 'raw_concepts')\n",
    "#store_concept_objects(procpath, resultsdict)\n",
    "pickle.dump(resultsdict, open( procpath, \"wb\" ))\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "#concepts, ids, categories, labels_as_indices, pam = load_concept_objects(procpath)\n",
    "resultsdict = pickle.load(open( procpath, \"rb\" ))\n",
    "\n",
    "#labels = label_indices_to_one_hot(labels_as_indices, num_categories=len(categories))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Briefly analyse discovered concepts"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(data.shape[0],data.shape[1],concept_counts.shape) = (2126, 1911, (1911,))\n",
      "np.sort(concept_counts)[-1:-K:-1] = [109  66  48  46  43  28  20  20  17  17  16  15  13  13  12  12  12  11\n",
      "  11   9   9   9   9   9   9   8   8   8   8   8   8   8   8   7   7   7\n",
      "   7   7   7   7   7   7   7   7   7   7   6   6   6   6   6   6   6   6\n",
      "   6   5   5   5   5   5   5   5   5   5   5   5   5   5   5   5   4   4\n",
      "   4   4   4   4   4   4   4   4   4   4   4   4   4   4   4   4   3   3\n",
      "   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3\n",
      "   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3\n",
      "   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3\n",
      "   3   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2\n",
      "   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2\n",
      "   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2\n",
      "   2]\n"
     ]
    }
   ],
   "source": [
    "concept_counts = count_datapoints_in_each_feature(pam)\n",
    "K = 200\n",
    "print(f\"(data.shape[0],data.shape[1],concept_counts.shape) = {(pam.shape[0],pam.shape[1],concept_counts.shape)}\")\n",
    "print(f\"np.sort(concept_counts)[-1:-K:-1] = {np.sort(concept_counts)[-1:-K:-1]}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Text(0.5, 1.0, 'Raw ranked concept counts Survey data')"
      ]
     },
     "execution_count": 31,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYUAAAEaCAYAAAD+E0veAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/d3fzzAAAACXBIWXMAAAsTAAALEwEAmpwYAAAnDElEQVR4nO3deXxU9b3/8dcnk4SQBQKEfd9UFBEkggJ69arVolaruKDWpS61it62tr/Wapd7W6v2XltErdat1L1q1YorahUVREhcAUH2fd93SPL9/XFOwhCSySRk5pzJvJ+PRx6ZOdt8Zs7MvOd7vmcx5xwiIiIAGUEXICIi4aFQEBGRKgoFERGpolAQEZEqCgUREamiUBARkSoKBWkwM/utmT3ZCMs50cyWNUZNIjUxs/Fm9vug60gFCoWAmNkiM9tpZtvMbJX/ps0Pui5pODPrYWbOzDKDriXawYS3mRWa2WP+e3SrmX1jZj9v7BrDxMzeN7Org64jKAqFYJ3lnMsHBgKDgFuS+eBmFknm40lK+jOQD/QDWgLfAeY3ZEFhC0upmUIhBJxzq4C38MIBADP7hZnN93+dzTKz70aNW2xmg/3bl/q/Tg/3719tZi/X9Dh+a+QBM3vdzLYDJ5nZGWb2mZltMbOlZvbbqOkrf/lebmZLzGydmd1ay7KzzOwZM/unmWWbWSf/9lozW2hmN0VN29yvZaOZzQKOifX6mNkRZva2mW0ws9Vm9kt/eDMzG2tmK/y/sWbWzB93opktM7ObzWyNma00syur1XC3/1puNrOPzKy5P+5YM5tiZpvM7AszOzFqvvfN7A4zm+bP9y8za+2P/sD/v8lvAR5Xw3OJmNkvo9ZtqZl19ccNM7Pp/nKnm9mwqPkWmdkpUferfv3HWk9mdjrwS+BCv6Yv/OFXmNkCv4aFZnZJLS//McDTzrmNzrkK59xs59wL1R636ss++le2/xiTzezPZrYB+J3/mvaPmr6teS3mdv79M83sc3+6KWY2wB/+MzP7Z7XX8l4zG1tT0WY2yMw+9Z/fP4CcqHGtzOxV/7250b/dxR93O3A8cJ//et3nD7/HvM/HFn+dHV/L65X6nHP6C+APWASc4t/uAnwF3BM1/nygE15wXwhsBzr64x4HbvZvP4T3y+2HUeN+XMtjjgc2A8P95eYAJwJH+vcHAKuBc/zpewAOeBhoDhwF7Ab6+eN/Czzpj3vNX37EX1Yp8GsgG+gFLABO8+e7E/gQaA10BWYAy2qpuQBYCdzs11sADPXH/Q8wFWgHtAWmAL/zx50IlPnTZAEjgR1AK3/8/cD7QGe/5mFAM//+en/6DOBU/35bf773geVAfyAP+CfwZLXXKzPGev+Zv64PBcx/Tdv4r8VG4HtAJjDav9+m+vsl+rWvz3qKmjcP2AIc6t/vCBxRS72PADOBK4G+1cYd8Hz91+dq//YV/jq40X9OzYHHgNujpr8BeNO/fTSwBhjqr5PL/efdzK9xO1DoT5vpTzu4hpqzgcXAj/11PwrYC/zeH98GOA/IxXs/PQ+8XNNziBp2qT9fJt57cRWQE/T3SEK+m4IuIF3//Df7NmCr/8F6t/INX8v0nwNn+7evAl7xb38NXA08699fDBxdyzLGA4/XUddY4M/+7coPfZeo8dOAi/zbvwVeASYB4wDzhw8FllRb7i3A3/zbC4DTo8ZdS+2hMBr4rJZx84GRUfdPAxb5t08EdrL/F9Ya4Fi8L/udwFE1LPPnwBPVhr0FXO7ffh+4M2rc4cAe/0us8vWKFQpzKtdjteHfA6ZVG/YxcEXU+6WuUIi1nqqHwia8L8bmdbwfmuO1NErxvljnAd+u9rixQqH6++AUYEHU/cnAZf7tB/BDvdrr9R/+7TeAa/zbZwKzaqn5BGBF5fvRHzYFPxRqmH4gsLGm5xDjddlY0/unKfxp81GwznHOFeB9gR0GFFWOMLPLoprRm/B+mVaOnwQcb2Yd8L6M/gEMN7MeeNt9P4/xmEuj75jZUDN7z29Kbwaui67Dtyrq9g68bcyVjsVrYdzp/E8L0B3oVFm7X/8vgfb++E7V6lgco96u1L4Nu1O1eRf7wyqtd86V1VB7EV6ro6bldgfOr1b7CLxfqpWq157Fga9ZbWp7PtWfS+WyO8e5XIi9nqo457bjtT6vA1aa2Wtmdlgt0+50zv3BOTcY75fyc8DzUZvM6rK02v1/A8399113vC/kl/xx3YGbq732Xdm3Tv+O94sd//8TtTxmJ2B51PsRol5bM8s1s7/6mw634G32K7QYfWzmbYb82t+0twnvcxbvOk8pCoUQcM5NwvsV/38A/oflYWAM3uaDQrxNLOZPPw/vQ38T8IFzbiveF8K1wEfOuYpYD1ft/tN4v/a7OudaAg9WPk6cJgJ3AO+aWeWX/lJgoXOuMOqvwDk30h+/Eu/DXqlbjOUvBXrXMm4F3hdJ9HJWxFHzOmBXLctditdSiK49zzl3Z9Q01Wvf6y8znlMO1/Z8qj+XymUv929vx9vcUalDHI9V6YC6nHNvOedOxQu72Xjvt9gLcW4L8Ae8lkZPvybqqGu/x/bfm8/htQAvBl7137/gvTa3V3vtc51zz/jjXwYG+H0SZwJP1VLqSqCzmUW/j6PfYzfjbb4b6pxrgdeygH3v+/1q9vsPfg5cgLf5sRBvM2x9PicpQ6EQHmOBU81sIN6HzgFrAczrIO1fbfpJeKExyb//frX78SoANjjndpnZELwPar045/6IFy7vmlkR3qaLLWb2c/M6dCNm1t/MKjuUnwNu8Tv8uuBtc67Nq0AHM/uReR3LBWY21B/3DHCb31lZhNeHUeeul/4X02PAn8zrEI+Y2XHmdVI/CZxlZqf5w3PM67TuErWIS83scDPLxeuzeME5V463virw+lBq8wheh2tf8wwwszbA68AhZnaxmWWa2YV4m6Ze9ef7HLjIvA79Yrzt5PFaDfQwswwAM2tvZt8xszy8vodtQHlNM5rZr8zsGPN2HsgB/gtv09Mc59xavNC61H+tvk/tAR7tabyWyiX+7UoPA9f5rQgzszzzdoQoAHDO7QJe8OeZ5pxbUsvyP8bry7jJfy3PBYZEjS/A23y4yW/x/Kba/KvZfx0W+MtbC2Sa2a+BFnE8z5SkUAgJ/wP2OPAr59ws4G68N/dqvI7gydVmmYT3Zv2glvvxuh74HzPbivel+lwD6/8d3i+5d/Ca1mfhbRpYiPcr+hF/OMB/4zXnF+K1NGrbDID/K/JUf3mrgLnASf7o3wMlwJd4nbef+sPi8VN/nunABuAuIMM5txQ4G29z11q8X68/Y//PyhN4LbtVeJuhbvJr3QHcDkz2N38cW8Pj/gnvNZ6I19n7KN52/fV4v35vxuvY/n/Amc65df58v8L7wt2I9/o9Tfye9/+vN7NP/edyM17rZAPwH3jvg5o44G9463AF3ro4wzm3zR9/Dd7rsx44Am/bfUzOuU/wWhmd8PoJKoeX+Mu7D+95zsPrl4j2d7zPQ6z3zB7gXH/ejXgB9GLUJGPx+krW4e2o8Ga1RdwDjPL3TBqH16f0BvAN3vt2FwduFmsyKjsGRSQOZvY+XqftI0HXko7MrBve5q4O/uYsaWRqKYhISvA3f/0Eb087BUKC6AhDEQk9v/9jNd7mm9MDLqdJ0+YjERGpos1HIiJSRaEgIiJVUrpPoaioyPXo0SPoMkREUkppaek651zbmsaldCiU5Rbx0sQP6No6t+6JRUQE8M60XNu4UG0+MrNzzOxh805H/K26pt9dVsEZ4z7k7Vmrk1GeiEiTl/BQMO+qTWvMbEa14aeb2Rwzm2dmvwBwzr3snLsG70jEC+tadt92+XRrk8s1j5dw+2uz2Fse65Q/IiJSl2S0FMZTbb9i/2yE9wPfxju/y2jzLxLju80fH1N2ZgYvXDeM7x3bnYc/XMiFf/2YFZt2Nl7lIiJpJuGh4Jz7AO/8KtGGAPOccwv885Q8C5ztnwTrLuAN59yn8Sw/JyvC787pz72jB/HN6m2MHPch781e07hPQkQkTQTVp9CZ/U8otcwfdiPeRThGmdl1Nc1oZteaWYmZlaxdu7Zq+FlHdWLCjSPo2LI5V46fzp1vzKZMm5NEROolqFCo6Tzkzjk3zjk32Dl3nXPuwZpmdM495Jwrds4Vt227/x5VPYvyeOn6YYwe0o0HJ81n9MNTWbV5VyLqFxFpkoIKhWXsf6GSLsR3cZQ65WRFuOPcIxl74UBmrtjCyHEfMumbtXXPKCIigYXCdKCvmfU0s2zgIryrfzWacwZ15pUxI2ib34wr/jaNuyfOobxC53kSEYklGbukPoN3sZhDzWyZmV3lXzd3DN7FK74GnnPOzWzsx+7TLp+XbxjO+YO7cO+/53HJI1NZs0Wbk0REapOSZ0k1s7OAs/r06XPN3Llz45rnhdJl3PbyV+Q3y+KeiwYyvE+TvOa2iEidzKzUOVdc07hQHdEcL+fcBOfctS1btqx7Yt+owV14ZcwICnOzuPTRTxj7zjfanCQiUk1KhkJDHdK+gFfGDOe7Azsz9p25XP7YNNZu3R10WSIioZFWoQCQm53J3RccxV3nHcn0RRsYOe5Dpi5YH3RZIiKhkHahAGBmXHhMN16+YTgFzTK5+OGp3PfvuVRoc5KIpLmUDAUzO8vMHtq8efNBLadfxxa8cuMIzhzQif+b+A1XjJ/O+m3anCQi6SslQ6EhHc21yW+WyT0XDeT27/Zn6oL1nDHuI6Yvqn6qJhGR9JCSodDYzIxLhnbnxR8OIycrg4semsoD78/X5iQRSTsKhSj9O7dkwo0jOP2IDtz15myufryEjdv3BF2WiEjSKBSqKcjJ4r6LB/E/Zx/BR3PXcca4DyldvDHoskREkkKhUAMz47LjevDPHw4jEjEu/OvHPPzBAlLx6G8RkfpIyVBorL2P6nJkl5a8euPxnNyvHbe//jXXPF7K5h17E/qYIiJBSslQaMy9j+rSsnkWD146mF+feTiTvlnDyHEf8vnSTQl/XBGRIKRkKCSbmfH9ET15/rphAJz/4BSe/mRJwFWJiDQ+hUI9DOxayGs3jWBY7yJ++dJX/PpfM9irS36KSBOiUKinwtxsHrviGH5wQi8e/3gxlz06jQ3abVVEmgiFQgNEMoxbRvbjTxccRemSjZx9/0fMXrUl6LJERA6aQuEgnHt0F577wXHs3lvBuX+ZwlszVwVdkojIQUnJUEjWLqnxGNi1kAk3jqBv+wJ+8EQp9747V8cziEjKSslQSOYuqfFo3yKHf1x7LOcO6szdb3/DmKc/Y8eesqDLEhGpt8ygC2gqcrIi3H3BURzWsYA735jNwnXbeeiywXRplRt0aSIicUvJlkJYmRnXntCbR684hqUbd3D2fZN1Gm4RSSkKhQQ46dB2vHzDcFo2z+Lih6fyzDQd6CYiqUGhkCC92+bz0g3DGda7iFte/Irf6EA3EUkBCoUEatk8i8euOIZrT+jF3/0D3XR9BhEJM4VCgkUyjF+O7Mfd53sHun3n/o+Ys2pr0GWJiNRIoZAk5w3uwj+uPdY/0G0yE3Wgm4iEUEqGQpgOXquPQd1aMeHGEfRpl8+1T5Ry98Q5zFyxmZ17yoMuTUQEAEvlo2+Li4tdSUlJ0GXU26695dzy4le89NnyqmGdC5vTq20evdvm7/e/Q4sczCzAakWkqTGzUudccY3jFArBcM4xd8025q7exoK125i/dhsL1m1n/pptbI9qOeRmR+jVNo9eRV5IDOjSkmG9i8jJigRYvYikslihoCOaA2JmHNK+gEPaF+w33DnHmq27mb9mG/P9kFiwbjulizcy4csVOOcFxYmHtuW0Izpw4qHtaNk8K6BnISJNjUIhZMyM9i1yaN8ih2F9ivYbt3NPOZ8sXM/EWat5e9ZqXv9qFVkR49hebTjtiA6cenh72rfICahyEWkKtPkoRVVUOD5buomJM1fx1sxVLFq/A4BB3Qr51uEduOiYrrTKyw64ShEJI/UpNHGV/RNvzVjFxFmr+Wr5Zk48tC3jrxwSdGkiEkLqU2jiovsnbjy5L39++xvueXcui9dvp3ubvKDLE5EUkpLHKUhso4d0I5JhPK0T8YlIPSkUmqAOLXM4pV87ni9Zxu4yHRgnIvFLyVBI1SOak+nSY7uzYfse3pyh02mISPxSMhTCdjnOMBreu4jubXJ5aqo2IYlI/FIyFKRuGRnGJUO7MW3RBp2VVUTiplBowkYN7kp2JIOnP1kcdCkikiIUCk1Y67xsRh7ZgRc/Xc723WVBlyMiKUCh0MRdemx3tu4uY8IXK4IuRURSgEKhiRvcvRWHti/gyU8Wk8pHr4tIcigUmjgz49JjuzFj+Ra+XKZdeEUkNoVCGjhnUGdysyM8pQ5nEamDQiENFORkcfbATrzyxQo279gbdDkiEmIKhTRxydDu7NpbwYufLQu6FBEJMYVCmujfuSVHdS3kqU+WqMNZRGqlUEgjlw7txrw12/hk4YagSxGRkErJUNAJ8RrmzAGdaJGTyVOf6HxIIlKzlAwFnRCvYZpnRxg1uCtvzljJ2q27gy5HREIoJUNBGu7iod3YW+54rmRp0KWISAgpFNJMn3b5DOvdhienLmZveUXQ5YhIyCgU0tD3h/dk5eZdugCPiBxAoZCG/vOwdvRok8tjkxcGXYqIhIxCIQ1lZBhXDu/JZ0s28emSjUGXIyIholBIU6MGd6EgJ5PHPlJrQUT2USikqbxmmYwe0o03Zqxi+aadQZcjIiGhUEhjlw/rAcDjUxYFWoeIhIdCIY11LmzO6Ud04JlpS3S5ThEBFApp7/sjerJlVxn//FRnTxURhULaO7pbIUd1LeRvkxdRUaGzp4qkO4VCmjMzrhrRk4XrtvPenDVBlyMiAVMoCN/u34GOLXN0MJuIKBQEsiIZXHZcDybPW8/XK7cEXY6IBEihIACMHtKV5lkRHcwmkuYygy5AwqEwN5vzBnfmuenLuPCYruRm7//WyM2O0KMoL6DqRCRZFApS5crhPXnqkyWMevDjGsffdkY/rj6+V5KrEpFkSslQMLOzgLP69OkTdClNSu+2+bz4w2Gs3nLgVdmenb6EP745hxF9izisQ4sAqhORZDDnUnff9OLiYldSUhJ0GWlh/bbdnDb2Q4rys/nXmOE0y4wEXZKINJCZlTrnimsap45miUub/Gb8cdSRzF61lT9N/CbockQkQRQKErf/PKw9Fw/txkMfLmDqgvVBlyMiCaBQkHq5dWQ/urfO5ebnvmDLrr1BlyMijUyhIPWS1yyTP104kJWbd/Lfr8wKuhwRaWQpufeRBOvobq0Yc1Ifxv17Hss37SArsu+3RX6zTG46uS/9OmoPJZFUpFCQBrnx5L6s3rKbuWu2sqesomr4jOWbeefr1Yw5qS/Xn9R7v8AQkfBTKEiDZEUyuGvUgAOGb9i+h/+eMJM/v/MNb81cxf+eP4AjOrUMoEIRaQj9jJNG1Tovm3suGsRfvzeYNVt3c/Z9k3nog/mk8vEwIulEoSAJcdoRHXjnJydw6uHt+cPrs/nhk59qbyWRFKBQkIQpzM3mL5cczW1n9OPtr1dz9n2Tmb1Kp+YWCTOFgiSUmXH18b145ppj2b67jO/eP4V5a7YGXZaI1EKhIEkxpGdrXhkzgkiG8cc35wRdjojUQqEgSdOhZQ4/OKEXE2etpnTxxqDLEZEaKBQkqa46vidF+c24643Z2iNJJITiCgUzGx7PMJG65GZn8l+n9GXaog28N2dN0OWISDXxthTujXOYSJ0uOqYrPdrkctcbcyivUGtBJExiHtFsZscBw4C2ZvaTqFEtAF1lRRokK5LBT087lDFPf8bvXp1FT//az73a5nF837YBVyeS3uo6zUU2kO9PVxA1fAswKlFFSdM3sn9HhvVewvgpi/Ybfu/oQZx1VKdgihKR2KHgnJsETDKz8c65xUmqSdJARobxxFVD2bzTO8q5vMJxw1Of8tPnv6B7m1wGdCkMtkCRNBVvn0IzM3vIzCaa2b8r/xJamTR5kQyjdV42rfOyaVvQjAcuPZqi/GZc83gJq7fsCro8kbRk8ewWaGZfAA8CpUB55XDnXGniSqtbcXGxKykpCbIEaWRfr9zCeQ9MoVVuNl1aNd9vXCTDGHNSH4b1KQqoOpGmwcxKnXPFNY2Lt6VQ5px7wDk3zTlXWvnXiDWKANCvYwsevHQwPYpyDxi3YO12bnr2Mzbv0In1RBIl3uspTDCz64GXgN2VA51zGxJSlaS1Ew5pywmHHLgX0swVm/nOfZO5882vuePcA6/lICIHL96WwuXAz4ApeJuQSgFtt5GkOqJTS64a0ZNnpi1l2kL9HhFJhLhaCs65nokuRCQePzqlL699uZLrnyqlV1E+kQzj1jP60b+zru4m0hjiCgUzu6ym4c65xxu3HJHYcrMzGTd6EGPf+Yayckfp4o28ULpMoSDSSOLtUzgm6nYOcDLwKdBooWBmvYBbgZbOOR0YJ7Ua3L0VT1w1FIBLHpnKJ9qUJNJo4upTcM7dGPV3DTAI72jnmMzsMTNbY2Yzqg0/3czmmNk8M/uF/xgLnHNXNeRJSPoa0qMNs1dt0R5JIo2koafO3gH0jWO68cDp0QPMLALcD3wbOBwYbWaHN7AOSXNDe7XGOZi+SK0FkcYQb5/CBKDyKLcI0A94rq75nHMfmFmPaoOHAPOccwv8ZT8LnA3MirOWa4FrAbp16xbPLNKEDexaSHZmBp8sXM8ph7cPuhyRlBdvn8L/Rd0uAxY755Y18DE7A0uj7i8DhppZG+B2YJCZ3eKcu6OmmZ1zDwEPgXdEcwNrkCYiJyvCwK6F6lcQaSTx9ilMAmbjnSm1FbDnIB7Tan4It945d51zrndtgSBSk6E9WzNj+Wa27lK/gsjBivfKaxcA04DzgQuAT8ysoXsILQO6Rt3vAqxo4LJEGNqzDRUOXfdZpBHEu/noVuAY59waADNrC7wDvNCAx5wO9DWznsBy4CLg4gYsRwSAo7sXkplh/OW9+Xw0d13MaTMyjNFDulVd2EdE9hdvKGRUBoJvPXG0MszsGeBEoMjMlgG/cc49amZjgLfwOq0fc87NrE/RZnYWcFafPn3qM5s0UbnZmZwxoCPvzFrNzBWbY067Y28567bu5k8XDkxOcSIpJt5TZ/8vMAB4xh90IfClc+7nCaytTjp1ttTXT5//gokzV1Fy26lkZzZ0j2yR1NbgU2ebWR8zG+6c+xnwV7xgOAr4GH8PIJFUcvoRHdiyq4ypC9YHXYpIKNX1U2kssBXAOfeic+4nzrkfA6/740RSyoi+ReRmR3hz5qqgSxEJpbr6FHo4576sPtA5V1LDQWkioZeTFeGkQ9sxceZqrhrRs8b9o6vrVNicnKxIwmsTCYO6QiEnxrjmMcaJhNbp/Tvw2lcrOfnuSXFN/63D2/PQZTVufhVpcuoKhelmdo1z7uHogWZ2Fd6FdgKhvY/kYIw8siNZkQx2l5XXOe0/P13O1AXrqahwZGTE064QSW11hcKPgJfM7BL2hUAx3hlSv5vAumJyzk0AJhQXF18TVA2SuiIZxun9O8Q17a695XzwzVoWrt9O77b5Ca5MJHgxQ8E5txoYZmYnAf39wa855/6d8MpEQmBg11YAfL5kk0JB0kK8l+N8D3gvwbWIhE6fdvnkZUf4fOkmzhvcJehyRBIu3iOaRdJSJMMY0KWQz5ZuZG95Rb3mNSAzogPkJLUoFETqMKhbIX95fz59b32jXvNlZhhPXT2Uob3aJKgykcaXkqGgvY8kma4Y3oP8nEwqKuK/fEdZhWPsO3P5fOkmhYKklJQMBe19JMnUriCH60+s/w+Q8VMWsXjDjgRUJJI42uApkiDdW+eyVKEgKUahIJIgXVvnskShIClGoSCSIN3b5LJ8407K6rnXkkiQFAoiCdKtdS5lFY6Vm3cFXYpI3FKyo1kkFXRr7V3y89Q/TyJiB543KT8nk5dvGE7Hljq3pIRHSoaCdkmVVFDcoxU/OqUv23aVHTBu3bbdvPz5Cmav2qpQkFBJyVDQLqmSCrIiGfzolENqHLd0ww5e/nwFa7fuTnJVIrGpT0EkAG0LmgEoFCR0FAoiAcjJilCQk6lQkNBRKIgEpG1BM4WChI5CQSQgbfMVChI+CgWRgLQtaMbabQoFCZeU3PtIpCloV5DDxE2rue3lr+qc9rheRZwxoGMSqpJ0l5KhoOMUpCk4tldrJny5gje+WhVzuq27y5g8b71CQZLCnIv/HPFhU1xc7EpKSoIuQyShbn3pK96csYrSX50adCnSRJhZqXOuuKZx6lMQCbkWzbPYsmsvqfwDTlKHQkEk5FrkZLG33LFrr862KomnUBAJuYIcr+tv6669AVci6UChIBJyLZpnAbBFoSBJoFAQCbkWfkthSw1nWxVpbAoFkZAryPFbCjvVUpDEUyiIhFzL5mopSPKk5MFrIumksqUwZd66OndL7dMunyM6tUxGWdJEpWQo6IhmSSeFuVnkZUd4dvpSnp2+NOa0HVvm8PEtJyepMmmKUjIUdOU1SSfNMiN89PP/ZMOOPTGne/D9+bz21cokVSVNVUqGgki6aZWXTau87JjTtG+Rw6695TjnMLMkVSZNjTqaRZqI5tkRKhzsKdeRz9JwCgWRJiInKwKg02HIQVEoiDQRzatCoTzgSiSVKRREmoicLO/jvHOPQkEaTqEg0kRUtRTKFArScAoFkSYiJ9sLBbUU5GAoFESaiJxMdTTLwVMoiDQRzbPV0SwHT6Eg0kRU9insVCjIQdARzSJNROXeR0s37GDB2m2Nuux2LXLIb6avi3SgtSzSRFSeTfWON2ZzxxuzG3XZh7Yv4K0fn9Coy5RwSslQ0FlSRQ7UOi+bp68Zytqtuxt1uc+VLGXOqq2NukwJr5QMBZ0lVaRmw3oXNfoyP128kRnLtzT6ciWc1NEsIjFFMjIor4h9cR9pOhQKIhJTZsQoq9CxD+lCoSAiMWVmmFoKaUShICIxZWYYZQqFtKFQEJGYIhkZOAcVCoa0oFAQkZgyI96lPdVaSA8KBRGJKZLhhYL6FdKDQkFEYsr0Q2Gv9kBKCwoFEYmpqqVQrpZCOlAoiEhMmRHva0J9CulBoSAiMWWqTyGtKBREJKbKzUc6qjk9KBREJCa1FNKLQkFEYtrXUlAopAOFgojElJnhdzRr76O0kJLXUxCR5KlsKTw4aT5t8rIDqaFLq+ZcMbxnII+dblIyFHTlNZHk6d02j6L8bCbOXBXI4+8td+wpr2BUcVddJzoJzLnUbRIWFxe7kpKSoMsQkQR65MMF/P61r/nyt9+ihX8dajk4ZlbqnCuuaZz6FEQk1My8zVdOe8QmhUJBRELN/P+O1N2qkUoUCiISan5DgRTe0p1SFAoiEmoZlZuPAq4jXSgURCTU9rUUFAvJoFAQkVCr7FPQAdXJoVAQkXCr2nykVEgGhYKIhFplS0GZkBwKBREJtao+hWDLSBsKBREJtaq9j5QKSaFQEJFQ29fRrFRIBoWCiISaNh8ll0JBRELNqNx8pFhIBoWCiISbTnORVAoFEQm1yo5mSQ6FgoiEmjqak0uhICKhprOkJpdCQURCTXsfJZdCQURCTXsfJZdCQURCTS2F5FIoiEiomU5zkVQKBREJtaprNCsVkkKhICKhps1HyaVQEJFQ29fRHHAhaUKhICKhllHVUlAqJINCQURCTQevJZdCQURCzksFneYiOTKDLqCSmeUBfwH2AO87554KuCQRCQG1FJIroS0FM3vMzNaY2Yxqw083szlmNs/MfuEPPhd4wTl3DfCdRNYlIqlD50hNrkS3FMYD9wGPVw4wswhwP3AqsAyYbmavAF2Ar/zJyhNcl4ikiMpTZ//ixS/Jyw7Nxo2UdPmwHow8smPMaRL6CjvnPjCzHtUGDwHmOecWAJjZs8DZeAHRBficGC0YM7sWuNa/u7t6KySJWgKbA1hGvPPUNV2s8bWNq2l49WFFwLo46kuExlgnDVlO2NcJBLdeGm2dLA7fZ+Vghyd9nTy372bfWidyziX0D+gBzIi6Pwp4JOr+9/BaE3nA34AHgEviXHZJouuP8dgPBbGMeOepa7pY42sbV9Pw6sNSfZ00ZDlhXydBrpeg1kky1svBDg/rZyWItlhNmwidc247cGWyizkIEwJaRrzz1DVdrPG1jatpeGO8Do2lsWqp73K0TmoX1DqpzzwNXS+NNTwItdZifmokjL/56FXnXH///nHAb51zp/n3bwFwzt3RgGWXOOeKG7FcOUhaJ+Gk9RI+YV0nQRynMB3oa2Y9zSwbuAh4pYHLeqjxypJGonUSTlov4RPKdZLQloKZPQOciNehshr4jXPuUTMbCYwFIsBjzrnbE1aEiIjELeGbj0REJHXoNBciIlJFoSAiIlWaVCiYWZ6Z/d3MHjazS4KuR8DMepnZo2b2QtC1yD5mdo7/OfmXmX0r6HoEzKyfmT1oZi+Y2Q+DqiP0oaDzJ4VPfdaJc26Bc+6qYCpNL/VcLy/7n5MrgAsDKDct1HOdfO2cuw64AAhsV9XQhwLe+ZNOjx4Qdf6kbwOHA6PN7HC802Qs9SfT+ZMSZzzxrxNJnvHUf73c5o+XxBhPPdaJmX0H+Ah4N7ll7hP6UHDOfQBsqDa46vxJzrk9QPXzJ0EKPLdUVc91IklSn/VinruAN5xznya71nRR38+Kc+4V59wwILDN36n6xdmZfS0C8MKgM/AicJ6ZPUC4DilPBzWuEzNrY2YPAoMqj16XpKrts3IjcAowysyuC6KwNFbbZ+VEMxtnZn8FXg+mtBBdZKeemsr5k5qS2tbJekBfOsGpbb2MA8YluxgBal8n7wPvJ7eUA6VqS2EZ0DXqfhdgRUC1iEfrJJy0XsIn1OskVUOhMc+fJI1D6ySctF7CJ9TrJPSh4J8/6WPgUDNbZmZXOefKgDHAW8DXwHPOuZlB1plOtE7CSeslfFJxnejcRyIiUiX0LQUREUkehYKIiFRRKIiISBWFgoiIVFEoiIhIFYWCiIhUUSiIxMnMys3sczObYWYTzKzwIJa1rRFLE2k0CgWR+O10zg10zvXHO/PlDUEXJNLYFAoiDfMx3tkuMbMhZjbFzD7z/x/qD7/CzF40szfNbK6Z/bH6QsysyMw+NrMzkly/SI1S9SypIoHxL5JyMvCoP2g2cIJzrszMTgH+AJznjxsIDAJ2A3PM7F7n3FJ/Oe3xznlzm3Pu7SQ+BZFaKRRE4tfczD4HegClQOUXeUvg72bWF3BAVtQ87zrnNgOY2SygO9659LPwrq51g3NuUlKqF4mDNh+JxG+nc24g3hd7Nvv6FH4HvOf3NZwF5ETNszvqdjn7foiV4QXLaYksWKS+FAoi9eT/8r8J+KmZZeG1FJb7o6+IdzHA94HDKi/cLhIGCgWRBnDOfQZ8gXcu/D8Cd5jZZCBSj2WU+/OfZGbXJ6RQkXrSqbNFRKSKWgoiIlJFoSAiIlUUCiIiUkWhICIiVRQKIiJSRaEgIiJVFAoiIlJFoSAiIlX+P12BqbIkq6eKAAAAAElFTkSuQmCC\n",
      "text/plain": [
       "<Figure size 432x288 with 1 Axes>"
      ]
     },
     "metadata": {
      "needs_background": "light"
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "plot_rank_versus_freq(concept_counts)\n",
    "plt.title(\"Raw ranked concept counts Survey data\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Top 40 concepts:\n",
      "\tRank\tID\tCount\tConcept\n",
      "\t0\t6\t109 : the batter did not swing\n",
      "\t1\t62\t66 : the batter hit the ball\n",
      "\t2\t34\t48 : the batter didn't swing\n",
      "\t3\t63\t46 : it hit the ground\n",
      "\t4\t7\t43 : the batter swung and missed\n",
      "\t5\t203\t28 : the pitcher threw the ball\n",
      "\t6\t984\t20 : the batter did not swing at the ball\n",
      "\t7\t50\t20 : the batter did not swing at the pitch\n",
      "\t8\t42\t17 : it was a strike\n",
      "\t9\t249\t17 : the batter made contact\n",
      "\t10\t532\t16 : the umpire called it a ball\n",
      "\t11\t124\t15 : the batter did not swing at it\n",
      "\t12\t79\t13 : the batter hit the ball in the air\n",
      "\t13\t404\t13 : it was outside the strike zone\n",
      "\t14\t41\t12 : the ball was in the strike zone\n",
      "\t15\t93\t12 : the hitter hit the ball\n",
      "\t16\t8\t12 : it was in the strike zone\n",
      "\t17\t983\t11 : the ball landed outside of the strike zone\n",
      "\t18\t231\t11 : the batter hit the ball into foul territory\n",
      "\t19\t236\t9 : the batter made contact with the ball\n",
      "\t20\t82\t9 : the batter hit the\n",
      "\t21\t0\t9 : the ball was not thrown through the strike zone\n",
      "\t22\t413\t9 : the umpire called it a strike\n",
      "\t23\t160\t9 : the batter hit it\n",
      "\t24\t521\t9 : the batter hit it into foul territory\n",
      "\t25\t1025\t8 : it touched the ground\n",
      "\t26\t400\t8 : who caught the ball in the air\n",
      "\t27\t398\t8 : the batter hit the ball on the ground\n",
      "\t28\t222\t8 : the batter swung and missed the ball\n",
      "\t29\t259\t8 : it was called a ball\n",
      "\t30\t225\t8 : it was a ball\n",
      "\t31\t552\t8 : it was caught in the air by the center fielder\n",
      "\t32\t1\t8 : additionally the batter did not swing at the pitch\n",
      "\t33\t45\t7 : it was caught\n",
      "\t34\t71\t7 : the batter's knees\n",
      "\t35\t101\t7 : the batter swung and\n",
      "\t36\t38\t7 : it was outside of the strike zone\n",
      "\t37\t176\t7 : it was below the strike zone\n",
      "\t38\t39\t7 : the batter didn't swing at it\n",
      "\t39\t91\t7 : the umpire called the pitch a ball\n"
     ]
    }
   ],
   "source": [
    "display_most_frequent_concepts(concepts, pam, K=40)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Miscellaneous"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "line drive: with rawid 22 is not a concept\n",
      "therefore the batter is: with rawid 31 is not a concept\n",
      "batter swung and: with rawid 102 is not a concept\n",
      "s pop up fly: with rawid 103 is not a concept\n",
      "hit the ball to left center field: with rawid 213 is not a concept\n",
      "that ball hit into the air: with rawid 445 is not a concept\n",
      "the batter: with rawid 468 is not a concept\n",
      "batter swung and missed the ball: with rawid 573 is not a concept\n",
      "it did was not a fair ball: with rawid 640 is not a concept\n",
      "a line drive: with rawid 986 is not a concept\n",
      "a base hit: with rawid 1055 is not a concept\n",
      "fielder's: with rawid 1558 is not a concept\n",
      "batter swung and pitch and missed the ball which is a: with rawid 1754 is not a concept\n",
      "batter swung at pitch and missed ball which is a strike: with rawid 1755 is not a concept\n",
      "the batter was: with rawid 1792 is not a concept\n",
      "running was thrown out at first: with rawid 1906 is not a concept\n"
     ]
    }
   ],
   "source": [
    "# are concepts actually concepts\n",
    "not_concepts = []\n",
    "for rawid, concept in enumerate(concepts):\n",
    "    if not is_concept(nlp(concept), incmatcher, excmatcher):\n",
    "        print(f\"{concept}: with rawid {rawid} is not a concept\")\n",
    "        not_concepts.append(rawid)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 34,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "is_concept(nlp(\"it is a foul ball\"), incmatcher, excmatcher)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [],
   "source": [
    "## tests for inputs that break the nlp model\n",
    "#texts = []\n",
    "#texts.append(\"there was not a hit but rather a ground ball to the first baseman.\")\n",
    "#texts.append(\"The first baseman fielded the grounder and threw the ball to the covering pitcher for an out at first base.\")\n",
    "#texts.append(\"there was not a hit but rather a ground ball to the first baseman. The first baseman fielded the grounder.\")\n",
    "#texts.append(\"The batter took a �check swing,�.\")\n",
    "#texts.append(\"The batter completed a �full swing.� The first base umpire declared that the batter did not take a full swing.\")\n",
    "#texts.append(\"the ball was not thrown through the strike zone. The batter took a �check swing,� and the umpire behind the plate appealed to the first base umpire, who had a better view, to determine whether the batter completed a �full swing.� The first base umpire declared that the batter did not take a full swing, and therefore the pitch was called a ball, rather than a strike.\")\n",
    "##texts.append(\"there was not a hit but rather a ground ball to the first baseman.  The first baseman fielded the grounder.\")\n",
    "##text = \"The cat sat on the mat.\"\n",
    "##texts.append(\"there was not a hit but rather a ground ball to the first baseman.  The first baseman fielded the grounder and threw the ball to the covering pitcher for an out at first base.\")\n",
    "#for text in texts:\n",
    "#    print(f\"text = {text}\")\n",
    "#    doc = nlp(text)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
