{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "d9a16000-e874-4d24-a5c1-cf7226195bb8",
   "metadata": {
    "tags": []
   },
   "source": [
    "## Parameters"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "1611b7b9-cb00-4405-a5cb-e63e3047eb4b",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "all = []\n",
    "\n",
    "country = 'India'\n",
    "basePath = './ICE India/Corpus/'\n",
    "fname = basePath + '/W1a-001.txt'\n",
    "all.append([country, basePath, fname])\n",
    "\n",
    "country = 'Philippines'\n",
    "basePath = './ICE Philippines/Corpus/'\n",
    "fname = basePath + '/W1A-001.txt'\n",
    "all.append([country, basePath, fname])\n",
    "\n",
    "country = 'Singapore'\n",
    "basePath = './ICE SINGAPORE/Corpus/'\n",
    "fname = basePath + '/W1A-001.TXT'\n",
    "all.append([country, basePath, fname])\n",
    "\n",
    "country = 'Canada'\n",
    "basePath = './ICE-CAN/Corpus/'\n",
    "fname = basePath + '/W1A-001.TXT'\n",
    "all.append([country, basePath, fname])\n",
    "\n",
    "country = 'HongKong'\n",
    "basePath = './ICE-HK/Corpus/'\n",
    "fname = basePath + '/w1a-001.txt'\n",
    "all.append([country, basePath, fname])\n",
    "\n",
    "country = 'Ireland'\n",
    "basePath = './ICE-IRL/Corpus/'\n",
    "fname = basePath + '/W1A-001 Linguistics essays.txt'\n",
    "all.append([country, basePath, fname])\n",
    "\n",
    "country = 'Jamaica'\n",
    "basePath = './ICE-JA/Corpus/'\n",
    "fname = basePath + '/W1A-001.TXT'\n",
    "all.append([country, basePath, fname])\n",
    "\n",
    "country = 'SriLanka'\n",
    "basePath = './ICE-SL/Corpus/'\n",
    "fname = basePath + '/W2B-010.txt'\n",
    "all.append([country, basePath, fname])\n",
    "\n",
    "country = 'USA'\n",
    "basePath = './ICE-USA/Corpus/'\n",
    "fname = basePath + '/W1A-001.TXT'\n",
    "all.append([country, basePath, fname])\n",
    "\n",
    "country = 'Nigeria'\n",
    "basePath = './ICE-Nigeria/Corpus/'\n",
    "fname = basePath + '/bl_93.txt'\n",
    "all.append([country, basePath, fname])\n",
    "\n",
    "# Just for the sample\n",
    "country = 'Canada'\n",
    "basePath = './ICE-CAN/Corpus/'\n",
    "fname = basePath + '/W2B-031.TXT'"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "64d62a05-1cdd-411f-822f-fd5e9a443a7a",
   "metadata": {},
   "source": [
    "## Sample"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "8cd9636d-4e31-4723-829f-8a095a59749e",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import re\n",
    "pd.set_option('display.max_colwidth', 400)\n",
    "pd.set_option('display.max_rows', 10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "e8327522-2b71-487a-b7d7-9b192c02a664",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<I>\n",
      "<ICE-CAN:W2B-031#1:1> <h> <bold> S'ware vendor callbacks unsatisfactory, says IDC </bold>\n",
      "\n",
      "</h>\n",
      "\n",
      "<p> <ICE-CAN:W2B-031#2:1> TORONTO - International Data Corp. ( Canada) Ltd. ( IDC) has\n",
      "\n",
      "completed a survey of user operating system soft <l> ware support needs and\n",
      "\n",
      "expec <l> tations. </p>\n",
      "\n",
      "<p> <ICE-CAN:W2B-031#3:1> The purpose of the <it> 1990 Canadian User Satisfaction with\n",
      "\n",
      "Software Support </it> survey is to let vendors know how successful they are\n",
      "\n",
      "in the eyes of customers.\n",
      "<ICE-CAN:W2B-031#4:1> Vendors can compare customer satisfaction ratings in 1990 to those\n",
      "\n",
      "from the <it> 1989 Canadian User Satis <l> faction with Software Support\n",
      "\n",
      "</it> survey to discover where they have made progress and where they should\n",
      "\n",
      "concentrate their efforts in the upcoming year. </p>\n",
      "\n",
      "<p> <ICE-CAN:W2B-031#5:1> It is apparent customers still expect total service from a\n",
      "\n",
      "software service provider, based on average importance ratings of individual\n",
      "\n",
      "software support criteria.\n",
      "< ...\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "['<I>\\r\\n<ICE-CAN:W2B-031#1:1> <h> <bold> S\\'ware vendor callbacks unsatisfactory, says IDC </bold>\\r\\n\\r\\n</h>\\r\\n\\r\\n<p> <ICE-CAN:W2B-031#2:1> TORONTO - International Data Corp. ( Canada) Ltd. ( IDC) has\\r\\n\\r\\ncompleted a survey of user operating system soft <l> ware support needs and\\r\\n\\r\\nexpec <l> tations. </p>\\r\\n\\r\\n<p> <ICE-CAN:W2B-031#3:1> The purpose of the <it> 1990 Canadian User Satisfaction with\\r\\n\\r\\nSoftware Support </it> survey is to let vendors know how successful they are\\r\\n\\r\\nin the eyes of customers.\\r\\n<ICE-CAN:W2B-031#4:1> Vendors can compare customer satisfaction ratings in 1990 to those\\r\\n\\r\\nfrom the <it> 1989 Canadian User Satis <l> faction with Software Support\\r\\n\\r\\n</it> survey to discover where they have made progress and where they should\\r\\n\\r\\nconcentrate their efforts in the upcoming year. </p>\\r\\n\\r\\n<p> <ICE-CAN:W2B-031#5:1> It is apparent customers still expect total service from a\\r\\n\\r\\nsoftware service provider, based on average importance ratings of individual\\r\\n\\r\\nsoftware support criteria.\\r\\n<ICE-CAN:W2B-031#6:1> According to the survey: <l>\\r\\n\\r\\n<p> <ICE-CAN:W2B-031#7:1> &bullet; Overall support and quality control of software received\\r\\n\\r\\nwere deemed the most impor <l> tant initial support criteria. </p>\\r\\n\\r\\n<p> <ICE-CAN:W2B-031#8:1> &bullet; The ease of reporting a prob <l> lem and receiving\\r\\n\\r\\nongoing feed <l> back on the status of a prob <l> lem / solution were rated\\r\\n\\r\\nthe most significant criteria of tele <l> phone support. </p>\\r\\n\\r\\n<p> <ICE-CAN:W2B-031#9:1> &bullet; The ability to provide per <l> manent fixes, quality of\\r\\n\\r\\nup <l> dates / revisions and their ease of installation, and the ability to\\r\\n\\r\\nprovide workarounds were all judged as equally important on <l> going support\\r\\n\\r\\ncriteria. </p>\\r\\n\\r\\n<p> <ICE-CAN:W2B-031#10:1> &bullet; The ease of maintenance and quality of remote\\r\\n\\r\\ndiagnostics were rated as the most signifi <l> cant additional services. </p>\\r\\n\\r\\n<p> <ICE-CAN:W2B-031#11:1> One industry trend observed is that vendors, for the most part,\\r\\n\\r\\nare not meeting customer expectations in regards to call <l> back times in\\r\\n\\r\\nboth emergencies and under normal circumstances. </p>\\r\\n\\r\\n<p> <ICE-CAN:W2B-031#12:1> Hewlett-Packard is the only software support provider meeting its\\r\\n\\r\\n<w> customers\\' </w> average acceptable callback times in emergencies and\\r\\n\\r\\nunder normal circumstances.\\r\\n<ICE-CAN:W2B-031#13:1> In fact, average callback times under normal circumstances have\\r\\n\\r\\nincreased; only Hewlett-Packard and NCR achieved decreases in their average\\r\\n\\r\\ncallback times under normal circumstances since 1989.\\r\\n<ICE-CAN:W2B-031#14:1> However, average call <l> back times in emergencies have decreased;\\r\\n\\r\\nonly IBM and Bull achieved increases in their average callback times in\\r\\n\\r\\nemergencies since 1989.\\r\\n<ICE-CAN:W2B-031#15:1> Vendors are concentrating too much on quicker responses in emergen\\r\\n\\r\\n<l> cies to the neglect of their re <l> sponse time under normal cir <l>\\r\\n\\r\\ncumstances. </p>\\r\\n\\r\\n<p> <ICE-CAN:W2B-031#16:1> Customer ratings of in <l> dividual <w> vendors\\' </w> performance\\r\\n\\r\\nare included in the report.\\r\\n<ICE-CAN:W2B-031#17:1> Ven <l> dors assessed include Bull, Digital, Hewlett-Packard, IBM,\\r\\n\\r\\nNCR, and Unisys. </p>\\r\\n\\r\\n<p> <ICE-CAN:W2B-031#18:1> For more information about IDC\\'s <it> 1990 Canadian User\\r\\n\\r\\nSatisfaction with Software Sup <l> port </it> survey, contact Mark Pel <l>\\r\\n\\r\\nlettier at ( 416) 369-0033. </p> </I>\\r\\n\\r\\n\\r\\n<I>\\r\\n<ICE-CAN:W2B-031#19:2> <h> <bold> Win over friends, neighbors with community marketing\\r\\n\\r\\n</bold> </h>\\r\\n<ICE-CAN:W2B-031#20:2> <it> by Catherine Callaghan Special to CDN </it>\\r\\n\\r\\n<p> <ICE-CAN:W2B-031#21:2> Common sense dictates that some of your most important business\\r\\n\\r\\ncomes from buyers in your area.\\r\\n<ICE-CAN:W2B-031#22:2> But according to Tracy Groves, marketing manager for Concord,\\r\\n\\r\\nOnt.-based Computer Brokers of Canada ( CBC), too many dealers overlook their\\r\\n\\r\\nown backyard when it comes to planning a marketing campaign.\\r\\n<ICE-CAN:W2B-031#23:2> <quote> \" Community-based marketing is vastly underutilized,\"\\r\\n\\r\\n</quote> she says. <X>\\r\\n<ICE-CAN:W2B-031#X24:2> <quote> \" Dealers have to be shown just how cost-effective it can\\r\\n\\r\\nbe.\" </quote> </X> </p>\\r\\n\\r\\n<p> <ICE-CAN:W2B-031#25:2> There are several good rea <l> sons to act locally when it comes\\r\\n\\r\\nto your overall market <l> ing strategy.\\r\\n<ICE-CAN:W2B-031#26:2> Cost is a major consideration. </p>\\r\\n\\r\\n<p> <ICE-CAN:W2B-031#27:2> <quote> \" For anything other than a local campaign, the costs\\r\\n\\r\\nwill just destroy you,\" </quote> says Robert Cohen, president of The Cohen\\r\\n\\r\\nGroup, an integrated market <l> ing/advertising firm in Rich <l> mond Hill,\\r\\n\\r\\nOnt. </p>\\r\\n\\r\\n<p> <ICE-CAN:W2B-031#28:2> It\\'s also a chance for you to differentiate yourself from the\\r\\n\\r\\ncompetition, using the home <l> team advantage, or \" buy local\" mentality to\\r\\n\\r\\nyour benefit.\\r\\n<ICE-CAN:W2B-031#29:2> Over the long haul, it\\'s a chance for you to generate goodwill - to\\r\\n\\r\\nestablish that your firm is a good corporate citizen.\\r\\n<ICE-CAN:W2B-031#30:2> In a competitive marketplace, the dealer that earns the con <l>\\r\\n\\r\\nsumer\\'s trust gets the sales. </p>\\r\\n\\r\\n<p> <ICE-CAN:W2B-031#31:2> Not all businesses can ap <l> proach community-based marketing\\r\\n\\r\\nthe same way.\\r\\n<ICE-CAN:W2B-031#32:2> The vehicles you choose depend on who your customers are, says\\r\\n\\r\\nGroves.\\r\\n<ICE-CAN:W2B-031#33:2> <quote> \" Community market <l> ing is especially important if you\\r\\n\\r\\nserve the retail customer,\" </quote> she says.\\r\\n<ICE-CAN:W2B-031#34:2> Buying a computer represents a big investment - and a major risk -\\r\\n\\r\\nfor most, so building trust and credibility over time is important. </p>\\r\\n\\r\\n<p> <ICE-CAN:W2B-031#35:2> The scope of your campaign also depends on your size.\\r\\n<ICE-CAN:W2B-031#36:2> A dealer like ComputerLand can afford a major outdoor advertis <l>\\r\\n\\r\\ning campaign, says Groves, but one-man shops may have to settle for\\r\\n\\r\\nbus-shelter ads out <l> side the store. </p>\\r\\n\\r\\n<p> <ICE-CAN:W2B-031#37:2> Finally, location plays a big part in how you stage your\\r\\n\\r\\ncampaign.\\r\\n<ICE-CAN:W2B-031#38:2> In major centres, says Cohen, it\\'s critical to es <l> tablish\\r\\n\\r\\nyourself as a specialist in some area - say, networking or peripherals.\\r\\n<ICE-CAN:W2B-031#39:2> In smaller com <l> munities, where you\\'re up against a handful of\\r\\n\\r\\ncompeti <l> tors, you want to establish yourself as an expert in all areas.\\r\\n\\r\\n</p>\\r\\n\\r\\n<p> <ICE-CAN:W2B-031#40:2> The tools of the community-marketing trade will look famil <l>\\r\\n\\r\\niar.\\r\\n<ICE-CAN:W2B-031#41:2> CBC\\'s Groves points to local print and radio advertising as two of\\r\\n\\r\\nthe most common ways dealers reach out to their neighbors.\\r\\n<ICE-CAN:W2B-031#42:2> <quote> \" Community news <l> papers are extremely well-read outside\\r\\n\\r\\nmajor centres like Toronto,\" </quote> agrees Cohen.\\r\\n<ICE-CAN:W2B-031#43:2> And while TV advertising is often out of the question for compa <l>\\r\\n\\r\\nnies on tight budgets, he says, spots on local cable channels can be highly\\r\\n\\r\\ncost-effective. </p>\\r\\n\\r\\n<p> <ICE-CAN:W2B-031#44:2> Richard Dexter, co-owner of Basys Consulting Ltd. in Dart <l>\\r\\n\\r\\nmouth, N.S., is a firm believer in the power of local print media.\\r\\n<ICE-CAN:W2B-031#45:2> He and partner Roy Drinnan advertise regularly in the monthly paper\\r\\n\\r\\nthat goes to the 2,000 neighbors in their in <l> dustrial park. </p>\\r\\n\\r\\n<p> <ICE-CAN:W2B-031#46:2> <quote> \" One of our target markets is small and mid-sized busi\\r\\n\\r\\n<l> ness,\" </quote> says Dexter, <quote> \" and the majority of them fit the\\r\\n\\r\\nprofile.\" </quote>\\r\\n<ICE-CAN:W2B-031#47:2> Basys pays between &dollar;300 and &dollar;400 for an ad in the\\r\\n\\r\\nmonthly <it> Burnside News </it> , and Dexter says the campaign plays an im\\r\\n\\r\\n<l> portant part in the firm\\'s efforts to overcome price competition from\\r\\n\\r\\ndealers in Toronto. </p>\\r\\n\\r\\n<p> <ICE-CAN:W2B-031#48:2> Local business and commu <l> nity associations offer another way\\r\\n\\r\\nto reinforce your home <l> team advantage.\\r\\n<ICE-CAN:W2B-031#49:2> Time, of course, is a limiting factor, says Cohen, and the extent of\\r\\n\\r\\nyour personal involvement has to balance against potential re <l> turns:\\r\\n\\r\\n<quote> \" You need to get your name known in the community without taking up\\r\\n\\r\\na lot of your time.\" </quote> </p>\\r\\n\\r\\n<p> <ICE-CAN:W2B-031#50:2> Membership in a local Chamber of Commerce is a good way to start.\\r\\n\\r\\n</p>\\r\\n\\r\\n<p> <ICE-CAN:W2B-031#51:2> Isaac Ehrlich, owner of Richmond Hill, Ont.-based lap <l> top and\\r\\n\\r\\nperipheral reseller Keysoft Network Inc., belongs to three local Chambers.\\r\\n<ICE-CAN:W2B-031#52:2> Listings <quote> \" tell Chamber mem <l> bers who we are and what we\\r\\n\\r\\nare,\" </quote> says Keysoft general manager Barbara Smith, <quote> \" and\\r\\n\\r\\nthat\\'s reinforced when we at <l> tend local functions.\\r\\n<ICE-CAN:W2B-031#53:2> Most of these people really make an ef <l> fort to buy local.\"\\r\\n\\r\\n</quote> </p>\\r\\n\\r\\n<p> <ICE-CAN:W2B-031#54:2> The networking paid off re <l> cently when Keysoft landed a deal\\r\\n\\r\\nto sell 10 laptops to the municipality of Vaughn, north of Toronto.\\r\\n<ICE-CAN:W2B-031#55:2> The deal came about when Ehrlich, who lives in Vaughn, met with a\\r\\n\\r\\nmunicipal representative at a recent Chamber meeting. </p>\\r\\n\\r\\n<p> <ICE-CAN:W2B-031#56:2> Event marketing - hosting events ranging from wine-and-cheese\\r\\n\\r\\nopen houses to champagne-splashed product launches - has become increas <l>\\r\\n\\r\\ningly popular in recent years, says CBC\\'s Groves.\\r\\n<ICE-CAN:W2B-031#57:2> Event marketing can work very well, she says, particularly in the\\r\\n\\r\\ncomputer industry. <X>\\r\\n<ICE-CAN:W2B-031#X58:2> <quote> \" Because it\\'s a very technical business, you need to show\\r\\n\\r\\npeople a human face when you get them in the door.\" </quote> </X> </p>\\r\\n\\r\\n<p> <ICE-CAN:W2B-031#59:2> But she warns that custom <l> ers have become more <w>\\r\\n\\r\\nblas&eacute;\\' </w> about events over the past couple of years, simply because\\r\\n\\r\\nthere are so many of them.\\r\\n<ICE-CAN:W2B-031#60:2> As a result, businesses are having to go to even greater lengths to\\r\\n\\r\\ndraw a crowd.\\r\\n<ICE-CAN:W2B-031#61:2> <quote> \" You\\'ll have <it> an event </it> and no one\\'ll show up.\\r\\n<ICE-CAN:W2B-031#62:2> Then you find out that Joe down the street held one last week, and he\\r\\n\\r\\nhad champagne and you only had beer - you get into that sort of competition,\"\\r\\n\\r\\n</quote> she says. </p>\\r\\n\\r\\n<p> <ICE-CAN:W2B-031#63:2> One response, says Groves, has been for smaller firms to pool\\r\\n\\r\\ntheir resources and stage \" mini trade shows.\"\\r\\n<ICE-CAN:W2B-031#64:2> Renting space in a local hotel or meet <l> ing hall, a group can set\\r\\n\\r\\nup booths and invite members of the local community to browse and partake of\\r\\n\\r\\nfood and drink. <O> photograph </O> <O> Robert Cohen </O>\\r\\n<ICE-CAN:W2B-031#65:2> With the combined muscle - and budget - of a number of firms, says\\r\\n\\r\\nGroves, mini trade shows offer one way to get your name in front of local\\r\\n\\r\\nbuyers. </p>\\r\\n\\r\\n<p> <ICE-CAN:W2B-031#66:2> Direct mail offers a very precise way to carry your mes <l> sage\\r\\n\\r\\nto the people who count.\\r\\n<ICE-CAN:W2B-031#67:2> You can buy lists from list brokers, local associations like your\\r\\n\\r\\nChamber of Commerce, or even church and community groups. </p>\\r\\n\\r\\n<p> <ICE-CAN:W2B-031#68:2> Roland Lau, owner of Cal <l> gary\\'s The Home Computing Centre\\r\\n\\r\\nInc., used postal code data to send flyers to 40,000 households in his\\r\\n\\r\\nimmediate area.\\r\\n<ICE-CAN:W2B-031#69:2> The effort cost him about &dollar;7,500, and while he can\\'t at <l>\\r\\n\\r\\ntribute a specific increase in sales to the effort, he will say the mailing\\r\\n\\r\\nincreased his visi <l> bility. </p>\\r\\n\\r\\n<p> <ICE-CAN:W2B-031#70:2> When it comes to competi <l> tion, says Lau, dealers have two\\r\\n\\r\\nchoices: <quote> \" You can be the biggest, or you can be closest to your\\r\\n\\r\\ncommunity.\" </quote>\\r\\n<ICE-CAN:W2B-031#71:2> Lau esti <l> mates that more than 70 per cent of sales come from\\r\\n\\r\\nbuyers within a 10-kilometre radius of his retail outlet. </p>\\r\\n\\r\\n<p> <ICE-CAN:W2B-031#72:2> Depending on the size of your company, outdoor adver <l> tising\\r\\n\\r\\ncan be a good way to keep in touch with your com <l> munity.\\r\\n<ICE-CAN:W2B-031#73:2> But both Groves and Cohen warn that, like any advertising,\\r\\n\\r\\nbillboards, bus shelters and taxi tops only work if the ads emphasize some\\r\\n\\r\\nunique selling point. </p>\\r\\n\\r\\n<p> <ICE-CAN:W2B-031#74:2> <quote> \" There is a place for <it> outdoor advertising </it> if\\r\\n\\r\\nyou can afford it, and can play up some unique selling feature - if you\\'ve\\r\\n\\r\\ngot an exclusive on a product, for ex <l> ample,\" </quote> says Groves. <X>\\r\\n<ICE-CAN:W2B-031#X75:2> <quote> \" Every <l> one\\'s got basically the same bill of goods, so\\r\\n\\r\\nyou have to focus on what you do better.\" </quote> </X> </p>\\r\\n\\r\\n<p> <ICE-CAN:W2B-031#76:2> Whether your community marketing campaign pays off in direct\\r\\n\\r\\nsales today, or paves the way for tomorrow\\'s orders, the push to act locally\\r\\n\\r\\nremains the same: developing a relationship based on trust. </p>\\r\\n\\r\\n<p> <ICE-CAN:W2B-031#77:2> <quote> \" Being a good corporate citi <l> zen is very important,\\r\\n\\r\\nespe <l> cially when you\\'re talking about a technical business like\\r\\n\\r\\ncomputers,\" </quote> says Groves. <X>\\r\\n<ICE-CAN:W2B-031#X78:2> <quote> \" When you build that association with the community, they (\\r\\n\\r\\nlocal buyers) get to know you and trust you.\\r\\n<ICE-CAN:W2B-031#X79:2> That\\'s what IBM did - everyone trusts IBM, and look where they are\\r\\n\\r\\ntoday.\" </quote> </X> </p> </I>\\r\\n\\r\\n\\r\\n<I>\\r\\n<ICE-CAN:W2B-031#80:3> <h> <bold> Quebec makes moves to boost s\\'ware profiles </bold> </h>\\r\\n<ICE-CAN:W2B-031#81:3> <it> by Nina Gilbert Special to CDN </it>\\r\\n\\r\\n<p> <ICE-CAN:W2B-031#82:3> How many software companies with an excellent product that meet a\\r\\n\\r\\nreal demand remain vir <l> tually invisible despite their technical\\r\\n\\r\\nexpertise? </p>\\r\\n\\r\\n<p> <ICE-CAN:W2B-031#83:3> In Quebec, the number is far too high, according to Yvon Blais,\\r\\n\\r\\nmanager of the Quebec computer sector for the Minis <l> try of Industry,\\r\\n\\r\\nScience, and Technology. </p>\\r\\n\\r\\n<p> <ICE-CAN:W2B-031#84:3> The problem is lack of marketing expertise, and many, including\\r\\n\\r\\nBlais, hope that the recently established Centre de Promotion du Logi <l>\\r\\n\\r\\nciel Qu&eacute;b&eacute;cois ( CPLQ) will address this concern. </p>\\r\\n\\r\\n<p> <ICE-CAN:W2B-031#85:3> The centre, which began operations in September 1990, is a\\r\\n\\r\\nnon-profit organization funded by the federal Depart <l> ment of\\r\\n\\r\\nCommunications and the provincial Ministry of Com <l> munications. </p>\\r\\n\\r\\n<p> <ICE-CAN:W2B-031#86:3> The ministries are contribut <l> ing &dollar;200,000 and\\r\\n\\r\\n&dollar;400,000 re <l> spectively over a period of two years, after which\\r\\n\\r\\ntime the centre expects to be self supporting. </p>\\r\\n\\r\\n<p> <ICE-CAN:W2B-031#87:3> As the name implies, the centre aims to promote the\\r\\n\\r\\ncommercialization of Quebec software. </p>\\r\\n\\r\\n<p> <ICE-CAN:W2B-031#88:3> Blais recently threw his sup <l> port behind the new organiza <l>\\r\\n\\r\\ntion by inviting the CPLQ to co-sponsor a fall workshop that prepared Quebec\\r\\n\\r\\ncompanies for Comdex. </p>\\r\\n\\r\\n<p> <ICE-CAN:W2B-031#89:3> According to Mich&egrave;le Guay, general director of the CPLQ,\\r\\n\\r\\nthe Comdex workshop was the first of many that the centre plans to be\\r\\n\\r\\ninvolved in. </p>\\r\\n\\r\\n<p> <ICE-CAN:W2B-031#90:3> <quote> \" Within a few months we expect to be hosting one work\\r\\n\\r\\n<l> shop per month and one show <l> case per week,\" </quote> said Guay. </p>\\r\\n\\r\\n<p> <ICE-CAN:W2B-031#91:3> Showcases, held in the centre\\'s Montreal showroom, will allow\\r\\n\\r\\nsoftware producers to present products to prospec <l> tive clients and\\r\\n\\r\\ndistributors. </p>\\r\\n\\r\\n<p> <ICE-CAN:W2B-031#92:3> Guay\\'s strategy for achiev <l> ing the centre\\'s goal is two-fold.\\r\\n<ICE-CAN:W2B-031#93:3> On the home front, she in <l> tends to help Quebec software producers\\r\\n\\r\\nfully exploit their local markets.\\r\\n<ICE-CAN:W2B-031#94:3> As part of this effort, the CPLQ will try to get large Quebec\\r\\n\\r\\ncompanies to consider locally developed soft <l> ware before turning to the\\r\\n\\r\\nUnited States or elsewhere.\\r\\n<ICE-CAN:W2B-031#95:3> Guay believes a strong local base gives companies a real advantage in\\r\\n\\r\\nthe international market. </p>\\r\\n\\r\\n<p> <ICE-CAN:W2B-031#96:3> The CPLQ also has projects underway for companies targeting\\r\\n\\r\\nmarkets outside of the province.\\r\\n<ICE-CAN:W2B-031#97:3> The centre plans to collaborate on a national marketing network for\\r\\n\\r\\nedu <l> cational and training software. </p>\\r\\n\\r\\n<p> <ICE-CAN:W2B-031#98:3> Will Dubitsky, an industry policy analyst with Communica <l>\\r\\n\\r\\ntions Canada, has organized a symposium for April in Vancou <l> ver.\\r\\n<ICE-CAN:W2B-031#99:3> At that time, groups from across Canada will meet with Dubitsky and\\r\\n\\r\\nB.C. Tel to dis <l> cuss implementation. </p>\\r\\n\\r\\n<p> <ICE-CAN:W2B-031#100:3> According to Dubitsky, the CPLQ is the only centre of its kind\\r\\n\\r\\nin Canada right now.\\r\\n<ICE-CAN:W2B-031#101:3> They were the obvious choice to rep <l> resent Quebec in the cour\\r\\n\\r\\n<l> seware network. </p>\\r\\n\\r\\n<p> <ICE-CAN:W2B-031#102:3> The CPLQ is also collaborat <l> ing on a plan to give 10 Quebec\\r\\n\\r\\ncompanies a boost into the in <l> ternational market.\\r\\n<ICE-CAN:W2B-031#103:3> The compa <l> nies will benefit from the ex <l> pertise of a\\r\\n\\r\\nmarketing firm and the involvement of government agencies.\\r\\n<ICE-CAN:W2B-031#104:3> For the relatively low cost of &dollar;5,000, the firms will be\\r\\n\\r\\nhelped to launch an inter <l> national campaign. </p>\\r\\n\\r\\n<p> <ICE-CAN:W2B-031#105:3> The CPLQ intends to play an important role as a focus for\\r\\n\\r\\ninformation about what is avail <l> able to software producers and\\r\\n\\r\\ndistributors.\\r\\n<ICE-CAN:W2B-031#106:3> <quote> \" We want to be the hub of a net work made up of producers,\\r\\n\\r\\ndistributors, buy <l> ers, associations, and govern <l> ment agencies\\r\\n\\r\\ninvolved in the Quebec software industry,\" </quote> ex <l> plained Guay. </p>\\r\\n\\r\\n<p> <ICE-CAN:W2B-031#107:3> Many software producers feel a dire need for just such an\\r\\n\\r\\ninformation centre. </p> </I>\\r\\n']"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "txt = \"\"\n",
    "with open(fname,\"rb\") as f:\n",
    "    txt = f.read().decode(\"latin-1\")\n",
    "print(txt[:1000], \"...\")\n",
    "sent = re.split(r'\\<ICE\\-\\w+:\\s\\w+-\\d+#\\d+:\\d+>', txt)\n",
    "sent"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "feacb123-a8da-4c43-b8c2-7bfa6bc1277c",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'S\\'ware vendor callbacks unsatisfactory, says IDC    - International Data Corp. ( ) Ltd. ( IDC) has completed a survey of user operating system software support needs and expectations.    The purpose of the  1990  User Satisfaction with Software Support  survey is to let vendors know how successful they are in the eyes of customers.  Vendors  compare customer satisfaction ratings in 1990 to those from the  1989  User Satisfaction with Software Support  survey to discover where they have made progress and where they should concentrate their efforts in the upcoming year.    It is apparent customers still expect total service from a software service provider, based on average importance ratings of individual software support criteria.  According to the survey:     Overall support and quality control of software received were deemed the most important initial support criteria.     The ease of reporting a problem and receiving ongoing feedback on the status of a problem / solution were rated the most significant criteria of telephone support.     The ability to provide permanent fixes, quality of updates / revisions and their ease of installation, and the ability to provide workarounds were all judged as equally important ongoing support criteria.     The ease of maintenance and quality of remote diagnostics were rated as the most significant additional services.    One industry trend observed is that vendors, for the most part, are not meeting customer expectations in regards to callback times in both emergencies and under normal circumstances.    Hewlett-Packard is the only software support provider meeting its  customers\\'  average acceptable callback times in emergencies and under normal circumstances.  In fact, average callback times under normal circumstances have increased; only Hewlett-Packard and NCR achieved decreases in their average callback times under normal circumstances since 1989.  However, average callback times in emergencies have decreased; only IBM and Bull achieved increases in their average callback times in emergencies since 1989.  Vendors are concentrating too much on quicker responses in emergencies to the neglect of their response time under normal circumstances.    Customer ratings of individual  vendors\\'  performance are included in the report.  Vendors assessed include Bull, Digital, Hewlett-Packard, IBM, NCR, and Unisys.    For more information about IDC\\'s  1990  User Satisfaction with Software Support  survey, contact Mark Pellettier at ( 416) 369-0033.       Win over friends, neighbors with community marketing     by Catherine Callaghan Special to CDN    Common sense dictates that some of your most important business comes from buyers in your area.  But according to Tracy Groves, marketing manager for Concord, Ont.-based Computer Brokers of  ( CBC), too many dealers overlook their own backyard when it comes to planning a marketing campaign.  \" Community-based marketing is vastly underutilized,\" she says.   \" Dealers have to be shown just how cost-effective it  be.\"     There are several good reasons to act locally when it comes to your overall marketing strategy.  Cost is a major consideration.    \" For anything other than a local campaign, the costs will just destroy you,\" says Robert Cohen, president of The Cohen Group, an integrated marketing/advertising firm in , Ont.    It\\'s also a chance for you to differentiate yourself from the competition, using the hometeam advantage, or \" buy local\" mentality to your benefit.  Over the long haul, it\\'s a chance for you to generate goodwill - to establish that your firm is a good corporate citizen.  In a competitive marketplace, the dealer that earns the consumer\\'s trust gets the sales.    Not all businesses  approach community-based marketing the same way.  The vehicles you choose depend on who your customers are, says Groves.  \" Community marketing is especially important if you serve the retail customer,\" she says.  Buying a computer represents a big investment - and a major risk - for most, so building trust and credibility over time is important.    The scope of your campaign also depends on your size.  A dealer like ComputerLand  afford a major outdoor advertising campaign, says Groves, but one-man shops may have to settle for bus-shelter ads outside the store.    Finally, location plays a big part in how you stage your campaign.  In major centres, says Cohen, it\\'s critical to establish yourself as a specialist in some area - say, networking or peripherals.  In smaller communities, where you\\'re up against a handful of competitors, you want to establish yourself as an expert in all areas.    The tools of the community-marketing trade will look familiar.  CBC\\'s Groves points to local print and radio advertising as two of the most common ways dealers reach out to their neighbors.  \" Community newspapers are extremely well-read outside major centres like ,\" agrees Cohen.  And while TV advertising is often out of the question for companies on tight budgets, he says, spots on local cable channels  be highly cost-effective.    Richard Dexter, co-owner of Basys Consulting Ltd. in Dartmouth, N.S., is a firm believer in the power of local print media.  He and partner Roy Drinnan advertise regularly in the monthly paper that goes to the 2,000 neighbors in their industrial park.    \" One of our target markets is small and mid-sized business,\" says Dexter, \" and the majority of them fit the profile.\"  Basys pays between $300 and $400 for an ad in the monthly  Burnside News  , and Dexter says the campaign plays an important part in the firm\\'s efforts to overcome price competition from dealers in .    Local business and community associations offer another way to reinforce your hometeam advantage.  Time, of course, is a limiting factor, says Cohen, and the extent of your personal involvement has to balance against potential returns: \" You need to get your name known in the community without taking up a lot of your time.\"    Membership in a local Chamber of Commerce is a good way to start.    Isaac Ehrlich, owner of , Ont.-based laptop and peripheral reseller Keysoft Network Inc., belongs to three local Chambers.  Listings \" tell Chamber members who we are and what we are,\" says Keysoft general manager Barbara Smith, \" and that\\'s reinforced when we attend local functions.  Most of these people really make an effort to buy local.\"    The networking paid off recently when Keysoft landed a deal to sell 10 laptops to the municipality of Vaughn, north of .  The deal came about when Ehrlich, who lives in Vaughn, met with a municipal representative at a recent Chamber meeting.    Event marketing - hosting events ranging from wine-and-cheese open houses to champagne-splashed product launches - has become increasingly popular in recent years, says CBC\\'s Groves.  Event marketing  work very well, she says, particularly in the computer industry.   \" Because it\\'s a very technical business, you need to show people a human face when you get them in the door.\"     But she warns that customers have become more  blase\\'  about events over the past couple of years, simply because there are so many of them.  As a result, businesses are having to go to even greater lengths to draw a crowd.  \" You\\'ll have  an event  and no one\\'ll show up.  Then you find out that Joe down the street held one last week, and he had champagne and you only had beer - you get into that sort of competition,\" she says.    One response, says Groves, has been for smaller firms to pool their resources and stage \" mini trade shows.\"  Renting space in a local hotel or meeting hall, a group  set up booths and invite members of the local community to browse and partake of food and drink.    With the combined muscle - and budget - of a number of firms, says Groves, mini trade shows offer one way to get your name in front of local buyers.    Direct mail offers a very precise way to carry your message to the people who count.  You  buy lists from list brokers, local associations like your Chamber of Commerce, or even church and community groups.    Roland Lau, owner of \\'s The Home Computing Centre Inc., used postal code data to send flyers to 40,000 households in his immediate area.  The effort cost him about $7,500, and while he \\'t attribute a specific increase in sales to the effort, he will say the mailing increased his visibility.    When it comes to competition, says Lau, dealers have two choices: \" You  be the biggest, or you  be closest to your community.\"  Lau estimates that more than 70 per cent of sales come from buyers within a 10-kilometre radius of his retail outlet.    Depending on the size of your company, outdoor advertising  be a good way to keep in touch with your community.  But both Groves and Cohen warn that, like any advertising, billboards, bus shelters and taxi tops only work if the ads emphasize some unique selling point.    \" There is a place for  outdoor advertising  if you  afford it, and  play up some unique selling feature - if you\\'ve got an exclusive on a product, for example,\" says Groves.   \" Everyone\\'s got basically the same bill of goods, so you have to focus on what you do better.\"     Whether your community marketing campaign pays off in direct sales today, or paves the way for tomorrow\\'s orders, the push to act locally remains the same: developing a relationship based on trust.    \" Being a good corporate citizen is very important, especially when you\\'re talking about a technical business like computers,\" says Groves.   \" When you build that association with the community, they ( local buyers) get to know you and trust you.  That\\'s what IBM did - everyone trusts IBM, and look where they are today.\"     makes moves to boost s\\'ware profiles     by Nina Gilbert Special to CDN    How many software companies with an excellent product that meet a real demand remain virtually invisible despite their technical expertise?    In , the number is far too high, according to Yvon Blais, manager of the  computer sector for the Ministry of Industry, Science, and Technology.    The problem is lack of marketing expertise, and many, including Blais,  that the recently established Centre de Promotion du Logiciel Quebecois ( CPLQ) will address this concern.    The centre, which began operations in September 1990, is a non-profit organization funded by the federal Department of Communications and the provincial Ministry of Communications.    The ministries are contributing $200,000 and $400,000 respectively over a period of two years, after which time the centre expects to be self supporting.    As the name implies, the centre aims to promote the commercialization of  software.    Blais recently threw his support behind the new organization by inviting the CPLQ to co-sponsor a fall workshop that prepared  companies for Comdex.    According to Michele Guay, general director of the CPLQ, the Comdex workshop was the first of many that the centre plans to be involved in.    \" Within a few months we expect to be hosting one workshop per month and one showcase per week,\" said Guay.    Showcases, held in the centre\\'s  showroom, will allow software producers to present products to prospective clients and distributors.    Guay\\'s strategy for achieving the centre\\'s goal is two-fold.  On the home front, she intends to help  software producers fully exploit their local markets.  As part of this effort, the CPLQ will try to get large  companies to consider locally developed software before turning to the United States or elsewhere.  Guay believes a strong local base gives companies a real advantage in the international market.    The CPLQ also has projects underway for companies targeting markets outside of the province.  The centre plans to collaborate on a national marketing network for educational and training software.    Will Dubitsky, an industry policy analyst with Communications , has organized a symposium for April in .  At that time, groups from across  will meet with Dubitsky and B.C. Tel to discuss implementation.    According to Dubitsky, the CPLQ is the only centre of its kind in  right now.  They were the obvious choice to represent  in the courseware network.    The CPLQ is also collaborating on a plan to give 10  companies a boost into the international market.  The companies will benefit from the expertise of a marketing firm and the involvement of government agencies.  For the relatively low cost of $5,000, the firms will be helped to launch an international campaign.    The CPLQ intends to play an important role as a focus for information about what is available to software producers and distributors.  \" We want to be the hub of a net work made up of producers, distributors, buyers, associations, and government agencies involved in the  software industry,\" explained Guay.    Many software producers feel a dire need for just such an information centre.'"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import html\n",
    "import unicodedata\n",
    "\n",
    "nat_df = pd.read_csv('../util/countries_and_nationality.csv', encoding='latin-1')\n",
    "nat_df = nat_df[nat_df.Nationality != \"-\"]\n",
    "\n",
    "extra = {\n",
    "    \"Canada\": [\"Ian\", \"quebec\", \"ontario\", \"laura\", \"Marilyn\", \"Pauline\", \"canadians\", \"hannah\", \"ned\", \"cegep\", \"alberta\", \"hume\"],\n",
    "    \"HongKong\" : [\"lee\", \"chan\", \"wong\", \"hkac\"],\n",
    "    \"India\": [\"Sd/\", \"shreya\", \"vishal\", \"shivaji\", \"rukmini\", \"shastri\", \"renuka\", \"shivaji\", \"ramesh\", \"ashok\", \"pakistan\", \"hilda\", \"kulkarni\", \"bisnu\", \n",
    "              \"ammu\", \"souza\", \"hilda\", \"shannon\", \"maharashtra\", \"narasimha\", \"singh\", \"shri\"],\n",
    "    \"Ireland\": [\"fianna\", \"Cissie\", \"rosaleen\"],\n",
    "    \"Jamaica\": [\"mona\", \"manley\", \"gleaner\", \"uwi\"],\n",
    "    \"Nigeria\": [\"bukola\", \"esp\", \"kehinde\"],\n",
    "    \"Philippines\" : [\"alejandro\", \"miguel\", \"estrada\", \"nog\", \"edsa\", \"tagalog\", \"Mindanao\", \"\"],\n",
    "    \"Singapore\" : [\"NUS\", \"Goh\", \"Tan\", \"Gulliver\", \"Lin\", \"Lee\", \"Sui\", \"Lim\", \"CPF\", \"citibank\"],\n",
    "    \"SriLanka\": [\"Zn\", \"Samira\", \"Sadhana\", \"OUSL\", \"wijewardene\", \"saman\", \"rajiv\", \"ugc\", \"kelaniya\", \"tamils\", \"ltte\", \"unp\"],\n",
    "    \"USA\" : [\"lucy\", \"florida\", \"irish\", \"argyst\", \"aristotle\", \"caribbean\", \"parc\", \"leila\"]\n",
    "}\n",
    "\n",
    "def getExtra(country):\n",
    "    if country in extra:\n",
    "        return set(extra[country])\n",
    "    else:\n",
    "        return set()\n",
    "\n",
    "def getNationality(country):\n",
    "    df = nat_df[nat_df.Country == country]\n",
    "    df = df['Nationality']\n",
    "    return set(df.values.flatten())\n",
    "\n",
    "cities_df = pd.read_csv('../util/worldcities.csv')\n",
    "cities_df = cities_df[['city', 'city_ascii', 'country', 'iso2', 'iso3']]\n",
    "def getCities(country):\n",
    "    df = cities_df[cities_df.country == country]\n",
    "    cities = df.values.flatten()\n",
    "    cities = set(x for x in cities if str(x) != 'nan')\n",
    "    return cities\n",
    "\n",
    "# cities, nationality, language, currency\n",
    "india = set([\"India\", \"Indian\", \"Indians\", \"INR\", \"Rs\", \"₨\", \"Rupee\", \"Bombay\",\"Bengali\",\"Hindi\",\"Maithili\",\"Nepalese\",\"Sanskrit\",\"Tamil\",\"Urdu\",\"Assamese\",\"Dogri\",\n",
    "             \"Kannada\",\"Gujarati\",\"Bodo\",\"Manipur\",\"Meitei\",\"Oriya\",\"Marathi\",\"Santali\",\"Telugu\",\"Punjabi\",\"Sindhi\",\"Malayalam\",\"Konkani\", \"Kashmiri\", \"English\"])\n",
    "india = india | getCities(\"India\") | getNationality(\"India\") | getExtra(\"India\")\n",
    "\n",
    "philippines = set([\"Filipino\",\"Filipinos\", \"Mga\", \"Philippines\", \"Philippine\", \"₱\", \"Peso\", \"PHP\", \"English\"])\n",
    "philippines = philippines | getCities(\"Philippines\") | getNationality(\"the Philippines\") | getExtra(\"Philippines\")\n",
    "\n",
    "singapore = set([\"Singapore\", \"Singaporeans\", \"Singaporean\", \"$\", \"S$\", \"SGD\", \"English\", \"Malay\", \"Mandarin\", \"Tamil\", \"Chinese\", \"China\"])\n",
    "singapore = singapore | getCities(\"Singapore\") | getNationality(\"Singapore\") | getExtra(\"Singapore\")\n",
    "\n",
    "canada = set([\"Canada\", \"Canadian\", \"Canadiens\", \"CAD\", \"$\", \"CA$\", \"Can$\", \"C$\", \"English\", \"French\"])\n",
    "canada = canada | getCities(\"Canada\") | getNationality(\"Canada\") | getExtra(\"Canada\")\n",
    "\n",
    "hongkong = set([\"Hong Kong\", \"HongKong\", \"Hong Konger\", \"Hong Kongers\", \"Hong Kongese\", \"HongKongers\", \"HongKonger\", \"HongKongese\", \"HKD\", \"HK$\", \"$\", \"Mainland\", \"Chinese\", \"China\", \"English\"])\n",
    "hongkong = hongkong | getCities(\"Hong Kong\") | getNationality(\"Hong Kong\") | getExtra(\"HongKong\")\n",
    "\n",
    "ireland = set([\"Ireland\", \"Euro\", \"IEP\", \"pounds\", \"pound\", \"Irish pound\", \"Belfast\", \"punt Éireannach\", \"£\", \"IR£\", \"Irish\", \"English\"])\n",
    "ireland = ireland | getCities(\"Ireland\") | getNationality(\"Ireland, Republic of\") | getExtra(\"Ireland\")\n",
    "\n",
    "jamaica = set([\"Jamaica\", \"Jumieka\", \"JMD\", \"$\", \"J$\", \"Jamaican\", \"Caribbean\", \"Jamaicans\", \"Jamaican Patois\", \"Patois\", \"Patwa\", \"Patwah\", \"Jamaican Creole\", \"English\"])\n",
    "jamaica = jamaica | getCities(\"Jamaica\") | getNationality(\"Jamaica\") | getExtra(\"Jamaica\")\n",
    "\n",
    "srilanka = set([\"Sri Lanka\", \"SriLanka\", \"Sri Lankan\", \"SriLankan\", \"SriLankans\", \"LKR\", \"Rupee\", \"Rs\", \"Sinhala\", \"Tamil\", \"English\"])\n",
    "srilanka = srilanka | getCities(\"Sri Lanka\") | getNationality(\"Sri Lanka\") | getExtra(\"SriLanka\")\n",
    "\n",
    "usa = set([\"United States\", \"U.S.A.\",\"American\", \"Americans\", \"USA\", \"USD\", \"US$\", \"$\",\"English\",  (\"US\")])\n",
    "usa = usa | getCities(\"United States\") | getNationality(\"United States\") | getExtra(\"USA\")\n",
    "\n",
    "nigeria = set([\"Nigeria\", \"Nigerian\", \"Nigerians\",  \"₦\", \"NGN\", \"Africa\",  \"African\", \"Africans\", \"Naira\", \"Hausa\", \"Igbo\", \"Yoruba\", \"English\"])\n",
    "nigeria = nigeria | getCities(\"Nigeria\") | getNationality(\"Nigeria\") | getExtra(\"Nigeria\")\n",
    "\n",
    "keywords = {\n",
    "    \"India\" : india,\n",
    "    \"Philippines\" : philippines,\n",
    "    \"Singapore\" : singapore,\n",
    "    \"Canada\" : canada,\n",
    "    \"HongKong\" : hongkong,\n",
    "    \"Ireland\" : ireland,\n",
    "    \"Jamaica\" : jamaica,\n",
    "    \"SriLanka\" : srilanka,\n",
    "    \"USA\" : usa,\n",
    "    \"Nigeria\" : nigeria,\n",
    "}\n",
    "\n",
    "def match_replace(m):\n",
    "    replaceText = \"\"\n",
    "    if m.group(1):\n",
    "        replaceText = (\"\".join(\" \") + m.group(1)).replace(\"  \", \" \") \n",
    "    elif m.group(2):\n",
    "        replaceText = (m.group(2) + \"\".join(\" \")).replace(\"  \", \" \") \n",
    "    elif m.group(3):\n",
    "        replaceText = (m.group(3) + \"\".join(\" \") + m.group(4)).replace(\"  \", \" \") \n",
    "    elif m.group(5):\n",
    "        replaceText = (m.group(5) + \"\".join(\" \") + m.group(6)).replace(\"  \", \" \") \n",
    "    elif m.group(7):\n",
    "        replaceText = (m.group(7) + \"\".join(\" \") + m.group(8)).replace(\"  \", \" \") \n",
    "    else:\n",
    "        replaceText = \" \"\n",
    "#     print(\"Replacing: '\" + m.group(0) + \"' with '\"+ replaceText + \"'\")\n",
    "    return replaceText\n",
    "\n",
    "def strip_keywords(text, country):\n",
    "    for k in sorted(keywords[country], key = len, reverse=True):\n",
    "        if isinstance(k, tuple):\n",
    "            pk = re.escape(k[0])\n",
    "        else:\n",
    "            pk = re.escape(k)\n",
    "        \n",
    "        # handles \"<keyword> \" or \" <keyword>\" or \" <keyword> \" or \" <keyword> !@#$%\"\n",
    "        pattern = \"^\"+pk+\"([\\W{0,1}\\s+])|(\\s+)\"+pk+\"$|(\\s+\\W+)\"+pk+\"([\\W+\\s+])|(\\s+)\"+pk+\"([\\W+\\s+])|(\\w\\W{1})\"+pk+\"([\\W+\\s+])\"\n",
    "        \n",
    "        if isinstance(k, tuple):\n",
    "            text = re.sub(pattern, match_replace, text)\n",
    "        else:\n",
    "            text = re.sub(pattern, match_replace, text, flags=re.IGNORECASE)\n",
    "    return text\n",
    "    \n",
    "def strip_accent(text):\n",
    "    text = text.replace(\"&acircumflex;\", \"â\")\n",
    "    text = text.replace(\"&icircumflex;\", \"î\")\n",
    "    text = text.replace(\"&ecircumflex;\", \"ê\")\n",
    "    text = text.replace(\"&ocircumflex;\", \"ô\")\n",
    "    text = text.replace(\"&ucircumflex;\", \"û\")\n",
    "    text = text.replace(\"&aumlaut;\", \"ä\")\n",
    "    text = text.replace(\"&dotted-line;\", \"...\")\n",
    "    \n",
    "    text = text.replace(\"&A-ACUTE;\", \"Â\")\n",
    "    text = text.replace(\"&I-ACUTE;\", \"Î\")\n",
    "    text = text.replace(\"&E-ACUTE;\", \"Ê\")\n",
    "    text = text.replace(\"&O-ACUTE;\", \"Ó\")\n",
    "    text = text.replace(\"&U-ACUTE;\", \"Û\")\n",
    "\n",
    "    text = text.replace(\"&eumlaut;\", \"ë\")\n",
    "    text = text.replace(\"&iumlaut;\", \"ï\")\n",
    "    text = text.replace(\"&oumlaut;\", \"ö\")\n",
    "    text = text.replace(\"&uumlaut;\", \"ü\")\n",
    "\n",
    "    text = text.replace(\"&aeligature;\", \"Æ\")\n",
    "    text = text.replace(\"&oeligature;\", \"Œ\")\n",
    "    text = text.replace(\"&ccedille;\", \"Ç\")\n",
    "    text = text.replace(\"&ntidle;\", \"ñ\")\n",
    "    text = text.replace(\"&obrack;\", \"[\")\n",
    "    text = text.replace(\"&cbrack;\", \"]\")\n",
    "\n",
    "    text = text.replace(\"&lsqbrack;\", \"{\")\n",
    "    text = text.replace(\"&rsqbrack;\", \"}\")\n",
    "    text = text.replace(\"&ampersand;\", \"&\")\n",
    "\n",
    "    text = text.replace(\"&degree;\", \"°\")\n",
    "    text = text.replace(\"&degree-sign;\", \"°\")\n",
    "    text = text.replace(\"&percent;\", \"%\")\n",
    "    text = text.replace(\"&scol;\", \";\")\n",
    "\n",
    "    text = text.replace(\"&plus-or-minus;\", \"±\")\n",
    "    text = text.replace(\"&curved-dash;\", \"~\")\n",
    "    text = text.replace(\"&very-long-dash;\", \"—\")\n",
    "    text = text.replace(\"&long-dash;\", \"—\")\n",
    "    text = text.replace(\"&dotted-line;\", \"┄\")\n",
    "    text = text.replace(\"&dotted-line;\", \"┄\")\n",
    "    text = text.replace(\"&arrowhead;\", \"➤\")\n",
    "    text = text.replace(\"&right-arrow;\", \"→\")\n",
    "    text = text.replace(\"&black-square;\", \"■\")\n",
    "    text = text.replace(\"&peso;\", \"₱\")\n",
    "    text = text.replace(\"&centavo;\", \"￠\")\n",
    "    text = text.replace(\"&pound-sign;\", \"£\")\n",
    "    text = text.replace(\"&club;\", \"♣\")\n",
    "    text = text.replace(\"&heart;\", \"♥\")\n",
    "    text = text.replace(\"&spade;\", \"♠\")\n",
    "    text = text.replace(\"&diamond;\", \"♦\")\n",
    "    \n",
    "    # new set\n",
    "    text = text.replace(\"&hash;\", \"#\")\n",
    "    text = text.replace(\"&unch;\", \"उञ्छ्\")#\n",
    "    text = text.replace(\"&DELTA;\", \"Δ\")\n",
    "    text = text.replace(\"&THETA;\", \"Θ\")\n",
    "    text = text.replace(\"&asterisk;\", \"!\")\n",
    "    text = text.replace(\"&line;\", \"-\")\n",
    "    text = text.replace(\"&ggt;\", \"\")#\n",
    "    text = text.replace(\"&infinity;\", \"∞\")\n",
    "    text = text.replace(\"&obrack;\", \"(\")\n",
    "    text = text.replace(\"&cbrack;\", \")\")\n",
    "    text = text.replace(\"&percent;\", \"%\")\n",
    "    text = text.replace(\"&scol;\", \"\")#\n",
    "    text = text.replace(\"&crback;\", \"\")#\n",
    "    text = text.replace(\"&swungdash;\", \"\")#\n",
    "    text = text.replace(\"&atsign;\", \"@\")\n",
    "    text = text.replace(\"&longdash;\", \"—\")\n",
    "    text = text.replace(\"&rqduo;\", \"\\\"\")\n",
    "    text = text.replace(\"&arrow;\", \"\")\n",
    "    text = text.replace(\"&EACUTE;\", \"\")\n",
    "    text = text.replace(\"&esszett;\", \"\")\n",
    "    text = text.replace(\"&ldquuo;\", \"\\\"\")\n",
    "    text = text.replace(\"&rsdquo;\", \"\\\"\")\n",
    "    text = text.replace(\"&squared;\", \"²\")\n",
    "    text = text.replace(\"&plusminus;\", \"±\")\n",
    "    text = text.replace(\"&bullelt;\", \"•\")\n",
    "    text = text.replace(\"&circle;\", \"○\")\n",
    "    text = text.replace(\"&multiply;\", \"×\")\n",
    "    text = text.replace(\"&dotted;\", \".\")\n",
    "    text = text.replace(\"&dottedline;\", \"...\")\n",
    "    text = text.replace(\"&Amacron;\", \"Ā\")\n",
    "    text = text.replace(\"&amacron;\", \"ā\")\n",
    "    text = text.replace(\"&emacron;\", \"ē\")\n",
    "    text = text.replace(\"&imacron;\", \"ī\")\n",
    "    text = text.replace(\"&scircumflex;\", \"ˆ\")\n",
    "    text = text.replace(\"&PSI;\", \"Ψ\")\n",
    "    text = text.replace(\"&umacron;\", \"ū\")\n",
    "    text = text.replace(\"&rcedille;\", \"ç\")\n",
    "    text = text.replace(\"&lcedille;\", \"ç\")\n",
    "    text = text.replace(\"&ncedille;\", \"ç\")\n",
    "    text = text.replace(\"&omacron;\", \"ō\")\n",
    "    text = text.replace(\"&mdot;\", \".\")\n",
    "    text = text.replace(\"&ldot;\", \".\")\n",
    "    text = text.replace(\"&ndot;\", \".\")\n",
    "    text = text.replace(\"&ddot;\", \".\")\n",
    "    text = text.replace(\"&lline;\", \"-\")\n",
    "    text = text.replace(\"&ndotabove;\", \"⋵̸\")\n",
    "    text = text.replace(\"&eaeute;\", \"\")\n",
    "    text = text.replace(\"&eeedille;\", \"\")\n",
    "    text = text.replace(\"&uum1aut;\", \"ü\")\n",
    "    text = text.replace(\"&oeligature;\", \"Œ\")\n",
    "    text = text.replace(\"&ersand;\", \"\")\n",
    "    text = text.replace(\"&dolalr;\", \"$\")\n",
    "    text = text.replace(\"&peso;\", \"₱\")\n",
    "    text = text.replace(\"&ntidle;\", \"~\")\n",
    "    text = text.replace(\"&lsqbrack;\", \"[\")\n",
    "    text = text.replace(\"&rsqbrack;\", \"]\")\n",
    "    text = text.replace(\"&arrowhead;\", \"➤\")\n",
    "    text = text.replace(\"&club;\", \"♣\")\n",
    "    text = text.replace(\"&heart;\", \"♥\")\n",
    "    text = text.replace(\"&spade;\", \"♠\")\n",
    "    \n",
    "    text = html.unescape(text)\n",
    "\n",
    "    try:\n",
    "        text = str(text, 'utf-8')\n",
    "    except (TypeError, NameError):  # unicode is a default on python 3\n",
    "        pass\n",
    "    text = unicodedata.normalize('NFKD', text)\n",
    "    text = text.encode('ascii', 'ignore')\n",
    "    text = text.decode(\"utf-8\")\n",
    "    text = str(text)\n",
    "    return text\n",
    "\n",
    "def parse(txt, country):\n",
    "    txt = re.sub(r'<O>.*?<\\/O>', '', txt, flags=re.DOTALL) # remove photographs info Untranscribed text <O> </O> \n",
    "    txt = re.sub(r'<\\+>.*?<\\/\\+>', '', txt) # remove corrected words\n",
    "    txt = re.sub(r'<\\&>.*?<\\/\\&>', '', txt, flags=re.DOTALL) # Editorial comments <&> </&> \n",
    "    txt = re.sub(r'<@>\\s*[^<]*?\\s*<\\/@>', '', txt) # Person names and entities\n",
    "    \n",
    "    txt = re.sub(r'{\\s*?#\\s*?}', '', txt) # Remove anonimized text {#} {\\n#}\n",
    "    txt = re.sub(r'<\\->(.*?)<\\/\\->', r'\\1', txt) # replace <->[text]</-> with [text]\n",
    "    txt = re.sub(r'<quote>\\s*?[\\'`\"]\\s*?', '\"', txt) # replace <quote> tags with \"\n",
    "    txt = re.sub(r'[\\'`\"]\\s*?</quote>', '\"', txt) # replace </quote> tags with \"\n",
    "    txt = re.sub(r'XXX|Mr\\.\\s+X\\.\\s*X\\.|Dr\\.\\s+X\\.\\s*X\\.|Prof\\.\\s+X\\.\\s*X\\.|\\s+X\\.\\s*X\\.', '', txt) # replace XXX\n",
    "    \n",
    "    txt = re.sub(r'<\\*>\\s*dotted line\\s*<\\/\\*>', '...', txt) # replace dotted-line\n",
    "    txt = re.sub(r'<\\*>\\s*ampersand\\s*<\\/\\*>|<\\*>\\s*ampersand sign\\s*<\\/\\*>', '&', txt) # replace ampersand\n",
    "    txt = re.sub(r'<\\*>\\s*degrees\\s*<\\/\\*>|<\\*>\\s*degrees sign\\s*<\\/\\*>', '°', txt) # replace degrees\n",
    "    txt = re.sub(r'<\\*>\\s*minus\\s*<\\/\\*>|<\\*>\\s*minus sign\\s*<\\/\\*>', '-', txt) # replace minus\n",
    "    txt = re.sub(r'<\\*>\\s*plus\\s*<\\/\\*>|<\\*>\\s*plus sign\\s*<\\/\\*>', '+', txt) # replace minus\n",
    "    txt = re.sub(r'<\\*>\\s*approximate-sign\\s*<\\/\\*>', '~', txt)\n",
    "    txt = re.sub(r'<\\*>\\s*theta\\s*<\\/\\*>|<\\*>\\s*theta sign\\s*<\\/\\*>', 'Θ', txt)\n",
    "    txt = re.sub(r'<\\*>\\s*mu\\s*<\\/\\*>|<\\*>\\s*mu sign\\s*<\\/\\*>', 'μ', txt)\n",
    "    txt = re.sub(r'<\\*>\\s*star\\s*<\\/\\*>', '*', txt)\n",
    "    txt = re.sub(r'<\\*>\\s*plus or minus\\s*<\\/\\*>|<\\*>\\s*plus or minus sign\\s*<\\/\\*>', '±', txt)\n",
    "    txt = re.sub(r'<\\*>\\s*pound\\s*<\\/\\*>|<\\*>\\s*pound sign\\s*<\\/\\*>', '£', txt)\n",
    "    txt = re.sub(r'<\\*>\\s*per cent\\s*<\\/\\*>|<\\*>\\s*per cent sign\\s*<\\/\\*>', '%', txt)\n",
    "    txt = re.sub(r'<\\*>\\s*alpha\\s*<\\/\\*>|<\\*>\\s*alpha sign\\s*<\\/\\*>', 'α', txt)\n",
    "    txt = re.sub(r'<\\*>\\b.*\\b<\\/\\*>', '-', txt) # replace all <*> </*>\n",
    "    \n",
    "    txt = re.sub(r'(\\w)\\s*<l>\\s*(\\w)', r'\\1\\2', txt) # replaces \"play <l> ing\" and \"eat<l>ing\"\n",
    "    \n",
    "    # Divide into sentences\n",
    "    sent = re.split(r'\\<ICE\\-\\w+:s\\w+-\\d+#\\d+:\\d+>', txt)\n",
    "    sent = [re.sub(r'^\\s*[\\n]+|[\\n]+\\s*$|\\r\\s*', ' ', s) for s in sent] # new lines in start and end stripped off\n",
    "    sent = [s.replace('\\n', ' ') for s in sent] # new lines in the middle of sent is replaced with a whitespace\n",
    "    sent = [re.sub(r'<[^<]*>', '', s) for s in sent] # remove all other XML entities\n",
    "    sent = [strip_keywords(s, country) for s in sent] # strip cities\n",
    "    sent = [strip_accent(s) for s in sent] # strip accent\n",
    "    # sent\n",
    "    # collec sentences into a single text adding dots[.] if needed\n",
    "    res = \"\"\n",
    "    for idx, s in enumerate(sent):\n",
    "        prevSent = sent[idx - 1] if idx > 0 else None\n",
    "        if prevSent and not prevSent.isspace() and not prevSent.endswith(\".\"):\n",
    "            res += \". \" + s.strip()\n",
    "        else:\n",
    "            res += s.strip()\n",
    "\n",
    "    return res\n",
    "\n",
    "with open(fname,\"rb\") as f:\n",
    "    txt = f.read().decode(\"latin-1\")\n",
    "parse(txt, country)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "0bfc6c92-5377-4d10-813d-8f139e5d49c1",
   "metadata": {},
   "source": [
    "## Dump All to a CSV"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "945bb682-6822-4532-9f1b-384a94b1ce65",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Processing  India\n",
      "Processing  Philippines\n",
      "Processing  Singapore\n",
      "Processing Processing  Canada\n",
      "Processing  Ireland\n",
      " HongKong\n",
      "Processing Processing  Jamaica\n",
      " SriLanka\n",
      "Processing  USA\n",
      "Processing  Nigeria\n",
      "Writing  './singapore.csv'\n",
      "Writing  './hongkong.csv'\n",
      "Writing  './jamaica.csv'\n",
      "Writing  './srilanka.csv'\n",
      "Writing  './ireland.csv'\n",
      "Writing  './nigeria.csv'\n",
      "Writing  './canada.csv'\n",
      "Writing  './india.csv'\n",
      "Writing  './philippines.csv'\n",
      "Writing  './usa.csv'\n",
      "Writing ./merged_ice.csv\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>name</th>\n",
       "      <th>document</th>\n",
       "      <th>language</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>W2B-004.TXT</td>\n",
       "      <td>THE NATION-PORT   The story of the port is really the story of .  The existence of Temasek and the arrival of Stamford Raffles are of such momentous importance to the history and economic development of  that they form part of every child's education.  Even visitors get to know about it when they come to .  This familiar tale has been told and retold so many times that the role of the port in ...</td>\n",
       "      <td>Singapore</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>W2A-016.txt</td>\n",
       "      <td>Mortality and Morbidity Trends and Poverty in  LAKSHMAN DISSANAYAKE Senior Lecturer Department of Demography University of  Colombo,  Introduction In general, demographers or other social scientists who have examined the demographic transition in  have failed to explore the importance of the contribution made by changes in mortality on the major demographic changes occurred in the country. The...</td>\n",
       "      <td>SriLanka</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>W2D-001.TXT</td>\n",
       "      <td>NATIONAL UNIVERSITY OF   Telephone 7756666 Telegrams UNIVSPORE  Telex NUSPER RS51111  Telefax 7783948  Bitnet PERSDEPT NUSVM  19 February 1990  Teaching, Research, Administrative and Professional Library &amp; Computer Staff   UNIVERSITY HOLIDAY CHALETS SCHEME   I am pleased to inform you that the University has obtained leases for three UDMC double-story chalets at East Coast Parkway and one NTUC...</td>\n",
       "      <td>Singapore</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>W1B-007.TXT</td>\n",
       "      <td>Dear ,   How are you?    as you might have guessed, we have again been having problems with our internet connection.  I have been trying to send you mail and   i just keep on losing them.    i am now typing this in notepad first before   i paste it into my email.  You might also notice that   i am using a different email account because yahoo is down right now.  I am not sure    i will be able...</td>\n",
       "      <td>Jamaica</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>W2A-032.TXT</td>\n",
       "      <td>Seismic response of concentrically braced steel frames    Richard G. Redwood AND Feng Lu  Gilles Bouchard and Patrick Paultre   Braced frame structures designed according to the 1990 edition of the National Building Code of  and the CSA standard for steel structures ( /CSA-S16.1-M89) are analyzed under a number of different earthquake motions.  The nonlinear response is studied in the light of...</td>\n",
       "      <td>Canada</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1950</th>\n",
       "      <td>W2f-011.txt</td>\n",
       "      <td>The Drunk Tantra       A Prosperity of Cousins     What does one do when one doesn't do?      Returning from the learned conference at the port town, which he had extended by several days, combining it with holy pilgrimages to various shrines  the region, Hairy looked a different man, a changed man: subdued, moody, quite out of character.  He appeared alarmingly tamed.  But the change appealed...</td>\n",
       "      <td>India</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1951</th>\n",
       "      <td>W2E-008.TXT</td>\n",
       "      <td>Domestic budgeting in the region     By Lisa McGregor    MEXICO has the second largest economy in the Latin American region with an average gross domestic product of US$3,412 per capita.  Nonetheless, the country does not escape the problem of rampant poverty.  According to 1991 figures of the United Nations Economic Commission for Latin America and the , 48.8% of Mexico's 82 million inhabitan...</td>\n",
       "      <td>Jamaica</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1952</th>\n",
       "      <td>W1B-024 Tunnel - TD.txt</td>\n",
       "      <td>17TH JULY 1996  CLLR. D. MCDOWELL, T.D.  107C MALAHIDE ROAD,  DONNYCARNEY,  3.     RE:  PORT TUNNEL (D.P.T.)     Dear Councillor,  I refer to your letter dated 21st May 1996 to the National Roads Authority, and to Mr. J. Fitzgerald, City Manager.    In your letter the first three paragraphs list some of the genuine concerns that residents along, and close to, the route have in relation to the ...</td>\n",
       "      <td>Ireland</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1953</th>\n",
       "      <td>sl_01_ed_03_SkHo_25.txt</td>\n",
       "      <td>Monday, November 5, 2007 7:26 PM from: \"  \"  To: \"  \"  Hello Sir, So nice to hear from you. I got confused with the  post graduate programme and  abandoned it. The lecturers were too slow, and the it was quite  uninteresting. I am presently working as at  Reporter with  Independent Television, AIT and Rapower FM  station in . I'm enjoying the job, but I still want to get my  Masters Degree. Me...</td>\n",
       "      <td>Nigeria</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1954</th>\n",
       "      <td>W2B-037.TXT</td>\n",
       "      <td>Video-on-demand: still more to be done   About 500 households from different parts of  will be involved in the video-on-demand trial conducted by  Telecom  by Kevin   Comfortably lounging on a sofa in his three-room HDB apartment, Mr  reaches for his TV remote control to check which Sharon Stone movies are available on TV.  He considers Basic Instinct but finally chooses Crocodile Dundee.  Hal...</td>\n",
       "      <td>Singapore</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>1955 rows × 3 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                         name  \\\n",
       "0                 W2B-004.TXT   \n",
       "1                 W2A-016.txt   \n",
       "2                 W2D-001.TXT   \n",
       "3                 W1B-007.TXT   \n",
       "4                 W2A-032.TXT   \n",
       "...                       ...   \n",
       "1950              W2f-011.txt   \n",
       "1951              W2E-008.TXT   \n",
       "1952  W1B-024 Tunnel - TD.txt   \n",
       "1953  sl_01_ed_03_SkHo_25.txt   \n",
       "1954              W2B-037.TXT   \n",
       "\n",
       "                                                                                                                                                                                                                                                                                                                                                                                                             document  \\\n",
       "0     THE NATION-PORT   The story of the port is really the story of .  The existence of Temasek and the arrival of Stamford Raffles are of such momentous importance to the history and economic development of  that they form part of every child's education.  Even visitors get to know about it when they come to .  This familiar tale has been told and retold so many times that the role of the port in ...   \n",
       "1     Mortality and Morbidity Trends and Poverty in  LAKSHMAN DISSANAYAKE Senior Lecturer Department of Demography University of  Colombo,  Introduction In general, demographers or other social scientists who have examined the demographic transition in  have failed to explore the importance of the contribution made by changes in mortality on the major demographic changes occurred in the country. The...   \n",
       "2     NATIONAL UNIVERSITY OF   Telephone 7756666 Telegrams UNIVSPORE  Telex NUSPER RS51111  Telefax 7783948  Bitnet PERSDEPT NUSVM  19 February 1990  Teaching, Research, Administrative and Professional Library & Computer Staff   UNIVERSITY HOLIDAY CHALETS SCHEME   I am pleased to inform you that the University has obtained leases for three UDMC double-story chalets at East Coast Parkway and one NTUC...   \n",
       "3     Dear ,   How are you?    as you might have guessed, we have again been having problems with our internet connection.  I have been trying to send you mail and   i just keep on losing them.    i am now typing this in notepad first before   i paste it into my email.  You might also notice that   i am using a different email account because yahoo is down right now.  I am not sure    i will be able...   \n",
       "4     Seismic response of concentrically braced steel frames    Richard G. Redwood AND Feng Lu  Gilles Bouchard and Patrick Paultre   Braced frame structures designed according to the 1990 edition of the National Building Code of  and the CSA standard for steel structures ( /CSA-S16.1-M89) are analyzed under a number of different earthquake motions.  The nonlinear response is studied in the light of...   \n",
       "...                                                                                                                                                                                                                                                                                                                                                                                                               ...   \n",
       "1950  The Drunk Tantra       A Prosperity of Cousins     What does one do when one doesn't do?      Returning from the learned conference at the port town, which he had extended by several days, combining it with holy pilgrimages to various shrines  the region, Hairy looked a different man, a changed man: subdued, moody, quite out of character.  He appeared alarmingly tamed.  But the change appealed...   \n",
       "1951  Domestic budgeting in the region     By Lisa McGregor    MEXICO has the second largest economy in the Latin American region with an average gross domestic product of US$3,412 per capita.  Nonetheless, the country does not escape the problem of rampant poverty.  According to 1991 figures of the United Nations Economic Commission for Latin America and the , 48.8% of Mexico's 82 million inhabitan...   \n",
       "1952  17TH JULY 1996  CLLR. D. MCDOWELL, T.D.  107C MALAHIDE ROAD,  DONNYCARNEY,  3.     RE:  PORT TUNNEL (D.P.T.)     Dear Councillor,  I refer to your letter dated 21st May 1996 to the National Roads Authority, and to Mr. J. Fitzgerald, City Manager.    In your letter the first three paragraphs list some of the genuine concerns that residents along, and close to, the route have in relation to the ...   \n",
       "1953  Monday, November 5, 2007 7:26 PM from: \"  \"  To: \"  \"  Hello Sir, So nice to hear from you. I got confused with the  post graduate programme and  abandoned it. The lecturers were too slow, and the it was quite  uninteresting. I am presently working as at  Reporter with  Independent Television, AIT and Rapower FM  station in . I'm enjoying the job, but I still want to get my  Masters Degree. Me...   \n",
       "1954  Video-on-demand: still more to be done   About 500 households from different parts of  will be involved in the video-on-demand trial conducted by  Telecom  by Kevin   Comfortably lounging on a sofa in his three-room HDB apartment, Mr  reaches for his TV remote control to check which Sharon Stone movies are available on TV.  He considers Basic Instinct but finally chooses Crocodile Dundee.  Hal...   \n",
       "\n",
       "       language  \n",
       "0     Singapore  \n",
       "1      SriLanka  \n",
       "2     Singapore  \n",
       "3       Jamaica  \n",
       "4        Canada  \n",
       "...         ...  \n",
       "1950      India  \n",
       "1951    Jamaica  \n",
       "1952    Ireland  \n",
       "1953    Nigeria  \n",
       "1954  Singapore  \n",
       "\n",
       "[1955 rows x 3 columns]"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import os\n",
    "import pandas as pd\n",
    "\n",
    "import threading\n",
    "from threading import Thread\n",
    "import time\n",
    "\n",
    "class myThread(threading.Thread):\n",
    "    def __init__(self, country, basePath):\n",
    "        threading.Thread.__init__(self)\n",
    "        self.country = country\n",
    "        self.basePath = basePath\n",
    "    def run(self):\n",
    "        print(\"Processing \", self.country)\n",
    "        files = os.listdir(self.basePath)\n",
    "        parsedList = []\n",
    "        nameList = []\n",
    "        for file in files:\n",
    "            if file.lower().endswith(\".txt\"):\n",
    "                txt = \"\"\n",
    "                fname = self.basePath + file\n",
    "                with open(fname,\"rb\") as f:\n",
    "                    txt = f.read().decode(\"latin-1\")\n",
    "                parsedList.append(parse(txt, self.country))\n",
    "                nameList.append(file)\n",
    "        # parsedList\n",
    "        df = pd.DataFrame({\"name\" : nameList, \"document\" : parsedList})\n",
    "        df = df.assign(language=self.country)\n",
    "        print(\"Writing \", \"'./\" + self.country.lower() + \".csv\" + \"'\")\n",
    "        df.to_csv(\"./\" + self.country.lower() + \".csv\", index = False)\n",
    "\n",
    "threads = []\n",
    "for country, basePath, _ in all:\n",
    "    thread1 = myThread(country, basePath)\n",
    "    thread1.start()\n",
    "    threads.append(thread1)\n",
    "for t in threads:\n",
    "    t.join()\n",
    "\n",
    "# threads = []\n",
    "# thread1 = myThread('India', './ICE India/Corpus/')\n",
    "# thread1.start()\n",
    "# threads.append(thread1)\n",
    "# for t in threads:\n",
    "#     t.join()\n",
    "\n",
    "print(\"Writing ./merged_ice.csv\")\n",
    "df = None\n",
    "for country, _, _ in all:\n",
    "    csv_file = \"./\" + country.lower() + \".csv\"\n",
    "    df_tmp = pd.read_csv(csv_file)\n",
    "    if df is None:\n",
    "        df = df_tmp\n",
    "    else:\n",
    "        df = df.append(df_tmp, ignore_index = True)\n",
    "df = df.sample(frac=1).reset_index(drop=True)\n",
    "df.to_csv(\"./merged_ice.csv\", index = False)\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "21d916c8-e30c-4ae3-84b8-ef448d8222f0",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.11"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
