## split texts to sentences
#from pycorenlp import StanfordCoreNLP
import re
import time
import sys
import json
import os

#stdout = sys.stdout
#reload(sys)
#sys.setdefaultencoding('utf-8')
#sys.stdout = stdout
#nlp = StanfordCoreNLP('http://localhost:9000')
#nlp = StanfordCoreNLP()

CORPUS_PATH = '.'

CORENLP_PATH = "../stanford-corenlp-4.4.0"

texts = ['delorme.com_shu.pages_',
         'mbta.com_mtu.pages_',
         'ucdavis_wnba.pages_',
         'utexas_iit.pages_',
         'weather.yahoo_bbk.ac.pages_']
counts = [131, 83, 74, 96, 19]

for t,c in zip(texts, counts):

    for i in range(c + 1):

        infilePath = f'{CORPUS_PATH}/webbase_all/{t}{i}.txt'
        tempfilePath =f'{t}{i}.txt.json'
        outfilePath = f'{CORPUS_PATH}/webbase_processed/{t}{i}.txt'

        splitted_texts = ''
        count_articles = 0
        count_tokens = 0
        start_time = time.time()
        with open(outfilePath, 'w') as outfile:

            os.system(f'java -cp "{CORENLP_PATH}/*" edu.stanford.nlp.pipeline.StanfordCoreNLP -annotators tokenize,ssplit -tokenize.options "normalizeParentheses=false, normalizeOtherBrackets=false" -outputFormat json --file {infilePath} -outputDirectory . > /dev/null')

            with open(tempfilePath) as js:
                try:
                    output = json.load(js)
                    sents = [[re.sub(r'\d', '0', token['word'].lower()) for token in sent['tokens']] for sent in output['sentences'] if len(sent['tokens']) >= 5]
                except Exception as e:
                    print(e)
                else:
                    for sent in sents:
                        outfile.write(' '.join(sent)+'\n')

        os.system(f'rm {tempfilePath}')
        elapsed_time = time.time() - start_time
        print (infilePath.split('/')[-1], 'Elapsed time:', elapsed_time, 's')
        sys.stdout.flush()
