import urllib.request
import tarfile
import os
import pickle
import bs4
from bs4 import BeautifulSoup
import pandas as pd

thetarfile = "http://www.dianamccarthy.co.uk/downloads/WordMeaningAnno2012/cl-meaningincontext.tgz"
ftpstream = urllib.request.urlopen(thetarfile)
thetarfile = tarfile.open(fileobj=ftpstream, mode="r|gz")
os.chdir('/makesense_dir_path/data/usim/')
thetarfile.extractall()


os.chdir("/makesense_dir_path/experiments/")
os.chdir('../data/usim/Data/')
with open('lexsub_wcdata.xml', 'r') as f: 
    data = f.read()
    
Bs_data = BeautifulSoup(data, "xml")
items = Bs_data.find_all('lexelt')

context_all = []
words_all = []
ins_all = []
for item in items:
    contexts = item.find_all('context')
    instance = item.find_all("instance")
    
    for i in contexts:
        context_all.append(i.text)
        words = i.find_all('head')
        
        for j in words:
            words_all.append(j.text)
    for k in instance:
        ins_all.append(k['id'])
        
        
word_context = pd.DataFrame(zip(ins_all, context_all, words_all), columns = ['lexsub_id1', 'context', 'word'])
word_context = word_context.astype({'lexsub_id1': 'int64'})

word_pos = []
for i in range(len(word_context)):
    word_pos.append(word_context.iloc[i].context.split().index(word_context.iloc[i].word))
    
os.chdir("/makesense_dir_path/experiments/")
os.chdir('../data/usim/Markup/UsageSimilarity/')

data = pd.read_csv('usim2ratings.csv')
data_filt1 = data[['lexsub_id1', 'lemma']].drop_duplicates()
data_filt2 = data[['lexsub_id2', 'lemma']].drop_duplicates()
data_filt2.columns = ['lexsub_id1', 'lemma']
data_filt = data_filt1.append(data_filt2, ignore_index=True).drop_duplicates()
data_filt = data_filt.sort_values(by=['lexsub_id1']).reset_index(drop = True)

word_context['word_position'] = word_pos
data_all = pd.merge(word_context, data_filt, on="lexsub_id1")
data_all.to_csv("/makesense_dir_path/data/usim_sents.csv", index=False)
    
    