import pandas as pd


def aggregate_all():
    import pandas as pd

    MAIN_KEYS = ['GLOBALEVENTID', 'SQLDATE', 'MonthYear', 'Year',
                 'Actor1Name', 'Actor1CountryCode',
                 'Actor2Name', 'Actor2CountryCode',
                 'IsRootEvent', 'EventCode', 'EventRootCode']

    frames = []
    for i in range(0, 731, 10):
        df = pd.read_csv('%d.csv' % i, delimiter=',',
                         converters={'EventCode': lambda x: str(x), 'EventRootCode': lambda x: str(x),
                                     'EventBaseCode': lambda x: str(x)})
        df = df[(df['Year'] >= 2017) & (df['Year'] <= 2019)]
        df = df.drop_duplicates(
            subset=['GLOBALEVENTID', 'SQLDATE', 'EventCode', 'Actor1Name', 'Actor1CountryCode', 'Actor2Name',
                    'Actor2CountryCode', 'ActionGeo_Lat', 'ActionGeo_Long'], keep='first')
        df = df[MAIN_KEYS]
        frames.append(df)

    events = pd.concat(frames, ignore_index=True)
    print(len(events))
    events = events.drop_duplicates(subset=['GLOBALEVENTID'], keep='first')
    print(len(events))
    events.to_csv('first.csv')

    frames = []
    for i in range(731, 992, 10):
        df = pd.read_csv('%d.csv' % i, delimiter=',',
                         converters={'EventCode': lambda x: str(x), 'EventRootCode': lambda x: str(x),
                                     'EventBaseCode': lambda x: str(x)})

        df = df[(df['Year'] >= 2017) & (df['Year'] <= 2019)]
        df = df.drop_duplicates(
            subset=['GLOBALEVENTID', 'SQLDATE', 'EventCode', 'Actor1Name', 'Actor1CountryCode', 'Actor2Name',
                    'Actor2CountryCode', 'ActionGeo_Lat', 'ActionGeo_Long'], keep='first')

        df = df[MAIN_KEYS]
        frames.append(df)

    events = pd.concat(frames, ignore_index=True)
    print(len(events))
    events = events.drop_duplicates(subset=['GLOBALEVENTID'], keep='first')
    print(len(events))
    events.to_csv('second.csv')


def preprocess():
    # concat two files
    g1 = pd.read_csv('first.csv', delimiter=',',
                         converters={'EventCode': lambda x: str(x), 'EventRootCode': lambda x: str(x), 'EventBaseCode': lambda x: str(x)})
    g2 = pd.read_csv('second.csv', delimiter=',',
                         converters={'EventCode': lambda x: str(x), 'EventRootCode': lambda x: str(x), 'EventBaseCode': lambda x: str(x)})

    import IPython; IPython.embed()
    events = pd.concat([g1, g2], ignore_index=True)
    events = events.drop_duplicates(subset=['GLOBALEVENTID'], keep='first')

    # process date
    events['Event Date'] = pd.to_datetime(events['SQLDATE'], format='%Y%m%d')

    # rename columns: convert gdelt keys to ICEWS keys
    KEY_MAP = {'Actor1Name': 'Source Name', 'Actor1CountryCode': 'SActor',
               'Actor2Name': 'Target Name', 'Actor2CountryCode': 'TActor',
               'EventCode': 'Cameo Code'}
    events.rename(columns=KEY_MAP, inplace=True)


    # save the file
    events.to_csv('gdelt.csv')


preprocess()





