############################################################
#
# lobster_to_dataset.py
# script to processes Lobster data (which comes in CSV format)
# into useful form for our models
# February 2020
#
############################################################

import torch
import numpy as np
import argparse
from utility import now
import os
import pandas as pd
import glob
import _pickle as pickle


# Global variable file_number used to name data files as they get saved
global file_number


def next_file_name(out_dir):
    """ File name creator, with sequential numbers and different directories for
    raw data arrays and for transactions."""

    global file_number
    file_number += 1
    return os.path.join(out_dir, 'long_data', str(file_number).zfill(8)), \
        os.path.join(out_dir, 'transactions', str(file_number).zfill(8)), \
        os.path.join(out_dir, 'transactions', str(file_number).zfill(8) + 'running')


def get_transaction_price_lists(m_fh, row_to_start, timestamps):
    """ Function to generate a list of prices at which the asset was transacted. """

    # read in message from Lobster file, starting at the correct row
    messages = pd.read_csv(m_fh, header=None, usecols=[0, 1, 2, 3, 4, 5]).values
    messages = messages[row_to_start:, :]

    # identify all rows witha transaction by finding both hidden and visable transactions
    idx_vis = messages[:, 1] == 4
    idx_hid = messages[:, 1] == 5
    idx = idx_vis + idx_hid
    transactions = messages[idx]

    # Running price tracker
    running_price = torch.zeros_like(torch.tensor(timestamps))

    # find transactions for this snapshot in time
    list_of_transaction_prices = []
    current_row = 0
    for i, t in enumerate(timestamps):
        current_prices = set()
        finding_prices = True
        while finding_prices:
            if transactions[current_row, 0] <= t:
                current_prices.add(messages[current_row, 4] / 1e4)  # change prices to dollar ammounts
                current_row += 1
            else:
                running_price[i] = messages[current_row, 4] / 1e4
                list_of_transaction_prices.append(current_prices)
                finding_prices = False
    return list_of_transaction_prices, running_price


def main():
    print(now(), "lobster_to_dataset.py running...")

    # global file_number for naming output files, start at -1
    global file_number
    file_number = -1

    # Argument parser
    parser = argparse.ArgumentParser(description="Python")
    parser.add_argument("--raw_data_dir", default="JPMData", type=str, help="Where is the raw data?")
    parser.add_argument("--out_data_dir", default="junk_data", type=str, help="Where should the clean data go?")
    parser.add_argument("--orders_per_sec", default=100, type=int, help="How many orders per second?")
    parser.add_argument("--start_date", default='', type=str, help="Time between start of adjacent chunks.")
    args = parser.parse_args()

    # make output directory (check for existence of each directory)
    if os.path.isdir(os.path.join(args.out_data_dir, 'long_data')) and os.path.isdir(os.path.join(args.out_data_dir,
                                                                                                  'transactions')):
        print("Files exist, aborting data generation in lobster_to_dataset.py.")
        return
    if not os.path.isdir(os.path.join(args.out_data_dir, 'long_data')):
        os.makedirs(os.path.join(args.out_data_dir, 'long_data'))
    if not os.path.isdir(os.path.join(args.out_data_dir, 'transactions')):
        os.makedirs(os.path.join(args.out_data_dir, 'transactions'))

    # get file names
    message_file_name_list = sorted(glob.glob(os.path.join(args.raw_data_dir, "*message*.csv")))
    orderbook_file_name_list = sorted(glob.glob(os.path.join(args.raw_data_dir, "*orderbook*.csv")))

    # useful variables
    reached_start_date = False
    out_dim = args.orders_per_sec * 60 * 60  # rows required to represent the first hour of activity

    # march through the files, building a tensor for each snapshot
    for m_fh, o_fh in zip(message_file_name_list, orderbook_file_name_list):

        if args.start_date in m_fh.split('/')[-1]:
            reached_start_date = True
        
        if reached_start_date:
            print(m_fh, o_fh)

            # times from messages file
            timestamps = pd.read_csv(m_fh, header=None, usecols=[0]).values

            # combine orders with times
            orders_without_times = pd.read_csv(o_fh, header=None).values
            orders = np.concatenate((timestamps, orders_without_times), axis=1)

            # start filling array
            out_array = np.zeros([out_dim, orders.shape[1]])
            out_array[0, :] = orders[0, :]
            out_array[1:, 0] = 1 / args.orders_per_sec * np.arange(1, out_array.shape[0]) + out_array[0, 0]

            # fill row by row
            order_idx = 1
            out_row = 1
            out_row_filled = False
            while out_row < out_array.shape[0]:
                if orders[order_idx, 0] - out_array[out_row - 1, 0] <= 1 / args.orders_per_sec:
                    out_array[out_row, 1:] = orders[order_idx, 1:]
                    order_idx += 1
                    out_row_filled = True
                elif not out_row_filled:
                    out_array[out_row, 1:] = out_array[out_row - 1, 1:]
                    out_row += 1
                    out_row_filled = False
                else:
                    out_row += 1
                    out_row_filled = False

            # save as a tensor
            out_tensor = torch.as_tensor(out_array).float()
            out_tensor[:, 1::2] *= 1e-4     # change prices to dollar ammounts
            raw_file_out, trans_file_out, running_file_out = next_file_name(args.out_data_dir)
            torch.save(out_tensor, raw_file_out)
            list_of_transaction_prices, running_price = get_transaction_price_lists(m_fh, 0, out_array[:, 0])
            torch.save(running_price, running_file_out)
            with open(trans_file_out, 'wb') as pickle_file:
                pickle.dump(list_of_transaction_prices, pickle_file)

    if not reached_start_date:
        print("No data processed, start date not reached.")
        return

    input_size = out_tensor.shape[0]
    torch.save(input_size, os.path.join(args.out_data_dir, 'meta'))
    print("\n", now(), "lobster_to_dataset.py done.")
    return


if __name__ == "__main__":
    main()

