import pandas as pd
import numpy as np
import glob
import os
from os.path import isfile, join
    
def add_spoofing_data(data, fraud_num, fraction, mean_bid_order, mean_ask_order,inject_onelevel=False):
    '''
    Adds spoofing data to the given LOB data.
    Parameters:
    - data: pd.DataFrame, the original LOB data.
    - fraud_num: int, the number of spoofing events to inject.
    - fraction: float, the fraction of spoofing events that will be on the ask side.
    - mean_bid_order: float, the mean size of bid orders from original data.
    - mean_ask_order: float, the mean size of ask orders from original data.
    - inject_onelevel: bool, if True, injects spoofing data on the first level only.
    '''
    # Getting last anomaly cluster number
    previous_last_cluster = np.sort(data['ClusterNo'].unique())[-1]
    if pd.isna(previous_last_cluster):
        previous_last_cluster = 0
    current_cluster = previous_last_cluster + 1

    # Finding n random possible starting indexes
    possible_starting_indexes = data[
        (data['AskPrice1'] / data['BidPrice1'] >= 1.0008) &
        (pd.isna(data['ClusterNo'])) &
        (pd.isna(data['ClusterNo'].shift(1))) &
        (pd.isna(data['ClusterNo'].shift(-1)))
    ].index.values
    
    # Check if we have enough possible starting indexes
    if len(possible_starting_indexes) < fraud_num:
        raise ValueError(f"Not enough possible starting indexes: {len(possible_starting_indexes)} available, {fraud_num} required")
        
    starting_indexes = np.sort(np.random.choice(possible_starting_indexes, size=fraud_num, replace=False))
    ask_lob_num = int(fraud_num * fraction)
    shuffled_indexes = np.random.permutation(starting_indexes)
    bid_manipulate_indexes = shuffled_indexes[ask_lob_num:]
    ask_manipulate_indexes = shuffled_indexes[:ask_lob_num]
    insert_pd = pd.DataFrame(columns=data.columns)

    # manipulate on bid side
    for index in bid_manipulate_indexes:
        new_data = data.iloc[index].copy()
        extra_lob = {}
        extra_lob['bid_stack'] = []
        extra_lob['ask_stack'] = []
        max_bid_price = np.round(np.minimum(np.ceil(new_data['BidPrice1'] * 1.0007 * 100) / 100, new_data['AskPrice1'] - 0.01), 2)
        min_ask_price = np.round(np.maximum(np.floor(new_data['AskPrice1'] * 0.9997 * 100) / 100, max_bid_price + 0.01), 2)
        # 1. generate a bona-fide new order event from manipulator
        bona_ask_size = np.ceil(mean_ask_order * np.random.uniform(2, 3))
        event = {
            'type': 1,
            'price': min_ask_price,
            'size': bona_ask_size,
            'direction': -1,
            'deltaTime': np.random.randint(1, 6),
            'ClusterNo': current_cluster,
            'FraudType': 1,
            'OriginalSequenceNumber': new_data['OriginalSequenceNumber'].astype(str) + '-' + ("0000000" + str(index))[-7:] + '-' + "000"
        }
        new_data, extra_lob = generate_anomaly(new_data, event, extra_lob)
        if insert_pd.empty:
            insert_pd = pd.DataFrame([new_data])
        else:
            insert_pd = pd.concat([insert_pd, pd.DataFrame([new_data])], ignore_index=True)
        
        # 2. generate none-bona-fide new order events from manipulator
        # Selecting length, price increment and rate of orders per level of price upgrade that best achieves increase in bid price
        fit = np.inf
        best_length = 0
        best_price_increment = 0
        best_rate = 0
        for length in [10, 11, 12, 13, 14]:
            if inject_onelevel:
                price_increment = np.maximum(np.floor(((max_bid_price - new_data['BidPrice1']) / length) * 100) / 100, 0.01)
                rate = np.ceil(length / np.maximum((max_bid_price - new_data['BidPrice1']) / price_increment, 1))
                length_fit = (max_bid_price - new_data['BidPrice1']) - np.ceil((length / rate)) * price_increment
            else:
                price_increment = np.maximum(np.floor(((max_bid_price - new_data['BidPrice5']) / length) * 100) / 100, 0.01)
                rate = np.ceil(length / np.maximum((max_bid_price - new_data['BidPrice5']) / price_increment, 1))
                length_fit = (max_bid_price - new_data['BidPrice5']) - np.ceil((length / rate)) * price_increment
            if length_fit < fit:
                best_length = length
                best_price_increment = price_increment
                best_rate = rate
                fit = length_fit
                
        # Sending non-bona fide bid orders
        nonbona_order_size = 100 * np.ceil((mean_bid_order * np.random.uniform(5, 6) // best_length) / 100)
        nonbona_order_size += np.ceil(nonbona_order_size * np.random.uniform(-0.1, 0.1))
        if inject_onelevel:
            nonbona_order_price = new_data['BidPrice1']
        else:
            nonbona_order_price = new_data['BidPrice5']
        for j in range(best_length):
            new_data = new_data.copy()
            nonbona_order_price = nonbona_order_price + best_price_increment * ((j % best_rate) == 0)
            event = {
                'type': 1,
                'price': np.round(nonbona_order_price, 2),
                'size': nonbona_order_size,
                'direction': 1,
                'deltaTime': np.random.randint(10, 21),
                'ClusterNo': current_cluster,
                'FraudType': 1,
                'OriginalSequenceNumber': new_data['OriginalSequenceNumber'][:-3] + ("00" + str(j + 1))[-3:]
            }
            new_data, extra_lob = generate_anomaly(new_data, event, extra_lob)
            insert_pd = pd.concat([insert_pd, pd.DataFrame([new_data])], ignore_index=True)
            
        # 3. Adding a trade of bona fide sell order
        new_data = new_data.copy()
        event = {
            'type': 3,
            'price': min_ask_price,
            'size': bona_ask_size,
            'direction': -1,
            'deltaTime': np.random.randint(10, 21),
            'ClusterNo': current_cluster,
            'FraudType': 1,
            'OriginalSequenceNumber': new_data['OriginalSequenceNumber'][:-3] + ("00" + str(best_length + 1))[-3:]
        }
        new_data, extra_lob = generate_anomaly(new_data, event, extra_lob)
        insert_pd = pd.concat([insert_pd, pd.DataFrame([new_data])], ignore_index=True)
        
        # 4. cancelling the non-bona-fide orders
        for j in reversed(range(best_length)):
            new_data = new_data.copy()
            event = {
                'type': 2,
                'price': np.round(nonbona_order_price, 2),
                'size': nonbona_order_size,
                'direction': 1,
                'deltaTime': np.random.randint(100, 500) if j == best_length - 1 else 0,
                'ClusterNo': current_cluster,
                'FraudType': 1,
                'OriginalSequenceNumber': new_data['OriginalSequenceNumber'][:-3] + ("00" + str(2 * best_length - j + 1))[-3:]
            }
            new_data, extra_lob = generate_anomaly(new_data, event, extra_lob)
            insert_pd = pd.concat([insert_pd, pd.DataFrame([new_data])], ignore_index=True)
            nonbona_order_price = nonbona_order_price - best_price_increment * ((j % best_rate) == 0)
            
        current_cluster += 1
        
    # manipulate on ask side
    for index in ask_manipulate_indexes:
        new_data = data.iloc[index].copy()
        extra_lob = {}
        extra_lob['bid_stack'] = []
        extra_lob['ask_stack'] = []
        min_ask_price = np.round(np.maximum(np.floor(new_data['AskPrice1'] * 0.9993 * 100) / 100, new_data['BidPrice1'] + 0.01), 2)
        max_bid_price = np.round(np.minimum(np.ceil(new_data['BidPrice1'] * 1.0003 * 100) / 100, min_ask_price - 0.01), 2)
        # 1. generate a bona-fide new order event from manipulator
        bona_bid_size = np.ceil(mean_bid_order * np.random.uniform(2, 3))
        event = {
            'type': 1,
            'price': max_bid_price,
            'size': bona_bid_size,
            'direction': 1,
            'deltaTime': np.random.randint(1, 6),
            'ClusterNo': current_cluster,
            'FraudType': 1,
            'OriginalSequenceNumber': new_data['OriginalSequenceNumber'].astype(str) + '-' + ("0000000" + str(index))[-7:] + '-' + "000"
        }
        new_data, extra_lob = generate_anomaly(new_data, event, extra_lob)
        insert_pd = pd.concat([insert_pd, pd.DataFrame([new_data])], ignore_index=True)
        
        # 2. generate none-bona-fide new order events from manipulator
        # Selecting length, price increment and rate of orders per level of price upgrade that best achieves increase in ask price
        fit = np.inf
        best_length = 0
        best_price_increment = 0
        best_rate = 0
        for length in [10, 11, 12, 13, 14]:
            if inject_onelevel:
                price_increment = np.maximum(np.floor(((new_data['AskPrice1']-max_bid_price) / length) * 100) / 100, 0.01)
                rate = np.ceil(length / np.maximum((new_data['AskPrice1']-max_bid_price) / price_increment, 1))
                length_fit = (new_data['AskPrice1'] - max_bid_price) - np.ceil((length / rate)) * price_increment
            else:
                price_increment = np.maximum(np.floor(((new_data['AskPrice5']-min_ask_price) / length) * 100) / 100, 0.01)
                rate = np.ceil(length / np.maximum((new_data['AskPrice5']-min_ask_price) / price_increment, 1))
                length_fit = (new_data['AskPrice5'] - min_ask_price) - np.ceil((length / rate)) * price_increment
            if length_fit < fit:
                best_length = length
                best_price_increment = price_increment
                best_rate = rate
                fit = length_fit
                
        # Sending non-bona fide ask orders
        nonbona_order_size = 100 * np.ceil((mean_ask_order * np.random.uniform(5, 6) // best_length) / 100)
        nonbona_order_size += np.ceil(nonbona_order_size * np.random.uniform(-0.1, 0.1))
        if inject_onelevel:
            nonbona_order_price = new_data['AskPrice1']
        else:
            nonbona_order_price = new_data['AskPrice5']
        for j in range(best_length):
            new_data = new_data.copy()
            nonbona_order_price = nonbona_order_price - best_price_increment * ((j % best_rate) == 0)
            event = {
                'type': 1,
                'price': np.round(nonbona_order_price, 2),
                'size': nonbona_order_size,
                'direction': -1,
                'deltaTime': np.random.randint(10, 21),
                'ClusterNo': current_cluster,
                'FraudType': 1,
                'OriginalSequenceNumber': new_data['OriginalSequenceNumber'][:-3] + ("00" + str(j + 1))[-3:]
            }
            new_data, extra_lob = generate_anomaly(new_data, event, extra_lob)
            insert_pd = pd.concat([insert_pd, pd.DataFrame([new_data])], ignore_index=True)
            
        # 3. Adding a trade of bona fide sell order
        new_data = new_data.copy()
        event = {
            'type': 3,
            'price': max_bid_price,
            'size': bona_bid_size,
            'direction': 1,
            'deltaTime': np.random.randint(10, 21),
            'ClusterNo': current_cluster,
            'FraudType': 1,
            'OriginalSequenceNumber': new_data['OriginalSequenceNumber'][:-3] + ("00" + str(best_length + 1))[-3:]
        }
        new_data, extra_lob = generate_anomaly(new_data, event, extra_lob)
        insert_pd = pd.concat([insert_pd, pd.DataFrame([new_data])], ignore_index=True)
        
        # 4. cancelling the non-bona-fide orders
        for j in reversed(range(best_length)):
            new_data = new_data.copy()
            event = {
                'type': 2,
                'price': np.round(nonbona_order_price, 2),
                'size': nonbona_order_size,
                'direction': -1,
                'deltaTime': np.random.randint(100, 500) if j == best_length - 1 else 0,
                'ClusterNo': current_cluster,
                'FraudType': 1,
                'OriginalSequenceNumber': new_data['OriginalSequenceNumber'][:-3] + ("00" + str(2 * best_length - j + 1))[-3:]
            }
            new_data, extra_lob = generate_anomaly(new_data, event, extra_lob)
            insert_pd = pd.concat([insert_pd, pd.DataFrame([new_data])], ignore_index=True)
            nonbona_order_price = nonbona_order_price + best_price_increment * ((j % best_rate) == 0)
            
        current_cluster += 1
           
    # adjust the time and index of normal data
    data['OriginalSequenceNumber'] = data['OriginalSequenceNumber'].astype(str)
    data = pd.concat((data, insert_pd), axis=0).sort_values(by=['index', 'OriginalSequenceNumber']).reset_index(drop=True)
    for cluster in range(int(current_cluster - fraud_num), int(current_cluster)):
        cluster_data = data[data['ClusterNo'] == cluster]
        cluster_data_indexes = cluster_data.index.values.astype(int)
        cluster_start_time = data.loc[cluster_data_indexes[0] - 1, 'TimeInMilliSecs'] if cluster_data_indexes[0] > 0 else 0
        cluster_end_time = data.loc[cluster_data_indexes[-1], 'TimeInMilliSecs']
        duration = cluster_end_time - cluster_start_time

        # Adding time delta to data occuring after cluster to account for synthetized data
        data.loc[cluster_data_indexes[-1] + 1:, 'TimeInMilliSecs'] += duration

    # Getting anomalies with modified timestamps
    anomalies = data[~pd.isna(data['ClusterNo'])]
    
    return data, anomalies


def generate_anomaly(starting_data, event, extra_lob, level_num=5):
    """
    Generate a new LOB data point based on the event and starting data.
    event: dict
        {
            'type': # 1: new order, 2: cancel order, 3: trade
            'price': in dollars,
            'size': int,
            'direction': # 1: bid, -1: ask
            'deltaTime': int, # in milliseconds
            'ClusterNo': the events belong to the same cluster has the same ClusterNo,
            'FraudType': int,,
            'OriginalSequenceNumber': 1
            }
    """
    new_data = starting_data.copy()
    event_type = event['type']
    new_data['ClusterNo'] = event['ClusterNo']
    new_data['FraudType'] = event['FraudType']
    new_data['OriginalSequenceNumber'] = event['OriginalSequenceNumber']
    new_data['TimeInMilliSecs'] += event['deltaTime']
    updated_data, extra_lob, updated_level = update_lob(new_data, event, extra_lob, level_num)
    updated_data['ManipulatedLevel'] = updated_level
    if event_type == 1: # new order
        updated_data['TradeIndicator'] = 0
        updated_data['TradeSize'] = 0
        updated_data['CancelledBidIndicator'] = 0
        updated_data['CancelledAskIndicator'] = 0
        updated_data['CancelledBidSize'] = 0
        updated_data['CancelledAskSize'] = 0
    elif event_type == 2: # cancel order
        updated_data['TradeIndicator'] = 0
        updated_data['TradeSize'] = 0
        updated_data['CancelledBidIndicator'] = 1 if event['direction'] == 1 else 0
        updated_data['CancelledAskIndicator'] = 1 if event['direction'] == -1 else 0
        updated_data['CancelledBidSize'] = event['size'] if event['direction'] == 1 else 0
        updated_data['CancelledAskSize'] = event['size'] if event['direction'] == -1 else 0
    else: # trade
        updated_data['TradeIndicator'] = event['direction']
        updated_data['TradeSize'] = event['size']
        updated_data['CancelledBidIndicator'] = 0
        updated_data['CancelledAskIndicator'] = 0
        updated_data['CancelledBidSize'] = 0
        updated_data['CancelledAskSize'] = 0
    return updated_data, extra_lob

def update_lob(old_lob, event, extra_lob, level_num=5):
    updated_level = 0
    lob = old_lob.copy()
    ask_prices = [f'AskPrice{i}' for i in range(1, level_num+1)]
    ask_sizes = [f'AskSize{i}' for i in range(1, level_num+1)]
    bid_prices = [f'BidPrice{i}' for i in range(1, level_num+1)]
    bid_sizes = [f'BidSize{i}' for i in range(1, level_num+1)]

    price = event['price']
    size = event['size']
    direction = event['direction']
    event_type = event['type']

    # Bid Direction
    if direction == 1:
        if event_type == 1:  # new order
            # insert into the appropriate level
            for i in range(level_num):
                if price > lob[bid_prices[i]]:
                    # move the levels down
                    updated_level = i+1
                    extra_lob['bid_stack'].append((lob[bid_prices[level_num-1]], lob[bid_sizes[level_num-1]]))
                    for j in range(level_num-1, i, -1):
                        lob[bid_prices[j]] = lob[bid_prices[j-1]]
                        lob[bid_sizes[j]] = lob[bid_sizes[j-1]]
                    lob[bid_prices[i]] = price
                    lob[bid_sizes[i]] = size
                    break
                elif price == lob[bid_prices[i]]:
                    updated_level = i+1
                    lob[bid_sizes[i]] += size
                    break
        elif event_type == 2:  # cancel order
            for i in range(level_num):
                if price == lob[bid_prices[i]]:
                    updated_level = i+1
                    lob[bid_sizes[i]] = max(0, lob[bid_sizes[i]] - size)
                    if lob[bid_sizes[i]] == 0:
                        # move the levels up
                        for j in range(i, level_num-1):
                            lob[bid_prices[j]] = lob[bid_prices[j+1]]
                            lob[bid_sizes[j]] = lob[bid_sizes[j+1]]
                        if len(extra_lob['bid_stack']) > 0:
                            last_price, last_size = extra_lob['bid_stack'].pop()
                            lob[bid_prices[level_num-1]] = last_price
                            lob[bid_sizes[level_num-1]] = last_size
                        else:
                            lob[bid_prices[level_num-1]] = 0
                            lob[bid_sizes[level_num-1]] = 0
                    break
        elif event_type == 3:  # trade order
            for i in range(level_num):
                if price == lob[bid_prices[i]]:
                    updated_level = i+1
                    # TODO: if the size is larger than the current size, it should move to comsume the next level
                    lob[bid_sizes[i]] = max(0, lob[bid_sizes[i]] - size)
                    if lob[bid_sizes[i]] == 0:
                        # move the levels up
                        for j in range(i, level_num-1):
                            lob[bid_prices[j]] = lob[bid_prices[j+1]]
                            lob[bid_sizes[j]] = lob[bid_sizes[j+1]]
                        if len(extra_lob['bid_stack']) > 0:
                            last_price, last_size = extra_lob['bid_stack'].pop()
                            lob[bid_prices[level_num-1]] = last_price
                            lob[bid_sizes[level_num-1]] = last_size
                        else:
                            lob[bid_prices[level_num-1]] = 0
                            lob[bid_sizes[level_num-1]] = 0
                    break
    # Ask Direction
    elif direction == -1:
        if event_type == 1:  # new order
            for i in range(level_num):
                if price < lob[ask_prices[i]]:
                    updated_level = i+1
                    extra_lob['ask_stack'].append((lob[ask_prices[level_num-1]], lob[ask_sizes[level_num-1]]))
                    for j in range(level_num-1, i, -1):
                        lob[ask_prices[j]] = lob[ask_prices[j-1]]
                        lob[ask_sizes[j]] = lob[ask_sizes[j-1]]
                    lob[ask_prices[i]] = price
                    lob[ask_sizes[i]] = size
                    break
                elif price == lob[ask_prices[i]]:
                    updated_level = i+1
                    lob[ask_sizes[i]] += size
                    break
        elif event_type == 2:  # cancel order
            for i in range(level_num):
                if price == lob[ask_prices[i]]:
                    updated_level = i+1
                    lob[ask_sizes[i]] = max(0, lob[ask_sizes[i]] - size)
                    if lob[ask_sizes[i]] == 0:
                        # move the levels up
                        for j in range(i, level_num-1):
                            lob[ask_prices[j]] = lob[ask_prices[j+1]]
                            lob[ask_sizes[j]] = lob[ask_sizes[j+1]]
                        if len(extra_lob['ask_stack']) > 0:
                            last_price, last_size = extra_lob['ask_stack'].pop()
                            lob[ask_prices[level_num-1]] = last_price
                            lob[ask_sizes[level_num-1]] = last_size
                        else:   
                            lob[ask_prices[level_num-1]] = 0
                            lob[ask_sizes[level_num-1]] = 0
                    break
        elif event_type == 3:  # trade order
            for i in range(level_num):
                if price == lob[ask_prices[i]]:
                    updated_level = i+1
                    # TODO: if the size is larger than the current size, it should move to comsume the next level
                    lob[ask_sizes[i]] = max(0, lob[ask_sizes[i]] - size)
                    if lob[ask_sizes[i]] == 0:
                        # move the levels up
                        for j in range(i, level_num-1):
                            lob[ask_prices[j]] = lob[ask_prices[j+1]]
                            lob[ask_sizes[j]] = lob[ask_sizes[j+1]]
                        if len(extra_lob['ask_stack']) > 0:
                            last_price, last_size = extra_lob['ask_stack'].pop()
                            lob[ask_prices[level_num-1]] = last_price
                            lob[ask_sizes[level_num-1]] = last_size
                        else:
                            lob[ask_prices[level_num-1]] = 0
                            lob[ask_sizes[level_num-1]] = 0
                    break
    return lob, extra_lob, updated_level

def rawdata_to_formatted(stock, date, level_num):
    global path  # Add global declaration
    # Importing data sets
    messages = pd.read_csv(path + stock + "_" + date + "_34200000_57600000_message_10.csv", header=None, usecols=range(6))
    lobs = pd.read_csv(path + stock + "_" + date + "_34200000_57600000_orderbook_10.csv", header=None, usecols=range(level_num * 4))

    messages.columns = ['time', 'type', 'id', 'size', 'price', 'direction']
    lobs_columns = []
    for i in range(1, level_num+1):
        lobs_columns.append(f'AskPrice{i}')
        lobs_columns.append(f'AskSize{i}')
        lobs_columns.append(f'BidPrice{i}')
        lobs_columns.append(f'BidSize{i}')
    lobs.columns = lobs_columns

    price_columns = []
    for i in range(1, level_num+1):
        price_columns.append(f'AskPrice{i}')
        price_columns.append(f'BidPrice{i}')
    lobs[price_columns] /= 10000
    lobs[price_columns] = lobs[price_columns].round(2)

    # Combining LOB data with messages
    stock_data = pd.concat((messages, lobs), axis=1)
    stock_data = stock_data[stock_data['type'] != 5].reset_index() # get rid of hidden executions
    stock_data['Date'] = date
    stock_data['StockSymbol'] = stock
    stock_data['OriginalSequenceNumber'] = stock_data['id']

    # Reworking time
    stock_data['time'] *= 1000000
    stock_data['Hours'] = (stock_data['time'] // (1000000 * 3600)).astype(int)
    stock_data['Minutes'] = ((stock_data['time'] - stock_data['Hours'] * 1000000 * 3600) // (1000000 * 60)).astype(int)
    stock_data['Seconds'] = ((stock_data['time'] - stock_data['Hours'] * 1000000 * 3600 - stock_data['Minutes'] * 1000000 * 60) // 1000000).astype(int)
    stock_data['MilliSeconds'] = ((stock_data['time'] - stock_data['Hours'] * 1000000 * 3600 - stock_data['Minutes'] * 1000000 * 60 - stock_data['Seconds'] * 1000000) // 1000 + 1).astype(int)
    stock_data['TimeInMilliSecs'] = stock_data['Hours'] * 3600 * 1000 + stock_data['Minutes'] * 60 * 1000 + stock_data['Seconds'] * 1000 + stock_data['MilliSeconds']

    # Creating a trade indicator and trade size columns
    stock_data['TradeIndicator'] = 0
    stock_data.loc[stock_data['type'] == 4, 'TradeIndicator'] = stock_data.loc[(stock_data['type'] == 4), 'direction']
    stock_data['TradeSize'] = (stock_data['TradeIndicator']) * stock_data['size']
    
    stock_data['CancelledBidIndicator'] = 0
    stock_data['CancelledAskIndicator'] = 0
    stock_data['CancelledBidSize'] = 0
    stock_data['CancelledAskSize'] = 0

    stock_data.loc[((stock_data['type'] == 2) | (stock_data['type'] == 3)) & (stock_data['direction'] == 1), 'CancelledBidIndicator'] = 1
    stock_data.loc[((stock_data['type'] == 2) | (stock_data['type'] == 3)) & (stock_data['direction'] == -1), 'CancelledAskIndicator'] = 1
    stock_data['CancelledBidSize'] = (stock_data['CancelledBidIndicator']) * stock_data['size']
    stock_data['CancelledAskSize'] = (stock_data['CancelledAskIndicator']) * stock_data['size']
    
    mean_bid_order = stock_data.loc[(stock_data['type'] == 1) & (stock_data['direction'] == 1), 'size'].mean()
    mean_ask_order = stock_data.loc[(stock_data['type'] == 1) & (stock_data['direction'] == -1), 'size'].mean()
    
    stock_data = stock_data.drop(columns=['index','time', 'type', 'id', 'size', 'price', 'direction', 'Hours', 'Minutes', 'Seconds', 'MilliSeconds'])
    stock_data['ClusterNo'] = np.nan
    stock_data['FraudType'] = np.nan
    stock_data['ManipulatedLevel'] = np.nan
    
    stock_data = stock_data.reset_index()
    # Saving the processed data to a CSV file
    output_path = "dataset/formatted_LOBSTER/"
    os.makedirs(output_path, exist_ok=True)  # Ensure directory exists
    output_file = f"{output_path}{stock}_{date}_formatted.csv"
    stock_data.to_csv(output_file, index=False)
    print(f"Saved formatted data to {output_file}")
    return stock_data, mean_bid_order, mean_ask_order

def concat_all_csv(data_dir="dataset/formatted_LOBSTER/", input_file_name="*_Injected.csv", fraud_num=50):
    files = glob.glob(os.path.join(data_dir, input_file_name))
    all_data = []
    file_nb = 0
    for f in files:
        df = pd.read_csv(f, sep=';', dtype={'OriginalSequenceNumber': str})
        # Only adjust ClusterNo for non-NaN values to avoid errors
        mask = ~pd.isna(df['ClusterNo'])
        df.loc[mask, 'ClusterNo'] += file_nb * fraud_num
        all_data.append(df)
        file_nb += 1
    all_df = pd.concat(all_data, ignore_index=True)
    all_df = all_df.sort_values(by=['StockSymbol', 'Date', 'TimeInMilliSecs', 'index', 'OriginalSequenceNumber']).reset_index(drop=True)
    all_df['OriginalSequenceNumber'] = all_df['OriginalSequenceNumber'].astype(str)
    all_df.to_csv(os.path.join(data_dir, "All_Stocks"+input_file_name[1:]), index=False, sep=';')
    print(f"Concatenated all {input_file_name} data into one csv.")
    return all_df

if __name__ == "__main__":
    # Getting all stock tickers
    path = "dataset/rawdata_LOBSTER/"
    files = [f for f in os.listdir(path) if isfile(join(path, f)) and f.endswith(".csv")]
    stocks = set([file[:4] for file in files])
    dates = set([file[5:15] for file in files])
    level_num = 5
    
    for stock in stocks:
        print(stock)
        for date in dates:
            print(date)
            format_data, mean_bid_order, mean_ask_order = rawdata_to_formatted(stock, date, level_num)
            # Adding spoofing data into 5-level LOB data
            # injection_data, anomalies = add_spoofing_data(format_data, fraud_num=50, fraction=0.5, mean_bid_order=mean_bid_order, mean_ask_order=mean_ask_order)
            # injection_data.to_csv("dataset/formatted_LOBSTER/" + stock + "_" + date + "_Injected.csv", index=False, sep=';')
            # anomalies.to_csv("dataset/formatted_LOBSTER/" + stock + "_" + date + "_Anomalies.csv", index=False, sep=';')
            
            # Adding spoofing data into 1-level LOB data
            injection_data, anomalies = add_spoofing_data(format_data, fraud_num=50, fraction=0.5, mean_bid_order=mean_bid_order, mean_ask_order=mean_ask_order, inject_onelevel=True)
            injection_data.to_csv("dataset/formatted_LOBSTER/" + stock + "_" + date + "_InjectedL1.csv", index=False, sep=';')
            anomalies.to_csv("dataset/formatted_LOBSTER/" + stock + "_" + date + "_AnomaliesL1.csv", index=False, sep=';')
            print("Save the dataset injected by anomalies.")

    # concat_all_csv(data_dir="dataset/formatted_LOBSTER/", input_file_name ="*_Injected.csv")
    # concat_all_csv(data_dir="dataset/formatted_LOBSTER/", input_file_name = "*_Anomalies.csv")
    concat_all_csv(data_dir="dataset/formatted_LOBSTER/", input_file_name ="*_InjectedL1.csv")
    concat_all_csv(data_dir="dataset/formatted_LOBSTER/", input_file_name = "*_AnomaliesL1.csv")