import os
import pandas as pd 
import numpy as np 

payload = pd.read_html('https://en.wikipedia.org/wiki/List_of_S%26P_500_companies')
df = pd.DataFrame(payload[0])[['Symbol', 'GICS Sector', 'Date added']]
df["Date added"] = pd.to_datetime(df["Date added"])
print(df.head())
df = df[df["Date added"] < "2019-01-01"]
all_stocks = df['Symbol'].values.tolist()
print(len(all_stocks))
print(all_stocks)

weights = pd.DataFrame({"Symbol" : [], "GICS Sector" : [], "sort" : [], "weight" : []})
categories = pd.read_csv("stock_categories.csv")

# custom sorting of the labels
sorting = {
    "Communication Services" : 1,
    "Information Technology" : 2,
    "Financials" : 3,
    "Consumer Discretionary" : 4,
    "Consumer Staples" : 5,
    "Health Care" : 6,
    "Industrials" : 7,
    "Energy" : 8,
    "Materials" : 9,
    "Utilities" : 10,
    "Real Estate" : 11
}

if __name__ == '__main__':
    files = []
    dividends = []
    lst = os.listdir(".")
    lst.sort()
    for r, filename_data in enumerate(lst):
        if filename_data.split(".")[0] in all_stocks and filename_data.split(".")[-1] == "csv" and filename_data.split("_")[0] not in ["X", "stock"]:
            stock_abbrv = filename_data.split("_")[-1].split(".")[0]
            CSV = pd.read_csv("{}".format(filename_data))
            # computes the weight of the stock -- strength of the stock
            weight = CSV.iloc[-1]["close"] * CSV.iloc[-1]["volume"]
            category = categories.loc[categories["Symbol"] == stock_abbrv]["GICS Sector"].values[0]
            weights.loc[len(weights)] = [stock_abbrv, category, sorting[category], weight]
            files.append([CSV, stock_abbrv])
            print(stock_abbrv, filename_data)

            # taking the dividend
            CSV = pd.read_csv("{}_dividend.csv".format(stock_abbrv))
            dividends.append([CSV, stock_abbrv])

    weights = weights.sort_values(by="Symbol")
    weights["alphabetical_index"] = np.arange(len(weights)) 
    weights = weights.sort_values(by=["sort", "Symbol"])
    weights["categorical_index"] = np.arange(len(weights)) 
    weights = weights.sort_values(by="weight", ascending=False)
    weights.to_csv("stock_weights.csv", index=False)


    for argument in ["open", "close", "high", "low"]:
        X = files[0][0][["formatted_date", argument]] # takes the first stock, CSV and then chooses 2 columns
        X_index = files[0][1]  # takes abbreviation of the first stock
        # renaming date column and closing value (or other argument)
        X = X.rename(columns={"formatted_date" : "Date", argument : argument + "_{}".format(X_index)})
        # print(X.head())
        for i in range(len(files) - 1):
            Y = files[i + 1][0][["formatted_date", argument]]
            Y_index = files[i + 1][1]
            Y = Y.rename(columns={"formatted_date" : "Date", argument : argument + "_{}".format(Y_index)})
            # print(Y.head())
            X = pd.merge(X, Y, on="Date", how="inner") #considers the intersection of keys

        # X = X.iloc[::-1] # reverse the order of rows so the oldest stock value is first
        X.to_csv("X_{}.csv".format(argument), index=False)

        if (argument == "open"):
            dfOpen = X
        elif (argument == "close"):
            dfClose = X

    df = pd.merge(dfOpen, dfClose, on="Date", how="inner")
    df.to_csv("X_OpenClose.csv", index=False)

    allDates = X[["Date"]]
    D = dividends[0][0][["formatted_date", "amount"]] # takes the date + amount of dividents (empty if none)
    D_index = dividends[0][1]  # takes abbreviation of the first stock

    # columns will be date, the abbrv of the stock
    D = D.rename(columns={"formatted_date" : "Date", "amount" : "{}".format(D_index)})
    # expanding to all dates
    D = pd.merge(allDates, D, on="Date", how="left").fillna(0) # consider the keys of left dataframe

    for i in range(len(files) - 1):
        E = dividends[i + 1][0][["formatted_date", "amount"]]
        E_index = dividends[i + 1][1]
        E = E.rename(columns={"formatted_date" : "Date", "amount" : "{}".format(E_index)})
        # print(Y.head())
        D = pd.merge(D, E, on="Date", how="left").fillna(0) # consider the keys of left dataframe

    D.to_csv("Dividends.csv", index=False)
    