{
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "provenance": [],
      "collapsed_sections": [
        "riPDxNkXb0rN",
        "6lI3JYbIhSpE",
        "GkeiN3s0KP6C",
        "Uu2z7ejxqi2P",
        "nrvKcj5OuSp2"
      ]
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    },
    "language_info": {
      "name": "python"
    }
  },
  "cells": [
    {
      "cell_type": "markdown",
      "source": [
        "#1. Part 1 - Synthetic Experimentation"
      ],
      "metadata": {
        "id": "riPDxNkXb0rN"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "import numpy as np\n",
        "import pandas as pd\n",
        "import matplotlib.pyplot as plt\n",
        "from scipy.optimize import minimize\n",
        "import math\n",
        "import functools\n",
        "import operator\n",
        "import time\n",
        "from tqdm import tqdm\n",
        "import os\n",
        "import random\n",
        "import itertools\n",
        "from typing import List, Tuple\n",
        "\n",
        "random.seed(42)\n",
        "np.random.seed(42)\n"
      ],
      "metadata": {
        "id": "lXb0uwKGcFYQ"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "##1.1 - preparations"
      ],
      "metadata": {
        "id": "YAcMFZ_zb7J5"
      }
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "Nk91eA6sbu3s"
      },
      "outputs": [],
      "source": [
        "def nyse_n(folder_path):\n",
        "    \"\"\"\n",
        "    Reads CSV files from the specified folder and returns a DataFrame with 21 columns and 5526 rows,\n",
        "    where each column is named after its corresponding CSV file (without the .csv extension),\n",
        "    and contains data from rows 5652 to 11177.\n",
        "\n",
        "    Parameters:\n",
        "    folder_path (str): The path to the folder containing the CSV files.\n",
        "\n",
        "    Returns:\n",
        "    pd.DataFrame: The processed DataFrame.\n",
        "    \"\"\"\n",
        "    # List to hold each column of data and their corresponding column names\n",
        "    data_columns = []\n",
        "    column_names = []\n",
        "\n",
        "    # Get a sorted list of CSV files in the folder to ensure consistent column order\n",
        "    csv_files = sorted([f for f in os.listdir(folder_path) if f.endswith('.csv')])\n",
        "\n",
        "    # Check that there are exactly 21 CSV files\n",
        "    if len(csv_files) != 21:\n",
        "        raise ValueError(f\"Expected 21 CSV files, but found {len(csv_files)}.\")\n",
        "\n",
        "    # Loop through each CSV file\n",
        "    for file_name in csv_files:\n",
        "        file_path = os.path.join(folder_path, file_name)\n",
        "\n",
        "        # Read the CSV file\n",
        "        df = pd.read_csv(file_path, header=None)\n",
        "\n",
        "        # Check if the CSV file has at least two columns\n",
        "        if df.shape[1] < 2:\n",
        "            raise ValueError(f\"File {file_name} does not have at least two columns.\")\n",
        "\n",
        "        # Extract the second column (index 1)\n",
        "        second_column = df.iloc[:, 1].astype(float).reset_index(drop=True)\n",
        "\n",
        "        # Append the column to the list\n",
        "        data_columns.append(second_column)\n",
        "\n",
        "        # Extract the file name without the '.csv' extension to use as column name\n",
        "        column_name = os.path.splitext(file_name)[0]\n",
        "        column_names.append(column_name)\n",
        "\n",
        "    # Combine all columns into a DataFrame\n",
        "    combined_df = pd.concat(data_columns, axis=1)\n",
        "\n",
        "    # Set the column names based on the file names\n",
        "    combined_df.columns = column_names\n",
        "\n",
        "    # Check if the DataFrame has the expected number of rows\n",
        "    if combined_df.shape[0] != 11178:\n",
        "        raise ValueError(\"The combined DataFrame does not have 11178 rows.\")\n",
        "\n",
        "    # Slice the DataFrame to keep rows from index 5652 to 11177 (zero-based indexing)\n",
        "    processed_df = combined_df.iloc[5651:11178].reset_index(drop=True)\n",
        "\n",
        "    # Return the processed DataFrame\n",
        "    return processed_df\n",
        "\n",
        "def load_and_construct_nyseo_dataframe(file_path, num_columns):\n",
        "    column_names = [str(i) for i in range(1, num_columns + 1)]\n",
        "    df = pd.read_csv(file_path, delimiter='\\s+', header=None, names=column_names)\n",
        "    return df\n",
        "def extract_columns(df, x, y):\n",
        "    new_df = df[[str(x), str(y)]]\n",
        "    return new_df\n",
        "\n",
        "########## NYSE(O) ##########\n",
        "folder_path_nyseo = './NYSE(O).txt'\n",
        "df_O = load_and_construct_nyseo_dataframe(folder_path_nyseo, 36)\n",
        "print(\"\\nDataFrame with row ids from 1 to K: \\n\", df_O)\n",
        "\n",
        "comb1_2_stocks = [20, 23] # Iroquois & Kin_Ark\n",
        "comb2_2_stocks = [6, 23] # Comm_Metals & Kin_Ark\n",
        "comb3_2_stocks = [6, 26] # Comm_Metals & Meicco\n",
        "comb1_3_stocks = [\"25\", \"23\", \"4\"]\n",
        "comb2_3_stocks = [\"35\", \"13\", \"23\"]\n",
        "comb3_3_stocks = [\"16\", \"9\", \"32\"]\n",
        "\n",
        "# [NYSEO] comb 1-3, extensive in UP, EG, etc.\n",
        "comb1_two = extract_columns(df_O, comb1_2_stocks[0], comb1_2_stocks[1])\n",
        "comb2_two = extract_columns(df_O, comb2_2_stocks[0], comb2_2_stocks[1])\n",
        "comb3_two = extract_columns(df_O, comb3_2_stocks[0], comb3_2_stocks[1])\n",
        "# [NYSEO] comb 4-6, used in Yang et al..\n",
        "comb4_three = df_O[comb1_3_stocks]\n",
        "comb5_three = df_O[comb2_3_stocks]\n",
        "comb6_three = df_O[comb3_3_stocks]"
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "########## NYSE(N) ##########\n",
        "folder_path_nysen = \"./NYSE(N)\"\n",
        "df_N = nyse_n(folder_path)\n",
        "\n",
        "# Mock df_N with placeholder numbers (remove this block in real usage)\n",
        "columns = ['Bond', 'Cash', 'ahp', 'alcoa', 'amerb', 'coke', 'dow', 'dupont',\n",
        "           'ford', 'ge', 'gm', 'hp', 'ibm', 'inger', 'jnj', 'kimbc', 'merck',\n",
        "           'mmm', 'morris', 'pandg', 'schlum']\n",
        "\n",
        "asset_cols = [c for c in df_N.columns if c not in ('Bond', 'Cash')]\n",
        "\n",
        "# 1) comb7_5 : 5 random asset columns\n",
        "comb7_5_cols = random.sample(asset_cols, 5)\n",
        "comb7_5 = df_N[comb7_5_cols]\n",
        "\n",
        "# 2) comb8_5 : 5 other columns disjoint from comb7_5\n",
        "remaining_after_7 = [c for c in asset_cols if c not in comb7_5_cols]\n",
        "comb8_5_cols = random.sample(remaining_after_7, 5)\n",
        "comb8_5 = df_N[comb8_5_cols]\n",
        "\n",
        "# 3) comb9_8 : 8 random asset columns\n",
        "comb9_8_cols = random.sample(asset_cols, 8)\n",
        "comb9_8 = df_N[comb9_8_cols]\n",
        "\n",
        "# 4) comb10_8 : 8 other columns disjoint from comb9_8\n",
        "remaining_after_9 = [c for c in asset_cols if c not in comb9_8_cols]\n",
        "comb10_8_cols = random.sample(remaining_after_9, 8)\n",
        "comb10_8 = df_N[comb10_8_cols]\n",
        "\n",
        "# 5) comb11_11 : 11 random asset columns\n",
        "comb11_11_cols = random.sample(asset_cols, 11)\n",
        "comb11_11 = df_N[comb11_11_cols]\n",
        "\n",
        "# 6) comb12_11 : must include every asset not in comb11_11, then fill to 11\n",
        "leftover = [c for c in asset_cols if c not in comb11_11_cols]\n",
        "extra_needed = 11 - len(leftover)\n",
        "extra_cols = random.sample(comb11_11_cols, extra_needed) if extra_needed > 0 else []\n",
        "comb12_11_cols = leftover + extra_cols\n",
        "comb12_11 = df_N[comb12_11_cols]\n",
        "\n",
        "# Show the selected column names for verification\n",
        "print(\"comb7_5 cols :\", comb7_5_cols)\n",
        "print(\"comb8_5 cols :\", comb8_5_cols)\n",
        "print(\"comb9_8 cols :\", comb9_8_cols)\n",
        "print(\"comb10_8 cols:\", comb10_8_cols)\n",
        "print(\"comb11_11 cols:\", comb11_11_cols)\n",
        "print(\"comb12_11 cols:\", comb12_11_cols)\n",
        "\n",
        "# comb1_two,comb2_two,comb3_two,comb4_three,comb5_three,comb6_three\n",
        "# comb7_5, comb8_5, comb9_8, comb10_8, comb11_11, comb12_11"
      ],
      "metadata": {
        "id": "RFxr64qXcxPJ"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "##1.2 - utilities"
      ],
      "metadata": {
        "id": "J8QUw2i0b9X7"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "def bcrp(df):\n",
        "    ratio_matrix = df.values  # Shape: (Time Steps, Assets)\n",
        "    N = ratio_matrix.shape[1]  # Number of assets\n",
        "\n",
        "    def objective(weights):\n",
        "        # Calculate portfolio returns over time\n",
        "        V = np.dot(ratio_matrix, weights)\n",
        "        # Ensure no zero or negative values in V to avoid log(0) or log of negative numbers\n",
        "        if np.any(V <= 0):\n",
        "            return np.inf  # Penalize invalid weights leading to non-positive returns\n",
        "        # Calculate cumulative log return\n",
        "        log_V = np.log(V)\n",
        "        CR = np.sum(log_V)\n",
        "        return -CR  # Negative sign because we are minimizing\n",
        "\n",
        "    # Constraints: weights sum to 1\n",
        "    constraints = {'type': 'eq', 'fun': lambda weights: np.sum(weights) - 1}\n",
        "    # Bounds: weights between 0 and 1\n",
        "    bounds = [(0, 1) for _ in range(N)]\n",
        "    # Initial guess: equally distributed weights\n",
        "    initial_guess = np.ones(N) / N\n",
        "\n",
        "    # Optimize using SLSQP (Sequential Least Squares Programming)\n",
        "    result = minimize(\n",
        "        objective,\n",
        "        initial_guess,\n",
        "        method='SLSQP',\n",
        "        bounds=bounds,\n",
        "        constraints=constraints,\n",
        "        options={'disp': False}\n",
        "    )\n",
        "\n",
        "    if not result.success:\n",
        "        print(\"Optimization failed:\", result.message)\n",
        "        return None\n",
        "\n",
        "    optimal_weights = result.x\n",
        "    dot_product_result = df.values @ optimal_weights\n",
        "    wealth = np.prod(dot_product_result)\n",
        "\n",
        "    # print(f\"\\nBCRP Wealth:{wealth}\")\n",
        "    # print(f\"Optimal weights: {optimal_weights}\")\n",
        "    return optimal_weights, wealth\n",
        "\n",
        "def opt(df: pd.DataFrame) -> float:\n",
        "    top_values = df.max(axis=1)\n",
        "    final_wealth = top_values.prod()\n",
        "    return final_wealth\n",
        "\n",
        "def adv(df: pd.DataFrame) -> float:\n",
        "    top_values = df.min(axis=1)\n",
        "    final_wealth = top_values.prod()\n",
        "    return final_wealth\n",
        "\n",
        "def ucrp(df):\n",
        "  # Convert DataFrame to numpy array\n",
        "  price_relatives = df.values\n",
        "  num_steps, num_assets = price_relatives.shape\n",
        "  # Initialize portfolio weights (uniform distribution)\n",
        "  weights = np.ones(num_assets) / num_assets\n",
        "  # Initialize portfolio value\n",
        "  portfolio_values = np.ones(num_steps)\n",
        "\n",
        "  # Iterate through each time step\n",
        "  cur_wealth = 1\n",
        "  for t in range(num_steps):\n",
        "    # Calculate the portfolio value at the current time step\n",
        "    portfolio_values[t] = np.dot(weights, price_relatives[t])\n",
        "    # get current wealth\n",
        "    sum = 0\n",
        "    for i,j in zip(weights, price_relatives[t]):\n",
        "      sum += i * j\n",
        "    cur_wealth = cur_wealth * sum\n",
        "    # Update the weights based on the price relatives\n",
        "    weights = (weights * price_relatives[t]) / np.dot(weights, price_relatives[t])\n",
        "  return cur_wealth\n",
        "\n",
        "def am(df):\n",
        "    price_relatives = df.values  # shape = (T, N)\n",
        "    T, N = price_relatives.shape\n",
        "\n",
        "    # Initial weights for uniform buy-and-hold\n",
        "    weights = np.ones(N) / N\n",
        "\n",
        "    # Each asset’s cumulative product of price relatives\n",
        "    # => how much each $1 in that asset grows over time\n",
        "    cumulative_growth = np.cumprod(price_relatives, axis=0)  # shape = (T, N)\n",
        "\n",
        "    # At the final time step (T-1), the value of the portfolio is\n",
        "    # sum_i [ initial_fraction_in_asset_i * cumulative_growth(T-1, i) ]\n",
        "    final_wealth = np.sum(weights * cumulative_growth[-1, :])\n",
        "    return final_wealth\n",
        "\n",
        "def best(df):\n",
        "    # Convert DataFrame to NumPy array: shape = (T, N)\n",
        "    price_relatives = df.values\n",
        "    # Initialize wealth for each stock\n",
        "    # wealth[i] will hold the product of all price relatives for stock i\n",
        "    _, num_assets = price_relatives.shape\n",
        "    wealth = np.ones(num_assets)\n",
        "    # Accumulate final wealth for each single-stock buy-and-hold\n",
        "    for row in price_relatives:\n",
        "        wealth *= row  # multiply each stock's wealth by that stock's price relative at this time step\n",
        "    # Pick the best single stock (largest final wealth)\n",
        "    return wealth.max()  # same as wealth[np.argmax(wealth)]\n",
        "\n",
        "def worst(df):\n",
        "    # Convert DataFrame to NumPy array: shape = (T, N)\n",
        "    price_relatives = df.values\n",
        "    # Initialize wealth for each stock\n",
        "    # wealth[i] will hold the product of all price relatives for stock i\n",
        "    _, num_assets = price_relatives.shape\n",
        "    wealth = np.ones(num_assets)\n",
        "    # Accumulate final wealth for each single-stock buy-and-hold\n",
        "    for row in price_relatives:\n",
        "        wealth *= row  # multiply each stock's wealth by that stock's price relative at this time step\n",
        "    # Pick the best single stock (largest final wealth)\n",
        "    return wealth.min()  # same as wealth[np.argmax(wealth)]\n",
        "\n",
        "def gm(df):\n",
        "    # Convert to a NumPy array for convenience\n",
        "    arr = df.values  # shape = (T, N)\n",
        "    # final_wealth[i] = product of all price relatives for asset i\n",
        "    final_wealths = arr.prod(axis=0)  # shape = (N,)\n",
        "    # geometric mean of these final wealths\n",
        "    gm_value = np.prod(final_wealths) ** (1.0 / len(final_wealths))\n",
        "    return gm_value\n",
        "\n",
        "def generate_simplex(m, step_size=0.01):\n",
        "    \"\"\"\n",
        "    Generate all m-dimensional weight vectors (w1,...,wm) with increments step_size\n",
        "    such that sum(wi) = 1. Returns a list of lists of length m.\n",
        "    \"\"\"\n",
        "    def dfs(dim, total):\n",
        "        if dim == 1:\n",
        "            yield [total]\n",
        "        else:\n",
        "            steps = int(round(total / step_size))\n",
        "            for i in range(steps + 1):\n",
        "                w = i * step_size\n",
        "                for rest in dfs(dim - 1, total - w):\n",
        "                    yield [w] + rest\n",
        "\n",
        "    return list(dfs(m, 1.0))\n",
        "\n",
        "def up(df, step_size=0.01, plot=False):\n",
        "    \"\"\"\n",
        "    Computes the Universal Portfolio using a discrete set of weight vectors\n",
        "    on the simplex with granularity 'step_size'.\n",
        "\n",
        "    df: A pandas DataFrame of prices/returns, shape (T, m)\n",
        "    step_size: The increment for generating the simplex.\n",
        "\n",
        "    Returns:\n",
        "        to_plot: The log wealth over time (list of length T+1)\n",
        "        cur_wealth: Final wealth (scalar)\n",
        "    \"\"\"\n",
        "    # 1) Enumerate the simplex\n",
        "    simplex_list = generate_simplex(df.shape[1], step_size)\n",
        "    # Convert to a NumPy array of shape (N, m)\n",
        "    simplex = np.array(simplex_list, dtype=np.float64)\n",
        "    N = simplex.shape[0]\n",
        "\n",
        "    # 2) Initialize each combination's cumulative performance\n",
        "    cumulative_simplex = np.ones(N, dtype=np.float64)\n",
        "\n",
        "    # 3) Initialize portfolio weights (e.g. uniform)\n",
        "    m = df.shape[1]\n",
        "    weights = np.ones(m, dtype=np.float64) / m\n",
        "\n",
        "    to_plot = [0.0]     # we keep track of log-wealth\n",
        "    cur_wealth = 1.0    # actual wealth\n",
        "\n",
        "    # Convert df to NumPy for faster iteration\n",
        "    data = df.values\n",
        "\n",
        "    for t in range(len(data)):\n",
        "        row = data[t, :]  # shape (m,)\n",
        "\n",
        "        # (A) Compute current portfolio return\n",
        "        ret_t = weights @ row  # dot product\n",
        "        cur_wealth *= ret_t\n",
        "        # log-wealth\n",
        "        to_plot.append(to_plot[-1] + np.log(ret_t))\n",
        "\n",
        "        # (B) Update each weight vector's performance\n",
        "        #     payoff_j = sum_j ( w_j[i] * row[i] )\n",
        "        payoff = simplex @ row  # shape (N,)\n",
        "        cumulative_simplex *= payoff\n",
        "\n",
        "        # (C) Compute the new mixture distribution\n",
        "        #     (the probability of each weight vector)\n",
        "        total_perf = cumulative_simplex.sum()\n",
        "        performance = cumulative_simplex / total_perf  # shape (N,)\n",
        "\n",
        "        # (D) Derive the new portfolio by mixing over all weight vectors\n",
        "        #     weights[i] = sum_j (performance_j * simplex[j, i])\n",
        "        #     which is a matrix multiplication\n",
        "        weights = performance @ simplex  # shape (m,)\n",
        "\n",
        "    return to_plot, cur_wealth\n",
        "\n",
        "def eg(df, eta, plot=False):\n",
        "  weights = [1/df.shape[1] for i in range(df.shape[1])]\n",
        "  cur_wealth = 1\n",
        "  to_plot = [0]\n",
        "  for _, row in df.iterrows():\n",
        "    stock_vector = row.to_list()\n",
        "\n",
        "    #compute current wealth\n",
        "    sum = 0\n",
        "    for i,j in zip(weights, stock_vector):\n",
        "      sum += i * j\n",
        "    cur_wealth = cur_wealth * sum\n",
        "    #compute cumulative return of current weights\n",
        "    dR = np.log(sum)\n",
        "    to_plot.append(to_plot[-1]+dR)\n",
        "\n",
        "    sum = 0\n",
        "    for idx, i in enumerate(stock_vector):\n",
        "      sum += weights[idx]*i\n",
        "\n",
        "    for idx in range(len(weights)):\n",
        "      weights[idx] = weights[idx]*math.exp(eta*stock_vector[idx]/sum)\n",
        "    sum = np.sum(weights)\n",
        "\n",
        "    for idx in range(len(weights)):\n",
        "      weights[idx] = weights[idx]/sum\n",
        "\n",
        "  print(f\"Total accumulated wealth is: {cur_wealth}\")\n",
        "  print(f\"Final Cumulative Lograthimic Wealth: {to_plot[-1]}\")\n",
        "  print(weights)\n",
        "  if plot:\n",
        "    plt.figure(figsize=(20, 8))\n",
        "    plt.plot(to_plot)\n",
        "    # plt.plot(comb1.iloc[:, 0].tolist())\n",
        "    # plt.plot(comb1.iloc[:, 1].tolist())\n",
        "    plt.grid(True)\n",
        "    plt.show()\n",
        "  return to_plot, cur_wealth\n",
        "\n",
        "def set_seed(seed):\n",
        "    random.seed(seed)\n",
        "    np.random.seed(seed)\n",
        "\n",
        "def forecast_perm(values, p):\n",
        "    n = len(values)\n",
        "    asc  = sorted(range(n), key=lambda i: values[i])\n",
        "    desc = asc[::-1]\n",
        "\n",
        "    u = random.random()\n",
        "    if p <= 0.5:\n",
        "        if u < 1 - 2*p:                # Dirac on ascending\n",
        "            return asc\n",
        "        else:                           # uniform among n! perms\n",
        "            return random.sample(asc, k=n)\n",
        "    else:\n",
        "        if u < 2*(1 - p):              # uniform block\n",
        "            return random.sample(asc, k=n)\n",
        "        else:                           # Dirac on descending\n",
        "            return desc\n",
        "\n",
        "def _simplex_grid(m, step):\n",
        "    K = round(1.0 / step)\n",
        "    partitions: List[Tuple[int, ...]] = []\n",
        "    for cut_indices in itertools.combinations(range(K + m - 1), m - 1):\n",
        "        parts = []\n",
        "        last = -1\n",
        "        for c in cut_indices + (K + m - 1,):\n",
        "            parts.append(c - last - 1)\n",
        "            last = c\n",
        "        partitions.append(tuple(parts))\n",
        "    grid = np.asarray(partitions, dtype=np.float64) / K\n",
        "    return grid\n",
        "\n",
        "\n",
        "def upsi(df, up_granularity):\n",
        "    n, m = df.shape\n",
        "    grid = _simplex_grid(m, up_granularity)\n",
        "    G = grid.shape[0]\n",
        "    wealth = np.ones((m, G), dtype=np.float64)\n",
        "    state_tot = np.full(m, G, dtype=np.float64)\n",
        "    algo_wealth = 1.0\n",
        "\n",
        "    for _, row in df.iterrows():\n",
        "        r_t = row.to_numpy(dtype=np.float64)         # (m,)\n",
        "\n",
        "        # Side information revealed for this period\n",
        "        s = 1 # random.randrange(m)                      # uniform in {0,…,m-1}\n",
        "\n",
        "        # Update every grid portfolio *inside* state s\n",
        "        gains = grid @ r_t                           # (G,)\n",
        "        wealth[s] *= gains\n",
        "\n",
        "        # Capital fraction residing in state s before / after update\n",
        "        before = state_tot[s]\n",
        "        after = wealth[s].sum()\n",
        "        state_tot[s] = after\n",
        "\n",
        "        # Only state-s capital is re-invested this round\n",
        "        algo_wealth *= after / before\n",
        "\n",
        "    return algo_wealth\n",
        "\n",
        "import numpy as np\n",
        "\n",
        "def ram_rand(df):\n",
        "    n, m = df.shape\n",
        "    cur_wealth = 1.0\n",
        "    weights = np.ones(m) / m\n",
        "    for _, row in df.iterrows():\n",
        "        # clairvoyant return at i\n",
        "        x = row.to_numpy(dtype=np.float64)\n",
        "        # predicted return\n",
        "        accuracy = 1\n",
        "        y_idx = forecast_perm(x, accuracy)\n",
        "        y = np.array([x[j] for j in y_idx])\n",
        "        # permutation layer\n",
        "        w_desc = np.sort(weights)[::-1]\n",
        "        w_new = np.empty_like(weights)\n",
        "        for rank, asset_idx in enumerate(y_idx):\n",
        "            w_new[asset_idx] = w_desc[rank]\n",
        "        # get wealth\n",
        "        dR = w_new @ x\n",
        "        cur_wealth *= dR\n",
        "        # get new weights\n",
        "        weights = (w_new * x) / dR\n",
        "    return cur_wealth\n",
        "\n",
        "def ram_upsi_rand(df, accuracy, up_granularity):\n",
        "    n, m = df.shape\n",
        "    # RAM init\n",
        "    ram_wealth = 1.0\n",
        "    weights = np.ones(m) / m\n",
        "\n",
        "    # UPSI init\n",
        "    grid = _simplex_grid(m, up_granularity)\n",
        "    G = grid.shape[0]\n",
        "    wealth = np.ones((m, G), dtype=np.float64)\n",
        "    state_tot = np.full(m, G, dtype=np.float64)\n",
        "    upsi_wealth = 1.0\n",
        "\n",
        "    for _, row in df.iterrows():\n",
        "        # clairvoyant return at i\n",
        "        x = row.to_numpy(dtype=np.float64)\n",
        "\n",
        "        # predicted return\n",
        "        y_idx = forecast_perm(x, accuracy)\n",
        "        y = np.array([x[j] for j in y_idx])\n",
        "        s = y_idx[0]\n",
        "        # permutation layer\n",
        "        w_desc = np.sort(weights)[::-1]\n",
        "        w_new = np.empty_like(weights)\n",
        "        for rank, asset_idx in enumerate(y_idx):\n",
        "            w_new[asset_idx] = w_desc[rank]\n",
        "        # get wealth\n",
        "        dR = w_new @ x\n",
        "        ram_wealth *= dR\n",
        "        # get new weights\n",
        "        weights = (w_new * x) / dR\n",
        "\n",
        "        ####### UPSI #########\n",
        "        gains = grid @ x\n",
        "        wealth[s] *= gains\n",
        "\n",
        "        before = state_tot[s]\n",
        "        after = wealth[s].sum()\n",
        "        state_tot[s] = after\n",
        "        upsi_wealth *= after / before\n",
        "\n",
        "    return ram_wealth, upsi_wealth\n",
        "\n",
        "def eg_with_states(df,eta):\n",
        "\n",
        "    n, m = df.shape\n",
        "\n",
        "    # one EG weight vector per state\n",
        "    weights_eg = np.full((m, m), 1.0 / m, dtype=np.float64)\n",
        "    wealth_eg  = 1.0\n",
        "\n",
        "    for _, row in df.iterrows():\n",
        "        x_t = row.to_numpy(dtype=np.float64)      # (m,)\n",
        "\n",
        "        s = 1\n",
        "\n",
        "        # ---- use & update only the active state's weights --------------------\n",
        "        w_s      = weights_eg[s]\n",
        "        port_ret = np.dot(w_s, x_t)\n",
        "        wealth_eg  *= port_ret\n",
        "\n",
        "        # Exponential-Gradient update\n",
        "        w_s *= np.exp(eta * x_t / port_ret)\n",
        "        w_s /= w_s.sum()\n",
        "\n",
        "    return wealth_eg\n",
        "\n",
        "def ram_eg_rand(df, accuracy, eta):\n",
        "    n, m = df.shape\n",
        "    # RAM init\n",
        "    ram_wealth = 1.0\n",
        "    weights = np.ones(m) / m\n",
        "\n",
        "    # EG init\n",
        "    weights_eg = np.full((m, m), 1.0 / m, dtype=np.float64)\n",
        "    wealth_eg  = 1.0\n",
        "\n",
        "    for _, row in df.iterrows():\n",
        "        # clairvoyant return at i\n",
        "        x = row.to_numpy(dtype=np.float64)\n",
        "\n",
        "        # predicted return\n",
        "        y_idx = forecast_perm(x, accuracy)\n",
        "        y = np.array([x[j] for j in y_idx])\n",
        "        s = y_idx[0]\n",
        "        # permutation layer\n",
        "        w_desc = np.sort(weights)[::-1]\n",
        "        w_new = np.empty_like(weights)\n",
        "        for rank, asset_idx in enumerate(y_idx):\n",
        "            w_new[asset_idx] = w_desc[rank]\n",
        "        # get wealth\n",
        "        dR = w_new @ x\n",
        "        ram_wealth *= dR\n",
        "        # get new weights\n",
        "        weights = (w_new * x) / dR\n",
        "\n",
        "        ####### EG #########\n",
        "        w_s      = weights_eg[s]\n",
        "        port_ret = np.dot(w_s, x)\n",
        "        wealth_eg  *= port_ret\n",
        "\n",
        "        # Exponential-Gradient update\n",
        "        w_s *= np.exp(eta * x / port_ret)\n",
        "        w_s /= w_s.sum()\n",
        "\n",
        "    return ram_wealth, wealth_eg"
      ],
      "metadata": {
        "id": "LgP5HzVOcDcL"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "##1.3 - Table 1"
      ],
      "metadata": {
        "id": "mjlXlCbIb_7Y"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "for i in [comb1_two,comb2_two,comb3_two,comb4_three,comb5_three,comb6_three]:\n",
        "    gm_w = gm(i)\n",
        "    _, up_w = up(i, 0.01)\n",
        "    ram_adv = ram_rand(i)\n",
        "    print(f\"gm: {gm_w}, ram_adv: {ram_adv}, up: {up_w}\")"
      ],
      "metadata": {
        "id": "cq-au9j1fe8s"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "dataset = [comb1_two,comb2_two,comb3_two,comb4_three,comb5_three,comb6_three]\n",
        "\n",
        "ram_list = [[],[],[],[],[],[]]\n",
        "upsi_list = [[],[],[],[],[],[]]\n",
        "accuracy = 0.5\n",
        "set_seed(42)\n",
        "runs = 1000\n",
        "\n",
        "for idx,i in enumerate(dataset):\n",
        "    for j in tqdm(range(runs)):\n",
        "        ram_w, upsi_w = ram_upsi_rand(i, accuracy, 0.01)\n",
        "        ram_list[idx].append(ram_w)\n",
        "        upsi_list[idx].append(upsi_w)"
      ],
      "metadata": {
        "id": "InYRC_I9fnCS"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "rows = []\n",
        "for idx, (ram_vals, upsi_vals) in enumerate(zip(ram_list, upsi_list), start=1):\n",
        "    ram_arr = np.array(ram_vals, dtype=float)\n",
        "    upsi_arr = np.array(upsi_vals, dtype=float)\n",
        "\n",
        "    # Use NaN for empty sequences so that the table still renders\n",
        "    ram_mean   = ram_arr.mean()\n",
        "    ram_sd     = ram_arr.std(ddof=1)\n",
        "    ram_med    = np.median(ram_arr)\n",
        "\n",
        "    upsi_mean  = upsi_arr.mean()\n",
        "    upsi_sd    = upsi_arr.std(ddof=1)\n",
        "    upsi_med   = np.median(upsi_arr)\n",
        "\n",
        "    rows.append(\n",
        "        {\n",
        "            \"Row\": idx,\n",
        "            \"RAM mean\":   ram_mean,\n",
        "            \"RAM sd\":     ram_sd,\n",
        "            \"RAM median\": ram_med,\n",
        "            \"UPSI mean\":  upsi_mean,\n",
        "            \"UPSI sd\":    upsi_sd,\n",
        "            \"UPSI median\": upsi_med,\n",
        "        }\n",
        "    )\n",
        "\n",
        "df = pd.DataFrame(rows).set_index(\"Row\")\n",
        "\n",
        "df"
      ],
      "metadata": {
        "id": "XTL0GQ_YfqLG"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "for i in [comb1_two,comb2_two,comb3_two,comb4_three,comb5_three,comb6_three]:\n",
        "    print(eg_with_states(i,0.05))\n",
        "    print(ram_rand(i))\n",
        "    # print(gm(i))\n",
        "    print(\"############\")"
      ],
      "metadata": {
        "id": "r3puuIN7frf9"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "for i in [comb7_5, comb8_5, comb9_8, comb10_8, comb11_11, comb12_11]:\n",
        "    a,b = ram_eg_rand(i, 1, 0.05)\n",
        "    print(f\"EG: {b}     RAM: {a}\")\n",
        "    print(\"############\")"
      ],
      "metadata": {
        "id": "W4Djo2s5f0J9"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "for i in [comb7_5, comb8_5, comb9_8, comb10_8, comb11_11, comb12_11]:\n",
        "    print(f\"GM: {gm(i)}  EG: {eg_with_states(i,0.05)}\")"
      ],
      "metadata": {
        "id": "09UMnkJcf7XR"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "dataset = [comb7_5, comb8_5, comb9_8, comb10_8, comb11_11, comb12_11]\n",
        "\n",
        "ram_list = [[],[],[],[],[],[]]\n",
        "upsi_list = [[],[],[],[],[],[]]\n",
        "accuracy = 0.53\n",
        "set_seed(42)\n",
        "runs = 1000\n",
        "\n",
        "for idx,i in enumerate(dataset):\n",
        "    for j in tqdm(range(runs)):\n",
        "        ram_w, upsi_w = ram_eg_rand(i, accuracy, 0.05)\n",
        "        ram_list[idx].append(ram_w)\n",
        "        upsi_list[idx].append(upsi_w)"
      ],
      "metadata": {
        "id": "f2nijbJpf9d0"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "rows = []\n",
        "for idx, (ram_vals, upsi_vals) in enumerate(zip(ram_list, upsi_list), start=1):\n",
        "    ram_arr = np.array(ram_vals, dtype=float)\n",
        "    upsi_arr = np.array(upsi_vals, dtype=float)\n",
        "\n",
        "    # Use NaN for empty sequences so that the table still renders\n",
        "    ram_mean   = ram_arr.mean()\n",
        "    ram_sd     = ram_arr.std(ddof=1)\n",
        "    ram_med    = np.median(ram_arr)\n",
        "\n",
        "    upsi_mean  = upsi_arr.mean()\n",
        "    upsi_sd    = upsi_arr.std(ddof=1)\n",
        "    upsi_med   = np.median(upsi_arr)\n",
        "\n",
        "    rows.append(\n",
        "        {\n",
        "            \"Row\": idx,\n",
        "            \"RAM mean\":   ram_mean,\n",
        "            \"RAM sd\":     ram_sd,\n",
        "            \"RAM median\": ram_med,\n",
        "            \"EGSI mean\":  upsi_mean,\n",
        "            \"EGSI sd\":    upsi_sd,\n",
        "            \"EGSI median\": upsi_med,\n",
        "        }\n",
        "    )\n",
        "\n",
        "df = pd.DataFrame(rows).set_index(\"Row\")\n",
        "\n",
        "df"
      ],
      "metadata": {
        "id": "QSxwv16YgBpU"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "dataset = [comb7_5, comb8_5, comb9_8, comb10_8, comb11_11, comb12_11]\n",
        "\n",
        "ram_list = [[],[],[],[],[],[]]\n",
        "upsi_list = [[],[],[],[],[],[]]\n",
        "accuracy = 0.6\n",
        "set_seed(42)\n",
        "runs = 1000\n",
        "\n",
        "for idx,i in enumerate(dataset):\n",
        "    for j in tqdm(range(runs)):\n",
        "        ram_w, upsi_w = ram_eg_rand(i, accuracy, 0.05)\n",
        "        ram_list[idx].append(ram_w)\n",
        "        upsi_list[idx].append(upsi_w)"
      ],
      "metadata": {
        "id": "cmwfsmHugKsh"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "rows = []\n",
        "for idx, (ram_vals, upsi_vals) in enumerate(zip(ram_list, upsi_list), start=1):\n",
        "    ram_arr = np.array(ram_vals, dtype=float)\n",
        "    upsi_arr = np.array(upsi_vals, dtype=float)\n",
        "\n",
        "    # Use NaN for empty sequences so that the table still renders\n",
        "    ram_mean   = ram_arr.mean()\n",
        "    ram_sd     = ram_arr.std(ddof=1)\n",
        "    ram_med    = np.median(ram_arr)\n",
        "\n",
        "    upsi_mean  = upsi_arr.mean()\n",
        "    upsi_sd    = upsi_arr.std(ddof=1)\n",
        "    upsi_med   = np.median(upsi_arr)\n",
        "\n",
        "    rows.append(\n",
        "        {\n",
        "            \"Row\": idx,\n",
        "            \"RAM mean\":   ram_mean,\n",
        "            \"RAM sd\":     ram_sd,\n",
        "            \"RAM median\": ram_med,\n",
        "            \"EGSI mean\":  upsi_mean,\n",
        "            \"EGSI sd\":    upsi_sd,\n",
        "            \"EGSI median\": upsi_med,\n",
        "        }\n",
        "    )\n",
        "\n",
        "df = pd.DataFrame(rows).set_index(\"Row\")\n",
        "\n",
        "df"
      ],
      "metadata": {
        "id": "zMdXaIN9gQca"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "for i in [comb7_5, comb8_5, comb9_8, comb10_8, comb11_11, comb12_11]:\n",
        "    ram_w, upsi_w = ram_eg_rand(i, 1, 0.05)\n",
        "    print(f\"RAM: {ram_w}  EGSI: {upsi_w}\")"
      ],
      "metadata": {
        "id": "BiB4FXe_gUv3"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "#2. Part 2 - Real ML Model"
      ],
      "metadata": {
        "id": "iie35Kc2gX-k"
      }
    },
    {
      "cell_type": "markdown",
      "source": [
        "##2.1) preparation"
      ],
      "metadata": {
        "id": "6lI3JYbIhSpE"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "!pip install lightgbm"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "aBjIy-_DBM8_",
        "outputId": "7fef118e-9fb0-4468-eaf5-1b9d649c9b9c"
      },
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Requirement already satisfied: lightgbm in /usr/local/lib/python3.11/dist-packages (4.5.0)\n",
            "Requirement already satisfied: numpy>=1.17.0 in /usr/local/lib/python3.11/dist-packages (from lightgbm) (2.0.2)\n",
            "Requirement already satisfied: scipy in /usr/local/lib/python3.11/dist-packages (from lightgbm) (1.15.2)\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "!pip -q install kaggle --upgrade\n",
        "\n",
        "from google.colab import files, drive\n",
        "uploaded = files.upload()          # choose kaggle.json\n",
        "!mkdir -p ~/.kaggle\n",
        "!mv kaggle.json ~/.kaggle/\n",
        "!chmod 600 ~/.kaggle/kaggle.json"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 95
        },
        "id": "sGBP8JD9hUo4",
        "outputId": "fe5ba77f-7f5d-4ea5-c200-b7e40ca4f806"
      },
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "\u001b[?25l   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0.0/181.2 kB\u001b[0m \u001b[31m?\u001b[0m eta \u001b[36m-:--:--\u001b[0m\r\u001b[2K   \u001b[91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[90m╺\u001b[0m\u001b[90m━\u001b[0m \u001b[32m174.1/181.2 kB\u001b[0m \u001b[31m6.9 MB/s\u001b[0m eta \u001b[36m0:00:01\u001b[0m\r\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m181.2/181.2 kB\u001b[0m \u001b[31m3.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[?25h"
          ]
        },
        {
          "output_type": "display_data",
          "data": {
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ],
            "text/html": [
              "\n",
              "     <input type=\"file\" id=\"files-3a0d0b35-1452-4efb-9a11-9b6298084625\" name=\"files[]\" multiple disabled\n",
              "        style=\"border:none\" />\n",
              "     <output id=\"result-3a0d0b35-1452-4efb-9a11-9b6298084625\">\n",
              "      Upload widget is only available when the cell has been executed in the\n",
              "      current browser session. Please rerun this cell to enable.\n",
              "      </output>\n",
              "      <script>// Copyright 2017 Google LLC\n",
              "//\n",
              "// Licensed under the Apache License, Version 2.0 (the \"License\");\n",
              "// you may not use this file except in compliance with the License.\n",
              "// You may obtain a copy of the License at\n",
              "//\n",
              "//      http://www.apache.org/licenses/LICENSE-2.0\n",
              "//\n",
              "// Unless required by applicable law or agreed to in writing, software\n",
              "// distributed under the License is distributed on an \"AS IS\" BASIS,\n",
              "// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
              "// See the License for the specific language governing permissions and\n",
              "// limitations under the License.\n",
              "\n",
              "/**\n",
              " * @fileoverview Helpers for google.colab Python module.\n",
              " */\n",
              "(function(scope) {\n",
              "function span(text, styleAttributes = {}) {\n",
              "  const element = document.createElement('span');\n",
              "  element.textContent = text;\n",
              "  for (const key of Object.keys(styleAttributes)) {\n",
              "    element.style[key] = styleAttributes[key];\n",
              "  }\n",
              "  return element;\n",
              "}\n",
              "\n",
              "// Max number of bytes which will be uploaded at a time.\n",
              "const MAX_PAYLOAD_SIZE = 100 * 1024;\n",
              "\n",
              "function _uploadFiles(inputId, outputId) {\n",
              "  const steps = uploadFilesStep(inputId, outputId);\n",
              "  const outputElement = document.getElementById(outputId);\n",
              "  // Cache steps on the outputElement to make it available for the next call\n",
              "  // to uploadFilesContinue from Python.\n",
              "  outputElement.steps = steps;\n",
              "\n",
              "  return _uploadFilesContinue(outputId);\n",
              "}\n",
              "\n",
              "// This is roughly an async generator (not supported in the browser yet),\n",
              "// where there are multiple asynchronous steps and the Python side is going\n",
              "// to poll for completion of each step.\n",
              "// This uses a Promise to block the python side on completion of each step,\n",
              "// then passes the result of the previous step as the input to the next step.\n",
              "function _uploadFilesContinue(outputId) {\n",
              "  const outputElement = document.getElementById(outputId);\n",
              "  const steps = outputElement.steps;\n",
              "\n",
              "  const next = steps.next(outputElement.lastPromiseValue);\n",
              "  return Promise.resolve(next.value.promise).then((value) => {\n",
              "    // Cache the last promise value to make it available to the next\n",
              "    // step of the generator.\n",
              "    outputElement.lastPromiseValue = value;\n",
              "    return next.value.response;\n",
              "  });\n",
              "}\n",
              "\n",
              "/**\n",
              " * Generator function which is called between each async step of the upload\n",
              " * process.\n",
              " * @param {string} inputId Element ID of the input file picker element.\n",
              " * @param {string} outputId Element ID of the output display.\n",
              " * @return {!Iterable<!Object>} Iterable of next steps.\n",
              " */\n",
              "function* uploadFilesStep(inputId, outputId) {\n",
              "  const inputElement = document.getElementById(inputId);\n",
              "  inputElement.disabled = false;\n",
              "\n",
              "  const outputElement = document.getElementById(outputId);\n",
              "  outputElement.innerHTML = '';\n",
              "\n",
              "  const pickedPromise = new Promise((resolve) => {\n",
              "    inputElement.addEventListener('change', (e) => {\n",
              "      resolve(e.target.files);\n",
              "    });\n",
              "  });\n",
              "\n",
              "  const cancel = document.createElement('button');\n",
              "  inputElement.parentElement.appendChild(cancel);\n",
              "  cancel.textContent = 'Cancel upload';\n",
              "  const cancelPromise = new Promise((resolve) => {\n",
              "    cancel.onclick = () => {\n",
              "      resolve(null);\n",
              "    };\n",
              "  });\n",
              "\n",
              "  // Wait for the user to pick the files.\n",
              "  const files = yield {\n",
              "    promise: Promise.race([pickedPromise, cancelPromise]),\n",
              "    response: {\n",
              "      action: 'starting',\n",
              "    }\n",
              "  };\n",
              "\n",
              "  cancel.remove();\n",
              "\n",
              "  // Disable the input element since further picks are not allowed.\n",
              "  inputElement.disabled = true;\n",
              "\n",
              "  if (!files) {\n",
              "    return {\n",
              "      response: {\n",
              "        action: 'complete',\n",
              "      }\n",
              "    };\n",
              "  }\n",
              "\n",
              "  for (const file of files) {\n",
              "    const li = document.createElement('li');\n",
              "    li.append(span(file.name, {fontWeight: 'bold'}));\n",
              "    li.append(span(\n",
              "        `(${file.type || 'n/a'}) - ${file.size} bytes, ` +\n",
              "        `last modified: ${\n",
              "            file.lastModifiedDate ? file.lastModifiedDate.toLocaleDateString() :\n",
              "                                    'n/a'} - `));\n",
              "    const percent = span('0% done');\n",
              "    li.appendChild(percent);\n",
              "\n",
              "    outputElement.appendChild(li);\n",
              "\n",
              "    const fileDataPromise = new Promise((resolve) => {\n",
              "      const reader = new FileReader();\n",
              "      reader.onload = (e) => {\n",
              "        resolve(e.target.result);\n",
              "      };\n",
              "      reader.readAsArrayBuffer(file);\n",
              "    });\n",
              "    // Wait for the data to be ready.\n",
              "    let fileData = yield {\n",
              "      promise: fileDataPromise,\n",
              "      response: {\n",
              "        action: 'continue',\n",
              "      }\n",
              "    };\n",
              "\n",
              "    // Use a chunked sending to avoid message size limits. See b/62115660.\n",
              "    let position = 0;\n",
              "    do {\n",
              "      const length = Math.min(fileData.byteLength - position, MAX_PAYLOAD_SIZE);\n",
              "      const chunk = new Uint8Array(fileData, position, length);\n",
              "      position += length;\n",
              "\n",
              "      const base64 = btoa(String.fromCharCode.apply(null, chunk));\n",
              "      yield {\n",
              "        response: {\n",
              "          action: 'append',\n",
              "          file: file.name,\n",
              "          data: base64,\n",
              "        },\n",
              "      };\n",
              "\n",
              "      let percentDone = fileData.byteLength === 0 ?\n",
              "          100 :\n",
              "          Math.round((position / fileData.byteLength) * 100);\n",
              "      percent.textContent = `${percentDone}% done`;\n",
              "\n",
              "    } while (position < fileData.byteLength);\n",
              "  }\n",
              "\n",
              "  // All done.\n",
              "  yield {\n",
              "    response: {\n",
              "      action: 'complete',\n",
              "    }\n",
              "  };\n",
              "}\n",
              "\n",
              "scope.google = scope.google || {};\n",
              "scope.google.colab = scope.google.colab || {};\n",
              "scope.google.colab._files = {\n",
              "  _uploadFiles,\n",
              "  _uploadFilesContinue,\n",
              "};\n",
              "})(self);\n",
              "</script> "
            ]
          },
          "metadata": {}
        },
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Saving kaggle.json to kaggle.json\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "DATA_ROOT = \"/content/sp500_raw\"\n",
        "!mkdir -p $DATA_ROOT\n",
        "!kaggle datasets download andrewmvd/sp-500-stocks --unzip -p $DATA_ROOT"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "Q9dQIyqriUcE",
        "outputId": "b2560b96-79b4-43c9-c36a-e187835b5a54"
      },
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Dataset URL: https://www.kaggle.com/datasets/andrewmvd/sp-500-stocks\n",
            "License(s): CC0-1.0\n",
            "Downloading sp-500-stocks.zip to /content/sp500_raw\n",
            "  0% 0.00/18.7M [00:00<?, ?B/s]\n",
            "100% 18.7M/18.7M [00:00<00:00, 778MB/s]\n"
          ]
        }
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "##2.2) dataset"
      ],
      "metadata": {
        "id": "GkeiN3s0KP6C"
      }
    },
    {
      "cell_type": "markdown",
      "source": [
        "###a) pre processing"
      ],
      "metadata": {
        "id": "xeIh7Jc8SVlY"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "import pathlib\n",
        "\n",
        "insp1, insp2, insp3 = \"sp500_companies.csv\", \"sp500_index.csv\", \"sp500_stocks.csv\"\n",
        "\n",
        "CSV_PATH = pathlib.Path(\"/content/sp500_raw/\"+insp3)\n",
        "\n",
        "# 1. Read a *small* sample first (fast, avoids OOM on huge files)\n",
        "sample = pd.read_csv(CSV_PATH, nrows=5)\n",
        "print(\"First five rows:\\n\", sample, \"\\n\")\n",
        "\n",
        "# 2. Inspect the full schema without loading the data\n",
        "df_info = pd.read_csv(CSV_PATH, nrows=0)\n",
        "print(\"Columns:\", list(df_info.columns))\n",
        "\n",
        "# 3. Load the full DataFrame (if size is reasonable)\n",
        "df = pd.read_csv(CSV_PATH, parse_dates=True)\n",
        "\n",
        "# 4. Quick overview\n",
        "df.info()          # dtypes, non-null counts, memory footprint\n",
        "print(df.describe(include=\"all\").T.head())   # stats per column\n",
        "\n",
        "# 5. Peek at unique values / missing ratios\n",
        "for col in df.columns[:10]:                  # ⇐ limit to first 10\n",
        "    nunique = df[col].nunique(dropna=True)\n",
        "    missing = df[col].isna().mean()\n",
        "    print(f\"{col:<20} uniques={nunique:<7} missing={missing:>.2%}\")\n",
        "\n",
        "print(df[:20])"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "S66N51lOjhov",
        "outputId": "b74c3948-59fa-428f-c1d9-6faee8c97759"
      },
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "First five rows:\n",
            "          Date Symbol  Adj Close  Close  High  Low  Open  Volume\n",
            "0  2010-01-04    MMM        NaN    NaN   NaN  NaN   NaN     NaN\n",
            "1  2010-01-05    MMM        NaN    NaN   NaN  NaN   NaN     NaN\n",
            "2  2010-01-06    MMM        NaN    NaN   NaN  NaN   NaN     NaN\n",
            "3  2010-01-07    MMM        NaN    NaN   NaN  NaN   NaN     NaN\n",
            "4  2010-01-08    MMM        NaN    NaN   NaN  NaN   NaN     NaN \n",
            "\n",
            "Columns: ['Date', 'Symbol', 'Adj Close', 'Close', 'High', 'Low', 'Open', 'Volume']\n",
            "<class 'pandas.core.frame.DataFrame'>\n",
            "RangeIndex: 1891536 entries, 0 to 1891535\n",
            "Data columns (total 8 columns):\n",
            " #   Column     Dtype  \n",
            "---  ------     -----  \n",
            " 0   Date       object \n",
            " 1   Symbol     object \n",
            " 2   Adj Close  float64\n",
            " 3   Close      float64\n",
            " 4   High       float64\n",
            " 5   Low        float64\n",
            " 6   Open       float64\n",
            " 7   Volume     float64\n",
            "dtypes: float64(6), object(2)\n",
            "memory usage: 115.5+ MB\n",
            "              count unique         top  freq       mean         std       min  \\\n",
            "Date        1891536   3768  2024-12-20   502        NaN         NaN       NaN   \n",
            "Symbol      1891536    502         ZTS  3768        NaN         NaN       NaN   \n",
            "Adj Close  617831.0    NaN         NaN   NaN  79.672357  102.742931  0.203593   \n",
            "Close      617831.0    NaN         NaN   NaN   87.47154  104.521901     0.222   \n",
            "High       617831.0    NaN         NaN   NaN  88.417844  105.684113   0.22625   \n",
            "\n",
            "                 25%        50%         75%          max  \n",
            "Date             NaN        NaN         NaN          NaN  \n",
            "Symbol           NaN        NaN         NaN          NaN  \n",
            "Adj Close  26.572459  49.821613   94.831036  1702.530029  \n",
            "Close      32.700001  59.139999  105.019997  1702.530029  \n",
            "High       33.060001  59.720001  106.129997      1714.75  \n",
            "Date                 uniques=3768    missing=0.00%\n",
            "Symbol               uniques=502     missing=0.00%\n",
            "Adj Close            uniques=525360  missing=67.34%\n",
            "Close                uniques=123637  missing=67.34%\n",
            "High                 uniques=123036  missing=67.34%\n",
            "Low                  uniques=122480  missing=67.34%\n",
            "Open                 uniques=121187  missing=67.34%\n",
            "Volume               uniques=208513  missing=67.34%\n",
            "          Date Symbol  Adj Close  Close  High  Low  Open  Volume\n",
            "0   2010-01-04    MMM        NaN    NaN   NaN  NaN   NaN     NaN\n",
            "1   2010-01-05    MMM        NaN    NaN   NaN  NaN   NaN     NaN\n",
            "2   2010-01-06    MMM        NaN    NaN   NaN  NaN   NaN     NaN\n",
            "3   2010-01-07    MMM        NaN    NaN   NaN  NaN   NaN     NaN\n",
            "4   2010-01-08    MMM        NaN    NaN   NaN  NaN   NaN     NaN\n",
            "5   2010-01-11    MMM        NaN    NaN   NaN  NaN   NaN     NaN\n",
            "6   2010-01-12    MMM        NaN    NaN   NaN  NaN   NaN     NaN\n",
            "7   2010-01-13    MMM        NaN    NaN   NaN  NaN   NaN     NaN\n",
            "8   2010-01-14    MMM        NaN    NaN   NaN  NaN   NaN     NaN\n",
            "9   2010-01-15    MMM        NaN    NaN   NaN  NaN   NaN     NaN\n",
            "10  2010-01-19    MMM        NaN    NaN   NaN  NaN   NaN     NaN\n",
            "11  2010-01-20    MMM        NaN    NaN   NaN  NaN   NaN     NaN\n",
            "12  2010-01-21    MMM        NaN    NaN   NaN  NaN   NaN     NaN\n",
            "13  2010-01-22    MMM        NaN    NaN   NaN  NaN   NaN     NaN\n",
            "14  2010-01-25    MMM        NaN    NaN   NaN  NaN   NaN     NaN\n",
            "15  2010-01-26    MMM        NaN    NaN   NaN  NaN   NaN     NaN\n",
            "16  2010-01-27    MMM        NaN    NaN   NaN  NaN   NaN     NaN\n",
            "17  2010-01-28    MMM        NaN    NaN   NaN  NaN   NaN     NaN\n",
            "18  2010-01-29    MMM        NaN    NaN   NaN  NaN   NaN     NaN\n",
            "19  2010-02-01    MMM        NaN    NaN   NaN  NaN   NaN     NaN\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "CSV_PATH  = pathlib.Path(\"/content/sp500_raw/sp500_stocks.csv\")\n",
        "DATE_COL  = \"Date\"\n",
        "TICKER    = \"Symbol\"          # older snapshots use \"Ticker\"\n",
        "PRICE_COL = \"Adj Close\"       # fallback to \"Close\" where missing\n",
        "\n",
        "# ── 1  Load only the needed columns ────────────────────────────────────────────\n",
        "use_cols = [DATE_COL, TICKER, PRICE_COL, \"Close\"]      # pick both, pick whichever exists\n",
        "df_raw   = pd.read_csv(CSV_PATH, usecols=lambda c: c in use_cols,\n",
        "                       parse_dates=[DATE_COL])\n",
        "\n",
        "# choose Adjusted if available else Close\n",
        "if PRICE_COL not in df_raw.columns:\n",
        "    PRICE_COL = \"Close\"\n",
        "\n",
        "# drop rows with no price at all (≈ 67 %)\n",
        "df_raw = df_raw.dropna(subset=[PRICE_COL])\n",
        "\n",
        "# ── 2  Compute price factors per ticker ────────────────────────────────────────\n",
        "df_raw = df_raw.sort_values([TICKER, DATE_COL])\n",
        "df_raw[\"factor\"] = df_raw.groupby(TICKER)[PRICE_COL].pct_change().add(1.0)\n",
        "\n",
        "# some tickers start later than others ⇒ first row factor = NaN\n",
        "df_raw = df_raw.dropna(subset=[\"factor\"])\n",
        "\n",
        "# ── 3  Forward-fill occasional holiday gaps (optional) ─────────────────────────\n",
        "# maximum gap tolerated = 5 business days\n",
        "df_raw[\"factor_ffill\"] = (\n",
        "    df_raw\n",
        "        .groupby(TICKER)[\"factor\"]\n",
        "        .transform(lambda s: s.fillna(method=\"ffill\", limit=5))\n",
        ")\n",
        "\n",
        "# ── 4  Pivot to wide matrix and keep only fully-observed days & tickers ────────\n",
        "df_wide = (\n",
        "    df_raw.pivot(index=DATE_COL, columns=TICKER, values=\"factor_ffill\")\n",
        "          .astype(\"float32\")\n",
        ")\n",
        "\n",
        "# drop columns (tickers) that still have NaNs\n",
        "na_by_col = df_wide.isna().mean()\n",
        "cols_keep = na_by_col[na_by_col == 0].index\n",
        "df_wide   = df_wide[cols_keep]\n",
        "\n",
        "# drop any day that now has a NaN (should be none, but safe-guard)\n",
        "df_factor_clean = df_wide.dropna(axis=0, how=\"any\").sort_index()\n",
        "\n",
        "print(\"final shape =\", df_factor_clean.shape)\n",
        "print(\"any NaN left = \", df_factor_clean.isna().any().any())\n",
        "# final shape ≈ (n_days, m_clean)   e.g. (3640, 485)"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "x4Zedy-VjmHd",
        "outputId": "80f471f7-f83c-4879-d352-8d94b50fa89b"
      },
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stderr",
          "text": [
            "<ipython-input-36-83c06b319fa0>:30: FutureWarning: Series.fillna with 'method' is deprecated and will raise in a future version. Use obj.ffill() or obj.bfill() instead.\n",
            "  .transform(lambda s: s.fillna(method=\"ffill\", limit=5))\n",
            "<ipython-input-36-83c06b319fa0>:27: SettingWithCopyWarning: \n",
            "A value is trying to be set on a copy of a slice from a DataFrame.\n",
            "Try using .loc[row_indexer,col_indexer] = value instead\n",
            "\n",
            "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
            "  df_raw[\"factor_ffill\"] = (\n"
          ]
        },
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "final shape = (3767, 150)\n",
            "any NaN left =  False\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "print(df_factor_clean.describe(include=\"all\").T.head())"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "NiAKArhHoDpH",
        "outputId": "bcd4b726-0f53-4fe8-ae2c-de23f754ec23"
      },
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "         count      mean       std       min       25%       50%       75%  \\\n",
            "Symbol                                                                       \n",
            "ABT     3767.0  1.000571  0.013512  0.902143  0.993946  1.000594  1.007652   \n",
            "ADM     3767.0  1.000365  0.016334  0.758029  0.992638  1.000828  1.008962   \n",
            "AES     3767.0  1.000290  0.020065  0.826989  0.990316  1.000778  1.010753   \n",
            "AJG     3767.0  1.000861  0.012821  0.847648  0.994702  1.001112  1.007294   \n",
            "ALB     3767.0  1.000603  0.025314  0.800894  0.989013  1.001058  1.012985   \n",
            "\n",
            "             max  \n",
            "Symbol            \n",
            "ABT     1.109360  \n",
            "ADM     1.102695  \n",
            "AES     1.149582  \n",
            "AJG     1.132327  \n",
            "ALB     1.137381  \n"
          ]
        }
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "###b) post processing"
      ],
      "metadata": {
        "id": "Mj4_3rRqSYjk"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "comp_a_names = [\"T\", \"MSFT\", \"NVDA\", \"AMZN\", \"V\"]\n",
        "comp_b_names = [\"CSCO\", \"MSFT\",   # technology\n",
        "                \"WRB\", \"RF\",      # financial\n",
        "                \"T\", \"NFLX\",      # communication\n",
        "                \"SBUX\", \"BBY\",    # consumer\n",
        "                \"ABT\", \"BAX\"]     # healthcare\n",
        "comp_c_names = [\"CMCSA\", \"AMP\", \"HSIC\", \"FSLR\", \"FCX\", \"DE\", \"CE\", \"VLO\", \"BWA\", \"PH\", \"ANSS\", \"AMZN\", \"C\", \"EXPE\", \"FDX\", \"TJX\", \"WST\", \"EMN\", \"PGR\", \"FAST\", \"PODD\", \"HST\", \"ADM\", \"NVDA\", \"PAYX\", \"BRO\", \"MO\", \"ESS\", \"DTE\", \"WEC\"]\n",
        "\n",
        "comp_a_pre = df_factor_clean[comp_a_names]\n",
        "comp_b_pre = df_factor_clean[comp_b_names]\n",
        "comp_c_pre = df_factor_clean[comp_c_names]\n",
        "\n",
        "time_A = [2150,2600] # [2400:2600]      2019-July to 2020-May\n",
        "time_B = [3012,3767] # [3012,3767]      2022-December to 2024-December\n",
        "\n",
        "a_A = comp_a_pre[time_A[0]:time_A[1]]\n",
        "b_A = comp_b_pre[time_A[0]:time_A[1]]\n",
        "c_A = comp_c_pre[time_A[0]:time_A[1]]\n",
        "a_B = comp_a_pre[time_B[0]:time_B[1]]\n",
        "b_B = comp_b_pre[time_B[0]:time_B[1]]\n",
        "c_B = comp_c_pre[time_B[0]:time_B[1]]"
      ],
      "metadata": {
        "id": "ZP0OXqRzSanP"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "############## Sample Testing ########################\n",
        "def ram_eg_ml(df, eta, ml_pred):\n",
        "    n, m = df.shape\n",
        "    # RAM init\n",
        "    ram_wealth = 1.0\n",
        "    weights = np.ones(m) / m\n",
        "\n",
        "    # EG init\n",
        "    weights_eg = np.full((m, m), 1.0 / m, dtype=np.float64)\n",
        "    wealth_eg  = 1.0\n",
        "\n",
        "    t = 0\n",
        "\n",
        "    for _, row in df.iterrows():\n",
        "        # clairvoyant return at i\n",
        "        x = row.to_numpy(dtype=np.float64)\n",
        "\n",
        "        # predicted return\n",
        "        y_idx = ml_pred[t]\n",
        "        # y_idx = forecast_perm(x, accuracy)\n",
        "        y = np.array([x[j] for j in y_idx])\n",
        "        s = y_idx[0]\n",
        "        # permutation layer\n",
        "        w_desc = np.sort(weights)[::-1]\n",
        "        w_new = np.empty_like(weights)\n",
        "        for rank, asset_idx in enumerate(y_idx):\n",
        "            w_new[asset_idx] = w_desc[rank]\n",
        "        # get wealth\n",
        "        dR = w_new @ x\n",
        "        ram_wealth *= dR\n",
        "        # get new weights\n",
        "        weights = (w_new * x) / dR\n",
        "\n",
        "        ####### EG #########\n",
        "        w_s      = weights_eg[s]\n",
        "        port_ret = np.dot(w_s, x)\n",
        "        wealth_eg  *= port_ret\n",
        "\n",
        "        # Exponential-Gradient update\n",
        "        w_s *= np.exp(eta * x / port_ret)\n",
        "        w_s /= w_s.sum()\n",
        "\n",
        "        t += 1\n",
        "\n",
        "    return ram_wealth, wealth_eg"
      ],
      "metadata": {
        "id": "mQetWwGHzv7r"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "def pure_ml(df, ml_pred):\n",
        "    wealth = 1\n",
        "    k = 0\n",
        "    for _, row in df.iterrows():\n",
        "        wealth *= row[ml_pred[k][0]]\n",
        "\n",
        "        k+=1\n",
        "    return wealth"
      ],
      "metadata": {
        "id": "3V_8oE6e1RPc"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "##2.3) model training"
      ],
      "metadata": {
        "id": "Uu2z7ejxqi2P"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "\"\"\"\n",
        "sliding_rank_lgbm.py\n",
        "--------------------\n",
        "Produce daily cross-section ranks with LightGBM LambdaMART.\n",
        "\n",
        "• sliding window  (RECENT_WINDOW days)\n",
        "• exponential decay  (DECAY_GAMMA)\n",
        "• integer relevance labels  (mandatory for ranking)\n",
        "• warm-starts each booster for speed (init_model)\n",
        "\n",
        "Returns\n",
        "-------\n",
        "start_day   : int    (date_idx of first prediction)\n",
        "rank_matrix : ndarray, shape = (n - start_day, m)\n",
        "\"\"\"\n",
        "\n",
        "# ---------- 0. imports ----------\n",
        "import pandas as pd\n",
        "import numpy as np\n",
        "import lightgbm as lgb\n",
        "from pathlib import Path\n",
        "from tqdm import trange\n",
        "\n",
        "# ---------- 1. user-configurable ----------\n",
        "N_LAGS        = 3        # lagged factors per asset\n",
        "RECENT_WINDOW = 250      # training window length (days)\n",
        "DECAY_GAMMA   = 0.995    # per-day exponential decay (1 → uniform)\n",
        "VAL_DAYS      = 60       # inner validation slice for early stopping\n",
        "N_TREES       = 300      # base num_boost_round\n",
        "EARLY_STOP    = 20       # callback patience\n",
        "VERBOSE_EVAL  = False    # LightGBM stdout\n",
        "\n",
        "# ---------- 2. wide → long panel w/ lags & target ----------\n",
        "def prepare_long(df_wide: pd.DataFrame, n_lags: int) -> pd.DataFrame:\n",
        "    df = df_wide.copy()\n",
        "    df[\"date_idx\"] = np.arange(len(df))\n",
        "    long = df.melt(id_vars=\"date_idx\",\n",
        "                   var_name=\"asset\",\n",
        "                   value_name=\"ret_t\")           # ret_t is today's gross return\n",
        "\n",
        "    for l in range(1, n_lags + 1):\n",
        "        long[f\"ret_lag{l}\"] = long.groupby(\"asset\")[\"ret_t\"].shift(l)\n",
        "\n",
        "    long[\"target_ret\"] = long.groupby(\"asset\")[\"ret_t\"].shift(-1)\n",
        "    long.dropna(inplace=True)\n",
        "    long.reset_index(drop=True, inplace=True)\n",
        "    return long\n",
        "\n",
        "# ---------- 3. integer relevance labels per day ----------\n",
        "def add_relevance(long: pd.DataFrame) -> pd.DataFrame:\n",
        "    def _rank_int(s: pd.Series):\n",
        "        # lowest return  -> 0,  highest -> m-1\n",
        "        return (s.rank(method=\"first\", ascending=True) - 1).astype(int)\n",
        "    long[\"relevance\"] = long.groupby(\"date_idx\")[\"target_ret\"].transform(_rank_int)\n",
        "    return long\n",
        "\n",
        "# ---------- 4. utility to build LightGBM Dataset ----------\n",
        "def build_lgb_dataset(block: pd.DataFrame,\n",
        "                      decay_gamma: float):\n",
        "    feat_cols = [c for c in block.columns if c.startswith(\"ret_\")]\n",
        "    X, y = block[feat_cols], block[\"relevance\"]\n",
        "\n",
        "    group = block.groupby(\"date_idx\").size().values\n",
        "    age   = block[\"age\"]\n",
        "    wts   = (decay_gamma ** age).values\n",
        "    return lgb.Dataset(X, y, group=group, weight=wts, free_raw_data=False), feat_cols\n",
        "\n",
        "# ---------- 5. main pipeline ----------\n",
        "def sliding_rank_predictions(df_factor_clean: pd.DataFrame):\n",
        "    long = prepare_long(df_factor_clean, N_LAGS)\n",
        "    long = add_relevance(long)\n",
        "\n",
        "    m          = df_factor_clean.shape[1]\n",
        "    max_day    = long[\"date_idx\"].max()\n",
        "    start_day  = max(RECENT_WINDOW, N_LAGS)          # first day that can be scored\n",
        "    n_days_out = max_day - start_day + 1\n",
        "\n",
        "    rank_matrix = np.empty((n_days_out, m), dtype=np.int16)\n",
        "\n",
        "    booster     = None                               # warm-start handle\n",
        "    feat_cols   = None\n",
        "\n",
        "    # -------------------------------------------------------------\n",
        "    for idx, day in enumerate(                        # daily loop\n",
        "            trange(start_day, max_day + 1, ncols=80)):\n",
        "        # --------  build sliding window  ------------------------\n",
        "        win_start = day - RECENT_WINDOW\n",
        "        win_mask  = long[\"date_idx\"].between(win_start, day - 1)\n",
        "\n",
        "        block        = long.loc[win_mask].copy()\n",
        "        block[\"age\"] = day - 1 - block[\"date_idx\"]\n",
        "\n",
        "        val_cut  = max(block[\"date_idx\"].min(), day - VAL_DAYS)\n",
        "        val_mask = block[\"date_idx\"] >= val_cut\n",
        "        trn_mask = ~val_mask\n",
        "\n",
        "        dtrain, feat_cols = build_lgb_dataset(block.loc[trn_mask], DECAY_GAMMA)\n",
        "        dvalid, _         = build_lgb_dataset(block.loc[val_mask], DECAY_GAMMA)\n",
        "\n",
        "        params = dict(objective=\"lambdarank\",\n",
        "                      metric=\"ndcg\",\n",
        "                      learning_rate=0.05,\n",
        "                      num_leaves=63,\n",
        "                      min_data_in_leaf=30,\n",
        "                      verbosity=-1)\n",
        "\n",
        "        booster = lgb.train(\n",
        "            params,\n",
        "            dtrain,\n",
        "            num_boost_round=N_TREES,\n",
        "            valid_sets=[dvalid],\n",
        "            init_model=booster,          # warm‑start\n",
        "            callbacks=[\n",
        "                lgb.early_stopping(EARLY_STOP, first_metric_only=True, verbose=False),\n",
        "                lgb.log_evaluation(-1)\n",
        "            ]\n",
        "        )\n",
        "\n",
        "        # --------  inference for current day  -------------------\n",
        "        X_today  = long[long[\"date_idx\"] == day][feat_cols]\n",
        "        scores   = booster.predict(X_today,\n",
        "                                   num_iteration=booster.best_iteration)\n",
        "        rank_matrix[idx] = scores.argsort()[::-1]      # high→low\n",
        "\n",
        "\n",
        "    return start_day, rank_matrix"
      ],
      "metadata": {
        "id": "MyTkW--Y422y"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "for dataset in [a_A, b_A, c_A, a_B, b_B, c_B]:\n",
        "\n",
        "    _, rank_mat = sliding_rank_predictions(dataset)\n",
        "    to_run = dataset.iloc[251:]\n",
        "\n",
        "    best_w = best(to_run)\n",
        "    gm_w = gm(to_run)\n",
        "    ram_w, egsi_w = ram_eg_ml(to_run, 0.05, rank_mat.tolist())\n",
        "    ml_w = pure_ml(to_run, rank_mat.tolist())\n",
        "\n",
        "    print(f\"Best: {best_w}\\t GM: {gm_w}\\t RAM: {ram_w}\\t EGSI: {egsi_w}\\t ML: {ml_w}\")"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "6gOb5vIv1T5T",
        "outputId": "1c7dad93-dcf0-4590-8a69-eb6f346d93b2"
      },
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stderr",
          "text": [
            "100%|█████████████████████████████████████████| 199/199 [01:09<00:00,  2.86it/s]\n",
            "<ipython-input-246-aaac90bc78eb>:5: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
            "  wealth *= row[ml_pred[k][0]]\n"
          ]
        },
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Best: 1.7333662099241787\t GM: 1.2061561346054077\t RAM: 1.2397790670660627\t EGSI: 1.2301902932977695\t ML: 1.305495262145996\n"
          ]
        },
        {
          "output_type": "stream",
          "name": "stderr",
          "text": [
            "100%|█████████████████████████████████████████| 199/199 [02:51<00:00,  1.16it/s]\n",
            "<ipython-input-246-aaac90bc78eb>:5: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
            "  wealth *= row[ml_pred[k][0]]\n"
          ]
        },
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Best: 1.3587748578569965\t GM: 0.9534178376197815\t RAM: 1.0038427382621955\t EGSI: 0.9826716571848239\t ML: 1.6783100366592407\n"
          ]
        },
        {
          "output_type": "stream",
          "name": "stderr",
          "text": [
            "100%|█████████████████████████████████████████| 199/199 [05:19<00:00,  1.60s/it]\n",
            "<ipython-input-246-aaac90bc78eb>:5: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
            "  wealth *= row[ml_pred[k][0]]\n"
          ]
        },
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Best: 1.7333662099241787\t GM: 0.8853138089179993\t RAM: 0.9151650356262963\t EGSI: 0.9246241413254339\t ML: 0.5327281355857849\n"
          ]
        },
        {
          "output_type": "stream",
          "name": "stderr",
          "text": [
            "100%|█████████████████████████████████████████| 504/504 [07:01<00:00,  1.20it/s]\n",
            "<ipython-input-246-aaac90bc78eb>:5: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
            "  wealth *= row[ml_pred[k][0]]\n"
          ]
        },
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Best: 8.293215670107248\t GM: 2.470444679260254\t RAM: 2.5128635292415082\t EGSI: 2.629362813015742\t ML: 1.6762250661849976\n"
          ]
        },
        {
          "output_type": "stream",
          "name": "stderr",
          "text": [
            "100%|█████████████████████████████████████████| 504/504 [20:59<00:00,  2.50s/it]\n",
            "<ipython-input-246-aaac90bc78eb>:5: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
            "  wealth *= row[ml_pred[k][0]]\n"
          ]
        },
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Best: 3.1531377311637616\t GM: 1.3003515005111694\t RAM: 1.3779589073517693\t EGSI: 1.3731261614127162\t ML: 0.5721277594566345\n"
          ]
        },
        {
          "output_type": "stream",
          "name": "stderr",
          "text": [
            "100%|█████████████████████████████████████████| 504/504 [45:07<00:00,  5.37s/it]"
          ]
        },
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Best: 8.293215670107248\t GM: 1.4015233516693115\t RAM: 1.4930821532269538\t EGSI: 1.5013096825154844\t ML: 1.3915815353393555\n"
          ]
        },
        {
          "output_type": "stream",
          "name": "stderr",
          "text": [
            "\n",
            "<ipython-input-246-aaac90bc78eb>:5: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
            "  wealth *= row[ml_pred[k][0]]\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "Best: 1.733\t GM: 1.206\t RAM: 1.239\t EGSI: 1.230\t ML: 1.305\n",
        "Best: 1.358\t GM: 0.953\t RAM: 1.003\t EGSI: 0.982\t ML: 1.678\n",
        "Best: 1.733\t GM: 0.885\t RAM: 0.915\t EGSI: 0.924\t ML: 0.532\n",
        "Best: 8.293\t GM: 2.470\t RAM: 2.512\t EGSI: 2.629\t ML: 1.676\n",
        "Best: 3.153\t GM: 1.300\t RAM: 1.377\t EGSI: 1.373\t ML: 0.572\n",
        "Best: 8.293\t GM: 1.401\t RAM: 1.493\t EGSI: 1.501\t ML: 1.391\n"
      ],
      "metadata": {
        "id": "mYo9o4k8Jizd"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "##2.4) Statistics"
      ],
      "metadata": {
        "id": "nrvKcj5OuSp2"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "def get_dom(ml, gm, ram, egsi):\n",
        "    ml_gain = (ml-gm)/gm\n",
        "    ram_gain = (ram-gm)/ml_gain\n",
        "    eg_gain = (egsi-gm)/ml_gain\n",
        "    return ram_gain - eg_gain\n",
        "\n",
        "a_A = get_dom(1.305, 1.206, 1.239, 1.230)\n",
        "b_A = get_dom(1.678, 0.953, 1.003, 0.982)\n",
        "print(a_A)\n",
        "print(b_A)\n",
        "print(f\"c_A: ram-egsi gap {0.915/0.924}\")\n",
        "print(f\"c_A: ram-eg dominance {(0.915-0.885)/0.885}\")\n",
        "print(f\"a_B: {2.512/2.629}\")\n",
        "print(f\"b_B: {1.377/1.373}\")\n",
        "print(f\"c_B: {1.493/1.501}\")\n",
        "print((0.572-1.3)/1.3)\n",
        "print((1.377-1.373)/1.373)\n",
        "print((1.377-1.3)/1.3)"
      ],
      "metadata": {
        "id": "9ZgQZE6zuUTp"
      },
      "execution_count": null,
      "outputs": []
    }
  ]
}