{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import re\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "from collections import defaultdict\n",
    "import numpy as np\n",
    "\n",
    "def process_and_parse_log(input_file):\n",
    "    # 定义匹配性能数据和时间信息的正则表达式\n",
    "    patterns = [\n",
    "        r'#wait:\\s+(\\d+)/\\s+(\\d+)\\s+#run:\\s+(\\d+)\\s+#prefill:\\s+(\\d+)\\s+#decode:\\s+(\\d+)\\s+memory_util:\\s+([\\d.]+)\\s+%',\n",
    "        r'Worker rank (\\d+) step time: ([\\d.]+)',\n",
    "        r'sample time: ([\\d.]+)'\n",
    "    ]\n",
    "\n",
    "    # 初始化数据容器\n",
    "    wait_seq_list, wait_token_list = [], []\n",
    "    run_list, prefill_list, decode_list = [], [], []\n",
    "    mem_util_list, sample_time_list = [], []\n",
    "    worker_time_dict = defaultdict(list)\n",
    "\n",
    "    with open(input_file, 'r') as f:\n",
    "        for line in f:\n",
    "            line = line.strip()\n",
    "\n",
    "            # 匹配状态信息\n",
    "            m_status = re.search(patterns[0], line)\n",
    "            if m_status:\n",
    "                wait_seq_list.append(int(m_status.group(1)))\n",
    "                wait_token_list.append(int(m_status.group(2)))\n",
    "                run_list.append(int(m_status.group(3)))\n",
    "                prefill_list.append(int(m_status.group(4)))\n",
    "                decode_list.append(int(m_status.group(5)))\n",
    "                mem_util_list.append(float(m_status.group(6)))\n",
    "                continue\n",
    "\n",
    "            # 匹配 Worker 时间\n",
    "            m_worker = re.search(patterns[1], line)\n",
    "            if m_worker:\n",
    "                rank = int(m_worker.group(1))\n",
    "                time = float(m_worker.group(2))\n",
    "                worker_time_dict[rank].append(time)\n",
    "                continue\n",
    "\n",
    "            # 匹配 sample 时间\n",
    "            m_sample = re.search(patterns[2], line)\n",
    "            if m_sample:\n",
    "                sample_time = float(m_sample.group(1))\n",
    "                sample_time_list.append(sample_time)\n",
    "\n",
    "    # 构建 DataFrame\n",
    "    time_list = range(len(wait_seq_list))\n",
    "    df = pd.DataFrame({\n",
    "        \"time\": time_list,\n",
    "        \"wait_seq\": wait_seq_list,\n",
    "        \"wait_token\": wait_token_list,\n",
    "        \"run\": run_list,\n",
    "        \"prefill\": prefill_list,\n",
    "        \"decode\": decode_list,\n",
    "        \"memory_util\": mem_util_list,\n",
    "        \"sample_time\": sample_time_list\n",
    "    })\n",
    "\n",
    "    for rank, times in worker_time_dict.items():\n",
    "        if len(times) == len(df):\n",
    "            df[f\"worker_{rank}_time\"] = times\n",
    "        else:\n",
    "            print(f\"⚠️ 警告：Worker {rank} 时间步数 {len(times)} 与主数据长度 {len(df)} 不一致，跳过该列。\")\n",
    "\n",
    "    return df\n",
    "\n",
    "# 示例调用\n",
    "if __name__ == \"__main__\":\n",
    "    input_log_path = \"/mnt/sda/2022-0526/home/xuhx/projects/gLLM/experiments/introduction/rate8/profile_52_sharegpt_32b.log\"\n",
    "    df = process_and_parse_log(input_log_path)\n",
    "    df = df.sample(frac=0.01, random_state=42)\n",
    "    df['each_layer_time'] = df['worker_1_time'] / 16\n",
    "    df = df[df['sample_time'] < 10]\n",
    "    df = df[df['worker_1_time'] < 100]\n",
    "    df['cost_ratio'] = df['sample_time'] / df['each_layer_time']\n",
    "    df = df[df['cost_ratio'] < 10]\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 预测的敏感性分析\n",
    "# def predict_worker_time(prefill, decode,memory_util):     return 5.994919      + 0.029744 * prefill      + 0.110212 * decode # 14b\n",
    "# def predict_sample_time(prefill, decode,memory_util):     return 1.795752      + 0.044437 * decode # 14b\n",
    "def predict_worker_time(prefill, decode,memory_util):     return 11.238080      + 0.075143 * prefill      + 0.140392 * decode\n",
    "def predict_sample_time(prefill, decode,memory_util):     return 1.808118      + 0.044461 * decode"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "全体 MSE: 0.11222906004327943\n"
     ]
    }
   ],
   "source": [
    "df['predict_worker_time'] = df.apply(\n",
    "    lambda row: predict_worker_time(\n",
    "        row['prefill'],\n",
    "        row['decode'],\n",
    "        row['memory_util']\n",
    "    ),\n",
    "    axis=1\n",
    ")\n",
    "# df['predict_worker_time'] = df.apply(\n",
    "#     lambda row: predict_worker_time(\n",
    "#         row['prefill'],\n",
    "#         row['decode']\n",
    "#     ),\n",
    "#     axis=1\n",
    "# )\n",
    "df['layer_mse'] = abs((df['predict_worker_time'] - df['worker_1_time'])/df['worker_1_time']) \n",
    "print(\"全体 MSE:\", df['layer_mse'].mean())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "全体 MSE: 0.035544036261901\n"
     ]
    }
   ],
   "source": [
    "df['predict_sample_time'] = df.apply(\n",
    "    lambda row: predict_sample_time(\n",
    "        row['prefill'],\n",
    "        row['decode'],\n",
    "        row['memory_util']\n",
    "    ),\n",
    "    axis=1\n",
    ")\n",
    "# df['predict_worker_time'] = df.apply(\n",
    "#     lambda row: predict_worker_time(\n",
    "#         row['prefill'],\n",
    "#         row['decode']\n",
    "#     ),\n",
    "#     axis=1\n",
    "# )\n",
    "df['sample_mse'] = abs((df['predict_sample_time'] - df['sample_time'])/df['sample_time'])\n",
    "print(\"全体 MSE:\", df['sample_mse'].mean())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "3139    3.008565\n",
       "3409    2.875182\n",
       "1374    2.741799\n",
       "1545    2.875182\n",
       "8167    1.852579\n",
       "          ...   \n",
       "533     2.386111\n",
       "7094    2.119345\n",
       "2375    3.275331\n",
       "1897    3.053026\n",
       "879     2.608416\n",
       "Name: predict_sample_time, Length: 82, dtype: float64"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df['predict_sample_time']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "# import matplotlib.pyplot as plt\n",
    "\n",
    "# # 设置 NeurIPS 风格参数\n",
    "# plt.rcParams.update({\n",
    "#     'font.family': 'serif',\n",
    "#     'font.serif': ['Times New Roman', 'DejaVu Serif'],\n",
    "#     'mathtext.fontset': 'stix',\n",
    "#     'axes.labelsize': 7,\n",
    "#     'axes.titlesize': 8,\n",
    "#     'xtick.labelsize': 6,\n",
    "#     'ytick.labelsize': 6,\n",
    "#     'figure.dpi': 300,\n",
    "#     'savefig.dpi': 300,\n",
    "#     'figure.autolayout': True\n",
    "# })\n",
    "\n",
    "# # 创建点状图\n",
    "# plt.figure(figsize=(3.4, 2.4))  # 单栏尺寸\n",
    "# plt.scatter(df.index, df['cost_ratio'], color='#1f77b4', s=9, alpha=0.7)\n",
    "\n",
    "# plt.xlabel('Index (Time)', fontsize=7, labelpad=6)\n",
    "# plt.ylabel('Cost Ratio', fontsize=7, labelpad=6)\n",
    "# plt.title('Cost Ratio Over Time', fontsize=8, pad=8)\n",
    "\n",
    "# # 添加均值线\n",
    "# mean_val = df['cost_ratio'].mean()\n",
    "# plt.axhline(mean_val, color='#2ca02c', linestyle='--', linewidth=1, alpha=0.8)\n",
    "# plt.text(df.index[-1] * 0.9, mean_val * 1.03,\n",
    "#          f'Mean: {mean_val:.2f}', color='#2ca02c', fontsize=5.5)\n",
    "\n",
    "# # 样式调整\n",
    "# plt.grid(axis='y', linestyle=':', alpha=0.35)\n",
    "# plt.tick_params(axis='both', which='both', length=2.8, width=0.4)\n",
    "\n",
    "# # 保存或展示\n",
    "# # plt.savefig(\"cost_ratio_over_time.pdf\", format='pdf', bbox_inches='tight')\n",
    "# plt.show()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "# import seaborn as sns\n",
    "# import matplotlib.pyplot as plt\n",
    "# import numpy as np\n",
    "\n",
    "# # 设置学术风格参数\n",
    "# plt.rcParams.update({\n",
    "#     'font.family': 'serif',\n",
    "#     'font.serif': ['Times New Roman', 'DejaVu Serif'],\n",
    "#     'mathtext.fontset': 'stix',\n",
    "#     'axes.labelsize': 8,\n",
    "#     'axes.titlesize': 9,\n",
    "#     'xtick.labelsize': 7,\n",
    "#     'ytick.labelsize': 7,\n",
    "#     'figure.dpi': 300,\n",
    "#     'savefig.dpi': 300,\n",
    "#     'figure.autolayout': True\n",
    "# })\n",
    "\n",
    "# # 创建图形，单栏尺寸建议：宽度约3.4英寸，高度2.4英寸\n",
    "# fig, ax1 = plt.subplots(figsize=(3.4, 2.4))\n",
    "\n",
    "# # 绘制直方图（左轴: count）\n",
    "# bins = np.linspace(df['cost_ratio'].min(), df['cost_ratio'].max(), 25)\n",
    "# ax1.hist(df['cost_ratio'], bins=bins,\n",
    "#          color='#1f77b4', edgecolor='w', linewidth=0.6, alpha=0.85)\n",
    "# ax1.set_xlabel('Sample Time / Layer Time', fontsize=9, labelpad=6)\n",
    "# ax1.set_ylabel('Count', fontsize=9, labelpad=6, color='#1f77b4')\n",
    "# ax1.tick_params(axis='y', labelcolor='#1f77b4')\n",
    "\n",
    "# # 创建右轴绘制 KDE（密度）\n",
    "# ax2 = ax1.twinx()\n",
    "# sns.kdeplot(df['cost_ratio'], ax=ax2, color='#d62728', linewidth=1.4)\n",
    "# ax2.set_ylabel('Density', fontsize=9, labelpad=6, color='#d62728')\n",
    "# ax2.tick_params(axis='y', labelcolor='#d62728')\n",
    "\n",
    "# # 添加均值标注\n",
    "# mean_val = df['cost_ratio'].mean()\n",
    "# ax1.axvline(mean_val, color='#2ca02c', linestyle='--', linewidth=1, alpha=0.8)\n",
    "# ax1.text(mean_val * 1.03, ax1.get_ylim()[1] * 0.85,\n",
    "#          f'Mean: {mean_val:.2f}', color='#2ca02c', fontsize=6)  # 原为7\n",
    "\n",
    "\n",
    "# # 样式调整\n",
    "# for spine in ['top', 'right']:\n",
    "#     ax1.spines[spine].set_visible(False)\n",
    "# for spine in ['left', 'bottom']:\n",
    "#     ax1.spines[spine].set_linewidth(0.7)\n",
    "\n",
    "# plt.title('Distribution of Sample Cost Ratios', fontsize=10, pad=10)\n",
    "# ax1.grid(axis='y', linestyle=':', alpha=0.35)\n",
    "# plt.tick_params(axis='both', which='both', length=3.2, width=0.4)\n",
    "\n",
    "\n",
    "\n",
    "# # 保存图形为 PDF 文件\n",
    "# # plt.savefig(\"sample_cost.pdf\", format='pdf', bbox_inches='tight')\n",
    "# plt.show()\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "gllm",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
