%\documentclass{uai2023} % for initial submission
\documentclass[accepted]{uai2023} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like

%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2023} % ptmx math instead of Computer
% Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2023} % newtx fonts (improves upon
 % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams
\usepackage{amsmath}
\usepackage{amssymb}
\usepackage{amsthm}
\usepackage{subcaption}
\usepackage{bibentry}
% for cross referencing the main text
% PLEASE ONLY USE xr IN THE SUPPLEMENTARY MATERIAL. 
% In the main paper, hard code any cross-reference to the supplementary material. 
\usepackage{xr} 
\externaldocument{uai2023-template}

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros

%\newcommand{\swap}[3][-]{#3#1#2} % just an example
\newtheorem{theorem}{Theorem}
\newtheorem{lemma}[theorem]{Lemma}
\newtheorem{corollary}{Corollary}[theorem]


\title{Keep-Alive Caching for Hawkes Processes\\(Supplementary Material)}

% The standard author block has changed for UAI 2023 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[]{Sushirdeep Narayana}
\author[]{Ian A. Kash}
\author[]{\\snaray25@uic.edu}
\author[]{iankash@uic.edu}
% Add affiliations after the authors
\affil[1]{%
    Department of Computer Science,\\
    University of Illinois at Chicago,\\
     Chicago, Illinois, USA
}
  
  
\begin{document}
  
\onecolumn %% Turn this off if single column is desired for the supplement
\maketitle

This Supplementary Material contains proofs and other material omitted from the main manuscript. 


\appendix
\section{Omitted Proofs}

\begin{lemma} \label{expect_cost}
The expected cost of a cache policy over an inter-arrival is
\begin{equation*}
\mathbb{E}[cost(\pi(\cdot |\mathcal{H}_{m-1}))] = c_{cs} + \displaystyle \int_{0}^{\infty} \pi(x|\mathcal{H}_{m-1}) \cdot g(x|\mathcal{H}_{m-1})\; dx , 
\end{equation*} 
where the instantaneous cost at \textit{x} units after the most recent arrival at $t_{m-1}$ is
\begin{equation*}
g(x|\mathcal{H}_{m-1}) = c_{p} \cdot \big(1 - F(x|\mathcal{H}_{m-1})\big) - c_{cs} \cdot f(x| \mathcal{H}_{m-1}).
\end{equation*} 
\end{lemma}


\begin{proof}
Let $\mathcal{L} = \{L_{0}, L_{1}, L_{2}, \cdots, L_{2k-1}\}$ denote the set of points on the sequence of keep-alive windows for policy $\pi(\cdot | \mathcal{H}_{m-1})$ where even indices are the start of the windows and odd indices are the endpoints of the windows.  Let $Z(\mathcal{L}, j) = \sum_{p = 0}^{j} (L_{2p+1} - L_{2p})$ for $j \geq 0$. The function $Z(\mathcal{L}, j)$ represents the time accumulated in the cache through the $j$-th sequence of the keep-alive window. For $j < 0$, we have $Z(\mathcal{L},j) = 0$.  Then we have

\begin{align*}
\begin{split}
& \mathbb{E}[cost(\pi(\cdot|\mathcal{H}_{m-1}))]\\ 
& =  c_{cs} \cdot \displaystyle \int_{0}^{L_{0}} f(x | \mathcal{H}_{m-1}) \; dx \; + \;  \displaystyle \sum_{j = 0}^{k-1} \int_{L_{2j}}^{L_{2j+1}} \; c_{p} \cdot \Big( Z(\mathcal{L}, j-1) + x - L_{2j} \Big) \cdot f(x|\mathcal{H}_{m-1}) \; dx \; \\ & \quad + \sum_{j = 0}^{k-2} \int_{L_{2j+1}}^{L_{2j+2}} \; \Big( c_{cs} + c_{p} Z(\mathcal{L}, j) \Big) f(x|\mathcal{H}_{m-1}) \; dx \; + \int_{L_{2k-1}}^{\infty} \; \Big( c_{cs} + c_{p} Z(\mathcal{L}, k-1) \Big) f(x|\mathcal{H}_{m-1}) \; dx  \quad \text{(1)}\\
& = c_{cs} \cdot F(L_{0} | \mathcal{H}_{m-1}) + \displaystyle \sum_{j = 0}^{k-1} \Bigg( c_{p} \cdot \Big( Z(\mathcal{L}, j-1) + x - L_{2j} \Big) \cdot F(x|\mathcal{H}_{m-1})\Big|_{L_{2j}}^{L_{2j+1}}  - \int_{L_{2j}}^{L_{2j+1}} c_{p} F(x|\mathcal{H}_{m-1}) \; dx\Bigg) \;  \\  & \quad + \sum_{j = 0}^{k-2} \; \Big( c_{cs} + c_{p} \cdot Z(\mathcal{L}, j) \Big) \cdot F(x|\mathcal{H}_{m-1})\Big|_{L_{2j+1}}^{L_{2j+2}} + \Big( c_{cs} + c_{p} \cdot Z(\mathcal{L}, k-1) \Big) \cdot F(x|\mathcal{H}_{m-1})\Big|_{L_{2k-1}}^{\infty}  \qquad  \text{(2)}\\
\end{split}    
\end{align*}

\begin{align*}
\begin{split}
 &\mathbb{E}[cost(\pi(\cdot|\mathcal{H}_{m-1}))] \\
& =  c_{cs} F(L_{0} | \mathcal{H}_{m-1}) + \displaystyle \sum_{j = 0}^{k-1} \; c_{p} \Big( Z(\mathcal{L}, j-1) + L_{2j+1} - L_{2j} \Big) F(L_{2j+1}|\mathcal{H}_{m-1})   - \sum_{j = 0}^{k-1} \; c_{p} Z(\mathcal{L}, j-1) F(L_{2j}|\mathcal{H}_{m-1}) \\ & \quad - \sum_{j = 0}^{k-1} \; \int_{L_{2j}}^{L_{2j+1}} c_{p} F(x|\mathcal{H}_{m-1}) \; dx + \sum_{j = 0}^{k-2} \; \Big( c_{cs} + c_{p} \cdot Z(\mathcal{L}, j) \Big) \cdot F(L_{2j+2}|\mathcal{H}_{m-1}) \\ & \quad - \sum_{j = 0}^{k-2} \; \Big( c_{cs} + c_{p} \cdot Z(\mathcal{L}, j) \Big) F(L_{2j+1}|\mathcal{H}_{m-1}) + \Big( c_{cs} + c_{p} \cdot Z(\mathcal{L}, k-1) \Big) \cdot 1 \\ & \quad - \Big( c_{cs} + c_{p} \cdot Z(\mathcal{L}, k-1) \Big) F(L_{2k-1}|\mathcal{H}_{m-1}) \qquad \qquad \text{(3)}\\
& =  c_{cs} F(L_{0} | \mathcal{H}_{m-1}) \; + \displaystyle \sum_{j = 0}^{k-1} \; c_{p} Z(\mathcal{L}, j) F(L_{2j+1}| \mathcal{H}_{m-1})   - \sum_{j = 0}^{k-1} \; c_{p} Z(\mathcal{L}, j-1) F(L_{2j}| \mathcal{H}_{m-1}) \; \\ & \quad - \sum_{j = 0}^{k-1} \; \int_{L_{2j}}^{L_{2j+1}} c_{p} F(x|\mathcal{H}_{m-1}) \; dx \quad + \sum_{j = 0}^{k-2} \; c_{cs}  \cdot F(L_{2j+2}| \mathcal{H}_{m-1}) + \sum_{j = 0}^{k-2} \; c_{p} \cdot Z(\mathcal{L}, j) \cdot F(L_{2j+2}|\mathcal{H}_{m-1}) \; \\ & \quad - \sum_{j = 0}^{k-2} \;  c_{cs} \cdot F(L_{2j+1}|\mathcal{H}_{m-1}) \quad - \sum_{j = 0}^{k-2} c_{p} \cdot Z(\mathcal{L}, j) \cdot F(L_{2j+1}|\mathcal{H}_{m-1}) \\& \quad +  c_{cs} + c_{p} \cdot Z(\mathcal{L}, k-1) \; - c_{cs} \cdot F(L_{2k-1}|\mathcal{H}_{m-1}) \quad - c_{p} \cdot Z(\mathcal{L}, k-1) \cdot F(L_{2k-1}|\mathcal{H}_{m}) \qquad \qquad \text{(4)}\\   
\end{split}    
\end{align*}

We apply integration by parts to the term $\int_{L_{2j}}^{L_{2j+1}} \; c_{p} \cdot \big( Z(\mathcal{L}, j-1) + x - L_{2j} \big) \cdot f(x|\mathcal{H}_{m-1}) \; dx \;$ in Equation (1) to get the terms $c_{p} \cdot \big( Z(\mathcal{L}, j-1) + x - L_{2j} \big) \cdot F(x|\mathcal{H}_{m-1})\big|_{L_{2j}}^{L_{2j+1}}  - \int_{L_{2j}}^{L_{2j+1}} c_{p} F(x|\mathcal{H}_{m-1}) \; dx \;$ in Equation (2). In Equation (4), we have substituted $ \; Z(\mathcal{L},j) \;$ for the terms $\; Z(\mathcal{L},j-1) + (L_{2j+1} - L_{2j})$ in Equation (3) since, $\; Z(\mathcal{L},j) = Z(\mathcal{L},j-1) + (L_{2j+1} - L_{2j})$. The remainder of the proof consists of combining and canceling terms to simplify (4), then applying the fundamental theorem of calculus.


\begin{align*}
\begin{split}
&\mathbb{E}[cost(\pi(\cdot|\mathcal{H}_{m-1}))] \\
& = \; c_{cs} F(L_{0} | \mathcal{H}_{m-1}) \; + \displaystyle \sum_{j = 0}^{k-1} \; c_{p} Z(\mathcal{L}, j) \cdot F(L_{2j+1}| \mathcal{H}_{m-1}) - \sum_{j = 0}^{k-1} \; c_{p} Z(\mathcal{L}, j-1) \cdot F(L_{2j}|\mathcal{H}_{m-1}) \; \\ & \quad - \sum_{j = 0}^{k-1} \; \int_{L_{2j}}^{L_{2j+1}} c_{p} F(x|\mathcal{H}_{m-1}) \; dx  \quad + \sum_{j = 0}^{k-2} \;  c_{cs} \cdot F(L_{2j+2}|\mathcal{H}_{m-1}) \quad + \sum_{j = 0}^{k-2} \;  c_{p} \cdot Z(\mathcal{L}, j) \cdot F(L_{2j+2}|\mathcal{H}_{m-1}) \; \\ & \quad  - \sum_{j = 0}^{k-1} \;  c_{cs} \cdot F(L_{2j+1}|\mathcal{H}_{m-1}) \quad- \sum_{j = 0}^{k-1} c_{p} \cdot Z(\mathcal{L}, j) \cdot F(L_{2j+1}|\mathcal{H}_{m-1}) +  c_{cs} + c_{p} \cdot Z(\mathcal{L}, k-1)\\
& = \; \displaystyle - \sum_{j=0}^{k-1} \int_{L_{2j}}^{L_{2j+1}} c_{p} F(x|\mathcal{H}_{m-1}) \;dx + \; \sum_{j= 0}^{k-1} c_{cs} F(L_{2j}|\mathcal{H}_{m-1}) - \sum_{j = 0}^{k-1} c_{cs} F(L_{2j+1}|\mathcal{H}_{m-1}) + c_{cs} + c_{p} Z(\mathcal{L}, k-1) \quad \text{(5)}\\
& = \quad \displaystyle   c_{p} \cdot Z(\mathcal{L}, k-1) - \sum_{j = 0}^{k-1} \; \int_{L_{2j}}^{L_{2j+1}} c_{p} F(x|\mathcal{H}_{m-1}) \; dx  - \sum_{j = 0}^{k-1} \;  c_{cs} \cdot \Big( F(L_{2j+1}|\mathcal{H}_{m-1}) - F(L_{2j}|\mathcal{H}_{m-1}) \Big) + c_{cs}\\
& = \quad \displaystyle \int_{\mathcal{I}} c_{p}(1 - F(x|\mathcal{H}_{m-1})) -c_{cs} \cdot f(x|\mathcal{H}_{m-1}) \; dx \; + \; c_{cs}
\end{split}    
\end{align*}


\noindent where $\mathcal{I} =\begin{cases}1, \; \text{for} \quad x \in [L_{0} , \; L_{1}] \bigcup \cdots \bigcup \; [L_{2k-2}, L_{2k-1}]\\ 0, \; \text{otherwise} \end{cases} $.\\

\noindent In Equation (5),  we combine \; $c_{cs} \cdot F(L_{0} | \mathcal{H}_{m-1})$ \; and \; $\displaystyle \sum_{j=0}^{k-2} \; c_{cs} F(L_{2j+2} | \mathcal{H}_{m-1})$ \; to obtain $\displaystyle \sum_{j=0}^{k-1} c_{cs} F(L_{2j} | \mathcal{H}_{m-1})$.

\end{proof}


\begin{theorem} \label{opt_policy}
The points $L_i$ of the sequence of keep-alive windows over an inter-arrival for the optimal policy $\pi_{\text{opt}}(\cdot | \mathcal{H}_{m-1})$ are at 0, $\infty$, or solutions to the equation $c_{p} - (c_{cs} \cdot \lambda(x | \mathcal{H}_{m-1})) = 0 \;$ where the sign changes.
\end{theorem}

\begin{proof}
From Lemma \ref{expect_cost} we know that the expected cost is given by $\mathbb{E}[cost(\pi(\cdot|\mathcal{H}_{m-1}))] = \int_{0}^{\infty} \pi(x|\mathcal{H}_{m-1}) \cdot g(x|\mathcal{H}_{m-1}) \; dx +c_{cs}$. The points of the sequence of keep-alive windows $L_{k}\; \text{for} \; k = 0, 1, 2, \cdots$ for the optimal policy are the points where the first order partial derivative of the expected cost is zero, that is, $\displaystyle \frac{\partial\; \mathbb{E}[cost(\pi(\cdot|\mathcal{H}_{m-1}))]}{\partial x} = \; 0$ at $x = L_{k}\; \forall k$. The first order derivative of the expected cost is $\displaystyle \frac{\partial\; \mathbb{E}[cost(\pi(\cdot|\mathcal{H}_{m-1}))]}{\partial x} \; = \; g(x |\mathcal{H}_{m-1})$. We simplify $g(x |\mathcal{H}_{m-1})$ as follows,

\begin{align*}
\begin{split}
& g(x | \mathcal{H}_{m-1}) \\ 
& = c_{p}(1 - F(x | \mathcal{H}_{m-1})) - c_{cs} f(x | \mathcal{H}_{m-1}) \\
& = c_{p} (1 - F(x | \mathcal{H}_{m-1})) - c_{cs} \lambda(x | \mathcal{H}_{m-1}) (1 - F(x|\mathcal{H}_{m-1})) \\
& = (1 - F(x | \mathcal{H}_{m-1})) \cdot (c_{p} - c_{cs}\lambda(x | \mathcal{H}_{m-1}))
\end{split}    
\end{align*}

\begin{align*}
& g(x = L_{k} |\mathcal{H}_{m-1}) \; = \; 0   \\
\implies & 1 - F(x = L_{k} |\mathcal{H}_{m-1}) = 0 \quad \text{or} \quad  c_{p} - c_{cs} \lambda(x = L_{k} |\mathcal{H}_{m-1}) = 0 \\
\implies & F(x = L_{k}|\mathcal{H}_{m-1}) = 1 \quad \text{or} \quad \displaystyle \frac{c_{p}}{c_{cs}} = \lambda(x = L_{k} |\mathcal{H}_{m-1}) = \frac{f(x = L_{k}|\mathcal{H}_{m-1})}{1 - F(x = L_{k}| \mathcal{H}_{m-1})}
\end{align*}

\noindent Hence, the points where $g(x | \mathcal{H}_{m-1}) = 0$ are also the points where $c_{p} - c_{cs} \lambda(x | \mathcal{H}_{m-1}) = 0$. We know that the instantaneous cost of the policy over an inter-arrival is given by $g(x | \mathcal{H}_{m-1})$.  Let $x = L_{k}$ be an arbitrary root of the equation $c_{p}- c_{cs} \lambda(x |\mathcal{H}_{m-1}) = 0$. If $g(x |\mathcal{H}_{m-1})$ changes sign from positive to negative as it goes through $x = L_{k}$, that is, $g(x |\mathcal{H}_{m-1}) >\; 0\;$ for $x < L_{k}$ and changes sign to $g(x | \mathcal{H}_{m-1}) < \; 0\;$ for $x > L_{k}$. It would be optimal for the cache policy to start the keep-alive window from $x = L_{k}$, since the cost of caching the object from $x = L_{k}$ benefits the policy. Similarly, if $g(x |\mathcal{H}_{m-1})$ changes sign from negative to positive as it passes $x = L_{k}$, that is, $g(x |\mathcal{H}_{m-1}) <\; 0\;$ for $x < L_{k}$ and changes sign to $g(x | \mathcal{H}_{m-1}) > \; 0\;$ for $x > L_{k}$. It would be optimal to stop the keep-alive window after $x = L_{k}$, since the cost of caching the object after $x = L_{k}$ will not benefit the policy.  Since $1 - F(x | \mathcal{H}_{m-1}) \; \geq \; 0,  \quad \forall x,\;$ the sign of $c_{p} - c_{cs} \cdot \lambda(x | \mathcal{H}_{m-1})$ determines the sign of $g(x |\mathcal{H}_{m-1})$. The sign of $c_{p} - c_{cs} \lambda(x | \mathcal{H}_{m-1})$ should change as it passes through the root of the equation $x = L_{k}$ for $x = L_{k}$ to be considered as a point where the keep-alive window of the optimal policy starts or ends.

This leaves the end cases where there is no solution to the equation $c_{p} - c_{cs} \lambda(x | \mathcal{H}_{m-1}) = 0$ or the sign of $c_{p} - c_{cs} \lambda(x | \mathcal{H}_{m-1})$ does not change $\forall \; x$. When $g(x |\mathcal{H}_{m-1})$ is always non-negative, it is optimal to have a keep-alive window length of 0. This is because having an active keep-alive window length in this case would be more expensive than a cold start. On the other hand, when $g(x |\mathcal{H}_{m-1})$ is always non-positive, it is optimal for the keep-alive window to always be active since keeping the object in cache is beneficial to the policy. 
    
\end{proof}



\begin{corollary} \label{optimal_decreasing}
If the hazard rate is weakly decreasing, the optimal policy $\pi_{\text{opt}}(\cdot|\mathcal{H}_{m-1})$  is a single keep-alive window starting at $\tau_{pw}=0$, and is given by \\ $\pi_{\text{opt}}(x|\mathcal{H}_{m-1}) =  \begin{cases}1 \; , \quad \forall x \in [0 , \; \tau_{\text{opt}, \mathcal{H}_{m-1}}]\\ 0 \; , \quad \text{otherwise}\end{cases}$ where,
\begin{enumerate}
    \item $\tau_{\text{opt}, \mathcal{H}_{m-1}} = \infty$, i.e., the optimal policy is to have the keep-alive window always be active when \\ $\forall x, \; \displaystyle \frac{c_{p}}{c_{cs}} < \; \lambda(x | \mathcal{H}_{m-1}), \; $
    \item $\tau_{\text{opt}, \mathcal{H}_{m-1}} = 0$, i.e., the optimal policy would be to not cache and always have a cold start when \\ $\displaystyle \frac{c_{p}}{c_{cs}} >\; \lambda(x = 0 | \mathcal{H}_{m-1})\;$
    \item The optimal policy is a keep-alive window of length $\tau_{\text{opt}, \mathcal{H}_{m-1}}$ given by the solution to the equation \\ $ \displaystyle  \frac{c_{p}}{c_{cs}} = \frac{f(x = \tau_{\text{opt}, \mathcal{H}_{m-1}} | \mathcal{H}_{m-1})}{1 - F(x = \tau_{\text{opt}, \mathcal{H}_{m-1}} | \mathcal{H}_{m-1})}$, otherwise. 
\end{enumerate}
\end{corollary}


\begin{proof}
Assume that the hazard rate of the arrival of function invocations over an inter-arrival $\lambda (x|\mathcal{H}_{m-1})$ is (weakly) decreasing. From Lemma \ref{expect_cost} we know that the expected cost is given by $\mathbb{E}[cost(\pi(\cdot|\mathcal{H}_{m-1}))] = \int_{0}^{\infty} \pi(x|\mathcal{H}_{m-1}) \cdot g(x|\mathcal{H}_{m-1}) \; dx +c_{cs}$. We prove a single keep-alive window is optimal by showing that $g(x|\mathcal{H}_{m-1})$ can only change its sign from negative to positive at most once. 
Thus $g$ is optimized by single window policy that keeps the object in cache until the transition from negative to positive occurs.
To begin,
\begin{equation*}
g(x |\mathcal{H}_{m-1})  = \; (1 - F(x|\mathcal{H}_{m-1})) \cdot (c_{p} - c_{cs} \lambda(x| \mathcal{H}_{m-1}))
\end{equation*}

\noindent Since $\lambda(x|\mathcal{H}_{m-1})$ is weakly decreasing, $c_{p} - c_{cs} \lambda(x| \mathcal{H}_{m-1})$ is weakly increasing. Also, $1 - F(x|\mathcal{H}_{m-1})$ is always positive. If $g(x|\mathcal{H}_{m-1}) \; \geq 0$, then the optimal policy is to have a keep-alive window length of 0. This is because having an active keep-alive window in this case would be more expensive than a cold start. If $g(x|\mathcal{H}_{m-1})$ is always negative, then it is always beneficial for the keep-alive window to be active. Otherwise, $g(x|\mathcal{H}_{m-1})$ can change its sign at most once and such a change must be from negative to positive. It is no longer beneficial for the provider to keep things in memory after $g(x|\mathcal{H}_{m-1})$ has changed from negative to positive because the cost of keeping in memory outweighs the cost of a cold start. Thus, the optimal policy is of the form of a single keep-alive window. Now, it only remains to determine the point$\tau_{\text{opt}, \mathcal{H}_{m-1}}$ at which $g(x|\mathcal{H}_{m-1})$ changes from negative to positive.

\begin{align*}
& g(x = \tau_{\text{opt}, \mathcal{H}_{m-1}}|\mathcal{H}_{m-1}) \; = \; 0   \\
\implies & F(x = \tau_{\text{opt},\mathcal{H}_{m-1}}|\mathcal{H}_{m-1}) = 1 \quad \text{or} \quad \displaystyle \frac{c_{p}}{c_{cs}} = \lambda(x = \tau_{\text{opt},\mathcal{H}_{m-1}}|\mathcal{H}_{m-1}) = \frac{f(x =\tau_{\text{opt},\mathcal{H}_{m-1}}|\mathcal{H}_{m-1})}{1 - F(x = \tau_{\text{opt},\mathcal{H}_{m-1}}| \mathcal{H}_{m-1})}
\end{align*}

\end{proof}


\begin{corollary} \label{optimal_increasing}
If the hazard rate is weakly increasing, the optimal policy $\pi_{\text{opt}}(\cdot|\mathcal{H}_{m-1})$  is a single keep-alive window with $\tau_{ka} = \infty$ and a pre-warming window, and is given by \\ $\pi_{\text{opt}}(x|\mathcal{H}_{m-1}) =  \begin{cases}1 , \; \tau_{\text{pw}, \mathcal{H}_{m-1}} \leq x \\ 0, \; \text{otherwise}\end{cases}$ where,
\begin{enumerate}
    \item $\tau_{\text{pw}, \mathcal{H}_{m-1}} = 0$, i.e., the optimal policy is to have the keep-alive window always be active when\\ $\forall x, \quad \displaystyle \frac{c_{p}}{c_{cs}} < \; \lambda(x | \mathcal{H}_{m-1}), \; $
    \item $\tau_{\text{pw}, \mathcal{H}_{m-1}} = \infty$, i.e., the optimal policy is to always have a cold start when $\forall x, \; \displaystyle \frac{c_{p}}{c_{cs}} > \; \lambda(x | \mathcal{H}_{m-1}) \;$. 
    \item $\tau_{\text{pw}, \mathcal{H}_{m-1}}$ satisfies the equation\\ $\; \displaystyle \frac{c_{p}}{c_{cs}} = \displaystyle \frac{f(x = \tau_{\text{pw}, \mathcal{H}_{m-1}} |  \mathcal{H}_{m-1})}{1 - F(x = \tau_{\text{pw}, \mathcal{H}_{m-1}} |  \mathcal{H}_{m-1})}$, i.e., an infinite keep-alive window after a pre-warming window of length $\tau_{\text{pw}, \mathcal{H}_{m-1}}$ when $c_{p} - c_{cs} \lambda(x = 0 | \mathcal{H}_{m-1}) \; > 0$ and changes sign. 
\end{enumerate}
\end{corollary}


\begin{proof}

Following the proof of Theorem \ref{opt_policy}, we know that $g(x | \mathcal{H}_{m-1}) = (1 - F(x| \mathcal{H}_{m-1}))\cdot (c_{p} - c_{cs} \cdot \lambda(x| \mathcal{H}_{m-1}))$. Since $\lambda(x | \mathcal{H}_{m-1})$ is weakly increasing, $c_{p} - c_{cs} \lambda(x|\mathcal{H}_{m-1})$ is weakly decreasing. Also, $1 - F(x|\mathcal{H}_{m-1})$ is always positive. If initially $g(x | \mathcal{H}_{m-1}) < \; 0$, then $g(x |\mathcal{H}_{m-1})$ will always be negative. Hence, it is optimal to have the keep-alive window always be active. If $g(x|\mathcal{H}_{m-1}) > 0, \quad \forall x$, that is, $g$ is always positive, then the optimal policy is to encounter a cold start. If $g(x |\mathcal{H}_{m-1})$ is initially positive, then changes to a negative sign as $\lambda(x | \mathcal{H}_{m-1})$ is weakly increasing, then the optimal policy will be a pre-warming window of length decided by the position of the change of sign. We obtain $\tau_{\text{pw}, \mathcal{H}_{m-1}}\;$ from solving $g(x = \tau_{\text{pw},\mathcal{H}_{m-1}}|\mathcal{H}_{m-1}) = 0$ as follows.

\begin{align*}
& g(x = \tau_{\text{pw}, \mathcal{H}_{m-1}}|\mathcal{H}_{m-1}) \; = \; 0   \\
\implies & F(x = \tau_{\text{pw},\mathcal{H}_{m-1}}|\mathcal{H}_{m-1}) = 1 \quad \text{or} \quad \displaystyle \frac{c_{p}}{c_{cs}} = \lambda(x = \tau_{\text{pw},\mathcal{H}_{m-1}}|\mathcal{H}_{m-1}) = \frac{f(x =\tau_{\text{pw},\mathcal{H}_{m-1}}|\mathcal{H}_{m-1})}{1 - F(x = \tau_{\text{pw},\mathcal{H}_{m-1}}| \mathcal{H}_{m-1})}
\end{align*}

After the sign changes to negative, the keep-alive window should always be active,, that is, $\tau_{\text{ka},\mathcal{H}_{m-1}} = \infty$. 
\end{proof}


Corollary \ref{optimal_decreasing} characterizes the optimal policy when the distribution of arrival requests follow the Hawkes process to be one of the following policies,
\begin{itemize}
    \item The keep-alive window is to always be active with $\tau_{\text{opt}, \mathcal{H}_{m-1}} = \infty$ when, as in the Poisson case, the background intensity is sufficiently high: $\displaystyle \frac{c_{p}}{c_{cs}} < \; \lambda_{0}$.
    \item Experience a cold start with $\tau_{\text{opt}, \mathcal{H}_{m-1}} = 0$ when $\displaystyle \frac{c_{p}}{c_{cs}} > \; \lambda(x| \mathcal{H}_{m-1}))$, after the most recent arrival request.
    \item The keep-alive window is given by the expression 
    \begin{equation*}
        \tau_{\text{opt}, \mathcal{H}_{m-1}} = \frac{1}{\beta} \Big( \log \alpha + \log \Big(\sum_{j=1}^{m-1} e^{\beta  (t_{j}-t_{m-1})} \Big) - \log \Big( \frac{c_{p}}{c_{cs}} - \lambda_{0}\Big) \Big)
    \end{equation*}
    otherwise. \\To compute $\tau_{\text{opt}, \mathcal{H}_{m-1}}$, we  know that the length of the optimal keep-alive window is $\tau_{\text{opt}, \mathcal{H}}  = t_{\text{opt}, \mathcal{H}} - \; t_{m-1}$, where $t_{m-1}$ is the most recent arrival request. This expression is obtained by substituting the conditional intensity of the Hawkes process in Corollary \ref{optimal_decreasing} and solving for $t_{\text{opt}, \mathcal{H}_{m-1}}$.
\end{itemize}


\begin{align*}
\frac{c_{p}}{c_{cs}} & = \lambda_{0} + \sum_{t_{j} \in \mathcal{H}_{m-1}} \alpha \cdot e^{- \beta \cdot (t_{\text{opt}, \mathcal{H}_{m-1}} - t_{j})}\\
\frac{1}{\alpha}\Big(\frac{c_{p}}{c_{cs}} - \lambda_{0} \Big) & = \sum_{t_{j} \in \mathcal{H}_{m-1}} e^{- \beta \cdot (t_{\text{opt}, \mathcal{H}_{m-1}} - t_{j})}\\
\frac{1}{\alpha}\big(\frac{c_{p}}{c_{cs}} - \lambda_{0} \Big) & = e^{- \beta \cdot t_{\text{opt}, \mathcal{H}_{m-1}}} \cdot \sum_{t_{j} \in \mathcal{H}_{m-1}} e^{\beta \cdot t_{j}} \\
\log \left(\frac{1}{\alpha}\big(\frac{c_{p}}{c_{cs}} - \lambda_{0} \Big)\right) & = - \beta \cdot t_{\text{opt}, \mathcal{H}_{m-1}} + \log \left(\sum_{t_{j} \in \mathcal{H}_{m-1}} e^{\beta \cdot t_{j}}\right) \\
\beta \cdot t_{\text{opt}, \mathcal{H}_{m-1}} & = \log \alpha + \log \left(\sum_{t_{j} \in \mathcal{H}_{m-1}} e^{\beta \cdot t_{j}}\right) - \log \Big(\frac{c_{p}}{c_{cs}} - \lambda_{0}\Big)\\
\beta \cdot t_{\text{opt}, \mathcal{H}_{m-1}} & = \log \alpha + \log \left(\Big(e^{\beta \cdot t_{m-1}} \Big) \cdot \Big(\sum_{t_{j} \in \mathcal{H}_{m-1}} e^{\beta \cdot t_{j} - \beta \cdot t_{m-1}}\Big) \right) - \log \Big(\frac{c_{p}}{c_{cs}} - \lambda_{0}\Big)\\
\beta \cdot t_{\text{opt}, \mathcal{H}_{m-1}} & = \log \alpha + \beta \cdot t_{m-1} + \log \left(\sum_{t_{j} \in \mathcal{H}_{m-1}} e^{\beta \cdot (t_{j} - t_{m-1})}\right) - \log \Big(\frac{c_{p}}{c_{cs}} - \lambda_{0}\Big)\\
t_{\text{opt}, \mathcal{H}_{m-1}} & = t_{m-1} + \frac{1}{\beta} \left(\log \alpha +  \log \left(\sum_{t_{j} \in \mathcal{H}_{m-1}} e^{\beta \cdot (t_{j} - t_{m-1})}\right) - \log \Big(\frac{c_{p}}{c_{cs}} - \lambda_{0}\Big)\right)\\
\tau_{\text{opt}, \mathcal{H}_{m-1}}  & = \frac{1}{\beta} \cdot \left( \log \alpha + \log \big(\sum_{t_{j} \in \mathcal{H}_{m-1}} e^{\beta \cdot (t_{j}-t_{m-1})} \big) - \log \Big( \frac{c_{p}}{c_{cs}} - \lambda_{0}\Big) \right)
\end{align*}

    
\begin{corollary}
When the parameters of the Hawkes process are such that $c_{p} - (c_{cs} \cdot \lambda(x | \mathcal{H})) = 0$ has a solution, the optimal policy has a history independent lower bound,  and an upper bound expressed as follows
\begin{align*}
\tau_{\text{opt}, \mathcal{H}} & \geq \frac{1}{\beta} \cdot \left(\log \alpha - \log \Big(\frac{c_{p}}{c_{cs}} - \lambda_{0}\Big)\right) \\
\tau_{\text{opt}, \mathcal{H}} & \leq \frac{1}{\beta} \cdot \left(\log \alpha + \log \delta + 1 - \log \Big(\frac{c_{p}}{c_{cs}} - \lambda_{0}\Big)\right)
\end{align*}
\noindent where $\delta$ satisfies
\begin{equation*}
\sum_{i=m-\delta}^{m-1} e^{\beta \cdot (t_i - t_{m-1})} 
\geq \frac{1}{2} \sum_{i=1}^{m-1} e^{\beta \cdot (t_i - t_{m-1})} 
\end{equation*}
\end{corollary}


\begin{proof}
We can rewrite the formula for the optimal policy for a Hawkes process with a given history as:
 
\begin{align*}
\tau_{\text{opt}, \mathcal{H}} & = t_{\text{opt},\mathcal{H}} - t_{m} \\
& = \frac{1}{\beta} \cdot \left(\log \alpha + \log \left(\sum_{t_{j} \in \mathcal{H}} e^{\beta \cdot t_{j}}\right) - \log \Big(\frac{c_{p}}{c_{cs}} - \lambda_{0}\Big)\right) - t_{m} \\
& = \frac{1}{\beta} \cdot \left(\log \alpha + \log \left( e^{\beta \cdot t_{1}} + e^{\beta \cdot t_{2}} + \cdots + e^{\beta \cdot t_{m}}\right) - \log \Big(\frac{c_{p}}{c_{cs}} - \lambda_{0}\Big)\right) - t_{m}\\
& = \frac{1}{\beta} \cdot \left(\log \alpha + \log \left( e^{\beta \cdot t_{1}} + e^{\beta \cdot t_{2}} + \cdots + e^{\beta \cdot t_{m}}\right) - \log \Big(\frac{c_{p}}{c_{cs}} - \lambda_{0}\Big)\right) - t_{m}\\
& = \frac{1}{\beta} \cdot \left(\log \alpha + \log \left( (e^{\beta \cdot t_{m}}) \cdot ( e^{\beta \cdot (t_{1} - t_{m})} + e^{\beta \cdot (t_{2} - t_{m})} + \cdots + e^{\beta \cdot (t_{m} - t_{m})})\right) - \log \Big(\frac{c_{p}}{c_{cs}} - \lambda_{0}\Big)\right) - t_{m}\\
& = \frac{1}{\beta} \cdot \left(\log \alpha + \log \left( e^{\beta \cdot (t_{1} - t_{m})} + e^{\beta \cdot (t_{2} - t_{m})} + \cdots + e^{\beta \cdot (t_{m} - t_{m})}\right) - \log \Big(\frac{c_{p}}{c_{cs}} - \lambda_{0}\Big)\right)\\
& = \frac{1}{\beta} \cdot \left(\log \alpha + \log \left( e^{\beta \cdot (t_{1} - t_{m})} + e^{\beta \cdot (t_{2} - t_{m})} + \cdots + 1 \right) - \log \Big(\frac{c_{p}}{c_{cs}} - \lambda_{0}\Big)\right)\\
\end{align*}

This has three terms, two of which are independent of the history.  Thus we can obtain a lower bound on the optimal policy for {\em any} history as$\tau_{\text{opt}, \mathcal{H}} \geq \frac{1}{\beta} \cdot \left(\log \alpha - \log \Big(\frac{c_{p}}{c_{cs}} - \lambda_{0}\Big)\right)$.  In fact, this is the optimal policy for the empty history. 

For the term that depends on history, all exponents are negative so each term is at most 1.  This yields a trivial upper bound of $\tau_{\text{opt}, \mathcal{H}} \leq \frac{1}{\beta} \cdot \left(\log \alpha + \log m - \log \Big(\frac{c_{p}}{c_{cs}} - \lambda_{0}\Big)\right)$. 
While it grows slowly due to the log,
this bound is unappealing to apply directly because it grows with the length of the history.  In reality, many of the terms of the sum are close to 0 because $t_i - t_m$ is very negative for $t_i$ substantially in the past.

To get a better estimate, let $\delta$ be such that 
\begin{equation*}
\sum_{i=m-\delta+1}^m e^{\beta \cdot (t_i - t_m)} 
\geq \frac{1}{2} \sum_{i=1}^m e^{\beta \cdot (t_i - t_m)} 
\end{equation*}

That is, the most recent $\delta$ arrivals provide at least half the total weight.  This can be thought of as only having $\delta$ arrivals that are recent enough to matter.  Then we have the 
upper bound of $\tau_{\text{opt}, \mathcal{H}} \leq \frac{1}{\beta} \cdot \left(\log \alpha + \log \delta + 1 - \log \Big(\frac{c_{p}}{c_{cs}} - \lambda_{0}\Big)\right)$.  
\end{proof}

\section{Omitted Figures from Section 5.1}

\begin{figure*}[h!]
    \centering
    \begin{subfigure}[t]{0.3\textwidth}
        \centering
        \includegraphics[height=1.3in]{images/Simulation_Alpha1.png}
        \caption{$\alpha = 0.8$}
    \end{subfigure}
    ~ 
    \begin{subfigure}[t]{0.3\textwidth}
        \centering
        \includegraphics[height=1.3in]{images/Simulation_Alpha2.png}
        \caption{$\alpha = 1.4$}
    \end{subfigure}
    ~
    \begin{subfigure}[t]{0.3\textwidth}
        \centering
        \includegraphics[height=1.3in]{images/Simulation_Alpha3.png}
        \caption{$\alpha = 2.0$}
    \end{subfigure}    
    \caption{Plots of average costs for policies when $\alpha$ is increased, given $\lambda_{0} = 0.6, \beta = 2.4, c_{p} = 1.0, c_{cs} = 1.25$}
    \label{cost_compare_alpha}
\end{figure*}



\begin{figure*}[h!]
    \centering
    \begin{subfigure}[t]{0.3\textwidth}
        \centering
        \includegraphics[height=1.3in]{images/Simulation_Lambda0_1.png}
        \caption{$\lambda_{0} = 0.5$}
    \end{subfigure}
    ~ 
    \begin{subfigure}[t]{0.3\textwidth}
        \centering
        \includegraphics[height=1.3in]{images/Simulation_Lambda0_2.png}
        \caption{$\lambda_{0} = 0.75$}
    \end{subfigure}
    ~
    \begin{subfigure}[t]{0.3\textwidth}
        \centering
        \includegraphics[height=1.3in]{images/Simulation_Lambda0_3.png}
        \caption{$\lambda_{0} = 1.0$}
    \end{subfigure}    
    \caption{Plots of average costs for policies when $\lambda_{0}$ is increased, given $\alpha = 1.2, \beta = 2.4, c_{p} = 1.0, c_{cs} = 1.25$}
    \label{cost_compare_lambda}
\end{figure*}


\begin{figure*}[h!]
    \centering
    \begin{subfigure}[t]{0.3\textwidth}
        \centering
        \includegraphics[height=1.3in]{images/Simulation_Beta1.png}
        \caption{$\beta = 1.8$}
    \end{subfigure}
    ~ 
    \begin{subfigure}[t]{0.3\textwidth}
        \centering
        \includegraphics[height=1.3in]{images/Simulation_Beta2.png}
        \caption{$\beta = 2.4$}
    \end{subfigure}
    ~
    \begin{subfigure}[t]{0.3\textwidth}
        \centering
        \includegraphics[height=1.3in]{images/Simulation_Beta3.png}
        \caption{$\beta = 3.0$}
    \end{subfigure}    
    \caption{Plots of average costs for policies when $\beta$ is increased, given $\lambda_{0} = 0.6, \alpha = 1.2, c_{p} = 1.0, c_{cs} = 1.25$}
    \label{cost_compare_beta}
\end{figure*}


These additional figures demonstrate the robustness of the performance of Optimized-TTL with respect to a range of parameters. In them, we examine how the average costs behave with respect to the Hawkes process parameters $\lambda_{0}, \; \alpha, \text{and} \; \beta$ while holding the costs fixed. Figure \ref{cost_compare_alpha} shows the behavior of the average cost of the policies for different values of $\alpha$ of the Hawkes process. We see that as $\alpha$ increases, the average length of the optimal keep-alive window increases. This is because for higher values of $\alpha$, the intensity of the subsequent arrival will be larger making it larger keep-alive windows more desirable. This intuition can be made more precise with Corollary \ref{optimal_decreasing} since $g(x|\mathcal{H}_{m-1}) = (1 - F(x|\mathcal{H}_{m-1})) \cdot (c_{p} - c_{cs}\lambda(x|\mathcal{H}_{m-1}))$. Increasing $\alpha$, increases $\lambda(x|\mathcal{H}_{m-1})$ causing $g(x|\mathcal{H}_{m-1})$ to be more negative. Therefore, the point $\tau_{\text{opt}}$ where $g(x|\mathcal{H}_{m-1})$ changes sign from negative to positive is larger for a larger $\alpha$. The behavior of the average costs of the policies when $\lambda_{0}$ increases is similar to that of $\alpha$ as shown in Figure \ref{cost_compare_lambda}. From Figure \ref{cost_compare_beta} we see that as $\beta$ increases, the average length of the optimal keep-alive window decreases. The decay rate of the arrivals' influence is larger for a larger $\beta$ which makes shorter keep-alive windows more optimal. This connects to Corollary \ref{optimal_decreasing}, where a higher value of $\beta$ causes $g(x|\mathcal{H}_{m-1}) = (1 - F(x|\mathcal{H}_{m-1})) \cdot (c_{p} - c_{cs}\lambda(x|\mathcal{H}_{m-1}))$ to change from negative to positive earlier.


\section{Extension: worst-case guarantees for Hawkes processes}

We know from Theorem \ref{opt_policy}, that computing the optimal keep-alive policy requires the history $\mathcal{H}_{m-1}$ of previous $m-1$ invocations. As described in Section 4.2, the computational complexity of the optimal policy increases as the history of invocations increase. Hence, we propose history independent policies that do not require any information regarding past arrival requests. This problem is similar in spirit to the \textit{Ski Rental} problem in online algorithms, where the customer can buy an item for $\$$ B or rent the item for $\$$ R per the unit of time. There, a 2-approximation results from renting until the cost of buying has been paid in rental fees as if the input ends during the rental period the policy was optimal and otherwise buying immediately would have been optimal so the policy overpaid by a factor of 2.  Similarly, in our setting a fixed keep-alive policy can achieve a 2-approximation (Theorem \ref{fixed_policy}). This bound does not use any information about the parameters of the Hawkes process. In Theorem \ref{approx_policy}, we propose a history independent approximate policy that requires only the parameters of the Hawkes process (i.e. is independent of the history), and approximates the optimal cost by a factor of $ \displaystyle 1 + \left( \frac{1}{ \frac{c_{p}}{c_{cs}} \cdot \tau_{\text{opt}, \mathcal{H} = \phi} + 1} \right)^{\frac{1}{2}} $. Both results follow from the following lemma, which bounds the performance of arbitrary history independent keep-alive policies.  


\begin{lemma} \label{simple_policy}
A policy with keep-alive window $\tau$ which does not depend on the history of arrivals of invocations is at least \quad $\displaystyle \max \Big\{\frac{c_{p} \cdot \tau}{c_{p} \cdot \tau_{\text{opt}, \mathcal{H} = \phi} +c_{cs}} + 1, 1 + \frac{c_{cs}}{c_{p} \cdot \tau} \Big\} \;$ approximation to the cost of the optimal policy $\tau_{\text{opt}, \mathcal{H}_{m-1}}$ for any history $\mathcal{H}_{m-1}$. 
\end{lemma} 



\begin{proof}
Given history of application invocations $\mathcal{H}_{m-1}$, we denote the length of the optimal keep-alive window  by $\tau_{\text{opt}, \mathcal{H}_{m-1}}$. Let $\tau$ denote the length of a history independent policy. We examine the upper bound of the ratio of the cost of the history independent policy to the cost of the optimal policy when the application is invoked at the $m$-th inter-arrival $x_{m}$, that is, $\displaystyle \frac{cost(x_{m}, \tau)}{cost(x_{m}, \tau_{\text{opt}, \mathcal{H}_{m-1}})}$, where $x_{m} = t_{m} - t_{m-1}$ is the length of the $m$-th inter-arrival. There are three possibilities when comparing keep-alive window $\tau$ with $\tau_{\text{opt}, \mathcal{H}_{m-1}}$. They are,
\begin{enumerate}
    \item $\tau \; = \; \tau_{\text{opt}, \mathcal{H}_{m-1}}$
    \item $\tau \;  < \; \tau_{\text{opt}, \mathcal{H}_{m-1}}$
    \item $\tau \;  > \; \tau_{\text{opt}, \mathcal{H}_{m-1}}$
\end{enumerate}

We examine the upper bound of the ratio of the cost of the history independent policy to the cost of the optimal policy for each case listed above.

\noindent \textbf{Case 1}: When $\tau = \; \tau_{\text{opt}, \mathcal{H}_{m-1}}$, both the history independent policy and the optimal policy have the same cost. That is , 
\begin{equation*}
    cost(x_{m}, \tau) \; = \; cost(x_{m}, \tau_{\text{opt}, \mathcal{H}_{m-1}})
\end{equation*}

\noindent \textbf{Case 2}: When $\tau \; < \; \tau_{\text{opt}, \mathcal{H}_{m-1}}$, the cost of the policies can be compared based on when the application invocation occurs.

\begin{itemize}
    \item When the application invocation occurs before the end of the history independent keep-alive window, that is, $x_{m} \leq \tau$,  then both policies encounter a warm start. Hence, both policies have the same cost. That is, 
    \begin{equation*}
    cost(x_{m}, \tau) \; = \; cost(x_{m}, \tau_{\text{opt}, \mathcal{H}_{m-1}}) = c_{p} \cdot x_{m}
    \end{equation*}
    \item When the application invocation is after the keep-alive window $\tau$, but before the end of the optimal policy, that is, $\tau < \; x_{m} \; \leq \; \tau_{\text{opt}, \mathcal{H}_{m-1}}$, then the history independent policy encounters a cold start whereas the optimal policy experiences a warm start. The ratio of the cost of the history independent policy to the cost of the optimal policy is expressed as follows, 
    \begin{align*}
    \frac{cost(x_{m}, \tau)}{cost(x_{m}, \tau_{\text{opt}, \mathcal{H}_{m-1}})} & = \; \frac{c_{p} \cdot \tau + c_{cs}}{c_{p} \cdot x_{m}} \\
    & \leq \; \frac{c_{p} \cdot \tau + c_{cs}}{c_{p} \cdot \tau} \quad \qquad \text{(6)}\\
    & = \; 1 + \frac{c_{cs}}{c_{p} \cdot \tau}
    \end{align*}
    In Equation (6) above we see that the minimum possible cost of the optimal policy in this scenario is when the application gets invoked just after the fixed keep-alive window, that is, when $x_{m}= \tau$.
    \item When the application invocation is after the optimal keep-alive policy, that is, $\tau \; < \; \tau_{\text{opt}, \mathcal{H}_{m-1}} \; <\; x_{m}$, then both policies encounter a cold start. Here, the cost of the optimal policy is larger than the cost of the history independent keep-alive policy because the cost of a policy when a cold start occurs is proportional to the length of the keep-alive window. That is,
    \begin{align*}
    \tau \; & \leq \; \tau_{\text{opt}, \mathcal{H}_{m-1}} \\
    \implies c_{p} \cdot \tau  + c_{cs} \; & \leq \; c_{p} \cdot \tau_{\text{opt}, \mathcal{H}_{m-1}} + c_{cs} \\
    \implies cost(x_{m}, \tau) \; & \; \leq \; cost(x_{m}, \tau_{\text{opt}, \mathcal{H}_{m-1}})
    \end{align*}
\end{itemize}

\noindent \textbf{Case 3}: When $\tau \; > \; \tau_{\text{opt}, \mathcal{H}_{m-1}}$, the cost of the policies can be compared based on the arrival of application invocations.
\begin{itemize}
    \item When the application is invoked before the end of the optimal keep-alive window, that is, $x_{m} \; \leq \; \tau_{\text{opt},\mathcal{H}_{m-1}}$, then both the policies encounter a warm start. Hence, both the policies have the same costs. That is,
    \begin{equation*}
    cost(x_{m}, \tau) = \; cost(x_{m}, \tau_{\text{opt}, \mathcal{H}_{m-1}}) = \; c_{p} \cdot x_{m} 
    \end{equation*}
    \item When the application invocation occurs after the optimal keep-alive window, that is, $x_{m} \; > \; \tau_{\text{opt}, \mathcal{H}_{m-1}}$, then the optimal policy experiences a cold start. The ratio of the cost of the history independent policy to the cost of the optimal policy is upper bounded when the history independent policy has a cold start. We compute the upper bound on the ratio of costs as follows,
    \begin{align*}
    \frac{cost(x_{m}, \tau)}{cost(x_{m}, \tau_{\text{opt},\mathcal{H}_{m-1}})} \;& \leq \; \frac{c_{p} \cdot \tau + c_{cs}}{c_{p} \cdot \tau_{\text{opt}, \mathcal{H}_{m-1}}+ c_{cs}} \\
    %\; & \leq \; \frac{c_{p} \cdot \tau}{c_{p} \cdot \tau_{\text{opt}, \mathcal{H}_{m-1}}+ c_{cs}} + \frac{c_{cs}}{c_{p} \cdot \tau_{\text{opt}, \mathcal{H}_{m-1}}+ c_{cs}} \\
    %\; & \leq \; \frac{c_{p} \cdot \tau}{c_{p} \cdot 0 + c_{cs}} + \frac{c_{cs}}{c_{p} \cdot 0 + c_{cs}} \quad \qquad \text{(b)}\\
    \; & \leq \; \frac{c_{p} \cdot \tau}{c_{p} \cdot \tau_{\text{opt}, \mathcal{H} = \phi} + c_{cs}} + 1  \quad \qquad \text{(7)}
    \end{align*}
    where in Equation (7) we have substituted $\mathcal{H} = \phi$ to compute the upper bound.
\end{itemize}

We have now established two separate upper bounds for cases 2 and 3. The approximation factor of the history independent policy with respect to the optimal policy for an arbitrary history is the maximum of the two upper bounds, that is,\\ $\displaystyle \max \Big\{\frac{c_{p} \cdot \tau}{ c_{p} \cdot \tau_{\text{opt}, \mathcal{H} = \phi} + c_{cs}} + 1, 1 + \frac{c_{cs}}{c_{p} \cdot \tau} \Big\}$.

\end{proof}


\subsection{Fixed Keep-Alive Policy}


We first show our Ski rental style result.

\begin{theorem}
The cost of the fixed keep-alive policy $\tau_{\text{fixed}}=c_{cs}/c_{p}$ is at most twice the cost of the optimal keep-alive policy $\tau_{\text{opt}, \mathcal{H}_{m-1}}$. That is, when a function is invoked at time $t_{m}$ after previous $m-1$ arrivals we have, $cost(x_{m}, \tau_{\text{fixed}}) \; \leq \;  2 \cdot cost(x_{m}, \tau_{\text{opt}, \mathcal{H}_{m-1}})$, where $x_{m} = t_{m} - t_{m-1}$ is the length of the $m$-th inter-arrival, and $c_{p} \cdot \tau_{\text{fixed}} = c_{cs}$.
\label{fixed_policy}
\end{theorem} 

While this has a simple direct proof, we illustrate how it follows from Lemma~\ref{simple_policy}.

\begin{proof}

From Lemma \ref{simple_policy}, we know that the approximation factor of the fixed policy with respect to the optimal policy is $\displaystyle \max \Big\{\frac{c_{p} \cdot \tau_{\text{fixed}}}{c_{p} \cdot \tau_{\text{opt}, \mathcal{H} = \phi}+ c_{cs}} + 1, 1 + \frac{c_{cs}}{c_{p} \cdot \tau_{\text{fixed}}} \Big\}$. The upper bound  of $\displaystyle \frac{c_{p} \cdot \tau_{\text{fixed}}}{ c_{p} \cdot \tau_{\text{opt}, \mathcal{H} = \phi}+ c_{cs}} + 1$ can further be reduced to $\displaystyle \frac{c_{p} \cdot \tau_{\text{fixed}}}{ c_{p} \cdot \tau_{\text{opt}, \mathcal{H} = \phi}+ c_{cs}} + 1 \; \leq \; \frac{c_{p} \cdot \tau_{\text{fixed}}}{c_{cs}} + 1$ by substituting $\tau_{\text{opt}, \mathcal{H} = \phi} = 0$ because a fixed policy should accommodate for any history independent policy.


The best length of the keep-alive window for the fixed policy is the length which minimizes the maximum of the two upper bounds on the ratio of the cost of the fixed policy and the optimal policy. Mathematically, the best length of the fixed policy is expressed as 
\begin{equation*}
    \arg \min_{\tau_{\text{fixed}}} \max \Big\{\frac{c_{p} \cdot \tau_{\text{fixed}}}{c_{cs}} + 1, 1 + \frac{c_{cs}}{c_{p} \cdot \tau_{\text{fixed}}} \Big\}
\end{equation*}

We obtain the length of the fixed policy by solving the above expression,

\begin{align*}
    \frac{c_{p} \cdot \tau_{\text{fixed}}}{c_{cs}} + 1 \; & = \; 1 + \frac{c_{cs}}{c_{p}\cdot \tau_{\text{fixed}}}\\
    \frac{c_{p} \cdot \tau_{\text{fixed}}}{c_{cs}} \; & = \;  \frac{c_{cs}}{c_{p}\cdot \tau_{\text{fixed}}} \\
    \big(c_{p} \cdot \tau_{\text{fixed}}\big)^{2} \; & = \; (c_{cs})^{2}\\
    c_{p} \cdot \tau_{\text{fixed}} \; & = \;  c_{cs} 
\end{align*}

Substituting this back to the upper bound on the ratio of the cost of the fixed policy to the cost of the optimal policy, we get
\begin{align*}
    \frac{cost(x_{m}, \tau_{\text{fixed}})}{cost(x_{m}, \tau_{\text{opt}, \mathcal{H}_{m-1}})} \; & \leq \; 1 + \frac{c_{p} \cdot \tau_{\text{fixed}}}{c_{cs}}\\
    & = \; 1  + \frac{c_{cs}}{c_{cs}} \\
    & = \; 2
\end{align*}
\end{proof}

\subsection{History Independent Keep-Alive Policies}


More generally, we can take advantage of Lemma~\ref{simple_policy} to achieve a tighter bound that makes use of the parameters of the Hawkes process only through the policy they induce given the empty history.

\begin{theorem} \label{approx_policy}
There exists a policy with keep-alive window $\tau_{\text{approx}}$ which does not require the history of arrivals of application invocations with its cost upper bounded by  a factor of $ \; 1 + \left(\frac{1}{ \frac{c_{p}}{c_{cs}} \cdot \tau_{\text{opt}, \mathcal{H} = \phi} + 1} \right)^{\frac{1}{2}}$ with respect to the cost of the optimal keep-alive policy $\tau_{\text{opt}, \mathcal{H}_{m-1}}$. In other words, for a given history of invocations $\mathcal{H}_{m-1}$, when the application invocation has an inter-arrival of length $x_{m}$,  $ \displaystyle \frac{cost(x_{m}, \tau_{\text{approx}})}{cost(x_{m}, \tau_{\text{opt}, \mathcal{H}_{m-1}})} \; \leq 1 + \left( \frac{1}{ \frac{c_{p}}{c_{cs}} \cdot \tau_{\text{opt}, \mathcal{H}_{m-1} = \phi} + 1} \right)^{\frac{1}{2}} \leq 2$.
\end{theorem}

\begin{proof}

From Lemma \ref{simple_policy}, we know that the approximation factor of the approximate policy with respect to the optimal policy is $\displaystyle \max \Big\{\frac{c_{p} \cdot \tau_{\text{approx}}}{c_{p} \cdot \tau_{\text{opt}, \mathcal{H} = \phi}+ c_{cs}} + 1, 1 + \frac{c_{cs}}{c_{p} \cdot \tau_{\text{approx}}} \Big\}$. The best length of the keep-alive window of the approximate policy would minimize the maximum of the upper bounds of the ratio of the costs of the approximate policy and the optimal policy. Mathematically, the best approximate policy keep-alive window is expressed as,
\begin{equation*}
    \arg \min_{\tau_{\text{approx}}} \max \Big\{\frac{c_{p} \cdot \tau_{\text{approx}}}{c_{p} \cdot \tau_{\text{opt}, \mathcal{H} = \phi} +c_{cs}} + 1, 1 + \frac{c_{cs}}{c_{p} \cdot \tau_{\text{approx}}} \Big\}
\end{equation*}

\noindent We can obtain the length of the approximate keep-alive window by solving the above expression. 
\begin{align*}
    \frac{c_{p} \cdot \tau_{\text{approx}}}{c_{p} \cdot \tau_{\text{opt}, \mathcal{H} = \phi} + c_{cs}} + 1  & =  1 + \frac{c_{cs}}{c_{p} \cdot \tau_{\text{approx}}} \\
    \frac{c_{p} \cdot \tau_{\text{approx}}}{c_{p} \cdot \tau_{\text{opt}, \mathcal{H} = \phi} + c_{cs}}   & =  \frac{c_{cs}}{c_{p} \cdot \tau_{\text{approx}}} \\
    (c_{p} \cdot \tau_{\text{approx}})^{2} & = c_{cs} \cdot (c_{p} \cdot \tau_{\text{opt}, \mathcal{H} = \phi} + c_{cs}) \\
    \tau_{\text{approx}} & = \left( \frac{c_{cs}}{c_{p}} \cdot \Big( \tau_{\text{opt}, \mathcal{H} = \phi} + \frac{c_{cs}}{c_{p}} \Big) \right)^{\frac{1}{2}} \\
\end{align*}

\noindent Substituting $\tau_{\text{approx}}$ in the expression for the upper bound of the ratio of the cost of the approximate policy to the cost of the optimal policy, we get


\begin{align*}
    \frac{cost(x_{m}, \tau_{\text{approx}})}{cost(x_{m}, \tau_{\text{opt}, \mathcal{H}_{m-1}})} & = \;  1 + \frac{c_{p} \cdot \tau_{\text{approx}}}{c_{p} \cdot \tau_{\text{opt}, \mathcal{H} = \phi} + c_{cs}} \\
    & = \;  1 + \frac{\tau_{\text{approx}}}{ \tau_{\text{opt}, \mathcal{H} = \phi} + \displaystyle \frac{c_{cs}}{c_{p}}} \\
    & = \;  1 + \frac{\displaystyle \left( \frac{c_{cs}}{c_{p}} \cdot \Big( \tau_{\text{opt}, \mathcal{H} = \phi} + \frac{c_{cs}}{c_{p}} \Big) \right)^{\frac{1}{2}}}{ \tau_{\text{opt}, \mathcal{H} = \phi} + \displaystyle \frac{c_{cs}}{c_{p}}}\\[6 pt]
    & = \; 1 + \left( \frac{c_{cs}}{c_{p} \cdot \tau_{\text{opt}, \mathcal{H} = \phi} + c_{cs}} \right)^{\frac{1}{2}} \\
    & = \; 1 + \left( \frac{1}{ \displaystyle \frac{c_{p}}{c_{cs}} \cdot \tau_{\text{opt}, \mathcal{H} = \phi} + 1} \right)^{\frac{1}{2}} \\
    & \leq \; 1 + 1 = 2 \\
\end{align*}


\end{proof}

\subsection{Application to Poisson and Hawkes Processes}
As previously observed, the fixed policy from Theorem~\ref{fixed_policy} is independent of the process and so has a keep alive window of $\tau_{\text{fixed}}=c_{cs}/c_{p}$ for both Poisson and Hawkes processes. The behavior of $\tau_{\text{approx}}$
from Theorem~\ref{approx_policy} is more interesting.  For Poisson processes, we know that $\tau_{\text{opt}, \mathcal{H} = \phi}$ is either 0 or $\infty$.  In the former case, $\tau_{\text{approx}} = \tau_{\text{fixed}}$ and the approximation ratio of 2 is tight.  (Consider any input where $x_{m} > c_{p} / c_{cs}$.)  In the latter case however, $\tau_{\text{approx}} = \infty =  \tau_{\text{opt}}$ and so the approximation is 1.

For Hawkes processes, the length of the keep-alive window for the approximate policy is, 
\begin{align*}
    \tau_{\text{approx}}  = \left( \frac{c_{cs}}{c_{p}} \cdot \Big( \tau_{\text{opt}, \mathcal{H} = \phi} + \frac{c_{cs}}{c_{p}} \Big) \right)^{\frac{1}{2}} 
\end{align*}

From the more general expression for $\tau_{\text{opt}, \mathcal{H}}$ for Hawkes processes,
\begin{align*}
    \tau_{\text{opt}, \mathcal{H} = \phi} = \frac{1}{\beta} \cdot \Big( \log \alpha - \log \big( \frac{c_{p}}{c_{cs}} - \lambda_{0}\big) \Big)
\end{align*}

Combining these, we have 
\begin{align*}
    \tau_{\text{approx}}  = \left( \frac{c_{cs}}{c_{p}} \cdot \Bigg( \frac{1}{\beta} \cdot \Big( \log \alpha - \log \big( \frac{c_{p}}{c_{cs}} - \lambda_{0}\big) \Big) + \frac{c_{cs}}{c_{p}} \Bigg) \right)^{\frac{1}{2}} 
\end{align*}
This illustrates how $\tau_{\text{opt}, \mathcal{H} = \phi}$ implicitly brings the parameters of the Hawkes process into $\tau_{\text{approx}}$.


\subsection{Performances on Simulated Hawkes Processes} 


We present similar simulations from before with two additional policies included (approximate policy and fixed policy of length $c_{cs}$).   Generally, they demonstrate the conservative approach these policies take to achieve their worst case guarantees.  In general, both perform worse than both the optimal policy (blue) and best fixed policy (red).  The relative performance of the two policies is not consistent, with each better in some cases.  The gap between the yellow line and the blue line is what provided room for the improvement of Optimal-TTL policy over Fixed policy.

% Results and Insights
% Figures

\begin{figure*}[h!]
    \centering
    \begin{subfigure}[t]{0.45\textwidth}
        \centering
        \includegraphics[height=2.0in]{images/Cost_Simulation_Instance1.png}
        \caption{$\lambda_{0} = 0.6, \alpha = 1.2, \beta = 2.4, c_{p} = 1.0, c_{cs} = 1.25$}
    \end{subfigure}
    ~ 
    \begin{subfigure}[t]{0.5\textwidth}
        \centering
        \includegraphics[height=2.0in]{images/Cost_Simulation_Instance2.png}
        \caption{$\lambda_{0} = 0.45, \alpha = 0.8, \beta = 1.2, c_{p} = 1.0, c_{cs} = 3.0$}
    \end{subfigure}
    \caption{Plots of average cost comparisons between different policies for cases where $c_{p} \; \leq c_{cs}$}
    \label{cost_compare_sim1b}
\end{figure*}

\begin{figure*}[h!]
    \centering
    \begin{subfigure}[t]{0.45\textwidth}
        \centering
        \includegraphics[height=2.0in]{images/Cost_Simulation_Instance3.png}
        \caption{$\lambda_{0} = 0.65, \alpha = 1.4, \beta = 2.2, c_{p} = 1.0, c_{cs} = 0.5$}
    \end{subfigure}
    ~ 
    \begin{subfigure}[t]{0.5\textwidth}
        \centering
        \includegraphics[height=2.0in]{images/Cost_Simulation_Instance4.png}
        \caption{$\lambda_{0} = 0.5, \alpha = 0.6, \beta = 1.5, c_{p} = 1.0, c_{cs} = 0.4$}
    \end{subfigure}
    \caption{Plots of average cost comparisons between different policies for cases where $c_{p} \; \geq c_{cs}$}
    \label{cost_compare_sim2b}
\end{figure*}

As Figures \ref{cost_compare_sim1b}  and \ref{cost_compare_sim2b} illustrate, $\tau_{\text{approx}}$ is always more conservative than $\tau_{\text{fixed}}$ in that it chooses a weakly longer window length (which is is how it achieves its stronger worst case performance guarantee).  Despite this, their average performance is often quite similar.  In some situations, like Figure \ref{cost_compare_sim1b} (a), both policies are excessively conservative and so the extra conservatism of $\tau_{\text{approx}}$ causes it to perform worse.  This effect is bounded however, because in situations like Figure \ref{cost_compare_sim2b} (b) where $\tau_{\text{opt}, \mathcal{H} = \phi}$ is close to zero they become essentially the same policy.  In contrast, when the optimal window length is long, like Figure \ref{cost_compare_sim1b} (b), $\tau_{\text{approx}}$ performs better.  Again however the effect is small, this time because when optimal window lengths are relatively long it is typically unlikely that it will actually be a long time until the next arrival.


\begin{figure*}[h!]
    \centering
    \begin{subfigure}[t]{0.32\textwidth}
        \centering
        \includegraphics[height=1.4in]{images/Cost_Simulation_alpha1.png}
        \caption{$\alpha = 0.8$}
    \end{subfigure}
    ~ 
    \begin{subfigure}[t]{0.32\textwidth}
        \centering
        \includegraphics[height=1.4in]{images/Cost_Simulation_alpha2.png}
        \caption{$\alpha = 1.4$}
    \end{subfigure}
    ~
    \begin{subfigure}[t]{0.32\textwidth}
        \centering
        \includegraphics[height=1.4in]{images/Cost_Simulation_alpha3.png}
        \caption{$\alpha = 2.0$}
    \end{subfigure}    
    \caption{Plots of average costs for policies when $\alpha$ is increased, given $\lambda_{0} = 0.6, \beta = 2.4, c_{p} = 1.0, c_{cs} = 1.25$}
    \label{cost_compare_alphab}
\end{figure*}

\begin{figure*}[h!]
    \centering
    \begin{subfigure}[t]{0.32\textwidth}
        \centering
        \includegraphics[height=1.4in]{images/Cost_Simulation_lambda1.png}
        \caption{$\lambda_{0} = 0.5$}
    \end{subfigure}
    ~ 
    \begin{subfigure}[t]{0.32\textwidth}
        \centering
        \includegraphics[height=1.4in]{images/Cost_Simulation_lambda2.png}
        \caption{$\lambda_{0} = 0.75$}
    \end{subfigure}
    ~
    \begin{subfigure}[t]{0.32\textwidth}
        \centering
        \includegraphics[height=1.4in]{images/Cost_Simulation_lambda3.png}
        \caption{$\lambda_{0} = 1.0$}
    \end{subfigure}    
    \caption{Plots of average costs for policies when $\lambda_{0}$ is increased, given $\alpha = 1.2, \beta = 2.4, c_{p} = 1.0, c_{cs} = 1.25$}
    \label{cost_compare_lambdab}
\end{figure*}

\begin{figure*}[h!]
    \centering
    \begin{subfigure}[t]{0.32\textwidth}
        \centering
        \includegraphics[height=1.4in]{images/Cost_Simulation_beta1.png}
        \caption{$\beta = 1.8$}
    \end{subfigure}
    ~ 
    \begin{subfigure}[t]{0.32\textwidth}
        \centering
        \includegraphics[height=1.4in]{images/Cost_Simulation_beta2.png}
        \caption{$\beta = 2.4$}
    \end{subfigure}
    ~
    \begin{subfigure}[t]{0.32\textwidth}
        \centering
        \includegraphics[height=1.4in]{images/Cost_Simulation_beta3.png}
        \caption{$\beta = 3.0$}
    \end{subfigure}    
    \caption{Plots of average costs for policies when $\beta$ is increased, given $\lambda_{0} = 0.6, \alpha = 1.2, c_{p} = 1.0, c_{cs} = 1.25$}
    \label{cost_compare_betab}
\end{figure*}



\subsection{Azure Datatrace Performance Results} \label{azure_results_appendix}

% Describe the insights of results
Figure \ref{azure_fig1} plots the trade-off curve between the average number of cold starts per application vs the normalized wasted memory for optimal, optimized-TTL, approximate and fixed policies. In Figure \ref{azure_fig1} (a), the trade-off curve is plotted when including only those applications that follow the Hawkes process during day 9. 
The trade-off Pareto curve of Figure \ref{azure_fig1} (b) plots the average number of cold starts per application vs the normalized memory for all applications invoked during day 9. The plots in Figure \ref{azure_fig1} show that the trade-off Pareto curve of the approximate policy is very slightly better than the fixed policy, but substantially worse than the optimal policy, and thus Optimized-TTL as well.



\begin{figure*}[h!]
    \centering
    \begin{subfigure}[t]{0.5\textwidth}
        \centering
        \includegraphics[height=2.1in]{images/TradeoffCurve_fitted_apps_all_policies_updated_day9.png}
        \caption{Evaluation only on Hawkes process applications}
    \end{subfigure}%
    ~ 
    \begin{subfigure}[t]{0.5\textwidth}
        \centering
        \includegraphics[height=2.1in]{images/TradeoffCurve_all_apps_all_policies_updated_day9.png}
        \caption{Evaluation on all applications}
    \end{subfigure}
    \caption{Trade-off curve of average number of cold starts vs normalized wasted memory}
    \label{azure_fig1}
\end{figure*}

\section{Use of separate data for goodness of fit}

The goodness of fit test is known to have a few limitations when the same data is used both to estimate the parameters and to compute the KS- statistic. \citet{reynaud2014goodness} show that the Hawkes process parameters when examined  for goodness of fit on the same dataset which was used for parameter estimation leads to a high bias. The authors propose sub-sampling as a reasonable solution to this problem. Rather than sub-sampling we took the advantage of additional data we are not currently using (e.g. day 7). \citet{van2016deep, kash2019combining} show a similar problem and solution for training and applying double Deep Q-learning Networks (DQNs). 

We report the results for the Optimized-TTL policy. We refer to the procedure where the goodness of fit is based on arrivals of application invocations on day 7 as "fix", whereas the procedure where the goodness of fit is based on arrivals of applications invocations on day 8 (same day as parameters estimated) is referred to as "no-fix". To compare the "fix" and "no-fix" procedures of selecting appropriate Hawkes process applications, we collect the common pool of applications invoked on day 7, day 8 and day 9. The Hawkes process applications in "fix" refer to applications where the parameters were estimated on day 8, and the KS test was performed on day 7 of the corresponding applications. The Hawkes process applications in "no-fix" refer to applications where the parameters were estimated on day 8, and the KS test was performed on the same day 8 of the corresponding applications (these are the common pool of applications present on day 7 and day 8). The number of common pool applications on day 7, day 8, and day 9 = 14788. Of these 3,694 applications fall into the 25 percentile apps that were selected as Hawkes process apps for each procedure ("fix", and "no-fix"). The amount of overlap on applications between the two tests, that is, the overlap of applications that passed the test on day 7 and applications that passed the test on day 8 = 2754. Overlap percentage = 2754/3694 = 0.745 . We show the plots of the trade-off curve between the fixed policy and the optimized-TTL policies for the overlapped apps in Figures \ref{azure_fix_test_figures}, and \ref{azure_no_fix_test_figures} for "fix", and "no-fix" procedures. Figures \ref{azure_fix_test_figures} (a), and \ref{azure_no_fix_test_figures} (a) show the trade-off curves for treated apps, whereas Figures \ref{azure_fix_test_figures} (b), and \ref{azure_no_fix_test_figures} (b) show the trade-off curves for all apps. We compute the cold start savings as the area between the optimized-TTL curve and the fixed policy curve divided by the maximum amount of wasted memory. Similarly, the wasted memory savings is the area between the optimized-TTL curve and the fixed policy curve divided by the maximum number of average cold-starts. The corresponding versions of the cold start savings and memory savings for optimized-TTL  policy are given in Table \ref{tab_appendix}. The “fix” version shows a slightly weaker performance on the treated apps, but a noticeably better performance on all apps than either other version.  So we view this as a demonstration that the "fix" does improve the selection of which apps to treat as Hawkes process and proceed to test the goodness of fit based on the arrivals of application invocations on day 7. 


% Figures of Tradeoff curves involving common pool of apps for "fix" and "no-fix" procedures

\begin{figure*}[h!]
    \centering
    \begin{subfigure}[t]{0.44\textwidth}
        \centering
        \includegraphics[height=2.3in]{images/TradeoffCurve_among_fitted_apps_common_pool_with_fix_day7_evaluated_day9.png}
        \caption{Hawkes process applications}
    \end{subfigure}%
    ~ 
    \begin{subfigure}[t]{0.44\textwidth}
        \centering
        \includegraphics[height=2.3in]{images/TradeoffCurve_among_all_common_pool_with_fix_day7_evaluated_day9.png}
        \caption{All applications}
    \end{subfigure}
\caption{Trade-off curve for optimized-TTL and fixed policies where goodness of fit is evaluated on day 7}
\label{azure_fix_test_figures}
\end{figure*}


\begin{figure*}[t!]
    \centering    
    \begin{subfigure}[t]{0.44\textwidth}
        \centering
        \includegraphics[height=2.3in]{images/TradeoffCurve_among_fitted_apps_common_pool_no_fix_day8_evaluated_day9.png}
        \caption{Hawkes process applications}
    \end{subfigure}%
    ~ 
    \begin{subfigure}[t]{0.44\textwidth}
        \centering
        \includegraphics[height=2.3in]{images/TradeoffCurve_among_all_common_pool_nofixday8_evaluated_day9.png}
        \caption{All applications}
    \end{subfigure}
   \caption{Trade-off curve for optimized-TTL and fixed policies where goodness of fit is evaluated on day 8}
    \label{azure_no_fix_test_figures}
\end{figure*}


\begin{table*}[t]
\centering
\begin{tabular}{|c|c|c|c|c|}
\hline
Procedure & Avg. Cold Start Savings (Hawkes) & (All) & Avg. Memory Savings (Hawkes) & (All)\\
\hline
Optimized-TTL (fix) & 0.834 &  0.1393 &  0.043  & 0.0085 \\
\hline
Optimized-TTL (no-fix) & 1.037  & 0.0574 & 0.053 & 0.0035 \\
\hline
\end{tabular}
\caption{\label{tab_appendix} Average performance improvement over fixed policy}
\end{table*}

\nobibliography{narayana_621}


\end{document}