slides.tex - hpcs-17-subord - Subordination: Providing Resilience to Simultaneous Failure of Multiple Cluster Nodes

slides.tex (16024B)
      1 % Created 2017-07-14 Пт 11:23
      2 % Intended LaTeX compiler: pdflatex
      3 \documentclass[12pt,aspectratio=169]{beamer}
      4 \usepackage{graphicx}
      5 \usepackage{booktabs}
      6 \usepackage{amsmath}
      7 \usepackage{amssymb}
      8 \usepackage{hyperref}
      9 \usepackage{tikz}
     10 \usetikzlibrary{shapes}
     11 \usepackage{cite}
     12 \usepackage{url}
     13 \usepackage{polyglossia}
     14 \input{preamble}
     15 \AtBeginSection[]{\frame{\sectionpage}}
     16 \usetheme{SaintPetersburg}
     17 
     18 \author{I.\,Gankevich \quad Yu.\,Tipikin \quad V.\,Korkhov}
     19 \date{July, 2017}
     20 \title{Subordination: Providing resilience to simultaneous failure of multiple cluster nodes}
     21 \institute{Saint Petersburg State University}
     22 \setdefaultlanguage{english}
     23 
     24 
     25 \definecolor{spbuGreen}{RGB}{40,160,40}
     26 \definecolor{spbuBlue}{RGB}{40,40,160}
     27 
     28 % physical layer
     29 \tikzset{Node/.style={rectangle,draw=spbuDarkGray,line width=3pt}}
     30 \tikzset{Switch/.style={circle,draw=gray,fill=gray,line width=7pt,solid}}
     31 \tikzset{PhysicalLink/.style={draw=gray,line width=7pt,solid}}
     32 \tikzset{NetworkLink/.style={draw=black,line width=3pt,dashed}}
     33 
     34 % logical layer
     35 \tikzset{Daemon/.style={ellipse,draw=spbuBlue,line width=3pt,solid,scale=0.65}}
     36 \tikzset{DaemonLink/.style={draw=spbuBlue,line width=3pt,solid}}
     37 
     38 % application layer
     39 \tikzset{Process/.style={ellipse,draw=spbuGreen,line width=3pt,solid}}
     40 \tikzset{ProcessEdge/.style={draw=spbuGreen,line width=3pt,solid,->}}
     41 
     42 
     43 \tikzset{Node/.style={rectangle,draw=spbuDarkGray,thick}}
     44 \tikzset{Task/.style={
     45 	ellipse,
     46 	draw=spbuGreen,
     47 	line width=3pt,
     48 	dashed,
     49 	anchor=center,
     50 	text=spbuGreen
     51 }}
     52 \tikzset{Kernel/.style={Task,solid,line width=2pt}}
     53 \tikzset{TaskEdge/.style={
     54 	ProcessEdge,
     55 	draw=spbuGreen,
     56 	solid,
     57 	->,
     58 	line width=2pt
     59 }}
     60 \tikzset{PseudoKernelEdge/.style={draw=spbuGreen,line width=3pt,dashed}}
     61 \tikzset{Label/.style={label distance=0.1cm,text=spbuTerracotta}}
     62 \tikzset{TaskLabel/.style={label distance=0.1cm,text=spbuGreen}}
     63 \tikzset{KernelPool/.style={
     64 	rectangle,
     65 	draw=spbuDarkGray,
     66 	minimum width=4.7cm,
     67 	minimum height=0.9cm,
     68 	line width=2pt,
     69 	thick
     70 }}
     71 
     72 \begin{document}
     73 
     74 \maketitle
     75 
     76 \begin{frame}{Motivation}
     77 	Looking for ways to develop fault-tolerant HPC applications without
     78 	checkpoints.
     79 	\vfill\pause
     80 	The outcome:
     81 
     82 	a scheduler with C++ API which replaces both batch job scheduler and
     83 	message passing interface.
     84 \end{frame}
     85 
     86 \begin{frame}{System architecture}
     87 	\begin{columns}[T]
     88 		\begin{column}{0.2\textwidth}
     89 			\input{tex/cluster-0}
     90 		\end{column}
     91 		\begin{column}{0.2\textwidth}
     92 			\input{tex/frame-0}
     93 		\end{column}
     94 		\begin{column}{0.2\textwidth}
     95 			\input{tex/frame-3}
     96 		\end{column}
     97 		\begin{column}{0.2\textwidth}
     98 			\input{tex/frame-4}
     99 		\end{column}
    100 	\end{columns}
    101 	\vfill
    102 	\begin{center}
    103 		\input{tex/legend}
    104 	\end{center}
    105 \end{frame}
    106 
    107 \begin{frame}{Application architecture}
    108 	\centering
    109 	\includegraphics{build/ppl.pdf}
    110 \end{frame}
    111 
    112 \section{Node discovery algorithm}
    113 
    114 \begin{frame}{Hierarchy of cluster nodes}
    115 	\begin{equation*}
    116 		\forall n_1 \forall n_2 \in \mathcal{N},
    117 		\forall f \colon \mathcal{N} \rightarrow \mathcal{R}^n 
    118 		\Rightarrow (f(n_1) < f(n_2) \Leftrightarrow \neg (f(n_1) \geq f(n_2)))
    119 	\end{equation*}
    120 	Here \(n\) is node rank~--- node's position in the network IP-address
    121 	range.
    122 	\vfill
    123 	\begin{columns}[T]
    124 		\begin{column}{0.35\textwidth}
    125 			\begin{itemize}
    126 				\item Strict total order.
    127 				\item Static node positions in the hierarchy.
    128 			\end{itemize}
    129 		\end{column}
    130 		\begin{column}{0.55\textwidth}
    131 			\begin{tikzpicture}[x=2.5cm,y=-1cm]
    132 				\node[Node] (M) at (2,0) {\texttt{10.0.0.1}};
    133 				\node[Node] (S1) at (1.25,1) {\texttt{10.0.0.2}};
    134 				\node[Node] (S2) at (2.75,1) {\texttt{10.0.0.3}};
    135 				\node[Node] (S3) at (0.75,2) {\texttt{10.0.0.4}};
    136 				\node[Node] (S4) at (1.75,2) {\texttt{10.0.0.5}};
    137 				\node[Node] (S5) at (2.75,2) {\texttt{10.0.0.6}};
    138 				\path[thick] (M) edge (S1);
    139 				\path[thick] (M) edge (S2);
    140 				\path[thick] (S1) edge (S3);
    141 				\path[thick] (S1) edge (S4);
    142 				\path[thick] (S2) edge (S5);
    143 			\end{tikzpicture}
    144 			\begin{center}
    145 				\texttt{fanout=2, nodes=6}
    146 			\end{center}
    147 		\end{column}
    148 	\end{columns}
    149 \end{frame}
    150 
    151 \begin{frame}{Example}
    152 	\framesubtitle{\texttt{fanout=2, nodes=100, network=10.0.0.0/8}}
    153 	\centering
    154 	\includegraphics<1>[width=0.9\linewidth]{build/graph.eps}
    155 	\includegraphics<2>[width=0.9\linewidth,trim={20cm 10cm 10cm 5cm},clip]{build/graph.eps}
    156 \end{frame}
    157 
    158 \begin{frame}{Performance of node discovery algorithm}
    159 	\framesubtitle{4 physical nodes, 400 virtual nodes, fanout=2}
    160 	\centering
    161 	\includegraphics[width=0.8\linewidth]{figures/node-discovery.eps}
    162 \end{frame}
    163 
    164 \begin{frame}{Node discovery algorithm: summary}
    165 	\begin{itemize}
    166 		\item Arranges cluster nodes into the hierarchy with strict total
    167 			order.
    168 		\item Scalable to a large number of cluster nodes.
    169 		\item Simple and reliable.
    170 	\end{itemize}
    171 	\vfill
    172 	\begin{center}
    173 		The purpose of node hierarchy is to distribute the load on the cluster.
    174 	\end{center}
    175 \end{frame}
    176 
    177 \section{Kernel processing algorithm}
    178 
    179 \begin{frame}{Example}
    180 	\begin{tikzpicture}[x=3cm,y=-0.9cm]
    181 		\node (dummy) at (0,0) {};
    182 
    183 		\node<1-5> (kernellabel) at (2,-1) {Kernels};
    184 		\node<1-5> (kernelpoollabel) at (4,-1) {Kernel pool};
    185 
    186 		\node<1-5>[Kernel] (task1) at (2,0) {task1};
    187 		\node<1>[KernelPool] (pool0) at (4,0) {\small\texttt{task1.act()}};
    188 		\node<1>[KernelPool] (pool1) at (4,1) {};
    189 		\node<1>[KernelPool] (pool2) at (4,2) {};
    190 		\node<1>[KernelPool] (pool3) at (4,3) {};
    191 		\node<1>[KernelPool] (pool4) at (4,4) {};
    192 
    193 		\node<2-5>[Kernel] (sub1) at (1.25,1.5) {sub1};
    194 		\node<2-4>[Kernel] (sub2) at (2.75,1.5) {sub2};
    195 		\path<2-5>[TaskEdge] (task1) edge (sub1);
    196 		\path<2-4>[TaskEdge] (task1) edge (sub2);
    197 		\node<2>[KernelPool] (pool0) at (4,0) {\small\texttt{sub1.act()}};
    198 		\node<2>[KernelPool] (pool1) at (4,1) {\small\texttt{sub2.act()}};
    199 		\node<2>[KernelPool] (pool2) at (4,2) {};
    200 		\node<2>[KernelPool] (pool3) at (4,3) {};
    201 		\node<2>[KernelPool] (pool4) at (4,4) {};
    202 
    203 		\node<3-4>[Kernel] (subsub1) at (0.75,3) {subsub1};
    204 		\node<3-4>[Kernel] (subsub2) at (1.75,3) {subsub2};
    205 		\path<3-4>[TaskEdge] (sub1) edge (subsub1);
    206 		\path<3-4>[TaskEdge] (sub1) edge (subsub2);
    207 		\node<3>[KernelPool] (pool0) at (4,0) {\small\texttt{sub2.act()}};
    208 		\node<3>[KernelPool] (pool1) at (4,1) {\small\texttt{subsub1.act()}};
    209 		\node<3>[KernelPool] (pool2) at (4,2) {\small\texttt{subsub2.act()}};
    210 		\node<3>[KernelPool] (pool3) at (4,3) {};
    211 		\node<3>[KernelPool] (pool4) at (4,4) {};
    212 
    213 		\path<4>[TaskEdge] (subsub1) edge[bend left] (sub1);
    214 		\path<4>[TaskEdge] (subsub2) edge[bend right] (sub1);
    215 		\path<4>[TaskEdge] (sub2) edge[bend right] (task1);
    216 		\node<4>[KernelPool] (pool0) at (4,0) {\small\texttt{task1.react(sub2)}};
    217 		\node<4>[KernelPool] (pool1) at (4,1) {\small\texttt{sub1.react(subsub1)}};
    218 		\node<4>[KernelPool] (pool2) at (4,2) {\small\texttt{sub1.react(subsub2)}};
    219 		\node<4>[KernelPool] (pool3) at (4,3) {};
    220 		\node<4>[KernelPool] (pool4) at (4,4) {};
    221 
    222 		\path<5>[TaskEdge] (sub1) edge[bend left] (task1);
    223 		\node<5>[KernelPool] (pool0) at (4,0) {\small\texttt{task1.react(sub1)}};
    224 		\node<5>[KernelPool] (pool1) at (4,1) {};
    225 		\node<5>[KernelPool] (pool2) at (4,2) {};
    226 		\node<5>[KernelPool] (pool3) at (4,3) {};
    227 		\node<5>[KernelPool] (pool4) at (4,4) {};
    228 
    229 		\node<6> (task1) at (2,0) {Programme ends};
    230 
    231 	\end{tikzpicture}
    232 	\note<1>{
    233 		\begin{itemize}
    234 			\item Left --- kernels, right --- execution queue (pool). Animation shows how kernels create subordinates and collect results from them.
    235 			\item The main difference compared to the jobs is that a task completes only when all its subordinates complete (it can not complete earlier). So, this model is more like asynchronous execution of subroutines (coroutines), but coroutines also do not have hierarchy.
    236 		\end{itemize}
    237 	}
    238 \end{frame}
    239 
    240 \begin{frame}{Kernel processing algorithm: summary}
    241 	\begin{itemize}
    242 		\item Kernels are like procedure calls, but asynchronous.
    243 		\item The number of kernels is decoupled from the number of cluster
    244 			nodes to aid in load balancing.
    245 	\end{itemize}
    246 	\vfill
    247 	\begin{center}
    248 		The purpose of kernel hierarchy is to determine who is responsible for
    249 		re-executing kernels from failed cluster nodes.
    250 	\end{center}
    251 \end{frame}
    252 
    253 \section{Recovery from master node failure}
    254 
    255 \begin{frame}
    256 	\frametitle{Handling master node failure}
    257 	\centering
    258 	\begin{tikzpicture}[remember picture,x=3cm,y=-3cm]
    259 		\node[Node] (A1) at (0,0) {A};
    260 		\node[Node] (B1) at (1,0) {B};
    261 		\node[Node] (C1) at (0,1) {C};
    262 		\node[Node] (D1) at (1,1) {D};
    263 	\end{tikzpicture}
    264 	\begin{tikzpicture}[remember picture,overlay]
    265 		\path[thick] (A1) edge (B1);
    266 		%\path[thick] (A1) edge (C1);
    267 		%\path[thick] (A1) edge (D1);
    268 		\path[thick] (B1) edge (C1);
    269 		\path[thick] (B1) edge (D1);
    270 		%\path[thick] (C1) edge (D1);
    271 	\end{tikzpicture}
    272 	\only<2->{%
    273 		\begin{tikzpicture}[remember picture,overlay]
    274 			\node[Task,label={[TaskLabel]90:Master\vphantom{p}}] (Master) at (A1.center) {\phantom{A}};
    275 		\end{tikzpicture}%
    276 	}
    277 	\only<3->{%
    278 		\begin{tikzpicture}[remember picture,overlay]
    279 			\node[Task,label={[TaskLabel]90:Backup}] (MasterCopy) at (B1.center) {\phantom{A}};
    280             \path[TaskEdge] (Master) edge (MasterCopy);
    281         \end{tikzpicture}%
    282     }
    283     \only<4->{%
    284         \begin{tikzpicture}[remember picture,overlay]
    285             \node[Task,label={[TaskLabel]0:Task1}] (Task1) at (B1.center) {\phantom{A}};
    286         \end{tikzpicture}%
    287     }
    288     \only<5->{%
    289         \begin{tikzpicture}[remember picture,overlay]
    290             \node[Task,label={[TaskLabel]180:Sub1}] (Sub1) at (A1.center) {\phantom{A}};
    291             \node[Task,label={[TaskLabel]180:Sub2}] (Sub2) at (C1.center) {\phantom{A}};
    292             \node[Task,label={[TaskLabel]0:Sub3}] (Sub3) at (D1.center) {\phantom{A}};
    293             \path[TaskEdge] (Task1) edge[bend left] (Sub1);
    294             \path[TaskEdge] (Task1) edge (Sub2);
    295             \path[TaskEdge] (Task1) edge (Sub3);
    296         \end{tikzpicture}%
    297     }
    298     \note<5>{
    299         \begin{itemize}
    300             \item Here A, B, C, D --- cluster nodes and Master, Backup, Task1, Sub1, Sub2, Sub3 --- tasks.
    301             \item Backup is a copy of Master which is sent along with Task1 to the subordinate node.
    302             \item Task1 represent one sequential step of a programme. There can be any number of seq. steps in a programme, and when Backup node fails, the current step is restarted from the beginning.
    303             \item When node B, C or D fails, corresponding master task restarts failed subordinates (Task1 restart Sub1, Master restarts Task1 etc.). When node A fails, Master task is recovered from Backup (its copy).
    304         \end{itemize}
    305     }
    306 \end{frame}
    307 
    308 \begin{frame}{Case study: ARMA ocean wavy surface generator}
    309 	\begin{columns}[T]
    310 		\begin{column}{0.25\textwidth}
    311 			MA model:
    312 			\begin{equation*}
    313 				\zeta_{\vec{x}} =
    314 				\sum\limits_{\vec{y}=\vec{0}}^{N}
    315 				\Theta_{\vec{y}}\,\epsilon_{\vec{x} - \vec{y}}
    316 			\end{equation*}
    317 			\vskip1cm
    318 			MA coefficients:
    319 			\begin{equation*}
    320 				K_{i,j,k} = 
    321 				\left[
    322 					\displaystyle
    323 					\sum\limits_{l=i}^{q_1}
    324 					\sum\limits_{m=j}^{q_2}
    325 					\sum\limits_{n=k}^{q_3}
    326 					\Theta_{l,m,n}\Theta_{l-i,m-j,n-k}
    327 				\right]
    328 				\sigma_\epsilon^2
    329 			\end{equation*}
    330 		\end{column}
    331 		\begin{column}{0.65\textwidth}
    332 			\vskip1cm
    333 			\includegraphics[width=\linewidth]{figures/wavy.eps}
    334 		\end{column}
    335 	\end{columns}
    336 \end{frame}
    337 
    338 \begin{frame}{Performance with master node failure (ARMA)}
    339 	\centering
    340 	\begin{columns}
    341 		\begin{column}{0.49\textwidth}
    342 			\includegraphics[width=\linewidth]{figures/performance-arma.eps}
    343 		\end{column}
    344 		\begin{column}{0.49\textwidth}
    345 			\includegraphics[width=\linewidth]{figures/overhead-arma.eps}
    346 		\end{column}
    347 	\end{columns}
    348 \end{frame}
    349 
    350 \begin{frame}
    351 	\frametitle{Case study: NDBC dataset preprocessing}
    352 	\begin{center}
    353 		\begin{tabular}{ll}
    354 			\toprule
    355 			Dataset size & 144MB \\
    356 			Dataset size (uncompressed) & 770MB \\
    357 			No.~of wave stations & 24 \\
    358 			Time span & 3 years (2010--2012) \\
    359 			Total no.~of spectra & 445422 \\
    360 			\bottomrule
    361 		\end{tabular}
    362 	\end{center}
    363 	\vfill
    364 	Spectrum reconstruction formula:
    365 	\begin{equation*}
    366 		S(\omega, \theta) = \frac{1}{\pi}\!
    367 		\left[
    368 			\frac{1}{2} + 
    369 			r_1 \cos \left( \theta - \alpha_1 \right) +
    370 			r_2 \sin \left( 2 \left( \theta - \alpha_2 \right) \right)
    371 		\right]\!
    372 		S_0(\omega).
    373 	\end{equation*}
    374 	\note{
    375 		Master node fail over technique is evaluated on the example of wave energy spectra processing application. This programme uses NDBC (\href{http://www.ndbc.noaa.gov/}{National Data Buoy Center}) dataset to reconstruct frequency-directional spectra from wave rider buoy measurements and compute variance. Each spectrum is reconstructed from five variables using the following formula.
    376 		\begin{equation*}
    377 			S(\omega, \theta) = \frac{1}{\pi}
    378 			\left[
    379 				\frac{1}{2} + 
    380 				r_1 \cos \left( \theta - \alpha_1 \right) +
    381 				r_2 \sin \left( 2 \left( \theta - \alpha_2 \right) \right)
    382 			\right]
    383 			S_0(\omega).
    384 		\end{equation*}
    385 		Here $\omega$ denotes frequency, $\theta$ is wave direction, $r_{1,2}$ and $\alpha_{1,2}$ are parameters of spectrum decomposition and $S_0$ is non-directional spectrum; $r_{1,2}$, $\alpha_{1,2}$ and $S_0$ are acquired through measurements. Properties of the dataset which is used in evaluation are listed in Table.
    386 	}
    387 \end{frame}
    388 
    389 \begin{frame}{Performance with master node failure (NDBC)}
    390 	\centering
    391 	\begin{columns}
    392 		\begin{column}{0.49\textwidth}
    393 			\includegraphics[width=\linewidth]{figures/performance-ndbc.eps}
    394 		\end{column}
    395 		\begin{column}{0.49\textwidth}
    396 			\includegraphics[width=\linewidth]{figures/overhead-ndbc.eps}
    397 		\end{column}
    398 	\end{columns}
    399 \end{frame}
    400 
    401 \begin{frame}{Recovery from master node failure: summary}
    402 	\centering
    403 	Works only when master kernel has at most one subordinate at a time.
    404 	\vfill
    405 	How to adapt it for multiple node failures?
    406 \end{frame}
    407 
    408 \section{Recovery from multiple node failures}
    409 
    410 \begin{frame}[t]
    411 	\frametitle{Transform kernel hierarchy}
    412 	\vskip1cm
    413 	\begin{tikzpicture}[x=2cm,y=-1.5cm]
    414 		\node[Kernel] (task1) at (2,0) {task1};
    415 		\node[Kernel] (sub1) at (1,1) {sub1};
    416 		\node[Kernel] (sub2) at (2,1) {sub2};
    417 		\node[Kernel] (sub3) at (3,1) {sub3};
    418 		\path[TaskEdge] (task1) edge (sub1);
    419 		\path[TaskEdge] (task1) edge (sub2);
    420 		\path[TaskEdge] (task1) edge (sub3);
    421 		\node<2->[
    422 			single arrow,
    423 			fill=spbuGray,
    424 			minimum height=1.5cm,
    425 			minimum width=0.5cm
    426 		] at (4,0.5) {};
    427 		\node<2->[Kernel] (xtask1) at (6,0) {task1};
    428 		\node<2->[Kernel] (xsub1) at (5,1) {sub1};
    429 		\node<2->[Kernel] (xsub2) at (6,1.5) {sub2};
    430 		\node<2->[Kernel] (xsub3) at (7,2) {sub3};
    431 		\path<2->[TaskEdge] (xtask1) edge (xsub1);
    432 		\path<2->[TaskEdge] (xsub1) edge[bend right] (xsub2);
    433 		\path<2->[TaskEdge] (xsub2) edge[bend right] (xsub3);
    434 		\path<2->[PseudoKernelEdge] (xtask1) edge (xsub2);
    435 		\path<2->[PseudoKernelEdge] (xtask1) edge (xsub3);
    436 	\end{tikzpicture}
    437 	\begin{center}
    438 		\only<3>{\alert{Very inefficient}}
    439 		\only<4->{%
    440 			Solution: infer relationship between subordinate kernels by
    441 			recording IP-addresses of cluster nodes they were sent to.
    442 		}
    443 	\end{center}
    444 \end{frame}
    445 
    446 \begin{frame}{Overhead of recovery from multiple node failures}
    447 	\begin{columns}[T]
    448 		\begin{column}{0.45\textwidth}
    449 			\vskip-1cm
    450 			\includegraphics[width=\linewidth]{test-1-phys}\newline\vspace{-1cm}
    451 			\includegraphics[width=\linewidth]{test-2-phys}
    452 		\end{column}
    453 		\begin{column}{0.45\textwidth}
    454 			\vskip-1cm
    455 			\includegraphics[width=\linewidth]{test-1-virt}\newline\vspace{-1cm}
    456 			\includegraphics[width=\linewidth]{test-2-virt}
    457 		\end{column}
    458 	\end{columns}
    459 \end{frame}
    460 
    461 \begin{frame}{Conclusions}
    462 	\begin{itemize}
    463 		\item Having hierarchy (strict total order) between tasks (kernels) is
    464 			sufficient to devise algorithms for recovery from cluster node
    465 			failures.
    466 		\item Each cluster node having an IP-address (or any other unique
    467 			identifier) is the only assumption of these algorithms.
    468 		\item The whole system acts as the router for kernels which
    469 			reconfigures itself upon node failures.
    470 	\end{itemize}
    471 \end{frame}
    472 
    473 \begin{frame}
    474 	\centering%
    475 	\vspace{1.0cm}%
    476 	\Large\textbf{Thank you for attention!}
    477 \end{frame}
    478 
    479 \end{document}
	hpcs-17-subord Subordination: Providing Resilience to Simultaneous Failure of Multiple Cluster Nodes
	git clone https://git.igankevich.com/hpcs-17-subord.git
	Log \| Files \| Refs