slides.tex (16024B)
1 % Created 2017-07-14 Пт 11:23 2 % Intended LaTeX compiler: pdflatex 3 \documentclass[12pt,aspectratio=169]{beamer} 4 \usepackage{graphicx} 5 \usepackage{booktabs} 6 \usepackage{amsmath} 7 \usepackage{amssymb} 8 \usepackage{hyperref} 9 \usepackage{tikz} 10 \usetikzlibrary{shapes} 11 \usepackage{cite} 12 \usepackage{url} 13 \usepackage{polyglossia} 14 \input{preamble} 15 \AtBeginSection[]{\frame{\sectionpage}} 16 \usetheme{SaintPetersburg} 17 18 \author{I.\,Gankevich \quad Yu.\,Tipikin \quad V.\,Korkhov} 19 \date{July, 2017} 20 \title{Subordination: Providing resilience to simultaneous failure of multiple cluster nodes} 21 \institute{Saint Petersburg State University} 22 \setdefaultlanguage{english} 23 24 25 \definecolor{spbuGreen}{RGB}{40,160,40} 26 \definecolor{spbuBlue}{RGB}{40,40,160} 27 28 % physical layer 29 \tikzset{Node/.style={rectangle,draw=spbuDarkGray,line width=3pt}} 30 \tikzset{Switch/.style={circle,draw=gray,fill=gray,line width=7pt,solid}} 31 \tikzset{PhysicalLink/.style={draw=gray,line width=7pt,solid}} 32 \tikzset{NetworkLink/.style={draw=black,line width=3pt,dashed}} 33 34 % logical layer 35 \tikzset{Daemon/.style={ellipse,draw=spbuBlue,line width=3pt,solid,scale=0.65}} 36 \tikzset{DaemonLink/.style={draw=spbuBlue,line width=3pt,solid}} 37 38 % application layer 39 \tikzset{Process/.style={ellipse,draw=spbuGreen,line width=3pt,solid}} 40 \tikzset{ProcessEdge/.style={draw=spbuGreen,line width=3pt,solid,->}} 41 42 43 \tikzset{Node/.style={rectangle,draw=spbuDarkGray,thick}} 44 \tikzset{Task/.style={ 45 ellipse, 46 draw=spbuGreen, 47 line width=3pt, 48 dashed, 49 anchor=center, 50 text=spbuGreen 51 }} 52 \tikzset{Kernel/.style={Task,solid,line width=2pt}} 53 \tikzset{TaskEdge/.style={ 54 ProcessEdge, 55 draw=spbuGreen, 56 solid, 57 ->, 58 line width=2pt 59 }} 60 \tikzset{PseudoKernelEdge/.style={draw=spbuGreen,line width=3pt,dashed}} 61 \tikzset{Label/.style={label distance=0.1cm,text=spbuTerracotta}} 62 \tikzset{TaskLabel/.style={label distance=0.1cm,text=spbuGreen}} 63 \tikzset{KernelPool/.style={ 64 rectangle, 65 draw=spbuDarkGray, 66 minimum width=4.7cm, 67 minimum height=0.9cm, 68 line width=2pt, 69 thick 70 }} 71 72 \begin{document} 73 74 \maketitle 75 76 \begin{frame}{Motivation} 77 Looking for ways to develop fault-tolerant HPC applications without 78 checkpoints. 79 \vfill\pause 80 The outcome: 81 82 a scheduler with C++ API which replaces both batch job scheduler and 83 message passing interface. 84 \end{frame} 85 86 \begin{frame}{System architecture} 87 \begin{columns}[T] 88 \begin{column}{0.2\textwidth} 89 \input{tex/cluster-0} 90 \end{column} 91 \begin{column}{0.2\textwidth} 92 \input{tex/frame-0} 93 \end{column} 94 \begin{column}{0.2\textwidth} 95 \input{tex/frame-3} 96 \end{column} 97 \begin{column}{0.2\textwidth} 98 \input{tex/frame-4} 99 \end{column} 100 \end{columns} 101 \vfill 102 \begin{center} 103 \input{tex/legend} 104 \end{center} 105 \end{frame} 106 107 \begin{frame}{Application architecture} 108 \centering 109 \includegraphics{build/ppl.pdf} 110 \end{frame} 111 112 \section{Node discovery algorithm} 113 114 \begin{frame}{Hierarchy of cluster nodes} 115 \begin{equation*} 116 \forall n_1 \forall n_2 \in \mathcal{N}, 117 \forall f \colon \mathcal{N} \rightarrow \mathcal{R}^n 118 \Rightarrow (f(n_1) < f(n_2) \Leftrightarrow \neg (f(n_1) \geq f(n_2))) 119 \end{equation*} 120 Here \(n\) is node rank~--- node's position in the network IP-address 121 range. 122 \vfill 123 \begin{columns}[T] 124 \begin{column}{0.35\textwidth} 125 \begin{itemize} 126 \item Strict total order. 127 \item Static node positions in the hierarchy. 128 \end{itemize} 129 \end{column} 130 \begin{column}{0.55\textwidth} 131 \begin{tikzpicture}[x=2.5cm,y=-1cm] 132 \node[Node] (M) at (2,0) {\texttt{10.0.0.1}}; 133 \node[Node] (S1) at (1.25,1) {\texttt{10.0.0.2}}; 134 \node[Node] (S2) at (2.75,1) {\texttt{10.0.0.3}}; 135 \node[Node] (S3) at (0.75,2) {\texttt{10.0.0.4}}; 136 \node[Node] (S4) at (1.75,2) {\texttt{10.0.0.5}}; 137 \node[Node] (S5) at (2.75,2) {\texttt{10.0.0.6}}; 138 \path[thick] (M) edge (S1); 139 \path[thick] (M) edge (S2); 140 \path[thick] (S1) edge (S3); 141 \path[thick] (S1) edge (S4); 142 \path[thick] (S2) edge (S5); 143 \end{tikzpicture} 144 \begin{center} 145 \texttt{fanout=2, nodes=6} 146 \end{center} 147 \end{column} 148 \end{columns} 149 \end{frame} 150 151 \begin{frame}{Example} 152 \framesubtitle{\texttt{fanout=2, nodes=100, network=10.0.0.0/8}} 153 \centering 154 \includegraphics<1>[width=0.9\linewidth]{build/graph.eps} 155 \includegraphics<2>[width=0.9\linewidth,trim={20cm 10cm 10cm 5cm},clip]{build/graph.eps} 156 \end{frame} 157 158 \begin{frame}{Performance of node discovery algorithm} 159 \framesubtitle{4 physical nodes, 400 virtual nodes, fanout=2} 160 \centering 161 \includegraphics[width=0.8\linewidth]{figures/node-discovery.eps} 162 \end{frame} 163 164 \begin{frame}{Node discovery algorithm: summary} 165 \begin{itemize} 166 \item Arranges cluster nodes into the hierarchy with strict total 167 order. 168 \item Scalable to a large number of cluster nodes. 169 \item Simple and reliable. 170 \end{itemize} 171 \vfill 172 \begin{center} 173 The purpose of node hierarchy is to distribute the load on the cluster. 174 \end{center} 175 \end{frame} 176 177 \section{Kernel processing algorithm} 178 179 \begin{frame}{Example} 180 \begin{tikzpicture}[x=3cm,y=-0.9cm] 181 \node (dummy) at (0,0) {}; 182 183 \node<1-5> (kernellabel) at (2,-1) {Kernels}; 184 \node<1-5> (kernelpoollabel) at (4,-1) {Kernel pool}; 185 186 \node<1-5>[Kernel] (task1) at (2,0) {task1}; 187 \node<1>[KernelPool] (pool0) at (4,0) {\small\texttt{task1.act()}}; 188 \node<1>[KernelPool] (pool1) at (4,1) {}; 189 \node<1>[KernelPool] (pool2) at (4,2) {}; 190 \node<1>[KernelPool] (pool3) at (4,3) {}; 191 \node<1>[KernelPool] (pool4) at (4,4) {}; 192 193 \node<2-5>[Kernel] (sub1) at (1.25,1.5) {sub1}; 194 \node<2-4>[Kernel] (sub2) at (2.75,1.5) {sub2}; 195 \path<2-5>[TaskEdge] (task1) edge (sub1); 196 \path<2-4>[TaskEdge] (task1) edge (sub2); 197 \node<2>[KernelPool] (pool0) at (4,0) {\small\texttt{sub1.act()}}; 198 \node<2>[KernelPool] (pool1) at (4,1) {\small\texttt{sub2.act()}}; 199 \node<2>[KernelPool] (pool2) at (4,2) {}; 200 \node<2>[KernelPool] (pool3) at (4,3) {}; 201 \node<2>[KernelPool] (pool4) at (4,4) {}; 202 203 \node<3-4>[Kernel] (subsub1) at (0.75,3) {subsub1}; 204 \node<3-4>[Kernel] (subsub2) at (1.75,3) {subsub2}; 205 \path<3-4>[TaskEdge] (sub1) edge (subsub1); 206 \path<3-4>[TaskEdge] (sub1) edge (subsub2); 207 \node<3>[KernelPool] (pool0) at (4,0) {\small\texttt{sub2.act()}}; 208 \node<3>[KernelPool] (pool1) at (4,1) {\small\texttt{subsub1.act()}}; 209 \node<3>[KernelPool] (pool2) at (4,2) {\small\texttt{subsub2.act()}}; 210 \node<3>[KernelPool] (pool3) at (4,3) {}; 211 \node<3>[KernelPool] (pool4) at (4,4) {}; 212 213 \path<4>[TaskEdge] (subsub1) edge[bend left] (sub1); 214 \path<4>[TaskEdge] (subsub2) edge[bend right] (sub1); 215 \path<4>[TaskEdge] (sub2) edge[bend right] (task1); 216 \node<4>[KernelPool] (pool0) at (4,0) {\small\texttt{task1.react(sub2)}}; 217 \node<4>[KernelPool] (pool1) at (4,1) {\small\texttt{sub1.react(subsub1)}}; 218 \node<4>[KernelPool] (pool2) at (4,2) {\small\texttt{sub1.react(subsub2)}}; 219 \node<4>[KernelPool] (pool3) at (4,3) {}; 220 \node<4>[KernelPool] (pool4) at (4,4) {}; 221 222 \path<5>[TaskEdge] (sub1) edge[bend left] (task1); 223 \node<5>[KernelPool] (pool0) at (4,0) {\small\texttt{task1.react(sub1)}}; 224 \node<5>[KernelPool] (pool1) at (4,1) {}; 225 \node<5>[KernelPool] (pool2) at (4,2) {}; 226 \node<5>[KernelPool] (pool3) at (4,3) {}; 227 \node<5>[KernelPool] (pool4) at (4,4) {}; 228 229 \node<6> (task1) at (2,0) {Programme ends}; 230 231 \end{tikzpicture} 232 \note<1>{ 233 \begin{itemize} 234 \item Left --- kernels, right --- execution queue (pool). Animation shows how kernels create subordinates and collect results from them. 235 \item The main difference compared to the jobs is that a task completes only when all its subordinates complete (it can not complete earlier). So, this model is more like asynchronous execution of subroutines (coroutines), but coroutines also do not have hierarchy. 236 \end{itemize} 237 } 238 \end{frame} 239 240 \begin{frame}{Kernel processing algorithm: summary} 241 \begin{itemize} 242 \item Kernels are like procedure calls, but asynchronous. 243 \item The number of kernels is decoupled from the number of cluster 244 nodes to aid in load balancing. 245 \end{itemize} 246 \vfill 247 \begin{center} 248 The purpose of kernel hierarchy is to determine who is responsible for 249 re-executing kernels from failed cluster nodes. 250 \end{center} 251 \end{frame} 252 253 \section{Recovery from master node failure} 254 255 \begin{frame} 256 \frametitle{Handling master node failure} 257 \centering 258 \begin{tikzpicture}[remember picture,x=3cm,y=-3cm] 259 \node[Node] (A1) at (0,0) {A}; 260 \node[Node] (B1) at (1,0) {B}; 261 \node[Node] (C1) at (0,1) {C}; 262 \node[Node] (D1) at (1,1) {D}; 263 \end{tikzpicture} 264 \begin{tikzpicture}[remember picture,overlay] 265 \path[thick] (A1) edge (B1); 266 %\path[thick] (A1) edge (C1); 267 %\path[thick] (A1) edge (D1); 268 \path[thick] (B1) edge (C1); 269 \path[thick] (B1) edge (D1); 270 %\path[thick] (C1) edge (D1); 271 \end{tikzpicture} 272 \only<2->{% 273 \begin{tikzpicture}[remember picture,overlay] 274 \node[Task,label={[TaskLabel]90:Master\vphantom{p}}] (Master) at (A1.center) {\phantom{A}}; 275 \end{tikzpicture}% 276 } 277 \only<3->{% 278 \begin{tikzpicture}[remember picture,overlay] 279 \node[Task,label={[TaskLabel]90:Backup}] (MasterCopy) at (B1.center) {\phantom{A}}; 280 \path[TaskEdge] (Master) edge (MasterCopy); 281 \end{tikzpicture}% 282 } 283 \only<4->{% 284 \begin{tikzpicture}[remember picture,overlay] 285 \node[Task,label={[TaskLabel]0:Task1}] (Task1) at (B1.center) {\phantom{A}}; 286 \end{tikzpicture}% 287 } 288 \only<5->{% 289 \begin{tikzpicture}[remember picture,overlay] 290 \node[Task,label={[TaskLabel]180:Sub1}] (Sub1) at (A1.center) {\phantom{A}}; 291 \node[Task,label={[TaskLabel]180:Sub2}] (Sub2) at (C1.center) {\phantom{A}}; 292 \node[Task,label={[TaskLabel]0:Sub3}] (Sub3) at (D1.center) {\phantom{A}}; 293 \path[TaskEdge] (Task1) edge[bend left] (Sub1); 294 \path[TaskEdge] (Task1) edge (Sub2); 295 \path[TaskEdge] (Task1) edge (Sub3); 296 \end{tikzpicture}% 297 } 298 \note<5>{ 299 \begin{itemize} 300 \item Here A, B, C, D --- cluster nodes and Master, Backup, Task1, Sub1, Sub2, Sub3 --- tasks. 301 \item Backup is a copy of Master which is sent along with Task1 to the subordinate node. 302 \item Task1 represent one sequential step of a programme. There can be any number of seq. steps in a programme, and when Backup node fails, the current step is restarted from the beginning. 303 \item When node B, C or D fails, corresponding master task restarts failed subordinates (Task1 restart Sub1, Master restarts Task1 etc.). When node A fails, Master task is recovered from Backup (its copy). 304 \end{itemize} 305 } 306 \end{frame} 307 308 \begin{frame}{Case study: ARMA ocean wavy surface generator} 309 \begin{columns}[T] 310 \begin{column}{0.25\textwidth} 311 MA model: 312 \begin{equation*} 313 \zeta_{\vec{x}} = 314 \sum\limits_{\vec{y}=\vec{0}}^{N} 315 \Theta_{\vec{y}}\,\epsilon_{\vec{x} - \vec{y}} 316 \end{equation*} 317 \vskip1cm 318 MA coefficients: 319 \begin{equation*} 320 K_{i,j,k} = 321 \left[ 322 \displaystyle 323 \sum\limits_{l=i}^{q_1} 324 \sum\limits_{m=j}^{q_2} 325 \sum\limits_{n=k}^{q_3} 326 \Theta_{l,m,n}\Theta_{l-i,m-j,n-k} 327 \right] 328 \sigma_\epsilon^2 329 \end{equation*} 330 \end{column} 331 \begin{column}{0.65\textwidth} 332 \vskip1cm 333 \includegraphics[width=\linewidth]{figures/wavy.eps} 334 \end{column} 335 \end{columns} 336 \end{frame} 337 338 \begin{frame}{Performance with master node failure (ARMA)} 339 \centering 340 \begin{columns} 341 \begin{column}{0.49\textwidth} 342 \includegraphics[width=\linewidth]{figures/performance-arma.eps} 343 \end{column} 344 \begin{column}{0.49\textwidth} 345 \includegraphics[width=\linewidth]{figures/overhead-arma.eps} 346 \end{column} 347 \end{columns} 348 \end{frame} 349 350 \begin{frame} 351 \frametitle{Case study: NDBC dataset preprocessing} 352 \begin{center} 353 \begin{tabular}{ll} 354 \toprule 355 Dataset size & 144MB \\ 356 Dataset size (uncompressed) & 770MB \\ 357 No.~of wave stations & 24 \\ 358 Time span & 3 years (2010--2012) \\ 359 Total no.~of spectra & 445422 \\ 360 \bottomrule 361 \end{tabular} 362 \end{center} 363 \vfill 364 Spectrum reconstruction formula: 365 \begin{equation*} 366 S(\omega, \theta) = \frac{1}{\pi}\! 367 \left[ 368 \frac{1}{2} + 369 r_1 \cos \left( \theta - \alpha_1 \right) + 370 r_2 \sin \left( 2 \left( \theta - \alpha_2 \right) \right) 371 \right]\! 372 S_0(\omega). 373 \end{equation*} 374 \note{ 375 Master node fail over technique is evaluated on the example of wave energy spectra processing application. This programme uses NDBC (\href{http://www.ndbc.noaa.gov/}{National Data Buoy Center}) dataset to reconstruct frequency-directional spectra from wave rider buoy measurements and compute variance. Each spectrum is reconstructed from five variables using the following formula. 376 \begin{equation*} 377 S(\omega, \theta) = \frac{1}{\pi} 378 \left[ 379 \frac{1}{2} + 380 r_1 \cos \left( \theta - \alpha_1 \right) + 381 r_2 \sin \left( 2 \left( \theta - \alpha_2 \right) \right) 382 \right] 383 S_0(\omega). 384 \end{equation*} 385 Here $\omega$ denotes frequency, $\theta$ is wave direction, $r_{1,2}$ and $\alpha_{1,2}$ are parameters of spectrum decomposition and $S_0$ is non-directional spectrum; $r_{1,2}$, $\alpha_{1,2}$ and $S_0$ are acquired through measurements. Properties of the dataset which is used in evaluation are listed in Table. 386 } 387 \end{frame} 388 389 \begin{frame}{Performance with master node failure (NDBC)} 390 \centering 391 \begin{columns} 392 \begin{column}{0.49\textwidth} 393 \includegraphics[width=\linewidth]{figures/performance-ndbc.eps} 394 \end{column} 395 \begin{column}{0.49\textwidth} 396 \includegraphics[width=\linewidth]{figures/overhead-ndbc.eps} 397 \end{column} 398 \end{columns} 399 \end{frame} 400 401 \begin{frame}{Recovery from master node failure: summary} 402 \centering 403 Works only when master kernel has at most one subordinate at a time. 404 \vfill 405 How to adapt it for multiple node failures? 406 \end{frame} 407 408 \section{Recovery from multiple node failures} 409 410 \begin{frame}[t] 411 \frametitle{Transform kernel hierarchy} 412 \vskip1cm 413 \begin{tikzpicture}[x=2cm,y=-1.5cm] 414 \node[Kernel] (task1) at (2,0) {task1}; 415 \node[Kernel] (sub1) at (1,1) {sub1}; 416 \node[Kernel] (sub2) at (2,1) {sub2}; 417 \node[Kernel] (sub3) at (3,1) {sub3}; 418 \path[TaskEdge] (task1) edge (sub1); 419 \path[TaskEdge] (task1) edge (sub2); 420 \path[TaskEdge] (task1) edge (sub3); 421 \node<2->[ 422 single arrow, 423 fill=spbuGray, 424 minimum height=1.5cm, 425 minimum width=0.5cm 426 ] at (4,0.5) {}; 427 \node<2->[Kernel] (xtask1) at (6,0) {task1}; 428 \node<2->[Kernel] (xsub1) at (5,1) {sub1}; 429 \node<2->[Kernel] (xsub2) at (6,1.5) {sub2}; 430 \node<2->[Kernel] (xsub3) at (7,2) {sub3}; 431 \path<2->[TaskEdge] (xtask1) edge (xsub1); 432 \path<2->[TaskEdge] (xsub1) edge[bend right] (xsub2); 433 \path<2->[TaskEdge] (xsub2) edge[bend right] (xsub3); 434 \path<2->[PseudoKernelEdge] (xtask1) edge (xsub2); 435 \path<2->[PseudoKernelEdge] (xtask1) edge (xsub3); 436 \end{tikzpicture} 437 \begin{center} 438 \only<3>{\alert{Very inefficient}} 439 \only<4->{% 440 Solution: infer relationship between subordinate kernels by 441 recording IP-addresses of cluster nodes they were sent to. 442 } 443 \end{center} 444 \end{frame} 445 446 \begin{frame}{Overhead of recovery from multiple node failures} 447 \begin{columns}[T] 448 \begin{column}{0.45\textwidth} 449 \vskip-1cm 450 \includegraphics[width=\linewidth]{test-1-phys}\newline\vspace{-1cm} 451 \includegraphics[width=\linewidth]{test-2-phys} 452 \end{column} 453 \begin{column}{0.45\textwidth} 454 \vskip-1cm 455 \includegraphics[width=\linewidth]{test-1-virt}\newline\vspace{-1cm} 456 \includegraphics[width=\linewidth]{test-2-virt} 457 \end{column} 458 \end{columns} 459 \end{frame} 460 461 \begin{frame}{Conclusions} 462 \begin{itemize} 463 \item Having hierarchy (strict total order) between tasks (kernels) is 464 sufficient to devise algorithms for recovery from cluster node 465 failures. 466 \item Each cluster node having an IP-address (or any other unique 467 identifier) is the only assumption of these algorithms. 468 \item The whole system acts as the router for kernels which 469 reconfigures itself upon node failures. 470 \end{itemize} 471 \end{frame} 472 473 \begin{frame} 474 \centering% 475 \vspace{1.0cm}% 476 \Large\textbf{Thank you for attention!} 477 \end{frame} 478 479 \end{document}