commit 84c6d4c3e7baf081a48097f2d4f91866d9b5fc70
parent 48ca5657878d0a48fa8835e9077aa079d128ffa3
Author: Ivan Gankevich <igankevich@ya.ru>
Date: Fri, 12 May 2017 18:52:25 +0300
Add a diagram of fail over algorithm.
Diffstat:
14 files changed, 169 insertions(+), 17 deletions(-)
diff --git a/Makefile b/Makefile
@@ -1,4 +1,4 @@
-build/main.pdf: *.tex *.bib bib/* build src/* \
+build/main.pdf: *.tex *.bib bib/* build src/* tex/* dot/* \
build/sc12.pdf \
build/sc1.pdf \
build/sc2.pdf \
@@ -14,7 +14,6 @@ build/test-2-virt.pdf
-pdf \
-bibtex \
-shell-escape \
- -quiet \
-f main.tex
build/test-1-phys.pdf: R/test-1.R R/common.R
diff --git a/dot/ppl.dot b/dot/ppl.dot
@@ -1,8 +1,8 @@
graph Pipeline {
- node [fontname="Times",fontsize=8,margin="0.01,0.01",shape=box,height="0.1",width="0.1"]
+ node [fontname="Times",fontsize=7,margin="0.01,0.01",shape=box,height="0.1",width="0.1",penwidth="0.5"]
graph [fontname="Times",fontsize=8,nodesep="0.07",ranksep="0.05",rankdir="LR",margin="-0.1,-0.1"]
- edge [arrowsize=0.66]
+ edge [arrowsize=0.66,penwidth="0.5"]
subgraph cluster_daemon {
label="Daemon process"
diff --git a/main.tex b/main.tex
@@ -10,6 +10,7 @@
\input{preamble}
+\input{tex/preamble}
\begin{document}
@@ -26,7 +27,7 @@ cluster nodes}
}%
}%
-%\IEEEspecialpapernotice{(Poster Paper)}
+\IEEEspecialpapernotice{(WORK-IN-PROGRESS)}
diff --git a/preamble.tex b/preamble.tex
@@ -1,2 +1,10 @@
\newcommand{\Method}[1]{\texttt{#1}}
+\newcommand*{\spbuInsertFigure}[1]{%
+\vspace{1\baselineskip}%
+\begin{minipage}{0.5\linewidth}%
+ \normalsize%
+ \input{#1}%
+\end{minipage}%
+}%
+
diff --git a/src/body.tex b/src/body.tex
@@ -66,7 +66,7 @@ their parents, and a thread pool that processes kernels in accordance with
rules outlined in the previous paragraph. A~separate pipeline exists for each
compute device: There are pipelines for parallel processing, schedule-based
processing (periodic and delayed tasks), and a proxy pipeline for processing of
-kernels on other cluster nodes.
+kernels on other cluster nodes (see~fig.~\ref{fig:pipeline}).
In principle, kernels and pipelines machinery reflect the one of procedures and
call stacks, with the advantage that kernel methods are called asynchronously
@@ -131,13 +131,12 @@ nodes, they are used interchangeably in the paper.
The main purpose of the system is to provide continuous execution of kernels in
the presence of daemon (and consequently node) failures. There are three types
of such failures.
-
-\begin{enumerate}
+\begin{itemize}
\item Simultaneous failure of at most one node.
\item Simultaneous failure of more than one node but less than total number
of nodes.
\item Simultaneous failure of all nodes (electricity outage).
-\end{enumerate}
+\end{itemize}
For the sake of simplicity, it is assumed that parallel programme runs on all
cluster nodes.
@@ -147,9 +146,10 @@ principal divides the task into parts and creates a subordinate to compute each
of them. The principal copies itself to each subordinate in the order of their
creation, and includes in each subordinate a list of all node IP addresses to
which previously created subordinates were sent (a list of \emph{neighbours}).
-When a daemon fails or goes offline, all kernels which reside on the
-corresponding cluster node are considered failed, and recovery process is
-triggered in accordance with the following scenarios.
+When a connection from master node to slave node closes either as a results of
+node failure, or as a consequence of the daemon hierarchy change, all kernels
+which reside on the corresponding cluster node are considered failed, and
+recovery process is triggered in accordance with the following scenarios.
\paragraph*{Scenario~1} With respect to kernel hierarchy, there are two
possible variants of this failure: when a principal fails and when a
@@ -158,7 +158,7 @@ cluster node.
Since a subordinate is a simple worker, rather than a valuable part of
execution, a copy of it (in initial state) is simply restored from the node
-where its parent is located (Figure~\ref{fig:subordinate-fails}). Each daemon
+where its parent is located (fig.~\ref{fig:subordinate-fails}). Each daemon
maintains a list of kernels, that were sent to a particular subordinate node.
When the corresponding network connection closes all kernels from the list are
automatically re-sent to available node, or executed locally of there no
@@ -168,7 +168,7 @@ When a principal fails every subordinate has its copy, but we need to restore
it only once and only on one node to correctly continue programme execution. To
ensure that the principal is restored only once, each subordinate tries to find
the first surviving node from the IP address list of neighbours
-(Figure~\ref{fig:principal-fails}
+(fig.~\ref{fig:principal-fails}
and~\ref{fig:subordinate-and-principal-fail}). If such node is online, the
search stops and the subordinate kernel is deleted. If the node is not found,
the subordinate restores the principal from the copy and deletes itself. Kernel
@@ -261,6 +261,15 @@ principal kernels in the same hierarchy branch have failed. If a node fails in
the middle of recovery process, the whole process is restarted from the
beginning.
+\begin{figure}
+ \noindent%
+ \spbuInsertFigure{tex/cluster-0}~\spbuInsertFigure{tex/frame-0}\newline
+ \spbuInsertFigure{tex/frame-3}~\spbuInsertFigure{tex/frame-4}\newline
+ \spbuInsertFigure{tex/legend}%
+ \caption{An example of fail over algorithm in
+ action.\label{fig:failover-example}}
+\end{figure}
+
\section{Evaluation}
Proposed node failure handling approach was evaluated on the example of
diff --git a/src/head.tex b/src/head.tex
@@ -12,8 +12,22 @@ parallel and sequential parts. Using different fault tolerant scenarios based
on hierarchy interactions, this framework provides continuous execution of a
parallel programme in case of hardware errors or electricity outages.
-In this paper we present an algorithm that guarantees continuous execution of a
-parallel programme upon failure of all nodes except one. This algorithm is
-based on the one developed in previous
+The aim of the research reported here is to investigate how continuous
+execution of parallel programmes can be provided on the level of software
+framework. This framework replaces both MPI library and batch job scheduler by
+intoriducing the notion of a kernel~--- a unit of work which can be copied
+between cluster nodes and re-executed any number of times~--- if it is required
+to provide resilience to node failures. In this paper we present an algorithm
+that guarantees continuous execution of a parallel programme upon failure of
+all nodes except one. This algorithm is based on the one developed in previous
papers~\cite{gankevich2015subordination,gankevich2016factory}, where only one
node failure at a time is guaranteed to not interrupt programme execution.
+
+In this paper failure detection methods are not studied, and node failure is
+assumed if the corresponding network connection prematurely closes. Node
+failure handling, provided by our algorithm, is transparent for a programmer:
+there is no need explicitly specify which kernels should be copied to other
+cluster nodes. However, its implementation cannot be used to provide fault
+tolerance to existing parallel programmes based on MPI or other libraries: the
+purpose of software framework developed here is to seamlessly provide fault
+tolerance for new parallel applications.
diff --git a/tex/cluster-0.tex b/tex/cluster-0.tex
@@ -0,0 +1,21 @@
+%\centering%
+\begin{tikzpicture}[remember picture,x=1.5cm,y=-1.5cm]%
+ \node[Node] (A1) at (0,0) {B};
+ \node[Node] (B1) at (1,0) {A};
+ \node[Node] (C1) at (0,1) {C};
+ \node[Node] (D1) at (1,1) {D};
+
+ \node[Switch] (S1) at (.5,.5) {};
+
+ \path[PhysicalLink] (A1) edge (S1);
+ \path[PhysicalLink] (B1) edge (S1);
+ \path[PhysicalLink] (C1) edge (S1);
+ \path[PhysicalLink] (D1) edge (S1);
+
+ \path[NetworkLink] (A1) edge (B1);
+ \path[NetworkLink] (A1) edge (C1);
+ \path[NetworkLink] (A1) edge (D1);
+ \path[NetworkLink] (B1) edge (C1);
+ \path[NetworkLink] (B1) edge (D1);
+ \path[NetworkLink] (C1) edge (D1);
+\end{tikzpicture}%
diff --git a/tex/frame-0.tex b/tex/frame-0.tex
@@ -0,0 +1,15 @@
+\input{tex/cluster-0}
+\begin{tikzpicture}[remember picture,overlay]
+
+ \node[Daemon] (Ax) at (A1) {\phantom{A}};
+ \node[Daemon] (Bx) at (B1) {\phantom{B}};
+ \node[Daemon] (Cx) at (C1) {\phantom{C}};
+ \node[Daemon] (Dx) at (D1) {\phantom{D}};
+
+ \path[DaemonLink] (Ax) edge (Bx);
+ %\path[thick] (A1) edge (C1);
+ %\path[thick] (A1) edge (D1);
+ \path[DaemonLink] (Bx) edge (Cx);
+ \path[DaemonLink] (Bx) edge (Dx);
+ %\path[thick] (C1) edge (D1);
+\end{tikzpicture}
diff --git a/tex/frame-1.tex b/tex/frame-1.tex
@@ -0,0 +1,4 @@
+\input{tex/frame-0}
+\begin{tikzpicture}[remember picture,overlay]
+ \node[Task,label={[TaskLabel,label distance=0.001cm]90:\textbf{Main\vphantom{p}}}] (Master) at (A1.center) {\phantom{A}};
+\end{tikzpicture}%
diff --git a/tex/frame-2.tex b/tex/frame-2.tex
@@ -0,0 +1,5 @@
+\input{tex/frame-1}
+\begin{tikzpicture}[remember picture,overlay]
+ \node[Task,label={[TaskLabel,label distance=0.001cm]90:\textbf{Backup}}] (MasterCopy) at (B1.center) {\phantom{A}};
+ \path[TaskEdge] (Master) edge (MasterCopy);
+\end{tikzpicture}%
diff --git a/tex/frame-3.tex b/tex/frame-3.tex
@@ -0,0 +1,4 @@
+\input{tex/frame-2}
+\begin{tikzpicture}[remember picture,overlay]
+ \node[Task,label={[TaskLabel,label distance=0.001cm]0:\textbf{T\textsubscript{\color{spbuGreen}1}}}] (Task1) at (B1.center) {\phantom{A}};
+\end{tikzpicture}%
diff --git a/tex/frame-4.tex b/tex/frame-4.tex
@@ -0,0 +1,10 @@
+\input{tex/frame-3}
+\begin{tikzpicture}[remember picture,overlay]
+ \node[Task,label={[TaskLabel,label distance=0.001cm]180:\textbf{S\textsubscript{\color{spbuGreen}1}}}] (Sub1) at (A1.center) {\phantom{A}};
+ \node[Task,label={[TaskLabel,label distance=0.001cm]180:\textbf{S\textsubscript{\color{spbuGreen}2}}}] (Sub2) at (C1.center) {\phantom{A}};
+ \node[Task,label={[TaskLabel,label distance=0.001cm]0:\textbf{S\textsubscript{\color{spbuGreen}3}}}] (Sub3) at (D1.center) {\phantom{A}};
+ %\node[Task,label={[TaskLabel]0:Task1}] (Task1) at (B1.center) {\phantom{A}};
+ \path[TaskEdge] (Task1) edge[bend left] (Sub1);
+ \path[TaskEdge] (Task1) edge (Sub2);
+ \path[TaskEdge] (Task1) edge (Sub3);
+\end{tikzpicture}%
diff --git a/tex/legend.tex b/tex/legend.tex
@@ -0,0 +1,22 @@
+\small
+\begin{tikzpicture}[x=1.0cm,y=-0.5cm,framed]
+
+ \node[Node] at (0,0) {\phantom{A}};
+ \node[anchor=west] (X2) at (0.4,0) {\strut cluster node};
+ \node[Switch] at (4,0) {};
+ \node[anchor=west] (X2) at (4.4,0) {\strut network switch};
+ \draw[PhysicalLink] (3.8,1) -- (4.2,1);
+ \node[anchor=west] (X2) at (4.4,1) {\strut physical link};
+ \draw[NetworkLink] (3.8,2) -- (4.2,2);
+ \node[anchor=west] (X2) at (4.4,2) {\strut network link};
+
+ \node[Daemon,scale=0.6] at (0,3) {\phantom{A}};
+ \node[anchor=west] (X2) at (0.4,3) {\strut daemon process};
+ \draw[DaemonLink] (3.8,3) -- (4.2,3);
+ \node[anchor=west] (X2) at (4.4,3) {\strut node hierarchy link};
+
+ \node[Process,scale=0.6] (X3) at (0,4) {\phantom{A}};
+ \node[baseline=(X3.base),anchor=west] at (0.4,4) {\strut kernel};
+ \draw[ProcessEdge] (3.8,4) -- (4.2,4);
+ \node[anchor=west] (X2) at (4.4,4) {\strut kernel hierarchy link};
+\end{tikzpicture}
diff --git a/tex/preamble.tex b/tex/preamble.tex
@@ -0,0 +1,40 @@
+\usepackage{tikz}
+\usetikzlibrary{shapes}
+\usetikzlibrary{backgrounds}
+
+% corporate colors
+\definecolor{spbuTerracotta}{cmyk}{.08,.91,.92,.33}
+\definecolor{spbuGray}{cmyk}{.21,.11,.09,.22}
+
+\definecolor{spbuWhite1}{RGB}{245,246,245}
+\definecolor{spbuWhite2}{RGB}{230,231,230}
+\definecolor{spbuWhite3}{RGB}{217,218,217}
+
+\definecolor{spbuWhiteRed2}{RGB}{255,231,230}
+\definecolor{spbuWhiteRed3}{RGB}{255,160,160}
+\definecolor{spbuRed}{RGB}{200,40,40}
+
+\definecolor{spbuDarkGray}{HTML}{404040}
+\definecolor{spbuDarkGray2}{HTML}{5F7177}
+
+\definecolor{spbuGreen}{RGB}{40,160,40}
+\definecolor{spbuBlue}{RGB}{40,40,160}
+
+% physical layer
+\tikzset{Node/.style={rectangle,draw=spbuDarkGray,line width=2pt}}
+\tikzset{Switch/.style={circle,draw=gray,fill=gray,line width=4pt,solid}}
+\tikzset{PhysicalLink/.style={draw=gray,line width=4pt,solid}}
+\tikzset{NetworkLink/.style={draw=black,line width=2pt,dashed}}
+
+% logical layer
+\tikzset{Daemon/.style={ellipse,draw=spbuBlue,line width=2pt,solid,scale=0.65}}
+\tikzset{DaemonLink/.style={draw=spbuBlue,line width=2pt,solid}}
+
+% application layer
+\tikzset{Process/.style={ellipse,draw=spbuGreen,line width=2pt,solid}}
+\tikzset{ProcessEdge/.style={draw=spbuGreen,line width=2pt,solid}}
+
+\tikzset{Task/.style={Process,anchor=center,draw=spbuGreen}}
+\tikzset{TaskEdge/.style={ProcessEdge,draw=spbuGreen,solid,->}}
+\tikzset{Label/.style={label distance=0.1cm,text=spbuTerracotta}}
+\tikzset{TaskLabel/.style={label distance=0.1cm,text=spbuGreen}}