commit f717550ed77b75bce20fd9f00898b0809167c5e6
parent 27d412fd40a3ccc4322f412eec7f80ccfb85e8f2
Author: Ivan Gankevich <igankevich@ya.ru>
Date: Fri, 24 Mar 2017 17:33:38 +0300
Refer to the previous papers.
Diffstat:
3 files changed, 32 insertions(+), 2 deletions(-)
diff --git a/bib/refs.bib b/bib/refs.bib
@@ -39,3 +39,25 @@
year={2015},
publisher={IEEE}
}
+
+@inproceedings{gankevich2015subordination,
+ title={Subordination: Cluster management without distributed consensus},
+ author={Gankevich, Ivan and Tipikin, Yuri and Gaiduchok, Vladimir},
+ booktitle={High Performance Computing \& Simulation (HPCS), 2015
+ International Conference on},
+ pages={639--642},
+ year={2015},
+ organization={IEEE}
+}
+
+
+@inproceedings{gankevich2016factory,
+ title={Factory: Non-stop batch jobs without checkpointing},
+ author={Gankevich, Ivan and Tipikin, Yuri and Korkhov, Vladimir and
+ Gaiduchok, Vladimir},
+ booktitle={High Performance Computing \& Simulation (HPCS), 2016
+ International Conference on},
+ pages={979--984},
+ year={2016},
+ organization={IEEE}
+}
diff --git a/src/body.tex b/src/body.tex
@@ -1,7 +1,7 @@
\section{Cluster scheduler architecture}
\subsection{Overview}
-This scheduler has layered architecture:
+Our framework has layered architecture:
\begin{itemize}
\item \textit{Physical layer.} Consists of nodes and direct/routed network
diff --git a/src/head.tex b/src/head.tex
@@ -12,6 +12,8 @@ parallel and sequential parts. Using different fault tolerant scenarios based
on hierarchy interactions, this framework provides continuous execution of a
parallel programme in case of hardware errors or electricity outages.
+\subsection{Computational kernel hierarchy}
+
The framework provides classes and methods to simplify development of
distributed applications and middleware. The focus is to make distributed
application resilient to failures, i.e.~make it fault tolerant and highly
@@ -58,7 +60,7 @@ are necessary because calls are asynchronous and one must wait before
subordinate kernels complete their work. Pipelines allow circumventing active
wait, and call correct kernel methods by analysing their internal state.
-\section{Related work}
+\subsection{Related work}
The feature that distinguishes our research with respect to some others, is the
use of hierarchy as the only possible way of defining dependencies between
@@ -108,3 +110,9 @@ hierarchy~--- an abstraction which defines strict total order on a set of
kernels (their execution order) and, consequently, defines for each kernel a
principal kernel, responsibility of which is to re-execute failed subordinate
kernels upon a failure.
+
+In this paper we present an algorithm that guarantees continuous execution of a
+parallel programme upon failure of all nodes except one. This algorithm is
+based on the one developed in previous
+papers~\cite{gankevich2015subordination,gankevich2016factory}, where only one
+node failure at a time is guaranteed to not interrupt programme execution.