git clone https://git.igankevich.com/hpcs-17-subord.git
Log | Files | Refs

commit 5fa5a12c1a5a10b39bb508d998732d50e68be6c5
parent 16308bcb1fb776d31e823d57f7458a5f4746baa3
Author: Ivan Gankevich <igankevich@ya.ru>
Date:   Mon, 15 May 2017 15:38:35 +0300

Add more references.

bib/refs.bib | 36++++++++++++++++++++++++++++++++++++
src/tail.tex | 16++++++++++++++++
2 files changed, 52 insertions(+), 0 deletions(-)

diff --git a/bib/refs.bib b/bib/refs.bib @@ -73,3 +73,39 @@ year={2007}, organization={IOP Publishing} } + +@inproceedings{robertson2000linux, + title={{Linux-HA} Heartbeat System Design.}, + author={Robertson, Alan}, + booktitle={Proc. of 4\textsuperscript{th} Annual Linux Showcase \& + Conference}, + year={2000}, + pages={305--316}, + address={Atlanta, Georgia}, + organization={USENIX}, + url={http://static.usenix.org/publications/library/proceedings/als00/2000papers/papers/full_papers/robertson/robertson_html/} +} + +@article{haddad2003ha, + title={{HA-OSCAR}: the birth of highly available {OSCAR}}, + author={Haddad, Ibrahim and Leangsuksun, Chokchai and Scott, Stephen L}, + journal={Linux Journal}, + volume={2003}, + number={115}, + pages={1}, + year={2003}, + publisher={Belltown Media} +} + +@article{leangsuksun2005achieving, + title={Achieving high availability and performance computing with an HA-OSCAR + cluster}, + author={Leangsuksun, Chokchai Box and Shen, Lixin and Liu, Tong and Scott, + Stephen L}, + journal={Future Generation Computer Systems}, + volume={21}, + number={4}, + pages={597--606}, + year={2005}, + publisher={Elsevier} +} diff --git a/src/tail.tex b/src/tail.tex @@ -49,6 +49,22 @@ kernels (their execution order) and, consequently, defines for each kernel a principal kernel, responsibility of which is to re-execute failed subordinate kernels upon a failure. +With respect to various high-availability cluster +projects~\cite{robertson2000linux,haddad2003ha,leangsuksun2005achieving} our +approach has the following advantages. First, is scales with the large number +of nodes, as only point-to-point communication between slave and master node is +used instead of broadcast messages (which has been shown in the previous +work~\cite{gankevich2015subordination}), hence, the use of several switches and +routers is possible within single cluster. Second, our approach does not +require the use of standby servers to provide high availability of a master +node: we provide fault tolerance on kernel layer instead. As the computation +progresses, kernels copy themselves on nodes that are directly connected to the +current one, and these can be any nodes from the cluster. Finally, +high-availability cluster projects do not deal with parallel programme +failures, they aim to provide high-availability for services running on master +node (NFS, SMB, DHCP, etc.), whereas our approach is specifically targeted at +providing continuous execution of parallel applications. + \section{Conclusion} In the paper we propose a system architecture consisting of two tree