Add master/slave node failure benchmarks.

commit b0b499cdb266a7a6ee0e0b0da2e7cc321f154c4a
parent d2b8c47cdfd8468780493058b7a1e1fdd241c85e
Author: Ivan Gankevich <igankevich@ya.ru>
Date:   Tue, 24 Oct 2017 20:24:04 +0300

Add master/slave node failure benchmarks.

Diffstat:
R/benchmarks.R  | 64 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-----
arma-thesis.org  | 99 +++++++++++++++++++++++++++++++++++--------------------------------------------
setup.org  | 2 +-

3 files changed, 104 insertions(+), 61 deletions(-)
diff --git a/R/benchmarks.R b/R/benchmarks.R
@@ -263,10 +263,7 @@ arma.print_table_for_realtime_data <- function (data, routine_names, column_name
   ascii(all_data, include.rownames=FALSE, digits=4)
 }
 
-arma.load_bscheduler_data <- function () {
-	all_test_cases <- list(c("a9-single-node-direct", "openmp", "m1"),
-						   c("a9-single-node-direct", "bscheduler", "m1"),
-						   c("a9-two-nodes-direct", "bscheduler", "m1"))
+arma.load_bscheduler_data <- function (all_test_cases) {
 	all_data = data.frame(
 		framework=rep(NA,0),
 		size=rep(NA,0),
@@ -301,7 +298,21 @@ arma.load_bscheduler_data <- function () {
 	all_data
 }
 
-arma.plot_bscheduler_data <- function (all_data, names) {
+arma.load_bscheduler_performance_data <- function() {
+	all_test_cases <- list(c("a9-single-node-direct", "openmp", "m1"),
+						   c("a9-single-node-direct", "bscheduler", "m1"),
+						   c("a9-two-nodes-direct", "bscheduler", "m1"))
+	arma.load_bscheduler_data(all_test_cases)
+}
+
+arma.load_master_slave_failure_data <- function() {
+	all_test_cases <- list(c("a10-failure-direct-slave", "bscheduler", "m1"),
+						   c("a9-single-node-direct", "bscheduler", "m1"),
+						   c("a10-failure-direct-master", "bscheduler", "m1"))
+	arma.load_bscheduler_data(all_test_cases)
+}
+
+arma.plot_bscheduler_performance_data <- function (all_data, names) {
 	plot.new()
 	plot.window(xlim=range(all_data$size), ylim=range(0,all_data$t))
 	conf <- list(
@@ -343,3 +354,46 @@ arma.plot_bscheduler_data <- function (all_data, names) {
 	axis(2)
 	box()
 }
+
+arma.plot_master_slave_failure_data <- function (all_data, names) {
+	plot.new()
+	plot.window(xlim=range(all_data$size), ylim=range(0,all_data$t))
+	conf <- list(
+		a=list(
+			framework='a10-failure-direct-master-bscheduler',
+			color='#000000',
+			lty="solid",
+			lwd=1,
+			name=names$master
+		),
+		b=list(
+			framework='a10-failure-direct-slave-bscheduler',
+			color='#000000',
+			lty="dashed",
+			lwd=1,
+			name=names$slave
+		),
+		c=list(
+			framework='a9-single-node-direct-bscheduler',
+			color='#000000',
+			lty="dotted",
+			lwd=1,
+			name=names$nofailures
+		)
+	)
+	for (c in conf) {
+		data <- all_data[all_data$framework==c$framework, ]
+		lines(data$size, data$t, col=c$color, lty=c$lty)
+		points(data$size, data$t, col=c$color)
+	}
+	legend(
+		"bottomright",
+		legend=sapply(conf, function (c) c$name),
+		col=sapply(conf, function (c) c$color),
+		lty=sapply(conf, function (c) c$lty),
+		lwd=sapply(conf, function (c) c$lwd)
+	)
+	axis(1)
+	axis(2)
+	box()
+}
diff --git a/arma-thesis.org b/arma-thesis.org
@@ -3341,7 +3341,7 @@ which only demands explicit marking of replicated kernels.
 
 In a series of experiments performance of the new version of the application in
 the presence of different types of failures was benchmarked (numbers correspond
-to the graphs in fig.\nbsp{}[[fig-benchmark]]):
+to the graphs in fig.\nbsp{}[[fig-master-slave-failure]]):
 1) no failures,
 2) failure of a subordinate node (a node where a part of wavy surface is
    generated),
@@ -3355,69 +3355,58 @@ the total run time without failures on a single node. The application
 immediately recognised node as offline, because the corresponding connection was
 closed; in real-world scenario, however, the failure is detected after a
 configurable time-out. The results of these runs were compared to the run
-without node failures (fig.\nbsp{}[[fig-benchmark]] and\nbsp{}[[fig-slowdown]]).
+without node failures (fig.\nbsp{}[[fig-master-slave-failure]]).
 
 There is considerable difference in overall application performance for
 different types of failures. Graphs\nbsp{}2 and\nbsp{}3 in
-fig.\nbsp{}[[fig-benchmark]] show that performance in case of principal and
-subordinate node failure is the same. In case of principal node failure a backup
-node stores a copy of the main kernel and uses this copy when it detects failure
-of the principal node. In case of subordinate node failure, the principal node
-redistributes the non-returning kernels between remaining subordinate nodes. In
-both cases the state of the main kernel is not lost and no time is spent to
-restore it, which explains similar performance.
-
-Graph\nbsp{}4 in fig.\nbsp{}[[fig-benchmark]] shows that performance in case of a
-backup node failure is much lower than in other cases. It happens because
-principal node stores only the state of the current step of the computation plus
-some additional fixed amount of data, whereas a backup node not only stores the
-copy of this data, but executes the step in parallel with other subordinate
-nodes. So, when a backup node fails, the principal node executes the whole step
-once again on arbitrarily chosen survived node.
-
-To measure how much time is lost due to a node failure the total execution time
-with a failure was divided by the total execution time without the failure but
-with the number of nodes minus one. This relation is obtained from the same
-benchmark and presented in fig.\nbsp{}[[fig-slowdown]]. The difference in
-performance in case of principal and subordinate node failures lies within 5%
-margin, and in case of backup node failure within 50% margin for the number of
-node less than 6[fn::Measuring this margin for higher number of nodes does not
-make sense since time before failure is greater than total execution time with
-these numbers of nodes, and programme's execution finishes before a failure
-occurs.]. Increase in execution time of 50% is more than \(1/3\) of execution
-time after which a failure occurs, but backup node failure needs some time to be
-discovered: it is detected only when subordinate kernel carrying the copy of the
-main kernel finishes its execution and tries to reach its parent. Instant
-detection requires abrupt stopping of the subordinate kernel which may be
-inapplicable for programmes with complicated logic.
-
-#+name: fig-benchmark
-#+begin_src R :file build/benchmark-xxx.pdf
-# TODO
-plot(c(1:10))
+fig.\nbsp{}[[fig-master-slave-failure]] show that performance in case of principal
+and subordinate node failure is the same. In case of principal node failure a
+backup node stores a copy of the main kernel and uses this copy when it detects
+failure of the principal node. In case of subordinate node failure, the
+principal node redistributes the non-returning kernels between remaining
+subordinate nodes. In both cases the state of the main kernel is not lost and no
+time is spent to restore it, which explains similar performance.
+
+Graph\nbsp{}4 in fig.\nbsp{}[[fig-master-slave-failure]] shows that performance in
+case of a backup node failure is much lower than in other cases. It happens
+because principal node stores only the state of the current step of the
+computation plus some additional fixed amount of data, whereas a backup node not
+only stores the copy of this data, but executes the step in parallel with other
+subordinate nodes. So, when a backup node fails, the principal node executes the
+whole step once again on arbitrarily chosen survived node.
+
+Backup node failure needs some time to be discovered: it is detected only when
+subordinate kernel carrying the copy of the main kernel finishes its execution
+and tries to reach its parent. Instant detection requires abrupt stopping of the
+subordinate kernel which may be inapplicable for programmes with complicated
+logic.
+
+#+name: fig-master-slave-failure
+#+begin_src R :file build/master-slave-failure.pdf
+source(file.path("R", "benchmarks.R"))
+par(family="serif")
+data <- arma.load_master_slave_failure_data()
+arma.plot_master_slave_failure_data(
+  data,
+  list(
+    master="Bscheduler (master failure)",
+    slave="Bscheduler (slave failure)",
+    nofailures="Bscheduler (no failures)"
+  )
+)
+title(xlab="Wavy surface size", ylab="Time, s")
 #+end_src
 
-#+caption: Performance of hydrodynamics HPC application in the presence of node failures.
-#+name: fig-benchmark
-#+RESULTS: fig-benchmark
-[[file:build/benchmark-xxx.pdf]]
+#+caption: Performance of AR model in the presence of node failures.
+#+name: fig-master-slave-failure
+#+RESULTS: fig-master-slave-failure
+[[file:build/master-slave-failure.pdf]]
 
 The results of the benchmark allows to conclude that no matter a principal or a
 subordinate node fails, the overall performance of a parallel programme roughly
 equals to the one without failures with the number of nodes minus one, however,
 when a backup node fails performance penalty is much higher.
 
-#+name: fig-slowdown
-#+begin_src R :file build/slowdown-xxx.pdf
-# TODO
-plot(c(1:10))
-#+end_src
-
-#+caption: Slowdown of the hydrodynamics HPC application in the presence of different types of node failures compared to execution without failures but with the number of nodes minus one.
-#+name: fig-slowdown
-#+RESULTS: fig-slowdown
-[[file:build/slowdown-xxx.pdf]]
-
 **** Discussion of test results.
 Fail over algorithm guarantees to handle one failure per sequential programme
 step, more failures can be tolerated if they do not affect the principal node.
@@ -3635,8 +3624,8 @@ to small data transmission overhead of direct network link.
 #+begin_src R :file build/bscheduler-performance.pdf
 source(file.path("R", "benchmarks.R"))
 par(family="serif")
-data <- arma.load_bscheduler_data()
-arma.plot_bscheduler_data(
+data <- arma.load_bscheduler_performance_data()
+arma.plot_bscheduler_performance_data(
   data,
   list(
     openmp="OpenMP",
diff --git a/setup.org b/setup.org
@@ -1239,7 +1239,7 @@ fi
 cd $dir
 git checkout master
 git pull
-git checkout 82c6f79f6c7bab3d92672edf2cdc6ccec56eee6d
+git checkout e478211f2682faabf32cf7a95cf21fb361ddcd48
 #+end_src
 
 #+RESULTS:

	arma-thesis
	git clone https://git.igankevich.com/arma-thesis.git
	Log \| Files \| Refs \| LICENSE

R/benchmarks.R	\|	64	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-----
arma-thesis.org	\|	99	+++++++++++++++++++++++++++++++++++--------------------------------------------
setup.org	\|	2	+-