commit b0b499cdb266a7a6ee0e0b0da2e7cc321f154c4a
parent d2b8c47cdfd8468780493058b7a1e1fdd241c85e
Author: Ivan Gankevich <igankevich@ya.ru>
Date: Tue, 24 Oct 2017 20:24:04 +0300
Add master/slave node failure benchmarks.
Diffstat:
R/benchmarks.R | | | 64 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----- |
arma-thesis.org | | | 99 | +++++++++++++++++++++++++++++++++++-------------------------------------------- |
setup.org | | | 2 | +- |
3 files changed, 104 insertions(+), 61 deletions(-)
diff --git a/R/benchmarks.R b/R/benchmarks.R
@@ -263,10 +263,7 @@ arma.print_table_for_realtime_data <- function (data, routine_names, column_name
ascii(all_data, include.rownames=FALSE, digits=4)
}
-arma.load_bscheduler_data <- function () {
- all_test_cases <- list(c("a9-single-node-direct", "openmp", "m1"),
- c("a9-single-node-direct", "bscheduler", "m1"),
- c("a9-two-nodes-direct", "bscheduler", "m1"))
+arma.load_bscheduler_data <- function (all_test_cases) {
all_data = data.frame(
framework=rep(NA,0),
size=rep(NA,0),
@@ -301,7 +298,21 @@ arma.load_bscheduler_data <- function () {
all_data
}
-arma.plot_bscheduler_data <- function (all_data, names) {
+arma.load_bscheduler_performance_data <- function() {
+ all_test_cases <- list(c("a9-single-node-direct", "openmp", "m1"),
+ c("a9-single-node-direct", "bscheduler", "m1"),
+ c("a9-two-nodes-direct", "bscheduler", "m1"))
+ arma.load_bscheduler_data(all_test_cases)
+}
+
+arma.load_master_slave_failure_data <- function() {
+ all_test_cases <- list(c("a10-failure-direct-slave", "bscheduler", "m1"),
+ c("a9-single-node-direct", "bscheduler", "m1"),
+ c("a10-failure-direct-master", "bscheduler", "m1"))
+ arma.load_bscheduler_data(all_test_cases)
+}
+
+arma.plot_bscheduler_performance_data <- function (all_data, names) {
plot.new()
plot.window(xlim=range(all_data$size), ylim=range(0,all_data$t))
conf <- list(
@@ -343,3 +354,46 @@ arma.plot_bscheduler_data <- function (all_data, names) {
axis(2)
box()
}
+
+arma.plot_master_slave_failure_data <- function (all_data, names) {
+ plot.new()
+ plot.window(xlim=range(all_data$size), ylim=range(0,all_data$t))
+ conf <- list(
+ a=list(
+ framework='a10-failure-direct-master-bscheduler',
+ color='#000000',
+ lty="solid",
+ lwd=1,
+ name=names$master
+ ),
+ b=list(
+ framework='a10-failure-direct-slave-bscheduler',
+ color='#000000',
+ lty="dashed",
+ lwd=1,
+ name=names$slave
+ ),
+ c=list(
+ framework='a9-single-node-direct-bscheduler',
+ color='#000000',
+ lty="dotted",
+ lwd=1,
+ name=names$nofailures
+ )
+ )
+ for (c in conf) {
+ data <- all_data[all_data$framework==c$framework, ]
+ lines(data$size, data$t, col=c$color, lty=c$lty)
+ points(data$size, data$t, col=c$color)
+ }
+ legend(
+ "bottomright",
+ legend=sapply(conf, function (c) c$name),
+ col=sapply(conf, function (c) c$color),
+ lty=sapply(conf, function (c) c$lty),
+ lwd=sapply(conf, function (c) c$lwd)
+ )
+ axis(1)
+ axis(2)
+ box()
+}
diff --git a/arma-thesis.org b/arma-thesis.org
@@ -3341,7 +3341,7 @@ which only demands explicit marking of replicated kernels.
In a series of experiments performance of the new version of the application in
the presence of different types of failures was benchmarked (numbers correspond
-to the graphs in fig.\nbsp{}[[fig-benchmark]]):
+to the graphs in fig.\nbsp{}[[fig-master-slave-failure]]):
1) no failures,
2) failure of a subordinate node (a node where a part of wavy surface is
generated),
@@ -3355,69 +3355,58 @@ the total run time without failures on a single node. The application
immediately recognised node as offline, because the corresponding connection was
closed; in real-world scenario, however, the failure is detected after a
configurable time-out. The results of these runs were compared to the run
-without node failures (fig.\nbsp{}[[fig-benchmark]] and\nbsp{}[[fig-slowdown]]).
+without node failures (fig.\nbsp{}[[fig-master-slave-failure]]).
There is considerable difference in overall application performance for
different types of failures. Graphs\nbsp{}2 and\nbsp{}3 in
-fig.\nbsp{}[[fig-benchmark]] show that performance in case of principal and
-subordinate node failure is the same. In case of principal node failure a backup
-node stores a copy of the main kernel and uses this copy when it detects failure
-of the principal node. In case of subordinate node failure, the principal node
-redistributes the non-returning kernels between remaining subordinate nodes. In
-both cases the state of the main kernel is not lost and no time is spent to
-restore it, which explains similar performance.
-
-Graph\nbsp{}4 in fig.\nbsp{}[[fig-benchmark]] shows that performance in case of a
-backup node failure is much lower than in other cases. It happens because
-principal node stores only the state of the current step of the computation plus
-some additional fixed amount of data, whereas a backup node not only stores the
-copy of this data, but executes the step in parallel with other subordinate
-nodes. So, when a backup node fails, the principal node executes the whole step
-once again on arbitrarily chosen survived node.
-
-To measure how much time is lost due to a node failure the total execution time
-with a failure was divided by the total execution time without the failure but
-with the number of nodes minus one. This relation is obtained from the same
-benchmark and presented in fig.\nbsp{}[[fig-slowdown]]. The difference in
-performance in case of principal and subordinate node failures lies within 5%
-margin, and in case of backup node failure within 50% margin for the number of
-node less than 6[fn::Measuring this margin for higher number of nodes does not
-make sense since time before failure is greater than total execution time with
-these numbers of nodes, and programme's execution finishes before a failure
-occurs.]. Increase in execution time of 50% is more than \(1/3\) of execution
-time after which a failure occurs, but backup node failure needs some time to be
-discovered: it is detected only when subordinate kernel carrying the copy of the
-main kernel finishes its execution and tries to reach its parent. Instant
-detection requires abrupt stopping of the subordinate kernel which may be
-inapplicable for programmes with complicated logic.
-
-#+name: fig-benchmark
-#+begin_src R :file build/benchmark-xxx.pdf
-# TODO
-plot(c(1:10))
+fig.\nbsp{}[[fig-master-slave-failure]] show that performance in case of principal
+and subordinate node failure is the same. In case of principal node failure a
+backup node stores a copy of the main kernel and uses this copy when it detects
+failure of the principal node. In case of subordinate node failure, the
+principal node redistributes the non-returning kernels between remaining
+subordinate nodes. In both cases the state of the main kernel is not lost and no
+time is spent to restore it, which explains similar performance.
+
+Graph\nbsp{}4 in fig.\nbsp{}[[fig-master-slave-failure]] shows that performance in
+case of a backup node failure is much lower than in other cases. It happens
+because principal node stores only the state of the current step of the
+computation plus some additional fixed amount of data, whereas a backup node not
+only stores the copy of this data, but executes the step in parallel with other
+subordinate nodes. So, when a backup node fails, the principal node executes the
+whole step once again on arbitrarily chosen survived node.
+
+Backup node failure needs some time to be discovered: it is detected only when
+subordinate kernel carrying the copy of the main kernel finishes its execution
+and tries to reach its parent. Instant detection requires abrupt stopping of the
+subordinate kernel which may be inapplicable for programmes with complicated
+logic.
+
+#+name: fig-master-slave-failure
+#+begin_src R :file build/master-slave-failure.pdf
+source(file.path("R", "benchmarks.R"))
+par(family="serif")
+data <- arma.load_master_slave_failure_data()
+arma.plot_master_slave_failure_data(
+ data,
+ list(
+ master="Bscheduler (master failure)",
+ slave="Bscheduler (slave failure)",
+ nofailures="Bscheduler (no failures)"
+ )
+)
+title(xlab="Wavy surface size", ylab="Time, s")
#+end_src
-#+caption: Performance of hydrodynamics HPC application in the presence of node failures.
-#+name: fig-benchmark
-#+RESULTS: fig-benchmark
-[[file:build/benchmark-xxx.pdf]]
+#+caption: Performance of AR model in the presence of node failures.
+#+name: fig-master-slave-failure
+#+RESULTS: fig-master-slave-failure
+[[file:build/master-slave-failure.pdf]]
The results of the benchmark allows to conclude that no matter a principal or a
subordinate node fails, the overall performance of a parallel programme roughly
equals to the one without failures with the number of nodes minus one, however,
when a backup node fails performance penalty is much higher.
-#+name: fig-slowdown
-#+begin_src R :file build/slowdown-xxx.pdf
-# TODO
-plot(c(1:10))
-#+end_src
-
-#+caption: Slowdown of the hydrodynamics HPC application in the presence of different types of node failures compared to execution without failures but with the number of nodes minus one.
-#+name: fig-slowdown
-#+RESULTS: fig-slowdown
-[[file:build/slowdown-xxx.pdf]]
-
**** Discussion of test results.
Fail over algorithm guarantees to handle one failure per sequential programme
step, more failures can be tolerated if they do not affect the principal node.
@@ -3635,8 +3624,8 @@ to small data transmission overhead of direct network link.
#+begin_src R :file build/bscheduler-performance.pdf
source(file.path("R", "benchmarks.R"))
par(family="serif")
-data <- arma.load_bscheduler_data()
-arma.plot_bscheduler_data(
+data <- arma.load_bscheduler_performance_data()
+arma.plot_bscheduler_performance_data(
data,
list(
openmp="OpenMP",
diff --git a/setup.org b/setup.org
@@ -1239,7 +1239,7 @@ fi
cd $dir
git checkout master
git pull
-git checkout 82c6f79f6c7bab3d92672edf2cdc6ccec56eee6d
+git checkout e478211f2682faabf32cf7a95cf21fb361ddcd48
#+end_src
#+RESULTS: