arma-thesis

git clone https://git.igankevich.com/arma-thesis.git
Log | Files | Refs | LICENSE

commit b0b499cdb266a7a6ee0e0b0da2e7cc321f154c4a
parent d2b8c47cdfd8468780493058b7a1e1fdd241c85e
Author: Ivan Gankevich <igankevich@ya.ru>
Date:   Tue, 24 Oct 2017 20:24:04 +0300

Add master/slave node failure benchmarks.

Diffstat:
R/benchmarks.R | 64+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-----
arma-thesis.org | 99+++++++++++++++++++++++++++++++++++--------------------------------------------
setup.org | 2+-
3 files changed, 104 insertions(+), 61 deletions(-)

diff --git a/R/benchmarks.R b/R/benchmarks.R @@ -263,10 +263,7 @@ arma.print_table_for_realtime_data <- function (data, routine_names, column_name ascii(all_data, include.rownames=FALSE, digits=4) } -arma.load_bscheduler_data <- function () { - all_test_cases <- list(c("a9-single-node-direct", "openmp", "m1"), - c("a9-single-node-direct", "bscheduler", "m1"), - c("a9-two-nodes-direct", "bscheduler", "m1")) +arma.load_bscheduler_data <- function (all_test_cases) { all_data = data.frame( framework=rep(NA,0), size=rep(NA,0), @@ -301,7 +298,21 @@ arma.load_bscheduler_data <- function () { all_data } -arma.plot_bscheduler_data <- function (all_data, names) { +arma.load_bscheduler_performance_data <- function() { + all_test_cases <- list(c("a9-single-node-direct", "openmp", "m1"), + c("a9-single-node-direct", "bscheduler", "m1"), + c("a9-two-nodes-direct", "bscheduler", "m1")) + arma.load_bscheduler_data(all_test_cases) +} + +arma.load_master_slave_failure_data <- function() { + all_test_cases <- list(c("a10-failure-direct-slave", "bscheduler", "m1"), + c("a9-single-node-direct", "bscheduler", "m1"), + c("a10-failure-direct-master", "bscheduler", "m1")) + arma.load_bscheduler_data(all_test_cases) +} + +arma.plot_bscheduler_performance_data <- function (all_data, names) { plot.new() plot.window(xlim=range(all_data$size), ylim=range(0,all_data$t)) conf <- list( @@ -343,3 +354,46 @@ arma.plot_bscheduler_data <- function (all_data, names) { axis(2) box() } + +arma.plot_master_slave_failure_data <- function (all_data, names) { + plot.new() + plot.window(xlim=range(all_data$size), ylim=range(0,all_data$t)) + conf <- list( + a=list( + framework='a10-failure-direct-master-bscheduler', + color='#000000', + lty="solid", + lwd=1, + name=names$master + ), + b=list( + framework='a10-failure-direct-slave-bscheduler', + color='#000000', + lty="dashed", + lwd=1, + name=names$slave + ), + c=list( + framework='a9-single-node-direct-bscheduler', + color='#000000', + lty="dotted", + lwd=1, + name=names$nofailures + ) + ) + for (c in conf) { + data <- all_data[all_data$framework==c$framework, ] + lines(data$size, data$t, col=c$color, lty=c$lty) + points(data$size, data$t, col=c$color) + } + legend( + "bottomright", + legend=sapply(conf, function (c) c$name), + col=sapply(conf, function (c) c$color), + lty=sapply(conf, function (c) c$lty), + lwd=sapply(conf, function (c) c$lwd) + ) + axis(1) + axis(2) + box() +} diff --git a/arma-thesis.org b/arma-thesis.org @@ -3341,7 +3341,7 @@ which only demands explicit marking of replicated kernels. In a series of experiments performance of the new version of the application in the presence of different types of failures was benchmarked (numbers correspond -to the graphs in fig.\nbsp{}[[fig-benchmark]]): +to the graphs in fig.\nbsp{}[[fig-master-slave-failure]]): 1) no failures, 2) failure of a subordinate node (a node where a part of wavy surface is generated), @@ -3355,69 +3355,58 @@ the total run time without failures on a single node. The application immediately recognised node as offline, because the corresponding connection was closed; in real-world scenario, however, the failure is detected after a configurable time-out. The results of these runs were compared to the run -without node failures (fig.\nbsp{}[[fig-benchmark]] and\nbsp{}[[fig-slowdown]]). +without node failures (fig.\nbsp{}[[fig-master-slave-failure]]). There is considerable difference in overall application performance for different types of failures. Graphs\nbsp{}2 and\nbsp{}3 in -fig.\nbsp{}[[fig-benchmark]] show that performance in case of principal and -subordinate node failure is the same. In case of principal node failure a backup -node stores a copy of the main kernel and uses this copy when it detects failure -of the principal node. In case of subordinate node failure, the principal node -redistributes the non-returning kernels between remaining subordinate nodes. In -both cases the state of the main kernel is not lost and no time is spent to -restore it, which explains similar performance. - -Graph\nbsp{}4 in fig.\nbsp{}[[fig-benchmark]] shows that performance in case of a -backup node failure is much lower than in other cases. It happens because -principal node stores only the state of the current step of the computation plus -some additional fixed amount of data, whereas a backup node not only stores the -copy of this data, but executes the step in parallel with other subordinate -nodes. So, when a backup node fails, the principal node executes the whole step -once again on arbitrarily chosen survived node. - -To measure how much time is lost due to a node failure the total execution time -with a failure was divided by the total execution time without the failure but -with the number of nodes minus one. This relation is obtained from the same -benchmark and presented in fig.\nbsp{}[[fig-slowdown]]. The difference in -performance in case of principal and subordinate node failures lies within 5% -margin, and in case of backup node failure within 50% margin for the number of -node less than 6[fn::Measuring this margin for higher number of nodes does not -make sense since time before failure is greater than total execution time with -these numbers of nodes, and programme's execution finishes before a failure -occurs.]. Increase in execution time of 50% is more than \(1/3\) of execution -time after which a failure occurs, but backup node failure needs some time to be -discovered: it is detected only when subordinate kernel carrying the copy of the -main kernel finishes its execution and tries to reach its parent. Instant -detection requires abrupt stopping of the subordinate kernel which may be -inapplicable for programmes with complicated logic. - -#+name: fig-benchmark -#+begin_src R :file build/benchmark-xxx.pdf -# TODO -plot(c(1:10)) +fig.\nbsp{}[[fig-master-slave-failure]] show that performance in case of principal +and subordinate node failure is the same. In case of principal node failure a +backup node stores a copy of the main kernel and uses this copy when it detects +failure of the principal node. In case of subordinate node failure, the +principal node redistributes the non-returning kernels between remaining +subordinate nodes. In both cases the state of the main kernel is not lost and no +time is spent to restore it, which explains similar performance. + +Graph\nbsp{}4 in fig.\nbsp{}[[fig-master-slave-failure]] shows that performance in +case of a backup node failure is much lower than in other cases. It happens +because principal node stores only the state of the current step of the +computation plus some additional fixed amount of data, whereas a backup node not +only stores the copy of this data, but executes the step in parallel with other +subordinate nodes. So, when a backup node fails, the principal node executes the +whole step once again on arbitrarily chosen survived node. + +Backup node failure needs some time to be discovered: it is detected only when +subordinate kernel carrying the copy of the main kernel finishes its execution +and tries to reach its parent. Instant detection requires abrupt stopping of the +subordinate kernel which may be inapplicable for programmes with complicated +logic. + +#+name: fig-master-slave-failure +#+begin_src R :file build/master-slave-failure.pdf +source(file.path("R", "benchmarks.R")) +par(family="serif") +data <- arma.load_master_slave_failure_data() +arma.plot_master_slave_failure_data( + data, + list( + master="Bscheduler (master failure)", + slave="Bscheduler (slave failure)", + nofailures="Bscheduler (no failures)" + ) +) +title(xlab="Wavy surface size", ylab="Time, s") #+end_src -#+caption: Performance of hydrodynamics HPC application in the presence of node failures. -#+name: fig-benchmark -#+RESULTS: fig-benchmark -[[file:build/benchmark-xxx.pdf]] +#+caption: Performance of AR model in the presence of node failures. +#+name: fig-master-slave-failure +#+RESULTS: fig-master-slave-failure +[[file:build/master-slave-failure.pdf]] The results of the benchmark allows to conclude that no matter a principal or a subordinate node fails, the overall performance of a parallel programme roughly equals to the one without failures with the number of nodes minus one, however, when a backup node fails performance penalty is much higher. -#+name: fig-slowdown -#+begin_src R :file build/slowdown-xxx.pdf -# TODO -plot(c(1:10)) -#+end_src - -#+caption: Slowdown of the hydrodynamics HPC application in the presence of different types of node failures compared to execution without failures but with the number of nodes minus one. -#+name: fig-slowdown -#+RESULTS: fig-slowdown -[[file:build/slowdown-xxx.pdf]] - **** Discussion of test results. Fail over algorithm guarantees to handle one failure per sequential programme step, more failures can be tolerated if they do not affect the principal node. @@ -3635,8 +3624,8 @@ to small data transmission overhead of direct network link. #+begin_src R :file build/bscheduler-performance.pdf source(file.path("R", "benchmarks.R")) par(family="serif") -data <- arma.load_bscheduler_data() -arma.plot_bscheduler_data( +data <- arma.load_bscheduler_performance_data() +arma.plot_bscheduler_performance_data( data, list( openmp="OpenMP", diff --git a/setup.org b/setup.org @@ -1239,7 +1239,7 @@ fi cd $dir git checkout master git pull -git checkout 82c6f79f6c7bab3d92672edf2cdc6ccec56eee6d +git checkout e478211f2682faabf32cf7a95cf21fb361ddcd48 #+end_src #+RESULTS: