Add discovery benchmark data.

commit f6efccda05e9516ba22d526f602eada09fb10001
parent 8fc48b5610949d86bfd9dea7d29f9aba180c1cd4
Author: Ivan Gankevich <igankevich@ya.ru>
Date:   Sat, 30 Sep 2017 13:22:40 +0300

Add discovery benchmark data.

Diffstat:
R/discovery.R  | 104 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
arma-thesis.org  | 15 +++++++++++----
setup.org  | 18 ++++++++++++++++++

3 files changed, 133 insertions(+), 4 deletions(-)
diff --git a/R/discovery.R b/R/discovery.R
@@ -0,0 +1,104 @@
+bscheduler.load_node_discovery_data <- function () {
+	dir <- file.path('build', 'bscheduler-benchmarks', 'output', 'm12')
+	all_files <- list.files(
+		dir,
+		pattern='bsc.10.1.0.1.log',
+		recursive=TRUE
+	)
+	all_data <- data.frame(
+		daemons=rep(NA,0),
+		nodes=rep(NA,0),
+		attempt=rep(NA,0),
+		timeout=rep(NA,0),
+		t=rep(NA,0)
+	)
+	row <- 1
+	for (file in all_files) {
+		daemons <- as.numeric(gsub('^d([0-9]+)/.*$', '\\1', file, perl=TRUE))
+		nodes <- as.numeric(gsub('^d[0-9]+/n([0-9]+)/.*$', '\\1', file, perl=TRUE))
+		attempt <- as.numeric(gsub('^d[0-9]+/n[0-9]+/a([0-9]+)/.*$', '\\1', file, perl=TRUE))
+		f <- file.path(dir, file)
+		data <- readLines(f)
+		nsubordinates <- length(data[grepl('add subordinate', data)])
+		data <- data[grepl('time since epoch', data)]
+		data <- gsub('^.*time since epoch ([0-9]+)ms.*$', '\\1', data, perl=TRUE);
+		data <- as.numeric(data)
+		data <- data - min(data)
+		data <- data.frame(data)
+		# calculate adjacent difference
+		diff <- data[-1,] - data[-nrow(data),]
+		# find termination time point
+		idx <- which(diff > 4000)
+		if (length(idx) > 0) {
+			idx <- idx[[1]]
+		}
+		if (length(idx) == 0 || daemons == 1) {
+			idx <- nrow(data)
+		}
+		# remove all events after termination
+		data <- data[c(1:idx),]
+		t <- max(data)
+		all_data[row, 'attempt'] <- attempt
+		all_data[row, 'nodes'] <- nodes
+		all_data[row, 'daemons'] <- daemons
+		all_data[row, 't'] <- t
+		if (daemons == 1) {
+			all_data[row, 'timeout'] <- 190
+		} else {
+			all_data[row, 'timeout'] <- 100
+		}
+		row <- row + 1
+		if (nsubordinates != nodes*daemons-1) {
+			write(paste('# Bad no. of subordinates:', f, nsubordinates), stderr())
+			write(paste('rm -rf', dirname(f)), stderr())
+		}
+	}
+	# subtract artificial timeout
+	all_data$t <- all_data$t - (all_data$nodes*all_data$daemons)*all_data$timeout
+	all_data$timeout <- NULL
+	write('All data:', stdout())
+	print(all_data[order(all_data$daemons, all_data$nodes, all_data$attempt), ])
+	result <- aggregate(
+		all_data$t,
+		by=list(nodes=all_data$nodes, daemons=all_data$daemons),
+		FUN=mean
+	)
+	result$t_avg <- result$x
+	result$x <- NULL
+	result$t_min <- aggregate(
+		all_data$t,
+		by=list(nodes=all_data$nodes, daemons=all_data$daemons),
+		FUN=min
+	)$x
+	result$t_max <- aggregate(
+		all_data$t,
+		by=list(nodes=all_data$nodes, daemons=all_data$daemons),
+		FUN=max
+	)$x
+	result
+}
+
+bscheduler.plot_discovery <- function (xlabel='No. of daemon processes',
+                                       ylabel='Time, ms',
+                                       toplabel='processes per node') {
+	result <- bscheduler.load_node_discovery_data();
+	par(mfrow=c(2,2))
+	for (n in c(1, 8, 16, 32)) {
+		res <- result[result$daemons==n,]
+		x <- res$nodes*n
+		plot.new()
+		plot.window(xlim=range(x), ylim=range(0, result$t_min, result$t_max))
+		points(x, res$t_avg, pch=19)
+		lines(x, res$t_min, lty='dashed')
+		lines(x, res$t_max, lty='dashed')
+		xlabels <- c(1:max(res$nodes))*n
+		axis(1, at=xlabels, labels=xlabels)
+		axis(2)
+		title(
+			paste(toplabel, n, sep=': '),
+			xlab=xlabel,
+			ylab=ylabel
+		)
+		box()
+	}
+}
diff --git a/arma-thesis.org b/arma-thesis.org
@@ -2993,12 +2993,19 @@ finds its principal on the first try.
 
 The benchmark was run varying the number of daemons per cluster node. The
 experiment showed that discovery of up to 400 nodes each other takes no more
-than 2 seconds (fig.\nbsp{}[[fig-bootstrap-local]]). This value does not change
+than 2 seconds (fig.\nbsp{}[[fig-discovery-benchmark]]). This value does not change
 significantly with the increase in number of physical nodes.
 
-#+name: fig-bootstrap-local
-#+caption: Time to discover all daemon processes running on the cluster depending on the number of daemon processes.
-[[file:graphics/discovery.eps]]
+#+name: fig-discovery-benchmark
+#+begin_src R :file build/discovery-benchmark.pdf
+source(file.path("R", "discovery.R"))
+bscheduler.plot_discovery()
+#+end_src
+
+#+caption: Time to discover all daemon processes running on the cluster depending on the number of daemon processes. Dashed lines represent minimum and maximum values.
+#+name: fig-discovery-benchmark
+#+RESULTS: fig-discovery-benchmark
+[[file:build/discovery-benchmark.pdf]]
 
 **** Discussion.
 Node discovery scales to a large number of nodes, because in order to determine
diff --git a/setup.org b/setup.org
@@ -1245,3 +1245,21 @@ git checkout 1ed6679387f0b79d8495c8bf55a6b0b304347e48
 #+RESULTS:
 : Ваша ветка обновлена в соответствии с «origin/master».
 : Already up-to-date.
+** Download bscheduler-benchmarks data from repository
+#+begin_src sh :exports none :results verbatim
+set -e
+dir=build/bscheduler-benchmarks
+mkdir -p $dir
+if ! test -d "$dir/.git"
+then
+    git clone https://github.com/igankevich/bscheduler-benchmarks $dir
+fi
+cd $dir
+git checkout master
+git pull
+git checkout d4eaeab314524a7ad0b2bf244ffc19dc7b41bd59
+#+end_src
+
+#+RESULTS:
+: Ваша ветка обновлена в соответствии с «origin/master».
+: Уже обновлено.

	arma-thesis
	git clone https://git.igankevich.com/arma-thesis.git
	Log \| Files \| Refs \| LICENSE

R/discovery.R	\|	104	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
arma-thesis.org	\|	15	+++++++++++----
setup.org	\|	18	++++++++++++++++++