commit c45575d6a662eba2c8064ee0148ea1836ccd83ad
parent fee2baa5c4f6535bb03a738cc2baa1ff9527fe04
Author: Ivan Gankevich <igankevich@ya.ru>
Date: Tue, 15 Aug 2017 18:06:20 +0300
Add graph and table from arma-realtime benchmarks.
Diffstat:
.Rprofile | | | 2 | ++ |
R/benchmarks.R | | | 82 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-- |
arma-thesis.org | | | 80 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---------------- |
3 files changed, 146 insertions(+), 18 deletions(-)
diff --git a/.Rprofile b/.Rprofile
@@ -1,3 +1,5 @@
# Replace pdf with cairo_pdf to be able to print Russian characters on graphs.
require("grDevices")
pdf <- cairo_pdf
+library(ascii)
+options(asciiType="org")
diff --git a/R/benchmarks.R b/R/benchmarks.R
@@ -25,7 +25,6 @@ arma.load_benchmark_data <- function(attempt, framework, models, tags) {
}
arma.print_openmp_vs_opencl <- function(model_names, row_names) {
- library(ascii)
options(asciiType="org")
models <- c("ar", "ma", "lh");
frameworks <- c("openmp", "opencl")
@@ -82,7 +81,6 @@ arma.load_io_benchmark_data <- function(attempt, filesystems, suffix, tags) {
}
arma.print_sync_vs_async_io <- function(suffix_names, row_names, top_names) {
- library(ascii)
options(asciiType="org")
tags <- list("generate_surface", "write_all")
filesystems <- c("xfs", "nfs", "gfs")
@@ -172,3 +170,83 @@ arma.plot_io_events <- function (fsnames) {
xpd=TRUE
)
}
+
+arma.load_realtime_data <- function () {
+ tags <- c(
+ "harts_g1",
+ "harts_g2",
+ "harts_fft",
+ "harts_copy_to_host"
+ )
+ sizes <- 2^c(7:14)
+ frameworks <- c("openmp", "opencl")
+ attempt <- "a6"
+ data <- data.frame()
+ row <- 1
+ for (framework in frameworks) {
+ for (m in sizes) {
+ for (t in tags) {
+ all_data <- arma.load(
+ file.path("build", "arma-benchmarks", "output", "storm", attempt, m, framework),
+ t,
+ ".*\\s+([0-9]+)us.*"
+ )
+ data[row,"framework"] <- framework
+ data[row,"size"] <- as.character(m)
+ data[row,"t"] <- mean(all_data/1000/1000)
+ data[row,"routine"] <- t
+ row <- row + 1
+ }
+ }
+ }
+ data
+}
+
+arma.aggregate_by_size <- function (data, framework) {
+ fwdata <- data[data["framework"] == framework,]
+ fwdata <- aggregate(
+ fwdata$t,
+ by=list(size=fwdata$size),
+ FUN=sum
+ )
+ fwdata <- setNames(fwdata, c("size", "t"))
+ fwdata
+}
+
+arma.plot_realtime_data <- function (data, ...) {
+ args <- list(...)
+ openmp <- arma.aggregate_by_size(data, "openmp")
+ opencl <- arma.aggregate_by_size(data, "opencl")
+ openmp_len <- length(openmp$t)
+ opencl_len <- length(opencl$t)
+ plot.new()
+ plot.window(xlim=range(openmp$size), ylim=range(openmp$t))
+ lines(openmp$size, openmp$t, lty="solid", type="b")
+ lines(opencl$size, opencl$t, lty="dashed", type="b")
+ axis(1, at=2^c(7:14))
+ axis(2)
+ box()
+ text(openmp$size[[openmp_len-1]], openmp$t[[openmp_len-1]], "OpenMP", pos=4, offset=1)
+ text(opencl$size[[opencl_len-1]], opencl$t[[opencl_len-1]], "OpenCL", pos=4, offset=1)
+}
+
+arma.filter_by_framework_and_size <- function (data, size, framework) {
+ data <- data[data["framework"]==framework & data["size"] == size, ]
+ data <- data[c("routine", "t")]
+ rownames(data) <- c(1:length(data$t))
+ data
+}
+
+arma.print_table_for_realtime_data <- function (data, routine_names, column_names) {
+ par(family="serif")
+ openmp <- arma.filter_by_framework_and_size(data, 2^14, "openmp")
+ opencl <- arma.filter_by_framework_and_size(data, 2^14, "opencl")
+ all_data <- merge(openmp, opencl, by="row.names")
+ all_data <- all_data[c("routine.x", "t.x", "t.y")]
+ all_data <- setNames(all_data, c("routine", "openmp", "opencl"))
+ # remove non-existent data copying
+ all_data[all_data$routine=="harts_copy_to_host", "openmp"] <- NA
+ all_data$routine <- sapply(all_data$routine, function (c) get(c, routine_names))
+ all_data <- setNames(all_data, column_names)
+ ascii(all_data, include.rownames=FALSE, digits=4)
+}
diff --git a/arma-thesis.org b/arma-thesis.org
@@ -3745,7 +3745,7 @@ arma.plot_io_events(fsnames)
**** Parallel velocity potential field computation.
The benchmarks for AR, MA and LH models showed that velocity potential field
computation consume only a fraction of total programme execution time, however,
-the absolute computation time over a large $XY$ domain may still be high. One
+the absolute computation time over a large \(XY\) domain may still be high. One
application where faster computation is needed is real-time simulation and
visualisation of wavy surface. The purpose of real-time visualisation is
two-fold:
@@ -3786,10 +3786,10 @@ Velocity potential solver was rewritten in OpenCL and its performance was
compared to an existing OpenMP implementation.
For each implementation the overall performance of the solver for a particular
-time instant was measured. Velocity field was computed for one $t$ point, for
-128 $z$ points below wavy surface and for each $x$ and $y$ point of
-four-dimensional $(t,x,y,z)$ grid. The only parameter that was varied between
-subsequent programme runs is the size of the grid along $x$ dimension.
+time instant was measured. Velocity field was computed for one \(t\) point, for
+128 \(z\) points below wavy surface and for each \(x\) and \(y\) point of
+four-dimensional \((t,x,y,z)\) grid. The only parameter that was varied between
+subsequent programme runs is the size of the grid along \(x\) dimension.
A different FFT library was used for each version of the solver: GNU Scientific
Library (GSL)\nbsp{}cite:galassi2015gnu for OpenMP and clFFT
@@ -3798,7 +3798,7 @@ routines from these libraries.
- The order of frequencies in Fourier transforms is different and clFFT library
requires reordering the result of\nbsp{}eqref:eq:phi-linear whereas GSL does
not.
-- Discontinuity at $(x,y)=(0,0)$ of velocity potential field grid is handled
+- Discontinuity at \((x,y)=(0,0)\) of velocity potential field grid is handled
automatically by clFFT library, whereas GSL library produce skewed values at
this point.
For GSL library an additional interpolation from neighbouring points was used to
@@ -3811,19 +3811,42 @@ efficient data copying between host and device is in OpenCL implementation, and
how one implementation corresponds to the other in terms of performance.
**** Performance of velocity potential OpenCL solver.
+:PROPERTIES:
+:header-args:R: :results output raw :exports results
+:END:
+
The experiments showed that OpenCL outperforms OpenMP implementation by a factor
-of 10--15 (fig.\nbsp{}), however, distribution of time between computation
-stages is different for each implementation (fig.\nbsp{}). The major time
-consumer on CPU is $g_1$, whereas in GPU its running time is comparable to
-$g_2$. Copying the resulting velocity potential field between CPU and GPU
-consumes $\approx{}20\%$ of solver execution time. \(g_2\) consumes the most of
-the execution time for OpenCL solver, and \(g_1\) for OpenMP solver. In both
-implementations \(g_2\) is computed on CPU, but for GPU implementation the
-result is duplicated for each \(z\) grid point in order to perform
-multiplication of all \(XYZ\) planes along \(z\) dimension in single OpenCL
-kernel, and, subsequently copied to GPU memory which severely hinders the
+of 10--15 (fig.\nbsp{}[[fig-arma-realtime-graph]]), however, distribution of time
+between computation stages is different for each implementation (fig.\nbsp{}).
+The major time consumer on CPU is \(g_1\), whereas in GPU its running time is
+comparable to \(g_2\). Copying the resulting velocity potential field between
+CPU and GPU consumes \(\approx{}20\%\) of solver execution time. \(g_2\)
+consumes the most of the execution time for OpenCL solver, and \(g_1\) for
+OpenMP solver. In both implementations \(g_2\) is computed on CPU, but for GPU
+implementation the result is duplicated for each \(z\) grid point in order to
+perform multiplication of all \(XYZ\) planes along \(z\) dimension in single
+OpenCL kernel, and, subsequently copied to GPU memory which severely hinders the
overall performance.
+#+name: fig-arma-realtime-graph
+#+header: :results output graphics
+#+begin_src R :file build/realtime-performance.pdf
+source(file.path("R", "benchmarks.R"))
+par(family="serif")
+data <- arma.load_realtime_data()
+params <- list(
+ titles <- c("OpenMP", "OpenCL"),
+ linetypes = c("solid", "dashed")
+)
+arma.plot_realtime_data(data, params)
+title(xlab="Wavy surface size", ylab="Time, s")
+#+end_src
+
+#+name: fig-arma-realtime-graph
+#+caption: Performance comparison of CPU (OpenMP) and GPU (OpenCL) versions of velocity potential solver.
+#+RESULTS: fig-arma-realtime-graph
+[[file:build/realtime-performance.pdf]]
+
The reason for different distribution of time between computation stages is the
same as for different AR model performance on CPU and GPU: GPU has more floating
point units and modules for transcendental mathematical functions, which are
@@ -3836,6 +3859,31 @@ due to unavailability of such library it was not done in this work.
Additionally, such library may allow to efficiently compute the non-simplified
formula entirely on GPU, since omitted terms also contain derivatives.
+#+name: fig-arma-realtime-table
+#+begin_src R
+source(file.path("R", "benchmarks.R"))
+routine_names <- list(
+ harts_g1="\\(g_1\\)",
+ harts_g2="\\(g_2\\)",
+ harts_fft="FFT",
+ harts_copy_to_host="Copy data from GPU"
+)
+column_names <- c("Subroutine", "OpenMP time, s", "OpenCL time, s")
+data <- arma.load_realtime_data()
+arma.print_table_for_realtime_data(data, routine_names, column_names)
+#+end_src
+
+#+name: fig-arma-realtime-table
+#+caption: Running time of real-time velocity potential solver subroutines.
+#+attr_latex: :booktabs t
+#+RESULTS: fig-arma-realtime-table
+| Subroutine | OpenMP time, s | OpenCL time, s |
+|--------------------+----------------+----------------|
+| \(g_1\) | 4.6730 | 0.0038 |
+| \(g_2\) | 0.0002 | 0.8253 |
+| FFT | 2.8560 | 0.3585 |
+| Copy data from GPU | | 2.6357 |
+
As expected, sharing the same buffer between OpenCL and OpenGL contexts
increases overall solver performance by eliminating data transfer between CPU
and GPU memory, but also requires for the data to be in vertex buffer object