Insert the last big section.

commit 46ee797152c7747d270d56b32e0b9c35601988f5
parent 65c6d9394388d2ff9da5e4575e029f3ef91ee1c4
Author: Ivan Gankevich <igankevich@ya.ru>
Date:   Mon, 29 May 2017 13:30:35 +0300

Insert the last big section.

Diffstat:
arma.org  | 331 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
figures/bench-cpu-gpu.pdf  | 0 
figures/breakdown-cpu-gpu.pdf  | 0 
preamble.tex  | 2 ++
refs.bib  | 97 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

5 files changed, 427 insertions(+), 3 deletions(-)
diff --git a/arma.org b/arma.org
@@ -1102,7 +1102,7 @@ and plugging the result into eqref:eq-guessed-sol-3d yields formula for
         \frac{ \Sinh{\smash{2\pi \Kveclen (z+h)}} }{ 2\pi\Kveclen }
         \frac{ \FourierY{ \zeta_t / \left( i f_1(x,y) + i f_2(x,y) - 1 \right)}{u,v} }
         { \FourierY{\mathcal{D}_3\left( x,y,\zeta\left(x,y\right) \right)}{u,v} }
-    }{x,y},
+    }{x,y}\label{eq-phi-high-amp},
 \end{equation*}
 where \(\FourierY{\mathcal{D}_3\left(x,y,z\right)}{u,v}=\Sinh{\smash{2\pi\Kveclen{}z}}\).
 
@@ -1226,11 +1226,336 @@ arma.plot_velocity(
 [[file:build/large-and-small-amplitude-velocity-field-comparison.pdf]]
 
 * High-performance software implementation for heterogeneous platforms
+:PROPERTIES:
+:CUSTOM_ID: sec:arma-algorithms
+:END:
+** Wave elevation distribution approximation
+One of the parameters of ocean wavy surface generator is probability density
+function (PDF) of the surface elevation. This distribution is given by either
+polynomial approximation of /in situ/ data or analytic formula.
+
+**** Gram---Charlier series expansion.
+In\nbsp{}cite:huang1980experimental the authors experimentally show, that PDF of sea
+surface elevation is distinguished from normal distribution by non-nought
+kurtosis and skewness. In\nbsp{}cite:рожков1996теория the authors show, that this type
+of PDF expands in Gram---Charlier series:
+\begin{align}
+    \label{eq-skew-normal-1}
+    F(z; \gamma_1, \gamma_2) & = \phi(z)
+        - \gamma_1 \frac{\phi'''(z)}{3!}
+        + \gamma_2 \frac{\phi''''(z)}{4!} \nonumber \\
+    & =
+    \frac{1}{2} \text{erf}\left[\frac{z}{\sqrt{2}}\right]
+    -
+    \frac{e^{-\frac{z^2}{2}}}{\sqrt{2\pi}}
+    \left[
+        \frac{1}{6} \gamma_1 \left(z^2-1\right)
+        + \frac{1}{24} \gamma_2 z \left(z^2-3\right)
+    \right]
+    ,\nonumber \\
+    f(z; \gamma_1, \gamma_2) & =
+    \frac{e^{-\frac{z^2}{2}}}{\sqrt{2 \pi }}
+    \left[
+        \frac{1}{6} \gamma_1 z \left(z^2-3\right)
+        + \frac{1}{24} \gamma_2 \left(z^4-6z^2+3\right)
+        +1
+    \right],
+\end{align}
+where \(\phi(z)=\frac{1}{2}\mathrm{erf}(z/\sqrt{2})\), \(\gamma_1\)\nbsp{}--- skewness,
+\(\gamma_2\)\nbsp{}--- kurtosis, \(f\)\nbsp{}--- PDF, \(F\)\nbsp{}--- cumulative distribution function
+(CDF). According to\nbsp{}cite:рожков1990вероятностные for ocean waves skewness is
+selected from interval \(0.1\leq\gamma_1\leq{0.52}]\) and kurtosis from interval
+\(0.1\leq\gamma_2\leq{0.7}\). Family of probability density functions for
+different parameters is shown in fig.\nbsp{}[[fig-skew-normal-1]].
+
+#+NAME: fig-skew-normal-1
+#+begin_src R :file build/skew-normal-1.pdf
+source(file.path("R", "common.R"))
+x <- seq(-3, 3, length.out=100)
+params <- data.frame(
+  skewness = c(0.00, 0.52, 0.00, 0.52),
+  kurtosis = c(0.00, 0.00, 0.70, 0.70),
+  linetypes = c("solid", "dashed", "dotdash", "dotted")
+)
+arma.skew_normal_1_plot(x, params)
+legend(
+  "topleft",
+  mapply(
+    function (s, k) {
+      as.expression(bquote(list(
+        gamma[1] == .(arma.fmt(s, 2)),
+        gamma[2] == .(arma.fmt(k, 2))
+      )))
+    },
+    params$skewness,
+    params$kurtosis
+  ),
+  lty = paste(params$linetypes)
+)
+#+end_src
+
+#+caption: Probability density function eqref:eq-skew-normal-1 of ocean wavy surface elevation for different values of skewness \(\gamma_1\) and kurtosis \(\gamma_2\).
+#+label: fig-skew-normal-1
+#+RESULTS: fig-skew-normal-1
+[[file:build/skew-normal-1.pdf]]
+
+**** Skew-normal distribution.
+Alternative approach is to approximate distribution of ocean wavy surface
+elevation by skew-normal distribution:
+\begin{align}
+    \label{eq-skew-normal-2}
+    F(z; \alpha) & = \frac{1}{2}
+   \mathrm{erfc}\left[-\frac{z}{\sqrt{2}}\right]-2 T(z,\alpha ), \nonumber \\
+    f(z; \alpha) & = \frac{e^{-\frac{z^2}{2}}}{\sqrt{2 \pi }}
+   \mathrm{erfc}\left[-\frac{\alpha z}{\sqrt{2}}\right],
+\end{align}
+where \(T\)\nbsp{}--- Owen \(T\)-function\nbsp{}cite:owen1956tables. Using this formula it is
+impossible to specify skewness and kurtosis separately\nbsp{}--- both values are
+adjusted via \(\alpha\) parameter. The only advantage of the formula is its
+relative computational simplicity: this function is available in some programmes
+and mathematical libraries. Its graph for different values of \(\alpha\) is shown
+in fig.\nbsp{}[[fig-skew-normal-2]].
+
+#+name: fig-skew-normal-2
+#+begin_src R :file build/skew-normal-2.pdf
+source(file.path("R", "common.R"))
+x <- seq(-3, 3, length.out=100)
+alpha <- c(0.00, 0.87, 2.25, 4.90)
+params <- data.frame(
+  alpha = alpha,
+  skewness = arma.bits.skewness_2(alpha),
+  kurtosis = arma.bits.kurtosis_2(alpha),
+  linetypes = c("solid", "dashed", "dotdash", "dotted")
+)
+arma.skew_normal_2_plot(x, params)
+legend(
+  "topleft",
+  mapply(
+    function (a, s, k) {
+      as.expression(bquote(list(
+        alpha == .(arma.fmt(a, 2)),
+        gamma[1] == .(arma.fmt(s, 2)),
+        gamma[2] == .(arma.fmt(k, 2))
+      )))
+    },
+    params$alpha,
+    params$skewness,
+    params$kurtosis
+  ),
+  lty = paste(params$linetypes)
+)
+#+end_src
+
+#+caption: Probability density function eqref:eq-skew-normal-2 of ocean wavy surface for different values of skewness coefficient \(\alpha\).
+#+label: fig-skew-normal-2
+#+RESULTS: fig-skew-normal-2
+[[file:build/skew-normal-2.pdf]]
+
+**** Evaluation.
+Equation eqref:eq-distribution-transformation with selected wave elevation
+distribution may be solved either in every point of generated wavy surface,
+which gives the most accurate results, or in every fixed grid point
+interpolating result via least-squares (LS) polynomial. In the second case
+precision is lower. For example, interpolating 12^th order polynomial on a fixed
+grid of 500 points on interval \(-5\sigma_z\leq{z}\leq{5}\sigma_z\) gives error of
+\(\approx{0.43}\cdot10^{-3}\). Increasing polynomial order leads to either numeric
+overflows during LS interpolation, or more coefficient close to nought;
+increasing the size of the grid has insignificant effect on the result. In the
+majority of cases three Gram---Charlier series coefficients is enough to
+transform ACF; relative error without interpolation is \(10^{-5}\).
+
 ** White noise generation
+In order to eliminate periodicity from generated wavy surface, it is imperative
+to use PRNG with sufficiently large period to generate white noise. Parallel
+Mersenne Twister\nbsp{}cite:matsumoto1998mersenne with a period of \(2^{19937}-1\) is
+used as a generator in this work. It allows to produce aperiodic ocean wavy
+surface realisations in any practical usage scenarios.
+
+There is no guarantee that multiple Mersenne Twisters executed in parallel
+threads with distinct initial states produce uncorrelated pseudo-random number
+sequences, however, algorithm of dynamic creation of Mersenne Twisters\nbsp{}cite:matsumoto1998dynamic may be used to provide such guarantee. The essence of
+the algorithm is to find matrices of initial generator states, that give
+maximally uncorrelated pseudo-random number sequences when Mersenne Twisters are
+executed in parallel with these initial states. Since finding such initial
+states consumes considerable amount of processor time, vector of initial states
+is created preliminary with knowingly larger number of parallel threads and
+saved to a file, which is then read before starting white noise generation.
+
 ** Wavy surface generation
-** Velocity potential field computation
+In ARMA model value of wavy surface elevation at a particular point depends on
+previous in space and time points, as a result the so called /ramp-up interval/
+(see fig.\nbsp{}[[fig-ramp-up-interval]]), in which realisation does not correspond to
+specified ACF, forms in the beginning of the realisation. There are several
+solutions to this problem which depend on the simulation context.
+
+If realisation is used in the context of ship stability simulation without
+manoeuvring, ramp-up interval will not affect results of the simulation, because
+it is located on the border (too far away from the studied marine object). If
+ship stability with manoeuvring is studied, then the interval may be simply
+discarded from the realisation (the size of the interval approximately equals
+the number of AR coefficients in each dimension). However, this may lead to loss
+of a very large number of points, because discarding occurs for each dimension.
+Alternative approach is to generate ocean wavy surface on ramp-up interval with
+LH model and generate the rest of the realisation with ARMA model.
+
+Algorithm of wavy surface generation is data-parallel: realisation is divided
+into equal parts each of which is generated independently, however, in the
+beginning of each realisation there is ramp-up interval. To eliminate it
+/overlap-add/ method\nbsp{}cite:oppenheim1989discrete,svoboda2011efficient,pavel2013algorithms (a popular
+method in signal processing) is used. The essence of the method is to add
+another interval, size of which is equal to the ramp-up interval size, to the
+end of each part. Then wavy surface is generated in each point of each part
+(including points from the added interval), the interval at the end of part \(N\)
+is superimposed on the ramp-up interval at the beginning of the part \(N+1\), and
+values in corresponding points are added.
+
+#+name: fig-ramp-up-interval
+#+begin_src R :file build/ramp-up-interval.pdf
+source(file.path("R", "common.R"))
+arma.plot_ramp_up_interval()
+#+end_src
+
+#+caption: Ramp-up interval at the beginning of the \(OX\) axis of the realisation.
+#+label: fig-ramp-up-interval
+#+RESULTS: fig-ramp-up-interval
+[[file:build/ramp-up-interval.pdf]]
+
+** Velocity potential computation
+:PROPERTIES:
+:CUSTOM_ID: sec:compute-delta
+:END:
+
+In solutions eqref:eq-solution-2d and eqref:eq-solution-2d-full to
+two-dimensional pressure determination problem there are functions
+\(\Fun{z}=\InverseFourierY{e^{2\pi{u}{z}}}{x}\) and
+\(\FunSecond{z}=\InverseFourierY{\Sinh{2\pi{u}{z}}}{x}\) which has multiple
+analytic representations and are difficult to compute. Each function is a
+Fourier transform of linear combination of exponents which reduces to poorly
+defined Dirac delta function of a complex argument (see
+table\nbsp{}[[tab-delta-functions]]). The usual way of handling this type of
+functions is to write them as multiplication of Dirac delta functions of real
+and imaginary part, however, this approach does not work here, because applying
+inverse Fourier transform to this representation does not produce exponent,
+which severely warp resulting velocity field. In order to get unique analytic
+definition normalisation factor \(1/\Sinh{2\pi{u}{h}}\) (which is also included
+in formula for \(E(u)\)) may be used. Despite the fact that normalisation allows
+to obtain adequate velocity potential field, numerical experiments show that
+there is little difference between this field and the one produced by formulae
+from linear wave theory, in which terms with \(\zeta\) are omitted. As a result,
+the formula for three-dimensional case was not derived.
+
+#+name: tab-delta-functions
+#+caption: Formulae for computing \(\Fun{z}\) and \(\FunSecond{z}\) from [[#sec:pressure-2d]], that use normalisation to eliminate uncertainty from definition of Dirac delta function of complex argument.
+#+attr_latex: :booktabs t
+| Function          | Without normalisation                                        | Normalised                                                                                                                             |
+|-------------------+--------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------|
+| \(\Fun{z}\)       | \(\delta (x+i z)\)                                           | \(\frac{1}{2 h}\mathrm{sech}\left(\frac{\pi  (x-i (h+z))}{2 h}\right)\)                                                                |
+| \(\FunSecond{z}\) | \(\frac{1}{2}\left[\delta (x-i z) + \delta (x+i z) \right]\) | \(\frac{1}{4 h}\left[\text{sech}\left(\frac{\pi  (x-i (h+z))}{2 h}\right)+\text{sech}\left(\frac{\pi  (x+i(h+z))}{2 h}\right)\right]\) |
+
 ** Evaluation
-** Discussion
+ARMA model does not require highly optimised software implementation to be
+efficient, its performance is high even without use of co-processors; there are
+two main causes of that. First, ARMA model itself does not use transcendental
+functions (sines, cosines and exponents) as opposed to LH model. All
+calculations (except model coefficients) are done via polynomials, which can be
+efficiently computed on modern processors using a series of FMA instructions.
+Second, pressure computation is done via explicit analytic formula using nested
+FFTs. Since two-dimensional FFT of the same size is repeatedly applied to every
+time slice, its coefficients (complex exponents) are pre-computed for all
+slices, and computations are performed with only a few transcendental functions.
+In case of MA model, performance is also increased by doing convolution with FFT
+transforms. So, high performance of ARMA model is due to scarce use of
+transcendental functions and heavy use of FFT, not to mention that high
+convergence rate and non-existence of periodicity allows to use far fewer
+coefficients compared to LH model.
+
+ARMA implementation uses several libraries of reusable mathematical functions
+and numerical algorithms (listed in table\nbsp{}[[tab-arma-libs]]), and was
+implemented using several parallel programming technologies (OpenMP,
+OpenCL) to find the most efficient one.
+
+#+name: tab-arma-libs
+#+caption: A list of mathematical libraries used in ARMA model implementation.
+#+attr_latex: :booktabs t
+| Library                                                      | What it is used for             |
+|--------------------------------------------------------------+---------------------------------|
+| DCMT\nbsp{}cite:matsumoto1998dynamic                         | parallel PRNG                   |
+| Blitz\nbsp{}cite:veldhuizen1997will,veldhuizen2000techniques | multidimensional arrays         |
+| GSL\nbsp{}cite:galassi2015gnu                                | PDF, CDF, FFT computation       |
+|                                                              | checking process stationarity   |
+| LAPACK, GotoBLAS\nbsp{}cite:goto2008high,goto2008anatomy     | finding AR coefficients         |
+| GL, GLUT\nbsp{}cite:kilgard1996opengl                        | three-dimensional visualisation |
+| CGAL\nbsp{}cite:fabri2009cgal                                | wave numbers triangulation      |
+
+For the purpose of evaluation we use simplified version of\nbsp{}eqref:eq-phi-high-amp:
+\begin{align}
+    \label{eq-phi-linear}
+    \phi(x,y,z,t) &= \InverseFourierY{
+        \frac{ \Sinh{\smash{2\pi \Kveclen (z+h)}} }
+        { 2\pi\Kveclen \Sinh{\smash{2\pi \Kveclen h}} }
+        \FourierY{ \zeta_t }{u,v}
+    }{x,y}\nonumber \\
+    &= \InverseFourierY{ g_1(u,v) \FourierY{ g_2(x,y) }{u,v} }{x,y}.
+\end{align}
+Since standing sea wave generator does not allow efficient GPU implementation
+due to autoregressive dependencies between wavy surface points, only velocity
+potential solver was rewritten in OpenCL and its performance was compared to
+existing OpenMP implementation.
+
+For each implementation the overall performance of the solver for a particular
+time instant was measured. Velocity field was computed for one $t$ point, for
+128 $z$ points below wavy surface and for each $x$ and $y$ point of
+four-dimensional $(t,x,y,z)$ grid. The only parameter that was varied between
+subsequent programme runs is the size of the grid along $x$ dimension. A total
+of 10 runs were performed and average time of each stage was computed.
+
+A different FFT library was used for each version of the solver. For OpenMP
+version FFT routines from GNU Scientific Library (GSL)\nbsp{}cite:galassi2015gnu
+were used, and for OpenCL version clFFT library\nbsp{}cite:clfft was used instead.
+There are two major differences in the routines from these libraries.
+
+- The order of frequencies in Fourier transforms is different and clFFT library
+  requires reordering the result of\nbsp{}eqref:eq-phi-linear whereas GSL does not.
+- Discontinuity at $(x,y) = (0,0)$ of velocity potential field grid is handled
+  automatically by clFFT library, whereas GSL library produce skewed values at
+  this point.
+
+For GSL library an additional interpolation from neighbouring points was used to
+smooth velocity potential field at these points. We have not spotted other
+differences in FFT implementations that have impact on the overall performance.
+
+In the course of the numerical experiments we have measured how much time each
+solver's implementation spends in each computation stage to explain find out how
+efficient data copying between host and device is in OpenCL implementation, and
+how one implementation corresponds to the other in terms of performance.
+
+** Results
+The experiments showed that GPU implementation outperforms CPU implementation by
+a factor of 10--15 (fig.~\ref{fig:bench-cpu-gpu}), however, distribution of time
+between computation stages is different for each implementation
+(fig.~\ref{fig:breakdown-cpu-gpu}). The major time consumer in CPU
+implementation is computation of $g_1$, whereas in GPU implementation its
+running time is comparable to computation of $g_2$. GPU computes $g_1$ much
+faster than CPU due to a large amount of modules for transcendental mathematical
+function computation. In both implementations $g_2$ is computed on CPU, but for
+GPU implementation the result is duplicated for each $z$ grid point in order to
+perform multiplication of all $XYZ$ planes along $z$ dimension in single OpenCL
+kernel, and, subsequently copied to GPU memory which severely hinders overall
+stage performance. Copying the resulting velocity potential field between CPU
+and GPU consumes $\approx{}20\%$ of velocity potential solver execution time.
+
+\begin{figure}
+	\centering
+	\includegraphics{bench-cpu-gpu}
+	\caption{Performance comparison of CPU (OpenMP) and GPU (OpenCL) versions of velocity potential solver.\label{fig:bench-cpu-gpu}}
+\end{figure}
+
+\begin{figure}
+	\centering
+	\includegraphics{breakdown-cpu-gpu}
+	\caption{Performance breakdown for GPU (OpenCL) and CPU (OpenMP) versions of velocity potential solver.\label{fig:breakdown-cpu-gpu}}
+\end{figure}
+
 * Conclusion
 Research results allow to conclude that a problem of determining pressures under
 sea surface can be solved analytically without assumptions of linear and
diff --git a/figures/bench-cpu-gpu.pdf b/figures/bench-cpu-gpu.pdf
Binary files differ.
diff --git a/figures/breakdown-cpu-gpu.pdf b/figures/breakdown-cpu-gpu.pdf
Binary files differ.
diff --git a/preamble.tex b/preamble.tex
@@ -2,6 +2,8 @@
 \usepackage{cite}
 \usepackage{latexsym} % \Box macro
 \usepackage{mathtools} % fancy dots in matrices
+\usepackage{graphicx}
+\graphicspath{{figures/}}
 
 % custom mathematical expressions
 \newcommand{\Var}[1]{\sigma_{#1}^2}
diff --git a/refs.bib b/refs.bib
@@ -191,4 +191,101 @@
   pages={416--423},
   booktitle={Proc. of II int. conf. on shipbuilding (ISC'98)},
   year={1998}
+}
+
+@book{galassi2015gnu,
+ author = {Galassi, M and Davies, J and Theiler, J and Gough, B and Jungman, G
+                  and Alken, P and Booth, M and Rossi, F and Ulerich, R},
+ title = {GNU Scientific Library Reference Manual},
+ year = {2009},
+ isbn = {0954612078, 9780954612078},
+ edition = {3},
+ publisher = {Network Theory Ltd.},
+ note = {Eds. Brian Gough}
+}
+
+@misc{clfft,
+  author = {{clFFT developers}},
+  title = {{clFFT: OpenCL Fast Fourier Transforms (FFTs)}},
+  howpublished = {\url{https://clmathlibraries.github.io/clFFT/}}
+}
+
+@Article{	  goto2008anatomy,
+  title		= {Anatomy of high-performance matrix multiplication},
+  author	= {Goto, Kazushige and Geijn, Robert A},
+  journal	= {ACM Transactions on Mathematical Software (TOMS)},
+  volume	= {34},
+  number	= {3},
+  pages		= {12},
+  year		= {2008},
+  publisher	= {ACM}
+}
+
+@Article{	  goto2008high,
+  title		= {High-performance implementation of the level-3 BLAS},
+  author	= {Goto, Kazushige and Van De Geijn, Robert},
+  journal	= {ACM Transactions on Mathematical Software (TOMS)},
+  volume	= {35},
+  number	= {1},
+  pages		= {4},
+  year		= {2008},
+  publisher	= {ACM}
+}
+
+@InProceedings{	  fabri2009cgal,
+  title		= {CGAL: The computational geometry algorithms library},
+  author	= {Fabri, Andreas and Pion, Sylvain},
+  booktitle	= {Proceedings of the 17th ACM SIGSPATIAL international
+		  conference on advances in geographic information systems},
+  pages		= {538--539},
+  year		= {2009},
+  organization	= {ACM}
+}
+
+@article{kilgard1996opengl,
+  title = {The OpenGL Utility Toolkit (GLUT) Programming Interface},
+  author = {Kilgard, Mark J},
+  year = {1996},
+  publisher = {Citeseer}
+}
+
+@article{veldhuizen2000techniques,
+  title = {Techniques for scientific C++},
+  author = {Veldhuizen, Todd},
+  journal = {Computer science technical report},
+  volume = {542},
+  pages = {60},
+  year = {2000},
+  publisher = {Citeseer}
+}
+
+@inproceedings{veldhuizen1997will,
+  title = {Will C++ be faster than Fortran?},
+  author = {Veldhuizen, Todd L and Jernigan, M Ed},
+  booktitle = {International Conference on Computing in Object-Oriented Parallel Environments},
+  pages = {49--56},
+  year = {1997},
+  organization = {Springer}
+}
+
+@Article{	  matsumoto1998dynamic,
+  title		= {Dynamic creation of pseudorandom number generators},
+  author	= {Matsumoto, Makoto and Nishimura, Takuji},
+  journal	= {Monte Carlo and Quasi-Monte Carlo Methods},
+  volume	= {2000},
+  pages		= {56--69},
+  year		= {1998}
+}
+
+@Article{	  matsumoto1998mersenne,
+  title		= {Mersenne twister: a 623-dimensionally equidistributed
+		  uniform pseudo-random number generator},
+  author	= {Matsumoto, Makoto and Nishimura, Takuji},
+  journal	= {ACM Transactions on Modeling and Computer Simulation
+		  (TOMACS)},
+  volume	= {8},
+  number	= {1},
+  pages		= {3--30},
+  year		= {1998},
+  publisher	= {ACM}
 } 
\ No newline at end of file

	waves-16-arma
	git clone https://git.igankevich.com/waves-16-arma.git
	Log \| Files \| Refs

arma.org	\|	331	++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
figures/bench-cpu-gpu.pdf	\|	0
figures/breakdown-cpu-gpu.pdf	\|	0
preamble.tex	\|	2	++
refs.bib	\|	97	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

waves-16-arma