git clone https://git.igankevich.com/waves-16-arma.git
Log | Files | Refs

commit 46ee797152c7747d270d56b32e0b9c35601988f5
parent 65c6d9394388d2ff9da5e4575e029f3ef91ee1c4
Author: Ivan Gankevich <igankevich@ya.ru>
Date:   Mon, 29 May 2017 13:30:35 +0300

Insert the last big section.

arma.org | 331++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
figures/bench-cpu-gpu.pdf | 0
figures/breakdown-cpu-gpu.pdf | 0
preamble.tex | 2++
refs.bib | 97+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
5 files changed, 427 insertions(+), 3 deletions(-)

diff --git a/arma.org b/arma.org @@ -1102,7 +1102,7 @@ and plugging the result into eqref:eq-guessed-sol-3d yields formula for \frac{ \Sinh{\smash{2\pi \Kveclen (z+h)}} }{ 2\pi\Kveclen } \frac{ \FourierY{ \zeta_t / \left( i f_1(x,y) + i f_2(x,y) - 1 \right)}{u,v} } { \FourierY{\mathcal{D}_3\left( x,y,\zeta\left(x,y\right) \right)}{u,v} } - }{x,y}, + }{x,y}\label{eq-phi-high-amp}, \end{equation*} where \(\FourierY{\mathcal{D}_3\left(x,y,z\right)}{u,v}=\Sinh{\smash{2\pi\Kveclen{}z}}\). @@ -1226,11 +1226,336 @@ arma.plot_velocity( [[file:build/large-and-small-amplitude-velocity-field-comparison.pdf]] * High-performance software implementation for heterogeneous platforms +:PROPERTIES: +:CUSTOM_ID: sec:arma-algorithms +:END: +** Wave elevation distribution approximation +One of the parameters of ocean wavy surface generator is probability density +function (PDF) of the surface elevation. This distribution is given by either +polynomial approximation of /in situ/ data or analytic formula. + +**** Gram---Charlier series expansion. +In\nbsp{}cite:huang1980experimental the authors experimentally show, that PDF of sea +surface elevation is distinguished from normal distribution by non-nought +kurtosis and skewness. In\nbsp{}cite:рожков1996теория the authors show, that this type +of PDF expands in Gram---Charlier series: +\begin{align} + \label{eq-skew-normal-1} + F(z; \gamma_1, \gamma_2) & = \phi(z) + - \gamma_1 \frac{\phi'''(z)}{3!} + + \gamma_2 \frac{\phi''''(z)}{4!} \nonumber \\ + & = + \frac{1}{2} \text{erf}\left[\frac{z}{\sqrt{2}}\right] + - + \frac{e^{-\frac{z^2}{2}}}{\sqrt{2\pi}} + \left[ + \frac{1}{6} \gamma_1 \left(z^2-1\right) + + \frac{1}{24} \gamma_2 z \left(z^2-3\right) + \right] + ,\nonumber \\ + f(z; \gamma_1, \gamma_2) & = + \frac{e^{-\frac{z^2}{2}}}{\sqrt{2 \pi }} + \left[ + \frac{1}{6} \gamma_1 z \left(z^2-3\right) + + \frac{1}{24} \gamma_2 \left(z^4-6z^2+3\right) + +1 + \right], +\end{align} +where \(\phi(z)=\frac{1}{2}\mathrm{erf}(z/\sqrt{2})\), \(\gamma_1\)\nbsp{}--- skewness, +\(\gamma_2\)\nbsp{}--- kurtosis, \(f\)\nbsp{}--- PDF, \(F\)\nbsp{}--- cumulative distribution function +(CDF). According to\nbsp{}cite:рожков1990вероятностные for ocean waves skewness is +selected from interval \(0.1\leq\gamma_1\leq{0.52}]\) and kurtosis from interval +\(0.1\leq\gamma_2\leq{0.7}\). Family of probability density functions for +different parameters is shown in fig.\nbsp{}[[fig-skew-normal-1]]. + +#+NAME: fig-skew-normal-1 +#+begin_src R :file build/skew-normal-1.pdf +source(file.path("R", "common.R")) +x <- seq(-3, 3, length.out=100) +params <- data.frame( + skewness = c(0.00, 0.52, 0.00, 0.52), + kurtosis = c(0.00, 0.00, 0.70, 0.70), + linetypes = c("solid", "dashed", "dotdash", "dotted") +) +arma.skew_normal_1_plot(x, params) +legend( + "topleft", + mapply( + function (s, k) { + as.expression(bquote(list( + gamma[1] == .(arma.fmt(s, 2)), + gamma[2] == .(arma.fmt(k, 2)) + ))) + }, + params$skewness, + params$kurtosis + ), + lty = paste(params$linetypes) +) +#+end_src + +#+caption: Probability density function eqref:eq-skew-normal-1 of ocean wavy surface elevation for different values of skewness \(\gamma_1\) and kurtosis \(\gamma_2\). +#+label: fig-skew-normal-1 +#+RESULTS: fig-skew-normal-1 +[[file:build/skew-normal-1.pdf]] + +**** Skew-normal distribution. +Alternative approach is to approximate distribution of ocean wavy surface +elevation by skew-normal distribution: +\begin{align} + \label{eq-skew-normal-2} + F(z; \alpha) & = \frac{1}{2} + \mathrm{erfc}\left[-\frac{z}{\sqrt{2}}\right]-2 T(z,\alpha ), \nonumber \\ + f(z; \alpha) & = \frac{e^{-\frac{z^2}{2}}}{\sqrt{2 \pi }} + \mathrm{erfc}\left[-\frac{\alpha z}{\sqrt{2}}\right], +\end{align} +where \(T\)\nbsp{}--- Owen \(T\)-function\nbsp{}cite:owen1956tables. Using this formula it is +impossible to specify skewness and kurtosis separately\nbsp{}--- both values are +adjusted via \(\alpha\) parameter. The only advantage of the formula is its +relative computational simplicity: this function is available in some programmes +and mathematical libraries. Its graph for different values of \(\alpha\) is shown +in fig.\nbsp{}[[fig-skew-normal-2]]. + +#+name: fig-skew-normal-2 +#+begin_src R :file build/skew-normal-2.pdf +source(file.path("R", "common.R")) +x <- seq(-3, 3, length.out=100) +alpha <- c(0.00, 0.87, 2.25, 4.90) +params <- data.frame( + alpha = alpha, + skewness = arma.bits.skewness_2(alpha), + kurtosis = arma.bits.kurtosis_2(alpha), + linetypes = c("solid", "dashed", "dotdash", "dotted") +) +arma.skew_normal_2_plot(x, params) +legend( + "topleft", + mapply( + function (a, s, k) { + as.expression(bquote(list( + alpha == .(arma.fmt(a, 2)), + gamma[1] == .(arma.fmt(s, 2)), + gamma[2] == .(arma.fmt(k, 2)) + ))) + }, + params$alpha, + params$skewness, + params$kurtosis + ), + lty = paste(params$linetypes) +) +#+end_src + +#+caption: Probability density function eqref:eq-skew-normal-2 of ocean wavy surface for different values of skewness coefficient \(\alpha\). +#+label: fig-skew-normal-2 +#+RESULTS: fig-skew-normal-2 +[[file:build/skew-normal-2.pdf]] + +**** Evaluation. +Equation eqref:eq-distribution-transformation with selected wave elevation +distribution may be solved either in every point of generated wavy surface, +which gives the most accurate results, or in every fixed grid point +interpolating result via least-squares (LS) polynomial. In the second case +precision is lower. For example, interpolating 12^th order polynomial on a fixed +grid of 500 points on interval \(-5\sigma_z\leq{z}\leq{5}\sigma_z\) gives error of +\(\approx{0.43}\cdot10^{-3}\). Increasing polynomial order leads to either numeric +overflows during LS interpolation, or more coefficient close to nought; +increasing the size of the grid has insignificant effect on the result. In the +majority of cases three Gram---Charlier series coefficients is enough to +transform ACF; relative error without interpolation is \(10^{-5}\). + ** White noise generation +In order to eliminate periodicity from generated wavy surface, it is imperative +to use PRNG with sufficiently large period to generate white noise. Parallel +Mersenne Twister\nbsp{}cite:matsumoto1998mersenne with a period of \(2^{19937}-1\) is +used as a generator in this work. It allows to produce aperiodic ocean wavy +surface realisations in any practical usage scenarios. + +There is no guarantee that multiple Mersenne Twisters executed in parallel +threads with distinct initial states produce uncorrelated pseudo-random number +sequences, however, algorithm of dynamic creation of Mersenne Twisters\nbsp{}cite:matsumoto1998dynamic may be used to provide such guarantee. The essence of +the algorithm is to find matrices of initial generator states, that give +maximally uncorrelated pseudo-random number sequences when Mersenne Twisters are +executed in parallel with these initial states. Since finding such initial +states consumes considerable amount of processor time, vector of initial states +is created preliminary with knowingly larger number of parallel threads and +saved to a file, which is then read before starting white noise generation. + ** Wavy surface generation -** Velocity potential field computation +In ARMA model value of wavy surface elevation at a particular point depends on +previous in space and time points, as a result the so called /ramp-up interval/ +(see fig.\nbsp{}[[fig-ramp-up-interval]]), in which realisation does not correspond to +specified ACF, forms in the beginning of the realisation. There are several +solutions to this problem which depend on the simulation context. + +If realisation is used in the context of ship stability simulation without +manoeuvring, ramp-up interval will not affect results of the simulation, because +it is located on the border (too far away from the studied marine object). If +ship stability with manoeuvring is studied, then the interval may be simply +discarded from the realisation (the size of the interval approximately equals +the number of AR coefficients in each dimension). However, this may lead to loss +of a very large number of points, because discarding occurs for each dimension. +Alternative approach is to generate ocean wavy surface on ramp-up interval with +LH model and generate the rest of the realisation with ARMA model. + +Algorithm of wavy surface generation is data-parallel: realisation is divided +into equal parts each of which is generated independently, however, in the +beginning of each realisation there is ramp-up interval. To eliminate it +/overlap-add/ method\nbsp{}cite:oppenheim1989discrete,svoboda2011efficient,pavel2013algorithms (a popular +method in signal processing) is used. The essence of the method is to add +another interval, size of which is equal to the ramp-up interval size, to the +end of each part. Then wavy surface is generated in each point of each part +(including points from the added interval), the interval at the end of part \(N\) +is superimposed on the ramp-up interval at the beginning of the part \(N+1\), and +values in corresponding points are added. + +#+name: fig-ramp-up-interval +#+begin_src R :file build/ramp-up-interval.pdf +source(file.path("R", "common.R")) +arma.plot_ramp_up_interval() +#+end_src + +#+caption: Ramp-up interval at the beginning of the \(OX\) axis of the realisation. +#+label: fig-ramp-up-interval +#+RESULTS: fig-ramp-up-interval +[[file:build/ramp-up-interval.pdf]] + +** Velocity potential computation +:PROPERTIES: +:CUSTOM_ID: sec:compute-delta +:END: + +In solutions eqref:eq-solution-2d and eqref:eq-solution-2d-full to +two-dimensional pressure determination problem there are functions +\(\Fun{z}=\InverseFourierY{e^{2\pi{u}{z}}}{x}\) and +\(\FunSecond{z}=\InverseFourierY{\Sinh{2\pi{u}{z}}}{x}\) which has multiple +analytic representations and are difficult to compute. Each function is a +Fourier transform of linear combination of exponents which reduces to poorly +defined Dirac delta function of a complex argument (see +table\nbsp{}[[tab-delta-functions]]). The usual way of handling this type of +functions is to write them as multiplication of Dirac delta functions of real +and imaginary part, however, this approach does not work here, because applying +inverse Fourier transform to this representation does not produce exponent, +which severely warp resulting velocity field. In order to get unique analytic +definition normalisation factor \(1/\Sinh{2\pi{u}{h}}\) (which is also included +in formula for \(E(u)\)) may be used. Despite the fact that normalisation allows +to obtain adequate velocity potential field, numerical experiments show that +there is little difference between this field and the one produced by formulae +from linear wave theory, in which terms with \(\zeta\) are omitted. As a result, +the formula for three-dimensional case was not derived. + +#+name: tab-delta-functions +#+caption: Formulae for computing \(\Fun{z}\) and \(\FunSecond{z}\) from [[#sec:pressure-2d]], that use normalisation to eliminate uncertainty from definition of Dirac delta function of complex argument. +#+attr_latex: :booktabs t +| Function | Without normalisation | Normalised | +|-------------------+--------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------| +| \(\Fun{z}\) | \(\delta (x+i z)\) | \(\frac{1}{2 h}\mathrm{sech}\left(\frac{\pi (x-i (h+z))}{2 h}\right)\) | +| \(\FunSecond{z}\) | \(\frac{1}{2}\left[\delta (x-i z) + \delta (x+i z) \right]\) | \(\frac{1}{4 h}\left[\text{sech}\left(\frac{\pi (x-i (h+z))}{2 h}\right)+\text{sech}\left(\frac{\pi (x+i(h+z))}{2 h}\right)\right]\) | + ** Evaluation -** Discussion +ARMA model does not require highly optimised software implementation to be +efficient, its performance is high even without use of co-processors; there are +two main causes of that. First, ARMA model itself does not use transcendental +functions (sines, cosines and exponents) as opposed to LH model. All +calculations (except model coefficients) are done via polynomials, which can be +efficiently computed on modern processors using a series of FMA instructions. +Second, pressure computation is done via explicit analytic formula using nested +FFTs. Since two-dimensional FFT of the same size is repeatedly applied to every +time slice, its coefficients (complex exponents) are pre-computed for all +slices, and computations are performed with only a few transcendental functions. +In case of MA model, performance is also increased by doing convolution with FFT +transforms. So, high performance of ARMA model is due to scarce use of +transcendental functions and heavy use of FFT, not to mention that high +convergence rate and non-existence of periodicity allows to use far fewer +coefficients compared to LH model. + +ARMA implementation uses several libraries of reusable mathematical functions +and numerical algorithms (listed in table\nbsp{}[[tab-arma-libs]]), and was +implemented using several parallel programming technologies (OpenMP, +OpenCL) to find the most efficient one. + +#+name: tab-arma-libs +#+caption: A list of mathematical libraries used in ARMA model implementation. +#+attr_latex: :booktabs t +| Library | What it is used for | +|--------------------------------------------------------------+---------------------------------| +| DCMT\nbsp{}cite:matsumoto1998dynamic | parallel PRNG | +| Blitz\nbsp{}cite:veldhuizen1997will,veldhuizen2000techniques | multidimensional arrays | +| GSL\nbsp{}cite:galassi2015gnu | PDF, CDF, FFT computation | +| | checking process stationarity | +| LAPACK, GotoBLAS\nbsp{}cite:goto2008high,goto2008anatomy | finding AR coefficients | +| GL, GLUT\nbsp{}cite:kilgard1996opengl | three-dimensional visualisation | +| CGAL\nbsp{}cite:fabri2009cgal | wave numbers triangulation | + +For the purpose of evaluation we use simplified version of\nbsp{}eqref:eq-phi-high-amp: +\begin{align} + \label{eq-phi-linear} + \phi(x,y,z,t) &= \InverseFourierY{ + \frac{ \Sinh{\smash{2\pi \Kveclen (z+h)}} } + { 2\pi\Kveclen \Sinh{\smash{2\pi \Kveclen h}} } + \FourierY{ \zeta_t }{u,v} + }{x,y}\nonumber \\ + &= \InverseFourierY{ g_1(u,v) \FourierY{ g_2(x,y) }{u,v} }{x,y}. +\end{align} +Since standing sea wave generator does not allow efficient GPU implementation +due to autoregressive dependencies between wavy surface points, only velocity +potential solver was rewritten in OpenCL and its performance was compared to +existing OpenMP implementation. + +For each implementation the overall performance of the solver for a particular +time instant was measured. Velocity field was computed for one $t$ point, for +128 $z$ points below wavy surface and for each $x$ and $y$ point of +four-dimensional $(t,x,y,z)$ grid. The only parameter that was varied between +subsequent programme runs is the size of the grid along $x$ dimension. A total +of 10 runs were performed and average time of each stage was computed. + +A different FFT library was used for each version of the solver. For OpenMP +version FFT routines from GNU Scientific Library (GSL)\nbsp{}cite:galassi2015gnu +were used, and for OpenCL version clFFT library\nbsp{}cite:clfft was used instead. +There are two major differences in the routines from these libraries. + +- The order of frequencies in Fourier transforms is different and clFFT library + requires reordering the result of\nbsp{}eqref:eq-phi-linear whereas GSL does not. +- Discontinuity at $(x,y) = (0,0)$ of velocity potential field grid is handled + automatically by clFFT library, whereas GSL library produce skewed values at + this point. + +For GSL library an additional interpolation from neighbouring points was used to +smooth velocity potential field at these points. We have not spotted other +differences in FFT implementations that have impact on the overall performance. + +In the course of the numerical experiments we have measured how much time each +solver's implementation spends in each computation stage to explain find out how +efficient data copying between host and device is in OpenCL implementation, and +how one implementation corresponds to the other in terms of performance. + +** Results +The experiments showed that GPU implementation outperforms CPU implementation by +a factor of 10--15 (fig.~\ref{fig:bench-cpu-gpu}), however, distribution of time +between computation stages is different for each implementation +(fig.~\ref{fig:breakdown-cpu-gpu}). The major time consumer in CPU +implementation is computation of $g_1$, whereas in GPU implementation its +running time is comparable to computation of $g_2$. GPU computes $g_1$ much +faster than CPU due to a large amount of modules for transcendental mathematical +function computation. In both implementations $g_2$ is computed on CPU, but for +GPU implementation the result is duplicated for each $z$ grid point in order to +perform multiplication of all $XYZ$ planes along $z$ dimension in single OpenCL +kernel, and, subsequently copied to GPU memory which severely hinders overall +stage performance. Copying the resulting velocity potential field between CPU +and GPU consumes $\approx{}20\%$ of velocity potential solver execution time. + +\begin{figure} + \centering + \includegraphics{bench-cpu-gpu} + \caption{Performance comparison of CPU (OpenMP) and GPU (OpenCL) versions of velocity potential solver.\label{fig:bench-cpu-gpu}} +\end{figure} + +\begin{figure} + \centering + \includegraphics{breakdown-cpu-gpu} + \caption{Performance breakdown for GPU (OpenCL) and CPU (OpenMP) versions of velocity potential solver.\label{fig:breakdown-cpu-gpu}} +\end{figure} + * Conclusion Research results allow to conclude that a problem of determining pressures under sea surface can be solved analytically without assumptions of linear and diff --git a/figures/bench-cpu-gpu.pdf b/figures/bench-cpu-gpu.pdf Binary files differ. diff --git a/figures/breakdown-cpu-gpu.pdf b/figures/breakdown-cpu-gpu.pdf Binary files differ. diff --git a/preamble.tex b/preamble.tex @@ -2,6 +2,8 @@ \usepackage{cite} \usepackage{latexsym} % \Box macro \usepackage{mathtools} % fancy dots in matrices +\usepackage{graphicx} +\graphicspath{{figures/}} % custom mathematical expressions \newcommand{\Var}[1]{\sigma_{#1}^2} diff --git a/refs.bib b/refs.bib @@ -191,4 +191,101 @@ pages={416--423}, booktitle={Proc. of II int. conf. on shipbuilding (ISC'98)}, year={1998} +} + +@book{galassi2015gnu, + author = {Galassi, M and Davies, J and Theiler, J and Gough, B and Jungman, G + and Alken, P and Booth, M and Rossi, F and Ulerich, R}, + title = {GNU Scientific Library Reference Manual}, + year = {2009}, + isbn = {0954612078, 9780954612078}, + edition = {3}, + publisher = {Network Theory Ltd.}, + note = {Eds. Brian Gough} +} + +@misc{clfft, + author = {{clFFT developers}}, + title = {{clFFT: OpenCL Fast Fourier Transforms (FFTs)}}, + howpublished = {\url{https://clmathlibraries.github.io/clFFT/}} +} + +@Article{ goto2008anatomy, + title = {Anatomy of high-performance matrix multiplication}, + author = {Goto, Kazushige and Geijn, Robert A}, + journal = {ACM Transactions on Mathematical Software (TOMS)}, + volume = {34}, + number = {3}, + pages = {12}, + year = {2008}, + publisher = {ACM} +} + +@Article{ goto2008high, + title = {High-performance implementation of the level-3 BLAS}, + author = {Goto, Kazushige and Van De Geijn, Robert}, + journal = {ACM Transactions on Mathematical Software (TOMS)}, + volume = {35}, + number = {1}, + pages = {4}, + year = {2008}, + publisher = {ACM} +} + +@InProceedings{ fabri2009cgal, + title = {CGAL: The computational geometry algorithms library}, + author = {Fabri, Andreas and Pion, Sylvain}, + booktitle = {Proceedings of the 17th ACM SIGSPATIAL international + conference on advances in geographic information systems}, + pages = {538--539}, + year = {2009}, + organization = {ACM} +} + +@article{kilgard1996opengl, + title = {The OpenGL Utility Toolkit (GLUT) Programming Interface}, + author = {Kilgard, Mark J}, + year = {1996}, + publisher = {Citeseer} +} + +@article{veldhuizen2000techniques, + title = {Techniques for scientific C++}, + author = {Veldhuizen, Todd}, + journal = {Computer science technical report}, + volume = {542}, + pages = {60}, + year = {2000}, + publisher = {Citeseer} +} + +@inproceedings{veldhuizen1997will, + title = {Will C++ be faster than Fortran?}, + author = {Veldhuizen, Todd L and Jernigan, M Ed}, + booktitle = {International Conference on Computing in Object-Oriented Parallel Environments}, + pages = {49--56}, + year = {1997}, + organization = {Springer} +} + +@Article{ matsumoto1998dynamic, + title = {Dynamic creation of pseudorandom number generators}, + author = {Matsumoto, Makoto and Nishimura, Takuji}, + journal = {Monte Carlo and Quasi-Monte Carlo Methods}, + volume = {2000}, + pages = {56--69}, + year = {1998} +} + +@Article{ matsumoto1998mersenne, + title = {Mersenne twister: a 623-dimensionally equidistributed + uniform pseudo-random number generator}, + author = {Matsumoto, Makoto and Nishimura, Takuji}, + journal = {ACM Transactions on Modeling and Computer Simulation + (TOMACS)}, + volume = {8}, + number = {1}, + pages = {3--30}, + year = {1998}, + publisher = {ACM} } \ No newline at end of file