commit 4f85ba1eba0fad7aae2d5618b8bfa560735a3c12
parent c4bd3a73cb5cec7f77996e018435632373e381ec
Author: Ivan Gankevich <igankevich@ya.ru>
Date: Thu, 28 Mar 2019 12:11:22 +0300
Our approach.
Diffstat:
3 files changed, 134 insertions(+), 17 deletions(-)
diff --git a/gnuplot/histogram.gnuplot b/gnuplot/histogram.gnuplot
@@ -20,19 +20,22 @@ plot \
using 'waves':xtic(1) ls 1, \
'' using 'velocity' ls 2, \
'' using 'wetted' ls 3, \
- '' using 'pressure' ls 4, \
- '' using 'exchange' ls 5, \
+ '' using 'clamp' ls 4, \
+ '' using 'pressure' ls 5, \
+ '' using 'exchange' ls 6, \
newhistogram "GPUlab", \
'build/gpulab1.histogram.dat' \
using 'waves':xtic(1) ls 1 notitle, \
'' using 'velocity' ls 2 notitle, \
'' using 'wetted' ls 3 notitle, \
- '' using 'pressure' ls 4 notitle, \
- '' using 'exchange' ls 5 notitle, \
+ '' using 'clamp' ls 4 notitle, \
+ '' using 'pressure' ls 5 notitle, \
+ '' using 'exchange' ls 6 notitle, \
newhistogram "Capybara", \
'build/capybara1.histogram.dat' \
using 'waves':xtic(1) ls 1 notitle, \
'' using 'velocity' ls 2 notitle, \
'' using 'wetted' ls 3 notitle, \
- '' using 'pressure' ls 4 notitle, \
- '' using 'exchange' ls 5 notitle
+ '' using 'clamp' ls 4 notitle, \
+ '' using 'pressure' ls 5 notitle, \
+ '' using 'exchange' ls 6 notitle
diff --git a/main.tex b/main.tex
@@ -37,8 +37,7 @@
\begin{document}
-\title{Virtual testbed%
-\thanks{Supported by Saint Petersburg State University (grant no.~26520170)}}
+\title{Virtual testbed: Ship motion simulation for~personal workstations}
\author{%
Alexander Degtyarev \and
Vasily Khramushin \and
@@ -71,7 +70,7 @@ in and out of accelerator's main memory has major impact on performance when
done in a loop, and the best performance is achieved when copying in and out is
done outside the loop (when data copying inside the loop involves accelerator's
main memory only). This result comes in line with how distributed computations
-are performed on a set of cluster nodes, which suggests using similar
+are performed on a set of cluster nodes, and suggests using similar
approaches for single heterogeneous node with a graphical accelerator.
\keywords{%
@@ -79,6 +78,7 @@ wavy surface
\and pressure field
\and pressure force
\and ship
+\and wetter surface
\and OpenCL
\and GPGPU.
}
@@ -469,8 +469,8 @@ graphical accelerator for visualisation.
\subsection{Benchmark results}
The main result of the benchmarks is that Virtual testbed is capable of running
-on a regular workstation with a graphical accelerator in real-time with high
-frame rate and small simulation time steps.
+on a regular workstation with or without a graphical accelerator in real-time
+with high frame rate and small simulation time steps.
\begin{itemize}
@@ -496,8 +496,8 @@ frame rate and small simulation time steps.
has comparable but negative difference.
\item Usage of graphical accelerator increases time needed to synchronise
- simulation step with the visualisation frame ("exchange" in
- fig.~\ref{fig:histogram}).
+ simulation step with the visualisation frame (\textit{exchange} stage
+ in fig.~\ref{fig:histogram}).
\end{itemize}
@@ -554,13 +554,92 @@ simulation; however, it gives performance reserve for further increase in
detail and scale of simulated physical phenomena. We manually limit simulation
time step to a minimum of \(1/30\) of the second to prevent floating-point
numerical errors due to small time steps. Also, we limit maximum time step to
-maintain Nyquist frequency for precise computation of time partial derivatives.
+have frequency greater or equal to Nyquist frequency for precise partial time
+derivatives computation.
+
+Real-time simulation is essential not only for educational purposes, but also
+for on-board intelligent systems. These systems analyse data coming from a
+multitude of sensors the ship equips, calculate probability of occurrence of a
+particular dangerous situation (e.g.~large roll angle) and try to prevent it by
+notifying ship's crew and an operator on the coast. This is one of the topics
+of future work.
+
+Despite the fact that Capybara has the highest floating-point performanace
+across all workstations in the benchmarks, Virtual testbed runs faster on its
+processor, not the graphical accelerator. Routine-by-routine investigation
+showed that it simply slower at computing even fully parallel Stokes wave
+generator kernel. This kernel fills threedimensional array elements using
+explicit formula for the wave profile, it has linear memory access pattern and
+no information dependencies between array elements. It seems, that P5000 is not
+optimised for general purpose computations. We did not conduct visualisation
+benchmarks, so we do not know if it is more efficient in that case.
+
+Although, Capybara's processor hash 20 hardware threads (2 threads per core),
+OpenMP performance does not scale beyond 10 threads. Parallel threads in our
+code do mostly the same operations but with different data, so switching
+between different hardware threads running on the same core in the hope that
+the second thread performs useful work while the first one stalls on
+input/output or load/store operation is not efficient. This problem is usually
+solved by creating a pipeline from the main loop in which each stage is
+executed in parallel and data constantly flows between subsequent stages. This
+approach is easy to implement when computational grid can be divided into
+distinct parts, which is not the case for Virtual testbed: there are too many
+dependencies between parts and in each stage position and size of each part can
+be different. OpenCL does not have these limitations, and pipeline would
+probably not improve graphical accelerator performance, so we did not take this
+approach.
+
+Our approach for performing computations on a heterogeneous node (a node with
+both a processor and a graphical accelerator) is similar to approach followed by
+the authors of Spark distributed data processing
+framework~\cite{zaharia2016spark}. In this framework data is first loaded into
+the main memory of each cluster node and then processed in a loop. Each
+iteration of this loop runs by all nodes in parallel and syncrhonisation occurrs
+at the end of each iteration. This is in contrast to MapReduce
+framework~\cite{dean2008mapreduce} where after each iteration the data is
+written to stable storage and then read back into the main memory to continue
+prcoessing. Not interacting with slow stable storage on every iteration allows
+Spark to achieve an order of magnitude higher performance than Hadoop
+(open-source version of MapReduce) on iterative algorithms.
+
+On a heterogeneous node an analogue of stable storage, read/writes to which is
+much slower than accesses to the main memory, is graphical acelerator memory. To
+minimise interaction with this memory, we do not read intermeidate results of
+our computations from it, but reuse arrays that already reside there. (As a
+concrete example, we do not copy pressure field from a graphical accelerator,
+only the forces for each panel.) This allows us to eliminate expensive data
+transfer between CPU and GPU memory. In early versions of our programme this
+copying significantly slowed down simulation.
+
+Although, heterogeneous node is not a cluster, the approach to programming it is
+similar to distributed data processing systems: we process data only on those
+device main memory of which contains this data and we never transfer
+intermediate computation results between devices. To employ this approach the
+whole iteration of the programme's main loop have to be executed either on a
+processor or a graphical accelerator. Given the time constraints, future
+maintenance burden and programme's code size, it was difficult to fully follow
+this approach, but we came to a reasonable approximation of it. We still have
+functions (\textit{clamp} stage in fig.~\ref{fig:histogram} that reduces the
+size of the computational grid to the points nearby the ship) in Virtual testbed
+that work with intermediate results on a processor, but the amount of data that
+is copied to and from a graphical accelerator is relatively small.
\section{Conclusion}
-Wind simulation, rudder and propeller, compartment flooding and fire,
-trochoidal waves.
+We showed that ship motion simulation can be performed on a regular workstation
+with or without graphical accelerator. Our programme includes only minimal
+number of mathematical models that allow motion calculation, but has performance
+reserve for inclusion of additional models. We plan to implement wind, rudder
+and propeller, compartment flooding and fire, and trochoidal waves simulation.
+Apart from that, the main direction of future research is creation of on-board
+intelligent system that would include Virtual testbed as an integral part for
+simulating and predicting physical phenomena.
+
+\section*{Acknowledgements}
+
+Research work is supported Supported by Saint Petersburg State University
+(grant no.~26520170).
\bibliographystyle{splncs04}
diff --git a/references.bib b/references.bib
@@ -63,6 +63,21 @@
number = {6}
}
+@Article{ dean2008mapreduce,
+ author = {Dean, Jeffrey and Ghemawat, Sanjay},
+ title = {MapReduce: Simplified Data Processing on Large Clusters},
+ journal = {Communications of the ACM},
+ volume = {51},
+ number = {1},
+ month = jan,
+ year = {2008},
+ issn = {0001-0782},
+ pages = {107--113},
+ doi = {10.1145/1327452.1327492},
+ publisher = {ACM},
+ address = {New York, NY, USA}
+}
+
@InProceedings{ micikevicius2009derivative,
author = {Micikevicius, Paulius},
title = {{3D} Finite Difference Computation on {GPUs} Using {CUDA}},
@@ -120,7 +135,8 @@
@Misc{ vessel2015,
title = {Vessel: Blueprints for the analysis of hydrostatic
- characteristics, stability and propulsion of the ship (in Russian)},
+ characteristics, stability and propulsion of the ship (in
+ Russian)},
author = {Alexander Bogdanov and Vasily Khramushin},
year = {2015},
url = {http://www1.fips.ru/fips_servl/fips_servlet?DB=EVM&DocNumber=2015621368&TypeFile=html},
@@ -140,6 +156,25 @@
PĂ©rez-Rojas and Alberto Francescutto}
}
+@Article{ zaharia2016spark,
+ author = {Zaharia, Matei and Xin, Reynold S. and Wendell, Patrick and
+ Das, Tathagata and Armbrust, Michael and Dave, Ankur and Meng,
+ Xiangrui and Rosen, Josh and Venkataraman, Shivaram and
+ Franklin, Michael J. and Ghodsi, Ali and Gonzalez, Joseph and
+ Shenker, Scott and Stoica, Ion},
+ title = {Apache Spark: A Unified Engine for Big Data Processing},
+ journal = {Commun. ACM},
+ volume = {59},
+ number = {11},
+ month = {October},
+ year = {2016},
+ issn = {0001-0782},
+ pages = {56--65},
+ doi = {10.1145/2934664},
+ publisher = {ACM},
+ address = {New York, NY, USA}
+}
+
@InBook{ gankevich2018ocean,
author = {Gankevich, Ivan and Degtyarev, Alexander},
editor = {Velarde, Manuel G. and Tarakanov, Roman Yu. and Marchenko,