Add makefile to build asynchronously.

commit 5af5d43ff5ef45bc60bc896a69f4087a9a77ebee
parent fbd105a080937baf22405c669bb1421df17ab463
Author: Ivan Gankevich <igankevich@ya.ru>
Date:   Tue, 28 Feb 2017 22:03:27 +0300

Add makefile to build asynchronously.

Diffstat:
.gitignore  | 8 ++++----
Makefile  | 22 ++++++++++++++++++++++
arma-thesis-ru.org  | 3434 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
arma-thesis.org  | 3226 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
phd-diss-ru.org  | 3434 -------------------------------------------------------------------------------
phd-diss.org  | 3226 -------------------------------------------------------------------------------

6 files changed, 6686 insertions(+), 6664 deletions(-)
diff --git a/.gitignore b/.gitignore
@@ -1,7 +1,7 @@
 build
 ltxpng
 auto/
-phd-diss-ru.pdf
-phd-diss-ru.tex
-phd-diss.pdf
-phd-diss.tex
+arma-thesis-ru.pdf
+arma-thesis-ru.tex
+arma-thesis.pdf
+arma-thesis.tex
diff --git a/Makefile b/Makefile
@@ -0,0 +1,22 @@
+PHD_RU = arma-thesis-ru
+PHD_EN = arma-thesis
+FLAGS = -interaction=nonstopmode \
+	-output-directory=build \
+	-pdf \
+	-xelatex \
+	-bibtex \
+	-shell-escape
+
+export TEXINPUTS=$(PWD)//:
+
+all: build/$(PHD_RU).pdf build/$(PHD_EN).pdf
+
+build/$(PHD_RU).pdf: $(PHD_RU).tex bib/*
+	latexmk $(FLAGS) -f $(PHD_RU).tex
+
+build/$(PHD_EN).pdf: $(PHD_EN).tex bib/*
+	latexmk $(FLAGS) -f $(PHD_EN).tex
+
+clean:
+	rm -f build/$(PHD_EN)*
+	rm -f build/$(PHD_RU)*
diff --git a/arma-thesis-ru.org b/arma-thesis-ru.org
@@ -0,0 +1,3434 @@
+# Local Variables:
+# org-ref-default-bibliography ("bib/refs.bib")
+# org-latex-image-default-width nil
+# org-latex-caption-above nil
+# org-latex-hyperref-template "\\hypersetup{\n pdfauthor={%a},\n pdftitle={%t},\n pdfkeywords={%k},\n pdfsubject={%d},\n pdfcreator={%c},\n pdflang={%L},\n unicode={true}\n}\n\\setdefaultlanguage{%l}\n"
+# org-export-latex-tables-hline "\\midrule"
+# org-export-latex-tables-tstart "\\toprule"
+# org-export-latex-tables-tend "\\bottomrule"
+# eval: (add-to-list 'org-latex-classes '("gost" "\\documentclass{gost} [DEFAULT-PACKAGES] [PACKAGES] [EXTRA]" ("\\section{%s}" . "\\section*{%s}") ("\\subsection{%s}" . "\\subsection*{%s}") ("\\subsubsection{%s}" . "\\subsubsection*{%s}") ("\\paragraph{%s}" . "\\paragraph*{%s}") ("\\subparagraph{%s}" . "\\subparagraph*{%s}")))
+# End:
+
+#+TITLE: Высокопроизводительная модель морского волнения для программ динамики морских объектов
+#+AUTHOR: Иван Ганкевич
+#+DATE: Санкт-Петербург, 2017
+#+LANGUAGE: ru
+#+LATEX_CLASS: gost
+#+LATEX_CLASS_OPTIONS: [hidelinks,fontsize=14pt,paper=a4,pagesize,DIV=calc,noenddot]
+#+LATEX_HEADER_EXTRA: \input{preamble}
+#+LATEX_HEADER_EXTRA: \organization{Санкт-Петербургский государственный университет}
+#+LATEX_HEADER_EXTRA: \manuscript{на правах рукописи}
+#+LATEX_HEADER_EXTRA: \degree{Диссертация на соискание ученой степени\\кандидата физико-математических наук}
+#+LATEX_HEADER_EXTRA: \speciality{Специальность 05.13.18\\Математическое моделирование, численные методы и комплексы программ}
+#+LATEX_HEADER_EXTRA: \supervisor{Научный руководитель\\д.т.н Дегтярев Александр Борисович}
+#+LATEX_HEADER_EXTRA: \newcites{published}{Список опубликованных по теме диссертации работ}
+#+OPTIONS: todo:nil title:nil ':t H:5
+#+STARTUP: indent
+#+PROPERTY: header-args:R :results graphics :exports results
+
+* Config                                                           :noexport:
+** Produce data for Q-Q and ACF plots
+#+begin_src sh :exports none :results verbatim
+root=$(pwd)
+for testname in propagating_wave standing_wave
+do
+    wd=$root/build/$testname
+    rm -rf $wd
+    mkdir -p $wd
+    cd $wd
+    arma -c $root/config/$testname.arma 2>&1
+done
+#+end_src
+
+#+RESULTS:
+#+begin_example
+Input file                     = /home/igankevich/workspace/phd-diss/config/propagating_wave.arma
+ACF grid size                  = (20,10,10)
+ACF grid patch size            = (0.526316,0.555556,0.555556)
+Output grid size               = (200,40,40)
+Output grid patch size         = (1,1,1)
+AR order                       = (10,10,10)
+Do least squares               = 0
+ACF function                   = propagating_wave
+Model                          = MA
+MA algorithm                   = fixed_point_iteration
+Verification scheme            = manual
+ACF variance = 5
+fixed_point_iteration:Iteration=0, var_wn=2.70831
+fixed_point_iteration:Iteration=1, var_wn=1.93791
+fixed_point_iteration:Iteration=2, var_wn=1.54801
+fixed_point_iteration:Iteration=3, var_wn=1.31202
+fixed_point_iteration:Iteration=4, var_wn=1.15328
+fixed_point_iteration:Iteration=5, var_wn=1.0386
+fixed_point_iteration:Iteration=6, var_wn=0.951442
+fixed_point_iteration:Iteration=7, var_wn=0.882674
+fixed_point_iteration:Iteration=8, var_wn=0.82688
+fixed_point_iteration:Iteration=9, var_wn=0.780623
+fixed_point_iteration:Iteration=10, var_wn=0.74161
+fixed_point_iteration:Iteration=11, var_wn=0.708244
+fixed_point_iteration:Iteration=12, var_wn=0.679374
+fixed_point_iteration:Iteration=13, var_wn=0.654145
+fixed_point_iteration:Iteration=14, var_wn=0.63191
+fixed_point_iteration:Iteration=15, var_wn=0.612168
+fixed_point_iteration:Iteration=16, var_wn=0.594523
+fixed_point_iteration:Iteration=17, var_wn=0.578663
+fixed_point_iteration:Iteration=18, var_wn=0.564333
+fixed_point_iteration:Iteration=19, var_wn=0.551325
+fixed_point_iteration:Iteration=20, var_wn=0.539469
+fixed_point_iteration:Iteration=21, var_wn=0.528623
+fixed_point_iteration:Iteration=22, var_wn=0.518666
+fixed_point_iteration:Iteration=23, var_wn=0.509497
+fixed_point_iteration:Iteration=24, var_wn=0.50103
+fixed_point_iteration:Iteration=25, var_wn=0.493191
+fixed_point_iteration:Iteration=26, var_wn=0.485916
+fixed_point_iteration:Iteration=27, var_wn=0.479148
+fixed_point_iteration:Iteration=28, var_wn=0.472841
+fixed_point_iteration:Iteration=29, var_wn=0.466951
+fixed_point_iteration:Iteration=30, var_wn=0.461442
+fixed_point_iteration:Iteration=31, var_wn=0.456279
+fixed_point_iteration:Iteration=32, var_wn=0.451435
+fixed_point_iteration:Iteration=33, var_wn=0.446882
+fixed_point_iteration:Iteration=34, var_wn=0.442597
+fixed_point_iteration:Iteration=35, var_wn=0.43856
+fixed_point_iteration:Iteration=36, var_wn=0.434752
+fixed_point_iteration:Iteration=37, var_wn=0.431155
+fixed_point_iteration:Iteration=38, var_wn=0.427755
+fixed_point_iteration:Iteration=39, var_wn=0.424538
+fixed_point_iteration:Iteration=40, var_wn=0.42149
+fixed_point_iteration:Iteration=41, var_wn=0.418601
+fixed_point_iteration:Iteration=42, var_wn=0.415859
+fixed_point_iteration:Iteration=43, var_wn=0.413256
+fixed_point_iteration:Iteration=44, var_wn=0.410782
+fixed_point_iteration:Iteration=45, var_wn=0.40843
+fixed_point_iteration:Iteration=46, var_wn=0.406191
+fixed_point_iteration:Iteration=47, var_wn=0.404059
+fixed_point_iteration:Iteration=48, var_wn=0.402029
+fixed_point_iteration:Iteration=49, var_wn=0.400092
+fixed_point_iteration:Iteration=50, var_wn=0.398246
+fixed_point_iteration:Iteration=51, var_wn=0.396483
+fixed_point_iteration:Iteration=52, var_wn=0.3948
+fixed_point_iteration:Iteration=53, var_wn=0.393193
+fixed_point_iteration:Iteration=54, var_wn=0.391656
+fixed_point_iteration:Iteration=55, var_wn=0.390188
+fixed_point_iteration:Iteration=56, var_wn=0.388782
+fixed_point_iteration:Iteration=57, var_wn=0.387438
+fixed_point_iteration:Iteration=58, var_wn=0.386151
+fixed_point_iteration:Iteration=59, var_wn=0.384918
+fixed_point_iteration:Iteration=60, var_wn=0.383738
+fixed_point_iteration:Iteration=61, var_wn=0.382606
+fixed_point_iteration:Iteration=62, var_wn=0.381522
+fixed_point_iteration:Iteration=63, var_wn=0.380482
+fixed_point_iteration:Iteration=64, var_wn=0.379485
+fixed_point_iteration:Iteration=65, var_wn=0.378528
+fixed_point_iteration:Iteration=66, var_wn=0.37761
+fixed_point_iteration:Iteration=67, var_wn=0.376729
+fixed_point_iteration:Iteration=68, var_wn=0.375882
+fixed_point_iteration:Iteration=69, var_wn=0.37507
+fixed_point_iteration:Iteration=70, var_wn=0.374289
+fixed_point_iteration:Iteration=71, var_wn=0.373539
+fixed_point_iteration:Iteration=72, var_wn=0.372818
+fixed_point_iteration:Iteration=73, var_wn=0.372126
+fixed_point_iteration:Iteration=74, var_wn=0.37146
+fixed_point_iteration:Iteration=75, var_wn=0.37082
+fixed_point_iteration:Iteration=76, var_wn=0.370204
+fixed_point_iteration:Iteration=77, var_wn=0.369612
+fixed_point_iteration:Iteration=78, var_wn=0.369042
+fixed_point_iteration:Iteration=79, var_wn=0.368494
+fixed_point_iteration:Iteration=80, var_wn=0.367966
+fixed_point_iteration:Iteration=81, var_wn=0.367458
+fixed_point_iteration:Iteration=82, var_wn=0.366969
+fixed_point_iteration:Iteration=83, var_wn=0.366499
+fixed_point_iteration:Iteration=84, var_wn=0.366046
+fixed_point_iteration:Iteration=85, var_wn=0.36561
+fixed_point_iteration:Iteration=86, var_wn=0.365189
+fixed_point_iteration:Iteration=87, var_wn=0.364785
+fixed_point_iteration:Iteration=88, var_wn=0.364395
+fixed_point_iteration:Iteration=89, var_wn=0.364019
+fixed_point_iteration:Iteration=90, var_wn=0.363657
+fixed_point_iteration:Iteration=91, var_wn=0.363309
+fixed_point_iteration:Iteration=92, var_wn=0.362973
+fixed_point_iteration:Iteration=93, var_wn=0.362649
+fixed_point_iteration:Iteration=94, var_wn=0.362337
+fixed_point_iteration:Iteration=95, var_wn=0.362036
+fixed_point_iteration:Iteration=96, var_wn=0.361746
+fixed_point_iteration:Iteration=97, var_wn=0.361466
+fixed_point_iteration:Iteration=98, var_wn=0.361197
+fixed_point_iteration:Iteration=99, var_wn=0.360937
+fixed_point_iteration:Iteration=100, var_wn=0.360686
+fixed_point_iteration:Iteration=101, var_wn=0.360444
+fixed_point_iteration:Iteration=102, var_wn=0.360211
+fixed_point_iteration:Iteration=103, var_wn=0.359986
+fixed_point_iteration:Iteration=104, var_wn=0.359769
+fixed_point_iteration:Iteration=105, var_wn=0.35956
+fixed_point_iteration:Iteration=106, var_wn=0.359358
+fixed_point_iteration:Iteration=107, var_wn=0.359163
+fixed_point_iteration:Iteration=108, var_wn=0.358975
+fixed_point_iteration:Iteration=109, var_wn=0.358794
+fixed_point_iteration:Iteration=110, var_wn=0.358619
+fixed_point_iteration:Iteration=111, var_wn=0.35845
+fixed_point_iteration:Iteration=112, var_wn=0.358288
+fixed_point_iteration:Iteration=113, var_wn=0.35813
+fixed_point_iteration:Iteration=114, var_wn=0.357979
+fixed_point_iteration:Iteration=115, var_wn=0.357832
+fixed_point_iteration:Iteration=116, var_wn=0.357691
+fixed_point_iteration:Iteration=117, var_wn=0.357555
+fixed_point_iteration:Iteration=118, var_wn=0.357423
+fixed_point_iteration:Iteration=119, var_wn=0.357296
+fixed_point_iteration:Iteration=120, var_wn=0.357173
+fixed_point_iteration:Iteration=121, var_wn=0.357055
+fixed_point_iteration:Iteration=122, var_wn=0.356941
+fixed_point_iteration:Iteration=123, var_wn=0.356831
+fixed_point_iteration:Iteration=124, var_wn=0.356724
+fixed_point_iteration:Iteration=125, var_wn=0.356621
+fixed_point_iteration:Iteration=126, var_wn=0.356522
+fixed_point_iteration:Iteration=127, var_wn=0.356426
+fixed_point_iteration:Iteration=128, var_wn=0.356334
+fixed_point_iteration:Iteration=129, var_wn=0.356244
+fixed_point_iteration:Iteration=130, var_wn=0.356158
+fixed_point_iteration:Iteration=131, var_wn=0.356075
+fixed_point_iteration:Iteration=132, var_wn=0.355994
+fixed_point_iteration:Iteration=133, var_wn=0.355917
+fixed_point_iteration:Iteration=134, var_wn=0.355842
+fixed_point_iteration:Iteration=135, var_wn=0.355769
+fixed_point_iteration:Iteration=136, var_wn=0.355699
+fixed_point_iteration:Iteration=137, var_wn=0.355632
+fixed_point_iteration:Iteration=138, var_wn=0.355567
+fixed_point_iteration:Iteration=139, var_wn=0.355504
+fixed_point_iteration:Iteration=140, var_wn=0.355443
+fixed_point_iteration:Iteration=141, var_wn=0.355384
+fixed_point_iteration:Iteration=142, var_wn=0.355327
+fixed_point_iteration:Iteration=143, var_wn=0.355273
+fixed_point_iteration:Iteration=144, var_wn=0.35522
+fixed_point_iteration:Iteration=145, var_wn=0.355169
+fixed_point_iteration:Iteration=146, var_wn=0.355119
+fixed_point_iteration:Iteration=147, var_wn=0.355072
+fixed_point_iteration:Iteration=148, var_wn=0.355026
+fixed_point_iteration:Iteration=149, var_wn=0.354981
+fixed_point_iteration:Iteration=150, var_wn=0.354938
+fixed_point_iteration:Iteration=151, var_wn=0.354897
+fixed_point_iteration:Iteration=152, var_wn=0.354856
+fixed_point_iteration:Iteration=153, var_wn=0.354818
+fixed_point_iteration:Iteration=154, var_wn=0.35478
+fixed_point_iteration:Iteration=155, var_wn=0.354744
+fixed_point_iteration:Iteration=156, var_wn=0.354709
+fixed_point_iteration:Iteration=157, var_wn=0.354676
+fixed_point_iteration:Iteration=158, var_wn=0.354643
+fixed_point_iteration:Iteration=159, var_wn=0.354612
+fixed_point_iteration:Iteration=160, var_wn=0.354581
+fixed_point_iteration:Iteration=161, var_wn=0.354552
+fixed_point_iteration:Iteration=162, var_wn=0.354524
+fixed_point_iteration:Iteration=163, var_wn=0.354496
+fixed_point_iteration:Iteration=164, var_wn=0.35447
+fixed_point_iteration:Iteration=165, var_wn=0.354444
+fixed_point_iteration:Iteration=166, var_wn=0.35442
+fixed_point_iteration:Iteration=167, var_wn=0.354396
+fixed_point_iteration:Iteration=168, var_wn=0.354373
+fixed_point_iteration:Iteration=169, var_wn=0.35435
+fixed_point_iteration:Iteration=170, var_wn=0.354329
+fixed_point_iteration:Iteration=171, var_wn=0.354308
+fixed_point_iteration:Iteration=172, var_wn=0.354288
+fixed_point_iteration:Iteration=173, var_wn=0.354269
+fixed_point_iteration:Iteration=174, var_wn=0.35425
+fixed_point_iteration:Iteration=175, var_wn=0.354232
+fixed_point_iteration:Iteration=176, var_wn=0.354214
+fixed_point_iteration:Iteration=177, var_wn=0.354198
+fixed_point_iteration:Iteration=178, var_wn=0.354181
+fixed_point_iteration:Iteration=179, var_wn=0.354165
+fixed_point_iteration:Iteration=180, var_wn=0.35415
+fixed_point_iteration:Iteration=181, var_wn=0.354136
+fixed_point_iteration:Iteration=182, var_wn=0.354121
+fixed_point_iteration:Iteration=183, var_wn=0.354108
+fixed_point_iteration:Iteration=184, var_wn=0.354094
+fixed_point_iteration:Iteration=185, var_wn=0.354082
+fixed_point_iteration:Iteration=186, var_wn=0.354069
+fixed_point_iteration:Iteration=187, var_wn=0.354057
+fixed_point_iteration:Iteration=188, var_wn=0.354046
+fixed_point_iteration:Iteration=189, var_wn=0.354034
+fixed_point_iteration:Iteration=190, var_wn=0.354024
+fixed_point_iteration:Iteration=191, var_wn=0.354013
+fixed_point_iteration:Iteration=192, var_wn=0.354003
+fixed_point_iteration:Iteration=193, var_wn=0.353994
+WN variance = 0.353994
+Input file                     = /home/igankevich/workspace/phd-diss/config/standing_wave.arma
+ACF grid size                  = (10,10,10)
+ACF grid patch size            = (0.277778,0.555556,0.555556)
+Output grid size               = (200,40,40)
+Output grid patch size         = (1,1,1)
+AR order                       = (7,7,7)
+Do least squares               = 0
+ACF function                   = standing_wave
+Model                          = AR
+MA algorithm                   = fixed_point_iteration
+Verification scheme            = manual
+ACF variance = 5
+WN variance = 0.00261323
+Zeta size = (193,33,33)
+NaN: 29, -nan, 1.798e+36, -1.04284e+38, inf, -1.798e+36, -1.798e+36
+#+end_example
+
+* Введение
+**** Актуальность темы.
+Программы, моделирующие поведение судна на морских волнах, широко применяются
+для расчета качки судна, оценки величины воздействия внешних сил на плавучую
+платформу или другой морской объект, а также для оценки вероятности
+опрокидывания судна при заданных погодных условиях; однако, большинство из них
+используют линейную теорию для моделирования морского волнения\nbsp{}cite:shin2003nonlinear,van2007forensic,kat2001prediction,van2002development, в
+рамках которой сложно воспроизвести определенные особенности ветроволнового
+климата. Среди них можно выделить переход от нормальных погодных условий к
+шторму и волнение, вызванное наложением множества систем ветровых волн и волн
+зыби, распространяющихся в нескольких направлениях. Другой недостаток линейной
+теории волн заключается в предположении, что высота волн много меньше их длины.
+Это делает расчеты грубыми при моделировании качки судна в условиях
+нерегулярного волнения, когда такое предположение несправедливо. Разработка
+новых и более совершенных моделей и методов, используемых при расчете динамики
+судна, может увеличить количество сценариев ее применения и, в частности,
+способствовать исследованию поведения судна в экстремальных условиях.
+
+**** Степень разработанности.
+Модель авторегрессии скользящего среднего (АРСС) возникла как ответ на
+сложности, с которыми на практике сталкиваются ученые, использующие в свой
+работе модели морского волнения, разработанные в рамках линейной теории волн.
+Проблемы, с которыми они сталкиваются при использовании модели Лонге---Хиггинса
+(которая полностью основана на линейной теории волн) перечислены ниже.
+1. /Периодичность/. В рамках линейной теории волны аппроксимируются суммой
+   гармоник, а период реализации взволнованной поверхности зависит от их
+   количества. Чем больше размер реализации, тем больше коэффициентов требуется
+   для исключения периодичности, поэтому с увеличением размера реализации время
+   ее генерации растет нелинейно. Это приводит к тому, что любая модель,
+   основанная на линейной теории, неэффективна при генерации больших реализаций
+   взволнованной поверхности, независимо от того, насколько оптимизирован
+   исходный код программы.
+2. /Линейность/. В рамках линейной теории волн дается математическое определение
+   морским волнам в предположении малости их амплитуд по сравнению с длинами.
+   Такие волны, в основном, характерны для открытого моря и океана, а волны в
+   прибрежных районах и штормовые волны, для которых это предположение
+   несправедливо, грубо описываются в рамках линейной теории.
+3. /Вероятностная сходимость/. Фаза волны, значение которой обычно получается с
+   помощью генератора псевдослучайных чисел (ГПСЧ), имеет равномерное
+   распределение, что иногда приводит к медленной сходимости интегральных
+   характеристик взволнованной поверхности (таких как распределение высот волн,
+   их периодов, длин и т.п.). Скорость сходимости зависит от значений,
+   полученных от ГПСЧ, поэтому быстрая сходимость не гарантируется.
+
+Эти сложности стали отправной точкой в поиске модели, не основанной на линейной
+теории волн, и в исследованиях процесса АРСС был найден необходимый
+математический аппарат.
+1. Параметром процесса АРСС является автоковариационная функция (АКФ), которая
+   может быть напрямую получена из энергетического или частотно-направленного
+   спектра морского волнения (который, в свою очередь является входным
+   параметром для модели Лонге---Хиггинса). Так что входные параметры одной
+   модели могут быть легко преобразованы во входные параметры другой.
+2. Процесс АРСС не имеет ограничение на амплитуду генерируемых волн: их крутизна
+   может быть увеличена на столько, на сколько это позволяет АКФ реальных
+   морских волн.
+3. Период реализации равен периоду ГПСЧ, поэтому время генерации растет линейно
+   с увеличением размера реализации.
+4. Белый шум, который является единственным вероятностным членом формулы
+   процесса АРСС, имеет нормальное распределение; так что скорость сходимость не
+   носит вероятностный характер.
+
+**** Цели и задачи.
+Процесс АРСС стал основой модели ветрового волнения АРСС, однако он нуждался в
+доработке перед тем, как его можно было бы использовать на практике.
+1. Необходимо было исследовать, как различные формы АКФ влияют на выбор
+   параметров процесса АРСС (количество коэффициентов процесса скользящего
+   среднего и процесса авторегрессии).
+2. Затем исследовать возможность генерации волн с произвольным профилем, а не
+   только профиль синусоиды (учесть асимметричность распределения волновых
+   аппликат взволнованной поверхности).
+3. Затем вывести формулы для определения поля давлений под взволнованной
+   поверхностью. Такие формулы обычно выводятся для конкретной модели путем
+   подстановки формулы профиля волны в eqref:eq-problem, однако процесс АРСС не
+   содержит в себе формулу профиля волны в явном виде, поэтому для него
+   необходимо было получить решение для взволнованной поверхности общего вида
+   (для которой не существует аналитического выражения) без линеаризации
+   граничных условий (ГУ) и предположении о малости амплитуд волн.
+4. Наконец, верифицировать интегральные характеристики взволнованной поверхности
+   на соответствие реальным морским волнам.
+5. Заключительный этап состоял в разработке комплекса программ, реализующего
+   созданную модель и метод расчета давлений и позволяющего проводить расчеты
+   как на многопроцессорной машине с общей памятью (SMP), так и на компьютерном
+   кластере (MPP).
+
+**** Научная новизна.
+Модель АРСС в отличие от других моделей ветрового волнения не основана на
+линейной теории волн, что позволяет
+- генерировать волны произвольной амплитуды, регулируя крутизну посредством АКФ;
+- генерировать волны произвольной формы, регулируя асимметричность распределения
+  волновых аппликат посредством нелинейного безынерционного преобразования
+  (НБП).
+В то же время математический аппарат этой процесса АРСС хорошо изучен в
+других научных областях, что позволяет его обобщить для моделирования развития
+морского волнения в условиях шторма с учетом климатических спектров и данных
+ассимиляции определенных районов мирового океана, что невозможно сделать с
+помощью модели, основанной на линейной теории волн.
+
+**** Теоретическая и практическая значимость работы.
+Применение модели АРСС и формулы поля давлений, не использующей предположения
+линейной теории волн, качественно повысит работу комплексов программ для расчета
+воздействия океанских волн на морские объекты.
+
+1. Поскольку формула для поля давлений выводится для дискретно заданной
+   взволнованной поверхности и без каких-либо предположений об амплитудах волн,
+   то она применима для любой взволнованной поверхности невязкой несжимаемой
+   жидкости (в частности она применима для поверхности, генерируемой моделью
+   Лонге---Хиггинса). Это позволяет использовать формулу поля давлений без
+   привязки к модели АРСС.
+2. С вычислительной точки зрения эта формула более эффективна, чем
+   соответствующая формула для модели ЛХ, поскольку интегралы в формуле сводятся
+   к преобразованиям Фурье, для которых существует семейство алгоритмов быстрого
+   преобразования Фурье (БПФ), оптимизированных под разные архитектуры
+   процессоров.
+3. Поскольку формула явная, то обмена данными между параллельными процессами
+   можно избежать, что позволяет достичь высокой масштабируемости на
+   компьютерном кластере.
+4. Наконец, сама модель АРСС более эффективна, чем модель ЛХ, ввиду отсутствия
+   тригонометрических функций в ее формуле. Взволнованная поверхность
+   вычисляется как сумма большого числа многочленов, для которых существует
+   низкоуровневая ассемблерная инструкция (Fused Multiply-Add), показывающая
+   высокую производительность на процессорах.
+
+**** Методология и методы исследования.
+Программная реализация модели АРСС и формула вычисления давлений создавалась
+поэтапно: прототип, написанный высокойровневом инженерном языке\nbsp{}cite:mathematica10,octave2015, был преобразован в программу на языке более
+низкого уровня (C++). Реализация одних и тех же формул и алгоритмов на языках
+разного уровня (ввиду использования различных абстракций и языковых примитивов)
+позволяет выявить и исправить ошибки, которые остались бы незамеченными в случае
+одного языка. Генерируемая моделью АРСС взволнованная поверхность, а также все
+входные параметры (АКФ, формула распределения волновых аппликат и т.п.) были
+проверены с помощью встроенных в язык программирования графических средств для
+визуального контроля корректности работы программы.
+
+**** Положения, выносимые на защиту.
+- Модель ветрового волнения, способная генерировать реализации взволнованной
+  морской поверхности, имеющие большой период и состоящие из волн произвольной
+  амплитуды;
+- Формула для поля давлений, выведенная для этой модели без предположений
+  линейной теории волн;
+- Программная реализация созданной модели и формулы для вычислительных систем с
+  общей (SMP) и с распределенной памятью (MPP).
+
+**** Степень достоверности и апробация результатов.
+Верификация модели АРСС проводится путем сравнения интегральных характеристик
+(распределений волновых аппликат, высот и длин волн и т.п.) генерируемой
+взволнованной поверхности с характеристиками реальных морских волн. Формула для
+поля давлений выводится с помощью языка Mathematica, в котором полученные
+выражения проверяются с помощью встроенных в язык графических средств.
+
+Модель АРСС и формула для поля давлений были реализованы в Large Amplitude
+Motion Programme (LAMP), программе для моделирования качки судна, и сопоставлены
+с используемой ранее моделью ЛХ. Предварительные численные эксперименты показали
+более высокую вычислительную эффективность модели АРСС.
+
+* Постановка задачи
+Задача состоит в исследовании возможности применении математического аппарата
+процесса АРСС для моделирования морских волн и в выводе формулы для поля
+давлений под генерируемой взволнованной морской поверхностью для случая
+идеальной несжимаемой жидкости без предположений линейной теории волн.
+- Для случая волн малых амплитуд полученная формула должна быть сопоставимо с
+  соответствующей формулой линейной теории волн; для остальных случаев формула
+  не должна расходиться.
+- Интегральные характеристики генерируемой взволнованной поверхности должны
+  совпадать с характеристиками реальных морских волн.
+- Программная реализация модели АРСС и формулы вычисления давлений должна
+  работать на системах с общей (SMP) и распределенной памятью (MPP).
+
+**** Формула для поля давлений.
+Задача определения поля давлений под взволнованной морской поверхностью
+представляет собой обратную задачу гидродинамики для несжимаемой невязкой
+жидкости. Система уравнений для нее в общем виде записывается как\nbsp{}cite:kochin1966theoretical
+\begin{align}
+    & \nabla^2\phi = 0,\nonumber\\
+    & \phi_t+\frac{1}{2} |\vec{\upsilon}|^2 + g\zeta=-\frac{p}{\rho}, & \text{на }z=\zeta(x,y,t),\label{eq-problem}\\
+    & D\zeta = \nabla \phi \cdot \vec{n}, & \text{на }z=\zeta(x,y,t),\nonumber
+\end{align}
+где \(\phi\)\nbsp{}--- потенциал скорости, \(\zeta\)\nbsp{}--- подъем (аппликата)
+взволнованной поверхности, \(p\)\nbsp{}--- давление жидкости, \(\rho\)\nbsp{}--- плотность
+жидкости, \(\vec{\upsilon}=(\phi_x,\phi_y,\phi_z)\)\nbsp{}--- вектор скорости, \(g\)\nbsp{}--- ускорение свободного падения и \(D\)\nbsp{}--- субстанциональная производная
+(производная Лагранжа). Первое уравнение является уравнением неразрывности
+(уравнение Лапласа), второе\nbsp{}--- законом сохранения импульса (которое иногда
+называют динамическим граничным условием); третье уравнение\nbsp{}--- кинематическое
+граничное условие, которое сводится к равенству скорости перемещения этой
+поверхности (\(D\zeta\)) нормальной составляющей скорости жидкости
+(\(\nabla\phi\cdot\vec{n}\)).
+
+Обратная задача гидродинамики заключается в решении этой системы уравнений
+относительно \(\phi\). В такой постановке динамическое ГУ становится явной
+формулой для определения поля давлений по значениям производных потенциалов
+скорости, полученных из оставшихся уравнений. Таким образом, с математической
+точки зрения обратная задача гидродинамики сводится к решению уравнения Лапласа
+со смешанным ГУ\nbsp{}--- задаче Робена.
+
+* Обзор литературы
+** Анализ моделей морского волнения
+Вычисление давлений возможно только при условии знания формы взволнованной
+поверхности, которая задается либо дискретно в каждой точке пространственной
+сетки, либо непрерывно с помощью аналитической формулы. Как будет показано в
+разделе [[#linearisation]], знание такой формулы может упростить вычисление
+давлений, фактически сведя задачу к генерации поля давлений, а не самой
+взволнованной поверхности.
+
+**** Модель Лонге---Хиггинса.
+Наиболее простой моделью, формула которой выводится в рамках линейной теории
+волн (см.\nbsp{}разд.\nbsp{}[[#longuet-higgins-derivation]]), является модель
+Лонге---Хиггинса (ЛХ)\nbsp{}cite:longuet1957statistical. Подробный сравнительный
+анализ этой модели и модели АРСС проведен в
+работах\nbsp{}cite:degtyarev2011modelling,boukhanovsky1997thesis.
+
+Модель ЛХ представляет взволнованную морскую поверхность в виде суперпозиции
+элементарных гармонических волн случайных амплитуд \(c_n\) и фаз \(\epsilon_n\),
+непрерывно распределенных на интервале \([0,2\pi]\). Подъем (координата \(z\))
+поверхности определяется формулой
+#+name: eq-longuet-higgins
+\begin{equation}
+    \zeta(x,y,t) = \sum\limits_n c_n \cos(u_n x + v_n y - \omega_n t + \epsilon_n).
+\end{equation}
+Здесь волновые числа \((u_n,v_n)\) непрерывно распределены на плоскости \((u,v)\),
+т.е. площадка \(du \times dv\) содержит бесконечно большое количество волновых
+чисел. Частота связана с волновыми числами дисперсионным соотношением
+\(\omega_n=\omega(u_n,v_n)\). Функция \(\zeta(x,y,t)\) является трехмерным
+эргодическим стационарным однородным гауссовым процессом, определяемым
+соотношением
+\begin{equation*}
+    2E_\zeta(u,v)\, du\,  dv = \sum\limits_n c_n^2,
+\end{equation*}
+где \(E_\zeta(u,v)\)\nbsp{}--- двумерная спектральная плотность энергии волн.
+Коэффициенты \(c_n\) определяются из энергетического спектра волнения \(S(\omega)\)
+по формуле
+\begin{equation*}
+    c_n = \sqrt{ \textstyle\int\limits_{\omega_n}^{\omega_{n+1}} S(\omega) d\omega}.
+\end{equation*}
+
+**** Основные недостатки модели Лонге---Хиггинса.
+Модель Лонге---Хиггинса отличается простотой численного алгоритма и
+наглядностью, однако, на практике она обладает рядом недостатков.
+
+1. Модель рассчитана на представление стационарного гауссова поля. Это является
+   следствием центральной предельной теоремы (ЦПТ): сумма большого числа
+   гармоник со случайными амплитудами и фазами имеет нормальное распределение в
+   независимости от спектра, подаваемого на вход модели. Использование меньшего
+   количества коэффициентов может решить проблему, но также уменьшит период
+   реализации. Таким образом, использование модели ЛХ для генерации волн с
+   негауссовым распределением аппликат (которое имеют реальные морские волны\nbsp{}cite:huang1980experimental,рожков1996теория) не реализуемо на практике.
+2. С вычислительной точки зрения, недостатком модели является нелинейный рост
+   времени генерации поверхности с увеличением размера реализации. Чем больше
+   размер реализации, тем больше коэффициентов (дискретных точек
+   частотно-направленного спектра) требуется для исключения периодичности. Это
+   делает модель неэффективной для проведения длительных численных
+   экспериментов.
+3. Наконец, с инженерной точки зрения, модель обладает рядом особенностей,
+   которые не позволяют использовать ее в качестве фундамента для построения
+   более совершенных моделей.
+   - В программной реализации скорость сходимости выражения
+     ур.\nbsp{}[[eq-longuet-higgins]] может быть низкой, т.к. фазы \(\epsilon_n\)
+     имеют вероятностный характер.
+   - Обобщение модели для негауссовых и нелинейных процессов возможно при
+     включении нелинейных членов в ур.\nbsp{}[[eq-longuet-higgins]], для которого не
+     известна формула вычисления
+     коэффициентов\nbsp{}cite:рожков1990вероятностные.
+ 
+Таким образом, модель ЛХ применима для решения задачи генерации взволнованной
+морской поверхности только в рамках линейной теории волн, неэффективна для
+длительных экспериментов и имеет ряд недостатков, не позволяющих использовать ее
+в качестве основы для построения более совершенных моделей.
+
+**** Модель АРСС.
+В\nbsp{}cite:spanos1982arma модель АРСС используется для генерации временного ряда,
+спектр которого совпадает с аппроксимацией Пирсона---Московица для спектров
+морского волнения. Авторы проводят эксперименты для одномерных моделей АР, СС и
+АРСС. Они отмечают превосходное совпадение полученного и исходного спектров и
+более высокую вычислительную эффективность модели АРСС по сравнению с
+моделями, основанными на суммировании большого числа гармоник со случайными
+фазами. Также отмечается, что для того чтобы спектр полученного временного ряда
+совпадал с заданным, модели СС требуется меньшее количество коэффициентов, чем
+модели АР. В\nbsp{}cite:spanos1996efficient автор обобщает формулы для нахождения
+коэффициентов модели АРСС для случая нескольких (векторов) переменных.
+
+Отличие данной работы от вышеперечисленных отличается в исследовании трехмерной
+модели АРСС (два пространственных и одно временное измерение), что во многом
+является другой задачей.
+1. Система уравнений Юла---Уокера, используемая для определения коэффициентов
+   АР, имеет более сложную блочно-блочную структуру.
+2. Оптимальный (для совпадения заданного и исходного спектров) порядок модели
+   определяется вручную.
+3. Вместо аппроксимации ПМ в качестве входа модели используются аналитические
+   выражения для АКФ стоячих и прогрессивных волн.
+4. Трехмерная взволнованная поверхность должна быть сопоставима с реальной
+   морской поверхностью не только по спектральным характеристикам, но и по форме
+   волновых профилей, поэтому верификация модели производится и для
+   распределений различных параметров генерируемых волн (длин, высот, периодов и
+   др.).
+Многомерность исследуемой модели не только усложняет задачу, но и позволяет
+провести визуальную проверку генерируемой взволнованной поверхности. Именно
+возможность визуализировать результат работы программы позволила удостовериться,
+что генерируемая поверхность действительно похожа на реальное морское волнение,
+а не является абстрактным многомерным случайным процессом, совпадающим с
+реальным лишь статистически.
+
+В\nbsp{}cite:fusco2010short модель АР используется для прогнозирования волн зыби для
+управления преобразователем энергии волн (ПЭВ) в реальном времени. Для
+эффективной работы ПЭВ необходимо чтобы частота встроенного осциллятора
+совпадала с частотой морских волн. Авторы статьи представляют подъем волны как
+временной ряд и сравнивают эффективность модели АР, нейронных сеть и циклических
+моделей в прогнозировании будущих значения ряда. Модель АР дает наиболее точный
+прогноз для низкочастотных волн зыби вплоть до двух типовых периодов волн. Это
+пример успешного применения модели процесса АР для моделирования морских волн.
+
+** Известные формулы определения поля давлений
+**** Теория волн малых амплитуд.
+В\nbsp{}cite:stab2012,детярев1998моделирование,degtyarev1997analysis дается решение
+обратной задачи гидродинамики для случая идеальной несжимаемой жидкости в рамках
+теории волн малых амплитуд (в предположении, что длина волны много больше ее
+высоты: \(\lambda \gg h\)). В этом случае обратная задача линейна и сводится к
+уравнению Лапласа со смешанным граничным условием, а уравнение движения
+используется только для нахождения давлений по известным значениям производных
+потенциала скорости. Предположение о малости амплитуд волн означает слабое
+изменение локального волнового числа во времени и пространстве по сравнению с
+подъемом (аппликатой) взволнованной поверхности. Это позволяет вычислить
+производную подъема поверхности по \(z\) как \(\zeta_z=k\zeta\), где \(k\)\nbsp{}---
+волновое число. В двухмерном случае решение записывается явной формулой
+\begin{align}
+    \left.\frac{\partial\phi}{\partial x}\right|_{x,t}= &
+        -\frac{1}{\sqrt{1+\alpha^{2}}}e^{-I(x)}
+            \int\limits_{0}^x\frac{\partial\dot{\zeta}/\partial      
+                z+\alpha\dot{\alpha}}{\sqrt{1+\alpha^{2}}}e^{I(x)}dx,\label{eq-old-sol-2d}\\
+    I(x)= & \int\limits_{0}^x\frac{\partial\alpha/\partial z}{1+\alpha^{2}}dx,\nonumber
+\end{align}
+где \(\alpha\)\nbsp{}--- уклоны волн. В трехмерном случае решение записывается в виде
+эллиптического дифференциального уравнения в частных производных
+\begin{align*}
+    & \frac{\partial^2 \phi}{\partial x^2} \left( 1 + \alpha_x^2 \right) +
+    \frac{\partial^2 \phi}{\partial y^2} \left( 1 + \alpha_y^2 \right) +
+    2\alpha_x\alpha_y \frac{\partial^2 \phi}{\partial x \partial y} + \\
+    & \left(
+        \frac{\partial \alpha_x}{\partial z} +
+        \alpha_x \frac{\partial \alpha_x}{\partial x} +
+        \alpha_y \frac{\partial \alpha_x}{\partial y}
+    \right) \frac{\partial \phi}{\partial x} + \\
+    & \left(
+        \frac{\partial \alpha_y}{\partial z} +
+        \alpha_x \frac{\partial \alpha_y}{\partial x} +
+        \alpha_y \frac{\partial \alpha_y}{\partial y}
+    \right) \frac{\partial \phi}{\partial y} + \\
+    & \frac{\partial \dot{\zeta}}{\partial z} + 
+    \alpha_x \dot{\alpha_x} + \alpha_y \dot{\alpha_y} = 0.
+\end{align*}
+Уравнение предполагается решать численно путем сведения к разностному.
+
+Как будет показано в [[#sec:compare-formulae]] формула eqref:eq-old-sol-2d
+расходится при попытке вычислить поле скоростей для волн больших амплитуд, а
+значит не может быть использована вместе с моделью ветрового волнения,
+генерирующей волны произвольных амплитуд.
+
+**** Линеаризация граничного условия.
+:PROPERTIES:
+:CUSTOM_ID: linearisation
+:END:
+Модель Лонге---Хиггинса позволяет вывести явную формулу для поля
+скоростей путем линеаризации кинематического граничного условия. Формула для
+потенциала скорости запишется как
+\begin{equation*}
+\phi(x,y,z,t) = \sum_n \frac{c_n g}{\omega_n} 
+     e^{\sqrt{u_n^2+v_n^2} z}
+     \sin(u_n x + v_n y - \omega_n t + \epsilon_n).
+\end{equation*}
+Формула дифференцируется для получения производных потенциала, а полученные
+значения подставляются в динамическое граничное условие для вычисления давлений.
+
+* Модель АРСС в задаче имитационного моделирования морского волнения
+** Основные формулы трехмерного процесса AРСС
+Модель АРСС для морского волнения определяет взволнованную морскую поверхность
+как трехмерный (два пространственных и одно временное измерение) процесс
+авторегрессии скользящего среднего: каждая точка взволнованной поверхности
+представляется в виде взвешенной суммы предыдущих по времени и пространству
+точек и взвешенной суммы предыдущих по времени и пространству нормально
+распределенных случайных импульсов. Основным уравнением для трехмерного процесса
+АРСС является
+\begin{equation}
+    \zeta_{\vec i}
+    =
+    \sum\limits_{\vec j = \vec 0}^{\vec N}
+    \Phi_{\vec j} \zeta_{\vec i - \vec j}
+    +
+    \sum\limits_{\vec j = \vec 0}^{\vec M}
+    \Theta_{\vec j} \epsilon_{\vec i - \vec j}
+    ,
+    \label{eq-arma-process}
+\end{equation}
+где \(\zeta\)\nbsp{}--- подъем (аппликата) взволнованной поверхности,
+\(\Phi\)\nbsp{}--- коэффициенты процесса АР, \(\Theta\)\nbsp{}--- коэффициенты
+процесса СС, \(\epsilon\)\nbsp{}--- белый шум, имеющий Гауссово распределение,
+\(\vec{N}\)\nbsp{}--- порядок процесса АР, \(\vec{M}\)\nbsp{}--- порядок
+процесса СС, причем \(\Phi_{\vec{0}}\equiv0\), \(\Theta_{\vec{0}}\equiv0\).
+Здесь стрелки обозначают многокомпонентные индексы, содержащие отдельную
+компоненту для каждого измерения. В общем случае в качестве компонент могут
+выступать любые скалярные величины (температура, соленость, концентрация
+какого-либо раствора в воде и т.п.). Параметрами уравнения служат коэффициенты и
+порядки процессов АР и СС.
+
+Свойства стационарности и обратимости являются основными критериями выбора того
+или иного процесса для моделирования волн разных профилей, которые обсуждаются в
+разд.\nbsp{}[[#sec-process-selection]].
+
+**** Процесс авторегрессии (АР).
+Процесс АР\nbsp{}--- это процесс АРСС только лишь с одним случайным импульсом вместо их
+взвешенной суммы:
+\begin{equation}
+    \zeta_{\vec i}
+    =
+    \sum\limits_{\vec j = \vec 0}^{\vec N}
+    \Phi_{\vec j} \zeta_{\vec i - \vec j}
+    +
+    \epsilon_{i,j,k}
+    .
+    \label{eq-ar-process}
+\end{equation}
+Коэффициенты авторегрессии \(\Phi\) определяются из многомерных уравнений
+Юла---Уокера, получаемых после домножения на \(\zeta_{\vec{i}-\vec{k}}\) обеих
+частей уравнения и взятия математического ожидания. В общем виде уравнения
+Юла---Уокера записываются как
+\begin{equation}
+    \label{eq-yule-walker}
+    \gamma_{\vec k}
+    =
+    \sum\limits_{\vec j = \vec 0}^{\vec N}
+    \Phi_{\vec j}
+    \text{ }\gamma_{\vec{k}-\vec{j}}
+    +
+    \Var{\epsilon} \delta_{\vec{k}},
+    \qquad
+    \delta_{\vec{k}} =
+    \begin{cases}
+        1, \quad \text{if } \vec{k}=0 \\
+        0, \quad \text{if } \vec{k}\neq0,
+    \end{cases}
+\end{equation}
+где \(\gamma\)\nbsp{}--- АКФ процесса \(\zeta\), \(\Var{\epsilon}\)\nbsp{}--- дисперсия
+белого шума. Матричная форма трехмерной системы уравнений Юла---Уокера,
+используемой в данной работе, имеет следующий вид.
+\begin{equation*}
+    \Gamma
+    \left[
+        \begin{array}{l}
+            \Phi_{\vec 0}\\
+            \Phi_{0,0,1}\\
+            \vdotswithin{\Phi_{\vec 0}}\\
+            \Phi_{\vec N}
+        \end{array}
+    \right]
+    =
+    \left[
+        \begin{array}{l}
+            \gamma_{0,0,0}-\Var{\epsilon}\\
+            \gamma_{0,0,1}\\
+            \vdotswithin{\gamma_{\vec 0}}\\
+            \gamma_{\vec N}
+        \end{array}
+    \right],
+    \qquad
+    \Gamma=
+    \left[
+        \begin{array}{llll}
+            \Gamma_0 & \Gamma_1 & \cdots & \Gamma_{N_1} \\
+            \Gamma_1 & \Gamma_0 & \ddots & \vdotswithin{\Gamma_0} \\
+            \vdotswithin{\Gamma_0} & \ddots & \ddots & \Gamma_1 \\
+            \Gamma_{N_1} & \cdots & \Gamma_1 & \Gamma_0
+        \end{array}
+    \right],
+\end{equation*}
+где \(\vec N = \left( N_1, N_2, N_3 \right)\) и
+\begin{equation*}
+    \Gamma_i =
+    \left[
+    \begin{array}{llll}
+        \Gamma^0_i & \Gamma^1_i & \cdots & \Gamma^{N_2}_i \\
+        \Gamma^1_i & \Gamma^0_i & \ddots & \vdotswithin{\Gamma^0_i} \\
+        \vdotswithin{\Gamma^0_i} & \ddots & \ddots & \Gamma^1_i \\
+        \Gamma^{N_2}_i & \cdots & \Gamma^1_i & \Gamma^0_i
+    \end{array}
+    \right]
+    \qquad
+    \Gamma_i^j=
+    \left[
+    \begin{array}{llll}
+        \gamma_{i,j,0} & \gamma_{i,j,1} & \cdots & \gamma_{i,j,N_3} \\
+        \gamma_{i,j,1} & \gamma_{i,j,0} & \ddots &x \vdotswithin{\gamma_{i,j,0}} \\
+        \vdotswithin{\gamma_{i,j,0}} & \ddots & \ddots & \gamma_{i,j,1} \\
+        \gamma_{i,j,N_3} & \cdots & \gamma_{i,j,1} & \gamma_{i,j,0}
+    \end{array}
+    \right],
+\end{equation*}
+Поскольку по определению \(\Phi_{\vec 0}\equiv0\), то первую строку и столбец
+матрицы \(\Gamma\) можно отбросить. Матрица \(\Gamma\), как и оставшаяся от нее
+матрица, будут блочно-теплицевы, положительно определены и симметричны, поэтому
+систему уравнений Юла---Уокера можно эффективно решить методом Холецкого,
+специально предназначенного для таких матриц.
+
+После нахождения решения системы уравнений дисперсия белого шума определяется из
+уравнения eqref:eq-yule-walker при \(\vec k = \vec 0\) как
+\begin{equation*}
+    \Var{\epsilon} =
+    \Var{\zeta}
+    -
+    \sum\limits_{\vec j = \vec 0}^{\vec N}
+    \Phi_{\vec j}
+    \text{ }\gamma_{\vec{j}}.
+\end{equation*}
+
+**** Процесс скользящего среднего (СС).
+Процесс СС\nbsp{}--- это процесс АРСС, в котором \(\Phi\equiv0\):
+\begin{equation}
+    \zeta_{\vec i}
+    =
+    \sum\limits_{\vec j = \vec 0}^{\vec M}
+    \Theta_{\vec j} \epsilon_{\vec i - \vec j}
+    .
+    \label{eq-ma-process}
+\end{equation}
+Коэффициенты СС \(\Theta\) определяются неявно из системы нелинейных уравнений
+\begin{equation*}
+  \gamma_{\vec i} =
+  \left[
+    \displaystyle
+    \sum\limits_{\vec j = \vec i}^{\vec M}
+    \Theta_{\vec j}\Theta_{\vec j - \vec i}
+  \right]
+  \Var{\epsilon}.
+\end{equation*}
+Система решается численно с помощью метода простой итерации по формуле
+\begin{equation*}
+  \Theta_{\vec i} =
+    -\frac{\gamma_{\vec 0}}{\Var{\epsilon}}
+    +
+    \sum\limits_{\vec j = \vec i}^{\vec M}
+    \Theta_{\vec j} \Theta_{\vec j - \vec i}.
+\end{equation*}
+Здесь новые значения коэффициентов \(\Theta\) вычисляются, начиная с последнего:
+от \(\vec{i}=\vec{M}\) до \(\vec{i}=\vec{0}\). Дисперсия белого шума вычисляется из
+\begin{equation*}
+    \Var{\epsilon} = \frac{\gamma_{\vec 0}}{
+    1
+    +
+    \sum\limits_{\vec j = \vec 0}^{\vec M}
+    \Theta_{\vec j}^2
+    }.
+\end{equation*}
+Авторы\nbsp{}cite:box1976time предлагают использовать метод Ньютона---Рафсона для
+решения этого уравнения с большей точностью, однако, этот метод не подходит для
+трех измерений. Использование более медленного метода не оказывает большого
+эффекта на общую производительность программы, потому что количество
+коэффициентов мало, и большую часть времени программа тратит на генерацию
+взволнованной поверхности.
+
+**** Стационарность и обратимость процессов АР и СС
+Для того чтобы моделируемая взволнованная поверхность представляла собой
+физическое явление, соответствующий процесс должен быть стационарным и
+обратимым. Если процесс обратим, то существует разумная связь текущих событий с
+событиями в прошлом, и, если процесс стационарен, то амплитуда моделируемого
+физического сигнала не увеличивается бесконечно в пространстве и времени.
+
+Процесс АР всегда обратим, а для стационарности необходимо, чтобы корни
+характеристического уравнения
+\begin{equation*}
+1 - \Phi_{0,0,1} z - \Phi_{0,0,2} z^2
+- \cdots
+- \Phi_{\vec N} z^{N_0 N_1 N_2} = 0,
+\end{equation*}
+лежали \emph{вне} единичного круга. Здесь \(\vec{N}\)\nbsp{}--- порядок процесса
+АР, а \(\Phi\)\nbsp{}--- коэффициенты.
+
+Процесс СС всегда стационарен, а для обратимости необходимо, чтобы корни
+характеристического уравнения
+\begin{equation*}
+1 - \Theta_{0,0,1} z - \Theta_{0,0,2} z^2
+- \cdots
+- \Theta_{\vec M} z^{M_0 M_1 M_2} = 0,
+\end{equation*}
+лежали \emph{вне} единичного круга. Здесь \(\vec{M}\)\nbsp{}--- порядок процесса
+СС, а \(\Theta\)\nbsp{}--- коэффициенты.
+
+**** Смешанный процесс авторегрессии скользящего среднего (АРСС).
+:PROPERTIES:
+:CUSTOM_ID: sec:how-to-mix-ARMA
+:END:
+В общем и целом, процесс АРСС получается путем подстановки сгенерированной
+процессом СС взволнованной поверхности в качестве случайного импульса процесса
+АР, однако, для того чтобы АКФ результирующего процесса соответствовала
+заданной, необходимо предварительно скорректировать значения коэффициентов АР.
+Существует несколько способов "смешивания" процессов АР и СС.
+- Подход, предложенный авторами\nbsp{}cite:box1976time, который включается в себя
+  разделение АКФ на часть для процесса АР и часть для процесса СС по каждому из
+  измерений, не подходит в данной ситуации, поскольку в трех измерениях
+  невозможно таким образом разделить АКФ: всегда останутся части, которые не
+  будут учтены ни в процессе АР, ни в процессе СС.
+- Альтернативный подход состоит в использование одной и той же (неразделенной)
+  АКФ для обоих процессов разных порядков, однако, тогда характеристики
+  реализации (математической ожидание, дисперсия и др.) будут смещены: они
+  станут характеристика двух наложенных друг на друга процессов.
+Для первого подхода авторами\nbsp{}cite:box1976time предложена формула корректировки
+коэффициентов процесса АР, для второго же подхода такой формулы нет. Таким
+образом, лучшим решением на данный момент является использование процессов АР и
+СС по отдельности.
+
+**** Критерии выбора процесса для моделирования разных профилей волн.
+:PROPERTIES:
+:CUSTOM_ID: sec-process-selection
+:END:
+
+Одной из проблем в применении модели АРСС для генерации взволнованной морской
+поверхности является то, что для разных профилей волн /необходимо/ использовать
+разные процессы: стоячие волны моделируются только процессом АР, а прогрессивные
+волны\nbsp{}--- только процессом СС. Это утверждение пришло из практики: если
+попытаться использовать процессы наоборот, результирующая реализация либо
+расходится, либо не представляет собой реальные морские волны (такое происходит
+в случае необратимого процесса СС, который всегда стационарен). Таким образом,
+процесс АР может быть использован только для моделирования стоячих волн, а
+процесс СС\nbsp{}--- для прогрессивных волн.
+
+Другой проблемой является невозможность автоматического определения оптимального
+количества коэффициентов для трехмерных процессов АР и СС. Для одномерных
+процессов существуют итеративные методы\nbsp{}cite:box1976time, однако они расходятся
+в трехмерном случае.
+
+Последней проблемой, которая описана в разделе [[#sec:how-to-mix-ARMA]], является
+невозможность "смешать" процесс АР и СС в трех измерениях.
+
+Практика показывает, что некоторые утверждения авторов\nbsp{}cite:box1976time не
+выполняются для трехмерной модели АРСС. Например, авторы утверждают, что АКФ
+процесса СС обрывается на отсчете \(q\), а АКФ процесса АР затухает на
+бесконечности, однако, на практике при использовании слабо затухающей и
+обрывающейся на отсчете \(q\) АКФ для трехмерного процесса СС получается
+необратимый процесс СС и реализация, не соответствующая реальными морским
+волнам, в то время как при использовании той же самой АКФ для трехмерного
+процесса АР получается стационарный обратимый процесс и адекватная реализация.
+Также, авторы утверждают, что первые \(q\) точек АКФ смешанного процесса
+необходимо выделить процессу СС (поскольку он обычно используется для описания
+пиков АКФ) и отдать остальные точки процессу АР, однако, на практике в случае
+АКФ прогрессивной волны процесс АР стационарен только для начального временного
+среза АКФ, а остальные точки отдаются процессу СС.
+
+Суммируя вышесказанное, наиболее разработанным сценарием применения модели АРСС
+для генерации взволнованной морской поверхности является использование процесса
+АР для стоячих волн и процесса СС для прогрессивных волн. Смешанный процесс АРСС
+может сделать модель более точной при условии наличия соответствующих формул
+пересчета коэффициентов, что является целью дальнейших исследований.
+
+** Моделирование нелинейности морских волн
+Модель АРСС позволяет учесть асимметричность распределения волновых аппликат,
+т.е. генерировать морские волны, закон распределения аппликат которых имеет
+ненулевой эксцесс и асимметрию. Такой закон распределения характерен для реальных
+морских волн\nbsp{}cite:longuet1963nonlinear.
+
+Асимметричность волн моделируется с помощью нелинейного безынерционного
+преобразования (НБП) случайного процесса, однако, любое нелинейное
+преобразование случайного процесса приводит к преобразованию его АКФ. Для того
+чтобы подавить этот эффект, необходимо предварительно преобразовать АКФ, как
+показано в\nbsp{}cite:boukhanovsky1997thesis.
+
+**** Преобразование взволнованной поверхности.
+Формула \(z=f(y)\) преобразования взволнованной поверхности к необходимому
+одномерному закону распределения \(F(z)\) получается путем решения нелинейного
+трансцендентного уравнения \(F(z) = \Phi(y)\), где \(\Phi(y)\)\nbsp{}--- функция
+одномерного нормального закона распределения. Поскольку функция распределения
+аппликат морских волн часто задается некоторой аппроксимацией, основанной на
+натурных данных, то это уравнение целесообразно решать численно в каждой точке
+\(y_k|_{k=0}^N\) сетки сгенерированной поверхности относительно \(z_k\). Тогда
+уравнение запишется в виде
+\begin{equation}
+    \label{eq-distribution-transformation}
+    F(z_k)
+    =
+    \frac{1}{\sqrt{2\pi}}
+    \int\limits_0^{y_k} \exp\left[ -\frac{t^2}{2} \right] dt
+    .
+\end{equation}
+Поскольку функции распределения монотонны, для решения этого уравнения
+используется простейший численный метод половинного деления (метод бисекции).
+
+**** Предварительное преобразование АКФ.
+Для преобразования АКФ \(\gamma_z\) процесса ее необходимо разложить в ряд по
+полиномам Эрмита (ряд Грама---Шарлье)
+\begin{equation*}
+    \gamma_z \left( \vec u \right)
+    =
+    \sum\limits_{m=0}^{\infty}
+    C_m^2 \frac{\gamma_y^m \left( \vec u \right)}{m!},
+\end{equation*}
+где
+\begin{equation*}
+    C_m = \frac{1}{\sqrt{2\pi}}
+  \int\limits_{0}^\infty
+    f(y) H_m(y) \exp\left[ -\frac{y^2}{2} \right],
+\end{equation*}
+\(H_m\)\nbsp{}--- полином Эрмита, а \(f(y)\)\nbsp{}--- решение уравнения
+eqref:eq-distribution-transformation. Воспользовавшись полиномиальной
+аппроксимацией \(f(y) \approx \sum\limits_i d_i y^i\) и аналитическими выражениями
+для полнимов Эрмита, формулу определения коэффициентов можно упростить,
+используя следующее равенство:
+\begin{equation*}
+    \frac{1}{\sqrt{2\pi}}
+    \int\limits_\infty^\infty
+    y^k \exp\left[ -\frac{y^2}{2} \right]
+    =
+    \begin{cases}
+        (k-1)!! & \text{для четных }k,\\
+        0       & \text{для нечетных }k.
+    \end{cases}
+\end{equation*}
+Оптимальное количество коэффициентов \(C_m\) определяется путем вычисления их
+последовательно и критерий прекращения счета определяется совпадением дисперсий
+обоих полей с требуемой точностью \(\epsilon\):
+\begin{equation*}
+    \left| \Var{z} - \sum\limits_{k=0}^m
+    \frac{C_k^2}{k!} \right| \leq \epsilon.
+\end{equation*}
+
+В\nbsp{}cite:boukhanovsky1997thesis автор предлагает использовать полиномиальную
+аппроксимацию для \(f(y)\) также для преобразования поверхности, однако на
+практике в реализации взволнованной поверхности часто находятся точки,
+выпадающие за промежуток на котором построена аппроксимация, что приводит к
+резкому уменьшению ее точности. В этих точках уравнение
+eqref:eq-distribution-transformation эффективнее решать методом бисекции.
+Использование полиномиальной аппроксимации в формулах для коэффициентов ряда
+Грама---Шарлье не приводит к аналогичным ошибкам.
+
+** Определение поля давлений под дискретно заданной взволнованной поверхностью
+Аналитические решения граничных задач для классических уравнений часто
+используются для исследования различных свойств уравнений, и для таких
+исследований запись формулы общего решения неудобна ввиду своей сложности и
+наличия интегралов от неизвестных функций. Одним из методов нахождения
+аналитических решений ДУЧП является метод Фурье. Основой метода служит
+преобразование Фурье, применение которого к любому ДУЧП позволяет свести его к
+алгебраическому, а его решение записывается как обратное преобразование Фурье от
+некоторой функции (которая может содержать преобразования Фурье от других
+функций). Поскольку эти преобразования не всегда можно записать аналитически, то
+вместо этого ищутся частные решения задачи и анализируется их поведение в
+различных областях. В то же время, вычисление дискретных преобразований Фурье на
+компьютере возможно для любой дискретно заданной функции и эффективно при
+использовании алгоритмов БПФ. Эти алгоритмы используют симметрию комплексных
+экспонент для понижения асимптотической сложности с \(\mathcal{O}(n^2)\) до
+\(\mathcal{O}(n\log_{2}n)\). Таким образом, даже если общее решение содержит
+преобразования Фурье от неизвестных функций, они все равно могут быть взяты
+численно, а использование алгоритмов БПФ делает этот подход эффективным.
+
+Альтернативным подходом является сведение их к разностным уравнениям, решаемым с
+помощью построения различных численных схем. При этом решение получается
+приближенным, а асимптотическая сложность соответствующих алгоритмов сопоставима
+со сложностью алгоритма БПФ. Например, стационарное эллиптическое уравнение в
+частных производных преобразуется в неявную разностную схему, решаемую
+итерационным методом, на каждом шаге которого ищется решение трехдиагональной
+или пятидиагональной СЛАУ методом прогонки (алгоритм Томаса). Асимптотическая
+сложность алгоритма составляет \(\mathcal{O}({n}{m})\), где \(n\)\nbsp{}--- количество
+точек на сетке взволнованной поверхности, \(m\)\nbsp{}--- число итераций. Несмотря на
+широкое распространение, итеративные алгоритмы неэффективно отображаются на
+архитектуру параллельных машин; в частности, отображение на сопроцессоры может
+включать в себя копирование данных на сопроцессор и обратно на каждой итерации,
+что отрицательно сказывается на их производительности. В то же время, наличие
+большого количества преобразований Фурье в решении является скорее
+преимуществом, чем недостатком. Во-первых, решения, полученные с помощью метода
+Фурье, явные, а значит хорошо масштабируются на большое количество параллельно
+работающих вычислительных ядер с использованием простейших приемов параллельного
+программирования. Во-вторых, для алгоритмов БПФ существуют готовые
+оптимизированные реализация для различных архитектур процессоров и сопроцессоров
+(GPU, MIC). Эти преимущества обусловили выбор метода Фурье в качестве рабочего
+для получения явного аналитического решения задачи определения давлений под
+взволнованной морской поверхностью.
+
+*** Двухмерное поле скоростей
+:PROPERTIES:
+:CUSTOM_ID: sec:pressure-2d
+:END:
+**** Формула для жидкости бесконечной глубины.
+Задача Робена для уравнения Лапласа в двух измерениях записывается как
+\begin{align}
+    \label{eq-problem-2d}
+    & \phi_{xx}+\phi_{zz}=0,\\
+    & \zeta_t + \zeta_x\phi_x = \frac{\zeta_x}{\sqrt{1 + \zeta_x^2}} \phi_x - \phi_z, & \text{на }z=\zeta(x,t).\nonumber
+\end{align}
+Для ее решения воспользуемся методом Фурье. Возьмем преобразование Фурье от
+обоих частей уравнений Лапласа и получим
+\begin{equation*}
+    -4 \pi^2 \left( u^2 + v^2 \right)
+    \FourierY{\phi(x,z)}{u,v} = 0,
+\end{equation*}
+откуда имеем \(v = \pm i u\). Здесь и далее будет использоваться следующая
+симметричная форма преобразования Фурье:
+\begin{equation*}
+    \FourierY{f(x,y)}{u,v} =
+    \iint\limits_{-\infty}^{\phantom{--}\infty}
+    f(x,y)
+    e^{-2\pi i (x u + y v)}
+    dx dy.
+\end{equation*}
+Решение уравнения будем искать в виде обратного преобразования Фурье
+\(\phi(x,z)=\InverseFourierY{E(u,v)}{x,z}\). Подставляя[fn::Выражение \(v={-i}{u}\)
+не подходит в данной задаче, поскольку потенциал скорости должен стремиться к
+нулю с увеличением глубины до бесконечности.} \(v={i}{u}\) в формулу, решение
+перепишется как
+\begin{equation}
+    \label{eq-guessed-sol-2d}
+    \phi(x,z) = \InverseFourierY{e^{2\pi u z}E(u)}{x}.
+\end{equation}
+Для того чтобы подстановка \(z=\zeta(x,t)\) не помешала использованию
+преобразований Фурье в решении, перепишем eqref:eq-guessed-sol-2d в виде
+свертки:
+\begin{equation*}
+    \phi(x,z)
+    =
+    \Fun{z}
+    \ast
+    \InverseFourierY{E(u)}{x},
+\end{equation*}
+где \(\Fun{z}\)\nbsp{}--- некоторая функция, вид которой будет определен в
+[[#sec:compute-delta]] и для которой выполняется соотношение
+\(\FourierY{\Fun{z}}{u}=e^{2\pi{u}{z}}\). Подставляя выражение для \(\phi\) в
+граничное условие, получим
+\begin{equation*}
+    \zeta_t
+    =
+    \left( i f(x) - 1 \right)
+    \left[
+        \Fun{z}
+        \ast
+        \InverseFourierY{2\pi u E(u)}{x}
+    \right],
+\end{equation*}
+где \(f(x) = {\zeta_x}/{\sqrt{1 + \zeta_x^2}} - \zeta_x\). Применяя преобразование
+Фурье к обеим частям, получаем выражение для коэффициентов \(E\):
+\begin{equation*}
+    E(u) =
+    \frac{1}{2\pi u}
+    \frac{
+    \FourierY{\zeta_t / \left(i f(x) - 1\right)}{u}
+    }{
+    \FourierY{\Fun{z}}{u}
+    }
+\end{equation*}
+Выполняя подстановку \(z=\zeta(x,t)\) и подставляя полученное выражение в
+eqref:eq-guessed-sol-2d, получаем окончательное выражение для \(\phi(x,z)\):
+\begin{equation}
+    \label{eq-solution-2d}
+    \boxed{
+        \phi(x,z)
+        =
+        \InverseFourierY{
+            \frac{e^{2\pi u z}}{2\pi u}
+            \frac{
+            \FourierY{ \zeta_t / \left(i f(x) - 1\right) }{u}
+            }{
+            \FourierY{ \Fun{\zeta(x,t)} }{u}
+            }
+        }{x}.
+    }
+\end{equation}
+
+Множитель \(e^{2\pi u z}/(2\pi u)\) делает график функции от которой берется
+обратное преобразования Фурье несимметричным относительно оси \(OY\). Это
+затрудняет применение БПФ, поскольку оно требует периодичную функцию, которая на
+концах промежутка принимает нулевое значение. Использование численного
+интегрирования вместо БПФ не позволит получить преимущество над решением всей
+системы уравнений с помощью разностных схем. Эту проблему можно обойти,
+используя формулу eqref:eq-solution-2d-full для жидкости конечной глубины с
+заведомо большим значением глубины водоема \(h\). Вывод формулы дан в следующем
+разделе.
+
+**** Формула для жидкости конечной глубины.
+На дне водоема вертикальная составляющая скорости перемещения жидкости должна
+равняться нулю, т.е. \(\phi_z=0\) на \(z=-h\), где \(h\)\nbsp{}--- глубина водоема. В этом
+случае пренебречь равенством \(v = -i u\), полученным из уравнения Лапласа,
+нельзя, и решение ищется в виде
+\begin{equation}
+    \phi(x,z)
+    =
+    \InverseFourierY{
+        \left( C_1 e^{2\pi u z} + C_2 e^{-2\pi u z} \right)
+        E(u)
+    }{x}.
+    \label{eq-guessed-sol-2d-full}
+\end{equation}
+Подставляя \(\phi\) в условие на дне водоема, получим
+\begin{equation*}
+    C_1 e^{-2\pi u h} - C_2 e^{2\pi u h} = 0,
+\end{equation*}
+откуда имеем \(C_1=\frac{1}{2}C{e}^{2\pi{u}{h}}\) и
+\(C_2=-\frac{1}{2}C{e}^{-2\pi{u}{h}}\). Константа \(C\) здесь произвольна, поскольку
+при подстановке станет частью неизвестных коэффициентов \(E(u)\). Подставляя
+полученные выражения для \(C_1\) и \(C_2\) в eqref:eq-guessed-sol-2d-full, получаем
+выражение
+\begin{equation*}
+    \phi(x,z) = \InverseFourierY{ \Sinh{2\pi u (z+h)} E(u) }{x}.
+\end{equation*}
+Подставляя \(\phi\) в граничное условие на свободной поверхности, получаем
+\begin{equation*}
+    \zeta_t = f(x) \InverseFourierY{ 2\pi i u \Sinh{2\pi u (z+h)} E(u) }{x}
+            - \InverseFourierY{ 2\pi u \SinhX{2\pi u (z+h)} E(u) }{x}.
+\end{equation*}
+Здесь \(\sinh\) и \(\cosh\) дают схожие результаты вблизи свободной поверхности, и,
+поскольку эта область является наиболее интересной с точки зрения практического
+применения, положим \(\Sinh{2\pi{u}(z+h)}\approx\SinhX{2\pi{u}(z+h)}\). Выполняя
+аналогичные предыдущему разделу операции, получаем окончательное выражение для
+\(\phi(x,z)\):
+\begin{equation}
+\boxed{
+    \phi(x,z,t)
+    =
+  \InverseFourierY{
+        \frac{\Sinh{2\pi u (z+h)}}{2\pi u}
+        \frac{
+            \FourierY{ \zeta_t / \left(i f(x) - 1\right) }{u}
+        }{
+            \FourierY{ \FunSecond{\zeta(x,t)} }{u}
+        }
+    }{x},
+}
+    \label{eq-solution-2d-full}
+\end{equation}
+где \(\FunSecond{z}\)\nbsp{}--- некоторая функция, вид которой будет определен в
+[[#sec:compute-delta]] и для которой выполняется соотношение
+\(\FourierY{\FunSecond{z}}{u}=\Sinh{2\pi{u}{z}}\).
+
+**** Сведение к формулам линейной теории волн.
+Справедливость полученных формул проверим, подставив в качестве \(\zeta(x,t)\)
+известные аналитические выражения для плоских волн. Символьные вычисления
+преобразований Фурье в этом разделе производились с помощью пакета Mathematica\nbsp{}cite:mathematica10. В линейной теории широко используется предположение о
+малости амплитуд волн, что позволяет упростить исходную систему уравнений
+eqref:eq-problem-2d до
+\begin{align*}
+    & \phi_{xx}+\phi_{zz}=0,\\
+    & \zeta_t = -\phi_z & \text{на }z=\zeta(x,t),
+\end{align*}
+решение которой запишется как
+\begin{equation*}
+    \phi(x,z,t)
+    =
+    -\InverseFourierY{
+        \frac{e^{2\pi u z}}{2\pi u}
+        \FourierY{\zeta_t}{u}
+    }{x}
+    .
+\end{equation*}
+Профиль прогрессивной волны описывается формулой \(\zeta(x,t)=A\cos(2\pi(kx-t))\).
+Подстановка этого выражения в eqref:eq-solution-2d дает равенство
+\(\phi(x,z,t)=-\frac{A}{k}\sin(2\pi(kx-t))\Sinh{2\pi{k}{z}}\). Чтобы свести его к
+формуле линейной теории волн, представим гиперболический синус в
+экспоненциальной форме и отбросим член, содержащий \(e^{-2\pi{k}{z}}\), как
+противоречащий условию \(\phi\underset{z\rightarrow-\infty}{\longrightarrow}0\).
+После взятия действительной части выражения получится известная формула линейной
+теории \(\phi(x,z,t)=\frac{A}{k}e^{2\pi{k}{z}}\sin(2\pi(kx-t))\). Аналогично,
+предположение о малости амплитуд волн позволяет упростить формулу
+eqref:eq-solution-2d-full до
+\begin{equation*}
+    \phi(x,z,t)
+    =
+    -\InverseFourierY{
+        \frac{\Sinh{2\pi u (z+h)}}{2\pi u \Sinh{2\pi u h}}
+        \FourierY{\zeta_t}{u}
+    }{x}.
+\end{equation*}
+Подстановка формулы для прогрессивной плоской волны вместо \(\zeta(x,t)\) дает
+равенство
+\begin{equation}
+    \label{eq-solution-2d-linear}
+    \phi(x,z,t)=\frac{A}{k}
+    \frac{\Sinh{2 \pi k (z+h)}}{ \Sinh{2 \pi k h} }
+    \sin(2 \pi (k x-t)),
+\end{equation}
+что соответствует формуле линейной теории для конечной глубины.
+
+Различные записи решения уравнения Лапласа, в которых затухающая экспонента
+может встречаться как со знаком "+", так и со знаком "-", могут стать причиной
+разницы между формулами линейно теории и формулами, выведенными в данной работе,
+где вместо \(\sinh\) используется \(\cosh\). Выражение
+\(\frac{\Sinh{2\pi{k}(z+h)}}{\Sinh{2\pi{k}{h}}}\approx\frac{\sinh(2\pi{k}(z+h))}{\sinh(2\pi{k}{h})}\)
+превращается в строгое равенство на поверхности, и разница между правой левой
+частью увеличивается при приближении к дну водоема (для достаточно большой
+глубины ошибка вблизи поверхности жидкости незначительна). Поэтому для
+достаточно большой глубины можно использовать любую из функций (\(\cosh\) или
+\(\sinh\)) для вычисления потенциала скорости вблизи взволнованной поверхности.
+
+Сведение формул eqref:eq-solution-2d и eqref:eq-solution-2d-full к формулам
+линейной теории волн показывает, что формула eqref:eq-solution-2d для жидкости
+бесконечной глубины не подходит для вычисления потенциала скорости с
+использованием метода Фурье, т.к. не обладает необходимой для преобразования
+Фурье симметрией. Однако, для такого случая можно использовать формулу для
+конечной глубины, полагая \(h\) равным характерному значению глубины исследуемого
+водоема. Для стоячих волн сведение к формулам линейной теории происходит с
+аналогичными предположениями.
+
+*** Трехмерное поле скоростей
+В трех измерениях исходная система уравнений eqref:eq-problem переписывается как
+\begin{align}
+    \label{eq-problem-3d}
+    & \phi_xx + \phi_yy + \phi_zz = 0,\\
+    & \zeta_t + \zeta_x\phi_x + \zeta_y\phi_y
+    =
+    \frac{\zeta_x}{\sqrt{1 + \zeta_x^2}} \phi_x
+    +\frac{\zeta_y}{\sqrt{\vphantom{\zeta_x^2}\smash[b]{1 + \zeta_y^2}}} \phi_y
+    - \phi_z, & \text{на }z=\zeta(x,y,t).\nonumber
+\end{align}
+Для ее решения также воспользуемся методом Фурье. Возьмем преобразование Фурье
+от обоих частей уравнений Лапласа и получим
+\begin{equation*}
+    -4 \pi^2 \left( u^2 + v^2 + w^2 \right)
+    \FourierY{\phi(x,y,z)}{u,v,w} = 0,
+\end{equation*}
+откуда имеем \(w=\pm{i}\sqrt{u^2+v^2}\). Решение уравнения будем искать в виде
+обратного преобразования Фурье \(\phi(x,y,z)=\InverseFourierY{E(u,v,w)}{x,y,z}\).
+Применяя полученное равенство, получаем
+\begin{equation*}
+    \phi(x,y,z) = \InverseFourierY{
+        \left(
+            C_1 e^{2\pi \sqrt{u^2+v^2} z}
+            -C_2 e^{-2\pi \sqrt{u^2+v^2} z}
+        \right)
+        E(u,v)
+    }{x,y}.
+\end{equation*}
+Подставляя \(\phi\) в условие на дне водоема аналогично двухмерному случаю,
+получаем
+\begin{equation}
+    \label{eq-guessed-sol-3d}
+    \phi(x,y,z) = \InverseFourierY{
+        \Sinh{2\pi \sqrt{u^2+v^2} (z+h)} E(u,v)
+    }{x,y}.
+\end{equation}
+Подставляя выражение для \(\phi\) в граничное условие, получим
+\begin{equation*}
+    \arraycolsep=1.4pt
+    \begin{array}{rl}
+        \zeta_t = & i f_1(x,y) \InverseFourierY{2 \pi u \Sinh{2\pi \sqrt{u^2+v^2} (z+h)}E(u,v)}{x,y} \\
+        + & i f_2(x,y) \InverseFourierY{2 \pi v \Sinh{2\pi \sqrt{u^2+v^2} (z+h)}E(u,v)}{x,y} \\
+        - & \InverseFourierY{2 \pi \sqrt{u^2+v^2} \Sinh{2\pi \sqrt{u^2+v^2} (z+h)}E(u,v)}{x,y}
+    \end{array}
+\end{equation*}
+где \(f_1(x,y)={\zeta_x}/{\sqrt{1+\zeta_x^2}}-\zeta_x\) и
+\(f_2(x,y)={\zeta_y}/{\sqrt{\vphantom{\zeta_x^2}\smash[b]{1+\zeta_y^2}}}-\zeta_y\).
+Применяя преобразование Фурье к обеим частям, получаем выражение для
+коэффициентов \(E\):
+\begin{equation*}
+    \arraycolsep=1.4pt
+    \begin{array}{rl}
+        \FourierY{\zeta_t}{u,v} = &
+        \FourierY{i f_1(x,y) \InverseFourierY{2 \pi u \Sinh{2\pi \sqrt{u^2+v^2} (z+h)} E(u,v)}{x,y}}{u,v}  \\
+        + & \FourierY{i f_2(x,y) \InverseFourierY{2 \pi v \Sinh{2\pi \sqrt{u^2+v^2} (z+h)} E(u,v)}{x,y}}{u,v}  \\
+        - & 2 \pi \sqrt{u^2+v^2} \Sinh{2\pi \sqrt{u^2+v^2} (z+h)} E(u,v)
+    \end{array}
+\end{equation*}
+Окончательное решение получается при подстановке выражения для \(E(u,v)\)
+в eqref:eq-guessed-sol-3d.
+
+* Численные методы и результаты экспериментов
+** Форма АКФ для разных волновых профилей
+**** Аналитический метод.
+Прямой способ нахождения АКФ, соответствующей заданному профилю морской волны,
+состоит в применении теоремы Винера---Хинчина. Согласно этой теореме
+автокорреляционная функция \(K\) функции \(\zeta\) равна преобразованию Фурье от
+квадрата модуля этой функции:
+\begin{equation}
+  K(t) = \Fourier{\left| \zeta(t) \right|^2}.
+  \label{eq-wiener-khinchin}
+\end{equation}
+Если заменить \(\zeta\) на формулу для волнового профиля, то это выражение даст
+аналитическую формулу для соответствующей АКФ.
+
+Для трехмерного волнового профиля (два пространственных и одно временное
+измерение) аналитическая формула представляет собой многочлен высокой степени, и
+ее лучше всего вычислять с помощью программы для символьных вычислений. Затем,
+для практического применения она может быть аппроксимирована суперпозицией
+экспоненциально затухающих косинусов (именно так выглядит АКФ стационарного
+процесса АРСС\nbsp{}cite:box1976time).
+
+**** Эмпирический метод.
+Впрочем, для трехмерного случая существует более простой эмпирический метод
+нахождения формы АКФ, не требующий использования сложного программного
+обеспечения. Известно, что АКФ, представляющая собой суперпозицию
+экспоненциально затухающих косинусов, является решением уравнения Стокса для
+гравитационных волн\nbsp{}cite:boccotti1983wind. Значит, если в моделируемом морском
+волнении важна только форма волны, а не точные ее характеристики, то заданный
+волновой профиль можно просто домножить на затухающую экспоненту, чтобы получить
+подходящую АКФ. Эта АКФ не отражает параметры волн, такие как высота и период,
+зато это открывает возможность моделировать волны определенных неаналитических
+форм, "рисуя" профиль волны, домножая его на экспоненту и используя
+результирующую функцию в качестве АКФ. Таким образом, эмпирический метод
+неточен, но более простой по сравнению с применением теоремы Винера---Хинчина;
+он, в основном, полезен для тестирования модели АРСС.
+
+**** АКФ стоячей волны.
+Профиль трехмерной плоской стоячей волны задается как
+\begin{equation}
+  \zeta(t, x, y) = A \sin (k_x x + k_y y) \sin (\sigma t).
+  \label{eq-standing-wave}
+\end{equation}
+Найдем АКФ с помощью аналитического метода. Домножив формулу на затухающую
+экспоненту (поскольку преобразование Фурье определено для функции \(f\), для
+которой справедливо \(f\underset{x\rightarrow\pm\infty}{\longrightarrow}0\)),
+получим
+\begin{equation}
+  \zeta(t, x, y) =
+  A
+  \exp\left[-\alpha (|t|+|x|+|y|) \right]
+  \sin (k_x x + k_y y) \sin (\sigma t).
+  \label{eq-decaying-standing-wave}
+\end{equation}
+Затем, применяя трехмерное преобразование Фурье к обоим частям уравнения с
+помощью программы для символьных вычислений, получим многочлен высокой степени,
+который аппроксимируем выражением
+\begin{equation}
+  K(t,x,y) =
+  \gamma
+  \exp\left[-\alpha (|t|+|x|+|y|) \right]
+  \cos \beta t
+  \cos \left[ \beta x + \beta y \right].
+  \label{eq-standing-wave-acf}
+\end{equation}
+Таким образом, после применения теоремы Винера---Хинчина получаем исходную
+формулу, но с косинусами вместо синусов. Это различие важно, поскольку значение
+АКФ в точке \((0,0,0)\) равно дисперсии процесса АРСС, которое при использовании
+синусов было бы неверным.
+
+Если попытаться получить ту же самую формулу с помощью эмпирического метода, то
+выражение eqref:eq-decaying-standing-wave необходимо адаптировать для
+соответствия eqref:eq-standing-wave-acf. Это можно осуществить либо, изменяя
+фазу синуса, либо заменой синуса на косинус, чтобы сдвинуть максимум функции в
+начало координат.
+
+**** АКФ прогрессивной волны.
+Профиль трехмерной плоской прогрессивной волны задается как
+\begin{equation}
+  \zeta(t, x, y) = A \cos (\sigma t + k_x x + k_y y).
+  \label{eq-propagating-wave}
+\end{equation}
+Для аналитического метода повторение шагов из предыдущих двух параграфов дает
+\begin{equation}
+  K(t,x,y) =
+  \gamma
+  \exp\left[-\alpha (|t|+|x|+|y|) \right]
+  \cos\left[\beta (t+x+y) \right].
+  \label{eq-propagating-wave-acf}
+\end{equation}
+Для эмпирического метода профиль волны можно просто домножить на затухающую
+экспоненту, не изменяя положение максимума АКФ (как это требовалось для стоячей
+волны).
+
+**** Сравнение изученных методов.
+Итого, аналитический метод нахождения АКФ морских волн сводится к следующим
+шагам.
+- Обеспечить затухание выражения для профиля волны на \(\pm\infty\), домножив его
+  на затухающую экспоненту.
+- Взять преобразование Фурье от квадрата модуля получившегося профиля,
+  воспользовавшись программой для символьных вычислений.
+- Аппроксимировать получившийся многочлен подходящим выражением для АКФ.
+
+Два примера этого раздела показывают, что затухающие профили стоячих и
+прогрессивных волн схожи по форме с соответствующими АКФ с тем лишь различием,
+что максимум АКФ должен быть перенесен в начало координат, чтобы сохранить
+дисперсию моделируемого процесса. Применение эмпирического метода нахождения АКФ
+сводится к следующим шагам.
+- Обеспечить затухание выражения для профиля волны на \(\pm\infty\), домножив его
+  на затухающую экспоненту.
+- Перенести максимум получившейся функции в начало координат, используя свойства
+  тригонометрических функций для сдвига фазы.
+
+** Дополните льные формулы, методы и алгоритмы для модели АРСС
+:PROPERTIES:
+:CUSTOM_ID: sec:arma-algorithms
+:END:
+*** Аппроксимация распределения аппликат
+Одним из параметров генератора взволнованной морской поверхности служит функция
+плотности распределения (ФПР) аппликат этой поверхности. Она задается либо
+полиномиальной аппроксимацией натурных данных, либо аналитически.
+
+**** Разложение в ряд Грама---Шарлье.
+В\nbsp{}cite:huang1980experimental было экспериментально показано, что распределение
+аппликат морской поверхности отличается от нормального ненулевым эксцессом и
+асимметрией. В\nbsp{}cite:рожков1996теория показано, что такое распределение
+раскладывается в ряд Грама---Шарлье:
+\begin{align}
+    \label{eq-skew-normal-1}
+    F(z; \gamma_1, \gamma_2) & = \phi(z)
+        - \gamma_1 \frac{\phi'''(z)}{3!}
+        + \gamma_2 \frac{\phi''''(z)}{4!} \nonumber \\
+    & =
+    \frac{1}{2} \text{erf}\left[\frac{z}{\sqrt{2}}\right]
+    -
+    \frac{e^{-\frac{z^2}{2}}}{\sqrt{2\pi}}
+    \left[
+        \frac{1}{6} \gamma_1 \left(z^2-1\right)
+        + \frac{1}{24} \gamma_2 z \left(z^2-3\right)
+    \right]
+    ,\nonumber \\
+    f(z; \gamma_1, \gamma_2) & =
+    \frac{e^{-\frac{z^2}{2}}}{\sqrt{2 \pi }}
+    \left[
+        \frac{1}{6} \gamma_1 z \left(z^2-3\right)
+        + \frac{1}{24} \gamma_2 \left(z^4-6z^2+3\right)
+        +1
+    \right],
+\end{align}
+где \(\phi(z)=\frac{1}{2}\mathrm{erf}(z/\sqrt{2})\), \(\gamma_1\)\nbsp{}--- асимметрия,
+\(\gamma_2\)\nbsp{}--- эксцесс, \(f\)\nbsp{}--- ФПР, \(F\)\nbsp{}--- функция распределения (ФР).
+Согласно\nbsp{}cite:рожков1990вероятностные для аппликат морских волн значение
+асимметрии выбирается на интервале \(0,1\leq\gamma_1\leq{0,52}]\), а значение
+эксцесса на интервале \(0,1\leq\gamma_2\leq{0,7}\). Семейство плотностей
+распределения при различных параметрах показано на рис.\nbsp{}[[fig-skew-normal-1]].
+
+#+name: fig-skew-normal-1
+#+begin_src R :file build/skew-normal-1-ru.pdf
+source(file.path("R", "common.R"))
+x <- seq(-3, 3, length.out=100)
+params <- data.frame(
+  skewness = c(0.00, 0.52, 0.00, 0.52),
+  kurtosis = c(0.00, 0.00, 0.70, 0.70),
+  linetypes = c("solid", "dashed", "dotdash", "dotted")
+)
+arma.skew_normal_1_plot(x, params)
+legend(
+  "topleft",
+  mapply(
+    function (s, k) {
+      as.expression(bquote(list(
+        gamma[1] == .(arma.fmt(s, 2)),
+        gamma[2] == .(arma.fmt(k, 2))
+      )))
+    },
+    params$skewness,
+    params$kurtosis
+  ),
+  lty = paste(params$linetypes)
+)
+#+end_src
+
+#+caption: Вид плотности распределения eqref:eq-skew-normal-1 аппликат взволнованной морской поверхности при различных значениях асимметрии \(\gamma_1\) и эксцесса \(\gamma_2\).
+#+label: fig-skew-normal-1
+#+RESULTS: fig-skew-normal-1
+[[file:build/skew-normal-1-ru.pdf]]
+
+**** Асимметричное нормальное распределение.
+Альтернативной аппроксимацией распределения волновых аппликат служит формула
+асимметричного нормального распределения:
+\begin{align}
+    \label{eq-skew-normal-2}
+    F(z; \alpha) & = \frac{1}{2}
+   \mathrm{erfc}\left[-\frac{z}{\sqrt{2}}\right]-2 T(z,\alpha ), \nonumber \\
+    f(z; \alpha) & = \frac{e^{-\frac{z^2}{2}}}{\sqrt{2 \pi }}
+   \mathrm{erfc}\left[-\frac{\alpha z}{\sqrt{2}}\right],
+\end{align}
+где \(T\)\nbsp{}--- функция Оуэна\nbsp{}cite:owen1956tables. Эта формула не позволяет задать
+значения асимметрии и эксцесса по отдельности\nbsp{}--- оба значения регулируются
+параметром \(\alpha\). Преимущество данной формулы лишь в относительной простоте
+вычисления: эта функция встроена в некоторые программы и библиотеки
+математических функций. График функции для разных значений \(\alpha\) представлен
+на рис.\nbsp{}[[fig-skew-normal-2]].
+
+#+name: fig-skew-normal-2
+#+begin_src R :file build/skew-normal-2-ru.pdf
+source(file.path("R", "common.R"))
+x <- seq(-3, 3, length.out=100)
+alpha <- c(0.00, 0.87, 2.25, 4.90)
+params <- data.frame(
+  alpha = alpha,
+  skewness = arma.bits.skewness_2(alpha),
+  kurtosis = arma.bits.kurtosis_2(alpha),
+  linetypes = c("solid", "dashed", "dotdash", "dotted")
+)
+arma.skew_normal_2_plot(x, params)
+legend(
+  "topleft",
+  mapply(
+    function (a, s, k) {
+      as.expression(bquote(list(
+        alpha == .(arma.fmt(a, 2)),
+        gamma[1] == .(arma.fmt(s, 2)),
+        gamma[2] == .(arma.fmt(k, 2))
+      )))
+    },
+    params$alpha,
+    params$skewness,
+    params$kurtosis
+  ),
+  lty = paste(params$linetypes)
+)
+#+end_src
+
+#+caption: Вид плотности распределения eqref:eq-skew-normal-2 волновых аппликат при различных значениях коэффициента асимметрии \(\alpha\).
+#+label: fig-skew-normal-2
+#+RESULTS: fig-skew-normal-2
+[[file:build/skew-normal-2.pdf]]
+
+**** Тестирование.
+Решение уравнения eqref:eq-distribution-transformation с выбранной функцией
+распределения можно произвести либо в каждой точке генерируемой поверхности, что
+даст наиболее точные результаты, либо в каждой точке фиксированной сетки,
+интерполировав решение методом наименьших квадратов (МНК). Во втором случае
+точность будет меньше. Например, интерполяция многочленом 12-го порядка на сетке
+из 500 узлов, построенной на промежутке \(-5\sigma_z\leq{z}\leq{5}\sigma_z\), дает
+погрешность \(\approx{0,43}\cdot10^{-3}\). Увеличение порядка многочлена приводит
+либо к переполнениям при интерполяции МНК, либо к дополнительным коэффициентам
+близким к нулю; увеличение размера сетки влияет на результат незначительно. В
+большинстве случаев трех коэффициентов ряда Грама---Шарлье было достаточно для
+преобразования АКФ; относительная погрешность без интерполяции составляет
+\(10^{-5}\).
+
+*** Алгоритм генерации белого шума
+Чтобы исключить периодичность из сгенерированной моделью ветрового волнения
+реализации взволнованной поверхности, для генерации белого шума нужно
+использовать ГПСЧ с достаточно большим периодом. В качестве такого генератора в
+работе используется параллельная реализация вихря Мерсенна\nbsp{}cite:matsumoto1998mersenne с периодом \(2^{19937}-1\). Это позволяет создавать
+апериодичные реализации взволнованной морской поверхности для любых сценариев
+применения, встречаемых на практике.
+
+Запуск нескольких ГПСЧ с разными начальными состояниями в параллельных потоках
+не гарантирует некоррелированность генерируемых последовательностей
+псевдослучайных чисел, однако, можно воспользоваться алгоритмом динамического
+создания вихрей Мерсенна\nbsp{}cite:matsumoto1998dynamic, чтобы дать такую гарантию.
+Суть алгоритма заключается в поиске таких матриц начальных состояний
+генераторов, которые бы дали максимально некоррелированные последовательности
+псевдослучайных чисел при параллельном запуске нескольких вихрей Мерсенна с
+этими начальными состоянями. Поскольку на поиск начальных состояний можно
+потратить значительное количество процессорного времени, то вектор состояний
+создается предварительно для заведомо большего количества параллельных потоков и
+сохраняется в файл, который впоследствиии считывается основной программой перед
+началом генерации белого шума.
+
+*** Алгоритм генерации взволнованной поверхности
+В модели АРСС значение подъема взволнованной поверхности в каждой точке зависит
+от предыдущих по пространству и времени значений, из-за чего в начале реализации
+образуется так называемый /интервал разгона/ (см.\nbsp{}рис.\nbsp{}[[fig-ramp-up-interval]])\nbsp{}---
+промежуток, на котором реализация не соответствует заданной АКФ. Способ решения
+этой проблемы зависит от контекста, в котором происходит моделирование.
+
+Если реализация используется в контексте расчета остойчивости судна без учета
+маневрирования, то интервал никак не повлияет результаты эксперимента, поскольку
+находится на границе (далеко от исследуемого морского объекта). Если изучается
+остойчивость судна в условиях маневрирования, то интервал проще всего исключить
+из реализации (размер интервала примерно равен числу коэффициентов АР по каждому
+из измерений). Однако, это приводит к потере большого числа точек, поскольку
+исключение происходит по каждому из трех измерений. Альтернативным подходом
+является генерация взволнованной поверхности на интервале разгона моделью ЛХ и
+генерация остальной реализации с помощью модели АРСС.
+
+В алгоритме генерации взволнованной поверхности используется параллелизм по
+данным: реализация делится на равные части, каждая из которых генерируется
+независимо,\nbsp{}--- однако, в начале каждой из частей также присутствует
+интервал разгона. Для его исключения используется метод /сшивания/, часто
+применяемый в обработке цифровых
+сигналов\nbsp{}cite:oppenheim1989discrete,svoboda2011efficient,pavel2013algorithms.
+Суть метода заключается в добавлении интервала равного по размеру интервалу
+разгона в конец каждой из частей. Затем взволнованная поверхность генерируется в
+каждой точки каждой из частей (включая добавленный интервал), интервал в конце
+части \(N\) накладывается на интервал разгона в начале части \(N+1\), и значения
+в соответствующих точках складываются.
+
+#+name: fig-ramp-up-interval
+#+begin_src R :file build/ramp-up-interval-ru.pdf
+source(file.path("R", "common.R"))
+arma.plot_ramp_up_interval(label="Интервал разгона")
+#+end_src
+
+#+caption: Интевал разгона в начале оси \(OX\) реализации.
+#+label: fig-ramp-up-interval
+#+RESULTS: fig-ramp-up-interval
+[[file:build/ramp-up-interval-ru.pdf]]
+
+*** Формулы нормировки для потенциалов скоростей
+:PROPERTIES:
+:CUSTOM_ID: sec:compute-delta
+:END:
+
+В решениях eqref:eq-solution-2d и eqref:eq-solution-2d-full двухмерной задачи
+определения поля давлений присутствуют функции
+\(\Fun{z}=\InverseFourierY{e^{2\pi{u}{z}}}{x}\) и
+\(\FunSecond{z}=\InverseFourierY{\Sinh{2\pi{u}{z}}}{x}\), которые могут быть
+записаны аналитически различными выражениями и представляют сложность при
+вычислении на компьютере. Каждая функция\nbsp{}--- это преобразование Фурье от
+линейной комбинации экспонент, которое сводится к плохо определенной дельта
+функции комплексного аргумента (см.\nbsp{}табл.\nbsp{}[[tab-delta-functions]]).
+Обычно такого типа функции записывают как произведение дельта функций от
+действительной и мнимой части, однако, такой подход не работает здесь, поскольку
+взятие обратного преобразования Фурье не даст экспоненту, что сильно исказит
+результирующее поле скоростей. Для получения однозначного аналитического
+выражения можно воспользоваться нормировкой \(1/\Sinh{2\pi{u}{h}}\) (которая
+также включается в выражение для коэффициентов \(E(u)\)). Численные эксперименты
+показывают, что нормировка хоть и позволяет получить адекватное поле скоростей,
+оно мало отличается от выражений из линейной теории волн, в которых члены с
+\(\zeta\) опускаются.
+
+#+name: tab-delta-functions
+#+caption: Формулы для вычисления \(\Fun{z}\) и \(\FunSecond{z}\) из [[#sec:pressure-2d]], использующие нормировку для исключения неоднозначности определения дельта функции комплексного аргумента.
+#+attr_latex: :booktabs t
+| Функция           | Без нормировки                                               | С нормировкой                                                                                                                          |
+|-------------------+--------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------|
+| \(\Fun{z}\)       | \(\delta (x+i z)\)                                           | \(\frac{1}{2 h}\mathrm{sech}\left(\frac{\pi  (x-i (h+z))}{2 h}\right)\)                                                                |
+| \(\FunSecond{z}\) | \(\frac{1}{2}\left[\delta (x-i z) + \delta (x+i z) \right]\) | \(\frac{1}{4 h}\left[\text{sech}\left(\frac{\pi  (x-i (h+z))}{2 h}\right)+\text{sech}\left(\frac{\pi  (x+i(h+z))}{2 h}\right)\right]\) |
+
+** Верификация модели АРСС
+:PROPERTIES:
+:CUSTOM_ID: sec:verification
+:END:
+
+Для модели АР в
+работах\nbsp{}cite:degtyarev2011modelling,degtyarev2013synoptic,boukhanovsky1997thesis
+экспериментальным путем были верифицированы
+- распределения различных характеристик волн (высоты волн, длины волн, длины
+  гребней, период волн, уклон волн, показатель трехмерности),
+- дисперсионное соотношение,
+- сохранение интегральных характеристик для случая смешанного волнения.
+В данной работе верифицируются как модель АР, так и СС путем сравнения
+распределений различных характеристик волн.
+
+*** Верификация интегральных характеристик взволнованной поверхности
+В\nbsp{}cite:рожков1990вероятностные авторы показывают, что некоторые характеристики
+морских волн (перечисленные в табл.\nbsp{}[[tab-weibull-shape]]) имеют распределение
+Вейбулла, а подъем взволнованной поверхности\nbsp{}--- нормальное распределение. Для
+верификации генерируемых моделями АР и СС реализаций используются спрямленные
+диаграммы (графики, в которых по оси \(OX\) откладываются квантили функции
+распределения, вычисленные аналитически, а по оси \(OY\)\nbsp{}--- вычисленные
+экспериментально). Если экспериментально полученное распределение соответствует
+аналитическому, то график представляет собой прямую линию. Концы графика могут
+отклоняться от прямой линии, поскольку не могут быть надежно получены из
+реализации конечной длины. Различные методы извлечения волн из реализации также
+могут привести к вариациям на концах графиков, извлечь каждую волну из
+реализации практически невозможно, поскольку они могут (и часто) накладываются
+друг на друга.
+
+#+name: tab-weibull-shape
+#+caption: Значение коэффициента формы \(k\) распределения Вейбулла для различных характеристик волн.
+#+attr_latex: :booktabs t
+| Характеристика          | Коэффициент формы \(k\) |
+|-------------------------+-----------------------|
+| Высота волны            |                     2 |
+| Длина волны             |                   2,3 |
+| Длина гребня волны      |                   2,3 |
+| Период волны            |                     3 |
+| Уклон волны             |                   2,5 |
+| Показатель трехмерности |                   2,5 |
+
+Верификация производится для стоячих и прогрессивных волн. Соответствующие АКФ и
+спрямленные диаграммы распределений характеристик волн представлены на рис.
+[[acf-slices]], [[standing-wave-distributions]], [[propagating-wave-distributions]].
+
+#+name: propagating-wave-distributions
+#+begin_src R :file build/propagating-wave-qqplots-ru.pdf
+source(file.path("R", "common.R"))
+par(pty="s", mfrow=c(2, 2))
+arma.qqplot_grid(
+  file.path("build", "propagating_wave"),
+  c("elevation", "heights_y", "lengths_y", "periods"),
+  c("подъем", "высота по Y", "длина по Y", "период"),
+  xlab="x",
+  ylab="y"
+)
+#+end_src
+
+#+caption: Спрямленные диаграммы для прогрессивных волн.
+#+label: propagating-wave-distributions
+#+RESULTS: propagating-wave-distributions
+[[file:build/propagating-wave-qqplots.pdf]]
+
+#+name: standing-wave-distributions
+#+begin_src R :file build/standing-wave-qqplots-ru.pdf
+source(file.path("R", "common.R"))
+par(pty="s", mfrow=c(2, 2))
+arma.qqplot_grid(
+  file.path("build", "standing_wave"),
+  c("elevation", "heights_y", "lengths_y", "periods"),
+  c("подъем", "высота по Y", "длина по Y", "период"),
+  xlab="x",
+  ylab="y"
+)
+#+end_src
+
+#+caption: Спрямленные диаграммы для стоячих волн.
+#+label: standing-wave-distributions
+#+RESULTS: standing-wave-distributions
+[[file:build/standing-wave-qqplots-ru.pdf]]
+
+#+name: acf-slices
+#+header: :width 6 :height 9
+#+begin_src R :file build/acf-slices-ru.pdf
+source(file.path("R", "common.R"))
+propagating_acf <- read.csv(file.path("build", "propagating_wave", "acf.csv"))
+standing_acf <- read.csv(file.path("build", "standing_wave", "acf.csv"))
+par(mfrow=c(5, 2), mar=c(0,0,0,0))
+for (i in seq(0, 4)) {
+  arma.wavy_plot(standing_acf, i, zlim=c(-5,5))
+  arma.wavy_plot(propagating_acf, i, zlim=c(-5,5))
+}
+#+end_src
+
+#+caption: Временные срезы АКФ для стоячих (слева) и прогрессивных (справа) волн.
+#+label: acf-slices
+#+RESULTS: acf-slices
+[[file:build/acf-slices-ru.pdf]]
+
+Хвосты распределений на рис.\nbsp{}[[propagating-wave-distributions]] отклоняются от
+оригинального распределения для характеристик отдельных волн, поскольку каждую
+волну необходимо извлечь из полученной взволнованной поверхности, чтобы измерить
+ее длину, период и высоту. Алгоритм, который бы гарантировал безошибочное
+извлечение всех волн, не известен, поскольку волны могут и часто накладываются
+друг на друга. Правый хвост распределения Вейбулла отклоняется больше, поскольку
+он представляет редко возникающие волны.
+
+Степень соответствия для стоячих волн (рис.\nbsp{}[[standing-wave-distributions]])
+ниже для высот и длин, примерно одинакова для подъема поверхности и выше для
+периодов волн. Более низкая степень соответствия длин и высот может быть
+результатом того, что распределения были получены эмпирически для морских волн,
+которые, в основном, являются прогрессиными, и аналогичные распределения для
+стоячих волн могут отличаться. Более высокая степень соответствия периодов волн
+является следствием того, что периоды стоячих волн извлекаются более точно,
+поскольку волн не перемещаются вне моделируемой области взволнованной
+поверхности. Одинаковая степень соответствия для подъема поверхности получается
+из-за того, что это характеристика поверхности (и соответствующего процесса АР
+или СС), и она не зависит от типа волн.
+
+*** Верификация полей потенциалов скоростей
+:PROPERTIES:
+:CUSTOM_ID: sec:compare-formulae
+:END:
+
+Сравнение полученных общих формул eqref:eq-solution-2d и
+eqref:eq-solution-2d-full с известными формулами линейной теории волн позволяет
+оценить различие между полями скоростей для волн как больших, так и малых
+амплитуд. В общем случае аналитическое выражение для потенциала скорости
+неизвестно даже для плоских волн, поэтому сравнение производится численно. Имея
+ввиду выводы раздела [[#sec:pressure-2d]], сравниваются только формулы для случая
+конечной глубины.
+
+**** Отличие от формул линейной теории волн.
+Для того чтобы получить поля потенциалов скоростей, взволнованная морская
+поверхность генерировалась с помощью модели АР с варьированием амлитуды волн. В
+численной реализации волновые числа в преобразованиях Фурье выбирались на
+интервале от \(0\) до максимального волнового числа, определяемого численно из
+полученной взволнованной поверхности. Эксперименты проводились для волн малых и
+больших амплитуд.
+
+Эксперимент показал, что поля потенциалов скоростей, полученные по формуле
+eqref:eq-solution-2d-full для конечной глубины и по формуле
+eqref:eq-solution-2d-linear линейной теории, качественно отличаются
+(см.\nbsp{}рис.\nbsp{}[[fig-potential-field-nonlinear]]). Во-первых, контуры
+потенциала скорости имеют вид затухающей синусоиды, что отличается от овальной
+формы, описываемой линейной теории волн. Во-вторых, по мере приближения к дну
+водоема потенциал скорости затухает гораздо быстрее, чем в линейной теории, а
+область, где сконцентрирована большая часть энергии волны, еще больше приближена
+к ее гребню. Аналогичный численный эксперимент, в котором из формулы
+eqref:eq-solution-2d-full были исключены члены, которыми пренебрегают в рамках
+линейной теории волн, показал, что полное соотвествие получившихся полей
+потенциалов скоростей (насколько это позволяет сделать машинная точность).
+
+#+name: fig-potential-field-nonlinear
+#+caption: Поле потенциала скорости прогрессивной волны \(\zeta(x,y,t) = \cos(2\pi x - t/2)\). Поле, полученное по формуле eqref:eq-solution-2d-full (сверху) и по формуле линейной теории волн (снизу).
+#+begin_figure
+#+attr_latex: :width 0.47\textwidth
+[[file:graphics/pressure/potential-5.eps]]
+#+attr_latex: :width 0.47\textwidth
+[[file:graphics/pressure/potential-6.eps]]
+#+end_figure
+
+**** Отличие от формул теории волн малой амплитуды.
+Эксперимент, в котором сравнивались поля потенциалов скоростей, полученные
+численно разлиными формулами, показал, что поля скоростей, полученные по формуле
+eqref:eq-solution-2d-full и формуле для волн малой амплитуды
+eqref:eq-old-sol-2d, сопоставимы для волн малых амплитуд. В этом эксперименте
+использовались две реализации взволнованной морской поверхности, полученные по
+модели АР: одна содержала волны малой амплитуды, другая\nbsp{}--- большой.
+Интегрирование в формуле eqref:eq-solution-2d-full велось по диапазону волновых
+чисел, полученному из морской поверхности. Для волн малой амплитуды обе формулы
+показали сопоставимые результаты (разница в значениях скорости приписывается
+стохастической природе модели АР), в то время как для волн больших амплитуд
+устойчивое поле скоростей дала только формула eqref:eq-solution-2d-full (рис.
+рис.\nbsp{}[[fig-velocity-field-2d]]). Таким образом, общая формула
+eqref:eq-solution-2d-full показывает удовлетворительные результаты, не вводя
+ограничения на амплитуду волн.
+
+#+name: fig-velocity-field-2d
+#+caption: Сравнение полей скоростей на поверхности моря, полученных по общей формуле (\(u_1\)) и формуле для волн малой амплитуды (\(u_2\)). Поле скоростей для поверхности волн малой амплитуды (сверху) и большой амплитуды (снизу).
+#+begin_figure
+[[file:build/low-amp-nocolor.eps]]
+[[file:build/high-amp-nocolor.eps]]
+#+end_figure
+
+*** Нефизическая природа модели
+Благодаря своей нефизической природе модель АРСС не включает в себя понятие
+морской волны; вместо этого она моделирует взволнованную поверхность как единое
+целое. Движения отдельных волн и их форма часто получаются грубыми, а точное
+количество генерируемых волн неизвестно. Несмотря на это, интегральные
+характеристики взволнованной поверхности соответствуют реальным морским волнам.
+
+Теоретически, профили самих морских волн могут быть использованы в качестве АКФ,
+если предварительно обеспечить их экспоненциальное затухание. Это может
+позволить генерировать волны произвольных профилей и является одной из тем
+дальнейших исследований.
+
+* Высокопроизводительный программный комплекс для моделирования морского волнения
+** Модель вычислений
+**** Отображение алгоритма генерации взволнованной поверхности на вычислительную модель.
+Модель АРСС реализована в программном комплексе, работающем по принципу
+вычислительного конвейера, в котором каждое звено применяет некоторую функцию к
+выходным данным предыдущего звена. Звенья конвейера распределяются по узлам
+вычислительного кластера, чтобы сделать возможным параллелизм по операциям, а
+затем данные, перемещающиеся между звеньями конвейера распределяются между
+ядрами процессора, чтобы сделать возможным параллелизм по данным. На рис.\nbsp{}[[fig-pipeline]] представлена схема конвейера обработки данных, в которой
+прямоугольниками со скругленными углами обозначены звенья конвейера, обычными
+прямоугольниками\nbsp{}--- массивы объектов из предметной области задачи, передаваемые
+от одного звена к другому, а стрелками\nbsp{}--- направление передачи данных.
+Некоторые звенья разделены на /секции/, каждая из которых обрабатывает отдельную
+часть массива. Если звенья соединены без использования /барьера/ (горизонтальная
+или вертикальная полоса), то передача отдельных объектов между такими звеньями
+происходит параллельно с вычислениями, по мере их готовности. Секции работают
+параллельно на нескольких ядрах процессора (нескольких узлах кластера). Таким
+образом, между множеством ядер процессора, секций звеньев конвейера и объектами
+устанавливается сюръективное отображение, т.е. на одном ядре процессора может
+работать несколько секций звеньев конвейера, каждая из которых может
+обрабатывать несколько объектов последовательно, но одна секция не может
+работать сразу на нескольких ядрах, а объект не может обрабатываться сразу
+несколькими секциями конвейера.
+
+#+name: fig-pipeline
+#+begin_src dot :exports results :file build/pipeline-ru.pdf
+digraph {
+
+  node [fontsize=14,margin="0.055,0"]
+  graph [nodesep="0.25",ranksep="0.25",rankdir="TB"]
+  edge [arrowsize=0.66]
+
+  # data
+  subgraph xcluster_linear {
+    label="Линейная модель"
+
+    start [label="",shape=circle,style=filled,fillcolor=black,width=0.23]
+    spectrum [label="S(ω,θ)",shape=box]
+    acf [label="K(i,j,k)",shape=box]
+    phi [label="Φ(i,j,k)",shape=box]
+
+    # transformations
+    fourier_transform [label="Преобразование Фурье",shape=box,style=rounded]
+    solve_yule_walker [label="Решение уравнений\nЮла—Уокера",shape=box,style=rounded]
+
+    subgraph cluster_nonlinear_1 {
+      label="Моделир. нелинейности\l"
+      labeljust=left
+      style=filled
+      color=lightgrey
+      acf2 [label="K*(i,j,k)",shape=box]
+      transform_acf [label="Преобразование АКФ",shape=box,style=rounded]
+    }
+  }
+
+  subgraph xcluster_linear2 {
+
+    eps_parts [label="<e1> ε₁|<e2> ε₂|<e3> …|<e4> εₙ|<e> ε(t,x,y)",shape=record]
+    end [label="",shape=doublecircle,style=filled,fillcolor=black,width=0.23]
+
+    generate_white_noise [label="<g1> g₁|<g2> g₂|<g3> …|<g4> gₙ|<gen> Генерация\lбелого шума",shape=record,style=rounded]
+    generate_zeta [label="<g1> g₁|<g2> g₂|<g3> …|<g4> gₙ|<gen> Генерация частей\lвзволнованной мор-\lской поверхности\l",shape=record,style=rounded]
+
+    zeta_parts [label="<g1> ζ₁|<g2> ζ₂|<g3> …|<g4> ζₙ|<gen> Несшитые части реализации",shape=record]
+    overlap_add [label="<g1> ζ₁|<g2> ζ₂|<g3> …|<g4> ζₙ|<gen> Сшивание час-\lтей реализации\l",shape=record,style=rounded]
+
+    zeta_parts:g1->overlap_add:g1
+    zeta_parts:g2->overlap_add:g2
+    zeta_parts:g3->overlap_add:g3
+    zeta_parts:g4->overlap_add:g4
+
+    zeta_parts:g2->overlap_add:g1 [constraint=false]
+    zeta_parts:g3->overlap_add:g2 [constraint=false]
+    zeta_parts:g4->overlap_add:g3 [constraint=false]
+
+    overlap_add:g1->zeta2_parts:g1
+    overlap_add:g2->zeta2_parts:g2
+    overlap_add:g3->zeta2_parts:g3
+    overlap_add:g4->zeta2_parts:g4
+
+    zeta2_parts:g1->transform_zeta:g1->zeta3_parts:g1->write_zeta:g1->eps_end
+    zeta2_parts:g2->transform_zeta:g2->zeta3_parts:g2->write_zeta:g2->eps_end
+    zeta2_parts:g3->transform_zeta:g3->zeta3_parts:g3->write_zeta:g3->eps_end
+    zeta2_parts:g4->transform_zeta:g4->zeta3_parts:g4->write_zeta:g4->eps_end
+
+  }
+
+  subgraph part3 {
+
+    zeta2_parts [label="<g1> ζ₁|<g2> ζ₂|<g3> …|<g4> ζₙ|<gen> Поверхность с нормаль-\lным законом распреде-\lления\l",shape=record]
+
+    subgraph cluster_nonlinear_2 {
+      label="Моделир. нелинейности\r"
+      labeljust=right
+      style=filled
+      color=lightgrey
+      zeta3_parts [label="<g1> ζ₁|<g2> ζ₂|<g3> …|<g4> ζₙ|<gen> ζ(t,x,y)",shape=record]
+      transform_zeta [label="<g1> g₁|<g2> g₂|<g3> …|<g4> gₙ|<gen> Преобразование за-\lкона распределения\lвзволнованной мор-\lской поверхности\l",shape=record,style=rounded]
+    }
+
+    # barriers
+    eps_start [label="",shape=box,style=filled,fillcolor=black,height=0.05]
+    eps_end [label="",shape=box,style=filled,fillcolor=black,height=0.05]
+
+    write_zeta [label="<g1> g₁|<g2> g₂|<g3> …|<g4> gₙ|<gen> Запись готовых\lчастей в файл\l",shape=record,style=rounded]
+  }
+
+  # edges
+  start->spectrum->fourier_transform->acf->transform_acf
+  transform_acf->acf2
+  acf2->solve_yule_walker
+  solve_yule_walker->phi
+  phi->eps_start [constraint=false]
+  eps_start->generate_white_noise:g1
+  eps_start->generate_white_noise:g2
+  eps_start->generate_white_noise:g3
+  eps_start->generate_white_noise:g4
+  generate_white_noise:g1->eps_parts:e1->generate_zeta:g1->zeta_parts:g1
+  generate_white_noise:g2->eps_parts:e2->generate_zeta:g2->zeta_parts:g2
+  generate_white_noise:g3->eps_parts:e3->generate_zeta:g3->zeta_parts:g3
+  generate_white_noise:g4->eps_parts:e4->generate_zeta:g4->zeta_parts:g4
+
+  eps_end->end
+}
+#+end_src
+
+#+caption: Схема конвейера обработки данных, реализующего генерацию взволнованной морской поверхности по АР модели.
+#+label: fig-pipeline
+#+RESULTS: fig-pipeline
+[[file:build/pipeline-ru.pdf]]
+
+Конвейер объектов можно считать развитием модели BSP (Bulk Synchronous
+Parallel)\nbsp{}cite:valiant1990bridging, применяемой в системах обработки
+графов\nbsp{}cite:malewicz2010pregel,seo2010hama. Конвейер позволяет исключить
+глобальную синхронизацию (где это возможно) между последовательно идущим этапами
+вычислений путем передачи данных между звеньев параллельно с вычислениями, в то
+время как в модели BSP глобальная синхронизация происходит после каждого шага.
+
+Конвейер объектов ускоряет программу путем параллельного выполнения блоков кода,
+работающих с разными вычислительными устройствами: в то время как текущая часть
+взолнованной поверхности генерируется на процессоре, предыдущая часть
+записывается на диск. Такой подход позволяет получить ускорение, потому что
+различные вычислительные устройства работают асинхронно, и их параллельное
+использование увеличивает производительность программы.
+
+Поскольку передача данных между звеньями конвейера происходит параллельно с
+вычислениями, то на одном и том же конвейере можно запустить сразу несколько
+копий приложения с разными параметрами (генерировать сразу несколько
+взволнованных морских поверхностей с разными характеристиками). На практике
+оказывается, что высокопроизводительные приложения не всегда загружают
+процессор на 100%, тратя время на синхронизацию параллельных процессов и
+запись данных на диск. Использование конвейера в таком случае позволит на одном
+и том же множестве процессов запустить сразу несколько расчетов и максимально
+эффективно использовать все устройства компьютера. Например, во время записи в
+файл одной задачей может производиться расчет на процессоре другой задачей. Это
+минимизирует время простоя процессора и других устройств компьютера и повышает
+общую пропускную способность кластера.
+
+Конвейеризация шагов программы, которые в противном случае последовательны,
+выгодно не только для кода, работающего с различными устройствами, но и для
+кода, различные ветки которого могут быть запущены на нескольких аппаратных
+потоках одного процессорного ядра, т.е. ветки, осуществляющие доступ к различным
+блокам памяти или использующие смешанную арифметику (целочисленную и с плавающей
+точкой). Ветки кода, которые используют различные модули процессора, являются
+хорошими кандидатами для параллельного запуска на процессорном ядре с
+несколькими аппаратными потоками.
+
+Таким образом, вычислительную модель на основе конвейера можно рассматривать как
+/массивно асинхронную модель/ (bulk-asynchronous model) из-за параллельной
+природы шагов программы. Эта модель является основой модели отказоустойчивости,
+которая будет описана далее.
+
+**** Программная реализация.
+Из соображений эффективности конвейер объектов и методы обеспечения
+отказоустойчивости (которые будут описаны далее) были реализованы во фреймворке
+на языке C++: с точки зрения автора язык C слишком низкоуровневый для написания
+распределенных программ, а использование языка Java влечет за собой накладные
+расходы, и не популярно в высокопроизводительных вычислениях. На данный момент
+фреймворк запускает сервис и приложение в одном процессе. Фреймворк называется
+"Фабрика" и находится на этапе проверки концепции.
+
+**** Обзор вычислительной модели.
+Ключевой особенностью, которая отсутствует в текущих технологиях параллельного
+программирования, является возможность указать иерархических зависимостей между
+параллельными задачами. Когда такая зависимость есть, определить, какая из задач
+должна быть ответственна за повторное выполнение не удавшейся задачи на одном из
+выживших узлов, тривиально. Чтобы повторно выполнить задачу на вершине иерархии,
+создается резервная задача, выполняющаяся на другом узле. Существует ряд систем,
+которые способны выполнять направленные ациклические графы задач параллельно\nbsp{}cite:acun2014charmpp,islam2012oozie, но графы не подходят для определения
+отношений руководитель-подчиненный между задачами, поскольку узел графа может
+иметь несколько родительских узлов.
+
+Основное назначение модели состоит в упрощении разработки распределенных
+приложений для пакетной обработки данных и промежуточного программного
+обеспечения. Основное внимание направлено на обеспечение устойчивости приложений
+к поломкам оборудования, т.е. обеспечение отказоустойчивости и высокой
+доступности, которое прозрачно для программиста. Реализация модели состоит из
+двух слоев: на нижнем слое находятся подпрограммы и классы для приложений,
+работающих на одном узле (без сетевых взаимодействий), на верхнем слое\nbsp{}--- для
+приложений, работающих на произвольном количестве узлов. Модель включает в себя
+два вида сильно связанных друг с другом сущностей\nbsp{}--- /управляющие объекты/ (или
+/ядра/) и /конвейеры/,\nbsp{}--- которые используются совместно для написания
+программы.
+
+Управляющие объекты реализуют логику (порядок выполнения) программы в методах
+~act~ и ~react~ и хранят состояние текущей ветки исполнения. Как логика так и
+состояние задаются программистом. В методе ~act~ какая-либо функция либо
+вычисляется непосредственно, либо разлагается на вложенные функции
+(представляемые подчиненными управляющими объектами), которые впоследствии
+отправляются на конвейер. В методе ~react~ подчиненные управляющие объекты,
+вернувшиеся с конвейера, обрабатываются их родительским объектом. Вызовы методов
+~act~ и ~react~ производятся асинхронно внутри потоков, присоединенных к
+конвейеру. Для каждого управляющего объекта метод ~act~ вызывается только один
+раз, и для нескольких объектов вызовы происходят параллельно друг другу, в то
+время как метод ~react~ вызывается один раз для каждого подчиненного объекта, и
+все вызовы происходят в одном потоке для предотвращения одновременного изменения
+состояния несколькими потоками (для разных родительских объектов могут
+использоваться разные потоки).
+
+Конвейеры осуществляют асинхронные вызовы методов ~act~ и ~react~, стараясь
+сделать как можно больше вызовов параллельно, учитывая предоставляемый
+платформой параллелизм (количество процессорных ядер на узле и количество узлов
+в кластере). Конвейер включает в себя пул управляющих объектов, содержащий все
+подчиненные объекты, отправленные в него родителями, и пул потоков,
+обрабатывающий эти объекты в соответствии с правилами, описанными в предыдущем
+параграфе. Для каждого устройства используется отдельный конвейер. Существуют
+конвейеры для параллельной обработки, обработки по расписанию (периодические и
+отложенные задачи) и промежуточный конвейер для обработки управляющих объектов
+на узлах кластера (см.\nbsp{}рис.\nbsp{}[[fig-subord-ppl]]).
+
+По принципу работу механизм управляющих объектов и конвейеров напоминает
+механизм работы процедур и стеков вызовов, с тем лишь преимуществом, что методы
+объектов вызываются асинхронно и параллельно друг другу (насколько это позволяет
+логика программы). Поля управляющего объекта\nbsp{}--- это локальные переменные стека,
+метод ~act~\nbsp{}--- это последовательность процессорных инструкций перед вложенным
+вызовом процедуры, а метод ~react~\nbsp{}--- это последовательность инструкций после
+вложенного вызова. Создание и отправка на конвейер подчиненного объекта\nbsp{}--- это
+вложенный вызов процедуры. Наличие двух методов обуславливается асинхронностью
+вложенных вызовов и помогает заменить активное ожидание завершения подчиненных
+объектов пассивным при помощи конвейеров. Конвейеры, в свою очередь, позволяют
+реализовать пассивное ожидание и вызывают правильные методы, анализируя
+внутреннее состояние объектов.
+
+#+name: fig-subord-ppl
+#+begin_src dot :exports results :file build/subord-ppl-ru.pdf
+graph G {
+
+  node [fontname="Old Standard",fontsize=14,margin="0.055,0",shape=box]
+  graph [nodesep="0.25",ranksep="0.25",rankdir="LR"]
+  edge [arrowsize=0.66]
+
+  subgraph cluster_daemon {
+    label="Родительский процесс"
+    style=filled
+    color=lightgrey
+
+    factory [label="Фабрика"]
+    parallel_ppl [label="Параллельный\nконвейер"]
+    io_ppl [label="Конвейер\nввода/вывода"]
+    sched_ppl [label="Конвейер\nдля таймера"]
+    net_ppl [label="Конвейер для\nсетевых устройств"]
+    proc_ppl [label="Конвейер\nдля процессов"]
+
+    upstream [label="Пул потоков\nupstream"]
+    downstream [label="Пул потоков\ndownstream"]
+  }
+
+  factory--parallel_ppl
+  factory--io_ppl
+  factory--sched_ppl
+  factory--net_ppl
+  factory--proc_ppl
+
+  subgraph cluster_hardware {
+    label="Вычислительные устройства"
+    style=filled
+    color=lightgrey
+
+    cpu [label="CPU"]
+    core0 [label="Ядро 0"]
+    core1 [label="Ядро 1"]
+    core2 [label="Ядро 2"]
+    core3 [label="Ядро 3"]
+
+    storage [label="Устройства\nхранения"]
+    disk0 [label="Диск 0"]
+
+    network [label="Сетевые\nкарты"]
+    nic0 [label="СК 0"]
+
+    timer [label="Таймер"]
+
+  }
+
+  core0--cpu
+  core1--cpu
+  core2--cpu
+  core3--cpu
+
+  disk0--storage
+  nic0--network
+
+  parallel_ppl--upstream
+  parallel_ppl--downstream
+
+  upstream--{core0,core1,core2,core3} [style="dashed"]
+  downstream--core0 [style="dashed"]
+
+  io_ppl--core0 [style="dashed"]
+  io_ppl--disk0 [style="dashed"]
+  sched_ppl--core0 [style="dashed"]
+  sched_ppl--timer [style="dashed"]
+  net_ppl--core0 [style="dashed"]
+  net_ppl--nic0 [style="dashed"]
+  proc_ppl--core0 [style="dashed"]
+
+  subgraph cluster_children {
+    style=filled
+    color=white
+
+    subgraph cluster_child0 {
+      label="Дочерний процесс 0"
+      style=filled
+      color=lightgrey
+
+      app0_factory [label="Фабрика"]
+      app0 [label="Конвейер\nдочернего\nпроцесса"]
+    }
+
+#    subgraph cluster_child1 {
+#      label="Дочерний процесс 1"
+#      style=filled
+#      color=lightgrey
+#
+#      app1_factory [label="Фабрика"]
+#      app1 [label="Конвейер\nдочернего процесса"]
+#    }
+  }
+
+  proc_ppl--app0
+#  proc_ppl--app1
+
+  app0_factory--app0 [constraint=false]
+#  app1_factory--app1 [constraint=false]
+
+}
+#+end_src
+
+#+caption: Отображение конвейеров родительского и дочернего процессов на вычислительные устройства. Сплошные линии обозначают агрегацию, пунктирные линии обозначают отображение между логическими и физическими сущностями.
+#+attr_latex: :width \textwidth
+#+label: fig-subord-ppl
+#+RESULTS: fig-subord-ppl
+[[file:build/subord-ppl-ru.pdf]]
+
+**** Основополагающие принципы модели.
+Модель конвейера обработки данных строится по следующим принципам, следование
+которым обеспечивает максимальную эффективность программы.
+- В модели отсутствует понятие сообщения, роль сообщения выполняет сам
+  управляющий объект: он может быть передан по сети на другой узел и получить
+  доступ к полям любого другого управляющего объекта на этом узле. Гарантировать
+  существование такого объекта может только логика программы.
+- Управляющий объект представляет собой /сопрограмму/, которая при вызове
+  отправляется в пул управляющих объектов и затем выполняется планировщиком
+  асинхронно. Тело сопрограммы может содержать произвольное количество вызовов
+  других сопрограмм. Каждый вызов отправляет соответствующую сопрограмму в пул и
+  сразу завершается. Управляющие объекты, находящиеся в пуле, могут быть
+  обработаны в любом порядке; это используется планировщиком для извлечения
+  максимального параллелизма из вычислительной системы путем распределения
+  объектов из пула между доступными узлами кластера и ядрами процессора.
+- Асинхронное выполнение управляющих объектов позволяет избежать явной
+  синхронизации после вызова сопрограммы (отправки объекта в очередь);
+  планировщик возвращает поток управления в родительский управляющий объект
+  каждый раз когда какой-либо его дочерний объект завершает выполнение. Такое
+  взаимодействие превращает сопрограмму в некоторого рода обработчик событий, в
+  котором событием является дочерний объект, а обработчиком\nbsp{}--- родительский.
+- Сопрограмма может взаимодействовать с произвольным количеством управляющих
+  объектов, адреса которых известны; взаимодействие с объектами, осуществляемое
+  вразрез с иерархией сильно усложняет поток управления и стек вызовов
+  сопрограмм теряет древовидную структуру. Только логика программы может
+  гарантировать существование в памяти машины двух взаимодействующих объектов.
+  Один из способов обеспечения такой гарантии\nbsp{}--- взаимодействие между
+  вложенными сопрограммами, вызванными из одной родительской сопрограммы.
+  Поскольку такого рода взаимодействие можно осуществить в рамках иерархии через
+  родительскую сопрограмму, его можно считать оптимизацией, позволяющей
+  избавиться от накладных расходов при передаче данных через промежуточный узел.
+  Для программ, логика которых полностью основана на событиях (например, для
+  серверов и программ с графическим интерфейсом), ситуация иная, и такого рода
+  взаимодействия являются основными.
+- Также, взаимодействия, идущие вразрез с иерархией и поверх сети кластера,
+  усложняют разработку алгоритмов обеспечения отказоустойчивости. Гарантировать
+  нахождение определенного управляющего объекта в памяти соседнего узла
+  невозможно, поскольку узел может выйти из строя прямо во время выполнения
+  соответствующей сопрограммы. В результате, при аварийном завершении
+  сопрограммы, все его вложенные сопрограммы должны быть выполнены заново. Это
+  подталкивает программиста к созданию
+  - глубоких древовидных иерархий сильно связанных управляющих объектов (которые
+    взаимодействуют между собой на одном уровне иерархии), уменьшающих накладные
+    расходы на повторное выполнение сопрограмм;
+  - толстых древовидных иерархий слабо связанных управляющих объектов,
+    обеспечивающих максимальную степень параллелизма.
+  Глубокие иерархии это не только требование технологии, они помогают
+  оптимизировать сетевое взаимодействие большого количества узлов кластера,
+  сводя его к взаимодейсвтвию соседних узлов.
+
+Таким образом, управляющие объекты обладают свойствами как сопрограмм, так и
+обработчиков событий одновременно.
+
+** Реализация для систем с общей памятью (SMP)
+**** Алгоритм распределения нагрузки.
+Наиболее простым и широко применяемым подходом к распределению нагрузки на
+вычислительную систему является разбиение данных на равные части (или разбиение
+задачи на однородные подзадачи) с последующим их равномерным распределением
+между отдельными ядрами процессора и узлами кластера, однако такой подход не
+всегда работает эффективно. Во-первых, часто общее количество частей, на которые
+разбиваются входные данные, диктуется не архитектурой и конфигурацией
+вычислительной системы, а самой задачей, и такое распределение не всегда
+эффективно с точки зрения вычислительной машины: количество частей оказывается
+либо слишком большим по сравнению с количеством процессоров, работающих
+параллельно, что ведет к увеличению накладных расходов на обмен данными, либо
+слишком маленьким, что не позволяет использовать все доступные вычислительные
+ядра. Во-вторых, накладываемые решаемой задачей ограничения могут не позволить
+разделить входные данные на равные части, что может стать причиной дисбаланса в
+загрузке ядер процессора. В-третьих, в вычислительной системе в вычислениях
+участвуют помимо процессора сразу несколько компонент (таких как векторные
+сопроцессоры и устройства хранения), то время решения конкретной задачи зависит
+от производительности всех задействованных устройств. Каким же образом сделать
+алгоритм распределения нагрузки более эффективным, принимая во внимание разный
+размер частей, на которые разделяются входные данные, и учитывая все устройства,
+задействованные в вычислениях?
+
+Алгоритм распределения нагрузки состоит из двух этапов. На первом этапе алгоритм
+размещает часть входных данных (или подзадачу), обернутую в управляющий объект,
+в соответствующем пуле управляющих объектов: для каждого устройства используется
+отдельный пул управляющих объектов и сопряженный с ним пул потоков. На втором
+этапе, управляющий объект извлекается из пула одним из потоков и обрабатывается.
+Благодаря отдельным пулам потоков все устройства работают параллельно, уменьшая
+тем самым время простоя оборудования по сравнению с использованием всех
+устройств из одного потока.
+
+Для того чтобы учесть неоднородность частей, на которые разбиваются входные
+данные, и неоднородность выполняемых задач, необходимо предсказать время
+выполнения каждой из задач. Соответствующее исследование сделано в\nbsp{}cite:degtyarev2016balance, поскольку реализация модели АРСС включает в себя, в
+основном, однородные задачи.
+
+Таким образом, распределение нагрузки осуществляется в два этапа: на первом
+этапе задача в форме урпавляющего объекта направляется на подходящее устройство,
+а на втором этапе она направляется в один из потоков из соответсвующего
+устройству пула. Неоднородность управляющих объектов может быть учтена путем
+предсказания времени их выполнения, однако такие объекты не встречаются в
+реализации модели АРСС.
+
+**** Производительность реализаций на MPI, OpenMP и OpenCL.
+Программная реализация состояла в создании и отладке прототипа программы и в
+последующем написании компоненты виртуального полигона на языке более низкого
+уровня. При этом тесты показали, что одной высокопроизводительной
+многопроцессорной машины достаточно для создания типовых реализаций морского
+волнения. Также использование видеокарт в качестве векторных ускорителей
+эффективно только в случае расчета давлений, в то время как генерация волновой
+поверхности выполняется быстрее на скалярном процессоре.
+
+Создание программной реализации происходило в два этапа: на первом этапе был
+создан и отлажен прототип в программной среде
+Mathematica\nbsp{}cite:mathematica10, а на втором этапе логика программы была
+переписана на более низкоуровневом языке C++, и для получения эффективно
+работающего параллельного кода были проведены эксперименты с рядом библиотек. С
+помощью этих библиотек были реализованы функции генерации взволнованной морской
+поверхности, а также процедура расчета гидродинамических давлений под
+сгенерированной поверхностью. Тестирование производилось на вычислительных
+машинах кластера РЦ ВЦ СПбГУ (см.\nbsp{}табл.\nbsp{}[[tab-autoreg-testbed]]) и
+позволило получить два основных результата. Во-первых, использование видеокарт
+неэффективно при генерации волновой поверхности
+(см.\nbsp{}табл.\nbsp{}[[tab-autoreg-performance]]), что обусловлено сравнительно
+небольшим количеством арифметических операций по отношению к количеству операций
+с памятью устройства, а также отсутствием трансцендентных функций в реализации
+алгоритма. Во-вторых, для генерации одной реализации взволнованной морского
+поверхности одной многопроцессорной машины достаточно для эффективного и
+быстрого решения задачи (см.\nbsp{}рис.\nbsp{}[[fig-autoreg-performance]]). По
+результатам тестирования стандарт OpenMP был выбран в качестве основного, как
+наиболее эффективный и наиболее подходящий для расчетов на многопроцессорной
+системе.
+
+#+name: fig-autoreg-performance
+#+caption: Скорость генерации взволнованной поверхности на многопроцессорной системе для типовых размеров реализации (сверху). Масштабируемость (относительное ускорение при увеличении количества процессоров) программной реализации на многопроцессорной системе для типовых размеров реализации (снизу). Временная протяженность 512 с.
+#+begin_figure
+[[file:graphics/speed.eps]]
+[[file:graphics/speedup.eps]]
+#+end_figure
+
+#+name: tab-autoreg-testbed
+#+caption: Конфигурация оборудования.
+#+attr_latex: :booktabs t
+| Вычислительная машина | HP SL390s G7                           |
+| Процессор             | 2\(\times\)Intel X5650 (всего 12 ядер) |
+| Оперативная память    | 96ГБ RAM                               |
+| Операционная система  | CentOS 5.6 (Linux)                     |
+
+#+name: tab-autoreg-performance
+#+caption: Время (с.) генерации взволнованной морской поверхности различными программными реализациями авторегрессионной модели.
+#+attr_latex: :booktabs t :align cllllll
+|        |     ЛХ |     ЛХ |    ЛХ |     АР |     АР |    АР |
+| Размер | OpenCL | OpenMP |   MPI | OpenCL | OpenMP |   MPI |
+|--------+--------+--------+-------+--------+--------+-------|
+| 400000 |   0.82 |  40.44 | 32.60 |   1.80 |  0.800 | 0.750 |
+| 440000 |   0.90 |  44.59 | 35.78 |   1.92 |  0.100 | 0.930 |
+| 480000 |   0.99 |  48.49 | 38.93 |   2.29 |  0.970 | 0.126 |
+| 520000 |   1.07 |  52.65 | 41.92 |   2.43 |  0.118 | 0.117 |
+| 560000 |   1.15 |  56.45 | 45.00 |   2.51 |  0.117 | 0.161 |
+| 600000 |   1.23 |  60.85 | 48.80 |   2.54 |  0.123 | 0.132 |
+| 640000 |   1.31 |  65.07 | 53.02 |   2.73 |  0.123 | 0.160 |
+| 680000 |   1.40 |  68.90 | 54.92 |   2.80 |  0.138 | 0.136 |
+| 720000 |   1.48 |  72.49 | 58.42 |   2.88 |  0.144 | 0.173 |
+| 760000 |   1.56 |  76.86 | 61.41 |   3.47 |  0.156 | 0.155 |
+| 800000 |   1.64 |  81.03 | 66.42 |   3.25 |  0.166 | 0.174 |
+
+Кроме выбора стандарта параллельных вычислений на время работы программы влияет
+выбор библиотек типовых вычислительных методов, и эффективность этих библиотек
+была показана тестированием их разработчиками. В качестве библиотеки для
+матричных операций (расчета коэффициентов авторегрессионной модели) была выбрана
+GotoBLAS и основанная на ней LAPACK, для непрерывной аппроксимации поля волновых
+чисел использовалась библиотека CGAL\nbsp{}cite:fabri2009cgal и для статистической
+проверки интегральных характеристик реализации взволнованной поверхности
+использовалась библиотека GSL\nbsp{}cite:gsl2008scientific. В случае GotoBLAS
+эффективность библиотеки показана в работах\nbsp{}cite:goto2008high,goto2008anatomy,
+для других библиотек эффективность не является важной, и они были выбраны,
+исходя из удобства их использования.
+
+#+name: tab-arma-libs
+#+caption: Список библиотек, используемых в реализации модели АРСС.
+#+attr_latex: :booktabs t :align lp{0.6\linewidth}
+| Library                                                | What it is used for              |
+|--------------------------------------------------------+----------------------------------|
+| DCMT\nbsp{}cite:matsumoto1998dynamic                         | параллельный ГПСЧ                |
+| Blitz\nbsp{}cite:veldhuizen1997will,veldhuizen2000techniques | многомерные массивы              |
+| GSL\nbsp{}cite:gsl2008scientific                             | вычисление ФПР, ФР, БПФ          |
+|                                                        | проверка стационарности процесса |
+| LAPACK, GotoBLAS\nbsp{}cite:goto2008high,goto2008anatomy     | определение коэффициентов АР     |
+| GL, GLUT\nbsp{}cite:kilgard1996opengl                        | трехмерная визуализация          |
+
+**** Производительность алгоритма распределения нагрузки.
+Программная реализация генерации взволнованной поверхности сбалансирована с
+точки зрения нагрузки на процессорные ядра, однако, как показывают тесты,
+характеризуется высокой нагрузкой на устройства хранения. До проведения
+тестирования генерация взволнованной поверхности была реализована с
+использованием OpenMP для параллельных вычислений, и была переписана с
+использованием POSIX потоков для того чтобы реализовать алгоритм распределения
+нагрузки. Производительность двух реализаций сравнивалась на платформе,
+конфигурация которой приведена в табл.\nbsp{}[[tab-multicore-specs]].
+
+#+name: tab-multicore-specs
+#+caption: Конфигурация многоядерной системы.
+#+attr_latex: :booktabs t
+| Компонента                    | Подробности                      |
+|-------------------------------+----------------------------------|
+| Язык программирования         | C++11                            |
+| Библиотека потоков            | C++11 STL threads                |
+| Библиотека атомарных операций | C++11 STL atomic                 |
+| Подпрограммы замера времени   | ~clock_gettime(CLOCK_MONOTONIC)~ |
+|                               | ~/usr/bin/time -f \%e~           |
+| Компилятор                    | GCC 4.8.2                        |
+| Опции компиляции              | ~-std=c++11 -O2 -march=native~   |
+| Операционная система          | Debian 3.2.51-1 x86_64           |
+| Файловая система              | ext4                             |
+| Процессор                     | Intel Core 2 Quad Q9650          |
+| Частота процессора (ГГц)      | 3.00                             |
+| Количество ядер               | 4                                |
+| Объем оперативной памяти (ГБ) | 8                                |
+| Диск                          | Seagate ST3250318AS              |
+| Скорость диска (об./мин.)     | 7200                             |
+
+Эксперимент состоял в запуске двух программных реализаций на многоядерной
+машине, изменяя размер поверхности. Размер пула потоков процессора и пула
+потоков ввода/вывода оставался неизменным во время эксперимента. Пул потоков
+ввода/вывода состоял из одного потока, а количество потоков процессора равнялось
+количеству физических ядер процессора.
+
+В эксперименте алгоритм распределения нагрузки показал большую эффективность по
+сравнению с реализацией без него. Чем больше размер генерируемой поверхности,
+тем больше разрыв в производительности (рис.\nbsp{}[[fig-factory-performance]]), что
+является следствием наложения вычислительной фазы и фазы вывода данных друг на
+друга (рис.\nbsp{}[[fig-factory-overlap]]). В реализации OpenMP фаза вывода данных
+начинается только тогда, когда заканчивается вычислительная фаза, в то время как
+использование алгоритма распределения нагрузки приводит почти к одновременному
+завершению обеих фаз. Таким образом, /выполнение параллельных изнутри,
+последовательных фаз в режиме конвейера более эффективно, чем их
+последовательное выполнение/, и это позволяет сбалансировать нагрузку на
+различные устройства, задействованные в вычислениях.
+
+#+name: fig-factory-performance
+#+header: :width 5 :height 4
+#+begin_src R :file build/factory-vs-openmp-ru.pdf
+source(file.path("R", "common.R"))
+arma.plot_factory_vs_openmp(
+  xlab="Размер реализации",
+  ylab="Время, с.",
+  power=6
+)
+#+end_src
+
+#+caption: Сравнение производительности реализаций программы на OpenMP и Factory.
+#+label: fig-factory-performance
+#+RESULTS: fig-factory-performance
+[[file:build/factory-vs-openmp-ru.pdf]]
+
+#+name: fig-factory-overlap
+#+header: :width 7 :height 4
+#+begin_src R :file build/factory-vs-openmp-overlap-ru.pdf
+source(file.path("R", "common.R"))
+par(mar=c(5, 6, 0, 1), pty="m")
+arma.plot_factory_vs_openmp_overlap(
+  xlab="Время, с.",
+  labels=c("Factory", "OpenMP"),
+  scale=10**9
+)
+#+end_src
+
+#+caption: Наложение параллельных вычислений на \([G_0,G_1]\) и записи данных на диск на \([W_0,W_1]\). В реализации OpenMP наложение отсутствует.
+#+label: fig-factory-overlap
+#+RESULTS: fig-factory-overlap
+[[file:build/factory-vs-openmp-overlap-ru.pdf]]
+
+Предложенный алгоритм распределения нагрузки на многоядерную систему позволяет
+получить прирост производительности для приложений, считывающих и записывающих
+большой объем данных на диск, но может быть использован также и в других
+случаях. Основная идея алгоритма состоит в определении типа нагрузки и поиске
+подходящего устройства для перенаправления нагрузки на него. Таким образом любое
+устройство помимо дисков может быть использовано.
+
+** Реализация для систем с распределенной памятью (MPP)
+*** Алгоритм обнаружения узлов кластера
+:PROPERTIES:
+:CUSTOM_ID: sec:node-discovery
+:END:
+
+Многие распределенные системы построены по принципу /субординации/: в каждом
+кластере выбирается главный (руководящий) узел, который управляет очередью
+задач, планирует их запуск на подчиненных узлах и следит за их состоянием. Роль
+главного узла задается либо /статически/, путем выделения конкретного
+физического узла под нее, либо /динамически/, путем избрания какого-либо из
+узлов кластера главным. В первом случае отказоустойчивость обеспечивается
+посредством резервирования дополнительного свободного узла, который выполнит
+роль главного в случае отказа текущего. Во втором случае отказоустойчивость
+обеспечивается выбором нового главного узла из оставшихся. Несмотря на то что
+динамическое задание ролей требует наличия специализированного распределенного
+алгоритма, этот подход становится все более и более популярным, поскольку не
+требует наличия простаивающих резервных узлов на случай отказа главного узла.
+
+Алгоритмы выбора лидера (которые иногда называют алгоритмами /распределенного
+консенсуса/) являются частными случаями волновых алгоритмов. В\nbsp{}cite:tel2000introduction Тель определяет их как алгоритмы, в которых событие
+завершения программы предваряется хотя бы одним каким-либо другим событием,
+происходящем в /каждом/ параллельном процессе. Волновые алгоритмы не определены
+для анонимных сетей, т.е. они работают только с теми параллельными процессами,
+которые могут себя уникально идентифицировать. Однако, количество процессов,
+которых затрагивает "волна", может быть определено по мере выполнения алгоритма.
+В рамках распределенных систем это означает, что волновые алгоритмы подходят для
+вычислительных кластеров с динамически меняющимся количеством узлов, так что
+включение и выключение отдельных узлов не влияет на работу алгоритма.
+
+Подход к динамическому выбору главного узла, исследованный в данной работе, не
+использует волновые алгоритмы, а значит не требует опроса всех узлов кластера
+для выбора лидера. Вместо этого каждый узел кластера нумерует все узлы подсети,
+в которой он находится, и преобразует список в /древовидную иерархию/ с заданным
+максимальным значением ветвления (максимальным количеством подчиненных вершин).
+Затем узел определяет свой уровень иерархии и пытается соединиться с
+вышестоящими узлами, чтобы стать их подчиненным. Сначала он проверяет близко
+расположенные к нему узлы, а потом все остальные узлы вплоть до вершины
+иерархии. Если вышестоящих узлов нет или с ними невозможно соединиться, то узел
+сам становится главой иерархии.
+
+Древовидная иерархия узлов подсети определяет отношение строгого порядка на
+множестве всех узлов кластера. Несмотря на то что с технической точки зрения
+любая функция может быть выбрана для присвоения узлу подсети номера в списке, на
+практике эта функция должна быть достаточно гладкой вдоль временной оси и иметь
+лишь редкие скачки: быстрые изменения в структуре иерархии узлов (которые часто
+являются следствием погрешности измерений) могут привести постоянной передаче
+роли главного узла от одного узла к другому, что сделает кластер неуправляемым.
+Простейшей такой функцией является позиция IP-адреса узла в диапазоне всех
+IP-адресов подсети.
+
+Следующие ключевые особенности отличают наш подход от некоторых предложенных
+ранее подходов\nbsp{}cite:brunekreef1996design,aguilera2001stable,romano2014design.
+- *Многоуровневая иерархия.* Количество руководящих узлов в сети зависит от
+  значения ветвления. Если оно меньше количества IP-адресов в подсети, то в
+  кластере будет несколько руководящих узлов. Если оно больше или равно
+  количеству IP-адресов в подсети, то в кластере будет только один руководящий
+  узел. Когда какой-либо узел выходит из строя, многоуровневая иерархия
+  изменятся локально, только узлы, примыкающие к вышедшему из строя,
+  взаимодействуют друг с другом.
+- *Отображение IP-адресов.* Поскольку структура иерархии зависит только от
+  IP-адресов узлов, то в алгоритме отсутствует фаза выбора лидера. Чтобы сменить
+  руководителя, каждый узел отправляет сообщение только прежнему и новому
+  руководителю.
+- *Полностью основан на событиях.* Сообщения отправляются только при выходе из
+  строя узла, поэтому постоянной нагрузки на сеть нету. Поскольку алгоритм
+  допускает ошибку при отправке любого сообщения, то нет необходимости в
+  heartbeat-пакетах, являющихся индикацией нахождения узла в сети; вместо этого
+  все сообщения выполняют роль heartbeat-пакетов и настраивается время ожидания
+  отправки пакета.
+- *Отсутствие ручной конфигурации.* Узлу не требуется никаких предварительных
+  знаний, чтобы найти руководителя: он определяет сеть, узлом которой он
+  является, вычисляет IP-адрес потенциального руководителя и отправляет ему
+  сообщение. Если это не срабатывает, то процесс повторяется для следующего
+  потенциального руководителя. Таким образом, алгоритм подходит для начальной
+  загрузки кластера без ручной настройки, для этого требуется только запустить
+  соответствующий сервис на каждом узле.
+Суммируя вышесказанное, достоинством алгоритма является то, что он
+- масштабируется на большое количество узлов посредством иерархии с несколькими
+  руководящими узлами,
+- не нагружает сеть отправкой сообщений с текущим состоянием узлов и
+  heartbeat-пакетами,
+- не требует ручной настройки для первичной загрузки кластера.
+
+Недостатком алгоритма является то, что он требует редкого изменения IP-адресов.
+Он не подходит для облачной среды, в которой только DNS имя узла сохраняется, а
+IP-адрес может меняться со временем. Когда IP-адрес меняется, текущие соединения
+могут закрыться, сигнализируя о "выходе из строя" узла и перестраивая иерархию
+узлов. Таким образом, окружения, в которых узлы не идентифицируются IP-адресами,
+не подходят для алгоритма.
+
+Другим недостатком алгоритма является искусственная зависимость ранга узла от
+IP-адреса: замена отображения IP-адресов на что-то более совершенное (например,
+на отображение, которое использует загрузку текущего узла и сети для
+ранжирования узлов) представляет сложность, поскольку погрешность измерений
+может стать причиной неустойчивой иерархии, а полная событийность алгоритма
+будет нарушена.
+
+Алгоритм обнаружения узлов спроектирован для балансировки нагрузки на кластер
+вычислительных узлов, и его применение в других приложениях не рассматривается в
+данной работе. Когда распределенная или параллельная программа запускается на
+одном из узлов кластера, ее подзадачи распределяются между всеми примыкающими
+узлами иерархии (включая главный узел, если есть). Для того чтобы равномерно
+распределить нагрузку, когда программа запускается на подчиненном узле, каждый
+узел хранит вес каждого из примыкающих узлов иерархии. Вес равен количеству
+узлов дерева, находящегося "за" примыкающим узлом. Например, если вес первого
+примыкающего узла равен 2, то циклический алгоритм балансировки нагрузки
+распределит две подзадачи на первый узел перед тем как перейти к следующему
+узлу.
+
+Суммируя вышесказанное, алгоритм обнаружения узлов
+- спроектирован для облегчения распределения нагрузки на кластер,
+- полностью отказоустойчивый, состояние каждого узла можно вычислить заново в
+  любой момент времени,
+- полностью основан на событиях, а значит не нагружает сеть периодической
+  отправкой сообщений.
+
+**** Построение древовидной иерархии.
+Отношение строго порядка на множестве \(\mathcal{N}\) узлов одной подсети
+определяется как
+\begin{equation*}
+  \forall n_1 \forall n_2 \in \mathcal{N},
+  \forall f \colon \mathcal{N} \rightarrow \mathcal{R}^n
+  \Rightarrow (f(n_1) < f(n_2) \Leftrightarrow \neg (f(n_1) \geq f(n_2))),
+\end{equation*}
+где \(f\)\nbsp{}--- отображение узла на его ранг, а \(<\)\nbsp{}--- оператор, определяющий
+отношение строго порядка на множестве \(\mathcal{R}^n\). Функция \(f\) присваивает
+узлу порядковый номер, а оператор \(<\) делает этот номер уникальным.
+
+Простейшее отображение \(f\) ставит в соответствие каждому узлу подсети позицию
+его IP-адреса в диапазоне всех адресов подсети. Без преобразования к древовидной
+иерархии (когда в подсети выбирается только один лидер) рабочий узел, адрес
+которого занимает наименьшую позицию в диапазоне, становится руководящим. Если
+адрес узла занимает первую позицию в диапазоне, то для него невозможно выбрать
+лидера, и он будет находится на вершине иерархии вплоть до выхода из строя.
+Несмотря на то что идентификацию узлов на основе их IP-адресов легко реализовать
+в программе, такой подход устанавливает искусственную зависимость роли
+руководителя от IP-адреса узла. Тем не менее, этот подход полезен для первичного
+объединения узлов в кластер, когда более сложные отображения неприменимы.
+
+Для того чтобы алгоритм обнаружения масштабировался на большое количество узлов,
+диапазона IP адресов подсети отображается на древовидную иерархию. В такой
+иерархии каждый узел определяется уровнем \(l\) иерархии, на котором он
+находится, и отступом \(o\), который равен порядковому номеру узла на его
+уровне. Значения уровня и отступа определяются из следующей задачи оптимизации.
+\begin{equation*}
+    n = \sum\limits_{i=0}^{l(n)} p^i + o(n), \quad
+    l \rightarrow \min, \quad
+    o \rightarrow \min, \quad
+    l \geq 0, \quad
+    o \geq 0
+\end{equation*}
+где \(n\)\nbsp{}--- позиция IP-адреса узла в диапазоне IP-адресов подсети и \(p\)\nbsp{}---
+значение ветвления (максимальное количество подчиненных, которых может иметь
+узел). Руководитель узла на уровне \(l\) с отступом \(o\) имеет уровень \(l-1\) и
+отступ \(\lfloor{o/p}\rfloor\). Расстояние между любыми двумя узлами в иерархии,
+адреса которых занимают позиции \(i\) и \(j\) в диапазоне определяется как
+\begin{align*}
+    & \langle
+        \text{lsub}(l(j), l(i)), \quad
+        \left| o(j) - o(i)/p \right|
+    \rangle,\\
+    & \text{lsub}(l_1, l_2) =
+    \begin{cases}
+        \infty & \quad \text{if } l_1 \geq l_2, \\
+        l_1 - l_2 & \quad \text{if } l_1 < l_2.
+    \end{cases}
+\end{align*}
+Расстояние является составным, чтобы уровень иерархии учитывался в первую
+очередь.
+
+Для выбора руководителя каждый узел ранжирует все узлы подсети в соответствии с
+их позицией \(\langle{l(n),o(n)}\rangle\) и, используя формулу для определения
+расстояния, выбирает ближайший к потенциальному руководителю узел, имеющий
+наименьший ранг. Это позволяет пропустить IP-адреса выключенных узлов, однако,
+для разреженных сетей (в которых узлы занимают непоследовательные IP-адреса)
+сбалансированность дерева не гарантируется.
+
+Поскольку узлу для выбора руководителя нужно соединиться с узлом, адрес которого
+известен заранее, то алгоритм обнаружения масштабируется на большое количество
+узлов. Соединение с другими узлами из ранжированного списка происходит только в
+том случае, если текущим узел-руководитель выходит из строя. Таким образом, если
+адреса узлов кластера расположены плотно в диапазоне адресов подсети, каждый
+узел устанавливает соединение только со своим руководителем, и
+неэффективного сканирования всей сети каждым узлом не происходит.
+
+**** Результаты тестирования.
+Платформа, на которой осуществлялось тестирование, представляла собой несколько
+многоядерных узлов, поверх которых с помощью пространств имен Linux
+развертывался виртуальный кластер из заданного количества узлов. Похожий подход
+используется в\nbsp{}cite:lantz2010network,handigol2012reproducible,heller2013reproducible, где
+авторы воспроизводят разнообразные практические эксперименты на виртуальных
+кластерах и сопоставляют результаты с физическими. Преимущество данного подхода
+заключается в возможности проведения экспериментов на больших виртуальных
+кластерах, используя сравнительно небольшое количество физических узлов. Данный
+подход использовался для тестирования алгоритма обнаружения узлов кластера,
+потому что этот алгоритм обладает низкими требованиями к ресурсам системы
+(процессорному времени и пропускной способности сети).
+
+Производительность алгоритма была протестирована путем измерения времени
+необходимого для обнаружения всеми узлами кластера друг друга. Каждое изменение
+иерархии (с точки зрения каждого из узлов) записывалось в файл, и по прошествии
+30 секунд все процессы (каждый из которых моделирует один узел кластера) были
+вынужденно завершены. Пробные запуски показали, что одновременный запуск более
+100 виртуальных узлов искажал результаты, поэтому для этого эксперимента были
+использованы дополнительные физические узлы, на каждом из которых запускалось по
+100 виртуальных. Эксперимент показал, что обнаружение 100--400 узлами друг друга
+занимает в среднем 1,5 секунды, и это значение ненамного увеличивается с ростом
+количества узлов (см.\nbsp{}рис.\nbsp{}[[fig-bootstrap-local]]). Пример древовидной
+иерархии для 11 узлов с ветвлением равным 2 представлен на
+рис.\nbsp{}[[fig-tree-hierarchy-11]].
+
+#+name: fig-bootstrap-local
+#+caption: Зависимость времени объединения узлов в кластер от их количества.
+[[file:graphics/discovery.eps]]
+
+#+name: fig-tree-hierarchy-11
+#+begin_src dot :exports results :file build/tree-hierarchy-11-ru.pdf
+digraph {
+
+  node [fontsize=14,margin="0.055,0",shape=box,style=rounded]
+  graph [nodesep="0.15",ranksep="0.20",rankdir="BT"]
+  edge [arrowsize=0.66]
+
+  m1 [label="127.0.0.1"]
+  m2 [label="127.0.0.2"]
+  m3 [label="127.0.0.3"]
+  m4 [label="127.0.0.4"]
+  m5 [label="127.0.0.5"]
+  m6 [label="127.0.0.6"]
+  m7 [label="127.0.0.7"]
+  m8 [label="127.0.0.8"]
+  m9 [label="127.0.0.9"]
+  m10 [label="127.0.0.10"]
+  m11 [label="127.0.0.11"]
+
+  m2->m1
+  m3->m1
+  m4->m2
+  m5->m2
+  m6->m3
+  m7->m3
+  m8->m4
+  m9->m4
+  m10->m5
+  m11->m5
+}
+#+end_src
+
+#+caption: Древовидная иерархия для 11 узлов для ветвления равного 2.
+#+label: fig-tree-hierarchy-11
+#+RESULTS: fig-tree-hierarchy-11
+[[file:build/tree-hierarchy-11-ru.pdf]]
+
+*** Алгоритм восстановления после сбоев
+**** Контрольные точки восстановления.
+Сбои узлов распределенной системы можно разделить на два типа: сбой подчиненного
+узла и сбой руководящего узла. Для того чтобы запущенная на кластере задача
+могла пережить сбой подчиненного узла, планировщик задач периодически создает
+для нее контрольные точки восстановления и записывают их в надежное хранилище.
+Для того чтобы создать контрольную точку, планировщик временно останавливает все
+параллельные процессы задачи, копирует все страницы памяти и все структуры ядра
+операционной системы, выделенные для этих процессов, на диск, и продолжает
+выполнение задачи. Для того чтобы пережить сбой руководящего узла, серверный
+процесс планировщика задач непрерывно копирует свое внутреннее состояние на
+резервный узел, который становится руководящим после сбоя.
+
+Оптимизации работы контрольных точек восстановления посвящено большое количество
+работ\nbsp{}cite:egwutuoha2013survey, а альтернативным подходам уделяется меньше
+внимания. Обычно высокопроизводительные приложения используют передачу сообщений
+для обмена данными между параллельными процессами и хранят свое текущее
+состояние в глобальной памяти, поэтому не существует способа перезапустить
+завершившийся процесс, не записав образ всей выделенной для него памяти на диск.
+Обычно общее число процессов фиксировано и задается планировщиком, и в случае
+отказа перезапускаются сразу все процессы. Существуют некоторые обходные
+решения, которые позволяют перезапустить только часть
+процессов\nbsp{}cite:meyer2012radic, восстановив их из контрольной точки на
+выживших узлах, однако это может привести к перегрузке, если на этих узлах уже
+запущены другие задачи. Теоретически, перезапуск процесса необязателен если
+задача может быть продолжена на выживших узлах, но библиотека передачи сообщений
+не позволяет изменять количество параллельных процессов во время работы
+программы, и большинство программ все равно предполагают, что это значение
+является константой. Таким образом, не существует надежного способа обеспечения
+отказоустойчивости на уровне библиотеки передачи сообщений кроме как путем
+перезапуска всех параллельных процессов из контрольной точки восстановления.
+
+Однако, существует возможность продолжить выполнение задачи на меньшем
+количестве узлов, чем было изначально выделено изначально, реализовав
+отказоустойчивость на уровне приложения. В этом случае роли руководителя и
+подчиненного динамически распределяются между сервисами планировщика задач,
+работающими на каждом узле кластера, образуя древовидную иерархию узлов
+кластера, а параллельная программа состоит из управляющих объектов, использующих
+иерархию узлов для динамического распределения нагрузки и свою собственную
+иерархию для перезапуска управляющих объектов в случае сбоя узла.
+**** Динамическое распределение ролей.
+Отказоуйстовчисть параллельной программы\nbsp{}--- это одна из проблем, которая
+должна решаться планировщиком задач обработки больших данных или
+высокопроизводительных вычислений, однако, большинство планировщиков
+обеспечивают только отказоустойчивость подчиненных узлов. Такого рода сбои
+обычно обрабатываются путем перезапуска затронутой задачи (из контрольной точки
+восстановления) или ее части на оставшихся узлах, а выход из строя руководящего
+узла считается либо маловероятным, либо слишком сложным для обработки и
+настройки на целевой платформе. Системные администраторы обычно находят
+альтернативы отказоустойчивости на уровне приложения: они изолируют руководящий
+процесс планировщика от остальных узлов кластера, размещая его на специально
+выделенной машине, или, вместо этого, используют технологии виртуализации. Все
+эти альтернативы усложняют конфигурацию и обслуживание, и, уменьшая вероятность
+выхода из строя машины, приводящей к выходу из строя всей системы, увеличивают
+вероятность ошибки оператора.
+
+С этой точки зрения более практичным реализовать отказойстойчивость руководящего
+узла на уровне приложения, но не существует общего зарекомендовавшего себя
+решения. Большинство реализаций слишком привязаны к конкретному приложению,
+чтобы стать повсеместно применяемыми. Автор считает, что это происходит из-за
+привычки людей думать о кластере, как о совокупности отдельных машин, каждая из
+которых может быть руководителем или подчиненным, вместо того чтобы думать о
+кластере, как о едином целом, в котором роли руководителя и подчиненного
+динамически распределяются между запущенными на разных узлах процессами.
+
+Понимание того, что кластер тоже является вычислительной машиной, позволяет
+реализовать промежуточное программное обеспечение, которое автоматически
+распределяет роли руководителя и подчиненного и общим спопобом обрабатывает сбои
+узлов. Это программное обеспечение предоставляет программный интерфейс и
+распределяет управляющие объекты между доступными на данный момент узлами.
+Используя этот интерфейс, можно написать программу, которая запускается на
+кластерe, не зная точного количества работающих узлов. Это промежуточное
+программное обеспечение работает как кластерная операционная система в
+пользовательском пространстве, позволяющая писать и запускать распределенные
+приложения прозрачно.
+
+**** Симметричная архитектура.
+Многие распределенные хранилища типа "ключ-значение" и параллельные файловые
+системы имеют симметричную архитектуру, в которой роли руководителя и
+подчиненного распределяются динамически, так что любой узел может выступать в
+роли руководитля, если текущий руководящий узел выходит из строя, однако, такая
+архитектура до сих пор не используется в планировщиках задач обработки больших
+данных и высокопроизводительных вычислений. Например, в планировщике задач
+обработки больших данных YARN, роли руководителя и подчиненного являются
+статическими. Восстановление после сбоя подчиненного узла осуществляется путем
+перезапуска работавшей на нем части задачи на одном из выживших узлов, а
+восстановление после сбоя руководящего узла осуществляется путем установки
+резервного руководящего узла\nbsp{}cite:murthy2011architecture. Оба руководящих
+узла управляются сервисом Zookeeper, который использует динамическое
+распределение ролей для обеспечения своей
+отказоустойчивости\nbsp{}cite:okorafor2012zookeeper. Таким образом, отсутствие
+динамического распределения ролей у планировщика YARN усложняет конфигурацию
+всего кластера: если бы динамические роли были доступны, Zookeeper был бы лишним
+в данной конфигурации.
+
+Такая же проблема возникает в планировщиках задач для высокопроизводительных
+вычислений, руководящий узел (на котором запущен главный процесс планировщика
+задач) является единой точкой сбоя.
+В\nbsp{}cite:uhlemann2006joshua,engelmann2006symmetric авторы реплицируют
+состояние планировщика задач на резервный узел, чтобы обеспечить высокую
+доступность руководящего узла, но роль резервного узла задается статически.
+Такое решение близко к симметричной архитектуре, поскольку не использует внешний
+сервис для обеспечения высокой доступности, но далеко от идеала, в котором
+резервный узел выбирается динамически.
+
+Наконец, наиболее простой вариант высокой доступности руководящего узла
+реализован в протоколе VRRP (Virtual Router Redundancy
+Protocol)\nbsp{}cite:knight1998rfc2338,hinden2004virtual,nadas2010rfc5798.
+Несмотря на то что протокол VRRP предоставляет динамическое распределение ролей,
+он не может быть использован в планировщиках задач, поскольку спроектирован для
+маршрутизаторов, за которыми стоят реверс прокси серверы. В таких серверах
+отсутствует состояние (очередь задач), которое необходимо восстановить после
+выхода из строя узла, поэтому их высокую доступность обеспечить проще. Это может
+быть реализовано даже без маршрутизаторов, используя вместо этого сервис
+Keepalived\nbsp{}cite:cassen2002keepalived.
+
+Симметричная архитектура выгодна для планировщиков задач, поскольку позволяет
+- сделать физические узлы взаимозаменяемыми,
+- реализовать динамическое распределение ролей руководителя и подчиненного и
+- реализовать автоматическое восстановление после сбоя любого из узлов.
+В последующих разделах будут описаны компоненты необходимые для написания
+параллельной программы и планировщика, которые устойчивы к сбоям узлов кластера.
+
+**** Иерархия управляющих объектов.
+Для распределения нагрузки узлы кластера объединяются в древовидную иерархию
+(см.\nbsp{}раздел [[#sec:node-discovery]]), и нагрузка распределяется между
+непосредственными соседями узла, так что при запуске управляющего объекта на
+подчиненном узле главный узел также получают часть его подчиненных объектов. Это
+делает систему симметричной и легкой в обслуживании: на каждом узле установлен
+один и тот же набор программного обеспечения, что позволяет заменить один узел
+другим при выходе из строя первого. Похожее архитектурное решение используется в
+хранилищах типа ключ-значение\nbsp{}cite:anderson2010couchdb,lakshman2010cassandra для
+обеспечения отказоустойчивости, однако автору неизвестны планировщики задач,
+которые используют данный подход.
+
+В отличие от функции ~main~ в программах на основе библиотеки передачи
+сообщений, первый (главный) управляющий объект выполняется только на одном узле,
+а дополнительные узлы используются по необходимости. Такое решение позволяет
+использовать произвольное количество узлов для запуска задачи и динамически
+менять это количество во время ее выполнения. Похожее решение используется в
+системах обработки больших объемов
+данных\nbsp{}cite:dean2008mapreduce,vavilapalli2013yarn \nbsp{}--- пользователь,
+запускающий задачу на кластере, не указывает количество узлов, фактические
+узлы\nbsp{}--- это узлы, на которых расположены входные файлы.
+
+С математической точки зрения управляющий объект \(K\) может быть определен как
+векторнозначный функционал, отображающий один управляющий объект на
+\(n\)-компонентный вектор управляющих объектов:
+\begin{equation*}
+    K(f): \mathbb{K} \rightarrow \mathbb{K}^n
+    \qquad
+    \mathbb{K}^n = \left\{ f: \mathbb{K} \rightarrow \mathbb{K}^n \right\}.
+\end{equation*}
+Специальный объект \(\mathbb{O}: \mathbb{K} \rightarrow \mathbb{K}^0\)
+используется для остановки рекурсии, и передается в качестве аргумента главному
+управляющему объекту программы. Аргумент управляющего объекта интерпретируется
+следующим образом.
+- Если объект является только что созданным объектом, то аргумент\nbsp{}--- это его
+  родительский объект.
+- В остальных случаях аргументом может являться любой объект (чаще всего
+  дочерний по отношению к текущему).
+
+Объекты обрабатываются в цикле, который начинается с выполнением главного
+объекта, затем внутри главного объекта создаются и асинхронно выполняются другие
+объекты. Цикл продолжается до тех пор пока какой-либо объекта не вернет
+\(\mathbb{O}\). Поскольку вызов функции может породить сразу несколько объектов,
+они выполняются параллельно, что приводит к быстрому заполнению пула объектов.
+Поскольку объекты из пула могут выполняться в произвольном порядке, несколько
+потоков одновременно выбирают объекты для обработки, и при переполнении
+пула объекты могут быть переданы на другие узлы кластера без явного указания в
+исходном коде программы.
+
+Вычислительные объекты реализованы в виде замыканий (функторы в C++)\nbsp{}---
+объектов-функций, которые сохраняют в себе аргументы, ссылку на породивший их
+объект и данные из предметной области задачи. Данные обрабатываются либо при
+выполнении объекта, либо для параллельной обработки создаются подчиненные
+объекты. Когда обработка завершена, родительский объект вызывается с дочерним
+объектом в качестве аргумента для сбора результатов обработки.
+
+**** Обработка выхода узлов из строя.
+Наиболее распространенная стратегия при выходе из строя подчиненного узла
+является перезапуск выполнявшихся на нем объектов на рабочих узлах\nbsp{}---
+стратегия, которой следует язык Erlang при перезапуске подчиненных процессов\nbsp{}cite:armstrong2003thesis. Для того что реализовать этот метод в рамках иерархии
+управляющих объектов, узел-отправитель сохраняет каждый объект, передаваемый на
+другие узлы кластера, и в случае отказа произвольного количества узлов, на
+которые были переданы объекты, их копии перераспределяются между оставшимися
+узлами без индивидуальной обработки программистом. Если больше не осталось
+узлов, на которые можно отправить объекты, то они выполняются локально. В
+отличие от "тяжеловесного" метода контрольных точек восстановления,
+используемого планировщиками задач HPC кластеров, древовидная иерархия узлов в
+паре с иерархией объектов позволяет автоматически продолжить выполнение
+программы при выходе из строя произвольного количества подчиненных узлов без
+перезапуска каких-либо процессов параллельной программы.
+
+Возможный подход к обработке выхода из строя главного узла (узла, на котором
+запускается главный управляющий объект) заключается в копировании этого главного
+объекта на резервный узел и синхронизации любых изменений между двумя копиями
+объекта посредством распределенных транзакций, однако, этот подход не
+соотносится с асинхронностью вычислительных ядер и слишком сложна в реализации.
+На практике, оказывается, что главный управляющий объект обычно не выполняет
+операции параллельно, а последовательно переходит от вычисления одного шага
+программы к вычислению другого, и, значит, имеет не больше одного подчиненного в
+каждый момент времени. (Каждый подчиненный объект представляет собой
+последовательный шаг вычислений, который может быть, а может не быть
+параллельным внутри.) Имея это ввиду, можно упростить синхронизацию состояния
+главного объекта программы: отправить главный объект на подчиненный узел вместе
+с его подчиненным объектом. Тогда при выходе из строя главного узла, копия
+главного объекта принимает подчиненный объект (поскольку оба объекта находятся
+на одном и том же узле), и время на восстановление не тратится. Если же выходит
+из строя подчиненный узел, на которым был отправлен подчиненный объект вместе с
+копией главного объекта, то подчиненный объект отправляется на оставшиеся узлы,
+и в худшем случае текущий шаг вычислений выполняется заново.
+
+Описанный выше подход предназначен для объектов, у которых нет объекта-родителя
+и которые имеют только один подчиненный объект в каждый момент времени, и
+повторяет механизм работы контрольных точек восстановления. Преимуществом
+данного подхода является то, что он
+- сохраняет состояние только между последовательными шагами вычислений (когда оно
+занимает минимальный объем памяти),
+- сохраняет только актуальное данные и
+- использует для сохранения состояния оперативную память другого узла кластера,
+  а не дисковое хранилище.
+Этот подход позволяет выдержать выход из строя не более одного /любого/ узла
+кластера за один шаг вычислений или произвольного количества подчинненых узлов в
+любой момент работы программы.
+
+Далее следует пример работы алгоритма восстановления после сбоев
+(рис.\nbsp{}[[fig-fail-over-example]]).
+1. Исходное состояние. На начальном этапе вычислительный кластер не требует
+   никакой настройки за исключением настройки сети. Алгоритм предполагает полную
+   связность узлов кластера и лучше всего работает с древовидными топологиями, в
+   которых все узлы кластера соединены несколькими коммутаторами.
+2. Построение иерархии узлов. При первичной загрузке на всех узлах кластера
+   запускаются процессы-сервисы, которые совместно строят иерархию таких же
+   процессов поверх топологии сети кластера. Положение процесса-сервиса в
+   иерархии определяется позицией IP-адреса его узла в диапазоне IP-адресов
+   сети. Для установления связи каждый из процессов соединеняется только с
+   предполагаемым руководящим процессом. В данном случае процесс на узле \(A\)
+   становится руководящим процессом для всех остальных. Иерархия может
+   измениться, только если новый узел присоденяется к кластеру или какой-либо из
+   узлов выходит из строя.
+3. Запуск главного управляющего объекта. Первый управляющий объект запускается
+   на одном из подчиненных узлов (узел \(B\)). Главный объект может иметь только
+   один подчиненный объект в каждый момент времени, и резервная копия главного
+   объекта посылается вместе с этим подчиненным объектом \(T_1\) на руководящий узел
+   \(A\). \(T_1\) представляет собой последовательный шаг программы. В программе
+   может быть произвольное количество последовательных шагов, и, когда узел
+   \(A\) выходит из строя, текущий шаг перезапускается с начала.
+4. Запуск подчиненных управляющих объектов. Управлящие объекты \(S_1\), \(S_2\),
+   \(S_3\) запускаются на подчиненных узлах кластера. Когда узел \(B\), \(C\)
+   или \(D\), соответствующий руководящий управляющий объект перезапускает
+   завершившиеся некорректно подчиненные объекты (\(T_1\) перезапускает \(S_1\),
+   главный объект перезапускает \(T_1\) и т.д.). Когда выходит из строя узел
+   \(B\), главный объект восстанавливается из резервной копии.
+
+#+name: fig-fail-over-example
+#+header: :headers '("\\input{preamble}\\setdefaultlanguage{russian}")
+#+begin_src latex :file build/fail-over-example-ru.pdf :exports results :results raw
+\input{tex/preamble}
+\newcommand*{\spbuInsertFigure}[1]{%
+\vspace{2\baselineskip}%
+\begin{minipage}[b]{0.5\linewidth}%
+    \Large%
+    \input{#1}%
+\end{minipage}%
+}%
+\noindent%
+\spbuInsertFigure{tex/cluster-0}~\spbuInsertFigure{tex/frame-0}\newline
+\spbuInsertFigure{tex/frame-3-ru}~\spbuInsertFigure{tex/frame-4-ru}\newline
+\spbuInsertFigure{tex/legend-ru}
+#+end_src
+
+#+caption: Пример работы алгоритма восстановления после сбоев.
+#+label: fig-fail-over-example
+#+attr_latex: :width \textwidth
+#+RESULTS: fig-fail-over-example
+[[file:build/fail-over-example-ru.pdf]]
+
+**** Результаты тестирования.
+Методы отказоустойчивости были протестированы на физическом кластере
+(см.\nbsp{}табл.\nbsp{}[[tab-cluster]]) на примере программы, генерирующей
+взволнованную морскую поверхность, подробно описанной в разделе
+[[#sec:arma-algorithms]]. Программа состоит из серии фильтров, каждый из которых
+применяется к результату работы предыдущего. Некоторые из фильтров вычисляются
+параллельно, так что вся программа состоит из последовательно выполняющихся
+шагов, некоторые из которых внутри реализованы параллельно из соображений
+эффективности. Только наиболее ресурсоемкий этап программы (генерация
+взволнованной морской поверхности) выполняется параллельно на всех узлах, другие
+этапы выполняются параллельно на всех процессорных ядрах главного узла.
+
+#+name: tab-cluster
+#+caption: Конфигурация кластера, на котором проводились эксперименты.
+#+attr_latex: :booktabs t
+| CPU                 | Intel Xeon E5440, 2.83GHz |
+| RAM                 | 4Gb                       |
+| HDD                 | ST3250310NS, 7200rpm      |
+| Кол-во узлов        | 12                        |
+| Кол-во ядер на узел | 8                         |
+
+Программа была переписана под отказоустойчивую версию фреймворка, что
+потребовало лишь небольших изменений исходного кода для корректной обработки
+выхода из строя узла с главным объектом. Главный объект был помечен, чтобы
+фреймворк смог передать его на подчиненный узел вместе с подчиненным ему
+объектом. Другие изменения исходного кода были связаны с изменением программного
+интерфейса фреймворка. Таким образом, обеспечение отказоустойчивости посредством
+иерархии управляющих объектов, в основном, прозрачно для программиста и требует
+лишь маркировки главного объекта для его репликации на резервный узел.
+
+В ряде экспериментов была измерена производительность новой версии программы при
+выходе из строя различных типов узлов во время выполнения программы (номера
+пунктов соответствуют номерам графиков рис.\nbsp{}[[fig-benchmark]]):
+1) без выхода из строя узлов,
+2) выход из строя подчиненного узла (на котором генерируется часть взволнованной
+   поверхности),
+3) выход из строя главного узла (на котором запускается программа),
+4) выход из строя резервного узла (на который копируется главный объект
+   программы).
+Древовидная иерархия узлов со значением ветвления равного 64 использовалась в
+экспериментах, для того чтобы удостовериться, что все подчиненные узлы кластера
+соединены с узлом, имеющим первый IP-адрес в диапазоне адресов подсети.
+Узел-жертва выводился из строя по прошествии фиксированного временного интервала
+после запуска программы равного примерно \(1/3\) времени работы программы на
+одном узле. Приложение мгновенно узнавала о выходе из строя узла, поскольку
+закрывалось соответсвтвующие соединение; при реалистичном развитии событий,
+однако, выход из строя узла обнаружится по прошествии настраивомого тайм-аута.
+Способ запуска для каждого эксперимента представлен в табл.\nbsp{}[[tab-benchmark]].
+Результаты экспериментов приведены на рис.\nbsp{}[[fig-benchmark]]
+и\nbsp{}[[fig-slowdown]].
+
+Эксперименты показали большую разницу в общей производительности приложения при
+выходе из строя различных типов узлов. Графики\nbsp{}2 и\nbsp{}3 на
+рис.\nbsp{}[[fig-benchmark]] показывают, что производительность в случае выхода из
+строя руководящего и подчиненного узлов одинакова. В случае отказа руководящего
+узла резервный узел сохраняет копию главного объекта и восстанавливает главный
+объект из нее, когда обнаруживает, что главный узел вышел из строя. В случае
+отказа подчиненного узла, главный узел перераспределяет невернувшиеся объекты
+между оставшимися подчиненными узлами. В обоих случая состояние главного объекта
+программы не теряется, а значит не тратится время на его восстановление, что
+объясняет схожую производительность.
+
+График\nbsp{}4 на\nbsp{}[[fig-benchmark]] показывает, что производительность в
+случае выхода из строя резервного узла гораздо ниже, чем в других случаях. Это
+происходит, потому что руководящий узел сохраняет состояние только текущего
+последовательного шага программы, в то время как резервный узел не только хранит
+копию этого состояния, но и выполняет этот шаг параллельно с другими
+подчиненными узлами. Так что, когда резервный узел выходит из строя, главный
+узел начинает выполнение текущего этапа с самого начала на произвольно выбранном
+выжившем узле.
+
+#+caption: Параметры экспериментов с алгоритмово восстановления после сбоев.
+#+name: tab-benchmark
+#+attr_latex: :booktabs t
+| Номер эксп. | Время до выхода из строя, сек. |
+|           1 |                                |
+|           2 |                             10 |
+|           3 |                             10 |
+|           4 |                             10 |
+
+Для оценки количества времени, которое теряется при выходе узла из строя, можно
+поделить общее время работы программы со сбоем на время работы программы без
+сбоев, но с количеством узлов минус один. Это отношение получается из того же
+самого эксперимента и представлено на рис.\nbsp{}[[fig-slowdown]]. Разница в
+производительности в случае выхода из строя руководящего и подчиненного узлов
+находится в пределах 5%, а в случае выхода из строя резервного узла\nbsp{}--- в
+пределах 50% для количества узлов меньше 6[fn::Измерение разницы для большего
+количества узлов не имеет смысла, поскольку программа завершается еще до
+наступления сбоя.]. Увеличение времени выполнения на 50%\nbsp{}--- это больше,
+чем \(1/3\) времени работы программы, после которого происходит сбой, однако
+отказ резервного узла требует некоторого времени, чтобы быть обнаруженным
+другими узлами: сбой узла обнаруживается только тогда, когда подчиненный объект,
+имеющий копию главного объекта, завершает свое выполнение и пытается вернуться
+на исходный узел к родителю. Мгновенное обнаружение сбоя узла требует внезапной
+остановки выполнения объектов, что может быть неприменимо для программ со
+сложной логикой.
+
+#+name: fig-benchmark
+#+begin_src R :file build/benchmark-xxx-ru.pdf
+# TODO
+#+end_src
+
+#+caption: Производительность программы генерации взволнованной морской поверхности при различных типах сбоев узлов.
+#+label: fig-benchmark
+#+RESULTS: fig-benchmark
+[[file:build/benchmark-xxx-ru.pdf]]
+
+Результаты экспериментов позволяют сделать вывод о том, что не важно, вышел ли
+из строя руководящий узел или подчиненный, общее время работы параллельной
+программы примерно равно времени ее работы без сбоев, но с уменьшенным на
+единицу количеством узлов, однако, в случае выхода из строя резервного узла
+потери в производительности гораздо больше.
+
+#+name: fig-slowdown
+#+begin_src R :file build/slowdown-xxx-ru.pdf
+# TODO
+#+end_src
+
+#+caption: Замедление программы генерации взволнованной морской поверхности при различных типах сбоев по сравнению с запуском без сбоев но с уменьшенным на единицу количеством узлов.
+#+label: fig-slowdown
+#+RESULTS: fig-slowdown
+[[file:build/slowdown-xxx-ru.pdf]]
+
+**** Обсуждение результатов тестирования.
+Алгоритм восстановления после сбоев гарантирует обработку выхода из строя одного
+узла на один последовательный шаг программы; больше сбоев может быть выдержано,
+если он не затрагивают руководящий узел. Алгоритм обрабатывает одновременный
+выход из строя всех подчиненных узлов, однако, если руководящий и резервный узлы
+вместе выходят из строя, у программы нет ни единого шанса продолжить работу. В
+этом случае состояние текущего шага вычислений теряется полностью, и его можно
+восстановить только перезапуском программы с начала.
+
+Управляющие объекты являются абстракциями, отделяющие распределенное приложение
+от физических устройств: для непрерывной работы программы не важно, сколько
+узлов кластера в данный момент работают. Управляющие объекты позволяют
+отказаться от выделения физического резервного узла для обеспечения устойчивости
+к выходу из строя руководящего узла: в рамках иерархии управляющих объектов
+любой физический узел (кроме руководящего) может выполнять роль резервного.
+Наконец, иерархия управляющих объектов позволяет обрабатывать сбои прозрачно для
+программиста, определяя порядок действий из внутреннего состояния объекта.
+
+Проведенные эксперименты показывают, что параллельной программе необходимо иметь
+несколько последовательных этапов выполнения, чтобы сделать ее устойчивой к
+сбоям узлов, иначе выход из строя резервного узла фактически вызывает
+восстановление исходного состояния программы. Несмотря на то что вероятность
+сбоя резервного узла меньше вероятности сбоя одного из подчиненных узлов, это не
+повод потерять все данные, когда выполнявшаяся продолжительное время программа
+почти завершилась. В общем случае, чем больше последовательных этапов вычислений
+содержит параллельная программа, тем меньше времени потеряется в случае сбоя
+резервного узла, и, аналогично, чем больше параллельных частей содержит каждый
+последовательный этап, тем меньше времени потеряется при сбое руководящего или
+подчиненного узла. Другими словами, чем больше количество узлов, на которое
+масштабируется программа, тем она становится более устойчива к сбою узлов
+кластера.
+
+Хотя это не было показано в экспериментах, Фабрика не только обеспечивает
+устойчивость к выходу из строя узлов кластера, но и позволяет автоматически
+вводить новые узлы в кластер и распределять на них часть управляющих объектов из
+уже запущенных программ. В контексте фреймворка этот процесс тривиален,
+поскольку не требует перезапуска незавершившихся управляющих объектов и
+копирования их состояния, и не изучался экспериментально в данной работе.
+
+Теоретически, отказоустойчивость, основанная на иерархии узлов и управляющих
+объектов, может быть реализована поверх библиотеки передачи сообщений без потери
+общности. Хотя использование незагруженных узлов заместо вышедших из строя в
+рамках такой библиотеки представляет определенную сложность, поскольку
+количество узлов, на которых запущена программа, в таких библиотеках
+фиксировано, однако, выделение достаточно большого количества узлов для
+программы будет достаточно для обеспечения ее отказоустойчивости. В то же время,
+реализация отказоустойчивости, основанной на иерархии, внутри самой библиотеки
+передачи сообщений не практично, поскольку это потребует сохранения текущего
+состояния параллельной программы, объем которого эквивалентен всей занимаемой ей
+памятью на каждом узле кластера, что, в свою очередь, не позволит сделать такой
+подход эффективнее контрольных точек восстановления.
+
+Слабым местом описанных методов является период времени, начиная с отказа
+руководящего узла и заканчивая обнаружением сбоя подчиненным узлом,
+восстановлением главного объекта из копии и получением нового подчиненного
+объекта вместе с копией его родителя подчиненным узлом. Если в любой момент
+времени из этого периода резервный узел выходит из строя, то состояние
+выполнения программы полностью теряется без возможности его восстановить, кроме
+как перезапуском с самого начала. Протяженность этого опасного промежутка
+времени может быть минимизирована, но полностью исключить вероятность внезапного
+завершения программы невозможно. Этот результат согласуется с исследованиями
+/теории невыполнимости/ в рамках которой доказывается невозможность
+распределенного консенсуса с хотя бы одним процессом, дающим
+сбой\nbsp{}cite:fischer1985impossibility и невозможность надежной передачи
+данных в случае сбоя одного из узлов\nbsp{}cite:fekete1993impossibility.
+
+** Сравнение предложенного подхода с современными подходами
+Современный подход к разработке и запуску параллельных программ на кластере
+заключается в использовании библиотеки передачи сообщений MPI и планировщика
+задач, и, несмотря на то что этот подход имеет высокую эффективность с точки
+зрения параллельных вычислений, он недостаточно гибок, чтобы вместить в себя
+динамическую балансировку нагрузки и автоматическое обеспечение
+отказоустойчивости. Программы, написанные с помощью MPI обычно предполагают
+- равномерную загрузку каждого процессора,
+- бесперебойное и надежное выполнение пакетных задач, и
+- постоянное число параллельных процессов/потоков во время выполнения, равное
+  общему количеству процессоров.
+Первое предположение несправедливо для программы моделирование морского
+волнения, поскольку модель АР требует динамической балансировки нагрузки между
+процессорами для генерации каждой части поверхности только когда генерация всех
+зависимых частей уже закончена. Последнее предположение также несправедливо,
+поскольку в угоду эффективности каждая часть записывается в файл отдельным
+потоком асинхронно. Оставшееся предположение относится не к самой программе, а к
+планировщику задач, и несправедливо для больших вычислительных кластеров, в
+которых узлы часто выходят из строя, а планировщик перезапускает задачу из
+контрольной точки восстановления, серьезно замедляя ее. Таким образом, идея
+предлагаемого подхода\nbsp{}--- дать параллельным программам больше гибкости:
+- предоставить динамическую балансировку нагрузки путем выполнения
+  последовательных, параллельных изнутри шагов программы в режиме конвейера,
+- перезапускать только затронутые выходом из строя узла процессы, и
+- выполнять программу на как можно большем количестве узлов, которое доступно в
+  кластере.
+В данном разделе обсуждаются преимущества и недостатки этого подхода.
+
+В сравнении с портируемыми системами пакетных заданий (PBS) для распределения
+нагрузки на узлы кластера предлагаемый подход использует легковесные управляющие
+объекты вместо тяжеловесных параллельных задач. Во-первых, это позволяет иметь
+очереди объектов на каждом узле, вместо того чтобы иметь одну очередь задач на
+кластер. Зернистость управляющих объектов гораздо выше, чем у пакетных задач, и,
+несмотря на то что время их выполнения не может быть надежно спрогнозировано
+(также как и время выполнения пакетных задач), объекты из нескольких
+параллельных программ могут быть динамически распределены между одним и тем же
+множеством узлов кластера, делая нагрузку более равномерной. Недостатком
+является необходимость в большем количестве оперативной памяти для выполнения
+нескольких задач на одних и тех же узлах, а также в том что выполнение каждой
+программы может занять больше времени из-за общих очередей управляющих объектов.
+Во-вторых, предлагаемый подход использует динамическое распределение ролей
+руководителя и подчиненного среди узлов кластера вместо их статического
+присвоения конкретным физическим узлам. Это позволяет сделать узлы
+взаимозаменяемыми, что необходимо для обеспечения отказоустойчивости. Таким
+образом, одновременное выполнение нескольких параллельных программ на одном и
+том же множестве узлов может увеличить пропускную способность кластера, но также
+может уменьшить их производительность, взятую по отдельности, а динамичское
+распределение ролей является основанием, на котором строится устойчивость к
+сбоям.
+
+В сравнении с MPI для разбиения программы на отдельные сущности предлагаемый
+подход использует легковесные управляющие объекты вместо тяжеловесных процессов.
+Во-первых, это позволяет определить число обрабатываемых параллельно сущностей,
+исходя из задачи, а не архитектуры компьютера или кластера. Это поощряет
+программиста создачать столько объектов, солько необходимо, руководствуясь
+алгоритмом или ограничениями на размер структур данных из предметной области
+задачи. В программе моделирования морского волннения минимальный размер каждой
+части поверхности зависит от числа коэффициентов вдоль каждой из осей, и, в то
+же время, количество частей должно быть больше, чем количество процессоров, для
+того чтобы сделать нагрузку на процессоры более равномерной. Учитывая эти
+ограничения оптимальный размер части определяется во время выполнения, и, в
+общем случае, не совпадает с количеством параллельных процессов. Недостатком
+является то, что, чем больше управляющих объектов в программе, тем больше общих
+структур данных копируется на один и тот же узел вместе с подчиненными
+объектами; проблема решается введением промежуточного слоя объектов, что в свою
+очередь влечет увеличивает сложноть программы. Во-вторых, иерархия управляющих
+объектов совместно с иерархией узлов позволяет автоматически пересчитвать
+завершившиеся некорретно подчиненные объекты на выживших узлах кластера в случае
+выхода из строя оборудования. Это возможно, поскольку ход выполнения программы
+сохраняется в каждом объекте, а не в глобальных переменных, как это делается в
+программах MPI. Дублируя состояние на подчиненные узлы, система пресчитывает
+только объекты из поврежденных процессов, а не программу целиком. Таким образом,
+переход от процессов к управляющим объектам может увеличить производительность
+параллельной программы путем динамической балансировки нагрузки, но также может
+повлиять на ее масштабируемость на большое количество узлов из-за дублирования
+состояния хода выполнения.
+
+Может показаться, что три составляющих предлагаемого подхода\nbsp{}---
+управляющие объекты, конвейеры и иерархии\nbsp{}--- ортогональны, но, на самом
+деле, они дополняют друг друга. Если бы управляющие объекты не содержали в себе
+состояние хода выполнения программы, то было бы невозможно пересчитать
+завершившиеся некорретно подчиненные объекты и обеспечить отказоустойчивость.
+Если бы ирерархии узлов не было, то было бы невозможно распределить нагрузку
+между узлами кластера, поскольку все узлы одинаковы без иерархии. Если бы для
+каждого устройства не было конвейера, то было бы невозможно обрабатывать
+управляющие объекты асинхронно и реализовать динамическую балансировку нагрузки.
+Эти три сущности образуют замкнутую систему, в которую нечего добавить и из
+которой нечего удалить\nbsp{}--- надежную основу для любой распределенной
+программы.
+
+Подводя итог, можно сказать, что управляющие объекты придают гибкости
+параллельным программам: они балансируют снижение производительности за счет
+использования общих очередей ее увеличением за счет динамической балансировки
+нагрузки. Требуя больше оперативной памяти для работы, они позволяют выполнять
+сразу несколько параллельных программ одновременно на всех узлах кластера без
+простаивания в очереди задач, и превращают кластер в единую вычислительную
+систему, которая делает все возможное для непрерывной работы распределенных
+приложений.
+
+* Заключение
+**** Итоги исследования.
+В изучении возможностей математического аппарата для имитационного моделирования
+морского волнения, выходящего за рамки линейной теории волн, были достигнуты
+следующие основные результаты.
+- Процесс АРСС был использован для моделирования морских волн произвольных
+  амплитуд. Интегральные характеристики генерируемой взволнованной поверхности
+  были верифицированы путем сопоставления с характеристиками реальной морской
+  поверхности.
+- Аналитическая формула для определения давлений была использована для
+  вычисления поля потенциала скорости под генерируемой поверхности. Получившееся
+  поле потенциалов скоростей было верифицировано путем сравнения с полем,
+  вычисляемым по формулам из линейной теории волн. Аналитическая формула
+  эффективна с вычислительной точки зрения, поскольку все интегралы в ней
+  записываются как преобразования Фурье, для которого существуют
+  высокопроизводительные реализации.
+
+**** Перспективы дальнейших исследований.
+Одной из тем дальнейших исследований является изучение возможности генерации
+волн произвольных профилей на базе смешанного процесса АРСС. Другим направлением
+является интеграция разработанной модели и формулы расчета давлений в
+существующие пакеты прикладного программного обеспечения.
+
+* Выводы
+Результаты исследования позволяют сделать вывод о том, что задача вычисления
+давлений под реальной морской поверхностью может быть решена аналитическив,
+минуя предположения линейной теории волн и теории волн малой амплитуды. Это
+решение в паре с моделью АРСС морского волнения, способной
+генерировать волны произвольных амплитуд, может быть использовано для расчета
+влияния колебаний волн на поведение динамического объекта в открытом море, и
+дает более точные результаты чем аналогичное решение для волн малых амплитуд.
+
+Результаты проведенных численных экспериментов позволяют сделать вывод о том,
+что как генерация взволнованной поверхности так и расчет гидродинамических
+давлений могут быть реализованы эффективно с использованием алгоритмов быстрого
+преобразования Фурье, и длительные сессии имитационного моделирования могут
+проводиться.
+
+Разработанный в работе математический аппарат и его численная реализация могут
+стать основой виртуального полигона, предназанченного для расчетов динамики
+морских объектов.
+
+* Благодарности
+Графики в этой работе были подготовлены с помощью языка для статистических
+вычислений R\nbsp{}cite:rlang2016,Sarkar2008lattice и программного обеспечения
+Graphviz\nbsp{}cite:Gansner00anopen. Документ был подготовлен с использованием
+Org-mode\nbsp{}cite:Schulte2011org2,Schulte2011org1,Dominik2010org для GNU
+Emacs, предоставляющего вычислительное окружение для воспроизводимых
+исследований. Это означает, что все графики можно воспроизвести и
+соответствующие утверждения проверить, скопировав репозиторий
+диссертации[fn:repo], установив Emacs и экспортировав документ.
+
+Исследования были проведены на вычислительных ресурсах ресурсного центра
+"Вычислительный центр СПбГУ" (\mbox{T-EDGE96} \mbox{HPC-0011828-001}) в рамках
+грантов РФФИ (проекты\nbsp{}\mbox{16-07-01111}, \mbox{16-07-00886},
+\mbox{16-07-01113}).
+
+[fn:repo] [[https://github.com/igankevich/arma-thesis]]
+
+* Список сокращений и условных обозначений
+- <<<MPP>>> :: Massively Parallel Processing, класс вычислительных систем с разделенной памятью.
+- <<<SMP>>> :: Symmetric Multi-Processing, класс вычислительных систем с общей памятью.
+- <<<АКФ>>> :: автоковариационная функция.
+- <<<БПФ>>> :: быстрое преобразование Фурье.
+- <<<ГПСЧ>>> :: генератор псевдослучайных чисел.
+- <<<ГУ>>> :: граничное условие.
+- <<<ДУЧП>>> :: дифференциальное уравнение в частных производных.
+- <<<НБП>>> :: нелинейное безынерционное преобразование.
+- <<<АР>>> :: процесс авторегрессии.
+- <<<АРСС>>> :: процесс авторегрессии скользящего среднего.
+- <<<СС>>> :: процесс скользящего среднего.
+- <<<ЛХ>>> :: модель Лонге---Хиггинса.
+- <<<LAMP>>> :: Large Amplitude Motion Programme, программа для моделирования качки судна на морских волнах.
+- <<<ЦПТ>>> :: центральная предельная теорема.
+- <<<ПМ>>> :: аппроксимация Пирсона---Московица для спектра морского волнения.
+- <<<ЮУ>>> :: система уравнений Юла---Уокера.
+- <<<МНК>>> :: метод наименьших квадратов.
+- <<<ФПР>>> :: функция плотности распределения.
+- <<<ФР>>> :: функция распределения.
+- <<<BSP>>> :: Bulk Synchronous Parallel.
+- <<<OpenCL>>> :: Open Computing Language.
+- <<<OpenMP>>> :: Open Multi-Processing.
+- <<<MPI>>> :: Message Passing Interface.
+- <<<POSIX>>> :: Portable Operating System.
+- <<<FMA>>> :: Fused multiply-add.
+- <<<DCMT>>> :: Dynamic creation of Mersenne Twisters.
+- <<<GSL>>> :: GNU Scientific Library.
+- <<<BLAS>>> :: Basic Linear Algebra Sub-programmes.
+- <<<LAPACK>>> :: Linear Algebra Package.
+- <<<DNS>>> :: Dynamic name resolution.
+- <<<HPC>>> ::  High-performance computing.
+
+#+begin_export latex
+\input{postamble}
+#+end_export
+
+bibliographystyle:ugost2008
+bibliography:bib/refs.bib
+
+* Приложение
+** Вывод формулы модели Лонге---Хиггинса
+:PROPERTIES:
+:CUSTOM_ID: longuet-higgins-derivation
+:END:
+
+Двухмерная система уравнений\nbsp{}eqref:eq-problem в рамках линейной теории
+волн записывается как
+\begin{align*}
+    & \phi_{xx} + \phi_{zz} = 0,\\
+    & \zeta(x,t) = -\frac{1}{g} \phi_t, & \text{на }z=\zeta(x,t),
+\end{align*}
+где \(\frac{p}{\rho}\) включено в \(\phi_t\). Решение уравнения Лапласа ищется в
+виде ряда Фурье cite:kochin1966theoretical:
+\begin{equation*}
+    \phi(x,z,t) = \int\limits_{0}^{\infty} e^{k z}
+    \left[ A(k, t) \cos(k x) + B(k, t) \sin(k x) \right] dk.
+\end{equation*}
+Подставляя его в граничное условие, получаем
+\begin{align*}
+    \zeta(x,t) &= -\frac{1}{g} \int\limits_{0}^{\infty}
+    \left[ A_t(k, t) \cos(k x) + B_t(k, t) \sin(k x) \right] dk \\
+    &= -\frac{1}{g} \int\limits_{0}^{\infty} C_t(k, t) \cos(kx + \epsilon(k, t)).
+\end{align*}
+Здесь \(\epsilon\)\nbsp{}--- белый шум, а \(C_t\) включает в себя значение \(dk\).
+Подставляя бесконечную сумму вместо интеграла, получаем двухмерную форму
+ур.\nbsp{}[[eq-longuet-higgins]].
diff --git a/arma-thesis.org b/arma-thesis.org
@@ -0,0 +1,3226 @@
+# Local Variables:
+# org-ref-default-bibliography ("bib/refs.bib")
+# org-latex-image-default-width nil
+# org-latex-caption-above nil
+# org-latex-hyperref-template "\\hypersetup{\n pdfauthor={%a},\n pdftitle={%t},\n pdfkeywords={%k},\n pdfsubject={%d},\n pdfcreator={%c},\n pdflang={%L},\n unicode={true}\n}\n\\setdefaultlanguage{%l}\n"
+# org-export-latex-tables-hline "\\midrule"
+# org-export-latex-tables-tstart "\\toprule"
+# org-export-latex-tables-tend "\\bottomrule"
+# eval: (add-to-list 'org-latex-classes '("gost" "\\documentclass{gost} [DEFAULT-PACKAGES] [PACKAGES] [EXTRA]" ("\\section{%s}" . "\\section*{%s}") ("\\subsection{%s}" . "\\subsection*{%s}") ("\\subsubsection{%s}" . "\\subsubsection*{%s}") ("\\paragraph{%s}" . "\\paragraph*{%s}") ("\\subparagraph{%s}" . "\\subparagraph*{%s}")))
+# End:
+
+#+TITLE: High-performance ocean wave simulation model for studying marine object behaviour
+#+AUTHOR: Ivan Gankevich
+#+DATE: St. Petersburg, 2017
+#+LANGUAGE: en
+#+LATEX_CLASS: gost
+#+LATEX_CLASS_OPTIONS: [hidelinks,fontsize=14pt,paper=a4,pagesize,DIV=calc,noenddot]
+#+LATEX_HEADER_EXTRA: \input{preamble}
+#+LATEX_HEADER_EXTRA: \organization{Saint Petersburg State University}
+#+LATEX_HEADER_EXTRA: \manuscript{}
+#+LATEX_HEADER_EXTRA: \degree{thesis for candidate of sciences degree}
+#+LATEX_HEADER_EXTRA: \speciality{Speciality 05.13.18\\Mathematical modeling, numerical methods and programme complexes}
+#+LATEX_HEADER_EXTRA: \supervisor{Supervisor\\Alexander Degtyarev}
+#+LATEX_HEADER_EXTRA: \newcites{published}{Publications on the subject of thesis}
+#+OPTIONS: todo:nil title:nil ':t H:5
+#+STARTUP: indent
+#+PROPERTY: header-args:R :results graphics :exports results
+
+* Config                                                           :noexport:
+** Produce data for Q-Q and ACF plots
+#+begin_src sh :exports none :results verbatim
+root=$(pwd)
+for testname in propagating_wave standing_wave
+do
+    wd=$root/build/$testname
+    rm -rf $wd
+    mkdir -p $wd
+    cd $wd
+    arma -c $root/config/$testname.arma 2>&1
+done
+#+end_src
+
+#+RESULTS:
+#+begin_example
+Input file                     = /home/igankevich/workspace/phd-diss/config/propagating_wave.arma
+ACF grid size                  = (20,10,10)
+ACF grid patch size            = (0.526316,0.555556,0.555556)
+Output grid size               = (200,40,40)
+Output grid patch size         = (1,1,1)
+AR order                       = (10,10,10)
+Do least squares               = 0
+ACF function                   = propagating_wave
+Model                          = MA
+MA algorithm                   = fixed_point_iteration
+Verification scheme            = manual
+ACF variance = 5
+fixed_point_iteration:Iteration=0, var_wn=2.70831
+fixed_point_iteration:Iteration=1, var_wn=1.93791
+fixed_point_iteration:Iteration=2, var_wn=1.54801
+fixed_point_iteration:Iteration=3, var_wn=1.31202
+fixed_point_iteration:Iteration=4, var_wn=1.15328
+fixed_point_iteration:Iteration=5, var_wn=1.0386
+fixed_point_iteration:Iteration=6, var_wn=0.951442
+fixed_point_iteration:Iteration=7, var_wn=0.882674
+fixed_point_iteration:Iteration=8, var_wn=0.82688
+fixed_point_iteration:Iteration=9, var_wn=0.780623
+fixed_point_iteration:Iteration=10, var_wn=0.74161
+fixed_point_iteration:Iteration=11, var_wn=0.708244
+fixed_point_iteration:Iteration=12, var_wn=0.679374
+fixed_point_iteration:Iteration=13, var_wn=0.654145
+fixed_point_iteration:Iteration=14, var_wn=0.63191
+fixed_point_iteration:Iteration=15, var_wn=0.612168
+fixed_point_iteration:Iteration=16, var_wn=0.594523
+fixed_point_iteration:Iteration=17, var_wn=0.578663
+fixed_point_iteration:Iteration=18, var_wn=0.564333
+fixed_point_iteration:Iteration=19, var_wn=0.551325
+fixed_point_iteration:Iteration=20, var_wn=0.539469
+fixed_point_iteration:Iteration=21, var_wn=0.528623
+fixed_point_iteration:Iteration=22, var_wn=0.518666
+fixed_point_iteration:Iteration=23, var_wn=0.509497
+fixed_point_iteration:Iteration=24, var_wn=0.50103
+fixed_point_iteration:Iteration=25, var_wn=0.493191
+fixed_point_iteration:Iteration=26, var_wn=0.485916
+fixed_point_iteration:Iteration=27, var_wn=0.479148
+fixed_point_iteration:Iteration=28, var_wn=0.472841
+fixed_point_iteration:Iteration=29, var_wn=0.466951
+fixed_point_iteration:Iteration=30, var_wn=0.461442
+fixed_point_iteration:Iteration=31, var_wn=0.456279
+fixed_point_iteration:Iteration=32, var_wn=0.451435
+fixed_point_iteration:Iteration=33, var_wn=0.446882
+fixed_point_iteration:Iteration=34, var_wn=0.442597
+fixed_point_iteration:Iteration=35, var_wn=0.43856
+fixed_point_iteration:Iteration=36, var_wn=0.434752
+fixed_point_iteration:Iteration=37, var_wn=0.431155
+fixed_point_iteration:Iteration=38, var_wn=0.427755
+fixed_point_iteration:Iteration=39, var_wn=0.424538
+fixed_point_iteration:Iteration=40, var_wn=0.42149
+fixed_point_iteration:Iteration=41, var_wn=0.418601
+fixed_point_iteration:Iteration=42, var_wn=0.415859
+fixed_point_iteration:Iteration=43, var_wn=0.413256
+fixed_point_iteration:Iteration=44, var_wn=0.410782
+fixed_point_iteration:Iteration=45, var_wn=0.40843
+fixed_point_iteration:Iteration=46, var_wn=0.406191
+fixed_point_iteration:Iteration=47, var_wn=0.404059
+fixed_point_iteration:Iteration=48, var_wn=0.402029
+fixed_point_iteration:Iteration=49, var_wn=0.400092
+fixed_point_iteration:Iteration=50, var_wn=0.398246
+fixed_point_iteration:Iteration=51, var_wn=0.396483
+fixed_point_iteration:Iteration=52, var_wn=0.3948
+fixed_point_iteration:Iteration=53, var_wn=0.393193
+fixed_point_iteration:Iteration=54, var_wn=0.391656
+fixed_point_iteration:Iteration=55, var_wn=0.390188
+fixed_point_iteration:Iteration=56, var_wn=0.388782
+fixed_point_iteration:Iteration=57, var_wn=0.387438
+fixed_point_iteration:Iteration=58, var_wn=0.386151
+fixed_point_iteration:Iteration=59, var_wn=0.384918
+fixed_point_iteration:Iteration=60, var_wn=0.383738
+fixed_point_iteration:Iteration=61, var_wn=0.382606
+fixed_point_iteration:Iteration=62, var_wn=0.381522
+fixed_point_iteration:Iteration=63, var_wn=0.380482
+fixed_point_iteration:Iteration=64, var_wn=0.379485
+fixed_point_iteration:Iteration=65, var_wn=0.378528
+fixed_point_iteration:Iteration=66, var_wn=0.37761
+fixed_point_iteration:Iteration=67, var_wn=0.376729
+fixed_point_iteration:Iteration=68, var_wn=0.375882
+fixed_point_iteration:Iteration=69, var_wn=0.37507
+fixed_point_iteration:Iteration=70, var_wn=0.374289
+fixed_point_iteration:Iteration=71, var_wn=0.373539
+fixed_point_iteration:Iteration=72, var_wn=0.372818
+fixed_point_iteration:Iteration=73, var_wn=0.372126
+fixed_point_iteration:Iteration=74, var_wn=0.37146
+fixed_point_iteration:Iteration=75, var_wn=0.37082
+fixed_point_iteration:Iteration=76, var_wn=0.370204
+fixed_point_iteration:Iteration=77, var_wn=0.369612
+fixed_point_iteration:Iteration=78, var_wn=0.369042
+fixed_point_iteration:Iteration=79, var_wn=0.368494
+fixed_point_iteration:Iteration=80, var_wn=0.367966
+fixed_point_iteration:Iteration=81, var_wn=0.367458
+fixed_point_iteration:Iteration=82, var_wn=0.366969
+fixed_point_iteration:Iteration=83, var_wn=0.366499
+fixed_point_iteration:Iteration=84, var_wn=0.366046
+fixed_point_iteration:Iteration=85, var_wn=0.36561
+fixed_point_iteration:Iteration=86, var_wn=0.365189
+fixed_point_iteration:Iteration=87, var_wn=0.364785
+fixed_point_iteration:Iteration=88, var_wn=0.364395
+fixed_point_iteration:Iteration=89, var_wn=0.364019
+fixed_point_iteration:Iteration=90, var_wn=0.363657
+fixed_point_iteration:Iteration=91, var_wn=0.363309
+fixed_point_iteration:Iteration=92, var_wn=0.362973
+fixed_point_iteration:Iteration=93, var_wn=0.362649
+fixed_point_iteration:Iteration=94, var_wn=0.362337
+fixed_point_iteration:Iteration=95, var_wn=0.362036
+fixed_point_iteration:Iteration=96, var_wn=0.361746
+fixed_point_iteration:Iteration=97, var_wn=0.361466
+fixed_point_iteration:Iteration=98, var_wn=0.361197
+fixed_point_iteration:Iteration=99, var_wn=0.360937
+fixed_point_iteration:Iteration=100, var_wn=0.360686
+fixed_point_iteration:Iteration=101, var_wn=0.360444
+fixed_point_iteration:Iteration=102, var_wn=0.360211
+fixed_point_iteration:Iteration=103, var_wn=0.359986
+fixed_point_iteration:Iteration=104, var_wn=0.359769
+fixed_point_iteration:Iteration=105, var_wn=0.35956
+fixed_point_iteration:Iteration=106, var_wn=0.359358
+fixed_point_iteration:Iteration=107, var_wn=0.359163
+fixed_point_iteration:Iteration=108, var_wn=0.358975
+fixed_point_iteration:Iteration=109, var_wn=0.358794
+fixed_point_iteration:Iteration=110, var_wn=0.358619
+fixed_point_iteration:Iteration=111, var_wn=0.35845
+fixed_point_iteration:Iteration=112, var_wn=0.358288
+fixed_point_iteration:Iteration=113, var_wn=0.35813
+fixed_point_iteration:Iteration=114, var_wn=0.357979
+fixed_point_iteration:Iteration=115, var_wn=0.357832
+fixed_point_iteration:Iteration=116, var_wn=0.357691
+fixed_point_iteration:Iteration=117, var_wn=0.357555
+fixed_point_iteration:Iteration=118, var_wn=0.357423
+fixed_point_iteration:Iteration=119, var_wn=0.357296
+fixed_point_iteration:Iteration=120, var_wn=0.357173
+fixed_point_iteration:Iteration=121, var_wn=0.357055
+fixed_point_iteration:Iteration=122, var_wn=0.356941
+fixed_point_iteration:Iteration=123, var_wn=0.356831
+fixed_point_iteration:Iteration=124, var_wn=0.356724
+fixed_point_iteration:Iteration=125, var_wn=0.356621
+fixed_point_iteration:Iteration=126, var_wn=0.356522
+fixed_point_iteration:Iteration=127, var_wn=0.356426
+fixed_point_iteration:Iteration=128, var_wn=0.356334
+fixed_point_iteration:Iteration=129, var_wn=0.356244
+fixed_point_iteration:Iteration=130, var_wn=0.356158
+fixed_point_iteration:Iteration=131, var_wn=0.356075
+fixed_point_iteration:Iteration=132, var_wn=0.355994
+fixed_point_iteration:Iteration=133, var_wn=0.355917
+fixed_point_iteration:Iteration=134, var_wn=0.355842
+fixed_point_iteration:Iteration=135, var_wn=0.355769
+fixed_point_iteration:Iteration=136, var_wn=0.355699
+fixed_point_iteration:Iteration=137, var_wn=0.355632
+fixed_point_iteration:Iteration=138, var_wn=0.355567
+fixed_point_iteration:Iteration=139, var_wn=0.355504
+fixed_point_iteration:Iteration=140, var_wn=0.355443
+fixed_point_iteration:Iteration=141, var_wn=0.355384
+fixed_point_iteration:Iteration=142, var_wn=0.355327
+fixed_point_iteration:Iteration=143, var_wn=0.355273
+fixed_point_iteration:Iteration=144, var_wn=0.35522
+fixed_point_iteration:Iteration=145, var_wn=0.355169
+fixed_point_iteration:Iteration=146, var_wn=0.355119
+fixed_point_iteration:Iteration=147, var_wn=0.355072
+fixed_point_iteration:Iteration=148, var_wn=0.355026
+fixed_point_iteration:Iteration=149, var_wn=0.354981
+fixed_point_iteration:Iteration=150, var_wn=0.354938
+fixed_point_iteration:Iteration=151, var_wn=0.354897
+fixed_point_iteration:Iteration=152, var_wn=0.354856
+fixed_point_iteration:Iteration=153, var_wn=0.354818
+fixed_point_iteration:Iteration=154, var_wn=0.35478
+fixed_point_iteration:Iteration=155, var_wn=0.354744
+fixed_point_iteration:Iteration=156, var_wn=0.354709
+fixed_point_iteration:Iteration=157, var_wn=0.354676
+fixed_point_iteration:Iteration=158, var_wn=0.354643
+fixed_point_iteration:Iteration=159, var_wn=0.354612
+fixed_point_iteration:Iteration=160, var_wn=0.354581
+fixed_point_iteration:Iteration=161, var_wn=0.354552
+fixed_point_iteration:Iteration=162, var_wn=0.354524
+fixed_point_iteration:Iteration=163, var_wn=0.354496
+fixed_point_iteration:Iteration=164, var_wn=0.35447
+fixed_point_iteration:Iteration=165, var_wn=0.354444
+fixed_point_iteration:Iteration=166, var_wn=0.35442
+fixed_point_iteration:Iteration=167, var_wn=0.354396
+fixed_point_iteration:Iteration=168, var_wn=0.354373
+fixed_point_iteration:Iteration=169, var_wn=0.35435
+fixed_point_iteration:Iteration=170, var_wn=0.354329
+fixed_point_iteration:Iteration=171, var_wn=0.354308
+fixed_point_iteration:Iteration=172, var_wn=0.354288
+fixed_point_iteration:Iteration=173, var_wn=0.354269
+fixed_point_iteration:Iteration=174, var_wn=0.35425
+fixed_point_iteration:Iteration=175, var_wn=0.354232
+fixed_point_iteration:Iteration=176, var_wn=0.354214
+fixed_point_iteration:Iteration=177, var_wn=0.354198
+fixed_point_iteration:Iteration=178, var_wn=0.354181
+fixed_point_iteration:Iteration=179, var_wn=0.354165
+fixed_point_iteration:Iteration=180, var_wn=0.35415
+fixed_point_iteration:Iteration=181, var_wn=0.354136
+fixed_point_iteration:Iteration=182, var_wn=0.354121
+fixed_point_iteration:Iteration=183, var_wn=0.354108
+fixed_point_iteration:Iteration=184, var_wn=0.354094
+fixed_point_iteration:Iteration=185, var_wn=0.354082
+fixed_point_iteration:Iteration=186, var_wn=0.354069
+fixed_point_iteration:Iteration=187, var_wn=0.354057
+fixed_point_iteration:Iteration=188, var_wn=0.354046
+fixed_point_iteration:Iteration=189, var_wn=0.354034
+fixed_point_iteration:Iteration=190, var_wn=0.354024
+fixed_point_iteration:Iteration=191, var_wn=0.354013
+fixed_point_iteration:Iteration=192, var_wn=0.354003
+fixed_point_iteration:Iteration=193, var_wn=0.353994
+WN variance = 0.353994
+Input file                     = /home/igankevich/workspace/phd-diss/config/standing_wave.arma
+ACF grid size                  = (10,10,10)
+ACF grid patch size            = (0.277778,0.555556,0.555556)
+Output grid size               = (200,40,40)
+Output grid patch size         = (1,1,1)
+AR order                       = (7,7,7)
+Do least squares               = 0
+ACF function                   = standing_wave
+Model                          = AR
+MA algorithm                   = fixed_point_iteration
+Verification scheme            = manual
+ACF variance = 5
+WN variance = 0.00261323
+Zeta size = (193,33,33)
+NaN: 29, -nan, 1.798e+36, -1.04284e+38, inf, -1.798e+36, -1.798e+36
+#+end_example
+
+* Introduction
+**** Topic relevance.
+Software programmes, which simulates vessel behaviour in sea waves, are widely
+used to model ship motion, estimate impact of external forces on floating
+platform or other marine object, and estimate capsize probability under given
+weather conditions; however, to model ocean waves most of the simulation codes
+use linear wave theory\nbsp{}cite:shin2003nonlinear,van2007forensic,kat2001prediction,van2002development, in
+the framework of which it is difficult to reproduce certain peculiarities of
+wind wave climate. Among them are transition between normal and storm weather,
+and sea composed of multiple wave systems\nbsp{}--- both wind waves and swell\nbsp{}---
+heading from multiple directions. Another shortcoming of linear wave theory is
+an assumption, that wave amplitude is small compared to wave length. This makes
+calculations imprecise when modelling ship motion in irregular waves, for which
+the assumption does not hold. So, studying new and more advanced models and
+methods for ocean simulation software may increase number of its application
+scenarios and foster a study of ship motion in extreme conditions in particular.
+
+**** State-of-the-art.
+Autoregressive moving average (ARMA) model emerged in response to difficulties
+encountered by practitioners who used wave simulation models developed in the
+framework of linear wave theory. The problems they have encountered with
+Longuet---Higgins model (a model which is entirely based on linear wave theory)
+can be summarised as the following.
+1. /Periodicity/. Linear wave theory approximates waves by a sum of harmonics,
+   so period of the whole wavy surface realisation depends on the number of
+   harmonics in the model. The more realisation size is, the more coefficients
+   are required to eliminate periodicity, therefore, generation time grows
+   non-linearly with realisation size. This in turn results in overall low
+   efficiency of any model based on this theory, no matter how optimised the
+   software implementation is.
+2. /Linearity/. Linear wave theory gives mathematical definition for ocean waves
+   which have small amplitudes compared to their lengths. Waves of this type
+   occur mostly in the ocean, so near-shore waves as well as storm waves, for
+   which this assumption does not hold, are not perfectly captured by linear
+   theory.
+3. /Probabilistic convergence/. Phase of a wave, which is often generated by
+   pseudo random number generator (PRNG), has uniform distribution, and this
+   makes wavy surface characteristics (average wave height, wave period, wave
+   length etc.) sometimes converge slowly to the desired values. Convergence
+   rate depends on the values generated by PRNG, so high convergence rate is not
+   guaranteed.
+
+These difficulties became a starting point in search for a new model which is
+not based on linear wave theory. ARMA process studies were found to have all the
+required mathematical apparatus.
+1. ARMA process takes auto-covariate function (ACF) as an input parameter, and
+   this function can be directly obtained from wave energy or
+   frequency-directional spectrum (which is the input for Longuet---Higgins
+   model). So, inputs for one model can easily be converted to each other.
+2. There is no small-amplitude waves assumption. Wave may have any amplitude,
+   and can be generated as steep as it is possible with real ocean wave ACF.
+3. Period of the realisation equals the period of PRNG, so generation time grows
+   linearly with the realisation size.
+4. White noise\nbsp{}--- the only probabilistic term in ARMA process\nbsp{}--- has
+   Gaussian distribution; so, convergence rate is not probabilistic.
+
+**** Goals and objectives.
+ARMA process became the basis for ARMA ocean simulation model, however, there
+was still much work to be done to make it useful in practice.
+1. One have to investigate how different ACF shapes affect the choice of ARMA
+   parameters (the number of moving average and autoregressive processes
+   coefficients).
+2. Then, investigate a possibility to generate waves of arbitrary profile, not
+   only cosines (which means taking into account asymmetric distribution of wavy
+   surface elevation).
+3. Then, derive formulae to determine pressure field under wavy surface.
+   Usually, such formulae are derived for a particular model by substituting
+   wave profile into the eq. eqref:eq-problem, however, ARMA process does not
+   provide explicit wave profile formula, so this problem had to be solved for
+   general wavy surface (which is not defined by an analytic formula),
+   without linearisation of boundaries and assumption of small-amplitude waves.
+4. Finally, verify wavy surface integral characteristics to match the ones of
+   real ocean waves.
+5. In the final stage, develop software programme that implements ARMA model and
+   pressure calculation method, and allows to run simulations on both shared
+   memory (SMP) and distributed memory (MPP) computer systems.
+
+**** Scientific novelty.
+ARMA model, as opposed to other ocean simulation models, does not use linear
+wave theory. This makes it capable of
+- generating waves with arbitrary amplitudes by adjusting wave steepness via
+  ACF;
+- generating waves with arbitrary profiles by adjusting asymmetry of wave
+  elevation distribution via non-linear inertia-less transform (NIT).
+This makes it possible to use ARMA process to model transition between normal
+and storm weather taking into account climate spectra and assimilation data of a
+particular ocean region, which is not possible with models based on linear wave
+theory.
+
+**** Theoretical and practical significance.
+Implementing ARMA model, that does not use assumptions of linear wave theory,
+will increase quality of ship motion and marine object behaviour simulation
+software.
+
+1. Since pressure field formula is derived for discrete wavy surface and without
+   assumptions about wave amplitudes, it is applicable to any wavy surface of
+   incompressible inviscid fluid (in particular, it is applicable to wavy
+   surface generated by LH model). This allows to use pressure field formula
+   without being tied to ARMA model.
+2. From computational point of view this formula is more efficient than the
+   corresponding formula for LH model, because integrals in it are reduced to
+   Fourier transforms, for which there is fast Fourier transform (FFT) family of
+   algorithms, optimised for different processor architectures.
+3. Since the formula is explicit, there is no need in data exchange between
+   parallel processes, which allows to achieve high scalability on computer
+   clusters.
+4. Finally, ARMA model is itself more efficient than LH model due to vicinity of
+   trigonometric functions in its formula: In fact, wavy surface is computed as
+   a sum of large number of polynomials, for which there is low-level assembly
+   instruction (Fused Multiply-Add) giving native performance on CPUs.
+
+**** Methodology and research methods.
+Software implementation of ARMA model and pressure field formula was created
+incrementally: a prototype written in high-level engineering language\nbsp{}cite:mathematica10,octave2015 was rewritten in lower level language (C++).
+Implementation of the same algorithm and formulae in languages of varying
+levels (which involves usage of different abstractions and language primitives)
+allows to correct errors, which would left unnoticed otherwise. Wavy surface,
+generated by ARMA model, as well as all input parameters (ACF, distribution of
+wave elevation etc.) were inspected via graphical means built into the
+programming language allowing visual control of programme correctness.
+
+**** Theses for the defence.
+- Wind wave model which allows to generate wavy surface realisations with large
+  period and consisting of wave of arbitrary amplitudes;
+- Pressure field formulae derived for this model without assumptions of linear
+  wave theory;
+- Software implementation of the model and the formula for shared memory (SMP)
+  and distributed memory (MPP) systems.
+
+**** Results verification and approbation.
+ARMA model is verified by comparing generated wavy surface integral
+characteristics (distribution of wave elevation, wave heights and lengths etc.)
+to the ones of real ocean waves. Pressure field formula is derived in
+Mathematica language, where resulting formulae are verified by built-in
+graphical means.
+
+ARMA model and pressure field formula were incorporated into Large Amplitude
+Motion Programme (LAMP)\nbsp{}--- an ship motion simulation software programme\nbsp{}---
+where they were compared to previously used LH model. Preliminary numerical
+experiments showed higher computational efficiency of ARMA model.
+
+* Problem statement
+The aim of the study reported here is to investigate possibilities of applying
+ARMA process mathematical apparatus to ocean wave modelling and to derive formula
+for pressure field under generated wavy surface without assumptions of linear
+wave theory.
+- In case of small-amplitude waves resulting formula must correspond to the
+  one from linear wave theory; in all other cases the formula must not diverge.
+- Integral characteristics of generated wavy surface must match the ones of real
+  ocean waves.
+- Software implementation of ARMA model and pressure field formula must work on
+  shared memory (SMP) and distributed memory (MPP) systems.
+
+**** Pressure field formula.
+The problem of finding pressure field under wavy sea surface represents inverse
+problem of hydrodynamics for incompressible inviscid fluid. System of equations
+for it in general case is written as\nbsp{}cite:kochin1966theoretical
+\begin{align}
+    & \nabla^2\phi = 0,\nonumber\\
+    & \phi_t+\frac{1}{2} |\vec{\upsilon}|^2 + g\zeta=-\frac{p}{\rho}, & \text{на }z=\zeta(x,y,t),\label{eq-problem}\\
+    & D\zeta = \nabla \phi \cdot \vec{n}, & \text{на }z=\zeta(x,y,t),\nonumber
+\end{align}
+where \(\phi\)\nbsp{}--- velocity potential, \(\zeta\)\nbsp{}--- elevation (\(z\) coordinate)
+of wavy surface, \(p\)\nbsp{}--- wave pressure, \(\rho\)\nbsp{}--- fluid density,
+\(\vec{\upsilon}=(\phi_x,\phi_y,\phi_z)\)\nbsp{}--- velocity vector, \(g\)\nbsp{}---
+acceleration of gravity, and \(D\)\nbsp{}--- substantial (Lagrange) derivative. The
+first equation is called continuity (Laplace) equation, the second one is the
+conservation of momentum law (the so called dynamic boundary condition); the
+third one is kinematic boundary condition for free wavy surface, which states
+that rate of change of wavy surface elevation (\(D\zeta\)) equals to the change of
+velocity potential derivative along the wavy surface normal
+(\(\nabla\phi\cdot\vec{n}\)).
+
+Inverse problem of hydrodynamics consists in solving this system of equations
+for \(\phi\). In this formulation dynamic boundary condition becomes explicit
+formula to determine pressure field using velocity potential derivatives
+obtained from the remaining equations. So, from mathematical point of view
+inverse problem of hydrodynamics reduces to Laplace equation with mixed boundary
+condition\nbsp{}--- Robin problem.
+
+* Related work
+** Ocean wave models analysis
+Pressure computation is only possible when the shape of wavy surface is known.
+It is defined either at discrete grid points, or continuously via some analytic
+formula. As will be shown in section [[#linearisation]], such formula may simplify
+pressure computation by effectively reducing the task to pressure field
+generation, instead of wavy surface generation.
+
+**** Longuet---Higgins model.
+The simplest model, formula of which is derived in the framework of linear wave
+theory (see\nbsp{}section\nbsp{}[[#longuet-higgins-derivation]]), is
+Longuet---Higgins (LH) model\nbsp{}cite:longuet1957statistical. In-depth
+comparative analysis of this model and ARMA model is done
+in\nbsp{}cite:degtyarev2011modelling,boukhanovsky1997thesis.
+
+LH model represents ocean wavy surface as a superposition of
+sine waves with random amplitudes \(c_n\) and phases \(\epsilon_n\), continuously
+distributed on interval \([0,2\pi]\). Wavy surface elevation (\(z\) coordinate) is
+defined by
+#+name: eq-longuet-higgins
+\begin{equation}
+    \zeta(x,y,t) = \sum\limits_n c_n \cos(u_n x + v_n y - \omega_n t + \epsilon_n).
+\end{equation}
+Here wave numbers \((u_n,v_n)\) are continuously distributed on plane \((u,v)\),
+i.e. area \(du \times dv\) contains infinite quantity of wave numbers. Frequency
+is related to wave numbers via dispersion relation \(\omega_n=\omega(u_n,v_n)\).
+Function \(\zeta(x,y,t)\) is a three-dimensional ergodic stationary homogeneous
+Gaussian process defined by
+\begin{equation*}
+    2E_\zeta(u,v)\, du\,  dv = \sum\limits_n c_n^2,
+\end{equation*}
+where \(E_\zeta(u,v)\)\nbsp{}--- two-dimensional wave energy spectral density.
+Coefficients \(c_n\) are derived from wave energy spectrum \(S(\omega)\) via
+\begin{equation*}
+    c_n = \sqrt{ \textstyle\int\limits_{\omega_n}^{\omega_{n+1}} S(\omega) d\omega}.
+\end{equation*}
+
+**** Disadvantages of Longuet-Higgins model.
+Although LH model is simple and easy to understand, there are shortcomings that
+appear in practice.
+
+1. The model simulates only stationary Gaussian process. This is consequence of
+   central limit theorem (CLT): sum of large number of sines with random
+   amplitudes and phases has normal distribution, no matter what spectrum is
+   used as the model input. Using lower number of coefficients may solve the
+   problem, but also make realisation period smaller. So, using LH model to
+   simulate waves with non-Gaussian distribution of elevation\nbsp{}--- a
+   distribution which real ocean waves
+   have\nbsp{}cite:huang1980experimental,рожков1996теория \nbsp{}--- is
+   impractical.
+2. From computational point of view, the deficiency of the model is non-linear
+   increase of wavy surface generation time with the increase of realisation
+   size. The larger the size of the realisation, the higher number of
+   coefficients (discrete points of frequency-directional spectrum) is needed to
+   eliminate periodicity. This makes LH model inefficient for long-time
+   simulations.
+3. Finally, there are peculiarities which make LH model unsuitable base for
+   building more advanced simulation models.
+   - In software implementation convergence rate of eq.\nbsp{}[[eq-longuet-higgins]]
+     may be low due to randomness of phases \(\epsilon_n\).
+   - It is difficult to generalise LH model for non-Gaussian processes as it
+     involves incorporating non-linear terms in eq.\nbsp{}[[eq-longuet-higgins]] for
+     which there is no known formula to determine
+     coefficients\nbsp{}cite:рожков1990вероятностные.
+
+To summarise, LH model is applicable to generating ocean wavy surface in the
+framework of linear wave theory, inefficient for long-time simulations, and
+difficult to use as a base for more advanced models.
+
+**** ARMA model
+In\nbsp{}cite:spanos1982arma ARMA model is used to generate time series spectrum of
+which is compatible with Pierson---Moskowitz (PM) approximation of ocean wave
+spectrum. The authors carry out experiments for one-dimensional AR, MA and ARMA
+models. They mention excellent agreement between target and initial spectra and
+higher performance of ARMA model compared to models based on summing large
+number of harmonic components with random phases. The also mention that in order
+to reach agreement between target and initial spectrum MA model require lesser
+number of coefficients than AR model. In\nbsp{}cite:spanos1996efficient the authors
+generalise ARMA model coefficients determination formulae for multi-variate
+(vector) case.
+
+One thing that distinguishes present work with respect to afore-mentioned ones
+is the study of three-dimensional (2D in space and 1D in time) ARMA model, which
+is mostly a different problem.
+1. Yule---Walker system of equations, which are used to determine AR
+   coefficients, has complex block-block structure.
+2. Optimal model order (in a sense that target spectrum agrees with initial) is
+   determined manually.
+3. Instead of PM spectrum, analytic formulae for standing and propagating
+   waves ACF are used as the model input.
+4. Three-dimensional wavy surface should be compatible with real ocean surface
+   not only in terms of spectral characteristics, but also in the shape of wave
+   profiles. So, model verification includes distributions of various parameters
+   of generated waves (lengths, heights, periods etc.).
+Multi-dimensionality of investigated model not only complexifies the task, but
+also allows to carry out visual validation of generated wavy surface. It is the
+opportunity to visualise output of the programme that allowed to ensure that
+generated surface is compatible with real ocean surface, and is not abstract
+multi-dimensional stochastic process that is real only statistically.
+
+In\nbsp{}cite:fusco2010short AR model is used to predict swell waves to control
+wave-energy converters (WEC) in real-time. In order to make WEC more efficient
+its internal oscillator frequency should match the one of ocean waves. The
+authors treat wave elevation as time series and compare performance of AR model,
+neural networks and cyclical models in forecasting time series future values. AR
+model gives the most accurate prediction of low-frequency swell waves for up to
+two typical wave periods. It is an example of successful application of AR
+process to ocean wave modelling.
+
+** Pressure field determination formulae
+**** Small amplitude waves theory.
+In\nbsp{}cite:stab2012,детярев1998моделирование,degtyarev1997analysis the authors
+propose a solution for inverse problem of hydrodynamics of potential flow in the
+framework of small-amplitude wave theory (under assumption that wave length is
+much larger than height: \(\lambda \gg h\)). In that case inverse problem is
+linear and reduces to Laplace equation with mixed boundary conditions, and
+equation of motion is solely used to determine pressures for calculated velocity
+potential derivatives. The assumption of small amplitudes means the slow decay
+of wind wave coherence function, i.e. small change of local wave number in time
+and space compared to the wavy surface elevation (\(z\) coordinate). This
+assumption allows to calculate elevation \(z\) derivative as \(\zeta_z=k\zeta\),
+where \(k\) is wave number. In two-dimensional case the solution is written
+explicitly as
+\begin{align}
+    \left.\frac{\partial\phi}{\partial x}\right|_{x,t}= &
+        -\frac{1}{\sqrt{1+\alpha^{2}}}e^{-I(x)}
+            \int\limits_{0}^x\frac{\partial\dot{\zeta}/\partial
+                z+\alpha\dot{\alpha}}{\sqrt{1+\alpha^{2}}}e^{I(x)}dx,\label{eq-old-sol-2d}\\
+    I(x)= & \int\limits_{0}^x\frac{\partial\alpha/\partial z}{1+\alpha^{2}}dx,\nonumber
+\end{align}
+
+where \(\alpha\) is wave slope. In three-dimensional case solution is written in
+the form of elliptic partial differential equation (PDE):
+\begin{align*}
+    & \frac{\partial^2 \phi}{\partial x^2} \left( 1 + \alpha_x^2 \right) +
+    \frac{\partial^2 \phi}{\partial y^2} \left( 1 + \alpha_y^2 \right) +
+    2\alpha_x\alpha_y \frac{\partial^2 \phi}{\partial x \partial y} + \\
+    & \left(
+        \frac{\partial \alpha_x}{\partial z} +
+        \alpha_x \frac{\partial \alpha_x}{\partial x} +
+        \alpha_y \frac{\partial \alpha_x}{\partial y}
+    \right) \frac{\partial \phi}{\partial x} + \\
+    & \left(
+        \frac{\partial \alpha_y}{\partial z} +
+        \alpha_x \frac{\partial \alpha_y}{\partial x} +
+        \alpha_y \frac{\partial \alpha_y}{\partial y}
+    \right) \frac{\partial \phi}{\partial y} + \\
+    & \frac{\partial \dot{\zeta}}{\partial z} +
+    \alpha_x \dot{\alpha_x} + \alpha_y \dot{\alpha_y} = 0.
+\end{align*}
+The authors suggest transforming this equation to finite differences and solve
+it numerically.
+
+As will be shown in [[#sec:compare-formulae]] that eqref:eq-old-sol-2d diverges when
+attempted to calculate velocity field for large-amplitude waves, and this is the
+reason that it can not be used together with ARMA model, that generates
+arbitrary-amplitude waves.
+
+**** Linearisation of boundary condition.
+:PROPERTIES:
+:CUSTOM_ID: linearisation
+:END:
+
+LH model allows to derive an explicit formula for velocity field by linearising
+kinematic boundary condition. Velocity potential formula is written as
+\begin{equation*}
+\phi(x,y,z,t) = \sum_n \frac{c_n g}{\omega_n}
+     e^{\sqrt{u_n^2+v_n^2} z}
+     \sin(u_n x + v_n y - \omega_n t + \epsilon_n).
+\end{equation*}
+This formula is differentiated to obtain velocity potential derivatives, which
+are plugged to dynamic boundary condition to obtain pressures.
+
+* ARMA model for ocean wave simulation
+** Governing equations for 3-dimensional ARMA process
+ARMA ocean simulation model defines ocean wavy surface as three-dimensional (two
+dimensions in space and one in time) autoregressive moving average process:
+every surface point is represented as a weighted sum of previous in time and
+space points plus weighted sum of previous in time and space normally
+distributed random impulses. The governing equation for 3-D ARMA process is
+\begin{equation}
+    \zeta_{\vec i}
+    =
+    \sum\limits_{\vec j = \vec 0}^{\vec N}
+    \Phi_{\vec j} \zeta_{\vec i - \vec j}
+    +
+    \sum\limits_{\vec j = \vec 0}^{\vec M}
+    \Theta_{\vec j} \epsilon_{\vec i - \vec j}
+    ,
+    \label{eq-arma-process}
+\end{equation}
+where \(\zeta\)\nbsp{}--- wave elevation, \(\Phi\)\nbsp{}--- AR process
+coefficients, \(\Theta\)\nbsp{}--- MA process coefficients,
+\(\epsilon\)\nbsp{}--- white noise with Gaussian distribution,
+\(\vec{N}\)\nbsp{}--- AR process order, \(\vec{M}\)\nbsp{}--- MA process order,
+and \(\Phi_{\vec{0}}\equiv{0}\), \(\Theta_{\vec{0}}\equiv{0}\). Here arrows
+denote multi-component indices with a component for each dimension. In general,
+any scalar quantity can be a component (temperature, salinity, concentration of
+some substance in water etc.). Equation parameters are AR and MA process
+coefficients and order.
+
+**** Autoregressive (AR) process.
+AR process is ARMA process with only one random impulse instead of theirs
+weighted sum:
+\begin{equation}
+    \zeta_{\vec i}
+    =
+    \sum\limits_{\vec j = \vec 0}^{\vec N}
+    \Phi_{\vec j} \zeta_{\vec i - \vec j}
+    +
+    \epsilon_{i,j,k}
+    .
+    \label{eq-ar-process}
+\end{equation}
+The coefficients \(\Phi\) are calculated from ACF via three-dimensional
+Yule---Walker equations, which are obtained after multiplying both parts of the
+previous equation by \(\zeta_{\vec{i}-\vec{k}}\) and computing the expected value.
+Generic form of YW equations is
+\begin{equation}
+    \label{eq-yule-walker}
+    \gamma_{\vec k}
+    =
+    \sum\limits_{\vec j = \vec 0}^{\vec N}
+    \Phi_{\vec j}
+    \text{ }\gamma_{\vec{k}-\vec{j}}
+    +
+    \Var{\epsilon} \delta_{\vec{k}},
+    \qquad
+    \delta_{\vec{k}} =
+    \begin{cases}
+        1, \quad \text{if } \vec{k}=0 \\
+        0, \quad \text{if } \vec{k}\neq0,
+    \end{cases}
+\end{equation}
+where \(\gamma\)\nbsp{}--- ACF of process \(\zeta\), \(\Var{\epsilon}\)\nbsp{}--- white noise
+variance. Matrix form of three-dimensional YW equations, which is used in the
+present work, is
+\begin{equation*}
+    \Gamma
+    \left[
+        \begin{array}{l}
+            \Phi_{\vec 0}\\
+            \Phi_{0,0,1}\\
+            \vdotswithin{\Phi_{\vec 0}}\\
+            \Phi_{\vec N}
+        \end{array}
+    \right]
+    =
+    \left[
+        \begin{array}{l}
+            \gamma_{0,0,0}-\Var{\epsilon}\\
+            \gamma_{0,0,1}\\
+            \vdotswithin{\gamma_{\vec 0}}\\
+            \gamma_{\vec N}
+        \end{array}
+    \right],
+    \qquad
+    \Gamma=
+    \left[
+        \begin{array}{llll}
+            \Gamma_0 & \Gamma_1 & \cdots & \Gamma_{N_1} \\
+            \Gamma_1 & \Gamma_0 & \ddots & \vdotswithin{\Gamma_0} \\
+            \vdotswithin{\Gamma_0} & \ddots & \ddots & \Gamma_1 \\
+            \Gamma_{N_1} & \cdots & \Gamma_1 & \Gamma_0
+        \end{array}
+    \right],
+\end{equation*}
+where \(\vec N = \left( p_1, p_2, p_3 \right)\) and
+\begin{equation*}
+    \Gamma_i =
+    \left[
+    \begin{array}{llll}
+        \Gamma^0_i & \Gamma^1_i & \cdots & \Gamma^{N_2}_i \\
+        \Gamma^1_i & \Gamma^0_i & \ddots & \vdotswithin{\Gamma^0_i} \\
+        \vdotswithin{\Gamma^0_i} & \ddots & \ddots & \Gamma^1_i \\
+        \Gamma^{N_2}_i & \cdots & \Gamma^1_i & \Gamma^0_i
+    \end{array}
+    \right]
+    \qquad
+    \Gamma_i^j=
+    \left[
+    \begin{array}{llll}
+        \gamma_{i,j,0} & \gamma_{i,j,1} & \cdots & \gamma_{i,j,N_3} \\
+        \gamma_{i,j,1} & \gamma_{i,j,0} & \ddots &x \vdotswithin{\gamma_{i,j,0}} \\
+        \vdotswithin{\gamma_{i,j,0}} & \ddots & \ddots & \gamma_{i,j,1} \\
+        \gamma_{i,j,N_3} & \cdots & \gamma_{i,j,1} & \gamma_{i,j,0}
+    \end{array}
+    \right],
+\end{equation*}
+Since \(\Phi_{\vec 0}\equiv0\), the first row and column of \(\Gamma\) can be
+eliminated. Matrix \(\Gamma\) is block-toeplitz, positive definite and symmetric,
+hence the system is efficiently solved by Cholesky decomposition, which is
+particularly suitable for these types of matrices.
+
+After solving this system of equations white noise variance is estimated from
+eqref:eq-yule-walker by plugging \(\vec k = \vec 0\):
+\begin{equation*}
+    \Var{\epsilon} =
+    \Var{\zeta}
+    -
+    \sum\limits_{\vec j = \vec 0}^{\vec N}
+    \Phi_{\vec j}
+    \text{ }\gamma_{\vec{j}}.
+\end{equation*}
+
+**** Moving average (MA) process.
+MA process is ARMA process with \(\Phi\equiv0\):
+\begin{equation}
+    \zeta_{\vec i}
+    =
+    \sum\limits_{\vec j = \vec 0}^{\vec M}
+    \Theta_{\vec j} \epsilon_{\vec i - \vec j}
+    .
+    \label{eq-ma-process}
+\end{equation}
+MA coefficients \(\Theta\) are defined implicitly via the following non-linear
+system of equations:
+\begin{equation*}
+  \gamma_{\vec i} =
+	\left[
+		\displaystyle
+    \sum\limits_{\vec j = \vec i}^{\vec M}
+    \Theta_{\vec j}\Theta_{\vec j - \vec i}
+	\right]
+  \Var{\epsilon}.
+\end{equation*}
+The system is solved numerically by fixed-point iteration method via the
+following formulae
+\begin{equation*}
+  \Theta_{\vec i} =
+    -\frac{\gamma_{\vec 0}}{\Var{\epsilon}}
+		+
+    \sum\limits_{\vec j = \vec i}^{\vec M}
+    \Theta_{\vec j} \Theta_{\vec j - \vec i}.
+\end{equation*}
+Here coefficients \(\Theta\) are calculated from back to front: from
+\(\vec{i}=\vec{M}\) to \(\vec{i}=\vec{0}\). White noise variance is estimated by
+\begin{equation*}
+    \Var{\epsilon} = \frac{\gamma_{\vec 0}}{
+		1
+		+
+    \sum\limits_{\vec j = \vec 0}^{\vec M}
+    \Theta_{\vec j}^2
+    }.
+\end{equation*}
+Authors of\nbsp{}cite:box1976time suggest using Newton---Raphson method to solve this
+equation with higher precision, however, this method does not work in three
+dimensions. Using slower method does not have dramatic effect on the overall
+programme performance, because the number of coefficients is small and most of
+the time is spent generating wavy surface.
+
+**** Stationarity and invertibility of AR and MA processes
+In order for modelled wavy surface to represent physical phenomena, the
+corresponding process must be stationary and invertible. If the process is
+invertible, then there is a reasonable connection of current events with the
+events in the past, and if the process is stationary, the modelled physical
+signal amplitude does not increase infinitely in time and space.
+
+AR process is always invertible, and for stationarity it is necessary for roots
+of characteristic equation
+\begin{equation*}
+1 - \Phi_{0,0,1} z - \Phi_{0,0,2} z^2
+- \cdots
+- \Phi_{\vec N} z^{N_0 N_1 N_2} = 0,
+\end{equation*}
+to lie \emph{outside} the unit circle. Here \(\vec{N}\) is AR process order
+and \(\Phi\) are coefficients.
+
+MA process is always stationary, and for invertibility it is necessary for roots
+of characteristic equation
+\begin{equation*}
+1 - \Theta_{0,0,1} z - \Theta_{0,0,2} z^2
+- \cdots
+- \Theta_{\vec M} z^{M_0 M_1 M_2} = 0,
+\end{equation*}
+to lie \emph{outside} the unit circle. Here \(\vec{M}\) is
+three-dimensional MA process order and \(\Theta\) are coefficients.
+
+Stationarity and invertibility properties are the main criteria in selection of
+the process to model different wave profiles, which are discussed in
+section\nbsp{}[[#sec-process-selection]].
+
+**** Mixed autoregressive moving average (ARMA) process.
+:PROPERTIES:
+:CUSTOM_ID: sec:how-to-mix-ARMA
+:END:
+Generally speaking, ARMA process is obtained by plugging MA generated wavy
+surface as random impulse to AR process, however, in order to get the process
+with desired ACF one should re-compute AR coefficients before plugging. There
+are several approaches to "mix" AR and MA processes.
+- The approach proposed in\nbsp{}cite:box1976time which involves dividing ACF into MA
+  and AR part along each dimension is not applicable here, because in three
+  dimensions such division is not possible: there always be parts of the ACF
+  that are not taken into account by AR and MA process.
+- The alternative approach is to use the same (undivided) ACF for both AR and MA
+  processes but use different process order, however, then realisation
+  characteristics (mean, variance etc.) become skewed: these are characteristics
+  of the two overlapped processes.
+For the first approach there is a formula to re-compute ACF for AR process, but
+there is no such formula for the second approach. So, the best solution for now
+is to simply use AR and MA process exclusively.
+
+**** Process selection criteria for different wave profiles.
+:PROPERTIES:
+:CUSTOM_ID: sec-process-selection
+:END:
+
+One problem of ARMA model application to ocean wave generation is that for
+different types of wave profiles different processes /must/ be used: standing
+waves are modelled by AR process, and propagating waves by MA process. This
+statement comes from practice: if one tries to use the processes the other way
+round, the resulting realisation either diverges or does not correspond to real
+ocean waves. (The latter happens for non-invertible MA process, as it is always
+stationary.) So, the best way to apply ARMA model to ocean wave generation is to
+use AR process for standing waves and MA process for progressive waves.
+
+The other problem is inability to automatically determine optimal number of
+coefficients for three-dimensional AR and MA processes. For one-dimensional
+processes this can be achieved via iterative methods\nbsp{}cite:box1976time, but they
+diverge in three-dimensional case.
+
+The final problem, which is discussed in [[#sec:how-to-mix-ARMA]], is inability to
+"mix" AR and MA process in three dimensions.
+
+In practice some statements made for AR and MA processes in\nbsp{}cite:box1976time
+should be flipped for three-dimensional case. For example, the authors say that
+ACF of MA process cuts at \(q\) and ACF of AR process decays to nought infinitely,
+but in practice making ACF of 3-dimensional MA process not decay results in it
+being non-invertible and producing realisation that does not look like real
+ocean waves, whereas doing the same for ACF of AR process results in stationary
+process and adequate realisation. Also, the authors say that one
+should allocate the first \(q\) points of ACF to MA process (as it often needed to
+describe the peaks in ACF) and leave the rest points to AR process, but in
+practice in case of ACF of a propagating wave AR process is stationary only for
+the first time slice of the ACF, and the rest is left to MA process.
+
+To summarise, the only established scenario of applying ARMA model to ocean wave
+generation is to use AR process for standing waves and MA process for
+propagating waves. With new formulae for 3 dimensions a single mixed ARMA
+process might increase model precision, which is one of the objectives of the
+future research.
+
+** Modelling non-linearity of ocean waves
+ARMA model allows to model asymmetry of wave elevation distribution, i.e.
+generate ocean waves, distribution of z-coordinate of which has non-nought
+kurtosis and asymmetry. Such distribution is inherent to real ocean waves\nbsp{}cite:longuet1963nonlinear.
+
+Wave asymmetry is modelled by non-linear inertia-less transform (NIT) of
+stochastic process, however, transforming resulting wavy surface means
+transforming initial ACF. In order to alleviate this, ACF must be preliminary
+transformed as shown in\nbsp{}cite:boukhanovsky1997thesis.
+
+**** Wavy surface transformation.
+Explicit formula \(z=f(y)\) that transforms wavy surface to desired
+one-dimensional distribution \(F(z)\) is the solution of non-linear transcendental
+equation \(F(z)=\Phi(y)\), where \(\Phi(y)\)\nbsp{}--- one-dimensional Gaussian
+distribution. Since distribution of wave elevation is often given by some
+approximation based on field data, this equation is solved numerically with
+respect to \(z_k\) in each grid point \(y_k|_{k=0}^N\) of generated wavy surface. In
+this case equation is rewritten as
+\begin{equation}
+    \label{eq-distribution-transformation}
+    F(z_k)
+    =
+    \frac{1}{\sqrt{2\pi}}
+    \int\limits_0^{y_k} \exp\left[ -\frac{t^2}{2} \right] dt
+    .
+\end{equation}
+Since, distribution functions are monotonic, the simplest interval halving
+(bisection) numerical method is used to solve this equation.
+
+**** Preliminary ACF transformation.
+In order to transform ACF \(\gamma_z\) of the process, it should be expanded in
+series of Hermite polynomials (Gram---Charlier series)
+\begin{equation*}
+    \gamma_z \left( \vec u \right)
+    =
+    \sum\limits_{m=0}^{\infty}
+    C_m^2 \frac{\gamma_y^m \left( \vec u \right)}{m!},
+\end{equation*}
+where
+\begin{equation*}
+    C_m = \frac{1}{\sqrt{2\pi}}
+  \int\limits_{0}^\infty
+    f(y) H_m(y) \exp\left[ -\frac{y^2}{2} \right],
+\end{equation*}
+\(H_m\)\nbsp{}--- Hermite polynomial, and \(f(y)\)\nbsp{}--- solution to equation
+eqref:eq-distribution-transformation. Plugging polynomial approximation
+\(f(y)\approx\sum\limits_{i}d_{i}y^i\) and analytic formulae for Hermite
+polynomial yields
+\begin{equation*}
+    \frac{1}{\sqrt{2\pi}}
+    \int\limits_\infty^\infty
+    y^k \exp\left[ -\frac{y^2}{2} \right]
+    =
+    \begin{cases}
+        (k-1)!! & \text{if }k\text{ is even},\\
+        0       & \text{if }k\text{ is odd},
+    \end{cases}
+\end{equation*}
+which simplifies the former equation. Optimal number of coefficients \(C_m\) is
+determined by computing them sequentially and stopping when variances of both
+fields become equal with desired accuracy \(\epsilon\):
+\begin{equation*}
+    \left| \Var{z} - \sum\limits_{k=0}^m
+    \frac{C_k^2}{k!} \right| \leq \epsilon.
+\end{equation*}
+
+In\nbsp{}cite:boukhanovsky1997thesis the author suggests using polynomial
+approximation \(f(y)\) also for wavy surface transformation, however, in practice
+ocean surface realisation often contains points, where z-coordinate is beyond
+the limits of the approximation, which makes solution wrong. In these points it
+is more efficient to solve equation eqref:eq-distribution-transformation by
+bisection method. Using the same approximation in Gram---Charlier series does
+not lead to such errors.
+
+** Determining wave pressures for discretely given wavy surface
+Analytic solutions to boundary problems in classical equations are often used to
+study different properties of the solution, and for that purpose general
+solution formula is too difficult to study, as it contains integrals of unknown
+functions. Fourier method is one of the methods to find analytic solutions to
+PDE. It is based on application of Fourier transform to each part of PDE, which
+reduces the equation to algebraic, and the solution is written as inverse
+Fourier transform of some function (which may contain Fourier transforms of
+other functions). Since, it is not possible to write analytic forms of these
+Fourier transforms in all cases, unique solutions are found and their behaviour
+is studied in different domains instead. At the same time, computing discrete
+Fourier transforms on the computer is possible for any discretely defined
+function and efficient when using FFT algorithms. These algorithms use symmetry
+of complex exponentials to decrease asymptotic complexity from
+\(\mathcal{O}(n^2)\) to \(\mathcal{O}(n\log_{2}n)\). So, even if general solution
+contains Fourier transforms of unknown functions, they still can be computed
+numerically, and FFT family of algorithms makes this approach efficient.
+
+Alternative approach to solve PDE is to reduce it to difference equations, which
+are solved by constructing various numerical schemes. This approach leads to
+approximate solution, and asymptotic complexity of corresponding algorithms is
+comparable to that of FFT. For example, stationary elliptic PDE transforms to
+implicit numerical scheme which is solved by iterative method on each step of
+which a tridiagonal of five-diagonal system of algebraic equations is solved by
+Thomas algorithm. Asymptotic complexity of this approach is
+\(\mathcal{O}({n}{m})\), where \(n\)\nbsp{}--- number of wavy surface grid points, \(m\)\nbsp{}---
+number of iterations. Despite their wide spread, iterative algorithms are
+inefficient on parallel computer architectures; in particular, their mapping to
+co-processors may involve copying data in and out of the co-processor in each
+iteration, which negatively affects their performance. At the same time, high
+number of Fourier transforms in the solution is an advantage, rather than a
+disadvantage. First, solutions obtained by Fourier method are explicit, hence
+their implementations scales with the large number of parallel computer cores.
+Second, there are implementations of FFT optimised for different processor
+architectures as well as co-processors (GPU, MIC) which makes it easy to get
+high performance on any computing platform. These advantages substantiate the
+choice of Fourier method to obtain explicit analytic solution to the problem of
+determining pressures under wavy ocean surface.
+
+*** Two-dimensional velocity field
+:PROPERTIES:
+:CUSTOM_ID: sec:pressure-2d
+:END:
+**** Formula for infinite depth fluid.
+Two-dimensional Laplace equation with Robin boundary condition is written as
+\begin{align}
+    \label{eq-problem-2d}
+    & \phi_{xx}+\phi_{zz}=0,\\
+    & \zeta_t + \zeta_x\phi_x = \frac{\zeta_x}{\sqrt{1 + \zeta_x^2}} \phi_x - \phi_z, & \text{на }z=\zeta(x,t).\nonumber
+\end{align}
+Use Fourier method to solve this problem. Applying Fourier transform to both
+sides of the equation yields
+\begin{equation*}
+    -4 \pi^2 \left( u^2 + v^2 \right)
+    \FourierY{\phi(x,z)}{u,v} = 0,
+\end{equation*}
+hence \(v = \pm i u\). Hereinafter we use the following symmetric form of Fourier
+transform:
+\begin{equation*}
+    \FourierY{f(x,y)}{u,v} =
+    \iint\limits_{-\infty}^{\phantom{--}\infty}
+    f(x,y)
+    e^{-2\pi i (x u + y v)}
+    dx dy.
+\end{equation*}
+We seek solution in the form of inverse Fourier transform
+\(\phi(x,z)=\InverseFourierY{E(u,v)}{x,z}\). Plugging[fn::\(v={-i}{u}\) is not
+applicable because velocity potential must go to nought when depth goes to
+infinity.] \(v={i}{u}\) into the formula yields
+\begin{equation}
+    \label{eq-guessed-sol-2d}
+    \phi(x,z) = \InverseFourierY{e^{2\pi u z}E(u)}{x}.
+\end{equation}
+In order to make substitution \(z=\zeta(x,t)\) not interfere with Fourier
+transforms, we rewrite eqref:eq-guessed-sol-2d as a convolution:
+\begin{equation*}
+    \phi(x,z)
+    =
+    \Fun{z}
+    \ast
+    \InverseFourierY{E(u)}{x},
+\end{equation*}
+where \(\Fun{z}\)\nbsp{}--- a function, form of which is defined in section
+[[#sec:compute-delta]] and which satisfies equation
+\(\FourierY{\Fun{z}}{u}=e^{2\pi{u}{z}}\). Plugging formula \(\phi\) into the boundary
+condition yields
+\begin{equation*}
+    \zeta_t
+    =
+    \left( i f(x) - 1 \right)
+    \left[
+        \Fun{z}
+        \ast
+        \InverseFourierY{2\pi u E(u)}{x}
+    \right],
+\end{equation*}
+where \(f(x)={\zeta_x}/{\sqrt{1+\zeta_x^2}}-\zeta_x\). Applying Fourier transform
+to both sides of this equation yields formula for coefficients \(E\):
+\begin{equation*}
+    E(u) =
+    \frac{1}{2\pi u}
+    \frac{
+    \FourierY{\zeta_t / \left(i f(x) - 1\right)}{u}
+    }{
+    \FourierY{\Fun{z}}{u}
+    }
+\end{equation*}
+Finally, substituting \(z\) for \(\zeta(x,t)\) and plugging resulting equation into
+eqref:eq-guessed-sol-2d yields formula for \(\phi(x,z)\):
+\begin{equation}
+    \label{eq-solution-2d}
+    \boxed{
+        \phi(x,z)
+        =
+        \InverseFourierY{
+            \frac{e^{2\pi u z}}{2\pi u}
+            \frac{
+            \FourierY{ \zeta_t / \left(i f(x) - 1\right) }{u}
+            }{
+            \FourierY{ \Fun{\zeta(x,t)} }{u}
+            }
+        }{x}.
+    }
+\end{equation}
+
+Multiplier \(e^{2\pi{u}{z}}/(2\pi{u})\) makes graph of a function to which Fourier
+transform of which is applied asymmetric with respect to \(OY\) axis. This makes
+it difficult to apply FFT which expects periodic function with nought on both
+ends of the interval. Using numerical integration instead of FFT is not faster
+than solving the initial system of equations with numerical schemes. This
+problem is alleviated by using formula eqref:eq-solution-2d-full for finite
+depth fluid with wittingly large depth \(h\). This formula is derived in the
+following section.
+
+**** Formula for finite depth fluid.
+On the sea bottom vertical fluid velocity component equals nought: \(\phi_z=0\) on
+\(z=-h\), where \(h\)\nbsp{}--- water depth. In this case equation \(v=-{i}{u}\), which came
+from Laplace equation, can not be neglected, hence the solution is sought in the
+following form:
+\begin{equation}
+    \phi(x,z)
+    =
+    \InverseFourierY{
+        \left( C_1 e^{2\pi u z} + C_2 e^{-2\pi u z} \right)
+        E(u)
+    }{x}.
+    \label{eq-guessed-sol-2d-full}
+\end{equation}
+Plugging \(\phi\) into the boundary condition on the sea bottom yields
+\begin{equation*}
+    C_1 e^{-2\pi u h} - C_2 e^{2\pi u h} = 0,
+\end{equation*}
+hence \(C_1=\frac{1}{2}C{e}^{2\pi{u}{h}}\) and
+\(C_2=-\frac{1}{2}C{e}^{-2\pi{u}{h}}\). Constant \(C\) may take arbitrary value
+here, because after plugging it becomes part of unknown coefficients \(E(u)\).
+Plugging formulae for \(C_1\) and \(C_2\) into eqref:eq-guessed-sol-2d-full yields
+\begin{equation*}
+    \phi(x,z) = \InverseFourierY{ \Sinh{2\pi u (z+h)} E(u) }{x}.
+\end{equation*}
+Plugging \(\phi\) into the boundary condition on the free surface yields
+\begin{equation*}
+    \zeta_t = f(x) \InverseFourierY{ 2\pi i u \Sinh{2\pi u (z+h)} E(u) }{x}
+            - \InverseFourierY{ 2\pi u \SinhX{2\pi u (z+h)} E(u) }{x}.
+\end{equation*}
+Here \(\sinh\) and \(\cosh\) give similar results near free surface, and since this
+is the main area of interest in practical applications, we assume that
+\(\Sinh{2\pi{u}(z+h)}\approx\SinhX{2\pi{u}(z+h)}\). Performing analogous to the
+previous section transformations yields final formula for \(\phi(x,z)\):
+\begin{equation}
+\boxed{
+    \phi(x,z,t)
+    =
+  \InverseFourierY{
+        \frac{\Sinh{2\pi u (z+h)}}{2\pi u}
+        \frac{
+            \FourierY{ \zeta_t / \left(i f(x) - 1\right) }{u}
+        }{
+            \FourierY{ \FunSecond{\zeta(x,t)} }{u}
+        }
+    }{x},
+}
+    \label{eq-solution-2d-full}
+\end{equation}
+where \(\FunSecond{z}\)\nbsp{}--- a function, form of which is defined in section
+[[#sec:compute-delta]] and which satisfies equation
+\(\FourierY{\FunSecond{z}}{u}=\Sinh{2\pi{u}{z}}\).
+
+**** Reducing to the formulae from linear wave theory.
+Check the validity of derived formulae by substituting \(\zeta(x,t)\) with known
+analytic formula for plain waves. Symbolic computation of Fourier transforms in
+this section were performed in Mathematica\nbsp{}cite:mathematica10. In the framework
+of linear wave theory assume that waves have small amplitude compared to their
+lengths, which allows us to simplify initial system of equations
+eqref:eq-problem-2d to
+\begin{align*}
+    & \phi_{xx}+\phi_{zz}=0,\\
+    & \zeta_t = -\phi_z & \text{на }z=\zeta(x,t),
+\end{align*}
+solution to which is written as
+\begin{equation*}
+    \phi(x,z,t)
+    =
+    -\InverseFourierY{
+        \frac{e^{2\pi u z}}{2\pi u}
+        \FourierY{\zeta_t}{u}
+    }{x}
+    .
+\end{equation*}
+Propagating wave profile is defined as \(\zeta(x,t)=A\cos(2\pi(kx-t))\). Plugging
+this formula into eqref:eq-solution-2d yields
+\(\phi(x,z,t)=-\frac{A}{k}\sin(2\pi(kx-t))\Sinh{2\pi{k}{z}}\). In order to reduce
+it to the formula from linear wave theory, rewrite hyperbolic sine in
+exponential form, discard the term containing \(e^{-2\pi{k}{z}}\) as contradicting
+condition \(\phi\underset{z\rightarrow-\infty}{\longrightarrow}0\). Taking real
+part of the resulting formula yields
+\(\phi(x,z,t)=\frac{A}{k}e^{2\pi{k}{z}}\sin(2\pi(kx-t))\), which corresponds to
+the known formula from linear wave theory. Similarly, under small-amplitude
+waves assumption the formula for finite depth fluid eqref:eq-solution-2d-full is
+reduced to
+\begin{equation*}
+    \phi(x,z,t)
+    =
+    -\InverseFourierY{
+        \frac{\Sinh{2\pi u (z+h)}}{2\pi u \Sinh{2\pi u h}}
+        \FourierY{\zeta_t}{u}
+    }{x}.
+\end{equation*}
+Substituting \(\zeta(x,t)\) with propagating plain wave profile formula yields
+\begin{equation}
+    \label{eq-solution-2d-linear}
+    \phi(x,z,t)=\frac{A}{k}
+    \frac{\Sinh{2 \pi k (z+h)}}{ \Sinh{2 \pi k h} }
+    \sin(2 \pi (k x-t)),
+\end{equation}
+which corresponds to the formula from linear wave theory for finite depth fluid.
+
+Different forms of Laplace equation solutions, in which decaying exponent is
+written with either "+" or "-" signs, may cause incompatibilities between
+formulae from linear wave theory and formulae derived in this work, where
+\(\sinh\) is used instead of \(\cosh\). Equality
+\(\frac{\Sinh{2\pi{k}(z+h)}}{\Sinh{2\pi{k}{h}}}\approx\frac{\sinh(2\pi{k}(z+h))}{\sinh(2\pi{k}{h})}\)
+becomes strict on the free surface, and difference between left-hand and
+right-hand sides increases when approaching sea bottom (for sufficiently large
+depth difference near free surface is negligible). So, for sufficiently large
+depth any function (\(\cosh\) or \(\sinh\)) may be used for velocity potential
+computation near free surface.
+
+Reducing eqref:eq-solution-2d и eqref:eq-solution-2d-full to the known formulae
+from linear wave theory shows, that formula for infinite depth
+eqref:eq-solution-2d is not suitable to compute velocity potentials with Fourier
+method, because it does not have symmetry, which is required for Fourier
+transform. However, formula for finite depth can be used instead by setting \(h\)
+to some characteristic water depth. For standing wave reducing to linear wave
+theory formulae is made under the same assumptions.
+
+*** Three-dimensional velocity field
+Three-dimensional version of eqref:eq-problem is written as
+\begin{align}
+    \label{eq-problem-3d}
+    & \phi_xx + \phi_yy + \phi_zz = 0,\\
+    & \zeta_t + \zeta_x\phi_x + \zeta_y\phi_y
+    =
+    \frac{\zeta_x}{\sqrt{1 + \zeta_x^2}} \phi_x
+    +\frac{\zeta_y}{\sqrt{\vphantom{\zeta_x^2}\smash[b]{1 + \zeta_y^2}}} \phi_y
+    - \phi_z, & \text{на }z=\zeta(x,y,t).\nonumber
+\end{align}
+Again, use Fourier method to solve it. Applying Fourier transform to both sides
+of Laplace equation yields
+\begin{equation*}
+    -4 \pi^2 \left( u^2 + v^2 + w^2 \right)
+    \FourierY{\phi(x,y,z)}{u,v,w} = 0,
+\end{equation*}
+hence \(w=\pm{i}\sqrt{u^2+v^2}\). We seek solution in the form of inverse Fourier
+transform \(\phi(x,y,z)=\InverseFourierY{E(u,v,w)}{x,y,z}\). Plugging
+\(w=i\sqrt{u^2+v^2}\) into the formula yields
+\begin{equation*}
+    \phi(x,y,z) = \InverseFourierY{
+        \left(
+            C_1 e^{2\pi \sqrt{u^2+v^2} z}
+            -C_2 e^{-2\pi \sqrt{u^2+v^2} z}
+        \right)
+        E(u,v)
+    }{x,y}.
+\end{equation*}
+Plugging \(\phi\) into the boundary condition on the sea bottom (analogous to
+two-dimensional case) yields
+\begin{equation}
+    \label{eq-guessed-sol-3d}
+    \phi(x,y,z) = \InverseFourierY{
+        \Sinh{2\pi \sqrt{u^2+v^2} (z+h)} E(u,v)
+    }{x,y}.
+\end{equation}
+Plugging \(\phi\) into the boundary condition on the free surface yields
+\begin{equation*}
+    \arraycolsep=1.4pt
+    \begin{array}{rl}
+        \zeta_t = & i f_1(x,y) \InverseFourierY{2 \pi u \Sinh{2\pi \sqrt{u^2+v^2} (z+h)}E(u,v)}{x,y} \\
+        + & i f_2(x,y) \InverseFourierY{2 \pi v \Sinh{2\pi \sqrt{u^2+v^2} (z+h)}E(u,v)}{x,y} \\
+        - & \InverseFourierY{2 \pi \sqrt{u^2+v^2} \SinhX{2\pi \sqrt{u^2+v^2} (z+h)}E(u,v)}{x,y}
+    \end{array}
+\end{equation*}
+where \(f_1(x,y)={\zeta_x}/{\sqrt{1+\zeta_x^2}}-\zeta_x\) and
+\(f_2(x,y)={\zeta_y}/{\sqrt{\vphantom{\zeta_x^2}\smash[b]{1+\zeta_y^2}}}-\zeta_y\).
+Applying Fourier transform to both sides of the equation yields formula for
+coefficients \(E\):
+\begin{equation*}
+    \arraycolsep=1.4pt
+    \begin{array}{rl}
+        \FourierY{\zeta_t}{u,v} = &
+        \FourierY{i f_1(x,y) \InverseFourierY{2 \pi u \Sinh{2\pi \sqrt{u^2+v^2} (z+h)} E(u,v)}{x,y}}{u,v}  \\
+        + & \FourierY{i f_2(x,y) \InverseFourierY{2 \pi v \Sinh{2\pi \sqrt{u^2+v^2} (z+h)} E(u,v)}{x,y}}{u,v}  \\
+        - & 2 \pi \sqrt{u^2+v^2} \SinhX{2\pi \sqrt{u^2+v^2} (z+h)} E(u,v)
+    \end{array}
+\end{equation*}
+Final solution is obtained after plugging \(E(u,v)\) into eqref:eq-guessed-sol-3d.
+
+* Numerical methods and experimental results
+** The shape of ACF for different types of waves
+**** Analytic method of finding the ACF.
+The straightforward way to find ACF for a given ocean wave profile is to apply
+Wiener---Khinchin theorem. According to this theorem the autocorrelation \(K\) of
+a function \(\zeta\) is given by the Fourier transform of the absolute square of
+the function:
+\begin{equation}
+  K(t) = \Fourier{\left| \zeta(t) \right|^2}.
+  \label{eq-wiener-khinchin}
+\end{equation}
+When \(\zeta\) is replaced with actual wave profile, this formula gives you
+analytic formula for the corresponding ACF.
+
+For three-dimensional wave profile (2D in space and 1D in time) analytic formula
+is a polynomial of high order and is best obtained via symbolic computation
+programme. Then for practical usage it can be approximated by superposition of
+exponentially decaying cosines (which is how ACF of a stationary ARMA process
+looks like\nbsp{}cite:box1976time).
+
+**** Empirical method of finding the ACF.
+However, for three-dimensional case there exists simpler empirical method which
+does not require sophisticated software to determine shape of the ACF. It is
+known that ACF represented by exponentially decaying cosines satisfies first
+order Stokes' equations for gravity waves\nbsp{}cite:boccotti1983wind. So, if the
+shape of the wave profile is the only concern in the simulation, then one can
+simply multiply it by a decaying exponent to get appropriate ACF. This ACF does
+not reflect other wave profile parameters, such as wave height and period, but
+opens possibility to simulate waves of a particular non-analytic shape by
+"drawing" their profile, then multiplying it by an exponent and using the
+resulting function as ACF. So, this empirical method is imprecise but offers
+simpler alternative to Wiener---Khinchin theorem approach; it is mainly useful
+to test ARMA model.
+
+**** Standing wave ACF.
+For three-dimensional plain standing wave the profile is given by
+\begin{equation}
+  \zeta(t, x, y) = A \sin (k_x x + k_y y) \sin (\sigma t).
+  \label{eq-standing-wave}
+\end{equation}
+Find ACF via analytic method. Multiplying the formula by a decaying exponent
+(because Fourier transform is defined for a function \(f\) that
+\(f\underset{x\rightarrow\pm\infty}{\longrightarrow}0\)) yields
+\begin{equation}
+  \zeta(t, x, y) =
+  A
+  \exp\left[-\alpha (|t|+|x|+|y|) \right]
+  \sin (k_x x + k_y y) \sin (\sigma t).
+  \label{eq-decaying-standing-wave}
+\end{equation}
+Then, apply 3D Fourier transform to both sides of the equation via symbolic
+computation programme, fit the resulting polynomial to the following
+approximation:
+\begin{equation}
+  K(t,x,y) =
+  \gamma
+  \exp\left[-\alpha (|t|+|x|+|y|) \right]
+  \cos \beta t
+  \cos \left[ \beta x + \beta y \right].
+  \label{eq-standing-wave-acf}
+\end{equation}
+So, after applying Wiener---Khinchin theorem we get initial formula but with
+cosines instead of sines. This difference is important because the value of ACF
+at \((0,0,0)\) equals to the ARMA process variance, and if one used sines the
+value would be wrong.
+
+If one tries to replicate the same formula via empirical method, the usual way
+is to adapt eqref:eq-decaying-standing-wave to match eqref:eq-standing-wave-acf.
+This can be done either by changing the phase of the sine, or by substituting
+sine with cosine to move the maximum of the function to the origin of
+coordinates.
+
+**** Propagating wave ACF.
+Three-dimensional profile of plain propagating wave is given by
+\begin{equation}
+  \zeta(t, x, y) = A \cos (\sigma t + k_x x + k_y y).
+  \label{eq-propagating-wave}
+\end{equation}
+For the analytic method repeating steps from the previous two paragraphs yields
+\begin{equation}
+  K(t,x,y) =
+  \gamma
+  \exp\left[-\alpha (|t|+|x|+|y|) \right]
+  \cos\left[\beta (t+x+y) \right].
+  \label{eq-propagating-wave-acf}
+\end{equation}
+For the empirical method the wave profile is simply multiplied by a decaying
+exponent without need to adapt the maximum value of ACF (as it is required for
+standing wave).
+
+**** Comparison of studied methods.
+To summarise, the analytic method of finding ocean wave's ACF reduces to the
+following steps.
+- Make wave profile decay when approaching \(\pm\infty\) by multiplying it by
+  a decaying exponent.
+- Apply Fourier transform to the absolute square of the resulting equation using
+  symbolic computation programme.
+- Fit the resulting polynomial to the appropriate ACF approximation.
+
+Two examples in this section showed that in case of standing and propagating
+waves their decaying profiles resemble the corresponding ACFs with the exception
+that the ACF's maximum should be moved to the origin to preserve simulated
+process variance. Empirical method of finding ACF reduces to the following
+steps.
+- Make wave profile decay when approaching \(\pm\infty\) by multiplying it by
+  a decaying exponent.
+- Move maximum value of the resulting function to the origin by using
+  trigonometric identities to shift the phase.
+
+** Additional formulae, methods and algorithms for ARMA model
+:PROPERTIES:
+:CUSTOM_ID: sec:arma-algorithms
+:END:
+*** Wave elevation distribution approximation
+One of the parameters of ocean wavy surface generator is probability density
+function (PDF) of the surface elevation. This distribution is given by either
+polynomial approximation of /in situ/ data or analytic formula.
+
+**** Gram---Charlier series expansion.
+In\nbsp{}cite:huang1980experimental the authors experimentally show, that PDF of sea
+surface elevation is distinguished from normal distribution by non-nought
+kurtosis and skewness. In\nbsp{}cite:рожков1996теория the authors show, that this type
+of PDF expands in Gram---Charlier series:
+\begin{align}
+    \label{eq-skew-normal-1}
+    F(z; \gamma_1, \gamma_2) & = \phi(z)
+        - \gamma_1 \frac{\phi'''(z)}{3!}
+        + \gamma_2 \frac{\phi''''(z)}{4!} \nonumber \\
+    & =
+    \frac{1}{2} \text{erf}\left[\frac{z}{\sqrt{2}}\right]
+    -
+    \frac{e^{-\frac{z^2}{2}}}{\sqrt{2\pi}}
+    \left[
+        \frac{1}{6} \gamma_1 \left(z^2-1\right)
+        + \frac{1}{24} \gamma_2 z \left(z^2-3\right)
+    \right]
+    ,\nonumber \\
+    f(z; \gamma_1, \gamma_2) & =
+    \frac{e^{-\frac{z^2}{2}}}{\sqrt{2 \pi }}
+    \left[
+        \frac{1}{6} \gamma_1 z \left(z^2-3\right)
+        + \frac{1}{24} \gamma_2 \left(z^4-6z^2+3\right)
+        +1
+    \right],
+\end{align}
+where \(\phi(z)=\frac{1}{2}\mathrm{erf}(z/\sqrt{2})\), \(\gamma_1\)\nbsp{}--- skewness,
+\(\gamma_2\)\nbsp{}--- kurtosis, \(f\)\nbsp{}--- PDF, \(F\)\nbsp{}--- cumulative distribution function
+(CDF). According to\nbsp{}cite:рожков1990вероятностные for ocean waves skewness is
+selected from interval \(0.1\leq\gamma_1\leq{0.52}]\) and kurtosis from interval
+\(0.1\leq\gamma_2\leq{0.7}\). Family of probability density functions for
+different parameters is shown in fig.\nbsp{}[[fig-skew-normal-1]].
+
+#+NAME: fig-skew-normal-1
+#+begin_src R :file build/skew-normal-1.pdf
+source(file.path("R", "common.R"))
+x <- seq(-3, 3, length.out=100)
+params <- data.frame(
+  skewness = c(0.00, 0.52, 0.00, 0.52),
+  kurtosis = c(0.00, 0.00, 0.70, 0.70),
+  linetypes = c("solid", "dashed", "dotdash", "dotted")
+)
+arma.skew_normal_1_plot(x, params)
+legend(
+  "topleft",
+  mapply(
+    function (s, k) {
+      as.expression(bquote(list(
+        gamma[1] == .(arma.fmt(s, 2)),
+        gamma[2] == .(arma.fmt(k, 2))
+      )))
+    },
+    params$skewness,
+    params$kurtosis
+  ),
+  lty = paste(params$linetypes)
+)
+#+end_src
+
+#+caption: Probability density function eqref:eq-skew-normal-1 of ocean wavy surface elevation for different values of skewness \(\gamma_1\) and kurtosis \(\gamma_2\).
+#+label: fig-skew-normal-1
+#+RESULTS: fig-skew-normal-1
+[[file:build/skew-normal-1.pdf]]
+
+**** Skew-normal distribution.
+Alternative approach is to approximate distribution of ocean wavy surface
+elevation by skew-normal distribution:
+\begin{align}
+    \label{eq-skew-normal-2}
+    F(z; \alpha) & = \frac{1}{2}
+   \mathrm{erfc}\left[-\frac{z}{\sqrt{2}}\right]-2 T(z,\alpha ), \nonumber \\
+    f(z; \alpha) & = \frac{e^{-\frac{z^2}{2}}}{\sqrt{2 \pi }}
+   \mathrm{erfc}\left[-\frac{\alpha z}{\sqrt{2}}\right],
+\end{align}
+where \(T\)\nbsp{}--- Owen \(T\)-function\nbsp{}cite:owen1956tables. Using this formula it is
+impossible to specify skewness and kurtosis separately\nbsp{}--- both values are
+adjusted via \(\alpha\) parameter. The only advantage of the formula is its
+relative computational simplicity: this function is available in some programmes
+and mathematical libraries. Its graph for different values of \(\alpha\) is shown
+in fig.\nbsp{}[[fig-skew-normal-2]].
+
+#+name: fig-skew-normal-2
+#+begin_src R :file build/skew-normal-2.pdf
+source(file.path("R", "common.R"))
+x <- seq(-3, 3, length.out=100)
+alpha <- c(0.00, 0.87, 2.25, 4.90)
+params <- data.frame(
+  alpha = alpha,
+  skewness = arma.bits.skewness_2(alpha),
+  kurtosis = arma.bits.kurtosis_2(alpha),
+  linetypes = c("solid", "dashed", "dotdash", "dotted")
+)
+arma.skew_normal_2_plot(x, params)
+legend(
+  "topleft",
+  mapply(
+    function (a, s, k) {
+      as.expression(bquote(list(
+        alpha == .(arma.fmt(a, 2)),
+        gamma[1] == .(arma.fmt(s, 2)),
+        gamma[2] == .(arma.fmt(k, 2))
+      )))
+    },
+    params$alpha,
+    params$skewness,
+    params$kurtosis
+  ),
+  lty = paste(params$linetypes)
+)
+#+end_src
+
+#+caption: Probability density function eqref:eq-skew-normal-2 of ocean wavy surface for different values of skewness coefficient \(\alpha\).
+#+label: fig-skew-normal-2
+#+RESULTS: fig-skew-normal-2
+[[file:build/skew-normal-2.pdf]]
+
+**** Evaluation.
+Equation eqref:eq-distribution-transformation with selected wave elevation
+distribution may be solved either in every point of generated wavy surface,
+which gives the most accurate results, or in every fixed grid point
+interpolating result via least-squares (LS) polynomial. In the second case
+precision is lower. For example, interpolating 12^th order polynomial on a fixed
+grid of 500 points on interval \(-5\sigma_z\leq{z}\leq{5}\sigma_z\) gives error of
+\(\approx{0.43}\cdot10^{-3}\). Increasing polynomial order leads to either numeric
+overflows during LS interpolation, or more coefficient close to nought;
+increasing the size of the grid has insignificant effect on the result. In the
+majority of cases three Gram---Charlier series coefficients is enough to
+transform ACF; relative error without interpolation is \(10^{-5}\).
+
+*** White noise generation algorithm
+In order to eliminate periodicity from generated wavy surface, it is imperative
+to use PRNG with sufficiently large period to generate white noise. Parallel
+Mersenne Twister\nbsp{}cite:matsumoto1998mersenne with a period of \(2^{19937}-1\) is
+used as a generator in this work. It allows to produce aperiodic ocean wavy
+surface realisations in any practical usage scenarios.
+
+There is no guarantee that multiple Mersenne Twisters executed in parallel
+threads with distinct initial states produce uncorrelated pseudo-random number
+sequences, however, algorithm of dynamic creation of Mersenne Twisters\nbsp{}cite:matsumoto1998dynamic may be used to provide such guarantee. The essence of
+the algorithm is to find matrices of initial generator states, that give
+maximally uncorrelated pseudo-random number sequences when Mersenne Twisters are
+executed in parallel with these initial states. Since finding such initial
+states consumes considerable amount of processor time, vector of initial states
+is created preliminary with knowingly larger number of parallel threads and
+saved to a file, which is then read before starting white noise generation.
+
+*** Wavy surface generation algorithm
+In ARMA model value of wavy surface elevation at a particular point depends on
+previous in space and time points, as a result the so called /ramp-up interval/
+(see fig.\nbsp{}[[fig-ramp-up-interval]]), in which realisation does not correspond to
+specified ACF, forms in the beginning of the realisation. There are several
+solutions to this problem which depend on the simulation context.
+
+If realisation is used in the context of ship stability simulation without
+manoeuvring, ramp-up interval will not affect results of the simulation, because
+it is located on the border (too far away from the studied marine object). If
+ship stability with manoeuvring is studied, then the interval may be simply
+discarded from the realisation (the size of the interval approximately equals
+the number of AR coefficients in each dimension). However, this may lead to loss
+of a very large number of points, because discarding occurs for each dimension.
+Alternative approach is to generate ocean wavy surface on ramp-up interval with
+LH model and generate the rest of the realisation with ARMA model.
+
+Algorithm of wavy surface generation is data-parallel: realisation is divided
+into equal parts each of which is generated independently, however, in the
+beginning of each realisation there is ramp-up interval. To eliminate it
+/overlap-add/ method\nbsp{}cite:oppenheim1989discrete,svoboda2011efficient,pavel2013algorithms (a popular
+method in signal processing) is used. The essence of the method is to add
+another interval, size of which is equal to the ramp-up interval size, to the
+end of each part. Then wavy surface is generated in each point of each part
+(including points from the added interval), the interval at the end of part \(N\)
+is superimposed on the ramp-up interval at the beginning of the part \(N+1\), and
+values in corresponding points are added.
+
+#+name: fig-ramp-up-interval
+#+begin_src R :file build/ramp-up-interval.pdf
+source(file.path("R", "common.R"))
+arma.plot_ramp_up_interval()
+#+end_src
+
+#+caption: Ramp-up interval at the beginning of the \(OX\) axis of the realisation.
+#+label: fig-ramp-up-interval
+#+RESULTS: fig-ramp-up-interval
+[[file:build/ramp-up-interval.pdf]]
+
+*** Velocity potential normalisation formulae
+:PROPERTIES:
+:CUSTOM_ID: sec:compute-delta
+:END:
+
+In solutions eqref:eq-solution-2d and eqref:eq-solution-2d-full to
+two-dimensional pressure determination problem there are functions
+\(\Fun{z}=\InverseFourierY{e^{2\pi{u}{z}}}{x}\) and
+\(\FunSecond{z}=\InverseFourierY{\Sinh{2\pi{u}{z}}}{x}\) which has multiple
+analytic representations and are difficult to compute. Each function is a
+Fourier transform of linear combination of exponents which reduces to poorly
+defined Dirac delta function of a complex argument (see table\nbsp{}[[tab-delta-functions]]).
+The usual way of handling this type of functions is to write them as
+multiplication of Dirac delta functions of real and imaginary part, however,
+this approach does not work here, because applying inverse Fourier transform to
+this representation does not produce exponent, which severely warp resulting
+velocity field. In order to get unique analytic definition normalisation factor
+\(1/\Sinh{2\pi{u}{h}}\) (which is also included in formula for \(E(u)\)) may be
+used. Despite the fact that normalisation allows to obtain adequate velocity
+potential field, numerical experiments show that there is little difference
+between this field and the one produced by formulae from linear wave theory, in
+which terms with \(\zeta\) are omitted.
+
+#+name: tab-delta-functions
+#+caption: Formulae for computing \(\Fun{z}\) and \(\FunSecond{z}\) from [[#sec:pressure-2d]], that use normalisation to eliminate uncertainty from definition of Dirac delta function of complex argument.
+#+attr_latex: :booktabs t
+| Function          | Without normalisation                                        | Normalised                                                                                                                             |
+|-------------------+--------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------|
+| \(\Fun{z}\)       | \(\delta (x+i z)\)                                           | \(\frac{1}{2 h}\mathrm{sech}\left(\frac{\pi  (x-i (h+z))}{2 h}\right)\)                                                                |
+| \(\FunSecond{z}\) | \(\frac{1}{2}\left[\delta (x-i z) + \delta (x+i z) \right]\) | \(\frac{1}{4 h}\left[\text{sech}\left(\frac{\pi  (x-i (h+z))}{2 h}\right)+\text{sech}\left(\frac{\pi  (x+i(h+z))}{2 h}\right)\right]\) |
+
+** ARMA model verification
+:PROPERTIES:
+:CUSTOM_ID: sec:verification
+:END:
+
+In\nbsp{}cite:degtyarev2011modelling,degtyarev2013synoptic,boukhanovsky1997thesis AR
+model the following items are verified experimentally:
+- probability distributions of different wave characteristics (wave heights,
+  lengths, crests, periods, slopes, three-dimensionality),
+- dispersion relation,
+- retention of integral characteristics for mixed wave sea state.
+In this work both AR and MA model are verified by comparing probability
+distributions of different wave characteristics.
+
+*** Verification of wavy surface integral characteristics
+In\nbsp{}cite:рожков1990вероятностные the authors show that several ocean wave
+characteristics (listed in table\nbsp{}[[tab-weibull-shape]]) have Weibull distribution,
+and wavy surface elevation has Gaussian distribution. In order to verify that
+distributions corresponding to generated realisation are correct,
+quantile-quantile plots are used (plots where analytic quantile values are used
+for \(OX\) axis and estimated quantile values for \(OY\) axis). If the estimated
+distribution matches analytic then the graph has the form of the straight line.
+Tails of the graph may diverge from the straight line, because they can not be
+reliably estimated from the finite-size realisation. Different methods of
+extracting waves from realisation produce variations in quantile function tails,
+it is probably impractical to extract every possible wave from realisation since
+they may (and often) overlap.
+
+#+name: tab-weibull-shape
+#+caption: Values of Weibull shape parameter for different wave characteristics.
+#+attr_latex: :booktabs t
+| Characteristic       | Weibull shape (\(k\)) |
+|----------------------+---------------------|
+| Wave height          |                   2 |
+| Wave length          |                 2.3 |
+| Crest length         |                 2.3 |
+| Wave period          |                   3 |
+| Wave slope           |                 2.5 |
+| Three-dimensionality |                 2.5 |
+
+Verification was performed for standing and propagating waves. The corresponding
+ACFs and quantile-quantile plots of wave characteristics distributions are shown
+in
+fig.\nbsp{}[[propagating-wave-distributions]],\nbsp{}[[standing-wave-distributions]],\nbsp{}[[acf-slices]].
+
+#+name: propagating-wave-distributions
+#+begin_src R :file build/propagating-wave-qqplots.pdf
+source(file.path("R", "common.R"))
+par(pty="s", mfrow=c(2, 2))
+arma.qqplot_grid(
+  file.path("build", "propagating_wave"),
+  c("elevation", "heights_y", "lengths_y", "periods"),
+  c("elevation", "height Y", "length Y", "period"),
+  xlab="x",
+  ylab="y"
+)
+#+end_src
+
+#+caption: Quantile-quantile plots for propagating waves.
+#+label: propagating-wave-distributions
+#+RESULTS: propagating-wave-distributions
+[[file:build/propagating-wave-qqplots.pdf]]
+
+#+name: standing-wave-distributions
+#+begin_src R :file build/standing-wave-qqplots.pdf
+source(file.path("R", "common.R"))
+par(pty="s", mfrow=c(2, 2))
+arma.qqplot_grid(
+  file.path("build", "standing_wave"),
+  c("elevation", "heights_y", "lengths_y", "periods"),
+  c("elevation", "height Y", "length Y", "period"),
+  xlab="x",
+  ylab="y"
+)
+#+end_src
+
+#+caption: Quantile-quantile plots for standing waves.
+#+label: standing-wave-distributions
+#+RESULTS: standing-wave-distributions
+[[file:build/standing-wave-qqplots.pdf]]
+
+#+name: acf-slices
+#+header: :width 6 :height 9
+#+begin_src R :file build/acf-slices.pdf
+source(file.path("R", "common.R"))
+propagating_acf <- read.csv(file.path("build", "propagating_wave", "acf.csv"))
+standing_acf <- read.csv(file.path("build", "standing_wave", "acf.csv"))
+par(mfrow=c(5, 2), mar=c(0,0,0,0))
+for (i in seq(0, 4)) {
+  arma.wavy_plot(standing_acf, i, zlim=c(-5,5))
+  arma.wavy_plot(propagating_acf, i, zlim=c(-5,5))
+}
+#+end_src
+
+#+caption: Time slices of ACF for standing (left column) and propagating waves (right column).
+#+label: acf-slices
+#+RESULTS: acf-slices
+[[file:build/acf-slices.pdf]]
+
+Graph tails in fig.\nbsp{}[[propagating-wave-distributions]] deviate from original
+distribution for individual wave characteristics, because every wave have to be
+extracted from the resulting wavy surface to measure its length, period and
+height. There is no algorithm that guarantees correct extraction of all waves,
+because they may and often overlap each other. Weibull distribution right tail
+represents infrequently occurring waves, so it deviates more than left tail.
+
+Correspondence rate for standing waves (fig.\nbsp{}[[standing-wave-distributions]])
+is lower for height and length, roughly the same for surface
+elevation and higher for wave period distribution tails. Lower correspondence
+degree for length and height may be attributed to the fact that Weibull
+distributions were obtained empirically for ocean waves which are typically
+propagating, and distributions may be different for standings waves. Higher
+correspondence degree for wave periods is attributed to the fact that wave
+periods of standing waves are extracted more precisely as the waves do not move
+outside simulated wavy surface region. The same correspondence degree for wave elevation
+is obtained, because this is the characteristic of the wavy surface (and
+corresponding AR or MA process) and is not affected by the type of waves.
+
+*** Verification of velocity potential fields
+:PROPERTIES:
+:CUSTOM_ID: sec:compare-formulae
+:END:
+
+Comparing obtained generic formulae eqref:eq-solution-2d and
+eqref:eq-solution-2d-full to the known formulae from linear wave theory allows
+to see the difference between velocity fields for both large and small amplitude
+waves. In general analytic formula for velocity potential in not known, even for
+plain waves, so comparison is done numerically. Taking into account conclusions
+of [[#sec:pressure-2d]], only finite depth formulae are compared.
+
+**** The difference with linear wave theory formulae.
+In order to obtain velocity potential fields, ocean wavy surface was generated
+by AR model with varying wave amplitude. In numerical implementation wave
+numbers in Fourier transforms were chosen on the interval from \(0\) to the
+maximal wave number determined numerically from the obtained wavy surface.
+Experiments were conducted for waves of both small and large amplitudes.
+
+The experiment showed that velocity potential fields produced by formula
+eqref:eq-solution-2d-full for finite depth fluid and formula
+eqref:eq-solution-2d-linear from linear wave theory are qualitatively different
+(fig.\nbsp{}[[fig-potential-field-nonlinear]]). First, velocity potential contours
+have sinusoidal shape, which is different from oval shape described by linear
+wave theory. Second, velocity potential decays more rapidly than in linear wave
+theory as getting closer to the bottom, and the region where the majority of
+wave energy is concentrated is closer to the wave crest. Similar numerical
+experiment, in which all terms of eqref:eq-solution-2d-full that are neglected
+in the framework of linear wave theory are eliminated, shows no difference (as
+much as machine precision allows) in resulting velocity potential fields.
+
+#+name: fig-potential-field-nonlinear
+#+caption: Velocity potential field of propagating wave \(\zeta(x,y,t) = \cos(2\pi x - t/2)\). Field produced by formula eqref:eq-solution-2d-full (top) and linear wave theory formula (bottom).
+#+begin_figure
+#+attr_latex: :width 0.47\textwidth
+[[file:graphics/pressure/potential-5.eps]]
+#+attr_latex: :width 0.47\textwidth
+[[file:graphics/pressure/potential-6.eps]]
+#+end_figure
+
+**** The difference with small-amplitude wave theory.
+The experiment, in which velocity fields produced numerically by different
+formulae were compared, shows that velocity fields produced by formula
+eqref:eq-solution-2d-full and eqref:eq-old-sol-2d correspond to each other for
+small-amplitude waves. Two ocean wavy surface realisations were made by AR
+model: one containing small-amplitude waves, other containing large-amplitude
+waves. Integration in formula eqref:eq-solution-2d-full was done over wave
+numbers range extracted from the generated wavy surface. For small-amplitude
+waves both formulae showed comparable results (the difference in the velocity is
+attributed to the stochastic nature of AR model), whereas for large-amplitude
+waves stable velocity field was produced only by formula
+eqref:eq-solution-2d-full (fig.\nbsp{}[[fig-velocity-field-2d]]). So, generic
+formula eqref:eq-solution-2d-full gives satisfactory results without restriction
+on wave amplitudes.
+
+#+name: fig-velocity-field-2d
+#+caption: Comparison of velocity field on the ocean wavy surface obtained by generic formula (\(u_1\)) and formula for small-amplitude waves (\(u_2\)). Velocity field for realisations containing small-amplitude (top) and large-amplitude (bottom) waves.
+#+begin_figure
+[[file:build/low-amp-nocolor.eps]]
+[[file:build/high-amp-nocolor.eps]]
+#+end_figure
+*** Non-physical nature of ARMA model
+ARMA model, owing to its non-physical nature, does not have the notion of ocean
+wave; it simulates wavy surface as a whole instead. Motions of individual waves
+and their shape are often rough, and the total number of waves can not be
+determined precisely. However, integral characteristics of wavy surface match
+the ones of real ocean waves.
+
+Theoretically, ocean waves themselves can be chosen as ACFs, the only
+pre-processing step is to make them decay exponentially. This may allow
+to generate waves of arbitrary profiles, and is one of the directions of future
+work.
+
+* High-performance software implementation of ocean wave simulation
+** Computational model
+**** Mapping wavy surface generation algorithm on computational model.
+Software implementation of ARMA model works as a computational pipeline, in
+which each joint applies some function to the output coming from the pipe of the
+previous joint. Joints are distributed across computer cluster nodes to enable
+function parallelism, and then data flowing through the joints is distributed
+across processor cores to enable data parallelism. Figure\nbsp{}[[fig-pipeline]] shows a
+diagram of data processing pipeline in which rectangles with rounded corners
+denote joints, regular rectangles denote arrays of problem domain objects
+flowing from one joint to another, and arrows show flow direction. Some joints
+are divided into /sections/ each of which process a separate part of the array.
+If joints are connected without a /barrier/ (horizontal or vertical bar), then
+transfer of separate objects between them is done in parallel to computations,
+as they become available. Sections work in parallel on each processor core (or
+node of the cluster). There is surjective mapping between a set of processor
+cores, a set of pipeline joint sections and objects, i.e. each processor core
+may run several sections, each of which may sequentially process several
+objects, but a section can not work simultaneously on several processor cores,
+and an object can not be processed simultaneously by several sections.
+
+#+name: fig-pipeline
+#+begin_src dot :exports results :file build/pipeline.pdf
+digraph {
+
+  node [fontsize=14,margin="0.055,0"]
+  graph [nodesep="0.25",ranksep="0.25",rankdir="TB"]
+  edge [arrowsize=0.66]
+
+  # data
+  subgraph xcluster_linear {
+    label="Linear model"
+
+    start [label="",shape=circle,style=filled,fillcolor=black,width=0.23]
+    spectrum [label="S(ω,θ)",shape=box]
+    acf [label="K(i,j,k)",shape=box]
+    phi [label="Φ(i,j,k)",shape=box]
+
+    # transformations
+    fourier_transform [label="Fourier transform",shape=box,style=rounded]
+    solve_yule_walker [label="Solve Yule—Walker\nequations",shape=box,style=rounded]
+
+    subgraph cluster_nonlinear_1 {
+      label="Simulate non-linearity\l"
+      labeljust=left
+      style=filled
+      color=lightgrey
+      acf2 [label="K*(i,j,k)",shape=box]
+      transform_acf [label="Transform ACF",shape=box,style=rounded]
+    }
+  }
+
+  subgraph xcluster_linear2 {
+
+    eps_parts [label="<e1> ε₁|<e2> ε₂|<e3> …|<e4> εₙ|<e> ε(t,x,y)",shape=record]
+    end [label="",shape=doublecircle,style=filled,fillcolor=black,width=0.23]
+
+    generate_white_noise [label="<g1> g₁|<g2> g₂|<g3> …|<g4> gₙ|<gen> Generate\lwhite noise",shape=record,style=rounded]
+    generate_zeta [label="<g1> g₁|<g2> g₂|<g3> …|<g4> gₙ|<gen> Generate ocean\lwavy surface parts\l",shape=record,style=rounded]
+
+    zeta_parts [label="<g1> ζ₁|<g2> ζ₂|<g3> …|<g4> ζₙ|<gen> Non-crosslinked\lrealisation parts",shape=record]
+    overlap_add [label="<g1> ζ₁|<g2> ζ₂|<g3> …|<g4> ζₙ|<gen> Crosslink realisation\lparts\l",shape=record,style=rounded]
+
+    zeta_parts:g1->overlap_add:g1
+    zeta_parts:g2->overlap_add:g2
+    zeta_parts:g3->overlap_add:g3
+    zeta_parts:g4->overlap_add:g4
+
+    zeta_parts:g2->overlap_add:g1 [constraint=false]
+    zeta_parts:g3->overlap_add:g2 [constraint=false]
+    zeta_parts:g4->overlap_add:g3 [constraint=false]
+
+    overlap_add:g1->zeta2_parts:g1
+    overlap_add:g2->zeta2_parts:g2
+    overlap_add:g3->zeta2_parts:g3
+    overlap_add:g4->zeta2_parts:g4
+
+    zeta2_parts:g1->transform_zeta:g1->zeta3_parts:g1->write_zeta:g1->eps_end
+    zeta2_parts:g2->transform_zeta:g2->zeta3_parts:g2->write_zeta:g2->eps_end
+    zeta2_parts:g3->transform_zeta:g3->zeta3_parts:g3->write_zeta:g3->eps_end
+    zeta2_parts:g4->transform_zeta:g4->zeta3_parts:g4->write_zeta:g4->eps_end
+
+  }
+
+  subgraph part3 {
+
+    zeta2_parts [label="<g1> ζ₁|<g2> ζ₂|<g3> …|<g4> ζₙ|<gen> Wavy surface with\lGaussian distribution\l",shape=record]
+
+    subgraph cluster_nonlinear_2 {
+      label="Simulate non-linearity\r"
+      labeljust=right
+      style=filled
+      color=lightgrey
+      zeta3_parts [label="<g1> ζ₁|<g2> ζ₂|<g3> …|<g4> ζₙ|<gen> ζ(t,x,y)",shape=record]
+      transform_zeta [label="<g1> g₁|<g2> g₂|<g3> …|<g4> gₙ|<gen> Transform wavy\lsurface elevation\lprobability distribution\l",shape=record,style=rounded]
+    }
+
+    # barriers
+    eps_start [label="",shape=box,style=filled,fillcolor=black,height=0.05]
+    eps_end [label="",shape=box,style=filled,fillcolor=black,height=0.05]
+
+    write_zeta [label="<g1> g₁|<g2> g₂|<g3> …|<g4> gₙ|<gen> Write finished\lparts to a file\l",shape=record,style=rounded]
+  }
+
+  # edges
+  start->spectrum->fourier_transform->acf->transform_acf
+  transform_acf->acf2
+  acf2->solve_yule_walker
+  solve_yule_walker->phi
+  phi->eps_start [constraint=false]
+  eps_start->generate_white_noise:g1
+  eps_start->generate_white_noise:g2
+  eps_start->generate_white_noise:g3
+  eps_start->generate_white_noise:g4
+  generate_white_noise:g1->eps_parts:e1->generate_zeta:g1->zeta_parts:g1
+  generate_white_noise:g2->eps_parts:e2->generate_zeta:g2->zeta_parts:g2
+  generate_white_noise:g3->eps_parts:e3->generate_zeta:g3->zeta_parts:g3
+  generate_white_noise:g4->eps_parts:e4->generate_zeta:g4->zeta_parts:g4
+
+  eps_end->end
+}
+#+end_src
+
+#+caption: Diagram of data processing pipeline, that implements ocean wavy surface generation via AR model.
+#+label: fig-pipeline
+#+RESULTS: fig-pipeline
+[[file:build/pipeline.pdf]]
+
+Object pipeline may be seen as an improvement of BSP (Bulk Synchronous Parallel)
+model\nbsp{}cite:valiant1990bridging, which is used in graph processing\nbsp{}cite:malewicz2010pregel,seo2010hama. Pipeline eliminates global synchronisation
+(where it is possible) after each sequential computation step by doing data
+transfer between joints in parallel to computations, whereas in BSP model global
+synchronisation occurs after each step.
+
+Object pipeline speeds up the programme by parallel execution of code blocks
+that work with different compute devices: while the current part of wavy surface
+is generated by a processor, the previous part is written to a disk. This
+approach allows to get speed-up because compute devices operate asynchronously,
+and their parallel usage increases the whole programme performance.
+
+Since data transfer between pipeline joints is done in parallel to computations,
+the same pipeline may be used to run several copies of the application but with
+different parameters (generate several ocean wavy surfaces having different
+characteristics). In practise, high-performance applications do not always
+consume 100% of processor time spending a portion of time on synchronisation of
+parallel processes and writing data to disk. Using pipeline in this case allows
+to run several computations on the same set of processes, and use all of the
+computer devices at maximal efficiency. For example, when one object writes data
+to a file, the other do computations on the processor in parallel. This
+minimises downtime of the processor and other computer devices and increases
+throughput of the computer cluster.
+
+Pipelining of otherwise sequential steps is beneficial not only for code work
+with different devices, but for code different branches of which are suitable
+for execution by multiple hardware threads of the same processor core, i.e.
+branches accessing different memory blocks or performing mixed arithmetic
+(integer and floating point). Code branches which use different modules of
+processor are good candidates to run in parallel on a processor core with
+multiple hardware threads.
+
+So, computational model with a pipeline can be seen as /bulk-asynchronous
+model/, because of the parallel nature of programme steps. This model is the
+basis of the fault-tolerance model which will be described later.
+
+**** Software implementation.
+For efficiency reasons object pipeline and fault tolerance techniques (which
+will be described later) are implemented in the C++ framework: From the author's
+perspective C language is deemed low-level for distributed programmes, and Java
+incurs too much overhead and is not popular in HPC community. As of now, the
+framework runs in the same process as an parallel application that uses it. The
+framework is called Factory, it is now in proof-of-concept development stage.
+
+**** Computational model overview.
+The key feature that is missing in the current parallel programming technologies
+is a possibility to specify hierarchical dependencies between parallel tasks.
+When one has such dependency, it is trivial to determine which task should be
+responsible for re-executing a failed task on one of the survived nodes. To
+re-execute the task on the top of the hierarchy, a backup task is created and
+executed on a different node. There exists a number of systems that are capable
+of executing directed acyclic graphs of tasks in parallel\nbsp{}cite:acun2014charmpp,islam2012oozie, but graphs are not suitable to infer
+principal-subordinate relationship between tasks, because a node in the graph
+may have multiple parent nodes.
+
+The main purpose of the model is to simplify development of distributed batch
+processing applications and middleware. The main focus is to make application
+resilient to failures, i.e. make it fault tolerant and highly available, and do
+it transparently to a programmer. The implementation is divided into two layers:
+the lower layer consists of routines and classes for single node applications
+(with no network interactions), and the upper layer for applications that run on
+an arbitrary number of nodes. There are two kinds of tightly coupled entities in
+the model\nbsp{}--- /control flow objects/ (or /kernels/ for short) and
+/pipelines/\nbsp{}--- which are used together to compose a programme.
+
+Kernels implement control flow logic in theirs ~act~ and ~react~ methods and
+store the state of the current control flow branch. Both logic and state are
+implemented by a programmer. In ~act~ method some function is either directly
+computed or decomposed into nested functions (represented by a set of
+subordinate kernels) which are subsequently sent to a pipeline. In ~react~
+method subordinate kernels that returned from the pipeline are processed by
+their parent. Calls to ~act~ and ~react~ methods are asynchronous and are made
+within threads attached to a pipeline. For each kernel ~act~ is called only
+once, and for multiple kernels the calls are done in parallel to each other,
+whereas ~react~ method is called once for each subordinate kernel, and all the
+calls are made in the same thread to prevent race conditions (for different
+parent kernels different threads may be used).
+
+Pipelines implement asynchronous calls to ~act~ and ~react~, and try to make as
+many parallel calls as possible considering concurrency of the platform (no. of
+cores per node and no. of nodes in a cluster). A pipeline consists of a kernel
+pool, which contains all the subordinate kernels sent by their parents, and a
+thread pool that processes kernels in accordance with rules outlined in the
+previous paragraph. A separate pipeline is used for each device: There are
+pipelines for parallel processing, schedule-based processing (periodic and
+delayed tasks), and a proxy pipeline for processing of kernels on other cluster
+nodes (see fig.\nbsp{}[[fig-subord-ppl]]).
+
+In principle, kernels and pipelines machinery reflect the one of procedures and
+call stacks, with the advantage that kernel methods are called asynchronously
+and in parallel to each other (as much as programme logic allows). Kernel field
+is the stack, ~act~ method is a sequence of processor instructions before nested
+procedure call, and ~react~ method is a sequence of processor instructions after
+the call. Constructing and sending subordinate kernels to the pipeline is nested
+procedure call. Two methods are necessary to make calls asynchronous, and
+replace active wait for completion of subordinate kernels with passive one.
+Pipelines, in turn, allow to implement passive wait, and call correct kernel
+methods by analysing their internal state.
+
+#+name: fig-subord-ppl
+#+begin_src dot :exports results :file build/subord-ppl.pdf
+graph G {
+
+  node [fontname="Old Standard",fontsize=14,margin="0.055,0",shape=box]
+  graph [nodesep="0.25",ranksep="0.25",rankdir="LR"]
+  edge [arrowsize=0.66]
+
+  subgraph cluster_daemon {
+    label="Daemon process"
+    style=filled
+    color=lightgrey
+
+    factory [label="Factory"]
+    parallel_ppl [label="Parallel\npipeline"]
+    io_ppl [label="I/O\npipeline"]
+    sched_ppl [label="Schedule-based\npipeline"]
+    net_ppl [label="Network\npipeline"]
+    proc_ppl [label="Process\npipeline"]
+
+    upstream [label="Upstream\nthread pool"]
+    downstream [label="Downstream\nthread pool"]
+  }
+
+  factory--parallel_ppl
+  factory--io_ppl
+  factory--sched_ppl
+  factory--net_ppl
+  factory--proc_ppl
+
+  subgraph cluster_hardware {
+    label="Compute devices"
+    style=filled
+    color=lightgrey
+
+    cpu [label="CPU"]
+    core0 [label="Core 0"]
+    core1 [label="Core 1"]
+    core2 [label="Core 2"]
+    core3 [label="Core 3"]
+
+    storage [label="Storage"]
+    disk0 [label="Disk 0"]
+
+    network [label="Network"]
+    nic0 [label="NIC 0"]
+
+    timer [label="Timer"]
+
+  }
+
+  core0--cpu
+  core1--cpu
+  core2--cpu
+  core3--cpu
+
+  disk0--storage
+  nic0--network
+
+  parallel_ppl--upstream
+  parallel_ppl--downstream
+
+  upstream--{core0,core1,core2,core3} [style="dashed"]
+  downstream--core0 [style="dashed"]
+
+  io_ppl--core0 [style="dashed"]
+  io_ppl--disk0 [style="dashed"]
+  sched_ppl--core0 [style="dashed"]
+  sched_ppl--timer [style="dashed"]
+  net_ppl--core0 [style="dashed"]
+  net_ppl--nic0 [style="dashed"]
+  proc_ppl--core0 [style="dashed"]
+
+  subgraph cluster_children {
+    style=filled
+    color=white
+
+    subgraph cluster_child0 {
+      label="Child process 0"
+      style=filled
+      color=lightgrey
+      labeljust=right
+
+      app0_factory [label="Factory"]
+      app0 [label="Child process\rpipeline"]
+    }
+
+#    subgraph cluster_child1 {
+#      label="Child process 1"
+#      style=filled
+#      color=lightgrey
+#      labeljust=right
+#
+#      app1_factory [label="Factory"]
+#      app1 [label="Child process\rpipeline"]
+#    }
+  }
+
+  proc_ppl--app0
+#  proc_ppl--app1
+
+  app0_factory--app0 [constraint=false]
+#  app1_factory--app1 [constraint=false]
+
+}
+#+end_src
+
+#+caption: Mapping of parent and child process pipelines to compute devices. Solid lines denote aggregation, dashed lines denote mapping between logical and physical entities.
+#+attr_latex: :width \textwidth
+#+label: fig-subord-ppl
+#+RESULTS: fig-subord-ppl
+[[file:build/subord-ppl.pdf]]
+
+**** Governing principles.
+Data processing pipeline model is based on the following principles, following
+which maximises efficiency of a programme.
+- There is no notion of a message in the model, a kernel is itself a message
+  that can be sent over network to another node and directly access any kernel
+  on the local node. Only programme logic may guarantee the existence of the
+  kernel.
+- A kernel is a /cooperative routine/, which is submitted to kernel pool upon the
+  call and is executed asynchronously by a scheduler. There can be any number of
+  calls to other subroutines inside routine body. Every call submits
+  corresponding subroutine to kernel pool and returns immediately. Kernels in the
+  pool can be executed in any order; this fact is used by a scheduler to exploit
+  parallelism offered by the computer by distributing kernels from the pool
+  across available cluster nodes and processor cores.
+- Asynchronous execution prevents the use of explicit synchronisation after the
+  call to subroutine is made; system scheduler returns control flow to the
+  routine each time one of its subroutine returns. Such cooperation transforms
+  each routine which calls subroutines into event handler, where each event is a
+  subroutine and the handler is the routine that called them.
+- The routine may communicate with any number of local kernels, addresses of
+  which it knows; communication with kernels which are not adjacent in the call
+  stack complexifies control flow and call stack looses its tree shape. Only
+  programme logic may guarantee presence of communicating kernels in memory. One
+  way to ensure this is to perform communication between subroutines which are
+  called from the same routine. Since such communication is possible within
+  hierarchy through parent routine, it may treated as an optimisation that
+  eliminates overhead of transferring data over intermediate node. The situation
+  is different for interactive or event-based programmes (e.g. servers and
+  programmes with graphical interface) in which this is primary type of
+  communication.
+- In addition to this, communication which does not occur along hierarchical
+  links and executed over cluster network complexify design of resiliency
+  algorithms. Since it is impossible to ensure that a kernel resides in memory
+  of a neighbour node, because a node may fail in the middle of its execution of
+  the corresponding routine. As a result, upon failure of a routine all of its
+  subroutines must be restarted. This encourages a programmer to construct
+  - deep tree hierarchies of tightly-coupled kernels (which communicate on the
+    same level of hierarchy) to reduce overhead of recomputation;
+  - fat tree hierarchies of loosely-coupled kernels, providing maximal degree of
+    parallelism.
+  Deep hierarchy is not only requirement of technology, it helps optimise
+  communication of large number of cluster nodes reducing it to communication of
+  adjacent nodes.
+
+So, control flow objects (or kernels) possess properties of both cooperative
+routines and event handlers.
+
+** SMP implementation
+**** Load balancing algorithm.
+The simplest approach to balance the load on a multi-processor system is to
+split data into equal parts (or a task into homogeneous subtasks) and to
+distribute them evenly between processor cores and cluster nodes, however, this
+approach does not work efficiently in all cases. First, the total number of
+parts, into which input data is split, is often dictated by the problem being
+solved, rather than computer system architecture. Such load balancing may not
+efficient from the computer system point of view: the number of parts is either
+too large compared to the number of processors working in parallel, which
+increases data transfer overhead, or too small, which prevents using all
+available processor cores. Second, restrictions of problem being solved may not
+allow to split input data into even parts which may result in load imbalance
+across processor cores. Third, there are multiple components in the system aside
+from the processor that take part in the computation (such as vector
+co-processors and storage devices), and the problem solution time depends on the
+performance of all the components involved. So, how to make load balancing
+algorithm more efficient in the presence of non-homogeneous input data parts and
+take into account all the devices involved in the computation?
+
+The load balancing algorithm consists of two stages. In the first stage, the
+algorithm places input data part (or a subtask) wrapped in a kernel into an
+appropriate kernel pool: there is a separate pool for each device and an
+associated thread pool. In the second stage, a kernel is retrieved from the pool
+by one of the threads and processed. Due to separate thread pools all devices
+work in parallel to each other, lowering overall system resources downtime
+compared to using all devices from a single thread.
+
+In order to take into account non-homogeneous input data parts or tasks, one may
+predict execution time of each task. Relevant study is done
+in\nbsp{}cite:degtyarev2016balance since ARMA model implementation includes
+mostly homogeneous tasks.
+
+So, load balancing is done in two stages: in the first stage the task wrapped in
+the kernel is routed to the appropriate device and in the second stage the
+kernel is routed to one of the thread from the device thread pool.
+Non-homogeneous kernels may be handled by predicting their execution time, but
+such kernels are not present in ARMA model implementation.
+
+**** Performance of MPI, OpenMP, OpenCL implementations.
+ARMA model does not require highly optimised software implementation to be
+efficient, its performance is high even without use of co-processors; there are
+two main causes of that. First, ARMA model itself does not use transcendental
+functions (sines, cosines and exponents) as opposed to LH model. All
+calculations (except model coefficients) are done via polynomials, which can be
+efficiently computed on modern processors using a series of FMA instructions.
+Second, pressure computation is done via explicit analytic formula using nested
+FFTs. Since two-dimensional FFT of the same size is repeatedly applied to every
+time slice, its coefficients (complex exponents) are pre-computed for all
+slices, and computations are performed with only a few transcendental functions.
+In case of MA model, performance is also increased by doing convolution with FFT
+transforms. So, high performance of ARMA model is due to scarce use of
+transcendental functions and heavy use of FFT, not to mention that high
+convergence rate and non-existence of periodicity allows to use far fewer
+coefficients compared to LH model.
+
+ARMA implementation uses several libraries of reusable mathematical functions
+and numerical algorithms (listed in table\nbsp{}[[tab-arma-libs]]), and was implemented using
+several parallel programming technologies (MPI, OpenMP, OpenCL) to find the most
+efficient one.
+
+#+name: tab-arma-libs
+#+caption: A list of mathematical libraries used in ARMA model implementation.
+#+attr_latex: :booktabs t :align lp{0.6\linewidth}
+| Library                                                | What it is used for             |
+|--------------------------------------------------------+---------------------------------|
+| DCMT\nbsp{}cite:matsumoto1998dynamic                         | parallel PRNG                   |
+| Blitz\nbsp{}cite:veldhuizen1997will,veldhuizen2000techniques | multidimensional arrays         |
+| GSL\nbsp{}cite:gsl2008scientific                             | PDF, CDF, FFT computation       |
+|                                                        | checking process stationarity   |
+| LAPACK, GotoBLAS\nbsp{}cite:goto2008high,goto2008anatomy     | finding AR coefficients         |
+| GL, GLUT\nbsp{}cite:kilgard1996opengl                        | three-dimensional visualisation |
+
+**** Performance of load balancing algorithm.
+Software implementation of wavy surface generation is balanced in terms of the
+load on processor cores, however, as shown by tests, has high load on storage
+device. Before testing wavy surface generation was implemented using OpenMP for
+parallel computations and in order to implement load balancing algorithm was
+rewritten using POSIX threads. Performance of the two implementations was
+compared on the platform with the configuration listed in table\nbsp{}[[tab-multicore-specs]].
+
+#+name: tab-multicore-specs
+#+caption: Multi-core system configuration.
+#+attr_latex: :booktabs t
+| Component                 | Details                          |
+|---------------------------+----------------------------------|
+| Programming language      | C++11                            |
+| Threading library         | C++11 STL threads                |
+| Atomic operations library | C++11 STL atomic                 |
+| Routines to measure time  | ~clock_gettime(CLOCK_MONOTONIC)~ |
+|                           | ~/usr/bin/time -f \%e~           |
+| Compiler                  | GCC 4.8.2                        |
+| Compiler flags            | ~-std=c++11 -O2 -march=native~   |
+| Operating system          | Debian 3.2.51-1 x86_64           |
+| File system               | ext4                             |
+| Processor                 | Intel Core 2 Quad Q9650          |
+| Core frequency (GHz)      | 3.00                             |
+| No. of cores              | 4                                |
+| Amount of RAM (GB)        | 8                                |
+| Disk                      | Seagate ST3250318AS              |
+| Disk speed (rpm)          | 7200                             |
+
+The experiment consisted of running both implementations on a multi-core machine
+varying the size of the surface; the size of CPU thread pool and I/O thread pool
+was not changed during the experiment. I/O thread pool consisted of one thread,
+and CPU thread pool size was equal the number of physical processor cores.
+
+In the experiment load balancing algorithm showed higher performance than
+implementation without it. The more the size of the generated surface is the
+more the gap in performance is (fig.\nbsp{}[[fig-factory-performance]]) which is a
+result of overlap of computation phase and data output phase
+(fig.\nbsp{}[[fig-factory-overlap]]). In OpenMP implementation data output phase
+begins only when computation is over, whereas load balancing algorithm makes
+both phases end almost simultaneously. So, /pipelined execution of internally
+parallel sequential phases is more efficient than their sequential execution/,
+and this allows to balance the load across different devices involved in
+computation.
+
+#+name: fig-factory-performance
+#+header: :width 5 :height 4
+#+begin_src R :file build/factory-vs-openmp.pdf
+source(file.path("R", "common.R"))
+arma.plot_factory_vs_openmp(
+  xlab="Realisation size",
+  ylab="Time, s",
+  power=6
+)
+#+end_src
+
+#+caption: Performance comparison of OpenMP and Factory implementations.
+#+label: fig-factory-performance
+#+RESULTS: fig-factory-performance
+[[file:build/factory-vs-openmp.pdf]]
+
+#+name: fig-factory-overlap
+#+header: :width 7 :height 4
+#+begin_src R :file build/factory-vs-openmp-overlap.pdf
+source(file.path("R", "common.R"))
+par(mar=c(5, 6, 0, 1), pty="m")
+arma.plot_factory_vs_openmp_overlap(
+  xlab="Time, s",
+  labels=c("Factory", "OpenMP"),
+  scale=10**9
+)
+#+end_src
+
+#+caption: Overlap of parallel computations on \([G_0,G_1]\) and data output to disk on \([W_0,W_1]\). In OpenMP implementation there is no overlap.
+#+label: fig-factory-overlap
+#+RESULTS: fig-factory-overlap
+[[file:build/factory-vs-openmp-overlap.pdf]]
+
+Proposed load balancing method for multi-core systems allows to increase
+performance of applications that read or write large volumes of data to disk,
+but may be used in other cases too. The main idea of the algorithm is to
+classify the load and find the suitable device to route the load to. So, any
+devices other than disks may be used as well.
+** MPP implementation
+*** Cluster node discovery algorithm
+:PROPERTIES:
+:CUSTOM_ID: sec:node-discovery
+:END:
+
+Many distributed systems are built on the principle of /subordination/: there is
+principal node in each cluster which manages job queue, schedules their
+execution on subordinate nodes and monitors their state. Principal role is
+assigned either /statically/ by an administrator to a particular physical node,
+or /dynamically/ by electing one of the cluster nodes as principal. In the
+former case fault tolerance is provided by reserving additional spare node which
+takes principal role when current principal fails. In the latter case fault
+tolerance is provided by electing new principal node from survived nodes.
+Despite the fact that dynamic role assignment requires specialised distributed
+algorithm, this approach becomes more and more popular as it does not require
+spare reserved nodes to recover from principal node failure.
+
+Leader election algorithms (which sometimes referred to as /distributed
+consensus/ algorithms are special cases of wave algorithms. In\nbsp{}cite:tel2000introduction Tel defines them as algorithms in which termination
+event is preceded by at least one event occurring in /each/ parallel process.
+Wave algorithms are not defined for anonymous networks, that is they apply only
+to processes that can uniquely define themselves. However, the number of
+processes affected by the "wave" can be determined in the course of an
+algorithm. For a distributed system this means that wave algorithms work for
+computer clusters with dynamically changing number of nodes, and the algorithm
+is unaffected by some nodes going on-line and off-line.
+
+The approach in the following work does not use wave algorithms, and hence does
+not require communicating with each node of the cluster to determine a leader.
+Instead, each node enumerates all nodes in the network it is part of, and
+converts this list to a /tree hierarchy/ with a user-defined maximal fan-out
+value (maximal number of subordinate nodes). Then the node determines its
+hierarchy level and tries to communicate with nodes from higher levels to become
+their subordinate. First, it checks the closest ones and then goes all the way
+to the top. If there is no top-level nodes or the node cannot connect to them,
+then the node itself becomes the principal of the hierarchy.
+
+Tree hierarchy of all hosts in a network defines strict total order on a set of
+cluster nodes. Although, technically any function can be chosen to map a node to
+a number, in practise this function should be sufficiently smooth along the time
+axis and may have infrequent jumps: high-frequency oscillations (which are often
+caused by measurement errors) may result in constant passing of principal role
+from one node to another, which makes the cluster unmanageable. The simplest
+such function is the position of an IP address in network IP address range.
+
+The following key features distinguish this approach with respect to some
+existing proposals\nbsp{}cite:brunekreef1996design,aguilera2001stable,romano2014design.
+- *Multi-level hierarchy.* The number of principal nodes in a network depends on
+  the fan-out value. If it is lesser than the number of IP-addresses in the
+  network, then there are multiple principle nodes in the cluster. If it is
+  greater or equal to the number of IP-addresses in the network, then there is
+  only one principal node. When some node fail, multi-level hierarchy changes
+  locally, only nodes adjacent to the failed one communicate.
+- *IP-address mapping.* Since hierarchy structure solely depends on the nodes'
+  IP addresses, there is no election phase in the algorithm. To change the
+  principal each node sends a message to the old principal and to the new one.
+- *Completely event-based.* The messages are sent only when some node fails, so
+  there is no constant load on the network. Since the algorithm allows
+  to tolerate failure of sending any message, there is no need in heartbeat
+  packets indicating presence of a node in the network; instead, all messages
+  play role of heartbeats and packet send time-out is adjusted.
+- *No manual configuration.* A node does not require any prior knowledge to find
+  the principal: it determines the network it is part of, calculates potential
+  principal IP-address and sends the message. If it fails, the process is
+  repeated for the next potential principal node. So the algorithm is suitable
+  to bootstrap a cluster without manual configuration, the only requirement is
+  to start the corresponding service on each node.
+To summarise, the advantage of the algorithm is that it
+- scales to a large number of nodes by means of hierarchy with multiple
+  principals,
+- does not constantly load the network with node state updates and heartbeat
+  packets,
+- does not require manual configuration to bootstrap a cluster.
+
+The disadvantage of the algorithm is that it requires IP-address to change
+infrequently. It is not suitable for cloud environments in which node DNS name
+is preserved, but IP-address may change over time. When IP-address changes,
+current connections may close, thus triggering node "failure" and rebuilding
+node hierarchy. So, environments where nodes are not identified by IP-addresses,
+are not suitable for the algorithm.
+
+The other disadvantage is that the algorithm creates artificial dependence of
+node rank on IP-address: it is difficult to substitute IP-address mapping with a
+more sophisticated one (e.g. a mapping which uses current node and network load
+to infer node ranks) because measurement errors may result in unstable
+hierarchy, and the algorithm cease to be fully event-based.
+
+Node discovery algorithm is designed to balance the load on a cluster of compute
+nodes, its use in other applications is not studied here. When distributed or
+parallel programme starts on any of cluster nodes, its subtasks are distributed
+to all adjacent nodes in the hierarchy (including principal node if applicable).
+To distribute the load evenly when the application is run on a subordinate node,
+each node maintains weight of each adjacent node in the hierarchy. The weight
+equals to the number of nodes in the tree "behind" the adjacent node. For
+example, if the weight of the first adjacent node is 2, then round-robin load
+balancing algorithm distributes two subtasks to the first node before moving to
+the next one.
+
+To summarise, node discovery algorithm is
+- designed to ease load balancing on the cluster,
+- fully fault-tolerant the state of every node can be recomputed at any time,
+- fully event-based which means it does not load the network by periodically
+  sending messages.
+
+**** Building a tree hierarchy.
+Strict total order on the set \(\mathcal{N}\) of cluster nodes connected to a
+network is defined as
+\begin{equation*}
+  \forall n_1 \forall n_2 \in \mathcal{N},
+  \forall f \colon \mathcal{N} \rightarrow \mathcal{R}^n
+  \Rightarrow (f(n_1) < f(n_2) \Leftrightarrow \neg (f(n_1) \geq f(n_2))),
+\end{equation*}
+where \(f\) maps a node to its rank and operator \(<\) defines strict total order on
+\(\mathcal{R}^n\). Function \(f\) defines node's sequential number, and \(<\) makes
+this number unique.
+
+The simpliest function \(f\) maps each node to its Internet address position in
+network IP address range. Without conversion to a tree (when only /one/
+leader is allowed in the network) a node with the lowest position in this range
+becomes the principal. If IP-address of a node occupies the first position in
+the range, then there is no principal for it, and it continues to be at the top
+of the hierarchy until it fails. Although, IP address mapping is simple to
+implement, it introduces artificial dependence of the principal role on the
+address of a node. Still, it is useful for initial configuration of a cluster
+when more complex mappings are not applicable.
+
+To make discovery algorithm scale to a large number of nodes, IP address range
+is mapped to a tree hierarchy. In this hierarchy each node is uniquely
+identified by its hierarchy level \(l\), which it occupies, and offset \(o\),
+which equals to the sequential number of node on its level. Values of level and
+offset are computed from the following optimisation problem.
+\begin{align*}
+    n = \sum\limits_{i=0}^{l(n)} p^i + o(n), \quad
+    l \rightarrow \min, \quad
+    o \rightarrow \min, \quad
+    l \geq 0, \quad
+    o \geq 0
+\end{align*}
+where \(n\) is the position of node's IP address in network IP address range and
+\(p\) is fan-out value (the maximal number of subordinates, a node can have). The
+principal of a node with level \(l\) and offset \(o\) has level \(l-1\) and offset
+\(\lfloor{o/p}\rfloor\). The distance between any two nodes in the tree with
+network positions \(i\) and \(j\) is computed as
+\begin{align*}
+    & \langle
+        \text{lsub}(l(j), l(i)), \quad
+        \left| o(j) - o(i)/p \right|
+    \rangle,\\
+    & \text{lsub}(l_1, l_2) =
+    \begin{cases}
+        \infty & \quad \text{if } l_1 \geq l_2, \\
+        l_1 - l_2 & \quad \text{if } l_1 < l_2.
+    \end{cases}
+\end{align*}
+The distance is compound to account for level in the first place.
+
+To determine its principal each node ranks all nodes in the network according to
+their position \(\langle{l(n),o(n)}\rangle\), and using distance formula chooses
+the node which is closest to potential principal position and has lower rank.
+That way IP addresses of offline nodes are skipped, however, for sparse networks
+(in which nodes occupy non-contiguous IP addresses) perfect tree is not
+guaranteed.
+
+In order to determine its principal a node is required to communicate to a node
+address of which it knows beforehand, so discovery algorithm scales to a large
+number of nodes. Communication with other nodes in ranked list occurs only when
+the current principal node fails. So, if address of cluster nodes occupy
+contiguous addresses network IP address range, each node connects to its
+principal only, and inefficient scan of all network by each node does not occur.
+
+**** Evaluation results.
+Test platform consisted of several multi-core nodes, on top of which virtual
+clusters with varying number of nodes were deployed using Linux network
+namespaces. Similar approach is used
+in\nbsp{}cite:lantz2010network,handigol2012reproducible,heller2013reproducible
+where the authors reproduce various real-world experiments using virtual
+clusters and compare results to physical ones. The advantage of it is that the
+tests can be performed on a large virtual cluster using relatively small number
+of physical nodes. This approach was used to evaluate node discovery algorithm,
+because the algorithm has low requirement for system resources (processor time
+and network throughput).
+
+Performance of the algorithm was evaluated by measuring time needed to all nodes
+of the cluster to discover each other. Each change of the hierarchy (as seen by
+each node) was written to a file and after 30 seconds all the processes (each of
+which models cluster node) were forcibly terminated. Test runs showed that
+running more than 100 virtual nodes on one physical node simultaneously warp the
+results, thus additional physical nodes, each of which run 100 virtual nodes,
+were used for the experiment. The experiment showed that discovery of 100--400
+nodes each other takes 1.5 seconds on average, and the value increases only
+slightly with increase in the number of nodes (see
+fig.\nbsp{}[[fig-bootstrap-local]]). An example of tree hierarchy for 11 nodes with
+fan-out 2 is shown in fig.\nbsp{}[[fig-tree-hierarchy-11]].
+
+#+name: fig-bootstrap-local
+#+caption: Time to discover all nodes of the cluster in depending on number of nodes.
+[[file:graphics/discovery.eps]]
+
+#+name: fig-tree-hierarchy-11
+#+begin_src dot :exports results :file build/tree-hierarchy-11.pdf
+digraph {
+
+  node [fontname="Old Standard",fontsize=14,margin="0.055,0",shape=box,style=rounded]
+  graph [nodesep="0.15",ranksep="0.20",rankdir="BT"]
+  edge [arrowsize=0.66]
+
+  m1 [label="127.0.0.1"]
+  m2 [label="127.0.0.2"]
+  m3 [label="127.0.0.3"]
+  m4 [label="127.0.0.4"]
+  m5 [label="127.0.0.5"]
+  m6 [label="127.0.0.6"]
+  m7 [label="127.0.0.7"]
+  m8 [label="127.0.0.8"]
+  m9 [label="127.0.0.9"]
+  m10 [label="127.0.0.10"]
+  m11 [label="127.0.0.11"]
+
+  m2->m1
+  m3->m1
+  m4->m2
+  m5->m2
+  m6->m3
+  m7->m3
+  m8->m4
+  m9->m4
+  m10->m5
+  m11->m5
+}
+#+end_src
+
+#+caption: Tree hierarchy for 11 nodes with fan-out equals 2.
+#+label: fig-tree-hierarchy-11
+#+RESULTS: fig-tree-hierarchy-11
+[[file:build/tree-hierarchy-11.pdf]]
+
+*** Fail over algorithm
+**** Checkpoints.
+Node failures in a distributed system are divided into two types: failure of a
+subordinate node and failure of a principal node. In order for a job running on
+the cluster to survive subordinate node failure, job scheduler periodically
+creates checkpoints and writes them to a stable storage. In order to create the
+checkpoint, the scheduler temporarily suspends all parallel processes of the
+job, copies all memory pages and all internal operating system kernel structures
+allocated for these processes to disk, and resumes execution of the job. In
+order to survive principal node failure, job scheduler server process continuously
+copies its internal state to a backup node, which becomes the principal after
+the failure.
+
+There are many works dedicated to improving performance of
+checkpoints\nbsp{}cite:egwutuoha2013survey, and alternative approaches do not
+receive much attention. Usually HPC applications use message passing for
+parallel processes communication and store their state in global memory space,
+hence there is no way one can restart a failed process from its current state
+without writing the whole memory image to disk. Usually the total number of
+processes is fixed by the job scheduler, and all parallel processes restart upon
+a failure. There is ongoing effort to make it possible to restart only the
+failed process\nbsp{}cite:meyer2012radic by restoring them from a checkpoint on
+the surviving nodes, but this may lead to overload if there are other processes
+on these nodes. Theoretically, process restart is not needed, if the job can
+proceed on the surviving nodes, however, message passing library does not allow
+to change the number of processes at runtime, and most programmes assume this
+number to be constant. So, there is no reliable way to provide fault tolerance
+in message passing library other than restarting all parallel processes from a
+checkpoint.
+
+There is, however, a possibility to continue execution of a job on lesser number
+of nodes than it was initially requested by implementing fault tolerance on
+application level. In this case principal and subordinate roles are dynamically
+distributed between job scheduler daemons running on each cluster node, forming
+a tree hierarchy of cluster nodes, and parallel programme consists of kernels
+which use node hierarchy to dynamically distribute the load and use their own
+hierarchy to restart kernels upon node failure.
+
+**** Dynamic role distribution.
+Fault tolerance of a parallel programme is one of the problems which should by
+solved by big data and HPC job schedulers, however, most schedulers provide
+fault tolerance for subordinate nodes only. These types of failures are
+routinely handled by restarting the affected job (from a checkpoint) or its part
+on the remaining nodes, and failure of a principal node is often considered
+either improbable, or too complicated to handle and configure on the target
+platform. System administrators often find alternatives to application level
+fault tolerance: they isolate principal process of the scheduler from the rest
+of the cluster nodes by placing it on a dedicated machine, or use virtualisation
+technologies instead. All these alternatives complexify configuration and
+maintenance, and by decreasing probability of a machine failure resulting in a
+whole system failure, they increase probability of a human error.
+
+From such point of view it seems more practical to implement principal node
+fault tolerance at application level, but there is no proven generic solution.
+Most implementations are too tied to a particular application to become
+universally applicable. The author believes that this happens due to people's
+habit to think of a cluster as a collection of individual machines each of which
+can be either principal or subordinate, rather than to think of a cluster as a
+whole with principal and subordinate roles being dynamically distributed between
+processes running on different nodes.
+
+Realisation of the fact that a cluster is also a computer allows to implement
+middleware that distributes principal and subordinate roles automatically and
+handles node failures in a generic way. This software provides an API to
+distribute kernels between currently available nodes. Using this API one can
+write a programme that runs on a cluster without knowing the exact number of
+working nodes. The middleware works as a cluster operating system in user space,
+allowing to write and execute distributed applications transparently.
+
+**** Symmetric architecture.
+Many distributed key-value stores and parallel file systems have symmetric
+architecture, in which principal and subordinate roles are dynamically
+distributed, so that any node can act as a principal when the current principal
+node
+fails\nbsp{}cite:ostrovsky2015couchbase,divya2013elasticsearch,boyer2012glusterfs,anderson2010couchdb,lakshman2010cassandra.
+however, this architecture is still not used in big data and HPC job schedulers.
+For example, in YARN big data job scheduler\nbsp{}cite:vavilapalli2013yarn
+principal and subordinate roles are static. Failure of a subordinate node is
+tolerated by restarting a part of a job, that worked on it, on one of the
+surviving nodes, and failure of a principal node is tolerated by setting up
+standby principal node\nbsp{}cite:murthy2011architecture. Both principal nodes
+are coordinated by Zookeeper service which uses dynamic role assignment to
+ensure its own fault-tolerance\nbsp{}cite:okorafor2012zookeeper. So, the lack of
+dynamic role distribution in YARN scheduler complicates the whole cluster
+configuration: if dynamic roles were available, Zookeeper would be redundant in
+this configuration.
+
+The same problem occurs in HPC job schedulers where principal node (where the
+main job scheduler process is run) is the single point of failure.
+In\nbsp{}cite:uhlemann2006joshua,engelmann2006symmetric the authors replicate
+job scheduler state to a backup node to make the principal node highly
+available, but backup node role is assigned statically. This solution is close
+to symmetric architecture, because it does not involve external service to
+provide high availability, but far from ideal where backup node is dynamically
+chosen.
+
+Finally, the simplest principal node high availability is implemented in VRRP
+protocol (Virtual Router Redundancy
+Protocol)\nbsp{}cite:knight1998rfc2338,hinden2004virtual,nadas2010rfc5798.
+Although VRRP protocol does provide dynamic role distribution, but is designed
+to be used by routers and reverse proxy servers behind them. Such servers lack
+the state (a job queue) that needs to be restored upon node failure, so it is
+easier for them to provide high availability. In can be implemented even without
+routers using Keepalived daemon\nbsp{}cite:cassen2002keepalived instead.
+
+Symmetric architecture is beneficial for job schedulers because it
+allows to
+- make physical nodes interchangeable,
+- implement dynamic distribution of principal and subordinate roles, and
+- implement automatic recovery after failure of any node.
+The following sections will describe the components that are required to write
+parallel programme and job scheduler, that can tolerate failure of cluster
+nodes.
+
+**** Hierarchy of control flow objects
+For load balancing purposes cluster nodes are combined into tree hierarchy (see
+section [[#sec:node-discovery]]), and the load is distributed between direct
+neighbours: when one runs the kernel on the subordinate node, the principal node
+also receive some of its subordinate kernels. This makes the system symmetrical
+and easy to maintain: each node have the same set of software that allows
+to replace one node with another in case of failure of the former. Similar
+architectural solution used in key-value stores\nbsp{}cite:anderson2010couchdb,lakshman2010cassandra to provide fault tolerance, but
+author does not know any task schedulers that use this approach.
+
+Unlike ~main~ function in programmes based on message passing library, the first
+(the main) kernel is initially run only on one node, and remote nodes are used
+on-demand. This design choice allows to have arbitrary number of nodes throughout
+execution of a programme, and use more nodes for highly parallel parts of the
+code. Similar choice is made in the design of big data
+frameworks\nbsp{}cite:dean2008mapreduce,vavilapalli2013yarn \nbsp{}--- a user
+submitting a job does not specify the number of hosts to run its job on, and
+actual hosts are the hosts where input files are located.
+
+From mathematical point of view kernel \(K\) can be described as a vector-valued
+functional which recursively maps a kernel to \(n\)-component vector of kernels:
+\begin{equation*}
+    K(f): \mathbb{K} \rightarrow \mathbb{K}^n
+    \qquad
+    \mathbb{K}^n = \left\{ f: \mathbb{K} \rightarrow \mathbb{K}^n \right\}.
+\end{equation*}
+Special kernel \(\mathbb{O}: \mathbb{K} \rightarrow \mathbb{K}^0\) is used to stop
+the recursion and is passed as an argument to the main kernel. An argument to a
+kernel is interpreted as follows.
+- If a kernel is a newly created kernel, then its argument is its parent kernel.
+- In other cases the argument is an arbitrary kernel (often a child of the
+  current kernel).
+
+Kernels are processed in a loop which starts with executing the main kernel,
+then inside the main kernel other kernels are created and executed
+asynchronously. The loop continues until some kernel returns \(\mathbb{O}\).
+Since kernel may return multiple kernels they are executed in parallel, which
+quickly fills kernel pool. Since kernels from the pool may be executed in
+unspecified order, several concurrent threads retrieve kernels from the pool and
+may send the remaining kernels to neighbouring cluster nodes if the pool
+overflows.
+
+Kernels are implemented as closures (functors in C++)\nbsp{}--- function objects
+containing all their arguments, a reference to parent kernel and application
+domain data. The data is either processed upon kernel call, or subordinate
+kernels are created to process it in parallel. When the processing is complete a
+parent kernel closure with its subordinate kernel as an argument is called to
+collect the resulting data from it.
+
+**** Handling nodes failures.
+Basic strategy to overcome a failure of a subordinate node is to restart
+corresponding kernels on a healthy node\nbsp{}--- a strategy employed by Erlang
+language to restart failed subordinate processes\nbsp{}cite:armstrong2003thesis.
+In order to implement this method in the framework of kernel hierarchy, sender
+node saves every kernel that is sent to remote cluster nodes, and in an event of
+a failure of any number of nodes, where kernels were sent, their copies are
+redistributed between the remaining nodes without custom handling by a
+programmer. If there are no nodes to sent kernels to, they are executed locally.
+So, in contrast to "heavy-weight" checkpoint/restart machinery employed by HPC
+cluster job schedulers, tree hierarchy of nodes coupled with hierarchy of
+kernels allow to automatically and transparently handle of any number of
+subordinate node failures without restarting any processes of a parallel
+programme.
+
+A possible way of handling failure of the main node (a node where the main
+kernel is executed) is to replicate the main kernel to a backup node, and make
+all updates to its state propagate to the backup node by means of a distributed
+transaction, but this approach does not correlate with asynchronous nature of
+kernels and to complex to implement. In practice, however, the main kernel
+usually does not perform operations in parallel, it is rather sequentially
+execution steps one by one, so it has only one subordinate at a time. (Each
+subordinate kernel represent sequential computational step which may or may not
+be internally parallel.) Keeping this in mind, one can simplify synchronisation
+of the main kernel state: send the main kernel along with its subordinate to the
+subordinate node. Then if the main node fails, the copy of the main kernel
+receives its subordinate (because both of them are on the same node) and no time
+is spent on recovery. When the subordinate node, to which subordinate kernel
+together with the copy of the main kernel was sent, fails, the subordinate
+kernel is sent to some other node, and in the worst case the current
+computational step is executed again.
+
+The approach described above is designed for kernels that do not have a parent
+and have only one subordinate at a time, which means that it functions as
+checkpoint mechanism. The advantage of this approach is that it
+- saves results after each sequential step, when memory footprint of a programme
+  is low,
+- saves only relevant data, and
+- uses memory of a subordinate node rather than disk storage.
+This simple approach allows to tolerate at most one failure of /any/ cluster node
+per computational step or arbitrary number of subordinate nodes at any time
+during programme execution.
+
+An example of fail over algorithm follows (fig.\nbsp{}[[fig-fail-over-example]]).
+1. Initial state. Initially, computer cluster does not need to be configured
+   except setting up local network. The algorithm assumes full connectivity of
+   cluster nodes, and works best with tree topologies in which several network
+   switches connect all cluster nodes.
+2. Build node hierarchy. When the cluster is bootstrapped, daemon processes
+   start on all cluster nodes and collectively build hierarchy of such processes
+   superimposed on the topology of cluster network. Position of a daemon process
+   in the hierarchy is defined by the position of its node IP address in the
+   network IP address range. To establish hierarchical link each process
+   connects to its assumed principal process. The hierarchy is changed only when
+   a new node joins the cluster or a node fails.
+3. Launch main kernel. The first kernel launches on one of the subordinate nodes
+   (node \(B\)). Main kernel may have only one subordinate at a time, and backup
+   copy of the main kernel is sent along with the subordinate kernel \(T_1\) to
+   the principal node \(A\). \(T_1\) represents one sequential step of a
+   programme. There can be any number of sequential steps in a programme, and
+   when node \(A\) fails, the current step is restarted from the beginning.
+4. Launch subordinate kernels. Kernels \(S_1\), \(S_2\), \(S_3\) are launched on
+   subordinate cluster nodes. When node \(B\), \(C\) or \(D\) fails,
+   corresponding main kernel restarts failed subordinates (\(T_1\) restarts
+   \(S_1\), master kernel restarts \(T_1\) etc.). When node \(B\) fails, master
+   kernel is recovered from backup.
+
+#+name: fig-fail-over-example
+#+header: :headers '("\\input{preamble}")
+#+begin_src latex :file build/fail-over-example.pdf :exports results :results raw
+\input{tex/preamble}
+\newcommand*{\spbuInsertFigure}[1]{%
+\vspace{2\baselineskip}%
+\begin{minipage}{0.5\textwidth}%
+    \Large%
+    \input{#1}%
+\end{minipage}%
+}%
+\noindent%
+\spbuInsertFigure{tex/cluster-0}~\spbuInsertFigure{tex/frame-0}\newline
+\spbuInsertFigure{tex/frame-3}~\spbuInsertFigure{tex/frame-4}\newline
+\spbuInsertFigure{tex/legend}
+#+end_src
+
+#+caption: An example of fail over algorithm in action.
+#+label: fig-fail-over-example
+#+attr_latex: :width \textwidth
+#+RESULTS: fig-fail-over-example
+[[file:build/fail-over-example.pdf]]
+
+**** Evaluation results.
+Factory framework is evaluated on physical cluster (table\nbsp{}[[tab-cluster]]) on the
+example of HPC application, that generates ocean wavy surface, which is
+described in detail in section [[#sec:arma-algorithms]]. The application consists of
+a series of filters, each of which is applied to the result of the previous one.
+Some of the filters are computed in parallel, so the programme is written as a
+sequence of steps, some if which are made internally parallel to get better
+performance. In the programme only the most compute-intensive step (the surface
+generation) is executed in parallel across all cluster nodes, and other steps
+are executed in parallel across all cores of the principal node.
+
+#+name: tab-cluster
+#+caption: Test platform configuration.
+#+attr_latex: :booktabs t
+| CPU                       | Intel Xeon E5440, 2.83GHz |
+| RAM                       | 4Gb                       |
+| HDD                       | ST3250310NS, 7200rpm      |
+| No. of nodes              | 12                        |
+| No. of CPU cores per node | 8                         |
+
+The application was rewritten for the fault-tolerant version of the framework
+which required only slight modifications to handle failure of a node with the
+main kernel. The kernel was marked so that the framework makes a replica and
+sends it to some subordinate node along with its subordinate kernel. Other code
+changes involved modifying some parts to match the new API. So, providing fault
+tolerance by means of kernel hierarchy is mostly transparent to the programmer
+which only demands explicit marking of replicated kernels.
+
+In a series of experiments performance of the new version of the application in
+the presence of different types of failures was benchmarked (numbers correspond
+to the graphs in fig.\nbsp{}[[fig-benchmark]]):
+1) no failures,
+2) failure of a subordinate node (a node where a part of wavy surface is
+   generated),
+3) failure of a principal node (a node where the main kernel is run),
+4) failure of a backup node (a node where a copy of the main kernel is stored).
+A tree hierarchy with fan-out value of 64 was chosen to make all subordinate
+cluster nodes connect directly to the one having the first IP-address in the
+network IP address range. A victim node was made offline after a fixed amount of
+time after the programme start which is equivalent approximately to \(1/3\) of
+the total run time without failures on a single node. The application
+immediately recognised node as offline, because the corresponding connection was
+closed; in real-world scenario, however, the failure is detected after a
+configurable time-out. All relevant parameters are summarised in table\nbsp{}[[tab-benchmark]]. The results of these runs were compared to the run without node
+failures (fig.\nbsp{}[[fig-benchmark]] and\nbsp{}[[fig-slowdown]]).
+
+There is considerable difference in overall application performance for
+different types of failures. Graphs\nbsp{}2 and\nbsp{}3 in
+fig.\nbsp{}[[fig-benchmark]] show that performance in case of principal and
+subordinate node failure is the same. In case of principal node failure a backup
+node stores a copy of the main kernel and uses this copy when it detects failure
+of the principal node. In case of subordinate node failure, the principal node
+redistributes the non-returning kernels between remaining subordinate nodes. In
+both cases the state of the main kernel is not lost and no time is spent to
+restore it, which explains similar performance.
+
+Graph\nbsp{}4 in fig.\nbsp{}[[fig-benchmark]] shows that performance in case of a
+backup node failure is much lower than in other cases. It happens because
+principal node stores only the state of the current step of the computation plus
+some additional fixed amount of data, whereas a backup node not only stores the
+copy of this data, but executes the step in parallel with other subordinate
+nodes. So, when a backup node fails, the principal node executes the whole step
+once again on arbitrarily chosen survived node.
+
+#+name: tab-benchmark
+#+caption: Benchmark parameters for experiments with fail over algorithm.
+#+attr_latex: :booktabs t
+| Experiment no. | Time to offline, s |
+|              1 |                    |
+|              2 |                 10 |
+|              3 |                 10 |
+|              4 |                 10 |
+
+To measure how much time is lost due to a node failure the total execution time
+with a failure was divided by the total execution time without the failure but
+with the number of nodes minus one. This relation is obtained from the same
+benchmark and presented in fig.\nbsp{}[[fig-slowdown]]. The difference in
+performance in case of principal and subordinate node failures lies within 5%
+margin, and in case of backup node failure within 50% margin for the number of
+node less than 6[fn::Measuring this margin for higher number of nodes does not
+make sense since time before failure is greater than total execution time with
+these numbers of nodes, and programme's execution finishes before a failure
+occurs.]]. Increase in execution time of 50% is more than \(1/3\) of execution
+time after which a failure occurs, but backup node failure needs some time to be
+discovered: it is detected only when subordinate kernel carrying the copy of the
+main kernel finishes its execution and tries to reach its parent. Instant
+detection requires abrupt stopping of the subordinate kernel which may be
+inapplicable for programmes with complicated logic.
+
+#+name: fig-benchmark
+#+begin_src R :file build/benchmark-xxx.pdf
+# TODO
+#+end_src
+
+#+caption: Performance of hydrodynamics HPC application in the presence of node failures.
+#+label: fig-benchmark
+#+RESULTS: fig-benchmark
+
+The results of the benchmark allows to conclude that no matter a principal or a
+subordinate node fails, the overall performance of a parallel programme roughly
+equals to the one without failures with the number of nodes minus one, however,
+when a backup node fails performance penalty is much higher.
+
+#+name: fig-slowdown
+#+begin_src R :file build/slowdown-xxx.pdf
+# TODO
+#+end_src
+
+#+caption: Slowdown of the hydrodynamics HPC application in the presence of different types of node failures compared to execution without failures but with the number of nodes minus one.
+#+label: fig-slowdown
+#+RESULTS: fig-slowdown
+
+**** Discussion of test results.
+Fail over algorithm guarantees to handle one failure per sequential programme
+step, more failures can be tolerated if they do not affect the principal node.
+The algorithm handles simultaneous failure of all subordinate nodes, however, if
+both principal and backup nodes fail, there is no chance for a programme to
+continue the work. In this case the state of the current computation step is
+lost, and the only way to restore it is to restart the application from the
+beginning.
+
+Kernels are means of abstraction that decouple distributed application from
+physical hardware: it does not matter how many cluster nodes are currently
+available for a programme to run without interruption. Kernels eliminate the
+need to allocate a physical backup node to tolerate principal node failures: in
+the framework of kernel hierarchy any physical node (except the principal one)
+can act as a backup one. Finally, kernels allow to handle failures in a way that
+is transparent to a programmer, deriving the order of actions from the internal
+state of a kernel.
+
+The experiments show that it is essential for a parallel programme to have
+multiple sequential steps to make it resilient to cluster node failures,
+otherwise failure of a backup node in fact triggers recovery of the initial
+state of the programme. Although, the probability of a principal node failure is
+lower than the probability of a failure of any of the subordinate nodes, it does
+not justify loosing all the data when the long programme run is near completion.
+In general, the more sequential steps one has in a parallel programme the less
+time is lost in an event of a backup node failure, and the more parallel parts
+each sequential step has the less time is lost in case of a principal or
+subordinate node failure. In other words, the more scalable a programme is the
+more resilient to cluster node failures it becomes.
+
+Although it is not shown in the experiments, Factory does not only provide
+tolerance to cluster node failures, but allows for new nodes to automatically
+join the cluster and receive their portion of kernels from the already running
+programmes. This is trivial process as it does not involve restarting failed
+kernels or copying their state, so it has not been studied experimentally in
+this work.
+
+Theoretically, fault tolerance based on a hierarchy of nodes and kernels can be
+implemented on top of the message-passing library without loss of generality.
+Although it would be complicated to reuse free nodes instead of failed ones in
+the framework of this library, as the number of nodes is often fixed in such
+libraries, allocating reasonably large number of nodes for the programme would
+be enough to make it fault-tolerant. At the same time, implementing
+hierarchy-based fault tolerance inside message-passing library itself is not
+practical, because it would require saving the state of a parallel programme
+which equals to the total amount of memory it occupies on each cluster node,
+which in turn would not make it more efficient than checkpoints.
+
+The weak point of the proposed algorithm is the period of time starting from a
+failure of principal node up to the moment when the failure is detected, the
+main kernel is restored and new subordinate kernel with the parent's copy is
+received by a subordinate node. If at any time during this period backup node
+fails, execution state of a programme is completely lost, and there is no way to
+recover it other than restarting the programme from the beginning. The duration
+of the dangerous period can be minimised, but the probability of an abrupt
+programme termination can not be fully eliminated. This result is consistent
+with the scrutiny of /impossibility theory/, in the framework of which it is
+proved the impossibility of the distributed consensus with one faulty
+process\nbsp{}cite:fischer1985impossibility and impossibility of reliable
+communication in the presence of node
+failures\nbsp{}cite:fekete1993impossibility.
+** Comparison of the proposed approach to the current approaches
+Current state-of-the-art approach to developing and running parallel programmes
+on the cluster is the use of MPI message passing library and job scheduler, and
+despite the fact that this approach is highly efficient in terms of parallel
+computing, it is not flexible enough to accommodate dynamic load balancing and
+automatic fault-tolerance. Programmes written with MPI typically assume
+- equal load on each processor,
+- non-interruptible and reliable execution of batch jobs, and
+- constant number of parallel processes/threads throughout the execution which
+  is equal to the total number of processors.
+The first assumption does not hold for ocean wave simulation programme because
+AR model requires dynamic load balancing between processors to generate each
+part of the surface only when all dependent parts has already been generated.
+The last assumption also does not hold, because for the sake of efficiency each
+part is written to a file asynchronously by a separate thread. The remaining
+assumption is not related to the programme itself, but to the job scheduler, and
+does not generally hold for very large computer clusters in which node failures
+occur regularly, and job scheduler slowly restores the failed job from the
+checkpoint severely hindering its performance. So, the idea of the proposed
+approach is to give parallel programmes more flexibility:
+- provide dynamic load balancing via pipelined execution of sequential,
+  internally parallel programme steps,
+- restart only processes that were affected by node failure, and
+- execute the programme on as many compute nodes as are available in the
+  cluster.
+In this section advantages and disadvantages of this approach are discussed.
+
+In comparison to portable batch systems (PBS) the proposed approach uses
+lightweight control flow objects instead of heavy-weight parallel jobs to
+distribute the load on cluster nodes. First, this allows to have node object
+queues instead of several cluster-wide job queues. The granularity of control
+flow objects is much higher than the batch jobs, and despite the fact that their
+execution time cannot be reliably predicted (as is execution time of batch
+jobs), objects from multiple parallel programmes can be dynamically distributed
+between the same set of cluster nodes, thus making the load more even. The
+disadvantage is that this requires more RAM to execute many programmes on the
+same set of nodes, and execution of each programme may be longer because of the
+shared control flow object queues. Second, the proposed approach uses dynamic
+distribution of principal and subordinate roles between cluster nodes instead of
+their static assignment to the particular physical nodes. This makes nodes
+interchangeable, which is required to provide fault tolerance. So, simultaneous
+execution of multiple parallel programmes on the same set of nodes may increase
+throughput of the cluster, but may also decrease their performance taken
+separately, and dynamic role distribution is the base on which resilience to
+failures builds.
+
+In comparison to MPI the proposed approach uses lightweight control flow objects
+instead of heavy-weight processes to decompose the programme into individual
+entities. First, this allows to determine the number of entities computed in
+parallel by the problem being solved, not the computer or cluster architecture.
+A programmer is encouraged to create as many objects as needed, guided by the
+algorithm or restrictions on the size of data structures from the problem
+domain. In ocean wave simulation programme the minimal size of each wavy surface
+part depends on the number of coefficients along each dimension, and at the same
+time the number of parts should be larger than the number of processors to make
+the load on each processor more even. Considering these limits the optimal part
+size is determined at runtime, and, in general, is not equal the number of
+parallel processes. The disadvantage is that the more control flow objects there
+are in the programme, the more shared data structures are copied to the same
+node with subordinate objects; this problem is solved by introducing another
+intermediate layer of objects, which in turn adds more complexity to the
+programme. Second, hierarchy of control flow objects together with hierarchy of
+nodes allows for automatic recomputation of failed objects on surviving nodes in
+an event of hardware failures. It is possible because the state of the programme
+execution is stored in each object and not in global variables like in MPI
+programmes. By duplicating the state to a subordinate nodes, the system
+recomputes only objects from affected processes instead of the whole programme.
+So, transition from processes to control flow objects may increase performance
+of a parallel programme via dynamic load balancing, but inhibit its scalability
+for a large number of nodes due to duplication of execution state.
+
+It may seem as if three building blocks of the proposed approach\nbsp{}---
+control flow objects, pipelines and hierarchies\nbsp{}--- are orthogonal, but,
+in fact the complement each other. Without control flow objects carrying
+programme state it is impossible to recompute failed subordinate objects and
+provide fault tolerance. Without node hierarchy it is impossible to distribute
+the load between cluster nodes, because all nodes are equal without the
+hierarchy. Without pipelines for each device it is impossible to execute control
+flow objects asynchronously and implement dynamic load balancing. These three
+entities form a closed system with nothing to add and nothing to
+remove\nbsp{}--- a solid foundation for any distributed programme.
+
+To summarise, one can say that the control flow objects make parallel programmes
+more flexible: they balance the decrease in the performance due to shared object
+queues with the increase due to dynamic load balancing. Requiring more RAM, they
+allow to simultaneously run multiple parallel programmes on all cluster nodes
+without idling in the job queue, and transform the cluster into a unified
+computer system which makes best effort to execute distributed applications
+without interruption.
+
+* Conclusion
+**** Research results.
+In the sutdy of matheamtical apparatus for ocean wave simulations which goes
+beyond linear wave theory the following main results were achieved.
+- ARMA model was applied to simulation of ocean waves of arbitrary amplitudes.
+  Integral characteristics of generated wavy surface were verified by comparing
+  to the ones of a real ocean surface.
+- Analytic formula for determining wave pressures was applied to compute
+  velocity potentials under generated surface. The resulting velocity potential
+  field was verified by comparing it to the one given by fromuale from linear
+  wave theory for small-amplitude waves. For large amplitude waves the new
+  formula gives a resonably different field. Analytic formula is computationally
+  efficient because all the integrals are written as Fourier transforms, for
+  which there are high-performance implentations.
+
+**** Further research directions.
+One of the topic of future research is studying generation of wave of arbitrary
+profiles on the basis of mixed ARMA process. Another direction is integration of
+the developed model and pressure determination formula into existing application
+software packages.
+
+* Summary
+Research results allow to conclude that a problem of determining pressures under
+sea surface can be solved analytically without assumptions of linear and
+small-amplitude wave theories. This solution coupled with ARMA ocean wave
+simulation model, capable of generating waves of arbitrary amplitudes, can be
+used to determine the impact of wave oscillations on the dynamic marine object
+in a sea way, and give more precise results than analogous solution for
+small-amplitude waves.
+
+Results of the numerical experiments allow to conclude that wavy surface
+generation as well as pressure computation can be efficiently implemented via
+fast Fourier transforms, and long simulation session can be conducted.
+
+The developed mathematical apparatus and its numerical implementation can become
+a base of virtual testbed for marine objects dynamics studies.
+
+* Acknowledgements
+The graphs in this work were prepared using R language for statistical
+computing\nbsp{}cite:rlang2016,Sarkar2008lattice and Graphviz
+software\nbsp{}cite:Gansner00anopen. The manuscript was prepared using
+Org-mode\nbsp{}cite:Schulte2011org2,Schulte2011org1,Dominik2010org for GNU Emacs
+which provides computing environment for reproducible research. This means that
+all graphs can be reproduced and corresponding statements verified by cloning
+thesis repository[fn:repo], installing Emacs and exporting the document.
+
+The research was carried out using computational resources of Resource Centre
+"Computational Centre of Saint Petersburg State University" (\mbox{T-EDGE96}
+\mbox{HPC-0011828-001}) within frameworks of grants of Russian Foundation for
+Basic Research (projects no.\nbsp{}\mbox{16-07-01111}, \mbox{16-07-00886},
+\mbox{16-07-01113}).
+
+[fn:repo] [[https://github.com/igankevich/arma-thesis]]
+
+* List of acronyms and symbols
+- <<<MPP>>> :: Massively Parallel Processing, computers with distributed memory.
+- <<<SMP>>> :: Symmetric Multi-Processing, computers with shared memory.
+- <<<ACF>>> :: auto-covariate function.
+- <<<FFT>>> :: fast Fourier transform.
+- <<<PRNG>>> :: pseudo-random number generator.
+- <<<BC>>> :: boundary condition.
+- <<<PDE>>> :: partial differential equation.
+- <<<NIT>>> :: non-linear inertia-less transform.
+- <<<AR>>> :: auto-regressive process.
+- <<<ARMA>>> :: auto-regressive moving-average process.
+- <<<MA>>> :: moving average process.
+- <<<LH>>> :: Longuet---Higgins model.
+- <<<LAMP>>> :: Large Amplitude Motion Programme, a programme that simulates
+                ship behaviour in ocean waves.
+- <<<CLT>>> :: central limit theorem.
+- <<<PM>>> :: Pierson---Moskowitz ocean wave spectrum approximation.
+- <<<YW>>> :: Yule---Walker equations.
+- <<<LS>>> :: least squares.
+- <<<PDF>>> :: probability density function.
+- <<<CDF>>> :: cumulative distribution function.
+- <<<BSP>>> :: Bulk Synchronous Parallel.
+- <<<OpenCL>>> :: Open Computing Language.
+- <<<OpenMP>>> :: Open Multi-Processing.
+- <<<MPI>>> :: Message Passing Interface.
+- <<<POSIX>>> :: Portable Operating System.
+- <<<FMA>>> :: Fused multiply-add.
+- <<<DCMT>>> :: Dynamic creation of Mersenne Twisters.
+- <<<GSL>>> :: GNU Scientific Library.
+- <<<BLAS>>> :: Basic Linear Algebra Sub-programmes.
+- <<<LAPACK>>> :: Linear Algebra Package.
+- <<<DNS>>> :: Dynamic name resolution.
+- <<<HPC>>> ::  High-performance computing.
+- Master/slave node ::
+- Principal/subordinate kernel ::
+
+#+begin_export latex
+\input{postamble}
+#+end_export
+
+bibliographystyle:ugost2008
+bibliography:bib/refs.bib
+
+* Appendix
+** Longuet---Higgins model formula derivation
+:PROPERTIES:
+:CUSTOM_ID: longuet-higgins-derivation
+:END:
+
+In the framework of linear wave theory two-dimensional system of
+equations\nbsp{}eqref:eq-problem is written as
+\begin{align*}
+    & \phi_{xx} + \phi_{zz} = 0,\\
+    & \zeta(x,t) = -\frac{1}{g} \phi_t, & \text{на }z=\zeta(x,t),
+\end{align*}
+where \(\frac{p}{\rho}\) includes \(\phi_t\). The solution to the Laplace
+equation is sought in a form of Fourier series cite:kochin1966theoretical:
+\begin{equation*}
+    \phi(x,z,t) = \int\limits_{0}^{\infty} e^{k z}
+    \left[ A(k, t) \cos(k x) + B(k, t) \sin(k x) \right] dk.
+\end{equation*}
+Plugging it in the boundary condition yields
+\begin{align*}
+    \zeta(x,t) &= -\frac{1}{g} \int\limits_{0}^{\infty}
+    \left[ A_t(k, t) \cos(k x) + B_t(k, t) \sin(k x) \right] dk \\
+    &= -\frac{1}{g} \int\limits_{0}^{\infty} C_t(k, t) \cos(kx + \epsilon(k, t)).
+\end{align*}
+Here \(\epsilon\) is white noise and \(C_t\) includes \(dk\). Substituting
+integral with infinite sum yields two-dimensional form of
+eq.\nbsp{}[[eq-longuet-higgins]].
diff --git a/phd-diss-ru.org b/phd-diss-ru.org
@@ -1,3434 +0,0 @@
-# Local Variables:
-# org-ref-default-bibliography ("bib/refs.bib")
-# org-latex-image-default-width nil
-# org-latex-caption-above nil
-# org-latex-hyperref-template "\\hypersetup{\n pdfauthor={%a},\n pdftitle={%t},\n pdfkeywords={%k},\n pdfsubject={%d},\n pdfcreator={%c},\n pdflang={%L},\n unicode={true}\n}\n\\setdefaultlanguage{%l}\n"
-# org-export-latex-tables-hline "\\midrule"
-# org-export-latex-tables-tstart "\\toprule"
-# org-export-latex-tables-tend "\\bottomrule"
-# eval: (add-to-list 'org-latex-classes '("gost" "\\documentclass{gost} [DEFAULT-PACKAGES] [PACKAGES] [EXTRA]" ("\\section{%s}" . "\\section*{%s}") ("\\subsection{%s}" . "\\subsection*{%s}") ("\\subsubsection{%s}" . "\\subsubsection*{%s}") ("\\paragraph{%s}" . "\\paragraph*{%s}") ("\\subparagraph{%s}" . "\\subparagraph*{%s}")))
-# End:
-
-#+TITLE: Высокопроизводительная модель морского волнения для программ динамики морских объектов
-#+AUTHOR: Иван Ганкевич
-#+DATE: Санкт-Петербург, 2017
-#+LANGUAGE: ru
-#+LATEX_CLASS: gost
-#+LATEX_CLASS_OPTIONS: [hidelinks,fontsize=14pt,paper=a4,pagesize,DIV=calc,noenddot]
-#+LATEX_HEADER_EXTRA: \input{preamble}
-#+LATEX_HEADER_EXTRA: \organization{Санкт-Петербургский государственный университет}
-#+LATEX_HEADER_EXTRA: \manuscript{на правах рукописи}
-#+LATEX_HEADER_EXTRA: \degree{Диссертация на соискание ученой степени\\кандидата физико-математических наук}
-#+LATEX_HEADER_EXTRA: \speciality{Специальность 05.13.18\\Математическое моделирование, численные методы и комплексы программ}
-#+LATEX_HEADER_EXTRA: \supervisor{Научный руководитель\\д.т.н Дегтярев Александр Борисович}
-#+LATEX_HEADER_EXTRA: \newcites{published}{Список опубликованных по теме диссертации работ}
-#+OPTIONS: todo:nil title:nil ':t H:5
-#+STARTUP: indent
-#+PROPERTY: header-args:R :results graphics :exports results
-
-* Config                                                           :noexport:
-** Produce data for Q-Q and ACF plots
-#+begin_src sh :exports none :results verbatim
-root=$(pwd)
-for testname in propagating_wave standing_wave
-do
-    wd=$root/build/$testname
-    rm -rf $wd
-    mkdir -p $wd
-    cd $wd
-    arma -c $root/config/$testname.arma 2>&1
-done
-#+end_src
-
-#+RESULTS:
-#+begin_example
-Input file                     = /home/igankevich/workspace/phd-diss/config/propagating_wave.arma
-ACF grid size                  = (20,10,10)
-ACF grid patch size            = (0.526316,0.555556,0.555556)
-Output grid size               = (200,40,40)
-Output grid patch size         = (1,1,1)
-AR order                       = (10,10,10)
-Do least squares               = 0
-ACF function                   = propagating_wave
-Model                          = MA
-MA algorithm                   = fixed_point_iteration
-Verification scheme            = manual
-ACF variance = 5
-fixed_point_iteration:Iteration=0, var_wn=2.70831
-fixed_point_iteration:Iteration=1, var_wn=1.93791
-fixed_point_iteration:Iteration=2, var_wn=1.54801
-fixed_point_iteration:Iteration=3, var_wn=1.31202
-fixed_point_iteration:Iteration=4, var_wn=1.15328
-fixed_point_iteration:Iteration=5, var_wn=1.0386
-fixed_point_iteration:Iteration=6, var_wn=0.951442
-fixed_point_iteration:Iteration=7, var_wn=0.882674
-fixed_point_iteration:Iteration=8, var_wn=0.82688
-fixed_point_iteration:Iteration=9, var_wn=0.780623
-fixed_point_iteration:Iteration=10, var_wn=0.74161
-fixed_point_iteration:Iteration=11, var_wn=0.708244
-fixed_point_iteration:Iteration=12, var_wn=0.679374
-fixed_point_iteration:Iteration=13, var_wn=0.654145
-fixed_point_iteration:Iteration=14, var_wn=0.63191
-fixed_point_iteration:Iteration=15, var_wn=0.612168
-fixed_point_iteration:Iteration=16, var_wn=0.594523
-fixed_point_iteration:Iteration=17, var_wn=0.578663
-fixed_point_iteration:Iteration=18, var_wn=0.564333
-fixed_point_iteration:Iteration=19, var_wn=0.551325
-fixed_point_iteration:Iteration=20, var_wn=0.539469
-fixed_point_iteration:Iteration=21, var_wn=0.528623
-fixed_point_iteration:Iteration=22, var_wn=0.518666
-fixed_point_iteration:Iteration=23, var_wn=0.509497
-fixed_point_iteration:Iteration=24, var_wn=0.50103
-fixed_point_iteration:Iteration=25, var_wn=0.493191
-fixed_point_iteration:Iteration=26, var_wn=0.485916
-fixed_point_iteration:Iteration=27, var_wn=0.479148
-fixed_point_iteration:Iteration=28, var_wn=0.472841
-fixed_point_iteration:Iteration=29, var_wn=0.466951
-fixed_point_iteration:Iteration=30, var_wn=0.461442
-fixed_point_iteration:Iteration=31, var_wn=0.456279
-fixed_point_iteration:Iteration=32, var_wn=0.451435
-fixed_point_iteration:Iteration=33, var_wn=0.446882
-fixed_point_iteration:Iteration=34, var_wn=0.442597
-fixed_point_iteration:Iteration=35, var_wn=0.43856
-fixed_point_iteration:Iteration=36, var_wn=0.434752
-fixed_point_iteration:Iteration=37, var_wn=0.431155
-fixed_point_iteration:Iteration=38, var_wn=0.427755
-fixed_point_iteration:Iteration=39, var_wn=0.424538
-fixed_point_iteration:Iteration=40, var_wn=0.42149
-fixed_point_iteration:Iteration=41, var_wn=0.418601
-fixed_point_iteration:Iteration=42, var_wn=0.415859
-fixed_point_iteration:Iteration=43, var_wn=0.413256
-fixed_point_iteration:Iteration=44, var_wn=0.410782
-fixed_point_iteration:Iteration=45, var_wn=0.40843
-fixed_point_iteration:Iteration=46, var_wn=0.406191
-fixed_point_iteration:Iteration=47, var_wn=0.404059
-fixed_point_iteration:Iteration=48, var_wn=0.402029
-fixed_point_iteration:Iteration=49, var_wn=0.400092
-fixed_point_iteration:Iteration=50, var_wn=0.398246
-fixed_point_iteration:Iteration=51, var_wn=0.396483
-fixed_point_iteration:Iteration=52, var_wn=0.3948
-fixed_point_iteration:Iteration=53, var_wn=0.393193
-fixed_point_iteration:Iteration=54, var_wn=0.391656
-fixed_point_iteration:Iteration=55, var_wn=0.390188
-fixed_point_iteration:Iteration=56, var_wn=0.388782
-fixed_point_iteration:Iteration=57, var_wn=0.387438
-fixed_point_iteration:Iteration=58, var_wn=0.386151
-fixed_point_iteration:Iteration=59, var_wn=0.384918
-fixed_point_iteration:Iteration=60, var_wn=0.383738
-fixed_point_iteration:Iteration=61, var_wn=0.382606
-fixed_point_iteration:Iteration=62, var_wn=0.381522
-fixed_point_iteration:Iteration=63, var_wn=0.380482
-fixed_point_iteration:Iteration=64, var_wn=0.379485
-fixed_point_iteration:Iteration=65, var_wn=0.378528
-fixed_point_iteration:Iteration=66, var_wn=0.37761
-fixed_point_iteration:Iteration=67, var_wn=0.376729
-fixed_point_iteration:Iteration=68, var_wn=0.375882
-fixed_point_iteration:Iteration=69, var_wn=0.37507
-fixed_point_iteration:Iteration=70, var_wn=0.374289
-fixed_point_iteration:Iteration=71, var_wn=0.373539
-fixed_point_iteration:Iteration=72, var_wn=0.372818
-fixed_point_iteration:Iteration=73, var_wn=0.372126
-fixed_point_iteration:Iteration=74, var_wn=0.37146
-fixed_point_iteration:Iteration=75, var_wn=0.37082
-fixed_point_iteration:Iteration=76, var_wn=0.370204
-fixed_point_iteration:Iteration=77, var_wn=0.369612
-fixed_point_iteration:Iteration=78, var_wn=0.369042
-fixed_point_iteration:Iteration=79, var_wn=0.368494
-fixed_point_iteration:Iteration=80, var_wn=0.367966
-fixed_point_iteration:Iteration=81, var_wn=0.367458
-fixed_point_iteration:Iteration=82, var_wn=0.366969
-fixed_point_iteration:Iteration=83, var_wn=0.366499
-fixed_point_iteration:Iteration=84, var_wn=0.366046
-fixed_point_iteration:Iteration=85, var_wn=0.36561
-fixed_point_iteration:Iteration=86, var_wn=0.365189
-fixed_point_iteration:Iteration=87, var_wn=0.364785
-fixed_point_iteration:Iteration=88, var_wn=0.364395
-fixed_point_iteration:Iteration=89, var_wn=0.364019
-fixed_point_iteration:Iteration=90, var_wn=0.363657
-fixed_point_iteration:Iteration=91, var_wn=0.363309
-fixed_point_iteration:Iteration=92, var_wn=0.362973
-fixed_point_iteration:Iteration=93, var_wn=0.362649
-fixed_point_iteration:Iteration=94, var_wn=0.362337
-fixed_point_iteration:Iteration=95, var_wn=0.362036
-fixed_point_iteration:Iteration=96, var_wn=0.361746
-fixed_point_iteration:Iteration=97, var_wn=0.361466
-fixed_point_iteration:Iteration=98, var_wn=0.361197
-fixed_point_iteration:Iteration=99, var_wn=0.360937
-fixed_point_iteration:Iteration=100, var_wn=0.360686
-fixed_point_iteration:Iteration=101, var_wn=0.360444
-fixed_point_iteration:Iteration=102, var_wn=0.360211
-fixed_point_iteration:Iteration=103, var_wn=0.359986
-fixed_point_iteration:Iteration=104, var_wn=0.359769
-fixed_point_iteration:Iteration=105, var_wn=0.35956
-fixed_point_iteration:Iteration=106, var_wn=0.359358
-fixed_point_iteration:Iteration=107, var_wn=0.359163
-fixed_point_iteration:Iteration=108, var_wn=0.358975
-fixed_point_iteration:Iteration=109, var_wn=0.358794
-fixed_point_iteration:Iteration=110, var_wn=0.358619
-fixed_point_iteration:Iteration=111, var_wn=0.35845
-fixed_point_iteration:Iteration=112, var_wn=0.358288
-fixed_point_iteration:Iteration=113, var_wn=0.35813
-fixed_point_iteration:Iteration=114, var_wn=0.357979
-fixed_point_iteration:Iteration=115, var_wn=0.357832
-fixed_point_iteration:Iteration=116, var_wn=0.357691
-fixed_point_iteration:Iteration=117, var_wn=0.357555
-fixed_point_iteration:Iteration=118, var_wn=0.357423
-fixed_point_iteration:Iteration=119, var_wn=0.357296
-fixed_point_iteration:Iteration=120, var_wn=0.357173
-fixed_point_iteration:Iteration=121, var_wn=0.357055
-fixed_point_iteration:Iteration=122, var_wn=0.356941
-fixed_point_iteration:Iteration=123, var_wn=0.356831
-fixed_point_iteration:Iteration=124, var_wn=0.356724
-fixed_point_iteration:Iteration=125, var_wn=0.356621
-fixed_point_iteration:Iteration=126, var_wn=0.356522
-fixed_point_iteration:Iteration=127, var_wn=0.356426
-fixed_point_iteration:Iteration=128, var_wn=0.356334
-fixed_point_iteration:Iteration=129, var_wn=0.356244
-fixed_point_iteration:Iteration=130, var_wn=0.356158
-fixed_point_iteration:Iteration=131, var_wn=0.356075
-fixed_point_iteration:Iteration=132, var_wn=0.355994
-fixed_point_iteration:Iteration=133, var_wn=0.355917
-fixed_point_iteration:Iteration=134, var_wn=0.355842
-fixed_point_iteration:Iteration=135, var_wn=0.355769
-fixed_point_iteration:Iteration=136, var_wn=0.355699
-fixed_point_iteration:Iteration=137, var_wn=0.355632
-fixed_point_iteration:Iteration=138, var_wn=0.355567
-fixed_point_iteration:Iteration=139, var_wn=0.355504
-fixed_point_iteration:Iteration=140, var_wn=0.355443
-fixed_point_iteration:Iteration=141, var_wn=0.355384
-fixed_point_iteration:Iteration=142, var_wn=0.355327
-fixed_point_iteration:Iteration=143, var_wn=0.355273
-fixed_point_iteration:Iteration=144, var_wn=0.35522
-fixed_point_iteration:Iteration=145, var_wn=0.355169
-fixed_point_iteration:Iteration=146, var_wn=0.355119
-fixed_point_iteration:Iteration=147, var_wn=0.355072
-fixed_point_iteration:Iteration=148, var_wn=0.355026
-fixed_point_iteration:Iteration=149, var_wn=0.354981
-fixed_point_iteration:Iteration=150, var_wn=0.354938
-fixed_point_iteration:Iteration=151, var_wn=0.354897
-fixed_point_iteration:Iteration=152, var_wn=0.354856
-fixed_point_iteration:Iteration=153, var_wn=0.354818
-fixed_point_iteration:Iteration=154, var_wn=0.35478
-fixed_point_iteration:Iteration=155, var_wn=0.354744
-fixed_point_iteration:Iteration=156, var_wn=0.354709
-fixed_point_iteration:Iteration=157, var_wn=0.354676
-fixed_point_iteration:Iteration=158, var_wn=0.354643
-fixed_point_iteration:Iteration=159, var_wn=0.354612
-fixed_point_iteration:Iteration=160, var_wn=0.354581
-fixed_point_iteration:Iteration=161, var_wn=0.354552
-fixed_point_iteration:Iteration=162, var_wn=0.354524
-fixed_point_iteration:Iteration=163, var_wn=0.354496
-fixed_point_iteration:Iteration=164, var_wn=0.35447
-fixed_point_iteration:Iteration=165, var_wn=0.354444
-fixed_point_iteration:Iteration=166, var_wn=0.35442
-fixed_point_iteration:Iteration=167, var_wn=0.354396
-fixed_point_iteration:Iteration=168, var_wn=0.354373
-fixed_point_iteration:Iteration=169, var_wn=0.35435
-fixed_point_iteration:Iteration=170, var_wn=0.354329
-fixed_point_iteration:Iteration=171, var_wn=0.354308
-fixed_point_iteration:Iteration=172, var_wn=0.354288
-fixed_point_iteration:Iteration=173, var_wn=0.354269
-fixed_point_iteration:Iteration=174, var_wn=0.35425
-fixed_point_iteration:Iteration=175, var_wn=0.354232
-fixed_point_iteration:Iteration=176, var_wn=0.354214
-fixed_point_iteration:Iteration=177, var_wn=0.354198
-fixed_point_iteration:Iteration=178, var_wn=0.354181
-fixed_point_iteration:Iteration=179, var_wn=0.354165
-fixed_point_iteration:Iteration=180, var_wn=0.35415
-fixed_point_iteration:Iteration=181, var_wn=0.354136
-fixed_point_iteration:Iteration=182, var_wn=0.354121
-fixed_point_iteration:Iteration=183, var_wn=0.354108
-fixed_point_iteration:Iteration=184, var_wn=0.354094
-fixed_point_iteration:Iteration=185, var_wn=0.354082
-fixed_point_iteration:Iteration=186, var_wn=0.354069
-fixed_point_iteration:Iteration=187, var_wn=0.354057
-fixed_point_iteration:Iteration=188, var_wn=0.354046
-fixed_point_iteration:Iteration=189, var_wn=0.354034
-fixed_point_iteration:Iteration=190, var_wn=0.354024
-fixed_point_iteration:Iteration=191, var_wn=0.354013
-fixed_point_iteration:Iteration=192, var_wn=0.354003
-fixed_point_iteration:Iteration=193, var_wn=0.353994
-WN variance = 0.353994
-Input file                     = /home/igankevich/workspace/phd-diss/config/standing_wave.arma
-ACF grid size                  = (10,10,10)
-ACF grid patch size            = (0.277778,0.555556,0.555556)
-Output grid size               = (200,40,40)
-Output grid patch size         = (1,1,1)
-AR order                       = (7,7,7)
-Do least squares               = 0
-ACF function                   = standing_wave
-Model                          = AR
-MA algorithm                   = fixed_point_iteration
-Verification scheme            = manual
-ACF variance = 5
-WN variance = 0.00261323
-Zeta size = (193,33,33)
-NaN: 29, -nan, 1.798e+36, -1.04284e+38, inf, -1.798e+36, -1.798e+36
-#+end_example
-
-* Введение
-**** Актуальность темы.
-Программы, моделирующие поведение судна на морских волнах, широко применяются
-для расчета качки судна, оценки величины воздействия внешних сил на плавучую
-платформу или другой морской объект, а также для оценки вероятности
-опрокидывания судна при заданных погодных условиях; однако, большинство из них
-используют линейную теорию для моделирования морского волнения\nbsp{}cite:shin2003nonlinear,van2007forensic,kat2001prediction,van2002development, в
-рамках которой сложно воспроизвести определенные особенности ветроволнового
-климата. Среди них можно выделить переход от нормальных погодных условий к
-шторму и волнение, вызванное наложением множества систем ветровых волн и волн
-зыби, распространяющихся в нескольких направлениях. Другой недостаток линейной
-теории волн заключается в предположении, что высота волн много меньше их длины.
-Это делает расчеты грубыми при моделировании качки судна в условиях
-нерегулярного волнения, когда такое предположение несправедливо. Разработка
-новых и более совершенных моделей и методов, используемых при расчете динамики
-судна, может увеличить количество сценариев ее применения и, в частности,
-способствовать исследованию поведения судна в экстремальных условиях.
-
-**** Степень разработанности.
-Модель авторегрессии скользящего среднего (АРСС) возникла как ответ на
-сложности, с которыми на практике сталкиваются ученые, использующие в свой
-работе модели морского волнения, разработанные в рамках линейной теории волн.
-Проблемы, с которыми они сталкиваются при использовании модели Лонге---Хиггинса
-(которая полностью основана на линейной теории волн) перечислены ниже.
-1. /Периодичность/. В рамках линейной теории волны аппроксимируются суммой
-   гармоник, а период реализации взволнованной поверхности зависит от их
-   количества. Чем больше размер реализации, тем больше коэффициентов требуется
-   для исключения периодичности, поэтому с увеличением размера реализации время
-   ее генерации растет нелинейно. Это приводит к тому, что любая модель,
-   основанная на линейной теории, неэффективна при генерации больших реализаций
-   взволнованной поверхности, независимо от того, насколько оптимизирован
-   исходный код программы.
-2. /Линейность/. В рамках линейной теории волн дается математическое определение
-   морским волнам в предположении малости их амплитуд по сравнению с длинами.
-   Такие волны, в основном, характерны для открытого моря и океана, а волны в
-   прибрежных районах и штормовые волны, для которых это предположение
-   несправедливо, грубо описываются в рамках линейной теории.
-3. /Вероятностная сходимость/. Фаза волны, значение которой обычно получается с
-   помощью генератора псевдослучайных чисел (ГПСЧ), имеет равномерное
-   распределение, что иногда приводит к медленной сходимости интегральных
-   характеристик взволнованной поверхности (таких как распределение высот волн,
-   их периодов, длин и т.п.). Скорость сходимости зависит от значений,
-   полученных от ГПСЧ, поэтому быстрая сходимость не гарантируется.
-
-Эти сложности стали отправной точкой в поиске модели, не основанной на линейной
-теории волн, и в исследованиях процесса АРСС был найден необходимый
-математический аппарат.
-1. Параметром процесса АРСС является автоковариационная функция (АКФ), которая
-   может быть напрямую получена из энергетического или частотно-направленного
-   спектра морского волнения (который, в свою очередь является входным
-   параметром для модели Лонге---Хиггинса). Так что входные параметры одной
-   модели могут быть легко преобразованы во входные параметры другой.
-2. Процесс АРСС не имеет ограничение на амплитуду генерируемых волн: их крутизна
-   может быть увеличена на столько, на сколько это позволяет АКФ реальных
-   морских волн.
-3. Период реализации равен периоду ГПСЧ, поэтому время генерации растет линейно
-   с увеличением размера реализации.
-4. Белый шум, который является единственным вероятностным членом формулы
-   процесса АРСС, имеет нормальное распределение; так что скорость сходимость не
-   носит вероятностный характер.
-
-**** Цели и задачи.
-Процесс АРСС стал основой модели ветрового волнения АРСС, однако он нуждался в
-доработке перед тем, как его можно было бы использовать на практике.
-1. Необходимо было исследовать, как различные формы АКФ влияют на выбор
-   параметров процесса АРСС (количество коэффициентов процесса скользящего
-   среднего и процесса авторегрессии).
-2. Затем исследовать возможность генерации волн с произвольным профилем, а не
-   только профиль синусоиды (учесть асимметричность распределения волновых
-   аппликат взволнованной поверхности).
-3. Затем вывести формулы для определения поля давлений под взволнованной
-   поверхностью. Такие формулы обычно выводятся для конкретной модели путем
-   подстановки формулы профиля волны в eqref:eq-problem, однако процесс АРСС не
-   содержит в себе формулу профиля волны в явном виде, поэтому для него
-   необходимо было получить решение для взволнованной поверхности общего вида
-   (для которой не существует аналитического выражения) без линеаризации
-   граничных условий (ГУ) и предположении о малости амплитуд волн.
-4. Наконец, верифицировать интегральные характеристики взволнованной поверхности
-   на соответствие реальным морским волнам.
-5. Заключительный этап состоял в разработке комплекса программ, реализующего
-   созданную модель и метод расчета давлений и позволяющего проводить расчеты
-   как на многопроцессорной машине с общей памятью (SMP), так и на компьютерном
-   кластере (MPP).
-
-**** Научная новизна.
-Модель АРСС в отличие от других моделей ветрового волнения не основана на
-линейной теории волн, что позволяет
-- генерировать волны произвольной амплитуды, регулируя крутизну посредством АКФ;
-- генерировать волны произвольной формы, регулируя асимметричность распределения
-  волновых аппликат посредством нелинейного безынерционного преобразования
-  (НБП).
-В то же время математический аппарат этой процесса АРСС хорошо изучен в
-других научных областях, что позволяет его обобщить для моделирования развития
-морского волнения в условиях шторма с учетом климатических спектров и данных
-ассимиляции определенных районов мирового океана, что невозможно сделать с
-помощью модели, основанной на линейной теории волн.
-
-**** Теоретическая и практическая значимость работы.
-Применение модели АРСС и формулы поля давлений, не использующей предположения
-линейной теории волн, качественно повысит работу комплексов программ для расчета
-воздействия океанских волн на морские объекты.
-
-1. Поскольку формула для поля давлений выводится для дискретно заданной
-   взволнованной поверхности и без каких-либо предположений об амплитудах волн,
-   то она применима для любой взволнованной поверхности невязкой несжимаемой
-   жидкости (в частности она применима для поверхности, генерируемой моделью
-   Лонге---Хиггинса). Это позволяет использовать формулу поля давлений без
-   привязки к модели АРСС.
-2. С вычислительной точки зрения эта формула более эффективна, чем
-   соответствующая формула для модели ЛХ, поскольку интегралы в формуле сводятся
-   к преобразованиям Фурье, для которых существует семейство алгоритмов быстрого
-   преобразования Фурье (БПФ), оптимизированных под разные архитектуры
-   процессоров.
-3. Поскольку формула явная, то обмена данными между параллельными процессами
-   можно избежать, что позволяет достичь высокой масштабируемости на
-   компьютерном кластере.
-4. Наконец, сама модель АРСС более эффективна, чем модель ЛХ, ввиду отсутствия
-   тригонометрических функций в ее формуле. Взволнованная поверхность
-   вычисляется как сумма большого числа многочленов, для которых существует
-   низкоуровневая ассемблерная инструкция (Fused Multiply-Add), показывающая
-   высокую производительность на процессорах.
-
-**** Методология и методы исследования.
-Программная реализация модели АРСС и формула вычисления давлений создавалась
-поэтапно: прототип, написанный высокойровневом инженерном языке\nbsp{}cite:mathematica10,octave2015, был преобразован в программу на языке более
-низкого уровня (C++). Реализация одних и тех же формул и алгоритмов на языках
-разного уровня (ввиду использования различных абстракций и языковых примитивов)
-позволяет выявить и исправить ошибки, которые остались бы незамеченными в случае
-одного языка. Генерируемая моделью АРСС взволнованная поверхность, а также все
-входные параметры (АКФ, формула распределения волновых аппликат и т.п.) были
-проверены с помощью встроенных в язык программирования графических средств для
-визуального контроля корректности работы программы.
-
-**** Положения, выносимые на защиту.
-- Модель ветрового волнения, способная генерировать реализации взволнованной
-  морской поверхности, имеющие большой период и состоящие из волн произвольной
-  амплитуды;
-- Формула для поля давлений, выведенная для этой модели без предположений
-  линейной теории волн;
-- Программная реализация созданной модели и формулы для вычислительных систем с
-  общей (SMP) и с распределенной памятью (MPP).
-
-**** Степень достоверности и апробация результатов.
-Верификация модели АРСС проводится путем сравнения интегральных характеристик
-(распределений волновых аппликат, высот и длин волн и т.п.) генерируемой
-взволнованной поверхности с характеристиками реальных морских волн. Формула для
-поля давлений выводится с помощью языка Mathematica, в котором полученные
-выражения проверяются с помощью встроенных в язык графических средств.
-
-Модель АРСС и формула для поля давлений были реализованы в Large Amplitude
-Motion Programme (LAMP), программе для моделирования качки судна, и сопоставлены
-с используемой ранее моделью ЛХ. Предварительные численные эксперименты показали
-более высокую вычислительную эффективность модели АРСС.
-
-* Постановка задачи
-Задача состоит в исследовании возможности применении математического аппарата
-процесса АРСС для моделирования морских волн и в выводе формулы для поля
-давлений под генерируемой взволнованной морской поверхностью для случая
-идеальной несжимаемой жидкости без предположений линейной теории волн.
-- Для случая волн малых амплитуд полученная формула должна быть сопоставимо с
-  соответствующей формулой линейной теории волн; для остальных случаев формула
-  не должна расходиться.
-- Интегральные характеристики генерируемой взволнованной поверхности должны
-  совпадать с характеристиками реальных морских волн.
-- Программная реализация модели АРСС и формулы вычисления давлений должна
-  работать на системах с общей (SMP) и распределенной памятью (MPP).
-
-**** Формула для поля давлений.
-Задача определения поля давлений под взволнованной морской поверхностью
-представляет собой обратную задачу гидродинамики для несжимаемой невязкой
-жидкости. Система уравнений для нее в общем виде записывается как\nbsp{}cite:kochin1966theoretical
-\begin{align}
-    & \nabla^2\phi = 0,\nonumber\\
-    & \phi_t+\frac{1}{2} |\vec{\upsilon}|^2 + g\zeta=-\frac{p}{\rho}, & \text{на }z=\zeta(x,y,t),\label{eq-problem}\\
-    & D\zeta = \nabla \phi \cdot \vec{n}, & \text{на }z=\zeta(x,y,t),\nonumber
-\end{align}
-где \(\phi\)\nbsp{}--- потенциал скорости, \(\zeta\)\nbsp{}--- подъем (аппликата)
-взволнованной поверхности, \(p\)\nbsp{}--- давление жидкости, \(\rho\)\nbsp{}--- плотность
-жидкости, \(\vec{\upsilon}=(\phi_x,\phi_y,\phi_z)\)\nbsp{}--- вектор скорости, \(g\)\nbsp{}--- ускорение свободного падения и \(D\)\nbsp{}--- субстанциональная производная
-(производная Лагранжа). Первое уравнение является уравнением неразрывности
-(уравнение Лапласа), второе\nbsp{}--- законом сохранения импульса (которое иногда
-называют динамическим граничным условием); третье уравнение\nbsp{}--- кинематическое
-граничное условие, которое сводится к равенству скорости перемещения этой
-поверхности (\(D\zeta\)) нормальной составляющей скорости жидкости
-(\(\nabla\phi\cdot\vec{n}\)).
-
-Обратная задача гидродинамики заключается в решении этой системы уравнений
-относительно \(\phi\). В такой постановке динамическое ГУ становится явной
-формулой для определения поля давлений по значениям производных потенциалов
-скорости, полученных из оставшихся уравнений. Таким образом, с математической
-точки зрения обратная задача гидродинамики сводится к решению уравнения Лапласа
-со смешанным ГУ\nbsp{}--- задаче Робена.
-
-* Обзор литературы
-** Анализ моделей морского волнения
-Вычисление давлений возможно только при условии знания формы взволнованной
-поверхности, которая задается либо дискретно в каждой точке пространственной
-сетки, либо непрерывно с помощью аналитической формулы. Как будет показано в
-разделе [[#linearisation]], знание такой формулы может упростить вычисление
-давлений, фактически сведя задачу к генерации поля давлений, а не самой
-взволнованной поверхности.
-
-**** Модель Лонге---Хиггинса.
-Наиболее простой моделью, формула которой выводится в рамках линейной теории
-волн (см.\nbsp{}разд.\nbsp{}[[#longuet-higgins-derivation]]), является модель
-Лонге---Хиггинса (ЛХ)\nbsp{}cite:longuet1957statistical. Подробный сравнительный
-анализ этой модели и модели АРСС проведен в
-работах\nbsp{}cite:degtyarev2011modelling,boukhanovsky1997thesis.
-
-Модель ЛХ представляет взволнованную морскую поверхность в виде суперпозиции
-элементарных гармонических волн случайных амплитуд \(c_n\) и фаз \(\epsilon_n\),
-непрерывно распределенных на интервале \([0,2\pi]\). Подъем (координата \(z\))
-поверхности определяется формулой
-#+name: eq-longuet-higgins
-\begin{equation}
-    \zeta(x,y,t) = \sum\limits_n c_n \cos(u_n x + v_n y - \omega_n t + \epsilon_n).
-\end{equation}
-Здесь волновые числа \((u_n,v_n)\) непрерывно распределены на плоскости \((u,v)\),
-т.е. площадка \(du \times dv\) содержит бесконечно большое количество волновых
-чисел. Частота связана с волновыми числами дисперсионным соотношением
-\(\omega_n=\omega(u_n,v_n)\). Функция \(\zeta(x,y,t)\) является трехмерным
-эргодическим стационарным однородным гауссовым процессом, определяемым
-соотношением
-\begin{equation*}
-    2E_\zeta(u,v)\, du\,  dv = \sum\limits_n c_n^2,
-\end{equation*}
-где \(E_\zeta(u,v)\)\nbsp{}--- двумерная спектральная плотность энергии волн.
-Коэффициенты \(c_n\) определяются из энергетического спектра волнения \(S(\omega)\)
-по формуле
-\begin{equation*}
-    c_n = \sqrt{ \textstyle\int\limits_{\omega_n}^{\omega_{n+1}} S(\omega) d\omega}.
-\end{equation*}
-
-**** Основные недостатки модели Лонге---Хиггинса.
-Модель Лонге---Хиггинса отличается простотой численного алгоритма и
-наглядностью, однако, на практике она обладает рядом недостатков.
-
-1. Модель рассчитана на представление стационарного гауссова поля. Это является
-   следствием центральной предельной теоремы (ЦПТ): сумма большого числа
-   гармоник со случайными амплитудами и фазами имеет нормальное распределение в
-   независимости от спектра, подаваемого на вход модели. Использование меньшего
-   количества коэффициентов может решить проблему, но также уменьшит период
-   реализации. Таким образом, использование модели ЛХ для генерации волн с
-   негауссовым распределением аппликат (которое имеют реальные морские волны\nbsp{}cite:huang1980experimental,рожков1996теория) не реализуемо на практике.
-2. С вычислительной точки зрения, недостатком модели является нелинейный рост
-   времени генерации поверхности с увеличением размера реализации. Чем больше
-   размер реализации, тем больше коэффициентов (дискретных точек
-   частотно-направленного спектра) требуется для исключения периодичности. Это
-   делает модель неэффективной для проведения длительных численных
-   экспериментов.
-3. Наконец, с инженерной точки зрения, модель обладает рядом особенностей,
-   которые не позволяют использовать ее в качестве фундамента для построения
-   более совершенных моделей.
-   - В программной реализации скорость сходимости выражения
-     ур.\nbsp{}[[eq-longuet-higgins]] может быть низкой, т.к. фазы \(\epsilon_n\)
-     имеют вероятностный характер.
-   - Обобщение модели для негауссовых и нелинейных процессов возможно при
-     включении нелинейных членов в ур.\nbsp{}[[eq-longuet-higgins]], для которого не
-     известна формула вычисления
-     коэффициентов\nbsp{}cite:рожков1990вероятностные.
- 
-Таким образом, модель ЛХ применима для решения задачи генерации взволнованной
-морской поверхности только в рамках линейной теории волн, неэффективна для
-длительных экспериментов и имеет ряд недостатков, не позволяющих использовать ее
-в качестве основы для построения более совершенных моделей.
-
-**** Модель АРСС.
-В\nbsp{}cite:spanos1982arma модель АРСС используется для генерации временного ряда,
-спектр которого совпадает с аппроксимацией Пирсона---Московица для спектров
-морского волнения. Авторы проводят эксперименты для одномерных моделей АР, СС и
-АРСС. Они отмечают превосходное совпадение полученного и исходного спектров и
-более высокую вычислительную эффективность модели АРСС по сравнению с
-моделями, основанными на суммировании большого числа гармоник со случайными
-фазами. Также отмечается, что для того чтобы спектр полученного временного ряда
-совпадал с заданным, модели СС требуется меньшее количество коэффициентов, чем
-модели АР. В\nbsp{}cite:spanos1996efficient автор обобщает формулы для нахождения
-коэффициентов модели АРСС для случая нескольких (векторов) переменных.
-
-Отличие данной работы от вышеперечисленных отличается в исследовании трехмерной
-модели АРСС (два пространственных и одно временное измерение), что во многом
-является другой задачей.
-1. Система уравнений Юла---Уокера, используемая для определения коэффициентов
-   АР, имеет более сложную блочно-блочную структуру.
-2. Оптимальный (для совпадения заданного и исходного спектров) порядок модели
-   определяется вручную.
-3. Вместо аппроксимации ПМ в качестве входа модели используются аналитические
-   выражения для АКФ стоячих и прогрессивных волн.
-4. Трехмерная взволнованная поверхность должна быть сопоставима с реальной
-   морской поверхностью не только по спектральным характеристикам, но и по форме
-   волновых профилей, поэтому верификация модели производится и для
-   распределений различных параметров генерируемых волн (длин, высот, периодов и
-   др.).
-Многомерность исследуемой модели не только усложняет задачу, но и позволяет
-провести визуальную проверку генерируемой взволнованной поверхности. Именно
-возможность визуализировать результат работы программы позволила удостовериться,
-что генерируемая поверхность действительно похожа на реальное морское волнение,
-а не является абстрактным многомерным случайным процессом, совпадающим с
-реальным лишь статистически.
-
-В\nbsp{}cite:fusco2010short модель АР используется для прогнозирования волн зыби для
-управления преобразователем энергии волн (ПЭВ) в реальном времени. Для
-эффективной работы ПЭВ необходимо чтобы частота встроенного осциллятора
-совпадала с частотой морских волн. Авторы статьи представляют подъем волны как
-временной ряд и сравнивают эффективность модели АР, нейронных сеть и циклических
-моделей в прогнозировании будущих значения ряда. Модель АР дает наиболее точный
-прогноз для низкочастотных волн зыби вплоть до двух типовых периодов волн. Это
-пример успешного применения модели процесса АР для моделирования морских волн.
-
-** Известные формулы определения поля давлений
-**** Теория волн малых амплитуд.
-В\nbsp{}cite:stab2012,детярев1998моделирование,degtyarev1997analysis дается решение
-обратной задачи гидродинамики для случая идеальной несжимаемой жидкости в рамках
-теории волн малых амплитуд (в предположении, что длина волны много больше ее
-высоты: \(\lambda \gg h\)). В этом случае обратная задача линейна и сводится к
-уравнению Лапласа со смешанным граничным условием, а уравнение движения
-используется только для нахождения давлений по известным значениям производных
-потенциала скорости. Предположение о малости амплитуд волн означает слабое
-изменение локального волнового числа во времени и пространстве по сравнению с
-подъемом (аппликатой) взволнованной поверхности. Это позволяет вычислить
-производную подъема поверхности по \(z\) как \(\zeta_z=k\zeta\), где \(k\)\nbsp{}---
-волновое число. В двухмерном случае решение записывается явной формулой
-\begin{align}
-    \left.\frac{\partial\phi}{\partial x}\right|_{x,t}= &
-        -\frac{1}{\sqrt{1+\alpha^{2}}}e^{-I(x)}
-            \int\limits_{0}^x\frac{\partial\dot{\zeta}/\partial      
-                z+\alpha\dot{\alpha}}{\sqrt{1+\alpha^{2}}}e^{I(x)}dx,\label{eq-old-sol-2d}\\
-    I(x)= & \int\limits_{0}^x\frac{\partial\alpha/\partial z}{1+\alpha^{2}}dx,\nonumber
-\end{align}
-где \(\alpha\)\nbsp{}--- уклоны волн. В трехмерном случае решение записывается в виде
-эллиптического дифференциального уравнения в частных производных
-\begin{align*}
-    & \frac{\partial^2 \phi}{\partial x^2} \left( 1 + \alpha_x^2 \right) +
-    \frac{\partial^2 \phi}{\partial y^2} \left( 1 + \alpha_y^2 \right) +
-    2\alpha_x\alpha_y \frac{\partial^2 \phi}{\partial x \partial y} + \\
-    & \left(
-        \frac{\partial \alpha_x}{\partial z} +
-        \alpha_x \frac{\partial \alpha_x}{\partial x} +
-        \alpha_y \frac{\partial \alpha_x}{\partial y}
-    \right) \frac{\partial \phi}{\partial x} + \\
-    & \left(
-        \frac{\partial \alpha_y}{\partial z} +
-        \alpha_x \frac{\partial \alpha_y}{\partial x} +
-        \alpha_y \frac{\partial \alpha_y}{\partial y}
-    \right) \frac{\partial \phi}{\partial y} + \\
-    & \frac{\partial \dot{\zeta}}{\partial z} + 
-    \alpha_x \dot{\alpha_x} + \alpha_y \dot{\alpha_y} = 0.
-\end{align*}
-Уравнение предполагается решать численно путем сведения к разностному.
-
-Как будет показано в [[#sec:compare-formulae]] формула eqref:eq-old-sol-2d
-расходится при попытке вычислить поле скоростей для волн больших амплитуд, а
-значит не может быть использована вместе с моделью ветрового волнения,
-генерирующей волны произвольных амплитуд.
-
-**** Линеаризация граничного условия.
-:PROPERTIES:
-:CUSTOM_ID: linearisation
-:END:
-Модель Лонге---Хиггинса позволяет вывести явную формулу для поля
-скоростей путем линеаризации кинематического граничного условия. Формула для
-потенциала скорости запишется как
-\begin{equation*}
-\phi(x,y,z,t) = \sum_n \frac{c_n g}{\omega_n} 
-     e^{\sqrt{u_n^2+v_n^2} z}
-     \sin(u_n x + v_n y - \omega_n t + \epsilon_n).
-\end{equation*}
-Формула дифференцируется для получения производных потенциала, а полученные
-значения подставляются в динамическое граничное условие для вычисления давлений.
-
-* Модель АРСС в задаче имитационного моделирования морского волнения
-** Основные формулы трехмерного процесса AРСС
-Модель АРСС для морского волнения определяет взволнованную морскую поверхность
-как трехмерный (два пространственных и одно временное измерение) процесс
-авторегрессии скользящего среднего: каждая точка взволнованной поверхности
-представляется в виде взвешенной суммы предыдущих по времени и пространству
-точек и взвешенной суммы предыдущих по времени и пространству нормально
-распределенных случайных импульсов. Основным уравнением для трехмерного процесса
-АРСС является
-\begin{equation}
-    \zeta_{\vec i}
-    =
-    \sum\limits_{\vec j = \vec 0}^{\vec N}
-    \Phi_{\vec j} \zeta_{\vec i - \vec j}
-    +
-    \sum\limits_{\vec j = \vec 0}^{\vec M}
-    \Theta_{\vec j} \epsilon_{\vec i - \vec j}
-    ,
-    \label{eq-arma-process}
-\end{equation}
-где \(\zeta\)\nbsp{}--- подъем (аппликата) взволнованной поверхности,
-\(\Phi\)\nbsp{}--- коэффициенты процесса АР, \(\Theta\)\nbsp{}--- коэффициенты
-процесса СС, \(\epsilon\)\nbsp{}--- белый шум, имеющий Гауссово распределение,
-\(\vec{N}\)\nbsp{}--- порядок процесса АР, \(\vec{M}\)\nbsp{}--- порядок
-процесса СС, причем \(\Phi_{\vec{0}}\equiv0\), \(\Theta_{\vec{0}}\equiv0\).
-Здесь стрелки обозначают многокомпонентные индексы, содержащие отдельную
-компоненту для каждого измерения. В общем случае в качестве компонент могут
-выступать любые скалярные величины (температура, соленость, концентрация
-какого-либо раствора в воде и т.п.). Параметрами уравнения служат коэффициенты и
-порядки процессов АР и СС.
-
-Свойства стационарности и обратимости являются основными критериями выбора того
-или иного процесса для моделирования волн разных профилей, которые обсуждаются в
-разд.\nbsp{}[[#sec-process-selection]].
-
-**** Процесс авторегрессии (АР).
-Процесс АР\nbsp{}--- это процесс АРСС только лишь с одним случайным импульсом вместо их
-взвешенной суммы:
-\begin{equation}
-    \zeta_{\vec i}
-    =
-    \sum\limits_{\vec j = \vec 0}^{\vec N}
-    \Phi_{\vec j} \zeta_{\vec i - \vec j}
-    +
-    \epsilon_{i,j,k}
-    .
-    \label{eq-ar-process}
-\end{equation}
-Коэффициенты авторегрессии \(\Phi\) определяются из многомерных уравнений
-Юла---Уокера, получаемых после домножения на \(\zeta_{\vec{i}-\vec{k}}\) обеих
-частей уравнения и взятия математического ожидания. В общем виде уравнения
-Юла---Уокера записываются как
-\begin{equation}
-    \label{eq-yule-walker}
-    \gamma_{\vec k}
-    =
-    \sum\limits_{\vec j = \vec 0}^{\vec N}
-    \Phi_{\vec j}
-    \text{ }\gamma_{\vec{k}-\vec{j}}
-    +
-    \Var{\epsilon} \delta_{\vec{k}},
-    \qquad
-    \delta_{\vec{k}} =
-    \begin{cases}
-        1, \quad \text{if } \vec{k}=0 \\
-        0, \quad \text{if } \vec{k}\neq0,
-    \end{cases}
-\end{equation}
-где \(\gamma\)\nbsp{}--- АКФ процесса \(\zeta\), \(\Var{\epsilon}\)\nbsp{}--- дисперсия
-белого шума. Матричная форма трехмерной системы уравнений Юла---Уокера,
-используемой в данной работе, имеет следующий вид.
-\begin{equation*}
-    \Gamma
-    \left[
-        \begin{array}{l}
-            \Phi_{\vec 0}\\
-            \Phi_{0,0,1}\\
-            \vdotswithin{\Phi_{\vec 0}}\\
-            \Phi_{\vec N}
-        \end{array}
-    \right]
-    =
-    \left[
-        \begin{array}{l}
-            \gamma_{0,0,0}-\Var{\epsilon}\\
-            \gamma_{0,0,1}\\
-            \vdotswithin{\gamma_{\vec 0}}\\
-            \gamma_{\vec N}
-        \end{array}
-    \right],
-    \qquad
-    \Gamma=
-    \left[
-        \begin{array}{llll}
-            \Gamma_0 & \Gamma_1 & \cdots & \Gamma_{N_1} \\
-            \Gamma_1 & \Gamma_0 & \ddots & \vdotswithin{\Gamma_0} \\
-            \vdotswithin{\Gamma_0} & \ddots & \ddots & \Gamma_1 \\
-            \Gamma_{N_1} & \cdots & \Gamma_1 & \Gamma_0
-        \end{array}
-    \right],
-\end{equation*}
-где \(\vec N = \left( N_1, N_2, N_3 \right)\) и
-\begin{equation*}
-    \Gamma_i =
-    \left[
-    \begin{array}{llll}
-        \Gamma^0_i & \Gamma^1_i & \cdots & \Gamma^{N_2}_i \\
-        \Gamma^1_i & \Gamma^0_i & \ddots & \vdotswithin{\Gamma^0_i} \\
-        \vdotswithin{\Gamma^0_i} & \ddots & \ddots & \Gamma^1_i \\
-        \Gamma^{N_2}_i & \cdots & \Gamma^1_i & \Gamma^0_i
-    \end{array}
-    \right]
-    \qquad
-    \Gamma_i^j=
-    \left[
-    \begin{array}{llll}
-        \gamma_{i,j,0} & \gamma_{i,j,1} & \cdots & \gamma_{i,j,N_3} \\
-        \gamma_{i,j,1} & \gamma_{i,j,0} & \ddots &x \vdotswithin{\gamma_{i,j,0}} \\
-        \vdotswithin{\gamma_{i,j,0}} & \ddots & \ddots & \gamma_{i,j,1} \\
-        \gamma_{i,j,N_3} & \cdots & \gamma_{i,j,1} & \gamma_{i,j,0}
-    \end{array}
-    \right],
-\end{equation*}
-Поскольку по определению \(\Phi_{\vec 0}\equiv0\), то первую строку и столбец
-матрицы \(\Gamma\) можно отбросить. Матрица \(\Gamma\), как и оставшаяся от нее
-матрица, будут блочно-теплицевы, положительно определены и симметричны, поэтому
-систему уравнений Юла---Уокера можно эффективно решить методом Холецкого,
-специально предназначенного для таких матриц.
-
-После нахождения решения системы уравнений дисперсия белого шума определяется из
-уравнения eqref:eq-yule-walker при \(\vec k = \vec 0\) как
-\begin{equation*}
-    \Var{\epsilon} =
-    \Var{\zeta}
-    -
-    \sum\limits_{\vec j = \vec 0}^{\vec N}
-    \Phi_{\vec j}
-    \text{ }\gamma_{\vec{j}}.
-\end{equation*}
-
-**** Процесс скользящего среднего (СС).
-Процесс СС\nbsp{}--- это процесс АРСС, в котором \(\Phi\equiv0\):
-\begin{equation}
-    \zeta_{\vec i}
-    =
-    \sum\limits_{\vec j = \vec 0}^{\vec M}
-    \Theta_{\vec j} \epsilon_{\vec i - \vec j}
-    .
-    \label{eq-ma-process}
-\end{equation}
-Коэффициенты СС \(\Theta\) определяются неявно из системы нелинейных уравнений
-\begin{equation*}
-  \gamma_{\vec i} =
-  \left[
-    \displaystyle
-    \sum\limits_{\vec j = \vec i}^{\vec M}
-    \Theta_{\vec j}\Theta_{\vec j - \vec i}
-  \right]
-  \Var{\epsilon}.
-\end{equation*}
-Система решается численно с помощью метода простой итерации по формуле
-\begin{equation*}
-  \Theta_{\vec i} =
-    -\frac{\gamma_{\vec 0}}{\Var{\epsilon}}
-    +
-    \sum\limits_{\vec j = \vec i}^{\vec M}
-    \Theta_{\vec j} \Theta_{\vec j - \vec i}.
-\end{equation*}
-Здесь новые значения коэффициентов \(\Theta\) вычисляются, начиная с последнего:
-от \(\vec{i}=\vec{M}\) до \(\vec{i}=\vec{0}\). Дисперсия белого шума вычисляется из
-\begin{equation*}
-    \Var{\epsilon} = \frac{\gamma_{\vec 0}}{
-    1
-    +
-    \sum\limits_{\vec j = \vec 0}^{\vec M}
-    \Theta_{\vec j}^2
-    }.
-\end{equation*}
-Авторы\nbsp{}cite:box1976time предлагают использовать метод Ньютона---Рафсона для
-решения этого уравнения с большей точностью, однако, этот метод не подходит для
-трех измерений. Использование более медленного метода не оказывает большого
-эффекта на общую производительность программы, потому что количество
-коэффициентов мало, и большую часть времени программа тратит на генерацию
-взволнованной поверхности.
-
-**** Стационарность и обратимость процессов АР и СС
-Для того чтобы моделируемая взволнованная поверхность представляла собой
-физическое явление, соответствующий процесс должен быть стационарным и
-обратимым. Если процесс обратим, то существует разумная связь текущих событий с
-событиями в прошлом, и, если процесс стационарен, то амплитуда моделируемого
-физического сигнала не увеличивается бесконечно в пространстве и времени.
-
-Процесс АР всегда обратим, а для стационарности необходимо, чтобы корни
-характеристического уравнения
-\begin{equation*}
-1 - \Phi_{0,0,1} z - \Phi_{0,0,2} z^2
-- \cdots
-- \Phi_{\vec N} z^{N_0 N_1 N_2} = 0,
-\end{equation*}
-лежали \emph{вне} единичного круга. Здесь \(\vec{N}\)\nbsp{}--- порядок процесса
-АР, а \(\Phi\)\nbsp{}--- коэффициенты.
-
-Процесс СС всегда стационарен, а для обратимости необходимо, чтобы корни
-характеристического уравнения
-\begin{equation*}
-1 - \Theta_{0,0,1} z - \Theta_{0,0,2} z^2
-- \cdots
-- \Theta_{\vec M} z^{M_0 M_1 M_2} = 0,
-\end{equation*}
-лежали \emph{вне} единичного круга. Здесь \(\vec{M}\)\nbsp{}--- порядок процесса
-СС, а \(\Theta\)\nbsp{}--- коэффициенты.
-
-**** Смешанный процесс авторегрессии скользящего среднего (АРСС).
-:PROPERTIES:
-:CUSTOM_ID: sec:how-to-mix-ARMA
-:END:
-В общем и целом, процесс АРСС получается путем подстановки сгенерированной
-процессом СС взволнованной поверхности в качестве случайного импульса процесса
-АР, однако, для того чтобы АКФ результирующего процесса соответствовала
-заданной, необходимо предварительно скорректировать значения коэффициентов АР.
-Существует несколько способов "смешивания" процессов АР и СС.
-- Подход, предложенный авторами\nbsp{}cite:box1976time, который включается в себя
-  разделение АКФ на часть для процесса АР и часть для процесса СС по каждому из
-  измерений, не подходит в данной ситуации, поскольку в трех измерениях
-  невозможно таким образом разделить АКФ: всегда останутся части, которые не
-  будут учтены ни в процессе АР, ни в процессе СС.
-- Альтернативный подход состоит в использование одной и той же (неразделенной)
-  АКФ для обоих процессов разных порядков, однако, тогда характеристики
-  реализации (математической ожидание, дисперсия и др.) будут смещены: они
-  станут характеристика двух наложенных друг на друга процессов.
-Для первого подхода авторами\nbsp{}cite:box1976time предложена формула корректировки
-коэффициентов процесса АР, для второго же подхода такой формулы нет. Таким
-образом, лучшим решением на данный момент является использование процессов АР и
-СС по отдельности.
-
-**** Критерии выбора процесса для моделирования разных профилей волн.
-:PROPERTIES:
-:CUSTOM_ID: sec-process-selection
-:END:
-
-Одной из проблем в применении модели АРСС для генерации взволнованной морской
-поверхности является то, что для разных профилей волн /необходимо/ использовать
-разные процессы: стоячие волны моделируются только процессом АР, а прогрессивные
-волны\nbsp{}--- только процессом СС. Это утверждение пришло из практики: если
-попытаться использовать процессы наоборот, результирующая реализация либо
-расходится, либо не представляет собой реальные морские волны (такое происходит
-в случае необратимого процесса СС, который всегда стационарен). Таким образом,
-процесс АР может быть использован только для моделирования стоячих волн, а
-процесс СС\nbsp{}--- для прогрессивных волн.
-
-Другой проблемой является невозможность автоматического определения оптимального
-количества коэффициентов для трехмерных процессов АР и СС. Для одномерных
-процессов существуют итеративные методы\nbsp{}cite:box1976time, однако они расходятся
-в трехмерном случае.
-
-Последней проблемой, которая описана в разделе [[#sec:how-to-mix-ARMA]], является
-невозможность "смешать" процесс АР и СС в трех измерениях.
-
-Практика показывает, что некоторые утверждения авторов\nbsp{}cite:box1976time не
-выполняются для трехмерной модели АРСС. Например, авторы утверждают, что АКФ
-процесса СС обрывается на отсчете \(q\), а АКФ процесса АР затухает на
-бесконечности, однако, на практике при использовании слабо затухающей и
-обрывающейся на отсчете \(q\) АКФ для трехмерного процесса СС получается
-необратимый процесс СС и реализация, не соответствующая реальными морским
-волнам, в то время как при использовании той же самой АКФ для трехмерного
-процесса АР получается стационарный обратимый процесс и адекватная реализация.
-Также, авторы утверждают, что первые \(q\) точек АКФ смешанного процесса
-необходимо выделить процессу СС (поскольку он обычно используется для описания
-пиков АКФ) и отдать остальные точки процессу АР, однако, на практике в случае
-АКФ прогрессивной волны процесс АР стационарен только для начального временного
-среза АКФ, а остальные точки отдаются процессу СС.
-
-Суммируя вышесказанное, наиболее разработанным сценарием применения модели АРСС
-для генерации взволнованной морской поверхности является использование процесса
-АР для стоячих волн и процесса СС для прогрессивных волн. Смешанный процесс АРСС
-может сделать модель более точной при условии наличия соответствующих формул
-пересчета коэффициентов, что является целью дальнейших исследований.
-
-** Моделирование нелинейности морских волн
-Модель АРСС позволяет учесть асимметричность распределения волновых аппликат,
-т.е. генерировать морские волны, закон распределения аппликат которых имеет
-ненулевой эксцесс и асимметрию. Такой закон распределения характерен для реальных
-морских волн\nbsp{}cite:longuet1963nonlinear.
-
-Асимметричность волн моделируется с помощью нелинейного безынерционного
-преобразования (НБП) случайного процесса, однако, любое нелинейное
-преобразование случайного процесса приводит к преобразованию его АКФ. Для того
-чтобы подавить этот эффект, необходимо предварительно преобразовать АКФ, как
-показано в\nbsp{}cite:boukhanovsky1997thesis.
-
-**** Преобразование взволнованной поверхности.
-Формула \(z=f(y)\) преобразования взволнованной поверхности к необходимому
-одномерному закону распределения \(F(z)\) получается путем решения нелинейного
-трансцендентного уравнения \(F(z) = \Phi(y)\), где \(\Phi(y)\)\nbsp{}--- функция
-одномерного нормального закона распределения. Поскольку функция распределения
-аппликат морских волн часто задается некоторой аппроксимацией, основанной на
-натурных данных, то это уравнение целесообразно решать численно в каждой точке
-\(y_k|_{k=0}^N\) сетки сгенерированной поверхности относительно \(z_k\). Тогда
-уравнение запишется в виде
-\begin{equation}
-    \label{eq-distribution-transformation}
-    F(z_k)
-    =
-    \frac{1}{\sqrt{2\pi}}
-    \int\limits_0^{y_k} \exp\left[ -\frac{t^2}{2} \right] dt
-    .
-\end{equation}
-Поскольку функции распределения монотонны, для решения этого уравнения
-используется простейший численный метод половинного деления (метод бисекции).
-
-**** Предварительное преобразование АКФ.
-Для преобразования АКФ \(\gamma_z\) процесса ее необходимо разложить в ряд по
-полиномам Эрмита (ряд Грама---Шарлье)
-\begin{equation*}
-    \gamma_z \left( \vec u \right)
-    =
-    \sum\limits_{m=0}^{\infty}
-    C_m^2 \frac{\gamma_y^m \left( \vec u \right)}{m!},
-\end{equation*}
-где
-\begin{equation*}
-    C_m = \frac{1}{\sqrt{2\pi}}
-  \int\limits_{0}^\infty
-    f(y) H_m(y) \exp\left[ -\frac{y^2}{2} \right],
-\end{equation*}
-\(H_m\)\nbsp{}--- полином Эрмита, а \(f(y)\)\nbsp{}--- решение уравнения
-eqref:eq-distribution-transformation. Воспользовавшись полиномиальной
-аппроксимацией \(f(y) \approx \sum\limits_i d_i y^i\) и аналитическими выражениями
-для полнимов Эрмита, формулу определения коэффициентов можно упростить,
-используя следующее равенство:
-\begin{equation*}
-    \frac{1}{\sqrt{2\pi}}
-    \int\limits_\infty^\infty
-    y^k \exp\left[ -\frac{y^2}{2} \right]
-    =
-    \begin{cases}
-        (k-1)!! & \text{для четных }k,\\
-        0       & \text{для нечетных }k.
-    \end{cases}
-\end{equation*}
-Оптимальное количество коэффициентов \(C_m\) определяется путем вычисления их
-последовательно и критерий прекращения счета определяется совпадением дисперсий
-обоих полей с требуемой точностью \(\epsilon\):
-\begin{equation*}
-    \left| \Var{z} - \sum\limits_{k=0}^m
-    \frac{C_k^2}{k!} \right| \leq \epsilon.
-\end{equation*}
-
-В\nbsp{}cite:boukhanovsky1997thesis автор предлагает использовать полиномиальную
-аппроксимацию для \(f(y)\) также для преобразования поверхности, однако на
-практике в реализации взволнованной поверхности часто находятся точки,
-выпадающие за промежуток на котором построена аппроксимация, что приводит к
-резкому уменьшению ее точности. В этих точках уравнение
-eqref:eq-distribution-transformation эффективнее решать методом бисекции.
-Использование полиномиальной аппроксимации в формулах для коэффициентов ряда
-Грама---Шарлье не приводит к аналогичным ошибкам.
-
-** Определение поля давлений под дискретно заданной взволнованной поверхностью
-Аналитические решения граничных задач для классических уравнений часто
-используются для исследования различных свойств уравнений, и для таких
-исследований запись формулы общего решения неудобна ввиду своей сложности и
-наличия интегралов от неизвестных функций. Одним из методов нахождения
-аналитических решений ДУЧП является метод Фурье. Основой метода служит
-преобразование Фурье, применение которого к любому ДУЧП позволяет свести его к
-алгебраическому, а его решение записывается как обратное преобразование Фурье от
-некоторой функции (которая может содержать преобразования Фурье от других
-функций). Поскольку эти преобразования не всегда можно записать аналитически, то
-вместо этого ищутся частные решения задачи и анализируется их поведение в
-различных областях. В то же время, вычисление дискретных преобразований Фурье на
-компьютере возможно для любой дискретно заданной функции и эффективно при
-использовании алгоритмов БПФ. Эти алгоритмы используют симметрию комплексных
-экспонент для понижения асимптотической сложности с \(\mathcal{O}(n^2)\) до
-\(\mathcal{O}(n\log_{2}n)\). Таким образом, даже если общее решение содержит
-преобразования Фурье от неизвестных функций, они все равно могут быть взяты
-численно, а использование алгоритмов БПФ делает этот подход эффективным.
-
-Альтернативным подходом является сведение их к разностным уравнениям, решаемым с
-помощью построения различных численных схем. При этом решение получается
-приближенным, а асимптотическая сложность соответствующих алгоритмов сопоставима
-со сложностью алгоритма БПФ. Например, стационарное эллиптическое уравнение в
-частных производных преобразуется в неявную разностную схему, решаемую
-итерационным методом, на каждом шаге которого ищется решение трехдиагональной
-или пятидиагональной СЛАУ методом прогонки (алгоритм Томаса). Асимптотическая
-сложность алгоритма составляет \(\mathcal{O}({n}{m})\), где \(n\)\nbsp{}--- количество
-точек на сетке взволнованной поверхности, \(m\)\nbsp{}--- число итераций. Несмотря на
-широкое распространение, итеративные алгоритмы неэффективно отображаются на
-архитектуру параллельных машин; в частности, отображение на сопроцессоры может
-включать в себя копирование данных на сопроцессор и обратно на каждой итерации,
-что отрицательно сказывается на их производительности. В то же время, наличие
-большого количества преобразований Фурье в решении является скорее
-преимуществом, чем недостатком. Во-первых, решения, полученные с помощью метода
-Фурье, явные, а значит хорошо масштабируются на большое количество параллельно
-работающих вычислительных ядер с использованием простейших приемов параллельного
-программирования. Во-вторых, для алгоритмов БПФ существуют готовые
-оптимизированные реализация для различных архитектур процессоров и сопроцессоров
-(GPU, MIC). Эти преимущества обусловили выбор метода Фурье в качестве рабочего
-для получения явного аналитического решения задачи определения давлений под
-взволнованной морской поверхностью.
-
-*** Двухмерное поле скоростей
-:PROPERTIES:
-:CUSTOM_ID: sec:pressure-2d
-:END:
-**** Формула для жидкости бесконечной глубины.
-Задача Робена для уравнения Лапласа в двух измерениях записывается как
-\begin{align}
-    \label{eq-problem-2d}
-    & \phi_{xx}+\phi_{zz}=0,\\
-    & \zeta_t + \zeta_x\phi_x = \frac{\zeta_x}{\sqrt{1 + \zeta_x^2}} \phi_x - \phi_z, & \text{на }z=\zeta(x,t).\nonumber
-\end{align}
-Для ее решения воспользуемся методом Фурье. Возьмем преобразование Фурье от
-обоих частей уравнений Лапласа и получим
-\begin{equation*}
-    -4 \pi^2 \left( u^2 + v^2 \right)
-    \FourierY{\phi(x,z)}{u,v} = 0,
-\end{equation*}
-откуда имеем \(v = \pm i u\). Здесь и далее будет использоваться следующая
-симметричная форма преобразования Фурье:
-\begin{equation*}
-    \FourierY{f(x,y)}{u,v} =
-    \iint\limits_{-\infty}^{\phantom{--}\infty}
-    f(x,y)
-    e^{-2\pi i (x u + y v)}
-    dx dy.
-\end{equation*}
-Решение уравнения будем искать в виде обратного преобразования Фурье
-\(\phi(x,z)=\InverseFourierY{E(u,v)}{x,z}\). Подставляя[fn::Выражение \(v={-i}{u}\)
-не подходит в данной задаче, поскольку потенциал скорости должен стремиться к
-нулю с увеличением глубины до бесконечности.} \(v={i}{u}\) в формулу, решение
-перепишется как
-\begin{equation}
-    \label{eq-guessed-sol-2d}
-    \phi(x,z) = \InverseFourierY{e^{2\pi u z}E(u)}{x}.
-\end{equation}
-Для того чтобы подстановка \(z=\zeta(x,t)\) не помешала использованию
-преобразований Фурье в решении, перепишем eqref:eq-guessed-sol-2d в виде
-свертки:
-\begin{equation*}
-    \phi(x,z)
-    =
-    \Fun{z}
-    \ast
-    \InverseFourierY{E(u)}{x},
-\end{equation*}
-где \(\Fun{z}\)\nbsp{}--- некоторая функция, вид которой будет определен в
-[[#sec:compute-delta]] и для которой выполняется соотношение
-\(\FourierY{\Fun{z}}{u}=e^{2\pi{u}{z}}\). Подставляя выражение для \(\phi\) в
-граничное условие, получим
-\begin{equation*}
-    \zeta_t
-    =
-    \left( i f(x) - 1 \right)
-    \left[
-        \Fun{z}
-        \ast
-        \InverseFourierY{2\pi u E(u)}{x}
-    \right],
-\end{equation*}
-где \(f(x) = {\zeta_x}/{\sqrt{1 + \zeta_x^2}} - \zeta_x\). Применяя преобразование
-Фурье к обеим частям, получаем выражение для коэффициентов \(E\):
-\begin{equation*}
-    E(u) =
-    \frac{1}{2\pi u}
-    \frac{
-    \FourierY{\zeta_t / \left(i f(x) - 1\right)}{u}
-    }{
-    \FourierY{\Fun{z}}{u}
-    }
-\end{equation*}
-Выполняя подстановку \(z=\zeta(x,t)\) и подставляя полученное выражение в
-eqref:eq-guessed-sol-2d, получаем окончательное выражение для \(\phi(x,z)\):
-\begin{equation}
-    \label{eq-solution-2d}
-    \boxed{
-        \phi(x,z)
-        =
-        \InverseFourierY{
-            \frac{e^{2\pi u z}}{2\pi u}
-            \frac{
-            \FourierY{ \zeta_t / \left(i f(x) - 1\right) }{u}
-            }{
-            \FourierY{ \Fun{\zeta(x,t)} }{u}
-            }
-        }{x}.
-    }
-\end{equation}
-
-Множитель \(e^{2\pi u z}/(2\pi u)\) делает график функции от которой берется
-обратное преобразования Фурье несимметричным относительно оси \(OY\). Это
-затрудняет применение БПФ, поскольку оно требует периодичную функцию, которая на
-концах промежутка принимает нулевое значение. Использование численного
-интегрирования вместо БПФ не позволит получить преимущество над решением всей
-системы уравнений с помощью разностных схем. Эту проблему можно обойти,
-используя формулу eqref:eq-solution-2d-full для жидкости конечной глубины с
-заведомо большим значением глубины водоема \(h\). Вывод формулы дан в следующем
-разделе.
-
-**** Формула для жидкости конечной глубины.
-На дне водоема вертикальная составляющая скорости перемещения жидкости должна
-равняться нулю, т.е. \(\phi_z=0\) на \(z=-h\), где \(h\)\nbsp{}--- глубина водоема. В этом
-случае пренебречь равенством \(v = -i u\), полученным из уравнения Лапласа,
-нельзя, и решение ищется в виде
-\begin{equation}
-    \phi(x,z)
-    =
-    \InverseFourierY{
-        \left( C_1 e^{2\pi u z} + C_2 e^{-2\pi u z} \right)
-        E(u)
-    }{x}.
-    \label{eq-guessed-sol-2d-full}
-\end{equation}
-Подставляя \(\phi\) в условие на дне водоема, получим
-\begin{equation*}
-    C_1 e^{-2\pi u h} - C_2 e^{2\pi u h} = 0,
-\end{equation*}
-откуда имеем \(C_1=\frac{1}{2}C{e}^{2\pi{u}{h}}\) и
-\(C_2=-\frac{1}{2}C{e}^{-2\pi{u}{h}}\). Константа \(C\) здесь произвольна, поскольку
-при подстановке станет частью неизвестных коэффициентов \(E(u)\). Подставляя
-полученные выражения для \(C_1\) и \(C_2\) в eqref:eq-guessed-sol-2d-full, получаем
-выражение
-\begin{equation*}
-    \phi(x,z) = \InverseFourierY{ \Sinh{2\pi u (z+h)} E(u) }{x}.
-\end{equation*}
-Подставляя \(\phi\) в граничное условие на свободной поверхности, получаем
-\begin{equation*}
-    \zeta_t = f(x) \InverseFourierY{ 2\pi i u \Sinh{2\pi u (z+h)} E(u) }{x}
-            - \InverseFourierY{ 2\pi u \SinhX{2\pi u (z+h)} E(u) }{x}.
-\end{equation*}
-Здесь \(\sinh\) и \(\cosh\) дают схожие результаты вблизи свободной поверхности, и,
-поскольку эта область является наиболее интересной с точки зрения практического
-применения, положим \(\Sinh{2\pi{u}(z+h)}\approx\SinhX{2\pi{u}(z+h)}\). Выполняя
-аналогичные предыдущему разделу операции, получаем окончательное выражение для
-\(\phi(x,z)\):
-\begin{equation}
-\boxed{
-    \phi(x,z,t)
-    =
-  \InverseFourierY{
-        \frac{\Sinh{2\pi u (z+h)}}{2\pi u}
-        \frac{
-            \FourierY{ \zeta_t / \left(i f(x) - 1\right) }{u}
-        }{
-            \FourierY{ \FunSecond{\zeta(x,t)} }{u}
-        }
-    }{x},
-}
-    \label{eq-solution-2d-full}
-\end{equation}
-где \(\FunSecond{z}\)\nbsp{}--- некоторая функция, вид которой будет определен в
-[[#sec:compute-delta]] и для которой выполняется соотношение
-\(\FourierY{\FunSecond{z}}{u}=\Sinh{2\pi{u}{z}}\).
-
-**** Сведение к формулам линейной теории волн.
-Справедливость полученных формул проверим, подставив в качестве \(\zeta(x,t)\)
-известные аналитические выражения для плоских волн. Символьные вычисления
-преобразований Фурье в этом разделе производились с помощью пакета Mathematica\nbsp{}cite:mathematica10. В линейной теории широко используется предположение о
-малости амплитуд волн, что позволяет упростить исходную систему уравнений
-eqref:eq-problem-2d до
-\begin{align*}
-    & \phi_{xx}+\phi_{zz}=0,\\
-    & \zeta_t = -\phi_z & \text{на }z=\zeta(x,t),
-\end{align*}
-решение которой запишется как
-\begin{equation*}
-    \phi(x,z,t)
-    =
-    -\InverseFourierY{
-        \frac{e^{2\pi u z}}{2\pi u}
-        \FourierY{\zeta_t}{u}
-    }{x}
-    .
-\end{equation*}
-Профиль прогрессивной волны описывается формулой \(\zeta(x,t)=A\cos(2\pi(kx-t))\).
-Подстановка этого выражения в eqref:eq-solution-2d дает равенство
-\(\phi(x,z,t)=-\frac{A}{k}\sin(2\pi(kx-t))\Sinh{2\pi{k}{z}}\). Чтобы свести его к
-формуле линейной теории волн, представим гиперболический синус в
-экспоненциальной форме и отбросим член, содержащий \(e^{-2\pi{k}{z}}\), как
-противоречащий условию \(\phi\underset{z\rightarrow-\infty}{\longrightarrow}0\).
-После взятия действительной части выражения получится известная формула линейной
-теории \(\phi(x,z,t)=\frac{A}{k}e^{2\pi{k}{z}}\sin(2\pi(kx-t))\). Аналогично,
-предположение о малости амплитуд волн позволяет упростить формулу
-eqref:eq-solution-2d-full до
-\begin{equation*}
-    \phi(x,z,t)
-    =
-    -\InverseFourierY{
-        \frac{\Sinh{2\pi u (z+h)}}{2\pi u \Sinh{2\pi u h}}
-        \FourierY{\zeta_t}{u}
-    }{x}.
-\end{equation*}
-Подстановка формулы для прогрессивной плоской волны вместо \(\zeta(x,t)\) дает
-равенство
-\begin{equation}
-    \label{eq-solution-2d-linear}
-    \phi(x,z,t)=\frac{A}{k}
-    \frac{\Sinh{2 \pi k (z+h)}}{ \Sinh{2 \pi k h} }
-    \sin(2 \pi (k x-t)),
-\end{equation}
-что соответствует формуле линейной теории для конечной глубины.
-
-Различные записи решения уравнения Лапласа, в которых затухающая экспонента
-может встречаться как со знаком "+", так и со знаком "-", могут стать причиной
-разницы между формулами линейно теории и формулами, выведенными в данной работе,
-где вместо \(\sinh\) используется \(\cosh\). Выражение
-\(\frac{\Sinh{2\pi{k}(z+h)}}{\Sinh{2\pi{k}{h}}}\approx\frac{\sinh(2\pi{k}(z+h))}{\sinh(2\pi{k}{h})}\)
-превращается в строгое равенство на поверхности, и разница между правой левой
-частью увеличивается при приближении к дну водоема (для достаточно большой
-глубины ошибка вблизи поверхности жидкости незначительна). Поэтому для
-достаточно большой глубины можно использовать любую из функций (\(\cosh\) или
-\(\sinh\)) для вычисления потенциала скорости вблизи взволнованной поверхности.
-
-Сведение формул eqref:eq-solution-2d и eqref:eq-solution-2d-full к формулам
-линейной теории волн показывает, что формула eqref:eq-solution-2d для жидкости
-бесконечной глубины не подходит для вычисления потенциала скорости с
-использованием метода Фурье, т.к. не обладает необходимой для преобразования
-Фурье симметрией. Однако, для такого случая можно использовать формулу для
-конечной глубины, полагая \(h\) равным характерному значению глубины исследуемого
-водоема. Для стоячих волн сведение к формулам линейной теории происходит с
-аналогичными предположениями.
-
-*** Трехмерное поле скоростей
-В трех измерениях исходная система уравнений eqref:eq-problem переписывается как
-\begin{align}
-    \label{eq-problem-3d}
-    & \phi_xx + \phi_yy + \phi_zz = 0,\\
-    & \zeta_t + \zeta_x\phi_x + \zeta_y\phi_y
-    =
-    \frac{\zeta_x}{\sqrt{1 + \zeta_x^2}} \phi_x
-    +\frac{\zeta_y}{\sqrt{\vphantom{\zeta_x^2}\smash[b]{1 + \zeta_y^2}}} \phi_y
-    - \phi_z, & \text{на }z=\zeta(x,y,t).\nonumber
-\end{align}
-Для ее решения также воспользуемся методом Фурье. Возьмем преобразование Фурье
-от обоих частей уравнений Лапласа и получим
-\begin{equation*}
-    -4 \pi^2 \left( u^2 + v^2 + w^2 \right)
-    \FourierY{\phi(x,y,z)}{u,v,w} = 0,
-\end{equation*}
-откуда имеем \(w=\pm{i}\sqrt{u^2+v^2}\). Решение уравнения будем искать в виде
-обратного преобразования Фурье \(\phi(x,y,z)=\InverseFourierY{E(u,v,w)}{x,y,z}\).
-Применяя полученное равенство, получаем
-\begin{equation*}
-    \phi(x,y,z) = \InverseFourierY{
-        \left(
-            C_1 e^{2\pi \sqrt{u^2+v^2} z}
-            -C_2 e^{-2\pi \sqrt{u^2+v^2} z}
-        \right)
-        E(u,v)
-    }{x,y}.
-\end{equation*}
-Подставляя \(\phi\) в условие на дне водоема аналогично двухмерному случаю,
-получаем
-\begin{equation}
-    \label{eq-guessed-sol-3d}
-    \phi(x,y,z) = \InverseFourierY{
-        \Sinh{2\pi \sqrt{u^2+v^2} (z+h)} E(u,v)
-    }{x,y}.
-\end{equation}
-Подставляя выражение для \(\phi\) в граничное условие, получим
-\begin{equation*}
-    \arraycolsep=1.4pt
-    \begin{array}{rl}
-        \zeta_t = & i f_1(x,y) \InverseFourierY{2 \pi u \Sinh{2\pi \sqrt{u^2+v^2} (z+h)}E(u,v)}{x,y} \\
-        + & i f_2(x,y) \InverseFourierY{2 \pi v \Sinh{2\pi \sqrt{u^2+v^2} (z+h)}E(u,v)}{x,y} \\
-        - & \InverseFourierY{2 \pi \sqrt{u^2+v^2} \Sinh{2\pi \sqrt{u^2+v^2} (z+h)}E(u,v)}{x,y}
-    \end{array}
-\end{equation*}
-где \(f_1(x,y)={\zeta_x}/{\sqrt{1+\zeta_x^2}}-\zeta_x\) и
-\(f_2(x,y)={\zeta_y}/{\sqrt{\vphantom{\zeta_x^2}\smash[b]{1+\zeta_y^2}}}-\zeta_y\).
-Применяя преобразование Фурье к обеим частям, получаем выражение для
-коэффициентов \(E\):
-\begin{equation*}
-    \arraycolsep=1.4pt
-    \begin{array}{rl}
-        \FourierY{\zeta_t}{u,v} = &
-        \FourierY{i f_1(x,y) \InverseFourierY{2 \pi u \Sinh{2\pi \sqrt{u^2+v^2} (z+h)} E(u,v)}{x,y}}{u,v}  \\
-        + & \FourierY{i f_2(x,y) \InverseFourierY{2 \pi v \Sinh{2\pi \sqrt{u^2+v^2} (z+h)} E(u,v)}{x,y}}{u,v}  \\
-        - & 2 \pi \sqrt{u^2+v^2} \Sinh{2\pi \sqrt{u^2+v^2} (z+h)} E(u,v)
-    \end{array}
-\end{equation*}
-Окончательное решение получается при подстановке выражения для \(E(u,v)\)
-в eqref:eq-guessed-sol-3d.
-
-* Численные методы и результаты экспериментов
-** Форма АКФ для разных волновых профилей
-**** Аналитический метод.
-Прямой способ нахождения АКФ, соответствующей заданному профилю морской волны,
-состоит в применении теоремы Винера---Хинчина. Согласно этой теореме
-автокорреляционная функция \(K\) функции \(\zeta\) равна преобразованию Фурье от
-квадрата модуля этой функции:
-\begin{equation}
-  K(t) = \Fourier{\left| \zeta(t) \right|^2}.
-  \label{eq-wiener-khinchin}
-\end{equation}
-Если заменить \(\zeta\) на формулу для волнового профиля, то это выражение даст
-аналитическую формулу для соответствующей АКФ.
-
-Для трехмерного волнового профиля (два пространственных и одно временное
-измерение) аналитическая формула представляет собой многочлен высокой степени, и
-ее лучше всего вычислять с помощью программы для символьных вычислений. Затем,
-для практического применения она может быть аппроксимирована суперпозицией
-экспоненциально затухающих косинусов (именно так выглядит АКФ стационарного
-процесса АРСС\nbsp{}cite:box1976time).
-
-**** Эмпирический метод.
-Впрочем, для трехмерного случая существует более простой эмпирический метод
-нахождения формы АКФ, не требующий использования сложного программного
-обеспечения. Известно, что АКФ, представляющая собой суперпозицию
-экспоненциально затухающих косинусов, является решением уравнения Стокса для
-гравитационных волн\nbsp{}cite:boccotti1983wind. Значит, если в моделируемом морском
-волнении важна только форма волны, а не точные ее характеристики, то заданный
-волновой профиль можно просто домножить на затухающую экспоненту, чтобы получить
-подходящую АКФ. Эта АКФ не отражает параметры волн, такие как высота и период,
-зато это открывает возможность моделировать волны определенных неаналитических
-форм, "рисуя" профиль волны, домножая его на экспоненту и используя
-результирующую функцию в качестве АКФ. Таким образом, эмпирический метод
-неточен, но более простой по сравнению с применением теоремы Винера---Хинчина;
-он, в основном, полезен для тестирования модели АРСС.
-
-**** АКФ стоячей волны.
-Профиль трехмерной плоской стоячей волны задается как
-\begin{equation}
-  \zeta(t, x, y) = A \sin (k_x x + k_y y) \sin (\sigma t).
-  \label{eq-standing-wave}
-\end{equation}
-Найдем АКФ с помощью аналитического метода. Домножив формулу на затухающую
-экспоненту (поскольку преобразование Фурье определено для функции \(f\), для
-которой справедливо \(f\underset{x\rightarrow\pm\infty}{\longrightarrow}0\)),
-получим
-\begin{equation}
-  \zeta(t, x, y) =
-  A
-  \exp\left[-\alpha (|t|+|x|+|y|) \right]
-  \sin (k_x x + k_y y) \sin (\sigma t).
-  \label{eq-decaying-standing-wave}
-\end{equation}
-Затем, применяя трехмерное преобразование Фурье к обоим частям уравнения с
-помощью программы для символьных вычислений, получим многочлен высокой степени,
-который аппроксимируем выражением
-\begin{equation}
-  K(t,x,y) =
-  \gamma
-  \exp\left[-\alpha (|t|+|x|+|y|) \right]
-  \cos \beta t
-  \cos \left[ \beta x + \beta y \right].
-  \label{eq-standing-wave-acf}
-\end{equation}
-Таким образом, после применения теоремы Винера---Хинчина получаем исходную
-формулу, но с косинусами вместо синусов. Это различие важно, поскольку значение
-АКФ в точке \((0,0,0)\) равно дисперсии процесса АРСС, которое при использовании
-синусов было бы неверным.
-
-Если попытаться получить ту же самую формулу с помощью эмпирического метода, то
-выражение eqref:eq-decaying-standing-wave необходимо адаптировать для
-соответствия eqref:eq-standing-wave-acf. Это можно осуществить либо, изменяя
-фазу синуса, либо заменой синуса на косинус, чтобы сдвинуть максимум функции в
-начало координат.
-
-**** АКФ прогрессивной волны.
-Профиль трехмерной плоской прогрессивной волны задается как
-\begin{equation}
-  \zeta(t, x, y) = A \cos (\sigma t + k_x x + k_y y).
-  \label{eq-propagating-wave}
-\end{equation}
-Для аналитического метода повторение шагов из предыдущих двух параграфов дает
-\begin{equation}
-  K(t,x,y) =
-  \gamma
-  \exp\left[-\alpha (|t|+|x|+|y|) \right]
-  \cos\left[\beta (t+x+y) \right].
-  \label{eq-propagating-wave-acf}
-\end{equation}
-Для эмпирического метода профиль волны можно просто домножить на затухающую
-экспоненту, не изменяя положение максимума АКФ (как это требовалось для стоячей
-волны).
-
-**** Сравнение изученных методов.
-Итого, аналитический метод нахождения АКФ морских волн сводится к следующим
-шагам.
-- Обеспечить затухание выражения для профиля волны на \(\pm\infty\), домножив его
-  на затухающую экспоненту.
-- Взять преобразование Фурье от квадрата модуля получившегося профиля,
-  воспользовавшись программой для символьных вычислений.
-- Аппроксимировать получившийся многочлен подходящим выражением для АКФ.
-
-Два примера этого раздела показывают, что затухающие профили стоячих и
-прогрессивных волн схожи по форме с соответствующими АКФ с тем лишь различием,
-что максимум АКФ должен быть перенесен в начало координат, чтобы сохранить
-дисперсию моделируемого процесса. Применение эмпирического метода нахождения АКФ
-сводится к следующим шагам.
-- Обеспечить затухание выражения для профиля волны на \(\pm\infty\), домножив его
-  на затухающую экспоненту.
-- Перенести максимум получившейся функции в начало координат, используя свойства
-  тригонометрических функций для сдвига фазы.
-
-** Дополните льные формулы, методы и алгоритмы для модели АРСС
-:PROPERTIES:
-:CUSTOM_ID: sec:arma-algorithms
-:END:
-*** Аппроксимация распределения аппликат
-Одним из параметров генератора взволнованной морской поверхности служит функция
-плотности распределения (ФПР) аппликат этой поверхности. Она задается либо
-полиномиальной аппроксимацией натурных данных, либо аналитически.
-
-**** Разложение в ряд Грама---Шарлье.
-В\nbsp{}cite:huang1980experimental было экспериментально показано, что распределение
-аппликат морской поверхности отличается от нормального ненулевым эксцессом и
-асимметрией. В\nbsp{}cite:рожков1996теория показано, что такое распределение
-раскладывается в ряд Грама---Шарлье:
-\begin{align}
-    \label{eq-skew-normal-1}
-    F(z; \gamma_1, \gamma_2) & = \phi(z)
-        - \gamma_1 \frac{\phi'''(z)}{3!}
-        + \gamma_2 \frac{\phi''''(z)}{4!} \nonumber \\
-    & =
-    \frac{1}{2} \text{erf}\left[\frac{z}{\sqrt{2}}\right]
-    -
-    \frac{e^{-\frac{z^2}{2}}}{\sqrt{2\pi}}
-    \left[
-        \frac{1}{6} \gamma_1 \left(z^2-1\right)
-        + \frac{1}{24} \gamma_2 z \left(z^2-3\right)
-    \right]
-    ,\nonumber \\
-    f(z; \gamma_1, \gamma_2) & =
-    \frac{e^{-\frac{z^2}{2}}}{\sqrt{2 \pi }}
-    \left[
-        \frac{1}{6} \gamma_1 z \left(z^2-3\right)
-        + \frac{1}{24} \gamma_2 \left(z^4-6z^2+3\right)
-        +1
-    \right],
-\end{align}
-где \(\phi(z)=\frac{1}{2}\mathrm{erf}(z/\sqrt{2})\), \(\gamma_1\)\nbsp{}--- асимметрия,
-\(\gamma_2\)\nbsp{}--- эксцесс, \(f\)\nbsp{}--- ФПР, \(F\)\nbsp{}--- функция распределения (ФР).
-Согласно\nbsp{}cite:рожков1990вероятностные для аппликат морских волн значение
-асимметрии выбирается на интервале \(0,1\leq\gamma_1\leq{0,52}]\), а значение
-эксцесса на интервале \(0,1\leq\gamma_2\leq{0,7}\). Семейство плотностей
-распределения при различных параметрах показано на рис.\nbsp{}[[fig-skew-normal-1]].
-
-#+name: fig-skew-normal-1
-#+begin_src R :file build/skew-normal-1-ru.pdf
-source(file.path("R", "common.R"))
-x <- seq(-3, 3, length.out=100)
-params <- data.frame(
-  skewness = c(0.00, 0.52, 0.00, 0.52),
-  kurtosis = c(0.00, 0.00, 0.70, 0.70),
-  linetypes = c("solid", "dashed", "dotdash", "dotted")
-)
-arma.skew_normal_1_plot(x, params)
-legend(
-  "topleft",
-  mapply(
-    function (s, k) {
-      as.expression(bquote(list(
-        gamma[1] == .(arma.fmt(s, 2)),
-        gamma[2] == .(arma.fmt(k, 2))
-      )))
-    },
-    params$skewness,
-    params$kurtosis
-  ),
-  lty = paste(params$linetypes)
-)
-#+end_src
-
-#+caption: Вид плотности распределения eqref:eq-skew-normal-1 аппликат взволнованной морской поверхности при различных значениях асимметрии \(\gamma_1\) и эксцесса \(\gamma_2\).
-#+label: fig-skew-normal-1
-#+RESULTS: fig-skew-normal-1
-[[file:build/skew-normal-1-ru.pdf]]
-
-**** Асимметричное нормальное распределение.
-Альтернативной аппроксимацией распределения волновых аппликат служит формула
-асимметричного нормального распределения:
-\begin{align}
-    \label{eq-skew-normal-2}
-    F(z; \alpha) & = \frac{1}{2}
-   \mathrm{erfc}\left[-\frac{z}{\sqrt{2}}\right]-2 T(z,\alpha ), \nonumber \\
-    f(z; \alpha) & = \frac{e^{-\frac{z^2}{2}}}{\sqrt{2 \pi }}
-   \mathrm{erfc}\left[-\frac{\alpha z}{\sqrt{2}}\right],
-\end{align}
-где \(T\)\nbsp{}--- функция Оуэна\nbsp{}cite:owen1956tables. Эта формула не позволяет задать
-значения асимметрии и эксцесса по отдельности\nbsp{}--- оба значения регулируются
-параметром \(\alpha\). Преимущество данной формулы лишь в относительной простоте
-вычисления: эта функция встроена в некоторые программы и библиотеки
-математических функций. График функции для разных значений \(\alpha\) представлен
-на рис.\nbsp{}[[fig-skew-normal-2]].
-
-#+name: fig-skew-normal-2
-#+begin_src R :file build/skew-normal-2-ru.pdf
-source(file.path("R", "common.R"))
-x <- seq(-3, 3, length.out=100)
-alpha <- c(0.00, 0.87, 2.25, 4.90)
-params <- data.frame(
-  alpha = alpha,
-  skewness = arma.bits.skewness_2(alpha),
-  kurtosis = arma.bits.kurtosis_2(alpha),
-  linetypes = c("solid", "dashed", "dotdash", "dotted")
-)
-arma.skew_normal_2_plot(x, params)
-legend(
-  "topleft",
-  mapply(
-    function (a, s, k) {
-      as.expression(bquote(list(
-        alpha == .(arma.fmt(a, 2)),
-        gamma[1] == .(arma.fmt(s, 2)),
-        gamma[2] == .(arma.fmt(k, 2))
-      )))
-    },
-    params$alpha,
-    params$skewness,
-    params$kurtosis
-  ),
-  lty = paste(params$linetypes)
-)
-#+end_src
-
-#+caption: Вид плотности распределения eqref:eq-skew-normal-2 волновых аппликат при различных значениях коэффициента асимметрии \(\alpha\).
-#+label: fig-skew-normal-2
-#+RESULTS: fig-skew-normal-2
-[[file:build/skew-normal-2.pdf]]
-
-**** Тестирование.
-Решение уравнения eqref:eq-distribution-transformation с выбранной функцией
-распределения можно произвести либо в каждой точке генерируемой поверхности, что
-даст наиболее точные результаты, либо в каждой точке фиксированной сетки,
-интерполировав решение методом наименьших квадратов (МНК). Во втором случае
-точность будет меньше. Например, интерполяция многочленом 12-го порядка на сетке
-из 500 узлов, построенной на промежутке \(-5\sigma_z\leq{z}\leq{5}\sigma_z\), дает
-погрешность \(\approx{0,43}\cdot10^{-3}\). Увеличение порядка многочлена приводит
-либо к переполнениям при интерполяции МНК, либо к дополнительным коэффициентам
-близким к нулю; увеличение размера сетки влияет на результат незначительно. В
-большинстве случаев трех коэффициентов ряда Грама---Шарлье было достаточно для
-преобразования АКФ; относительная погрешность без интерполяции составляет
-\(10^{-5}\).
-
-*** Алгоритм генерации белого шума
-Чтобы исключить периодичность из сгенерированной моделью ветрового волнения
-реализации взволнованной поверхности, для генерации белого шума нужно
-использовать ГПСЧ с достаточно большим периодом. В качестве такого генератора в
-работе используется параллельная реализация вихря Мерсенна\nbsp{}cite:matsumoto1998mersenne с периодом \(2^{19937}-1\). Это позволяет создавать
-апериодичные реализации взволнованной морской поверхности для любых сценариев
-применения, встречаемых на практике.
-
-Запуск нескольких ГПСЧ с разными начальными состояниями в параллельных потоках
-не гарантирует некоррелированность генерируемых последовательностей
-псевдослучайных чисел, однако, можно воспользоваться алгоритмом динамического
-создания вихрей Мерсенна\nbsp{}cite:matsumoto1998dynamic, чтобы дать такую гарантию.
-Суть алгоритма заключается в поиске таких матриц начальных состояний
-генераторов, которые бы дали максимально некоррелированные последовательности
-псевдослучайных чисел при параллельном запуске нескольких вихрей Мерсенна с
-этими начальными состоянями. Поскольку на поиск начальных состояний можно
-потратить значительное количество процессорного времени, то вектор состояний
-создается предварительно для заведомо большего количества параллельных потоков и
-сохраняется в файл, который впоследствиии считывается основной программой перед
-началом генерации белого шума.
-
-*** Алгоритм генерации взволнованной поверхности
-В модели АРСС значение подъема взволнованной поверхности в каждой точке зависит
-от предыдущих по пространству и времени значений, из-за чего в начале реализации
-образуется так называемый /интервал разгона/ (см.\nbsp{}рис.\nbsp{}[[fig-ramp-up-interval]])\nbsp{}---
-промежуток, на котором реализация не соответствует заданной АКФ. Способ решения
-этой проблемы зависит от контекста, в котором происходит моделирование.
-
-Если реализация используется в контексте расчета остойчивости судна без учета
-маневрирования, то интервал никак не повлияет результаты эксперимента, поскольку
-находится на границе (далеко от исследуемого морского объекта). Если изучается
-остойчивость судна в условиях маневрирования, то интервал проще всего исключить
-из реализации (размер интервала примерно равен числу коэффициентов АР по каждому
-из измерений). Однако, это приводит к потере большого числа точек, поскольку
-исключение происходит по каждому из трех измерений. Альтернативным подходом
-является генерация взволнованной поверхности на интервале разгона моделью ЛХ и
-генерация остальной реализации с помощью модели АРСС.
-
-В алгоритме генерации взволнованной поверхности используется параллелизм по
-данным: реализация делится на равные части, каждая из которых генерируется
-независимо,\nbsp{}--- однако, в начале каждой из частей также присутствует
-интервал разгона. Для его исключения используется метод /сшивания/, часто
-применяемый в обработке цифровых
-сигналов\nbsp{}cite:oppenheim1989discrete,svoboda2011efficient,pavel2013algorithms.
-Суть метода заключается в добавлении интервала равного по размеру интервалу
-разгона в конец каждой из частей. Затем взволнованная поверхность генерируется в
-каждой точки каждой из частей (включая добавленный интервал), интервал в конце
-части \(N\) накладывается на интервал разгона в начале части \(N+1\), и значения
-в соответствующих точках складываются.
-
-#+name: fig-ramp-up-interval
-#+begin_src R :file build/ramp-up-interval-ru.pdf
-source(file.path("R", "common.R"))
-arma.plot_ramp_up_interval(label="Интервал разгона")
-#+end_src
-
-#+caption: Интевал разгона в начале оси \(OX\) реализации.
-#+label: fig-ramp-up-interval
-#+RESULTS: fig-ramp-up-interval
-[[file:build/ramp-up-interval-ru.pdf]]
-
-*** Формулы нормировки для потенциалов скоростей
-:PROPERTIES:
-:CUSTOM_ID: sec:compute-delta
-:END:
-
-В решениях eqref:eq-solution-2d и eqref:eq-solution-2d-full двухмерной задачи
-определения поля давлений присутствуют функции
-\(\Fun{z}=\InverseFourierY{e^{2\pi{u}{z}}}{x}\) и
-\(\FunSecond{z}=\InverseFourierY{\Sinh{2\pi{u}{z}}}{x}\), которые могут быть
-записаны аналитически различными выражениями и представляют сложность при
-вычислении на компьютере. Каждая функция\nbsp{}--- это преобразование Фурье от
-линейной комбинации экспонент, которое сводится к плохо определенной дельта
-функции комплексного аргумента (см.\nbsp{}табл.\nbsp{}[[tab-delta-functions]]).
-Обычно такого типа функции записывают как произведение дельта функций от
-действительной и мнимой части, однако, такой подход не работает здесь, поскольку
-взятие обратного преобразования Фурье не даст экспоненту, что сильно исказит
-результирующее поле скоростей. Для получения однозначного аналитического
-выражения можно воспользоваться нормировкой \(1/\Sinh{2\pi{u}{h}}\) (которая
-также включается в выражение для коэффициентов \(E(u)\)). Численные эксперименты
-показывают, что нормировка хоть и позволяет получить адекватное поле скоростей,
-оно мало отличается от выражений из линейной теории волн, в которых члены с
-\(\zeta\) опускаются.
-
-#+name: tab-delta-functions
-#+caption: Формулы для вычисления \(\Fun{z}\) и \(\FunSecond{z}\) из [[#sec:pressure-2d]], использующие нормировку для исключения неоднозначности определения дельта функции комплексного аргумента.
-#+attr_latex: :booktabs t
-| Функция           | Без нормировки                                               | С нормировкой                                                                                                                          |
-|-------------------+--------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------|
-| \(\Fun{z}\)       | \(\delta (x+i z)\)                                           | \(\frac{1}{2 h}\mathrm{sech}\left(\frac{\pi  (x-i (h+z))}{2 h}\right)\)                                                                |
-| \(\FunSecond{z}\) | \(\frac{1}{2}\left[\delta (x-i z) + \delta (x+i z) \right]\) | \(\frac{1}{4 h}\left[\text{sech}\left(\frac{\pi  (x-i (h+z))}{2 h}\right)+\text{sech}\left(\frac{\pi  (x+i(h+z))}{2 h}\right)\right]\) |
-
-** Верификация модели АРСС
-:PROPERTIES:
-:CUSTOM_ID: sec:verification
-:END:
-
-Для модели АР в
-работах\nbsp{}cite:degtyarev2011modelling,degtyarev2013synoptic,boukhanovsky1997thesis
-экспериментальным путем были верифицированы
-- распределения различных характеристик волн (высоты волн, длины волн, длины
-  гребней, период волн, уклон волн, показатель трехмерности),
-- дисперсионное соотношение,
-- сохранение интегральных характеристик для случая смешанного волнения.
-В данной работе верифицируются как модель АР, так и СС путем сравнения
-распределений различных характеристик волн.
-
-*** Верификация интегральных характеристик взволнованной поверхности
-В\nbsp{}cite:рожков1990вероятностные авторы показывают, что некоторые характеристики
-морских волн (перечисленные в табл.\nbsp{}[[tab-weibull-shape]]) имеют распределение
-Вейбулла, а подъем взволнованной поверхности\nbsp{}--- нормальное распределение. Для
-верификации генерируемых моделями АР и СС реализаций используются спрямленные
-диаграммы (графики, в которых по оси \(OX\) откладываются квантили функции
-распределения, вычисленные аналитически, а по оси \(OY\)\nbsp{}--- вычисленные
-экспериментально). Если экспериментально полученное распределение соответствует
-аналитическому, то график представляет собой прямую линию. Концы графика могут
-отклоняться от прямой линии, поскольку не могут быть надежно получены из
-реализации конечной длины. Различные методы извлечения волн из реализации также
-могут привести к вариациям на концах графиков, извлечь каждую волну из
-реализации практически невозможно, поскольку они могут (и часто) накладываются
-друг на друга.
-
-#+name: tab-weibull-shape
-#+caption: Значение коэффициента формы \(k\) распределения Вейбулла для различных характеристик волн.
-#+attr_latex: :booktabs t
-| Характеристика          | Коэффициент формы \(k\) |
-|-------------------------+-----------------------|
-| Высота волны            |                     2 |
-| Длина волны             |                   2,3 |
-| Длина гребня волны      |                   2,3 |
-| Период волны            |                     3 |
-| Уклон волны             |                   2,5 |
-| Показатель трехмерности |                   2,5 |
-
-Верификация производится для стоячих и прогрессивных волн. Соответствующие АКФ и
-спрямленные диаграммы распределений характеристик волн представлены на рис.
-[[acf-slices]], [[standing-wave-distributions]], [[propagating-wave-distributions]].
-
-#+name: propagating-wave-distributions
-#+begin_src R :file build/propagating-wave-qqplots-ru.pdf
-source(file.path("R", "common.R"))
-par(pty="s", mfrow=c(2, 2))
-arma.qqplot_grid(
-  file.path("build", "propagating_wave"),
-  c("elevation", "heights_y", "lengths_y", "periods"),
-  c("подъем", "высота по Y", "длина по Y", "период"),
-  xlab="x",
-  ylab="y"
-)
-#+end_src
-
-#+caption: Спрямленные диаграммы для прогрессивных волн.
-#+label: propagating-wave-distributions
-#+RESULTS: propagating-wave-distributions
-[[file:build/propagating-wave-qqplots.pdf]]
-
-#+name: standing-wave-distributions
-#+begin_src R :file build/standing-wave-qqplots-ru.pdf
-source(file.path("R", "common.R"))
-par(pty="s", mfrow=c(2, 2))
-arma.qqplot_grid(
-  file.path("build", "standing_wave"),
-  c("elevation", "heights_y", "lengths_y", "periods"),
-  c("подъем", "высота по Y", "длина по Y", "период"),
-  xlab="x",
-  ylab="y"
-)
-#+end_src
-
-#+caption: Спрямленные диаграммы для стоячих волн.
-#+label: standing-wave-distributions
-#+RESULTS: standing-wave-distributions
-[[file:build/standing-wave-qqplots-ru.pdf]]
-
-#+name: acf-slices
-#+header: :width 6 :height 9
-#+begin_src R :file build/acf-slices-ru.pdf
-source(file.path("R", "common.R"))
-propagating_acf <- read.csv(file.path("build", "propagating_wave", "acf.csv"))
-standing_acf <- read.csv(file.path("build", "standing_wave", "acf.csv"))
-par(mfrow=c(5, 2), mar=c(0,0,0,0))
-for (i in seq(0, 4)) {
-  arma.wavy_plot(standing_acf, i, zlim=c(-5,5))
-  arma.wavy_plot(propagating_acf, i, zlim=c(-5,5))
-}
-#+end_src
-
-#+caption: Временные срезы АКФ для стоячих (слева) и прогрессивных (справа) волн.
-#+label: acf-slices
-#+RESULTS: acf-slices
-[[file:build/acf-slices-ru.pdf]]
-
-Хвосты распределений на рис.\nbsp{}[[propagating-wave-distributions]] отклоняются от
-оригинального распределения для характеристик отдельных волн, поскольку каждую
-волну необходимо извлечь из полученной взволнованной поверхности, чтобы измерить
-ее длину, период и высоту. Алгоритм, который бы гарантировал безошибочное
-извлечение всех волн, не известен, поскольку волны могут и часто накладываются
-друг на друга. Правый хвост распределения Вейбулла отклоняется больше, поскольку
-он представляет редко возникающие волны.
-
-Степень соответствия для стоячих волн (рис.\nbsp{}[[standing-wave-distributions]])
-ниже для высот и длин, примерно одинакова для подъема поверхности и выше для
-периодов волн. Более низкая степень соответствия длин и высот может быть
-результатом того, что распределения были получены эмпирически для морских волн,
-которые, в основном, являются прогрессиными, и аналогичные распределения для
-стоячих волн могут отличаться. Более высокая степень соответствия периодов волн
-является следствием того, что периоды стоячих волн извлекаются более точно,
-поскольку волн не перемещаются вне моделируемой области взволнованной
-поверхности. Одинаковая степень соответствия для подъема поверхности получается
-из-за того, что это характеристика поверхности (и соответствующего процесса АР
-или СС), и она не зависит от типа волн.
-
-*** Верификация полей потенциалов скоростей
-:PROPERTIES:
-:CUSTOM_ID: sec:compare-formulae
-:END:
-
-Сравнение полученных общих формул eqref:eq-solution-2d и
-eqref:eq-solution-2d-full с известными формулами линейной теории волн позволяет
-оценить различие между полями скоростей для волн как больших, так и малых
-амплитуд. В общем случае аналитическое выражение для потенциала скорости
-неизвестно даже для плоских волн, поэтому сравнение производится численно. Имея
-ввиду выводы раздела [[#sec:pressure-2d]], сравниваются только формулы для случая
-конечной глубины.
-
-**** Отличие от формул линейной теории волн.
-Для того чтобы получить поля потенциалов скоростей, взволнованная морская
-поверхность генерировалась с помощью модели АР с варьированием амлитуды волн. В
-численной реализации волновые числа в преобразованиях Фурье выбирались на
-интервале от \(0\) до максимального волнового числа, определяемого численно из
-полученной взволнованной поверхности. Эксперименты проводились для волн малых и
-больших амплитуд.
-
-Эксперимент показал, что поля потенциалов скоростей, полученные по формуле
-eqref:eq-solution-2d-full для конечной глубины и по формуле
-eqref:eq-solution-2d-linear линейной теории, качественно отличаются
-(см.\nbsp{}рис.\nbsp{}[[fig-potential-field-nonlinear]]). Во-первых, контуры
-потенциала скорости имеют вид затухающей синусоиды, что отличается от овальной
-формы, описываемой линейной теории волн. Во-вторых, по мере приближения к дну
-водоема потенциал скорости затухает гораздо быстрее, чем в линейной теории, а
-область, где сконцентрирована большая часть энергии волны, еще больше приближена
-к ее гребню. Аналогичный численный эксперимент, в котором из формулы
-eqref:eq-solution-2d-full были исключены члены, которыми пренебрегают в рамках
-линейной теории волн, показал, что полное соотвествие получившихся полей
-потенциалов скоростей (насколько это позволяет сделать машинная точность).
-
-#+name: fig-potential-field-nonlinear
-#+caption: Поле потенциала скорости прогрессивной волны \(\zeta(x,y,t) = \cos(2\pi x - t/2)\). Поле, полученное по формуле eqref:eq-solution-2d-full (сверху) и по формуле линейной теории волн (снизу).
-#+begin_figure
-#+attr_latex: :width 0.47\textwidth
-[[file:graphics/pressure/potential-5.eps]]
-#+attr_latex: :width 0.47\textwidth
-[[file:graphics/pressure/potential-6.eps]]
-#+end_figure
-
-**** Отличие от формул теории волн малой амплитуды.
-Эксперимент, в котором сравнивались поля потенциалов скоростей, полученные
-численно разлиными формулами, показал, что поля скоростей, полученные по формуле
-eqref:eq-solution-2d-full и формуле для волн малой амплитуды
-eqref:eq-old-sol-2d, сопоставимы для волн малых амплитуд. В этом эксперименте
-использовались две реализации взволнованной морской поверхности, полученные по
-модели АР: одна содержала волны малой амплитуды, другая\nbsp{}--- большой.
-Интегрирование в формуле eqref:eq-solution-2d-full велось по диапазону волновых
-чисел, полученному из морской поверхности. Для волн малой амплитуды обе формулы
-показали сопоставимые результаты (разница в значениях скорости приписывается
-стохастической природе модели АР), в то время как для волн больших амплитуд
-устойчивое поле скоростей дала только формула eqref:eq-solution-2d-full (рис.
-рис.\nbsp{}[[fig-velocity-field-2d]]). Таким образом, общая формула
-eqref:eq-solution-2d-full показывает удовлетворительные результаты, не вводя
-ограничения на амплитуду волн.
-
-#+name: fig-velocity-field-2d
-#+caption: Сравнение полей скоростей на поверхности моря, полученных по общей формуле (\(u_1\)) и формуле для волн малой амплитуды (\(u_2\)). Поле скоростей для поверхности волн малой амплитуды (сверху) и большой амплитуды (снизу).
-#+begin_figure
-[[file:build/low-amp-nocolor.eps]]
-[[file:build/high-amp-nocolor.eps]]
-#+end_figure
-
-*** Нефизическая природа модели
-Благодаря своей нефизической природе модель АРСС не включает в себя понятие
-морской волны; вместо этого она моделирует взволнованную поверхность как единое
-целое. Движения отдельных волн и их форма часто получаются грубыми, а точное
-количество генерируемых волн неизвестно. Несмотря на это, интегральные
-характеристики взволнованной поверхности соответствуют реальным морским волнам.
-
-Теоретически, профили самих морских волн могут быть использованы в качестве АКФ,
-если предварительно обеспечить их экспоненциальное затухание. Это может
-позволить генерировать волны произвольных профилей и является одной из тем
-дальнейших исследований.
-
-* Высокопроизводительный программный комплекс для моделирования морского волнения
-** Модель вычислений
-**** Отображение алгоритма генерации взволнованной поверхности на вычислительную модель.
-Модель АРСС реализована в программном комплексе, работающем по принципу
-вычислительного конвейера, в котором каждое звено применяет некоторую функцию к
-выходным данным предыдущего звена. Звенья конвейера распределяются по узлам
-вычислительного кластера, чтобы сделать возможным параллелизм по операциям, а
-затем данные, перемещающиеся между звеньями конвейера распределяются между
-ядрами процессора, чтобы сделать возможным параллелизм по данным. На рис.\nbsp{}[[fig-pipeline]] представлена схема конвейера обработки данных, в которой
-прямоугольниками со скругленными углами обозначены звенья конвейера, обычными
-прямоугольниками\nbsp{}--- массивы объектов из предметной области задачи, передаваемые
-от одного звена к другому, а стрелками\nbsp{}--- направление передачи данных.
-Некоторые звенья разделены на /секции/, каждая из которых обрабатывает отдельную
-часть массива. Если звенья соединены без использования /барьера/ (горизонтальная
-или вертикальная полоса), то передача отдельных объектов между такими звеньями
-происходит параллельно с вычислениями, по мере их готовности. Секции работают
-параллельно на нескольких ядрах процессора (нескольких узлах кластера). Таким
-образом, между множеством ядер процессора, секций звеньев конвейера и объектами
-устанавливается сюръективное отображение, т.е. на одном ядре процессора может
-работать несколько секций звеньев конвейера, каждая из которых может
-обрабатывать несколько объектов последовательно, но одна секция не может
-работать сразу на нескольких ядрах, а объект не может обрабатываться сразу
-несколькими секциями конвейера.
-
-#+name: fig-pipeline
-#+begin_src dot :exports results :file build/pipeline-ru.pdf
-digraph {
-
-  node [fontsize=14,margin="0.055,0"]
-  graph [nodesep="0.25",ranksep="0.25",rankdir="TB"]
-  edge [arrowsize=0.66]
-
-  # data
-  subgraph xcluster_linear {
-    label="Линейная модель"
-
-    start [label="",shape=circle,style=filled,fillcolor=black,width=0.23]
-    spectrum [label="S(ω,θ)",shape=box]
-    acf [label="K(i,j,k)",shape=box]
-    phi [label="Φ(i,j,k)",shape=box]
-
-    # transformations
-    fourier_transform [label="Преобразование Фурье",shape=box,style=rounded]
-    solve_yule_walker [label="Решение уравнений\nЮла—Уокера",shape=box,style=rounded]
-
-    subgraph cluster_nonlinear_1 {
-      label="Моделир. нелинейности\l"
-      labeljust=left
-      style=filled
-      color=lightgrey
-      acf2 [label="K*(i,j,k)",shape=box]
-      transform_acf [label="Преобразование АКФ",shape=box,style=rounded]
-    }
-  }
-
-  subgraph xcluster_linear2 {
-
-    eps_parts [label="<e1> ε₁|<e2> ε₂|<e3> …|<e4> εₙ|<e> ε(t,x,y)",shape=record]
-    end [label="",shape=doublecircle,style=filled,fillcolor=black,width=0.23]
-
-    generate_white_noise [label="<g1> g₁|<g2> g₂|<g3> …|<g4> gₙ|<gen> Генерация\lбелого шума",shape=record,style=rounded]
-    generate_zeta [label="<g1> g₁|<g2> g₂|<g3> …|<g4> gₙ|<gen> Генерация частей\lвзволнованной мор-\lской поверхности\l",shape=record,style=rounded]
-
-    zeta_parts [label="<g1> ζ₁|<g2> ζ₂|<g3> …|<g4> ζₙ|<gen> Несшитые части реализации",shape=record]
-    overlap_add [label="<g1> ζ₁|<g2> ζ₂|<g3> …|<g4> ζₙ|<gen> Сшивание час-\lтей реализации\l",shape=record,style=rounded]
-
-    zeta_parts:g1->overlap_add:g1
-    zeta_parts:g2->overlap_add:g2
-    zeta_parts:g3->overlap_add:g3
-    zeta_parts:g4->overlap_add:g4
-
-    zeta_parts:g2->overlap_add:g1 [constraint=false]
-    zeta_parts:g3->overlap_add:g2 [constraint=false]
-    zeta_parts:g4->overlap_add:g3 [constraint=false]
-
-    overlap_add:g1->zeta2_parts:g1
-    overlap_add:g2->zeta2_parts:g2
-    overlap_add:g3->zeta2_parts:g3
-    overlap_add:g4->zeta2_parts:g4
-
-    zeta2_parts:g1->transform_zeta:g1->zeta3_parts:g1->write_zeta:g1->eps_end
-    zeta2_parts:g2->transform_zeta:g2->zeta3_parts:g2->write_zeta:g2->eps_end
-    zeta2_parts:g3->transform_zeta:g3->zeta3_parts:g3->write_zeta:g3->eps_end
-    zeta2_parts:g4->transform_zeta:g4->zeta3_parts:g4->write_zeta:g4->eps_end
-
-  }
-
-  subgraph part3 {
-
-    zeta2_parts [label="<g1> ζ₁|<g2> ζ₂|<g3> …|<g4> ζₙ|<gen> Поверхность с нормаль-\lным законом распреде-\lления\l",shape=record]
-
-    subgraph cluster_nonlinear_2 {
-      label="Моделир. нелинейности\r"
-      labeljust=right
-      style=filled
-      color=lightgrey
-      zeta3_parts [label="<g1> ζ₁|<g2> ζ₂|<g3> …|<g4> ζₙ|<gen> ζ(t,x,y)",shape=record]
-      transform_zeta [label="<g1> g₁|<g2> g₂|<g3> …|<g4> gₙ|<gen> Преобразование за-\lкона распределения\lвзволнованной мор-\lской поверхности\l",shape=record,style=rounded]
-    }
-
-    # barriers
-    eps_start [label="",shape=box,style=filled,fillcolor=black,height=0.05]
-    eps_end [label="",shape=box,style=filled,fillcolor=black,height=0.05]
-
-    write_zeta [label="<g1> g₁|<g2> g₂|<g3> …|<g4> gₙ|<gen> Запись готовых\lчастей в файл\l",shape=record,style=rounded]
-  }
-
-  # edges
-  start->spectrum->fourier_transform->acf->transform_acf
-  transform_acf->acf2
-  acf2->solve_yule_walker
-  solve_yule_walker->phi
-  phi->eps_start [constraint=false]
-  eps_start->generate_white_noise:g1
-  eps_start->generate_white_noise:g2
-  eps_start->generate_white_noise:g3
-  eps_start->generate_white_noise:g4
-  generate_white_noise:g1->eps_parts:e1->generate_zeta:g1->zeta_parts:g1
-  generate_white_noise:g2->eps_parts:e2->generate_zeta:g2->zeta_parts:g2
-  generate_white_noise:g3->eps_parts:e3->generate_zeta:g3->zeta_parts:g3
-  generate_white_noise:g4->eps_parts:e4->generate_zeta:g4->zeta_parts:g4
-
-  eps_end->end
-}
-#+end_src
-
-#+caption: Схема конвейера обработки данных, реализующего генерацию взволнованной морской поверхности по АР модели.
-#+label: fig-pipeline
-#+RESULTS: fig-pipeline
-[[file:build/pipeline-ru.pdf]]
-
-Конвейер объектов можно считать развитием модели BSP (Bulk Synchronous
-Parallel)\nbsp{}cite:valiant1990bridging, применяемой в системах обработки
-графов\nbsp{}cite:malewicz2010pregel,seo2010hama. Конвейер позволяет исключить
-глобальную синхронизацию (где это возможно) между последовательно идущим этапами
-вычислений путем передачи данных между звеньев параллельно с вычислениями, в то
-время как в модели BSP глобальная синхронизация происходит после каждого шага.
-
-Конвейер объектов ускоряет программу путем параллельного выполнения блоков кода,
-работающих с разными вычислительными устройствами: в то время как текущая часть
-взолнованной поверхности генерируется на процессоре, предыдущая часть
-записывается на диск. Такой подход позволяет получить ускорение, потому что
-различные вычислительные устройства работают асинхронно, и их параллельное
-использование увеличивает производительность программы.
-
-Поскольку передача данных между звеньями конвейера происходит параллельно с
-вычислениями, то на одном и том же конвейере можно запустить сразу несколько
-копий приложения с разными параметрами (генерировать сразу несколько
-взволнованных морских поверхностей с разными характеристиками). На практике
-оказывается, что высокопроизводительные приложения не всегда загружают
-процессор на 100%, тратя время на синхронизацию параллельных процессов и
-запись данных на диск. Использование конвейера в таком случае позволит на одном
-и том же множестве процессов запустить сразу несколько расчетов и максимально
-эффективно использовать все устройства компьютера. Например, во время записи в
-файл одной задачей может производиться расчет на процессоре другой задачей. Это
-минимизирует время простоя процессора и других устройств компьютера и повышает
-общую пропускную способность кластера.
-
-Конвейеризация шагов программы, которые в противном случае последовательны,
-выгодно не только для кода, работающего с различными устройствами, но и для
-кода, различные ветки которого могут быть запущены на нескольких аппаратных
-потоках одного процессорного ядра, т.е. ветки, осуществляющие доступ к различным
-блокам памяти или использующие смешанную арифметику (целочисленную и с плавающей
-точкой). Ветки кода, которые используют различные модули процессора, являются
-хорошими кандидатами для параллельного запуска на процессорном ядре с
-несколькими аппаратными потоками.
-
-Таким образом, вычислительную модель на основе конвейера можно рассматривать как
-/массивно асинхронную модель/ (bulk-asynchronous model) из-за параллельной
-природы шагов программы. Эта модель является основой модели отказоустойчивости,
-которая будет описана далее.
-
-**** Программная реализация.
-Из соображений эффективности конвейер объектов и методы обеспечения
-отказоустойчивости (которые будут описаны далее) были реализованы во фреймворке
-на языке C++: с точки зрения автора язык C слишком низкоуровневый для написания
-распределенных программ, а использование языка Java влечет за собой накладные
-расходы, и не популярно в высокопроизводительных вычислениях. На данный момент
-фреймворк запускает сервис и приложение в одном процессе. Фреймворк называется
-"Фабрика" и находится на этапе проверки концепции.
-
-**** Обзор вычислительной модели.
-Ключевой особенностью, которая отсутствует в текущих технологиях параллельного
-программирования, является возможность указать иерархических зависимостей между
-параллельными задачами. Когда такая зависимость есть, определить, какая из задач
-должна быть ответственна за повторное выполнение не удавшейся задачи на одном из
-выживших узлов, тривиально. Чтобы повторно выполнить задачу на вершине иерархии,
-создается резервная задача, выполняющаяся на другом узле. Существует ряд систем,
-которые способны выполнять направленные ациклические графы задач параллельно\nbsp{}cite:acun2014charmpp,islam2012oozie, но графы не подходят для определения
-отношений руководитель-подчиненный между задачами, поскольку узел графа может
-иметь несколько родительских узлов.
-
-Основное назначение модели состоит в упрощении разработки распределенных
-приложений для пакетной обработки данных и промежуточного программного
-обеспечения. Основное внимание направлено на обеспечение устойчивости приложений
-к поломкам оборудования, т.е. обеспечение отказоустойчивости и высокой
-доступности, которое прозрачно для программиста. Реализация модели состоит из
-двух слоев: на нижнем слое находятся подпрограммы и классы для приложений,
-работающих на одном узле (без сетевых взаимодействий), на верхнем слое\nbsp{}--- для
-приложений, работающих на произвольном количестве узлов. Модель включает в себя
-два вида сильно связанных друг с другом сущностей\nbsp{}--- /управляющие объекты/ (или
-/ядра/) и /конвейеры/,\nbsp{}--- которые используются совместно для написания
-программы.
-
-Управляющие объекты реализуют логику (порядок выполнения) программы в методах
-~act~ и ~react~ и хранят состояние текущей ветки исполнения. Как логика так и
-состояние задаются программистом. В методе ~act~ какая-либо функция либо
-вычисляется непосредственно, либо разлагается на вложенные функции
-(представляемые подчиненными управляющими объектами), которые впоследствии
-отправляются на конвейер. В методе ~react~ подчиненные управляющие объекты,
-вернувшиеся с конвейера, обрабатываются их родительским объектом. Вызовы методов
-~act~ и ~react~ производятся асинхронно внутри потоков, присоединенных к
-конвейеру. Для каждого управляющего объекта метод ~act~ вызывается только один
-раз, и для нескольких объектов вызовы происходят параллельно друг другу, в то
-время как метод ~react~ вызывается один раз для каждого подчиненного объекта, и
-все вызовы происходят в одном потоке для предотвращения одновременного изменения
-состояния несколькими потоками (для разных родительских объектов могут
-использоваться разные потоки).
-
-Конвейеры осуществляют асинхронные вызовы методов ~act~ и ~react~, стараясь
-сделать как можно больше вызовов параллельно, учитывая предоставляемый
-платформой параллелизм (количество процессорных ядер на узле и количество узлов
-в кластере). Конвейер включает в себя пул управляющих объектов, содержащий все
-подчиненные объекты, отправленные в него родителями, и пул потоков,
-обрабатывающий эти объекты в соответствии с правилами, описанными в предыдущем
-параграфе. Для каждого устройства используется отдельный конвейер. Существуют
-конвейеры для параллельной обработки, обработки по расписанию (периодические и
-отложенные задачи) и промежуточный конвейер для обработки управляющих объектов
-на узлах кластера (см.\nbsp{}рис.\nbsp{}[[fig-subord-ppl]]).
-
-По принципу работу механизм управляющих объектов и конвейеров напоминает
-механизм работы процедур и стеков вызовов, с тем лишь преимуществом, что методы
-объектов вызываются асинхронно и параллельно друг другу (насколько это позволяет
-логика программы). Поля управляющего объекта\nbsp{}--- это локальные переменные стека,
-метод ~act~\nbsp{}--- это последовательность процессорных инструкций перед вложенным
-вызовом процедуры, а метод ~react~\nbsp{}--- это последовательность инструкций после
-вложенного вызова. Создание и отправка на конвейер подчиненного объекта\nbsp{}--- это
-вложенный вызов процедуры. Наличие двух методов обуславливается асинхронностью
-вложенных вызовов и помогает заменить активное ожидание завершения подчиненных
-объектов пассивным при помощи конвейеров. Конвейеры, в свою очередь, позволяют
-реализовать пассивное ожидание и вызывают правильные методы, анализируя
-внутреннее состояние объектов.
-
-#+name: fig-subord-ppl
-#+begin_src dot :exports results :file build/subord-ppl-ru.pdf
-graph G {
-
-  node [fontname="Old Standard",fontsize=14,margin="0.055,0",shape=box]
-  graph [nodesep="0.25",ranksep="0.25",rankdir="LR"]
-  edge [arrowsize=0.66]
-
-  subgraph cluster_daemon {
-    label="Родительский процесс"
-    style=filled
-    color=lightgrey
-
-    factory [label="Фабрика"]
-    parallel_ppl [label="Параллельный\nконвейер"]
-    io_ppl [label="Конвейер\nввода/вывода"]
-    sched_ppl [label="Конвейер\nдля таймера"]
-    net_ppl [label="Конвейер для\nсетевых устройств"]
-    proc_ppl [label="Конвейер\nдля процессов"]
-
-    upstream [label="Пул потоков\nupstream"]
-    downstream [label="Пул потоков\ndownstream"]
-  }
-
-  factory--parallel_ppl
-  factory--io_ppl
-  factory--sched_ppl
-  factory--net_ppl
-  factory--proc_ppl
-
-  subgraph cluster_hardware {
-    label="Вычислительные устройства"
-    style=filled
-    color=lightgrey
-
-    cpu [label="CPU"]
-    core0 [label="Ядро 0"]
-    core1 [label="Ядро 1"]
-    core2 [label="Ядро 2"]
-    core3 [label="Ядро 3"]
-
-    storage [label="Устройства\nхранения"]
-    disk0 [label="Диск 0"]
-
-    network [label="Сетевые\nкарты"]
-    nic0 [label="СК 0"]
-
-    timer [label="Таймер"]
-
-  }
-
-  core0--cpu
-  core1--cpu
-  core2--cpu
-  core3--cpu
-
-  disk0--storage
-  nic0--network
-
-  parallel_ppl--upstream
-  parallel_ppl--downstream
-
-  upstream--{core0,core1,core2,core3} [style="dashed"]
-  downstream--core0 [style="dashed"]
-
-  io_ppl--core0 [style="dashed"]
-  io_ppl--disk0 [style="dashed"]
-  sched_ppl--core0 [style="dashed"]
-  sched_ppl--timer [style="dashed"]
-  net_ppl--core0 [style="dashed"]
-  net_ppl--nic0 [style="dashed"]
-  proc_ppl--core0 [style="dashed"]
-
-  subgraph cluster_children {
-    style=filled
-    color=white
-
-    subgraph cluster_child0 {
-      label="Дочерний процесс 0"
-      style=filled
-      color=lightgrey
-
-      app0_factory [label="Фабрика"]
-      app0 [label="Конвейер\nдочернего\nпроцесса"]
-    }
-
-#    subgraph cluster_child1 {
-#      label="Дочерний процесс 1"
-#      style=filled
-#      color=lightgrey
-#
-#      app1_factory [label="Фабрика"]
-#      app1 [label="Конвейер\nдочернего процесса"]
-#    }
-  }
-
-  proc_ppl--app0
-#  proc_ppl--app1
-
-  app0_factory--app0 [constraint=false]
-#  app1_factory--app1 [constraint=false]
-
-}
-#+end_src
-
-#+caption: Отображение конвейеров родительского и дочернего процессов на вычислительные устройства. Сплошные линии обозначают агрегацию, пунктирные линии обозначают отображение между логическими и физическими сущностями.
-#+attr_latex: :width \textwidth
-#+label: fig-subord-ppl
-#+RESULTS: fig-subord-ppl
-[[file:build/subord-ppl-ru.pdf]]
-
-**** Основополагающие принципы модели.
-Модель конвейера обработки данных строится по следующим принципам, следование
-которым обеспечивает максимальную эффективность программы.
-- В модели отсутствует понятие сообщения, роль сообщения выполняет сам
-  управляющий объект: он может быть передан по сети на другой узел и получить
-  доступ к полям любого другого управляющего объекта на этом узле. Гарантировать
-  существование такого объекта может только логика программы.
-- Управляющий объект представляет собой /сопрограмму/, которая при вызове
-  отправляется в пул управляющих объектов и затем выполняется планировщиком
-  асинхронно. Тело сопрограммы может содержать произвольное количество вызовов
-  других сопрограмм. Каждый вызов отправляет соответствующую сопрограмму в пул и
-  сразу завершается. Управляющие объекты, находящиеся в пуле, могут быть
-  обработаны в любом порядке; это используется планировщиком для извлечения
-  максимального параллелизма из вычислительной системы путем распределения
-  объектов из пула между доступными узлами кластера и ядрами процессора.
-- Асинхронное выполнение управляющих объектов позволяет избежать явной
-  синхронизации после вызова сопрограммы (отправки объекта в очередь);
-  планировщик возвращает поток управления в родительский управляющий объект
-  каждый раз когда какой-либо его дочерний объект завершает выполнение. Такое
-  взаимодействие превращает сопрограмму в некоторого рода обработчик событий, в
-  котором событием является дочерний объект, а обработчиком\nbsp{}--- родительский.
-- Сопрограмма может взаимодействовать с произвольным количеством управляющих
-  объектов, адреса которых известны; взаимодействие с объектами, осуществляемое
-  вразрез с иерархией сильно усложняет поток управления и стек вызовов
-  сопрограмм теряет древовидную структуру. Только логика программы может
-  гарантировать существование в памяти машины двух взаимодействующих объектов.
-  Один из способов обеспечения такой гарантии\nbsp{}--- взаимодействие между
-  вложенными сопрограммами, вызванными из одной родительской сопрограммы.
-  Поскольку такого рода взаимодействие можно осуществить в рамках иерархии через
-  родительскую сопрограмму, его можно считать оптимизацией, позволяющей
-  избавиться от накладных расходов при передаче данных через промежуточный узел.
-  Для программ, логика которых полностью основана на событиях (например, для
-  серверов и программ с графическим интерфейсом), ситуация иная, и такого рода
-  взаимодействия являются основными.
-- Также, взаимодействия, идущие вразрез с иерархией и поверх сети кластера,
-  усложняют разработку алгоритмов обеспечения отказоустойчивости. Гарантировать
-  нахождение определенного управляющего объекта в памяти соседнего узла
-  невозможно, поскольку узел может выйти из строя прямо во время выполнения
-  соответствующей сопрограммы. В результате, при аварийном завершении
-  сопрограммы, все его вложенные сопрограммы должны быть выполнены заново. Это
-  подталкивает программиста к созданию
-  - глубоких древовидных иерархий сильно связанных управляющих объектов (которые
-    взаимодействуют между собой на одном уровне иерархии), уменьшающих накладные
-    расходы на повторное выполнение сопрограмм;
-  - толстых древовидных иерархий слабо связанных управляющих объектов,
-    обеспечивающих максимальную степень параллелизма.
-  Глубокие иерархии это не только требование технологии, они помогают
-  оптимизировать сетевое взаимодействие большого количества узлов кластера,
-  сводя его к взаимодейсвтвию соседних узлов.
-
-Таким образом, управляющие объекты обладают свойствами как сопрограмм, так и
-обработчиков событий одновременно.
-
-** Реализация для систем с общей памятью (SMP)
-**** Алгоритм распределения нагрузки.
-Наиболее простым и широко применяемым подходом к распределению нагрузки на
-вычислительную систему является разбиение данных на равные части (или разбиение
-задачи на однородные подзадачи) с последующим их равномерным распределением
-между отдельными ядрами процессора и узлами кластера, однако такой подход не
-всегда работает эффективно. Во-первых, часто общее количество частей, на которые
-разбиваются входные данные, диктуется не архитектурой и конфигурацией
-вычислительной системы, а самой задачей, и такое распределение не всегда
-эффективно с точки зрения вычислительной машины: количество частей оказывается
-либо слишком большим по сравнению с количеством процессоров, работающих
-параллельно, что ведет к увеличению накладных расходов на обмен данными, либо
-слишком маленьким, что не позволяет использовать все доступные вычислительные
-ядра. Во-вторых, накладываемые решаемой задачей ограничения могут не позволить
-разделить входные данные на равные части, что может стать причиной дисбаланса в
-загрузке ядер процессора. В-третьих, в вычислительной системе в вычислениях
-участвуют помимо процессора сразу несколько компонент (таких как векторные
-сопроцессоры и устройства хранения), то время решения конкретной задачи зависит
-от производительности всех задействованных устройств. Каким же образом сделать
-алгоритм распределения нагрузки более эффективным, принимая во внимание разный
-размер частей, на которые разделяются входные данные, и учитывая все устройства,
-задействованные в вычислениях?
-
-Алгоритм распределения нагрузки состоит из двух этапов. На первом этапе алгоритм
-размещает часть входных данных (или подзадачу), обернутую в управляющий объект,
-в соответствующем пуле управляющих объектов: для каждого устройства используется
-отдельный пул управляющих объектов и сопряженный с ним пул потоков. На втором
-этапе, управляющий объект извлекается из пула одним из потоков и обрабатывается.
-Благодаря отдельным пулам потоков все устройства работают параллельно, уменьшая
-тем самым время простоя оборудования по сравнению с использованием всех
-устройств из одного потока.
-
-Для того чтобы учесть неоднородность частей, на которые разбиваются входные
-данные, и неоднородность выполняемых задач, необходимо предсказать время
-выполнения каждой из задач. Соответствующее исследование сделано в\nbsp{}cite:degtyarev2016balance, поскольку реализация модели АРСС включает в себя, в
-основном, однородные задачи.
-
-Таким образом, распределение нагрузки осуществляется в два этапа: на первом
-этапе задача в форме урпавляющего объекта направляется на подходящее устройство,
-а на втором этапе она направляется в один из потоков из соответсвующего
-устройству пула. Неоднородность управляющих объектов может быть учтена путем
-предсказания времени их выполнения, однако такие объекты не встречаются в
-реализации модели АРСС.
-
-**** Производительность реализаций на MPI, OpenMP и OpenCL.
-Программная реализация состояла в создании и отладке прототипа программы и в
-последующем написании компоненты виртуального полигона на языке более низкого
-уровня. При этом тесты показали, что одной высокопроизводительной
-многопроцессорной машины достаточно для создания типовых реализаций морского
-волнения. Также использование видеокарт в качестве векторных ускорителей
-эффективно только в случае расчета давлений, в то время как генерация волновой
-поверхности выполняется быстрее на скалярном процессоре.
-
-Создание программной реализации происходило в два этапа: на первом этапе был
-создан и отлажен прототип в программной среде
-Mathematica\nbsp{}cite:mathematica10, а на втором этапе логика программы была
-переписана на более низкоуровневом языке C++, и для получения эффективно
-работающего параллельного кода были проведены эксперименты с рядом библиотек. С
-помощью этих библиотек были реализованы функции генерации взволнованной морской
-поверхности, а также процедура расчета гидродинамических давлений под
-сгенерированной поверхностью. Тестирование производилось на вычислительных
-машинах кластера РЦ ВЦ СПбГУ (см.\nbsp{}табл.\nbsp{}[[tab-autoreg-testbed]]) и
-позволило получить два основных результата. Во-первых, использование видеокарт
-неэффективно при генерации волновой поверхности
-(см.\nbsp{}табл.\nbsp{}[[tab-autoreg-performance]]), что обусловлено сравнительно
-небольшим количеством арифметических операций по отношению к количеству операций
-с памятью устройства, а также отсутствием трансцендентных функций в реализации
-алгоритма. Во-вторых, для генерации одной реализации взволнованной морского
-поверхности одной многопроцессорной машины достаточно для эффективного и
-быстрого решения задачи (см.\nbsp{}рис.\nbsp{}[[fig-autoreg-performance]]). По
-результатам тестирования стандарт OpenMP был выбран в качестве основного, как
-наиболее эффективный и наиболее подходящий для расчетов на многопроцессорной
-системе.
-
-#+name: fig-autoreg-performance
-#+caption: Скорость генерации взволнованной поверхности на многопроцессорной системе для типовых размеров реализации (сверху). Масштабируемость (относительное ускорение при увеличении количества процессоров) программной реализации на многопроцессорной системе для типовых размеров реализации (снизу). Временная протяженность 512 с.
-#+begin_figure
-[[file:graphics/speed.eps]]
-[[file:graphics/speedup.eps]]
-#+end_figure
-
-#+name: tab-autoreg-testbed
-#+caption: Конфигурация оборудования.
-#+attr_latex: :booktabs t
-| Вычислительная машина | HP SL390s G7                           |
-| Процессор             | 2\(\times\)Intel X5650 (всего 12 ядер) |
-| Оперативная память    | 96ГБ RAM                               |
-| Операционная система  | CentOS 5.6 (Linux)                     |
-
-#+name: tab-autoreg-performance
-#+caption: Время (с.) генерации взволнованной морской поверхности различными программными реализациями авторегрессионной модели.
-#+attr_latex: :booktabs t :align cllllll
-|        |     ЛХ |     ЛХ |    ЛХ |     АР |     АР |    АР |
-| Размер | OpenCL | OpenMP |   MPI | OpenCL | OpenMP |   MPI |
-|--------+--------+--------+-------+--------+--------+-------|
-| 400000 |   0.82 |  40.44 | 32.60 |   1.80 |  0.800 | 0.750 |
-| 440000 |   0.90 |  44.59 | 35.78 |   1.92 |  0.100 | 0.930 |
-| 480000 |   0.99 |  48.49 | 38.93 |   2.29 |  0.970 | 0.126 |
-| 520000 |   1.07 |  52.65 | 41.92 |   2.43 |  0.118 | 0.117 |
-| 560000 |   1.15 |  56.45 | 45.00 |   2.51 |  0.117 | 0.161 |
-| 600000 |   1.23 |  60.85 | 48.80 |   2.54 |  0.123 | 0.132 |
-| 640000 |   1.31 |  65.07 | 53.02 |   2.73 |  0.123 | 0.160 |
-| 680000 |   1.40 |  68.90 | 54.92 |   2.80 |  0.138 | 0.136 |
-| 720000 |   1.48 |  72.49 | 58.42 |   2.88 |  0.144 | 0.173 |
-| 760000 |   1.56 |  76.86 | 61.41 |   3.47 |  0.156 | 0.155 |
-| 800000 |   1.64 |  81.03 | 66.42 |   3.25 |  0.166 | 0.174 |
-
-Кроме выбора стандарта параллельных вычислений на время работы программы влияет
-выбор библиотек типовых вычислительных методов, и эффективность этих библиотек
-была показана тестированием их разработчиками. В качестве библиотеки для
-матричных операций (расчета коэффициентов авторегрессионной модели) была выбрана
-GotoBLAS и основанная на ней LAPACK, для непрерывной аппроксимации поля волновых
-чисел использовалась библиотека CGAL\nbsp{}cite:fabri2009cgal и для статистической
-проверки интегральных характеристик реализации взволнованной поверхности
-использовалась библиотека GSL\nbsp{}cite:gsl2008scientific. В случае GotoBLAS
-эффективность библиотеки показана в работах\nbsp{}cite:goto2008high,goto2008anatomy,
-для других библиотек эффективность не является важной, и они были выбраны,
-исходя из удобства их использования.
-
-#+name: tab-arma-libs
-#+caption: Список библиотек, используемых в реализации модели АРСС.
-#+attr_latex: :booktabs t :align lp{0.6\linewidth}
-| Library                                                | What it is used for              |
-|--------------------------------------------------------+----------------------------------|
-| DCMT\nbsp{}cite:matsumoto1998dynamic                         | параллельный ГПСЧ                |
-| Blitz\nbsp{}cite:veldhuizen1997will,veldhuizen2000techniques | многомерные массивы              |
-| GSL\nbsp{}cite:gsl2008scientific                             | вычисление ФПР, ФР, БПФ          |
-|                                                        | проверка стационарности процесса |
-| LAPACK, GotoBLAS\nbsp{}cite:goto2008high,goto2008anatomy     | определение коэффициентов АР     |
-| GL, GLUT\nbsp{}cite:kilgard1996opengl                        | трехмерная визуализация          |
-
-**** Производительность алгоритма распределения нагрузки.
-Программная реализация генерации взволнованной поверхности сбалансирована с
-точки зрения нагрузки на процессорные ядра, однако, как показывают тесты,
-характеризуется высокой нагрузкой на устройства хранения. До проведения
-тестирования генерация взволнованной поверхности была реализована с
-использованием OpenMP для параллельных вычислений, и была переписана с
-использованием POSIX потоков для того чтобы реализовать алгоритм распределения
-нагрузки. Производительность двух реализаций сравнивалась на платформе,
-конфигурация которой приведена в табл.\nbsp{}[[tab-multicore-specs]].
-
-#+name: tab-multicore-specs
-#+caption: Конфигурация многоядерной системы.
-#+attr_latex: :booktabs t
-| Компонента                    | Подробности                      |
-|-------------------------------+----------------------------------|
-| Язык программирования         | C++11                            |
-| Библиотека потоков            | C++11 STL threads                |
-| Библиотека атомарных операций | C++11 STL atomic                 |
-| Подпрограммы замера времени   | ~clock_gettime(CLOCK_MONOTONIC)~ |
-|                               | ~/usr/bin/time -f \%e~           |
-| Компилятор                    | GCC 4.8.2                        |
-| Опции компиляции              | ~-std=c++11 -O2 -march=native~   |
-| Операционная система          | Debian 3.2.51-1 x86_64           |
-| Файловая система              | ext4                             |
-| Процессор                     | Intel Core 2 Quad Q9650          |
-| Частота процессора (ГГц)      | 3.00                             |
-| Количество ядер               | 4                                |
-| Объем оперативной памяти (ГБ) | 8                                |
-| Диск                          | Seagate ST3250318AS              |
-| Скорость диска (об./мин.)     | 7200                             |
-
-Эксперимент состоял в запуске двух программных реализаций на многоядерной
-машине, изменяя размер поверхности. Размер пула потоков процессора и пула
-потоков ввода/вывода оставался неизменным во время эксперимента. Пул потоков
-ввода/вывода состоял из одного потока, а количество потоков процессора равнялось
-количеству физических ядер процессора.
-
-В эксперименте алгоритм распределения нагрузки показал большую эффективность по
-сравнению с реализацией без него. Чем больше размер генерируемой поверхности,
-тем больше разрыв в производительности (рис.\nbsp{}[[fig-factory-performance]]), что
-является следствием наложения вычислительной фазы и фазы вывода данных друг на
-друга (рис.\nbsp{}[[fig-factory-overlap]]). В реализации OpenMP фаза вывода данных
-начинается только тогда, когда заканчивается вычислительная фаза, в то время как
-использование алгоритма распределения нагрузки приводит почти к одновременному
-завершению обеих фаз. Таким образом, /выполнение параллельных изнутри,
-последовательных фаз в режиме конвейера более эффективно, чем их
-последовательное выполнение/, и это позволяет сбалансировать нагрузку на
-различные устройства, задействованные в вычислениях.
-
-#+name: fig-factory-performance
-#+header: :width 5 :height 4
-#+begin_src R :file build/factory-vs-openmp-ru.pdf
-source(file.path("R", "common.R"))
-arma.plot_factory_vs_openmp(
-  xlab="Размер реализации",
-  ylab="Время, с.",
-  power=6
-)
-#+end_src
-
-#+caption: Сравнение производительности реализаций программы на OpenMP и Factory.
-#+label: fig-factory-performance
-#+RESULTS: fig-factory-performance
-[[file:build/factory-vs-openmp-ru.pdf]]
-
-#+name: fig-factory-overlap
-#+header: :width 7 :height 4
-#+begin_src R :file build/factory-vs-openmp-overlap-ru.pdf
-source(file.path("R", "common.R"))
-par(mar=c(5, 6, 0, 1), pty="m")
-arma.plot_factory_vs_openmp_overlap(
-  xlab="Время, с.",
-  labels=c("Factory", "OpenMP"),
-  scale=10**9
-)
-#+end_src
-
-#+caption: Наложение параллельных вычислений на \([G_0,G_1]\) и записи данных на диск на \([W_0,W_1]\). В реализации OpenMP наложение отсутствует.
-#+label: fig-factory-overlap
-#+RESULTS: fig-factory-overlap
-[[file:build/factory-vs-openmp-overlap-ru.pdf]]
-
-Предложенный алгоритм распределения нагрузки на многоядерную систему позволяет
-получить прирост производительности для приложений, считывающих и записывающих
-большой объем данных на диск, но может быть использован также и в других
-случаях. Основная идея алгоритма состоит в определении типа нагрузки и поиске
-подходящего устройства для перенаправления нагрузки на него. Таким образом любое
-устройство помимо дисков может быть использовано.
-
-** Реализация для систем с распределенной памятью (MPP)
-*** Алгоритм обнаружения узлов кластера
-:PROPERTIES:
-:CUSTOM_ID: sec:node-discovery
-:END:
-
-Многие распределенные системы построены по принципу /субординации/: в каждом
-кластере выбирается главный (руководящий) узел, который управляет очередью
-задач, планирует их запуск на подчиненных узлах и следит за их состоянием. Роль
-главного узла задается либо /статически/, путем выделения конкретного
-физического узла под нее, либо /динамически/, путем избрания какого-либо из
-узлов кластера главным. В первом случае отказоустойчивость обеспечивается
-посредством резервирования дополнительного свободного узла, который выполнит
-роль главного в случае отказа текущего. Во втором случае отказоустойчивость
-обеспечивается выбором нового главного узла из оставшихся. Несмотря на то что
-динамическое задание ролей требует наличия специализированного распределенного
-алгоритма, этот подход становится все более и более популярным, поскольку не
-требует наличия простаивающих резервных узлов на случай отказа главного узла.
-
-Алгоритмы выбора лидера (которые иногда называют алгоритмами /распределенного
-консенсуса/) являются частными случаями волновых алгоритмов. В\nbsp{}cite:tel2000introduction Тель определяет их как алгоритмы, в которых событие
-завершения программы предваряется хотя бы одним каким-либо другим событием,
-происходящем в /каждом/ параллельном процессе. Волновые алгоритмы не определены
-для анонимных сетей, т.е. они работают только с теми параллельными процессами,
-которые могут себя уникально идентифицировать. Однако, количество процессов,
-которых затрагивает "волна", может быть определено по мере выполнения алгоритма.
-В рамках распределенных систем это означает, что волновые алгоритмы подходят для
-вычислительных кластеров с динамически меняющимся количеством узлов, так что
-включение и выключение отдельных узлов не влияет на работу алгоритма.
-
-Подход к динамическому выбору главного узла, исследованный в данной работе, не
-использует волновые алгоритмы, а значит не требует опроса всех узлов кластера
-для выбора лидера. Вместо этого каждый узел кластера нумерует все узлы подсети,
-в которой он находится, и преобразует список в /древовидную иерархию/ с заданным
-максимальным значением ветвления (максимальным количеством подчиненных вершин).
-Затем узел определяет свой уровень иерархии и пытается соединиться с
-вышестоящими узлами, чтобы стать их подчиненным. Сначала он проверяет близко
-расположенные к нему узлы, а потом все остальные узлы вплоть до вершины
-иерархии. Если вышестоящих узлов нет или с ними невозможно соединиться, то узел
-сам становится главой иерархии.
-
-Древовидная иерархия узлов подсети определяет отношение строгого порядка на
-множестве всех узлов кластера. Несмотря на то что с технической точки зрения
-любая функция может быть выбрана для присвоения узлу подсети номера в списке, на
-практике эта функция должна быть достаточно гладкой вдоль временной оси и иметь
-лишь редкие скачки: быстрые изменения в структуре иерархии узлов (которые часто
-являются следствием погрешности измерений) могут привести постоянной передаче
-роли главного узла от одного узла к другому, что сделает кластер неуправляемым.
-Простейшей такой функцией является позиция IP-адреса узла в диапазоне всех
-IP-адресов подсети.
-
-Следующие ключевые особенности отличают наш подход от некоторых предложенных
-ранее подходов\nbsp{}cite:brunekreef1996design,aguilera2001stable,romano2014design.
-- *Многоуровневая иерархия.* Количество руководящих узлов в сети зависит от
-  значения ветвления. Если оно меньше количества IP-адресов в подсети, то в
-  кластере будет несколько руководящих узлов. Если оно больше или равно
-  количеству IP-адресов в подсети, то в кластере будет только один руководящий
-  узел. Когда какой-либо узел выходит из строя, многоуровневая иерархия
-  изменятся локально, только узлы, примыкающие к вышедшему из строя,
-  взаимодействуют друг с другом.
-- *Отображение IP-адресов.* Поскольку структура иерархии зависит только от
-  IP-адресов узлов, то в алгоритме отсутствует фаза выбора лидера. Чтобы сменить
-  руководителя, каждый узел отправляет сообщение только прежнему и новому
-  руководителю.
-- *Полностью основан на событиях.* Сообщения отправляются только при выходе из
-  строя узла, поэтому постоянной нагрузки на сеть нету. Поскольку алгоритм
-  допускает ошибку при отправке любого сообщения, то нет необходимости в
-  heartbeat-пакетах, являющихся индикацией нахождения узла в сети; вместо этого
-  все сообщения выполняют роль heartbeat-пакетов и настраивается время ожидания
-  отправки пакета.
-- *Отсутствие ручной конфигурации.* Узлу не требуется никаких предварительных
-  знаний, чтобы найти руководителя: он определяет сеть, узлом которой он
-  является, вычисляет IP-адрес потенциального руководителя и отправляет ему
-  сообщение. Если это не срабатывает, то процесс повторяется для следующего
-  потенциального руководителя. Таким образом, алгоритм подходит для начальной
-  загрузки кластера без ручной настройки, для этого требуется только запустить
-  соответствующий сервис на каждом узле.
-Суммируя вышесказанное, достоинством алгоритма является то, что он
-- масштабируется на большое количество узлов посредством иерархии с несколькими
-  руководящими узлами,
-- не нагружает сеть отправкой сообщений с текущим состоянием узлов и
-  heartbeat-пакетами,
-- не требует ручной настройки для первичной загрузки кластера.
-
-Недостатком алгоритма является то, что он требует редкого изменения IP-адресов.
-Он не подходит для облачной среды, в которой только DNS имя узла сохраняется, а
-IP-адрес может меняться со временем. Когда IP-адрес меняется, текущие соединения
-могут закрыться, сигнализируя о "выходе из строя" узла и перестраивая иерархию
-узлов. Таким образом, окружения, в которых узлы не идентифицируются IP-адресами,
-не подходят для алгоритма.
-
-Другим недостатком алгоритма является искусственная зависимость ранга узла от
-IP-адреса: замена отображения IP-адресов на что-то более совершенное (например,
-на отображение, которое использует загрузку текущего узла и сети для
-ранжирования узлов) представляет сложность, поскольку погрешность измерений
-может стать причиной неустойчивой иерархии, а полная событийность алгоритма
-будет нарушена.
-
-Алгоритм обнаружения узлов спроектирован для балансировки нагрузки на кластер
-вычислительных узлов, и его применение в других приложениях не рассматривается в
-данной работе. Когда распределенная или параллельная программа запускается на
-одном из узлов кластера, ее подзадачи распределяются между всеми примыкающими
-узлами иерархии (включая главный узел, если есть). Для того чтобы равномерно
-распределить нагрузку, когда программа запускается на подчиненном узле, каждый
-узел хранит вес каждого из примыкающих узлов иерархии. Вес равен количеству
-узлов дерева, находящегося "за" примыкающим узлом. Например, если вес первого
-примыкающего узла равен 2, то циклический алгоритм балансировки нагрузки
-распределит две подзадачи на первый узел перед тем как перейти к следующему
-узлу.
-
-Суммируя вышесказанное, алгоритм обнаружения узлов
-- спроектирован для облегчения распределения нагрузки на кластер,
-- полностью отказоустойчивый, состояние каждого узла можно вычислить заново в
-  любой момент времени,
-- полностью основан на событиях, а значит не нагружает сеть периодической
-  отправкой сообщений.
-
-**** Построение древовидной иерархии.
-Отношение строго порядка на множестве \(\mathcal{N}\) узлов одной подсети
-определяется как
-\begin{equation*}
-  \forall n_1 \forall n_2 \in \mathcal{N},
-  \forall f \colon \mathcal{N} \rightarrow \mathcal{R}^n
-  \Rightarrow (f(n_1) < f(n_2) \Leftrightarrow \neg (f(n_1) \geq f(n_2))),
-\end{equation*}
-где \(f\)\nbsp{}--- отображение узла на его ранг, а \(<\)\nbsp{}--- оператор, определяющий
-отношение строго порядка на множестве \(\mathcal{R}^n\). Функция \(f\) присваивает
-узлу порядковый номер, а оператор \(<\) делает этот номер уникальным.
-
-Простейшее отображение \(f\) ставит в соответствие каждому узлу подсети позицию
-его IP-адреса в диапазоне всех адресов подсети. Без преобразования к древовидной
-иерархии (когда в подсети выбирается только один лидер) рабочий узел, адрес
-которого занимает наименьшую позицию в диапазоне, становится руководящим. Если
-адрес узла занимает первую позицию в диапазоне, то для него невозможно выбрать
-лидера, и он будет находится на вершине иерархии вплоть до выхода из строя.
-Несмотря на то что идентификацию узлов на основе их IP-адресов легко реализовать
-в программе, такой подход устанавливает искусственную зависимость роли
-руководителя от IP-адреса узла. Тем не менее, этот подход полезен для первичного
-объединения узлов в кластер, когда более сложные отображения неприменимы.
-
-Для того чтобы алгоритм обнаружения масштабировался на большое количество узлов,
-диапазона IP адресов подсети отображается на древовидную иерархию. В такой
-иерархии каждый узел определяется уровнем \(l\) иерархии, на котором он
-находится, и отступом \(o\), который равен порядковому номеру узла на его
-уровне. Значения уровня и отступа определяются из следующей задачи оптимизации.
-\begin{equation*}
-    n = \sum\limits_{i=0}^{l(n)} p^i + o(n), \quad
-    l \rightarrow \min, \quad
-    o \rightarrow \min, \quad
-    l \geq 0, \quad
-    o \geq 0
-\end{equation*}
-где \(n\)\nbsp{}--- позиция IP-адреса узла в диапазоне IP-адресов подсети и \(p\)\nbsp{}---
-значение ветвления (максимальное количество подчиненных, которых может иметь
-узел). Руководитель узла на уровне \(l\) с отступом \(o\) имеет уровень \(l-1\) и
-отступ \(\lfloor{o/p}\rfloor\). Расстояние между любыми двумя узлами в иерархии,
-адреса которых занимают позиции \(i\) и \(j\) в диапазоне определяется как
-\begin{align*}
-    & \langle
-        \text{lsub}(l(j), l(i)), \quad
-        \left| o(j) - o(i)/p \right|
-    \rangle,\\
-    & \text{lsub}(l_1, l_2) =
-    \begin{cases}
-        \infty & \quad \text{if } l_1 \geq l_2, \\
-        l_1 - l_2 & \quad \text{if } l_1 < l_2.
-    \end{cases}
-\end{align*}
-Расстояние является составным, чтобы уровень иерархии учитывался в первую
-очередь.
-
-Для выбора руководителя каждый узел ранжирует все узлы подсети в соответствии с
-их позицией \(\langle{l(n),o(n)}\rangle\) и, используя формулу для определения
-расстояния, выбирает ближайший к потенциальному руководителю узел, имеющий
-наименьший ранг. Это позволяет пропустить IP-адреса выключенных узлов, однако,
-для разреженных сетей (в которых узлы занимают непоследовательные IP-адреса)
-сбалансированность дерева не гарантируется.
-
-Поскольку узлу для выбора руководителя нужно соединиться с узлом, адрес которого
-известен заранее, то алгоритм обнаружения масштабируется на большое количество
-узлов. Соединение с другими узлами из ранжированного списка происходит только в
-том случае, если текущим узел-руководитель выходит из строя. Таким образом, если
-адреса узлов кластера расположены плотно в диапазоне адресов подсети, каждый
-узел устанавливает соединение только со своим руководителем, и
-неэффективного сканирования всей сети каждым узлом не происходит.
-
-**** Результаты тестирования.
-Платформа, на которой осуществлялось тестирование, представляла собой несколько
-многоядерных узлов, поверх которых с помощью пространств имен Linux
-развертывался виртуальный кластер из заданного количества узлов. Похожий подход
-используется в\nbsp{}cite:lantz2010network,handigol2012reproducible,heller2013reproducible, где
-авторы воспроизводят разнообразные практические эксперименты на виртуальных
-кластерах и сопоставляют результаты с физическими. Преимущество данного подхода
-заключается в возможности проведения экспериментов на больших виртуальных
-кластерах, используя сравнительно небольшое количество физических узлов. Данный
-подход использовался для тестирования алгоритма обнаружения узлов кластера,
-потому что этот алгоритм обладает низкими требованиями к ресурсам системы
-(процессорному времени и пропускной способности сети).
-
-Производительность алгоритма была протестирована путем измерения времени
-необходимого для обнаружения всеми узлами кластера друг друга. Каждое изменение
-иерархии (с точки зрения каждого из узлов) записывалось в файл, и по прошествии
-30 секунд все процессы (каждый из которых моделирует один узел кластера) были
-вынужденно завершены. Пробные запуски показали, что одновременный запуск более
-100 виртуальных узлов искажал результаты, поэтому для этого эксперимента были
-использованы дополнительные физические узлы, на каждом из которых запускалось по
-100 виртуальных. Эксперимент показал, что обнаружение 100--400 узлами друг друга
-занимает в среднем 1,5 секунды, и это значение ненамного увеличивается с ростом
-количества узлов (см.\nbsp{}рис.\nbsp{}[[fig-bootstrap-local]]). Пример древовидной
-иерархии для 11 узлов с ветвлением равным 2 представлен на
-рис.\nbsp{}[[fig-tree-hierarchy-11]].
-
-#+name: fig-bootstrap-local
-#+caption: Зависимость времени объединения узлов в кластер от их количества.
-[[file:graphics/discovery.eps]]
-
-#+name: fig-tree-hierarchy-11
-#+begin_src dot :exports results :file build/tree-hierarchy-11-ru.pdf
-digraph {
-
-  node [fontsize=14,margin="0.055,0",shape=box,style=rounded]
-  graph [nodesep="0.15",ranksep="0.20",rankdir="BT"]
-  edge [arrowsize=0.66]
-
-  m1 [label="127.0.0.1"]
-  m2 [label="127.0.0.2"]
-  m3 [label="127.0.0.3"]
-  m4 [label="127.0.0.4"]
-  m5 [label="127.0.0.5"]
-  m6 [label="127.0.0.6"]
-  m7 [label="127.0.0.7"]
-  m8 [label="127.0.0.8"]
-  m9 [label="127.0.0.9"]
-  m10 [label="127.0.0.10"]
-  m11 [label="127.0.0.11"]
-
-  m2->m1
-  m3->m1
-  m4->m2
-  m5->m2
-  m6->m3
-  m7->m3
-  m8->m4
-  m9->m4
-  m10->m5
-  m11->m5
-}
-#+end_src
-
-#+caption: Древовидная иерархия для 11 узлов для ветвления равного 2.
-#+label: fig-tree-hierarchy-11
-#+RESULTS: fig-tree-hierarchy-11
-[[file:build/tree-hierarchy-11-ru.pdf]]
-
-*** Алгоритм восстановления после сбоев
-**** Контрольные точки восстановления.
-Сбои узлов распределенной системы можно разделить на два типа: сбой подчиненного
-узла и сбой руководящего узла. Для того чтобы запущенная на кластере задача
-могла пережить сбой подчиненного узла, планировщик задач периодически создает
-для нее контрольные точки восстановления и записывают их в надежное хранилище.
-Для того чтобы создать контрольную точку, планировщик временно останавливает все
-параллельные процессы задачи, копирует все страницы памяти и все структуры ядра
-операционной системы, выделенные для этих процессов, на диск, и продолжает
-выполнение задачи. Для того чтобы пережить сбой руководящего узла, серверный
-процесс планировщика задач непрерывно копирует свое внутреннее состояние на
-резервный узел, который становится руководящим после сбоя.
-
-Оптимизации работы контрольных точек восстановления посвящено большое количество
-работ\nbsp{}cite:egwutuoha2013survey, а альтернативным подходам уделяется меньше
-внимания. Обычно высокопроизводительные приложения используют передачу сообщений
-для обмена данными между параллельными процессами и хранят свое текущее
-состояние в глобальной памяти, поэтому не существует способа перезапустить
-завершившийся процесс, не записав образ всей выделенной для него памяти на диск.
-Обычно общее число процессов фиксировано и задается планировщиком, и в случае
-отказа перезапускаются сразу все процессы. Существуют некоторые обходные
-решения, которые позволяют перезапустить только часть
-процессов\nbsp{}cite:meyer2012radic, восстановив их из контрольной точки на
-выживших узлах, однако это может привести к перегрузке, если на этих узлах уже
-запущены другие задачи. Теоретически, перезапуск процесса необязателен если
-задача может быть продолжена на выживших узлах, но библиотека передачи сообщений
-не позволяет изменять количество параллельных процессов во время работы
-программы, и большинство программ все равно предполагают, что это значение
-является константой. Таким образом, не существует надежного способа обеспечения
-отказоустойчивости на уровне библиотеки передачи сообщений кроме как путем
-перезапуска всех параллельных процессов из контрольной точки восстановления.
-
-Однако, существует возможность продолжить выполнение задачи на меньшем
-количестве узлов, чем было изначально выделено изначально, реализовав
-отказоустойчивость на уровне приложения. В этом случае роли руководителя и
-подчиненного динамически распределяются между сервисами планировщика задач,
-работающими на каждом узле кластера, образуя древовидную иерархию узлов
-кластера, а параллельная программа состоит из управляющих объектов, использующих
-иерархию узлов для динамического распределения нагрузки и свою собственную
-иерархию для перезапуска управляющих объектов в случае сбоя узла.
-**** Динамическое распределение ролей.
-Отказоуйстовчисть параллельной программы\nbsp{}--- это одна из проблем, которая
-должна решаться планировщиком задач обработки больших данных или
-высокопроизводительных вычислений, однако, большинство планировщиков
-обеспечивают только отказоустойчивость подчиненных узлов. Такого рода сбои
-обычно обрабатываются путем перезапуска затронутой задачи (из контрольной точки
-восстановления) или ее части на оставшихся узлах, а выход из строя руководящего
-узла считается либо маловероятным, либо слишком сложным для обработки и
-настройки на целевой платформе. Системные администраторы обычно находят
-альтернативы отказоустойчивости на уровне приложения: они изолируют руководящий
-процесс планировщика от остальных узлов кластера, размещая его на специально
-выделенной машине, или, вместо этого, используют технологии виртуализации. Все
-эти альтернативы усложняют конфигурацию и обслуживание, и, уменьшая вероятность
-выхода из строя машины, приводящей к выходу из строя всей системы, увеличивают
-вероятность ошибки оператора.
-
-С этой точки зрения более практичным реализовать отказойстойчивость руководящего
-узла на уровне приложения, но не существует общего зарекомендовавшего себя
-решения. Большинство реализаций слишком привязаны к конкретному приложению,
-чтобы стать повсеместно применяемыми. Автор считает, что это происходит из-за
-привычки людей думать о кластере, как о совокупности отдельных машин, каждая из
-которых может быть руководителем или подчиненным, вместо того чтобы думать о
-кластере, как о едином целом, в котором роли руководителя и подчиненного
-динамически распределяются между запущенными на разных узлах процессами.
-
-Понимание того, что кластер тоже является вычислительной машиной, позволяет
-реализовать промежуточное программное обеспечение, которое автоматически
-распределяет роли руководителя и подчиненного и общим спопобом обрабатывает сбои
-узлов. Это программное обеспечение предоставляет программный интерфейс и
-распределяет управляющие объекты между доступными на данный момент узлами.
-Используя этот интерфейс, можно написать программу, которая запускается на
-кластерe, не зная точного количества работающих узлов. Это промежуточное
-программное обеспечение работает как кластерная операционная система в
-пользовательском пространстве, позволяющая писать и запускать распределенные
-приложения прозрачно.
-
-**** Симметричная архитектура.
-Многие распределенные хранилища типа "ключ-значение" и параллельные файловые
-системы имеют симметричную архитектуру, в которой роли руководителя и
-подчиненного распределяются динамически, так что любой узел может выступать в
-роли руководитля, если текущий руководящий узел выходит из строя, однако, такая
-архитектура до сих пор не используется в планировщиках задач обработки больших
-данных и высокопроизводительных вычислений. Например, в планировщике задач
-обработки больших данных YARN, роли руководителя и подчиненного являются
-статическими. Восстановление после сбоя подчиненного узла осуществляется путем
-перезапуска работавшей на нем части задачи на одном из выживших узлов, а
-восстановление после сбоя руководящего узла осуществляется путем установки
-резервного руководящего узла\nbsp{}cite:murthy2011architecture. Оба руководящих
-узла управляются сервисом Zookeeper, который использует динамическое
-распределение ролей для обеспечения своей
-отказоустойчивости\nbsp{}cite:okorafor2012zookeeper. Таким образом, отсутствие
-динамического распределения ролей у планировщика YARN усложняет конфигурацию
-всего кластера: если бы динамические роли были доступны, Zookeeper был бы лишним
-в данной конфигурации.
-
-Такая же проблема возникает в планировщиках задач для высокопроизводительных
-вычислений, руководящий узел (на котором запущен главный процесс планировщика
-задач) является единой точкой сбоя.
-В\nbsp{}cite:uhlemann2006joshua,engelmann2006symmetric авторы реплицируют
-состояние планировщика задач на резервный узел, чтобы обеспечить высокую
-доступность руководящего узла, но роль резервного узла задается статически.
-Такое решение близко к симметричной архитектуре, поскольку не использует внешний
-сервис для обеспечения высокой доступности, но далеко от идеала, в котором
-резервный узел выбирается динамически.
-
-Наконец, наиболее простой вариант высокой доступности руководящего узла
-реализован в протоколе VRRP (Virtual Router Redundancy
-Protocol)\nbsp{}cite:knight1998rfc2338,hinden2004virtual,nadas2010rfc5798.
-Несмотря на то что протокол VRRP предоставляет динамическое распределение ролей,
-он не может быть использован в планировщиках задач, поскольку спроектирован для
-маршрутизаторов, за которыми стоят реверс прокси серверы. В таких серверах
-отсутствует состояние (очередь задач), которое необходимо восстановить после
-выхода из строя узла, поэтому их высокую доступность обеспечить проще. Это может
-быть реализовано даже без маршрутизаторов, используя вместо этого сервис
-Keepalived\nbsp{}cite:cassen2002keepalived.
-
-Симметричная архитектура выгодна для планировщиков задач, поскольку позволяет
-- сделать физические узлы взаимозаменяемыми,
-- реализовать динамическое распределение ролей руководителя и подчиненного и
-- реализовать автоматическое восстановление после сбоя любого из узлов.
-В последующих разделах будут описаны компоненты необходимые для написания
-параллельной программы и планировщика, которые устойчивы к сбоям узлов кластера.
-
-**** Иерархия управляющих объектов.
-Для распределения нагрузки узлы кластера объединяются в древовидную иерархию
-(см.\nbsp{}раздел [[#sec:node-discovery]]), и нагрузка распределяется между
-непосредственными соседями узла, так что при запуске управляющего объекта на
-подчиненном узле главный узел также получают часть его подчиненных объектов. Это
-делает систему симметричной и легкой в обслуживании: на каждом узле установлен
-один и тот же набор программного обеспечения, что позволяет заменить один узел
-другим при выходе из строя первого. Похожее архитектурное решение используется в
-хранилищах типа ключ-значение\nbsp{}cite:anderson2010couchdb,lakshman2010cassandra для
-обеспечения отказоустойчивости, однако автору неизвестны планировщики задач,
-которые используют данный подход.
-
-В отличие от функции ~main~ в программах на основе библиотеки передачи
-сообщений, первый (главный) управляющий объект выполняется только на одном узле,
-а дополнительные узлы используются по необходимости. Такое решение позволяет
-использовать произвольное количество узлов для запуска задачи и динамически
-менять это количество во время ее выполнения. Похожее решение используется в
-системах обработки больших объемов
-данных\nbsp{}cite:dean2008mapreduce,vavilapalli2013yarn \nbsp{}--- пользователь,
-запускающий задачу на кластере, не указывает количество узлов, фактические
-узлы\nbsp{}--- это узлы, на которых расположены входные файлы.
-
-С математической точки зрения управляющий объект \(K\) может быть определен как
-векторнозначный функционал, отображающий один управляющий объект на
-\(n\)-компонентный вектор управляющих объектов:
-\begin{equation*}
-    K(f): \mathbb{K} \rightarrow \mathbb{K}^n
-    \qquad
-    \mathbb{K}^n = \left\{ f: \mathbb{K} \rightarrow \mathbb{K}^n \right\}.
-\end{equation*}
-Специальный объект \(\mathbb{O}: \mathbb{K} \rightarrow \mathbb{K}^0\)
-используется для остановки рекурсии, и передается в качестве аргумента главному
-управляющему объекту программы. Аргумент управляющего объекта интерпретируется
-следующим образом.
-- Если объект является только что созданным объектом, то аргумент\nbsp{}--- это его
-  родительский объект.
-- В остальных случаях аргументом может являться любой объект (чаще всего
-  дочерний по отношению к текущему).
-
-Объекты обрабатываются в цикле, который начинается с выполнением главного
-объекта, затем внутри главного объекта создаются и асинхронно выполняются другие
-объекты. Цикл продолжается до тех пор пока какой-либо объекта не вернет
-\(\mathbb{O}\). Поскольку вызов функции может породить сразу несколько объектов,
-они выполняются параллельно, что приводит к быстрому заполнению пула объектов.
-Поскольку объекты из пула могут выполняться в произвольном порядке, несколько
-потоков одновременно выбирают объекты для обработки, и при переполнении
-пула объекты могут быть переданы на другие узлы кластера без явного указания в
-исходном коде программы.
-
-Вычислительные объекты реализованы в виде замыканий (функторы в C++)\nbsp{}---
-объектов-функций, которые сохраняют в себе аргументы, ссылку на породивший их
-объект и данные из предметной области задачи. Данные обрабатываются либо при
-выполнении объекта, либо для параллельной обработки создаются подчиненные
-объекты. Когда обработка завершена, родительский объект вызывается с дочерним
-объектом в качестве аргумента для сбора результатов обработки.
-
-**** Обработка выхода узлов из строя.
-Наиболее распространенная стратегия при выходе из строя подчиненного узла
-является перезапуск выполнявшихся на нем объектов на рабочих узлах\nbsp{}---
-стратегия, которой следует язык Erlang при перезапуске подчиненных процессов\nbsp{}cite:armstrong2003thesis. Для того что реализовать этот метод в рамках иерархии
-управляющих объектов, узел-отправитель сохраняет каждый объект, передаваемый на
-другие узлы кластера, и в случае отказа произвольного количества узлов, на
-которые были переданы объекты, их копии перераспределяются между оставшимися
-узлами без индивидуальной обработки программистом. Если больше не осталось
-узлов, на которые можно отправить объекты, то они выполняются локально. В
-отличие от "тяжеловесного" метода контрольных точек восстановления,
-используемого планировщиками задач HPC кластеров, древовидная иерархия узлов в
-паре с иерархией объектов позволяет автоматически продолжить выполнение
-программы при выходе из строя произвольного количества подчиненных узлов без
-перезапуска каких-либо процессов параллельной программы.
-
-Возможный подход к обработке выхода из строя главного узла (узла, на котором
-запускается главный управляющий объект) заключается в копировании этого главного
-объекта на резервный узел и синхронизации любых изменений между двумя копиями
-объекта посредством распределенных транзакций, однако, этот подход не
-соотносится с асинхронностью вычислительных ядер и слишком сложна в реализации.
-На практике, оказывается, что главный управляющий объект обычно не выполняет
-операции параллельно, а последовательно переходит от вычисления одного шага
-программы к вычислению другого, и, значит, имеет не больше одного подчиненного в
-каждый момент времени. (Каждый подчиненный объект представляет собой
-последовательный шаг вычислений, который может быть, а может не быть
-параллельным внутри.) Имея это ввиду, можно упростить синхронизацию состояния
-главного объекта программы: отправить главный объект на подчиненный узел вместе
-с его подчиненным объектом. Тогда при выходе из строя главного узла, копия
-главного объекта принимает подчиненный объект (поскольку оба объекта находятся
-на одном и том же узле), и время на восстановление не тратится. Если же выходит
-из строя подчиненный узел, на которым был отправлен подчиненный объект вместе с
-копией главного объекта, то подчиненный объект отправляется на оставшиеся узлы,
-и в худшем случае текущий шаг вычислений выполняется заново.
-
-Описанный выше подход предназначен для объектов, у которых нет объекта-родителя
-и которые имеют только один подчиненный объект в каждый момент времени, и
-повторяет механизм работы контрольных точек восстановления. Преимуществом
-данного подхода является то, что он
-- сохраняет состояние только между последовательными шагами вычислений (когда оно
-занимает минимальный объем памяти),
-- сохраняет только актуальное данные и
-- использует для сохранения состояния оперативную память другого узла кластера,
-  а не дисковое хранилище.
-Этот подход позволяет выдержать выход из строя не более одного /любого/ узла
-кластера за один шаг вычислений или произвольного количества подчинненых узлов в
-любой момент работы программы.
-
-Далее следует пример работы алгоритма восстановления после сбоев
-(рис.\nbsp{}[[fig-fail-over-example]]).
-1. Исходное состояние. На начальном этапе вычислительный кластер не требует
-   никакой настройки за исключением настройки сети. Алгоритм предполагает полную
-   связность узлов кластера и лучше всего работает с древовидными топологиями, в
-   которых все узлы кластера соединены несколькими коммутаторами.
-2. Построение иерархии узлов. При первичной загрузке на всех узлах кластера
-   запускаются процессы-сервисы, которые совместно строят иерархию таких же
-   процессов поверх топологии сети кластера. Положение процесса-сервиса в
-   иерархии определяется позицией IP-адреса его узла в диапазоне IP-адресов
-   сети. Для установления связи каждый из процессов соединеняется только с
-   предполагаемым руководящим процессом. В данном случае процесс на узле \(A\)
-   становится руководящим процессом для всех остальных. Иерархия может
-   измениться, только если новый узел присоденяется к кластеру или какой-либо из
-   узлов выходит из строя.
-3. Запуск главного управляющего объекта. Первый управляющий объект запускается
-   на одном из подчиненных узлов (узел \(B\)). Главный объект может иметь только
-   один подчиненный объект в каждый момент времени, и резервная копия главного
-   объекта посылается вместе с этим подчиненным объектом \(T_1\) на руководящий узел
-   \(A\). \(T_1\) представляет собой последовательный шаг программы. В программе
-   может быть произвольное количество последовательных шагов, и, когда узел
-   \(A\) выходит из строя, текущий шаг перезапускается с начала.
-4. Запуск подчиненных управляющих объектов. Управлящие объекты \(S_1\), \(S_2\),
-   \(S_3\) запускаются на подчиненных узлах кластера. Когда узел \(B\), \(C\)
-   или \(D\), соответствующий руководящий управляющий объект перезапускает
-   завершившиеся некорректно подчиненные объекты (\(T_1\) перезапускает \(S_1\),
-   главный объект перезапускает \(T_1\) и т.д.). Когда выходит из строя узел
-   \(B\), главный объект восстанавливается из резервной копии.
-
-#+name: fig-fail-over-example
-#+header: :headers '("\\input{preamble}\\setdefaultlanguage{russian}")
-#+begin_src latex :file build/fail-over-example-ru.pdf :exports results :results raw
-\input{tex/preamble}
-\newcommand*{\spbuInsertFigure}[1]{%
-\vspace{2\baselineskip}%
-\begin{minipage}[b]{0.5\linewidth}%
-    \Large%
-    \input{#1}%
-\end{minipage}%
-}%
-\noindent%
-\spbuInsertFigure{tex/cluster-0}~\spbuInsertFigure{tex/frame-0}\newline
-\spbuInsertFigure{tex/frame-3-ru}~\spbuInsertFigure{tex/frame-4-ru}\newline
-\spbuInsertFigure{tex/legend-ru}
-#+end_src
-
-#+caption: Пример работы алгоритма восстановления после сбоев.
-#+label: fig-fail-over-example
-#+attr_latex: :width \textwidth
-#+RESULTS: fig-fail-over-example
-[[file:build/fail-over-example-ru.pdf]]
-
-**** Результаты тестирования.
-Методы отказоустойчивости были протестированы на физическом кластере
-(см.\nbsp{}табл.\nbsp{}[[tab-cluster]]) на примере программы, генерирующей
-взволнованную морскую поверхность, подробно описанной в разделе
-[[#sec:arma-algorithms]]. Программа состоит из серии фильтров, каждый из которых
-применяется к результату работы предыдущего. Некоторые из фильтров вычисляются
-параллельно, так что вся программа состоит из последовательно выполняющихся
-шагов, некоторые из которых внутри реализованы параллельно из соображений
-эффективности. Только наиболее ресурсоемкий этап программы (генерация
-взволнованной морской поверхности) выполняется параллельно на всех узлах, другие
-этапы выполняются параллельно на всех процессорных ядрах главного узла.
-
-#+name: tab-cluster
-#+caption: Конфигурация кластера, на котором проводились эксперименты.
-#+attr_latex: :booktabs t
-| CPU                 | Intel Xeon E5440, 2.83GHz |
-| RAM                 | 4Gb                       |
-| HDD                 | ST3250310NS, 7200rpm      |
-| Кол-во узлов        | 12                        |
-| Кол-во ядер на узел | 8                         |
-
-Программа была переписана под отказоустойчивую версию фреймворка, что
-потребовало лишь небольших изменений исходного кода для корректной обработки
-выхода из строя узла с главным объектом. Главный объект был помечен, чтобы
-фреймворк смог передать его на подчиненный узел вместе с подчиненным ему
-объектом. Другие изменения исходного кода были связаны с изменением программного
-интерфейса фреймворка. Таким образом, обеспечение отказоустойчивости посредством
-иерархии управляющих объектов, в основном, прозрачно для программиста и требует
-лишь маркировки главного объекта для его репликации на резервный узел.
-
-В ряде экспериментов была измерена производительность новой версии программы при
-выходе из строя различных типов узлов во время выполнения программы (номера
-пунктов соответствуют номерам графиков рис.\nbsp{}[[fig-benchmark]]):
-1) без выхода из строя узлов,
-2) выход из строя подчиненного узла (на котором генерируется часть взволнованной
-   поверхности),
-3) выход из строя главного узла (на котором запускается программа),
-4) выход из строя резервного узла (на который копируется главный объект
-   программы).
-Древовидная иерархия узлов со значением ветвления равного 64 использовалась в
-экспериментах, для того чтобы удостовериться, что все подчиненные узлы кластера
-соединены с узлом, имеющим первый IP-адрес в диапазоне адресов подсети.
-Узел-жертва выводился из строя по прошествии фиксированного временного интервала
-после запуска программы равного примерно \(1/3\) времени работы программы на
-одном узле. Приложение мгновенно узнавала о выходе из строя узла, поскольку
-закрывалось соответсвтвующие соединение; при реалистичном развитии событий,
-однако, выход из строя узла обнаружится по прошествии настраивомого тайм-аута.
-Способ запуска для каждого эксперимента представлен в табл.\nbsp{}[[tab-benchmark]].
-Результаты экспериментов приведены на рис.\nbsp{}[[fig-benchmark]]
-и\nbsp{}[[fig-slowdown]].
-
-Эксперименты показали большую разницу в общей производительности приложения при
-выходе из строя различных типов узлов. Графики\nbsp{}2 и\nbsp{}3 на
-рис.\nbsp{}[[fig-benchmark]] показывают, что производительность в случае выхода из
-строя руководящего и подчиненного узлов одинакова. В случае отказа руководящего
-узла резервный узел сохраняет копию главного объекта и восстанавливает главный
-объект из нее, когда обнаруживает, что главный узел вышел из строя. В случае
-отказа подчиненного узла, главный узел перераспределяет невернувшиеся объекты
-между оставшимися подчиненными узлами. В обоих случая состояние главного объекта
-программы не теряется, а значит не тратится время на его восстановление, что
-объясняет схожую производительность.
-
-График\nbsp{}4 на\nbsp{}[[fig-benchmark]] показывает, что производительность в
-случае выхода из строя резервного узла гораздо ниже, чем в других случаях. Это
-происходит, потому что руководящий узел сохраняет состояние только текущего
-последовательного шага программы, в то время как резервный узел не только хранит
-копию этого состояния, но и выполняет этот шаг параллельно с другими
-подчиненными узлами. Так что, когда резервный узел выходит из строя, главный
-узел начинает выполнение текущего этапа с самого начала на произвольно выбранном
-выжившем узле.
-
-#+caption: Параметры экспериментов с алгоритмово восстановления после сбоев.
-#+name: tab-benchmark
-#+attr_latex: :booktabs t
-| Номер эксп. | Время до выхода из строя, сек. |
-|           1 |                                |
-|           2 |                             10 |
-|           3 |                             10 |
-|           4 |                             10 |
-
-Для оценки количества времени, которое теряется при выходе узла из строя, можно
-поделить общее время работы программы со сбоем на время работы программы без
-сбоев, но с количеством узлов минус один. Это отношение получается из того же
-самого эксперимента и представлено на рис.\nbsp{}[[fig-slowdown]]. Разница в
-производительности в случае выхода из строя руководящего и подчиненного узлов
-находится в пределах 5%, а в случае выхода из строя резервного узла\nbsp{}--- в
-пределах 50% для количества узлов меньше 6[fn::Измерение разницы для большего
-количества узлов не имеет смысла, поскольку программа завершается еще до
-наступления сбоя.]. Увеличение времени выполнения на 50%\nbsp{}--- это больше,
-чем \(1/3\) времени работы программы, после которого происходит сбой, однако
-отказ резервного узла требует некоторого времени, чтобы быть обнаруженным
-другими узлами: сбой узла обнаруживается только тогда, когда подчиненный объект,
-имеющий копию главного объекта, завершает свое выполнение и пытается вернуться
-на исходный узел к родителю. Мгновенное обнаружение сбоя узла требует внезапной
-остановки выполнения объектов, что может быть неприменимо для программ со
-сложной логикой.
-
-#+name: fig-benchmark
-#+begin_src R :file build/benchmark-xxx-ru.pdf
-# TODO
-#+end_src
-
-#+caption: Производительность программы генерации взволнованной морской поверхности при различных типах сбоев узлов.
-#+label: fig-benchmark
-#+RESULTS: fig-benchmark
-[[file:build/benchmark-xxx-ru.pdf]]
-
-Результаты экспериментов позволяют сделать вывод о том, что не важно, вышел ли
-из строя руководящий узел или подчиненный, общее время работы параллельной
-программы примерно равно времени ее работы без сбоев, но с уменьшенным на
-единицу количеством узлов, однако, в случае выхода из строя резервного узла
-потери в производительности гораздо больше.
-
-#+name: fig-slowdown
-#+begin_src R :file build/slowdown-xxx-ru.pdf
-# TODO
-#+end_src
-
-#+caption: Замедление программы генерации взволнованной морской поверхности при различных типах сбоев по сравнению с запуском без сбоев но с уменьшенным на единицу количеством узлов.
-#+label: fig-slowdown
-#+RESULTS: fig-slowdown
-[[file:build/slowdown-xxx-ru.pdf]]
-
-**** Обсуждение результатов тестирования.
-Алгоритм восстановления после сбоев гарантирует обработку выхода из строя одного
-узла на один последовательный шаг программы; больше сбоев может быть выдержано,
-если он не затрагивают руководящий узел. Алгоритм обрабатывает одновременный
-выход из строя всех подчиненных узлов, однако, если руководящий и резервный узлы
-вместе выходят из строя, у программы нет ни единого шанса продолжить работу. В
-этом случае состояние текущего шага вычислений теряется полностью, и его можно
-восстановить только перезапуском программы с начала.
-
-Управляющие объекты являются абстракциями, отделяющие распределенное приложение
-от физических устройств: для непрерывной работы программы не важно, сколько
-узлов кластера в данный момент работают. Управляющие объекты позволяют
-отказаться от выделения физического резервного узла для обеспечения устойчивости
-к выходу из строя руководящего узла: в рамках иерархии управляющих объектов
-любой физический узел (кроме руководящего) может выполнять роль резервного.
-Наконец, иерархия управляющих объектов позволяет обрабатывать сбои прозрачно для
-программиста, определяя порядок действий из внутреннего состояния объекта.
-
-Проведенные эксперименты показывают, что параллельной программе необходимо иметь
-несколько последовательных этапов выполнения, чтобы сделать ее устойчивой к
-сбоям узлов, иначе выход из строя резервного узла фактически вызывает
-восстановление исходного состояния программы. Несмотря на то что вероятность
-сбоя резервного узла меньше вероятности сбоя одного из подчиненных узлов, это не
-повод потерять все данные, когда выполнявшаяся продолжительное время программа
-почти завершилась. В общем случае, чем больше последовательных этапов вычислений
-содержит параллельная программа, тем меньше времени потеряется в случае сбоя
-резервного узла, и, аналогично, чем больше параллельных частей содержит каждый
-последовательный этап, тем меньше времени потеряется при сбое руководящего или
-подчиненного узла. Другими словами, чем больше количество узлов, на которое
-масштабируется программа, тем она становится более устойчива к сбою узлов
-кластера.
-
-Хотя это не было показано в экспериментах, Фабрика не только обеспечивает
-устойчивость к выходу из строя узлов кластера, но и позволяет автоматически
-вводить новые узлы в кластер и распределять на них часть управляющих объектов из
-уже запущенных программ. В контексте фреймворка этот процесс тривиален,
-поскольку не требует перезапуска незавершившихся управляющих объектов и
-копирования их состояния, и не изучался экспериментально в данной работе.
-
-Теоретически, отказоустойчивость, основанная на иерархии узлов и управляющих
-объектов, может быть реализована поверх библиотеки передачи сообщений без потери
-общности. Хотя использование незагруженных узлов заместо вышедших из строя в
-рамках такой библиотеки представляет определенную сложность, поскольку
-количество узлов, на которых запущена программа, в таких библиотеках
-фиксировано, однако, выделение достаточно большого количества узлов для
-программы будет достаточно для обеспечения ее отказоустойчивости. В то же время,
-реализация отказоустойчивости, основанной на иерархии, внутри самой библиотеки
-передачи сообщений не практично, поскольку это потребует сохранения текущего
-состояния параллельной программы, объем которого эквивалентен всей занимаемой ей
-памятью на каждом узле кластера, что, в свою очередь, не позволит сделать такой
-подход эффективнее контрольных точек восстановления.
-
-Слабым местом описанных методов является период времени, начиная с отказа
-руководящего узла и заканчивая обнаружением сбоя подчиненным узлом,
-восстановлением главного объекта из копии и получением нового подчиненного
-объекта вместе с копией его родителя подчиненным узлом. Если в любой момент
-времени из этого периода резервный узел выходит из строя, то состояние
-выполнения программы полностью теряется без возможности его восстановить, кроме
-как перезапуском с самого начала. Протяженность этого опасного промежутка
-времени может быть минимизирована, но полностью исключить вероятность внезапного
-завершения программы невозможно. Этот результат согласуется с исследованиями
-/теории невыполнимости/ в рамках которой доказывается невозможность
-распределенного консенсуса с хотя бы одним процессом, дающим
-сбой\nbsp{}cite:fischer1985impossibility и невозможность надежной передачи
-данных в случае сбоя одного из узлов\nbsp{}cite:fekete1993impossibility.
-
-** Сравнение предложенного подхода с современными подходами
-Современный подход к разработке и запуску параллельных программ на кластере
-заключается в использовании библиотеки передачи сообщений MPI и планировщика
-задач, и, несмотря на то что этот подход имеет высокую эффективность с точки
-зрения параллельных вычислений, он недостаточно гибок, чтобы вместить в себя
-динамическую балансировку нагрузки и автоматическое обеспечение
-отказоустойчивости. Программы, написанные с помощью MPI обычно предполагают
-- равномерную загрузку каждого процессора,
-- бесперебойное и надежное выполнение пакетных задач, и
-- постоянное число параллельных процессов/потоков во время выполнения, равное
-  общему количеству процессоров.
-Первое предположение несправедливо для программы моделирование морского
-волнения, поскольку модель АР требует динамической балансировки нагрузки между
-процессорами для генерации каждой части поверхности только когда генерация всех
-зависимых частей уже закончена. Последнее предположение также несправедливо,
-поскольку в угоду эффективности каждая часть записывается в файл отдельным
-потоком асинхронно. Оставшееся предположение относится не к самой программе, а к
-планировщику задач, и несправедливо для больших вычислительных кластеров, в
-которых узлы часто выходят из строя, а планировщик перезапускает задачу из
-контрольной точки восстановления, серьезно замедляя ее. Таким образом, идея
-предлагаемого подхода\nbsp{}--- дать параллельным программам больше гибкости:
-- предоставить динамическую балансировку нагрузки путем выполнения
-  последовательных, параллельных изнутри шагов программы в режиме конвейера,
-- перезапускать только затронутые выходом из строя узла процессы, и
-- выполнять программу на как можно большем количестве узлов, которое доступно в
-  кластере.
-В данном разделе обсуждаются преимущества и недостатки этого подхода.
-
-В сравнении с портируемыми системами пакетных заданий (PBS) для распределения
-нагрузки на узлы кластера предлагаемый подход использует легковесные управляющие
-объекты вместо тяжеловесных параллельных задач. Во-первых, это позволяет иметь
-очереди объектов на каждом узле, вместо того чтобы иметь одну очередь задач на
-кластер. Зернистость управляющих объектов гораздо выше, чем у пакетных задач, и,
-несмотря на то что время их выполнения не может быть надежно спрогнозировано
-(также как и время выполнения пакетных задач), объекты из нескольких
-параллельных программ могут быть динамически распределены между одним и тем же
-множеством узлов кластера, делая нагрузку более равномерной. Недостатком
-является необходимость в большем количестве оперативной памяти для выполнения
-нескольких задач на одних и тех же узлах, а также в том что выполнение каждой
-программы может занять больше времени из-за общих очередей управляющих объектов.
-Во-вторых, предлагаемый подход использует динамическое распределение ролей
-руководителя и подчиненного среди узлов кластера вместо их статического
-присвоения конкретным физическим узлам. Это позволяет сделать узлы
-взаимозаменяемыми, что необходимо для обеспечения отказоустойчивости. Таким
-образом, одновременное выполнение нескольких параллельных программ на одном и
-том же множестве узлов может увеличить пропускную способность кластера, но также
-может уменьшить их производительность, взятую по отдельности, а динамичское
-распределение ролей является основанием, на котором строится устойчивость к
-сбоям.
-
-В сравнении с MPI для разбиения программы на отдельные сущности предлагаемый
-подход использует легковесные управляющие объекты вместо тяжеловесных процессов.
-Во-первых, это позволяет определить число обрабатываемых параллельно сущностей,
-исходя из задачи, а не архитектуры компьютера или кластера. Это поощряет
-программиста создачать столько объектов, солько необходимо, руководствуясь
-алгоритмом или ограничениями на размер структур данных из предметной области
-задачи. В программе моделирования морского волннения минимальный размер каждой
-части поверхности зависит от числа коэффициентов вдоль каждой из осей, и, в то
-же время, количество частей должно быть больше, чем количество процессоров, для
-того чтобы сделать нагрузку на процессоры более равномерной. Учитывая эти
-ограничения оптимальный размер части определяется во время выполнения, и, в
-общем случае, не совпадает с количеством параллельных процессов. Недостатком
-является то, что, чем больше управляющих объектов в программе, тем больше общих
-структур данных копируется на один и тот же узел вместе с подчиненными
-объектами; проблема решается введением промежуточного слоя объектов, что в свою
-очередь влечет увеличивает сложноть программы. Во-вторых, иерархия управляющих
-объектов совместно с иерархией узлов позволяет автоматически пересчитвать
-завершившиеся некорретно подчиненные объекты на выживших узлах кластера в случае
-выхода из строя оборудования. Это возможно, поскольку ход выполнения программы
-сохраняется в каждом объекте, а не в глобальных переменных, как это делается в
-программах MPI. Дублируя состояние на подчиненные узлы, система пресчитывает
-только объекты из поврежденных процессов, а не программу целиком. Таким образом,
-переход от процессов к управляющим объектам может увеличить производительность
-параллельной программы путем динамической балансировки нагрузки, но также может
-повлиять на ее масштабируемость на большое количество узлов из-за дублирования
-состояния хода выполнения.
-
-Может показаться, что три составляющих предлагаемого подхода\nbsp{}---
-управляющие объекты, конвейеры и иерархии\nbsp{}--- ортогональны, но, на самом
-деле, они дополняют друг друга. Если бы управляющие объекты не содержали в себе
-состояние хода выполнения программы, то было бы невозможно пересчитать
-завершившиеся некорретно подчиненные объекты и обеспечить отказоустойчивость.
-Если бы ирерархии узлов не было, то было бы невозможно распределить нагрузку
-между узлами кластера, поскольку все узлы одинаковы без иерархии. Если бы для
-каждого устройства не было конвейера, то было бы невозможно обрабатывать
-управляющие объекты асинхронно и реализовать динамическую балансировку нагрузки.
-Эти три сущности образуют замкнутую систему, в которую нечего добавить и из
-которой нечего удалить\nbsp{}--- надежную основу для любой распределенной
-программы.
-
-Подводя итог, можно сказать, что управляющие объекты придают гибкости
-параллельным программам: они балансируют снижение производительности за счет
-использования общих очередей ее увеличением за счет динамической балансировки
-нагрузки. Требуя больше оперативной памяти для работы, они позволяют выполнять
-сразу несколько параллельных программ одновременно на всех узлах кластера без
-простаивания в очереди задач, и превращают кластер в единую вычислительную
-систему, которая делает все возможное для непрерывной работы распределенных
-приложений.
-
-* Заключение
-**** Итоги исследования.
-В изучении возможностей математического аппарата для имитационного моделирования
-морского волнения, выходящего за рамки линейной теории волн, были достигнуты
-следующие основные результаты.
-- Процесс АРСС был использован для моделирования морских волн произвольных
-  амплитуд. Интегральные характеристики генерируемой взволнованной поверхности
-  были верифицированы путем сопоставления с характеристиками реальной морской
-  поверхности.
-- Аналитическая формула для определения давлений была использована для
-  вычисления поля потенциала скорости под генерируемой поверхности. Получившееся
-  поле потенциалов скоростей было верифицировано путем сравнения с полем,
-  вычисляемым по формулам из линейной теории волн. Аналитическая формула
-  эффективна с вычислительной точки зрения, поскольку все интегралы в ней
-  записываются как преобразования Фурье, для которого существуют
-  высокопроизводительные реализации.
-
-**** Перспективы дальнейших исследований.
-Одной из тем дальнейших исследований является изучение возможности генерации
-волн произвольных профилей на базе смешанного процесса АРСС. Другим направлением
-является интеграция разработанной модели и формулы расчета давлений в
-существующие пакеты прикладного программного обеспечения.
-
-* Выводы
-Результаты исследования позволяют сделать вывод о том, что задача вычисления
-давлений под реальной морской поверхностью может быть решена аналитическив,
-минуя предположения линейной теории волн и теории волн малой амплитуды. Это
-решение в паре с моделью АРСС морского волнения, способной
-генерировать волны произвольных амплитуд, может быть использовано для расчета
-влияния колебаний волн на поведение динамического объекта в открытом море, и
-дает более точные результаты чем аналогичное решение для волн малых амплитуд.
-
-Результаты проведенных численных экспериментов позволяют сделать вывод о том,
-что как генерация взволнованной поверхности так и расчет гидродинамических
-давлений могут быть реализованы эффективно с использованием алгоритмов быстрого
-преобразования Фурье, и длительные сессии имитационного моделирования могут
-проводиться.
-
-Разработанный в работе математический аппарат и его численная реализация могут
-стать основой виртуального полигона, предназанченного для расчетов динамики
-морских объектов.
-
-* Благодарности
-Графики в этой работе были подготовлены с помощью языка для статистических
-вычислений R\nbsp{}cite:rlang2016,Sarkar2008lattice и программного обеспечения
-Graphviz\nbsp{}cite:Gansner00anopen. Документ был подготовлен с использованием
-Org-mode\nbsp{}cite:Schulte2011org2,Schulte2011org1,Dominik2010org для GNU
-Emacs, предоставляющего вычислительное окружение для воспроизводимых
-исследований. Это означает, что все графики можно воспроизвести и
-соответствующие утверждения проверить, скопировав репозиторий
-диссертации[fn:repo], установив Emacs и экспортировав документ.
-
-Исследования были проведены на вычислительных ресурсах ресурсного центра
-"Вычислительный центр СПбГУ" (\mbox{T-EDGE96} \mbox{HPC-0011828-001}) в рамках
-грантов РФФИ (проекты\nbsp{}\mbox{16-07-01111}, \mbox{16-07-00886},
-\mbox{16-07-01113}).
-
-[fn:repo] [[https://github.com/igankevich/arma-thesis]]
-
-* Список сокращений и условных обозначений
-- <<<MPP>>> :: Massively Parallel Processing, класс вычислительных систем с разделенной памятью.
-- <<<SMP>>> :: Symmetric Multi-Processing, класс вычислительных систем с общей памятью.
-- <<<АКФ>>> :: автоковариационная функция.
-- <<<БПФ>>> :: быстрое преобразование Фурье.
-- <<<ГПСЧ>>> :: генератор псевдослучайных чисел.
-- <<<ГУ>>> :: граничное условие.
-- <<<ДУЧП>>> :: дифференциальное уравнение в частных производных.
-- <<<НБП>>> :: нелинейное безынерционное преобразование.
-- <<<АР>>> :: процесс авторегрессии.
-- <<<АРСС>>> :: процесс авторегрессии скользящего среднего.
-- <<<СС>>> :: процесс скользящего среднего.
-- <<<ЛХ>>> :: модель Лонге---Хиггинса.
-- <<<LAMP>>> :: Large Amplitude Motion Programme, программа для моделирования качки судна на морских волнах.
-- <<<ЦПТ>>> :: центральная предельная теорема.
-- <<<ПМ>>> :: аппроксимация Пирсона---Московица для спектра морского волнения.
-- <<<ЮУ>>> :: система уравнений Юла---Уокера.
-- <<<МНК>>> :: метод наименьших квадратов.
-- <<<ФПР>>> :: функция плотности распределения.
-- <<<ФР>>> :: функция распределения.
-- <<<BSP>>> :: Bulk Synchronous Parallel.
-- <<<OpenCL>>> :: Open Computing Language.
-- <<<OpenMP>>> :: Open Multi-Processing.
-- <<<MPI>>> :: Message Passing Interface.
-- <<<POSIX>>> :: Portable Operating System.
-- <<<FMA>>> :: Fused multiply-add.
-- <<<DCMT>>> :: Dynamic creation of Mersenne Twisters.
-- <<<GSL>>> :: GNU Scientific Library.
-- <<<BLAS>>> :: Basic Linear Algebra Sub-programmes.
-- <<<LAPACK>>> :: Linear Algebra Package.
-- <<<DNS>>> :: Dynamic name resolution.
-- <<<HPC>>> ::  High-performance computing.
-
-#+begin_export latex
-\input{postamble}
-#+end_export
-
-bibliographystyle:ugost2008
-bibliography:bib/refs.bib
-
-* Приложение
-** Вывод формулы модели Лонге---Хиггинса
-:PROPERTIES:
-:CUSTOM_ID: longuet-higgins-derivation
-:END:
-
-Двухмерная система уравнений\nbsp{}eqref:eq-problem в рамках линейной теории
-волн записывается как
-\begin{align*}
-    & \phi_{xx} + \phi_{zz} = 0,\\
-    & \zeta(x,t) = -\frac{1}{g} \phi_t, & \text{на }z=\zeta(x,t),
-\end{align*}
-где \(\frac{p}{\rho}\) включено в \(\phi_t\). Решение уравнения Лапласа ищется в
-виде ряда Фурье cite:kochin1966theoretical:
-\begin{equation*}
-    \phi(x,z,t) = \int\limits_{0}^{\infty} e^{k z}
-    \left[ A(k, t) \cos(k x) + B(k, t) \sin(k x) \right] dk.
-\end{equation*}
-Подставляя его в граничное условие, получаем
-\begin{align*}
-    \zeta(x,t) &= -\frac{1}{g} \int\limits_{0}^{\infty}
-    \left[ A_t(k, t) \cos(k x) + B_t(k, t) \sin(k x) \right] dk \\
-    &= -\frac{1}{g} \int\limits_{0}^{\infty} C_t(k, t) \cos(kx + \epsilon(k, t)).
-\end{align*}
-Здесь \(\epsilon\)\nbsp{}--- белый шум, а \(C_t\) включает в себя значение \(dk\).
-Подставляя бесконечную сумму вместо интеграла, получаем двухмерную форму
-ур.\nbsp{}[[eq-longuet-higgins]].
diff --git a/phd-diss.org b/phd-diss.org
@@ -1,3226 +0,0 @@
-# Local Variables:
-# org-ref-default-bibliography ("bib/refs.bib")
-# org-latex-image-default-width nil
-# org-latex-caption-above nil
-# org-latex-hyperref-template "\\hypersetup{\n pdfauthor={%a},\n pdftitle={%t},\n pdfkeywords={%k},\n pdfsubject={%d},\n pdfcreator={%c},\n pdflang={%L},\n unicode={true}\n}\n\\setdefaultlanguage{%l}\n"
-# org-export-latex-tables-hline "\\midrule"
-# org-export-latex-tables-tstart "\\toprule"
-# org-export-latex-tables-tend "\\bottomrule"
-# eval: (add-to-list 'org-latex-classes '("gost" "\\documentclass{gost} [DEFAULT-PACKAGES] [PACKAGES] [EXTRA]" ("\\section{%s}" . "\\section*{%s}") ("\\subsection{%s}" . "\\subsection*{%s}") ("\\subsubsection{%s}" . "\\subsubsection*{%s}") ("\\paragraph{%s}" . "\\paragraph*{%s}") ("\\subparagraph{%s}" . "\\subparagraph*{%s}")))
-# End:
-
-#+TITLE: High-performance ocean wave simulation model for studying marine object behaviour
-#+AUTHOR: Ivan Gankevich
-#+DATE: St. Petersburg, 2017
-#+LANGUAGE: en
-#+LATEX_CLASS: gost
-#+LATEX_CLASS_OPTIONS: [hidelinks,fontsize=14pt,paper=a4,pagesize,DIV=calc,noenddot]
-#+LATEX_HEADER_EXTRA: \input{preamble}
-#+LATEX_HEADER_EXTRA: \organization{Saint Petersburg State University}
-#+LATEX_HEADER_EXTRA: \manuscript{}
-#+LATEX_HEADER_EXTRA: \degree{thesis for candidate of sciences degree}
-#+LATEX_HEADER_EXTRA: \speciality{Speciality 05.13.18\\Mathematical modeling, numerical methods and programme complexes}
-#+LATEX_HEADER_EXTRA: \supervisor{Supervisor\\Alexander Degtyarev}
-#+LATEX_HEADER_EXTRA: \newcites{published}{Publications on the subject of thesis}
-#+OPTIONS: todo:nil title:nil ':t H:5
-#+STARTUP: indent
-#+PROPERTY: header-args:R :results graphics :exports results
-
-* Config                                                           :noexport:
-** Produce data for Q-Q and ACF plots
-#+begin_src sh :exports none :results verbatim
-root=$(pwd)
-for testname in propagating_wave standing_wave
-do
-    wd=$root/build/$testname
-    rm -rf $wd
-    mkdir -p $wd
-    cd $wd
-    arma -c $root/config/$testname.arma 2>&1
-done
-#+end_src
-
-#+RESULTS:
-#+begin_example
-Input file                     = /home/igankevich/workspace/phd-diss/config/propagating_wave.arma
-ACF grid size                  = (20,10,10)
-ACF grid patch size            = (0.526316,0.555556,0.555556)
-Output grid size               = (200,40,40)
-Output grid patch size         = (1,1,1)
-AR order                       = (10,10,10)
-Do least squares               = 0
-ACF function                   = propagating_wave
-Model                          = MA
-MA algorithm                   = fixed_point_iteration
-Verification scheme            = manual
-ACF variance = 5
-fixed_point_iteration:Iteration=0, var_wn=2.70831
-fixed_point_iteration:Iteration=1, var_wn=1.93791
-fixed_point_iteration:Iteration=2, var_wn=1.54801
-fixed_point_iteration:Iteration=3, var_wn=1.31202
-fixed_point_iteration:Iteration=4, var_wn=1.15328
-fixed_point_iteration:Iteration=5, var_wn=1.0386
-fixed_point_iteration:Iteration=6, var_wn=0.951442
-fixed_point_iteration:Iteration=7, var_wn=0.882674
-fixed_point_iteration:Iteration=8, var_wn=0.82688
-fixed_point_iteration:Iteration=9, var_wn=0.780623
-fixed_point_iteration:Iteration=10, var_wn=0.74161
-fixed_point_iteration:Iteration=11, var_wn=0.708244
-fixed_point_iteration:Iteration=12, var_wn=0.679374
-fixed_point_iteration:Iteration=13, var_wn=0.654145
-fixed_point_iteration:Iteration=14, var_wn=0.63191
-fixed_point_iteration:Iteration=15, var_wn=0.612168
-fixed_point_iteration:Iteration=16, var_wn=0.594523
-fixed_point_iteration:Iteration=17, var_wn=0.578663
-fixed_point_iteration:Iteration=18, var_wn=0.564333
-fixed_point_iteration:Iteration=19, var_wn=0.551325
-fixed_point_iteration:Iteration=20, var_wn=0.539469
-fixed_point_iteration:Iteration=21, var_wn=0.528623
-fixed_point_iteration:Iteration=22, var_wn=0.518666
-fixed_point_iteration:Iteration=23, var_wn=0.509497
-fixed_point_iteration:Iteration=24, var_wn=0.50103
-fixed_point_iteration:Iteration=25, var_wn=0.493191
-fixed_point_iteration:Iteration=26, var_wn=0.485916
-fixed_point_iteration:Iteration=27, var_wn=0.479148
-fixed_point_iteration:Iteration=28, var_wn=0.472841
-fixed_point_iteration:Iteration=29, var_wn=0.466951
-fixed_point_iteration:Iteration=30, var_wn=0.461442
-fixed_point_iteration:Iteration=31, var_wn=0.456279
-fixed_point_iteration:Iteration=32, var_wn=0.451435
-fixed_point_iteration:Iteration=33, var_wn=0.446882
-fixed_point_iteration:Iteration=34, var_wn=0.442597
-fixed_point_iteration:Iteration=35, var_wn=0.43856
-fixed_point_iteration:Iteration=36, var_wn=0.434752
-fixed_point_iteration:Iteration=37, var_wn=0.431155
-fixed_point_iteration:Iteration=38, var_wn=0.427755
-fixed_point_iteration:Iteration=39, var_wn=0.424538
-fixed_point_iteration:Iteration=40, var_wn=0.42149
-fixed_point_iteration:Iteration=41, var_wn=0.418601
-fixed_point_iteration:Iteration=42, var_wn=0.415859
-fixed_point_iteration:Iteration=43, var_wn=0.413256
-fixed_point_iteration:Iteration=44, var_wn=0.410782
-fixed_point_iteration:Iteration=45, var_wn=0.40843
-fixed_point_iteration:Iteration=46, var_wn=0.406191
-fixed_point_iteration:Iteration=47, var_wn=0.404059
-fixed_point_iteration:Iteration=48, var_wn=0.402029
-fixed_point_iteration:Iteration=49, var_wn=0.400092
-fixed_point_iteration:Iteration=50, var_wn=0.398246
-fixed_point_iteration:Iteration=51, var_wn=0.396483
-fixed_point_iteration:Iteration=52, var_wn=0.3948
-fixed_point_iteration:Iteration=53, var_wn=0.393193
-fixed_point_iteration:Iteration=54, var_wn=0.391656
-fixed_point_iteration:Iteration=55, var_wn=0.390188
-fixed_point_iteration:Iteration=56, var_wn=0.388782
-fixed_point_iteration:Iteration=57, var_wn=0.387438
-fixed_point_iteration:Iteration=58, var_wn=0.386151
-fixed_point_iteration:Iteration=59, var_wn=0.384918
-fixed_point_iteration:Iteration=60, var_wn=0.383738
-fixed_point_iteration:Iteration=61, var_wn=0.382606
-fixed_point_iteration:Iteration=62, var_wn=0.381522
-fixed_point_iteration:Iteration=63, var_wn=0.380482
-fixed_point_iteration:Iteration=64, var_wn=0.379485
-fixed_point_iteration:Iteration=65, var_wn=0.378528
-fixed_point_iteration:Iteration=66, var_wn=0.37761
-fixed_point_iteration:Iteration=67, var_wn=0.376729
-fixed_point_iteration:Iteration=68, var_wn=0.375882
-fixed_point_iteration:Iteration=69, var_wn=0.37507
-fixed_point_iteration:Iteration=70, var_wn=0.374289
-fixed_point_iteration:Iteration=71, var_wn=0.373539
-fixed_point_iteration:Iteration=72, var_wn=0.372818
-fixed_point_iteration:Iteration=73, var_wn=0.372126
-fixed_point_iteration:Iteration=74, var_wn=0.37146
-fixed_point_iteration:Iteration=75, var_wn=0.37082
-fixed_point_iteration:Iteration=76, var_wn=0.370204
-fixed_point_iteration:Iteration=77, var_wn=0.369612
-fixed_point_iteration:Iteration=78, var_wn=0.369042
-fixed_point_iteration:Iteration=79, var_wn=0.368494
-fixed_point_iteration:Iteration=80, var_wn=0.367966
-fixed_point_iteration:Iteration=81, var_wn=0.367458
-fixed_point_iteration:Iteration=82, var_wn=0.366969
-fixed_point_iteration:Iteration=83, var_wn=0.366499
-fixed_point_iteration:Iteration=84, var_wn=0.366046
-fixed_point_iteration:Iteration=85, var_wn=0.36561
-fixed_point_iteration:Iteration=86, var_wn=0.365189
-fixed_point_iteration:Iteration=87, var_wn=0.364785
-fixed_point_iteration:Iteration=88, var_wn=0.364395
-fixed_point_iteration:Iteration=89, var_wn=0.364019
-fixed_point_iteration:Iteration=90, var_wn=0.363657
-fixed_point_iteration:Iteration=91, var_wn=0.363309
-fixed_point_iteration:Iteration=92, var_wn=0.362973
-fixed_point_iteration:Iteration=93, var_wn=0.362649
-fixed_point_iteration:Iteration=94, var_wn=0.362337
-fixed_point_iteration:Iteration=95, var_wn=0.362036
-fixed_point_iteration:Iteration=96, var_wn=0.361746
-fixed_point_iteration:Iteration=97, var_wn=0.361466
-fixed_point_iteration:Iteration=98, var_wn=0.361197
-fixed_point_iteration:Iteration=99, var_wn=0.360937
-fixed_point_iteration:Iteration=100, var_wn=0.360686
-fixed_point_iteration:Iteration=101, var_wn=0.360444
-fixed_point_iteration:Iteration=102, var_wn=0.360211
-fixed_point_iteration:Iteration=103, var_wn=0.359986
-fixed_point_iteration:Iteration=104, var_wn=0.359769
-fixed_point_iteration:Iteration=105, var_wn=0.35956
-fixed_point_iteration:Iteration=106, var_wn=0.359358
-fixed_point_iteration:Iteration=107, var_wn=0.359163
-fixed_point_iteration:Iteration=108, var_wn=0.358975
-fixed_point_iteration:Iteration=109, var_wn=0.358794
-fixed_point_iteration:Iteration=110, var_wn=0.358619
-fixed_point_iteration:Iteration=111, var_wn=0.35845
-fixed_point_iteration:Iteration=112, var_wn=0.358288
-fixed_point_iteration:Iteration=113, var_wn=0.35813
-fixed_point_iteration:Iteration=114, var_wn=0.357979
-fixed_point_iteration:Iteration=115, var_wn=0.357832
-fixed_point_iteration:Iteration=116, var_wn=0.357691
-fixed_point_iteration:Iteration=117, var_wn=0.357555
-fixed_point_iteration:Iteration=118, var_wn=0.357423
-fixed_point_iteration:Iteration=119, var_wn=0.357296
-fixed_point_iteration:Iteration=120, var_wn=0.357173
-fixed_point_iteration:Iteration=121, var_wn=0.357055
-fixed_point_iteration:Iteration=122, var_wn=0.356941
-fixed_point_iteration:Iteration=123, var_wn=0.356831
-fixed_point_iteration:Iteration=124, var_wn=0.356724
-fixed_point_iteration:Iteration=125, var_wn=0.356621
-fixed_point_iteration:Iteration=126, var_wn=0.356522
-fixed_point_iteration:Iteration=127, var_wn=0.356426
-fixed_point_iteration:Iteration=128, var_wn=0.356334
-fixed_point_iteration:Iteration=129, var_wn=0.356244
-fixed_point_iteration:Iteration=130, var_wn=0.356158
-fixed_point_iteration:Iteration=131, var_wn=0.356075
-fixed_point_iteration:Iteration=132, var_wn=0.355994
-fixed_point_iteration:Iteration=133, var_wn=0.355917
-fixed_point_iteration:Iteration=134, var_wn=0.355842
-fixed_point_iteration:Iteration=135, var_wn=0.355769
-fixed_point_iteration:Iteration=136, var_wn=0.355699
-fixed_point_iteration:Iteration=137, var_wn=0.355632
-fixed_point_iteration:Iteration=138, var_wn=0.355567
-fixed_point_iteration:Iteration=139, var_wn=0.355504
-fixed_point_iteration:Iteration=140, var_wn=0.355443
-fixed_point_iteration:Iteration=141, var_wn=0.355384
-fixed_point_iteration:Iteration=142, var_wn=0.355327
-fixed_point_iteration:Iteration=143, var_wn=0.355273
-fixed_point_iteration:Iteration=144, var_wn=0.35522
-fixed_point_iteration:Iteration=145, var_wn=0.355169
-fixed_point_iteration:Iteration=146, var_wn=0.355119
-fixed_point_iteration:Iteration=147, var_wn=0.355072
-fixed_point_iteration:Iteration=148, var_wn=0.355026
-fixed_point_iteration:Iteration=149, var_wn=0.354981
-fixed_point_iteration:Iteration=150, var_wn=0.354938
-fixed_point_iteration:Iteration=151, var_wn=0.354897
-fixed_point_iteration:Iteration=152, var_wn=0.354856
-fixed_point_iteration:Iteration=153, var_wn=0.354818
-fixed_point_iteration:Iteration=154, var_wn=0.35478
-fixed_point_iteration:Iteration=155, var_wn=0.354744
-fixed_point_iteration:Iteration=156, var_wn=0.354709
-fixed_point_iteration:Iteration=157, var_wn=0.354676
-fixed_point_iteration:Iteration=158, var_wn=0.354643
-fixed_point_iteration:Iteration=159, var_wn=0.354612
-fixed_point_iteration:Iteration=160, var_wn=0.354581
-fixed_point_iteration:Iteration=161, var_wn=0.354552
-fixed_point_iteration:Iteration=162, var_wn=0.354524
-fixed_point_iteration:Iteration=163, var_wn=0.354496
-fixed_point_iteration:Iteration=164, var_wn=0.35447
-fixed_point_iteration:Iteration=165, var_wn=0.354444
-fixed_point_iteration:Iteration=166, var_wn=0.35442
-fixed_point_iteration:Iteration=167, var_wn=0.354396
-fixed_point_iteration:Iteration=168, var_wn=0.354373
-fixed_point_iteration:Iteration=169, var_wn=0.35435
-fixed_point_iteration:Iteration=170, var_wn=0.354329
-fixed_point_iteration:Iteration=171, var_wn=0.354308
-fixed_point_iteration:Iteration=172, var_wn=0.354288
-fixed_point_iteration:Iteration=173, var_wn=0.354269
-fixed_point_iteration:Iteration=174, var_wn=0.35425
-fixed_point_iteration:Iteration=175, var_wn=0.354232
-fixed_point_iteration:Iteration=176, var_wn=0.354214
-fixed_point_iteration:Iteration=177, var_wn=0.354198
-fixed_point_iteration:Iteration=178, var_wn=0.354181
-fixed_point_iteration:Iteration=179, var_wn=0.354165
-fixed_point_iteration:Iteration=180, var_wn=0.35415
-fixed_point_iteration:Iteration=181, var_wn=0.354136
-fixed_point_iteration:Iteration=182, var_wn=0.354121
-fixed_point_iteration:Iteration=183, var_wn=0.354108
-fixed_point_iteration:Iteration=184, var_wn=0.354094
-fixed_point_iteration:Iteration=185, var_wn=0.354082
-fixed_point_iteration:Iteration=186, var_wn=0.354069
-fixed_point_iteration:Iteration=187, var_wn=0.354057
-fixed_point_iteration:Iteration=188, var_wn=0.354046
-fixed_point_iteration:Iteration=189, var_wn=0.354034
-fixed_point_iteration:Iteration=190, var_wn=0.354024
-fixed_point_iteration:Iteration=191, var_wn=0.354013
-fixed_point_iteration:Iteration=192, var_wn=0.354003
-fixed_point_iteration:Iteration=193, var_wn=0.353994
-WN variance = 0.353994
-Input file                     = /home/igankevich/workspace/phd-diss/config/standing_wave.arma
-ACF grid size                  = (10,10,10)
-ACF grid patch size            = (0.277778,0.555556,0.555556)
-Output grid size               = (200,40,40)
-Output grid patch size         = (1,1,1)
-AR order                       = (7,7,7)
-Do least squares               = 0
-ACF function                   = standing_wave
-Model                          = AR
-MA algorithm                   = fixed_point_iteration
-Verification scheme            = manual
-ACF variance = 5
-WN variance = 0.00261323
-Zeta size = (193,33,33)
-NaN: 29, -nan, 1.798e+36, -1.04284e+38, inf, -1.798e+36, -1.798e+36
-#+end_example
-
-* Introduction
-**** Topic relevance.
-Software programmes, which simulates vessel behaviour in sea waves, are widely
-used to model ship motion, estimate impact of external forces on floating
-platform or other marine object, and estimate capsize probability under given
-weather conditions; however, to model ocean waves most of the simulation codes
-use linear wave theory\nbsp{}cite:shin2003nonlinear,van2007forensic,kat2001prediction,van2002development, in
-the framework of which it is difficult to reproduce certain peculiarities of
-wind wave climate. Among them are transition between normal and storm weather,
-and sea composed of multiple wave systems\nbsp{}--- both wind waves and swell\nbsp{}---
-heading from multiple directions. Another shortcoming of linear wave theory is
-an assumption, that wave amplitude is small compared to wave length. This makes
-calculations imprecise when modelling ship motion in irregular waves, for which
-the assumption does not hold. So, studying new and more advanced models and
-methods for ocean simulation software may increase number of its application
-scenarios and foster a study of ship motion in extreme conditions in particular.
-
-**** State-of-the-art.
-Autoregressive moving average (ARMA) model emerged in response to difficulties
-encountered by practitioners who used wave simulation models developed in the
-framework of linear wave theory. The problems they have encountered with
-Longuet---Higgins model (a model which is entirely based on linear wave theory)
-can be summarised as the following.
-1. /Periodicity/. Linear wave theory approximates waves by a sum of harmonics,
-   so period of the whole wavy surface realisation depends on the number of
-   harmonics in the model. The more realisation size is, the more coefficients
-   are required to eliminate periodicity, therefore, generation time grows
-   non-linearly with realisation size. This in turn results in overall low
-   efficiency of any model based on this theory, no matter how optimised the
-   software implementation is.
-2. /Linearity/. Linear wave theory gives mathematical definition for ocean waves
-   which have small amplitudes compared to their lengths. Waves of this type
-   occur mostly in the ocean, so near-shore waves as well as storm waves, for
-   which this assumption does not hold, are not perfectly captured by linear
-   theory.
-3. /Probabilistic convergence/. Phase of a wave, which is often generated by
-   pseudo random number generator (PRNG), has uniform distribution, and this
-   makes wavy surface characteristics (average wave height, wave period, wave
-   length etc.) sometimes converge slowly to the desired values. Convergence
-   rate depends on the values generated by PRNG, so high convergence rate is not
-   guaranteed.
-
-These difficulties became a starting point in search for a new model which is
-not based on linear wave theory. ARMA process studies were found to have all the
-required mathematical apparatus.
-1. ARMA process takes auto-covariate function (ACF) as an input parameter, and
-   this function can be directly obtained from wave energy or
-   frequency-directional spectrum (which is the input for Longuet---Higgins
-   model). So, inputs for one model can easily be converted to each other.
-2. There is no small-amplitude waves assumption. Wave may have any amplitude,
-   and can be generated as steep as it is possible with real ocean wave ACF.
-3. Period of the realisation equals the period of PRNG, so generation time grows
-   linearly with the realisation size.
-4. White noise\nbsp{}--- the only probabilistic term in ARMA process\nbsp{}--- has
-   Gaussian distribution; so, convergence rate is not probabilistic.
-
-**** Goals and objectives.
-ARMA process became the basis for ARMA ocean simulation model, however, there
-was still much work to be done to make it useful in practice.
-1. One have to investigate how different ACF shapes affect the choice of ARMA
-   parameters (the number of moving average and autoregressive processes
-   coefficients).
-2. Then, investigate a possibility to generate waves of arbitrary profile, not
-   only cosines (which means taking into account asymmetric distribution of wavy
-   surface elevation).
-3. Then, derive formulae to determine pressure field under wavy surface.
-   Usually, such formulae are derived for a particular model by substituting
-   wave profile into the eq. eqref:eq-problem, however, ARMA process does not
-   provide explicit wave profile formula, so this problem had to be solved for
-   general wavy surface (which is not defined by an analytic formula),
-   without linearisation of boundaries and assumption of small-amplitude waves.
-4. Finally, verify wavy surface integral characteristics to match the ones of
-   real ocean waves.
-5. In the final stage, develop software programme that implements ARMA model and
-   pressure calculation method, and allows to run simulations on both shared
-   memory (SMP) and distributed memory (MPP) computer systems.
-
-**** Scientific novelty.
-ARMA model, as opposed to other ocean simulation models, does not use linear
-wave theory. This makes it capable of
-- generating waves with arbitrary amplitudes by adjusting wave steepness via
-  ACF;
-- generating waves with arbitrary profiles by adjusting asymmetry of wave
-  elevation distribution via non-linear inertia-less transform (NIT).
-This makes it possible to use ARMA process to model transition between normal
-and storm weather taking into account climate spectra and assimilation data of a
-particular ocean region, which is not possible with models based on linear wave
-theory.
-
-**** Theoretical and practical significance.
-Implementing ARMA model, that does not use assumptions of linear wave theory,
-will increase quality of ship motion and marine object behaviour simulation
-software.
-
-1. Since pressure field formula is derived for discrete wavy surface and without
-   assumptions about wave amplitudes, it is applicable to any wavy surface of
-   incompressible inviscid fluid (in particular, it is applicable to wavy
-   surface generated by LH model). This allows to use pressure field formula
-   without being tied to ARMA model.
-2. From computational point of view this formula is more efficient than the
-   corresponding formula for LH model, because integrals in it are reduced to
-   Fourier transforms, for which there is fast Fourier transform (FFT) family of
-   algorithms, optimised for different processor architectures.
-3. Since the formula is explicit, there is no need in data exchange between
-   parallel processes, which allows to achieve high scalability on computer
-   clusters.
-4. Finally, ARMA model is itself more efficient than LH model due to vicinity of
-   trigonometric functions in its formula: In fact, wavy surface is computed as
-   a sum of large number of polynomials, for which there is low-level assembly
-   instruction (Fused Multiply-Add) giving native performance on CPUs.
-
-**** Methodology and research methods.
-Software implementation of ARMA model and pressure field formula was created
-incrementally: a prototype written in high-level engineering language\nbsp{}cite:mathematica10,octave2015 was rewritten in lower level language (C++).
-Implementation of the same algorithm and formulae in languages of varying
-levels (which involves usage of different abstractions and language primitives)
-allows to correct errors, which would left unnoticed otherwise. Wavy surface,
-generated by ARMA model, as well as all input parameters (ACF, distribution of
-wave elevation etc.) were inspected via graphical means built into the
-programming language allowing visual control of programme correctness.
-
-**** Theses for the defence.
-- Wind wave model which allows to generate wavy surface realisations with large
-  period and consisting of wave of arbitrary amplitudes;
-- Pressure field formulae derived for this model without assumptions of linear
-  wave theory;
-- Software implementation of the model and the formula for shared memory (SMP)
-  and distributed memory (MPP) systems.
-
-**** Results verification and approbation.
-ARMA model is verified by comparing generated wavy surface integral
-characteristics (distribution of wave elevation, wave heights and lengths etc.)
-to the ones of real ocean waves. Pressure field formula is derived in
-Mathematica language, where resulting formulae are verified by built-in
-graphical means.
-
-ARMA model and pressure field formula were incorporated into Large Amplitude
-Motion Programme (LAMP)\nbsp{}--- an ship motion simulation software programme\nbsp{}---
-where they were compared to previously used LH model. Preliminary numerical
-experiments showed higher computational efficiency of ARMA model.
-
-* Problem statement
-The aim of the study reported here is to investigate possibilities of applying
-ARMA process mathematical apparatus to ocean wave modelling and to derive formula
-for pressure field under generated wavy surface without assumptions of linear
-wave theory.
-- In case of small-amplitude waves resulting formula must correspond to the
-  one from linear wave theory; in all other cases the formula must not diverge.
-- Integral characteristics of generated wavy surface must match the ones of real
-  ocean waves.
-- Software implementation of ARMA model and pressure field formula must work on
-  shared memory (SMP) and distributed memory (MPP) systems.
-
-**** Pressure field formula.
-The problem of finding pressure field under wavy sea surface represents inverse
-problem of hydrodynamics for incompressible inviscid fluid. System of equations
-for it in general case is written as\nbsp{}cite:kochin1966theoretical
-\begin{align}
-    & \nabla^2\phi = 0,\nonumber\\
-    & \phi_t+\frac{1}{2} |\vec{\upsilon}|^2 + g\zeta=-\frac{p}{\rho}, & \text{на }z=\zeta(x,y,t),\label{eq-problem}\\
-    & D\zeta = \nabla \phi \cdot \vec{n}, & \text{на }z=\zeta(x,y,t),\nonumber
-\end{align}
-where \(\phi\)\nbsp{}--- velocity potential, \(\zeta\)\nbsp{}--- elevation (\(z\) coordinate)
-of wavy surface, \(p\)\nbsp{}--- wave pressure, \(\rho\)\nbsp{}--- fluid density,
-\(\vec{\upsilon}=(\phi_x,\phi_y,\phi_z)\)\nbsp{}--- velocity vector, \(g\)\nbsp{}---
-acceleration of gravity, and \(D\)\nbsp{}--- substantial (Lagrange) derivative. The
-first equation is called continuity (Laplace) equation, the second one is the
-conservation of momentum law (the so called dynamic boundary condition); the
-third one is kinematic boundary condition for free wavy surface, which states
-that rate of change of wavy surface elevation (\(D\zeta\)) equals to the change of
-velocity potential derivative along the wavy surface normal
-(\(\nabla\phi\cdot\vec{n}\)).
-
-Inverse problem of hydrodynamics consists in solving this system of equations
-for \(\phi\). In this formulation dynamic boundary condition becomes explicit
-formula to determine pressure field using velocity potential derivatives
-obtained from the remaining equations. So, from mathematical point of view
-inverse problem of hydrodynamics reduces to Laplace equation with mixed boundary
-condition\nbsp{}--- Robin problem.
-
-* Related work
-** Ocean wave models analysis
-Pressure computation is only possible when the shape of wavy surface is known.
-It is defined either at discrete grid points, or continuously via some analytic
-formula. As will be shown in section [[#linearisation]], such formula may simplify
-pressure computation by effectively reducing the task to pressure field
-generation, instead of wavy surface generation.
-
-**** Longuet---Higgins model.
-The simplest model, formula of which is derived in the framework of linear wave
-theory (see\nbsp{}section\nbsp{}[[#longuet-higgins-derivation]]), is
-Longuet---Higgins (LH) model\nbsp{}cite:longuet1957statistical. In-depth
-comparative analysis of this model and ARMA model is done
-in\nbsp{}cite:degtyarev2011modelling,boukhanovsky1997thesis.
-
-LH model represents ocean wavy surface as a superposition of
-sine waves with random amplitudes \(c_n\) and phases \(\epsilon_n\), continuously
-distributed on interval \([0,2\pi]\). Wavy surface elevation (\(z\) coordinate) is
-defined by
-#+name: eq-longuet-higgins
-\begin{equation}
-    \zeta(x,y,t) = \sum\limits_n c_n \cos(u_n x + v_n y - \omega_n t + \epsilon_n).
-\end{equation}
-Here wave numbers \((u_n,v_n)\) are continuously distributed on plane \((u,v)\),
-i.e. area \(du \times dv\) contains infinite quantity of wave numbers. Frequency
-is related to wave numbers via dispersion relation \(\omega_n=\omega(u_n,v_n)\).
-Function \(\zeta(x,y,t)\) is a three-dimensional ergodic stationary homogeneous
-Gaussian process defined by
-\begin{equation*}
-    2E_\zeta(u,v)\, du\,  dv = \sum\limits_n c_n^2,
-\end{equation*}
-where \(E_\zeta(u,v)\)\nbsp{}--- two-dimensional wave energy spectral density.
-Coefficients \(c_n\) are derived from wave energy spectrum \(S(\omega)\) via
-\begin{equation*}
-    c_n = \sqrt{ \textstyle\int\limits_{\omega_n}^{\omega_{n+1}} S(\omega) d\omega}.
-\end{equation*}
-
-**** Disadvantages of Longuet-Higgins model.
-Although LH model is simple and easy to understand, there are shortcomings that
-appear in practice.
-
-1. The model simulates only stationary Gaussian process. This is consequence of
-   central limit theorem (CLT): sum of large number of sines with random
-   amplitudes and phases has normal distribution, no matter what spectrum is
-   used as the model input. Using lower number of coefficients may solve the
-   problem, but also make realisation period smaller. So, using LH model to
-   simulate waves with non-Gaussian distribution of elevation\nbsp{}--- a
-   distribution which real ocean waves
-   have\nbsp{}cite:huang1980experimental,рожков1996теория \nbsp{}--- is
-   impractical.
-2. From computational point of view, the deficiency of the model is non-linear
-   increase of wavy surface generation time with the increase of realisation
-   size. The larger the size of the realisation, the higher number of
-   coefficients (discrete points of frequency-directional spectrum) is needed to
-   eliminate periodicity. This makes LH model inefficient for long-time
-   simulations.
-3. Finally, there are peculiarities which make LH model unsuitable base for
-   building more advanced simulation models.
-   - In software implementation convergence rate of eq.\nbsp{}[[eq-longuet-higgins]]
-     may be low due to randomness of phases \(\epsilon_n\).
-   - It is difficult to generalise LH model for non-Gaussian processes as it
-     involves incorporating non-linear terms in eq.\nbsp{}[[eq-longuet-higgins]] for
-     which there is no known formula to determine
-     coefficients\nbsp{}cite:рожков1990вероятностные.
-
-To summarise, LH model is applicable to generating ocean wavy surface in the
-framework of linear wave theory, inefficient for long-time simulations, and
-difficult to use as a base for more advanced models.
-
-**** ARMA model
-In\nbsp{}cite:spanos1982arma ARMA model is used to generate time series spectrum of
-which is compatible with Pierson---Moskowitz (PM) approximation of ocean wave
-spectrum. The authors carry out experiments for one-dimensional AR, MA and ARMA
-models. They mention excellent agreement between target and initial spectra and
-higher performance of ARMA model compared to models based on summing large
-number of harmonic components with random phases. The also mention that in order
-to reach agreement between target and initial spectrum MA model require lesser
-number of coefficients than AR model. In\nbsp{}cite:spanos1996efficient the authors
-generalise ARMA model coefficients determination formulae for multi-variate
-(vector) case.
-
-One thing that distinguishes present work with respect to afore-mentioned ones
-is the study of three-dimensional (2D in space and 1D in time) ARMA model, which
-is mostly a different problem.
-1. Yule---Walker system of equations, which are used to determine AR
-   coefficients, has complex block-block structure.
-2. Optimal model order (in a sense that target spectrum agrees with initial) is
-   determined manually.
-3. Instead of PM spectrum, analytic formulae for standing and propagating
-   waves ACF are used as the model input.
-4. Three-dimensional wavy surface should be compatible with real ocean surface
-   not only in terms of spectral characteristics, but also in the shape of wave
-   profiles. So, model verification includes distributions of various parameters
-   of generated waves (lengths, heights, periods etc.).
-Multi-dimensionality of investigated model not only complexifies the task, but
-also allows to carry out visual validation of generated wavy surface. It is the
-opportunity to visualise output of the programme that allowed to ensure that
-generated surface is compatible with real ocean surface, and is not abstract
-multi-dimensional stochastic process that is real only statistically.
-
-In\nbsp{}cite:fusco2010short AR model is used to predict swell waves to control
-wave-energy converters (WEC) in real-time. In order to make WEC more efficient
-its internal oscillator frequency should match the one of ocean waves. The
-authors treat wave elevation as time series and compare performance of AR model,
-neural networks and cyclical models in forecasting time series future values. AR
-model gives the most accurate prediction of low-frequency swell waves for up to
-two typical wave periods. It is an example of successful application of AR
-process to ocean wave modelling.
-
-** Pressure field determination formulae
-**** Small amplitude waves theory.
-In\nbsp{}cite:stab2012,детярев1998моделирование,degtyarev1997analysis the authors
-propose a solution for inverse problem of hydrodynamics of potential flow in the
-framework of small-amplitude wave theory (under assumption that wave length is
-much larger than height: \(\lambda \gg h\)). In that case inverse problem is
-linear and reduces to Laplace equation with mixed boundary conditions, and
-equation of motion is solely used to determine pressures for calculated velocity
-potential derivatives. The assumption of small amplitudes means the slow decay
-of wind wave coherence function, i.e. small change of local wave number in time
-and space compared to the wavy surface elevation (\(z\) coordinate). This
-assumption allows to calculate elevation \(z\) derivative as \(\zeta_z=k\zeta\),
-where \(k\) is wave number. In two-dimensional case the solution is written
-explicitly as
-\begin{align}
-    \left.\frac{\partial\phi}{\partial x}\right|_{x,t}= &
-        -\frac{1}{\sqrt{1+\alpha^{2}}}e^{-I(x)}
-            \int\limits_{0}^x\frac{\partial\dot{\zeta}/\partial
-                z+\alpha\dot{\alpha}}{\sqrt{1+\alpha^{2}}}e^{I(x)}dx,\label{eq-old-sol-2d}\\
-    I(x)= & \int\limits_{0}^x\frac{\partial\alpha/\partial z}{1+\alpha^{2}}dx,\nonumber
-\end{align}
-
-where \(\alpha\) is wave slope. In three-dimensional case solution is written in
-the form of elliptic partial differential equation (PDE):
-\begin{align*}
-    & \frac{\partial^2 \phi}{\partial x^2} \left( 1 + \alpha_x^2 \right) +
-    \frac{\partial^2 \phi}{\partial y^2} \left( 1 + \alpha_y^2 \right) +
-    2\alpha_x\alpha_y \frac{\partial^2 \phi}{\partial x \partial y} + \\
-    & \left(
-        \frac{\partial \alpha_x}{\partial z} +
-        \alpha_x \frac{\partial \alpha_x}{\partial x} +
-        \alpha_y \frac{\partial \alpha_x}{\partial y}
-    \right) \frac{\partial \phi}{\partial x} + \\
-    & \left(
-        \frac{\partial \alpha_y}{\partial z} +
-        \alpha_x \frac{\partial \alpha_y}{\partial x} +
-        \alpha_y \frac{\partial \alpha_y}{\partial y}
-    \right) \frac{\partial \phi}{\partial y} + \\
-    & \frac{\partial \dot{\zeta}}{\partial z} +
-    \alpha_x \dot{\alpha_x} + \alpha_y \dot{\alpha_y} = 0.
-\end{align*}
-The authors suggest transforming this equation to finite differences and solve
-it numerically.
-
-As will be shown in [[#sec:compare-formulae]] that eqref:eq-old-sol-2d diverges when
-attempted to calculate velocity field for large-amplitude waves, and this is the
-reason that it can not be used together with ARMA model, that generates
-arbitrary-amplitude waves.
-
-**** Linearisation of boundary condition.
-:PROPERTIES:
-:CUSTOM_ID: linearisation
-:END:
-
-LH model allows to derive an explicit formula for velocity field by linearising
-kinematic boundary condition. Velocity potential formula is written as
-\begin{equation*}
-\phi(x,y,z,t) = \sum_n \frac{c_n g}{\omega_n}
-     e^{\sqrt{u_n^2+v_n^2} z}
-     \sin(u_n x + v_n y - \omega_n t + \epsilon_n).
-\end{equation*}
-This formula is differentiated to obtain velocity potential derivatives, which
-are plugged to dynamic boundary condition to obtain pressures.
-
-* ARMA model for ocean wave simulation
-** Governing equations for 3-dimensional ARMA process
-ARMA ocean simulation model defines ocean wavy surface as three-dimensional (two
-dimensions in space and one in time) autoregressive moving average process:
-every surface point is represented as a weighted sum of previous in time and
-space points plus weighted sum of previous in time and space normally
-distributed random impulses. The governing equation for 3-D ARMA process is
-\begin{equation}
-    \zeta_{\vec i}
-    =
-    \sum\limits_{\vec j = \vec 0}^{\vec N}
-    \Phi_{\vec j} \zeta_{\vec i - \vec j}
-    +
-    \sum\limits_{\vec j = \vec 0}^{\vec M}
-    \Theta_{\vec j} \epsilon_{\vec i - \vec j}
-    ,
-    \label{eq-arma-process}
-\end{equation}
-where \(\zeta\)\nbsp{}--- wave elevation, \(\Phi\)\nbsp{}--- AR process
-coefficients, \(\Theta\)\nbsp{}--- MA process coefficients,
-\(\epsilon\)\nbsp{}--- white noise with Gaussian distribution,
-\(\vec{N}\)\nbsp{}--- AR process order, \(\vec{M}\)\nbsp{}--- MA process order,
-and \(\Phi_{\vec{0}}\equiv{0}\), \(\Theta_{\vec{0}}\equiv{0}\). Here arrows
-denote multi-component indices with a component for each dimension. In general,
-any scalar quantity can be a component (temperature, salinity, concentration of
-some substance in water etc.). Equation parameters are AR and MA process
-coefficients and order.
-
-**** Autoregressive (AR) process.
-AR process is ARMA process with only one random impulse instead of theirs
-weighted sum:
-\begin{equation}
-    \zeta_{\vec i}
-    =
-    \sum\limits_{\vec j = \vec 0}^{\vec N}
-    \Phi_{\vec j} \zeta_{\vec i - \vec j}
-    +
-    \epsilon_{i,j,k}
-    .
-    \label{eq-ar-process}
-\end{equation}
-The coefficients \(\Phi\) are calculated from ACF via three-dimensional
-Yule---Walker equations, which are obtained after multiplying both parts of the
-previous equation by \(\zeta_{\vec{i}-\vec{k}}\) and computing the expected value.
-Generic form of YW equations is
-\begin{equation}
-    \label{eq-yule-walker}
-    \gamma_{\vec k}
-    =
-    \sum\limits_{\vec j = \vec 0}^{\vec N}
-    \Phi_{\vec j}
-    \text{ }\gamma_{\vec{k}-\vec{j}}
-    +
-    \Var{\epsilon} \delta_{\vec{k}},
-    \qquad
-    \delta_{\vec{k}} =
-    \begin{cases}
-        1, \quad \text{if } \vec{k}=0 \\
-        0, \quad \text{if } \vec{k}\neq0,
-    \end{cases}
-\end{equation}
-where \(\gamma\)\nbsp{}--- ACF of process \(\zeta\), \(\Var{\epsilon}\)\nbsp{}--- white noise
-variance. Matrix form of three-dimensional YW equations, which is used in the
-present work, is
-\begin{equation*}
-    \Gamma
-    \left[
-        \begin{array}{l}
-            \Phi_{\vec 0}\\
-            \Phi_{0,0,1}\\
-            \vdotswithin{\Phi_{\vec 0}}\\
-            \Phi_{\vec N}
-        \end{array}
-    \right]
-    =
-    \left[
-        \begin{array}{l}
-            \gamma_{0,0,0}-\Var{\epsilon}\\
-            \gamma_{0,0,1}\\
-            \vdotswithin{\gamma_{\vec 0}}\\
-            \gamma_{\vec N}
-        \end{array}
-    \right],
-    \qquad
-    \Gamma=
-    \left[
-        \begin{array}{llll}
-            \Gamma_0 & \Gamma_1 & \cdots & \Gamma_{N_1} \\
-            \Gamma_1 & \Gamma_0 & \ddots & \vdotswithin{\Gamma_0} \\
-            \vdotswithin{\Gamma_0} & \ddots & \ddots & \Gamma_1 \\
-            \Gamma_{N_1} & \cdots & \Gamma_1 & \Gamma_0
-        \end{array}
-    \right],
-\end{equation*}
-where \(\vec N = \left( p_1, p_2, p_3 \right)\) and
-\begin{equation*}
-    \Gamma_i =
-    \left[
-    \begin{array}{llll}
-        \Gamma^0_i & \Gamma^1_i & \cdots & \Gamma^{N_2}_i \\
-        \Gamma^1_i & \Gamma^0_i & \ddots & \vdotswithin{\Gamma^0_i} \\
-        \vdotswithin{\Gamma^0_i} & \ddots & \ddots & \Gamma^1_i \\
-        \Gamma^{N_2}_i & \cdots & \Gamma^1_i & \Gamma^0_i
-    \end{array}
-    \right]
-    \qquad
-    \Gamma_i^j=
-    \left[
-    \begin{array}{llll}
-        \gamma_{i,j,0} & \gamma_{i,j,1} & \cdots & \gamma_{i,j,N_3} \\
-        \gamma_{i,j,1} & \gamma_{i,j,0} & \ddots &x \vdotswithin{\gamma_{i,j,0}} \\
-        \vdotswithin{\gamma_{i,j,0}} & \ddots & \ddots & \gamma_{i,j,1} \\
-        \gamma_{i,j,N_3} & \cdots & \gamma_{i,j,1} & \gamma_{i,j,0}
-    \end{array}
-    \right],
-\end{equation*}
-Since \(\Phi_{\vec 0}\equiv0\), the first row and column of \(\Gamma\) can be
-eliminated. Matrix \(\Gamma\) is block-toeplitz, positive definite and symmetric,
-hence the system is efficiently solved by Cholesky decomposition, which is
-particularly suitable for these types of matrices.
-
-After solving this system of equations white noise variance is estimated from
-eqref:eq-yule-walker by plugging \(\vec k = \vec 0\):
-\begin{equation*}
-    \Var{\epsilon} =
-    \Var{\zeta}
-    -
-    \sum\limits_{\vec j = \vec 0}^{\vec N}
-    \Phi_{\vec j}
-    \text{ }\gamma_{\vec{j}}.
-\end{equation*}
-
-**** Moving average (MA) process.
-MA process is ARMA process with \(\Phi\equiv0\):
-\begin{equation}
-    \zeta_{\vec i}
-    =
-    \sum\limits_{\vec j = \vec 0}^{\vec M}
-    \Theta_{\vec j} \epsilon_{\vec i - \vec j}
-    .
-    \label{eq-ma-process}
-\end{equation}
-MA coefficients \(\Theta\) are defined implicitly via the following non-linear
-system of equations:
-\begin{equation*}
-  \gamma_{\vec i} =
-	\left[
-		\displaystyle
-    \sum\limits_{\vec j = \vec i}^{\vec M}
-    \Theta_{\vec j}\Theta_{\vec j - \vec i}
-	\right]
-  \Var{\epsilon}.
-\end{equation*}
-The system is solved numerically by fixed-point iteration method via the
-following formulae
-\begin{equation*}
-  \Theta_{\vec i} =
-    -\frac{\gamma_{\vec 0}}{\Var{\epsilon}}
-		+
-    \sum\limits_{\vec j = \vec i}^{\vec M}
-    \Theta_{\vec j} \Theta_{\vec j - \vec i}.
-\end{equation*}
-Here coefficients \(\Theta\) are calculated from back to front: from
-\(\vec{i}=\vec{M}\) to \(\vec{i}=\vec{0}\). White noise variance is estimated by
-\begin{equation*}
-    \Var{\epsilon} = \frac{\gamma_{\vec 0}}{
-		1
-		+
-    \sum\limits_{\vec j = \vec 0}^{\vec M}
-    \Theta_{\vec j}^2
-    }.
-\end{equation*}
-Authors of\nbsp{}cite:box1976time suggest using Newton---Raphson method to solve this
-equation with higher precision, however, this method does not work in three
-dimensions. Using slower method does not have dramatic effect on the overall
-programme performance, because the number of coefficients is small and most of
-the time is spent generating wavy surface.
-
-**** Stationarity and invertibility of AR and MA processes
-In order for modelled wavy surface to represent physical phenomena, the
-corresponding process must be stationary and invertible. If the process is
-invertible, then there is a reasonable connection of current events with the
-events in the past, and if the process is stationary, the modelled physical
-signal amplitude does not increase infinitely in time and space.
-
-AR process is always invertible, and for stationarity it is necessary for roots
-of characteristic equation
-\begin{equation*}
-1 - \Phi_{0,0,1} z - \Phi_{0,0,2} z^2
-- \cdots
-- \Phi_{\vec N} z^{N_0 N_1 N_2} = 0,
-\end{equation*}
-to lie \emph{outside} the unit circle. Here \(\vec{N}\) is AR process order
-and \(\Phi\) are coefficients.
-
-MA process is always stationary, and for invertibility it is necessary for roots
-of characteristic equation
-\begin{equation*}
-1 - \Theta_{0,0,1} z - \Theta_{0,0,2} z^2
-- \cdots
-- \Theta_{\vec M} z^{M_0 M_1 M_2} = 0,
-\end{equation*}
-to lie \emph{outside} the unit circle. Here \(\vec{M}\) is
-three-dimensional MA process order and \(\Theta\) are coefficients.
-
-Stationarity and invertibility properties are the main criteria in selection of
-the process to model different wave profiles, which are discussed in
-section\nbsp{}[[#sec-process-selection]].
-
-**** Mixed autoregressive moving average (ARMA) process.
-:PROPERTIES:
-:CUSTOM_ID: sec:how-to-mix-ARMA
-:END:
-Generally speaking, ARMA process is obtained by plugging MA generated wavy
-surface as random impulse to AR process, however, in order to get the process
-with desired ACF one should re-compute AR coefficients before plugging. There
-are several approaches to "mix" AR and MA processes.
-- The approach proposed in\nbsp{}cite:box1976time which involves dividing ACF into MA
-  and AR part along each dimension is not applicable here, because in three
-  dimensions such division is not possible: there always be parts of the ACF
-  that are not taken into account by AR and MA process.
-- The alternative approach is to use the same (undivided) ACF for both AR and MA
-  processes but use different process order, however, then realisation
-  characteristics (mean, variance etc.) become skewed: these are characteristics
-  of the two overlapped processes.
-For the first approach there is a formula to re-compute ACF for AR process, but
-there is no such formula for the second approach. So, the best solution for now
-is to simply use AR and MA process exclusively.
-
-**** Process selection criteria for different wave profiles.
-:PROPERTIES:
-:CUSTOM_ID: sec-process-selection
-:END:
-
-One problem of ARMA model application to ocean wave generation is that for
-different types of wave profiles different processes /must/ be used: standing
-waves are modelled by AR process, and propagating waves by MA process. This
-statement comes from practice: if one tries to use the processes the other way
-round, the resulting realisation either diverges or does not correspond to real
-ocean waves. (The latter happens for non-invertible MA process, as it is always
-stationary.) So, the best way to apply ARMA model to ocean wave generation is to
-use AR process for standing waves and MA process for progressive waves.
-
-The other problem is inability to automatically determine optimal number of
-coefficients for three-dimensional AR and MA processes. For one-dimensional
-processes this can be achieved via iterative methods\nbsp{}cite:box1976time, but they
-diverge in three-dimensional case.
-
-The final problem, which is discussed in [[#sec:how-to-mix-ARMA]], is inability to
-"mix" AR and MA process in three dimensions.
-
-In practice some statements made for AR and MA processes in\nbsp{}cite:box1976time
-should be flipped for three-dimensional case. For example, the authors say that
-ACF of MA process cuts at \(q\) and ACF of AR process decays to nought infinitely,
-but in practice making ACF of 3-dimensional MA process not decay results in it
-being non-invertible and producing realisation that does not look like real
-ocean waves, whereas doing the same for ACF of AR process results in stationary
-process and adequate realisation. Also, the authors say that one
-should allocate the first \(q\) points of ACF to MA process (as it often needed to
-describe the peaks in ACF) and leave the rest points to AR process, but in
-practice in case of ACF of a propagating wave AR process is stationary only for
-the first time slice of the ACF, and the rest is left to MA process.
-
-To summarise, the only established scenario of applying ARMA model to ocean wave
-generation is to use AR process for standing waves and MA process for
-propagating waves. With new formulae for 3 dimensions a single mixed ARMA
-process might increase model precision, which is one of the objectives of the
-future research.
-
-** Modelling non-linearity of ocean waves
-ARMA model allows to model asymmetry of wave elevation distribution, i.e.
-generate ocean waves, distribution of z-coordinate of which has non-nought
-kurtosis and asymmetry. Such distribution is inherent to real ocean waves\nbsp{}cite:longuet1963nonlinear.
-
-Wave asymmetry is modelled by non-linear inertia-less transform (NIT) of
-stochastic process, however, transforming resulting wavy surface means
-transforming initial ACF. In order to alleviate this, ACF must be preliminary
-transformed as shown in\nbsp{}cite:boukhanovsky1997thesis.
-
-**** Wavy surface transformation.
-Explicit formula \(z=f(y)\) that transforms wavy surface to desired
-one-dimensional distribution \(F(z)\) is the solution of non-linear transcendental
-equation \(F(z)=\Phi(y)\), where \(\Phi(y)\)\nbsp{}--- one-dimensional Gaussian
-distribution. Since distribution of wave elevation is often given by some
-approximation based on field data, this equation is solved numerically with
-respect to \(z_k\) in each grid point \(y_k|_{k=0}^N\) of generated wavy surface. In
-this case equation is rewritten as
-\begin{equation}
-    \label{eq-distribution-transformation}
-    F(z_k)
-    =
-    \frac{1}{\sqrt{2\pi}}
-    \int\limits_0^{y_k} \exp\left[ -\frac{t^2}{2} \right] dt
-    .
-\end{equation}
-Since, distribution functions are monotonic, the simplest interval halving
-(bisection) numerical method is used to solve this equation.
-
-**** Preliminary ACF transformation.
-In order to transform ACF \(\gamma_z\) of the process, it should be expanded in
-series of Hermite polynomials (Gram---Charlier series)
-\begin{equation*}
-    \gamma_z \left( \vec u \right)
-    =
-    \sum\limits_{m=0}^{\infty}
-    C_m^2 \frac{\gamma_y^m \left( \vec u \right)}{m!},
-\end{equation*}
-where
-\begin{equation*}
-    C_m = \frac{1}{\sqrt{2\pi}}
-  \int\limits_{0}^\infty
-    f(y) H_m(y) \exp\left[ -\frac{y^2}{2} \right],
-\end{equation*}
-\(H_m\)\nbsp{}--- Hermite polynomial, and \(f(y)\)\nbsp{}--- solution to equation
-eqref:eq-distribution-transformation. Plugging polynomial approximation
-\(f(y)\approx\sum\limits_{i}d_{i}y^i\) and analytic formulae for Hermite
-polynomial yields
-\begin{equation*}
-    \frac{1}{\sqrt{2\pi}}
-    \int\limits_\infty^\infty
-    y^k \exp\left[ -\frac{y^2}{2} \right]
-    =
-    \begin{cases}
-        (k-1)!! & \text{if }k\text{ is even},\\
-        0       & \text{if }k\text{ is odd},
-    \end{cases}
-\end{equation*}
-which simplifies the former equation. Optimal number of coefficients \(C_m\) is
-determined by computing them sequentially and stopping when variances of both
-fields become equal with desired accuracy \(\epsilon\):
-\begin{equation*}
-    \left| \Var{z} - \sum\limits_{k=0}^m
-    \frac{C_k^2}{k!} \right| \leq \epsilon.
-\end{equation*}
-
-In\nbsp{}cite:boukhanovsky1997thesis the author suggests using polynomial
-approximation \(f(y)\) also for wavy surface transformation, however, in practice
-ocean surface realisation often contains points, where z-coordinate is beyond
-the limits of the approximation, which makes solution wrong. In these points it
-is more efficient to solve equation eqref:eq-distribution-transformation by
-bisection method. Using the same approximation in Gram---Charlier series does
-not lead to such errors.
-
-** Determining wave pressures for discretely given wavy surface
-Analytic solutions to boundary problems in classical equations are often used to
-study different properties of the solution, and for that purpose general
-solution formula is too difficult to study, as it contains integrals of unknown
-functions. Fourier method is one of the methods to find analytic solutions to
-PDE. It is based on application of Fourier transform to each part of PDE, which
-reduces the equation to algebraic, and the solution is written as inverse
-Fourier transform of some function (which may contain Fourier transforms of
-other functions). Since, it is not possible to write analytic forms of these
-Fourier transforms in all cases, unique solutions are found and their behaviour
-is studied in different domains instead. At the same time, computing discrete
-Fourier transforms on the computer is possible for any discretely defined
-function and efficient when using FFT algorithms. These algorithms use symmetry
-of complex exponentials to decrease asymptotic complexity from
-\(\mathcal{O}(n^2)\) to \(\mathcal{O}(n\log_{2}n)\). So, even if general solution
-contains Fourier transforms of unknown functions, they still can be computed
-numerically, and FFT family of algorithms makes this approach efficient.
-
-Alternative approach to solve PDE is to reduce it to difference equations, which
-are solved by constructing various numerical schemes. This approach leads to
-approximate solution, and asymptotic complexity of corresponding algorithms is
-comparable to that of FFT. For example, stationary elliptic PDE transforms to
-implicit numerical scheme which is solved by iterative method on each step of
-which a tridiagonal of five-diagonal system of algebraic equations is solved by
-Thomas algorithm. Asymptotic complexity of this approach is
-\(\mathcal{O}({n}{m})\), where \(n\)\nbsp{}--- number of wavy surface grid points, \(m\)\nbsp{}---
-number of iterations. Despite their wide spread, iterative algorithms are
-inefficient on parallel computer architectures; in particular, their mapping to
-co-processors may involve copying data in and out of the co-processor in each
-iteration, which negatively affects their performance. At the same time, high
-number of Fourier transforms in the solution is an advantage, rather than a
-disadvantage. First, solutions obtained by Fourier method are explicit, hence
-their implementations scales with the large number of parallel computer cores.
-Second, there are implementations of FFT optimised for different processor
-architectures as well as co-processors (GPU, MIC) which makes it easy to get
-high performance on any computing platform. These advantages substantiate the
-choice of Fourier method to obtain explicit analytic solution to the problem of
-determining pressures under wavy ocean surface.
-
-*** Two-dimensional velocity field
-:PROPERTIES:
-:CUSTOM_ID: sec:pressure-2d
-:END:
-**** Formula for infinite depth fluid.
-Two-dimensional Laplace equation with Robin boundary condition is written as
-\begin{align}
-    \label{eq-problem-2d}
-    & \phi_{xx}+\phi_{zz}=0,\\
-    & \zeta_t + \zeta_x\phi_x = \frac{\zeta_x}{\sqrt{1 + \zeta_x^2}} \phi_x - \phi_z, & \text{на }z=\zeta(x,t).\nonumber
-\end{align}
-Use Fourier method to solve this problem. Applying Fourier transform to both
-sides of the equation yields
-\begin{equation*}
-    -4 \pi^2 \left( u^2 + v^2 \right)
-    \FourierY{\phi(x,z)}{u,v} = 0,
-\end{equation*}
-hence \(v = \pm i u\). Hereinafter we use the following symmetric form of Fourier
-transform:
-\begin{equation*}
-    \FourierY{f(x,y)}{u,v} =
-    \iint\limits_{-\infty}^{\phantom{--}\infty}
-    f(x,y)
-    e^{-2\pi i (x u + y v)}
-    dx dy.
-\end{equation*}
-We seek solution in the form of inverse Fourier transform
-\(\phi(x,z)=\InverseFourierY{E(u,v)}{x,z}\). Plugging[fn::\(v={-i}{u}\) is not
-applicable because velocity potential must go to nought when depth goes to
-infinity.] \(v={i}{u}\) into the formula yields
-\begin{equation}
-    \label{eq-guessed-sol-2d}
-    \phi(x,z) = \InverseFourierY{e^{2\pi u z}E(u)}{x}.
-\end{equation}
-In order to make substitution \(z=\zeta(x,t)\) not interfere with Fourier
-transforms, we rewrite eqref:eq-guessed-sol-2d as a convolution:
-\begin{equation*}
-    \phi(x,z)
-    =
-    \Fun{z}
-    \ast
-    \InverseFourierY{E(u)}{x},
-\end{equation*}
-where \(\Fun{z}\)\nbsp{}--- a function, form of which is defined in section
-[[#sec:compute-delta]] and which satisfies equation
-\(\FourierY{\Fun{z}}{u}=e^{2\pi{u}{z}}\). Plugging formula \(\phi\) into the boundary
-condition yields
-\begin{equation*}
-    \zeta_t
-    =
-    \left( i f(x) - 1 \right)
-    \left[
-        \Fun{z}
-        \ast
-        \InverseFourierY{2\pi u E(u)}{x}
-    \right],
-\end{equation*}
-where \(f(x)={\zeta_x}/{\sqrt{1+\zeta_x^2}}-\zeta_x\). Applying Fourier transform
-to both sides of this equation yields formula for coefficients \(E\):
-\begin{equation*}
-    E(u) =
-    \frac{1}{2\pi u}
-    \frac{
-    \FourierY{\zeta_t / \left(i f(x) - 1\right)}{u}
-    }{
-    \FourierY{\Fun{z}}{u}
-    }
-\end{equation*}
-Finally, substituting \(z\) for \(\zeta(x,t)\) and plugging resulting equation into
-eqref:eq-guessed-sol-2d yields formula for \(\phi(x,z)\):
-\begin{equation}
-    \label{eq-solution-2d}
-    \boxed{
-        \phi(x,z)
-        =
-        \InverseFourierY{
-            \frac{e^{2\pi u z}}{2\pi u}
-            \frac{
-            \FourierY{ \zeta_t / \left(i f(x) - 1\right) }{u}
-            }{
-            \FourierY{ \Fun{\zeta(x,t)} }{u}
-            }
-        }{x}.
-    }
-\end{equation}
-
-Multiplier \(e^{2\pi{u}{z}}/(2\pi{u})\) makes graph of a function to which Fourier
-transform of which is applied asymmetric with respect to \(OY\) axis. This makes
-it difficult to apply FFT which expects periodic function with nought on both
-ends of the interval. Using numerical integration instead of FFT is not faster
-than solving the initial system of equations with numerical schemes. This
-problem is alleviated by using formula eqref:eq-solution-2d-full for finite
-depth fluid with wittingly large depth \(h\). This formula is derived in the
-following section.
-
-**** Formula for finite depth fluid.
-On the sea bottom vertical fluid velocity component equals nought: \(\phi_z=0\) on
-\(z=-h\), where \(h\)\nbsp{}--- water depth. In this case equation \(v=-{i}{u}\), which came
-from Laplace equation, can not be neglected, hence the solution is sought in the
-following form:
-\begin{equation}
-    \phi(x,z)
-    =
-    \InverseFourierY{
-        \left( C_1 e^{2\pi u z} + C_2 e^{-2\pi u z} \right)
-        E(u)
-    }{x}.
-    \label{eq-guessed-sol-2d-full}
-\end{equation}
-Plugging \(\phi\) into the boundary condition on the sea bottom yields
-\begin{equation*}
-    C_1 e^{-2\pi u h} - C_2 e^{2\pi u h} = 0,
-\end{equation*}
-hence \(C_1=\frac{1}{2}C{e}^{2\pi{u}{h}}\) and
-\(C_2=-\frac{1}{2}C{e}^{-2\pi{u}{h}}\). Constant \(C\) may take arbitrary value
-here, because after plugging it becomes part of unknown coefficients \(E(u)\).
-Plugging formulae for \(C_1\) and \(C_2\) into eqref:eq-guessed-sol-2d-full yields
-\begin{equation*}
-    \phi(x,z) = \InverseFourierY{ \Sinh{2\pi u (z+h)} E(u) }{x}.
-\end{equation*}
-Plugging \(\phi\) into the boundary condition on the free surface yields
-\begin{equation*}
-    \zeta_t = f(x) \InverseFourierY{ 2\pi i u \Sinh{2\pi u (z+h)} E(u) }{x}
-            - \InverseFourierY{ 2\pi u \SinhX{2\pi u (z+h)} E(u) }{x}.
-\end{equation*}
-Here \(\sinh\) and \(\cosh\) give similar results near free surface, and since this
-is the main area of interest in practical applications, we assume that
-\(\Sinh{2\pi{u}(z+h)}\approx\SinhX{2\pi{u}(z+h)}\). Performing analogous to the
-previous section transformations yields final formula for \(\phi(x,z)\):
-\begin{equation}
-\boxed{
-    \phi(x,z,t)
-    =
-  \InverseFourierY{
-        \frac{\Sinh{2\pi u (z+h)}}{2\pi u}
-        \frac{
-            \FourierY{ \zeta_t / \left(i f(x) - 1\right) }{u}
-        }{
-            \FourierY{ \FunSecond{\zeta(x,t)} }{u}
-        }
-    }{x},
-}
-    \label{eq-solution-2d-full}
-\end{equation}
-where \(\FunSecond{z}\)\nbsp{}--- a function, form of which is defined in section
-[[#sec:compute-delta]] and which satisfies equation
-\(\FourierY{\FunSecond{z}}{u}=\Sinh{2\pi{u}{z}}\).
-
-**** Reducing to the formulae from linear wave theory.
-Check the validity of derived formulae by substituting \(\zeta(x,t)\) with known
-analytic formula for plain waves. Symbolic computation of Fourier transforms in
-this section were performed in Mathematica\nbsp{}cite:mathematica10. In the framework
-of linear wave theory assume that waves have small amplitude compared to their
-lengths, which allows us to simplify initial system of equations
-eqref:eq-problem-2d to
-\begin{align*}
-    & \phi_{xx}+\phi_{zz}=0,\\
-    & \zeta_t = -\phi_z & \text{на }z=\zeta(x,t),
-\end{align*}
-solution to which is written as
-\begin{equation*}
-    \phi(x,z,t)
-    =
-    -\InverseFourierY{
-        \frac{e^{2\pi u z}}{2\pi u}
-        \FourierY{\zeta_t}{u}
-    }{x}
-    .
-\end{equation*}
-Propagating wave profile is defined as \(\zeta(x,t)=A\cos(2\pi(kx-t))\). Plugging
-this formula into eqref:eq-solution-2d yields
-\(\phi(x,z,t)=-\frac{A}{k}\sin(2\pi(kx-t))\Sinh{2\pi{k}{z}}\). In order to reduce
-it to the formula from linear wave theory, rewrite hyperbolic sine in
-exponential form, discard the term containing \(e^{-2\pi{k}{z}}\) as contradicting
-condition \(\phi\underset{z\rightarrow-\infty}{\longrightarrow}0\). Taking real
-part of the resulting formula yields
-\(\phi(x,z,t)=\frac{A}{k}e^{2\pi{k}{z}}\sin(2\pi(kx-t))\), which corresponds to
-the known formula from linear wave theory. Similarly, under small-amplitude
-waves assumption the formula for finite depth fluid eqref:eq-solution-2d-full is
-reduced to
-\begin{equation*}
-    \phi(x,z,t)
-    =
-    -\InverseFourierY{
-        \frac{\Sinh{2\pi u (z+h)}}{2\pi u \Sinh{2\pi u h}}
-        \FourierY{\zeta_t}{u}
-    }{x}.
-\end{equation*}
-Substituting \(\zeta(x,t)\) with propagating plain wave profile formula yields
-\begin{equation}
-    \label{eq-solution-2d-linear}
-    \phi(x,z,t)=\frac{A}{k}
-    \frac{\Sinh{2 \pi k (z+h)}}{ \Sinh{2 \pi k h} }
-    \sin(2 \pi (k x-t)),
-\end{equation}
-which corresponds to the formula from linear wave theory for finite depth fluid.
-
-Different forms of Laplace equation solutions, in which decaying exponent is
-written with either "+" or "-" signs, may cause incompatibilities between
-formulae from linear wave theory and formulae derived in this work, where
-\(\sinh\) is used instead of \(\cosh\). Equality
-\(\frac{\Sinh{2\pi{k}(z+h)}}{\Sinh{2\pi{k}{h}}}\approx\frac{\sinh(2\pi{k}(z+h))}{\sinh(2\pi{k}{h})}\)
-becomes strict on the free surface, and difference between left-hand and
-right-hand sides increases when approaching sea bottom (for sufficiently large
-depth difference near free surface is negligible). So, for sufficiently large
-depth any function (\(\cosh\) or \(\sinh\)) may be used for velocity potential
-computation near free surface.
-
-Reducing eqref:eq-solution-2d и eqref:eq-solution-2d-full to the known formulae
-from linear wave theory shows, that formula for infinite depth
-eqref:eq-solution-2d is not suitable to compute velocity potentials with Fourier
-method, because it does not have symmetry, which is required for Fourier
-transform. However, formula for finite depth can be used instead by setting \(h\)
-to some characteristic water depth. For standing wave reducing to linear wave
-theory formulae is made under the same assumptions.
-
-*** Three-dimensional velocity field
-Three-dimensional version of eqref:eq-problem is written as
-\begin{align}
-    \label{eq-problem-3d}
-    & \phi_xx + \phi_yy + \phi_zz = 0,\\
-    & \zeta_t + \zeta_x\phi_x + \zeta_y\phi_y
-    =
-    \frac{\zeta_x}{\sqrt{1 + \zeta_x^2}} \phi_x
-    +\frac{\zeta_y}{\sqrt{\vphantom{\zeta_x^2}\smash[b]{1 + \zeta_y^2}}} \phi_y
-    - \phi_z, & \text{на }z=\zeta(x,y,t).\nonumber
-\end{align}
-Again, use Fourier method to solve it. Applying Fourier transform to both sides
-of Laplace equation yields
-\begin{equation*}
-    -4 \pi^2 \left( u^2 + v^2 + w^2 \right)
-    \FourierY{\phi(x,y,z)}{u,v,w} = 0,
-\end{equation*}
-hence \(w=\pm{i}\sqrt{u^2+v^2}\). We seek solution in the form of inverse Fourier
-transform \(\phi(x,y,z)=\InverseFourierY{E(u,v,w)}{x,y,z}\). Plugging
-\(w=i\sqrt{u^2+v^2}\) into the formula yields
-\begin{equation*}
-    \phi(x,y,z) = \InverseFourierY{
-        \left(
-            C_1 e^{2\pi \sqrt{u^2+v^2} z}
-            -C_2 e^{-2\pi \sqrt{u^2+v^2} z}
-        \right)
-        E(u,v)
-    }{x,y}.
-\end{equation*}
-Plugging \(\phi\) into the boundary condition on the sea bottom (analogous to
-two-dimensional case) yields
-\begin{equation}
-    \label{eq-guessed-sol-3d}
-    \phi(x,y,z) = \InverseFourierY{
-        \Sinh{2\pi \sqrt{u^2+v^2} (z+h)} E(u,v)
-    }{x,y}.
-\end{equation}
-Plugging \(\phi\) into the boundary condition on the free surface yields
-\begin{equation*}
-    \arraycolsep=1.4pt
-    \begin{array}{rl}
-        \zeta_t = & i f_1(x,y) \InverseFourierY{2 \pi u \Sinh{2\pi \sqrt{u^2+v^2} (z+h)}E(u,v)}{x,y} \\
-        + & i f_2(x,y) \InverseFourierY{2 \pi v \Sinh{2\pi \sqrt{u^2+v^2} (z+h)}E(u,v)}{x,y} \\
-        - & \InverseFourierY{2 \pi \sqrt{u^2+v^2} \SinhX{2\pi \sqrt{u^2+v^2} (z+h)}E(u,v)}{x,y}
-    \end{array}
-\end{equation*}
-where \(f_1(x,y)={\zeta_x}/{\sqrt{1+\zeta_x^2}}-\zeta_x\) and
-\(f_2(x,y)={\zeta_y}/{\sqrt{\vphantom{\zeta_x^2}\smash[b]{1+\zeta_y^2}}}-\zeta_y\).
-Applying Fourier transform to both sides of the equation yields formula for
-coefficients \(E\):
-\begin{equation*}
-    \arraycolsep=1.4pt
-    \begin{array}{rl}
-        \FourierY{\zeta_t}{u,v} = &
-        \FourierY{i f_1(x,y) \InverseFourierY{2 \pi u \Sinh{2\pi \sqrt{u^2+v^2} (z+h)} E(u,v)}{x,y}}{u,v}  \\
-        + & \FourierY{i f_2(x,y) \InverseFourierY{2 \pi v \Sinh{2\pi \sqrt{u^2+v^2} (z+h)} E(u,v)}{x,y}}{u,v}  \\
-        - & 2 \pi \sqrt{u^2+v^2} \SinhX{2\pi \sqrt{u^2+v^2} (z+h)} E(u,v)
-    \end{array}
-\end{equation*}
-Final solution is obtained after plugging \(E(u,v)\) into eqref:eq-guessed-sol-3d.
-
-* Numerical methods and experimental results
-** The shape of ACF for different types of waves
-**** Analytic method of finding the ACF.
-The straightforward way to find ACF for a given ocean wave profile is to apply
-Wiener---Khinchin theorem. According to this theorem the autocorrelation \(K\) of
-a function \(\zeta\) is given by the Fourier transform of the absolute square of
-the function:
-\begin{equation}
-  K(t) = \Fourier{\left| \zeta(t) \right|^2}.
-  \label{eq-wiener-khinchin}
-\end{equation}
-When \(\zeta\) is replaced with actual wave profile, this formula gives you
-analytic formula for the corresponding ACF.
-
-For three-dimensional wave profile (2D in space and 1D in time) analytic formula
-is a polynomial of high order and is best obtained via symbolic computation
-programme. Then for practical usage it can be approximated by superposition of
-exponentially decaying cosines (which is how ACF of a stationary ARMA process
-looks like\nbsp{}cite:box1976time).
-
-**** Empirical method of finding the ACF.
-However, for three-dimensional case there exists simpler empirical method which
-does not require sophisticated software to determine shape of the ACF. It is
-known that ACF represented by exponentially decaying cosines satisfies first
-order Stokes' equations for gravity waves\nbsp{}cite:boccotti1983wind. So, if the
-shape of the wave profile is the only concern in the simulation, then one can
-simply multiply it by a decaying exponent to get appropriate ACF. This ACF does
-not reflect other wave profile parameters, such as wave height and period, but
-opens possibility to simulate waves of a particular non-analytic shape by
-"drawing" their profile, then multiplying it by an exponent and using the
-resulting function as ACF. So, this empirical method is imprecise but offers
-simpler alternative to Wiener---Khinchin theorem approach; it is mainly useful
-to test ARMA model.
-
-**** Standing wave ACF.
-For three-dimensional plain standing wave the profile is given by
-\begin{equation}
-  \zeta(t, x, y) = A \sin (k_x x + k_y y) \sin (\sigma t).
-  \label{eq-standing-wave}
-\end{equation}
-Find ACF via analytic method. Multiplying the formula by a decaying exponent
-(because Fourier transform is defined for a function \(f\) that
-\(f\underset{x\rightarrow\pm\infty}{\longrightarrow}0\)) yields
-\begin{equation}
-  \zeta(t, x, y) =
-  A
-  \exp\left[-\alpha (|t|+|x|+|y|) \right]
-  \sin (k_x x + k_y y) \sin (\sigma t).
-  \label{eq-decaying-standing-wave}
-\end{equation}
-Then, apply 3D Fourier transform to both sides of the equation via symbolic
-computation programme, fit the resulting polynomial to the following
-approximation:
-\begin{equation}
-  K(t,x,y) =
-  \gamma
-  \exp\left[-\alpha (|t|+|x|+|y|) \right]
-  \cos \beta t
-  \cos \left[ \beta x + \beta y \right].
-  \label{eq-standing-wave-acf}
-\end{equation}
-So, after applying Wiener---Khinchin theorem we get initial formula but with
-cosines instead of sines. This difference is important because the value of ACF
-at \((0,0,0)\) equals to the ARMA process variance, and if one used sines the
-value would be wrong.
-
-If one tries to replicate the same formula via empirical method, the usual way
-is to adapt eqref:eq-decaying-standing-wave to match eqref:eq-standing-wave-acf.
-This can be done either by changing the phase of the sine, or by substituting
-sine with cosine to move the maximum of the function to the origin of
-coordinates.
-
-**** Propagating wave ACF.
-Three-dimensional profile of plain propagating wave is given by
-\begin{equation}
-  \zeta(t, x, y) = A \cos (\sigma t + k_x x + k_y y).
-  \label{eq-propagating-wave}
-\end{equation}
-For the analytic method repeating steps from the previous two paragraphs yields
-\begin{equation}
-  K(t,x,y) =
-  \gamma
-  \exp\left[-\alpha (|t|+|x|+|y|) \right]
-  \cos\left[\beta (t+x+y) \right].
-  \label{eq-propagating-wave-acf}
-\end{equation}
-For the empirical method the wave profile is simply multiplied by a decaying
-exponent without need to adapt the maximum value of ACF (as it is required for
-standing wave).
-
-**** Comparison of studied methods.
-To summarise, the analytic method of finding ocean wave's ACF reduces to the
-following steps.
-- Make wave profile decay when approaching \(\pm\infty\) by multiplying it by
-  a decaying exponent.
-- Apply Fourier transform to the absolute square of the resulting equation using
-  symbolic computation programme.
-- Fit the resulting polynomial to the appropriate ACF approximation.
-
-Two examples in this section showed that in case of standing and propagating
-waves their decaying profiles resemble the corresponding ACFs with the exception
-that the ACF's maximum should be moved to the origin to preserve simulated
-process variance. Empirical method of finding ACF reduces to the following
-steps.
-- Make wave profile decay when approaching \(\pm\infty\) by multiplying it by
-  a decaying exponent.
-- Move maximum value of the resulting function to the origin by using
-  trigonometric identities to shift the phase.
-
-** Additional formulae, methods and algorithms for ARMA model
-:PROPERTIES:
-:CUSTOM_ID: sec:arma-algorithms
-:END:
-*** Wave elevation distribution approximation
-One of the parameters of ocean wavy surface generator is probability density
-function (PDF) of the surface elevation. This distribution is given by either
-polynomial approximation of /in situ/ data or analytic formula.
-
-**** Gram---Charlier series expansion.
-In\nbsp{}cite:huang1980experimental the authors experimentally show, that PDF of sea
-surface elevation is distinguished from normal distribution by non-nought
-kurtosis and skewness. In\nbsp{}cite:рожков1996теория the authors show, that this type
-of PDF expands in Gram---Charlier series:
-\begin{align}
-    \label{eq-skew-normal-1}
-    F(z; \gamma_1, \gamma_2) & = \phi(z)
-        - \gamma_1 \frac{\phi'''(z)}{3!}
-        + \gamma_2 \frac{\phi''''(z)}{4!} \nonumber \\
-    & =
-    \frac{1}{2} \text{erf}\left[\frac{z}{\sqrt{2}}\right]
-    -
-    \frac{e^{-\frac{z^2}{2}}}{\sqrt{2\pi}}
-    \left[
-        \frac{1}{6} \gamma_1 \left(z^2-1\right)
-        + \frac{1}{24} \gamma_2 z \left(z^2-3\right)
-    \right]
-    ,\nonumber \\
-    f(z; \gamma_1, \gamma_2) & =
-    \frac{e^{-\frac{z^2}{2}}}{\sqrt{2 \pi }}
-    \left[
-        \frac{1}{6} \gamma_1 z \left(z^2-3\right)
-        + \frac{1}{24} \gamma_2 \left(z^4-6z^2+3\right)
-        +1
-    \right],
-\end{align}
-where \(\phi(z)=\frac{1}{2}\mathrm{erf}(z/\sqrt{2})\), \(\gamma_1\)\nbsp{}--- skewness,
-\(\gamma_2\)\nbsp{}--- kurtosis, \(f\)\nbsp{}--- PDF, \(F\)\nbsp{}--- cumulative distribution function
-(CDF). According to\nbsp{}cite:рожков1990вероятностные for ocean waves skewness is
-selected from interval \(0.1\leq\gamma_1\leq{0.52}]\) and kurtosis from interval
-\(0.1\leq\gamma_2\leq{0.7}\). Family of probability density functions for
-different parameters is shown in fig.\nbsp{}[[fig-skew-normal-1]].
-
-#+NAME: fig-skew-normal-1
-#+begin_src R :file build/skew-normal-1.pdf
-source(file.path("R", "common.R"))
-x <- seq(-3, 3, length.out=100)
-params <- data.frame(
-  skewness = c(0.00, 0.52, 0.00, 0.52),
-  kurtosis = c(0.00, 0.00, 0.70, 0.70),
-  linetypes = c("solid", "dashed", "dotdash", "dotted")
-)
-arma.skew_normal_1_plot(x, params)
-legend(
-  "topleft",
-  mapply(
-    function (s, k) {
-      as.expression(bquote(list(
-        gamma[1] == .(arma.fmt(s, 2)),
-        gamma[2] == .(arma.fmt(k, 2))
-      )))
-    },
-    params$skewness,
-    params$kurtosis
-  ),
-  lty = paste(params$linetypes)
-)
-#+end_src
-
-#+caption: Probability density function eqref:eq-skew-normal-1 of ocean wavy surface elevation for different values of skewness \(\gamma_1\) and kurtosis \(\gamma_2\).
-#+label: fig-skew-normal-1
-#+RESULTS: fig-skew-normal-1
-[[file:build/skew-normal-1.pdf]]
-
-**** Skew-normal distribution.
-Alternative approach is to approximate distribution of ocean wavy surface
-elevation by skew-normal distribution:
-\begin{align}
-    \label{eq-skew-normal-2}
-    F(z; \alpha) & = \frac{1}{2}
-   \mathrm{erfc}\left[-\frac{z}{\sqrt{2}}\right]-2 T(z,\alpha ), \nonumber \\
-    f(z; \alpha) & = \frac{e^{-\frac{z^2}{2}}}{\sqrt{2 \pi }}
-   \mathrm{erfc}\left[-\frac{\alpha z}{\sqrt{2}}\right],
-\end{align}
-where \(T\)\nbsp{}--- Owen \(T\)-function\nbsp{}cite:owen1956tables. Using this formula it is
-impossible to specify skewness and kurtosis separately\nbsp{}--- both values are
-adjusted via \(\alpha\) parameter. The only advantage of the formula is its
-relative computational simplicity: this function is available in some programmes
-and mathematical libraries. Its graph for different values of \(\alpha\) is shown
-in fig.\nbsp{}[[fig-skew-normal-2]].
-
-#+name: fig-skew-normal-2
-#+begin_src R :file build/skew-normal-2.pdf
-source(file.path("R", "common.R"))
-x <- seq(-3, 3, length.out=100)
-alpha <- c(0.00, 0.87, 2.25, 4.90)
-params <- data.frame(
-  alpha = alpha,
-  skewness = arma.bits.skewness_2(alpha),
-  kurtosis = arma.bits.kurtosis_2(alpha),
-  linetypes = c("solid", "dashed", "dotdash", "dotted")
-)
-arma.skew_normal_2_plot(x, params)
-legend(
-  "topleft",
-  mapply(
-    function (a, s, k) {
-      as.expression(bquote(list(
-        alpha == .(arma.fmt(a, 2)),
-        gamma[1] == .(arma.fmt(s, 2)),
-        gamma[2] == .(arma.fmt(k, 2))
-      )))
-    },
-    params$alpha,
-    params$skewness,
-    params$kurtosis
-  ),
-  lty = paste(params$linetypes)
-)
-#+end_src
-
-#+caption: Probability density function eqref:eq-skew-normal-2 of ocean wavy surface for different values of skewness coefficient \(\alpha\).
-#+label: fig-skew-normal-2
-#+RESULTS: fig-skew-normal-2
-[[file:build/skew-normal-2.pdf]]
-
-**** Evaluation.
-Equation eqref:eq-distribution-transformation with selected wave elevation
-distribution may be solved either in every point of generated wavy surface,
-which gives the most accurate results, or in every fixed grid point
-interpolating result via least-squares (LS) polynomial. In the second case
-precision is lower. For example, interpolating 12^th order polynomial on a fixed
-grid of 500 points on interval \(-5\sigma_z\leq{z}\leq{5}\sigma_z\) gives error of
-\(\approx{0.43}\cdot10^{-3}\). Increasing polynomial order leads to either numeric
-overflows during LS interpolation, or more coefficient close to nought;
-increasing the size of the grid has insignificant effect on the result. In the
-majority of cases three Gram---Charlier series coefficients is enough to
-transform ACF; relative error without interpolation is \(10^{-5}\).
-
-*** White noise generation algorithm
-In order to eliminate periodicity from generated wavy surface, it is imperative
-to use PRNG with sufficiently large period to generate white noise. Parallel
-Mersenne Twister\nbsp{}cite:matsumoto1998mersenne with a period of \(2^{19937}-1\) is
-used as a generator in this work. It allows to produce aperiodic ocean wavy
-surface realisations in any practical usage scenarios.
-
-There is no guarantee that multiple Mersenne Twisters executed in parallel
-threads with distinct initial states produce uncorrelated pseudo-random number
-sequences, however, algorithm of dynamic creation of Mersenne Twisters\nbsp{}cite:matsumoto1998dynamic may be used to provide such guarantee. The essence of
-the algorithm is to find matrices of initial generator states, that give
-maximally uncorrelated pseudo-random number sequences when Mersenne Twisters are
-executed in parallel with these initial states. Since finding such initial
-states consumes considerable amount of processor time, vector of initial states
-is created preliminary with knowingly larger number of parallel threads and
-saved to a file, which is then read before starting white noise generation.
-
-*** Wavy surface generation algorithm
-In ARMA model value of wavy surface elevation at a particular point depends on
-previous in space and time points, as a result the so called /ramp-up interval/
-(see fig.\nbsp{}[[fig-ramp-up-interval]]), in which realisation does not correspond to
-specified ACF, forms in the beginning of the realisation. There are several
-solutions to this problem which depend on the simulation context.
-
-If realisation is used in the context of ship stability simulation without
-manoeuvring, ramp-up interval will not affect results of the simulation, because
-it is located on the border (too far away from the studied marine object). If
-ship stability with manoeuvring is studied, then the interval may be simply
-discarded from the realisation (the size of the interval approximately equals
-the number of AR coefficients in each dimension). However, this may lead to loss
-of a very large number of points, because discarding occurs for each dimension.
-Alternative approach is to generate ocean wavy surface on ramp-up interval with
-LH model and generate the rest of the realisation with ARMA model.
-
-Algorithm of wavy surface generation is data-parallel: realisation is divided
-into equal parts each of which is generated independently, however, in the
-beginning of each realisation there is ramp-up interval. To eliminate it
-/overlap-add/ method\nbsp{}cite:oppenheim1989discrete,svoboda2011efficient,pavel2013algorithms (a popular
-method in signal processing) is used. The essence of the method is to add
-another interval, size of which is equal to the ramp-up interval size, to the
-end of each part. Then wavy surface is generated in each point of each part
-(including points from the added interval), the interval at the end of part \(N\)
-is superimposed on the ramp-up interval at the beginning of the part \(N+1\), and
-values in corresponding points are added.
-
-#+name: fig-ramp-up-interval
-#+begin_src R :file build/ramp-up-interval.pdf
-source(file.path("R", "common.R"))
-arma.plot_ramp_up_interval()
-#+end_src
-
-#+caption: Ramp-up interval at the beginning of the \(OX\) axis of the realisation.
-#+label: fig-ramp-up-interval
-#+RESULTS: fig-ramp-up-interval
-[[file:build/ramp-up-interval.pdf]]
-
-*** Velocity potential normalisation formulae
-:PROPERTIES:
-:CUSTOM_ID: sec:compute-delta
-:END:
-
-In solutions eqref:eq-solution-2d and eqref:eq-solution-2d-full to
-two-dimensional pressure determination problem there are functions
-\(\Fun{z}=\InverseFourierY{e^{2\pi{u}{z}}}{x}\) and
-\(\FunSecond{z}=\InverseFourierY{\Sinh{2\pi{u}{z}}}{x}\) which has multiple
-analytic representations and are difficult to compute. Each function is a
-Fourier transform of linear combination of exponents which reduces to poorly
-defined Dirac delta function of a complex argument (see table\nbsp{}[[tab-delta-functions]]).
-The usual way of handling this type of functions is to write them as
-multiplication of Dirac delta functions of real and imaginary part, however,
-this approach does not work here, because applying inverse Fourier transform to
-this representation does not produce exponent, which severely warp resulting
-velocity field. In order to get unique analytic definition normalisation factor
-\(1/\Sinh{2\pi{u}{h}}\) (which is also included in formula for \(E(u)\)) may be
-used. Despite the fact that normalisation allows to obtain adequate velocity
-potential field, numerical experiments show that there is little difference
-between this field and the one produced by formulae from linear wave theory, in
-which terms with \(\zeta\) are omitted.
-
-#+name: tab-delta-functions
-#+caption: Formulae for computing \(\Fun{z}\) and \(\FunSecond{z}\) from [[#sec:pressure-2d]], that use normalisation to eliminate uncertainty from definition of Dirac delta function of complex argument.
-#+attr_latex: :booktabs t
-| Function          | Without normalisation                                        | Normalised                                                                                                                             |
-|-------------------+--------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------|
-| \(\Fun{z}\)       | \(\delta (x+i z)\)                                           | \(\frac{1}{2 h}\mathrm{sech}\left(\frac{\pi  (x-i (h+z))}{2 h}\right)\)                                                                |
-| \(\FunSecond{z}\) | \(\frac{1}{2}\left[\delta (x-i z) + \delta (x+i z) \right]\) | \(\frac{1}{4 h}\left[\text{sech}\left(\frac{\pi  (x-i (h+z))}{2 h}\right)+\text{sech}\left(\frac{\pi  (x+i(h+z))}{2 h}\right)\right]\) |
-
-** ARMA model verification
-:PROPERTIES:
-:CUSTOM_ID: sec:verification
-:END:
-
-In\nbsp{}cite:degtyarev2011modelling,degtyarev2013synoptic,boukhanovsky1997thesis AR
-model the following items are verified experimentally:
-- probability distributions of different wave characteristics (wave heights,
-  lengths, crests, periods, slopes, three-dimensionality),
-- dispersion relation,
-- retention of integral characteristics for mixed wave sea state.
-In this work both AR and MA model are verified by comparing probability
-distributions of different wave characteristics.
-
-*** Verification of wavy surface integral characteristics
-In\nbsp{}cite:рожков1990вероятностные the authors show that several ocean wave
-characteristics (listed in table\nbsp{}[[tab-weibull-shape]]) have Weibull distribution,
-and wavy surface elevation has Gaussian distribution. In order to verify that
-distributions corresponding to generated realisation are correct,
-quantile-quantile plots are used (plots where analytic quantile values are used
-for \(OX\) axis and estimated quantile values for \(OY\) axis). If the estimated
-distribution matches analytic then the graph has the form of the straight line.
-Tails of the graph may diverge from the straight line, because they can not be
-reliably estimated from the finite-size realisation. Different methods of
-extracting waves from realisation produce variations in quantile function tails,
-it is probably impractical to extract every possible wave from realisation since
-they may (and often) overlap.
-
-#+name: tab-weibull-shape
-#+caption: Values of Weibull shape parameter for different wave characteristics.
-#+attr_latex: :booktabs t
-| Characteristic       | Weibull shape (\(k\)) |
-|----------------------+---------------------|
-| Wave height          |                   2 |
-| Wave length          |                 2.3 |
-| Crest length         |                 2.3 |
-| Wave period          |                   3 |
-| Wave slope           |                 2.5 |
-| Three-dimensionality |                 2.5 |
-
-Verification was performed for standing and propagating waves. The corresponding
-ACFs and quantile-quantile plots of wave characteristics distributions are shown
-in
-fig.\nbsp{}[[propagating-wave-distributions]],\nbsp{}[[standing-wave-distributions]],\nbsp{}[[acf-slices]].
-
-#+name: propagating-wave-distributions
-#+begin_src R :file build/propagating-wave-qqplots.pdf
-source(file.path("R", "common.R"))
-par(pty="s", mfrow=c(2, 2))
-arma.qqplot_grid(
-  file.path("build", "propagating_wave"),
-  c("elevation", "heights_y", "lengths_y", "periods"),
-  c("elevation", "height Y", "length Y", "period"),
-  xlab="x",
-  ylab="y"
-)
-#+end_src
-
-#+caption: Quantile-quantile plots for propagating waves.
-#+label: propagating-wave-distributions
-#+RESULTS: propagating-wave-distributions
-[[file:build/propagating-wave-qqplots.pdf]]
-
-#+name: standing-wave-distributions
-#+begin_src R :file build/standing-wave-qqplots.pdf
-source(file.path("R", "common.R"))
-par(pty="s", mfrow=c(2, 2))
-arma.qqplot_grid(
-  file.path("build", "standing_wave"),
-  c("elevation", "heights_y", "lengths_y", "periods"),
-  c("elevation", "height Y", "length Y", "period"),
-  xlab="x",
-  ylab="y"
-)
-#+end_src
-
-#+caption: Quantile-quantile plots for standing waves.
-#+label: standing-wave-distributions
-#+RESULTS: standing-wave-distributions
-[[file:build/standing-wave-qqplots.pdf]]
-
-#+name: acf-slices
-#+header: :width 6 :height 9
-#+begin_src R :file build/acf-slices.pdf
-source(file.path("R", "common.R"))
-propagating_acf <- read.csv(file.path("build", "propagating_wave", "acf.csv"))
-standing_acf <- read.csv(file.path("build", "standing_wave", "acf.csv"))
-par(mfrow=c(5, 2), mar=c(0,0,0,0))
-for (i in seq(0, 4)) {
-  arma.wavy_plot(standing_acf, i, zlim=c(-5,5))
-  arma.wavy_plot(propagating_acf, i, zlim=c(-5,5))
-}
-#+end_src
-
-#+caption: Time slices of ACF for standing (left column) and propagating waves (right column).
-#+label: acf-slices
-#+RESULTS: acf-slices
-[[file:build/acf-slices.pdf]]
-
-Graph tails in fig.\nbsp{}[[propagating-wave-distributions]] deviate from original
-distribution for individual wave characteristics, because every wave have to be
-extracted from the resulting wavy surface to measure its length, period and
-height. There is no algorithm that guarantees correct extraction of all waves,
-because they may and often overlap each other. Weibull distribution right tail
-represents infrequently occurring waves, so it deviates more than left tail.
-
-Correspondence rate for standing waves (fig.\nbsp{}[[standing-wave-distributions]])
-is lower for height and length, roughly the same for surface
-elevation and higher for wave period distribution tails. Lower correspondence
-degree for length and height may be attributed to the fact that Weibull
-distributions were obtained empirically for ocean waves which are typically
-propagating, and distributions may be different for standings waves. Higher
-correspondence degree for wave periods is attributed to the fact that wave
-periods of standing waves are extracted more precisely as the waves do not move
-outside simulated wavy surface region. The same correspondence degree for wave elevation
-is obtained, because this is the characteristic of the wavy surface (and
-corresponding AR or MA process) and is not affected by the type of waves.
-
-*** Verification of velocity potential fields
-:PROPERTIES:
-:CUSTOM_ID: sec:compare-formulae
-:END:
-
-Comparing obtained generic formulae eqref:eq-solution-2d and
-eqref:eq-solution-2d-full to the known formulae from linear wave theory allows
-to see the difference between velocity fields for both large and small amplitude
-waves. In general analytic formula for velocity potential in not known, even for
-plain waves, so comparison is done numerically. Taking into account conclusions
-of [[#sec:pressure-2d]], only finite depth formulae are compared.
-
-**** The difference with linear wave theory formulae.
-In order to obtain velocity potential fields, ocean wavy surface was generated
-by AR model with varying wave amplitude. In numerical implementation wave
-numbers in Fourier transforms were chosen on the interval from \(0\) to the
-maximal wave number determined numerically from the obtained wavy surface.
-Experiments were conducted for waves of both small and large amplitudes.
-
-The experiment showed that velocity potential fields produced by formula
-eqref:eq-solution-2d-full for finite depth fluid and formula
-eqref:eq-solution-2d-linear from linear wave theory are qualitatively different
-(fig.\nbsp{}[[fig-potential-field-nonlinear]]). First, velocity potential contours
-have sinusoidal shape, which is different from oval shape described by linear
-wave theory. Second, velocity potential decays more rapidly than in linear wave
-theory as getting closer to the bottom, and the region where the majority of
-wave energy is concentrated is closer to the wave crest. Similar numerical
-experiment, in which all terms of eqref:eq-solution-2d-full that are neglected
-in the framework of linear wave theory are eliminated, shows no difference (as
-much as machine precision allows) in resulting velocity potential fields.
-
-#+name: fig-potential-field-nonlinear
-#+caption: Velocity potential field of propagating wave \(\zeta(x,y,t) = \cos(2\pi x - t/2)\). Field produced by formula eqref:eq-solution-2d-full (top) and linear wave theory formula (bottom).
-#+begin_figure
-#+attr_latex: :width 0.47\textwidth
-[[file:graphics/pressure/potential-5.eps]]
-#+attr_latex: :width 0.47\textwidth
-[[file:graphics/pressure/potential-6.eps]]
-#+end_figure
-
-**** The difference with small-amplitude wave theory.
-The experiment, in which velocity fields produced numerically by different
-formulae were compared, shows that velocity fields produced by formula
-eqref:eq-solution-2d-full and eqref:eq-old-sol-2d correspond to each other for
-small-amplitude waves. Two ocean wavy surface realisations were made by AR
-model: one containing small-amplitude waves, other containing large-amplitude
-waves. Integration in formula eqref:eq-solution-2d-full was done over wave
-numbers range extracted from the generated wavy surface. For small-amplitude
-waves both formulae showed comparable results (the difference in the velocity is
-attributed to the stochastic nature of AR model), whereas for large-amplitude
-waves stable velocity field was produced only by formula
-eqref:eq-solution-2d-full (fig.\nbsp{}[[fig-velocity-field-2d]]). So, generic
-formula eqref:eq-solution-2d-full gives satisfactory results without restriction
-on wave amplitudes.
-
-#+name: fig-velocity-field-2d
-#+caption: Comparison of velocity field on the ocean wavy surface obtained by generic formula (\(u_1\)) and formula for small-amplitude waves (\(u_2\)). Velocity field for realisations containing small-amplitude (top) and large-amplitude (bottom) waves.
-#+begin_figure
-[[file:build/low-amp-nocolor.eps]]
-[[file:build/high-amp-nocolor.eps]]
-#+end_figure
-*** Non-physical nature of ARMA model
-ARMA model, owing to its non-physical nature, does not have the notion of ocean
-wave; it simulates wavy surface as a whole instead. Motions of individual waves
-and their shape are often rough, and the total number of waves can not be
-determined precisely. However, integral characteristics of wavy surface match
-the ones of real ocean waves.
-
-Theoretically, ocean waves themselves can be chosen as ACFs, the only
-pre-processing step is to make them decay exponentially. This may allow
-to generate waves of arbitrary profiles, and is one of the directions of future
-work.
-
-* High-performance software implementation of ocean wave simulation
-** Computational model
-**** Mapping wavy surface generation algorithm on computational model.
-Software implementation of ARMA model works as a computational pipeline, in
-which each joint applies some function to the output coming from the pipe of the
-previous joint. Joints are distributed across computer cluster nodes to enable
-function parallelism, and then data flowing through the joints is distributed
-across processor cores to enable data parallelism. Figure\nbsp{}[[fig-pipeline]] shows a
-diagram of data processing pipeline in which rectangles with rounded corners
-denote joints, regular rectangles denote arrays of problem domain objects
-flowing from one joint to another, and arrows show flow direction. Some joints
-are divided into /sections/ each of which process a separate part of the array.
-If joints are connected without a /barrier/ (horizontal or vertical bar), then
-transfer of separate objects between them is done in parallel to computations,
-as they become available. Sections work in parallel on each processor core (or
-node of the cluster). There is surjective mapping between a set of processor
-cores, a set of pipeline joint sections and objects, i.e. each processor core
-may run several sections, each of which may sequentially process several
-objects, but a section can not work simultaneously on several processor cores,
-and an object can not be processed simultaneously by several sections.
-
-#+name: fig-pipeline
-#+begin_src dot :exports results :file build/pipeline.pdf
-digraph {
-
-  node [fontsize=14,margin="0.055,0"]
-  graph [nodesep="0.25",ranksep="0.25",rankdir="TB"]
-  edge [arrowsize=0.66]
-
-  # data
-  subgraph xcluster_linear {
-    label="Linear model"
-
-    start [label="",shape=circle,style=filled,fillcolor=black,width=0.23]
-    spectrum [label="S(ω,θ)",shape=box]
-    acf [label="K(i,j,k)",shape=box]
-    phi [label="Φ(i,j,k)",shape=box]
-
-    # transformations
-    fourier_transform [label="Fourier transform",shape=box,style=rounded]
-    solve_yule_walker [label="Solve Yule—Walker\nequations",shape=box,style=rounded]
-
-    subgraph cluster_nonlinear_1 {
-      label="Simulate non-linearity\l"
-      labeljust=left
-      style=filled
-      color=lightgrey
-      acf2 [label="K*(i,j,k)",shape=box]
-      transform_acf [label="Transform ACF",shape=box,style=rounded]
-    }
-  }
-
-  subgraph xcluster_linear2 {
-
-    eps_parts [label="<e1> ε₁|<e2> ε₂|<e3> …|<e4> εₙ|<e> ε(t,x,y)",shape=record]
-    end [label="",shape=doublecircle,style=filled,fillcolor=black,width=0.23]
-
-    generate_white_noise [label="<g1> g₁|<g2> g₂|<g3> …|<g4> gₙ|<gen> Generate\lwhite noise",shape=record,style=rounded]
-    generate_zeta [label="<g1> g₁|<g2> g₂|<g3> …|<g4> gₙ|<gen> Generate ocean\lwavy surface parts\l",shape=record,style=rounded]
-
-    zeta_parts [label="<g1> ζ₁|<g2> ζ₂|<g3> …|<g4> ζₙ|<gen> Non-crosslinked\lrealisation parts",shape=record]
-    overlap_add [label="<g1> ζ₁|<g2> ζ₂|<g3> …|<g4> ζₙ|<gen> Crosslink realisation\lparts\l",shape=record,style=rounded]
-
-    zeta_parts:g1->overlap_add:g1
-    zeta_parts:g2->overlap_add:g2
-    zeta_parts:g3->overlap_add:g3
-    zeta_parts:g4->overlap_add:g4
-
-    zeta_parts:g2->overlap_add:g1 [constraint=false]
-    zeta_parts:g3->overlap_add:g2 [constraint=false]
-    zeta_parts:g4->overlap_add:g3 [constraint=false]
-
-    overlap_add:g1->zeta2_parts:g1
-    overlap_add:g2->zeta2_parts:g2
-    overlap_add:g3->zeta2_parts:g3
-    overlap_add:g4->zeta2_parts:g4
-
-    zeta2_parts:g1->transform_zeta:g1->zeta3_parts:g1->write_zeta:g1->eps_end
-    zeta2_parts:g2->transform_zeta:g2->zeta3_parts:g2->write_zeta:g2->eps_end
-    zeta2_parts:g3->transform_zeta:g3->zeta3_parts:g3->write_zeta:g3->eps_end
-    zeta2_parts:g4->transform_zeta:g4->zeta3_parts:g4->write_zeta:g4->eps_end
-
-  }
-
-  subgraph part3 {
-
-    zeta2_parts [label="<g1> ζ₁|<g2> ζ₂|<g3> …|<g4> ζₙ|<gen> Wavy surface with\lGaussian distribution\l",shape=record]
-
-    subgraph cluster_nonlinear_2 {
-      label="Simulate non-linearity\r"
-      labeljust=right
-      style=filled
-      color=lightgrey
-      zeta3_parts [label="<g1> ζ₁|<g2> ζ₂|<g3> …|<g4> ζₙ|<gen> ζ(t,x,y)",shape=record]
-      transform_zeta [label="<g1> g₁|<g2> g₂|<g3> …|<g4> gₙ|<gen> Transform wavy\lsurface elevation\lprobability distribution\l",shape=record,style=rounded]
-    }
-
-    # barriers
-    eps_start [label="",shape=box,style=filled,fillcolor=black,height=0.05]
-    eps_end [label="",shape=box,style=filled,fillcolor=black,height=0.05]
-
-    write_zeta [label="<g1> g₁|<g2> g₂|<g3> …|<g4> gₙ|<gen> Write finished\lparts to a file\l",shape=record,style=rounded]
-  }
-
-  # edges
-  start->spectrum->fourier_transform->acf->transform_acf
-  transform_acf->acf2
-  acf2->solve_yule_walker
-  solve_yule_walker->phi
-  phi->eps_start [constraint=false]
-  eps_start->generate_white_noise:g1
-  eps_start->generate_white_noise:g2
-  eps_start->generate_white_noise:g3
-  eps_start->generate_white_noise:g4
-  generate_white_noise:g1->eps_parts:e1->generate_zeta:g1->zeta_parts:g1
-  generate_white_noise:g2->eps_parts:e2->generate_zeta:g2->zeta_parts:g2
-  generate_white_noise:g3->eps_parts:e3->generate_zeta:g3->zeta_parts:g3
-  generate_white_noise:g4->eps_parts:e4->generate_zeta:g4->zeta_parts:g4
-
-  eps_end->end
-}
-#+end_src
-
-#+caption: Diagram of data processing pipeline, that implements ocean wavy surface generation via AR model.
-#+label: fig-pipeline
-#+RESULTS: fig-pipeline
-[[file:build/pipeline.pdf]]
-
-Object pipeline may be seen as an improvement of BSP (Bulk Synchronous Parallel)
-model\nbsp{}cite:valiant1990bridging, which is used in graph processing\nbsp{}cite:malewicz2010pregel,seo2010hama. Pipeline eliminates global synchronisation
-(where it is possible) after each sequential computation step by doing data
-transfer between joints in parallel to computations, whereas in BSP model global
-synchronisation occurs after each step.
-
-Object pipeline speeds up the programme by parallel execution of code blocks
-that work with different compute devices: while the current part of wavy surface
-is generated by a processor, the previous part is written to a disk. This
-approach allows to get speed-up because compute devices operate asynchronously,
-and their parallel usage increases the whole programme performance.
-
-Since data transfer between pipeline joints is done in parallel to computations,
-the same pipeline may be used to run several copies of the application but with
-different parameters (generate several ocean wavy surfaces having different
-characteristics). In practise, high-performance applications do not always
-consume 100% of processor time spending a portion of time on synchronisation of
-parallel processes and writing data to disk. Using pipeline in this case allows
-to run several computations on the same set of processes, and use all of the
-computer devices at maximal efficiency. For example, when one object writes data
-to a file, the other do computations on the processor in parallel. This
-minimises downtime of the processor and other computer devices and increases
-throughput of the computer cluster.
-
-Pipelining of otherwise sequential steps is beneficial not only for code work
-with different devices, but for code different branches of which are suitable
-for execution by multiple hardware threads of the same processor core, i.e.
-branches accessing different memory blocks or performing mixed arithmetic
-(integer and floating point). Code branches which use different modules of
-processor are good candidates to run in parallel on a processor core with
-multiple hardware threads.
-
-So, computational model with a pipeline can be seen as /bulk-asynchronous
-model/, because of the parallel nature of programme steps. This model is the
-basis of the fault-tolerance model which will be described later.
-
-**** Software implementation.
-For efficiency reasons object pipeline and fault tolerance techniques (which
-will be described later) are implemented in the C++ framework: From the author's
-perspective C language is deemed low-level for distributed programmes, and Java
-incurs too much overhead and is not popular in HPC community. As of now, the
-framework runs in the same process as an parallel application that uses it. The
-framework is called Factory, it is now in proof-of-concept development stage.
-
-**** Computational model overview.
-The key feature that is missing in the current parallel programming technologies
-is a possibility to specify hierarchical dependencies between parallel tasks.
-When one has such dependency, it is trivial to determine which task should be
-responsible for re-executing a failed task on one of the survived nodes. To
-re-execute the task on the top of the hierarchy, a backup task is created and
-executed on a different node. There exists a number of systems that are capable
-of executing directed acyclic graphs of tasks in parallel\nbsp{}cite:acun2014charmpp,islam2012oozie, but graphs are not suitable to infer
-principal-subordinate relationship between tasks, because a node in the graph
-may have multiple parent nodes.
-
-The main purpose of the model is to simplify development of distributed batch
-processing applications and middleware. The main focus is to make application
-resilient to failures, i.e. make it fault tolerant and highly available, and do
-it transparently to a programmer. The implementation is divided into two layers:
-the lower layer consists of routines and classes for single node applications
-(with no network interactions), and the upper layer for applications that run on
-an arbitrary number of nodes. There are two kinds of tightly coupled entities in
-the model\nbsp{}--- /control flow objects/ (or /kernels/ for short) and
-/pipelines/\nbsp{}--- which are used together to compose a programme.
-
-Kernels implement control flow logic in theirs ~act~ and ~react~ methods and
-store the state of the current control flow branch. Both logic and state are
-implemented by a programmer. In ~act~ method some function is either directly
-computed or decomposed into nested functions (represented by a set of
-subordinate kernels) which are subsequently sent to a pipeline. In ~react~
-method subordinate kernels that returned from the pipeline are processed by
-their parent. Calls to ~act~ and ~react~ methods are asynchronous and are made
-within threads attached to a pipeline. For each kernel ~act~ is called only
-once, and for multiple kernels the calls are done in parallel to each other,
-whereas ~react~ method is called once for each subordinate kernel, and all the
-calls are made in the same thread to prevent race conditions (for different
-parent kernels different threads may be used).
-
-Pipelines implement asynchronous calls to ~act~ and ~react~, and try to make as
-many parallel calls as possible considering concurrency of the platform (no. of
-cores per node and no. of nodes in a cluster). A pipeline consists of a kernel
-pool, which contains all the subordinate kernels sent by their parents, and a
-thread pool that processes kernels in accordance with rules outlined in the
-previous paragraph. A separate pipeline is used for each device: There are
-pipelines for parallel processing, schedule-based processing (periodic and
-delayed tasks), and a proxy pipeline for processing of kernels on other cluster
-nodes (see fig.\nbsp{}[[fig-subord-ppl]]).
-
-In principle, kernels and pipelines machinery reflect the one of procedures and
-call stacks, with the advantage that kernel methods are called asynchronously
-and in parallel to each other (as much as programme logic allows). Kernel field
-is the stack, ~act~ method is a sequence of processor instructions before nested
-procedure call, and ~react~ method is a sequence of processor instructions after
-the call. Constructing and sending subordinate kernels to the pipeline is nested
-procedure call. Two methods are necessary to make calls asynchronous, and
-replace active wait for completion of subordinate kernels with passive one.
-Pipelines, in turn, allow to implement passive wait, and call correct kernel
-methods by analysing their internal state.
-
-#+name: fig-subord-ppl
-#+begin_src dot :exports results :file build/subord-ppl.pdf
-graph G {
-
-  node [fontname="Old Standard",fontsize=14,margin="0.055,0",shape=box]
-  graph [nodesep="0.25",ranksep="0.25",rankdir="LR"]
-  edge [arrowsize=0.66]
-
-  subgraph cluster_daemon {
-    label="Daemon process"
-    style=filled
-    color=lightgrey
-
-    factory [label="Factory"]
-    parallel_ppl [label="Parallel\npipeline"]
-    io_ppl [label="I/O\npipeline"]
-    sched_ppl [label="Schedule-based\npipeline"]
-    net_ppl [label="Network\npipeline"]
-    proc_ppl [label="Process\npipeline"]
-
-    upstream [label="Upstream\nthread pool"]
-    downstream [label="Downstream\nthread pool"]
-  }
-
-  factory--parallel_ppl
-  factory--io_ppl
-  factory--sched_ppl
-  factory--net_ppl
-  factory--proc_ppl
-
-  subgraph cluster_hardware {
-    label="Compute devices"
-    style=filled
-    color=lightgrey
-
-    cpu [label="CPU"]
-    core0 [label="Core 0"]
-    core1 [label="Core 1"]
-    core2 [label="Core 2"]
-    core3 [label="Core 3"]
-
-    storage [label="Storage"]
-    disk0 [label="Disk 0"]
-
-    network [label="Network"]
-    nic0 [label="NIC 0"]
-
-    timer [label="Timer"]
-
-  }
-
-  core0--cpu
-  core1--cpu
-  core2--cpu
-  core3--cpu
-
-  disk0--storage
-  nic0--network
-
-  parallel_ppl--upstream
-  parallel_ppl--downstream
-
-  upstream--{core0,core1,core2,core3} [style="dashed"]
-  downstream--core0 [style="dashed"]
-
-  io_ppl--core0 [style="dashed"]
-  io_ppl--disk0 [style="dashed"]
-  sched_ppl--core0 [style="dashed"]
-  sched_ppl--timer [style="dashed"]
-  net_ppl--core0 [style="dashed"]
-  net_ppl--nic0 [style="dashed"]
-  proc_ppl--core0 [style="dashed"]
-
-  subgraph cluster_children {
-    style=filled
-    color=white
-
-    subgraph cluster_child0 {
-      label="Child process 0"
-      style=filled
-      color=lightgrey
-      labeljust=right
-
-      app0_factory [label="Factory"]
-      app0 [label="Child process\rpipeline"]
-    }
-
-#    subgraph cluster_child1 {
-#      label="Child process 1"
-#      style=filled
-#      color=lightgrey
-#      labeljust=right
-#
-#      app1_factory [label="Factory"]
-#      app1 [label="Child process\rpipeline"]
-#    }
-  }
-
-  proc_ppl--app0
-#  proc_ppl--app1
-
-  app0_factory--app0 [constraint=false]
-#  app1_factory--app1 [constraint=false]
-
-}
-#+end_src
-
-#+caption: Mapping of parent and child process pipelines to compute devices. Solid lines denote aggregation, dashed lines denote mapping between logical and physical entities.
-#+attr_latex: :width \textwidth
-#+label: fig-subord-ppl
-#+RESULTS: fig-subord-ppl
-[[file:build/subord-ppl.pdf]]
-
-**** Governing principles.
-Data processing pipeline model is based on the following principles, following
-which maximises efficiency of a programme.
-- There is no notion of a message in the model, a kernel is itself a message
-  that can be sent over network to another node and directly access any kernel
-  on the local node. Only programme logic may guarantee the existence of the
-  kernel.
-- A kernel is a /cooperative routine/, which is submitted to kernel pool upon the
-  call and is executed asynchronously by a scheduler. There can be any number of
-  calls to other subroutines inside routine body. Every call submits
-  corresponding subroutine to kernel pool and returns immediately. Kernels in the
-  pool can be executed in any order; this fact is used by a scheduler to exploit
-  parallelism offered by the computer by distributing kernels from the pool
-  across available cluster nodes and processor cores.
-- Asynchronous execution prevents the use of explicit synchronisation after the
-  call to subroutine is made; system scheduler returns control flow to the
-  routine each time one of its subroutine returns. Such cooperation transforms
-  each routine which calls subroutines into event handler, where each event is a
-  subroutine and the handler is the routine that called them.
-- The routine may communicate with any number of local kernels, addresses of
-  which it knows; communication with kernels which are not adjacent in the call
-  stack complexifies control flow and call stack looses its tree shape. Only
-  programme logic may guarantee presence of communicating kernels in memory. One
-  way to ensure this is to perform communication between subroutines which are
-  called from the same routine. Since such communication is possible within
-  hierarchy through parent routine, it may treated as an optimisation that
-  eliminates overhead of transferring data over intermediate node. The situation
-  is different for interactive or event-based programmes (e.g. servers and
-  programmes with graphical interface) in which this is primary type of
-  communication.
-- In addition to this, communication which does not occur along hierarchical
-  links and executed over cluster network complexify design of resiliency
-  algorithms. Since it is impossible to ensure that a kernel resides in memory
-  of a neighbour node, because a node may fail in the middle of its execution of
-  the corresponding routine. As a result, upon failure of a routine all of its
-  subroutines must be restarted. This encourages a programmer to construct
-  - deep tree hierarchies of tightly-coupled kernels (which communicate on the
-    same level of hierarchy) to reduce overhead of recomputation;
-  - fat tree hierarchies of loosely-coupled kernels, providing maximal degree of
-    parallelism.
-  Deep hierarchy is not only requirement of technology, it helps optimise
-  communication of large number of cluster nodes reducing it to communication of
-  adjacent nodes.
-
-So, control flow objects (or kernels) possess properties of both cooperative
-routines and event handlers.
-
-** SMP implementation
-**** Load balancing algorithm.
-The simplest approach to balance the load on a multi-processor system is to
-split data into equal parts (or a task into homogeneous subtasks) and to
-distribute them evenly between processor cores and cluster nodes, however, this
-approach does not work efficiently in all cases. First, the total number of
-parts, into which input data is split, is often dictated by the problem being
-solved, rather than computer system architecture. Such load balancing may not
-efficient from the computer system point of view: the number of parts is either
-too large compared to the number of processors working in parallel, which
-increases data transfer overhead, or too small, which prevents using all
-available processor cores. Second, restrictions of problem being solved may not
-allow to split input data into even parts which may result in load imbalance
-across processor cores. Third, there are multiple components in the system aside
-from the processor that take part in the computation (such as vector
-co-processors and storage devices), and the problem solution time depends on the
-performance of all the components involved. So, how to make load balancing
-algorithm more efficient in the presence of non-homogeneous input data parts and
-take into account all the devices involved in the computation?
-
-The load balancing algorithm consists of two stages. In the first stage, the
-algorithm places input data part (or a subtask) wrapped in a kernel into an
-appropriate kernel pool: there is a separate pool for each device and an
-associated thread pool. In the second stage, a kernel is retrieved from the pool
-by one of the threads and processed. Due to separate thread pools all devices
-work in parallel to each other, lowering overall system resources downtime
-compared to using all devices from a single thread.
-
-In order to take into account non-homogeneous input data parts or tasks, one may
-predict execution time of each task. Relevant study is done
-in\nbsp{}cite:degtyarev2016balance since ARMA model implementation includes
-mostly homogeneous tasks.
-
-So, load balancing is done in two stages: in the first stage the task wrapped in
-the kernel is routed to the appropriate device and in the second stage the
-kernel is routed to one of the thread from the device thread pool.
-Non-homogeneous kernels may be handled by predicting their execution time, but
-such kernels are not present in ARMA model implementation.
-
-**** Performance of MPI, OpenMP, OpenCL implementations.
-ARMA model does not require highly optimised software implementation to be
-efficient, its performance is high even without use of co-processors; there are
-two main causes of that. First, ARMA model itself does not use transcendental
-functions (sines, cosines and exponents) as opposed to LH model. All
-calculations (except model coefficients) are done via polynomials, which can be
-efficiently computed on modern processors using a series of FMA instructions.
-Second, pressure computation is done via explicit analytic formula using nested
-FFTs. Since two-dimensional FFT of the same size is repeatedly applied to every
-time slice, its coefficients (complex exponents) are pre-computed for all
-slices, and computations are performed with only a few transcendental functions.
-In case of MA model, performance is also increased by doing convolution with FFT
-transforms. So, high performance of ARMA model is due to scarce use of
-transcendental functions and heavy use of FFT, not to mention that high
-convergence rate and non-existence of periodicity allows to use far fewer
-coefficients compared to LH model.
-
-ARMA implementation uses several libraries of reusable mathematical functions
-and numerical algorithms (listed in table\nbsp{}[[tab-arma-libs]]), and was implemented using
-several parallel programming technologies (MPI, OpenMP, OpenCL) to find the most
-efficient one.
-
-#+name: tab-arma-libs
-#+caption: A list of mathematical libraries used in ARMA model implementation.
-#+attr_latex: :booktabs t :align lp{0.6\linewidth}
-| Library                                                | What it is used for             |
-|--------------------------------------------------------+---------------------------------|
-| DCMT\nbsp{}cite:matsumoto1998dynamic                         | parallel PRNG                   |
-| Blitz\nbsp{}cite:veldhuizen1997will,veldhuizen2000techniques | multidimensional arrays         |
-| GSL\nbsp{}cite:gsl2008scientific                             | PDF, CDF, FFT computation       |
-|                                                        | checking process stationarity   |
-| LAPACK, GotoBLAS\nbsp{}cite:goto2008high,goto2008anatomy     | finding AR coefficients         |
-| GL, GLUT\nbsp{}cite:kilgard1996opengl                        | three-dimensional visualisation |
-
-**** Performance of load balancing algorithm.
-Software implementation of wavy surface generation is balanced in terms of the
-load on processor cores, however, as shown by tests, has high load on storage
-device. Before testing wavy surface generation was implemented using OpenMP for
-parallel computations and in order to implement load balancing algorithm was
-rewritten using POSIX threads. Performance of the two implementations was
-compared on the platform with the configuration listed in table\nbsp{}[[tab-multicore-specs]].
-
-#+name: tab-multicore-specs
-#+caption: Multi-core system configuration.
-#+attr_latex: :booktabs t
-| Component                 | Details                          |
-|---------------------------+----------------------------------|
-| Programming language      | C++11                            |
-| Threading library         | C++11 STL threads                |
-| Atomic operations library | C++11 STL atomic                 |
-| Routines to measure time  | ~clock_gettime(CLOCK_MONOTONIC)~ |
-|                           | ~/usr/bin/time -f \%e~           |
-| Compiler                  | GCC 4.8.2                        |
-| Compiler flags            | ~-std=c++11 -O2 -march=native~   |
-| Operating system          | Debian 3.2.51-1 x86_64           |
-| File system               | ext4                             |
-| Processor                 | Intel Core 2 Quad Q9650          |
-| Core frequency (GHz)      | 3.00                             |
-| No. of cores              | 4                                |
-| Amount of RAM (GB)        | 8                                |
-| Disk                      | Seagate ST3250318AS              |
-| Disk speed (rpm)          | 7200                             |
-
-The experiment consisted of running both implementations on a multi-core machine
-varying the size of the surface; the size of CPU thread pool and I/O thread pool
-was not changed during the experiment. I/O thread pool consisted of one thread,
-and CPU thread pool size was equal the number of physical processor cores.
-
-In the experiment load balancing algorithm showed higher performance than
-implementation without it. The more the size of the generated surface is the
-more the gap in performance is (fig.\nbsp{}[[fig-factory-performance]]) which is a
-result of overlap of computation phase and data output phase
-(fig.\nbsp{}[[fig-factory-overlap]]). In OpenMP implementation data output phase
-begins only when computation is over, whereas load balancing algorithm makes
-both phases end almost simultaneously. So, /pipelined execution of internally
-parallel sequential phases is more efficient than their sequential execution/,
-and this allows to balance the load across different devices involved in
-computation.
-
-#+name: fig-factory-performance
-#+header: :width 5 :height 4
-#+begin_src R :file build/factory-vs-openmp.pdf
-source(file.path("R", "common.R"))
-arma.plot_factory_vs_openmp(
-  xlab="Realisation size",
-  ylab="Time, s",
-  power=6
-)
-#+end_src
-
-#+caption: Performance comparison of OpenMP and Factory implementations.
-#+label: fig-factory-performance
-#+RESULTS: fig-factory-performance
-[[file:build/factory-vs-openmp.pdf]]
-
-#+name: fig-factory-overlap
-#+header: :width 7 :height 4
-#+begin_src R :file build/factory-vs-openmp-overlap.pdf
-source(file.path("R", "common.R"))
-par(mar=c(5, 6, 0, 1), pty="m")
-arma.plot_factory_vs_openmp_overlap(
-  xlab="Time, s",
-  labels=c("Factory", "OpenMP"),
-  scale=10**9
-)
-#+end_src
-
-#+caption: Overlap of parallel computations on \([G_0,G_1]\) and data output to disk on \([W_0,W_1]\). In OpenMP implementation there is no overlap.
-#+label: fig-factory-overlap
-#+RESULTS: fig-factory-overlap
-[[file:build/factory-vs-openmp-overlap.pdf]]
-
-Proposed load balancing method for multi-core systems allows to increase
-performance of applications that read or write large volumes of data to disk,
-but may be used in other cases too. The main idea of the algorithm is to
-classify the load and find the suitable device to route the load to. So, any
-devices other than disks may be used as well.
-** MPP implementation
-*** Cluster node discovery algorithm
-:PROPERTIES:
-:CUSTOM_ID: sec:node-discovery
-:END:
-
-Many distributed systems are built on the principle of /subordination/: there is
-principal node in each cluster which manages job queue, schedules their
-execution on subordinate nodes and monitors their state. Principal role is
-assigned either /statically/ by an administrator to a particular physical node,
-or /dynamically/ by electing one of the cluster nodes as principal. In the
-former case fault tolerance is provided by reserving additional spare node which
-takes principal role when current principal fails. In the latter case fault
-tolerance is provided by electing new principal node from survived nodes.
-Despite the fact that dynamic role assignment requires specialised distributed
-algorithm, this approach becomes more and more popular as it does not require
-spare reserved nodes to recover from principal node failure.
-
-Leader election algorithms (which sometimes referred to as /distributed
-consensus/ algorithms are special cases of wave algorithms. In\nbsp{}cite:tel2000introduction Tel defines them as algorithms in which termination
-event is preceded by at least one event occurring in /each/ parallel process.
-Wave algorithms are not defined for anonymous networks, that is they apply only
-to processes that can uniquely define themselves. However, the number of
-processes affected by the "wave" can be determined in the course of an
-algorithm. For a distributed system this means that wave algorithms work for
-computer clusters with dynamically changing number of nodes, and the algorithm
-is unaffected by some nodes going on-line and off-line.
-
-The approach in the following work does not use wave algorithms, and hence does
-not require communicating with each node of the cluster to determine a leader.
-Instead, each node enumerates all nodes in the network it is part of, and
-converts this list to a /tree hierarchy/ with a user-defined maximal fan-out
-value (maximal number of subordinate nodes). Then the node determines its
-hierarchy level and tries to communicate with nodes from higher levels to become
-their subordinate. First, it checks the closest ones and then goes all the way
-to the top. If there is no top-level nodes or the node cannot connect to them,
-then the node itself becomes the principal of the hierarchy.
-
-Tree hierarchy of all hosts in a network defines strict total order on a set of
-cluster nodes. Although, technically any function can be chosen to map a node to
-a number, in practise this function should be sufficiently smooth along the time
-axis and may have infrequent jumps: high-frequency oscillations (which are often
-caused by measurement errors) may result in constant passing of principal role
-from one node to another, which makes the cluster unmanageable. The simplest
-such function is the position of an IP address in network IP address range.
-
-The following key features distinguish this approach with respect to some
-existing proposals\nbsp{}cite:brunekreef1996design,aguilera2001stable,romano2014design.
-- *Multi-level hierarchy.* The number of principal nodes in a network depends on
-  the fan-out value. If it is lesser than the number of IP-addresses in the
-  network, then there are multiple principle nodes in the cluster. If it is
-  greater or equal to the number of IP-addresses in the network, then there is
-  only one principal node. When some node fail, multi-level hierarchy changes
-  locally, only nodes adjacent to the failed one communicate.
-- *IP-address mapping.* Since hierarchy structure solely depends on the nodes'
-  IP addresses, there is no election phase in the algorithm. To change the
-  principal each node sends a message to the old principal and to the new one.
-- *Completely event-based.* The messages are sent only when some node fails, so
-  there is no constant load on the network. Since the algorithm allows
-  to tolerate failure of sending any message, there is no need in heartbeat
-  packets indicating presence of a node in the network; instead, all messages
-  play role of heartbeats and packet send time-out is adjusted.
-- *No manual configuration.* A node does not require any prior knowledge to find
-  the principal: it determines the network it is part of, calculates potential
-  principal IP-address and sends the message. If it fails, the process is
-  repeated for the next potential principal node. So the algorithm is suitable
-  to bootstrap a cluster without manual configuration, the only requirement is
-  to start the corresponding service on each node.
-To summarise, the advantage of the algorithm is that it
-- scales to a large number of nodes by means of hierarchy with multiple
-  principals,
-- does not constantly load the network with node state updates and heartbeat
-  packets,
-- does not require manual configuration to bootstrap a cluster.
-
-The disadvantage of the algorithm is that it requires IP-address to change
-infrequently. It is not suitable for cloud environments in which node DNS name
-is preserved, but IP-address may change over time. When IP-address changes,
-current connections may close, thus triggering node "failure" and rebuilding
-node hierarchy. So, environments where nodes are not identified by IP-addresses,
-are not suitable for the algorithm.
-
-The other disadvantage is that the algorithm creates artificial dependence of
-node rank on IP-address: it is difficult to substitute IP-address mapping with a
-more sophisticated one (e.g. a mapping which uses current node and network load
-to infer node ranks) because measurement errors may result in unstable
-hierarchy, and the algorithm cease to be fully event-based.
-
-Node discovery algorithm is designed to balance the load on a cluster of compute
-nodes, its use in other applications is not studied here. When distributed or
-parallel programme starts on any of cluster nodes, its subtasks are distributed
-to all adjacent nodes in the hierarchy (including principal node if applicable).
-To distribute the load evenly when the application is run on a subordinate node,
-each node maintains weight of each adjacent node in the hierarchy. The weight
-equals to the number of nodes in the tree "behind" the adjacent node. For
-example, if the weight of the first adjacent node is 2, then round-robin load
-balancing algorithm distributes two subtasks to the first node before moving to
-the next one.
-
-To summarise, node discovery algorithm is
-- designed to ease load balancing on the cluster,
-- fully fault-tolerant the state of every node can be recomputed at any time,
-- fully event-based which means it does not load the network by periodically
-  sending messages.
-
-**** Building a tree hierarchy.
-Strict total order on the set \(\mathcal{N}\) of cluster nodes connected to a
-network is defined as
-\begin{equation*}
-  \forall n_1 \forall n_2 \in \mathcal{N},
-  \forall f \colon \mathcal{N} \rightarrow \mathcal{R}^n
-  \Rightarrow (f(n_1) < f(n_2) \Leftrightarrow \neg (f(n_1) \geq f(n_2))),
-\end{equation*}
-where \(f\) maps a node to its rank and operator \(<\) defines strict total order on
-\(\mathcal{R}^n\). Function \(f\) defines node's sequential number, and \(<\) makes
-this number unique.
-
-The simpliest function \(f\) maps each node to its Internet address position in
-network IP address range. Without conversion to a tree (when only /one/
-leader is allowed in the network) a node with the lowest position in this range
-becomes the principal. If IP-address of a node occupies the first position in
-the range, then there is no principal for it, and it continues to be at the top
-of the hierarchy until it fails. Although, IP address mapping is simple to
-implement, it introduces artificial dependence of the principal role on the
-address of a node. Still, it is useful for initial configuration of a cluster
-when more complex mappings are not applicable.
-
-To make discovery algorithm scale to a large number of nodes, IP address range
-is mapped to a tree hierarchy. In this hierarchy each node is uniquely
-identified by its hierarchy level \(l\), which it occupies, and offset \(o\),
-which equals to the sequential number of node on its level. Values of level and
-offset are computed from the following optimisation problem.
-\begin{align*}
-    n = \sum\limits_{i=0}^{l(n)} p^i + o(n), \quad
-    l \rightarrow \min, \quad
-    o \rightarrow \min, \quad
-    l \geq 0, \quad
-    o \geq 0
-\end{align*}
-where \(n\) is the position of node's IP address in network IP address range and
-\(p\) is fan-out value (the maximal number of subordinates, a node can have). The
-principal of a node with level \(l\) and offset \(o\) has level \(l-1\) and offset
-\(\lfloor{o/p}\rfloor\). The distance between any two nodes in the tree with
-network positions \(i\) and \(j\) is computed as
-\begin{align*}
-    & \langle
-        \text{lsub}(l(j), l(i)), \quad
-        \left| o(j) - o(i)/p \right|
-    \rangle,\\
-    & \text{lsub}(l_1, l_2) =
-    \begin{cases}
-        \infty & \quad \text{if } l_1 \geq l_2, \\
-        l_1 - l_2 & \quad \text{if } l_1 < l_2.
-    \end{cases}
-\end{align*}
-The distance is compound to account for level in the first place.
-
-To determine its principal each node ranks all nodes in the network according to
-their position \(\langle{l(n),o(n)}\rangle\), and using distance formula chooses
-the node which is closest to potential principal position and has lower rank.
-That way IP addresses of offline nodes are skipped, however, for sparse networks
-(in which nodes occupy non-contiguous IP addresses) perfect tree is not
-guaranteed.
-
-In order to determine its principal a node is required to communicate to a node
-address of which it knows beforehand, so discovery algorithm scales to a large
-number of nodes. Communication with other nodes in ranked list occurs only when
-the current principal node fails. So, if address of cluster nodes occupy
-contiguous addresses network IP address range, each node connects to its
-principal only, and inefficient scan of all network by each node does not occur.
-
-**** Evaluation results.
-Test platform consisted of several multi-core nodes, on top of which virtual
-clusters with varying number of nodes were deployed using Linux network
-namespaces. Similar approach is used
-in\nbsp{}cite:lantz2010network,handigol2012reproducible,heller2013reproducible
-where the authors reproduce various real-world experiments using virtual
-clusters and compare results to physical ones. The advantage of it is that the
-tests can be performed on a large virtual cluster using relatively small number
-of physical nodes. This approach was used to evaluate node discovery algorithm,
-because the algorithm has low requirement for system resources (processor time
-and network throughput).
-
-Performance of the algorithm was evaluated by measuring time needed to all nodes
-of the cluster to discover each other. Each change of the hierarchy (as seen by
-each node) was written to a file and after 30 seconds all the processes (each of
-which models cluster node) were forcibly terminated. Test runs showed that
-running more than 100 virtual nodes on one physical node simultaneously warp the
-results, thus additional physical nodes, each of which run 100 virtual nodes,
-were used for the experiment. The experiment showed that discovery of 100--400
-nodes each other takes 1.5 seconds on average, and the value increases only
-slightly with increase in the number of nodes (see
-fig.\nbsp{}[[fig-bootstrap-local]]). An example of tree hierarchy for 11 nodes with
-fan-out 2 is shown in fig.\nbsp{}[[fig-tree-hierarchy-11]].
-
-#+name: fig-bootstrap-local
-#+caption: Time to discover all nodes of the cluster in depending on number of nodes.
-[[file:graphics/discovery.eps]]
-
-#+name: fig-tree-hierarchy-11
-#+begin_src dot :exports results :file build/tree-hierarchy-11.pdf
-digraph {
-
-  node [fontname="Old Standard",fontsize=14,margin="0.055,0",shape=box,style=rounded]
-  graph [nodesep="0.15",ranksep="0.20",rankdir="BT"]
-  edge [arrowsize=0.66]
-
-  m1 [label="127.0.0.1"]
-  m2 [label="127.0.0.2"]
-  m3 [label="127.0.0.3"]
-  m4 [label="127.0.0.4"]
-  m5 [label="127.0.0.5"]
-  m6 [label="127.0.0.6"]
-  m7 [label="127.0.0.7"]
-  m8 [label="127.0.0.8"]
-  m9 [label="127.0.0.9"]
-  m10 [label="127.0.0.10"]
-  m11 [label="127.0.0.11"]
-
-  m2->m1
-  m3->m1
-  m4->m2
-  m5->m2
-  m6->m3
-  m7->m3
-  m8->m4
-  m9->m4
-  m10->m5
-  m11->m5
-}
-#+end_src
-
-#+caption: Tree hierarchy for 11 nodes with fan-out equals 2.
-#+label: fig-tree-hierarchy-11
-#+RESULTS: fig-tree-hierarchy-11
-[[file:build/tree-hierarchy-11.pdf]]
-
-*** Fail over algorithm
-**** Checkpoints.
-Node failures in a distributed system are divided into two types: failure of a
-subordinate node and failure of a principal node. In order for a job running on
-the cluster to survive subordinate node failure, job scheduler periodically
-creates checkpoints and writes them to a stable storage. In order to create the
-checkpoint, the scheduler temporarily suspends all parallel processes of the
-job, copies all memory pages and all internal operating system kernel structures
-allocated for these processes to disk, and resumes execution of the job. In
-order to survive principal node failure, job scheduler server process continuously
-copies its internal state to a backup node, which becomes the principal after
-the failure.
-
-There are many works dedicated to improving performance of
-checkpoints\nbsp{}cite:egwutuoha2013survey, and alternative approaches do not
-receive much attention. Usually HPC applications use message passing for
-parallel processes communication and store their state in global memory space,
-hence there is no way one can restart a failed process from its current state
-without writing the whole memory image to disk. Usually the total number of
-processes is fixed by the job scheduler, and all parallel processes restart upon
-a failure. There is ongoing effort to make it possible to restart only the
-failed process\nbsp{}cite:meyer2012radic by restoring them from a checkpoint on
-the surviving nodes, but this may lead to overload if there are other processes
-on these nodes. Theoretically, process restart is not needed, if the job can
-proceed on the surviving nodes, however, message passing library does not allow
-to change the number of processes at runtime, and most programmes assume this
-number to be constant. So, there is no reliable way to provide fault tolerance
-in message passing library other than restarting all parallel processes from a
-checkpoint.
-
-There is, however, a possibility to continue execution of a job on lesser number
-of nodes than it was initially requested by implementing fault tolerance on
-application level. In this case principal and subordinate roles are dynamically
-distributed between job scheduler daemons running on each cluster node, forming
-a tree hierarchy of cluster nodes, and parallel programme consists of kernels
-which use node hierarchy to dynamically distribute the load and use their own
-hierarchy to restart kernels upon node failure.
-
-**** Dynamic role distribution.
-Fault tolerance of a parallel programme is one of the problems which should by
-solved by big data and HPC job schedulers, however, most schedulers provide
-fault tolerance for subordinate nodes only. These types of failures are
-routinely handled by restarting the affected job (from a checkpoint) or its part
-on the remaining nodes, and failure of a principal node is often considered
-either improbable, or too complicated to handle and configure on the target
-platform. System administrators often find alternatives to application level
-fault tolerance: they isolate principal process of the scheduler from the rest
-of the cluster nodes by placing it on a dedicated machine, or use virtualisation
-technologies instead. All these alternatives complexify configuration and
-maintenance, and by decreasing probability of a machine failure resulting in a
-whole system failure, they increase probability of a human error.
-
-From such point of view it seems more practical to implement principal node
-fault tolerance at application level, but there is no proven generic solution.
-Most implementations are too tied to a particular application to become
-universally applicable. The author believes that this happens due to people's
-habit to think of a cluster as a collection of individual machines each of which
-can be either principal or subordinate, rather than to think of a cluster as a
-whole with principal and subordinate roles being dynamically distributed between
-processes running on different nodes.
-
-Realisation of the fact that a cluster is also a computer allows to implement
-middleware that distributes principal and subordinate roles automatically and
-handles node failures in a generic way. This software provides an API to
-distribute kernels between currently available nodes. Using this API one can
-write a programme that runs on a cluster without knowing the exact number of
-working nodes. The middleware works as a cluster operating system in user space,
-allowing to write and execute distributed applications transparently.
-
-**** Symmetric architecture.
-Many distributed key-value stores and parallel file systems have symmetric
-architecture, in which principal and subordinate roles are dynamically
-distributed, so that any node can act as a principal when the current principal
-node
-fails\nbsp{}cite:ostrovsky2015couchbase,divya2013elasticsearch,boyer2012glusterfs,anderson2010couchdb,lakshman2010cassandra.
-however, this architecture is still not used in big data and HPC job schedulers.
-For example, in YARN big data job scheduler\nbsp{}cite:vavilapalli2013yarn
-principal and subordinate roles are static. Failure of a subordinate node is
-tolerated by restarting a part of a job, that worked on it, on one of the
-surviving nodes, and failure of a principal node is tolerated by setting up
-standby principal node\nbsp{}cite:murthy2011architecture. Both principal nodes
-are coordinated by Zookeeper service which uses dynamic role assignment to
-ensure its own fault-tolerance\nbsp{}cite:okorafor2012zookeeper. So, the lack of
-dynamic role distribution in YARN scheduler complicates the whole cluster
-configuration: if dynamic roles were available, Zookeeper would be redundant in
-this configuration.
-
-The same problem occurs in HPC job schedulers where principal node (where the
-main job scheduler process is run) is the single point of failure.
-In\nbsp{}cite:uhlemann2006joshua,engelmann2006symmetric the authors replicate
-job scheduler state to a backup node to make the principal node highly
-available, but backup node role is assigned statically. This solution is close
-to symmetric architecture, because it does not involve external service to
-provide high availability, but far from ideal where backup node is dynamically
-chosen.
-
-Finally, the simplest principal node high availability is implemented in VRRP
-protocol (Virtual Router Redundancy
-Protocol)\nbsp{}cite:knight1998rfc2338,hinden2004virtual,nadas2010rfc5798.
-Although VRRP protocol does provide dynamic role distribution, but is designed
-to be used by routers and reverse proxy servers behind them. Such servers lack
-the state (a job queue) that needs to be restored upon node failure, so it is
-easier for them to provide high availability. In can be implemented even without
-routers using Keepalived daemon\nbsp{}cite:cassen2002keepalived instead.
-
-Symmetric architecture is beneficial for job schedulers because it
-allows to
-- make physical nodes interchangeable,
-- implement dynamic distribution of principal and subordinate roles, and
-- implement automatic recovery after failure of any node.
-The following sections will describe the components that are required to write
-parallel programme and job scheduler, that can tolerate failure of cluster
-nodes.
-
-**** Hierarchy of control flow objects
-For load balancing purposes cluster nodes are combined into tree hierarchy (see
-section [[#sec:node-discovery]]), and the load is distributed between direct
-neighbours: when one runs the kernel on the subordinate node, the principal node
-also receive some of its subordinate kernels. This makes the system symmetrical
-and easy to maintain: each node have the same set of software that allows
-to replace one node with another in case of failure of the former. Similar
-architectural solution used in key-value stores\nbsp{}cite:anderson2010couchdb,lakshman2010cassandra to provide fault tolerance, but
-author does not know any task schedulers that use this approach.
-
-Unlike ~main~ function in programmes based on message passing library, the first
-(the main) kernel is initially run only on one node, and remote nodes are used
-on-demand. This design choice allows to have arbitrary number of nodes throughout
-execution of a programme, and use more nodes for highly parallel parts of the
-code. Similar choice is made in the design of big data
-frameworks\nbsp{}cite:dean2008mapreduce,vavilapalli2013yarn \nbsp{}--- a user
-submitting a job does not specify the number of hosts to run its job on, and
-actual hosts are the hosts where input files are located.
-
-From mathematical point of view kernel \(K\) can be described as a vector-valued
-functional which recursively maps a kernel to \(n\)-component vector of kernels:
-\begin{equation*}
-    K(f): \mathbb{K} \rightarrow \mathbb{K}^n
-    \qquad
-    \mathbb{K}^n = \left\{ f: \mathbb{K} \rightarrow \mathbb{K}^n \right\}.
-\end{equation*}
-Special kernel \(\mathbb{O}: \mathbb{K} \rightarrow \mathbb{K}^0\) is used to stop
-the recursion and is passed as an argument to the main kernel. An argument to a
-kernel is interpreted as follows.
-- If a kernel is a newly created kernel, then its argument is its parent kernel.
-- In other cases the argument is an arbitrary kernel (often a child of the
-  current kernel).
-
-Kernels are processed in a loop which starts with executing the main kernel,
-then inside the main kernel other kernels are created and executed
-asynchronously. The loop continues until some kernel returns \(\mathbb{O}\).
-Since kernel may return multiple kernels they are executed in parallel, which
-quickly fills kernel pool. Since kernels from the pool may be executed in
-unspecified order, several concurrent threads retrieve kernels from the pool and
-may send the remaining kernels to neighbouring cluster nodes if the pool
-overflows.
-
-Kernels are implemented as closures (functors in C++)\nbsp{}--- function objects
-containing all their arguments, a reference to parent kernel and application
-domain data. The data is either processed upon kernel call, or subordinate
-kernels are created to process it in parallel. When the processing is complete a
-parent kernel closure with its subordinate kernel as an argument is called to
-collect the resulting data from it.
-
-**** Handling nodes failures.
-Basic strategy to overcome a failure of a subordinate node is to restart
-corresponding kernels on a healthy node\nbsp{}--- a strategy employed by Erlang
-language to restart failed subordinate processes\nbsp{}cite:armstrong2003thesis.
-In order to implement this method in the framework of kernel hierarchy, sender
-node saves every kernel that is sent to remote cluster nodes, and in an event of
-a failure of any number of nodes, where kernels were sent, their copies are
-redistributed between the remaining nodes without custom handling by a
-programmer. If there are no nodes to sent kernels to, they are executed locally.
-So, in contrast to "heavy-weight" checkpoint/restart machinery employed by HPC
-cluster job schedulers, tree hierarchy of nodes coupled with hierarchy of
-kernels allow to automatically and transparently handle of any number of
-subordinate node failures without restarting any processes of a parallel
-programme.
-
-A possible way of handling failure of the main node (a node where the main
-kernel is executed) is to replicate the main kernel to a backup node, and make
-all updates to its state propagate to the backup node by means of a distributed
-transaction, but this approach does not correlate with asynchronous nature of
-kernels and to complex to implement. In practice, however, the main kernel
-usually does not perform operations in parallel, it is rather sequentially
-execution steps one by one, so it has only one subordinate at a time. (Each
-subordinate kernel represent sequential computational step which may or may not
-be internally parallel.) Keeping this in mind, one can simplify synchronisation
-of the main kernel state: send the main kernel along with its subordinate to the
-subordinate node. Then if the main node fails, the copy of the main kernel
-receives its subordinate (because both of them are on the same node) and no time
-is spent on recovery. When the subordinate node, to which subordinate kernel
-together with the copy of the main kernel was sent, fails, the subordinate
-kernel is sent to some other node, and in the worst case the current
-computational step is executed again.
-
-The approach described above is designed for kernels that do not have a parent
-and have only one subordinate at a time, which means that it functions as
-checkpoint mechanism. The advantage of this approach is that it
-- saves results after each sequential step, when memory footprint of a programme
-  is low,
-- saves only relevant data, and
-- uses memory of a subordinate node rather than disk storage.
-This simple approach allows to tolerate at most one failure of /any/ cluster node
-per computational step or arbitrary number of subordinate nodes at any time
-during programme execution.
-
-An example of fail over algorithm follows (fig.\nbsp{}[[fig-fail-over-example]]).
-1. Initial state. Initially, computer cluster does not need to be configured
-   except setting up local network. The algorithm assumes full connectivity of
-   cluster nodes, and works best with tree topologies in which several network
-   switches connect all cluster nodes.
-2. Build node hierarchy. When the cluster is bootstrapped, daemon processes
-   start on all cluster nodes and collectively build hierarchy of such processes
-   superimposed on the topology of cluster network. Position of a daemon process
-   in the hierarchy is defined by the position of its node IP address in the
-   network IP address range. To establish hierarchical link each process
-   connects to its assumed principal process. The hierarchy is changed only when
-   a new node joins the cluster or a node fails.
-3. Launch main kernel. The first kernel launches on one of the subordinate nodes
-   (node \(B\)). Main kernel may have only one subordinate at a time, and backup
-   copy of the main kernel is sent along with the subordinate kernel \(T_1\) to
-   the principal node \(A\). \(T_1\) represents one sequential step of a
-   programme. There can be any number of sequential steps in a programme, and
-   when node \(A\) fails, the current step is restarted from the beginning.
-4. Launch subordinate kernels. Kernels \(S_1\), \(S_2\), \(S_3\) are launched on
-   subordinate cluster nodes. When node \(B\), \(C\) or \(D\) fails,
-   corresponding main kernel restarts failed subordinates (\(T_1\) restarts
-   \(S_1\), master kernel restarts \(T_1\) etc.). When node \(B\) fails, master
-   kernel is recovered from backup.
-
-#+name: fig-fail-over-example
-#+header: :headers '("\\input{preamble}")
-#+begin_src latex :file build/fail-over-example.pdf :exports results :results raw
-\input{tex/preamble}
-\newcommand*{\spbuInsertFigure}[1]{%
-\vspace{2\baselineskip}%
-\begin{minipage}{0.5\textwidth}%
-    \Large%
-    \input{#1}%
-\end{minipage}%
-}%
-\noindent%
-\spbuInsertFigure{tex/cluster-0}~\spbuInsertFigure{tex/frame-0}\newline
-\spbuInsertFigure{tex/frame-3}~\spbuInsertFigure{tex/frame-4}\newline
-\spbuInsertFigure{tex/legend}
-#+end_src
-
-#+caption: An example of fail over algorithm in action.
-#+label: fig-fail-over-example
-#+attr_latex: :width \textwidth
-#+RESULTS: fig-fail-over-example
-[[file:build/fail-over-example.pdf]]
-
-**** Evaluation results.
-Factory framework is evaluated on physical cluster (table\nbsp{}[[tab-cluster]]) on the
-example of HPC application, that generates ocean wavy surface, which is
-described in detail in section [[#sec:arma-algorithms]]. The application consists of
-a series of filters, each of which is applied to the result of the previous one.
-Some of the filters are computed in parallel, so the programme is written as a
-sequence of steps, some if which are made internally parallel to get better
-performance. In the programme only the most compute-intensive step (the surface
-generation) is executed in parallel across all cluster nodes, and other steps
-are executed in parallel across all cores of the principal node.
-
-#+name: tab-cluster
-#+caption: Test platform configuration.
-#+attr_latex: :booktabs t
-| CPU                       | Intel Xeon E5440, 2.83GHz |
-| RAM                       | 4Gb                       |
-| HDD                       | ST3250310NS, 7200rpm      |
-| No. of nodes              | 12                        |
-| No. of CPU cores per node | 8                         |
-
-The application was rewritten for the fault-tolerant version of the framework
-which required only slight modifications to handle failure of a node with the
-main kernel. The kernel was marked so that the framework makes a replica and
-sends it to some subordinate node along with its subordinate kernel. Other code
-changes involved modifying some parts to match the new API. So, providing fault
-tolerance by means of kernel hierarchy is mostly transparent to the programmer
-which only demands explicit marking of replicated kernels.
-
-In a series of experiments performance of the new version of the application in
-the presence of different types of failures was benchmarked (numbers correspond
-to the graphs in fig.\nbsp{}[[fig-benchmark]]):
-1) no failures,
-2) failure of a subordinate node (a node where a part of wavy surface is
-   generated),
-3) failure of a principal node (a node where the main kernel is run),
-4) failure of a backup node (a node where a copy of the main kernel is stored).
-A tree hierarchy with fan-out value of 64 was chosen to make all subordinate
-cluster nodes connect directly to the one having the first IP-address in the
-network IP address range. A victim node was made offline after a fixed amount of
-time after the programme start which is equivalent approximately to \(1/3\) of
-the total run time without failures on a single node. The application
-immediately recognised node as offline, because the corresponding connection was
-closed; in real-world scenario, however, the failure is detected after a
-configurable time-out. All relevant parameters are summarised in table\nbsp{}[[tab-benchmark]]. The results of these runs were compared to the run without node
-failures (fig.\nbsp{}[[fig-benchmark]] and\nbsp{}[[fig-slowdown]]).
-
-There is considerable difference in overall application performance for
-different types of failures. Graphs\nbsp{}2 and\nbsp{}3 in
-fig.\nbsp{}[[fig-benchmark]] show that performance in case of principal and
-subordinate node failure is the same. In case of principal node failure a backup
-node stores a copy of the main kernel and uses this copy when it detects failure
-of the principal node. In case of subordinate node failure, the principal node
-redistributes the non-returning kernels between remaining subordinate nodes. In
-both cases the state of the main kernel is not lost and no time is spent to
-restore it, which explains similar performance.
-
-Graph\nbsp{}4 in fig.\nbsp{}[[fig-benchmark]] shows that performance in case of a
-backup node failure is much lower than in other cases. It happens because
-principal node stores only the state of the current step of the computation plus
-some additional fixed amount of data, whereas a backup node not only stores the
-copy of this data, but executes the step in parallel with other subordinate
-nodes. So, when a backup node fails, the principal node executes the whole step
-once again on arbitrarily chosen survived node.
-
-#+name: tab-benchmark
-#+caption: Benchmark parameters for experiments with fail over algorithm.
-#+attr_latex: :booktabs t
-| Experiment no. | Time to offline, s |
-|              1 |                    |
-|              2 |                 10 |
-|              3 |                 10 |
-|              4 |                 10 |
-
-To measure how much time is lost due to a node failure the total execution time
-with a failure was divided by the total execution time without the failure but
-with the number of nodes minus one. This relation is obtained from the same
-benchmark and presented in fig.\nbsp{}[[fig-slowdown]]. The difference in
-performance in case of principal and subordinate node failures lies within 5%
-margin, and in case of backup node failure within 50% margin for the number of
-node less than 6[fn::Measuring this margin for higher number of nodes does not
-make sense since time before failure is greater than total execution time with
-these numbers of nodes, and programme's execution finishes before a failure
-occurs.]]. Increase in execution time of 50% is more than \(1/3\) of execution
-time after which a failure occurs, but backup node failure needs some time to be
-discovered: it is detected only when subordinate kernel carrying the copy of the
-main kernel finishes its execution and tries to reach its parent. Instant
-detection requires abrupt stopping of the subordinate kernel which may be
-inapplicable for programmes with complicated logic.
-
-#+name: fig-benchmark
-#+begin_src R :file build/benchmark-xxx.pdf
-# TODO
-#+end_src
-
-#+caption: Performance of hydrodynamics HPC application in the presence of node failures.
-#+label: fig-benchmark
-#+RESULTS: fig-benchmark
-
-The results of the benchmark allows to conclude that no matter a principal or a
-subordinate node fails, the overall performance of a parallel programme roughly
-equals to the one without failures with the number of nodes minus one, however,
-when a backup node fails performance penalty is much higher.
-
-#+name: fig-slowdown
-#+begin_src R :file build/slowdown-xxx.pdf
-# TODO
-#+end_src
-
-#+caption: Slowdown of the hydrodynamics HPC application in the presence of different types of node failures compared to execution without failures but with the number of nodes minus one.
-#+label: fig-slowdown
-#+RESULTS: fig-slowdown
-
-**** Discussion of test results.
-Fail over algorithm guarantees to handle one failure per sequential programme
-step, more failures can be tolerated if they do not affect the principal node.
-The algorithm handles simultaneous failure of all subordinate nodes, however, if
-both principal and backup nodes fail, there is no chance for a programme to
-continue the work. In this case the state of the current computation step is
-lost, and the only way to restore it is to restart the application from the
-beginning.
-
-Kernels are means of abstraction that decouple distributed application from
-physical hardware: it does not matter how many cluster nodes are currently
-available for a programme to run without interruption. Kernels eliminate the
-need to allocate a physical backup node to tolerate principal node failures: in
-the framework of kernel hierarchy any physical node (except the principal one)
-can act as a backup one. Finally, kernels allow to handle failures in a way that
-is transparent to a programmer, deriving the order of actions from the internal
-state of a kernel.
-
-The experiments show that it is essential for a parallel programme to have
-multiple sequential steps to make it resilient to cluster node failures,
-otherwise failure of a backup node in fact triggers recovery of the initial
-state of the programme. Although, the probability of a principal node failure is
-lower than the probability of a failure of any of the subordinate nodes, it does
-not justify loosing all the data when the long programme run is near completion.
-In general, the more sequential steps one has in a parallel programme the less
-time is lost in an event of a backup node failure, and the more parallel parts
-each sequential step has the less time is lost in case of a principal or
-subordinate node failure. In other words, the more scalable a programme is the
-more resilient to cluster node failures it becomes.
-
-Although it is not shown in the experiments, Factory does not only provide
-tolerance to cluster node failures, but allows for new nodes to automatically
-join the cluster and receive their portion of kernels from the already running
-programmes. This is trivial process as it does not involve restarting failed
-kernels or copying their state, so it has not been studied experimentally in
-this work.
-
-Theoretically, fault tolerance based on a hierarchy of nodes and kernels can be
-implemented on top of the message-passing library without loss of generality.
-Although it would be complicated to reuse free nodes instead of failed ones in
-the framework of this library, as the number of nodes is often fixed in such
-libraries, allocating reasonably large number of nodes for the programme would
-be enough to make it fault-tolerant. At the same time, implementing
-hierarchy-based fault tolerance inside message-passing library itself is not
-practical, because it would require saving the state of a parallel programme
-which equals to the total amount of memory it occupies on each cluster node,
-which in turn would not make it more efficient than checkpoints.
-
-The weak point of the proposed algorithm is the period of time starting from a
-failure of principal node up to the moment when the failure is detected, the
-main kernel is restored and new subordinate kernel with the parent's copy is
-received by a subordinate node. If at any time during this period backup node
-fails, execution state of a programme is completely lost, and there is no way to
-recover it other than restarting the programme from the beginning. The duration
-of the dangerous period can be minimised, but the probability of an abrupt
-programme termination can not be fully eliminated. This result is consistent
-with the scrutiny of /impossibility theory/, in the framework of which it is
-proved the impossibility of the distributed consensus with one faulty
-process\nbsp{}cite:fischer1985impossibility and impossibility of reliable
-communication in the presence of node
-failures\nbsp{}cite:fekete1993impossibility.
-** Comparison of the proposed approach to the current approaches
-Current state-of-the-art approach to developing and running parallel programmes
-on the cluster is the use of MPI message passing library and job scheduler, and
-despite the fact that this approach is highly efficient in terms of parallel
-computing, it is not flexible enough to accommodate dynamic load balancing and
-automatic fault-tolerance. Programmes written with MPI typically assume
-- equal load on each processor,
-- non-interruptible and reliable execution of batch jobs, and
-- constant number of parallel processes/threads throughout the execution which
-  is equal to the total number of processors.
-The first assumption does not hold for ocean wave simulation programme because
-AR model requires dynamic load balancing between processors to generate each
-part of the surface only when all dependent parts has already been generated.
-The last assumption also does not hold, because for the sake of efficiency each
-part is written to a file asynchronously by a separate thread. The remaining
-assumption is not related to the programme itself, but to the job scheduler, and
-does not generally hold for very large computer clusters in which node failures
-occur regularly, and job scheduler slowly restores the failed job from the
-checkpoint severely hindering its performance. So, the idea of the proposed
-approach is to give parallel programmes more flexibility:
-- provide dynamic load balancing via pipelined execution of sequential,
-  internally parallel programme steps,
-- restart only processes that were affected by node failure, and
-- execute the programme on as many compute nodes as are available in the
-  cluster.
-In this section advantages and disadvantages of this approach are discussed.
-
-In comparison to portable batch systems (PBS) the proposed approach uses
-lightweight control flow objects instead of heavy-weight parallel jobs to
-distribute the load on cluster nodes. First, this allows to have node object
-queues instead of several cluster-wide job queues. The granularity of control
-flow objects is much higher than the batch jobs, and despite the fact that their
-execution time cannot be reliably predicted (as is execution time of batch
-jobs), objects from multiple parallel programmes can be dynamically distributed
-between the same set of cluster nodes, thus making the load more even. The
-disadvantage is that this requires more RAM to execute many programmes on the
-same set of nodes, and execution of each programme may be longer because of the
-shared control flow object queues. Second, the proposed approach uses dynamic
-distribution of principal and subordinate roles between cluster nodes instead of
-their static assignment to the particular physical nodes. This makes nodes
-interchangeable, which is required to provide fault tolerance. So, simultaneous
-execution of multiple parallel programmes on the same set of nodes may increase
-throughput of the cluster, but may also decrease their performance taken
-separately, and dynamic role distribution is the base on which resilience to
-failures builds.
-
-In comparison to MPI the proposed approach uses lightweight control flow objects
-instead of heavy-weight processes to decompose the programme into individual
-entities. First, this allows to determine the number of entities computed in
-parallel by the problem being solved, not the computer or cluster architecture.
-A programmer is encouraged to create as many objects as needed, guided by the
-algorithm or restrictions on the size of data structures from the problem
-domain. In ocean wave simulation programme the minimal size of each wavy surface
-part depends on the number of coefficients along each dimension, and at the same
-time the number of parts should be larger than the number of processors to make
-the load on each processor more even. Considering these limits the optimal part
-size is determined at runtime, and, in general, is not equal the number of
-parallel processes. The disadvantage is that the more control flow objects there
-are in the programme, the more shared data structures are copied to the same
-node with subordinate objects; this problem is solved by introducing another
-intermediate layer of objects, which in turn adds more complexity to the
-programme. Second, hierarchy of control flow objects together with hierarchy of
-nodes allows for automatic recomputation of failed objects on surviving nodes in
-an event of hardware failures. It is possible because the state of the programme
-execution is stored in each object and not in global variables like in MPI
-programmes. By duplicating the state to a subordinate nodes, the system
-recomputes only objects from affected processes instead of the whole programme.
-So, transition from processes to control flow objects may increase performance
-of a parallel programme via dynamic load balancing, but inhibit its scalability
-for a large number of nodes due to duplication of execution state.
-
-It may seem as if three building blocks of the proposed approach\nbsp{}---
-control flow objects, pipelines and hierarchies\nbsp{}--- are orthogonal, but,
-in fact the complement each other. Without control flow objects carrying
-programme state it is impossible to recompute failed subordinate objects and
-provide fault tolerance. Without node hierarchy it is impossible to distribute
-the load between cluster nodes, because all nodes are equal without the
-hierarchy. Without pipelines for each device it is impossible to execute control
-flow objects asynchronously and implement dynamic load balancing. These three
-entities form a closed system with nothing to add and nothing to
-remove\nbsp{}--- a solid foundation for any distributed programme.
-
-To summarise, one can say that the control flow objects make parallel programmes
-more flexible: they balance the decrease in the performance due to shared object
-queues with the increase due to dynamic load balancing. Requiring more RAM, they
-allow to simultaneously run multiple parallel programmes on all cluster nodes
-without idling in the job queue, and transform the cluster into a unified
-computer system which makes best effort to execute distributed applications
-without interruption.
-
-* Conclusion
-**** Research results.
-In the sutdy of matheamtical apparatus for ocean wave simulations which goes
-beyond linear wave theory the following main results were achieved.
-- ARMA model was applied to simulation of ocean waves of arbitrary amplitudes.
-  Integral characteristics of generated wavy surface were verified by comparing
-  to the ones of a real ocean surface.
-- Analytic formula for determining wave pressures was applied to compute
-  velocity potentials under generated surface. The resulting velocity potential
-  field was verified by comparing it to the one given by fromuale from linear
-  wave theory for small-amplitude waves. For large amplitude waves the new
-  formula gives a resonably different field. Analytic formula is computationally
-  efficient because all the integrals are written as Fourier transforms, for
-  which there are high-performance implentations.
-
-**** Further research directions.
-One of the topic of future research is studying generation of wave of arbitrary
-profiles on the basis of mixed ARMA process. Another direction is integration of
-the developed model and pressure determination formula into existing application
-software packages.
-
-* Summary
-Research results allow to conclude that a problem of determining pressures under
-sea surface can be solved analytically without assumptions of linear and
-small-amplitude wave theories. This solution coupled with ARMA ocean wave
-simulation model, capable of generating waves of arbitrary amplitudes, can be
-used to determine the impact of wave oscillations on the dynamic marine object
-in a sea way, and give more precise results than analogous solution for
-small-amplitude waves.
-
-Results of the numerical experiments allow to conclude that wavy surface
-generation as well as pressure computation can be efficiently implemented via
-fast Fourier transforms, and long simulation session can be conducted.
-
-The developed mathematical apparatus and its numerical implementation can become
-a base of virtual testbed for marine objects dynamics studies.
-
-* Acknowledgements
-The graphs in this work were prepared using R language for statistical
-computing\nbsp{}cite:rlang2016,Sarkar2008lattice and Graphviz
-software\nbsp{}cite:Gansner00anopen. The manuscript was prepared using
-Org-mode\nbsp{}cite:Schulte2011org2,Schulte2011org1,Dominik2010org for GNU Emacs
-which provides computing environment for reproducible research. This means that
-all graphs can be reproduced and corresponding statements verified by cloning
-thesis repository[fn:repo], installing Emacs and exporting the document.
-
-The research was carried out using computational resources of Resource Centre
-"Computational Centre of Saint Petersburg State University" (\mbox{T-EDGE96}
-\mbox{HPC-0011828-001}) within frameworks of grants of Russian Foundation for
-Basic Research (projects no.\nbsp{}\mbox{16-07-01111}, \mbox{16-07-00886},
-\mbox{16-07-01113}).
-
-[fn:repo] [[https://github.com/igankevich/arma-thesis]]
-
-* List of acronyms and symbols
-- <<<MPP>>> :: Massively Parallel Processing, computers with distributed memory.
-- <<<SMP>>> :: Symmetric Multi-Processing, computers with shared memory.
-- <<<ACF>>> :: auto-covariate function.
-- <<<FFT>>> :: fast Fourier transform.
-- <<<PRNG>>> :: pseudo-random number generator.
-- <<<BC>>> :: boundary condition.
-- <<<PDE>>> :: partial differential equation.
-- <<<NIT>>> :: non-linear inertia-less transform.
-- <<<AR>>> :: auto-regressive process.
-- <<<ARMA>>> :: auto-regressive moving-average process.
-- <<<MA>>> :: moving average process.
-- <<<LH>>> :: Longuet---Higgins model.
-- <<<LAMP>>> :: Large Amplitude Motion Programme, a programme that simulates
-                ship behaviour in ocean waves.
-- <<<CLT>>> :: central limit theorem.
-- <<<PM>>> :: Pierson---Moskowitz ocean wave spectrum approximation.
-- <<<YW>>> :: Yule---Walker equations.
-- <<<LS>>> :: least squares.
-- <<<PDF>>> :: probability density function.
-- <<<CDF>>> :: cumulative distribution function.
-- <<<BSP>>> :: Bulk Synchronous Parallel.
-- <<<OpenCL>>> :: Open Computing Language.
-- <<<OpenMP>>> :: Open Multi-Processing.
-- <<<MPI>>> :: Message Passing Interface.
-- <<<POSIX>>> :: Portable Operating System.
-- <<<FMA>>> :: Fused multiply-add.
-- <<<DCMT>>> :: Dynamic creation of Mersenne Twisters.
-- <<<GSL>>> :: GNU Scientific Library.
-- <<<BLAS>>> :: Basic Linear Algebra Sub-programmes.
-- <<<LAPACK>>> :: Linear Algebra Package.
-- <<<DNS>>> :: Dynamic name resolution.
-- <<<HPC>>> ::  High-performance computing.
-- Master/slave node ::
-- Principal/subordinate kernel ::
-
-#+begin_export latex
-\input{postamble}
-#+end_export
-
-bibliographystyle:ugost2008
-bibliography:bib/refs.bib
-
-* Appendix
-** Longuet---Higgins model formula derivation
-:PROPERTIES:
-:CUSTOM_ID: longuet-higgins-derivation
-:END:
-
-In the framework of linear wave theory two-dimensional system of
-equations\nbsp{}eqref:eq-problem is written as
-\begin{align*}
-    & \phi_{xx} + \phi_{zz} = 0,\\
-    & \zeta(x,t) = -\frac{1}{g} \phi_t, & \text{на }z=\zeta(x,t),
-\end{align*}
-where \(\frac{p}{\rho}\) includes \(\phi_t\). The solution to the Laplace
-equation is sought in a form of Fourier series cite:kochin1966theoretical:
-\begin{equation*}
-    \phi(x,z,t) = \int\limits_{0}^{\infty} e^{k z}
-    \left[ A(k, t) \cos(k x) + B(k, t) \sin(k x) \right] dk.
-\end{equation*}
-Plugging it in the boundary condition yields
-\begin{align*}
-    \zeta(x,t) &= -\frac{1}{g} \int\limits_{0}^{\infty}
-    \left[ A_t(k, t) \cos(k x) + B_t(k, t) \sin(k x) \right] dk \\
-    &= -\frac{1}{g} \int\limits_{0}^{\infty} C_t(k, t) \cos(kx + \epsilon(k, t)).
-\end{align*}
-Here \(\epsilon\) is white noise and \(C_t\) includes \(dk\). Substituting
-integral with infinite sum yields two-dimensional form of
-eq.\nbsp{}[[eq-longuet-higgins]].

	arma-thesis
	git clone https://git.igankevich.com/arma-thesis.git
	Log \| Files \| Refs \| LICENSE

.gitignore	\|	8	++++----
Makefile	\|	22	++++++++++++++++++++++
arma-thesis-ru.org	\|	3434	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
arma-thesis.org	\|	3226	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
phd-diss-ru.org	\|	3434	-------------------------------------------------------------------------------
phd-diss.org	\|	3226	-------------------------------------------------------------------------------