commit 160fa5db54c2d3f7d9fee7941fcd42375a981037
parent a92f0288aafa6c7fef1ff8537b431f3e1eda82c7
Author: Ivan Gankevich <i.gankevich@spbu.ru>
Date: Thu, 15 Apr 2021 13:27:39 +0300
kernels
Diffstat:
main.bib | | | 67 | ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--- |
main.tex | | | 118 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---------- |
2 files changed, 168 insertions(+), 17 deletions(-)
diff --git a/main.bib b/main.bib
@@ -41,8 +41,9 @@
@Article{ fetterly2009dryadlinq,
title = {{DryadLINQ}: A system for general-purpose distributed
data-parallel computing using a high-level language},
- author = {Yuan Yu and Michael Isard and Dennis Fetterly and Mihai Budiu and
- Erlingsson, {\'U}lfar and Gunda, Pradeep Kumar and Jon Currey },
+ author = {Yuan Yu and Michael Isard and Dennis Fetterly and Mihai Budiu
+ and Erlingsson, {\'U}lfar and Gunda, Pradeep Kumar and Jon
+ Currey },
journal = {Proc. LSDS-IR},
volume = {8},
year = {2009},
@@ -100,7 +101,7 @@
doi = {10.1145/2934664},
journal = {Commun. ACM},
month = oct,
- pages = {56–65}
+ pages = {56--65}
}
@Article{ fault-tolerant-distributed-haskell,
@@ -116,3 +117,63 @@
comment = {"Computations in HdpH-RS are always as reliable as the root
node".}
}
+
+@InProceedings{ vavilapalli2013yarn,
+ author = {Vavilapalli, Vinod Kumar and Murthy, Arun C. and Douglas,
+ Chris and Agarwal, Sharad and Konar, Mahadev and Evans, Robert
+ and Graves, Thomas and Lowe, Jason and Shah, Hitesh and Seth,
+ Siddharth and Saha, Bikas and Curino, Carlo and O'Malley, Owen
+ and Radia, Sanjay and Reed, Benjamin and Baldeschwieler,
+ Eric},
+ title = {{Apache Hadoop YARN: Yet Another Resource Negotiator}},
+ booktitle = {Proceedings of the 4\textsuperscript{th} Annual Symposium on
+ Cloud Computing},
+ series = {SOCC'13},
+ year = {2013},
+ isbn = {978-1-4503-2428-1},
+ location = {Santa Clara, California},
+ pages = {1--16},
+ articleno = {5},
+ numpages = {16},
+ url = {http://doi.acm.org/10.1145/2523616.2523633},
+ doi = {10.1145/2523616.2523633},
+ acmid = {2523633},
+ publisher = {ACM},
+ address = {New York, NY, USA}
+}
+
+@Misc{ hadoop,
+ author = {{Apache Software Foundation}},
+ title = {Hadoop},
+ url = {https://hadoop.apache.org},
+ version = {3.2.1},
+ date = {2020-10-18}
+}
+
+@InProceedings{ oozie,
+ author = {Islam, Mohammad and Huang, Angelo K. and Battisha, Mohamed
+ and Chiang, Michelle and Srinivasan, Santhosh and Peters,
+ Craig and Neumann, Andreas and Abdelnur, Alejandro},
+ title = {Oozie: Towards a Scalable Workflow Management System for
+ Hadoop},
+ year = {2012},
+ isbn = {9781450318761},
+ publisher = {Association for Computing Machinery},
+ address = {New York, NY, USA},
+ url = {https://doi.org/10.1145/2443416.2443420},
+ doi = {10.1145/2443416.2443420},
+ booktitle = {Proceedings of the 1st ACM SIGMOD Workshop on Scalable
+ Workflow Execution Engines and Technologies},
+ articleno = {4},
+ numpages = {10},
+ location = {Scottsdale, Arizona, USA},
+ series = {SWEET'12}
+}
+
+@Misc{ storm,
+ author = {{Apache Software Foundation}},
+ title = {Storm},
+ url = {https://storm.apache.org},
+ version = {2.2.0},
+ date = {2021-04-15}
+}
diff --git a/main.tex b/main.tex
@@ -98,7 +98,7 @@ TODO \cite{lang-virt}
\section{Methods}
\subsection{Parallel and distributed computing technologies as components of
-cluster operating system}
+unified system}
In order to write parallel and distributed programmes the same way as we write
sequential programmes, we need the following components.
@@ -121,8 +121,8 @@ sequential programmes, we need the following components.
calls of the operating system kernel.
\end{itemize}
These three components are built on top of each other as in classical object-oriented
-programming arroach, and all the complexity is pushed to the lowest layer.
-Low-level language is responsible for providing parallelism and fault tolerance to
+programming approach, and all the complexity is pushed down to the lowest layer:
+low-level language is responsible for providing parallelism and fault tolerance to
the applications, cluster scheduler uses these facilities to provide transparent execution of
the applications on multiple cluster nodes, and high-level interface maps
the underlying system to the target language to simplify the work for
@@ -141,7 +141,8 @@ by batch job scheduler. Batch jobs schedulers provide means to allocate
resources (cluster nodes, processor cores, memory etc.) and launch parallel MPI
processes, but do not have control over messages that are sent between these
processes and do not control the actual number of resources used by the
-programme (all resources are exclusively owned by the programme), i.e.~cluster
+programme (all resources are owned exclusively by the programme, and the programme decides
+how to use them), i.e.~cluster
schedulers and MPI programmes do not talk to each other after the parallel
processes were launched. Consequently, high-level interface is also separated
from the scheduler. Although, high-level interface is built on top of the
@@ -151,15 +152,17 @@ control over communication of the applications that are run on the cluster, but
is used as resource allocator.
The situation in newer big data technologies is different: there are the same
-three components with hierarchical structure, but lack low-level language.
-There are many high-level libraries that are integrated with YARN cluster
-scheduler (TODO cite). The scheduler has more control over job execution as
-jobs are decomposed into tasks and execution of tasks is controled by the
-scheduler. Unfortunately, the lack of common low-level language made all
-high-level frameworks that are built on top of YARN API use their own custom
-protocol for communication, shift responsibility of providing fault tolerance
-to the scheduler and shift responsibility of data decomposition to higher level
-frameworks.
+three components with hierarchical structure, but the low-level language is
+integrated in the scheduler. There is YARN cluster
+scheduler~\cite{vavilapalli2013yarn} with API that is used as a low-level
+language for parallel and distributed computing, and there are many high-level
+libraries that are built on top of YARN~\cite{hadoop,oozie,spark2016,storm}.
+The scheduler has more control over job execution as jobs are decomposed into
+tasks and execution of tasks is controled by the scheduler. Unfortunately, the
+lack of common low-level language made all high-level frameworks that are built
+on top of YARN API use their own custom protocol for communication, shift
+responsibility of providing fault tolerance to the scheduler and shift
+responsibility of data decomposition to higher level frameworks.
To summarise, the current state-of-the-art technologies for parallel and
distributed computing can be divided into three classes: low-level languages,
@@ -177,15 +180,102 @@ of communication for parallel and distributed applications. Having such a
language at your disposal makes it easy to build higher level components,
because the complexity of cluster systems is hidden from the programmer, the
duplicated effort of implementing the same facilities in higher level
-interfaces is reduced, and cluster scheduler has full control of the programmes
+interfaces is reduced, and cluster scheduler has full control of the programme
execution as it speaks the same protocol and uses the same low-level language
internally: the language is general enough to write any distributed programme
including the scheduler itself.
\subsection{Kernels as objects that control the programme flow}
+In order to create low-level language for parallel and distributed computing we
+borrow familiar features from sequential low-level languages and augment them
+with asynchronous function calls and an ability to read and write call stack
+frames.
+
+In assembler and LLVM the programme is written in imperative style as a series
+of processor instructions. The variables are stored either on the stack (a
+special area of the computer's main memory) or in processor registers. Logical
+parts of the programme are represented by functions. A call to a function
+places all function arguments on the stack and then jumps to the address of the
+function. When the function returns, the result of the computation is written
+to the processor register and control flow is returned to the calling function.
+When the main function returns, the programme terminates.
+
+There are two problems with the assembler that need to be solved in order for
+it to be useful in parallel and distributed computing. First, the contents of
+the stack can not be copied between cluster nodes or saved to and read from the
+file, because they often contain pointers to memory blocks that may be invalid
+on another cluster node or in the process that reads the stack from the file.
+Second, there is no natural way to express parallelism in this language: all
+function calls are syncrhonous and all instructions are executed in the
+specified order. In order to solve these problems we use object-oriented
+techniques.
+
+We represent each stack frame with an object: local variables become object
+fields, and each function call is decomposed into the code that goes before
+function call, the code that performs function call, and the code that goes
+after. The code that goes before the call is placed into \texttt{act} method of
+the object and after this code the new object is created to call the function
+asynchronously. The code that goes after the call is placed into \texttt{react}
+method of the object, and this code is called asynchronously when the function
+call returns (this method takes the corresponding object as an argument). The
+object also has \texttt{read} and \texttt{write} methods that are used to read
+and write its fields to and from file or to copy the object to another cluster
+node. In this model each object contains enough information to perform the
+corresponding funciton call, and we can make these calls in any order we like.
+Also, the object is self-contained, and we can ask another cluster node to
+perform the function call or save the object to disk to perform the call when
+the user wants to resume the computation (e.g.~after the computer is upgraded
+and rebooted).
+
+The function calls are made asyncrhonous with help of thread pool. Each thread
+pool consists of an object queue and an array of threads. When the object is
+placed in the queue, one of the threads extracts it and calls \texttt{act} or
+\texttt{react} method depending on the state of the object. There are two
+states that are controlled by the programmer: when the state is
+\textit{upstream} \texttt{act} method is called, when the state is
+\textit{downstream} \texttt{react} method of the parent object is called with
+the current object as the argument. When the state is \textit{downstream} and
+there is no parent, the programme terminates.
+
+We call these objects \emph{control flow objects} or \emph{kernels} for short.
+These objects contain the input data in object fields, the code that processes
+this data in object methods and the output data (the result of the computation)
+also in object fields. The programmer decides which data is input and output.
+To reduce network usage the programmer may delete input data when the kernel
+enters \textit{downstream} state: that way only output data is copied back to
+the parent kernel over the network.
+
+This low-level language can be seen as an adaptation of classic function call
+stack, but with asynchronous function calls and an ability to read and write
+stack frames. This differences constitute the main advantages of the kernels.
+\begin{itemize}
+ \item Kernels define depedencies between function calls, but do not define
+ the order of computation. This gives natural way of expressing parallel
+ computations on the lowest possible level.
+ \item Kernels can be written to and read from any medium: files, network
+ connections, serial connections etc. This allows to implement fault
+ tolerance efficiently using any existing methods: in order to implement
+ checkpoints
+ a programmer no longer need to save memory contents of each parallel
+ process, only the fields of the main kernel are needed to restart
+ the programme from the last sequential step. However, with kernels
+ checkpoints can be replaced with simple restart: when the node
+ to which the child kernel was sent fails, the copy of this kernel
+ can be sent to another node without stopping the programme and no
+ additional configuration from the programmer.
+ \item Finally, kernels are simple enough to be used as an intermediate
+ representation for high-level languages: either via a compiler
+ modification, or via wrapper library that calls the low-level implementation
+ directly.
+\end{itemize}
+
\subsection{Reference cluster scheduler based on kernels}
+Kernels are general enough to write any programme, and the first programme that
+we wrote using them was cluster scheduler that uses kernels to implement
+its internal logic and to run applications spanning multiple cluster nodes.
+
TODO
\subsection{Kernels as intermediate representation for Guile language}