diff --git a/src/config.yaml b/src/config.yaml index 3ce788de4e85d5482fdf07a6296c77b9921ce0ec..834079e010a29ed6764e0bdee258466e7e8820bf 100644 --- a/src/config.yaml +++ b/src/config.yaml @@ -1,27 +1,27 @@ --- author: - Baptiste Coudray -title: "FUTHARK-MPI : LE CALCUL HAUTE PERFORMANCE DISTRIBUÉ POUR LES GENS" # Le titre de votre travail -smallTitle: "LE CALCUL HAUTE PERFORMANCE DISTRIBUÉ POUR LES GENS" # Le "petit" titre de votre travail, càd le titre qui sera affiché en haut de chaque page -institute: Haute École du Paysage, d'Ingénierie et d'Architecture de Genève +title: "Futhark-MPI: Distributed High-Performance Computing For People" +smallTitle: "Distributed High-Performance Computing For People" +institute: University of Applied Sciences and Arts Western Switzerland name: Baptiste surname: Coudray -keywords: [ HEPIA, Rapport, bachelor, Futhark, MPI, Programmation parallèle ] -orientation: logicielle -projectMonth: Mars +keywords: [Report, bachelor, HEPIA, HESSO] +orientation: Software and Complex Systems +projectMonth: August year: 2021 sensei: "Dr. Orestis Malaspinas" frontLogoLegend: "Jeu de la vie" # La légende de l'image de couverture frontLogoSourceURL: "https://upload.wikimedia.org/wikipedia/commons/e/e5/Gospers_glider_gun.gif" # La source de l'image de couverture -workTitle: "Travail de bachelor" +workTitle: "Bachelor thesis" workFor: "..." bibliography: my.bib csl: iso690.csl -biblio-title: Références documentaires +biblio-title: Bibliographical references link-citations: true autoSectionLabels: false autoEqnLabels: true -lang: fr +lang: en documentclass: report papersize: A4 cref: false diff --git a/src/figs/simple_automate.png b/src/figs/simple_automate.png index 26a180559c12381a574ab5de24e929c7765b6aa6..4dbb2ece7c6e98fb84dea290376fdd25ed3f39d2 100644 Binary files a/src/figs/simple_automate.png and b/src/figs/simple_automate.png differ diff --git a/src/my.bib b/src/my.bib index 4d6eefdd64e5456855f3768a5df84e657ea02713..e63c6fae5db1cab8fab88268939eedbb1842a8f3 100644 --- a/src/my.bib +++ b/src/my.bib @@ -12,7 +12,6 @@ Depuis 2020, une nouvelle version de {MPI} est disponible, {MPI}-4, qui apporte urldate = {2021-07-22}, date = {2021-03-17}, langid = {french}, - note = {Page Version {ID}: 180967726}, file = {Snapshot:/Users/baptistecdr/Zotero/storage/GLQPSHE4/index.html:text/html}, } @@ -30,54 +29,26 @@ Depuis 2020, une nouvelle version de {MPI} est disponible, {MPI}-4, qui apporte @inreference{noauthor_amdahls_2021, title = {Amdahl's law}, rights = {Creative Commons Attribution-{ShareAlike} License}, - url = {https://en.wikipedia.org/w/index.php?title=Amdahl%27s_law&oldid=1034193438}, + url = {https://en.wikipedia.org/w/index.php?title=Amdahl%27s_law}, abstract = {In computer architecture, Amdahl's law (or Amdahl's argument) is a formula which gives the theoretical speedup in latency of the execution of a task at fixed workload that can be expected of a system whose resources are improved. It is named after computer scientist Gene Amdahl, and was presented at the {AFIPS} Spring Joint Computer Conference in 1967. -Amdahl's law is often used in parallel computing to predict the theoretical speedup when using multiple processors. For example, if a program needs 20 hours to complete using a single thread, but a one-hour portion of the program cannot be parallelized, therefore only the remaining 19 hours (p = 0.95) of execution time can be parallelized, then regardless of how many threads are devoted to a parallelized execution of this program, the minimum execution time cannot be less than one hour. Hence, the theoretical speedup is limited to at most 20 times the single thread performance, - - - - - ( - - - - - 1 - - 1 - − - p - - - - - = - 20 - - ) - - - - \{{\textbackslash}displaystyle {\textbackslash}left(\{{\textbackslash}dfrac \{1\}\{1-p\}\}=20{\textbackslash}right)\} - .}, +Amdahl's law is often used in parallel computing to predict the theoretical speedup when using multiple processors. For example, if a program needs 20 hours to complete using a single thread, but a one-hour portion of the program cannot be parallelized, therefore only the remaining 19 hours (p = 0.95) of execution time can be parallelized, then regardless of how many threads are devoted to a parallelized execution of this program, the minimum execution time cannot be less than one hour. Hence, the theoretical speedup is limited to at most 20 times the single thread performance, +}, booktitle = {Wikipedia}, urldate = {2021-07-22}, date = {2021-07-18}, langid = {english}, - note = {Page Version {ID}: 1034193438}, file = {Snapshot:/Users/baptistecdr/Zotero/storage/4KJVD4JN/index.html:text/html}, } @inreference{noauthor_gustafsons_2021, title = {Gustafson's law}, rights = {Creative Commons Attribution-{ShareAlike} License}, - url = {https://en.wikipedia.org/w/index.php?title=Gustafson%27s_law&oldid=1031307338}, + url = {https://en.wikipedia.org/w/index.php?title=Gustafson%27s_law}, abstract = {In computer architecture, Gustafson's law (or Gustafson–Barsis's law) gives the theoretical speedup in latency of the execution of a task at fixed execution time that can be expected of a system whose resources are improved. It is named after computer scientist John L. Gustafson and his colleague Edwin H. Barsis, and was presented in the article Reevaluating Amdahl's Law in 1988.}, booktitle = {Wikipedia}, urldate = {2021-07-22}, date = {2021-06-30}, langid = {english}, - note = {Page Version {ID}: 1031307338}, file = {Snapshot:/Users/baptistecdr/Zotero/storage/FGKVKJQD/index.html:text/html}, } @@ -124,35 +95,33 @@ Amdahl's law is often used in parallel computing to predict the theoretical spee @inreference{noauthor_jeu_2021, title = {Jeu de la vie}, rights = {Creative Commons Attribution-{ShareAlike} License}, - url = {https://fr.wikipedia.org/w/index.php?title=Jeu_de_la_vie&oldid=183190635}, + url = {https://fr.wikipedia.org/w/index.php?title=Jeu_de_la_vie}, abstract = {Le jeu de la vie est un automate cellulaire imaginé par John Horton Conway en 1970 et qui est probablement le plus connu de tous les automates cellulaires. Malgré des règles très simples, le jeu de la vie est Turing-complet. Le jeu de la vie est un jeu de simulation au sens mathématique plutôt que ludique. Bien que n'étant pas décrit par la théorie des jeux, certains le décrivent comme un « jeu à zéro joueur ».}, booktitle = {Wikipédia}, urldate = {2021-07-22}, date = {2021-05-23}, langid = {french}, - note = {Page Version {ID}: 183190635}, file = {Snapshot:/Users/baptistecdr/Zotero/storage/HDKB5PPW/index.html:text/html}, } @inreference{noauthor_automate_2021, title = {Automate cellulaire}, rights = {Creative Commons Attribution-{ShareAlike} License}, - url = {https://fr.wikipedia.org/w/index.php?title=Automate_cellulaire&oldid=183026782}, + url = {https://fr.wikipedia.org/w/index.php?title=Automate_cellulaire}, abstract = {Un automate cellulaire consiste en une grille régulière de « cellules » contenant chacune un « état » choisi parmi un ensemble fini et qui peut évoluer au cours du temps. L'état d'une cellule au temps t+1 est fonction de l'état au temps t d'un nombre fini de cellules appelé son « voisinage ». À chaque nouvelle unité de temps, les mêmes règles sont appliquées simultanément à toutes les cellules de la grille, produisant une nouvelle « génération » de cellules dépendant entièrement de la génération précédente. Étudiés en mathématiques et en informatique théorique, les automates cellulaires sont à la fois un modèle de système dynamique discret et un modèle de calcul. Le modèle des automates cellulaires est remarquable par l'écart entre la simplicité de sa définition et la complexité que peuvent atteindre certains comportements macroscopiques : l'évolution dans le temps de l'ensemble des cellules ne se réduit pas (simplement) à la règle locale qui définit le système. À ce titre il constitue un des modèles standards dans l'étude des systèmes complexes.}, booktitle = {Wikipédia}, urldate = {2021-07-22}, date = {2021-05-18}, langid = {french}, - note = {Page Version {ID}: 183026782}, file = {Snapshot:/Users/baptistecdr/Zotero/storage/L5L9W28B/index.html:text/html}, } @inreference{noauthor_programmation_2021, title = {Programmation fonctionnelle}, rights = {Creative Commons Attribution-{ShareAlike} License}, - url = {https://fr.wikipedia.org/w/index.php?title=Programmation_fonctionnelle&oldid=183271341}, + url = {https://fr.wikipedia.org/w/index.php?title=Programmation_fonctionnelle}, abstract = {La programmation fonctionnelle est un paradigme de programmation de type déclaratif qui considère le calcul en tant qu'évaluation de fonctions mathématiques. Comme le changement d'état et la mutation des données ne peuvent pas être représentés par des évaluations de fonctions la programmation fonctionnelle ne les admet pas, au contraire elle met en avant l'application des fonctions, contrairement au modèle de programmation impérative qui met en avant les changements d'état. Un langage fonctionnel est donc un langage de programmation dont la syntaxe et les caractéristiques encouragent la programmation fonctionnelle. Alors que l'origine de la programmation fonctionnelle peut être trouvée dans le lambda-calcul, le langage fonctionnel le plus ancien est Lisp, créé en 1958 par {McCarthy}. Lisp a donné naissance à des variantes telles que Scheme (1975) et Common Lisp (1984) qui, comme Lisp, ne sont pas ou peu typées. Des langages fonctionnels plus récents tels {ML} (1973), Haskell (1987), {OCaml}, Erlang, Clean et Oz, {CDuce}, Scala (2003), F\# ou {PureScript} (2013), Agda (en) sont fortement typés.}, @@ -160,77 +129,17 @@ Un langage fonctionnel est donc un langage de programmation dont la syntaxe et l urldate = {2021-07-22}, date = {2021-05-26}, langid = {french}, - note = {Page Version {ID}: 183271341}, file = {Snapshot:/Users/baptistecdr/Zotero/storage/Z4UFD79Y/index.html:text/html}, } @inreference{noauthor_maximum_2021, title = {Maximum subarray problem}, rights = {Creative Commons Attribution-{ShareAlike} License}, - url = {https://en.wikipedia.org/w/index.php?title=Maximum_subarray_problem&oldid=1030176929}, - abstract = {In computer science, the maximum sum subarray problem is the task of finding a contiguous subarray with the largest sum, within a given one-dimensional array A[1...n] of numbers. Formally, the task is to find indices - - - - i - - - \{{\textbackslash}displaystyle i\} - and - - - - j - - - \{{\textbackslash}displaystyle j\} - with - - - - 1 - ≤ - i - ≤ - j - ≤ - n - - - \{{\textbackslash}displaystyle 1{\textbackslash}leq i{\textbackslash}leq j{\textbackslash}leq n\} - , such that the sum - - - - - - ∑ - - x - = - i - - - j - - - A - [ - x - ] - - - \{{\textbackslash}displaystyle {\textbackslash}sum \_\{x=i\}{\textasciicircum}\{j\}A[x]\} - is as large as possible. (Some formulations of the problem also allow the empty subarray to be considered; by convention, the sum of all values of the empty subarray is zero.) Each number in the input array A could be positive, negative, or zero.For example, for the array of values [−2, 1, −3, 4, −1, 2, 1, −5, 4], the contiguous subarray with the largest sum is [4, −1, 2, 1], with sum 6. -Some properties of this problem are: - -If the array contains all non-negative numbers, then the problem is trivial; a maximum subarray is the entire array. -If the array contains all non-positive numbers, then a solution is any subarray of size 1 containing the maximal value of the array (or the empty subarray, if it is permitted). -Several different sub-arrays may have the same maximum sum.This problem can be solved using several different algorithmic techniques, including brute force, divide and conquer, dynamic programming, and reduction to shortest paths.}, + url = {https://en.wikipedia.org/w/index.php?title=Maximum_subarray_problem}, + abstract = {In computer science, the maximum sum subarray problem is the task of finding a contiguous subarray with the largest sum, within a given one-dimensional array A[1...n] of numbers. Formally, the task is to find indices}, booktitle = {Wikipedia}, urldate = {2021-07-22}, date = {2021-06-24}, langid = {english}, - note = {Page Version {ID}: 1030176929}, file = {Snapshot:/Users/baptistecdr/Zotero/storage/LL8NK2KY/index.html:text/html}, -} \ No newline at end of file +} diff --git a/src/templates/default.latex b/src/templates/default.latex index 0678481aa0bef59bbafbe591c11ef6002ccbfa2e..a1924a1ce30dd6ea3b4f65706376ea452a9f2486 100644 --- a/src/templates/default.latex +++ b/src/templates/default.latex @@ -248,7 +248,7 @@ $endif$ contents={% \small{$name$, $surname$ - $smallTitle$ - $workTitle$ - $projectMonth$ $year$} }, - position={3.9, 0.5} + position={8, 1} } % END OF CUSTOM PACKAGE ROUTINES @@ -304,8 +304,8 @@ $endif$ \begin{document} % Nom conformes des tables -\renewcommand*\listfigurename{Liste des illustrations} -\renewcommand*\listtablename{Liste des tableaux} +\renewcommand*\listfigurename{List of illustrations} +\renewcommand*\listtablename{List of tables} % Sets the page numbering style to roman %\pagestyle{headings} \setcounter{page}{1} @@ -339,19 +339,19 @@ $endif$ } \vspace{1mm} - \Large{$workTitle$ présenté par}\\ + \Large{$workTitle$ defended by}\\ \vspace{1mm} \textbf{\Large{$author$}}\\ \vspace{2mm} \vspace{3mm} - \textbf{\Large{Ingénierie des technologies de l’information avec orientation en $orientation$}} + \textbf{\Large{Information technologies engineering with a specialisation in $orientation$}} \vspace{3mm} \large{\textbf{$projectMonth$ $year$}}\\ \vspace{5mm} \begin{tabular}{ p{6cm} } - \multicolumn{1}{c}{Professeur HES responsable}\\ + \multicolumn{1}{c}{Referent HES teacher}\\ \multicolumn{1}{c}{\textbf{$sensei$}}\\ \end{tabular} \end{center} diff --git a/src/text/00-preface.md b/src/text/00-preface.md index de32563f1cbc08a155e47ae0845065d81b202414..3c0516f6d3d3ba456e14f35b98f5ddeb29aa152b 100644 --- a/src/text/00-preface.md +++ b/src/text/00-preface.md @@ -14,7 +14,7 @@ I would like to thank the people who helped me during this project: \begin{figure} \vspace{.1cm} \begin{center} \includegraphics[width=3.72cm,height=2.4cm]{figs/front-logo.png} \end{center} \end{figure} \begin{tabular}{ p{3cm} p{1cm} p{1cm} p{6cm} } \multicolumn{1}{l}{Candidate:}& & & \multicolumn{1}{l}{Referent teacher:}\\ \multicolumn{1}{l}{\textbf{Baptiste Coudray}} & & & -\multicolumn{1}{l}{\textbf{Dr. Orestis Malaspinas}} \\ \multicolumn{1}{l}{Field of study: Information technologies engineering} & & & +\multicolumn{1}{l}{\textbf{Dr. Orestis Malaspinas}} \\ \multicolumn{1}{l}{Field of study: Information Technologies Engineering} & & & \multicolumn{1}{l}{} \\ \end{tabular} \pagebreak diff --git a/src/text/01-references.md b/src/text/01-references.md index dbe3e2ed1beaea2b078ceebfc1872b9be7b86a55..e09af7692dea5378bcf4f759434df9817fe44d3e 100644 --- a/src/text/01-references.md +++ b/src/text/01-references.md @@ -4,6 +4,8 @@ \listoffigures +\listoftables + #### Reference of the URLs {-} \begin{tabular}{ p{3cm} p{9cm} } \multicolumn{1}{l}{URL01} & diff --git a/src/text/02-introduction.md b/src/text/02-introduction.md index d3aeb1454e3e526b3b552704efa3ef16be43805c..f8b8713aaeb6ec9295d380b28934bc76d9d80065 100644 --- a/src/text/02-introduction.md +++ b/src/text/02-introduction.md @@ -5,6 +5,7 @@ Today, most computers are equipped with GPUs. They provide more and more computing cores and have become fundamental embedded high-performance computing tools. In this context, the number of applications taking advantage of these tools seems low at first glance. The problem is that the development tools are heterogeneous, complex, and strongly dependent on the GPU running the code. Futhark is an experimental, functional, and architecture agnostic language; that is why it seems relevant to study it. It allows generating code allowing a standard sequential execution (on a single-core processor), on GPU (with CUDA and OpenCL backends), on several cores of the same processor (shared memory). To make it a tool that could be used on all high-performance platforms, it lacks support for distributed computing. This work aims to develop a library that can port any Futhark code to an MPI library with as little effort as possible. To achieve that, we introduce the interest of parallelization --, then what is MPI and Futhark. We decide to implement a library that can parallelize cellular automaton in, one, two or three dimensions. By adding Futhark on top of MPI, the programmer will have the possibilities to compile his code in : + * parallelized-sequential mode, * parallelized-multicore mode, * parallelized-OpenCL mode, diff --git a/src/text/03-programmation-parallele.md b/src/text/03-programmation-parallele.md index d7c1e26541d466319acf010c29761a3e5ff6c1b3..41d241a0f08635f3850576bc8ace5bcdaf8d10db 100644 --- a/src/text/03-programmation-parallele.md +++ b/src/text/03-programmation-parallele.md @@ -8,7 +8,7 @@ In parallel computing, two important laws give the theoretical speedup that can \cimg{figs/amdahls-law.png}{scale=0.6}{Amdahl's law}{Source: Taken from https://commons.wikimedia.org/, ref. URL02} -Amdahl's law states that the program's overall speed is limited by the code that cannot be parallelized. Indeed, there will almost always be a sequential part in a code that cannot be parallelized. There is, therefore, a relationship between the ratio of parallelizable code and the overall execution speed of the program. [@noauthor_amdahls_2021] +Amdahl's law states that the program's overall speed is limited by the code that cannot be parallelized. Indeed, there will almost always be a sequential part in a code that cannot be parallelized. There is, therefore, a relationship between the ratio of parallelizable code and the overall execution speed of the program [@noauthor_amdahls_2021]. In the graph above, we notice that if: @@ -21,7 +21,7 @@ In the graph above, we notice that if: \cimg{figs/gustafson-law.png}{scale=0.75}{Gustafson–Barsis's law}{Source: Taken from https://commons.wikimedia.org/, ref. URL03} -Gustafson's law says that the more significant the amount of data to be processed, the more advantageous it is to use many processors. Thus, the acceleration is linear, as can be seen on the graph.[@noauthor_gustafsons_2021] +Gustafson's law says that the more significant the amount of data to be processed, the more advantageous it is to use many processors. Thus, the acceleration is linear, as can be seen on the graph [@noauthor_gustafsons_2021]. On the graph, we notice, for example, that with a code that is 90% parallelized, we have a speedup of at least x100 with 120 processors, where Amdahl's law estimated a maximum speedup of x10 with 512 processors. Gustafson's law is therefore much more optimistic in terms of performance gain. \pagebreak diff --git a/src/text/04-mpi.md b/src/text/04-mpi.md index 0d4c46b6b664ad976a8a5e86edc0788b440aefc4..daa45fecd39228975850c36ba462a3287f7009e4 100644 --- a/src/text/04-mpi.md +++ b/src/text/04-mpi.md @@ -79,3 +79,5 @@ Process 4 received token -1 from process 3 Process 0 received token -1 from process 4 ``` Thus, we can see that the processes exchange the token each in turn until node zero receives the token again. + +\pagebreak diff --git a/src/text/05-futhark.md b/src/text/05-futhark.md index 6d345fd5875d22863c066277c0b55268f7ebd853..910c0227d2adfcbc134879d87ce394c3a794cefc 100644 --- a/src/text/05-futhark.md +++ b/src/text/05-futhark.md @@ -13,17 +13,19 @@ The main goal of Futhark is to write generic code that can compile into either: Although a Futhark code can compile into an executable, this feature reserves for testing purposes because there is no (+^IO). Thus, the main interest is to write particular functions that you would like to speed up thanks to parallel programming and compile in library mode to use in a C program. -To see the performance of Futhark, Here is an example from the Futhark site that compares the resolution time of the (+^MSS) problem. The (+^MSS) problem is the task of finding a contiguous subarray with the largest sum, within a given one-dimensional array A[1...n] of numbers.[@noauthor_maximum_2021] +\pagebreak + +To see the performance of Futhark, Here is an example from the Futhark site that compares the resolution time of the (+^MSS) problem. The (+^MSS) problem is the task of finding a contiguous subarray with the largest sum, within a given one-dimensional array A[1...n] of numbers [@noauthor_maximum_2021]. -\cimg{figs/mss_bench.png}{scale=0.60}{MSS runtime (lower is better)}{Source: Taken from https://futhark-lang.org/performance.html, ref. URL04} +\cimg{figs/mss_bench.png}{scale=0.35}{MSS runtime (lower is better)}{Source: Taken from https://futhark-lang.org/performance.html, ref. URL04} -This graph shows performance of a maximum segment sum implementation in Futhark and Thrust (a C++ library developed by NVIDIA for (+^GPU) programming). The sequential runtime is for Futhark code compiled to sequential (+^CPU) code and the Futhark runtime is for code compiled to (+^CUDA).[@henriksen_gotta_2021] As we can see, the Futhark version is much faster than the sequential and Thrust versions, which justify using this language in this project. +This graph shows performance of a maximum segment sum implementation in Futhark and Thrust (a C++ library developed by NVIDIA for (+^GPU) programming). The sequential runtime is for Futhark code compiled to sequential (+^CPU) code and the Futhark runtime is for code compiled to (+^CUDA) [@henriksen_gotta_2021]. As we can see, the Futhark version is much faster than the sequential and Thrust versions, which justify using this language in this project. \pagebreak ## Example 1 -To better understand Futhark, here is a simple example: calculating the factorial of a number. [@henriksen_basic_2021]. +To better understand Futhark, here is a simple example: calculating the factorial of a number [@henriksen_basic_2021]. ``` let fact (n: i32): i32 = reduce (*) 1 (1...n) @@ -113,7 +115,7 @@ The program's execution with the factorial of 12 returns the correct value, i.e. Functional programming is a programming paradigm that considers computation as an evaluation of mathematical functions. The origin of functional programming comes from lambda-calculus, a formal system -invented by Alonzo Church where everything is a function. [@noauthor_programmation_2021] +invented by Alonzo Church where everything is a function [@noauthor_programmation_2021]. This paradigm avoids side effects by prohibiting the change of the value of a variable that is not defined in the current scope. Thus, it facilitates concurrent programming because shared variables cannot be modified, which reduces the bugs resulting from concurrent programmings, such as data race conditions. diff --git a/src/text/06-mpi-x-futhark.md b/src/text/06-mpi-x-futhark.md index a5a8afd3e22176f8b1ae04f9641488701a731a24..fc795ddee42326d56dc4ecc9a20f6b4f7da4d845 100644 --- a/src/text/06-mpi-x-futhark.md +++ b/src/text/06-mpi-x-futhark.md @@ -1,10 +1,10 @@ # Automate cellulaire -A cellular automaton consists of a regular grid of cells, each in one of a finite number of states. The grid can be in any finite number of dimensions. For each cell, a set of cells called its neighborhood is defined relative to the specified cell. An initial state (time $t = 0$) is selected by assigning a state for each cell. A new generation is created (advancing t by 1), according to some fixed rule (generally, a mathematical function) that determines the new state of each cell in terms of the current state of the cell and the states of the cells in its neighborhood. Typically, the rule for updating the state of cells is the same for each cell and does not change over time. [@noauthor_automate_2021] +A cellular automaton consists of a regular grid of cells, each in one of a finite number of states. The grid can be in any finite number of dimensions. For each cell, a set of cells called its neighborhood is defined relative to the specified cell. An initial state (time $t = 0$) is selected by assigning a state for each cell. A new generation is created (advancing t by 1), according to some fixed rule (generally, a mathematical function) that determines the new state of each cell in terms of the current state of the cell and the states of the cells in its neighborhood. Typically, the rule for updating the state of cells is the same for each cell and does not change over time [@noauthor_automate_2021]. The neighborhood of a cell is defined either by the Moore neighborhood or by the Von Neumann neighborhood. The first one defines that a cell has in a two-dimensional cellular automaton eight neighbors while the second one, four. -\cimg{figs/neighbours.png}{scale=0.60}{Comparison between Von Neumann (left) and Moore (right) neighborhoods}{Source: Created by Baptiste Coudray} +\cimg{figs/neighbours.png}{scale=0.5}{Comparison between Von Neumann (left) and Moore (right) neighborhoods}{Source: Created by Baptiste Coudray} The grid on the left represents the Von Neumann neighborhood, i.e., the four neighbors of a cell. These are denoted by the four cardinal points (north, west, south, east). The grid on the right represents Moore's neighborhood, i.e., the eight neighbors of a cell. These are denoted by the four cardinal points and the four inter-cardinal points (northwest, southwest, southeast, northeast). @@ -19,7 +19,8 @@ These values are valid for a cellular automaton of dimension two and a Chebyshev ## MPI x Futhark -Our library allows parallelizing cellular automata automatically so that the programmer only has to write the Futhark function to update his cellular automaton. Our library supports cellular automata of one, two, and three dimensions. The use of the Futhark language allows to quickly update the state of the cellular automaton thanks to the different backend available. Therefore, several modes are available: +Our library allows parallelizing cellular automata automatically so that the programmer only has to write the Futhark function to update his cellular automaton. Our library supports cellular automata of one, two, and three dimensions and with any types of data. The use of the Futhark language allows to quickly update the state of the cellular automaton thanks to the different backend available. Therefore, several modes are available: + * parallelized-sequential, the Futhark code executes sequentially, * parallelized-multicore, the Futhark code executes concurrently to POSIX threads, * parallelized-OpenCL/CUDA, the Futhark code executes on the graphics card. @@ -29,6 +30,8 @@ Our library allows parallelizing cellular automata automatically so that the pro Communication between the different MPI tasks is necessary to recover the missing neighbors and recreate the complete cellular automaton. Therefore, we create a virtual Cartesian topology. "*A virtual topology is a mechanism for naming the processes in a communicator in away that fits the communication pattern better. The main aim of this is to make sub-sequent code simpler. It may also provide hints to the run-time system which allow it to optimise the communication or even hint to the loader how to configure the processes. The virtual topology might also gain us some performance benefit.*" [@macdonald_writing_nodate] +\pagebreak + #### One dimension \cimg{figs/communication_1d.png}{scale=0.60}{Example of Cartesian virtual topology in one dimension}{Source: Created by Baptiste Coudray} @@ -72,9 +75,9 @@ For example, a cellular automaton of size $4 \times 4 \times 4$, each process ha The envelope of a chunk represents the missing neighbours of the cells at the extremities of the chunk. These missing cells are needed to compute the next iteration of the chunk of the cellular automaton that the process has. #### One dimension -\cimg{figs/envelope_1d.png}{scale=0.60}{Example of the envelope of a chunk in one dimension}{Source: Created by Baptiste Coudray} +\cimg{figs/dispatch_1d.png}{scale=0.60}{Example of the envelope of a chunk in one dimension}{Source: Created by Baptiste Coudray} -In one dimension, the Moore neighborhood of a cell includes the west-neighbor and the east-neighbor. Using the previously described one-dimensional cellular automaton, we notice that the envelope of $R_n$ includes the last cell of $R_{(n-1) % N}$ and the first cell of $R_{(n+1) % N}$. Thus, the ranks exchange data via MPI using the Cartesian virtual topology. +In one dimension, the Moore neighborhood of a cell includes the west-neighbor and the east-neighbor. Using the previously described one-dimensional cellular automaton, we notice that the envelope of $R_{n}$ includes the last cell of $R_{(n-1) \% N}$ and the first cell of $R_{(n+1) \% N}$. Thus, the ranks exchange data via MPI using the Cartesian virtual topology. #### Two dimensions diff --git a/src/text/07-automate-elementaire.md b/src/text/07-automate-elementaire.md index 058769417ce154cb05075005fe90a14289eb7f44..0ab8a4e8b4c2dc1d91b9e9aa2aa852aa6f2d96f4 100644 --- a/src/text/07-automate-elementaire.md +++ b/src/text/07-automate-elementaire.md @@ -1,6 +1,6 @@ # Simple Cellular Automaton -The simplest non-trivial cellular automaton that can be conceived consists of a one-dimensional grid of cells that can take only two states ("0" or "1"), with a neighborhood consisting, for each cell, of itself and the two cells adjacent to it. [@noauthor_automate_2021] +The simplest non-trivial cellular automaton that can be conceived consists of a one-dimensional grid of cells that can take only two states ("0" or "1"), with a neighborhood consisting, for each cell, of itself and the two cells adjacent to it [@noauthor_automate_2021]. There are $2^3 = 8$ possible configurations (or patterns, rules) of such a neighborhood. In order for the cellular automaton to work, it is necessary to define what the state must be at the next generation of a cell for each of these patterns. The 8 rules/configurations defined is as follows: @@ -14,7 +14,9 @@ There are $2^3 = 8$ possible configurations (or patterns, rules) of such a neigh | 6 | 1 | 0 | 1 | 0 | 7 | 1 | 1 | 0 | 0 | 8 | 1 | 1 | 1 | 0 -Table: <Evolution rules for a cellule in a one dimensional cellular-automaton> +Table: Evolution rules for a cellule in a one dimensional cellular-automaton + +\pagebreak ## Example @@ -29,7 +31,7 @@ Iteration 0 is the initial state and only cell two is alive. To perform the next ## Parallelized version -With the created library, we implement the cellular automaton previously described. To do this, we create a Futhark `elementary.fut` file, which is used to calculate the next state of a part of the cellular automaton. +With the created library, we implement this (+^SCA) previously described. To do this, we create a Futhark `elementary.fut` file, which is used to calculate the next state of a part of the cellular automaton. ``` let compute_next_elems [n] (chunk_elems :[n]i8) :[]i8 = ... @@ -42,11 +44,14 @@ entry next_chunk_elems [n] (chunk_elems :[n]i8) :[]i8 = Therefore, the `elementary.fut` file contains only a function that applies the rules on the cellular automaton. Note that the function returns the cellular automaton without the envelope. ```c -void compute_next_chunk_board(struct dispatch_context *dc, struct futhark_context *fc, chunk_info_t *ci) { - struct futhark_i8_1d *fut_chunk_with_envelope = get_chunk_with_envelope(dc, fc, 1, futhark_new_i8_1d); +void compute_next_chunk_board(struct dispatch_context *dc, + struct futhark_context *fc, chunk_info_t *ci) { + struct futhark_i8_1d *fut_chunk_with_envelope = + get_chunk_with_envelope(dc, fc, 1, futhark_new_i8_1d); struct futhark_i8_1d *fut_next_chunk_elems; - futhark_entry_next_chunk_elems(fc, &fut_next_chunk_elems, fut_chunk_with_envelope); + futhark_entry_next_chunk_elems(fc, &fut_next_chunk_elems, + fut_chunk_with_envelope); futhark_context_sync(fc); futhark_values_i8_1d(fc, fut_next_chunk_elems, ci->data); @@ -60,7 +65,8 @@ int main(int argc, char *argv[]) { const int N_ITERATIONS = 100; int elems_dimensions[1] = {600}; - struct dispatch_context *disp_context = dispatch_context_new(elems_dimensions, MPI_INT8_T, 1); + struct dispatch_context *disp_context = + dispatch_context_new(elems_dimensions, MPI_INT8_T, 1); chunk_info_t ci = get_chunk_info(disp_context); init_chunk_elems(&ci); @@ -78,6 +84,7 @@ Finally, a C file `main.c` is needed to create the program's entry point. We ini We perform benchmarks to validate the scalability of our one-dimensional parallelization when compiling in sequential, multicore, (+^OpenCL), or (+^CUDA) mode. The benchmarks are performed on the HES-GE cluster (Baobab/Yggdrasil). The sequential and multicore benchmarks are performed as follows: + * the cellular automaton is $300,000,000$ cells in size, * the number of tasks varies between $2^0$ and $2^7$, * 15 measurements are performed, one measurement corresponds to one iteration, @@ -93,7 +100,7 @@ The sequential and multicore benchmarks are performed as follows: | 32 | 20.938 [s] | ± 0.007 [s] | x31.4 | 15 | | 64 | 11.071 [s] | ± 0.024 [s] | x59.4 | 15 | | 128 | 5.316 [s] | ± 0.191 [s] | x123.7 | 15 | -Table: <Results for the parallelized-sequential version of Simple Cellular Automaton> +Table: Results for the parallelized-sequential version of SCA | Number of tasks | Average [s] | Standard Derivation [s] | Speedup | Number of measures | |:---:|:---:|:---:|:---:|:---:| @@ -105,9 +112,11 @@ Table: <Results for the parallelized-sequential version of Simple Cellular Autom | 32 | 25.776 [s] | ± 0.725 [s] | x27.5 | 15 | | 64 | 12.506 [s] | ± 0.554 [s] | x56.7 | 15 | | 128 | 5.816 [s] | ± 0.045 [s] | x121.8 | 15 | -Table: <Results for the parallelized-multicore version of Simple Cellular Automaton> +Table: Results for the parallelized-multicore version of SCA + +\pagebreak -\cimg{figs/elem_result_and_speedup_cpu.png}{width=\linewidth}{Benchmarks of the simple cellular automaton in parallelized-sequential/multicore}{Source: Realized by Baptiste Coudray} +\cimg{figs/elem_result_and_speedup_cpu.png}{width=\linewidth}{Benchmarks of the SCA in parallelized-sequential/multicore}{Source: Realized by Baptiste Coudray} Sur le graphique de gauche, nous comparons le temps d'exécution moyen pour chaque tâche et pour chaque version (sequential et multicore). Sur le graphique de droite, nous comparons le speedup idéal avec le speedup de la version parallelized-sequential et multicore. @@ -116,12 +125,15 @@ The more we increase the number of tasks, the more the execution time is reduced ## GPU Benchmark The (+^OpenCL) and (+^CUDA) benchmarks are performed as follows: + * the cellular automaton has $300'000'000$ cells, * the number of tasks varies between $2^0$ and $2^6$. * 15 measurements are performed, one measurement corresponds to one iteration, * the iteration is computed $50'000$ times. * From $2^0$ to $2^3$ tasks, an NVIDIA GeForce RTX 3090 is allocated for each task; beyond that, the eight graphics cards are shared equally among the ranks. +\pagebreak + | Number of tasks | Number of GPUs | Average [s] | Standard Derivation [s] | Speedup | Number of measures | |:---:|:---:|:---:|:---:|:---:|:---:| | 1 | 1 | 166.086 [s] | ± 0.096 [s] | x1.0 | 15 | @@ -131,7 +143,7 @@ The (+^OpenCL) and (+^CUDA) benchmarks are performed as follows: | 16 | 8 | 31.675 [s] | ± 0.056 [s] | x5.2 | 15 | | 32 | 8 | 43.65 [s] | ± 0.102 [s] | x3.8 | 15 | | 64 | 8 | 67.096 [s] | ± 0.118 [s] | x2.5 | 15 | -Table: <Results for the parallelized-OpenCL version of Simple Cellular Automaton> +Table: Results for the parallelized-OpenCL version of SCA | Number of tasks | Number of GPUs | Average [s] | Standard Derivation [s] | Speedup | Number of measures | |:---:|:---:|:---:|:---:|:---:|:---:| @@ -142,9 +154,11 @@ Table: <Results for the parallelized-OpenCL version of Simple Cellular Automaton | 16 | 8 | 30.749 [s] | ± 0.069 [s] | x5.2 | 15 | | 32 | 8 | 42.352 [s] | ± 0.117 [s] | x3.8 | 15 | | 64 | 8 | 65.228 [s] | ± 0.042 [s] | x2.5 | 15 | -Table: <Results for the parallelized-CUDA version of Simple Cellular Automaton> +Table: Results for the parallelized-CUDA version of SCA + +\pagebreak -\cimg{figs/elem_result_and_speedup_gpu.png}{width=\linewidth}{Benchmarks of the simple cellular automaton in parallelized-OpenCL/CUDA}{Source: Realized by Baptiste Coudray} +\cimg{figs/elem_result_and_speedup_gpu.png}{width=\linewidth}{Benchmarks of the SCA in parallelized-OpenCL/CUDA}{Source: Realized by Baptiste Coudray} With this performance test, we notice that the computation time is essentially the same in OpenCL as in CUDA. Moreover, the parallelization follows the ideal speedup curve when the number of processes equals the number of graphics cards. However, when the eight graphics cards are shared, the speedup in OpenCL/CUDA crashes, and the computation time increases. diff --git a/src/text/08-jeu-de-la-vie.md b/src/text/08-jeu-de-la-vie.md index 5bb0a5f5137d6ed1cc46f7cc695ceb2105ac8d0a..7b41a769644f5cddaab222f67a3311fd8d0338f0 100644 --- a/src/text/08-jeu-de-la-vie.md +++ b/src/text/08-jeu-de-la-vie.md @@ -5,15 +5,15 @@ The Game of Life is a zero-player game designed by John Horton Conway in 1970. I 1. a cell has eight neighbors, 2. a cell can be either alive or dead, 3. a dead cell with exactly three living neighbors becomes alive, -4. a living cell with two or three living neighbors stays alive; otherwise, it dies. [@noauthor_jeu_2021] - -\pagebreak +4. a living cell with two or three living neighbors stays alive; otherwise, it dies [@noauthor_jeu_2021]. ## Example \cimg{figs/gol_blinker1.png}{scale=0.40}{First state of blinker}{Source: Taken from \url{https://commons.wikimedia.org/}, ref. URL05. Re-created by Baptiste Coudray} +\pagebreak + A basic example is a blinker: * the cell (one, one) and (one, three) die because they have seven dead neighbors and one living neighbor (rule n°4), @@ -24,8 +24,6 @@ A basic example is a blinker: Thus, after the application of the rules, the horizontal line becomes a vertical line. Then, at the next iteration, the vertical line becomes a horizontal line again. -\pagebreak - ## Parallelized version We create the game of life with our library to test it with a two-dimensional cellular automaton. @@ -36,6 +34,7 @@ The code is relatively the same as the previous example; therefore, it is not ex We perform benchmarks to validate the scalability of our two-dimensional parallelization when compiling in sequential, multicore, (+^OpenCL), or (+^CUDA) mode. The benchmarks are performed on the (+^HES-GE) cluster (Baobab/Yggdrasil). The sequential and multicore benchmarks are performed as follows: + * the cellular automaton is $900,000,000$ cells in size, * the number of tasks varies between $2^0$ and $2^7$, * 15 measurements are performed, one measurement corresponds to one iteration, @@ -51,7 +50,7 @@ The sequential and multicore benchmarks are performed as follows: | 32 | 100.422 [s] | ± 0.068 [s] | x34.6 | 15 | | 64 | 55.986 [s] | ± 1.587 [s] | x62.0 | 15 | | 128 | 28.111 [s] | ± 0.263 [s] | x123.5 | 15 | -*Array of results for the parallelized-sequential version* +Table: Results for the parallelized-sequential version of Game of Life | Number of tasks | Average [s] | Standard Derivation [s] | Speedup | Number of measures | |:---:|:---:|:---:|:---:|:---:| @@ -63,23 +62,26 @@ The sequential and multicore benchmarks are performed as follows: | 32 | 71.463 [s] | ± 0.485 [s] | x30.2 | 15 | | 64 | 39.116 [s] | ± 0.489 [s] | x55.1 | 15 | | 128 | 14.008 [s] | ± 0.335 [s] | x153.8 | 15 | -*Array of results for the parallelized-multicore version* +Table: Results for the parallelized-multicore version of Game of Life + +\pagebreak \cimg{figs/gol_result_and_speedup_cpu.png}{width=\linewidth}{Benchmarks of the game of life in parallelized-sequential/multicore}{Source: Realized by Baptiste Coudray} We notice an apparent difference between the parallelized-sequential and multicore version when there is only one task. The multicore version is $1.6$ times faster than the sequential version. Nevertheless, both versions have a perfect speedup. The multicore version even gets a maximum speedup of x154 with 128 tasks. This performance can be explained by the caching of data in the processor and the use of threads. -\pagebreak - ## GPU Benchmarks The (+^OpenCL) and (+^CUDA) benchmarks are performed as follows: + * the cellular automaton has $900'000'000$ cells, * the number of tasks varies between $2^0$ and $2^6$. * 15 measurements are performed, one measurement corresponds to one iteration, * the iteration is computed $8'000$ times. * From $2^0$ to $2^3$ tasks, an NVIDIA GeForce RTX 3090 is allocated for each task; beyond that, the eight graphics cards are shared equally among the ranks. +\pagebreak + | Number of tasks | Number of GPUs | Average [s] | Standard Derivation [s] | Speedup | Number of measures | |:---:|:---:|:---:|:---:|:---:|:---:| | 1 | 1 | 230.144 [s] | ± 0.225 [s] | x1.0 | 15 | @@ -89,7 +91,7 @@ The (+^OpenCL) and (+^CUDA) benchmarks are performed as follows: | 16 | 8 | 30.579 [s] | ± 0.085 [s] | x7.5 | 15 | | 32 | 8 | 32.323 [s] | ± 0.045 [s] | x7.1 | 15 | | 64 | 8 | 35.551 [s] | ± 0.133 [s] | x6.5 | 15 | -Table: <Results for the parallelized-OpenCL version of Game of Life> +Table: Results for the parallelized-OpenCL version of Game of Life | Number of tasks | Number of GPUs | Average [s] | Standard Derivation [s] | Speedup | Number of measures | |:---:|:---:|:---:|:---:|:---:|:---:| @@ -100,7 +102,9 @@ Table: <Results for the parallelized-OpenCL version of Game of Life> | 16 | 8 | 29.174 [s] | ± 0.079 [s] | x7.5 | 15 | | 32 | 8 | 30.844 [s] | ± 0.051 [s] | x7.1 | 15 | | 64 | 8 | 34.192 [s] | ± 0.12 [s] | x6.4 | 15 | -Table: <Results for the parallelized-CUDA version of Game of Life> +Table: Results for the parallelized-CUDA version of Game of Life + +\pagebreak \cimg{figs/gol_result_and_speedup_gpu.png}{width=\linewidth}{Benchmarks of the game of life in parallelized-OpenCL/CUDA}{Source: Realized by Baptiste Coudray} diff --git a/src/text/09-lattice-boltzmann.md b/src/text/09-lattice-boltzmann.md index 252b4e14a27d0bdb7edace4d46628e9c31d87ce8..053d173e53eebe72f7a3308ad112b38cd45f9fde 100644 --- a/src/text/09-lattice-boltzmann.md +++ b/src/text/09-lattice-boltzmann.md @@ -16,6 +16,7 @@ We create the lattice-Boltzmann method with our library to test it with a three- We perform benchmarks to validate the scalability of our three-dimensional parallelization when compiling in sequential, multicore, (+^OpenCL), or (+^CUDA) mode. The benchmarks are performed on the (+^HES-GE) cluster (Baobab/Yggdrasil). The sequential and multicore benchmarks are performed as follows: + * the cellular automaton is $27'000'000$ cells in size, * the number of tasks varies between $2^0$ and $2^7$, * 15 measurements are performed, one measurement corresponds to one iteration, @@ -31,7 +32,7 @@ The sequential and multicore benchmarks are performed as follows: | 32 | 41.04 [s] | ± 1.59 [s] | x17.4 | 15 | | 64 | 22.188 [s] | ± 0.321 [s] | x32.3 | 15 | | 128 | 17.415 [s] | ± 4.956 [s] | x41.1 | 15 | -Table: <Results for the parallelized-sequential version of Lattice-Boltzmann> +Table: Results for the parallelized-sequential version of Lattice-Boltzmann | Number of tasks | Average [s] | Standard Derivation [s] | Speedup | Number of measures | |:---:|:---:|:---:|:---:|:---:| @@ -43,7 +44,9 @@ Table: <Results for the parallelized-sequential version of Lattice-Boltzmann> | 32 | 46.285 [s] | ± 0.138 [s] | x15.0 | 15 | | 64 | 24.059 [s] | ± 0.061 [s] | x28.9 | 15 | | 128 | 16.614 [s] | ± 1.088 [s] | x41.9 | 15 | -Table: <Results for the parallelized-multicore version of Lattice-Boltzmann> +Table: Results for the parallelized-multicore version of Lattice-Boltzmann + +\pagebreak \cimg{figs/lbm_result_and_speedup_cpu.png}{width=\linewidth}{Benchmarks of the lattice-Boltzmann method in parallelized-sequential/multicore}{Source: Realized by Baptiste Coudray} @@ -52,6 +55,7 @@ Contrairement aux benchmarks précédents, les speedups ne suivent pas la courbe ## GPU Benchmark The (+^OpenCL) and (+^CUDA) benchmarks are performed as follows: + * the cellular automaton has $27'000'000$ cells, * the number of tasks varies between $2^0$ and $2^6$. * 15 measurements are performed, one measurement corresponds to one iteration, diff --git a/src/text/ZZ-glossaire.tex b/src/text/ZZ-glossaire.tex index 2537582d2dacaa309010b0ab3de3b391544ba18f..5e1e61b12cb0e657ad295d26cb25d844c22f9559 100644 --- a/src/text/ZZ-glossaire.tex +++ b/src/text/ZZ-glossaire.tex @@ -16,3 +16,4 @@ \newacronym{HES-GE}{HES-GE}{Haute École Spécialisée de GEnève} \newacronym{IO}{I/O}{Input/Output} \newacronym{MSS}{MSS}{Maximum Segment Sum} +\newacronym{SCA}{SCA}{Simple Cellular Automaton}