diff --git a/.DS_Store b/.DS_Store
new file mode 100644
index 0000000000000000000000000000000000000000..b13c5322ee0e46416bd402f8349b145fc58c4365
Binary files /dev/null and b/.DS_Store differ
diff --git a/src/config.yaml b/src/config.yaml
index 834079e010a29ed6764e0bdee258466e7e8820bf..3674eea16057e9109a5009f984bec81e0ff0b54f 100644
--- a/src/config.yaml
+++ b/src/config.yaml
@@ -1,7 +1,7 @@
 ---
 author:
   - Baptiste Coudray
-title: "Futhark-MPI: Distributed High-Performance Computing For People"
+title: "MPI-Futhark: Distributed High-Performance Computing For People"
 smallTitle: "Distributed High-Performance Computing For People"
 institute: University of Applied Sciences and Arts Western Switzerland
 name: Baptiste
@@ -11,8 +11,8 @@ orientation: Software and Complex Systems
 projectMonth: August
 year: 2021
 sensei: "Dr. Orestis Malaspinas"
-frontLogoLegend: "Jeu de la vie" # La légende de l'image de couverture
-frontLogoSourceURL: "https://upload.wikimedia.org/wikipedia/commons/e/e5/Gospers_glider_gun.gif" # La source de l'image de couverture
+frontLogoLegend: "MPI x Futhark with examples" # La légende de l'image de couverture
+frontLogoSourceURL: "Realized by Baptiste Coudray" # La source de l'image de couverture
 workTitle: "Bachelor thesis"
 workFor: "..."
 bibliography: my.bib
diff --git a/src/figs/dispatch_1d.png b/src/figs/dispatch_1d.png
index dbd4f7054c3ad3a6377f8fb76b4e0f897d31a48e..f22af69b233fca6fbff4f59e31b697aa92a882fe 100644
Binary files a/src/figs/dispatch_1d.png and b/src/figs/dispatch_1d.png differ
diff --git a/src/figs/dispatch_2d.png b/src/figs/dispatch_2d.png
new file mode 100644
index 0000000000000000000000000000000000000000..c0770dda14b998d4bf4762fa1335a496537356fa
Binary files /dev/null and b/src/figs/dispatch_2d.png differ
diff --git a/src/figs/envelope_1d.png b/src/figs/envelope_1d.png
new file mode 100644
index 0000000000000000000000000000000000000000..290688859b1712dd8c4ec502544489620179b5c2
Binary files /dev/null and b/src/figs/envelope_1d.png differ
diff --git a/src/figs/envelope_2d.png b/src/figs/envelope_2d.png
new file mode 100644
index 0000000000000000000000000000000000000000..8533bc2c188a69c1b2d4770e64d4906d1f5b3588
Binary files /dev/null and b/src/figs/envelope_2d.png differ
diff --git a/src/figs/front-logo.graffle b/src/figs/front-logo.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..ae6d14c46ccf1e9d49563fb4ea9d02f165dc483c
Binary files /dev/null and b/src/figs/front-logo.graffle differ
diff --git a/src/figs/front-logo.png b/src/figs/front-logo.png
index a6678ffa104d45cfcecbf3d0577b49cb827b322e..a625ab431ee9f17eb7bf2b7dc32d52b127e15ce3 100644
Binary files a/src/figs/front-logo.png and b/src/figs/front-logo.png differ
diff --git a/src/figs/lbm_result_and_speedup_gpu.png b/src/figs/lbm_result_and_speedup_gpu.png
new file mode 100644
index 0000000000000000000000000000000000000000..a57419bece1e820afd7c67ae0fe72b3c013828ee
Binary files /dev/null and b/src/figs/lbm_result_and_speedup_gpu.png differ
diff --git a/src/figs/ring.png b/src/figs/ring.png
index 7cafe8f3339fc656c5a5f432679789c3403206b9..9794aa599240d99519d9fa508ae257f92a16fbad 100644
Binary files a/src/figs/ring.png and b/src/figs/ring.png differ
diff --git a/src/iso690.csl b/src/iso690.csl
index 93cca47ed37c6963ddc6ecb0966e8393d2b9e870..16de27a4292af756455a7a91aae7a70cfd58b6b9 100644
--- a/src/iso690.csl
+++ b/src/iso690.csl
@@ -25,12 +25,12 @@
   </info>
   <locale>
     <terms>
-      <term name="no date">[sans date]</term>
+      <term name="no date">[no date]</term>
       <term name="in">in</term>
-      <term name="online">en&#160;ligne</term>
-      <term name="accessed">consulté&#160;le</term>
-      <term name="retrieved">disponible</term>
-      <term name="from">à l'adresse</term>
+      <term name="online">online</term>
+      <term name="accessed">accessed&#160;on</term>
+      <term name="retrieved">retrieved</term>
+      <term name="from">from</term>
     </terms>
   </locale>
   <macro name="author">
diff --git a/src/my.bib b/src/my.bib
index 9a9adbe4ef4dc74d7cda0aeb619fbadb72fc06a6..6ced7824db2cd1dd1980b9d16ad7cf0001a5a6a0 100644
--- a/src/my.bib
+++ b/src/my.bib
@@ -7,7 +7,6 @@
 	urldate = {2021-07-22},
 	date = {2021-03-17},
 	langid = {french},
-	note = {Page Version {ID}: 180967726},
 	file = {Snapshot:/Users/baptistecdr/Zotero/storage/GLQPSHE4/index.html:text/html},
 }
 
@@ -25,24 +24,22 @@
 @inreference{noauthor_amdahls_2021,
 	title = {Amdahl's law},
 	rights = {Creative Commons Attribution-{ShareAlike} License},
-	url = {https://en.wikipedia.org/w/index.php?title=Amdahl%27s_law&oldid=1034193438},
+	url = {https://en.wikipedia.org/w/index.php?title=Amdahl%27s_law},
 	booktitle = {Wikipedia},
 	urldate = {2021-07-22},
 	date = {2021-07-18},
 	langid = {english},
-	note = {Page Version {ID}: 1034193438},
 	file = {Snapshot:/Users/baptistecdr/Zotero/storage/4KJVD4JN/index.html:text/html},
 }
 
 @inreference{noauthor_gustafsons_2021,
 	title = {Gustafson's law},
 	rights = {Creative Commons Attribution-{ShareAlike} License},
-	url = {https://en.wikipedia.org/w/index.php?title=Gustafson%27s_law&oldid=1031307338},
+	url = {https://en.wikipedia.org/w/index.php?title=Gustafson%27s_law},
 	booktitle = {Wikipedia},
 	urldate = {2021-07-22},
 	date = {2021-06-30},
 	langid = {english},
-	note = {Page Version {ID}: 1031307338},
 	file = {Snapshot:/Users/baptistecdr/Zotero/storage/FGKVKJQD/index.html:text/html},
 }
 
@@ -89,60 +86,55 @@
 @inreference{noauthor_jeu_2021,
 	title = {Jeu de la vie},
 	rights = {Creative Commons Attribution-{ShareAlike} License},
-	url = {https://fr.wikipedia.org/w/index.php?title=Jeu_de_la_vie&oldid=183190635},
+	url = {https://fr.wikipedia.org/w/index.php?title=Jeu_de_la_vie},
 	booktitle = {Wikipédia},
 	urldate = {2021-07-22},
 	date = {2021-05-23},
 	langid = {french},
-	note = {Page Version {ID}: 183190635},
 	file = {Snapshot:/Users/baptistecdr/Zotero/storage/HDKB5PPW/index.html:text/html},
 }
 
 @inreference{noauthor_automate_2021,
 	title = {Automate cellulaire},
 	rights = {Creative Commons Attribution-{ShareAlike} License},
-	url = {https://fr.wikipedia.org/w/index.php?title=Automate_cellulaire&oldid=183026782},
+	url = {https://fr.wikipedia.org/w/index.php?title=Automate_cellulaire},
 	booktitle = {Wikipédia},
 	urldate = {2021-07-22},
 	date = {2021-05-18},
 	langid = {french},
-	note = {Page Version {ID}: 183026782},
 	file = {Snapshot:/Users/baptistecdr/Zotero/storage/L5L9W28B/index.html:text/html},
 }
 
 @inreference{noauthor_programmation_2021,
 	title = {Programmation fonctionnelle},
 	rights = {Creative Commons Attribution-{ShareAlike} License},
-	url = {https://fr.wikipedia.org/w/index.php?title=Programmation_fonctionnelle&oldid=183271341},
+	url = {https://fr.wikipedia.org/w/index.php?title=Programmation_fonctionnelle},
 	booktitle = {Wikipédia},
 	urldate = {2021-07-22},
 	date = {2021-05-26},
 	langid = {french},
-	note = {Page Version {ID}: 183271341},
 	file = {Snapshot:/Users/baptistecdr/Zotero/storage/Z4UFD79Y/index.html:text/html},
 }
 
 @inreference{noauthor_maximum_2021,
 	title = {Maximum subarray problem},
 	rights = {Creative Commons Attribution-{ShareAlike} License},
-	url = {https://en.wikipedia.org/w/index.php?title=Maximum_subarray_problem&oldid=1030176929},
+	url = {https://en.wikipedia.org/w/index.php?title=Maximum_subarray_problem},
 	booktitle = {Wikipedia},
 	urldate = {2021-07-22},
 	date = {2021-06-24},
 	langid = {english},
-	note = {Page Version {ID}: 1030176929},
 	file = {Snapshot:/Users/baptistecdr/Zotero/storage/LL8NK2KY/index.html:text/html},
 }
 
 @inreference{noauthor_distributed_2021,
 	title = {Distributed computing},
 	rights = {Creative Commons Attribution-{ShareAlike} License},
-	url = {https://en.wikipedia.org/w/index.php?title=Distributed_computing&oldid=1033553148},
+	url = {https://en.wikipedia.org/w/index.php?title=Distributed_computing},
 	booktitle = {Wikipedia},
 	urldate = {2021-07-23},
 	date = {2021-07-14},
 	langid = {english},
-	note = {Page Version {ID}: 1033553148},
 	file = {Snapshot:/Users/baptistecdr/Zotero/storage/ZF8EB2I9/index.html:text/html},
 }
 
@@ -154,4 +146,4 @@
 	date = {2010},
 	langid = {english},
 	file = {Chapter 1 Covering Multithreading Basics (Multithreaded Programming Guide):/Users/baptistecdr/Zotero/storage/ASQQ8TRR/index.html:text/html},
-}
\ No newline at end of file
+}
diff --git a/src/templates/default.latex b/src/templates/default.latex
index a1924a1ce30dd6ea3b4f65706376ea452a9f2486..e27f675eb050afeea860a07b10ccf1f2e2df743f 100644
--- a/src/templates/default.latex
+++ b/src/templates/default.latex
@@ -333,7 +333,7 @@ $endif$
     \huge{$title$}\\
     \vspace{.5cm}
     \IfFileExists{figs/front-logo.png}{
-      \includegraphics[scale=1.2]{figs/front-logo.png}\\
+      \includegraphics[scale=0.6]{figs/front-logo.png}\\
     }{
       \vspace{8cm}
     }
@@ -363,7 +363,7 @@ $endif$
 % Illustration URL page
 \vspace*{\fill}
 \IfFileExists{figs/front-logo.png}{
-  $frontLogoLegend$ \url{$frontLogoSourceURL$}
+  Legend and source of the cover picture: $frontLogoLegend$ $frontLogoSourceURL$
 }{
   Cette page a été laissée blanche intentionnellement.
 }
diff --git a/src/text/00-preface.md b/src/text/00-preface.md
index 3c0516f6d3d3ba456e14f35b98f5ddeb29aa152b..22c765dea2766990ca8ac275ad54184d13bde4b3 100644
--- a/src/text/00-preface.md
+++ b/src/text/00-preface.md
@@ -10,8 +10,9 @@ I would like to thank the people who helped me during this project:
 
 # Abstract {-}
 
+Today, most computers are equipped with (+^GPU). They provide more and more computing cores and have become fundamental embedded high-performance computing tools. In this context, the number of applications taking advantage of these tools seems low at first glance. The problem is that the development tools are heterogeneous, complex, and strongly dependent on the (+GPU) running the code. Futhark is an experimental, functional, and architecture agnostic language; that is why it seems relevant to study it.  It allows generating code allowing a standard sequential execution (on a single-core processor), on (+GPU) (with (+CUDA) and (+OpenCL) backends), on several cores of the same processor (shared memory). To make it a tool that could be used on all high-performance platforms, it lacks support for distributed computing with (+MPI). Nous créons une librairie qui effectue la distribution d'un automate cellulaire sur plusieurs noeuds de calculs via MPI. The update of the cellular automaton is computed via the Futhark language using one of the four available backends (sequential, multicore, OpenCL, and CUDA). Pour valider notre librairie, we implement a cellular automaton in one dimension ((+SCA)), in two dimensions (Game of Life) and three dimensions ((+LBM)). Finally, with the performance tests performed, we obtain an ideal speedup in one and two dimensions with the sequential and multicore backend. With the GPU backend, we obtain an ideal speedup only when the number of tasks equals the number of GPUs.
 
-\begin{figure} \vspace{.1cm} \begin{center} \includegraphics[width=3.72cm,height=2.4cm]{figs/front-logo.png}
+\begin{figure} \vspace{.1cm} \begin{center} \includegraphics[scale=0.4]{figs/front-logo.png}
 \end{center} \end{figure} \begin{tabular}{ p{3cm} p{1cm} p{1cm} p{6cm} } \multicolumn{1}{l}{Candidate:}& & &
 \multicolumn{1}{l}{Referent teacher:}\\ \multicolumn{1}{l}{\textbf{Baptiste Coudray}} & & &
 \multicolumn{1}{l}{\textbf{Dr. Orestis Malaspinas}} \\ \multicolumn{1}{l}{Field of study: Information Technologies Engineering} & & &
diff --git a/src/text/01-references.md b/src/text/01-references.md
index e09af7692dea5378bcf4f759434df9817fe44d3e..c2f36142a5d36af45cc62d24266fa76b22f82545 100644
--- a/src/text/01-references.md
+++ b/src/text/01-references.md
@@ -8,11 +8,11 @@
 
 #### Reference of the URLs {-}
 
-\begin{tabular}{ p{3cm} p{9cm} } \multicolumn{1}{l}{URL01} &
-\multicolumn{1}{l}{\url{https://upload.wikimedia.org/wikipedia/commons/e/e5/Gospers_glider_gun.gif}}\\
-\multicolumn{1}{l}{URL02} & \multicolumn{1}{l}{\url{https://commons.wikimedia.org/wiki/File:AmdahlsLaw.svg}} \\
-\multicolumn{1}{l}{URL03} & \multicolumn{1}{l}{\url{https://commons.wikimedia.org/wiki/File:Gustafson.png}} \\
-\multicolumn{1}{l}{URL04} & \multicolumn{1}{l}{\url{https://commons.wikimedia.org/wiki/File:Elder_futhark.png}} \\
+\begin{tabular}{ p{3cm} p{9cm} } 
+\multicolumn{1}{l}{URL01} & \multicolumn{1}{l}{\url{https://commons.wikimedia.org/wiki/File:AmdahlsLaw.svg}} \\
+\multicolumn{1}{l}{URL02} & \multicolumn{1}{l}{\url{https://commons.wikimedia.org/wiki/File:Gustafson.png}} \\
+\multicolumn{1}{l}{URL03} & \multicolumn{1}{l}{\url{https://commons.wikimedia.org/wiki/File:Elder_futhark.png}} \\
+\multicolumn{1}{l}{URL04} & \multicolumn{1}{l}{\url{https://futhark-lang.org/images/mss.svg}} \\
 \multicolumn{1}{l}{URL05} & \multicolumn{1}{l}{\url{https://commons.wikimedia.org/wiki/File:Gol-blinker1.png}} \\
 \multicolumn{1}{l}{URL06} & \multicolumn{1}{l}{\url{https://commons.wikimedia.org/wiki/File:Gol-blinker2.png}} \\
 \end{tabular}
diff --git a/src/text/02-introduction.md b/src/text/02-introduction.md
index f8b8713aaeb6ec9295d380b28934bc76d9d80065..1900e9f19e40a51a69170b42539bed08b16109b0 100644
--- a/src/text/02-introduction.md
+++ b/src/text/02-introduction.md
@@ -2,26 +2,32 @@
 
 # Introduction {-}
 
-Today, most computers are equipped with GPUs. They provide more and more computing cores and have become fundamental embedded high-performance computing tools. In this context, the number of applications taking advantage of these tools seems low at first glance. The problem is that the development tools are heterogeneous, complex, and strongly dependent on the GPU running the code. Futhark is an experimental, functional, and architecture agnostic language; that is why it seems relevant to study it.  It allows generating code allowing a standard sequential execution (on a single-core processor), on GPU (with CUDA and OpenCL backends), on several cores of the same processor (shared memory). To make it a tool that could be used on all high-performance platforms, it lacks support for distributed computing. This work aims to develop a library that can port any Futhark code to an MPI library with as little effort as possible.
+Today, most computers are equipped with (+^GPU). They provide more and more computing cores and have become fundamental embedded high-performance computing tools. In this context, the number of applications taking advantage of these tools seems low at first glance. The problem is that the development tools are heterogeneous, complex, and strongly dependent on the (+GPU) running the code. Futhark is an experimental, functional, and architecture agnostic language; that is why it seems relevant to study it.  It allows generating code allowing a standard sequential execution (on a single-core processor), on (+GPU) (with (+CUDA) and (+OpenCL) backends), on several cores of the same processor (shared memory). To make it a tool that could be used on all high-performance platforms, it lacks support for distributed computing. This work aims to develop a library that can port any Futhark code to an (+MPI) library with as little effort as possible.
 
-To achieve that, we introduce the interest of parallelization --, then what is MPI and Futhark. We decide to implement a library that can parallelize cellular automaton in, one, two or three dimensions. By adding Futhark on top of MPI, the programmer will have the possibilities to compile his code in :
+To achieve that, we introduce the meaning of distributed high-performance computing, then what is (+MPI) and Futhark. We decide to implement a library that can parallelize cellular automaton in, one, two or three dimensions. By adding Futhark on top of (+MPI), the programmer will have the possibilities to compile his code in :
 
 * parallelized-sequential mode,
 * parallelized-multicore mode,
 * parallelized-OpenCL mode,
 * parallelized-CUDA mode.
 
-Finally, we used this library by implementing a cellular automata in each dimension, and we perform a benchmark to ensure that each cellular automata scales correctly in these four modes.
+Finally, we used this library by implementing a cellular automata in each dimension:
 
-The leading resources we used to carry out this project were Futhark and MPI user guide. We also exchanged with Futhark creator Troels Henriksen.
+* a (+SCA) in one dimension,
+* the Game of Life in two dimensions,
+* the (+LBM) in three dimensions.
+
+We perform a benchmark to ensure that each cellular automata scales correctly in the four modes.
+
+The leading resources we used to carry out this project were Futhark and (+MPI) user guide. We also exchanged with Futhark creator Troels Henriksen.
 
 ## Working method {-}
 
 During this project, we use Git and put the source code on the Gitlab platform of HEPIA:
 
-* https://gitedu.hesge.ch/baptiste.coudray/projet-de-bachelor
-  * Source code of the library with usage examples.
-* https://gitedu.hesge.ch/baptiste.coudray/projet-de-semestre/-/tree/report
-  * Source code of this report
+* Source code of the library with usage examples
+  * https://gitedu.hesge.ch/baptiste.coudray/projet-de-bachelor
+* Source code of this report
+  * https://gitedu.hesge.ch/baptiste.coudray/projet-de-semestre/-/tree/report
 
 \pagebreak
diff --git a/src/text/03-programmation-parallele.md b/src/text/03-programmation-parallele.md
index 62cc67c25a4eb6e3c3f60d3bfb09873d23c25c41..7638906c591f67baa7f2c00ae160c24de2a76465 100644
--- a/src/text/03-programmation-parallele.md
+++ b/src/text/03-programmation-parallele.md
@@ -14,7 +14,7 @@ In parallel computing, two important laws give the theoretical speedup that can
 
 \pagebreak
 
-\cimg{figs/amdahls-law.png}{scale=0.6}{Amdahl's law}{Source: Taken from https://commons.wikimedia.org/, ref. URL02}
+\cimg{figs/amdahls-law.png}{scale=0.6}{Amdahl's law}{Source: Taken from https://commons.wikimedia.org/, ref. URL01}
 
 Amdahl's law states that the program's overall speed is limited by the code that cannot be parallelized. Indeed, there will almost always be a sequential part in a code that cannot be parallelized. There is, therefore, a relationship between the ratio of parallelizable code and the overall execution speed of the program [@noauthor_amdahls_2021].
 
@@ -27,7 +27,7 @@ In the graph above, we notice that if:
 
 \pagebreak
 
-\cimg{figs/gustafson-law.png}{scale=0.75}{Gustafson–Barsis's law}{Source: Taken from https://commons.wikimedia.org/, ref. URL03}
+\cimg{figs/gustafson-law.png}{scale=0.75}{Gustafson–Barsis's law}{Source: Taken from https://commons.wikimedia.org/, ref. URL02}
 
 Gustafson's law says that the more significant the amount of data to be processed, the more advantageous it is to use many processors. Thus, the acceleration is linear, as can be seen on the graph [@noauthor_gustafsons_2021].
 On the graph, we notice, for example, that with a code that is 90% parallelized, we have a speedup of at least x100 with 120 processors, where Amdahl's law estimated a maximum speedup of x10 with 512 processors. Gustafson's law is therefore much more optimistic in terms of performance gain.
diff --git a/src/text/04-mpi.md b/src/text/04-mpi.md
index daa45fecd39228975850c36ba462a3287f7009e4..1fb0aaf71c2efe888b01733323bcdcae8fb30783 100644
--- a/src/text/04-mpi.md
+++ b/src/text/04-mpi.md
@@ -1,17 +1,17 @@
 # Message Passing Interface
 
-In order to realize parallel programming, the standard (+^MPI) was created in 1993-1994 to standardize the passage of messages between several computers or in a computer with several processors/cores [@noauthor_message_2021]. (+^MPI) is, therefore, a communication protocol and not a programming language. Currently, the latest version of (+^MPI) is 4.0 which approved in 2021. There are several implementations of the standard:
+In order to realize parallel programming, the standard (+MPI) was created in 1993-1994 to standardize the passage of messages between several computers or in a computer with several processors/cores [@noauthor_message_2021]. (+MPI) is, therefore, a communication protocol and not a programming language. Currently, the latest version of (+MPI) is 4.0 which approved in 2021. There are several implementations of the standard:
 
 * MPICH, which support for the moment, MPI 3.1,
 * Open MPI, which support, for the moment, MPI 3.1
 
-We use Open MPI throughout this project on the cluster of the (+^HES-GE).
+We use Open MPI throughout this project on the cluster of the (+HES-GE).
 
 \pagebreak
 
 ## Example
 
-To understand the basis of (+^MPI), let us look at an example mimicking a *token ring* network [@kendall_mpi_2018]. This type of network forces a process to send a message to the message in the console, for example, only if it has the token in its possession. Moreover, once it has emitted its message, the process must transmit the token to its neighbor.
+To understand the basis of (+MPI), let us look at an example mimicking a *token ring* network [@kendall_mpi_2018]. This type of network forces a process to send a message to the message in the console, for example, only if it has the token in its possession. Moreover, once it has emitted its message, the process must transmit the token to its neighbor.
 
 \cimg{figs/ring.png}{scale=0.4}{Imitation of a network in \textit{token ring}}{Source: Created by Baptiste Coudray}
 
@@ -67,8 +67,8 @@ mpicc ring.c -o ring
 mpirun -n 5 ./ring
 ```
 
-To compile a (+^MPI) program, you have to go through the `mpicc` program, which is a wrapper
-around (+^GCC). Indeed, `mpicc` automatically adds the correct compilation parameters to the (+^GCC) program.
+To compile a (+MPI) program, you have to go through the `mpicc` program, which is a wrapper
+around (+GCC). Indeed, `mpicc` automatically adds the correct compilation parameters to the (+GCC) program.
 Next, our compiled program must be run through `mpirun` to distribute our program to compute nodes. Finally, the `-n` parameter is used to specify the number of processes to run.
 
 ```
diff --git a/src/text/05-futhark.md b/src/text/05-futhark.md
index 910c0227d2adfcbc134879d87ce394c3a794cefc..a16b1dbd9bc3b8da6ff8f833c28670081f11edff 100644
--- a/src/text/05-futhark.md
+++ b/src/text/05-futhark.md
@@ -1,25 +1,25 @@
 # Introduction to the language Futhark
 
-\cimg{figs/futhark.png}{scale=0.60}{Futhark}{Source: Taken from https://commons.wikimedia.org/, ref. URL04}
+\cimg{figs/futhark.png}{scale=0.60}{Futhark}{Source: Taken from https://commons.wikimedia.org/, ref. URL03}
 
-Futhark is a purely functional programming language for producing parallelizable code on (+^CPU) or (+^GPU). It was designed by Troels Henriksen, Cosmin Oancea and Martin Elsman at the University of Copenhagen.
+Futhark is a purely functional programming language for producing parallelizable code on (+CPU) or (+GPU). It was designed by Troels Henriksen, Cosmin Oancea and Martin Elsman at the University of Copenhagen.
 The main goal of Futhark is to write generic code that can compile into either:
 
-* (+^OpenCL),
-* (+^CUDA),
-* multi-threaded (+^POSIX) C,
+* (+OpenCL),
+* (+CUDA),
+* multi-threaded (+POSIX) C,
 * sequential C,
 * sequential Python.
 
-Although a Futhark code can compile into an executable, this feature reserves for testing purposes because there is no (+^IO). Thus, the main interest is to write particular functions that you would like to speed up thanks to parallel programming and compile in library mode to use in a C program.
+Although a Futhark code can compile into an executable, this feature reserves for testing purposes because there is no (+IO). Thus, the main interest is to write particular functions that you would like to speed up thanks to parallel programming and compile in library mode to use in a C program.
 
 \pagebreak
 
-To see the performance of Futhark, Here is an example from the Futhark site that compares the resolution time of the (+^MSS) problem. The (+^MSS) problem is the task of finding a contiguous subarray with the largest sum, within a given one-dimensional array A[1...n] of numbers [@noauthor_maximum_2021].
+To see the performance of Futhark, Here is an example from the Futhark site that compares the resolution time of the (+MSS) problem. The (+MSS) problem is the task of finding a contiguous subarray with the largest sum, within a given one-dimensional array A[1...n] of numbers [@noauthor_maximum_2021].
 
 \cimg{figs/mss_bench.png}{scale=0.35}{MSS runtime (lower is better)}{Source: Taken from https://futhark-lang.org/performance.html, ref. URL04}
 
-This graph shows performance of a maximum segment sum implementation in Futhark and Thrust (a C++ library developed by NVIDIA for (+^GPU) programming). The sequential runtime is for Futhark code compiled to sequential (+^CPU) code and the Futhark runtime is for code compiled to (+^CUDA) [@henriksen_gotta_2021]. As we can see, the Futhark version is much faster than the sequential and Thrust versions, which justify using this language in this project.
+This graph shows performance of a maximum segment sum implementation in Futhark and Thrust (a C++ library developed by NVIDIA for (+GPU) programming). The sequential runtime is for Futhark code compiled to sequential (+CPU) code and the Futhark runtime is for code compiled to (+CUDA) [@henriksen_gotta_2021]. As we can see, the Futhark version is much faster than the sequential and Thrust versions, which justify using this language in this project.
 
 \pagebreak
 
@@ -41,13 +41,13 @@ echo 12 | ./fact
 
 To compile the Futhark code, we have to specify a backend; this one allows us to compile our code in:
 
-* (+^OpenCL) (opencl, pyopencl),
-* (+^CUDA) (cuda),
-* multi-thread (+^POSIX) C (multicore),
+* (+OpenCL) (opencl, pyopencl),
+* (+CUDA) (cuda),
+* multi-thread (+POSIX) C (multicore),
 * sequential C (c),
 * Python sequential (python).
 
-Here we compile in (+^OpenCL) to run the program on the graphics card, and we run the program with the number 12 as the parameter.
+Here we compile in (+OpenCL) to run the program on the graphics card, and we run the program with the number 12 as the parameter.
 
 ```
 479001600i32
@@ -69,7 +69,7 @@ Functions that can be used in C code must be defined with the `entry` keyword. T
 futhark opencl --lib fact.fut
 ```
 
-Then you have to compile the Futhark code in library mode and specify the backend. Here, the factorial program is compiled in (+^OpenCL). Finally, it generates a `fact.h` and `fact.c` file, which can be included in a C program.
+Then you have to compile the Futhark code in library mode and specify the backend. Here, the factorial program is compiled in (+OpenCL). Finally, it generates a `fact.h` and `fact.c` file, which can be included in a C program.
 
 ```c
 #include <stdio.h>
diff --git a/src/text/06-mpi-x-futhark.md b/src/text/06-mpi-x-futhark.md
index fc795ddee42326d56dc4ecc9a20f6b4f7da4d845..aad73245c572c6a48613ddb2e1bacfa2fdcf2460 100644
--- a/src/text/06-mpi-x-futhark.md
+++ b/src/text/06-mpi-x-futhark.md
@@ -1,4 +1,4 @@
-# Automate cellulaire
+# Cellular Automaton
 
 A cellular automaton consists of a regular grid of cells, each in one of a finite number of states. The grid can be in any finite number of dimensions. For each cell, a set of cells called its neighborhood is defined relative to the specified cell. An initial state (time $t = 0$) is selected by assigning a state for each cell. A new generation is created (advancing t by 1), according to some fixed rule (generally, a mathematical function) that determines the new state of each cell in terms of the current state of the cell and the states of the cells in its neighborhood. Typically, the rule for updating the state of cells is the same for each cell and does not change over time [@noauthor_automate_2021].
 
@@ -44,6 +44,8 @@ In a one-dimensional Cartesian topology, we notice that the rows can communicate
 
 In a two-dimensional Cartesian topology, we notice that rows can communicate directly with their left, right, top, and bottom neighbors. When a row needs to communicate with its diagonal neighbor, we use the default communicator (`MPI_COMM_WORLD`) to communicate directly with each other without going through a neighbor.
 
+\pagebreak
+
 ### Three dimensions
 
 \cimg{figs/communication_3d.png}{scale=0.60}{Example of Cartesian virtual topology in three dimensions}{Source: Created by Baptiste Coudray}
@@ -56,35 +58,50 @@ The cellular automaton is shared as equally possible among the available tasks t
 
 #### One dimension
 
-\cimg{figs/futhark.png}{scale=0.60}{Example of sharing a cellular automaton in one dimension}{Source: Created by Baptiste Coudray}
+\cimg{figs/dispatch_1d.png}{scale=0.60}{Example of sharing a cellular automaton in one dimension}{Source: Created by Baptiste Coudray}
 
 In this example, a cell automaton of dimension one, size 8, is split between three processes. As the division of the cellular automaton is not an integer, rank two have only two cells, unlike the others, which have three.
 
 #### Two dimensions
-\cimg{figs/dispatch_1d.png}{scale=0.60}{Example of sharing a cellular automaton in two dimensions}{Source: Created by Baptiste Coudray}
+\cimg{figs/dispatch_2d.png}{scale=0.60}{Example of sharing a cellular automaton in two dimensions}{Source: Created by Baptiste Coudray}
 
 In this example, the cellular automaton is in two dimensions and of size $9 \times 9$. With four tasks available, it can be separated into four sub-matrices of $3 \times 3$.
 
 #### Three dimensions
 
 In three dimensions, the cellular automaton partitioning representation is challenging to make understandable. Thus, based on the two-dimensional partitioning, each task divides the third dimension.
-For example, a cellular automaton of size $4 \times 4 \times 4$, each process has a piece of size $2 \times 2 \times 2$.
+For example, a cellular automaton of size $4 \times{} 4 \times{} 4$, each process has a piece of size $2 \times{} 2 \times{} 2$.
 
 ### Envelope
 
 The envelope of a chunk represents the missing neighbours of the cells at the extremities of the chunk. These missing cells are needed to compute the next iteration of the chunk of the cellular automaton that the process has.
 
+\pagebreak
+
 #### One dimension
-\cimg{figs/dispatch_1d.png}{scale=0.60}{Example of the envelope of a chunk in one dimension}{Source: Created by Baptiste Coudray}
+\cimg{figs/envelope_1d.png}{scale=0.60}{Example of the envelope of a chunk in one dimension}{Source: Created by Baptiste Coudray}
 
-In one dimension, the Moore neighborhood of a cell includes the west-neighbor and the east-neighbor. Using the previously described one-dimensional cellular automaton, we notice that the envelope of $R_{n}$ includes the last cell of $R_{(n-1) \% N}$ and the first cell of $R_{(n+1) \% N}$. Thus, the ranks exchange data via MPI using the Cartesian virtual topology.
+In one dimension, the Moore neighborhood of a cell includes the west-neighbor and the east-neighbor. We notice that the envelope of $R_{n}$ includes the last cell of $R_{(n-1)\:\%\:N}$ and the first cell of $R_{(n+1)\:\%\:N}$. For example, the envelope for R1 is the west-neighbor (three) and the east neighbor (seven). Thus, the ranks exchange data via MPI using the Cartesian virtual topology.
+
+\pagebreak
 
 #### Two dimensions
 
+\cimg{figs/envelope_2d.png}{scale=0.60}{Example of the envelope of a chunk in two dimensions}{Source: Created by Baptiste Coudray}
 
-#### Three dimensions
+Using the two-dimensional cellular automaton described above, the chunk envelope of R0 requires eight communications. This example uses a Cartesian topology of size $2\times 2$ (m\times n), the neighbors are recovered as follows:
 
-En troisième dimension, on rajoute la profondeur de devant et la profondeur de derrière 
+1. `North West Neighbors` are sent by $R_{ ((y - 1)\:\%:m,\:(x - 1)\%:n) }$,
+2. `North Neighbors`, are sent by $R_{ ((y - 1)\%:m,\:x) }$,
+3. `North East Neighbors`, are sent by $R_{ ((y - 1)\:\%:m,\:(x + 1)\:\%:n) }$,
+4. `East Neighbors`, are sent by $R_{ (y,\:(x + 1)\%:n) }$,
+5. `South East Neighbors`, are sent by $R_{ ((y + 1)\:\%:m,\:(x + 1)\%:n) }$,
+6. `South Neighbors`, are sent by $R_{ ((y + 1)\%:m,\:x) }$,
+7. `South West Neighbors`, are sent by $R_{ ((y + 1)\%:m,\:(x - 1)\%:n) }$,
+8. `West Neighbors`, are sent by $R_{ (y,\:(x - 1)\%:n) }$.
+
+#### Three dimensions
 
+With a three-dimensional cellular automaton, the envelope of a chunk requires 26 MPI communications.
 
 \pagebreak
diff --git a/src/text/07-automate-elementaire.md b/src/text/07-automate-elementaire.md
index be174ebf0ab15b81972a21a6204ad316fd2a5aaf..cccbdcafa1006ba6a81c94917735e2af4d8a92a3 100644
--- a/src/text/07-automate-elementaire.md
+++ b/src/text/07-automate-elementaire.md
@@ -2,7 +2,7 @@
 
 The simplest non-trivial cellular automaton that can be conceived consists of a one-dimensional grid of cells that can take only two states ("0" or "1"), with a neighborhood consisting, for each cell, of itself and the two cells adjacent to it [@noauthor_automate_2021].
 
-There are $2^3 = 8$ possible configurations (or patterns, rules) of such a neighborhood. In order for the cellular automaton to work, it is necessary to define what the state must be at the next generation of a cell for each of these patterns. The 8 rules/configurations defined is as follows:
+There are $2^3 = 8$ possible configurations (or patterns, rules) of such a neighborhood. In order for the cellular automaton to work, it is necessary to define what the state must be at the next generation of a cell for each of these patterns. The eight rules/configurations defined is as follows:
 
 | Rule n° | East neighbour state | Cell state | West neighbour state | Cell next state |
 |:---:|:---:|:---:|:---:|:---:|
@@ -20,8 +20,7 @@ Table: Evolution rules for a cellule in a one dimensional cellular-automaton
 
 ## Example
 
-\cimg{figs/simple_automate.png}{scale=0.5}{First state of blinker}{Source: Taken from
-\url{https://commons.wikimedia.org/}, ref. URL05. Re-created by Baptiste Coudray}
+\cimg{figs/simple_automate.png}{scale=0.5}{First and second state of a SCA}{Source: Created by Baptiste Coudray}
 
 Iteration 0 is the initial state and only cell two is alive. To perform the next iteration:
 
@@ -31,7 +30,7 @@ Iteration 0 is the initial state and only cell two is alive. To perform the next
 
 ## Parallelized version
 
-With the created library, we implement this (+^SCA) previously described. To do this, we create a Futhark `elementary.fut` file, which is used to calculate the next state of a part of the cellular automaton.
+With the created library, we implement this (+SCA) previously described. To do this, we create a Futhark `elementary.fut` file, which is used to calculate the next state of a part of the cellular automaton.
 
 ```
 let compute_next_elems [n] (chunk_elems :[n]i8) :[]i8 = ...
@@ -82,7 +81,7 @@ Finally, a C file `main.c` is needed to create the program's entry point. We ini
 
 ## CPU Benchmark
 
-We perform benchmarks to validate the scalability of our one-dimensional parallelization when compiling in sequential, multicore, (+^OpenCL), or (+^CUDA) mode. The benchmarks are performed on the HES-GE cluster (Baobab/Yggdrasil).
+We perform benchmarks to validate the scalability of our one-dimensional parallelization when compiling in sequential, multicore, (+OpenCL), or (+CUDA) mode. The benchmarks are performed on the (+HES-GE) cluster (Baobab/Yggdrasil).
 The sequential and multicore benchmarks are performed as follows:
 
 * the cellular automaton is $300,000,000$ cells in size,
@@ -100,7 +99,7 @@ The sequential and multicore benchmarks are performed as follows:
 | 32 | 20.938 [s] | ± 0.007 [s] | x31.4 | 15 |
 | 64 | 11.071 [s] | ± 0.024 [s] | x59.4 | 15 |
 | 128 | 5.316 [s] | ± 0.191 [s] | x123.7 | 15 |
-Table: Results for the parallelized-sequential version of SCA
+Table: Results for the parallelized-sequential version of (+SCA)
 
 | Number of tasks | Average [s] | Standard Derivation [s] | Speedup | Number of measures |
 |:---:|:---:|:---:|:---:|:---:|
@@ -112,7 +111,7 @@ Table: Results for the parallelized-sequential version of SCA
 | 32 | 25.776 [s] | ± 0.725 [s] | x27.5 | 15 |
 | 64 | 12.506 [s] | ± 0.554 [s] | x56.7 | 15 |
 | 128 | 5.816 [s] | ± 0.045 [s] | x121.8 | 15 |
-Table: Results for the parallelized-multicore version of SCA
+Table: Results for the parallelized-multicore version of (+SCA)
 
 \pagebreak
 
@@ -120,11 +119,11 @@ Table: Results for the parallelized-multicore version of SCA
 
 We compare the average computation time for each task and each version (sequential and multicore) on the left graph. On the right graph, we compare the ideal speedup with the parallelized-sequential and multicore version speedup.
 
-The more we increase the number of tasks, the more the execution time is reduced. Thus, the parallelized-sequential or multicore version speedup follows the curve of the ideal speedup.
+The more we increase the number of tasks, the more the execution time is reduced. Thus, the parallelized-sequential or multicore version speedup follows the curve of the ideal speedup. We can see that concurrent computing does not provide a significant performance gain over sequential computing.
 
 ## GPU Benchmark
 
-The (+^OpenCL) and (+^CUDA) benchmarks are performed as follows:
+The (+OpenCL) and (+CUDA) benchmarks are performed as follows:
 
 * the cellular automaton has $300'000'000$ cells,
 * the number of tasks varies between $2^0$ and $2^6$.
@@ -143,7 +142,7 @@ The (+^OpenCL) and (+^CUDA) benchmarks are performed as follows:
 | 16 | 8 | 31.675 [s] | ± 0.056 [s] | x5.2 | 15 |
 | 32 | 8 | 43.65 [s] | ± 0.102 [s] | x3.8 | 15 |
 | 64 | 8 | 67.096 [s] | ± 0.118 [s] | x2.5 | 15 |
-Table: Results for the parallelized-OpenCL version of SCA
+Table: Results for the parallelized-OpenCL version of (+SCA)
 
 | Number of tasks | Number of GPUs | Average [s] | Standard Derivation [s] | Speedup | Number of measures |
 |:---:|:---:|:---:|:---:|:---:|:---:|
@@ -154,12 +153,12 @@ Table: Results for the parallelized-OpenCL version of SCA
 | 16 | 8 | 30.749 [s] | ± 0.069 [s] | x5.2 | 15 |
 | 32 | 8 | 42.352 [s] | ± 0.117 [s] | x3.8 | 15 |
 | 64 | 8 | 65.228 [s] | ± 0.042 [s] | x2.5 | 15 |
-Table: Results for the parallelized-CUDA version of SCA
+Table: Results for the parallelized-CUDA version of (+SCA)
 
 \pagebreak
 
 \cimg{figs/elem_result_and_speedup_gpu.png}{width=\linewidth}{Benchmarks of the SCA in parallelized-OpenCL/CUDA}{Source: Realized by Baptiste Coudray}
 
-With this performance test, we notice that the computation time is essentially the same in OpenCL as in CUDA. Moreover, the parallelization follows the ideal speedup curve when the number of processes equals the number of graphics cards. However, when the eight graphics cards are shared, the speedup in OpenCL/CUDA crashes, and the computation time increases.
+With this performance test, we notice that the computation time is essentially the same in (+OpenCL) as in (+CUDA). Moreover, the parallelization follows the ideal speedup curve when the number of processes equals the number of graphics cards. However, when the eight graphics cards are shared, the speedup in (+OpenCL)/(+CUDA) crashes, and the computation time increases. Moreover, we notice that parallel computation is up to four times faster than sequential/concurrent computation when executing with a single task/graphical card.
 
 \pagebreak
diff --git a/src/text/08-jeu-de-la-vie.md b/src/text/08-jeu-de-la-vie.md
index 7b41a769644f5cddaab222f67a3311fd8d0338f0..a90e597db266a400814b37c2b5b2d60f165c9372 100644
--- a/src/text/08-jeu-de-la-vie.md
+++ b/src/text/08-jeu-de-la-vie.md
@@ -9,8 +9,7 @@ The Game of Life is a zero-player game designed by John Horton Conway in 1970. I
 
 ## Example
 
-\cimg{figs/gol_blinker1.png}{scale=0.40}{First state of blinker}{Source: Taken from
-\url{https://commons.wikimedia.org/}, ref. URL05. Re-created by Baptiste Coudray}
+\cimg{figs/gol_blinker1.png}{scale=0.40}{First state of blinker}{Source: Taken from \url{https://commons.wikimedia.org/}, ref. URL05. Re-created by Baptiste Coudray}
 
 \pagebreak
 
@@ -108,6 +107,6 @@ Table: Results for the parallelized-CUDA version of Game of Life
 
 \cimg{figs/gol_result_and_speedup_gpu.png}{width=\linewidth}{Benchmarks of the game of life in parallelized-OpenCL/CUDA}{Source: Realized by Baptiste Coudray}
 
-With this performance test, we notice that the computation time is essentially the same in OpenCL as in CUDA. Moreover, the parallelization follows the ideal speedup curve when the number of processes equals the number of graphics cards. However, when the eight graphics cards are shared, the speedup in OpenCL/CUDA stabilize, and the computation time increases ($+7 [s]$ between eight tasks and 64 tasks).
+With this performance test, we notice that the computation time is essentially the same in OpenCL as in CUDA. Moreover, the parallelization follows the ideal speedup curve when the number of processes equals the number of graphics cards. However, when the eight graphics cards are shared, the speedup in OpenCL/CUDA stabilize, and the computation time increases ($+7\:[s]$ between eight tasks and 64 tasks).
 
 \pagebreak
diff --git a/src/text/09-lattice-boltzmann.md b/src/text/09-lattice-boltzmann.md
index 053d173e53eebe72f7a3308ad112b38cd45f9fde..0c33cbc4813e4054e60e378345feb3e92f3c8859 100644
--- a/src/text/09-lattice-boltzmann.md
+++ b/src/text/09-lattice-boltzmann.md
@@ -1,4 +1,4 @@
-# Lattice-Boltzmann
+# Lattice-Boltzmann Method
 
 "_The lattice Boltzmann method (LBM) has established itself in the past decades as a valuable approach to Computational
 Fluid Dynamics (CFD). It is commonly used to model time-dependent, incompressible or compressible flows in a
@@ -10,11 +10,11 @@ fundamental research, as it keeps the cycle between the elaboration of a theory
 
 ## Parallelized version
 
-We create the lattice-Boltzmann method with our library to test it with a three-dimensional cellular automaton.
+We implement the Lattice-Boltzmann Method with our library to test it with a three-dimensional cellular automaton.
 
 ## CPU Benchmark
 
-We perform benchmarks to validate the scalability of our three-dimensional parallelization when compiling in sequential, multicore, (+^OpenCL), or (+^CUDA) mode. The benchmarks are performed on the (+^HES-GE) cluster (Baobab/Yggdrasil).
+We perform benchmarks to validate the scalability of our three-dimensional parallelization when compiling in sequential, multicore, (+OpenCL), or (+CUDA) mode. The benchmarks are performed on the (+HES-GE) cluster (Baobab/Yggdrasil).
 The sequential and multicore benchmarks are performed as follows:
 
 * the cellular automaton is $27'000'000$ cells in size,
@@ -32,7 +32,7 @@ The sequential and multicore benchmarks are performed as follows:
 | 32 | 41.04 [s] | ± 1.59 [s] | x17.4 | 15 |
 | 64 | 22.188 [s] | ± 0.321 [s] | x32.3 | 15 |
 | 128 | 17.415 [s] | ± 4.956 [s] | x41.1 | 15 |
-Table: Results for the parallelized-sequential version of Lattice-Boltzmann
+Table: Results for the parallelized-sequential version of (+LBM)
 
 | Number of tasks | Average [s] | Standard Derivation [s] | Speedup | Number of measures |
 |:---:|:---:|:---:|:---:|:---:|
@@ -44,17 +44,17 @@ Table: Results for the parallelized-sequential version of Lattice-Boltzmann
 | 32 | 46.285 [s] | ± 0.138 [s] | x15.0 | 15 |
 | 64 | 24.059 [s] | ± 0.061 [s] | x28.9 | 15 |
 | 128 | 16.614 [s] | ± 1.088 [s] | x41.9 | 15 |
-Table: Results for the parallelized-multicore version of Lattice-Boltzmann
+Table: Results for the parallelized-multicore version of (+LBM)
 
 \pagebreak
 
-\cimg{figs/lbm_result_and_speedup_cpu.png}{width=\linewidth}{Benchmarks of the lattice-Boltzmann method in parallelized-sequential/multicore}{Source: Realized by Baptiste Coudray}
+\cimg{figs/lbm_result_and_speedup_cpu.png}{width=\linewidth}{Benchmarks of the LBM in parallelized-sequential/multicore}{Source: Realized by Baptiste Coudray}
 
-Contrairement aux benchmarks précédents, les speedups ne suivent pas la courbe du speedup idéal. En effet, que ce soit en sequential ou en multicore, nous obtenons un speedup maximal avec 128 tâches de x41 alors qu'on espérait avoir un speedup de x128.
+Contrary to the previous benchmarks, the speedups do not follow the ideal speedup curve. Indeed, whether in sequential or multicore, we obtain a maximum speedup with 128 tasks of x41 when we were hoping to have a speedup of x128.
 
 ## GPU Benchmark
 
-The (+^OpenCL) and (+^CUDA) benchmarks are performed as follows:
+The (+OpenCL) and (+CUDA) benchmarks are performed as follows:
 
 * the cellular automaton has $27'000'000$ cells,
 * the number of tasks varies between $2^0$ and $2^6$.
@@ -62,4 +62,30 @@ The (+^OpenCL) and (+^CUDA) benchmarks are performed as follows:
 * the iteration is computed $3'000$ times.
 * From $2^0$ to $2^3$ tasks, an NVIDIA GeForce RTX 3090 is allocated for each task; beyond that, the eight graphics cards are shared equally among the ranks.
 
+| Number of tasks | Number of GPUs | Average [s] | Standard Derivation [s] | Speedup | Number of measures |
+|:---:|:---:|:---:|:---:|:---:|:---:|
+| 1 | 1 | 210.347 [s] | ± 0.096 [s] | x1.0 | 15 |
+| 2 | 2 | 99.677 [s] | ± 0.038 [s] | x2.1 | 15 |
+| 4 | 4 | 40.71 [s] | ± 0.076 [s] | x5.2 | 15 |
+| 8 | 8 | 20.8 [s] | ± 0.031 [s] | x10.1 | 15 |
+| 16 | 8 | 22.88 [s] | ± 0.064 [s] | x9.2 | 15 |
+| 32 | 8 | 22.47 [s] | ± 0.036 [s] | x9.4 | 15 |
+| 64 | 8 | 23.848 [s] | ± 0.035 [s] | x8.8 | 15 |
+Table: Results for the parallelized-OpenCL version of (+LBM)
+
+| Number of tasks | Number of GPUs | Average [s] | Standard Derivation [s] | Speedup | Number of measures |
+|:---:|:---:|:---:|:---:|:---:|:---:|
+| 1 | 1 | 207.683 [s] | ± 0.249 [s] | x1.0 | 15 |
+| 2 | 2 | 99.177 [s] | ± 0.056 [s] | x2.1 | 15 |
+| 4 | 4 | 40.24 [s] | ± 0.074 [s] | x5.2 | 15 |
+| 8 | 8 | 20.459 [s] | ± 0.037 [s] | x10.2 | 15 |
+| 16 | 8 | 22.837 [s] | ± 0.037 [s] | x9.1 | 15 |
+| 32 | 8 | 22.361 [s] | ± 0.024 [s] | x9.3 | 15 |
+| 64 | 8 | 23.688 [s] | ± 0.051 [s] | x8.8 | 15 |
+Table: Results for the parallelized-CUDA version of (+LBM)
+
+\cimg{figs/lbm_result_and_speedup_gpu.png}{width=\linewidth}{Benchmarks of the LBM in parallelized-OpenCL/CUDA}{Source: Realized by Baptiste Coudray}
+
+Like the other benchmarks, there is very little difference between the OpenCL and CUDA versions (computation time and speedup). We get a more than ideal speedup with 2, 4, and 8 tasks/GPUs (x2.1, x5.2, and x10.2, respectively). When several tasks use the same graphics card, the computation time stabilizes at 22 seconds, and the speedup stops increasing.
+
 \pagebreak
diff --git a/src/text/10-conclusion.md b/src/text/10-conclusion.md
index e2c5c985e276ce5927d10ed8b06006b64ba23bfc..26013bbc4770391a0aebe5300fedf3fc767c8697 100644
--- a/src/text/10-conclusion.md
+++ b/src/text/10-conclusion.md
@@ -1,3 +1,6 @@
 # Conclusion
 
+In this project, we created a library allowing to distribute a one, two or three dimensional cellular automaton on several computation nodes via MPI. Thanks to the different Futhark backends, the update of the cellular automaton can be done in sequential, concurrent or parallel computation. Thus, we compared these different modes by implementing a cellular automaton in one dimension ((+SCA)), in two dimensions (Game of Life) and in three dimensions ((+LBM)). Benchmarks for each backend were performed to verify the scalability of the library. We obtained ideal speedups with the cellular automata in one and two dimensions and with the use of the sequential and multicore Futhark backend. With these two backends and a three-dimensional cellular automaton, we had a maximum speedup of x41 with 128 tasks. Concerning the OpenCL and CUDA backends, they show no difference in performance between them. For the three cellular automata, the speedup is ideal only when the number of tasks is equal to the number of GPUs.
+Finally, the library can be improved in order to obtain an ideal speedup in three dimensions with the CPU backends. Moreover, the addition of a load balancing of the graphic cards to obtain better performances when there are more tasks than GPUs, and the support of the Von Neumann neighborhood to manage other cellular automata.
+
 \pagebreak
diff --git a/src/text/ZZ-glossaire.tex b/src/text/ZZ-glossaire.tex
index 2e0da749a9eeba72fac56fe1d0dc0be830fc99c7..4de72d916ce2a3870e66323d521682365ce7898f 100644
--- a/src/text/ZZ-glossaire.tex
+++ b/src/text/ZZ-glossaire.tex
@@ -16,3 +16,4 @@
 \newacronym{IO}{I/O}{Input/Output}
 \newacronym{MSS}{MSS}{Maximum Segment Sum}
 \newacronym{SCA}{SCA}{Simple Cellular Automaton}
+\newacronym{LBM}{LBM}{Lattice-Boltzmann Method}