diff --git a/src/figs/amdahls-law.png b/src/figs/amdahls-law.png new file mode 100644 index 0000000000000000000000000000000000000000..60fdf744578d7fe94a950d3dab2efb696db5f618 Binary files /dev/null and b/src/figs/amdahls-law.png differ diff --git a/src/figs/elem_result_and_speedup_cpu.png b/src/figs/elem_result_and_speedup_cpu.png new file mode 100644 index 0000000000000000000000000000000000000000..090d06e5d817386f131efbbc467ab37db43743d1 Binary files /dev/null and b/src/figs/elem_result_and_speedup_cpu.png differ diff --git a/src/figs/elem_result_and_speedup_gpu.png b/src/figs/elem_result_and_speedup_gpu.png new file mode 100644 index 0000000000000000000000000000000000000000..027a6e57e43bee9f855e9a4f0b12daa84be2c8f6 Binary files /dev/null and b/src/figs/elem_result_and_speedup_gpu.png differ diff --git a/src/figs/gol_result_and_speedup_cpu.png b/src/figs/gol_result_and_speedup_cpu.png index 0ef2be6adf8e25cd97a4fdadee00bffb5c3fad71..78d09dc22e0845af445c4502c4a3d1f9b2dd9887 100644 Binary files a/src/figs/gol_result_and_speedup_cpu.png and b/src/figs/gol_result_and_speedup_cpu.png differ diff --git a/src/figs/gol_result_and_speedup_gpu.png b/src/figs/gol_result_and_speedup_gpu.png index f2ea7ad06c0200a4914fbfa786855b77f48a5dc1..af4c6eaea4f9507a32ad7b5bc24cfb95e288d6ea 100644 Binary files a/src/figs/gol_result_and_speedup_gpu.png and b/src/figs/gol_result_and_speedup_gpu.png differ diff --git a/src/figs/gustafson-law.png b/src/figs/gustafson-law.png new file mode 100644 index 0000000000000000000000000000000000000000..14841a52957c10fd7a13d177751c8b5f085d63db Binary files /dev/null and b/src/figs/gustafson-law.png differ diff --git a/src/figs/lbm_result_and_speedup_cpu.png b/src/figs/lbm_result_and_speedup_cpu.png new file mode 100644 index 0000000000000000000000000000000000000000..2f00c2604779d2addf407ce00a8459ac5e242882 Binary files /dev/null and b/src/figs/lbm_result_and_speedup_cpu.png differ diff --git a/src/figs/mss_bench.png b/src/figs/mss_bench.png new file mode 100644 index 0000000000000000000000000000000000000000..46792ed82e71d8b46902401a43c8e55f5d72caf4 Binary files /dev/null and b/src/figs/mss_bench.png differ diff --git a/src/figs/sca_result_and_speedup.png b/src/figs/sca_result_and_speedup.png deleted file mode 100644 index a76765eb03b6adafd4d7ad7c11c20475efe94d96..0000000000000000000000000000000000000000 Binary files a/src/figs/sca_result_and_speedup.png and /dev/null differ diff --git a/src/my.bib b/src/my.bib index 8369d955857fde9f2322eaf06cda09abc8b83721..4d6eefdd64e5456855f3768a5df84e657ea02713 100644 --- a/src/my.bib +++ b/src/my.bib @@ -1,125 +1,236 @@ -@online{kendall_mpi_nodate, - title = {{MPI} Send and Receive · {MPI} Tutorial}, - url = {https://mpitutorial.com/tutorials/mpi-send-and-receive/}, - author = {Kendall, Wes}, - urldate = {2021-03-03}, - file = {MPI Send and Receive · MPI Tutorial:/Users/baptistecdr/Zotero/storage/Y5W9BQBP/mpi-send-and-receive.html:text/html}, + +@inreference{noauthor_message_2021, + title = {Message Passing Interface}, + rights = {Creative Commons Attribution-{ShareAlike} License}, + url = {https://fr.wikipedia.org/w/index.php?title=Message_Passing_Interface}, + abstract = {Message Passing Interface ({MPI}), est une norme conçue en 1993-94 pour le passage de messages entre ordinateurs distants ou dans un ordinateur multiprocesseur. Elle est devenue de facto un standard de communication pour des nœuds exécutant des programmes parallèles sur des systèmes à mémoire distribuée. Elle définit une bibliothèque de fonctions, utilisable avec les langages C, C++ et Fortran. +{MPI} a été écrite pour obtenir de bonnes performances aussi bien sur des machines massivement parallèles à mémoire partagée que sur des clusters d'ordinateurs hétérogènes à mémoire distribuée. Elle est disponible sur de très nombreux matériels et systèmes d'exploitation. Ainsi, {MPI} possède l'avantage par rapport aux plus vieilles bibliothèques de passage de messages d'être grandement portable (car {MPI} a été implémentée sur presque toutes les architectures de mémoires) et rapide (car chaque implémentation a été optimisée pour le matériel sur lequel il s'exécute). +Depuis 1997, une nouvelle version de {MPI} est disponible, {MPI}-2, qui apporte quelques puissantes[évasif] fonctionnalités supplémentaires[Lesquels ?]. +Depuis 2015, une nouvelle version de {MPI} est disponible, {MPI}-3, qui apporte des écritures parallèles dans les fichiers. +Depuis 2020, une nouvelle version de {MPI} est disponible, {MPI}-4, qui apporte le {RDMA} et la prévision de détection automatique en cas de panne. ({MPI} Forum)}, + booktitle = {Wikipédia}, + urldate = {2021-07-22}, + date = {2021-03-17}, + langid = {french}, + note = {Page Version {ID}: 180967726}, + file = {Snapshot:/Users/baptistecdr/Zotero/storage/GLQPSHE4/index.html:text/html}, } -@inreference{noauthor_jeu_2020, - title = {Jeu de la vie}, - rights = {Creative Commons Attribution-{ShareAlike} License}, - url = {https://fr.wikipedia.org/w/index.php?title=Jeu_de_la_vie}, - abstract = {Le jeu de la vie est un automate cellulaire imaginé par John Horton Conway en 1970 et qui est probablement le plus connu de tous les automates cellulaires. Malgré des règles très simples, le jeu de la vie est Turing-complet. -Le jeu de la vie est un jeu de simulation au sens mathématique plutôt que ludique. Bien que n'étant pas décrit par la théorie des jeux, certains le décrivent comme un « jeu à zéro joueur ».}, - booktitle = {Wikipédia}, - urldate = {2021-03-03}, - date = {2020-11-24}, - langid = {french}, - file = {Snapshot:/Users/baptistecdr/Zotero/storage/ECF2P336/index.html:text/html}, +@online{kendall_mpi_2018, + title = {{MPI} Send and Receive · {MPI} Tutorial}, + url = {https://mpitutorial.com/tutorials/mpi-send-and-receive/}, + titleaddon = {{MPI} Tutorial}, + author = {Kendall, Wess}, + urldate = {2021-07-22}, + date = {2018-05-18}, + langid = {english}, + file = {MPI Send and Receive · MPI Tutorial:/Users/baptistecdr/Zotero/storage/29HNMMMK/mpi-send-and-receive.html:text/html}, } -@online{noauthor_basic_nodate, - title = {Basic usage with the factorial function}, - url = {https://futhark-lang.org/examples/fact.html}, - urldate = {2021-03-03}, - file = {Basic usage with the factorial function:/Users/baptistecdr/Zotero/storage/B6VQZVH5/fact.html:text/html}, +@inreference{noauthor_amdahls_2021, + title = {Amdahl's law}, + rights = {Creative Commons Attribution-{ShareAlike} License}, + url = {https://en.wikipedia.org/w/index.php?title=Amdahl%27s_law&oldid=1034193438}, + abstract = {In computer architecture, Amdahl's law (or Amdahl's argument) is a formula which gives the theoretical speedup in latency of the execution of a task at fixed workload that can be expected of a system whose resources are improved. It is named after computer scientist Gene Amdahl, and was presented at the {AFIPS} Spring Joint Computer Conference in 1967. +Amdahl's law is often used in parallel computing to predict the theoretical speedup when using multiple processors. For example, if a program needs 20 hours to complete using a single thread, but a one-hour portion of the program cannot be parallelized, therefore only the remaining 19 hours (p = 0.95) of execution time can be parallelized, then regardless of how many threads are devoted to a parallelized execution of this program, the minimum execution time cannot be less than one hour. Hence, the theoretical speedup is limited to at most 20 times the single thread performance, + + + + + ( + + + + + 1 + + 1 + − + p + + + + + = + 20 + + ) + + + + \{{\textbackslash}displaystyle {\textbackslash}left(\{{\textbackslash}dfrac \{1\}\{1-p\}\}=20{\textbackslash}right)\} + .}, + booktitle = {Wikipedia}, + urldate = {2021-07-22}, + date = {2021-07-18}, + langid = {english}, + note = {Page Version {ID}: 1034193438}, + file = {Snapshot:/Users/baptistecdr/Zotero/storage/4KJVD4JN/index.html:text/html}, } -@inreference{noauthor_message_2020, - title = {Message Passing Interface}, - rights = {Creative Commons Attribution-{ShareAlike} License}, - url = {https://fr.wikipedia.org/w/index.php?title=Message_Passing_Interface}, - abstract = {Message Passing Interface ({MPI}), est une norme conçue en 1993-94 pour le passage de messages entre ordinateurs distants ou dans un ordinateur multiprocesseur. Elle est devenue de facto un standard de communication pour des nœuds exécutant des programmes parallèles sur des systèmes à mémoire distribuée. Elle définit une bibliothèque de fonctions, utilisable avec les langages C, C++ et Fortran. -{MPI} a été écrite pour obtenir de bonnes performances aussi bien sur des machines massivement parallèles à mémoire partagée que sur des clusters d'ordinateurs hétérogènes à mémoire distribuée. Elle est disponible sur de très nombreux matériels et systèmes d'exploitation. Ainsi, {MPI} possède l'avantage par rapport aux plus vieilles bibliothèques de passage de messages d'être grandement portable (car {MPI} a été implémentée sur presque toutes les architectures de mémoires) et rapide (car chaque implémentation a été optimisée pour le matériel sur lequel il s'exécute). -Depuis 1997, une nouvelle version de {MPI} est disponible, {MPI}-2, qui apporte quelques puissantes[évasif] fonctionnalités supplémentaires[Lesquels ?].}, - booktitle = {Wikipédia}, - urldate = {2021-03-03}, - date = {2020-08-18}, - langid = {french}, - file = {Snapshot:/Users/baptistecdr/Zotero/storage/LYSKJH5C/index.html:text/html}, +@inreference{noauthor_gustafsons_2021, + title = {Gustafson's law}, + rights = {Creative Commons Attribution-{ShareAlike} License}, + url = {https://en.wikipedia.org/w/index.php?title=Gustafson%27s_law&oldid=1031307338}, + abstract = {In computer architecture, Gustafson's law (or Gustafson–Barsis's law) gives the theoretical speedup in latency of the execution of a task at fixed execution time that can be expected of a system whose resources are improved. It is named after computer scientist John L. Gustafson and his colleague Edwin H. Barsis, and was presented in the article Reevaluating Amdahl's Law in 1988.}, + booktitle = {Wikipedia}, + urldate = {2021-07-22}, + date = {2021-06-30}, + langid = {english}, + note = {Page Version {ID}: 1031307338}, + file = {Snapshot:/Users/baptistecdr/Zotero/storage/FGKVKJQD/index.html:text/html}, } -@online{noauthor_mpi_nodate, - title = {{MPI} Forum}, - url = {https://www.mpi-forum.org/}, - urldate = {2021-03-03}, - langid = {english}, - file = {Snapshot:/Users/baptistecdr/Zotero/storage/FZMLQQ7X/www.mpi-forum.org.html:text/html}, +@online{henriksen_gotta_2021, + title = {Gotta Go Fast!}, + url = {https://futhark-lang.org/performance.html}, + titleaddon = {The Futhark Programming Language}, + author = {Henriksen, Troels}, + urldate = {2021-07-22}, + date = {2021-03-20}, + langid = {english}, + file = {Gotta Go Fast!:/Users/baptistecdr/Zotero/storage/ZCTX7QC3/performance.html:text/html}, } -@online{noauthor_filegol-blinker1png_nodate, - title = {File:Gol-blinker1.png}, - url = {https://commons.wikimedia.org/wiki/File:Gol-blinker1.png}, - shorttitle = {File}, - urldate = {2021-03-03}, - langid = {english}, - file = {Snapshot:/Users/baptistecdr/Zotero/storage/9KAMIL8Z/FileGol-blinker1.html:text/html}, +@online{henriksen_basic_2021, + title = {Basic usage with the factorial function}, + url = {https://futhark-lang.org/examples/fact.html}, + titleaddon = {The Futhark Programming Language}, + author = {Henriksen, Troels}, + urldate = {2021-07-22}, + date = {2021-04-11}, + file = {Basic usage with the factorial function:/Users/baptistecdr/Zotero/storage/YYAGN352/fact.html:text/html}, } -@online{noauthor_filegol-blinker2png_nodate, - title = {File:Gol-blinker2.png}, - url = {https://commons.wikimedia.org/wiki/File:Gol-blinker2.png}, - shorttitle = {File}, - urldate = {2021-03-03}, - langid = {english}, - file = {Snapshot:/Users/baptistecdr/Zotero/storage/NJBL98JF/FileGol-blinker2.html:text/html}, +@article{latt_palabos_2020, + title = {Palabos: Parallel Lattice Boltzmann Solver}, + volume = {81}, + doi = {10.1016/j.camwa.2020.03.022}, + abstract = {We present the scope, concepts, data structures and application programming models of the open-source Lattice Boltzmann library Palabos. Palabos is a C++ software platform developed since 2010 for Computational Fluid Dynamics simulations and Lattice Boltzmann modeling, which specifically targets applications with complex, coupled physics. The software proposes a very broad modeling framework, capable of addressing a large number of applications of interest in the Lattice Boltzmann community, yet exhibits solid computational performance. The article describes the philosophy of this programming framework and lists the models already implemented. Finally, benchmark simulations are provided which serve as a proof of quality of the implemented core functionalities.}, + journaltitle = {Computers \& Mathematics with Applications}, + shortjournal = {Computers \& Mathematics with Applications}, + author = {Latt, Jonas and Malaspinas, Orestis and Kontaxakis, Dimitrios and Parmigiani, Andrea and Lagrava, Daniel and Brogi, Federico and Ben Belgacem, Mohamed and Thorimbert, Yann and Leclaire, Sébastien and Li, Sha and Marson, Francesco and Lemus, Jonathan and Kotsalos, Christos and Conradin, Raphaël and Coreixas, Christophe and Petkantchin, Rémy and Raynaud, Franck and Beny, Joel and Chopard, Bastien}, + date = {2020-04-01}, } -@artwork{wikipedia_speed-up_2008, - title = {The speed-up of a program from parallelization is limited by how much of the program can be parallelized.}, - url = {https://commons.wikimedia.org/wiki/File:AmdahlsLaw.svg}, - shorttitle = {Amdahl's law}, - author = {Wikipedia, Daniels220}, - urldate = {2021-03-03}, - date = {2008-04-13}, - file = {Wikimedia Snapshot:/Users/baptistecdr/Zotero/storage/IQD37NXE/FileAmdahlsLaw.html:text/html}, +@article{macdonald_writing_nodate, + title = {Writing Message-Passing Parallel Programs with {MPI}}, + pages = {92}, + author = {{MacDonald}, Neil and Minty, Elspeth and Harding, Tim and Brown, Simon}, + langid = {english}, + file = {mpi_course.pdf:/Users/baptistecdr/Zotero/storage/HYDWR2SY/mpi_course.pdf:application/pdf}, } -@artwork{peahihawaii_speedup_nodate, - title = {Speedup according to Gustafson's Law.}, - rights = {Permission is granted to copy, distribute and/or modify this document under the terms of the {GNU} Free Documentation License, Version 1.2 or any later version published by the Free Software Foundation; with no Invariant Sections, no Front-Cover Texts, and no Back-Cover Texts. A copy of the license is included in the section entitled {GNU} Free Documentation License.http://www.gnu.org/copyleft/fdl.{htmlGFDLGNU} Free Documentation Licensetruetrue}, - url = {https://commons.wikimedia.org/wiki/File:Gustafson.png}, - shorttitle = {Gustafson's law}, - author = {{Peahihawaii}}, - urldate = {2021-03-03}, - file = {Wikimedia Snapshot:/Users/baptistecdr/Zotero/storage/FPBB7RUJ/FileGustafson.html:text/html}, +@inreference{noauthor_jeu_2021, + title = {Jeu de la vie}, + rights = {Creative Commons Attribution-{ShareAlike} License}, + url = {https://fr.wikipedia.org/w/index.php?title=Jeu_de_la_vie&oldid=183190635}, + abstract = {Le jeu de la vie est un automate cellulaire imaginé par John Horton Conway en 1970 et qui est probablement le plus connu de tous les automates cellulaires. Malgré des règles très simples, le jeu de la vie est Turing-complet. +Le jeu de la vie est un jeu de simulation au sens mathématique plutôt que ludique. Bien que n'étant pas décrit par la théorie des jeux, certains le décrivent comme un « jeu à zéro joueur ».}, + booktitle = {Wikipédia}, + urldate = {2021-07-22}, + date = {2021-05-23}, + langid = {french}, + note = {Page Version {ID}: 183190635}, + file = {Snapshot:/Users/baptistecdr/Zotero/storage/HDKB5PPW/index.html:text/html}, } -@inreference{noauthor_parallelisme_2021, - title = {Parallélisme (informatique)}, - rights = {Creative Commons Attribution-{ShareAlike} License}, - url = {https://fr.wikipedia.org/w/index.php?title=Parall%C3%A9lisme_(informatique)}, - abstract = {En informatique, le parallélisme consiste à mettre en œuvre des architectures d'électronique numérique permettant de traiter des informations de manière simultanée, ainsi que les algorithmes spécialisés pour celles-ci. Ces techniques ont pour but de réaliser le plus grand nombre d'opérations en un temps le plus petit possible. -Les architectures parallèles sont devenues le paradigme dominant pour tous les ordinateurs depuis les années 2000. En effet, la vitesse de traitement qui est liée à l'augmentation de la fréquence des processeurs connait des limites. La création de processeurs multi-cœurs, traitant plusieurs instructions en même temps au sein du même composant, résout ce dilemme pour les machines de bureau depuis le milieu des années 2000. -Pour être efficaces, les méthodes utilisées pour la programmation des différentes tâches qui constituent un programme sont spécifiques à ce mode de calcul, c'est-à-dire que les programmes doivent être réalisés avec cette optique. Ces méthodes ont initialement été développées de manière théorique et sur des superordinateurs, qui étaient à une période les seuls à compter de nombreux processeurs, mais sont de plus en plus volontiers utilisées par les développeurs de logiciel du fait de l'omniprésence de telles architectures. -Certains types de calculs se prêtent particulièrement bien à la parallélisation : la dynamique des fluides, les prédictions météorologiques, la modélisation et simulation de problèmes de dimensions plus grandes, le traitement de l'information et l'exploration de données, le décryptage de messages, la recherche de mots de passe, le traitement d'images ou la fabrication d'images de synthèse, tels que le lancer de rayon, l'intelligence artificielle et la fabrication automatisée. Initialement, c'est dans le domaine des supercalculateurs que le parallélisme a été utilisé, à des fins scientifiques.}, - booktitle = {Wikipédia}, - urldate = {2021-03-03}, - date = {2021-02-07}, - langid = {french}, - file = {Snapshot:/Users/baptistecdr/Zotero/storage/92MTSL2B/index.html:text/html}, +@inreference{noauthor_automate_2021, + title = {Automate cellulaire}, + rights = {Creative Commons Attribution-{ShareAlike} License}, + url = {https://fr.wikipedia.org/w/index.php?title=Automate_cellulaire&oldid=183026782}, + abstract = {Un automate cellulaire consiste en une grille régulière de « cellules » contenant chacune un « état » choisi parmi un ensemble fini et qui peut évoluer au cours du temps. L'état d'une cellule au temps t+1 est fonction de l'état au temps t d'un nombre fini de cellules appelé son « voisinage ». À chaque nouvelle unité de temps, les mêmes règles sont appliquées simultanément à toutes les cellules de la grille, produisant une nouvelle « génération » de cellules dépendant entièrement de la génération précédente. +Étudiés en mathématiques et en informatique théorique, les automates cellulaires sont à la fois un modèle de système dynamique discret et un modèle de calcul. Le modèle des automates cellulaires est remarquable par l'écart entre la simplicité de sa définition et la complexité que peuvent atteindre certains comportements macroscopiques : l'évolution dans le temps de l'ensemble des cellules ne se réduit pas (simplement) à la règle locale qui définit le système. À ce titre il constitue un des modèles standards dans l'étude des systèmes complexes.}, + booktitle = {Wikipédia}, + urldate = {2021-07-22}, + date = {2021-05-18}, + langid = {french}, + note = {Page Version {ID}: 183026782}, + file = {Snapshot:/Users/baptistecdr/Zotero/storage/L5L9W28B/index.html:text/html}, } @inreference{noauthor_programmation_2021, - title = {Programmation fonctionnelle}, - rights = {Creative Commons Attribution-{ShareAlike} License}, - url = {https://fr.wikipedia.org/w/index.php?title=Programmation_fonctionnelle}, - abstract = {La programmation fonctionnelle est un paradigme de programmation de type déclaratif qui considère le calcul en tant qu'évaluation de fonctions mathématiques. + title = {Programmation fonctionnelle}, + rights = {Creative Commons Attribution-{ShareAlike} License}, + url = {https://fr.wikipedia.org/w/index.php?title=Programmation_fonctionnelle&oldid=183271341}, + abstract = {La programmation fonctionnelle est un paradigme de programmation de type déclaratif qui considère le calcul en tant qu'évaluation de fonctions mathématiques. Comme le changement d'état et la mutation des données ne peuvent pas être représentés par des évaluations de fonctions la programmation fonctionnelle ne les admet pas, au contraire elle met en avant l'application des fonctions, contrairement au modèle de programmation impérative qui met en avant les changements d'état. -Un langage fonctionnel est donc un langage de programmation dont la syntaxe et les caractéristiques encouragent la programmation fonctionnelle. Alors que l'origine de la programmation fonctionnelle peut être trouvée dans le lambda-calcul, le langage fonctionnel le plus ancien est Lisp, créé en 1958 par {McCarthy}. Lisp a donné naissance à des variantes telles que Scheme (1975) et Common Lisp (1984) qui, comme Lisp, ne sont pas ou peu typées. Des langages fonctionnels plus récents tels {ML} (1973), Haskell (1987), {OCaml}, Erlang, Clean et Oz, {CDuce}, Scala (2003), F\# ou {PureScript} (2013), Agda (en) sont fortement typés.}, - booktitle = {Wikipédia}, - urldate = {2021-03-10}, - date = {2021-01-27}, - langid = {french}, - file = {Snapshot:/Users/baptistecdr/Zotero/storage/3ZY5HPXJ/index.html:text/html}, +Un langage fonctionnel est donc un langage de programmation dont la syntaxe et les caractéristiques encouragent la programmation fonctionnelle. Alors que l'origine de la programmation fonctionnelle peut être trouvée dans le lambda-calcul, le langage fonctionnel le plus ancien est Lisp, créé en 1958 par {McCarthy}. Lisp a donné naissance à des variantes telles que Scheme (1975) et Common Lisp (1984) qui, comme Lisp, ne sont pas ou peu typées. Des langages fonctionnels plus récents tels {ML} (1973), Haskell (1987), {OCaml}, Erlang, Clean et Oz, {CDuce}, Scala (2003), F\# ou {PureScript} (2013), Agda (en) sont fortement typés.}, + booktitle = {Wikipédia}, + urldate = {2021-07-22}, + date = {2021-05-26}, + langid = {french}, + note = {Page Version {ID}: 183271341}, + file = {Snapshot:/Users/baptistecdr/Zotero/storage/Z4UFD79Y/index.html:text/html}, } -@artwork{muidark_elder_nodate, - title = {Elder futhark}, - url = {https://commons.wikimedia.org/wiki/File:Elder_futhark.png}, - shorttitle = {Elder futhark}, - author = {{MuiDark}}, - urldate = {2021-03-13}, - file = {Wikimedia Snapshot:/Users/baptistecdr/Zotero/storage/YG6H6Z6R/FileElder_futhark.html:text/html}, -} +@inreference{noauthor_maximum_2021, + title = {Maximum subarray problem}, + rights = {Creative Commons Attribution-{ShareAlike} License}, + url = {https://en.wikipedia.org/w/index.php?title=Maximum_subarray_problem&oldid=1030176929}, + abstract = {In computer science, the maximum sum subarray problem is the task of finding a contiguous subarray with the largest sum, within a given one-dimensional array A[1...n] of numbers. Formally, the task is to find indices + + + + i + + + \{{\textbackslash}displaystyle i\} + and + + + + j + + + \{{\textbackslash}displaystyle j\} + with + + + + 1 + ≤ + i + ≤ + j + ≤ + n + + + \{{\textbackslash}displaystyle 1{\textbackslash}leq i{\textbackslash}leq j{\textbackslash}leq n\} + , such that the sum + + + + + + ∑ + + x + = + i + + + j + + + A + [ + x + ] + + + \{{\textbackslash}displaystyle {\textbackslash}sum \_\{x=i\}{\textasciicircum}\{j\}A[x]\} + is as large as possible. (Some formulations of the problem also allow the empty subarray to be considered; by convention, the sum of all values of the empty subarray is zero.) Each number in the input array A could be positive, negative, or zero.For example, for the array of values [−2, 1, −3, 4, −1, 2, 1, −5, 4], the contiguous subarray with the largest sum is [4, −1, 2, 1], with sum 6. +Some properties of this problem are: + +If the array contains all non-negative numbers, then the problem is trivial; a maximum subarray is the entire array. +If the array contains all non-positive numbers, then a solution is any subarray of size 1 containing the maximal value of the array (or the empty subarray, if it is permitted). +Several different sub-arrays may have the same maximum sum.This problem can be solved using several different algorithmic techniques, including brute force, divide and conquer, dynamic programming, and reduction to shortest paths.}, + booktitle = {Wikipedia}, + urldate = {2021-07-22}, + date = {2021-06-24}, + langid = {english}, + note = {Page Version {ID}: 1030176929}, + file = {Snapshot:/Users/baptistecdr/Zotero/storage/LL8NK2KY/index.html:text/html}, +} \ No newline at end of file diff --git a/src/text/00-preface.md b/src/text/00-preface.md index 96ae62dc71b232f667e3fcce4211ebafa4c0ed55..de32563f1cbc08a155e47ae0845065d81b202414 100644 --- a/src/text/00-preface.md +++ b/src/text/00-preface.md @@ -4,7 +4,8 @@ I would like to thank the people who helped me during this project: * Dr. Orestis Malaspinas, for his supervision and help on the project, * Michaël El Kharroubi, for his help on the project, -* Dr. Troels Henriksen, for his answers to my questions, +* Dr. Troels Henriksen, for his answers to my questions about Futhark, +* Yann Sagon, for his answers to my questions about Baobab/Yggdrasil, * Theo Pirkl, for the model of the bachelor thesis. # Abstract {-} diff --git a/src/text/03-programmation-parallele.md b/src/text/03-programmation-parallele.md index 13801dcc50005b9736225e0a0f0a1729fb98ed24..d7c1e26541d466319acf010c29761a3e5ff6c1b3 100644 --- a/src/text/03-programmation-parallele.md +++ b/src/text/03-programmation-parallele.md @@ -1,31 +1,27 @@ -# Distributed computing vs parallel computing vs concurrent computing +# Distributed and parallel computing -Parallel computing refers to the process of breaking down larger problems into smaller, independent, often similar parts that can be executed simultaneously by multiple processors communicating via shared memory, the results of which are combined upon completion as part of an overall algorithm. The primary goal of parallel computing is to increase available computation power for faster application processing and problem solving. +"Parallel computing refers to the process of breaking down larger problems into smaller, independent, often similar parts that can be executed simultaneously by multiple processors communicating via shared memory, the results of which are combined upon completion as part of an overall algorithm. The primary goal of parallel computing is to increase available computation power for faster application processing and problem-solving." +## Amdahl's law vs Gustafson–Barsis's law -Dans le parallélisme, il existe deux lois importantes : +In parallel computing, two important laws give the theoretical speedup that can be expected of a system whose resources are improved, Amdahl's law and Gustafson-Barsis law. -1. la loi d'Amdahl -2. la loi de Gustafon-Barsis +\cimg{figs/amdahls-law.png}{scale=0.6}{Amdahl's law}{Source: Taken from https://commons.wikimedia.org/, ref. URL02} -\cimg{figs/amdahls-law.png}{scale=0.6}{Loi d'Amdahl}{Source : Tiré de https://commons.wikimedia.org/, ref. URL02} +Amdahl's law states that the program's overall speed is limited by the code that cannot be parallelized. Indeed, there will almost always be a sequential part in a code that cannot be parallelized. There is, therefore, a relationship between the ratio of parallelizable code and the overall execution speed of the program. [@noauthor_amdahls_2021] -La loi d'Amdahls affirme que la vitesse globale du programme est limitée par le code qui ne peut être parallélisée. En -effet, dans un code il y aura presque toujours une partie séquentielle non parallélisable. Il y a donc une relation -entre le ratio de code parallélisable et la vitesse globale d'exécution du programme. +In the graph above, we notice that if: -Dans le graphique ci-dessus, on remarque que si : - -* 50 % du code est parallélisé, alors, on obtient une accélération théorique maximale de x2 à partir de 16 processeurs. -* 75 % du code est parallélisé, alors, on obtient une accélération théorique maximale de x4 à partir de 128 processeurs. -* 90 % du code est parallélisé, alors, on obtient une accélération théorique maximale de x10 à partir de 512 processeurs. -* 95 % du code est parallélisé, alors, on obtient une accélération théorique maximale de x20 à partir de 4096 processeurs. +* 50% of the code is parallelized, we obtain a maximum theoretical acceleration of x2 from 16 processors, +* 75% of the code is parallelized, we obtain a maximum theoretical acceleration of x4 from 128 processors, +* 90% of the code is parallelized, we obtain a maximum theoretical acceleration of x10 from 512 processors, +* 95% of the code is parallelized, we obtain a maximum theoretical acceleration of x20 from 4096 processors. \pagebreak -\cimg{figs/gustafson-law.png}{scale=0.75}{Loi de Gustafon-Barsis}{Source : Tiré de https://commons.wikimedia.org/, ref. URL03} +\cimg{figs/gustafson-law.png}{scale=0.75}{Gustafson–Barsis's law}{Source: Taken from https://commons.wikimedia.org/, ref. URL03} -La loi de Gustafson dit que plus le nombre de données à traiter est grand, plus l'utilisation d'un grand nombre de processeurs sera avantageux. Ainsi, l'accélération est linéaire comme on peut le voir sur le graphique. -Sur le graphique, on remarque par exemple qu'avec un code qui est 90 % parallélisé, on a un *speedup* d'au moins x100 avec 120 processeurs là où la loi d'Amdahl estimait un *speedup* maximal de x10 avec 512 processeurs. La loi de Gustafson est donc beaucoup plus optimiste en termes de gain de performance. +Gustafson's law says that the more significant the amount of data to be processed, the more advantageous it is to use many processors. Thus, the acceleration is linear, as can be seen on the graph.[@noauthor_gustafsons_2021] +On the graph, we notice, for example, that with a code that is 90% parallelized, we have a speedup of at least x100 with 120 processors, where Amdahl's law estimated a maximum speedup of x10 with 512 processors. Gustafson's law is therefore much more optimistic in terms of performance gain. \pagebreak diff --git a/src/text/04-mpi.md b/src/text/04-mpi.md index 991ceb7a4a0bb4b63f1a04acae0fe771bb27d1d1..0d4c46b6b664ad976a8a5e86edc0788b440aefc4 100644 --- a/src/text/04-mpi.md +++ b/src/text/04-mpi.md @@ -1,6 +1,6 @@ # Message Passing Interface -In order to realize parallel programming, the standard (+^MPI) was created in 1993-1994 to standardize the passage of messages between several computers or in a computer with several processors/cores [@noauthor_message_2020]. (+^MPI) is, therefore, a communication protocol and not a programming language. Currently, the latest version of (+^MPI) is 4.0 which approved in 2021. There are several implementations of the standard: +In order to realize parallel programming, the standard (+^MPI) was created in 1993-1994 to standardize the passage of messages between several computers or in a computer with several processors/cores [@noauthor_message_2021]. (+^MPI) is, therefore, a communication protocol and not a programming language. Currently, the latest version of (+^MPI) is 4.0 which approved in 2021. There are several implementations of the standard: * MPICH, which support for the moment, MPI 3.1, * Open MPI, which support, for the moment, MPI 3.1 @@ -11,12 +11,11 @@ We use Open MPI throughout this project on the cluster of the (+^HES-GE). ## Example -To understand the basis of (+^MPI), let us look at an example mimicking a *token ring* network [@kendall_mpi_nodate]. This type of network forces a process to send a message to the message in the console, for example, only if it has the token in its possession. Moreover, once it has emitted its message, the process must transmit the token to its neighbor. +To understand the basis of (+^MPI), let us look at an example mimicking a *token ring* network [@kendall_mpi_2018]. This type of network forces a process to send a message to the message in the console, for example, only if it has the token in its possession. Moreover, once it has emitted its message, the process must transmit the token to its neighbor. -\cimg{figs/ring.png}{scale=0.4}{Imitation of a network in \textit{token ring}}{Source : Baptiste Coudray} +\cimg{figs/ring.png}{scale=0.4}{Imitation of a network in \textit{token ring}}{Source: Created by Baptiste Coudray} -In this example, the node with the identifier zero has first the token that it will pass to node one, then it will give it to node two, and so on. The program ends when the token is back in possession of the process -zero: node four sends the token to node zero. +In this example, the node with the identifier zero has first the token that it will pass to node one, then it will give it to node two, and so on. The program ends when the token is back in possession of the process zero: node four sends the token to node zero. ```c int main(int argc, char** argv) { @@ -61,8 +60,7 @@ Thanks to the node number, the node with the identifier zero, sends the token to So, once sent, it waits for node four to send the token via the function `MPI_Recv`. Then, the other nodes are waiting to receive the token from their neighbor to pass the token in turn. The nodes communicate through the communicator `MPI_COMM_WORLD`, a macro-constant designating all nodes associated with the current program. -Finally, every program must terminate with the `MPI_Finalize()` function; otherwise, the execution ends with an -error message. +Finally, every program must terminate with the `MPI_Finalize()` function; otherwise, the execution ends with an error message. ```bash mpicc ring.c -o ring @@ -71,7 +69,7 @@ mpirun -n 5 ./ring To compile a (+^MPI) program, you have to go through the `mpicc` program, which is a wrapper around (+^GCC). Indeed, `mpicc` automatically adds the correct compilation parameters to the (+^GCC) program. -Next, our compiled program must be run through `mpirun` to distribute our program to the compute nodes. Finally, the `-n` parameter is used to specify the number of processes to run. +Next, our compiled program must be run through `mpirun` to distribute our program to compute nodes. Finally, the `-n` parameter is used to specify the number of processes to run. ``` Process 1 received token -1 from process 0 diff --git a/src/text/05-futhark.md b/src/text/05-futhark.md index ef75e4269c0350bbe36018311e4929e870701101..6d345fd5875d22863c066277c0b55268f7ebd853 100644 --- a/src/text/05-futhark.md +++ b/src/text/05-futhark.md @@ -1,9 +1,9 @@ # Introduction to the language Futhark -\cimg{figs/futhark.png}{scale=0.60}{Futhark}{Source : Taken from https://commons.wikimedia.org/, ref. URL04} +\cimg{figs/futhark.png}{scale=0.60}{Futhark}{Source: Taken from https://commons.wikimedia.org/, ref. URL04} Futhark is a purely functional programming language for producing parallelizable code on (+^CPU) or (+^GPU). It was designed by Troels Henriksen, Cosmin Oancea and Martin Elsman at the University of Copenhagen. -The main goal of Futhark is to write generic code that can compile into either : +The main goal of Futhark is to write generic code that can compile into either: * (+^OpenCL), * (+^CUDA), @@ -13,11 +13,17 @@ The main goal of Futhark is to write generic code that can compile into either : Although a Futhark code can compile into an executable, this feature reserves for testing purposes because there is no (+^IO). Thus, the main interest is to write particular functions that you would like to speed up thanks to parallel programming and compile in library mode to use in a C program. +To see the performance of Futhark, Here is an example from the Futhark site that compares the resolution time of the (+^MSS) problem. The (+^MSS) problem is the task of finding a contiguous subarray with the largest sum, within a given one-dimensional array A[1...n] of numbers.[@noauthor_maximum_2021] + +\cimg{figs/mss_bench.png}{scale=0.60}{MSS runtime (lower is better)}{Source: Taken from https://futhark-lang.org/performance.html, ref. URL04} + +This graph shows performance of a maximum segment sum implementation in Futhark and Thrust (a C++ library developed by NVIDIA for (+^GPU) programming). The sequential runtime is for Futhark code compiled to sequential (+^CPU) code and the Futhark runtime is for code compiled to (+^CUDA).[@henriksen_gotta_2021] As we can see, the Futhark version is much faster than the sequential and Thrust versions, which justify using this language in this project. + \pagebreak ## Example 1 -To better understand Futhark, here is a simple example: calculating the factorial of a number. @noauthor_basic_nodate]. +To better understand Futhark, here is a simple example: calculating the factorial of a number. [@henriksen_basic_2021]. ``` let fact (n: i32): i32 = reduce (*) 1 (1...n) @@ -26,14 +32,12 @@ let main (n: i32): i32 = fact n Futhark does not handle recursion, so the factorial of a number is defined as the successive multiplication of numbers from one to `n`. In Futhark, this operation defines the reduction of an array with the multiplication of each value as the operation. The program's entry point, `main`, takes as parameter a number and calls the function `fact`. -Futhark ne gère pas la récursion, de ce fait, la factorielle d'un nombre est défini comme la multiplication successive des nombres allant de un à `n`. En Futhark, on définit cette opération par la réduction d'un tableau avec comme opération la multiplication de chaque valeur. Le point d'entrée du programme, `main`, prends en paramètre un nombre et appelle la fonction `fact`. - ```bash futhark opencl fact.fut echo 12 | ./fact ``` -To compile the Futhark code, we have to specify a backend; this one allows us to compile our code in : +To compile the Futhark code, we have to specify a backend; this one allows us to compile our code in: * (+^OpenCL) (opencl, pyopencl), * (+^CUDA) (cuda), @@ -41,7 +45,7 @@ To compile the Futhark code, we have to specify a backend; this one allows us to * sequential C (c), * Python sequential (python). -Here I compile in (+^OpenCL) to run the program on the graphics card, and I run the program with the number 12 as the parameter. +Here we compile in (+^OpenCL) to run the program on the graphics card, and we run the program with the number 12 as the parameter. ``` 479001600i32 @@ -123,7 +127,7 @@ In functional programming, a higher-order function is a function with at least o * it takes one or more functions as parameters, * it returns a function as a result. -Thus, the best known higher-order functions are : +Thus, the best known higher-order functions are: * `map([A], A -> B) -> [B]`, takes as a parameter an array of data of type A and a function that applies to each element of the array to create a new array of data of type B, * `filter([A], A -> Boolean) -> [A]`, takes as a parameter an array of type A and a predicate (a function returning a boolean) which applies on each element of the array. Thus, the function returns only those elements that answer true to the predicate. diff --git a/src/text/06-mpi-x-futhark.md b/src/text/06-mpi-x-futhark.md index 20fcacbb6e4d0f8637105b81526a81ad8e488cde..a5a8afd3e22176f8b1ae04f9641488701a731a24 100644 --- a/src/text/06-mpi-x-futhark.md +++ b/src/text/06-mpi-x-futhark.md @@ -1,89 +1,83 @@ # Automate cellulaire -A cellular automaton consists of a regular grid of cells, each in one of a finite number of states. The grid can be in any finite number of dimensions. For each cell, a set of cells called its neighborhood is defined relative to the specified cell. An initial state (time $t = 0$) is selected by assigning a state for each cell. A new generation is created (advancing t by 1), according to some fixed rule (generally, a mathematical function) that determines the new state of each cell in terms of the current state of the cell and the states of the cells in its neighborhood. Typically, the rule for updating the state of cells is the same for each cell and does not change over time. // wiki +A cellular automaton consists of a regular grid of cells, each in one of a finite number of states. The grid can be in any finite number of dimensions. For each cell, a set of cells called its neighborhood is defined relative to the specified cell. An initial state (time $t = 0$) is selected by assigning a state for each cell. A new generation is created (advancing t by 1), according to some fixed rule (generally, a mathematical function) that determines the new state of each cell in terms of the current state of the cell and the states of the cells in its neighborhood. Typically, the rule for updating the state of cells is the same for each cell and does not change over time. [@noauthor_automate_2021] -Le voisinage d'une cellule est défini soit par le voisinage de Moore, soit par le voisinage de Von Neumann. Le premier, défini qu'une cellule dispose dans un automate cellulaire à deux dimensions, huit voisines alors que le deuxième, quatre. +The neighborhood of a cell is defined either by the Moore neighborhood or by the Von Neumann neighborhood. The first one defines that a cell has in a two-dimensional cellular automaton eight neighbors while the second one, four. -\cimg{figs/neighbours.png}{scale=0.60}{Comparaison entre le voisinage de Von Neumann (à gauche) et de Moore (à droite)}{Source : Created by Baptiste Coudray, ref. URL04} +\cimg{figs/neighbours.png}{scale=0.60}{Comparison between Von Neumann (left) and Moore (right) neighborhoods}{Source: Created by Baptiste Coudray} -La grille de gauche de gauche représente le voisinage de Von Neumann, à savoir, les quatre voisines d'une cellule. Celles-ci sont dénotées par les quatre points cardinaux (nord, ouest, sud, est). -La grille de droite représente le voisinage de Moore, à savoir, les huit voisines d'une cellule. Celles-ci sont dénotées par les quatre points cardinaux et les quatre points inter-cardinaux (nord-ouest, sud-ouest, sud-est, nord-est). +The grid on the left represents the Von Neumann neighborhood, i.e., the four neighbors of a cell. These are denoted by the four cardinal points (north, west, south, east). +The grid on the right represents Moore's neighborhood, i.e., the eight neighbors of a cell. These are denoted by the four cardinal points and the four inter-cardinal points (northwest, southwest, southeast, northeast). + +The cellular automaton will have to use the Moore neighborhood, which means that each cell has: + +* two neighbors in one dimension, +* eight neighbors in two dimensions, +* 26 neighbors in three dimensions. + +These values are valid for a cellular automaton of dimension two and a Chebyshev distance of one. We can generalize the number of neighbors that a cell has via the formula $(2r + 1)^d - 1$, where $r$ is the Chebyshev distance, and $d$ is the dimension. ## MPI x Futhark -Notre librairie permet de paralléliser des automates cellulaires automatiquement de sorte que le programmeur n'ai plus qu'à écrire la fonction Futhark permettant de mettre à jour son automate cellulaire. Notre librairie prends en charge les automates cellulaires de, une, deux et trois dimensions. L'utilisation du langage Futhark permet de mettre à jour rapidement l'état de l'automate cellulaire grâce aux différents backend disponibles. De ce fait, plusieurs modes sont disponibles : -* parallalized-sequential, le code Futhark est exécuté de manière séquentielle, -* parallalized-multicore, le code Futhark est exécuté de façon concurrents grâce aux threads POSIX, -* parallalized-OpenCL/CUDA, le code Futhark est exécuté sur la carte graphique. +Our library allows parallelizing cellular automata automatically so that the programmer only has to write the Futhark function to update his cellular automaton. Our library supports cellular automata of one, two, and three dimensions. The use of the Futhark language allows to quickly update the state of the cellular automaton thanks to the different backend available. Therefore, several modes are available: +* parallelized-sequential, the Futhark code executes sequentially, +* parallelized-multicore, the Futhark code executes concurrently to POSIX threads, +* parallelized-OpenCL/CUDA, the Futhark code executes on the graphics card. ### Communication -Une communication entre les différentes tâches MPI est nécessaire pour récupérer les voisines manquantes ainsi que recréer l'automate cellulaire complet. De ce fait, nous créons une topologie virtuelle cartésienne. - -A virtual topology is a mechanism for naming the processes in a communicator in away that fits the communication pattern better. The main aim of this is to makes sub-sequent code simpler. It may also provide hints to the run-time system which allow it to optimise the communication or even hint to the loader how to configure the processes. The virtual topology might also gain us some performance benefit. +Communication between the different MPI tasks is necessary to recover the missing neighbors and recreate the complete cellular automaton. Therefore, we create a virtual Cartesian topology. +"*A virtual topology is a mechanism for naming the processes in a communicator in away that fits the communication pattern better. The main aim of this is to make sub-sequent code simpler. It may also provide hints to the run-time system which allow it to optimise the communication or even hint to the loader how to configure the processes. The virtual topology might also gain us some performance benefit.*" [@macdonald_writing_nodate] #### One dimension -\cimg{figs/futhark.png}{scale=0.60}{Futhark}{Source : Taken from https://commons.wikimedia.org/, ref. URL04} +\cimg{figs/communication_1d.png}{scale=0.60}{Example of Cartesian virtual topology in one dimension}{Source: Created by Baptiste Coudray} -Dans une topologie cartésienne à une dimension, on remarque que les rangs peuvent communiquer directement avec leur voisin de gauche et de droite même s'ils sont aux extrémités du réseau. En effet, le communicateur MPI est défini pour être cyclique ce qui évite de devoir parcourir les $N - 2$ voisins qui les séparent. +In a one-dimensional Cartesian topology, we notice that the rows can communicate directly with their left and right neighbors even if they are at the ends of the network. Indeed, the MPI communicator is defined as cyclic, which avoids having to traverse the $N - 2$ neighbors that separate them. #### Two dimensions -\cimg{figs/futhark.png}{scale=0.60}{Futhark}{Source : Taken from https://commons.wikimedia.org/, ref. URL04} - -Dans une topologie cartésienne à deux dimensions, on remarque que les rangs peuvent communiquer directement avec leur voisin de gauche, de droite, du haut et du bas. Quand un rang doit communiquer avec leur voisin de diagonale, nous utilisons le communicateur par défaut (`MPI_COMM_WORLD`) pour qu'il communique directement entre eux sans passer par un voisin. +\cimg{figs/communication_2d.png}{scale=0.60}{Example of Cartesian virtual topology in two dimensions}{Source: Created by Baptiste Coudray} +In a two-dimensional Cartesian topology, we notice that rows can communicate directly with their left, right, top, and bottom neighbors. When a row needs to communicate with its diagonal neighbor, we use the default communicator (`MPI_COMM_WORLD`) to communicate directly with each other without going through a neighbor. ### Three dimensions -\cimg{figs/futhark.png}{scale=0.60}{Futhark}{Source : Taken from https://commons.wikimedia.org/, ref. URL04} +\cimg{figs/communication_3d.png}{scale=0.60}{Example of Cartesian virtual topology in three dimensions}{Source: Created by Baptiste Coudray} -Dans une topologie cartésienne à trois dimensions, on remarque que les rangs ont les mêmes capacités de communication qu'une topologie en deux dimensions, mais, ils peuvent en plus communiquer avec leur voisin de devant et de derrière ($z-1$ et $z+1$). Comme pour les autres dimensions, le communicateur de cet dimension est cyclique. +In a three-dimensional Cartesian topology, we notice that the rows have the same communication capabilities as a two-dimensional topology, but, in addition, they can communicate with their front and back neighbors. As for the other dimensions, the communicator of this dimension is cyclic. ### Data dispatching -L'automate cellulaire est partagé de façon la plus équitable possible entre les rangs disponibles de sorte que chaque rang effectue plus ou moins le même temps de travail. Ainsi chaque rang à un chunk qui est une partie de l'automate celullaire. Ce chunk peut être de dimension un, deux ou trois. +The cellular automaton is shared as equally possible among the available tasks to perform more or less the same amount of work. Thus, each task has a chunk that is a part of the cellular automaton. This chunk can be of dimensions one, two, or three. #### One dimension -\cimg{figs/futhark.png}{scale=0.60}{Futhark}{Source : Taken from https://commons.wikimedia.org/, ref. URL04} +\cimg{figs/futhark.png}{scale=0.60}{Example of sharing a cellular automaton in one dimension}{Source: Created by Baptiste Coudray} -Sur cet exemple, un automate cellulaire de dimension un, de taille 8, est partagé entre trois processus. Comme la division de l'automate cellulaire n'est pas entière, le rang deux se voit attribuer seulement deux cellules contrairement aux autres qui en ont trois. +In this example, a cell automaton of dimension one, size 8, is split between three processes. As the division of the cellular automaton is not an integer, rank two have only two cells, unlike the others, which have three. #### Two dimensions -\cimg{figs/futhark.png}{scale=0.60}{Futhark}{Source : Taken from https://commons.wikimedia.org/, ref. URL04} +\cimg{figs/dispatch_1d.png}{scale=0.60}{Example of sharing a cellular automaton in two dimensions}{Source: Created by Baptiste Coudray} -Dans cet exemple, l'automate cellulaire est en deux dimensions et de taille $9 \times 9$. Avec quatre rangs à disposition, il peut être séparé en 4 sous-matrices de $3 \times 3$ +In this example, the cellular automaton is in two dimensions and of size $9 \times 9$. With four tasks available, it can be separated into four sub-matrices of $3 \times 3$. #### Three dimensions -Le principe reste le même que pour la deuxième dimension, à l'exception que chaque processus se voit attribuer $x$ +In three dimensions, the cellular automaton partitioning representation is challenging to make understandable. Thus, based on the two-dimensional partitioning, each task divides the third dimension. +For example, a cellular automaton of size $4 \times 4 \times 4$, each process has a piece of size $2 \times 2 \times 2$. ### Envelope -L'automate cellulaire devra utiliser le voisinage de Moore ce qui veut dire que chaque cellule dispose de : - -* deux voisines en une dimension, -* huit voisines en deux dimensions, -* 26 voisines en trois dimensions. - -Ces valeurs sont valides pour un automate cellulaire de dimension deux et d'une distance de Tchebychev de un. On peut généraliser le nombre de voisines qu'une cellule dispose via la formule $(2r + 1)^d - 1$, où $r$ est la distance de Tchebychev et $d$ la dimension. - -Ainsi l'enveloppe contient le voisinage de Moore manquant d'une distance de Tchebychev de $r$ des cellules situées aux extrémités du chunk. +The envelope of a chunk represents the missing neighbours of the cells at the extremities of the chunk. These missing cells are needed to compute the next iteration of the chunk of the cellular automaton that the process has. #### One dimension -\cimg{figs/futhark.png}{scale=0.60}{Futhark}{Source : Taken from https://commons.wikimedia.org/, ref. URL04} +\cimg{figs/envelope_1d.png}{scale=0.60}{Example of the envelope of a chunk in one dimension}{Source: Created by Baptiste Coudray} -Le voisinage de Moore en une dimension d'une cellule comprends la voisine de gauche (west-neighbor) et la voisine de droite (east-neighbor). - -En reprenant l'automate cellulaire décrit précédemment en une dimension, on remarque que l'enveloppe de $R_n$ comprends la dernière cellule de $R_{(n-1) % N}$ et la première cellule de $R_{(n+1) % N}$. Ainsi, les rangs s'échangent les données via MPI en utilisant la topologie virtuelle cartésienne. +In one dimension, the Moore neighborhood of a cell includes the west-neighbor and the east-neighbor. Using the previously described one-dimensional cellular automaton, we notice that the envelope of $R_n$ includes the last cell of $R_{(n-1) % N}$ and the first cell of $R_{(n+1) % N}$. Thus, the ranks exchange data via MPI using the Cartesian virtual topology. #### Two dimensions -\cimg{figs/futhark.png}{scale=0.60}{Futhark}{Source : Taken from https://commons.wikimedia.org/, ref. URL04} - -Dans cet exemple, l'enveloppe comprends les voisines #### Three dimensions diff --git a/src/text/07-automate-elementaire.md b/src/text/07-automate-elementaire.md index 043c05f4955f2468910aa26145afa66c4b24cfb5..058769417ce154cb05075005fe90a14289eb7f44 100644 --- a/src/text/07-automate-elementaire.md +++ b/src/text/07-automate-elementaire.md @@ -1,8 +1,8 @@ # Simple Cellular Automaton -The simplest non-trivial cellular automaton that can be conceived consists of a one-dimensional grid of cells that can take only two states ("0" or "1"), with a neighborhood consisting, for each cell, of itself and the two cells adjacent to it. +The simplest non-trivial cellular automaton that can be conceived consists of a one-dimensional grid of cells that can take only two states ("0" or "1"), with a neighborhood consisting, for each cell, of itself and the two cells adjacent to it. [@noauthor_automate_2021] -There are $2^3 = 8$ possible configurations (or patterns, rules) of such a neighborhood. In order for the cellular automaton to work, it is necessary to define what the state must be, at the next generation, of a cell for each of these patterns. The 8 rules/configurations defined is as follows: +There are $2^3 = 8$ possible configurations (or patterns, rules) of such a neighborhood. In order for the cellular automaton to work, it is necessary to define what the state must be at the next generation of a cell for each of these patterns. The 8 rules/configurations defined is as follows: | Rule n° | East neighbour state | Cell state | West neighbour state | Cell next state | |:---:|:---:|:---:|:---:|:---:| @@ -14,10 +14,11 @@ There are $2^3 = 8$ possible configurations (or patterns, rules) of such a neigh | 6 | 1 | 0 | 1 | 0 | 7 | 1 | 1 | 0 | 0 | 8 | 1 | 1 | 1 | 0 +Table: <Evolution rules for a cellule in a one dimensional cellular-automaton> ## Example -\cimg{figs/simple_automate.png}{scale=0.5}{First state of blinker}{Source : Taken from +\cimg{figs/simple_automate.png}{scale=0.5}{First state of blinker}{Source: Taken from \url{https://commons.wikimedia.org/}, ref. URL05. Re-created by Baptiste Coudray} Iteration 0 is the initial state and only cell two is alive. To perform the next iteration: @@ -28,7 +29,7 @@ Iteration 0 is the initial state and only cell two is alive. To perform the next ## Parallelized version -Avec la librairie que nous avons créée, nous avons implémenté l'automate cellulaire précédemment décris. Pour ce faire, nous créons un fichier Futhark `elementary.fut` qui sert à calculer le prochain état d'une partie de l'automate cellulaire. +With the created library, we implement the cellular automaton previously described. To do this, we create a Futhark `elementary.fut` file, which is used to calculate the next state of a part of the cellular automaton. ``` let compute_next_elems [n] (chunk_elems :[n]i8) :[]i8 = ... @@ -38,7 +39,7 @@ entry next_chunk_elems [n] (chunk_elems :[n]i8) :[]i8 = in next_elems[1:n-1] ``` -De ce fait, le fichier `elementary.fut` contient seulement une fonction qui applique les règles sur une partie de l'automate cellulaire. A noter que la fonction retourne l'automate cellulaire sans l'enveloppe. +Therefore, the `elementary.fut` file contains only a function that applies the rules on the cellular automaton. Note that the function returns the cellular automaton without the envelope. ```c void compute_next_chunk_board(struct dispatch_context *dc, struct futhark_context *fc, chunk_info_t *ci) { @@ -71,38 +72,80 @@ int main(int argc, char *argv[]) { } ``` -Finalement, un fichier C `main.c` est nécessaire pour créer le point d'entrée du programme. Le code est relativement simple, le programmeur doit initialiser environnement MPI et Futhark. Ensuite, il doit initialiser notre librairie via la fonction `dispatch_context_new` en spécifiant la taille de l'automate cellulaire, son type de données et le nombre de dimensions (un en l'occurrence). Finalement, via la fonction `compute_next_chunk_board` qu'il crée, le programmeur récupère son morceau d'automate cellulaire avec l'envelope et il appelle la fonction Futhark qu'il a créé. +Finally, a C file `main.c` is needed to create the program's entry point. We initialize the MPI and Futhark environment. Then, our library, via the function `dispatch_context_new`, by specifying the size of the cellular automaton, its data type, and the number of dimensions (one in this case). Finally, via the `compute_next_chunk_board` function, we retrieve the chunk of the cellular automaton with the envelope and call the previously created Futhark function to obtain the chunk of the cellular automaton at time $t+1$. ## CPU Benchmark -Nous effectuons des benchmarks pour valider la scalabilité de notre parallélisation en une dimension quand on compile en mode séquentiel, multicœurs, OpenCL ou CUDA. Les benchmarks sont effectués sur le cluster HES-GE (Baobab2/Yggdrasil). -Le benchmark séquentiel et multicœurs sont effectués comme suit : -* l'automate cellulaire est de taille $900 000 000$ cellules, -* le nombre de tâches varie entre $2^0$ et $2^7$, -* 15 mesures sont effectuées, -* et une mesure correspond à 100 générations. +We perform benchmarks to validate the scalability of our one-dimensional parallelization when compiling in sequential, multicore, (+^OpenCL), or (+^CUDA) mode. The benchmarks are performed on the HES-GE cluster (Baobab/Yggdrasil). +The sequential and multicore benchmarks are performed as follows: +* the cellular automaton is $300,000,000$ cells in size, +* the number of tasks varies between $2^0$ and $2^7$, +* 15 measurements are performed, one measurement corresponds to one iteration, +* the iteration is computed 100 times. +| Number of tasks | Average [s] | Standard Derivation [s] | Speedup | Number of measures | +|:---:|:---:|:---:|:---:|:---:| +| 1 | 657.866 [s] | ± 14.977 [s] | x1.0 | 15 | +| 2 | 332.771 [s] | ± 2.814 [s] | x2.0 | 15 | +| 4 | 161.963 [s] | ± 7.309 [s] | x4.1 | 15 | +| 8 | 87.602 [s] | ± 2.918 [s] | x7.5 | 15 | +| 16 | 42.743 [s] | ± 0.039 [s] | x15.4 | 15 | +| 32 | 20.938 [s] | ± 0.007 [s] | x31.4 | 15 | +| 64 | 11.071 [s] | ± 0.024 [s] | x59.4 | 15 | +| 128 | 5.316 [s] | ± 0.191 [s] | x123.7 | 15 | +Table: <Results for the parallelized-sequential version of Simple Cellular Automaton> + +| Number of tasks | Average [s] | Standard Derivation [s] | Speedup | Number of measures | +|:---:|:---:|:---:|:---:|:---:| +| 1 | 708.689 [s] | ± 16.036 [s] | x1.0 | 15 | +| 2 | 358.007 [s] | ± 4.037 [s] | x2.0 | 15 | +| 4 | 138.523 [s] | ± 3.773 [s] | x5.1 | 15 | +| 8 | 71.077 [s] | ± 1.28 [s] | x10.0 | 15 | +| 16 | 34.697 [s] | ± 0.834 [s] | x20.4 | 15 | +| 32 | 25.776 [s] | ± 0.725 [s] | x27.5 | 15 | +| 64 | 12.506 [s] | ± 0.554 [s] | x56.7 | 15 | +| 128 | 5.816 [s] | ± 0.045 [s] | x121.8 | 15 | +Table: <Results for the parallelized-multicore version of Simple Cellular Automaton> -*Array of results for the parallelized-sequential version* - - -*Array of results for the parallelized-multicore version* - -On remarque que la version multicœurs est plus lente que la version séquentielle, cela est dû à la non-optimisation des tableaux à une dimension de la part du backend multicore de Futhark. En effet, cette fonctionnalité n'est pas encore implémentée dans le compilateur Futhark. - +\cimg{figs/elem_result_and_speedup_cpu.png}{width=\linewidth}{Benchmarks of the simple cellular automaton in parallelized-sequential/multicore}{Source: Realized by Baptiste Coudray} -A gauche, le graphique montre le temps de calculs de 100 générations du jeu de la vie pour un automate cellulaire de $30 000^2 = 900 000 000$ cellules de la version sequentielle et multicœurs. A droite, nous avons le speedup idéal ainsi que le speedup obtenu de la version sequentielle et multicœurs. +Sur le graphique de gauche, nous comparons le temps d'exécution moyen pour chaque tâche et pour chaque version (sequential et multicore). Sur le graphique de droite, nous comparons le speedup idéal avec le speedup de la version parallelized-sequential et multicore. -Sur le graphique du temps d'exécution, on remarque que celui-ci diminue de l'ordre $\frac{1}{x}$ pour l'exécution séquentielle et multicore. A noté que la version séquentielle est plus rapide que la version multicœurs. +The more we increase the number of tasks, the more the execution time is reduced. Thus, the parallelized-sequential or multicore version speedup follows the curve of the ideal speedup. ## GPU Benchmark -Les benchmarks OpenCL et CUDA sont effectués comme suit : -* l'automate cellulaire est de taille $900 000 000$ cellules, -* le nombre de tâches varie entre $2^0$ et $2^7$ -* 15 mesures sont effectuées -* une mesure correspond à 300 générations, -* de $2^0$ à $2^3$ tâches, une NVIDIA GeForce RTX 3090 est attribuée pour chaque tâche, au-delà, les tâches se partagent - de manière équitable les cartes graphiques. +The (+^OpenCL) and (+^CUDA) benchmarks are performed as follows: +* the cellular automaton has $300'000'000$ cells, +* the number of tasks varies between $2^0$ and $2^6$. +* 15 measurements are performed, one measurement corresponds to one iteration, +* the iteration is computed $50'000$ times. +* From $2^0$ to $2^3$ tasks, an NVIDIA GeForce RTX 3090 is allocated for each task; beyond that, the eight graphics cards are shared equally among the ranks. + +| Number of tasks | Number of GPUs | Average [s] | Standard Derivation [s] | Speedup | Number of measures | +|:---:|:---:|:---:|:---:|:---:|:---:| +| 1 | 1 | 166.086 [s] | ± 0.096 [s] | x1.0 | 15 | +| 2 | 2 | 83.339 [s] | ± 0.099 [s] | x2.0 | 15 | +| 4 | 4 | 42.122 [s] | ± 0.078 [s] | x3.9 | 15 | +| 8 | 8 | 21.447 [s] | ± 0.031 [s] | x7.7 | 15 | +| 16 | 8 | 31.675 [s] | ± 0.056 [s] | x5.2 | 15 | +| 32 | 8 | 43.65 [s] | ± 0.102 [s] | x3.8 | 15 | +| 64 | 8 | 67.096 [s] | ± 0.118 [s] | x2.5 | 15 | +Table: <Results for the parallelized-OpenCL version of Simple Cellular Automaton> + +| Number of tasks | Number of GPUs | Average [s] | Standard Derivation [s] | Speedup | Number of measures | +|:---:|:---:|:---:|:---:|:---:|:---:| +| 1 | 1 | 160.291 [s] | ± 0.062 [s] | x1.0 | 15 | +| 2 | 2 | 80.434 [s] | ± 0.094 [s] | x2.0 | 15 | +| 4 | 4 | 40.64 [s] | ± 0.073 [s] | x3.9 | 15 | +| 8 | 8 | 20.657 [s] | ± 0.046 [s] | x7.8 | 15 | +| 16 | 8 | 30.749 [s] | ± 0.069 [s] | x5.2 | 15 | +| 32 | 8 | 42.352 [s] | ± 0.117 [s] | x3.8 | 15 | +| 64 | 8 | 65.228 [s] | ± 0.042 [s] | x2.5 | 15 | +Table: <Results for the parallelized-CUDA version of Simple Cellular Automaton> + +\cimg{figs/elem_result_and_speedup_gpu.png}{width=\linewidth}{Benchmarks of the simple cellular automaton in parallelized-OpenCL/CUDA}{Source: Realized by Baptiste Coudray} + +With this performance test, we notice that the computation time is essentially the same in OpenCL as in CUDA. Moreover, the parallelization follows the ideal speedup curve when the number of processes equals the number of graphics cards. However, when the eight graphics cards are shared, the speedup in OpenCL/CUDA crashes, and the computation time increases. \pagebreak diff --git a/src/text/08-jeu-de-la-vie.md b/src/text/08-jeu-de-la-vie.md index abc36738bd7b89731f9d336d83ca1bf2f0f566df..5bb0a5f5137d6ed1cc46f7cc695ceb2105ac8d0a 100644 --- a/src/text/08-jeu-de-la-vie.md +++ b/src/text/08-jeu-de-la-vie.md @@ -1,17 +1,17 @@ # Game of Life -The Game of Life is a zero-player game designed by John Horton Conway in 1970. It is also one of the best-known cellular automata. A cellular automaton consists of a regular grid of cells each containing a state chosen among a finite set and which can evolve in the course of time. The game does not require the interaction of a player for it to evolve, it evolves thanks to these extremely simple rules: +The Game of Life is a zero-player game designed by John Horton Conway in 1970. It is also one of the best-known cellular automata. The game does not require the interaction of a player for it to evolve, it evolves thanks to these extremely simple rules: 1. a cell has eight neighbors, 2. a cell can be either alive or dead, 3. a dead cell with exactly three living neighbors becomes alive, -4. a living cell with two or three living neighbors stays alive; otherwise, it dies. [@noauthor_jeu_2020] +4. a living cell with two or three living neighbors stays alive; otherwise, it dies. [@noauthor_jeu_2021] \pagebreak ## Example -\cimg{figs/gol_blinker1.png}{scale=0.40}{First state of blinker}{Source : Taken from +\cimg{figs/gol_blinker1.png}{scale=0.40}{First state of blinker}{Source: Taken from \url{https://commons.wikimedia.org/}, ref. URL05. Re-created by Baptiste Coudray} A basic example is a blinker: @@ -20,8 +20,7 @@ A basic example is a blinker: * the cell (zero, two) and (two, two) are born because they have three living neighbors (rule n°3), * the cell (one, two) stays alive because it has two living neighbors (rule n°4). -\cimg{figs/gol_blinker2.png}{scale=0.40}{Second state of blinker}{Source : Taken from -\url{https://commons.wikimedia.org/}, ref. URL06. Re-created by Baptiste Coudray} +\cimg{figs/gol_blinker2.png}{scale=0.40}{Second state of blinker}{Source: Taken from \url{https://commons.wikimedia.org/}, ref. URL06. Re-created by Baptiste Coudray} Thus, after the application of the rules, the horizontal line becomes a vertical line. Then, at the next iteration, the vertical line becomes a horizontal line again. @@ -29,52 +28,82 @@ Thus, after the application of the rules, the horizontal line becomes a vertical ## Parallelized version -Avec la librairie que nous avons créée, nous avons implémenté le jeu de la vie. Le code est relativement le même que l'exemple précédemment, par conséquent, il n'est pas expliqué. +We create the game of life with our library to test it with a two-dimensional cellular automaton. +The code is relatively the same as the previous example; therefore, it is not explained, but you can find it in the Git repository. ## CPU Benchmarks -Nous effectuons un benchmark pour valider la scalabilité de notre parallélisation en deux dimensions quand on compile en mode séquentiel, multicoeurs, OpenCL ou CUDA. Les benchmarks sont effectués sur le cluster HES-GE (Baobab2). -Le benchmark séquentiel et multicœurs sont effectués comme suit : -* l'automate cellulaire est de taille $30000^2 = 900 000 000$ cellules, -* le nombre de tâches varie entre $2^0$ et $2^7$ -* 15 mesures sont effectuées -* une mesure correspond à 100 générations, - - +We perform benchmarks to validate the scalability of our two-dimensional parallelization when compiling in sequential, multicore, (+^OpenCL), or (+^CUDA) mode. The benchmarks are performed on the (+^HES-GE) cluster (Baobab/Yggdrasil). + +The sequential and multicore benchmarks are performed as follows: +* the cellular automaton is $900,000,000$ cells in size, +* the number of tasks varies between $2^0$ and $2^7$, +* 15 measurements are performed, one measurement corresponds to one iteration, +* the iteration is computed 100 times. + +| Number of tasks | Average [s] | Standard Derivation [s] | Speedup | Number of measures | +|:---:|:---:|:---:|:---:|:---:| +| 1 | 3471.723 [s] | ± 47.092 [s] | x1.0 | 15 | +| 2 | 1140.064 [s] | ± 56.78 [s] | x3.0 | 15 | +| 4 | 790.365 [s] | ± 10.501 [s] | x4.4 | 15 | +| 8 | 398.093 [s] | ± 13.438 [s] | x8.7 | 15 | +| 16 | 221.687 [s] | ± 4.152 [s] | x15.7 | 15 | +| 32 | 100.422 [s] | ± 0.068 [s] | x34.6 | 15 | +| 64 | 55.986 [s] | ± 1.587 [s] | x62.0 | 15 | +| 128 | 28.111 [s] | ± 0.263 [s] | x123.5 | 15 | *Array of results for the parallelized-sequential version* - +| Number of tasks | Average [s] | Standard Derivation [s] | Speedup | Number of measures | +|:---:|:---:|:---:|:---:|:---:| +| 1 | 2154.686 [s] | ± 198.122 [s] | x1.0 | 15 | +| 2 | 1160.921 [s] | ± 77.23 [s] | x1.9 | 15 | +| 4 | 502.86 [s] | ± 3.465 [s] | x4.3 | 15 | +| 8 | 206.818 [s] | ± 4.179 [s] | x10.4 | 15 | +| 16 | 106.103 [s] | ± 0.45 [s] | x20.3 | 15 | +| 32 | 71.463 [s] | ± 0.485 [s] | x30.2 | 15 | +| 64 | 39.116 [s] | ± 0.489 [s] | x55.1 | 15 | +| 128 | 14.008 [s] | ± 0.335 [s] | x153.8 | 15 | *Array of results for the parallelized-multicore version* -\cimg{figs/gol_result_and_speedup_cpu.png}{width=\linewidth}{First state of blinker}{Source : Taken from -\url{https://commons.wikimedia.org/}, ref. URL05. Re-created by Baptiste Coudray} +\cimg{figs/gol_result_and_speedup_cpu.png}{width=\linewidth}{Benchmarks of the game of life in parallelized-sequential/multicore}{Source: Realized by Baptiste Coudray} -A gauche, le graphique montre le temps de calculs de 100 générations du jeu de la vie pour un automate cellulaire de 30'000x30'000 cellules de la version sequentielle et multicoeurs. A droite, nous avons le speedup idéal ainsi que le speedup obtenu de la version sequentielle et multicoeurs. - -Sur le graphique du temps d'exécution, on remarque que celui-ci diminue de l'ordre $\frac{1}{x}$ pour l'exécution séquentielle et multicore. A noté que la version multicore est plus rapide que la version séquentielle. -Sur le graphique des speedups, on remarque que la parallélisation de l'automate cellulaire est idéal même meilleur. Cette performance peut s'expliquer grâce à la présence des caches dans le CPU ce qui permet de récupérer les données plus rapidement comparé à la RAM. +We notice an apparent difference between the parallelized-sequential and multicore version when there is only one task. The multicore version is $1.6$ times faster than the sequential version. Nevertheless, both versions have a perfect speedup. The multicore version even gets a maximum speedup of x154 with 128 tasks. This performance can be explained by the caching of data in the processor and the use of threads. \pagebreak ## GPU Benchmarks -Le benchmark OpenCL et CUDA sont effectués comme suit : -* l'automate cellulaire est de taille $60000^2 = 3 600 000 000$ cellules, -* le nombre de tâches varie entre $2^0$ et $2^7$ -* 15 mesures sont effectuées -* une mesure correspond à 100 générations, -* de $2^0$ à $2^3$ tâches, une NVIDIA GeForce RTX 3090 est attribuée pour chaque tâche, au delà, les tâches se partagent - de manière équitable les cartes graphiques. - - - -*Array of results for the parallelized-OpenCL version* - - -*Array of results for the parallelized-CUDA version* - -\cimg{figs/gol_result_and_speedup_gpu.png}{width=\linewidth}{First state of blinker}{Source : Taken from -\url{https://commons.wikimedia.org/}, ref. URL05. Re-created by Baptiste Coudray} - +The (+^OpenCL) and (+^CUDA) benchmarks are performed as follows: +* the cellular automaton has $900'000'000$ cells, +* the number of tasks varies between $2^0$ and $2^6$. +* 15 measurements are performed, one measurement corresponds to one iteration, +* the iteration is computed $8'000$ times. +* From $2^0$ to $2^3$ tasks, an NVIDIA GeForce RTX 3090 is allocated for each task; beyond that, the eight graphics cards are shared equally among the ranks. + +| Number of tasks | Number of GPUs | Average [s] | Standard Derivation [s] | Speedup | Number of measures | +|:---:|:---:|:---:|:---:|:---:|:---:| +| 1 | 1 | 230.144 [s] | ± 0.225 [s] | x1.0 | 15 | +| 2 | 2 | 115.4 [s] | ± 0.07 [s] | x2.0 | 15 | +| 4 | 4 | 58.019 [s] | ± 0.104 [s] | x4.0 | 15 | +| 8 | 8 | 29.157 [s] | ± 0.061 [s] | x7.9 | 15 | +| 16 | 8 | 30.579 [s] | ± 0.085 [s] | x7.5 | 15 | +| 32 | 8 | 32.323 [s] | ± 0.045 [s] | x7.1 | 15 | +| 64 | 8 | 35.551 [s] | ± 0.133 [s] | x6.5 | 15 | +Table: <Results for the parallelized-OpenCL version of Game of Life> + +| Number of tasks | Number of GPUs | Average [s] | Standard Derivation [s] | Speedup | Number of measures | +|:---:|:---:|:---:|:---:|:---:|:---:| +| 1 | 1 | 218.807 [s] | ± 0.057 [s] | x1.0 | 15 | +| 2 | 2 | 109.598 [s] | ± 0.109 [s] | x2.0 | 15 | +| 4 | 4 | 55.039 [s] | ± 0.1 [s] | x4.0 | 15 | +| 8 | 8 | 27.737 [s] | ± 0.05 [s] | x7.9 | 15 | +| 16 | 8 | 29.174 [s] | ± 0.079 [s] | x7.5 | 15 | +| 32 | 8 | 30.844 [s] | ± 0.051 [s] | x7.1 | 15 | +| 64 | 8 | 34.192 [s] | ± 0.12 [s] | x6.4 | 15 | +Table: <Results for the parallelized-CUDA version of Game of Life> + +\cimg{figs/gol_result_and_speedup_gpu.png}{width=\linewidth}{Benchmarks of the game of life in parallelized-OpenCL/CUDA}{Source: Realized by Baptiste Coudray} + +With this performance test, we notice that the computation time is essentially the same in OpenCL as in CUDA. Moreover, the parallelization follows the ideal speedup curve when the number of processes equals the number of graphics cards. However, when the eight graphics cards are shared, the speedup in OpenCL/CUDA stabilize, and the computation time increases ($+7 [s]$ between eight tasks and 64 tasks). \pagebreak diff --git a/src/text/09-lattice-boltzmann.md b/src/text/09-lattice-boltzmann.md index fc6d1b10c297ef9feba1f96edad77a30d471e31d..252b4e14a27d0bdb7edace4d46628e9c31d87ce8 100644 --- a/src/text/09-lattice-boltzmann.md +++ b/src/text/09-lattice-boltzmann.md @@ -1,19 +1,61 @@ # Lattice-Boltzmann -The lattice Boltzmann method (LBM) has established itself in the past decades as a valuable approach to Computational +"_The lattice Boltzmann method (LBM) has established itself in the past decades as a valuable approach to Computational Fluid Dynamics (CFD). It is commonly used to model time-dependent, incompressible or compressible flows in a regime of Direct Numerical Simulation (DNS) or Large Eddy Simulation (LES). One of its strengths lies in the ability to easily represent complex physical phenomena, ranging from multi-phase flows to reactive and suspension flows. The method originates in a molecular description of a fluid, based on the Boltzmann equation, and can directly incorporate physical terms stemming from a knowledge of the interaction between molecules. It is therefore an invaluable tool in -fundamental research, as it keeps the cycle between the elaboration of a theory and the formulation of a corresponding numerical model short. - At the same time, it has proven to be computationally very efficient and is applied to a large -variety of academic or industrial problems. +fundamental research, as it keeps the cycle between the elaboration of a theory and the formulation of a corresponding numerical model short._" [@latt_palabos_2020] ## Parallelized version +We create the lattice-Boltzmann method with our library to test it with a three-dimensional cellular automaton. + ## CPU Benchmark +We perform benchmarks to validate the scalability of our three-dimensional parallelization when compiling in sequential, multicore, (+^OpenCL), or (+^CUDA) mode. The benchmarks are performed on the (+^HES-GE) cluster (Baobab/Yggdrasil). +The sequential and multicore benchmarks are performed as follows: +* the cellular automaton is $27'000'000$ cells in size, +* the number of tasks varies between $2^0$ and $2^7$, +* 15 measurements are performed, one measurement corresponds to one iteration, +* the iteration is computed 100 times. + +| Number of tasks | Average [s] | Standard Derivation [s] | Speedup | Number of measures | +|:---:|:---:|:---:|:---:|:---:| +| 1 | 716.133 [s] | ± 5.309 [s] | x1.0 | 15 | +| 2 | 363.166 [s] | ± 3.482 [s] | x2.0 | 15 | +| 4 | 185.43 [s] | ± 0.847 [s] | x3.9 | 15 | +| 8 | 93.994 [s] | ± 0.566 [s] | x7.6 | 15 | +| 16 | 81.266 [s] | ± 8.947 [s] | x8.8 | 15 | +| 32 | 41.04 [s] | ± 1.59 [s] | x17.4 | 15 | +| 64 | 22.188 [s] | ± 0.321 [s] | x32.3 | 15 | +| 128 | 17.415 [s] | ± 4.956 [s] | x41.1 | 15 | +Table: <Results for the parallelized-sequential version of Lattice-Boltzmann> + +| Number of tasks | Average [s] | Standard Derivation [s] | Speedup | Number of measures | +|:---:|:---:|:---:|:---:|:---:| +| 1 | 695.675 [s] | ± 8.867 [s] | x1.0 | 15 | +| 2 | 352.925 [s] | ± 4.293 [s] | x2.0 | 15 | +| 4 | 181.736 [s] | ± 0.695 [s] | x3.8 | 15 | +| 8 | 237.983 [s] | ± 0.271 [s] | x2.9 | 15 | +| 16 | 79.36 [s] | ± 2.185 [s] | x8.8 | 15 | +| 32 | 46.285 [s] | ± 0.138 [s] | x15.0 | 15 | +| 64 | 24.059 [s] | ± 0.061 [s] | x28.9 | 15 | +| 128 | 16.614 [s] | ± 1.088 [s] | x41.9 | 15 | +Table: <Results for the parallelized-multicore version of Lattice-Boltzmann> + +\cimg{figs/lbm_result_and_speedup_cpu.png}{width=\linewidth}{Benchmarks of the lattice-Boltzmann method in parallelized-sequential/multicore}{Source: Realized by Baptiste Coudray} + +Contrairement aux benchmarks précédents, les speedups ne suivent pas la courbe du speedup idéal. En effet, que ce soit en sequential ou en multicore, nous obtenons un speedup maximal avec 128 tâches de x41 alors qu'on espérait avoir un speedup de x128. + ## GPU Benchmark +The (+^OpenCL) and (+^CUDA) benchmarks are performed as follows: +* the cellular automaton has $27'000'000$ cells, +* the number of tasks varies between $2^0$ and $2^6$. +* 15 measurements are performed, one measurement corresponds to one iteration, +* the iteration is computed $3'000$ times. +* From $2^0$ to $2^3$ tasks, an NVIDIA GeForce RTX 3090 is allocated for each task; beyond that, the eight graphics cards are shared equally among the ranks. + \pagebreak diff --git a/src/text/ZZ-glossaire.tex b/src/text/ZZ-glossaire.tex index 65132214662af4d4ff9bbd5883b48bb2563ef8df..2537582d2dacaa309010b0ab3de3b391544ba18f 100644 --- a/src/text/ZZ-glossaire.tex +++ b/src/text/ZZ-glossaire.tex @@ -14,4 +14,5 @@ \newacronym{OpenCL}{OpenCL}{Open Computing Language} \newacronym{SDL2}{SDL2}{Simple Directmedia Layer 2} \newacronym{HES-GE}{HES-GE}{Haute École Spécialisée de GEnève} -\newacronym{ES}{E/S}{Entrée/Sortie} +\newacronym{IO}{I/O}{Input/Output} +\newacronym{MSS}{MSS}{Maximum Segment Sum}