From 6e0e1f6d7c77ad049aca60d4a53ccd9f125fb2a0 Mon Sep 17 00:00:00 2001 From: Orestis <orestis.malaspinas@pm.me> Date: Sun, 4 Jul 2021 22:26:24 +0200 Subject: [PATCH] udpated pres --- presentations/pasc/Makefile | 4 +- presentations/pasc/pres.md | 291 +++++++++++++++++++++++++++---- presentations/unige2020/Makefile | 4 +- presentations/unige2020/pres.md | 291 +++++++++++++++++++++++++++---- 4 files changed, 514 insertions(+), 76 deletions(-) diff --git a/presentations/pasc/Makefile b/presentations/pasc/Makefile index 575514e..30d624b 100644 --- a/presentations/pasc/Makefile +++ b/presentations/pasc/Makefile @@ -3,13 +3,13 @@ FILTERDIR = $(DATADIR)/filters RESOURCEDIR = $(DATADIR)/resources PDFOPTIONS = -t beamer -PDFOPTIONS += --highlight-style my_highlight.theme +# PDFOPTIONS += --highlight-style my_highlight.theme PDFOPTIONS += --pdf-engine pdflatex PDFOPTIONS += --template=./default.latex PDFOPTIONS += -V theme:metropolis PDFOPTIONS += -V themeoptions:numbering=none -V themeoptions:progressbar=foot PDFOPTIONS += -V fontsize=smaller -# PDFOPTIONS += --filter pandoc-beamer-block +# PDFOPTIONS += --filter pandoc-beamer-block # PDFOPTIONS += --lua-filter=${FILTERDIR}/tex.lua # PDFOPTIONS += --include-in-header=${RESOURCEDIR}/definitions.tex # PDFOPTIONS += --include-in-header=${RESOURCEDIR}/beamer.tex diff --git a/presentations/pasc/pres.md b/presentations/pasc/pres.md index 64aaa6e..064a7b1 100644 --- a/presentations/pasc/pres.md +++ b/presentations/pasc/pres.md @@ -4,40 +4,113 @@ # What is Futhark -* Statically typed, data-parallel, purely functional, array language ... -* with limited functionalities (no I/O) ... -* that compiles to C, OpenCL, and Cuda backends ... -* efficiently ... -* without the pain of actually writing GPU code. +- Statically typed, data-parallel, purely functional, array language ... +- with limited functionalities (no I/O for example) ... +- that compiles to C, OpenCL, and Cuda backends... +- very efficiently ... +- without the pain of actually writing GPU code. -Spoiler: a toy d3q27 recursive regularized model does 1.5 GLUPS (single precision). +. . . + +- Developed in Copenhagen by Troels Henriksen ... +- Very friendly and eager to help newcomers ... +- Still a very experimental project (high truck factor). + +. . . + +**Spoiler:** a toy d3q27 recursive regularized model does 1.5 GLUPS (single precision). # Why use Futhark? -<!-- TODO: add opencl example --> +\tiny +```CPP +#include<CL/cl.hpp> +#include<iostream> +#include <fstream> +int main() +{ + std::vector<cl::Platform> platforms; + cl::Platform::get(&platforms); + + auto platform = platforms.front(); + std::vector<cl::Device> devices; + platform.getDevices(CL_DEVICE_TYPE_CPU, &devices); + + auto device = devices.front(); + + std::ifstream helloWorldFile("hello.cl"); + std::string src(std::istreambuf_iterator<char>(helloWorldFile), (std::istreambuf_iterator<char>())); + + cl::Program::Sources sources( 1, std::make_pair(src.c_str(), src.length() + 1)); + + cl::Context context(device); + cl::Program program(context, sources); + + auto err = program.build("-cl-std=CL1.2"); + + char buf[16]; + cl::Buffer memBuf(context, CL_MEM_WRITE_ONLY | CL_MEM_HOST_READ_ONLY, sizeof(buf)); + cl::Kernel kernel(program, "HelloWorld", &err); + kernel.setArg(0, memBuf); + + cl::CommandQueue queue(context, device); + queue.enqueueTask(kernel); + queue.enqueueReadBuffer(memBuf, GL_TRUE, 0, sizeof(buf), buf); + + std::cout << "hello"; + std::cin.get(); + +} +``` # How to use Futhark -* Not intended to replace existing generic-purpose languages. -* But aims at being easily integrated into non Futhark code: +- Not intended to replace existing generic-purpose languages. +- But aims at being easily integrated into non Futhark code: - * Used into python code: `PyOpenCL`, - * Conventional `C` code, - * Several others (`C#`, `Haskell`, `F#`, ..., and soon `Rust`). + - Used into python code: `PyOpenCL`, + - Conventional `C` code, + - Several others (`C#`, `Haskell`, `F#`, ..., and soon `Rust`). -* Futhark produces `C` code so it's accessible from any language through a FFI. +- Futhark produces `C` code so it's accessible from any language through a FFI. # An example: the dot product +::: dotprod + ## `dotprod.fut` ``` entry dotprod (xs: []i32) (ys: []i32): i32 = - reduce (+) 0 (map2 (*) xs ys) + reduce (+) 0 (map (\(x, y) -> x * y) (zip xs ys)) ``` +::: + +Intrinsics: SOAC (Second order array combinators) + +- `reduce 'a -> (a -> a -> a) -> a -> []a` +- `map 'a 'b -> (a -> b) -> []a -> []b` +- `zip 'a 'b -> []a -> []b -> [](a, b)` Futhark is very efficient at parallelizing array operations. +. . . + +::: cdotprod + +## `dotprod.c` + +```C +int dot_prod(int xs[], int ys[], int size) { + int dot = 0; + for (int i = 0; i < size; ++i) { + dot += xs[i] * ys[i]; + } + return dot; +} +``` +::: + # An example: the dot product from `C` ```C @@ -62,50 +135,145 @@ int main() { # And now -Let's implement some lattice Boltzmann +## Let the lattice Boltzmann method raid the Futhark. -# Computation of macroscopic moments +# The lattice Boltzmann method -## LBM equation +::: Simulation + +## Simulation + +0. Initialization (not Futhark here). +1. Collision. + - Compute $\rho(f_i)$, + - Compute $\bm{j}(f_i)$, + - Compute $f_i^\mathrm{eq}(\rho,\bm{j})$, + - Collide $f_i^\mathrm{out}(f_i,f_i^\mathrm{eq},\omega)$. +2. Propagation. + +Repeat 1-2 a certain amount of times +::: + +# Computation of macroscopic moments (1/2) + +::: Simulation + +## LBM equations \begin{equation} -\rho=\sum_{i=0}^{q-1}f_i,\quad \rho\bm{u}=\sum_{i=0}^{q-1}f_i \bm{c}_i, \forall\ \bm{x}. +\rho=\sum_{i=0}^{q-1}f_i, \forall\ \bm{x}. \end{equation} +::: + +::: Futhark ## Futhark code +```ocaml +map (\fx -> + map (\fxy -> + map (\fxyz -> + reduce (+) 0 fxyz + ) fxy + ) fx +) f ``` -let compute_rho (f: [nx][ny][nz][27]f32): [nx][ny][nz]f32 = - map ( map ( map ( reduce (+) 0.0 ) ) ) f -``` +::: + +# Computation of macroscopic moments (2/2) + +::: Simulation + +## LBM equation + +\begin{equation} +\rho\bm{u}=\sum_{i=0}^{q-1}f_i \bm{c}_i, \forall\ \bm{x}. +\end{equation} +::: + +::: Futhark -<!-- TODO add j --> +## Futhark code + +```ocaml +map (\fx -> + map (\fxy -> + map (\fxyz -> + map(\ci -> + dotprod ci fxyz + ) (transpose c) -- intrinsic + ) fxy + ) fx +) f +``` +::: # Computation of the equilibrium distribution +::: Simulation + ## LBM equation \begin{equation} f_i^\mathrm{eq}=w_i\rho\left(1+\frac{\bm{c}_i\cdot \bm{u}}{c_s^2}+\frac{1}{2c_s^4}(\bm{c}_i\cdot \bm{u})^2-\frac{1}{2c_s^2}\bm{u}^2\right),\ \forall \bm{x},i \end{equation} +::: + +::: Futhark ## Futhark code -<!-- TODO add feq --> +```ocaml +map2(\rho_x j_x -> + map2(\rho_xy j_xy -> + map2(\rho_xyz j_xyz -> + let u = map(\j_xyzi -> j_xyzi / rho_xyz ) j_xyz + let u_sqr = dotprod u u + + in map2(\wi ci -> + let c_u = dotprod ci u + in rho_xyz * wi * + (1 + 3 * c_u + 4.5 * c_u * c_u - 1.5 * u_sqr) + ) w c + ) rho_xy j_xy + ) rho_x j_x +) rho j +``` +::: # Collision +::: Simulation + ## LBM equation \begin{equation} -f^\mathrm{out}_i=f_i\left(1-\frac{1}{\tau}\right)+\frac{1}{\tau}f_i^\mathrm{eq}. +f^\mathrm{out}_i=f_i\left(1-\omega\right)+\omega f_i^\mathrm{eq}. \end{equation} +::: + +::: Futhark + ## Futhark code -<!-- TODO add collision --> +```ocaml +map2(\f_x feq_x -> + map2(\f_xy feq_xy -> + map2(\f_xyz feq_xyz -> + map2(\f_i feq_i-> + f_i * (1.0 - omega) + feq_i * omega + ) f_xyz feq_xyz + ) f_xy feq_xy + ) f_x feq_x +) f feq +``` -# Propagation +::: + +# Streaming + +::: Simulation ## LBM equation @@ -113,22 +281,73 @@ f^\mathrm{out}_i=f_i\left(1-\frac{1}{\tau}\right)+\frac{1}{\tau}f_i^\mathrm{eq}. f_i(\bm{x}+\bm{c}_i,t+1)=f^\mathrm{out}_i(\bm{x},t). \end{equation} +::: + +::: Futhark + ## Futhark code -<!-- TODO add collision --> +```ocaml +unsafe +tabulate_4d nx ny nz q (\x y z ipop -> + let next_x = (x-(i32.f32 c[ipop,0]) + nx) % nx + let next_y = (y-(i32.f32 c[ipop,1]) + ny) % ny + let next_z = (z-(i32.f32 c[ipop,2]) + nz) % nz -# And now everything together + in f[next_x, next_y, next_z, ipop] +) +``` +::: -1. Collision (completely local). - * Compute $\rho$, - * Compute $\bm{u}$, - * Compute $f_i^\mathrm{eq}$, - * Compute $f_i^\mathrm{out}$. -2. Propagation. +# Summary + +* D3Q27 fully periodic lattice Boltzmann library. + +* Lines of readable and easy to debug Futhark code: 110. + +* Single precision, periodic, D3Q27 only arrays: 250 MLPUS. + +*Not bad: but we can do better.* + +# How can we go faster? + +* Arrays are aggressively parallelized: each dimension is flattened. +* For small dimensions it is usually not worth. +* Replace length 3, or length 27 arrays by tuples: better use of GPU architecture. +* `[](a, b, c, ..) -> ([]a, []b, []c, ...)`{.ocaml} automatically by the compiler. +* Result: with a code of 150 lines, we go to 1.5 GLUPS. +* All results are the same with CUDA and OpenCL backends (room for improvement?). + +# Conclusion + +## What is great with Futhark? (IMHO) + +* Efficient without all the OpenCL syntax. +* Rapid prototyping. +* Only a limited set of functionalities but that work great. +* Possibility to have genericity (generics, modules). +* Very difficult to make memory errors. +* Functional programming is fun (although I'm pretty bad at it). + +# Current and Future Futhark planned developments + +## From Troels Henriksen himself + +* Incremental flattening (experimental). +* Multi-GPU: only on a single motherboard (very experimental). + +## What is missing? (IMHO) + +* Good compiler errors. +* A way to profile the code. +* Distributed GPU (MPI). +* Bonus: A cool rendering tool directly from the GPU. + +# Questions? + +## Thank you for your attention -## Futhark code -<!-- TODO add collide and stream --> diff --git a/presentations/unige2020/Makefile b/presentations/unige2020/Makefile index 575514e..30d624b 100644 --- a/presentations/unige2020/Makefile +++ b/presentations/unige2020/Makefile @@ -3,13 +3,13 @@ FILTERDIR = $(DATADIR)/filters RESOURCEDIR = $(DATADIR)/resources PDFOPTIONS = -t beamer -PDFOPTIONS += --highlight-style my_highlight.theme +# PDFOPTIONS += --highlight-style my_highlight.theme PDFOPTIONS += --pdf-engine pdflatex PDFOPTIONS += --template=./default.latex PDFOPTIONS += -V theme:metropolis PDFOPTIONS += -V themeoptions:numbering=none -V themeoptions:progressbar=foot PDFOPTIONS += -V fontsize=smaller -# PDFOPTIONS += --filter pandoc-beamer-block +# PDFOPTIONS += --filter pandoc-beamer-block # PDFOPTIONS += --lua-filter=${FILTERDIR}/tex.lua # PDFOPTIONS += --include-in-header=${RESOURCEDIR}/definitions.tex # PDFOPTIONS += --include-in-header=${RESOURCEDIR}/beamer.tex diff --git a/presentations/unige2020/pres.md b/presentations/unige2020/pres.md index 64aaa6e..064a7b1 100644 --- a/presentations/unige2020/pres.md +++ b/presentations/unige2020/pres.md @@ -4,40 +4,113 @@ # What is Futhark -* Statically typed, data-parallel, purely functional, array language ... -* with limited functionalities (no I/O) ... -* that compiles to C, OpenCL, and Cuda backends ... -* efficiently ... -* without the pain of actually writing GPU code. +- Statically typed, data-parallel, purely functional, array language ... +- with limited functionalities (no I/O for example) ... +- that compiles to C, OpenCL, and Cuda backends... +- very efficiently ... +- without the pain of actually writing GPU code. -Spoiler: a toy d3q27 recursive regularized model does 1.5 GLUPS (single precision). +. . . + +- Developed in Copenhagen by Troels Henriksen ... +- Very friendly and eager to help newcomers ... +- Still a very experimental project (high truck factor). + +. . . + +**Spoiler:** a toy d3q27 recursive regularized model does 1.5 GLUPS (single precision). # Why use Futhark? -<!-- TODO: add opencl example --> +\tiny +```CPP +#include<CL/cl.hpp> +#include<iostream> +#include <fstream> +int main() +{ + std::vector<cl::Platform> platforms; + cl::Platform::get(&platforms); + + auto platform = platforms.front(); + std::vector<cl::Device> devices; + platform.getDevices(CL_DEVICE_TYPE_CPU, &devices); + + auto device = devices.front(); + + std::ifstream helloWorldFile("hello.cl"); + std::string src(std::istreambuf_iterator<char>(helloWorldFile), (std::istreambuf_iterator<char>())); + + cl::Program::Sources sources( 1, std::make_pair(src.c_str(), src.length() + 1)); + + cl::Context context(device); + cl::Program program(context, sources); + + auto err = program.build("-cl-std=CL1.2"); + + char buf[16]; + cl::Buffer memBuf(context, CL_MEM_WRITE_ONLY | CL_MEM_HOST_READ_ONLY, sizeof(buf)); + cl::Kernel kernel(program, "HelloWorld", &err); + kernel.setArg(0, memBuf); + + cl::CommandQueue queue(context, device); + queue.enqueueTask(kernel); + queue.enqueueReadBuffer(memBuf, GL_TRUE, 0, sizeof(buf), buf); + + std::cout << "hello"; + std::cin.get(); + +} +``` # How to use Futhark -* Not intended to replace existing generic-purpose languages. -* But aims at being easily integrated into non Futhark code: +- Not intended to replace existing generic-purpose languages. +- But aims at being easily integrated into non Futhark code: - * Used into python code: `PyOpenCL`, - * Conventional `C` code, - * Several others (`C#`, `Haskell`, `F#`, ..., and soon `Rust`). + - Used into python code: `PyOpenCL`, + - Conventional `C` code, + - Several others (`C#`, `Haskell`, `F#`, ..., and soon `Rust`). -* Futhark produces `C` code so it's accessible from any language through a FFI. +- Futhark produces `C` code so it's accessible from any language through a FFI. # An example: the dot product +::: dotprod + ## `dotprod.fut` ``` entry dotprod (xs: []i32) (ys: []i32): i32 = - reduce (+) 0 (map2 (*) xs ys) + reduce (+) 0 (map (\(x, y) -> x * y) (zip xs ys)) ``` +::: + +Intrinsics: SOAC (Second order array combinators) + +- `reduce 'a -> (a -> a -> a) -> a -> []a` +- `map 'a 'b -> (a -> b) -> []a -> []b` +- `zip 'a 'b -> []a -> []b -> [](a, b)` Futhark is very efficient at parallelizing array operations. +. . . + +::: cdotprod + +## `dotprod.c` + +```C +int dot_prod(int xs[], int ys[], int size) { + int dot = 0; + for (int i = 0; i < size; ++i) { + dot += xs[i] * ys[i]; + } + return dot; +} +``` +::: + # An example: the dot product from `C` ```C @@ -62,50 +135,145 @@ int main() { # And now -Let's implement some lattice Boltzmann +## Let the lattice Boltzmann method raid the Futhark. -# Computation of macroscopic moments +# The lattice Boltzmann method -## LBM equation +::: Simulation + +## Simulation + +0. Initialization (not Futhark here). +1. Collision. + - Compute $\rho(f_i)$, + - Compute $\bm{j}(f_i)$, + - Compute $f_i^\mathrm{eq}(\rho,\bm{j})$, + - Collide $f_i^\mathrm{out}(f_i,f_i^\mathrm{eq},\omega)$. +2. Propagation. + +Repeat 1-2 a certain amount of times +::: + +# Computation of macroscopic moments (1/2) + +::: Simulation + +## LBM equations \begin{equation} -\rho=\sum_{i=0}^{q-1}f_i,\quad \rho\bm{u}=\sum_{i=0}^{q-1}f_i \bm{c}_i, \forall\ \bm{x}. +\rho=\sum_{i=0}^{q-1}f_i, \forall\ \bm{x}. \end{equation} +::: + +::: Futhark ## Futhark code +```ocaml +map (\fx -> + map (\fxy -> + map (\fxyz -> + reduce (+) 0 fxyz + ) fxy + ) fx +) f ``` -let compute_rho (f: [nx][ny][nz][27]f32): [nx][ny][nz]f32 = - map ( map ( map ( reduce (+) 0.0 ) ) ) f -``` +::: + +# Computation of macroscopic moments (2/2) + +::: Simulation + +## LBM equation + +\begin{equation} +\rho\bm{u}=\sum_{i=0}^{q-1}f_i \bm{c}_i, \forall\ \bm{x}. +\end{equation} +::: + +::: Futhark -<!-- TODO add j --> +## Futhark code + +```ocaml +map (\fx -> + map (\fxy -> + map (\fxyz -> + map(\ci -> + dotprod ci fxyz + ) (transpose c) -- intrinsic + ) fxy + ) fx +) f +``` +::: # Computation of the equilibrium distribution +::: Simulation + ## LBM equation \begin{equation} f_i^\mathrm{eq}=w_i\rho\left(1+\frac{\bm{c}_i\cdot \bm{u}}{c_s^2}+\frac{1}{2c_s^4}(\bm{c}_i\cdot \bm{u})^2-\frac{1}{2c_s^2}\bm{u}^2\right),\ \forall \bm{x},i \end{equation} +::: + +::: Futhark ## Futhark code -<!-- TODO add feq --> +```ocaml +map2(\rho_x j_x -> + map2(\rho_xy j_xy -> + map2(\rho_xyz j_xyz -> + let u = map(\j_xyzi -> j_xyzi / rho_xyz ) j_xyz + let u_sqr = dotprod u u + + in map2(\wi ci -> + let c_u = dotprod ci u + in rho_xyz * wi * + (1 + 3 * c_u + 4.5 * c_u * c_u - 1.5 * u_sqr) + ) w c + ) rho_xy j_xy + ) rho_x j_x +) rho j +``` +::: # Collision +::: Simulation + ## LBM equation \begin{equation} -f^\mathrm{out}_i=f_i\left(1-\frac{1}{\tau}\right)+\frac{1}{\tau}f_i^\mathrm{eq}. +f^\mathrm{out}_i=f_i\left(1-\omega\right)+\omega f_i^\mathrm{eq}. \end{equation} +::: + +::: Futhark + ## Futhark code -<!-- TODO add collision --> +```ocaml +map2(\f_x feq_x -> + map2(\f_xy feq_xy -> + map2(\f_xyz feq_xyz -> + map2(\f_i feq_i-> + f_i * (1.0 - omega) + feq_i * omega + ) f_xyz feq_xyz + ) f_xy feq_xy + ) f_x feq_x +) f feq +``` -# Propagation +::: + +# Streaming + +::: Simulation ## LBM equation @@ -113,22 +281,73 @@ f^\mathrm{out}_i=f_i\left(1-\frac{1}{\tau}\right)+\frac{1}{\tau}f_i^\mathrm{eq}. f_i(\bm{x}+\bm{c}_i,t+1)=f^\mathrm{out}_i(\bm{x},t). \end{equation} +::: + +::: Futhark + ## Futhark code -<!-- TODO add collision --> +```ocaml +unsafe +tabulate_4d nx ny nz q (\x y z ipop -> + let next_x = (x-(i32.f32 c[ipop,0]) + nx) % nx + let next_y = (y-(i32.f32 c[ipop,1]) + ny) % ny + let next_z = (z-(i32.f32 c[ipop,2]) + nz) % nz -# And now everything together + in f[next_x, next_y, next_z, ipop] +) +``` +::: -1. Collision (completely local). - * Compute $\rho$, - * Compute $\bm{u}$, - * Compute $f_i^\mathrm{eq}$, - * Compute $f_i^\mathrm{out}$. -2. Propagation. +# Summary + +* D3Q27 fully periodic lattice Boltzmann library. + +* Lines of readable and easy to debug Futhark code: 110. + +* Single precision, periodic, D3Q27 only arrays: 250 MLPUS. + +*Not bad: but we can do better.* + +# How can we go faster? + +* Arrays are aggressively parallelized: each dimension is flattened. +* For small dimensions it is usually not worth. +* Replace length 3, or length 27 arrays by tuples: better use of GPU architecture. +* `[](a, b, c, ..) -> ([]a, []b, []c, ...)`{.ocaml} automatically by the compiler. +* Result: with a code of 150 lines, we go to 1.5 GLUPS. +* All results are the same with CUDA and OpenCL backends (room for improvement?). + +# Conclusion + +## What is great with Futhark? (IMHO) + +* Efficient without all the OpenCL syntax. +* Rapid prototyping. +* Only a limited set of functionalities but that work great. +* Possibility to have genericity (generics, modules). +* Very difficult to make memory errors. +* Functional programming is fun (although I'm pretty bad at it). + +# Current and Future Futhark planned developments + +## From Troels Henriksen himself + +* Incremental flattening (experimental). +* Multi-GPU: only on a single motherboard (very experimental). + +## What is missing? (IMHO) + +* Good compiler errors. +* A way to profile the code. +* Distributed GPU (MPI). +* Bonus: A cool rendering tool directly from the GPU. + +# Questions? + +## Thank you for your attention -## Futhark code -<!-- TODO add collide and stream --> -- GitLab