From 6e0e1f6d7c77ad049aca60d4a53ccd9f125fb2a0 Mon Sep 17 00:00:00 2001
From: Orestis <orestis.malaspinas@pm.me>
Date: Sun, 4 Jul 2021 22:26:24 +0200
Subject: [PATCH] udpated pres

---
 presentations/pasc/Makefile      |   4 +-
 presentations/pasc/pres.md       | 291 +++++++++++++++++++++++++++----
 presentations/unige2020/Makefile |   4 +-
 presentations/unige2020/pres.md  | 291 +++++++++++++++++++++++++++----
 4 files changed, 514 insertions(+), 76 deletions(-)

diff --git a/presentations/pasc/Makefile b/presentations/pasc/Makefile
index 575514e..30d624b 100644
--- a/presentations/pasc/Makefile
+++ b/presentations/pasc/Makefile
@@ -3,13 +3,13 @@ FILTERDIR    = $(DATADIR)/filters
 RESOURCEDIR  = $(DATADIR)/resources
 
 PDFOPTIONS = -t beamer
-PDFOPTIONS += --highlight-style my_highlight.theme
+# PDFOPTIONS += --highlight-style my_highlight.theme
 PDFOPTIONS += --pdf-engine pdflatex
 PDFOPTIONS += --template=./default.latex
 PDFOPTIONS += -V theme:metropolis
 PDFOPTIONS += -V themeoptions:numbering=none -V themeoptions:progressbar=foot
 PDFOPTIONS += -V fontsize=smaller
-# PDFOPTIONS += --filter  pandoc-beamer-block
+# PDFOPTIONS += --filter pandoc-beamer-block
 # PDFOPTIONS += --lua-filter=${FILTERDIR}/tex.lua
 # PDFOPTIONS += --include-in-header=${RESOURCEDIR}/definitions.tex
 # PDFOPTIONS += --include-in-header=${RESOURCEDIR}/beamer.tex
diff --git a/presentations/pasc/pres.md b/presentations/pasc/pres.md
index 64aaa6e..064a7b1 100644
--- a/presentations/pasc/pres.md
+++ b/presentations/pasc/pres.md
@@ -4,40 +4,113 @@
 
 # What is Futhark
 
-* Statically typed, data-parallel, purely functional, array language ...
-* with limited functionalities (no I/O) ...
-* that compiles to C, OpenCL, and Cuda backends ...
-* efficiently ...
-* without the pain of actually writing GPU code.
+- Statically typed, data-parallel, purely functional, array language ...
+- with limited functionalities (no I/O for example) ...
+- that compiles to C, OpenCL, and Cuda backends...
+- very efficiently ...
+- without the pain of actually writing GPU code.
 
-Spoiler: a toy d3q27 recursive regularized model does 1.5 GLUPS (single precision).
+. . .
+
+- Developed in Copenhagen by Troels Henriksen ...
+- Very friendly and eager to help newcomers ...
+- Still a very experimental project (high truck factor).
+
+. . .
+
+**Spoiler:** a toy d3q27 recursive regularized model does 1.5 GLUPS (single precision).
 
 # Why use Futhark?
 
-<!-- TODO: add opencl example -->
+\tiny
+```CPP
+#include<CL/cl.hpp>
+#include<iostream>
+#include <fstream>
+int main() 
+{
+    std::vector<cl::Platform> platforms;
+    cl::Platform::get(&platforms);
+
+    auto platform = platforms.front();
+    std::vector<cl::Device> devices;
+    platform.getDevices(CL_DEVICE_TYPE_CPU, &devices);
+
+    auto device = devices.front();
+
+    std::ifstream helloWorldFile("hello.cl");
+    std::string src(std::istreambuf_iterator<char>(helloWorldFile), (std::istreambuf_iterator<char>()));
+
+    cl::Program::Sources sources( 1, std::make_pair(src.c_str(), src.length() + 1));
+
+    cl::Context context(device);
+    cl::Program program(context, sources);
+
+    auto err = program.build("-cl-std=CL1.2");
+
+    char buf[16];
+    cl::Buffer memBuf(context, CL_MEM_WRITE_ONLY | CL_MEM_HOST_READ_ONLY, sizeof(buf));
+    cl::Kernel kernel(program, "HelloWorld", &err);
+    kernel.setArg(0, memBuf);
+
+    cl::CommandQueue queue(context, device);
+    queue.enqueueTask(kernel);
+    queue.enqueueReadBuffer(memBuf, GL_TRUE, 0, sizeof(buf), buf);
+
+    std::cout << "hello";
+    std::cin.get();
+
+}
+```
 
 # How to use Futhark
 
-* Not intended to replace existing generic-purpose languages.
-* But aims at being easily integrated into non Futhark code:
+- Not intended to replace existing generic-purpose languages.
+- But aims at being easily integrated into non Futhark code:
 
-    * Used into python code: `PyOpenCL`,
-    * Conventional `C` code,
-    * Several others (`C#`, `Haskell`, `F#`, ..., and soon `Rust`).
+    - Used into python code: `PyOpenCL`,
+    - Conventional `C` code,
+    - Several others (`C#`, `Haskell`, `F#`, ..., and soon `Rust`).
 
-* Futhark produces `C` code so it's accessible from any language through a FFI.
+- Futhark produces `C` code so it's accessible from any language through a FFI.
 
 # An example: the dot product
 
+::: dotprod
+
 ## `dotprod.fut`
 
 ```
 entry dotprod (xs: []i32) (ys: []i32): i32 =
-  reduce (+) 0 (map2 (*) xs ys)
+  reduce (+) 0 (map (\(x, y) -> x * y) (zip xs ys))
 ```
+:::
+
+Intrinsics: SOAC (Second order array combinators)
+
+- `reduce 'a -> (a -> a -> a) -> a -> []a`
+- `map 'a 'b -> (a -> b) -> []a -> []b`
+- `zip 'a 'b -> []a -> []b -> [](a, b)`
 
 Futhark is very efficient at parallelizing array operations.
 
+. . .
+
+::: cdotprod
+
+## `dotprod.c`
+
+```C
+int dot_prod(int xs[], int ys[], int size) {
+    int dot = 0;
+    for (int i = 0; i < size; ++i) {
+        dot += xs[i] * ys[i];
+    }
+    return dot;
+}
+```
+:::
+
 # An example: the dot product from `C`
 
 ```C
@@ -62,50 +135,145 @@ int main() {
 
 # And now
 
-Let's implement some lattice Boltzmann
+## Let the lattice Boltzmann method raid the Futhark.
 
-# Computation of macroscopic moments
+# The lattice Boltzmann method
 
-## LBM equation
+::: Simulation
+
+## Simulation
+
+0. Initialization (not Futhark here).
+1. Collision.
+    - Compute $\rho(f_i)$,
+    - Compute $\bm{j}(f_i)$,
+    - Compute $f_i^\mathrm{eq}(\rho,\bm{j})$,
+    - Collide $f_i^\mathrm{out}(f_i,f_i^\mathrm{eq},\omega)$.
+2. Propagation.
+
+Repeat 1-2 a certain amount of times
+:::
+
+# Computation of macroscopic moments (1/2)
+
+::: Simulation
+
+## LBM equations
 
 \begin{equation}
-\rho=\sum_{i=0}^{q-1}f_i,\quad \rho\bm{u}=\sum_{i=0}^{q-1}f_i \bm{c}_i, \forall\ \bm{x}.
+\rho=\sum_{i=0}^{q-1}f_i, \forall\ \bm{x}.
 \end{equation}
+:::
+
+::: Futhark
 
 ## Futhark code
 
+```ocaml
+map (\fx ->
+    map (\fxy ->
+        map (\fxyz ->
+            reduce (+) 0 fxyz
+        ) fxy
+    ) fx
+) f
 ```
-let compute_rho (f: [nx][ny][nz][27]f32): [nx][ny][nz]f32 = 
-    map ( map ( map ( reduce (+) 0.0 ) ) ) f
-```
+:::
+
+# Computation of macroscopic moments (2/2)
+
+::: Simulation
+
+## LBM equation
+
+\begin{equation}
+\rho\bm{u}=\sum_{i=0}^{q-1}f_i \bm{c}_i, \forall\ \bm{x}.
+\end{equation}
+:::
+
+::: Futhark
 
-<!-- TODO add j -->
+## Futhark code
+
+```ocaml
+map (\fx ->
+    map (\fxy ->
+        map (\fxyz ->
+            map(\ci ->
+                dotprod ci fxyz
+            ) (transpose c) -- intrinsic
+        ) fxy
+    ) fx
+) f
+```
+:::
 
 # Computation of the equilibrium distribution
 
+::: Simulation
+
 ## LBM equation
 
 \begin{equation}
 f_i^\mathrm{eq}=w_i\rho\left(1+\frac{\bm{c}_i\cdot \bm{u}}{c_s^2}+\frac{1}{2c_s^4}(\bm{c}_i\cdot \bm{u})^2-\frac{1}{2c_s^2}\bm{u}^2\right),\ \forall \bm{x},i
 \end{equation}
+:::
+
+::: Futhark
 
 ## Futhark code
 
-<!-- TODO add feq -->
+```ocaml
+map2(\rho_x j_x -> 
+  map2(\rho_xy j_xy ->
+    map2(\rho_xyz j_xyz ->
+      let u = map(\j_xyzi -> j_xyzi / rho_xyz ) j_xyz
+      let u_sqr = dotprod u u
+
+      in map2(\wi ci ->
+        let c_u = dotprod ci u
+        in rho_xyz * wi * 
+          (1 + 3 * c_u + 4.5 * c_u * c_u - 1.5 * u_sqr)
+      ) w c
+    ) rho_xy j_xy 
+  ) rho_x j_x 
+) rho j
+```
+:::
 
 # Collision
 
+::: Simulation
+
 ## LBM equation
 
 \begin{equation}
-f^\mathrm{out}_i=f_i\left(1-\frac{1}{\tau}\right)+\frac{1}{\tau}f_i^\mathrm{eq}.
+f^\mathrm{out}_i=f_i\left(1-\omega\right)+\omega f_i^\mathrm{eq}.
 \end{equation}
 
+:::
+
+::: Futhark
+
 ## Futhark code
 
-<!-- TODO add collision -->
+```ocaml
+map2(\f_x feq_x ->
+  map2(\f_xy feq_xy ->
+      map2(\f_xyz feq_xyz ->
+          map2(\f_i feq_i-> 
+              f_i * (1.0 - omega) + feq_i * omega
+          ) f_xyz feq_xyz
+      ) f_xy feq_xy
+  ) f_x feq_x
+) f feq
+```
 
-# Propagation
+:::
+
+# Streaming
+
+::: Simulation
 
 ## LBM equation
 
@@ -113,22 +281,73 @@ f^\mathrm{out}_i=f_i\left(1-\frac{1}{\tau}\right)+\frac{1}{\tau}f_i^\mathrm{eq}.
 f_i(\bm{x}+\bm{c}_i,t+1)=f^\mathrm{out}_i(\bm{x},t).
 \end{equation}
 
+:::
+
+::: Futhark
+
 ## Futhark code
 
-<!-- TODO add collision -->
+```ocaml
+unsafe
+tabulate_4d nx ny nz q (\x y z ipop ->
+    let next_x = (x-(i32.f32 c[ipop,0]) + nx) % nx 
+    let next_y = (y-(i32.f32 c[ipop,1]) + ny) % ny 
+    let next_z = (z-(i32.f32 c[ipop,2]) + nz) % nz 
 
-# And now everything together
+    in f[next_x, next_y, next_z, ipop]
+)
+```
+:::
 
-1. Collision (completely local).
-    * Compute $\rho$,
-    * Compute $\bm{u}$,
-    * Compute $f_i^\mathrm{eq}$,
-    * Compute $f_i^\mathrm{out}$.
-2. Propagation.
+# Summary
+
+* D3Q27 fully periodic lattice Boltzmann library.
+
+* Lines of readable and easy to debug Futhark code: 110.
+
+* Single precision, periodic, D3Q27 only arrays: 250 MLPUS.
+
+*Not bad: but we can do better.*
+
+# How can we go faster?
+
+* Arrays are aggressively parallelized: each dimension is flattened.
+* For small dimensions it is usually not worth.
+* Replace length 3, or length 27 arrays by tuples: better use of GPU architecture.
+* `[](a, b, c, ..) -> ([]a, []b, []c, ...)`{.ocaml} automatically by the compiler.
+* Result: with a code of 150 lines, we go to 1.5 GLUPS.
+* All results are the same with CUDA and OpenCL backends (room for improvement?).
+
+# Conclusion
+
+## What is great with Futhark? (IMHO)
+
+* Efficient without all the OpenCL syntax.
+* Rapid prototyping.
+* Only a limited set of functionalities but that work great.
+* Possibility to have genericity (generics, modules).
+* Very difficult to make memory errors.
+* Functional programming is fun (although I'm pretty bad at it).
+
+# Current and Future Futhark planned developments
+
+## From Troels Henriksen himself
+
+* Incremental flattening (experimental).
+* Multi-GPU: only on a single motherboard (very experimental).
+
+## What is missing? (IMHO)
+
+* Good compiler errors.
+* A way to profile the code.
+* Distributed GPU (MPI).
+* Bonus: A cool rendering tool directly from the GPU.
+
+# Questions?
+
+## Thank you for your attention
 
-## Futhark code
 
-<!-- TODO add collide and stream -->
 
 
 
diff --git a/presentations/unige2020/Makefile b/presentations/unige2020/Makefile
index 575514e..30d624b 100644
--- a/presentations/unige2020/Makefile
+++ b/presentations/unige2020/Makefile
@@ -3,13 +3,13 @@ FILTERDIR    = $(DATADIR)/filters
 RESOURCEDIR  = $(DATADIR)/resources
 
 PDFOPTIONS = -t beamer
-PDFOPTIONS += --highlight-style my_highlight.theme
+# PDFOPTIONS += --highlight-style my_highlight.theme
 PDFOPTIONS += --pdf-engine pdflatex
 PDFOPTIONS += --template=./default.latex
 PDFOPTIONS += -V theme:metropolis
 PDFOPTIONS += -V themeoptions:numbering=none -V themeoptions:progressbar=foot
 PDFOPTIONS += -V fontsize=smaller
-# PDFOPTIONS += --filter  pandoc-beamer-block
+# PDFOPTIONS += --filter pandoc-beamer-block
 # PDFOPTIONS += --lua-filter=${FILTERDIR}/tex.lua
 # PDFOPTIONS += --include-in-header=${RESOURCEDIR}/definitions.tex
 # PDFOPTIONS += --include-in-header=${RESOURCEDIR}/beamer.tex
diff --git a/presentations/unige2020/pres.md b/presentations/unige2020/pres.md
index 64aaa6e..064a7b1 100644
--- a/presentations/unige2020/pres.md
+++ b/presentations/unige2020/pres.md
@@ -4,40 +4,113 @@
 
 # What is Futhark
 
-* Statically typed, data-parallel, purely functional, array language ...
-* with limited functionalities (no I/O) ...
-* that compiles to C, OpenCL, and Cuda backends ...
-* efficiently ...
-* without the pain of actually writing GPU code.
+- Statically typed, data-parallel, purely functional, array language ...
+- with limited functionalities (no I/O for example) ...
+- that compiles to C, OpenCL, and Cuda backends...
+- very efficiently ...
+- without the pain of actually writing GPU code.
 
-Spoiler: a toy d3q27 recursive regularized model does 1.5 GLUPS (single precision).
+. . .
+
+- Developed in Copenhagen by Troels Henriksen ...
+- Very friendly and eager to help newcomers ...
+- Still a very experimental project (high truck factor).
+
+. . .
+
+**Spoiler:** a toy d3q27 recursive regularized model does 1.5 GLUPS (single precision).
 
 # Why use Futhark?
 
-<!-- TODO: add opencl example -->
+\tiny
+```CPP
+#include<CL/cl.hpp>
+#include<iostream>
+#include <fstream>
+int main() 
+{
+    std::vector<cl::Platform> platforms;
+    cl::Platform::get(&platforms);
+
+    auto platform = platforms.front();
+    std::vector<cl::Device> devices;
+    platform.getDevices(CL_DEVICE_TYPE_CPU, &devices);
+
+    auto device = devices.front();
+
+    std::ifstream helloWorldFile("hello.cl");
+    std::string src(std::istreambuf_iterator<char>(helloWorldFile), (std::istreambuf_iterator<char>()));
+
+    cl::Program::Sources sources( 1, std::make_pair(src.c_str(), src.length() + 1));
+
+    cl::Context context(device);
+    cl::Program program(context, sources);
+
+    auto err = program.build("-cl-std=CL1.2");
+
+    char buf[16];
+    cl::Buffer memBuf(context, CL_MEM_WRITE_ONLY | CL_MEM_HOST_READ_ONLY, sizeof(buf));
+    cl::Kernel kernel(program, "HelloWorld", &err);
+    kernel.setArg(0, memBuf);
+
+    cl::CommandQueue queue(context, device);
+    queue.enqueueTask(kernel);
+    queue.enqueueReadBuffer(memBuf, GL_TRUE, 0, sizeof(buf), buf);
+
+    std::cout << "hello";
+    std::cin.get();
+
+}
+```
 
 # How to use Futhark
 
-* Not intended to replace existing generic-purpose languages.
-* But aims at being easily integrated into non Futhark code:
+- Not intended to replace existing generic-purpose languages.
+- But aims at being easily integrated into non Futhark code:
 
-    * Used into python code: `PyOpenCL`,
-    * Conventional `C` code,
-    * Several others (`C#`, `Haskell`, `F#`, ..., and soon `Rust`).
+    - Used into python code: `PyOpenCL`,
+    - Conventional `C` code,
+    - Several others (`C#`, `Haskell`, `F#`, ..., and soon `Rust`).
 
-* Futhark produces `C` code so it's accessible from any language through a FFI.
+- Futhark produces `C` code so it's accessible from any language through a FFI.
 
 # An example: the dot product
 
+::: dotprod
+
 ## `dotprod.fut`
 
 ```
 entry dotprod (xs: []i32) (ys: []i32): i32 =
-  reduce (+) 0 (map2 (*) xs ys)
+  reduce (+) 0 (map (\(x, y) -> x * y) (zip xs ys))
 ```
+:::
+
+Intrinsics: SOAC (Second order array combinators)
+
+- `reduce 'a -> (a -> a -> a) -> a -> []a`
+- `map 'a 'b -> (a -> b) -> []a -> []b`
+- `zip 'a 'b -> []a -> []b -> [](a, b)`
 
 Futhark is very efficient at parallelizing array operations.
 
+. . .
+
+::: cdotprod
+
+## `dotprod.c`
+
+```C
+int dot_prod(int xs[], int ys[], int size) {
+    int dot = 0;
+    for (int i = 0; i < size; ++i) {
+        dot += xs[i] * ys[i];
+    }
+    return dot;
+}
+```
+:::
+
 # An example: the dot product from `C`
 
 ```C
@@ -62,50 +135,145 @@ int main() {
 
 # And now
 
-Let's implement some lattice Boltzmann
+## Let the lattice Boltzmann method raid the Futhark.
 
-# Computation of macroscopic moments
+# The lattice Boltzmann method
 
-## LBM equation
+::: Simulation
+
+## Simulation
+
+0. Initialization (not Futhark here).
+1. Collision.
+    - Compute $\rho(f_i)$,
+    - Compute $\bm{j}(f_i)$,
+    - Compute $f_i^\mathrm{eq}(\rho,\bm{j})$,
+    - Collide $f_i^\mathrm{out}(f_i,f_i^\mathrm{eq},\omega)$.
+2. Propagation.
+
+Repeat 1-2 a certain amount of times
+:::
+
+# Computation of macroscopic moments (1/2)
+
+::: Simulation
+
+## LBM equations
 
 \begin{equation}
-\rho=\sum_{i=0}^{q-1}f_i,\quad \rho\bm{u}=\sum_{i=0}^{q-1}f_i \bm{c}_i, \forall\ \bm{x}.
+\rho=\sum_{i=0}^{q-1}f_i, \forall\ \bm{x}.
 \end{equation}
+:::
+
+::: Futhark
 
 ## Futhark code
 
+```ocaml
+map (\fx ->
+    map (\fxy ->
+        map (\fxyz ->
+            reduce (+) 0 fxyz
+        ) fxy
+    ) fx
+) f
 ```
-let compute_rho (f: [nx][ny][nz][27]f32): [nx][ny][nz]f32 = 
-    map ( map ( map ( reduce (+) 0.0 ) ) ) f
-```
+:::
+
+# Computation of macroscopic moments (2/2)
+
+::: Simulation
+
+## LBM equation
+
+\begin{equation}
+\rho\bm{u}=\sum_{i=0}^{q-1}f_i \bm{c}_i, \forall\ \bm{x}.
+\end{equation}
+:::
+
+::: Futhark
 
-<!-- TODO add j -->
+## Futhark code
+
+```ocaml
+map (\fx ->
+    map (\fxy ->
+        map (\fxyz ->
+            map(\ci ->
+                dotprod ci fxyz
+            ) (transpose c) -- intrinsic
+        ) fxy
+    ) fx
+) f
+```
+:::
 
 # Computation of the equilibrium distribution
 
+::: Simulation
+
 ## LBM equation
 
 \begin{equation}
 f_i^\mathrm{eq}=w_i\rho\left(1+\frac{\bm{c}_i\cdot \bm{u}}{c_s^2}+\frac{1}{2c_s^4}(\bm{c}_i\cdot \bm{u})^2-\frac{1}{2c_s^2}\bm{u}^2\right),\ \forall \bm{x},i
 \end{equation}
+:::
+
+::: Futhark
 
 ## Futhark code
 
-<!-- TODO add feq -->
+```ocaml
+map2(\rho_x j_x -> 
+  map2(\rho_xy j_xy ->
+    map2(\rho_xyz j_xyz ->
+      let u = map(\j_xyzi -> j_xyzi / rho_xyz ) j_xyz
+      let u_sqr = dotprod u u
+
+      in map2(\wi ci ->
+        let c_u = dotprod ci u
+        in rho_xyz * wi * 
+          (1 + 3 * c_u + 4.5 * c_u * c_u - 1.5 * u_sqr)
+      ) w c
+    ) rho_xy j_xy 
+  ) rho_x j_x 
+) rho j
+```
+:::
 
 # Collision
 
+::: Simulation
+
 ## LBM equation
 
 \begin{equation}
-f^\mathrm{out}_i=f_i\left(1-\frac{1}{\tau}\right)+\frac{1}{\tau}f_i^\mathrm{eq}.
+f^\mathrm{out}_i=f_i\left(1-\omega\right)+\omega f_i^\mathrm{eq}.
 \end{equation}
 
+:::
+
+::: Futhark
+
 ## Futhark code
 
-<!-- TODO add collision -->
+```ocaml
+map2(\f_x feq_x ->
+  map2(\f_xy feq_xy ->
+      map2(\f_xyz feq_xyz ->
+          map2(\f_i feq_i-> 
+              f_i * (1.0 - omega) + feq_i * omega
+          ) f_xyz feq_xyz
+      ) f_xy feq_xy
+  ) f_x feq_x
+) f feq
+```
 
-# Propagation
+:::
+
+# Streaming
+
+::: Simulation
 
 ## LBM equation
 
@@ -113,22 +281,73 @@ f^\mathrm{out}_i=f_i\left(1-\frac{1}{\tau}\right)+\frac{1}{\tau}f_i^\mathrm{eq}.
 f_i(\bm{x}+\bm{c}_i,t+1)=f^\mathrm{out}_i(\bm{x},t).
 \end{equation}
 
+:::
+
+::: Futhark
+
 ## Futhark code
 
-<!-- TODO add collision -->
+```ocaml
+unsafe
+tabulate_4d nx ny nz q (\x y z ipop ->
+    let next_x = (x-(i32.f32 c[ipop,0]) + nx) % nx 
+    let next_y = (y-(i32.f32 c[ipop,1]) + ny) % ny 
+    let next_z = (z-(i32.f32 c[ipop,2]) + nz) % nz 
 
-# And now everything together
+    in f[next_x, next_y, next_z, ipop]
+)
+```
+:::
 
-1. Collision (completely local).
-    * Compute $\rho$,
-    * Compute $\bm{u}$,
-    * Compute $f_i^\mathrm{eq}$,
-    * Compute $f_i^\mathrm{out}$.
-2. Propagation.
+# Summary
+
+* D3Q27 fully periodic lattice Boltzmann library.
+
+* Lines of readable and easy to debug Futhark code: 110.
+
+* Single precision, periodic, D3Q27 only arrays: 250 MLPUS.
+
+*Not bad: but we can do better.*
+
+# How can we go faster?
+
+* Arrays are aggressively parallelized: each dimension is flattened.
+* For small dimensions it is usually not worth.
+* Replace length 3, or length 27 arrays by tuples: better use of GPU architecture.
+* `[](a, b, c, ..) -> ([]a, []b, []c, ...)`{.ocaml} automatically by the compiler.
+* Result: with a code of 150 lines, we go to 1.5 GLUPS.
+* All results are the same with CUDA and OpenCL backends (room for improvement?).
+
+# Conclusion
+
+## What is great with Futhark? (IMHO)
+
+* Efficient without all the OpenCL syntax.
+* Rapid prototyping.
+* Only a limited set of functionalities but that work great.
+* Possibility to have genericity (generics, modules).
+* Very difficult to make memory errors.
+* Functional programming is fun (although I'm pretty bad at it).
+
+# Current and Future Futhark planned developments
+
+## From Troels Henriksen himself
+
+* Incremental flattening (experimental).
+* Multi-GPU: only on a single motherboard (very experimental).
+
+## What is missing? (IMHO)
+
+* Good compiler errors.
+* A way to profile the code.
+* Distributed GPU (MPI).
+* Bonus: A cool rendering tool directly from the GPU.
+
+# Questions?
+
+## Thank you for your attention
 
-## Futhark code
 
-<!-- TODO add collide and stream -->
 
 
 
-- 
GitLab