diff --git a/src/liblbm.fut b/src/liblbm.fut
index b9daf9e53fc0e3c480ba43444374e9e35e2114d9..74bbc24cf68ae01516026084bf4bb1732621694c 100644
--- a/src/liblbm.fut
+++ b/src/liblbm.fut
@@ -142,17 +142,14 @@ let collision [n] (f_in: [9][n]f64)(tau: f64): [9][n]f64 =
 -- 	)
 
 -- propagation phase
-let propagation [n] (f_out: [9][n]f64)(nx: i32)(ny: i32): [9][n]f64 =
+let propagation [n] (f_out: [9][n]f64)(nx: i32)(ny: i32): *[9][n]f64 =
     unsafe
-    tabulate_2d 9 n (\i ind-> 
-        let y = ind / nx
-        let x = ind - nx * y
-
+    unflatten(9)(n) (flatten_3d (tabulate_3d 9 nx ny (\i x y ->
         let next_x = (x-(i32.f64 c[i].1) + nx) % nx 
         let next_y = (y-(i32.f64 c[i].2) + ny) % ny 
 
-        in f_out[i, (next_x + nx * next_y)]
-    )
+        in f_out[i, (next_y + ny * next_x)]
+    )))
 
 -- executes one iteration of Lattice-Boltzmann
 -- which means it takes an f_in and updates it
diff --git a/src/makefile b/src/makefile
index 86d0ed9a592e68b08327d51e140f31a0a9c26d9c..d0064222103cbbbee5463f6de8da831b0b4efde7 100644
--- a/src/makefile
+++ b/src/makefile
@@ -1,17 +1,10 @@
-CUDA=false
-OPENCL=true
-SDL=false
-
 TARGET=c
+SDL=true
 
-ifeq ($(CUDA),true)
+ifeq ($(TARGET),cuda)
 	CUDALIBS=-lcudart -lcuda -lnvrtc
-	TARGET=cuda
-endif
-
-ifeq ($(OPENCL),true)
+else ifeq ($(TARGET),opencl)
 	OPENCLIBS=-lOpenCL
-	TARGET=opencl
 endif
 
 ifeq ($(SDL),true)