diff --git a/.clang-format b/.clang-format
new file mode 100644
index 0000000000000000000000000000000000000000..e955634b7224617f43b80693fd2eeb8e533d1660
--- /dev/null
+++ b/.clang-format
@@ -0,0 +1,183 @@
+---
+Language:        Cpp
+# BasedOnStyle:  Google
+AccessModifierOffset: -4
+AlignAfterOpenBracket: AlwaysBreak
+AlignConsecutiveAssignments: None
+AlignConsecutiveBitFields: None
+AlignConsecutiveDeclarations: None
+AlignConsecutiveMacros: true
+AlignEscapedNewlines: Left
+AlignOperands:   Align
+AlignTrailingComments: true
+AllowAllArgumentsOnNextLine: true
+AllowAllConstructorInitializersOnNextLine: true
+AllowAllParametersOfDeclarationOnNextLine: true
+AllowShortEnumsOnASingleLine: true
+AllowShortBlocksOnASingleLine: Never
+AllowShortCaseLabelsOnASingleLine: false
+AllowShortFunctionsOnASingleLine: Empty
+AllowShortLambdasOnASingleLine: Empty
+AllowShortIfStatementsOnASingleLine: Never
+AllowShortLoopsOnASingleLine: false
+AlwaysBreakAfterDefinitionReturnType: None
+AlwaysBreakAfterReturnType: None
+AlwaysBreakBeforeMultilineStrings: true
+AlwaysBreakTemplateDeclarations: Yes
+BinPackArguments: true
+BinPackParameters: true
+BitFieldColonSpacing: Both
+BraceWrapping:
+  AfterCaseLabel:  false
+  AfterClass:      false
+  AfterControlStatement: MultiLine
+  AfterEnum:       false
+  AfterFunction:   false
+  AfterNamespace:  false
+  AfterObjCDeclaration: false
+  AfterStruct:     false
+  AfterUnion:      false
+  AfterExternBlock: false
+  BeforeCatch:     false
+  BeforeElse:      false
+  BeforeLambdaBody: false
+  BeforeWhile:     false
+  IndentBraces:    false
+  SplitEmptyFunction: false
+  SplitEmptyRecord: true
+  SplitEmptyNamespace: true
+BreakBeforeBinaryOperators: NonAssignment
+BreakBeforeBraces: Custom
+BreakBeforeInheritanceComma: false
+BreakInheritanceList: AfterColon
+BreakBeforeTernaryOperators: true
+BreakConstructorInitializersBeforeComma: false
+BreakConstructorInitializers: AfterColon
+BreakAfterJavaFieldAnnotations: false
+BreakStringLiterals: true
+ColumnLimit:    100
+CommentPragmas:  '^ IWYU pragma:'
+CompactNamespaces: false
+ConstructorInitializerAllOnOneLineOrOnePerLine: true
+ConstructorInitializerIndentWidth: 4
+ContinuationIndentWidth: 4
+Cpp11BracedListStyle: true
+DeriveLineEnding: false
+DerivePointerAlignment: false
+DisableFormat:   false
+ExperimentalAutoDetectBinPacking: false
+FixNamespaceComments: true
+ForEachMacros:
+  - foreach
+  - Q_FOREACH
+  - BOOST_FOREACH
+IncludeBlocks:   Regroup
+IncludeCategories:
+  - Regex:           '^<ext/.*\.h>'
+    Priority:        2
+    SortPriority:    0
+  - Regex:           '^<.*\.h>'
+    Priority:        1
+    SortPriority:    0
+  - Regex:           '^<.*'
+    Priority:        2
+    SortPriority:    0
+  - Regex:           '.*'
+    Priority:        3
+    SortPriority:    0
+IncludeIsMainRegex: '([-_](test|unittest))?$'
+IncludeIsMainSourceRegex: ''
+IndentCaseLabels: false
+IndentCaseBlocks: false
+IndentGotoLabels: false
+IndentPPDirectives: None
+IndentExternBlock: AfterExternBlock
+IndentWidth:     4
+IndentWrappedFunctionNames: true
+InsertTrailingCommas: None
+JavaScriptQuotes: Leave
+JavaScriptWrapImports: true
+KeepEmptyLinesAtTheStartOfBlocks: false
+MacroBlockBegin: ''
+MacroBlockEnd:   ''
+MaxEmptyLinesToKeep: 1
+NamespaceIndentation: None
+ObjCBinPackProtocolList: Never
+ObjCBlockIndentWidth: 2
+ObjCBreakBeforeNestedBlockParam: true
+ObjCSpaceAfterProperty: false
+ObjCSpaceBeforeProtocolList: true
+PenaltyBreakAssignment: 2
+PenaltyBreakBeforeFirstCallParameter: 1
+PenaltyBreakComment: 300
+PenaltyBreakFirstLessLess: 120
+PenaltyBreakString: 1000
+PenaltyBreakTemplateDeclaration: 10
+PenaltyExcessCharacter: 1000000
+PenaltyReturnTypeOnItsOwnLine: 200
+PointerAlignment: Right
+RawStringFormats:
+  - Language:        Cpp
+    Delimiters:
+      - cc
+      - CC
+      - cpp
+      - Cpp
+      - CPP
+      - 'c++'
+      - 'C++'
+    CanonicalDelimiter: ''
+    BasedOnStyle:    google
+  - Language:        TextProto
+    Delimiters:
+      - pb
+      - PB
+      - proto
+      - PROTO
+    EnclosingFunctions:
+      - EqualsProto
+      - EquivToProto
+      - PARSE_PARTIAL_TEXT_PROTO
+      - PARSE_TEST_PROTO
+      - PARSE_TEXT_PROTO
+      - ParseTextOrDie
+      - ParseTextProtoOrDie
+      - ParseTestProto
+      - ParsePartialTestProto
+    CanonicalDelimiter: ''
+    BasedOnStyle:    google
+ReflowComments:  true
+SortIncludes:    false
+SortUsingDeclarations: false
+SpaceAfterCStyleCast: false
+SpaceAfterLogicalNot: false
+SpaceAfterTemplateKeyword: true
+SpaceBeforeAssignmentOperators: true
+SpaceBeforeCpp11BracedList: true
+SpaceBeforeCtorInitializerColon: true
+SpaceBeforeInheritanceColon: true
+SpaceBeforeParens: ControlStatements
+SpaceBeforeRangeBasedForLoopColon: true
+SpaceInEmptyBlock: true
+SpaceInEmptyParentheses: false
+SpacesBeforeTrailingComments: 2
+SpacesInAngles:  false
+SpacesInConditionalStatement: false
+SpacesInContainerLiterals: false
+SpacesInCStyleCastParentheses: false
+SpacesInParentheses: false
+SpacesInSquareBrackets: false
+SpaceBeforeSquareBrackets: false
+Standard:        Auto
+StatementMacros:
+  - Q_UNUSED
+  - QT_REQUIRE_VERSION
+TabWidth:        4
+UseCRLF:         false
+UseTab:          Never
+WhitespaceSensitiveMacros:
+  - STRINGIZE
+  - PP_STRINGIZE
+  - BOOST_PP_STRINGIZE
+...
+
diff --git a/meson.build b/meson.build
index ffa47711acf593deaf71140f7190cbe7e256e303..be1ed52d0df4322d6dc487e7d86dc3e3925625e9 100644
--- a/meson.build
+++ b/meson.build
@@ -14,6 +14,8 @@ build_args = [
 liblbm = ''
 if precision == 'FLT'
     liblbm = 'liblbm_f32.fut'
+elif precision == SHORT_FLT
+    liblbm = 'liblbm_f16.fut'
 else 
     liblbm = 'liblbm_f64.fut'
 endif
diff --git a/meson_options.txt b/meson_options.txt
index 69d03b9bb7051c605640f3a404088a1b4d5c0983..971c4f2b6509181ccac49cfad360cb552dcfe010 100644
--- a/meson_options.txt
+++ b/meson_options.txt
@@ -1,6 +1,6 @@
 option('futhark-backend', type: 'combo', choices: ['c', 'multicore', 'opencl', 'cuda'], value: 'c', description: 'Select the backend that Futhark code compiles to')
 option('lattice', type: 'combo', choices: ['D3Q27', 'D3Q19'], value: 'D3Q19', description: 'Select the lattice that the model uses')
-option('precision', type: 'combo', choices: ['DBL', 'FLT'], value: 'FLT', description: 'Select the numerical precision offloating point numbers that the model uses')
+option('precision', type: 'combo', choices: ['DBL', 'FLT', 'SHORT'], value: 'FLT', description: 'Select the numerical precision of floating point numbers that the model uses')
 option('cavity3d', type: 'feature', value: 'enabled', description: 'Enable the feature to compile the cavity3d case')
 option('kida', type: 'feature', value: 'enabled', description: 'Enable the feature to compile the Kida case')
 
diff --git a/src/c_interface/lbm_utils.c b/src/c_interface/lbm_utils.c
index 6c6f7030d8d02fec1c685e1db03cb10ddbb5d397..9ccaa26c4948c85ec27933079be69ae033ccc215 100644
--- a/src/c_interface/lbm_utils.c
+++ b/src/c_interface/lbm_utils.c
@@ -11,9 +11,9 @@ tensor_field create_distances(fint nx, fint ny, fint nz, fint q, triangle *t, fi
     }
     tensor_field distances = tensor_field_allocate_and_init(nx, ny, nz, q, ini);
     // TODO should go from 0 to nx/ny/nz but for debugging purposes we keep them
-    for (fint ix = 1; ix < nx-1; ++ix) {
-        for (fint iy = 1; iy < ny-1; ++iy) {
-            for (fint iz = 1; iz < nz-1; ++iz) {
+    for (fint ix = 1; ix < nx - 1; ++ix) {
+        for (fint iy = 1; iy < ny - 1; ++iy) {
+            for (fint iz = 1; iz < nz - 1; ++iz) {
                 for (fint iq = 0; iq < q; ++iq) {
                     T distance = (T)1;
                     point loc_point = point_create(ix, iy, iz);
@@ -26,7 +26,8 @@ tensor_field create_distances(fint nx, fint ny, fint nz, fint q, triangle *t, fi
                         if (point_dot(next_vec, triangle_compute_normal(t[it])) > 0.0) {
                             T dist_tmp = (T)0;
 
-                            bool intersection_tmp = triangle_compute_intersection_with_segment(t[it], loc_point, next_vec, &dist_tmp);
+                            bool intersection_tmp = triangle_compute_intersection_with_segment(
+                                t[it], loc_point, next_vec, &dist_tmp);
                             if (dist_tmp < distance && dist_tmp >= 0.0 && intersection_tmp) {
                                 distance = dist_tmp;
                                 intersection = intersection_tmp;
@@ -52,13 +53,14 @@ static T compute_feq(T w, const int c[], T rho, T u[]) {
     T c_u = c[0] * u[0];
     T u_sqr = u[0] * u[0];
     for (int id = 1; id < D3; ++id) {
-        c_u   += c[id] * u[id];
+        c_u += c[id] * u[id];
         u_sqr += u[id] * u[id];
     }
     return w * rho * (1.0 + 3.0 * c_u + 4.5 * c_u * c_u - 1.5 * u_sqr);
 }
 
-void lbm_allocate_and_ini_at_equilibrium_inplace(T rho, T u[], int nx, int ny, int nz, int q, tensor_field field) {
+void lbm_allocate_and_ini_at_equilibrium_inplace(
+    T rho, T u[], int nx, int ny, int nz, int q, tensor_field field) {
     assert(nx == field.nx && "Inconsistent nx size.");
     assert(ny == field.ny && "Inconsistent ny size.");
     assert(nz == field.nz && "Inconsistent nz size.");
@@ -83,27 +85,45 @@ void lbm_allocate_and_ini_at_equilibrium_kida_inplace(T rho0, T u_max, tensor_fi
         for (int iy = 0; iy < ny; ++iy) {
             for (int iz = 0; iz < nz; ++iz) {
                 T u[3];
-                    
-                T x = (T)2*pi*(T)ix / (T)(nx);
-                T y = (T)2*pi*(T)iy / (T)(ny);
-                T z = (T)2*pi*(T)iz / (T)(nz);
-                
-                u[0] = sin(x)*(cos((T)3*y) * cos(z) - cos(y)*cos((T)3*z));
-                u[1] = sin(y)*(cos((T)3*z) * cos(x) - cos(z)*cos((T)3*x));
-                u[2] = sin(z)*(cos((T)3*x) * cos(y) - cos(x)*cos((T)3*y));
+
+                T x = (T)2 * pi * (T)ix / (T)(nx);
+                T y = (T)2 * pi * (T)iy / (T)(ny);
+                T z = (T)2 * pi * (T)iz / (T)(nz);
+
+                u[0] = sin(x) * (cos((T)3 * y) * cos(z) - cos(y) * cos((T)3 * z));
+                u[1] = sin(y) * (cos((T)3 * z) * cos(x) - cos(z) * cos((T)3 * x));
+                u[2] = sin(z) * (cos((T)3 * x) * cos(y) - cos(x) * cos((T)3 * y));
 
                 for (fint id = 0; id < 3; ++id) {
                     u[id] *= u_max;
                 }
 
-                T rho = -(1.0/6.0)*cos(4*z)*cos(2*x)*cos(2*y)+(1.0/36.0)*cos(4*z)*cos(4*x)*cos(2*y)+(1.0/36.0)*cos(4*z)*cos(2*x)*cos(4*y);
-                rho += +cos(2*z)*cos(2*x)*cos(2*y)-(1.0/6.0)*cos(2*z)*cos(4*x)*cos(2*y)-(1.0/6.0)*cos(2*z)*cos(2*x)*cos(4*y);
-                rho += +(1.0/36.0)*cos(4*x)*cos(4*y)*cos(2*z)-(1.0/80.0)*cos(6*y)*cos(2*z)-(1.0/10.0)*cos(4*z)*cos(2*y)+(1.0/8.0)*cos(4*z)*cos(4*y);
-                rho += -(1.0/10.0)*cos(4*z)*cos(2*x)+(1.0/8.0)*cos(4*z)*cos(4*x)+(1.0/8.0)*cos(2*z)*cos(2*y)-(1.0/10.0)*cos(2*z)*cos(4*y);
-                rho += +(1.0/8.0)*cos(2*z)*cos(2*x)-(1.0/10.0)*cos(2*z)*cos(4*x)-(1.0/80.0)*cos(6*x)*cos(2*z)-(1.0/4.0)*cos(2*x);
-                rho += -(1.0/80.0)*cos(6*z)*cos(2*x)-(1.0/80.0)*cos(6*z)*cos(2*y)-(1.0/80.0)*cos(6*x)*cos(2*y)-(1.0/10.0)*cos(2*x)*cos(4*y);
-                rho += +(1.0/8.0)*cos(2*x)*cos(2*y)+(1.0/8.0)*cos(4*x)*cos(4*y)-(1.0/80.0)*cos(2*x)*cos(6*y)-(1.0/10.0)*cos(4*x)*cos(2*y);
-                rho += -(1.0/4.0)*cos(2*z)-(1.0/4.0)*cos(2*y);
+                T rho = -(1.0 / 6.0) * cos(4 * z) * cos(2 * x) * cos(2 * y)
+                        + (1.0 / 36.0) * cos(4 * z) * cos(4 * x) * cos(2 * y)
+                        + (1.0 / 36.0) * cos(4 * z) * cos(2 * x) * cos(4 * y);
+                rho += +cos(2 * z) * cos(2 * x) * cos(2 * y)
+                       - (1.0 / 6.0) * cos(2 * z) * cos(4 * x) * cos(2 * y)
+                       - (1.0 / 6.0) * cos(2 * z) * cos(2 * x) * cos(4 * y);
+                rho += +(1.0 / 36.0) * cos(4 * x) * cos(4 * y) * cos(2 * z)
+                       - (1.0 / 80.0) * cos(6 * y) * cos(2 * z)
+                       - (1.0 / 10.0) * cos(4 * z) * cos(2 * y)
+                       + (1.0 / 8.0) * cos(4 * z) * cos(4 * y);
+                rho += -(1.0 / 10.0) * cos(4 * z) * cos(2 * x)
+                       + (1.0 / 8.0) * cos(4 * z) * cos(4 * x)
+                       + (1.0 / 8.0) * cos(2 * z) * cos(2 * y)
+                       - (1.0 / 10.0) * cos(2 * z) * cos(4 * y);
+                rho += +(1.0 / 8.0) * cos(2 * z) * cos(2 * x)
+                       - (1.0 / 10.0) * cos(2 * z) * cos(4 * x)
+                       - (1.0 / 80.0) * cos(6 * x) * cos(2 * z) - (1.0 / 4.0) * cos(2 * x);
+                rho += -(1.0 / 80.0) * cos(6 * z) * cos(2 * x)
+                       - (1.0 / 80.0) * cos(6 * z) * cos(2 * y)
+                       - (1.0 / 80.0) * cos(6 * x) * cos(2 * y)
+                       - (1.0 / 10.0) * cos(2 * x) * cos(4 * y);
+                rho += +(1.0 / 8.0) * cos(2 * x) * cos(2 * y)
+                       + (1.0 / 8.0) * cos(4 * x) * cos(4 * y)
+                       - (1.0 / 80.0) * cos(2 * x) * cos(6 * y)
+                       - (1.0 / 10.0) * cos(4 * x) * cos(2 * y);
+                rho += -(1.0 / 4.0) * cos(2 * z) - (1.0 / 4.0) * cos(2 * y);
 
                 rho *= -u_max * u_max;
                 rho *= (T)3.0;
@@ -131,12 +151,11 @@ void lbm_allocate_and_ini_at_equilibrium_cavity_inplace(T rho0, T u_max, tensor_
             for (int iz = 0; iz < nz; ++iz) {
                 T u[3] = {0.0, 0.0, 0.0};
                 if (iz == nz - 1) {
-                    T x_vel = 1-pow(x/h, 18);
-                    T y_vel = 1-pow(y/h, 18);
+                    T x_vel = 1 - pow(x / h, 18);
+                    T y_vel = 1 - pow(y / h, 18);
                     u[0] = u_max * x_vel * x_vel * y_vel * y_vel;
                 }
 
-
                 for (int ipop = 0; ipop < field.q; ++ipop) {
                     field.grid[ix][iy][iz][ipop] = compute_feq(w[ipop], c_t[ipop], rho0, u);
                 }
@@ -145,28 +164,31 @@ void lbm_allocate_and_ini_at_equilibrium_cavity_inplace(T rho0, T u_max, tensor_
     }
 }
 
-futhark_4d *lbm_collide_and_stream(struct futhark_context *ctx,  futhark_4d **f, futhark_4d *vel, futhark_4d *d, T omega, fint iter) {
+futhark_4d *lbm_collide_and_stream(
+    struct futhark_context *ctx, futhark_4d **f, futhark_4d *vel, futhark_4d *d, T omega,
+    fint iter) {
     futhark_4d *fout = NULL;
     // futhark_entry_main(ctx, &fout, *f, vel, d, omega, iter);
 #ifdef D3Q19
-    futhark_entry_main_d3q19_reg(ctx, &fout, *f, vel, d, omega, iter);
+    futhark_entry_main_d3q19_bgk(ctx, &fout, *f, vel, d, omega, iter);
 #endif
 #ifdef D3Q27
-    futhark_entry_main_d3q27_reg(ctx, &fout, *f, vel, d, omega, iter);
+    futhark_entry_main_d3q27_bgk(ctx, &fout, *f, vel, d, omega, iter);
 #endif
 
-
     assert(NULL != fout);
     futhark_free_4d(ctx, *f);
     *f = NULL;
-    
+
     return fout;
 }
 
 // TODO: add possibility to have rho/u form futhark directly
-// futhark_4d *lbm_collide_and_stream_rho_u(struct futhark_context *ctx,  futhark_4d **f, struct futhark_f32_3d **rho, futhark_4d **vel, T omega, fint iter) {
+// futhark_4d *lbm_collide_and_stream_rho_u(struct futhark_context *ctx,  futhark_4d **f, struct
+// futhark_f32_3d **rho, futhark_4d **vel, T omega, fint iter) {
 //     futhark_4d *fout = NULL;
-//     int ret = futhark_entry_rho_u_main(ctx, &fout, rho, vel, *f, omega, iter); // this produces a waring (should ask what to do with that)
+//     int ret = futhark_entry_rho_u_main(ctx, &fout, rho, vel, *f, omega, iter); // this produces a
+//     waring (should ask what to do with that)
 
 //     futhark_free_4d(ctx, *f);
 //     *f = NULL;
@@ -174,10 +196,11 @@ futhark_4d *lbm_collide_and_stream(struct futhark_context *ctx,  futhark_4d **f,
 //     return fout;
 // }
 
-
 void lbm_compute_density_inplace(tensor_field lattice, scalar_field density) {
-    assert(tensor_field_consistant_size_scalar_field(lattice, density) && "Sizes of rho and lattice are inconsistent.");
-    
+    assert(
+        tensor_field_consistant_size_scalar_field(lattice, density)
+        && "Sizes of rho and lattice are inconsistent.");
+
     for (int ix = 0; ix < lattice.nx; ++ix) {
         for (int iy = 0; iy < lattice.ny; ++iy) {
             for (int iz = 0; iz < lattice.nz; ++iz) {
@@ -192,7 +215,9 @@ void lbm_compute_density_inplace(tensor_field lattice, scalar_field density) {
 }
 
 void lbm_compute_velocity_inplace(tensor_field lattice, tensor_field velocity) {
-    assert(tensor_field_consistant_spatial_size_tensor_field(lattice, velocity) && "Sizes of velocity and lattice are inconsistent.");
+    assert(
+        tensor_field_consistant_spatial_size_tensor_field(lattice, velocity)
+        && "Sizes of velocity and lattice are inconsistent.");
 
     for (int ix = 0; ix < lattice.nx; ++ix) {
         for (int iy = 0; iy < lattice.ny; ++iy) {
@@ -204,7 +229,8 @@ void lbm_compute_velocity_inplace(tensor_field lattice, tensor_field velocity) {
                 for (int id = 0; id < lattice.dim; ++id) {
                     velocity.grid[ix][iy][iz][id] = (T)0;
                     for (int ipop = 0; ipop < lattice.q; ++ipop) {
-                        velocity.grid[ix][iy][iz][id] += lattice.grid[ix][iy][iz][ipop] * c_t[ipop][id];
+                        velocity.grid[ix][iy][iz][id] +=
+                            lattice.grid[ix][iy][iz][ipop] * c_t[ipop][id];
                     }
                     velocity.grid[ix][iy][iz][id] /= rho;
                 }
@@ -221,19 +247,21 @@ tensor_field lbm_allocate_and_ini_at_equilibrium(T rho, T u[], int nx, int ny, i
     return lattice;
 }
 
-tensor_field lbm_allocate_and_ini_at_equilibrium_kida(T rho0, T u_max, int nx, int ny, int nz, int q) {
+tensor_field lbm_allocate_and_ini_at_equilibrium_kida(
+    T rho0, T u_max, int nx, int ny, int nz, int q) {
     tensor_field lattice = tensor_field_allocate(nx, ny, nz, q);
 
     lbm_allocate_and_ini_at_equilibrium_kida_inplace(rho0, u_max, lattice);
-    
+
     return lattice;
 }
 
-tensor_field lbm_allocate_and_ini_at_equilibrium_cavity(T rho0, T u_max, int nx, int ny, int nz, int q) {
+tensor_field lbm_allocate_and_ini_at_equilibrium_cavity(
+    T rho0, T u_max, int nx, int ny, int nz, int q) {
     tensor_field lattice = tensor_field_allocate(nx, ny, nz, q);
 
     lbm_allocate_and_ini_at_equilibrium_cavity_inplace(rho0, u_max, lattice);
-    
+
     return lattice;
 }
 
@@ -242,7 +270,7 @@ scalar_field lbm_compute_density(tensor_field lattice) {
 
     lbm_compute_density_inplace(lattice, density);
 
-    return density; 
+    return density;
 }
 
 tensor_field lbm_compute_velocity(tensor_field lattice) {
@@ -250,5 +278,5 @@ tensor_field lbm_compute_velocity(tensor_field lattice) {
 
     lbm_compute_velocity_inplace(lattice, vel);
 
-    return vel;   
+    return vel;
 }
diff --git a/src/globals.fut b/src/globals.fut
index 72e6fadf9f8355eb33999c3739a2b7d2475c46fa..7e9bc82a165ac94480a15d4dd125e8116a3bb8ec 100644
--- a/src/globals.fut
+++ b/src/globals.fut
@@ -1,3 +1,4 @@
+import "globals_f16"
 import "globals_f32"
 import "globals_f64"
 
diff --git a/src/globals_f16.fut b/src/globals_f16.fut
new file mode 100644
index 0000000000000000000000000000000000000000..152cfe75ed75d977088cd086433c5ac95a2e96ca
--- /dev/null
+++ b/src/globals_f16.fut
@@ -0,0 +1,62 @@
+import "lib/github.com/athas/vector/vector"
+
+import "vectors"
+import "lattice"
+import "descriptor"
+import "dynamics"
+import "bc"
+import "lbm"
+
+-- d3q19
+
+module d3q19_f16 = mk_desc vec3' vec19' d3q19_lattice f16
+
+-- dynamics
+
+module bgk_d3q19_f16 = mk_bgk f16 d3q19_f16
+
+module reg_d3q19_f16 = mk_reg f16 d3q19_f16
+
+-- boundary conditions
+
+module bfl_d3q19_f16 = mk_bfl_bc f16 d3q19_f16 --bfl et al.
+
+module periodic_d3q19_f16 = mk_no_bc f16 d3q19_f16
+
+-- model
+
+module d3q19_bgk_bfl_f16 = mk_lbm f16 d3q19_f16 bgk_d3q19_f16 bfl_d3q19_f16
+
+module d3q19_bgk_periodic_f16 = mk_lbm f16 d3q19_f16 bgk_d3q19_f16 periodic_d3q19_f16
+
+module d3q19_reg_bfl_f16 = mk_lbm f16 d3q19_f16 reg_d3q19_f16 bfl_d3q19_f16
+
+module d3q19_reg_periodic_f16 = mk_lbm f16 d3q19_f16 reg_d3q19_f16 periodic_d3q19_f16
+
+
+-- d3q27
+
+module d3q27_f16 = mk_desc vec3' vec27' d3q27_lattice f16
+
+-- dynamics
+
+module bgk_d3q27_f16 = mk_bgk f16 d3q27_f16
+
+module reg_d3q27_f16 = mk_reg f16 d3q27_f16
+
+-- boundary conditions
+
+module bfl_d3q27_f16 = mk_bfl_bc f16 d3q27_f16 --bfl et al.
+
+module periodic_d3q27_f16 = mk_no_bc f16 d3q27_f16
+
+-- model
+
+module d3q27_bgk_bfl_f16 = mk_lbm f16 d3q27_f16 bgk_d3q27_f16 bfl_d3q27_f16
+
+module d3q27_bgk_periodic_f16 = mk_lbm f16 d3q27_f16 bgk_d3q27_f16 periodic_d3q27_f16
+
+module d3q27_reg_bfl_f16 = mk_lbm f16 d3q27_f16 reg_d3q27_f16 bfl_d3q27_f16
+
+module d3q27_reg_periodic_f16 = mk_lbm f16 d3q27_f16 reg_d3q27_f16 periodic_d3q27_f16
+
diff --git a/src/lbm.fut b/src/lbm.fut
index 43c54271b34b0f47fd04d6ad851bb6d0a9a8b37d..684a38058496b58b263bb4fbb456c5d205a8028e 100644
--- a/src/lbm.fut
+++ b/src/lbm.fut
@@ -10,6 +10,9 @@ module type lbm = {
     type vec_q 'a
     type vec_d 'a
 
+    val q: i64
+    val dim: i64
+
     val get_q: i64 -> vec_q real -> real
     val get_d: i64 -> vec_d real -> real
     val to_ary_q [q] : vec_q real -> [q]real
@@ -18,10 +21,10 @@ module type lbm = {
     val stream [nx][ny][nz]: [nx][ny][nz](vec_q real) -> [nx][ny][nz](vec_q real)
     val collide [nx][ny][nz]: [nx][ny][nz](vec_q real) -> real -> [nx][ny][nz](vec_q real)
     val stream_and_collide [nx][ny][nz]: [nx][ny][nz](vec_d real) -> [nx][ny][nz](vec_q real) -> [nx][ny][nz](vec_q real) -> real -> [nx][ny][nz](vec_q real)
-    val from_array_q [nx][ny][nz][q]: [nx][ny][nz][q]real -> [nx][ny][nz](vec_q real)
-    val to_array_q [nx][ny][nz][q]: [nx][ny][nz](vec_q real) -> [nx][ny][nz][q]real
-    val from_array_d [nx][ny][nz][d]: [nx][ny][nz][d]real -> [nx][ny][nz](vec_d real)
-    val to_array_d [nx][ny][nz][d]: [nx][ny][nz](vec_d real) -> [nx][ny][nz][d]real
+    val from_array_q [nx][ny][nz]: [nx][ny][nz][q]real -> [nx][ny][nz](vec_q real)
+    val to_array_q [nx][ny][nz]: [nx][ny][nz](vec_q real) -> [nx][ny][nz][q]real
+    val from_array_d [nx][ny][nz]: [nx][ny][nz][dim]real -> [nx][ny][nz](vec_d real)
+    val to_array_d [nx][ny][nz]: [nx][ny][nz](vec_d real) -> [nx][ny][nz][dim]real
 }
 
 
@@ -39,6 +42,9 @@ module mk_lbm (real: real) (d: descriptor with real = real.t) (dyn: dynamics wit
     type vec_q 'a = d.vec_q a
     type vec_d 'a = d.vec_d a
 
+    let q = d.q
+    let dim = d.d
+
     def get_q (i: i64) (v: d.vec_q real) = d.get_q i v
     def get_d (i: i64) (v: d.vec_d real) = d.get_d i v
 
@@ -76,23 +82,20 @@ module mk_lbm (real: real) (d: descriptor with real = real.t) (dyn: dynamics wit
 
     def stream [nx][ny][nz] (f: [nx][ny][nz](vec_q real)) : [nx][ny][nz](vec_q real) =
         -- f
-        let n = nx * ny * nz
-        let op (i: i64) : [n]real =
+        let op (i: i64) : []real =
             let f' = (map_3d (d.get_q i) f) :> [nx][ny][nz]real
             in ((flatten_3d ( 
                 if i == 0 then f'
                 else stream_ci f' (d.get_q i d.c)
-            )) :> [n]real)
+            )) :> [nx*ny*nz]real)
 
         in (
-                (
-                    unflatten_3d nx ny nz (
-                        d.vzip_q 
-                            (
-                                d.map_q (\i -> op i) (d.iota_q :> (d.vec_q i64))
-                            ) 
-                    ) 
-                ) :> [nx][ny][nz](vec_q real)
+                unflatten_3d (
+                    d.vzip_q 
+                        (
+                            d.map_q (\i -> op i) (d.iota_q :> (d.vec_q i64))
+                        ) 
+                ) 
             )
 
     def collide [nx][ny][nz] (f: [nx][ny][nz](d.vec_q real)) (omega: real): [nx][ny][nz](vec_q real) = 
@@ -102,16 +105,16 @@ module mk_lbm (real: real) (d: descriptor with real = real.t) (dyn: dynamics wit
         -- stream (bc.apply distances (collide f omega))
         collide (bc.apply velocity distances (stream f)) omega
 
-    def from_array_q [nx][ny][nz][q] (f: [nx][ny][nz][q]real): [nx][ny][nz](vec_q real) =
+    def from_array_q [nx][ny][nz] (f: [nx][ny][nz][q]real): [nx][ny][nz](vec_q real) =
         map_3d (\fi -> d.from_array_q (fi :> [q]real)) f
         
-    def to_array_q [nx][ny][nz][q] (f: [nx][ny][nz](d.vec_q real)): [nx][ny][nz][q]real =
+    def to_array_q [nx][ny][nz] (f: [nx][ny][nz](d.vec_q real)): [nx][ny][nz][q]real =
         map_3d (\fi -> ((to_ary_q (fi) :[q]real ) :> [q]real)) f
 
-    def from_array_d [nx][ny][nz][dim] (f: [nx][ny][nz][dim]real): [nx][ny][nz](vec_d real) =
+    def from_array_d [nx][ny][nz] (f: [nx][ny][nz][dim]real): [nx][ny][nz](vec_d real) =
         map_3d (\fi -> d.from_array_d (fi : [dim]real)) f
 
-    def to_array_d [nx][ny][nz][dim] (f: [nx][ny][nz](vec_d real)): [nx][ny][nz][dim]real =
+    def to_array_d [nx][ny][nz] (f: [nx][ny][nz](vec_d real)): [nx][ny][nz][dim]real =
         map_3d (\fi -> ((d.to_array_d fi) : [dim]real)) f
 
 }
diff --git a/src/liblbm_f16.fut b/src/liblbm_f16.fut
new file mode 100644
index 0000000000000000000000000000000000000000..102a98e60eddaccae013b9f202e5c093ae347938
--- /dev/null
+++ b/src/liblbm_f16.fut
@@ -0,0 +1,29 @@
+-- Program for Lattice-Boltzmann in Futhark
+import "globals_f16"
+import "sim"
+import "polygonise"
+
+module sim_d3q27_reg_bfl_f16 = mk_sim d3q27_reg_bfl_f16
+
+entry main_d3q27_reg = sim_d3q27_reg_bfl_f16.run
+
+module sim_d3q19_reg_bfl_f16 = mk_sim d3q19_reg_bfl_f16
+
+entry main_d3q19_reg = sim_d3q19_reg_bfl_f16.run
+
+entry main_mcubes_polygonise = mcubes_polygonise_f16
+
+entry main_mtetras_polygonise = mtetras_polygonise_f16
+
+-- Test the sub1 function.
+-- ==
+-- input { 21i64 21i64 21i64 27i64 3i64 1.5f16 1000i64 } 
+-- auto output
+let main (nx: i64) (ny: i64) (nz: i64) (q: i64) (d: i64) (omega: f16) (max_t: i64): [nx][ny][nz][q]f16 = 
+    let fin = replicate nx (replicate ny (replicate nz (replicate q 0f16)))
+    let dist = replicate nx (replicate ny (replicate nz (replicate q (-1.0f16))))
+    let u = replicate nx (replicate ny (replicate nz (replicate d 0f16)))
+
+    in sim_d3q27_reg_bfl_f16.run fin u dist omega max_t
+
+
diff --git a/src/liblbm_f32.fut b/src/liblbm_f32.fut
index fbefef4ac283aa8a577ac458ff1a3d8afffc2348..54e46606c694d9e5e907afb90ba2c91773e9a0e9 100644
--- a/src/liblbm_f32.fut
+++ b/src/liblbm_f32.fut
@@ -3,59 +3,51 @@ import "globals_f32"
 import "sim"
 import "polygonise"
 
+-- D3Q27 - REG
+
 module sim_d3q27_reg_bfl_f32 = mk_sim d3q27_reg_bfl_f32
 
 entry main_d3q27_reg = sim_d3q27_reg_bfl_f32.run
 
-module sim_d3q19_reg_periodic_f32 = mk_sim d3q19_reg_periodic_f32
+module sim_d3q27_reg_periodic_f32 = mk_sim d3q27_reg_periodic_f32
+
+entry main_d3q27_reg_periodic = sim_d3q27_reg_periodic_f32.run
+
+-- D3Q19 - REG
+
+module sim_d3q19_reg_bfl_f32 = mk_sim d3q19_reg_bfl_f32
+
+entry main_d3q19_reg = sim_d3q19_reg_bfl_f32.run
+
+module sim_d3q19_reg_periodic_f32 = mk_sim d3q19_reg_periodic_f32 
+
+entry main_d3q19_reg_periodic_f32 = sim_d3q19_reg_periodic_f32.run
+
+-- D3Q27 - BGK
+
+module sim_d3q27_bgk_bfl_f32 = mk_sim d3q27_bgk_bfl_f32
+
+entry main_d3q27_bgk = sim_d3q27_bgk_bfl_f32.run
+
+module sim_d3q27_bgk_periodic_f32 = mk_sim d3q27_bgk_periodic_f32
+
+entry main_d3q27_bgk_periodic = sim_d3q27_bgk_periodic_f32.run
+
+-- D3Q19 - BGK
+
+module sim_d3q19_bgk_bfl_f32 = mk_sim d3q19_bgk_bfl_f32
+
+entry main_d3q19_bgk = sim_d3q19_bgk_bfl_f32.run
+
+module sim_d3q19_bgk_periodic_f32 = mk_sim d3q19_bgk_periodic_f32 
+
+entry main_d3q19_bgk_periodic_f32 = sim_d3q19_bgk_periodic_f32.run
 
-entry main_d3q19_reg = sim_d3q19_reg_periodic_f32.run
+-- Polygonise
 
 entry main_mcubes_polygonise = mcubes_polygonise_f32
 
 entry main_mtetras_polygonise = mtetras_polygonise_f32
 
--- Test the sub1 function.
--- ==
--- input { 21i64 21i64 21i64 27i64 3i64 1.5f32 1000i64 } 
--- auto output
-let main (nx: i64) (ny: i64) (nz: i64) (q: i64) (d: i64) (omega: f32) (max_t: i64): [nx][ny][nz][q]f32 = 
-    let fin = replicate nx (replicate ny (replicate nz (replicate q 0f32)))
-    let dist = replicate nx (replicate ny (replicate nz (replicate q (-1.0f32))))
-    let u = replicate nx (replicate ny (replicate nz (replicate d 0f32)))
-
-    in sim_d3q27_reg_bfl_f32.run fin u dist omega max_t
-
-
-
-
--- -- executes Lattice-Boltzmann it times and
--- let lattice_boltzmann [nx][ny][nz] (fin: [nx][ny][nz](lbm_model.vec_q real)) (vel: [nx][ny][nz](lbm_model.vec_d real)) (dists: [nx][ny][nz](lbm_model.vec_q real)) (omega: real) (max_t: i64):  [nx][ny][nz](lbm_model.vec_q real) =
---     let fini = lbm_model.collide fin omega
--- 	let f = loop f = fini for _i < (max_t-1) do
---         lbm_model.stream_and_collide vel dists f omega
---     let fout = lbm_model.bc_apply vel dists (lbm_model.stream f)
--- 	in fout
-
--- entry main [nx][ny][nz][d][q] (fin: [nx][ny][nz][q]real) (vel: [nx][ny][nz][d]real) (dists: [nx][ny][nz][q]real) (omega: real) (max_t: i64): [nx][ny][nz][q]real =
---     let fin_f = lbm_model.from_array_q fin
---     let vel = lbm_model.from_array_d vel
---     let dist = lbm_model.from_array_q dists
---     let f = lattice_boltzmann fin_f vel dist omega max_t
---     in lbm_model.to_array_q f
-
--- TODO add version where rho/u a outputs
--- entry rho_main [nx][ny][nz] (fin: [nx][ny][nz][19]real) (omega: real) (max_t: i64): ([nx][ny][nz][19]f32, [nx][ny][nz]f32) = 
---     let fin_f = lbm_model.from_array_q fin
---     let f = lattice_boltzmann fin_f omega max_t
---     let rho = compute_moment f bgk_d3q19.compute_rho
---     in (lbm_model.to_array_q f, rho)
-
--- entry rho_u_main [nx][ny][nz] (fin: [nx][ny][nz][19]f32) (omega: real) (max_t: i64): ([nx][ny][nz][19]f32, [nx][ny][nz]f32, [nx][ny][nz][3]f32) = 
---     let fin_f = lbm_model.from_array_q fin
---     let f = lattice_boltzmann fin_f omega max_t
---     let rho = compute_moment f bgk_d3q19.compute_rho
---     let u   = compute_moment f bgk_d3q19.compute_vel
---     in (lbm_model.to_array_q f, rho, (lbm_model.to_array_d u))
 
 
diff --git a/src/liblbm_f64.fut b/src/liblbm_f64.fut
index 44067505766feee6dd270a116a4acdd5b135f618..3ddf182cb347a53940291e57077533d0c8b462d0 100644
--- a/src/liblbm_f64.fut
+++ b/src/liblbm_f64.fut
@@ -16,43 +16,3 @@ entry main_mcubes_polygonise = mcubes_polygonise_f64
 entry main_mtetras_polygonise = mtetras_polygonise_f64
 
 
--- -- executes Lattice-Boltzmann it times and
--- let lattice_boltzmann [nx][ny][nz] (fin: [nx][ny][nz](lbm_model.vec_q real)) (vel: [nx][ny][nz](lbm_model.vec_d real)) (dists: [nx][ny][nz](lbm_model.vec_q real)) (omega: real) (max_t: i64):  [nx][ny][nz](lbm_model.vec_q real) =
---     let fini = lbm_model.collide fin omega
--- 	let f = loop f = fini for _i < (max_t-1) do
---         lbm_model.stream_and_collide vel dists f omega
---     let fout = lbm_model.bc_apply vel dists (lbm_model.stream f)
--- 	in fout
-
--- entry main [nx][ny][nz][d][q] (fin: [nx][ny][nz][q]real) (vel: [nx][ny][nz][d]real) (dists: [nx][ny][nz][q]real) (omega: real) (max_t: i64): [nx][ny][nz][q]real =
---     let fin_f = lbm_model.from_array_q fin
---     let vel = lbm_model.from_array_d vel
---     let dist = lbm_model.from_array_q dists
---     let f = lattice_boltzmann fin_f vel dist omega max_t
---     in lbm_model.to_array_q f
-
--- TODO add version where rho/u a outputs
--- entry rho_main [nx][ny][nz] (fin: [nx][ny][nz][19]real) (omega: real) (max_t: i64): ([nx][ny][nz][19]f32, [nx][ny][nz]f32) = 
---     let fin_f = lbm_model.from_array_q fin
---     let f = lattice_boltzmann fin_f omega max_t
---     let rho = compute_moment f bgk_d3q19.compute_rho
---     in (lbm_model.to_array_q f, rho)
-
--- entry rho_u_main [nx][ny][nz] (fin: [nx][ny][nz][19]f32) (omega: real) (max_t: i64): ([nx][ny][nz][19]f32, [nx][ny][nz]f32, [nx][ny][nz][3]f32) = 
---     let fin_f = lbm_model.from_array_q fin
---     let f = lattice_boltzmann fin_f omega max_t
---     let rho = compute_moment f bgk_d3q19.compute_rho
---     let u   = compute_moment f bgk_d3q19.compute_vel
---     in (lbm_model.to_array_q f, rho, (lbm_model.to_array_d u))
-
-
--- Testing main
--- ==
--- entry: test_main
--- compiled input { 213 123 0.8 1 }
--- auto output
-
--- entry test_main (nx: i64)(ny: i64)(tau: real)(it: i64): [9][]real =
--- 	let f_ini = init_f_in(nx)(ny)
--- 	let f = lattice_boltzmann(nx)(ny)(f_ini)(tau)(it)
---     in from_tuple_to_array(f)
diff --git a/src/polygonise.fut b/src/polygonise.fut
index f3416aae4bbbc8eafe3ba314adb2bfdc433ca4f1..cc9237881c0657a7d7f443f58f9ebd39fdf58705 100644
--- a/src/polygonise.fut
+++ b/src/polygonise.fut
@@ -20,6 +20,23 @@ let mcubes_polygonise_f32 [nx][ny][nz] (rho: [nx][ny][nz]f32) (isolevel: f32) =
             map(\txi -> f32.f64 txi) tx
         ) tri ) 
 
+
+let mcubes_polygonise_f16 [nx][ny][nz] (rho: [nx][ny][nz]f16) (isolevel: f16) =
+    let rho' = 
+        map (\rx -> 
+            map (\rxy -> 
+                map (\rxyz -> 
+                    f64.f16 rxyz
+                ) rxy
+            ) rx 
+        ) rho
+    let tri = map (mcubes.triangle_to_array) (mcubes.polygonise_field rho' (f64.f16 isolevel))
+    let si = length tri
+    in 
+        (si , map (\tx -> 
+            map(\txi -> f16.f64 txi) tx
+        ) tri ) 
+
 let mtetras_polygonise_f64 [nx][ny][nz] (rho: [nx][ny][nz]f64) (isolevel: f64) =
     map (mtetras.triangle_to_array) (mtetras.polygonise_field rho isolevel)
 
@@ -37,4 +54,20 @@ let mtetras_polygonise_f32 [nx][ny][nz] (rho: [nx][ny][nz]f32) (isolevel: f32) =
     in 
         (si , map (\tx -> 
             map(\txi -> f32.f64 txi) tx
+        ) tri ) 
+
+let mtetras_polygonise_f16 [nx][ny][nz] (rho: [nx][ny][nz]f16) (isolevel: f16) =
+    let rho' = 
+        map (\rx -> 
+            map (\rxy -> 
+                map (\rxyz -> 
+                    f64.f16 rxyz
+                ) rxy
+            ) rx 
+        ) rho
+    let tri = map (mtetras.triangle_to_array) (mtetras.polygonise_field rho' (f64.f16 isolevel))
+    let si = length tri
+    in 
+        (si , map (\tx -> 
+            map(\txi -> f16.f64 txi) tx
         ) tri ) 
\ No newline at end of file
diff --git a/src/sim.fut b/src/sim.fut
index 400970191d5b9bfa12f83e7d703ec259833978e0..0e20f509a18adf05869b1d78522e1e8c74cf6946 100644
--- a/src/sim.fut
+++ b/src/sim.fut
@@ -7,24 +7,32 @@ module type sim = {
     type vec_q 'a
     type vec_d 'a
 
-    val from_array_q [nx][ny][nz][q]: [nx][ny][nz][q]real -> [nx][ny][nz](vec_q real)
-    val to_array_q [nx][ny][nz][q]: [nx][ny][nz](vec_q real) -> [nx][ny][nz][q]real
-    val from_array_d [nx][ny][nz][d]: [nx][ny][nz][d]real -> [nx][ny][nz](vec_d real)
-    val to_array_d [nx][ny][nz][d]: [nx][ny][nz](vec_d real) -> [nx][ny][nz][d]real
+    type ary_q 'a
+    type ary_d 'a
+
+    val from_array_q [nx][ny][nz]: [nx][ny][nz](ary_q real) -> [nx][ny][nz](vec_q real)
+    val to_array_q [nx][ny][nz]: [nx][ny][nz](vec_q real) -> [nx][ny][nz](ary_q real)
+    val from_array_d [nx][ny][nz]: [nx][ny][nz](ary_d real) -> [nx][ny][nz](vec_d real)
+    val to_array_d [nx][ny][nz]: [nx][ny][nz](vec_d real) -> [nx][ny][nz](ary_d real)
 
     val timesteps [nx][ny][nz]: [nx][ny][nz](vec_q real) -> ([nx][ny][nz](vec_d real)) -> ([nx][ny][nz](vec_q real)) -> real -> i64 -> ([nx][ny][nz](vec_q real))
-    val run [nx][ny][nz][d][q]: [nx][ny][nz][q]real -> [nx][ny][nz][d]real -> [nx][ny][nz][q]real -> real -> i64 -> [nx][ny][nz][q]real
+    val run [nx][ny][nz]: [nx][ny][nz](ary_q real) -> [nx][ny][nz](ary_d real) -> [nx][ny][nz](ary_q real) -> real -> i64 -> [nx][ny][nz](ary_q real)
 }
 
 module mk_sim (l: lbm) : (sim with real = l.real
                               with vec_q 'a = l.vec_q a
-                              with vec_d 'a = l.vec_d a) = 
+                              with vec_d 'a = l.vec_d a
+                              with ary_q 'a = [l.q]a
+                              with ary_d 'a = [l.dim]a) = 
 {
     type real = l.real
 
     type vec_q 'a = l.vec_q a
     type vec_d 'a = l.vec_d a
 
+    type ary_q 'a = [l.q]a
+    type ary_d 'a = [l.dim]a
+
     let to_array_q = l.to_array_q
     let from_array_q = l.from_array_q
 
@@ -38,11 +46,11 @@ module mk_sim (l: lbm) : (sim with real = l.real
         let fout = l.bc_apply vel dists (l.stream f)
         in fout
 
-    let run [nx][ny][nz][d][q] (fin: [nx][ny][nz][q]real) (vel: [nx][ny][nz][d]real) (dists: [nx][ny][nz][q]real) (omega: real) (max_t: i64): [nx][ny][nz][q]real =
-        let fin_f = l.from_array_q fin
-        let vel = l.from_array_d vel
-        let dist = l.from_array_q dists
+    let run [nx][ny][nz] (fin: [nx][ny][nz](ary_q real)) (vel: [nx][ny][nz][l.dim]real) (dists: [nx][ny][nz](ary_q real)) (omega: real) (max_t: i64): [nx][ny][nz](ary_q real) =
+        let fin_f = (l.from_array_q (fin :> [nx][ny][nz](ary_q real)) ) :> [nx][ny][nz](vec_q real)
+        let vel = l.from_array_d (vel :> [nx][ny][nz][l.dim]real) 
+        let dist = l.from_array_q (dists :> [nx][ny][nz](ary_q real))
         let f = timesteps fin_f vel dist omega max_t
-        in l.to_array_q f
+        in (l.to_array_q f) :> [nx][ny][nz][l.q]real
 
 }
\ No newline at end of file
diff --git a/src/tests/run_test.sh b/src/tests/run_test.sh
index fc9d745a49a5a1a917b43a7dade6015bf453dc79..4818ec3131bf816720141257f13f7d4f70073ba0 100755
--- a/src/tests/run_test.sh
+++ b/src/tests/run_test.sh
@@ -1,5 +1,6 @@
 #!/bin/bash
 
+echo "running tests in ${MESON_BUILD_ROOT}src/tests"
 cd "${MESON_BUILD_ROOT}src/tests"
 pwd
 echo "" | ./liblbm_tests
diff --git a/src/tests/tests.fut b/src/tests/tests.fut
index b66713d391e2d94b1ebd00e47ac888c3cf2e1721..23b8e1339cf4f33ea1bf3b049cc8fb40fd99466b 100644
--- a/src/tests/tests.fut
+++ b/src/tests/tests.fut
@@ -1,3 +1,4 @@
+import "../globals_f16"
 import "../globals_f32"
 import "../globals_f64"
 
@@ -21,9 +22,11 @@ local module testing (real: real)= {
 local let all_true (a: []bool) : bool = 
     all (\ai -> ai) a
 
+local let e16 = 1.0e-3f16
 local let e32 = 1.0e-5f32
 local let e64 = 1.0e-10f64
 
+local module t16 = testing f16
 local module t32 = testing f32
 local module t64 = testing f64
 
@@ -36,18 +39,31 @@ local module t64 = testing f64
 -- entry: test_d3q19_w
 -- compiled nobench input { } 
 -- output { true true }
-
 let test_d3q19_w : []bool =
-    [(d3q19_f32.reduce_q (+) 0f32 d3q19_f32.w) == 1f32,
-     (d3q19_f64.reduce_q (+) 0f64 d3q19_f64.w) == 1f64]
+    [
+        t16.approx_eq [d3q19_f16.reduce_q (+) 0f16 d3q19_f16.w] [1f16] e16,
+        (d3q19_f32.reduce_q (+) 0f32 d3q19_f32.w) == 1f32,
+        (d3q19_f64.reduce_q (+) 0f64 d3q19_f64.w) == 1f64
+    ]
 
 -- Test the descriptors ci's.
 -- ==
 -- entry: test_d3q19_c
 -- compiled nobench input { } 
--- output { true true }
-
+-- output { true true true }
 let test_d3q19_c : []bool =
+    let zero_f16  = replicate (d3q19_f16.d) 0f16
+    let sum_c_f16 =
+        d3q19_f16.to_array_d (
+            d3q19_f16.map_d (\ca -> 
+                d3q19_f16.reduce_q (+) 0f16 (
+                    d3q19_f16.map2_q(\wi cai ->
+                        wi * cai
+                    ) d3q19_f16.w ca
+                )
+            ) d3q19_f16.c_t_num
+        )
+
     let zero_f32  = replicate (d3q19_f32.d) 0f32
     let sum_c_f32 =
         d3q19_f32.to_array_d (
@@ -60,6 +76,7 @@ let test_d3q19_c : []bool =
             ) d3q19_f32.c_t_num
         )
 
+
     let zero_f64  = replicate (d3q19_f64.d) 0f64
     let sum_c_f64 =
         d3q19_f64.to_array_d (
@@ -71,7 +88,7 @@ let test_d3q19_c : []bool =
                 )
             ) d3q19_f64.c_t_num
         )
-    in [t32.approx_eq sum_c_f32 zero_f32 e32, t64.approx_eq sum_c_f64 zero_f64 e64]
+    in [t16.approx_eq sum_c_f16 zero_f16 e16, t32.approx_eq sum_c_f32 zero_f32 e32, t64.approx_eq sum_c_f64 zero_f64 e64]
     
 
 -- Test the descriptors equilibrium's.