diff --git a/futmpi/.gitignore b/futmpi/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..3c9d1274d19e37270fa4a2fbf476cec51486b874
--- /dev/null
+++ b/futmpi/.gitignore
@@ -0,0 +1,128 @@
+### C template
+# Prerequisites
+*.d
+
+# Object files
+*.o
+*.ko
+*.obj
+*.elf
+
+# Linker output
+*.ilk
+*.map
+*.exp
+
+# Precompiled Headers
+*.gch
+*.pch
+
+# Libraries
+*.lib
+*.a
+*.la
+*.lo
+
+# Shared objects (inc. Windows DLLs)
+*.dll
+*.so
+*.so.*
+*.dylib
+
+# Executables
+*.exe
+*.out
+*.app
+*.i*86
+*.x86_64
+*.hex
+
+# Debug files
+*.dSYM/
+*.su
+*.idb
+*.pdb
+
+# Kernel Module Compile Results
+*.mod*
+*.cmd
+.tmp_versions/
+modules.order
+Module.symvers
+Mkfile.old
+dkms.conf
+
+### JetBrains template
+# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider
+# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
+
+# User-specific stuff
+.idea/**/workspace.xml
+.idea/**/tasks.xml
+.idea/**/usage.statistics.xml
+.idea/**/dictionaries
+.idea/**/shelf
+
+# Generated files
+.idea/**/contentModel.xml
+
+# Sensitive or high-churn files
+.idea/**/dataSources/
+.idea/**/dataSources.ids
+.idea/**/dataSources.local.xml
+.idea/**/sqlDataSources.xml
+.idea/**/dynamic.xml
+.idea/**/uiDesigner.xml
+.idea/**/dbnavigator.xml
+
+# Gradle
+.idea/**/gradle.xml
+.idea/**/libraries
+
+# Gradle and Maven with auto-import
+# When using Gradle or Maven with auto-import, you should exclude module files,
+# since they will be recreated, and may cause churn.  Uncomment if using
+# auto-import.
+# .idea/artifacts
+# .idea/compiler.xml
+# .idea/jarRepositories.xml
+# .idea/modules.xml
+# .idea/*.iml
+# .idea/modules
+# *.iml
+# *.ipr
+
+# CMake
+cmake-build-*/
+
+# Mongo Explorer plugin
+.idea/**/mongoSettings.xml
+
+# File-based project format
+*.iws
+
+# IntelliJ
+out/
+
+# mpeltonen/sbt-idea plugin
+.idea_modules/
+
+# JIRA plugin
+atlassian-ide-plugin.xml
+
+# Cursive Clojure plugin
+.idea/replstate.xml
+
+# Crashlytics plugin (for Android Studio and IntelliJ)
+com_crashlytics_export_strings.xml
+crashlytics.properties
+crashlytics-build.properties
+fabric.properties
+
+# Editor-based Rest Client
+.idea/httpRequests
+
+# Android studio 3.1+ serialized cache file
+.idea/caches/build_file_checksums.ser
+
+.idea
diff --git a/futmpi/CMakeLists.txt b/futmpi/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..56ff99060bf09698ddc9654930499a43e8e16e2e
--- /dev/null
+++ b/futmpi/CMakeLists.txt
@@ -0,0 +1,29 @@
+cmake_minimum_required(VERSION 3.19)
+project(futmpi C)
+
+set(CMAKE_C_STANDARD 11)
+
+if (CMAKE_BUILD_TYPE MATCHES Debug)
+    set(GCC_COMPILE_FLAGS "-DDEBUG -Wall -Wextra -Wconversion -pedantic -fsanitize=undefined -fsanitize=address")
+    if (CMAKE_SYSTEM_NAME MATCHES "Linux")
+        set(GCC_COMPILE_FLAGS "${GCC_COMPILE_FLAGS} -fsanitize=leak")
+    endif ()
+elseif (CMAKE_BUILD_TYPE MATCHES Release)
+    set(GCC_COMPILE_FLAGS "-O3")
+endif ()
+
+set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${GCC_COMPILE_FLAGS}")
+
+if (CMAKE_SYSTEM_NAME MATCHES "Darwin")
+    include_directories(/usr/local/include)
+endif ()
+
+if (CMAKE_SYSTEM_NAME MATCHES "Linux")
+    execute_process(COMMAND sdl2-config --cflags OUTPUT_VARIABLE SDL2_C_FLAGS)
+endif ()
+
+find_package(MPI REQUIRED)
+include_directories(${MPI_C_INCLUDE_PATH})
+
+add_executable(futmpi main.c gol.h gol.c gfx.c gfx.h)
+target_link_libraries(futmpi ${MPI_C_LIBRARIES} SDL2 m "-framework OpenCL")
diff --git a/futmpi/gfx.c b/futmpi/gfx.c
new file mode 100644
index 0000000000000000000000000000000000000000..bc1ae9d8f1da68180d286fb6213761ff32511de4
--- /dev/null
+++ b/futmpi/gfx.c
@@ -0,0 +1,95 @@
+/// @file gfx.c
+/// @author Florent Gluck
+/// @date November 6, 2016
+/// Helper routines to render pixels in fullscreen graphic mode.
+/// Uses the SDL2 library.
+
+#include "gfx.h"
+
+/// Create a fullscreen graphic window.
+/// @param title Title of the window.
+/// @param width Width of the window in pixels.
+/// @param height Height of the window in pixels.
+/// @return a pointer to the graphic context or NULL if it failed.
+struct gfx_context_t *gfx_create(char *title, uint width, uint height) {
+    if (SDL_Init(SDL_INIT_VIDEO) != 0) goto error;
+    SDL_Window *window = SDL_CreateWindow(title, SDL_WINDOWPOS_CENTERED,
+                                          SDL_WINDOWPOS_CENTERED, width, height, SDL_WINDOW_RESIZABLE);
+    SDL_Renderer *renderer = SDL_CreateRenderer(window, -1, 0);
+    SDL_Texture *texture = SDL_CreateTexture(renderer, SDL_PIXELFORMAT_ARGB8888,
+                                             SDL_TEXTUREACCESS_STREAMING, width, height);
+    uint32_t *pixels = malloc(width * height * sizeof(uint32_t));
+    struct gfx_context_t *ctxt = malloc(sizeof(struct gfx_context_t));
+
+    if (!window || !renderer || !texture || !pixels || !ctxt) goto error;
+
+    ctxt->renderer = renderer;
+    ctxt->texture = texture;
+    ctxt->window = window;
+    ctxt->width = width;
+    ctxt->height = height;
+    ctxt->pixels = pixels;
+
+    SDL_ShowCursor(SDL_DISABLE);
+    gfx_clear(ctxt, COLOR_BLACK);
+    return ctxt;
+
+    error:
+    return NULL;
+}
+
+/// Draw a pixel in the specified graphic context.
+/// @param ctxt Graphic context where the pixel is to be drawn.
+/// @param x X coordinate of the pixel.
+/// @param y Y coordinate of the pixel.
+/// @param color Color of the pixel.
+void gfx_putpixel(struct gfx_context_t *ctxt, int x, int y, uint32_t color) {
+    if (x < ctxt->width && y < ctxt->height)
+        ctxt->pixels[ctxt->width * y + x] = color;
+}
+
+/// Clear the specified graphic context.
+/// @param ctxt Graphic context to clear.
+/// @param color Color to use.
+void gfx_clear(struct gfx_context_t *ctxt, uint32_t color) {
+    int n = ctxt->width * ctxt->height;
+    while (n)
+        ctxt->pixels[--n] = color;
+}
+
+/// Display the graphic context.
+/// @param ctxt Graphic context to clear.
+void gfx_present(struct gfx_context_t *ctxt) {
+    SDL_UpdateTexture(ctxt->texture, NULL, ctxt->pixels, ctxt->width * sizeof(uint32_t));
+    SDL_RenderCopy(ctxt->renderer, ctxt->texture, NULL, NULL);
+    SDL_RenderPresent(ctxt->renderer);
+}
+
+/// Destroy a graphic window.
+/// @param ctxt Graphic context of the window to close.
+void *gfx_destroy(struct gfx_context_t *ctxt) {
+    SDL_ShowCursor(SDL_ENABLE);
+    SDL_DestroyTexture(ctxt->texture);
+    SDL_DestroyRenderer(ctxt->renderer);
+    SDL_DestroyWindow(ctxt->window);
+    free(ctxt->pixels);
+    ctxt->texture = NULL;
+    ctxt->renderer = NULL;
+    ctxt->window = NULL;
+    ctxt->pixels = NULL;
+    SDL_Quit();
+    free(ctxt);
+    return NULL;
+}
+
+/// If a key was pressed, returns its key code (non blocking call).
+/// List of key codes: https://wiki.libsdl.org/SDL_Keycode
+/// SDL_PumpEvents() must be called before.
+/// @return 0 if escape was not pressed.
+SDL_Keycode gfx_keypressed() {
+    const Uint8 *state = SDL_GetKeyboardState(NULL);
+    if (state && state[SDL_SCANCODE_ESCAPE]) {
+        return SDLK_ESCAPE;
+    }
+    return 0;
+}
diff --git a/futmpi/gfx.h b/futmpi/gfx.h
new file mode 100644
index 0000000000000000000000000000000000000000..d6604aeab83d63749ab6ab6424557e282b50e511
--- /dev/null
+++ b/futmpi/gfx.h
@@ -0,0 +1,43 @@
+#ifndef _GFX_H_
+#define _GFX_H_
+
+#include <stdint.h>
+#include <SDL2/SDL.h>
+
+#define MAKE_COLOR(r, g, b) ((uint32_t)b|((uint32_t)g<<8)|((uint32_t)r<<16))
+
+#define COLOR_BLACK  0x00000000
+#define COLOR_RED    0x00FF0000
+#define COLOR_GREEN  0x0000FF00
+#define COLOR_BLUE   0x000000FF
+#define COLOR_WHITE  0x00FFFFFF
+#define COLOR_YELLOW 0x00FFFF00
+
+typedef unsigned int uint;
+typedef unsigned long ulong;
+typedef unsigned char uchar;
+
+struct gfx_context_t {
+    SDL_Window *window;
+    SDL_Renderer *renderer;
+    SDL_Texture *texture;
+    uint32_t *pixels;
+    int width;
+    int height;
+};
+
+extern void gfx_putpixel(struct gfx_context_t *ctxt, int x, int y, uint32_t color);
+
+extern void gfx_clear(struct gfx_context_t *ctxt, uint32_t color);
+
+extern struct gfx_context_t *gfx_create(char *text, uint width, uint height);
+
+extern void *gfx_destroy(struct gfx_context_t *ctxt);
+
+extern void gfx_present(struct gfx_context_t *ctxt);
+
+extern SDL_Keycode gfx_keypressed();
+
+extern SDL_EventType poll_event();
+
+#endif
diff --git a/futmpi/gol.c b/futmpi/gol.c
new file mode 100644
index 0000000000000000000000000000000000000000..cc1a72e8f68e773aeaebb18d63c30f26f3f69767
--- /dev/null
+++ b/futmpi/gol.c
@@ -0,0 +1,4707 @@
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE
+#endif
+#ifdef __GNUC__
+#pragma GCC diagnostic ignored "-Wunused-function"
+#pragma GCC diagnostic ignored "-Wunused-variable"
+#pragma GCC diagnostic ignored "-Wparentheses"
+#pragma GCC diagnostic ignored "-Wunused-label"
+#pragma GCC diagnostic ignored "-Wunused-but-set-variable"
+#endif
+#ifdef __clang__
+#pragma clang diagnostic ignored "-Wunused-function"
+#pragma clang diagnostic ignored "-Wunused-variable"
+#pragma clang diagnostic ignored "-Wparentheses"
+#pragma clang diagnostic ignored "-Wunused-label"
+#endif
+// Headers
+
+#include <stdint.h>
+#include <stddef.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <float.h>
+#define CL_TARGET_OPENCL_VERSION 120
+#define CL_USE_DEPRECATED_OPENCL_1_2_APIS
+#ifdef __APPLE__
+#define CL_SILENCE_DEPRECATION
+#include <OpenCL/cl.h>
+#else
+#include <CL/cl.h>
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Initialisation
+
+struct futhark_context_config ;
+struct futhark_context_config *futhark_context_config_new(void);
+void futhark_context_config_free(struct futhark_context_config *cfg);
+void futhark_context_config_add_build_option(struct futhark_context_config *cfg,
+                                             const char *opt);
+void futhark_context_config_set_debugging(struct futhark_context_config *cfg,
+                                          int flag);
+void futhark_context_config_set_profiling(struct futhark_context_config *cfg,
+                                          int flag);
+void futhark_context_config_set_logging(struct futhark_context_config *cfg,
+                                        int flag);
+void futhark_context_config_set_device(struct futhark_context_config *cfg, const
+                                       char *s);
+void futhark_context_config_set_platform(struct futhark_context_config *cfg,
+                                         const char *s);
+void
+futhark_context_config_select_device_interactively(struct futhark_context_config *cfg);
+void futhark_context_config_list_devices(struct futhark_context_config *cfg);
+void futhark_context_config_dump_program_to(struct futhark_context_config *cfg,
+                                            const char *path);
+void
+futhark_context_config_load_program_from(struct futhark_context_config *cfg,
+                                         const char *path);
+void futhark_context_config_dump_binary_to(struct futhark_context_config *cfg,
+                                           const char *path);
+void futhark_context_config_load_binary_from(struct futhark_context_config *cfg,
+                                             const char *path);
+void
+futhark_context_config_set_default_group_size(struct futhark_context_config *cfg,
+                                              int size);
+void
+futhark_context_config_set_default_num_groups(struct futhark_context_config *cfg,
+                                              int num);
+void
+futhark_context_config_set_default_tile_size(struct futhark_context_config *cfg,
+                                             int num);
+void
+futhark_context_config_set_default_reg_tile_size(struct futhark_context_config *cfg,
+                                                 int num);
+void
+futhark_context_config_set_default_threshold(struct futhark_context_config *cfg,
+                                             int num);
+int futhark_context_config_set_size(struct futhark_context_config *cfg, const
+                                    char *size_name, size_t size_value);
+struct futhark_context ;
+struct futhark_context *futhark_context_new(struct futhark_context_config *cfg);
+struct futhark_context
+*futhark_context_new_with_command_queue(struct futhark_context_config *cfg,
+                                        cl_command_queue queue);
+void futhark_context_free(struct futhark_context *ctx);
+cl_command_queue futhark_context_get_command_queue(struct futhark_context *ctx);
+int futhark_get_num_sizes(void);
+const char *futhark_get_size_name(int);
+const char *futhark_get_size_class(int);
+
+// Arrays
+
+struct futhark_i8_2d ;
+struct futhark_i8_2d *futhark_new_i8_2d(struct futhark_context *ctx, const
+                                        int8_t *data, int64_t dim0,
+                                        int64_t dim1);
+struct futhark_i8_2d *futhark_new_raw_i8_2d(struct futhark_context *ctx, const
+                                            cl_mem data, int offset,
+                                            int64_t dim0, int64_t dim1);
+int futhark_free_i8_2d(struct futhark_context *ctx, struct futhark_i8_2d *arr);
+int futhark_values_i8_2d(struct futhark_context *ctx, struct futhark_i8_2d *arr,
+                         int8_t *data);
+cl_mem futhark_values_raw_i8_2d(struct futhark_context *ctx,
+                                struct futhark_i8_2d *arr);
+const int64_t *futhark_shape_i8_2d(struct futhark_context *ctx,
+                                   struct futhark_i8_2d *arr);
+
+// Opaque values
+
+
+// Entry points
+
+int futhark_entry_get_envelope(struct futhark_context *ctx,
+                               struct futhark_i8_2d **out0, const
+                               struct futhark_i8_2d *in0);
+int futhark_entry_next_chunk_board(struct futhark_context *ctx,
+                                   struct futhark_i8_2d **out0, const
+                                   struct futhark_i8_2d *in0, const
+                                   struct futhark_i8_2d *in1);
+
+// Miscellaneous
+
+int futhark_context_sync(struct futhark_context *ctx);
+char *futhark_context_report(struct futhark_context *ctx);
+char *futhark_context_get_error(struct futhark_context *ctx);
+void futhark_context_set_logging_file(struct futhark_context *ctx, FILE *f);
+void futhark_context_pause_profiling(struct futhark_context *ctx);
+void futhark_context_unpause_profiling(struct futhark_context *ctx);
+int futhark_context_clear_caches(struct futhark_context *ctx);
+#define FUTHARK_BACKEND_opencl
+#ifdef __cplusplus
+}
+#endif
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdbool.h>
+#include <math.h>
+#include <stdint.h>
+#undef NDEBUG
+#include <assert.h>
+#include <stdarg.h>
+// Start of util.h.
+//
+// Various helper functions that are useful in all generated C code.
+
+#include <errno.h>
+#include <string.h>
+
+static const char *fut_progname = "(embedded Futhark)";
+
+static void futhark_panic(int eval, const char *fmt, ...) {
+  va_list ap;
+  va_start(ap, fmt);
+  fprintf(stderr, "%s: ", fut_progname);
+  vfprintf(stderr, fmt, ap);
+  va_end(ap);
+  exit(eval);
+}
+
+// For generating arbitrary-sized error messages.  It is the callers
+// responsibility to free the buffer at some point.
+static char* msgprintf(const char *s, ...) {
+  va_list vl;
+  va_start(vl, s);
+  size_t needed = 1 + (size_t)vsnprintf(NULL, 0, s, vl);
+  char *buffer = (char*) malloc(needed);
+  va_start(vl, s); // Must re-init.
+  vsnprintf(buffer, needed, s, vl);
+  return buffer;
+}
+
+
+static inline void check_err(int errval, int sets_errno, const char *fun, int line,
+                            const char *msg, ...) {
+  if (errval) {
+    char errnum[10];
+
+    va_list vl;
+    va_start(vl, msg);
+
+    fprintf(stderr, "ERROR: ");
+    vfprintf(stderr, msg, vl);
+    fprintf(stderr, " in %s() at line %d with error code %s\n",
+            fun, line,
+            sets_errno ? strerror(errno) : errnum);
+    exit(errval);
+  }
+}
+
+#define CHECK_ERR(err, msg...) check_err(err, 0, __func__, __LINE__, msg)
+#define CHECK_ERRNO(err, msg...) check_err(err, 1, __func__, __LINE__, msg)
+
+// Read the rest of an open file into a NUL-terminated string; returns
+// NULL on error.
+static void* fslurp_file(FILE *f, size_t *size) {
+  size_t start = ftell(f);
+  fseek(f, 0, SEEK_END);
+  size_t src_size = ftell(f)-start;
+  fseek(f, start, SEEK_SET);
+  unsigned char *s = (unsigned char*) malloc(src_size + 1);
+  if (fread(s, 1, src_size, f) != src_size) {
+    free(s);
+    s = NULL;
+  } else {
+    s[src_size] = '\0';
+  }
+
+  if (size) {
+    *size = src_size;
+  }
+
+  return s;
+}
+
+// Read a file into a NUL-terminated string; returns NULL on error.
+static void* slurp_file(const char *filename, size_t *size) {
+  FILE *f = fopen(filename, "rb"); // To avoid Windows messing with linebreaks.
+  if (f == NULL) return NULL;
+  unsigned char *s = fslurp_file(f, size);
+  fclose(f);
+  return s;
+}
+
+// Dump 'n' bytes from 'buf' into the file at the designated location.
+// Returns 0 on success.
+static int dump_file(const char *file, const void *buf, size_t n) {
+  FILE *f = fopen(file, "w");
+
+  if (f == NULL) {
+    return 1;
+  }
+
+  if (fwrite(buf, sizeof(char), n, f) != n) {
+    return 1;
+  }
+
+  if (fclose(f) != 0) {
+    return 1;
+  }
+
+  return 0;
+}
+
+struct str_builder {
+  char *str;
+  size_t capacity; // Size of buffer.
+  size_t used; // Bytes used, *not* including final zero.
+};
+
+static void str_builder_init(struct str_builder *b) {
+  b->capacity = 10;
+  b->used = 0;
+  b->str = malloc(b->capacity);
+  b->str[0] = 0;
+}
+
+static void str_builder(struct str_builder *b, const char *s, ...) {
+  va_list vl;
+  va_start(vl, s);
+  size_t needed = (size_t)vsnprintf(NULL, 0, s, vl);
+
+  while (b->capacity < b->used + needed + 1) {
+    b->capacity *= 2;
+    b->str = realloc(b->str, b->capacity);
+  }
+
+  va_start(vl, s); // Must re-init.
+  vsnprintf(b->str+b->used, b->capacity-b->used, s, vl);
+  b->used += needed;
+}
+
+// End of util.h.
+
+// Start of timing.h.
+
+// The function get_wall_time() returns the wall time in microseconds
+// (with an unspecified offset).
+
+#ifdef _WIN32
+
+#include <windows.h>
+
+static int64_t get_wall_time(void) {
+  LARGE_INTEGER time,freq;
+  assert(QueryPerformanceFrequency(&freq));
+  assert(QueryPerformanceCounter(&time));
+  return ((double)time.QuadPart / freq.QuadPart) * 1000000;
+}
+
+#else
+// Assuming POSIX
+
+#include <time.h>
+#include <sys/time.h>
+
+static int64_t get_wall_time(void) {
+  struct timeval time;
+  assert(gettimeofday(&time,NULL) == 0);
+  return time.tv_sec * 1000000 + time.tv_usec;
+}
+
+static int64_t get_wall_time_ns(void) {
+  struct timespec time;
+  assert(clock_gettime(CLOCK_REALTIME, &time) == 0);
+  return time.tv_sec * 1000000000 + time.tv_nsec;
+}
+
+#endif
+
+// End of timing.h.
+
+#ifdef _MSC_VER
+#define inline __inline
+#endif
+#include <string.h>
+#include <string.h>
+#include <errno.h>
+#include <assert.h>
+#include <ctype.h>
+#define CL_TARGET_OPENCL_VERSION 120
+#define CL_USE_DEPRECATED_OPENCL_1_2_APIS
+#ifdef __APPLE__
+#define CL_SILENCE_DEPRECATION
+#include <OpenCL/cl.h>
+#else
+#include <CL/cl.h>
+#endif
+
+// Start of lock.h.
+
+// A very simple cross-platform implementation of locks.  Uses
+// pthreads on Unix and some Windows thing there.  Futhark's
+// host-level code is not multithreaded, but user code may be, so we
+// need some mechanism for ensuring atomic access to API functions.
+// This is that mechanism.  It is not exposed to user code at all, so
+// we do not have to worry about name collisions.
+
+#ifdef _WIN32
+
+typedef HANDLE lock_t;
+
+static void create_lock(lock_t *lock) {
+  *lock = CreateMutex(NULL,  // Default security attributes.
+                      FALSE, // Initially unlocked.
+                      NULL); // Unnamed.
+}
+
+static void lock_lock(lock_t *lock) {
+  assert(WaitForSingleObject(*lock, INFINITE) == WAIT_OBJECT_0);
+}
+
+static void lock_unlock(lock_t *lock) {
+  assert(ReleaseMutex(*lock));
+}
+
+static void free_lock(lock_t *lock) {
+  CloseHandle(*lock);
+}
+
+#else
+// Assuming POSIX
+
+#include <pthread.h>
+
+typedef pthread_mutex_t lock_t;
+
+static void create_lock(lock_t *lock) {
+  int r = pthread_mutex_init(lock, NULL);
+  assert(r == 0);
+}
+
+static void lock_lock(lock_t *lock) {
+  int r = pthread_mutex_lock(lock);
+  assert(r == 0);
+}
+
+static void lock_unlock(lock_t *lock) {
+  int r = pthread_mutex_unlock(lock);
+  assert(r == 0);
+}
+
+static void free_lock(lock_t *lock) {
+  // Nothing to do for pthreads.
+  (void)lock;
+}
+
+#endif
+
+// End of lock.h.
+
+static inline uint8_t add8(uint8_t x, uint8_t y)
+{
+    return x + y;
+}
+static inline uint16_t add16(uint16_t x, uint16_t y)
+{
+    return x + y;
+}
+static inline uint32_t add32(uint32_t x, uint32_t y)
+{
+    return x + y;
+}
+static inline uint64_t add64(uint64_t x, uint64_t y)
+{
+    return x + y;
+}
+static inline uint8_t sub8(uint8_t x, uint8_t y)
+{
+    return x - y;
+}
+static inline uint16_t sub16(uint16_t x, uint16_t y)
+{
+    return x - y;
+}
+static inline uint32_t sub32(uint32_t x, uint32_t y)
+{
+    return x - y;
+}
+static inline uint64_t sub64(uint64_t x, uint64_t y)
+{
+    return x - y;
+}
+static inline uint8_t mul8(uint8_t x, uint8_t y)
+{
+    return x * y;
+}
+static inline uint16_t mul16(uint16_t x, uint16_t y)
+{
+    return x * y;
+}
+static inline uint32_t mul32(uint32_t x, uint32_t y)
+{
+    return x * y;
+}
+static inline uint64_t mul64(uint64_t x, uint64_t y)
+{
+    return x * y;
+}
+static inline uint8_t udiv8(uint8_t x, uint8_t y)
+{
+    return x / y;
+}
+static inline uint16_t udiv16(uint16_t x, uint16_t y)
+{
+    return x / y;
+}
+static inline uint32_t udiv32(uint32_t x, uint32_t y)
+{
+    return x / y;
+}
+static inline uint64_t udiv64(uint64_t x, uint64_t y)
+{
+    return x / y;
+}
+static inline uint8_t udiv_up8(uint8_t x, uint8_t y)
+{
+    return (x + y - 1) / y;
+}
+static inline uint16_t udiv_up16(uint16_t x, uint16_t y)
+{
+    return (x + y - 1) / y;
+}
+static inline uint32_t udiv_up32(uint32_t x, uint32_t y)
+{
+    return (x + y - 1) / y;
+}
+static inline uint64_t udiv_up64(uint64_t x, uint64_t y)
+{
+    return (x + y - 1) / y;
+}
+static inline uint8_t umod8(uint8_t x, uint8_t y)
+{
+    return x % y;
+}
+static inline uint16_t umod16(uint16_t x, uint16_t y)
+{
+    return x % y;
+}
+static inline uint32_t umod32(uint32_t x, uint32_t y)
+{
+    return x % y;
+}
+static inline uint64_t umod64(uint64_t x, uint64_t y)
+{
+    return x % y;
+}
+static inline uint8_t udiv_safe8(uint8_t x, uint8_t y)
+{
+    return y == 0 ? 0 : x / y;
+}
+static inline uint16_t udiv_safe16(uint16_t x, uint16_t y)
+{
+    return y == 0 ? 0 : x / y;
+}
+static inline uint32_t udiv_safe32(uint32_t x, uint32_t y)
+{
+    return y == 0 ? 0 : x / y;
+}
+static inline uint64_t udiv_safe64(uint64_t x, uint64_t y)
+{
+    return y == 0 ? 0 : x / y;
+}
+static inline uint8_t udiv_up_safe8(uint8_t x, uint8_t y)
+{
+    return y == 0 ? 0 : (x + y - 1) / y;
+}
+static inline uint16_t udiv_up_safe16(uint16_t x, uint16_t y)
+{
+    return y == 0 ? 0 : (x + y - 1) / y;
+}
+static inline uint32_t udiv_up_safe32(uint32_t x, uint32_t y)
+{
+    return y == 0 ? 0 : (x + y - 1) / y;
+}
+static inline uint64_t udiv_up_safe64(uint64_t x, uint64_t y)
+{
+    return y == 0 ? 0 : (x + y - 1) / y;
+}
+static inline uint8_t umod_safe8(uint8_t x, uint8_t y)
+{
+    return y == 0 ? 0 : x % y;
+}
+static inline uint16_t umod_safe16(uint16_t x, uint16_t y)
+{
+    return y == 0 ? 0 : x % y;
+}
+static inline uint32_t umod_safe32(uint32_t x, uint32_t y)
+{
+    return y == 0 ? 0 : x % y;
+}
+static inline uint64_t umod_safe64(uint64_t x, uint64_t y)
+{
+    return y == 0 ? 0 : x % y;
+}
+static inline int8_t sdiv8(int8_t x, int8_t y)
+{
+    int8_t q = x / y;
+    int8_t r = x % y;
+    
+    return q - ((r != 0 && r < 0 != y < 0) ? 1 : 0);
+}
+static inline int16_t sdiv16(int16_t x, int16_t y)
+{
+    int16_t q = x / y;
+    int16_t r = x % y;
+    
+    return q - ((r != 0 && r < 0 != y < 0) ? 1 : 0);
+}
+static inline int32_t sdiv32(int32_t x, int32_t y)
+{
+    int32_t q = x / y;
+    int32_t r = x % y;
+    
+    return q - ((r != 0 && r < 0 != y < 0) ? 1 : 0);
+}
+static inline int64_t sdiv64(int64_t x, int64_t y)
+{
+    int64_t q = x / y;
+    int64_t r = x % y;
+    
+    return q - ((r != 0 && r < 0 != y < 0) ? 1 : 0);
+}
+static inline int8_t sdiv_up8(int8_t x, int8_t y)
+{
+    return sdiv8(x + y - 1, y);
+}
+static inline int16_t sdiv_up16(int16_t x, int16_t y)
+{
+    return sdiv16(x + y - 1, y);
+}
+static inline int32_t sdiv_up32(int32_t x, int32_t y)
+{
+    return sdiv32(x + y - 1, y);
+}
+static inline int64_t sdiv_up64(int64_t x, int64_t y)
+{
+    return sdiv64(x + y - 1, y);
+}
+static inline int8_t smod8(int8_t x, int8_t y)
+{
+    int8_t r = x % y;
+    
+    return r + (r == 0 || (x > 0 && y > 0) || (x < 0 && y < 0) ? 0 : y);
+}
+static inline int16_t smod16(int16_t x, int16_t y)
+{
+    int16_t r = x % y;
+    
+    return r + (r == 0 || (x > 0 && y > 0) || (x < 0 && y < 0) ? 0 : y);
+}
+static inline int32_t smod32(int32_t x, int32_t y)
+{
+    int32_t r = x % y;
+    
+    return r + (r == 0 || (x > 0 && y > 0) || (x < 0 && y < 0) ? 0 : y);
+}
+static inline int64_t smod64(int64_t x, int64_t y)
+{
+    int64_t r = x % y;
+    
+    return r + (r == 0 || (x > 0 && y > 0) || (x < 0 && y < 0) ? 0 : y);
+}
+static inline int8_t sdiv_safe8(int8_t x, int8_t y)
+{
+    return y == 0 ? 0 : sdiv8(x, y);
+}
+static inline int16_t sdiv_safe16(int16_t x, int16_t y)
+{
+    return y == 0 ? 0 : sdiv16(x, y);
+}
+static inline int32_t sdiv_safe32(int32_t x, int32_t y)
+{
+    return y == 0 ? 0 : sdiv32(x, y);
+}
+static inline int64_t sdiv_safe64(int64_t x, int64_t y)
+{
+    return y == 0 ? 0 : sdiv64(x, y);
+}
+static inline int8_t sdiv_up_safe8(int8_t x, int8_t y)
+{
+    return sdiv_safe8(x + y - 1, y);
+}
+static inline int16_t sdiv_up_safe16(int16_t x, int16_t y)
+{
+    return sdiv_safe16(x + y - 1, y);
+}
+static inline int32_t sdiv_up_safe32(int32_t x, int32_t y)
+{
+    return sdiv_safe32(x + y - 1, y);
+}
+static inline int64_t sdiv_up_safe64(int64_t x, int64_t y)
+{
+    return sdiv_safe64(x + y - 1, y);
+}
+static inline int8_t smod_safe8(int8_t x, int8_t y)
+{
+    return y == 0 ? 0 : smod8(x, y);
+}
+static inline int16_t smod_safe16(int16_t x, int16_t y)
+{
+    return y == 0 ? 0 : smod16(x, y);
+}
+static inline int32_t smod_safe32(int32_t x, int32_t y)
+{
+    return y == 0 ? 0 : smod32(x, y);
+}
+static inline int64_t smod_safe64(int64_t x, int64_t y)
+{
+    return y == 0 ? 0 : smod64(x, y);
+}
+static inline int8_t squot8(int8_t x, int8_t y)
+{
+    return x / y;
+}
+static inline int16_t squot16(int16_t x, int16_t y)
+{
+    return x / y;
+}
+static inline int32_t squot32(int32_t x, int32_t y)
+{
+    return x / y;
+}
+static inline int64_t squot64(int64_t x, int64_t y)
+{
+    return x / y;
+}
+static inline int8_t srem8(int8_t x, int8_t y)
+{
+    return x % y;
+}
+static inline int16_t srem16(int16_t x, int16_t y)
+{
+    return x % y;
+}
+static inline int32_t srem32(int32_t x, int32_t y)
+{
+    return x % y;
+}
+static inline int64_t srem64(int64_t x, int64_t y)
+{
+    return x % y;
+}
+static inline int8_t squot_safe8(int8_t x, int8_t y)
+{
+    return y == 0 ? 0 : x / y;
+}
+static inline int16_t squot_safe16(int16_t x, int16_t y)
+{
+    return y == 0 ? 0 : x / y;
+}
+static inline int32_t squot_safe32(int32_t x, int32_t y)
+{
+    return y == 0 ? 0 : x / y;
+}
+static inline int64_t squot_safe64(int64_t x, int64_t y)
+{
+    return y == 0 ? 0 : x / y;
+}
+static inline int8_t srem_safe8(int8_t x, int8_t y)
+{
+    return y == 0 ? 0 : x % y;
+}
+static inline int16_t srem_safe16(int16_t x, int16_t y)
+{
+    return y == 0 ? 0 : x % y;
+}
+static inline int32_t srem_safe32(int32_t x, int32_t y)
+{
+    return y == 0 ? 0 : x % y;
+}
+static inline int64_t srem_safe64(int64_t x, int64_t y)
+{
+    return y == 0 ? 0 : x % y;
+}
+static inline int8_t smin8(int8_t x, int8_t y)
+{
+    return x < y ? x : y;
+}
+static inline int16_t smin16(int16_t x, int16_t y)
+{
+    return x < y ? x : y;
+}
+static inline int32_t smin32(int32_t x, int32_t y)
+{
+    return x < y ? x : y;
+}
+static inline int64_t smin64(int64_t x, int64_t y)
+{
+    return x < y ? x : y;
+}
+static inline uint8_t umin8(uint8_t x, uint8_t y)
+{
+    return x < y ? x : y;
+}
+static inline uint16_t umin16(uint16_t x, uint16_t y)
+{
+    return x < y ? x : y;
+}
+static inline uint32_t umin32(uint32_t x, uint32_t y)
+{
+    return x < y ? x : y;
+}
+static inline uint64_t umin64(uint64_t x, uint64_t y)
+{
+    return x < y ? x : y;
+}
+static inline int8_t smax8(int8_t x, int8_t y)
+{
+    return x < y ? y : x;
+}
+static inline int16_t smax16(int16_t x, int16_t y)
+{
+    return x < y ? y : x;
+}
+static inline int32_t smax32(int32_t x, int32_t y)
+{
+    return x < y ? y : x;
+}
+static inline int64_t smax64(int64_t x, int64_t y)
+{
+    return x < y ? y : x;
+}
+static inline uint8_t umax8(uint8_t x, uint8_t y)
+{
+    return x < y ? y : x;
+}
+static inline uint16_t umax16(uint16_t x, uint16_t y)
+{
+    return x < y ? y : x;
+}
+static inline uint32_t umax32(uint32_t x, uint32_t y)
+{
+    return x < y ? y : x;
+}
+static inline uint64_t umax64(uint64_t x, uint64_t y)
+{
+    return x < y ? y : x;
+}
+static inline uint8_t shl8(uint8_t x, uint8_t y)
+{
+    return x << y;
+}
+static inline uint16_t shl16(uint16_t x, uint16_t y)
+{
+    return x << y;
+}
+static inline uint32_t shl32(uint32_t x, uint32_t y)
+{
+    return x << y;
+}
+static inline uint64_t shl64(uint64_t x, uint64_t y)
+{
+    return x << y;
+}
+static inline uint8_t lshr8(uint8_t x, uint8_t y)
+{
+    return x >> y;
+}
+static inline uint16_t lshr16(uint16_t x, uint16_t y)
+{
+    return x >> y;
+}
+static inline uint32_t lshr32(uint32_t x, uint32_t y)
+{
+    return x >> y;
+}
+static inline uint64_t lshr64(uint64_t x, uint64_t y)
+{
+    return x >> y;
+}
+static inline int8_t ashr8(int8_t x, int8_t y)
+{
+    return x >> y;
+}
+static inline int16_t ashr16(int16_t x, int16_t y)
+{
+    return x >> y;
+}
+static inline int32_t ashr32(int32_t x, int32_t y)
+{
+    return x >> y;
+}
+static inline int64_t ashr64(int64_t x, int64_t y)
+{
+    return x >> y;
+}
+static inline uint8_t and8(uint8_t x, uint8_t y)
+{
+    return x & y;
+}
+static inline uint16_t and16(uint16_t x, uint16_t y)
+{
+    return x & y;
+}
+static inline uint32_t and32(uint32_t x, uint32_t y)
+{
+    return x & y;
+}
+static inline uint64_t and64(uint64_t x, uint64_t y)
+{
+    return x & y;
+}
+static inline uint8_t or8(uint8_t x, uint8_t y)
+{
+    return x | y;
+}
+static inline uint16_t or16(uint16_t x, uint16_t y)
+{
+    return x | y;
+}
+static inline uint32_t or32(uint32_t x, uint32_t y)
+{
+    return x | y;
+}
+static inline uint64_t or64(uint64_t x, uint64_t y)
+{
+    return x | y;
+}
+static inline uint8_t xor8(uint8_t x, uint8_t y)
+{
+    return x ^ y;
+}
+static inline uint16_t xor16(uint16_t x, uint16_t y)
+{
+    return x ^ y;
+}
+static inline uint32_t xor32(uint32_t x, uint32_t y)
+{
+    return x ^ y;
+}
+static inline uint64_t xor64(uint64_t x, uint64_t y)
+{
+    return x ^ y;
+}
+static inline bool ult8(uint8_t x, uint8_t y)
+{
+    return x < y;
+}
+static inline bool ult16(uint16_t x, uint16_t y)
+{
+    return x < y;
+}
+static inline bool ult32(uint32_t x, uint32_t y)
+{
+    return x < y;
+}
+static inline bool ult64(uint64_t x, uint64_t y)
+{
+    return x < y;
+}
+static inline bool ule8(uint8_t x, uint8_t y)
+{
+    return x <= y;
+}
+static inline bool ule16(uint16_t x, uint16_t y)
+{
+    return x <= y;
+}
+static inline bool ule32(uint32_t x, uint32_t y)
+{
+    return x <= y;
+}
+static inline bool ule64(uint64_t x, uint64_t y)
+{
+    return x <= y;
+}
+static inline bool slt8(int8_t x, int8_t y)
+{
+    return x < y;
+}
+static inline bool slt16(int16_t x, int16_t y)
+{
+    return x < y;
+}
+static inline bool slt32(int32_t x, int32_t y)
+{
+    return x < y;
+}
+static inline bool slt64(int64_t x, int64_t y)
+{
+    return x < y;
+}
+static inline bool sle8(int8_t x, int8_t y)
+{
+    return x <= y;
+}
+static inline bool sle16(int16_t x, int16_t y)
+{
+    return x <= y;
+}
+static inline bool sle32(int32_t x, int32_t y)
+{
+    return x <= y;
+}
+static inline bool sle64(int64_t x, int64_t y)
+{
+    return x <= y;
+}
+static inline int8_t pow8(int8_t x, int8_t y)
+{
+    int8_t res = 1, rem = y;
+    
+    while (rem != 0) {
+        if (rem & 1)
+            res *= x;
+        rem >>= 1;
+        x *= x;
+    }
+    return res;
+}
+static inline int16_t pow16(int16_t x, int16_t y)
+{
+    int16_t res = 1, rem = y;
+    
+    while (rem != 0) {
+        if (rem & 1)
+            res *= x;
+        rem >>= 1;
+        x *= x;
+    }
+    return res;
+}
+static inline int32_t pow32(int32_t x, int32_t y)
+{
+    int32_t res = 1, rem = y;
+    
+    while (rem != 0) {
+        if (rem & 1)
+            res *= x;
+        rem >>= 1;
+        x *= x;
+    }
+    return res;
+}
+static inline int64_t pow64(int64_t x, int64_t y)
+{
+    int64_t res = 1, rem = y;
+    
+    while (rem != 0) {
+        if (rem & 1)
+            res *= x;
+        rem >>= 1;
+        x *= x;
+    }
+    return res;
+}
+static inline bool itob_i8_bool(int8_t x)
+{
+    return x;
+}
+static inline bool itob_i16_bool(int16_t x)
+{
+    return x;
+}
+static inline bool itob_i32_bool(int32_t x)
+{
+    return x;
+}
+static inline bool itob_i64_bool(int64_t x)
+{
+    return x;
+}
+static inline int8_t btoi_bool_i8(bool x)
+{
+    return x;
+}
+static inline int16_t btoi_bool_i16(bool x)
+{
+    return x;
+}
+static inline int32_t btoi_bool_i32(bool x)
+{
+    return x;
+}
+static inline int64_t btoi_bool_i64(bool x)
+{
+    return x;
+}
+#define sext_i8_i8(x) ((int8_t) (int8_t) x)
+#define sext_i8_i16(x) ((int16_t) (int8_t) x)
+#define sext_i8_i32(x) ((int32_t) (int8_t) x)
+#define sext_i8_i64(x) ((int64_t) (int8_t) x)
+#define sext_i16_i8(x) ((int8_t) (int16_t) x)
+#define sext_i16_i16(x) ((int16_t) (int16_t) x)
+#define sext_i16_i32(x) ((int32_t) (int16_t) x)
+#define sext_i16_i64(x) ((int64_t) (int16_t) x)
+#define sext_i32_i8(x) ((int8_t) (int32_t) x)
+#define sext_i32_i16(x) ((int16_t) (int32_t) x)
+#define sext_i32_i32(x) ((int32_t) (int32_t) x)
+#define sext_i32_i64(x) ((int64_t) (int32_t) x)
+#define sext_i64_i8(x) ((int8_t) (int64_t) x)
+#define sext_i64_i16(x) ((int16_t) (int64_t) x)
+#define sext_i64_i32(x) ((int32_t) (int64_t) x)
+#define sext_i64_i64(x) ((int64_t) (int64_t) x)
+#define zext_i8_i8(x) ((int8_t) (uint8_t) x)
+#define zext_i8_i16(x) ((int16_t) (uint8_t) x)
+#define zext_i8_i32(x) ((int32_t) (uint8_t) x)
+#define zext_i8_i64(x) ((int64_t) (uint8_t) x)
+#define zext_i16_i8(x) ((int8_t) (uint16_t) x)
+#define zext_i16_i16(x) ((int16_t) (uint16_t) x)
+#define zext_i16_i32(x) ((int32_t) (uint16_t) x)
+#define zext_i16_i64(x) ((int64_t) (uint16_t) x)
+#define zext_i32_i8(x) ((int8_t) (uint32_t) x)
+#define zext_i32_i16(x) ((int16_t) (uint32_t) x)
+#define zext_i32_i32(x) ((int32_t) (uint32_t) x)
+#define zext_i32_i64(x) ((int64_t) (uint32_t) x)
+#define zext_i64_i8(x) ((int8_t) (uint64_t) x)
+#define zext_i64_i16(x) ((int16_t) (uint64_t) x)
+#define zext_i64_i32(x) ((int32_t) (uint64_t) x)
+#define zext_i64_i64(x) ((int64_t) (uint64_t) x)
+#if defined(__OPENCL_VERSION__)
+static int32_t futrts_popc8(int8_t x)
+{
+    return popcount(x);
+}
+static int32_t futrts_popc16(int16_t x)
+{
+    return popcount(x);
+}
+static int32_t futrts_popc32(int32_t x)
+{
+    return popcount(x);
+}
+static int32_t futrts_popc64(int64_t x)
+{
+    return popcount(x);
+}
+#elif defined(__CUDA_ARCH__)
+static int32_t futrts_popc8(int8_t x)
+{
+    return __popc(zext_i8_i32(x));
+}
+static int32_t futrts_popc16(int16_t x)
+{
+    return __popc(zext_i16_i32(x));
+}
+static int32_t futrts_popc32(int32_t x)
+{
+    return __popc(x);
+}
+static int32_t futrts_popc64(int64_t x)
+{
+    return __popcll(x);
+}
+#else
+static int32_t futrts_popc8(int8_t x)
+{
+    int c = 0;
+    
+    for (; x; ++c)
+        x &= x - 1;
+    return c;
+}
+static int32_t futrts_popc16(int16_t x)
+{
+    int c = 0;
+    
+    for (; x; ++c)
+        x &= x - 1;
+    return c;
+}
+static int32_t futrts_popc32(int32_t x)
+{
+    int c = 0;
+    
+    for (; x; ++c)
+        x &= x - 1;
+    return c;
+}
+static int32_t futrts_popc64(int64_t x)
+{
+    int c = 0;
+    
+    for (; x; ++c)
+        x &= x - 1;
+    return c;
+}
+#endif
+#if defined(__OPENCL_VERSION__)
+static uint8_t futrts_mul_hi8(uint8_t a, uint8_t b)
+{
+    return mul_hi(a, b);
+}
+static uint16_t futrts_mul_hi16(uint16_t a, uint16_t b)
+{
+    return mul_hi(a, b);
+}
+static uint32_t futrts_mul_hi32(uint32_t a, uint32_t b)
+{
+    return mul_hi(a, b);
+}
+static uint64_t futrts_mul_hi64(uint64_t a, uint64_t b)
+{
+    return mul_hi(a, b);
+}
+#elif defined(__CUDA_ARCH__)
+static uint8_t futrts_mul_hi8(uint8_t a, uint8_t b)
+{
+    uint16_t aa = a;
+    uint16_t bb = b;
+    
+    return aa * bb >> 8;
+}
+static uint16_t futrts_mul_hi16(uint16_t a, uint16_t b)
+{
+    uint32_t aa = a;
+    uint32_t bb = b;
+    
+    return aa * bb >> 16;
+}
+static uint32_t futrts_mul_hi32(uint32_t a, uint32_t b)
+{
+    return mulhi(a, b);
+}
+static uint64_t futrts_mul_hi64(uint64_t a, uint64_t b)
+{
+    return mul64hi(a, b);
+}
+#else
+static uint8_t futrts_mul_hi8(uint8_t a, uint8_t b)
+{
+    uint16_t aa = a;
+    uint16_t bb = b;
+    
+    return aa * bb >> 8;
+}
+static uint16_t futrts_mul_hi16(uint16_t a, uint16_t b)
+{
+    uint32_t aa = a;
+    uint32_t bb = b;
+    
+    return aa * bb >> 16;
+}
+static uint32_t futrts_mul_hi32(uint32_t a, uint32_t b)
+{
+    uint64_t aa = a;
+    uint64_t bb = b;
+    
+    return aa * bb >> 32;
+}
+static uint64_t futrts_mul_hi64(uint64_t a, uint64_t b)
+{
+    __uint128_t aa = a;
+    __uint128_t bb = b;
+    
+    return aa * bb >> 64;
+}
+#endif
+#if defined(__OPENCL_VERSION__)
+static uint8_t futrts_mad_hi8(uint8_t a, uint8_t b, uint8_t c)
+{
+    return mad_hi(a, b, c);
+}
+static uint16_t futrts_mad_hi16(uint16_t a, uint16_t b, uint16_t c)
+{
+    return mad_hi(a, b, c);
+}
+static uint32_t futrts_mad_hi32(uint32_t a, uint32_t b, uint32_t c)
+{
+    return mad_hi(a, b, c);
+}
+static uint64_t futrts_mad_hi64(uint64_t a, uint64_t b, uint64_t c)
+{
+    return mad_hi(a, b, c);
+}
+#else
+static uint8_t futrts_mad_hi8(uint8_t a, uint8_t b, uint8_t c)
+{
+    return futrts_mul_hi8(a, b) + c;
+}
+static uint16_t futrts_mad_hi16(uint16_t a, uint16_t b, uint16_t c)
+{
+    return futrts_mul_hi16(a, b) + c;
+}
+static uint32_t futrts_mad_hi32(uint32_t a, uint32_t b, uint32_t c)
+{
+    return futrts_mul_hi32(a, b) + c;
+}
+static uint64_t futrts_mad_hi64(uint64_t a, uint64_t b, uint64_t c)
+{
+    return futrts_mul_hi64(a, b) + c;
+}
+#endif
+#if defined(__OPENCL_VERSION__)
+static int32_t futrts_clzz8(int8_t x)
+{
+    return clz(x);
+}
+static int32_t futrts_clzz16(int16_t x)
+{
+    return clz(x);
+}
+static int32_t futrts_clzz32(int32_t x)
+{
+    return clz(x);
+}
+static int32_t futrts_clzz64(int64_t x)
+{
+    return clz(x);
+}
+#elif defined(__CUDA_ARCH__)
+static int32_t futrts_clzz8(int8_t x)
+{
+    return __clz(zext_i8_i32(x)) - 24;
+}
+static int32_t futrts_clzz16(int16_t x)
+{
+    return __clz(zext_i16_i32(x)) - 16;
+}
+static int32_t futrts_clzz32(int32_t x)
+{
+    return __clz(x);
+}
+static int32_t futrts_clzz64(int64_t x)
+{
+    return __clzll(x);
+}
+#else
+static int32_t futrts_clzz8(int8_t x)
+{
+    int n = 0;
+    int bits = sizeof(x) * 8;
+    
+    for (int i = 0; i < bits; i++) {
+        if (x < 0)
+            break;
+        n++;
+        x <<= 1;
+    }
+    return n;
+}
+static int32_t futrts_clzz16(int16_t x)
+{
+    int n = 0;
+    int bits = sizeof(x) * 8;
+    
+    for (int i = 0; i < bits; i++) {
+        if (x < 0)
+            break;
+        n++;
+        x <<= 1;
+    }
+    return n;
+}
+static int32_t futrts_clzz32(int32_t x)
+{
+    int n = 0;
+    int bits = sizeof(x) * 8;
+    
+    for (int i = 0; i < bits; i++) {
+        if (x < 0)
+            break;
+        n++;
+        x <<= 1;
+    }
+    return n;
+}
+static int32_t futrts_clzz64(int64_t x)
+{
+    int n = 0;
+    int bits = sizeof(x) * 8;
+    
+    for (int i = 0; i < bits; i++) {
+        if (x < 0)
+            break;
+        n++;
+        x <<= 1;
+    }
+    return n;
+}
+#endif
+#if defined(__OPENCL_VERSION__)
+static int32_t futrts_ctzz8(int8_t x)
+{
+    int i = 0;
+    
+    for (; i < 8 && (x & 1) == 0; i++, x >>= 1)
+        ;
+    return i;
+}
+static int32_t futrts_ctzz16(int16_t x)
+{
+    int i = 0;
+    
+    for (; i < 16 && (x & 1) == 0; i++, x >>= 1)
+        ;
+    return i;
+}
+static int32_t futrts_ctzz32(int32_t x)
+{
+    int i = 0;
+    
+    for (; i < 32 && (x & 1) == 0; i++, x >>= 1)
+        ;
+    return i;
+}
+static int32_t futrts_ctzz64(int64_t x)
+{
+    int i = 0;
+    
+    for (; i < 64 && (x & 1) == 0; i++, x >>= 1)
+        ;
+    return i;
+}
+#elif defined(__CUDA_ARCH__)
+static int32_t futrts_ctzz8(int8_t x)
+{
+    int y = __ffs(x);
+    
+    return y == 0 ? 8 : y - 1;
+}
+static int32_t futrts_ctzz16(int16_t x)
+{
+    int y = __ffs(x);
+    
+    return y == 0 ? 16 : y - 1;
+}
+static int32_t futrts_ctzz32(int32_t x)
+{
+    int y = __ffs(x);
+    
+    return y == 0 ? 32 : y - 1;
+}
+static int32_t futrts_ctzz64(int64_t x)
+{
+    int y = __ffsll(x);
+    
+    return y == 0 ? 64 : y - 1;
+}
+#else
+static int32_t futrts_ctzz8(int8_t x)
+{
+    return x == 0 ? 8 : __builtin_ctz((uint32_t) x);
+}
+static int32_t futrts_ctzz16(int16_t x)
+{
+    return x == 0 ? 16 : __builtin_ctz((uint32_t) x);
+}
+static int32_t futrts_ctzz32(int32_t x)
+{
+    return x == 0 ? 32 : __builtin_ctz(x);
+}
+static int32_t futrts_ctzz64(int64_t x)
+{
+    return x == 0 ? 64 : __builtin_ctzll(x);
+}
+#endif
+static inline float fdiv32(float x, float y)
+{
+    return x / y;
+}
+static inline float fadd32(float x, float y)
+{
+    return x + y;
+}
+static inline float fsub32(float x, float y)
+{
+    return x - y;
+}
+static inline float fmul32(float x, float y)
+{
+    return x * y;
+}
+static inline float fmin32(float x, float y)
+{
+    return fmin(x, y);
+}
+static inline float fmax32(float x, float y)
+{
+    return fmax(x, y);
+}
+static inline float fpow32(float x, float y)
+{
+    return pow(x, y);
+}
+static inline bool cmplt32(float x, float y)
+{
+    return x < y;
+}
+static inline bool cmple32(float x, float y)
+{
+    return x <= y;
+}
+static inline float sitofp_i8_f32(int8_t x)
+{
+    return (float) x;
+}
+static inline float sitofp_i16_f32(int16_t x)
+{
+    return (float) x;
+}
+static inline float sitofp_i32_f32(int32_t x)
+{
+    return (float) x;
+}
+static inline float sitofp_i64_f32(int64_t x)
+{
+    return (float) x;
+}
+static inline float uitofp_i8_f32(uint8_t x)
+{
+    return (float) x;
+}
+static inline float uitofp_i16_f32(uint16_t x)
+{
+    return (float) x;
+}
+static inline float uitofp_i32_f32(uint32_t x)
+{
+    return (float) x;
+}
+static inline float uitofp_i64_f32(uint64_t x)
+{
+    return (float) x;
+}
+static inline int8_t fptosi_f32_i8(float x)
+{
+    return (int8_t) x;
+}
+static inline int16_t fptosi_f32_i16(float x)
+{
+    return (int16_t) x;
+}
+static inline int32_t fptosi_f32_i32(float x)
+{
+    return (int32_t) x;
+}
+static inline int64_t fptosi_f32_i64(float x)
+{
+    return (int64_t) x;
+}
+static inline uint8_t fptoui_f32_i8(float x)
+{
+    return (uint8_t) x;
+}
+static inline uint16_t fptoui_f32_i16(float x)
+{
+    return (uint16_t) x;
+}
+static inline uint32_t fptoui_f32_i32(float x)
+{
+    return (uint32_t) x;
+}
+static inline uint64_t fptoui_f32_i64(float x)
+{
+    return (uint64_t) x;
+}
+static inline double fdiv64(double x, double y)
+{
+    return x / y;
+}
+static inline double fadd64(double x, double y)
+{
+    return x + y;
+}
+static inline double fsub64(double x, double y)
+{
+    return x - y;
+}
+static inline double fmul64(double x, double y)
+{
+    return x * y;
+}
+static inline double fmin64(double x, double y)
+{
+    return fmin(x, y);
+}
+static inline double fmax64(double x, double y)
+{
+    return fmax(x, y);
+}
+static inline double fpow64(double x, double y)
+{
+    return pow(x, y);
+}
+static inline bool cmplt64(double x, double y)
+{
+    return x < y;
+}
+static inline bool cmple64(double x, double y)
+{
+    return x <= y;
+}
+static inline double sitofp_i8_f64(int8_t x)
+{
+    return (double) x;
+}
+static inline double sitofp_i16_f64(int16_t x)
+{
+    return (double) x;
+}
+static inline double sitofp_i32_f64(int32_t x)
+{
+    return (double) x;
+}
+static inline double sitofp_i64_f64(int64_t x)
+{
+    return (double) x;
+}
+static inline double uitofp_i8_f64(uint8_t x)
+{
+    return (double) x;
+}
+static inline double uitofp_i16_f64(uint16_t x)
+{
+    return (double) x;
+}
+static inline double uitofp_i32_f64(uint32_t x)
+{
+    return (double) x;
+}
+static inline double uitofp_i64_f64(uint64_t x)
+{
+    return (double) x;
+}
+static inline int8_t fptosi_f64_i8(double x)
+{
+    return (int8_t) x;
+}
+static inline int16_t fptosi_f64_i16(double x)
+{
+    return (int16_t) x;
+}
+static inline int32_t fptosi_f64_i32(double x)
+{
+    return (int32_t) x;
+}
+static inline int64_t fptosi_f64_i64(double x)
+{
+    return (int64_t) x;
+}
+static inline uint8_t fptoui_f64_i8(double x)
+{
+    return (uint8_t) x;
+}
+static inline uint16_t fptoui_f64_i16(double x)
+{
+    return (uint16_t) x;
+}
+static inline uint32_t fptoui_f64_i32(double x)
+{
+    return (uint32_t) x;
+}
+static inline uint64_t fptoui_f64_i64(double x)
+{
+    return (uint64_t) x;
+}
+static inline float fpconv_f32_f32(float x)
+{
+    return (float) x;
+}
+static inline double fpconv_f32_f64(float x)
+{
+    return (double) x;
+}
+static inline float fpconv_f64_f32(double x)
+{
+    return (float) x;
+}
+static inline double fpconv_f64_f64(double x)
+{
+    return (double) x;
+}
+static inline bool futrts_isnan32(float x)
+{
+    return isnan(x);
+}
+static inline bool futrts_isinf32(float x)
+{
+    return isinf(x);
+}
+#ifdef __OPENCL_VERSION__
+static inline float futrts_log32(float x)
+{
+    return log(x);
+}
+static inline float futrts_log2_32(float x)
+{
+    return log2(x);
+}
+static inline float futrts_log10_32(float x)
+{
+    return log10(x);
+}
+static inline float futrts_sqrt32(float x)
+{
+    return sqrt(x);
+}
+static inline float futrts_exp32(float x)
+{
+    return exp(x);
+}
+static inline float futrts_cos32(float x)
+{
+    return cos(x);
+}
+static inline float futrts_sin32(float x)
+{
+    return sin(x);
+}
+static inline float futrts_tan32(float x)
+{
+    return tan(x);
+}
+static inline float futrts_acos32(float x)
+{
+    return acos(x);
+}
+static inline float futrts_asin32(float x)
+{
+    return asin(x);
+}
+static inline float futrts_atan32(float x)
+{
+    return atan(x);
+}
+static inline float futrts_cosh32(float x)
+{
+    return cosh(x);
+}
+static inline float futrts_sinh32(float x)
+{
+    return sinh(x);
+}
+static inline float futrts_tanh32(float x)
+{
+    return tanh(x);
+}
+static inline float futrts_acosh32(float x)
+{
+    return acosh(x);
+}
+static inline float futrts_asinh32(float x)
+{
+    return asinh(x);
+}
+static inline float futrts_atanh32(float x)
+{
+    return atanh(x);
+}
+static inline float futrts_atan2_32(float x, float y)
+{
+    return atan2(x, y);
+}
+static inline float futrts_hypot32(float x, float y)
+{
+    return hypot(x, y);
+}
+static inline float futrts_gamma32(float x)
+{
+    return tgamma(x);
+}
+static inline float futrts_lgamma32(float x)
+{
+    return lgamma(x);
+}
+static inline float fmod32(float x, float y)
+{
+    return fmod(x, y);
+}
+static inline float futrts_round32(float x)
+{
+    return rint(x);
+}
+static inline float futrts_floor32(float x)
+{
+    return floor(x);
+}
+static inline float futrts_ceil32(float x)
+{
+    return ceil(x);
+}
+static inline float futrts_lerp32(float v0, float v1, float t)
+{
+    return mix(v0, v1, t);
+}
+static inline float futrts_mad32(float a, float b, float c)
+{
+    return mad(a, b, c);
+}
+static inline float futrts_fma32(float a, float b, float c)
+{
+    return fma(a, b, c);
+}
+#else
+static inline float futrts_log32(float x)
+{
+    return logf(x);
+}
+static inline float futrts_log2_32(float x)
+{
+    return log2f(x);
+}
+static inline float futrts_log10_32(float x)
+{
+    return log10f(x);
+}
+static inline float futrts_sqrt32(float x)
+{
+    return sqrtf(x);
+}
+static inline float futrts_exp32(float x)
+{
+    return expf(x);
+}
+static inline float futrts_cos32(float x)
+{
+    return cosf(x);
+}
+static inline float futrts_sin32(float x)
+{
+    return sinf(x);
+}
+static inline float futrts_tan32(float x)
+{
+    return tanf(x);
+}
+static inline float futrts_acos32(float x)
+{
+    return acosf(x);
+}
+static inline float futrts_asin32(float x)
+{
+    return asinf(x);
+}
+static inline float futrts_atan32(float x)
+{
+    return atanf(x);
+}
+static inline float futrts_cosh32(float x)
+{
+    return coshf(x);
+}
+static inline float futrts_sinh32(float x)
+{
+    return sinhf(x);
+}
+static inline float futrts_tanh32(float x)
+{
+    return tanhf(x);
+}
+static inline float futrts_acosh32(float x)
+{
+    return acoshf(x);
+}
+static inline float futrts_asinh32(float x)
+{
+    return asinhf(x);
+}
+static inline float futrts_atanh32(float x)
+{
+    return atanhf(x);
+}
+static inline float futrts_atan2_32(float x, float y)
+{
+    return atan2f(x, y);
+}
+static inline float futrts_hypot32(float x, float y)
+{
+    return hypotf(x, y);
+}
+static inline float futrts_gamma32(float x)
+{
+    return tgammaf(x);
+}
+static inline float futrts_lgamma32(float x)
+{
+    return lgammaf(x);
+}
+static inline float fmod32(float x, float y)
+{
+    return fmodf(x, y);
+}
+static inline float futrts_round32(float x)
+{
+    return rintf(x);
+}
+static inline float futrts_floor32(float x)
+{
+    return floorf(x);
+}
+static inline float futrts_ceil32(float x)
+{
+    return ceilf(x);
+}
+static inline float futrts_lerp32(float v0, float v1, float t)
+{
+    return v0 + (v1 - v0) * t;
+}
+static inline float futrts_mad32(float a, float b, float c)
+{
+    return a * b + c;
+}
+static inline float futrts_fma32(float a, float b, float c)
+{
+    return fmaf(a, b, c);
+}
+#endif
+static inline int32_t futrts_to_bits32(float x)
+{
+    union {
+        float f;
+        int32_t t;
+    } p;
+    
+    p.f = x;
+    return p.t;
+}
+static inline float futrts_from_bits32(int32_t x)
+{
+    union {
+        int32_t f;
+        float t;
+    } p;
+    
+    p.f = x;
+    return p.t;
+}
+static inline float fsignum32(float x)
+{
+    return futrts_isnan32(x) ? x : (x > 0) - (x < 0);
+}
+static inline double futrts_log64(double x)
+{
+    return log(x);
+}
+static inline double futrts_log2_64(double x)
+{
+    return log2(x);
+}
+static inline double futrts_log10_64(double x)
+{
+    return log10(x);
+}
+static inline double futrts_sqrt64(double x)
+{
+    return sqrt(x);
+}
+static inline double futrts_exp64(double x)
+{
+    return exp(x);
+}
+static inline double futrts_cos64(double x)
+{
+    return cos(x);
+}
+static inline double futrts_sin64(double x)
+{
+    return sin(x);
+}
+static inline double futrts_tan64(double x)
+{
+    return tan(x);
+}
+static inline double futrts_acos64(double x)
+{
+    return acos(x);
+}
+static inline double futrts_asin64(double x)
+{
+    return asin(x);
+}
+static inline double futrts_atan64(double x)
+{
+    return atan(x);
+}
+static inline double futrts_cosh64(double x)
+{
+    return cosh(x);
+}
+static inline double futrts_sinh64(double x)
+{
+    return sinh(x);
+}
+static inline double futrts_tanh64(double x)
+{
+    return tanh(x);
+}
+static inline double futrts_acosh64(double x)
+{
+    return acosh(x);
+}
+static inline double futrts_asinh64(double x)
+{
+    return asinh(x);
+}
+static inline double futrts_atanh64(double x)
+{
+    return atanh(x);
+}
+static inline double futrts_atan2_64(double x, double y)
+{
+    return atan2(x, y);
+}
+static inline double futrts_hypot64(double x, double y)
+{
+    return hypot(x, y);
+}
+static inline double futrts_gamma64(double x)
+{
+    return tgamma(x);
+}
+static inline double futrts_lgamma64(double x)
+{
+    return lgamma(x);
+}
+static inline double futrts_fma64(double a, double b, double c)
+{
+    return fma(a, b, c);
+}
+static inline double futrts_round64(double x)
+{
+    return rint(x);
+}
+static inline double futrts_ceil64(double x)
+{
+    return ceil(x);
+}
+static inline double futrts_floor64(double x)
+{
+    return floor(x);
+}
+static inline bool futrts_isnan64(double x)
+{
+    return isnan(x);
+}
+static inline bool futrts_isinf64(double x)
+{
+    return isinf(x);
+}
+static inline int64_t futrts_to_bits64(double x)
+{
+    union {
+        double f;
+        int64_t t;
+    } p;
+    
+    p.f = x;
+    return p.t;
+}
+static inline double futrts_from_bits64(int64_t x)
+{
+    union {
+        int64_t f;
+        double t;
+    } p;
+    
+    p.f = x;
+    return p.t;
+}
+static inline double fmod64(double x, double y)
+{
+    return fmod(x, y);
+}
+static inline double fsignum64(double x)
+{
+    return futrts_isnan64(x) ? x : (x > 0) - (x < 0);
+}
+#ifdef __OPENCL_VERSION__
+static inline double futrts_lerp64(double v0, double v1, double t)
+{
+    return mix(v0, v1, t);
+}
+static inline double futrts_mad64(double a, double b, double c)
+{
+    return mad(a, b, c);
+}
+#else
+static inline double futrts_lerp64(double v0, double v1, double t)
+{
+    return v0 + (v1 - v0) * t;
+}
+static inline double futrts_mad64(double a, double b, double c)
+{
+    return a * b + c;
+}
+#endif
+static int init_constants(struct futhark_context *);
+static int free_constants(struct futhark_context *);
+struct memblock_device {
+    int *references;
+    cl_mem mem;
+    int64_t size;
+    const char *desc;
+} ;
+struct memblock {
+    int *references;
+    char *mem;
+    int64_t size;
+    const char *desc;
+} ;
+typedef cl_mem fl_mem_t;
+// Start of free_list.h.
+
+// An entry in the free list.  May be invalid, to avoid having to
+// deallocate entries as soon as they are removed.  There is also a
+// tag, to help with memory reuse.
+struct free_list_entry {
+  size_t size;
+  fl_mem_t mem;
+  const char *tag;
+  unsigned char valid;
+};
+
+struct free_list {
+  struct free_list_entry *entries;        // Pointer to entries.
+  int capacity;                           // Number of entries.
+  int used;                               // Number of valid entries.
+};
+
+static void free_list_init(struct free_list *l) {
+  l->capacity = 30; // Picked arbitrarily.
+  l->used = 0;
+  l->entries = (struct free_list_entry*) malloc(sizeof(struct free_list_entry) * l->capacity);
+  for (int i = 0; i < l->capacity; i++) {
+    l->entries[i].valid = 0;
+  }
+}
+
+// Remove invalid entries from the free list.
+static void free_list_pack(struct free_list *l) {
+  int p = 0;
+  for (int i = 0; i < l->capacity; i++) {
+    if (l->entries[i].valid) {
+      l->entries[p] = l->entries[i];
+      if (i > p) {
+        l->entries[i].valid = 0;
+      }
+      p++;
+    }
+  }
+
+  // Now p is the number of used elements.  We don't want it to go
+  // less than the default capacity (although in practice it's OK as
+  // long as it doesn't become 1).
+  if (p < 30) {
+    p = 30;
+  }
+  l->entries = realloc(l->entries, p * sizeof(struct free_list_entry));
+  l->capacity = p;
+}
+
+static void free_list_destroy(struct free_list *l) {
+  assert(l->used == 0);
+  free(l->entries);
+}
+
+static int free_list_find_invalid(struct free_list *l) {
+  int i;
+  for (i = 0; i < l->capacity; i++) {
+    if (!l->entries[i].valid) {
+      break;
+    }
+  }
+  return i;
+}
+
+static void free_list_insert(struct free_list *l, size_t size, fl_mem_t mem, const char *tag) {
+  int i = free_list_find_invalid(l);
+
+  if (i == l->capacity) {
+    // List is full; so we have to grow it.
+    int new_capacity = l->capacity * 2 * sizeof(struct free_list_entry);
+    l->entries = realloc(l->entries, new_capacity);
+    for (int j = 0; j < l->capacity; j++) {
+      l->entries[j+l->capacity].valid = 0;
+    }
+    l->capacity *= 2;
+  }
+
+  // Now 'i' points to the first invalid entry.
+  l->entries[i].valid = 1;
+  l->entries[i].size = size;
+  l->entries[i].mem = mem;
+  l->entries[i].tag = tag;
+
+  l->used++;
+}
+
+// Find and remove a memory block of the indicated tag, or if that
+// does not exist, another memory block with exactly the desired size.
+// Returns 0 on success.
+static int free_list_find(struct free_list *l, size_t size,
+                          size_t *size_out, fl_mem_t *mem_out) {
+  int size_match = -1;
+  int i;
+  for (i = 0; i < l->capacity; i++) {
+    if (l->entries[i].valid &&
+        size <= l->entries[i].size &&
+        (size_match < 0 || l->entries[i].size < l->entries[size_match].size)) {
+      // If this entry is valid, has sufficient size, and is smaller than the
+      // best entry found so far, use this entry.
+      size_match = i;
+    }
+  }
+
+  if (size_match >= 0) {
+    l->entries[size_match].valid = 0;
+    *size_out = l->entries[size_match].size;
+    *mem_out = l->entries[size_match].mem;
+    l->used--;
+    return 0;
+  } else {
+    return 1;
+  }
+}
+
+// Remove the first block in the free list.  Returns 0 if a block was
+// removed, and nonzero if the free list was already empty.
+static int free_list_first(struct free_list *l, fl_mem_t *mem_out) {
+  for (int i = 0; i < l->capacity; i++) {
+    if (l->entries[i].valid) {
+      l->entries[i].valid = 0;
+      *mem_out = l->entries[i].mem;
+      l->used--;
+      return 0;
+    }
+  }
+
+  return 1;
+}
+
+// End of free_list.h.
+
+// Start of opencl.h.
+
+#define OPENCL_SUCCEED_FATAL(e) opencl_succeed_fatal(e, #e, __FILE__, __LINE__)
+#define OPENCL_SUCCEED_NONFATAL(e) opencl_succeed_nonfatal(e, #e, __FILE__, __LINE__)
+// Take care not to override an existing error.
+#define OPENCL_SUCCEED_OR_RETURN(e) {             \
+    char *serror = OPENCL_SUCCEED_NONFATAL(e);    \
+    if (serror) {                                 \
+      if (!ctx->error) {                          \
+        ctx->error = serror;                      \
+        return bad;                               \
+      } else {                                    \
+        free(serror);                             \
+      }                                           \
+    }                                             \
+  }
+
+// OPENCL_SUCCEED_OR_RETURN returns the value of the variable 'bad' in
+// scope.  By default, it will be this one.  Create a local variable
+// of some other type if needed.  This is a bit of a hack, but it
+// saves effort in the code generator.
+static const int bad = 1;
+
+struct opencl_config {
+  int debugging;
+  int profiling;
+  int logging;
+  int preferred_device_num;
+  const char *preferred_platform;
+  const char *preferred_device;
+  int ignore_blacklist;
+
+  const char* dump_program_to;
+  const char* load_program_from;
+  const char* dump_binary_to;
+  const char* load_binary_from;
+
+  size_t default_group_size;
+  size_t default_num_groups;
+  size_t default_tile_size;
+  size_t default_reg_tile_size;
+  size_t default_threshold;
+
+  int default_group_size_changed;
+  int default_tile_size_changed;
+
+  int num_sizes;
+  const char **size_names;
+  const char **size_vars;
+  int64_t *size_values;
+  const char **size_classes;
+};
+
+static void opencl_config_init(struct opencl_config *cfg,
+                               int num_sizes,
+                               const char *size_names[],
+                               const char *size_vars[],
+                               int64_t *size_values,
+                               const char *size_classes[]) {
+  cfg->debugging = 0;
+  cfg->logging = 0;
+  cfg->profiling = 0;
+  cfg->preferred_device_num = 0;
+  cfg->preferred_platform = "";
+  cfg->preferred_device = "";
+  cfg->ignore_blacklist = 0;
+  cfg->dump_program_to = NULL;
+  cfg->load_program_from = NULL;
+  cfg->dump_binary_to = NULL;
+  cfg->load_binary_from = NULL;
+
+  // The following are dummy sizes that mean the concrete defaults
+  // will be set during initialisation via hardware-inspection-based
+  // heuristics.
+  cfg->default_group_size = 0;
+  cfg->default_num_groups = 0;
+  cfg->default_tile_size = 0;
+  cfg->default_reg_tile_size = 0;
+  cfg->default_threshold = 0;
+
+  cfg->default_group_size_changed = 0;
+  cfg->default_tile_size_changed = 0;
+
+  cfg->num_sizes = num_sizes;
+  cfg->size_names = size_names;
+  cfg->size_vars = size_vars;
+  cfg->size_values = size_values;
+  cfg->size_classes = size_classes;
+}
+
+// A record of something that happened.
+struct profiling_record {
+  cl_event *event;
+  int *runs;
+  int64_t *runtime;
+};
+
+struct opencl_context {
+  cl_device_id device;
+  cl_context ctx;
+  cl_command_queue queue;
+
+  struct opencl_config cfg;
+
+  struct free_list free_list;
+
+  size_t max_group_size;
+  size_t max_num_groups;
+  size_t max_tile_size;
+  size_t max_threshold;
+  size_t max_local_memory;
+
+  size_t lockstep_width;
+
+  struct profiling_record *profiling_records;
+  int profiling_records_capacity;
+  int profiling_records_used;
+};
+
+struct opencl_device_option {
+  cl_platform_id platform;
+  cl_device_id device;
+  cl_device_type device_type;
+  char *platform_name;
+  char *device_name;
+};
+
+// This function must be defined by the user.  It is invoked by
+// setup_opencl() after the platform and device has been found, but
+// before the program is loaded.  Its intended use is to tune
+// constants based on the selected platform and device.
+static void post_opencl_setup(struct opencl_context*, struct opencl_device_option*);
+
+static char *strclone(const char *str) {
+  size_t size = strlen(str) + 1;
+  char *copy = (char*) malloc(size);
+  if (copy == NULL) {
+    return NULL;
+  }
+
+  memcpy(copy, str, size);
+  return copy;
+}
+
+static const char* opencl_error_string(cl_int err)
+{
+    switch (err) {
+        case CL_SUCCESS:                            return "Success!";
+        case CL_DEVICE_NOT_FOUND:                   return "Device not found.";
+        case CL_DEVICE_NOT_AVAILABLE:               return "Device not available";
+        case CL_COMPILER_NOT_AVAILABLE:             return "Compiler not available";
+        case CL_MEM_OBJECT_ALLOCATION_FAILURE:      return "Memory object allocation failure";
+        case CL_OUT_OF_RESOURCES:                   return "Out of resources";
+        case CL_OUT_OF_HOST_MEMORY:                 return "Out of host memory";
+        case CL_PROFILING_INFO_NOT_AVAILABLE:       return "Profiling information not available";
+        case CL_MEM_COPY_OVERLAP:                   return "Memory copy overlap";
+        case CL_IMAGE_FORMAT_MISMATCH:              return "Image format mismatch";
+        case CL_IMAGE_FORMAT_NOT_SUPPORTED:         return "Image format not supported";
+        case CL_BUILD_PROGRAM_FAILURE:              return "Program build failure";
+        case CL_MAP_FAILURE:                        return "Map failure";
+        case CL_INVALID_VALUE:                      return "Invalid value";
+        case CL_INVALID_DEVICE_TYPE:                return "Invalid device type";
+        case CL_INVALID_PLATFORM:                   return "Invalid platform";
+        case CL_INVALID_DEVICE:                     return "Invalid device";
+        case CL_INVALID_CONTEXT:                    return "Invalid context";
+        case CL_INVALID_QUEUE_PROPERTIES:           return "Invalid queue properties";
+        case CL_INVALID_COMMAND_QUEUE:              return "Invalid command queue";
+        case CL_INVALID_HOST_PTR:                   return "Invalid host pointer";
+        case CL_INVALID_MEM_OBJECT:                 return "Invalid memory object";
+        case CL_INVALID_IMAGE_FORMAT_DESCRIPTOR:    return "Invalid image format descriptor";
+        case CL_INVALID_IMAGE_SIZE:                 return "Invalid image size";
+        case CL_INVALID_SAMPLER:                    return "Invalid sampler";
+        case CL_INVALID_BINARY:                     return "Invalid binary";
+        case CL_INVALID_BUILD_OPTIONS:              return "Invalid build options";
+        case CL_INVALID_PROGRAM:                    return "Invalid program";
+        case CL_INVALID_PROGRAM_EXECUTABLE:         return "Invalid program executable";
+        case CL_INVALID_KERNEL_NAME:                return "Invalid kernel name";
+        case CL_INVALID_KERNEL_DEFINITION:          return "Invalid kernel definition";
+        case CL_INVALID_KERNEL:                     return "Invalid kernel";
+        case CL_INVALID_ARG_INDEX:                  return "Invalid argument index";
+        case CL_INVALID_ARG_VALUE:                  return "Invalid argument value";
+        case CL_INVALID_ARG_SIZE:                   return "Invalid argument size";
+        case CL_INVALID_KERNEL_ARGS:                return "Invalid kernel arguments";
+        case CL_INVALID_WORK_DIMENSION:             return "Invalid work dimension";
+        case CL_INVALID_WORK_GROUP_SIZE:            return "Invalid work group size";
+        case CL_INVALID_WORK_ITEM_SIZE:             return "Invalid work item size";
+        case CL_INVALID_GLOBAL_OFFSET:              return "Invalid global offset";
+        case CL_INVALID_EVENT_WAIT_LIST:            return "Invalid event wait list";
+        case CL_INVALID_EVENT:                      return "Invalid event";
+        case CL_INVALID_OPERATION:                  return "Invalid operation";
+        case CL_INVALID_GL_OBJECT:                  return "Invalid OpenGL object";
+        case CL_INVALID_BUFFER_SIZE:                return "Invalid buffer size";
+        case CL_INVALID_MIP_LEVEL:                  return "Invalid mip-map level";
+        default:                                    return "Unknown";
+    }
+}
+
+static void opencl_succeed_fatal(unsigned int ret,
+                                 const char *call,
+                                 const char *file,
+                                 int line) {
+  if (ret != CL_SUCCESS) {
+    futhark_panic(-1, "%s:%d: OpenCL call\n  %s\nfailed with error code %d (%s)\n",
+          file, line, call, ret, opencl_error_string(ret));
+  }
+}
+
+static char* opencl_succeed_nonfatal(unsigned int ret,
+                                     const char *call,
+                                     const char *file,
+                                     int line) {
+  if (ret != CL_SUCCESS) {
+    return msgprintf("%s:%d: OpenCL call\n  %s\nfailed with error code %d (%s)\n",
+                     file, line, call, ret, opencl_error_string(ret));
+  } else {
+    return NULL;
+  }
+}
+
+static void set_preferred_platform(struct opencl_config *cfg, const char *s) {
+  cfg->preferred_platform = s;
+  cfg->ignore_blacklist = 1;
+}
+
+static void set_preferred_device(struct opencl_config *cfg, const char *s) {
+  int x = 0;
+  if (*s == '#') {
+    s++;
+    while (isdigit(*s)) {
+      x = x * 10 + (*s++)-'0';
+    }
+    // Skip trailing spaces.
+    while (isspace(*s)) {
+      s++;
+    }
+  }
+  cfg->preferred_device = s;
+  cfg->preferred_device_num = x;
+  cfg->ignore_blacklist = 1;
+}
+
+static char* opencl_platform_info(cl_platform_id platform,
+                                  cl_platform_info param) {
+  size_t req_bytes;
+  char *info;
+
+  OPENCL_SUCCEED_FATAL(clGetPlatformInfo(platform, param, 0, NULL, &req_bytes));
+
+  info = (char*) malloc(req_bytes);
+
+  OPENCL_SUCCEED_FATAL(clGetPlatformInfo(platform, param, req_bytes, info, NULL));
+
+  return info;
+}
+
+static char* opencl_device_info(cl_device_id device,
+                                cl_device_info param) {
+  size_t req_bytes;
+  char *info;
+
+  OPENCL_SUCCEED_FATAL(clGetDeviceInfo(device, param, 0, NULL, &req_bytes));
+
+  info = (char*) malloc(req_bytes);
+
+  OPENCL_SUCCEED_FATAL(clGetDeviceInfo(device, param, req_bytes, info, NULL));
+
+  return info;
+}
+
+static void opencl_all_device_options(struct opencl_device_option **devices_out,
+                                      size_t *num_devices_out) {
+  size_t num_devices = 0, num_devices_added = 0;
+
+  cl_platform_id *all_platforms;
+  cl_uint *platform_num_devices;
+
+  cl_uint num_platforms;
+
+  // Find the number of platforms.
+  OPENCL_SUCCEED_FATAL(clGetPlatformIDs(0, NULL, &num_platforms));
+
+  // Make room for them.
+  all_platforms = calloc(num_platforms, sizeof(cl_platform_id));
+  platform_num_devices = calloc(num_platforms, sizeof(cl_uint));
+
+  // Fetch all the platforms.
+  OPENCL_SUCCEED_FATAL(clGetPlatformIDs(num_platforms, all_platforms, NULL));
+
+  // Count the number of devices for each platform, as well as the
+  // total number of devices.
+  for (cl_uint i = 0; i < num_platforms; i++) {
+    if (clGetDeviceIDs(all_platforms[i], CL_DEVICE_TYPE_ALL,
+                       0, NULL, &platform_num_devices[i]) == CL_SUCCESS) {
+      num_devices += platform_num_devices[i];
+    } else {
+      platform_num_devices[i] = 0;
+    }
+  }
+
+  // Make room for all the device options.
+  struct opencl_device_option *devices =
+    calloc(num_devices, sizeof(struct opencl_device_option));
+
+  // Loop through the platforms, getting information about their devices.
+  for (cl_uint i = 0; i < num_platforms; i++) {
+    cl_platform_id platform = all_platforms[i];
+    cl_uint num_platform_devices = platform_num_devices[i];
+
+    if (num_platform_devices == 0) {
+      continue;
+    }
+
+    char *platform_name = opencl_platform_info(platform, CL_PLATFORM_NAME);
+    cl_device_id *platform_devices =
+      calloc(num_platform_devices, sizeof(cl_device_id));
+
+    // Fetch all the devices.
+    OPENCL_SUCCEED_FATAL(clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL,
+                                  num_platform_devices, platform_devices, NULL));
+
+    // Loop through the devices, adding them to the devices array.
+    for (cl_uint i = 0; i < num_platform_devices; i++) {
+      char *device_name = opencl_device_info(platform_devices[i], CL_DEVICE_NAME);
+      devices[num_devices_added].platform = platform;
+      devices[num_devices_added].device = platform_devices[i];
+      OPENCL_SUCCEED_FATAL(clGetDeviceInfo(platform_devices[i], CL_DEVICE_TYPE,
+                                     sizeof(cl_device_type),
+                                     &devices[num_devices_added].device_type,
+                                     NULL));
+      // We don't want the structs to share memory, so copy the platform name.
+      // Each device name is already unique.
+      devices[num_devices_added].platform_name = strclone(platform_name);
+      devices[num_devices_added].device_name = device_name;
+      num_devices_added++;
+    }
+    free(platform_devices);
+    free(platform_name);
+  }
+  free(all_platforms);
+  free(platform_num_devices);
+
+  *devices_out = devices;
+  *num_devices_out = num_devices;
+}
+
+// Returns 0 on success.
+static int list_devices(void) {
+  struct opencl_device_option *devices;
+  size_t num_devices;
+
+  opencl_all_device_options(&devices, &num_devices);
+
+  const char *cur_platform = "";
+  for (size_t i = 0; i < num_devices; i++) {
+    struct opencl_device_option device = devices[i];
+    if (strcmp(cur_platform, device.platform_name) != 0) {
+      printf("Platform: %s\n", device.platform_name);
+      cur_platform = device.platform_name;
+    }
+    printf("[%d]: %s\n", (int)i, device.device_name);
+  }
+
+  // Free all the platform and device names.
+  for (size_t j = 0; j < num_devices; j++) {
+    free(devices[j].platform_name);
+    free(devices[j].device_name);
+  }
+  free(devices);
+
+  return 0;
+}
+
+// Returns 0 on success.
+static int select_device_interactively(struct opencl_config *cfg) {
+  struct opencl_device_option *devices;
+  size_t num_devices;
+  int ret = 1;
+
+  opencl_all_device_options(&devices, &num_devices);
+
+  printf("Choose OpenCL device:\n");
+  const char *cur_platform = "";
+  for (size_t i = 0; i < num_devices; i++) {
+    struct opencl_device_option device = devices[i];
+    if (strcmp(cur_platform, device.platform_name) != 0) {
+      printf("Platform: %s\n", device.platform_name);
+      cur_platform = device.platform_name;
+    }
+    printf("[%d] %s\n", (int)i, device.device_name);
+  }
+
+  int selection;
+  printf("Choice: ");
+  if (scanf("%d", &selection) == 1) {
+    ret = 0;
+    cfg->preferred_platform = "";
+    cfg->preferred_device = "";
+    cfg->preferred_device_num = selection;
+    cfg->ignore_blacklist = 1;
+  }
+
+  // Free all the platform and device names.
+  for (size_t j = 0; j < num_devices; j++) {
+    free(devices[j].platform_name);
+    free(devices[j].device_name);
+  }
+  free(devices);
+
+  return ret;
+}
+
+static int is_blacklisted(const char *platform_name, const char *device_name,
+                          const struct opencl_config *cfg) {
+  if (strcmp(cfg->preferred_platform, "") != 0 ||
+      strcmp(cfg->preferred_device, "") != 0) {
+    return 0;
+  } else if (strstr(platform_name, "Apple") != NULL &&
+             strstr(device_name, "Intel(R) Core(TM)") != NULL) {
+    return 1;
+  } else {
+    return 0;
+  }
+}
+
+static struct opencl_device_option get_preferred_device(const struct opencl_config *cfg) {
+  struct opencl_device_option *devices;
+  size_t num_devices;
+
+  opencl_all_device_options(&devices, &num_devices);
+
+  int num_device_matches = 0;
+
+  for (size_t i = 0; i < num_devices; i++) {
+    struct opencl_device_option device = devices[i];
+    if (strstr(device.platform_name, cfg->preferred_platform) != NULL &&
+        strstr(device.device_name, cfg->preferred_device) != NULL &&
+        (cfg->ignore_blacklist ||
+         !is_blacklisted(device.platform_name, device.device_name, cfg)) &&
+        num_device_matches++ == cfg->preferred_device_num) {
+      // Free all the platform and device names, except the ones we have chosen.
+      for (size_t j = 0; j < num_devices; j++) {
+        if (j != i) {
+          free(devices[j].platform_name);
+          free(devices[j].device_name);
+        }
+      }
+      free(devices);
+      return device;
+    }
+  }
+
+  futhark_panic(1, "Could not find acceptable OpenCL device.\n");
+  exit(1); // Never reached
+}
+
+static void describe_device_option(struct opencl_device_option device) {
+  fprintf(stderr, "Using platform: %s\n", device.platform_name);
+  fprintf(stderr, "Using device: %s\n", device.device_name);
+}
+
+static cl_build_status build_opencl_program(cl_program program, cl_device_id device, const char* options) {
+  cl_int clBuildProgram_error = clBuildProgram(program, 1, &device, options, NULL, NULL);
+
+  // Avoid termination due to CL_BUILD_PROGRAM_FAILURE
+  if (clBuildProgram_error != CL_SUCCESS &&
+      clBuildProgram_error != CL_BUILD_PROGRAM_FAILURE) {
+    OPENCL_SUCCEED_FATAL(clBuildProgram_error);
+  }
+
+  cl_build_status build_status;
+  OPENCL_SUCCEED_FATAL(clGetProgramBuildInfo(program,
+                                             device,
+                                             CL_PROGRAM_BUILD_STATUS,
+                                             sizeof(cl_build_status),
+                                             &build_status,
+                                             NULL));
+
+  if (build_status != CL_SUCCESS) {
+    char *build_log;
+    size_t ret_val_size;
+    OPENCL_SUCCEED_FATAL(clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, 0, NULL, &ret_val_size));
+
+    build_log = (char*) malloc(ret_val_size+1);
+    OPENCL_SUCCEED_FATAL(clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, ret_val_size, build_log, NULL));
+
+    // The spec technically does not say whether the build log is zero-terminated, so let's be careful.
+    build_log[ret_val_size] = '\0';
+
+    fprintf(stderr, "Build log:\n%s\n", build_log);
+
+    free(build_log);
+  }
+
+  return build_status;
+}
+
+// Fields in a bitmask indicating which types we must be sure are
+// available.
+enum opencl_required_type { OPENCL_F64 = 1 };
+
+// We take as input several strings representing the program, because
+// C does not guarantee that the compiler supports particularly large
+// literals.  Notably, Visual C has a limit of 2048 characters.  The
+// array must be NULL-terminated.
+static cl_program setup_opencl_with_command_queue(struct opencl_context *ctx,
+                                                  cl_command_queue queue,
+                                                  const char *srcs[],
+                                                  int required_types,
+                                                  const char *extra_build_opts[]) {
+  int error;
+
+  free_list_init(&ctx->free_list);
+  ctx->queue = queue;
+
+  OPENCL_SUCCEED_FATAL(clGetCommandQueueInfo(ctx->queue, CL_QUEUE_CONTEXT, sizeof(cl_context), &ctx->ctx, NULL));
+
+  // Fill out the device info.  This is redundant work if we are
+  // called from setup_opencl() (which is the common case), but I
+  // doubt it matters much.
+  struct opencl_device_option device_option;
+  OPENCL_SUCCEED_FATAL(clGetCommandQueueInfo(ctx->queue, CL_QUEUE_DEVICE,
+                                       sizeof(cl_device_id),
+                                       &device_option.device,
+                                       NULL));
+  OPENCL_SUCCEED_FATAL(clGetDeviceInfo(device_option.device, CL_DEVICE_PLATFORM,
+                                 sizeof(cl_platform_id),
+                                 &device_option.platform,
+                                 NULL));
+  OPENCL_SUCCEED_FATAL(clGetDeviceInfo(device_option.device, CL_DEVICE_TYPE,
+                                 sizeof(cl_device_type),
+                                 &device_option.device_type,
+                                 NULL));
+  device_option.platform_name = opencl_platform_info(device_option.platform, CL_PLATFORM_NAME);
+  device_option.device_name = opencl_device_info(device_option.device, CL_DEVICE_NAME);
+
+  ctx->device = device_option.device;
+
+  if (required_types & OPENCL_F64) {
+    cl_uint supported;
+    OPENCL_SUCCEED_FATAL(clGetDeviceInfo(device_option.device, CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE,
+                                   sizeof(cl_uint), &supported, NULL));
+    if (!supported) {
+      futhark_panic(1, "Program uses double-precision floats, but this is not supported on the chosen device: %s\n",
+            device_option.device_name);
+    }
+  }
+
+  size_t max_group_size;
+  OPENCL_SUCCEED_FATAL(clGetDeviceInfo(device_option.device, CL_DEVICE_MAX_WORK_GROUP_SIZE,
+                                 sizeof(size_t), &max_group_size, NULL));
+
+  size_t max_tile_size = sqrt(max_group_size);
+
+  cl_ulong max_local_memory;
+  OPENCL_SUCCEED_FATAL(clGetDeviceInfo(device_option.device, CL_DEVICE_LOCAL_MEM_SIZE,
+                                       sizeof(size_t), &max_local_memory, NULL));
+
+  // Futhark reserves 4 bytes for bookkeeping information.
+  max_local_memory -= 4;
+
+  // The OpenCL implementation may reserve some local memory bytes for
+  // various purposes.  In principle, we should use
+  // clGetKernelWorkGroupInfo() to figure out for each kernel how much
+  // is actually available, but our current code generator design
+  // makes this infeasible.  Instead, we have this nasty hack where we
+  // arbitrarily subtract some bytes, based on empirical measurements
+  // (but which might be arbitrarily wrong).  Fortunately, we rarely
+  // try to really push the local memory usage.
+  if (strstr(device_option.platform_name, "NVIDIA CUDA") != NULL) {
+    max_local_memory -= 12;
+  } else if (strstr(device_option.platform_name, "AMD") != NULL) {
+    max_local_memory -= 16;
+  }
+
+  // Make sure this function is defined.
+  post_opencl_setup(ctx, &device_option);
+
+  if (max_group_size < ctx->cfg.default_group_size) {
+    if (ctx->cfg.default_group_size_changed) {
+      fprintf(stderr, "Note: Device limits default group size to %zu (down from %zu).\n",
+              max_group_size, ctx->cfg.default_group_size);
+    }
+    ctx->cfg.default_group_size = max_group_size;
+  }
+
+  if (max_tile_size < ctx->cfg.default_tile_size) {
+    if (ctx->cfg.default_tile_size_changed) {
+      fprintf(stderr, "Note: Device limits default tile size to %zu (down from %zu).\n",
+              max_tile_size, ctx->cfg.default_tile_size);
+    }
+    ctx->cfg.default_tile_size = max_tile_size;
+  }
+
+  ctx->max_group_size = max_group_size;
+  ctx->max_tile_size = max_tile_size; // No limit.
+  ctx->max_threshold = ctx->max_num_groups = 0; // No limit.
+  ctx->max_local_memory = max_local_memory;
+
+  // Now we go through all the sizes, clamp them to the valid range,
+  // or set them to the default.
+  for (int i = 0; i < ctx->cfg.num_sizes; i++) {
+    const char *size_class = ctx->cfg.size_classes[i];
+    int64_t *size_value = &ctx->cfg.size_values[i];
+    const char* size_name = ctx->cfg.size_names[i];
+    int64_t max_value = 0, default_value = 0;
+
+    if (strstr(size_class, "group_size") == size_class) {
+      max_value = max_group_size;
+      default_value = ctx->cfg.default_group_size;
+    } else if (strstr(size_class, "num_groups") == size_class) {
+      max_value = max_group_size; // Futhark assumes this constraint.
+      default_value = ctx->cfg.default_num_groups;
+      // XXX: as a quick and dirty hack, use twice as many threads for
+      // histograms by default.  We really should just be smarter
+      // about sizes somehow.
+      if (strstr(size_name, ".seghist_") != NULL) {
+        default_value *= 2;
+      }
+    } else if (strstr(size_class, "tile_size") == size_class) {
+      max_value = sqrt(max_group_size);
+      default_value = ctx->cfg.default_tile_size;
+    } else if (strstr(size_class, "reg_tile_size") == size_class) {
+      max_value = 0; // No limit.
+      default_value = ctx->cfg.default_reg_tile_size;
+    } else if (strstr(size_class, "threshold") == size_class) {
+      // Threshold can be as large as it takes.
+      default_value = ctx->cfg.default_threshold;
+    } else {
+      // Bespoke sizes have no limit or default.
+    }
+    if (*size_value == 0) {
+      *size_value = default_value;
+    } else if (max_value > 0 && *size_value > max_value) {
+      fprintf(stderr, "Note: Device limits %s to %d (down from %d)\n",
+              size_name, (int)max_value, (int)*size_value);
+      *size_value = max_value;
+    }
+  }
+
+  if (ctx->lockstep_width == 0) {
+    ctx->lockstep_width = 1;
+  }
+
+  if (ctx->cfg.logging) {
+    fprintf(stderr, "Lockstep width: %d\n", (int)ctx->lockstep_width);
+    fprintf(stderr, "Default group size: %d\n", (int)ctx->cfg.default_group_size);
+    fprintf(stderr, "Default number of groups: %d\n", (int)ctx->cfg.default_num_groups);
+  }
+
+  char *fut_opencl_src = NULL;
+  cl_program prog;
+  error = CL_SUCCESS;
+
+  if (ctx->cfg.load_binary_from == NULL) {
+    size_t src_size = 0;
+
+    // Maybe we have to read OpenCL source from somewhere else (used for debugging).
+    if (ctx->cfg.load_program_from != NULL) {
+      fut_opencl_src = slurp_file(ctx->cfg.load_program_from, NULL);
+      assert(fut_opencl_src != NULL);
+    } else {
+      // Construct the OpenCL source concatenating all the fragments.
+      for (const char **src = srcs; src && *src; src++) {
+        src_size += strlen(*src);
+      }
+
+      fut_opencl_src = (char*) malloc(src_size + 1);
+
+      size_t n, i;
+      for (i = 0, n = 0; srcs && srcs[i]; i++) {
+        strncpy(fut_opencl_src+n, srcs[i], src_size-n);
+        n += strlen(srcs[i]);
+      }
+      fut_opencl_src[src_size] = 0;
+    }
+
+    if (ctx->cfg.dump_program_to != NULL) {
+      if (ctx->cfg.debugging) {
+        fprintf(stderr, "Dumping OpenCL source to %s...\n", ctx->cfg.dump_program_to);
+      }
+
+      dump_file(ctx->cfg.dump_program_to, fut_opencl_src, strlen(fut_opencl_src));
+    }
+
+    if (ctx->cfg.debugging) {
+      fprintf(stderr, "Creating OpenCL program...\n");
+    }
+
+    const char* src_ptr[] = {fut_opencl_src};
+    prog = clCreateProgramWithSource(ctx->ctx, 1, src_ptr, &src_size, &error);
+    OPENCL_SUCCEED_FATAL(error);
+  } else {
+    if (ctx->cfg.debugging) {
+      fprintf(stderr, "Loading OpenCL binary from %s...\n", ctx->cfg.load_binary_from);
+    }
+    size_t binary_size;
+    unsigned char *fut_opencl_bin =
+      (unsigned char*) slurp_file(ctx->cfg.load_binary_from, &binary_size);
+    assert(fut_opencl_bin != NULL);
+    const unsigned char *binaries[1] = { fut_opencl_bin };
+    cl_int status = 0;
+
+    prog = clCreateProgramWithBinary(ctx->ctx, 1, &device_option.device,
+                                     &binary_size, binaries,
+                                     &status, &error);
+
+    OPENCL_SUCCEED_FATAL(status);
+    OPENCL_SUCCEED_FATAL(error);
+  }
+
+  int compile_opts_size = 1024;
+
+  for (int i = 0; i < ctx->cfg.num_sizes; i++) {
+    compile_opts_size += strlen(ctx->cfg.size_names[i]) + 20;
+  }
+
+  for (int i = 0; extra_build_opts[i] != NULL; i++) {
+    compile_opts_size += strlen(extra_build_opts[i] + 1);
+  }
+
+  char *compile_opts = (char*) malloc(compile_opts_size);
+
+  int w = snprintf(compile_opts, compile_opts_size,
+                   "-DLOCKSTEP_WIDTH=%d ",
+                   (int)ctx->lockstep_width);
+
+  for (int i = 0; i < ctx->cfg.num_sizes; i++) {
+    w += snprintf(compile_opts+w, compile_opts_size-w,
+                  "-D%s=%d ",
+                  ctx->cfg.size_vars[i],
+                  (int)ctx->cfg.size_values[i]);
+  }
+
+  for (int i = 0; extra_build_opts[i] != NULL; i++) {
+    w += snprintf(compile_opts+w, compile_opts_size-w,
+                  "%s ", extra_build_opts[i]);
+  }
+
+  if (ctx->cfg.debugging) {
+    fprintf(stderr, "OpenCL compiler options: %s\n", compile_opts);
+    fprintf(stderr, "Building OpenCL program...\n");
+  }
+  OPENCL_SUCCEED_FATAL(build_opencl_program(prog, device_option.device, compile_opts));
+
+  free(compile_opts);
+  free(fut_opencl_src);
+
+  if (ctx->cfg.dump_binary_to != NULL) {
+    if (ctx->cfg.debugging) {
+      fprintf(stderr, "Dumping OpenCL binary to %s...\n", ctx->cfg.dump_binary_to);
+    }
+
+    size_t binary_size;
+    OPENCL_SUCCEED_FATAL(clGetProgramInfo(prog, CL_PROGRAM_BINARY_SIZES,
+                                          sizeof(size_t), &binary_size, NULL));
+    unsigned char *binary = (unsigned char*) malloc(binary_size);
+    unsigned char *binaries[1] = { binary };
+    OPENCL_SUCCEED_FATAL(clGetProgramInfo(prog, CL_PROGRAM_BINARIES,
+                                          sizeof(unsigned char*), binaries, NULL));
+
+    dump_file(ctx->cfg.dump_binary_to, binary, binary_size);
+  }
+
+  return prog;
+}
+
+static cl_program setup_opencl(struct opencl_context *ctx,
+                               const char *srcs[],
+                               int required_types,
+                               const char *extra_build_opts[]) {
+
+  ctx->lockstep_width = 0; // Real value set later.
+
+  struct opencl_device_option device_option = get_preferred_device(&ctx->cfg);
+
+  if (ctx->cfg.logging) {
+    describe_device_option(device_option);
+  }
+
+  // Note that NVIDIA's OpenCL requires the platform property
+  cl_context_properties properties[] = {
+    CL_CONTEXT_PLATFORM,
+    (cl_context_properties)device_option.platform,
+    0
+  };
+
+  cl_int clCreateContext_error;
+  ctx->ctx = clCreateContext(properties, 1, &device_option.device, NULL, NULL, &clCreateContext_error);
+  OPENCL_SUCCEED_FATAL(clCreateContext_error);
+
+  cl_int clCreateCommandQueue_error;
+  cl_command_queue queue =
+    clCreateCommandQueue(ctx->ctx,
+                         device_option.device,
+                         ctx->cfg.profiling ? CL_QUEUE_PROFILING_ENABLE : 0,
+                         &clCreateCommandQueue_error);
+  OPENCL_SUCCEED_FATAL(clCreateCommandQueue_error);
+
+  return setup_opencl_with_command_queue(ctx, queue, srcs, required_types, extra_build_opts);
+}
+
+// Count up the runtime all the profiling_records that occured during execution.
+// Also clears the buffer of profiling_records.
+static cl_int opencl_tally_profiling_records(struct opencl_context *ctx) {
+  cl_int err;
+  for (int i = 0; i < ctx->profiling_records_used; i++) {
+    struct profiling_record record = ctx->profiling_records[i];
+
+    cl_ulong start_t, end_t;
+
+    if ((err = clGetEventProfilingInfo(*record.event,
+                                       CL_PROFILING_COMMAND_START,
+                                       sizeof(start_t),
+                                       &start_t,
+                                       NULL)) != CL_SUCCESS) {
+      return err;
+    }
+
+    if ((err = clGetEventProfilingInfo(*record.event,
+                                       CL_PROFILING_COMMAND_END,
+                                       sizeof(end_t),
+                                       &end_t,
+                                       NULL)) != CL_SUCCESS) {
+      return err;
+    }
+
+    // OpenCL provides nanosecond resolution, but we want
+    // microseconds.
+    *record.runs += 1;
+    *record.runtime += (end_t - start_t)/1000;
+
+    if ((err = clReleaseEvent(*record.event)) != CL_SUCCESS) {
+      return err;
+    }
+    free(record.event);
+  }
+
+  ctx->profiling_records_used = 0;
+
+  return CL_SUCCESS;
+}
+
+// If profiling, produce an event associated with a profiling record.
+static cl_event* opencl_get_event(struct opencl_context *ctx, int *runs, int64_t *runtime) {
+    if (ctx->profiling_records_used == ctx->profiling_records_capacity) {
+      ctx->profiling_records_capacity *= 2;
+      ctx->profiling_records =
+        realloc(ctx->profiling_records,
+                ctx->profiling_records_capacity *
+                sizeof(struct profiling_record));
+    }
+    cl_event *event = malloc(sizeof(cl_event));
+    ctx->profiling_records[ctx->profiling_records_used].event = event;
+    ctx->profiling_records[ctx->profiling_records_used].runs = runs;
+    ctx->profiling_records[ctx->profiling_records_used].runtime = runtime;
+    ctx->profiling_records_used++;
+    return event;
+}
+
+// Allocate memory from driver. The problem is that OpenCL may perform
+// lazy allocation, so we cannot know whether an allocation succeeded
+// until the first time we try to use it.  Hence we immediately
+// perform a write to see if the allocation succeeded.  This is slow,
+// but the assumption is that this operation will be rare (most things
+// will go through the free list).
+static int opencl_alloc_actual(struct opencl_context *ctx, size_t size, cl_mem *mem_out) {
+  int error;
+  *mem_out = clCreateBuffer(ctx->ctx, CL_MEM_READ_WRITE, size, NULL, &error);
+
+  if (error != CL_SUCCESS) {
+    return error;
+  }
+
+  int x = 2;
+  error = clEnqueueWriteBuffer(ctx->queue, *mem_out, 1, 0, sizeof(x), &x, 0, NULL, NULL);
+
+  // No need to wait for completion here. clWaitForEvents() cannot
+  // return mem object allocation failures. This implies that the
+  // buffer is faulted onto the device on enqueue. (Observation by
+  // Andreas Kloeckner.)
+
+  return error;
+}
+
+static int opencl_alloc(struct opencl_context *ctx, size_t min_size, const char *tag, cl_mem *mem_out) {
+  (void)tag;
+  if (min_size < sizeof(int)) {
+    min_size = sizeof(int);
+  }
+
+  size_t size;
+
+  if (free_list_find(&ctx->free_list, min_size, &size, mem_out) == 0) {
+    // Successfully found a free block.  Is it big enough?
+    //
+    // FIXME: we might also want to check whether the block is *too
+    // big*, to avoid internal fragmentation.  However, this can
+    // sharply impact performance on programs where arrays change size
+    // frequently.  Fortunately, such allocations are usually fairly
+    // short-lived, as they are necessarily within a loop, so the risk
+    // of internal fragmentation resulting in an OOM situation is
+    // limited.  However, it would be preferable if we could go back
+    // and *shrink* oversize allocations when we encounter an OOM
+    // condition.  That is technically feasible, since we do not
+    // expose OpenCL pointer values directly to the application, but
+    // instead rely on a level of indirection.
+    if (size >= min_size) {
+      if (ctx->cfg.debugging) {
+        fprintf(stderr, "No need to allocate: Found a block in the free list.\n");
+      }
+
+      return CL_SUCCESS;
+    } else {
+      if (ctx->cfg.debugging) {
+        fprintf(stderr, "Found a free block, but it was too small.\n");
+      }
+
+      // Not just right - free it.
+      int error = clReleaseMemObject(*mem_out);
+      if (error != CL_SUCCESS) {
+        return error;
+      }
+    }
+  }
+
+  // We have to allocate a new block from the driver.  If the
+  // allocation does not succeed, then we might be in an out-of-memory
+  // situation.  We now start freeing things from the free list until
+  // we think we have freed enough that the allocation will succeed.
+  // Since we don't know how far the allocation is from fitting, we
+  // have to check after every deallocation.  This might be pretty
+  // expensive.  Let's hope that this case is hit rarely.
+
+  if (ctx->cfg.debugging) {
+    fprintf(stderr, "Actually allocating the desired block.\n");
+  }
+
+  int error = opencl_alloc_actual(ctx, min_size, mem_out);
+
+  while (error == CL_MEM_OBJECT_ALLOCATION_FAILURE) {
+    if (ctx->cfg.debugging) {
+      fprintf(stderr, "Out of OpenCL memory: releasing entry from the free list...\n");
+    }
+    cl_mem mem;
+    if (free_list_first(&ctx->free_list, &mem) == 0) {
+      error = clReleaseMemObject(mem);
+      if (error != CL_SUCCESS) {
+        return error;
+      }
+    } else {
+      break;
+    }
+    error = opencl_alloc_actual(ctx, min_size, mem_out);
+  }
+
+  return error;
+}
+
+static int opencl_free(struct opencl_context *ctx, cl_mem mem, const char *tag) {
+  size_t size;
+  cl_mem existing_mem;
+
+  // If there is already a block with this tag, then remove it.
+  if (free_list_find(&ctx->free_list, -1, &size, &existing_mem) == 0) {
+    int error = clReleaseMemObject(existing_mem);
+    if (error != CL_SUCCESS) {
+      return error;
+    }
+  }
+
+  int error = clGetMemObjectInfo(mem, CL_MEM_SIZE, sizeof(size_t), &size, NULL);
+
+  if (error == CL_SUCCESS) {
+    free_list_insert(&ctx->free_list, size, mem, tag);
+  }
+
+  return error;
+}
+
+static int opencl_free_all(struct opencl_context *ctx) {
+  cl_mem mem;
+  free_list_pack(&ctx->free_list);
+  while (free_list_first(&ctx->free_list, &mem) == 0) {
+    int error = clReleaseMemObject(mem);
+    if (error != CL_SUCCESS) {
+      return error;
+    }
+  }
+
+  return CL_SUCCESS;
+}
+
+// Free everything that belongs to 'ctx', but do not free 'ctx'
+// itself.
+static void teardown_opencl(struct opencl_context *ctx) {
+  (void)opencl_tally_profiling_records(ctx);
+  free(ctx->profiling_records);
+  (void)opencl_free_all(ctx);
+  (void)clReleaseCommandQueue(ctx->queue);
+  (void)clReleaseContext(ctx->ctx);
+}
+
+// End of opencl.h.
+
+static const char *opencl_program[] =
+                  {"#ifdef cl_clang_storage_class_specifiers\n#pragma OPENCL EXTENSION cl_clang_storage_class_specifiers : enable\n#endif\n#pragma OPENCL EXTENSION cl_khr_byte_addressable_store : enable\n__kernel void dummy_kernel(__global unsigned char *dummy, int n)\n{\n    const int thread_gid = get_global_id(0);\n    \n    if (thread_gid >= n)\n        return;\n}\n#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable\n#pragma OPENCL EXTENSION cl_khr_int64_extended_atomics : enable\ntypedef char int8_t;\ntypedef short int16_t;\ntypedef int int32_t;\ntypedef long int64_t;\ntypedef uchar uint8_t;\ntypedef ushort uint16_t;\ntypedef uint uint32_t;\ntypedef ulong uint64_t;\n#ifdef cl_nv_pragma_unroll\nstatic inline void mem_fence_global()\n{\n    asm(\"membar.gl;\");\n}\n#else\nstatic inline void mem_fence_global()\n{\n    mem_fence(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE);\n}\n#endif\nstatic inline void mem_fence_local()\n{\n    mem_fence(CLK_LOCAL_MEM_FENCE);\n}\nstatic inline uint8_t add8(uint8_t x, uint8_t y)\n{\n    return x + y;\n}\nstatic inline uint16_t add16(uint16_t x, uint16_t y)\n{\n    return x + y;\n}\nstatic inline uint32_t add32(uint32_t x, uint32_t y)\n{\n    return x + y;\n}\nstatic inline uint64_t add64(uint64_t x, uint64_t y)\n{\n    return x + y;\n}\nstatic inline uint8_t sub8(uint8_t x, uint8_t y)\n{\n    return x - y;\n}\nstatic inline uint16_t sub16(uint16_t x, uint16_t y)\n{\n    return x - y;\n}\nstatic inline uint32_t sub32(uint32_t x, uint32_t y)\n{\n    return x - y;\n}\nstatic inline uint64_t sub64(uint64_t x, uint64_t y)\n{\n    return x - y;\n}\nstatic inline uint8_t mul8(uint8_t x, uint8_t y)\n{\n    return x * y;\n}\nstatic inline uint16_t mul16(uint16_t x, uint16_t y)\n{\n    return x * y;\n}\nstatic inline uint32_t mul32(uint32_t x, uint32_t y)\n{\n    return x * y;\n}\nstatic inline uint64_t mul64(uint64_t x, uint64_t y)\n{\n    return x * y;\n}\nstatic inline uint8_t udiv8(uint8_t x, uint8_t y)\n{\n    return x / y;\n}\nstatic inline uint16_t udiv16(uint16_t x, uint16_t y)\n{\n    return x / y;\n}\nstatic inline uint32_t udiv32(uint3",
+                   "2_t x, uint32_t y)\n{\n    return x / y;\n}\nstatic inline uint64_t udiv64(uint64_t x, uint64_t y)\n{\n    return x / y;\n}\nstatic inline uint8_t udiv_up8(uint8_t x, uint8_t y)\n{\n    return (x + y - 1) / y;\n}\nstatic inline uint16_t udiv_up16(uint16_t x, uint16_t y)\n{\n    return (x + y - 1) / y;\n}\nstatic inline uint32_t udiv_up32(uint32_t x, uint32_t y)\n{\n    return (x + y - 1) / y;\n}\nstatic inline uint64_t udiv_up64(uint64_t x, uint64_t y)\n{\n    return (x + y - 1) / y;\n}\nstatic inline uint8_t umod8(uint8_t x, uint8_t y)\n{\n    return x % y;\n}\nstatic inline uint16_t umod16(uint16_t x, uint16_t y)\n{\n    return x % y;\n}\nstatic inline uint32_t umod32(uint32_t x, uint32_t y)\n{\n    return x % y;\n}\nstatic inline uint64_t umod64(uint64_t x, uint64_t y)\n{\n    return x % y;\n}\nstatic inline uint8_t udiv_safe8(uint8_t x, uint8_t y)\n{\n    return y == 0 ? 0 : x / y;\n}\nstatic inline uint16_t udiv_safe16(uint16_t x, uint16_t y)\n{\n    return y == 0 ? 0 : x / y;\n}\nstatic inline uint32_t udiv_safe32(uint32_t x, uint32_t y)\n{\n    return y == 0 ? 0 : x / y;\n}\nstatic inline uint64_t udiv_safe64(uint64_t x, uint64_t y)\n{\n    return y == 0 ? 0 : x / y;\n}\nstatic inline uint8_t udiv_up_safe8(uint8_t x, uint8_t y)\n{\n    return y == 0 ? 0 : (x + y - 1) / y;\n}\nstatic inline uint16_t udiv_up_safe16(uint16_t x, uint16_t y)\n{\n    return y == 0 ? 0 : (x + y - 1) / y;\n}\nstatic inline uint32_t udiv_up_safe32(uint32_t x, uint32_t y)\n{\n    return y == 0 ? 0 : (x + y - 1) / y;\n}\nstatic inline uint64_t udiv_up_safe64(uint64_t x, uint64_t y)\n{\n    return y == 0 ? 0 : (x + y - 1) / y;\n}\nstatic inline uint8_t umod_safe8(uint8_t x, uint8_t y)\n{\n    return y == 0 ? 0 : x % y;\n}\nstatic inline uint16_t umod_safe16(uint16_t x, uint16_t y)\n{\n    return y == 0 ? 0 : x % y;\n}\nstatic inline uint32_t umod_safe32(uint32_t x, uint32_t y)\n{\n    return y == 0 ? 0 : x % y;\n}\nstatic inline uint64_t umod_safe64(uint64_t x, uint64_t y)\n{\n    return y == 0 ? 0 : x % y;\n}\nstatic inline int8_t sdiv8(int8_t x, int8_t y)\n{\n    int8_t q =",
+                   " x / y;\n    int8_t r = x % y;\n    \n    return q - ((r != 0 && r < 0 != y < 0) ? 1 : 0);\n}\nstatic inline int16_t sdiv16(int16_t x, int16_t y)\n{\n    int16_t q = x / y;\n    int16_t r = x % y;\n    \n    return q - ((r != 0 && r < 0 != y < 0) ? 1 : 0);\n}\nstatic inline int32_t sdiv32(int32_t x, int32_t y)\n{\n    int32_t q = x / y;\n    int32_t r = x % y;\n    \n    return q - ((r != 0 && r < 0 != y < 0) ? 1 : 0);\n}\nstatic inline int64_t sdiv64(int64_t x, int64_t y)\n{\n    int64_t q = x / y;\n    int64_t r = x % y;\n    \n    return q - ((r != 0 && r < 0 != y < 0) ? 1 : 0);\n}\nstatic inline int8_t sdiv_up8(int8_t x, int8_t y)\n{\n    return sdiv8(x + y - 1, y);\n}\nstatic inline int16_t sdiv_up16(int16_t x, int16_t y)\n{\n    return sdiv16(x + y - 1, y);\n}\nstatic inline int32_t sdiv_up32(int32_t x, int32_t y)\n{\n    return sdiv32(x + y - 1, y);\n}\nstatic inline int64_t sdiv_up64(int64_t x, int64_t y)\n{\n    return sdiv64(x + y - 1, y);\n}\nstatic inline int8_t smod8(int8_t x, int8_t y)\n{\n    int8_t r = x % y;\n    \n    return r + (r == 0 || (x > 0 && y > 0) || (x < 0 && y < 0) ? 0 : y);\n}\nstatic inline int16_t smod16(int16_t x, int16_t y)\n{\n    int16_t r = x % y;\n    \n    return r + (r == 0 || (x > 0 && y > 0) || (x < 0 && y < 0) ? 0 : y);\n}\nstatic inline int32_t smod32(int32_t x, int32_t y)\n{\n    int32_t r = x % y;\n    \n    return r + (r == 0 || (x > 0 && y > 0) || (x < 0 && y < 0) ? 0 : y);\n}\nstatic inline int64_t smod64(int64_t x, int64_t y)\n{\n    int64_t r = x % y;\n    \n    return r + (r == 0 || (x > 0 && y > 0) || (x < 0 && y < 0) ? 0 : y);\n}\nstatic inline int8_t sdiv_safe8(int8_t x, int8_t y)\n{\n    return y == 0 ? 0 : sdiv8(x, y);\n}\nstatic inline int16_t sdiv_safe16(int16_t x, int16_t y)\n{\n    return y == 0 ? 0 : sdiv16(x, y);\n}\nstatic inline int32_t sdiv_safe32(int32_t x, int32_t y)\n{\n    return y == 0 ? 0 : sdiv32(x, y);\n}\nstatic inline int64_t sdiv_safe64(int64_t x, int64_t y)\n{\n    return y == 0 ? 0 : sdiv64(x, y);\n}\nstatic inline int8_t sdiv_up_safe8(int8_t x, int8_t y)\n{\n    return ",
+                   "sdiv_safe8(x + y - 1, y);\n}\nstatic inline int16_t sdiv_up_safe16(int16_t x, int16_t y)\n{\n    return sdiv_safe16(x + y - 1, y);\n}\nstatic inline int32_t sdiv_up_safe32(int32_t x, int32_t y)\n{\n    return sdiv_safe32(x + y - 1, y);\n}\nstatic inline int64_t sdiv_up_safe64(int64_t x, int64_t y)\n{\n    return sdiv_safe64(x + y - 1, y);\n}\nstatic inline int8_t smod_safe8(int8_t x, int8_t y)\n{\n    return y == 0 ? 0 : smod8(x, y);\n}\nstatic inline int16_t smod_safe16(int16_t x, int16_t y)\n{\n    return y == 0 ? 0 : smod16(x, y);\n}\nstatic inline int32_t smod_safe32(int32_t x, int32_t y)\n{\n    return y == 0 ? 0 : smod32(x, y);\n}\nstatic inline int64_t smod_safe64(int64_t x, int64_t y)\n{\n    return y == 0 ? 0 : smod64(x, y);\n}\nstatic inline int8_t squot8(int8_t x, int8_t y)\n{\n    return x / y;\n}\nstatic inline int16_t squot16(int16_t x, int16_t y)\n{\n    return x / y;\n}\nstatic inline int32_t squot32(int32_t x, int32_t y)\n{\n    return x / y;\n}\nstatic inline int64_t squot64(int64_t x, int64_t y)\n{\n    return x / y;\n}\nstatic inline int8_t srem8(int8_t x, int8_t y)\n{\n    return x % y;\n}\nstatic inline int16_t srem16(int16_t x, int16_t y)\n{\n    return x % y;\n}\nstatic inline int32_t srem32(int32_t x, int32_t y)\n{\n    return x % y;\n}\nstatic inline int64_t srem64(int64_t x, int64_t y)\n{\n    return x % y;\n}\nstatic inline int8_t squot_safe8(int8_t x, int8_t y)\n{\n    return y == 0 ? 0 : x / y;\n}\nstatic inline int16_t squot_safe16(int16_t x, int16_t y)\n{\n    return y == 0 ? 0 : x / y;\n}\nstatic inline int32_t squot_safe32(int32_t x, int32_t y)\n{\n    return y == 0 ? 0 : x / y;\n}\nstatic inline int64_t squot_safe64(int64_t x, int64_t y)\n{\n    return y == 0 ? 0 : x / y;\n}\nstatic inline int8_t srem_safe8(int8_t x, int8_t y)\n{\n    return y == 0 ? 0 : x % y;\n}\nstatic inline int16_t srem_safe16(int16_t x, int16_t y)\n{\n    return y == 0 ? 0 : x % y;\n}\nstatic inline int32_t srem_safe32(int32_t x, int32_t y)\n{\n    return y == 0 ? 0 : x % y;\n}\nstatic inline int64_t srem_safe64(int64_t x, int64_t y)\n{\n    return ",
+                   "y == 0 ? 0 : x % y;\n}\nstatic inline int8_t smin8(int8_t x, int8_t y)\n{\n    return x < y ? x : y;\n}\nstatic inline int16_t smin16(int16_t x, int16_t y)\n{\n    return x < y ? x : y;\n}\nstatic inline int32_t smin32(int32_t x, int32_t y)\n{\n    return x < y ? x : y;\n}\nstatic inline int64_t smin64(int64_t x, int64_t y)\n{\n    return x < y ? x : y;\n}\nstatic inline uint8_t umin8(uint8_t x, uint8_t y)\n{\n    return x < y ? x : y;\n}\nstatic inline uint16_t umin16(uint16_t x, uint16_t y)\n{\n    return x < y ? x : y;\n}\nstatic inline uint32_t umin32(uint32_t x, uint32_t y)\n{\n    return x < y ? x : y;\n}\nstatic inline uint64_t umin64(uint64_t x, uint64_t y)\n{\n    return x < y ? x : y;\n}\nstatic inline int8_t smax8(int8_t x, int8_t y)\n{\n    return x < y ? y : x;\n}\nstatic inline int16_t smax16(int16_t x, int16_t y)\n{\n    return x < y ? y : x;\n}\nstatic inline int32_t smax32(int32_t x, int32_t y)\n{\n    return x < y ? y : x;\n}\nstatic inline int64_t smax64(int64_t x, int64_t y)\n{\n    return x < y ? y : x;\n}\nstatic inline uint8_t umax8(uint8_t x, uint8_t y)\n{\n    return x < y ? y : x;\n}\nstatic inline uint16_t umax16(uint16_t x, uint16_t y)\n{\n    return x < y ? y : x;\n}\nstatic inline uint32_t umax32(uint32_t x, uint32_t y)\n{\n    return x < y ? y : x;\n}\nstatic inline uint64_t umax64(uint64_t x, uint64_t y)\n{\n    return x < y ? y : x;\n}\nstatic inline uint8_t shl8(uint8_t x, uint8_t y)\n{\n    return x << y;\n}\nstatic inline uint16_t shl16(uint16_t x, uint16_t y)\n{\n    return x << y;\n}\nstatic inline uint32_t shl32(uint32_t x, uint32_t y)\n{\n    return x << y;\n}\nstatic inline uint64_t shl64(uint64_t x, uint64_t y)\n{\n    return x << y;\n}\nstatic inline uint8_t lshr8(uint8_t x, uint8_t y)\n{\n    return x >> y;\n}\nstatic inline uint16_t lshr16(uint16_t x, uint16_t y)\n{\n    return x >> y;\n}\nstatic inline uint32_t lshr32(uint32_t x, uint32_t y)\n{\n    return x >> y;\n}\nstatic inline uint64_t lshr64(uint64_t x, uint64_t y)\n{\n    return x >> y;\n}\nstatic inline int8_t ashr8(int8_t x, int8_t y)\n{\n    return x >> y;\n}\n",
+                   "static inline int16_t ashr16(int16_t x, int16_t y)\n{\n    return x >> y;\n}\nstatic inline int32_t ashr32(int32_t x, int32_t y)\n{\n    return x >> y;\n}\nstatic inline int64_t ashr64(int64_t x, int64_t y)\n{\n    return x >> y;\n}\nstatic inline uint8_t and8(uint8_t x, uint8_t y)\n{\n    return x & y;\n}\nstatic inline uint16_t and16(uint16_t x, uint16_t y)\n{\n    return x & y;\n}\nstatic inline uint32_t and32(uint32_t x, uint32_t y)\n{\n    return x & y;\n}\nstatic inline uint64_t and64(uint64_t x, uint64_t y)\n{\n    return x & y;\n}\nstatic inline uint8_t or8(uint8_t x, uint8_t y)\n{\n    return x | y;\n}\nstatic inline uint16_t or16(uint16_t x, uint16_t y)\n{\n    return x | y;\n}\nstatic inline uint32_t or32(uint32_t x, uint32_t y)\n{\n    return x | y;\n}\nstatic inline uint64_t or64(uint64_t x, uint64_t y)\n{\n    return x | y;\n}\nstatic inline uint8_t xor8(uint8_t x, uint8_t y)\n{\n    return x ^ y;\n}\nstatic inline uint16_t xor16(uint16_t x, uint16_t y)\n{\n    return x ^ y;\n}\nstatic inline uint32_t xor32(uint32_t x, uint32_t y)\n{\n    return x ^ y;\n}\nstatic inline uint64_t xor64(uint64_t x, uint64_t y)\n{\n    return x ^ y;\n}\nstatic inline bool ult8(uint8_t x, uint8_t y)\n{\n    return x < y;\n}\nstatic inline bool ult16(uint16_t x, uint16_t y)\n{\n    return x < y;\n}\nstatic inline bool ult32(uint32_t x, uint32_t y)\n{\n    return x < y;\n}\nstatic inline bool ult64(uint64_t x, uint64_t y)\n{\n    return x < y;\n}\nstatic inline bool ule8(uint8_t x, uint8_t y)\n{\n    return x <= y;\n}\nstatic inline bool ule16(uint16_t x, uint16_t y)\n{\n    return x <= y;\n}\nstatic inline bool ule32(uint32_t x, uint32_t y)\n{\n    return x <= y;\n}\nstatic inline bool ule64(uint64_t x, uint64_t y)\n{\n    return x <= y;\n}\nstatic inline bool slt8(int8_t x, int8_t y)\n{\n    return x < y;\n}\nstatic inline bool slt16(int16_t x, int16_t y)\n{\n    return x < y;\n}\nstatic inline bool slt32(int32_t x, int32_t y)\n{\n    return x < y;\n}\nstatic inline bool slt64(int64_t x, int64_t y)\n{\n    return x < y;\n}\nstatic inline bool sle8(int8_t x, int8_t y)\n{\n    retur",
+                   "n x <= y;\n}\nstatic inline bool sle16(int16_t x, int16_t y)\n{\n    return x <= y;\n}\nstatic inline bool sle32(int32_t x, int32_t y)\n{\n    return x <= y;\n}\nstatic inline bool sle64(int64_t x, int64_t y)\n{\n    return x <= y;\n}\nstatic inline int8_t pow8(int8_t x, int8_t y)\n{\n    int8_t res = 1, rem = y;\n    \n    while (rem != 0) {\n        if (rem & 1)\n            res *= x;\n        rem >>= 1;\n        x *= x;\n    }\n    return res;\n}\nstatic inline int16_t pow16(int16_t x, int16_t y)\n{\n    int16_t res = 1, rem = y;\n    \n    while (rem != 0) {\n        if (rem & 1)\n            res *= x;\n        rem >>= 1;\n        x *= x;\n    }\n    return res;\n}\nstatic inline int32_t pow32(int32_t x, int32_t y)\n{\n    int32_t res = 1, rem = y;\n    \n    while (rem != 0) {\n        if (rem & 1)\n            res *= x;\n        rem >>= 1;\n        x *= x;\n    }\n    return res;\n}\nstatic inline int64_t pow64(int64_t x, int64_t y)\n{\n    int64_t res = 1, rem = y;\n    \n    while (rem != 0) {\n        if (rem & 1)\n            res *= x;\n        rem >>= 1;\n        x *= x;\n    }\n    return res;\n}\nstatic inline bool itob_i8_bool(int8_t x)\n{\n    return x;\n}\nstatic inline bool itob_i16_bool(int16_t x)\n{\n    return x;\n}\nstatic inline bool itob_i32_bool(int32_t x)\n{\n    return x;\n}\nstatic inline bool itob_i64_bool(int64_t x)\n{\n    return x;\n}\nstatic inline int8_t btoi_bool_i8(bool x)\n{\n    return x;\n}\nstatic inline int16_t btoi_bool_i16(bool x)\n{\n    return x;\n}\nstatic inline int32_t btoi_bool_i32(bool x)\n{\n    return x;\n}\nstatic inline int64_t btoi_bool_i64(bool x)\n{\n    return x;\n}\n#define sext_i8_i8(x) ((int8_t) (int8_t) x)\n#define sext_i8_i16(x) ((int16_t) (int8_t) x)\n#define sext_i8_i32(x) ((int32_t) (int8_t) x)\n#define sext_i8_i64(x) ((int64_t) (int8_t) x)\n#define sext_i16_i8(x) ((int8_t) (int16_t) x)\n#define sext_i16_i16(x) ((int16_t) (int16_t) x)\n#define sext_i16_i32(x) ((int32_t) (int16_t) x)\n#define sext_i16_i64(x) ((int64_t) (int16_t) x)\n#define sext_i32_i8(x) ((int8_t) (int32_t) x)\n#define sext_i32_i16(x) (",
+                   "(int16_t) (int32_t) x)\n#define sext_i32_i32(x) ((int32_t) (int32_t) x)\n#define sext_i32_i64(x) ((int64_t) (int32_t) x)\n#define sext_i64_i8(x) ((int8_t) (int64_t) x)\n#define sext_i64_i16(x) ((int16_t) (int64_t) x)\n#define sext_i64_i32(x) ((int32_t) (int64_t) x)\n#define sext_i64_i64(x) ((int64_t) (int64_t) x)\n#define zext_i8_i8(x) ((int8_t) (uint8_t) x)\n#define zext_i8_i16(x) ((int16_t) (uint8_t) x)\n#define zext_i8_i32(x) ((int32_t) (uint8_t) x)\n#define zext_i8_i64(x) ((int64_t) (uint8_t) x)\n#define zext_i16_i8(x) ((int8_t) (uint16_t) x)\n#define zext_i16_i16(x) ((int16_t) (uint16_t) x)\n#define zext_i16_i32(x) ((int32_t) (uint16_t) x)\n#define zext_i16_i64(x) ((int64_t) (uint16_t) x)\n#define zext_i32_i8(x) ((int8_t) (uint32_t) x)\n#define zext_i32_i16(x) ((int16_t) (uint32_t) x)\n#define zext_i32_i32(x) ((int32_t) (uint32_t) x)\n#define zext_i32_i64(x) ((int64_t) (uint32_t) x)\n#define zext_i64_i8(x) ((int8_t) (uint64_t) x)\n#define zext_i64_i16(x) ((int16_t) (uint64_t) x)\n#define zext_i64_i32(x) ((int32_t) (uint64_t) x)\n#define zext_i64_i64(x) ((int64_t) (uint64_t) x)\n#if defined(__OPENCL_VERSION__)\nstatic int32_t futrts_popc8(int8_t x)\n{\n    return popcount(x);\n}\nstatic int32_t futrts_popc16(int16_t x)\n{\n    return popcount(x);\n}\nstatic int32_t futrts_popc32(int32_t x)\n{\n    return popcount(x);\n}\nstatic int32_t futrts_popc64(int64_t x)\n{\n    return popcount(x);\n}\n#elif defined(__CUDA_ARCH__)\nstatic int32_t futrts_popc8(int8_t x)\n{\n    return __popc(zext_i8_i32(x));\n}\nstatic int32_t futrts_popc16(int16_t x)\n{\n    return __popc(zext_i16_i32(x));\n}\nstatic int32_t futrts_popc32(int32_t x)\n{\n    return __popc(x);\n}\nstatic int32_t futrts_popc64(int64_t x)\n{\n    return __popcll(x);\n}\n#else\nstatic int32_t futrts_popc8(int8_t x)\n{\n    int c = 0;\n    \n    for (; x; ++c)\n        x &= x - 1;\n    return c;\n}\nstatic int32_t futrts_popc16(int16_t x)\n{\n    int c = 0;\n    \n    for (; x; ++c)\n        x &= x - 1;\n    return c;\n}\nstatic int32_t futrts_popc32(int32_t x)\n{\n    int c = 0;\n    \n ",
+                   "   for (; x; ++c)\n        x &= x - 1;\n    return c;\n}\nstatic int32_t futrts_popc64(int64_t x)\n{\n    int c = 0;\n    \n    for (; x; ++c)\n        x &= x - 1;\n    return c;\n}\n#endif\n#if defined(__OPENCL_VERSION__)\nstatic uint8_t futrts_mul_hi8(uint8_t a, uint8_t b)\n{\n    return mul_hi(a, b);\n}\nstatic uint16_t futrts_mul_hi16(uint16_t a, uint16_t b)\n{\n    return mul_hi(a, b);\n}\nstatic uint32_t futrts_mul_hi32(uint32_t a, uint32_t b)\n{\n    return mul_hi(a, b);\n}\nstatic uint64_t futrts_mul_hi64(uint64_t a, uint64_t b)\n{\n    return mul_hi(a, b);\n}\n#elif defined(__CUDA_ARCH__)\nstatic uint8_t futrts_mul_hi8(uint8_t a, uint8_t b)\n{\n    uint16_t aa = a;\n    uint16_t bb = b;\n    \n    return aa * bb >> 8;\n}\nstatic uint16_t futrts_mul_hi16(uint16_t a, uint16_t b)\n{\n    uint32_t aa = a;\n    uint32_t bb = b;\n    \n    return aa * bb >> 16;\n}\nstatic uint32_t futrts_mul_hi32(uint32_t a, uint32_t b)\n{\n    return mulhi(a, b);\n}\nstatic uint64_t futrts_mul_hi64(uint64_t a, uint64_t b)\n{\n    return mul64hi(a, b);\n}\n#else\nstatic uint8_t futrts_mul_hi8(uint8_t a, uint8_t b)\n{\n    uint16_t aa = a;\n    uint16_t bb = b;\n    \n    return aa * bb >> 8;\n}\nstatic uint16_t futrts_mul_hi16(uint16_t a, uint16_t b)\n{\n    uint32_t aa = a;\n    uint32_t bb = b;\n    \n    return aa * bb >> 16;\n}\nstatic uint32_t futrts_mul_hi32(uint32_t a, uint32_t b)\n{\n    uint64_t aa = a;\n    uint64_t bb = b;\n    \n    return aa * bb >> 32;\n}\nstatic uint64_t futrts_mul_hi64(uint64_t a, uint64_t b)\n{\n    __uint128_t aa = a;\n    __uint128_t bb = b;\n    \n    return aa * bb >> 64;\n}\n#endif\n#if defined(__OPENCL_VERSION__)\nstatic uint8_t futrts_mad_hi8(uint8_t a, uint8_t b, uint8_t c)\n{\n    return mad_hi(a, b, c);\n}\nstatic uint16_t futrts_mad_hi16(uint16_t a, uint16_t b, uint16_t c)\n{\n    return mad_hi(a, b, c);\n}\nstatic uint32_t futrts_mad_hi32(uint32_t a, uint32_t b, uint32_t c)\n{\n    return mad_hi(a, b, c);\n}\nstatic uint64_t futrts_mad_hi64(uint64_t a, uint64_t b, uint64_t c)\n{\n    return mad_hi(a, b, c);\n}\n#else\nstatic uint8_t ",
+                   "futrts_mad_hi8(uint8_t a, uint8_t b, uint8_t c)\n{\n    return futrts_mul_hi8(a, b) + c;\n}\nstatic uint16_t futrts_mad_hi16(uint16_t a, uint16_t b, uint16_t c)\n{\n    return futrts_mul_hi16(a, b) + c;\n}\nstatic uint32_t futrts_mad_hi32(uint32_t a, uint32_t b, uint32_t c)\n{\n    return futrts_mul_hi32(a, b) + c;\n}\nstatic uint64_t futrts_mad_hi64(uint64_t a, uint64_t b, uint64_t c)\n{\n    return futrts_mul_hi64(a, b) + c;\n}\n#endif\n#if defined(__OPENCL_VERSION__)\nstatic int32_t futrts_clzz8(int8_t x)\n{\n    return clz(x);\n}\nstatic int32_t futrts_clzz16(int16_t x)\n{\n    return clz(x);\n}\nstatic int32_t futrts_clzz32(int32_t x)\n{\n    return clz(x);\n}\nstatic int32_t futrts_clzz64(int64_t x)\n{\n    return clz(x);\n}\n#elif defined(__CUDA_ARCH__)\nstatic int32_t futrts_clzz8(int8_t x)\n{\n    return __clz(zext_i8_i32(x)) - 24;\n}\nstatic int32_t futrts_clzz16(int16_t x)\n{\n    return __clz(zext_i16_i32(x)) - 16;\n}\nstatic int32_t futrts_clzz32(int32_t x)\n{\n    return __clz(x);\n}\nstatic int32_t futrts_clzz64(int64_t x)\n{\n    return __clzll(x);\n}\n#else\nstatic int32_t futrts_clzz8(int8_t x)\n{\n    int n = 0;\n    int bits = sizeof(x) * 8;\n    \n    for (int i = 0; i < bits; i++) {\n        if (x < 0)\n            break;\n        n++;\n        x <<= 1;\n    }\n    return n;\n}\nstatic int32_t futrts_clzz16(int16_t x)\n{\n    int n = 0;\n    int bits = sizeof(x) * 8;\n    \n    for (int i = 0; i < bits; i++) {\n        if (x < 0)\n            break;\n        n++;\n        x <<= 1;\n    }\n    return n;\n}\nstatic int32_t futrts_clzz32(int32_t x)\n{\n    int n = 0;\n    int bits = sizeof(x) * 8;\n    \n    for (int i = 0; i < bits; i++) {\n        if (x < 0)\n            break;\n        n++;\n        x <<= 1;\n    }\n    return n;\n}\nstatic int32_t futrts_clzz64(int64_t x)\n{\n    int n = 0;\n    int bits = sizeof(x) * 8;\n    \n    for (int i = 0; i < bits; i++) {\n        if (x < 0)\n            break;\n        n++;\n        x <<= 1;\n    }\n    return n;\n}\n#endif\n#if defined(__OPENCL_VERSION__)\nstatic int32_t futrts_ctzz8(int8_t x)\n{\n    int",
+                   " i = 0;\n    \n    for (; i < 8 && (x & 1) == 0; i++, x >>= 1)\n        ;\n    return i;\n}\nstatic int32_t futrts_ctzz16(int16_t x)\n{\n    int i = 0;\n    \n    for (; i < 16 && (x & 1) == 0; i++, x >>= 1)\n        ;\n    return i;\n}\nstatic int32_t futrts_ctzz32(int32_t x)\n{\n    int i = 0;\n    \n    for (; i < 32 && (x & 1) == 0; i++, x >>= 1)\n        ;\n    return i;\n}\nstatic int32_t futrts_ctzz64(int64_t x)\n{\n    int i = 0;\n    \n    for (; i < 64 && (x & 1) == 0; i++, x >>= 1)\n        ;\n    return i;\n}\n#elif defined(__CUDA_ARCH__)\nstatic int32_t futrts_ctzz8(int8_t x)\n{\n    int y = __ffs(x);\n    \n    return y == 0 ? 8 : y - 1;\n}\nstatic int32_t futrts_ctzz16(int16_t x)\n{\n    int y = __ffs(x);\n    \n    return y == 0 ? 16 : y - 1;\n}\nstatic int32_t futrts_ctzz32(int32_t x)\n{\n    int y = __ffs(x);\n    \n    return y == 0 ? 32 : y - 1;\n}\nstatic int32_t futrts_ctzz64(int64_t x)\n{\n    int y = __ffsll(x);\n    \n    return y == 0 ? 64 : y - 1;\n}\n#else\nstatic int32_t futrts_ctzz8(int8_t x)\n{\n    return x == 0 ? 8 : __builtin_ctz((uint32_t) x);\n}\nstatic int32_t futrts_ctzz16(int16_t x)\n{\n    return x == 0 ? 16 : __builtin_ctz((uint32_t) x);\n}\nstatic int32_t futrts_ctzz32(int32_t x)\n{\n    return x == 0 ? 32 : __builtin_ctz(x);\n}\nstatic int32_t futrts_ctzz64(int64_t x)\n{\n    return x == 0 ? 64 : __builtin_ctzll(x);\n}\n#endif\nstatic inline float fdiv32(float x, float y)\n{\n    return x / y;\n}\nstatic inline float fadd32(float x, float y)\n{\n    return x + y;\n}\nstatic inline float fsub32(float x, float y)\n{\n    return x - y;\n}\nstatic inline float fmul32(float x, float y)\n{\n    return x * y;\n}\nstatic inline float fmin32(float x, float y)\n{\n    return fmin(x, y);\n}\nstatic inline float fmax32(float x, float y)\n{\n    return fmax(x, y);\n}\nstatic inline float fpow32(float x, float y)\n{\n    return pow(x, y);\n}\nstatic inline bool cmplt32(float x, float y)\n{\n    return x < y;\n}\nstatic inline bool cmple32(float x, float y)\n{\n    return x <= y;\n}\nstatic inline float sitofp_i8_f32(int8_t x)\n{\n    return (floa",
+                   "t) x;\n}\nstatic inline float sitofp_i16_f32(int16_t x)\n{\n    return (float) x;\n}\nstatic inline float sitofp_i32_f32(int32_t x)\n{\n    return (float) x;\n}\nstatic inline float sitofp_i64_f32(int64_t x)\n{\n    return (float) x;\n}\nstatic inline float uitofp_i8_f32(uint8_t x)\n{\n    return (float) x;\n}\nstatic inline float uitofp_i16_f32(uint16_t x)\n{\n    return (float) x;\n}\nstatic inline float uitofp_i32_f32(uint32_t x)\n{\n    return (float) x;\n}\nstatic inline float uitofp_i64_f32(uint64_t x)\n{\n    return (float) x;\n}\nstatic inline int8_t fptosi_f32_i8(float x)\n{\n    return (int8_t) x;\n}\nstatic inline int16_t fptosi_f32_i16(float x)\n{\n    return (int16_t) x;\n}\nstatic inline int32_t fptosi_f32_i32(float x)\n{\n    return (int32_t) x;\n}\nstatic inline int64_t fptosi_f32_i64(float x)\n{\n    return (int64_t) x;\n}\nstatic inline uint8_t fptoui_f32_i8(float x)\n{\n    return (uint8_t) x;\n}\nstatic inline uint16_t fptoui_f32_i16(float x)\n{\n    return (uint16_t) x;\n}\nstatic inline uint32_t fptoui_f32_i32(float x)\n{\n    return (uint32_t) x;\n}\nstatic inline uint64_t fptoui_f32_i64(float x)\n{\n    return (uint64_t) x;\n}\nstatic inline bool futrts_isnan32(float x)\n{\n    return isnan(x);\n}\nstatic inline bool futrts_isinf32(float x)\n{\n    return isinf(x);\n}\n#ifdef __OPENCL_VERSION__\nstatic inline float futrts_log32(float x)\n{\n    return log(x);\n}\nstatic inline float futrts_log2_32(float x)\n{\n    return log2(x);\n}\nstatic inline float futrts_log10_32(float x)\n{\n    return log10(x);\n}\nstatic inline float futrts_sqrt32(float x)\n{\n    return sqrt(x);\n}\nstatic inline float futrts_exp32(float x)\n{\n    return exp(x);\n}\nstatic inline float futrts_cos32(float x)\n{\n    return cos(x);\n}\nstatic inline float futrts_sin32(float x)\n{\n    return sin(x);\n}\nstatic inline float futrts_tan32(float x)\n{\n    return tan(x);\n}\nstatic inline float futrts_acos32(float x)\n{\n    return acos(x);\n}\nstatic inline float futrts_asin32(float x)\n{\n    return asin(x);\n}\nstatic inline float futrts_atan32(float x)\n{\n    return atan(x);\n}",
+                   "\nstatic inline float futrts_cosh32(float x)\n{\n    return cosh(x);\n}\nstatic inline float futrts_sinh32(float x)\n{\n    return sinh(x);\n}\nstatic inline float futrts_tanh32(float x)\n{\n    return tanh(x);\n}\nstatic inline float futrts_acosh32(float x)\n{\n    return acosh(x);\n}\nstatic inline float futrts_asinh32(float x)\n{\n    return asinh(x);\n}\nstatic inline float futrts_atanh32(float x)\n{\n    return atanh(x);\n}\nstatic inline float futrts_atan2_32(float x, float y)\n{\n    return atan2(x, y);\n}\nstatic inline float futrts_hypot32(float x, float y)\n{\n    return hypot(x, y);\n}\nstatic inline float futrts_gamma32(float x)\n{\n    return tgamma(x);\n}\nstatic inline float futrts_lgamma32(float x)\n{\n    return lgamma(x);\n}\nstatic inline float fmod32(float x, float y)\n{\n    return fmod(x, y);\n}\nstatic inline float futrts_round32(float x)\n{\n    return rint(x);\n}\nstatic inline float futrts_floor32(float x)\n{\n    return floor(x);\n}\nstatic inline float futrts_ceil32(float x)\n{\n    return ceil(x);\n}\nstatic inline float futrts_lerp32(float v0, float v1, float t)\n{\n    return mix(v0, v1, t);\n}\nstatic inline float futrts_mad32(float a, float b, float c)\n{\n    return mad(a, b, c);\n}\nstatic inline float futrts_fma32(float a, float b, float c)\n{\n    return fma(a, b, c);\n}\n#else\nstatic inline float futrts_log32(float x)\n{\n    return logf(x);\n}\nstatic inline float futrts_log2_32(float x)\n{\n    return log2f(x);\n}\nstatic inline float futrts_log10_32(float x)\n{\n    return log10f(x);\n}\nstatic inline float futrts_sqrt32(float x)\n{\n    return sqrtf(x);\n}\nstatic inline float futrts_exp32(float x)\n{\n    return expf(x);\n}\nstatic inline float futrts_cos32(float x)\n{\n    return cosf(x);\n}\nstatic inline float futrts_sin32(float x)\n{\n    return sinf(x);\n}\nstatic inline float futrts_tan32(float x)\n{\n    return tanf(x);\n}\nstatic inline float futrts_acos32(float x)\n{\n    return acosf(x);\n}\nstatic inline float futrts_asin32(float x)\n{\n    return asinf(x);\n}\nstatic inline float futrts_atan32(float x)\n{\n    return ata",
+                   "nf(x);\n}\nstatic inline float futrts_cosh32(float x)\n{\n    return coshf(x);\n}\nstatic inline float futrts_sinh32(float x)\n{\n    return sinhf(x);\n}\nstatic inline float futrts_tanh32(float x)\n{\n    return tanhf(x);\n}\nstatic inline float futrts_acosh32(float x)\n{\n    return acoshf(x);\n}\nstatic inline float futrts_asinh32(float x)\n{\n    return asinhf(x);\n}\nstatic inline float futrts_atanh32(float x)\n{\n    return atanhf(x);\n}\nstatic inline float futrts_atan2_32(float x, float y)\n{\n    return atan2f(x, y);\n}\nstatic inline float futrts_hypot32(float x, float y)\n{\n    return hypotf(x, y);\n}\nstatic inline float futrts_gamma32(float x)\n{\n    return tgammaf(x);\n}\nstatic inline float futrts_lgamma32(float x)\n{\n    return lgammaf(x);\n}\nstatic inline float fmod32(float x, float y)\n{\n    return fmodf(x, y);\n}\nstatic inline float futrts_round32(float x)\n{\n    return rintf(x);\n}\nstatic inline float futrts_floor32(float x)\n{\n    return floorf(x);\n}\nstatic inline float futrts_ceil32(float x)\n{\n    return ceilf(x);\n}\nstatic inline float futrts_lerp32(float v0, float v1, float t)\n{\n    return v0 + (v1 - v0) * t;\n}\nstatic inline float futrts_mad32(float a, float b, float c)\n{\n    return a * b + c;\n}\nstatic inline float futrts_fma32(float a, float b, float c)\n{\n    return fmaf(a, b, c);\n}\n#endif\nstatic inline int32_t futrts_to_bits32(float x)\n{\n    union {\n        float f;\n        int32_t t;\n    } p;\n    \n    p.f = x;\n    return p.t;\n}\nstatic inline float futrts_from_bits32(int32_t x)\n{\n    union {\n        int32_t f;\n        float t;\n    } p;\n    \n    p.f = x;\n    return p.t;\n}\nstatic inline float fsignum32(float x)\n{\n    return futrts_isnan32(x) ? x : (x > 0) - (x < 0);\n}\n// Start of atomics.h\n\ninline int32_t atomic_xchg_i32_global(volatile __global int32_t *p, int32_t x) {\n#ifdef FUTHARK_CUDA\n  return atomicExch((int32_t*)p, x);\n#else\n  return atomic_xor(p, x);\n#endif\n}\n\ninline int32_t atomic_xchg_i32_local(volatile __local int32_t *p, int32_t x) {\n#ifdef FUTHARK_CUDA\n  return atomicExch(",
+                   "(int32_t*)p, x);\n#else\n  return atomic_xor(p, x);\n#endif\n}\n\ninline int32_t atomic_cmpxchg_i32_global(volatile __global int32_t *p,\n                                         int32_t cmp, int32_t val) {\n#ifdef FUTHARK_CUDA\n  return atomicCAS((int32_t*)p, cmp, val);\n#else\n  return atomic_cmpxchg(p, cmp, val);\n#endif\n}\n\ninline int32_t atomic_cmpxchg_i32_local(volatile __local int32_t *p,\n                                        int32_t cmp, int32_t val) {\n#ifdef FUTHARK_CUDA\n  return atomicCAS((int32_t*)p, cmp, val);\n#else\n  return atomic_cmpxchg(p, cmp, val);\n#endif\n}\n\ninline int32_t atomic_add_i32_global(volatile __global int32_t *p, int32_t x) {\n#ifdef FUTHARK_CUDA\n  return atomicAdd((int32_t*)p, x);\n#else\n  return atomic_add(p, x);\n#endif\n}\n\ninline int32_t atomic_add_i32_local(volatile __local int32_t *p, int32_t x) {\n#ifdef FUTHARK_CUDA\n  return atomicAdd((int32_t*)p, x);\n#else\n  return atomic_add(p, x);\n#endif\n}\n\ninline float atomic_fadd_f32_global(volatile __global float *p, float x) {\n#ifdef FUTHARK_CUDA\n  return atomicAdd((float*)p, x);\n#else\n  union { int32_t i; float f; } old;\n  union { int32_t i; float f; } assumed;\n  old.f = *p;\n  do {\n    assumed.f = old.f;\n    old.f = old.f + x;\n    old.i = atomic_cmpxchg_i32_global((volatile __global int32_t*)p, assumed.i, old.i);\n  } while (assumed.i != old.i);\n  return old.f;\n#endif\n}\n\ninline float atomic_fadd_f32_local(volatile __local float *p, float x) {\n#ifdef FUTHARK_CUDA\n  return atomicAdd((float*)p, x);\n#else\n  union { int32_t i; float f; } old;\n  union { int32_t i; float f; } assumed;\n  old.f = *p;\n  do {\n    assumed.f = old.f;\n    old.f = old.f + x;\n    old.i = atomic_cmpxchg_i32_local((volatile __local int32_t*)p, assumed.i, old.i);\n  } while (assumed.i != old.i);\n  return old.f;\n#endif\n}\n\ninline int32_t atomic_smax_i32_global(volatile __global int32_t *p, int32_t x) {\n#ifdef FUTHARK_CUDA\n  return atomicMax((int32_t*)p, x);\n#else\n  return atomic_max(p, x);\n#endif\n}\n\ninline int32_t atomic_smax_i32_local(volatile",
+                   " __local int32_t *p, int32_t x) {\n#ifdef FUTHARK_CUDA\n  return atomicMax((int32_t*)p, x);\n#else\n  return atomic_max(p, x);\n#endif\n}\n\ninline int32_t atomic_smin_i32_global(volatile __global int32_t *p, int32_t x) {\n#ifdef FUTHARK_CUDA\n  return atomicMin((int32_t*)p, x);\n#else\n  return atomic_min(p, x);\n#endif\n}\n\ninline int32_t atomic_smin_i32_local(volatile __local int32_t *p, int32_t x) {\n#ifdef FUTHARK_CUDA\n  return atomicMin((int32_t*)p, x);\n#else\n  return atomic_min(p, x);\n#endif\n}\n\ninline uint32_t atomic_umax_i32_global(volatile __global uint32_t *p, uint32_t x) {\n#ifdef FUTHARK_CUDA\n  return atomicMax((uint32_t*)p, x);\n#else\n  return atomic_max(p, x);\n#endif\n}\n\ninline uint32_t atomic_umax_i32_local(volatile __local uint32_t *p, uint32_t x) {\n#ifdef FUTHARK_CUDA\n  return atomicMax((uint32_t*)p, x);\n#else\n  return atomic_max(p, x);\n#endif\n}\n\ninline uint32_t atomic_umin_i32_global(volatile __global uint32_t *p, uint32_t x) {\n#ifdef FUTHARK_CUDA\n  return atomicMin((uint32_t*)p, x);\n#else\n  return atomic_min(p, x);\n#endif\n}\n\ninline uint32_t atomic_umin_i32_local(volatile __local uint32_t *p, uint32_t x) {\n#ifdef FUTHARK_CUDA\n  return atomicMin((uint32_t*)p, x);\n#else\n  return atomic_min(p, x);\n#endif\n}\n\ninline int32_t atomic_and_i32_global(volatile __global int32_t *p, int32_t x) {\n#ifdef FUTHARK_CUDA\n  return atomicAnd((int32_t*)p, x);\n#else\n  return atomic_and(p, x);\n#endif\n}\n\ninline int32_t atomic_and_i32_local(volatile __local int32_t *p, int32_t x) {\n#ifdef FUTHARK_CUDA\n  return atomicAnd((int32_t*)p, x);\n#else\n  return atomic_and(p, x);\n#endif\n}\n\ninline int32_t atomic_or_i32_global(volatile __global int32_t *p, int32_t x) {\n#ifdef FUTHARK_CUDA\n  return atomicOr((int32_t*)p, x);\n#else\n  return atomic_or(p, x);\n#endif\n}\n\ninline int32_t atomic_or_i32_local(volatile __local int32_t *p, int32_t x) {\n#ifdef FUTHARK_CUDA\n  return atomicOr((int32_t*)p, x);\n#else\n  return atomic_or(p, x);\n#endif\n}\n\ninline int32_t atomic_xor_i32_global(volatile __global int32_t *p, int3",
+                   "2_t x) {\n#ifdef FUTHARK_CUDA\n  return atomicXor((int32_t*)p, x);\n#else\n  return atomic_xor(p, x);\n#endif\n}\n\ninline int32_t atomic_xor_i32_local(volatile __local int32_t *p, int32_t x) {\n#ifdef FUTHARK_CUDA\n  return atomicXor((int32_t*)p, x);\n#else\n  return atomic_xor(p, x);\n#endif\n}\n\n// Start of 64 bit atomics\n\ninline int64_t atomic_xchg_i64_global(volatile __global int64_t *p, int64_t x) {\n#ifdef FUTHARK_CUDA\n  return atomicExch((uint64_t*)p, x);\n#else\n  return atom_xor(p, x);\n#endif\n}\n\ninline int64_t atomic_xchg_i64_local(volatile __local int64_t *p, int64_t x) {\n#ifdef FUTHARK_CUDA\n  return atomicExch((uint64_t*)p, x);\n#else\n  return atom_xor(p, x);\n#endif\n}\n\ninline int64_t atomic_cmpxchg_i64_global(volatile __global int64_t *p,\n                                         int64_t cmp, int64_t val) {\n#ifdef FUTHARK_CUDA\n  return atomicCAS((uint64_t*)p, cmp, val);\n#else\n  return atom_cmpxchg(p, cmp, val);\n#endif\n}\n\ninline int64_t atomic_cmpxchg_i64_local(volatile __local int64_t *p,\n                                        int64_t cmp, int64_t val) {\n#ifdef FUTHARK_CUDA\n  return atomicCAS((uint64_t*)p, cmp, val);\n#else\n  return atom_cmpxchg(p, cmp, val);\n#endif\n}\n\ninline int64_t atomic_add_i64_global(volatile __global int64_t *p, int64_t x) {\n#ifdef FUTHARK_CUDA\n  return atomicAdd((uint64_t*)p, x);\n#else\n  return atom_add(p, x);\n#endif\n}\n\ninline int64_t atomic_add_i64_local(volatile __local int64_t *p, int64_t x) {\n#ifdef FUTHARK_CUDA\n  return atomicAdd((uint64_t*)p, x);\n#else\n  return atom_add(p, x);\n#endif\n}\n\n#ifdef FUTHARK_F64_ENABLED\n\ninline double atomic_fadd_f64_global(volatile __global double *p, double x) {\n#if defined(FUTHARK_CUDA) && __CUDA_ARCH__ >= 600\n  return atomicAdd((double*)p, x);\n#else\n  union { int64_t i; double f; } old;\n  union { int64_t i; double f; } assumed;\n  old.f = *p;\n  do {\n    assumed.f = old.f;\n    old.f = old.f + x;\n    old.i = atomic_cmpxchg_i64_global((volatile __global int64_t*)p, assumed.i, old.i);\n  } while (assumed.i != old.i);\n  ",
+                   "return old.f;\n#endif\n}\n\ninline double atomic_fadd_f64_local(volatile __local double *p, double x) {\n#if defined(FUTHARK_CUDA) && __CUDA_ARCH__ >= 600\n  return atomicAdd((double*)p, x);\n#else\n  union { int64_t i; double f; } old;\n  union { int64_t i; double f; } assumed;\n  old.f = *p;\n  do {\n    assumed.f = old.f;\n    old.f = old.f + x;\n    old.i = atomic_cmpxchg_i64_local((volatile __local int64_t*)p, assumed.i, old.i);\n  } while (assumed.i != old.i);\n  return old.f;\n#endif\n}\n\n#endif\n\ninline int64_t atomic_smax_i64_global(volatile __global int64_t *p, int64_t x) {\n#ifdef FUTHARK_CUDA\n  return atomicMax((int64_t*)p, x);\n#else\n  return atom_max(p, x);\n#endif\n}\n\ninline int64_t atomic_smax_i64_local(volatile __local int64_t *p, int64_t x) {\n#ifdef FUTHARK_CUDA\n  return atomicMax((int64_t*)p, x);\n#else\n  return atom_max(p, x);\n#endif\n}\n\ninline int64_t atomic_smin_i64_global(volatile __global int64_t *p, int64_t x) {\n#ifdef FUTHARK_CUDA\n  return atomicMin((int64_t*)p, x);\n#else\n  return atom_min(p, x);\n#endif\n}\n\ninline int64_t atomic_smin_i64_local(volatile __local int64_t *p, int64_t x) {\n#ifdef FUTHARK_CUDA\n  return atomicMin((int64_t*)p, x);\n#else\n  return atom_min(p, x);\n#endif\n}\n\ninline uint64_t atomic_umax_i64_global(volatile __global uint64_t *p, uint64_t x) {\n#ifdef FUTHARK_CUDA\n  return atomicMax((uint64_t*)p, x);\n#else\n  return atom_max(p, x);\n#endif\n}\n\ninline uint64_t atomic_umax_i64_local(volatile __local uint64_t *p, uint64_t x) {\n#ifdef FUTHARK_CUDA\n  return atomicMax((uint64_t*)p, x);\n#else\n  return atom_max(p, x);\n#endif\n}\n\ninline uint64_t atomic_umin_i64_global(volatile __global uint64_t *p, uint64_t x) {\n#ifdef FUTHARK_CUDA\n  return atomicMin((uint64_t*)p, x);\n#else\n  return atom_min(p, x);\n#endif\n}\n\ninline uint64_t atomic_umin_i64_local(volatile __local uint64_t *p, uint64_t x) {\n#ifdef FUTHARK_CUDA\n  return atomicMin((uint64_t*)p, x);\n#else\n  return atom_min(p, x);\n#endif\n}\n\ninline int64_t atomic_and_i64_global(volatile __global int64_t *p, int64_t x) ",
+                   "{\n#ifdef FUTHARK_CUDA\n  return atomicAnd((int64_t*)p, x);\n#else\n  return atom_and(p, x);\n#endif\n}\n\ninline int64_t atomic_and_i64_local(volatile __local int64_t *p, int64_t x) {\n#ifdef FUTHARK_CUDA\n  return atomicAnd((int64_t*)p, x);\n#else\n  return atom_and(p, x);\n#endif\n}\n\ninline int64_t atomic_or_i64_global(volatile __global int64_t *p, int64_t x) {\n#ifdef FUTHARK_CUDA\n  return atomicOr((int64_t*)p, x);\n#else\n  return atom_or(p, x);\n#endif\n}\n\ninline int64_t atomic_or_i64_local(volatile __local int64_t *p, int64_t x) {\n#ifdef FUTHARK_CUDA\n  return atomicOr((int64_t*)p, x);\n#else\n  return atom_or(p, x);\n#endif\n}\n\ninline int64_t atomic_xor_i64_global(volatile __global int64_t *p, int64_t x) {\n#ifdef FUTHARK_CUDA\n  return atomicXor((int64_t*)p, x);\n#else\n  return atom_xor(p, x);\n#endif\n}\n\ninline int64_t atomic_xor_i64_local(volatile __local int64_t *p, int64_t x) {\n#ifdef FUTHARK_CUDA\n  return atomicXor((int64_t*)p, x);\n#else\n  return atom_xor(p, x);\n#endif\n}\n\n// End of atomics.h\n\n\n\n\n__kernel void get_envelopezicopy_9955(int64_t n_9485, int64_t i_9490, __global\n                                      unsigned char *chunk_board_mem_9941,\n                                      __global unsigned char *mem_9943)\n{\n    const int block_dim0 = 0;\n    const int block_dim1 = 1;\n    const int block_dim2 = 2;\n    int32_t copy_gtid_9955;\n    int32_t copy_ltid_9956;\n    int32_t copy_gid_9957;\n    \n    copy_gtid_9955 = get_global_id(0);\n    copy_ltid_9956 = get_local_id(0);\n    copy_gid_9957 = get_group_id(0);\n    if (slt64(sext_i32_i64(copy_gtid_9955), n_9485)) {\n        ((__global int8_t *) mem_9943)[n_9485 + sext_i32_i64(copy_gtid_9955)] =\n            ((__global int8_t *) chunk_board_mem_9941)[i_9490 +\n                                                       sext_i32_i64(copy_gtid_9955) *\n                                                       n_9485];\n    }\n    \n  error_0:\n    return;\n}\n__kernel void get_envelopezicopy_9960(int64_t n_9485, __global\n                                    ",
+                   "  unsigned char *chunk_board_mem_9941,\n                                      __global unsigned char *mem_9943)\n{\n    const int block_dim0 = 0;\n    const int block_dim1 = 1;\n    const int block_dim2 = 2;\n    int32_t copy_gtid_9960;\n    int32_t copy_ltid_9961;\n    int32_t copy_gid_9962;\n    \n    copy_gtid_9960 = get_global_id(0);\n    copy_ltid_9961 = get_local_id(0);\n    copy_gid_9962 = get_group_id(0);\n    if (slt64(sext_i32_i64(copy_gtid_9960), n_9485)) {\n        ((__global int8_t *) mem_9943)[(int64_t) 3 * n_9485 +\n                                       sext_i32_i64(copy_gtid_9960)] =\n            ((__global\n              int8_t *) chunk_board_mem_9941)[sext_i32_i64(copy_gtid_9960) *\n                                              n_9485];\n    }\n    \n  error_0:\n    return;\n}\n__kernel void next_chunk_boardzicopy_9965(int64_t n_9500, int64_t m_9501,\n                                          __global unsigned char *mem_9948,\n                                          __global unsigned char *mem_9950)\n{\n    const int block_dim0 = 0;\n    const int block_dim1 = 1;\n    const int block_dim2 = 2;\n    int32_t copy_gtid_9965;\n    int32_t copy_ltid_9966;\n    int32_t copy_gid_9967;\n    \n    copy_gtid_9965 = get_global_id(0);\n    copy_ltid_9966 = get_local_id(0);\n    copy_gid_9967 = get_group_id(0);\n    if (slt64(sext_i32_i64(copy_gtid_9965), n_9500 * n_9500)) {\n        ((__global int8_t *) mem_9950)[squot64(sext_i32_i64(copy_gtid_9965),\n                                               n_9500) * n_9500 +\n                                       (sext_i32_i64(copy_gtid_9965) -\n                                        squot64(sext_i32_i64(copy_gtid_9965),\n                                                n_9500) * n_9500)] = ((__global\n                                                                       int8_t *) mem_9948)[m_9501 +\n                                                                                           (int64_t) 1 +\n                                                       ",
+                   "                                    (squot64(sext_i32_i64(copy_gtid_9965),\n                                                                                                    n_9500) *\n                                                                                            m_9501 +\n                                                                                            (sext_i32_i64(copy_gtid_9965) -\n                                                                                             squot64(sext_i32_i64(copy_gtid_9965),\n                                                                                                     n_9500) *\n                                                                                             n_9500))];\n    }\n    \n  error_0:\n    return;\n}\n__kernel void next_chunk_boardzisegmap_9619(__global int *global_failure,\n                                            int failure_is_an_option, __global\n                                            int64_t *global_failure_args,\n                                            int64_t n_9500, int64_t m_9501,\n                                            __global\n                                            unsigned char *chunk_board_mem_9941,\n                                            __global\n                                            unsigned char *envelope_board_mem_9942,\n                                            __global unsigned char *mem_9945)\n{\n    #define segmap_group_sizze_9734 (next_chunk_boardzisegmap_group_sizze_9622)\n    \n    const int block_dim0 = 0;\n    const int block_dim1 = 1;\n    const int block_dim2 = 2;\n    \n    if (*global_failure >= 0)\n        return;\n    \n    int32_t global_tid_9955;\n    int32_t local_tid_9956;\n    int64_t group_sizze_9959;\n    int32_t wave_sizze_9958;\n    int32_t group_tid_9957;\n    \n    global_tid_9955 = get_global_id(0);\n    local_tid_9956 = get_local_id(0);\n    group_sizze_9959 = get_local_size(0);\n    wave_sizze_9958 = LOCKSTEP_WIDTH;\n    group_tid_99",
+                   "57 = get_group_id(0);\n    \n    int32_t phys_tid_9619;\n    \n    phys_tid_9619 = global_tid_9955;\n    \n    int64_t gtid_9617;\n    \n    gtid_9617 = squot64(sext_i32_i64(group_tid_9957) * segmap_group_sizze_9734 +\n                        sext_i32_i64(local_tid_9956), m_9501);\n    \n    int64_t gtid_9618;\n    \n    gtid_9618 = sext_i32_i64(group_tid_9957) * segmap_group_sizze_9734 +\n        sext_i32_i64(local_tid_9956) - squot64(sext_i32_i64(group_tid_9957) *\n                                               segmap_group_sizze_9734 +\n                                               sext_i32_i64(local_tid_9956),\n                                               m_9501) * m_9501;\n    if (slt64(gtid_9617, m_9501) && slt64(gtid_9618, m_9501)) {\n        bool index_primexp_9904 = gtid_9617 == (int64_t) 0;\n        int8_t defunc_0_f_res_9740;\n        \n        if (index_primexp_9904) {\n            int8_t defunc_0_f_res_t_res_9745 = ((__global\n                                                 int8_t *) envelope_board_mem_9942)[gtid_9618];\n            \n            defunc_0_f_res_9740 = defunc_0_f_res_t_res_9745;\n        } else {\n            int64_t y_9746 = sub64(m_9501, (int64_t) 1);\n            bool cond_9747 = gtid_9618 == y_9746;\n            int8_t defunc_0_f_res_f_res_9748;\n            \n            if (cond_9747) {\n                int8_t defunc_0_f_res_f_res_t_res_9753 = ((__global\n                                                           int8_t *) envelope_board_mem_9942)[m_9501 +\n                                                                                              gtid_9617];\n                \n                defunc_0_f_res_f_res_9748 = defunc_0_f_res_f_res_t_res_9753;\n            } else {\n                bool cond_9754 = gtid_9617 == y_9746;\n                int8_t defunc_0_f_res_f_res_f_res_9755;\n                \n                if (cond_9754) {\n                    int8_t defunc_0_f_res_f_res_f_res_t_res_9760 = ((__global\n                                                       ",
+                   "              int8_t *) envelope_board_mem_9942)[(int64_t) 2 *\n                                                                                                        m_9501 +\n                                                                                                        gtid_9618];\n                    \n                    defunc_0_f_res_f_res_f_res_9755 =\n                        defunc_0_f_res_f_res_f_res_t_res_9760;\n                } else {\n                    bool cond_9761 = gtid_9618 == (int64_t) 0;\n                    int8_t defunc_0_f_res_f_res_f_res_f_res_9762;\n                    \n                    if (cond_9761) {\n                        int8_t defunc_0_f_res_f_res_f_res_f_res_t_res_9767 =\n                               ((__global\n                                 int8_t *) envelope_board_mem_9942)[(int64_t) 3 *\n                                                                    m_9501 +\n                                                                    gtid_9617];\n                        \n                        defunc_0_f_res_f_res_f_res_f_res_9762 =\n                            defunc_0_f_res_f_res_f_res_f_res_t_res_9767;\n                    } else {\n                        int64_t i_9768 = sub64(gtid_9617, (int64_t) 1);\n                        bool x_9769 = sle64((int64_t) 0, i_9768);\n                        bool y_9770 = slt64(i_9768, n_9500);\n                        bool bounds_check_9771 = x_9769 && y_9770;\n                        int64_t i_9772 = sub64(gtid_9618, (int64_t) 1);\n                        bool x_9773 = sle64((int64_t) 0, i_9772);\n                        bool y_9774 = slt64(i_9772, n_9500);\n                        bool bounds_check_9775 = x_9773 && y_9774;\n                        bool index_ok_9776 = bounds_check_9771 &&\n                             bounds_check_9775;\n                        bool index_certs_9777;\n                        \n                        if (!index_ok_9776) {\n                            {\n               ",
+                   "                 if (atomic_cmpxchg_i32_global(global_failure,\n                                                              -1, 0) == -1) {\n                                    global_failure_args[0] = i_9768;\n                                    global_failure_args[1] = i_9772;\n                                    global_failure_args[2] = n_9500;\n                                    global_failure_args[3] = n_9500;\n                                    ;\n                                }\n                                return;\n                            }\n                        }\n                        \n                        int8_t defunc_0_f_res_f_res_f_res_f_res_f_res_9778 =\n                               ((__global\n                                 int8_t *) chunk_board_mem_9941)[i_9768 *\n                                                                 n_9500 +\n                                                                 i_9772];\n                        \n                        defunc_0_f_res_f_res_f_res_f_res_9762 =\n                            defunc_0_f_res_f_res_f_res_f_res_f_res_9778;\n                    }\n                    defunc_0_f_res_f_res_f_res_9755 =\n                        defunc_0_f_res_f_res_f_res_f_res_9762;\n                }\n                defunc_0_f_res_f_res_9748 = defunc_0_f_res_f_res_f_res_9755;\n            }\n            defunc_0_f_res_9740 = defunc_0_f_res_f_res_9748;\n        }\n        ((__global int8_t *) mem_9945)[gtid_9617 * m_9501 + gtid_9618] =\n            defunc_0_f_res_9740;\n    }\n    \n  error_0:\n    return;\n    #undef segmap_group_sizze_9734\n}\n__kernel void next_chunk_boardzisegmap_9790(__global int *global_failure,\n                                            int64_t m_9501, __global\n                                            unsigned char *mem_9945, __global\n                                            unsigned char *mem_9948)\n{\n    #define segmap_group_sizze_9865 (next_chunk_boardzisegmap_group_sizze_9793)\n    \n    const int ",
+                   "block_dim0 = 0;\n    const int block_dim1 = 1;\n    const int block_dim2 = 2;\n    \n    if (*global_failure >= 0)\n        return;\n    \n    int32_t global_tid_9960;\n    int32_t local_tid_9961;\n    int64_t group_sizze_9964;\n    int32_t wave_sizze_9963;\n    int32_t group_tid_9962;\n    \n    global_tid_9960 = get_global_id(0);\n    local_tid_9961 = get_local_id(0);\n    group_sizze_9964 = get_local_size(0);\n    wave_sizze_9963 = LOCKSTEP_WIDTH;\n    group_tid_9962 = get_group_id(0);\n    \n    int32_t phys_tid_9790;\n    \n    phys_tid_9790 = global_tid_9960;\n    \n    int64_t gtid_9788;\n    \n    gtid_9788 = squot64(sext_i32_i64(group_tid_9962) * segmap_group_sizze_9865 +\n                        sext_i32_i64(local_tid_9961), m_9501);\n    \n    int64_t gtid_9789;\n    \n    gtid_9789 = sext_i32_i64(group_tid_9962) * segmap_group_sizze_9865 +\n        sext_i32_i64(local_tid_9961) - squot64(sext_i32_i64(group_tid_9962) *\n                                               segmap_group_sizze_9865 +\n                                               sext_i32_i64(local_tid_9961),\n                                               m_9501) * m_9501;\n    if (slt64(gtid_9788, m_9501) && slt64(gtid_9789, m_9501)) {\n        int64_t i_p_o_9935 = add64((int64_t) -1, gtid_9788);\n        int64_t rot_i_9936 = smod64(i_p_o_9935, m_9501);\n        int64_t i_p_o_9937 = add64((int64_t) -1, gtid_9789);\n        int64_t rot_i_9938 = smod64(i_p_o_9937, m_9501);\n        int8_t x_9868 = ((__global int8_t *) mem_9945)[rot_i_9936 * m_9501 +\n                                                       rot_i_9938];\n        int64_t rot_i_9934 = smod64(gtid_9789, m_9501);\n        int8_t x_9869 = ((__global int8_t *) mem_9945)[rot_i_9936 * m_9501 +\n                                                       rot_i_9934];\n        int64_t i_p_o_9929 = add64((int64_t) 1, gtid_9789);\n        int64_t rot_i_9930 = smod64(i_p_o_9929, m_9501);\n        int8_t x_9870 = ((__global int8_t *) mem_9945)[rot_i_9936 * m_9501 +\n                                 ",
+                   "                      rot_i_9930];\n        int64_t rot_i_9924 = smod64(gtid_9788, m_9501);\n        int8_t x_9871 = ((__global int8_t *) mem_9945)[rot_i_9924 * m_9501 +\n                                                       rot_i_9938];\n        int8_t x_9872 = ((__global int8_t *) mem_9945)[rot_i_9924 * m_9501 +\n                                                       rot_i_9930];\n        int64_t i_p_o_9915 = add64((int64_t) 1, gtid_9788);\n        int64_t rot_i_9916 = smod64(i_p_o_9915, m_9501);\n        int8_t x_9873 = ((__global int8_t *) mem_9945)[rot_i_9916 * m_9501 +\n                                                       rot_i_9938];\n        int8_t x_9874 = ((__global int8_t *) mem_9945)[rot_i_9916 * m_9501 +\n                                                       rot_i_9934];\n        int8_t x_9875 = ((__global int8_t *) mem_9945)[rot_i_9916 * m_9501 +\n                                                       rot_i_9930];\n        int8_t x_9876 = ((__global int8_t *) mem_9945)[gtid_9788 * m_9501 +\n                                                       gtid_9789];\n        int8_t x_9877 = add8(x_9868, x_9869);\n        int8_t x_9878 = add8(x_9870, x_9877);\n        int8_t x_9879 = add8(x_9871, x_9878);\n        int8_t x_9880 = add8(x_9872, x_9879);\n        int8_t x_9881 = add8(x_9873, x_9880);\n        int8_t x_9882 = add8(x_9874, x_9881);\n        int8_t defunc_2_f_res_9883 = add8(x_9875, x_9882);\n        bool cond_9884 = x_9876 == (int8_t) 1;\n        bool cond_9885 = defunc_2_f_res_9883 == (int8_t) 2;\n        bool cond_t_res_f_res_9886 = defunc_2_f_res_9883 == (int8_t) 3;\n        bool x_9887 = !cond_9885;\n        bool y_9888 = cond_t_res_f_res_9886 && x_9887;\n        bool cond_t_res_9889 = cond_9885 || y_9888;\n        bool x_9890 = cond_9884 && cond_t_res_9889;\n        bool cond_9891 = x_9876 == (int8_t) 0;\n        bool x_9892 = cond_t_res_f_res_9886 && cond_9891;\n        bool x_9893 = !x_9890;\n        bool y_9894 = x_9892 && x_9893;\n        bool cond_9895 = x_9890 || y_9894",
+                   ";\n        int8_t defunc_1_f_res_9896 = btoi_bool_i8(cond_9895);\n        \n        ((__global int8_t *) mem_9948)[gtid_9788 * m_9501 + gtid_9789] =\n            defunc_1_f_res_9896;\n    }\n    \n  error_0:\n    return;\n    #undef segmap_group_sizze_9865\n}\n",
+                   NULL};
+static const char *size_names[] = {"get_envelope.group_size_9958",
+                                   "get_envelope.group_size_9963",
+                                   "next_chunk_board.group_size_9968",
+                                   "next_chunk_board.segmap_group_size_9622",
+                                   "next_chunk_board.segmap_group_size_9793"};
+static const char *size_vars[] = {"get_envelopezigroup_sizze_9958",
+                                  "get_envelopezigroup_sizze_9963",
+                                  "next_chunk_boardzigroup_sizze_9968",
+                                  "next_chunk_boardzisegmap_group_sizze_9622",
+                                  "next_chunk_boardzisegmap_group_sizze_9793"};
+static const char *size_classes[] = {"group_size", "group_size", "group_size",
+                                     "group_size", "group_size"};
+struct sizes {
+    int64_t get_envelopezigroup_sizze_9958;
+    int64_t get_envelopezigroup_sizze_9963;
+    int64_t next_chunk_boardzigroup_sizze_9968;
+    int64_t next_chunk_boardzisegmap_group_sizze_9622;
+    int64_t next_chunk_boardzisegmap_group_sizze_9793;
+} ;
+struct futhark_context_config {
+    struct opencl_config opencl;
+    int64_t sizes[5];
+    int num_build_opts;
+    const char **build_opts;
+} ;
+struct futhark_context_config *futhark_context_config_new(void)
+{
+    struct futhark_context_config *cfg =
+                                  (struct futhark_context_config *) malloc(sizeof(struct futhark_context_config));
+    
+    if (cfg == NULL)
+        return NULL;
+    cfg->num_build_opts = 0;
+    cfg->build_opts = (const char **) malloc(sizeof(const char *));
+    cfg->build_opts[0] = NULL;
+    cfg->sizes[0] = 0;
+    cfg->sizes[1] = 0;
+    cfg->sizes[2] = 0;
+    cfg->sizes[3] = 0;
+    cfg->sizes[4] = 0;
+    opencl_config_init(&cfg->opencl, 5, size_names, size_vars, cfg->sizes,
+                       size_classes);
+    return cfg;
+}
+void futhark_context_config_free(struct futhark_context_config *cfg)
+{
+    free(cfg->build_opts);
+    free(cfg);
+}
+void futhark_context_config_add_build_option(struct futhark_context_config *cfg,
+                                             const char *opt)
+{
+    cfg->build_opts[cfg->num_build_opts] = opt;
+    cfg->num_build_opts++;
+    cfg->build_opts = (const char **) realloc(cfg->build_opts,
+                                              (cfg->num_build_opts + 1) *
+                                              sizeof(const char *));
+    cfg->build_opts[cfg->num_build_opts] = NULL;
+}
+void futhark_context_config_set_debugging(struct futhark_context_config *cfg,
+                                          int flag)
+{
+    cfg->opencl.profiling = cfg->opencl.logging = cfg->opencl.debugging = flag;
+}
+void futhark_context_config_set_profiling(struct futhark_context_config *cfg,
+                                          int flag)
+{
+    cfg->opencl.profiling = flag;
+}
+void futhark_context_config_set_logging(struct futhark_context_config *cfg,
+                                        int flag)
+{
+    cfg->opencl.logging = flag;
+}
+void futhark_context_config_set_device(struct futhark_context_config *cfg, const
+                                       char *s)
+{
+    set_preferred_device(&cfg->opencl, s);
+}
+void futhark_context_config_set_platform(struct futhark_context_config *cfg,
+                                         const char *s)
+{
+    set_preferred_platform(&cfg->opencl, s);
+}
+void futhark_context_config_select_device_interactively(struct futhark_context_config *cfg)
+{
+    select_device_interactively(&cfg->opencl);
+}
+void futhark_context_config_list_devices(struct futhark_context_config *cfg)
+{
+    (void) cfg;
+    list_devices();
+}
+void futhark_context_config_dump_program_to(struct futhark_context_config *cfg,
+                                            const char *path)
+{
+    cfg->opencl.dump_program_to = path;
+}
+void futhark_context_config_load_program_from(struct futhark_context_config *cfg,
+                                              const char *path)
+{
+    cfg->opencl.load_program_from = path;
+}
+void futhark_context_config_dump_binary_to(struct futhark_context_config *cfg,
+                                           const char *path)
+{
+    cfg->opencl.dump_binary_to = path;
+}
+void futhark_context_config_load_binary_from(struct futhark_context_config *cfg,
+                                             const char *path)
+{
+    cfg->opencl.load_binary_from = path;
+}
+void futhark_context_config_set_default_group_size(struct futhark_context_config *cfg,
+                                                   int size)
+{
+    cfg->opencl.default_group_size = size;
+    cfg->opencl.default_group_size_changed = 1;
+}
+void futhark_context_config_set_default_num_groups(struct futhark_context_config *cfg,
+                                                   int num)
+{
+    cfg->opencl.default_num_groups = num;
+}
+void futhark_context_config_set_default_tile_size(struct futhark_context_config *cfg,
+                                                  int size)
+{
+    cfg->opencl.default_tile_size = size;
+    cfg->opencl.default_tile_size_changed = 1;
+}
+void futhark_context_config_set_default_reg_tile_size(struct futhark_context_config *cfg,
+                                                      int size)
+{
+    cfg->opencl.default_reg_tile_size = size;
+}
+void futhark_context_config_set_default_threshold(struct futhark_context_config *cfg,
+                                                  int size)
+{
+    cfg->opencl.default_threshold = size;
+}
+int futhark_context_config_set_size(struct futhark_context_config *cfg, const
+                                    char *size_name, size_t size_value)
+{
+    for (int i = 0; i < 5; i++) {
+        if (strcmp(size_name, size_names[i]) == 0) {
+            cfg->sizes[i] = size_value;
+            return 0;
+        }
+    }
+    if (strcmp(size_name, "default_group_size") == 0) {
+        cfg->opencl.default_group_size = size_value;
+        return 0;
+    }
+    if (strcmp(size_name, "default_num_groups") == 0) {
+        cfg->opencl.default_num_groups = size_value;
+        return 0;
+    }
+    if (strcmp(size_name, "default_threshold") == 0) {
+        cfg->opencl.default_threshold = size_value;
+        return 0;
+    }
+    if (strcmp(size_name, "default_tile_size") == 0) {
+        cfg->opencl.default_tile_size = size_value;
+        return 0;
+    }
+    if (strcmp(size_name, "default_reg_tile_size") == 0) {
+        cfg->opencl.default_reg_tile_size = size_value;
+        return 0;
+    }
+    return 1;
+}
+struct futhark_context {
+    int detail_memory;
+    int debugging;
+    int profiling;
+    int profiling_paused;
+    int logging;
+    lock_t lock;
+    char *error;
+    FILE *log;
+    int64_t peak_mem_usage_device;
+    int64_t cur_mem_usage_device;
+    int64_t peak_mem_usage_default;
+    int64_t cur_mem_usage_default;
+    struct {
+        int dummy;
+    } constants;
+    int total_runs;
+    long total_runtime;
+    cl_kernel get_envelopezicopy_9955;
+    cl_kernel get_envelopezicopy_9960;
+    cl_kernel next_chunk_boardzicopy_9965;
+    cl_kernel next_chunk_boardzisegmap_9619;
+    cl_kernel next_chunk_boardzisegmap_9790;
+    int64_t copy_dev_to_dev_total_runtime;
+    int copy_dev_to_dev_runs;
+    int64_t copy_dev_to_host_total_runtime;
+    int copy_dev_to_host_runs;
+    int64_t copy_host_to_dev_total_runtime;
+    int copy_host_to_dev_runs;
+    int64_t copy_scalar_to_dev_total_runtime;
+    int copy_scalar_to_dev_runs;
+    int64_t copy_scalar_from_dev_total_runtime;
+    int copy_scalar_from_dev_runs;
+    int64_t get_envelopezicopy_9955_total_runtime;
+    int get_envelopezicopy_9955_runs;
+    int64_t get_envelopezicopy_9960_total_runtime;
+    int get_envelopezicopy_9960_runs;
+    int64_t next_chunk_boardzicopy_9965_total_runtime;
+    int next_chunk_boardzicopy_9965_runs;
+    int64_t next_chunk_boardzisegmap_9619_total_runtime;
+    int next_chunk_boardzisegmap_9619_runs;
+    int64_t next_chunk_boardzisegmap_9790_total_runtime;
+    int next_chunk_boardzisegmap_9790_runs;
+    cl_mem global_failure;
+    cl_mem global_failure_args;
+    struct opencl_context opencl;
+    struct sizes sizes;
+    cl_int failure_is_an_option;
+} ;
+void post_opencl_setup(struct opencl_context *ctx,
+                       struct opencl_device_option *option)
+{
+    if ((ctx->lockstep_width == 0 && strstr(option->platform_name,
+                                            "NVIDIA CUDA") != NULL) &&
+        (option->device_type & CL_DEVICE_TYPE_GPU) == CL_DEVICE_TYPE_GPU) {
+        ctx->lockstep_width = 32;
+    }
+    if ((ctx->lockstep_width == 0 && strstr(option->platform_name,
+                                            "AMD Accelerated Parallel Processing") !=
+         NULL) && (option->device_type & CL_DEVICE_TYPE_GPU) ==
+        CL_DEVICE_TYPE_GPU) {
+        ctx->lockstep_width = 32;
+    }
+    if ((ctx->lockstep_width == 0 && strstr(option->platform_name, "") !=
+         NULL) && (option->device_type & CL_DEVICE_TYPE_GPU) ==
+        CL_DEVICE_TYPE_GPU) {
+        ctx->lockstep_width = 1;
+    }
+    if ((ctx->cfg.default_num_groups == 0 && strstr(option->platform_name,
+                                                    "") != NULL) &&
+        (option->device_type & CL_DEVICE_TYPE_GPU) == CL_DEVICE_TYPE_GPU) {
+        size_t MAX_COMPUTE_UNITS_val = 0;
+        
+        clGetDeviceInfo(ctx->device, CL_DEVICE_MAX_COMPUTE_UNITS,
+                        sizeof(MAX_COMPUTE_UNITS_val), &MAX_COMPUTE_UNITS_val,
+                        NULL);
+        ctx->cfg.default_num_groups = 4 * MAX_COMPUTE_UNITS_val;
+    }
+    if ((ctx->cfg.default_group_size == 0 && strstr(option->platform_name,
+                                                    "") != NULL) &&
+        (option->device_type & CL_DEVICE_TYPE_GPU) == CL_DEVICE_TYPE_GPU) {
+        ctx->cfg.default_group_size = 256;
+    }
+    if ((ctx->cfg.default_tile_size == 0 && strstr(option->platform_name, "") !=
+         NULL) && (option->device_type & CL_DEVICE_TYPE_GPU) ==
+        CL_DEVICE_TYPE_GPU) {
+        ctx->cfg.default_tile_size = 32;
+    }
+    if ((ctx->cfg.default_reg_tile_size == 0 && strstr(option->platform_name,
+                                                       "") != NULL) &&
+        (option->device_type & CL_DEVICE_TYPE_GPU) == CL_DEVICE_TYPE_GPU) {
+        ctx->cfg.default_reg_tile_size = 2;
+    }
+    if ((ctx->cfg.default_threshold == 0 && strstr(option->platform_name, "") !=
+         NULL) && (option->device_type & CL_DEVICE_TYPE_GPU) ==
+        CL_DEVICE_TYPE_GPU) {
+        ctx->cfg.default_threshold = 32768;
+    }
+    if ((ctx->lockstep_width == 0 && strstr(option->platform_name, "") !=
+         NULL) && (option->device_type & CL_DEVICE_TYPE_CPU) ==
+        CL_DEVICE_TYPE_CPU) {
+        ctx->lockstep_width = 1;
+    }
+    if ((ctx->cfg.default_num_groups == 0 && strstr(option->platform_name,
+                                                    "") != NULL) &&
+        (option->device_type & CL_DEVICE_TYPE_CPU) == CL_DEVICE_TYPE_CPU) {
+        size_t MAX_COMPUTE_UNITS_val = 0;
+        
+        clGetDeviceInfo(ctx->device, CL_DEVICE_MAX_COMPUTE_UNITS,
+                        sizeof(MAX_COMPUTE_UNITS_val), &MAX_COMPUTE_UNITS_val,
+                        NULL);
+        ctx->cfg.default_num_groups = MAX_COMPUTE_UNITS_val;
+    }
+    if ((ctx->cfg.default_group_size == 0 && strstr(option->platform_name,
+                                                    "") != NULL) &&
+        (option->device_type & CL_DEVICE_TYPE_CPU) == CL_DEVICE_TYPE_CPU) {
+        ctx->cfg.default_group_size = 32;
+    }
+    if ((ctx->cfg.default_tile_size == 0 && strstr(option->platform_name, "") !=
+         NULL) && (option->device_type & CL_DEVICE_TYPE_CPU) ==
+        CL_DEVICE_TYPE_CPU) {
+        ctx->cfg.default_tile_size = 4;
+    }
+    if ((ctx->cfg.default_reg_tile_size == 0 && strstr(option->platform_name,
+                                                       "") != NULL) &&
+        (option->device_type & CL_DEVICE_TYPE_CPU) == CL_DEVICE_TYPE_CPU) {
+        ctx->cfg.default_reg_tile_size = 1;
+    }
+    if ((ctx->cfg.default_threshold == 0 && strstr(option->platform_name, "") !=
+         NULL) && (option->device_type & CL_DEVICE_TYPE_CPU) ==
+        CL_DEVICE_TYPE_CPU) {
+        size_t MAX_COMPUTE_UNITS_val = 0;
+        
+        clGetDeviceInfo(ctx->device, CL_DEVICE_MAX_COMPUTE_UNITS,
+                        sizeof(MAX_COMPUTE_UNITS_val), &MAX_COMPUTE_UNITS_val,
+                        NULL);
+        ctx->cfg.default_threshold = MAX_COMPUTE_UNITS_val;
+    }
+}
+static void init_context_early(struct futhark_context_config *cfg,
+                               struct futhark_context *ctx)
+{
+    ctx->opencl.cfg = cfg->opencl;
+    ctx->detail_memory = cfg->opencl.debugging;
+    ctx->debugging = cfg->opencl.debugging;
+    ctx->profiling = cfg->opencl.profiling;
+    ctx->profiling_paused = 0;
+    ctx->logging = cfg->opencl.logging;
+    ctx->error = NULL;
+    ctx->log = stderr;
+    ctx->opencl.profiling_records_capacity = 200;
+    ctx->opencl.profiling_records_used = 0;
+    ctx->opencl.profiling_records =
+        malloc(ctx->opencl.profiling_records_capacity *
+        sizeof(struct profiling_record));
+    create_lock(&ctx->lock);
+    ctx->failure_is_an_option = 0;
+    ctx->peak_mem_usage_device = 0;
+    ctx->cur_mem_usage_device = 0;
+    ctx->peak_mem_usage_default = 0;
+    ctx->cur_mem_usage_default = 0;
+    ctx->total_runs = 0;
+    ctx->total_runtime = 0;
+    ctx->copy_dev_to_dev_total_runtime = 0;
+    ctx->copy_dev_to_dev_runs = 0;
+    ctx->copy_dev_to_host_total_runtime = 0;
+    ctx->copy_dev_to_host_runs = 0;
+    ctx->copy_host_to_dev_total_runtime = 0;
+    ctx->copy_host_to_dev_runs = 0;
+    ctx->copy_scalar_to_dev_total_runtime = 0;
+    ctx->copy_scalar_to_dev_runs = 0;
+    ctx->copy_scalar_from_dev_total_runtime = 0;
+    ctx->copy_scalar_from_dev_runs = 0;
+    ctx->get_envelopezicopy_9955_total_runtime = 0;
+    ctx->get_envelopezicopy_9955_runs = 0;
+    ctx->get_envelopezicopy_9960_total_runtime = 0;
+    ctx->get_envelopezicopy_9960_runs = 0;
+    ctx->next_chunk_boardzicopy_9965_total_runtime = 0;
+    ctx->next_chunk_boardzicopy_9965_runs = 0;
+    ctx->next_chunk_boardzisegmap_9619_total_runtime = 0;
+    ctx->next_chunk_boardzisegmap_9619_runs = 0;
+    ctx->next_chunk_boardzisegmap_9790_total_runtime = 0;
+    ctx->next_chunk_boardzisegmap_9790_runs = 0;
+}
+static int init_context_late(struct futhark_context_config *cfg,
+                             struct futhark_context *ctx, cl_program prog)
+{
+    cl_int error;
+    cl_int no_error = -1;
+    
+    ctx->global_failure = clCreateBuffer(ctx->opencl.ctx, CL_MEM_READ_WRITE |
+                                         CL_MEM_COPY_HOST_PTR, sizeof(cl_int),
+                                         &no_error, &error);
+    OPENCL_SUCCEED_OR_RETURN(error);
+    // The +1 is to avoid zero-byte allocations.
+    ctx->global_failure_args = clCreateBuffer(ctx->opencl.ctx,
+                                              CL_MEM_READ_WRITE,
+                                              sizeof(int64_t) * (4 + 1), NULL,
+                                              &error);
+    OPENCL_SUCCEED_OR_RETURN(error);
+    {
+        ctx->get_envelopezicopy_9955 = clCreateKernel(prog,
+                                                      "get_envelopezicopy_9955",
+                                                      &error);
+        OPENCL_SUCCEED_FATAL(error);
+        if (ctx->debugging)
+            fprintf(ctx->log, "Created kernel %s.\n", "get_envelope.copy_9955");
+    }
+    {
+        ctx->get_envelopezicopy_9960 = clCreateKernel(prog,
+                                                      "get_envelopezicopy_9960",
+                                                      &error);
+        OPENCL_SUCCEED_FATAL(error);
+        if (ctx->debugging)
+            fprintf(ctx->log, "Created kernel %s.\n", "get_envelope.copy_9960");
+    }
+    {
+        ctx->next_chunk_boardzicopy_9965 = clCreateKernel(prog,
+                                                          "next_chunk_boardzicopy_9965",
+                                                          &error);
+        OPENCL_SUCCEED_FATAL(error);
+        if (ctx->debugging)
+            fprintf(ctx->log, "Created kernel %s.\n",
+                    "next_chunk_board.copy_9965");
+    }
+    {
+        ctx->next_chunk_boardzisegmap_9619 = clCreateKernel(prog,
+                                                            "next_chunk_boardzisegmap_9619",
+                                                            &error);
+        OPENCL_SUCCEED_FATAL(error);
+        OPENCL_SUCCEED_FATAL(clSetKernelArg(ctx->next_chunk_boardzisegmap_9619,
+                                            0, sizeof(cl_mem),
+                                            &ctx->global_failure));
+        OPENCL_SUCCEED_FATAL(clSetKernelArg(ctx->next_chunk_boardzisegmap_9619,
+                                            2, sizeof(cl_mem),
+                                            &ctx->global_failure_args));
+        if (ctx->debugging)
+            fprintf(ctx->log, "Created kernel %s.\n",
+                    "next_chunk_board.segmap_9619");
+    }
+    {
+        ctx->next_chunk_boardzisegmap_9790 = clCreateKernel(prog,
+                                                            "next_chunk_boardzisegmap_9790",
+                                                            &error);
+        OPENCL_SUCCEED_FATAL(error);
+        OPENCL_SUCCEED_FATAL(clSetKernelArg(ctx->next_chunk_boardzisegmap_9790,
+                                            0, sizeof(cl_mem),
+                                            &ctx->global_failure));
+        if (ctx->debugging)
+            fprintf(ctx->log, "Created kernel %s.\n",
+                    "next_chunk_board.segmap_9790");
+    }
+    ctx->sizes.get_envelopezigroup_sizze_9958 = cfg->sizes[0];
+    ctx->sizes.get_envelopezigroup_sizze_9963 = cfg->sizes[1];
+    ctx->sizes.next_chunk_boardzigroup_sizze_9968 = cfg->sizes[2];
+    ctx->sizes.next_chunk_boardzisegmap_group_sizze_9622 = cfg->sizes[3];
+    ctx->sizes.next_chunk_boardzisegmap_group_sizze_9793 = cfg->sizes[4];
+    init_constants(ctx);
+    // Clear the free list of any deallocations that occurred while initialising constants.
+    OPENCL_SUCCEED_OR_RETURN(opencl_free_all(&ctx->opencl));
+    // The program will be properly freed after all the kernels have also been freed.
+    OPENCL_SUCCEED_OR_RETURN(clReleaseProgram(prog));
+    return futhark_context_sync(ctx);
+}
+struct futhark_context *futhark_context_new(struct futhark_context_config *cfg)
+{
+    struct futhark_context *ctx =
+                           (struct futhark_context *) malloc(sizeof(struct futhark_context));
+    
+    if (ctx == NULL)
+        return NULL;
+    
+    int required_types = 0;
+    
+    init_context_early(cfg, ctx);
+    
+    cl_program prog = setup_opencl(&ctx->opencl, opencl_program, required_types,
+                                   cfg->build_opts);
+    
+    init_context_late(cfg, ctx, prog);
+    return ctx;
+}
+struct futhark_context *futhark_context_new_with_command_queue(struct futhark_context_config *cfg,
+                                                               cl_command_queue queue)
+{
+    struct futhark_context *ctx =
+                           (struct futhark_context *) malloc(sizeof(struct futhark_context));
+    
+    if (ctx == NULL)
+        return NULL;
+    
+    int required_types = 0;
+    
+    init_context_early(cfg, ctx);
+    
+    cl_program prog = setup_opencl_with_command_queue(&ctx->opencl, queue,
+                                                      opencl_program,
+                                                      required_types,
+                                                      cfg->build_opts);
+    
+    init_context_late(cfg, ctx, prog);
+    return ctx;
+}
+void futhark_context_free(struct futhark_context *ctx)
+{
+    free_constants(ctx);
+    free_lock(&ctx->lock);
+    OPENCL_SUCCEED_FATAL(clReleaseKernel(ctx->get_envelopezicopy_9955));
+    OPENCL_SUCCEED_FATAL(clReleaseKernel(ctx->get_envelopezicopy_9960));
+    OPENCL_SUCCEED_FATAL(clReleaseKernel(ctx->next_chunk_boardzicopy_9965));
+    OPENCL_SUCCEED_FATAL(clReleaseKernel(ctx->next_chunk_boardzisegmap_9619));
+    OPENCL_SUCCEED_FATAL(clReleaseKernel(ctx->next_chunk_boardzisegmap_9790));
+    teardown_opencl(&ctx->opencl);
+    free(ctx);
+}
+int futhark_context_sync(struct futhark_context *ctx)
+{
+    cl_int failure_idx = -1;
+    
+    if (ctx->failure_is_an_option) {
+        OPENCL_SUCCEED_OR_RETURN(clEnqueueReadBuffer(ctx->opencl.queue,
+                                                     ctx->global_failure,
+                                                     CL_FALSE, 0,
+                                                     sizeof(cl_int),
+                                                     &failure_idx, 0, NULL,
+                                                     ctx->profiling_paused ||
+                                                     !ctx->profiling ? NULL : opencl_get_event(&ctx->opencl,
+                                                                                               &ctx->copy_scalar_from_dev_runs,
+                                                                                               &ctx->copy_scalar_from_dev_total_runtime)));
+        ctx->failure_is_an_option = 0;
+    }
+    OPENCL_SUCCEED_OR_RETURN(clFinish(ctx->opencl.queue));
+    if (failure_idx >= 0) {
+        cl_int no_failure = -1;
+        
+        OPENCL_SUCCEED_OR_RETURN(clEnqueueWriteBuffer(ctx->opencl.queue,
+                                                      ctx->global_failure,
+                                                      CL_TRUE, 0,
+                                                      sizeof(cl_int),
+                                                      &no_failure, 0, NULL,
+                                                      NULL));
+        
+        int64_t args[4 + 1];
+        
+        OPENCL_SUCCEED_OR_RETURN(clEnqueueReadBuffer(ctx->opencl.queue,
+                                                     ctx->global_failure_args,
+                                                     CL_TRUE, 0, sizeof(args),
+                                                     &args, 0, NULL,
+                                                     ctx->profiling_paused ||
+                                                     !ctx->profiling ? NULL : opencl_get_event(&ctx->opencl,
+                                                                                               &ctx->copy_dev_to_host_runs,
+                                                                                               &ctx->copy_dev_to_host_total_runtime)));
+        switch (failure_idx) {
+            
+          case 0:
+            {
+                ctx->error =
+                    msgprintf("Index [%lld, %lld] out of bounds for array of shape [%lld][%lld].\n-> #0  gol.fut:27:36-55\n   #1  /prelude/soacs.fut:59:3-10\n   #2  /prelude/array.fut:195:3-17\n   #3  /prelude/functional.fut:39:59-65\n   #4  /prelude/soacs.fut:59:3-10\n   #5  /prelude/array.fut:203:3-34\n   #6  gol.fut:18:5-27:56\n   #7  gol.fut:31:27-66\n   #8  gol.fut:30:1-40:43\n",
+                              args[0], args[1], args[2], args[3]);
+                break;
+            }
+        }
+        return 1;
+    }
+    return 0;
+}
+cl_command_queue futhark_context_get_command_queue(struct futhark_context *ctx)
+{
+    return ctx->opencl.queue;
+}
+static int memblock_unref_device(struct futhark_context *ctx,
+                                 struct memblock_device *block, const
+                                 char *desc)
+{
+    if (block->references != NULL) {
+        *block->references -= 1;
+        if (ctx->detail_memory)
+            fprintf(ctx->log,
+                    "Unreferencing block %s (allocated as %s) in %s: %d references remaining.\n",
+                    desc, block->desc, "space 'device'", *block->references);
+        if (*block->references == 0) {
+            ctx->cur_mem_usage_device -= block->size;
+            OPENCL_SUCCEED_OR_RETURN(opencl_free(&ctx->opencl, block->mem,
+                                                 desc));
+            free(block->references);
+            if (ctx->detail_memory)
+                fprintf(ctx->log,
+                        "%lld bytes freed (now allocated: %lld bytes)\n",
+                        (long long) block->size,
+                        (long long) ctx->cur_mem_usage_device);
+        }
+        block->references = NULL;
+    }
+    return 0;
+}
+static int memblock_alloc_device(struct futhark_context *ctx,
+                                 struct memblock_device *block, int64_t size,
+                                 const char *desc)
+{
+    if (size < 0)
+        futhark_panic(1,
+                      "Negative allocation of %lld bytes attempted for %s in %s.\n",
+                      (long long) size, desc, "space 'device'",
+                      ctx->cur_mem_usage_device);
+    
+    int ret = memblock_unref_device(ctx, block, desc);
+    
+    ctx->cur_mem_usage_device += size;
+    if (ctx->detail_memory)
+        fprintf(ctx->log,
+                "Allocating %lld bytes for %s in %s (then allocated: %lld bytes)",
+                (long long) size, desc, "space 'device'",
+                (long long) ctx->cur_mem_usage_device);
+    if (ctx->cur_mem_usage_device > ctx->peak_mem_usage_device) {
+        ctx->peak_mem_usage_device = ctx->cur_mem_usage_device;
+        if (ctx->detail_memory)
+            fprintf(ctx->log, " (new peak).\n");
+    } else if (ctx->detail_memory)
+        fprintf(ctx->log, ".\n");
+    OPENCL_SUCCEED_OR_RETURN(opencl_alloc(&ctx->opencl, size, desc,
+                                          &block->mem));
+    block->references = (int *) malloc(sizeof(int));
+    *block->references = 1;
+    block->size = size;
+    block->desc = desc;
+    return ret;
+}
+static int memblock_set_device(struct futhark_context *ctx,
+                               struct memblock_device *lhs,
+                               struct memblock_device *rhs, const
+                               char *lhs_desc)
+{
+    int ret = memblock_unref_device(ctx, lhs, lhs_desc);
+    
+    if (rhs->references != NULL)
+        (*rhs->references)++;
+    *lhs = *rhs;
+    return ret;
+}
+static int memblock_unref(struct futhark_context *ctx, struct memblock *block,
+                          const char *desc)
+{
+    if (block->references != NULL) {
+        *block->references -= 1;
+        if (ctx->detail_memory)
+            fprintf(ctx->log,
+                    "Unreferencing block %s (allocated as %s) in %s: %d references remaining.\n",
+                    desc, block->desc, "default space", *block->references);
+        if (*block->references == 0) {
+            ctx->cur_mem_usage_default -= block->size;
+            free(block->mem);
+            free(block->references);
+            if (ctx->detail_memory)
+                fprintf(ctx->log,
+                        "%lld bytes freed (now allocated: %lld bytes)\n",
+                        (long long) block->size,
+                        (long long) ctx->cur_mem_usage_default);
+        }
+        block->references = NULL;
+    }
+    return 0;
+}
+static int memblock_alloc(struct futhark_context *ctx, struct memblock *block,
+                          int64_t size, const char *desc)
+{
+    if (size < 0)
+        futhark_panic(1,
+                      "Negative allocation of %lld bytes attempted for %s in %s.\n",
+                      (long long) size, desc, "default space",
+                      ctx->cur_mem_usage_default);
+    
+    int ret = memblock_unref(ctx, block, desc);
+    
+    ctx->cur_mem_usage_default += size;
+    if (ctx->detail_memory)
+        fprintf(ctx->log,
+                "Allocating %lld bytes for %s in %s (then allocated: %lld bytes)",
+                (long long) size, desc, "default space",
+                (long long) ctx->cur_mem_usage_default);
+    if (ctx->cur_mem_usage_default > ctx->peak_mem_usage_default) {
+        ctx->peak_mem_usage_default = ctx->cur_mem_usage_default;
+        if (ctx->detail_memory)
+            fprintf(ctx->log, " (new peak).\n");
+    } else if (ctx->detail_memory)
+        fprintf(ctx->log, ".\n");
+    block->mem = (char *) malloc(size);
+    block->references = (int *) malloc(sizeof(int));
+    *block->references = 1;
+    block->size = size;
+    block->desc = desc;
+    return ret;
+}
+static int memblock_set(struct futhark_context *ctx, struct memblock *lhs,
+                        struct memblock *rhs, const char *lhs_desc)
+{
+    int ret = memblock_unref(ctx, lhs, lhs_desc);
+    
+    if (rhs->references != NULL)
+        (*rhs->references)++;
+    *lhs = *rhs;
+    return ret;
+}
+int futhark_get_num_sizes(void)
+{
+    return sizeof(size_names) / sizeof(size_names[0]);
+}
+const char *futhark_get_size_name(int i)
+{
+    return size_names[i];
+}
+const char *futhark_get_size_class(int i)
+{
+    return size_classes[i];
+}
+char *futhark_context_report(struct futhark_context *ctx)
+{
+    if (futhark_context_sync(ctx) != 0)
+        return NULL;
+    
+    struct str_builder builder;
+    
+    str_builder_init(&builder);
+    if (ctx->detail_memory || ctx->profiling || ctx->logging) {
+        str_builder(&builder,
+                    "Peak memory usage for space 'device': %lld bytes.\n",
+                    (long long) ctx->peak_mem_usage_device);
+        { }
+    }
+    if (ctx->profiling) {
+        OPENCL_SUCCEED_FATAL(opencl_tally_profiling_records(&ctx->opencl));
+        str_builder(&builder,
+                    "copy_dev_to_dev              ran %5d times; avg: %8ldus; total: %8ldus\n",
+                    ctx->copy_dev_to_dev_runs,
+                    (long) ctx->copy_dev_to_dev_total_runtime /
+                    (ctx->copy_dev_to_dev_runs !=
+                     0 ? ctx->copy_dev_to_dev_runs : 1),
+                    (long) ctx->copy_dev_to_dev_total_runtime);
+        ctx->total_runtime += ctx->copy_dev_to_dev_total_runtime;
+        ctx->total_runs += ctx->copy_dev_to_dev_runs;
+        str_builder(&builder,
+                    "copy_dev_to_host             ran %5d times; avg: %8ldus; total: %8ldus\n",
+                    ctx->copy_dev_to_host_runs,
+                    (long) ctx->copy_dev_to_host_total_runtime /
+                    (ctx->copy_dev_to_host_runs !=
+                     0 ? ctx->copy_dev_to_host_runs : 1),
+                    (long) ctx->copy_dev_to_host_total_runtime);
+        ctx->total_runtime += ctx->copy_dev_to_host_total_runtime;
+        ctx->total_runs += ctx->copy_dev_to_host_runs;
+        str_builder(&builder,
+                    "copy_host_to_dev             ran %5d times; avg: %8ldus; total: %8ldus\n",
+                    ctx->copy_host_to_dev_runs,
+                    (long) ctx->copy_host_to_dev_total_runtime /
+                    (ctx->copy_host_to_dev_runs !=
+                     0 ? ctx->copy_host_to_dev_runs : 1),
+                    (long) ctx->copy_host_to_dev_total_runtime);
+        ctx->total_runtime += ctx->copy_host_to_dev_total_runtime;
+        ctx->total_runs += ctx->copy_host_to_dev_runs;
+        str_builder(&builder,
+                    "copy_scalar_to_dev           ran %5d times; avg: %8ldus; total: %8ldus\n",
+                    ctx->copy_scalar_to_dev_runs,
+                    (long) ctx->copy_scalar_to_dev_total_runtime /
+                    (ctx->copy_scalar_to_dev_runs !=
+                     0 ? ctx->copy_scalar_to_dev_runs : 1),
+                    (long) ctx->copy_scalar_to_dev_total_runtime);
+        ctx->total_runtime += ctx->copy_scalar_to_dev_total_runtime;
+        ctx->total_runs += ctx->copy_scalar_to_dev_runs;
+        str_builder(&builder,
+                    "copy_scalar_from_dev         ran %5d times; avg: %8ldus; total: %8ldus\n",
+                    ctx->copy_scalar_from_dev_runs,
+                    (long) ctx->copy_scalar_from_dev_total_runtime /
+                    (ctx->copy_scalar_from_dev_runs !=
+                     0 ? ctx->copy_scalar_from_dev_runs : 1),
+                    (long) ctx->copy_scalar_from_dev_total_runtime);
+        ctx->total_runtime += ctx->copy_scalar_from_dev_total_runtime;
+        ctx->total_runs += ctx->copy_scalar_from_dev_runs;
+        str_builder(&builder,
+                    "get_envelope.copy_9955       ran %5d times; avg: %8ldus; total: %8ldus\n",
+                    ctx->get_envelopezicopy_9955_runs,
+                    (long) ctx->get_envelopezicopy_9955_total_runtime /
+                    (ctx->get_envelopezicopy_9955_runs !=
+                     0 ? ctx->get_envelopezicopy_9955_runs : 1),
+                    (long) ctx->get_envelopezicopy_9955_total_runtime);
+        ctx->total_runtime += ctx->get_envelopezicopy_9955_total_runtime;
+        ctx->total_runs += ctx->get_envelopezicopy_9955_runs;
+        str_builder(&builder,
+                    "get_envelope.copy_9960       ran %5d times; avg: %8ldus; total: %8ldus\n",
+                    ctx->get_envelopezicopy_9960_runs,
+                    (long) ctx->get_envelopezicopy_9960_total_runtime /
+                    (ctx->get_envelopezicopy_9960_runs !=
+                     0 ? ctx->get_envelopezicopy_9960_runs : 1),
+                    (long) ctx->get_envelopezicopy_9960_total_runtime);
+        ctx->total_runtime += ctx->get_envelopezicopy_9960_total_runtime;
+        ctx->total_runs += ctx->get_envelopezicopy_9960_runs;
+        str_builder(&builder,
+                    "next_chunk_board.copy_9965   ran %5d times; avg: %8ldus; total: %8ldus\n",
+                    ctx->next_chunk_boardzicopy_9965_runs,
+                    (long) ctx->next_chunk_boardzicopy_9965_total_runtime /
+                    (ctx->next_chunk_boardzicopy_9965_runs !=
+                     0 ? ctx->next_chunk_boardzicopy_9965_runs : 1),
+                    (long) ctx->next_chunk_boardzicopy_9965_total_runtime);
+        ctx->total_runtime += ctx->next_chunk_boardzicopy_9965_total_runtime;
+        ctx->total_runs += ctx->next_chunk_boardzicopy_9965_runs;
+        str_builder(&builder,
+                    "next_chunk_board.segmap_9619 ran %5d times; avg: %8ldus; total: %8ldus\n",
+                    ctx->next_chunk_boardzisegmap_9619_runs,
+                    (long) ctx->next_chunk_boardzisegmap_9619_total_runtime /
+                    (ctx->next_chunk_boardzisegmap_9619_runs !=
+                     0 ? ctx->next_chunk_boardzisegmap_9619_runs : 1),
+                    (long) ctx->next_chunk_boardzisegmap_9619_total_runtime);
+        ctx->total_runtime += ctx->next_chunk_boardzisegmap_9619_total_runtime;
+        ctx->total_runs += ctx->next_chunk_boardzisegmap_9619_runs;
+        str_builder(&builder,
+                    "next_chunk_board.segmap_9790 ran %5d times; avg: %8ldus; total: %8ldus\n",
+                    ctx->next_chunk_boardzisegmap_9790_runs,
+                    (long) ctx->next_chunk_boardzisegmap_9790_total_runtime /
+                    (ctx->next_chunk_boardzisegmap_9790_runs !=
+                     0 ? ctx->next_chunk_boardzisegmap_9790_runs : 1),
+                    (long) ctx->next_chunk_boardzisegmap_9790_total_runtime);
+        ctx->total_runtime += ctx->next_chunk_boardzisegmap_9790_total_runtime;
+        ctx->total_runs += ctx->next_chunk_boardzisegmap_9790_runs;
+        str_builder(&builder, "%d operations with cumulative runtime: %6ldus\n",
+                    ctx->total_runs, ctx->total_runtime);
+    }
+    return builder.str;
+}
+char *futhark_context_get_error(struct futhark_context *ctx)
+{
+    char *error = ctx->error;
+    
+    ctx->error = NULL;
+    return error;
+}
+void futhark_context_set_logging_file(struct futhark_context *ctx, FILE *f)
+{
+    ctx->log = f;
+}
+void futhark_context_pause_profiling(struct futhark_context *ctx)
+{
+    ctx->profiling_paused = 1;
+}
+void futhark_context_unpause_profiling(struct futhark_context *ctx)
+{
+    ctx->profiling_paused = 0;
+}
+int futhark_context_clear_caches(struct futhark_context *ctx)
+{
+    lock_lock(&ctx->lock);
+    ctx->peak_mem_usage_device = 0;
+    ctx->peak_mem_usage_default = 0;
+    if (ctx->error == NULL)
+        ctx->error = OPENCL_SUCCEED_NONFATAL(opencl_free_all(&ctx->opencl));
+    lock_unlock(&ctx->lock);
+    return ctx->error != NULL;
+}
+static int futrts_get_envelope(struct futhark_context *ctx,
+                               struct memblock_device *out_mem_p_9970,
+                               struct memblock_device chunk_board_mem_9941,
+                               int64_t n_9485);
+static int futrts_next_chunk_board(struct futhark_context *ctx,
+                                   struct memblock_device *out_mem_p_9981,
+                                   struct memblock_device chunk_board_mem_9941,
+                                   struct memblock_device envelope_board_mem_9942,
+                                   int64_t n_9500, int64_t m_9501);
+static int init_constants(struct futhark_context *ctx)
+{
+    (void) ctx;
+    
+    int err = 0;
+    
+    
+  cleanup:
+    return err;
+}
+static int free_constants(struct futhark_context *ctx)
+{
+    (void) ctx;
+    return 0;
+}
+static int futrts_get_envelope(struct futhark_context *ctx,
+                               struct memblock_device *out_mem_p_9970,
+                               struct memblock_device chunk_board_mem_9941,
+                               int64_t n_9485)
+{
+    (void) ctx;
+    
+    int err = 0;
+    struct memblock_device out_mem_9954;
+    
+    out_mem_9954.references = NULL;
+    
+    bool y_9487 = slt64((int64_t) 0, n_9485);
+    bool index_certs_9488;
+    
+    if (!y_9487) {
+        ctx->error = msgprintf("Error: %s%lld%s%lld%s\n\nBacktrace:\n%s",
+                               "Index [", (int64_t) 0,
+                               "] out of bounds for array of shape [", n_9485,
+                               "].",
+                               "-> #0  gol.fut:43:17-30\n   #1  gol.fut:42:1-48:33\n");
+        if (memblock_unref_device(ctx, &out_mem_9954, "out_mem_9954") != 0)
+            return 1;
+        return 1;
+    }
+    
+    int64_t i_9490 = sub64(n_9485, (int64_t) 1);
+    bool x_9491 = sle64((int64_t) 0, i_9490);
+    bool y_9492 = slt64(i_9490, n_9485);
+    bool bounds_check_9493 = x_9491 && y_9492;
+    bool index_certs_9494;
+    
+    if (!bounds_check_9493) {
+        ctx->error = msgprintf("Error: %s%lld%s%lld%s\n\nBacktrace:\n%s",
+                               "Index [", i_9490,
+                               "] out of bounds for array of shape [", n_9485,
+                               "].",
+                               "-> #0  gol.fut:44:17-32\n   #1  gol.fut:42:1-48:33\n");
+        if (memblock_unref_device(ctx, &out_mem_9954, "out_mem_9954") != 0)
+            return 1;
+        return 1;
+    }
+    
+    int64_t bytes_9942 = (int64_t) 4 * n_9485;
+    struct memblock_device mem_9943;
+    
+    mem_9943.references = NULL;
+    if (memblock_alloc_device(ctx, &mem_9943, bytes_9942, "mem_9943")) {
+        err = 1;
+        goto cleanup;
+    }
+    if (n_9485 > 0) {
+        OPENCL_SUCCEED_OR_RETURN(clEnqueueCopyBuffer(ctx->opencl.queue,
+                                                     chunk_board_mem_9941.mem,
+                                                     mem_9943.mem, (int64_t) 0,
+                                                     (int64_t) 0, n_9485, 0,
+                                                     NULL,
+                                                     ctx->profiling_paused ||
+                                                     !ctx->profiling ? NULL : opencl_get_event(&ctx->opencl,
+                                                                                               &ctx->copy_dev_to_dev_runs,
+                                                                                               &ctx->copy_dev_to_dev_total_runtime)));
+        if (ctx->debugging)
+            OPENCL_SUCCEED_FATAL(clFinish(ctx->opencl.queue));
+    }
+    
+    int64_t group_sizze_9958;
+    
+    group_sizze_9958 = ctx->sizes.get_envelopezigroup_sizze_9958;
+    
+    int64_t num_groups_9959;
+    
+    num_groups_9959 = sdiv_up64(n_9485, group_sizze_9958);
+    OPENCL_SUCCEED_OR_RETURN(clSetKernelArg(ctx->get_envelopezicopy_9955, 0,
+                                            sizeof(n_9485), &n_9485));
+    OPENCL_SUCCEED_OR_RETURN(clSetKernelArg(ctx->get_envelopezicopy_9955, 1,
+                                            sizeof(i_9490), &i_9490));
+    OPENCL_SUCCEED_OR_RETURN(clSetKernelArg(ctx->get_envelopezicopy_9955, 2,
+                                            sizeof(chunk_board_mem_9941.mem),
+                                            &chunk_board_mem_9941.mem));
+    OPENCL_SUCCEED_OR_RETURN(clSetKernelArg(ctx->get_envelopezicopy_9955, 3,
+                                            sizeof(mem_9943.mem),
+                                            &mem_9943.mem));
+    if (1 * ((size_t) num_groups_9959 * (size_t) group_sizze_9958) != 0) {
+        const size_t global_work_sizze_9971[1] = {(size_t) num_groups_9959 *
+                     (size_t) group_sizze_9958};
+        const size_t local_work_sizze_9975[1] = {group_sizze_9958};
+        int64_t time_start_9972 = 0, time_end_9973 = 0;
+        
+        if (ctx->debugging) {
+            fprintf(ctx->log, "Launching %s with global work size [",
+                    "get_envelope.copy_9955");
+            fprintf(ctx->log, "%zu", global_work_sizze_9971[0]);
+            fprintf(ctx->log, "] and local work size [");
+            fprintf(ctx->log, "%zu", local_work_sizze_9975[0]);
+            fprintf(ctx->log, "]; local memory parameters sum to %d bytes.\n",
+                    (int) 0);
+            time_start_9972 = get_wall_time();
+        }
+        OPENCL_SUCCEED_OR_RETURN(clEnqueueNDRangeKernel(ctx->opencl.queue,
+                                                        ctx->get_envelopezicopy_9955,
+                                                        1, NULL,
+                                                        global_work_sizze_9971,
+                                                        local_work_sizze_9975,
+                                                        0, NULL,
+                                                        ctx->profiling_paused ||
+                                                        !ctx->profiling ? NULL : opencl_get_event(&ctx->opencl,
+                                                                                                  &ctx->get_envelopezicopy_9955_runs,
+                                                                                                  &ctx->get_envelopezicopy_9955_total_runtime)));
+        if (ctx->debugging) {
+            OPENCL_SUCCEED_FATAL(clFinish(ctx->opencl.queue));
+            time_end_9973 = get_wall_time();
+            
+            long time_diff_9974 = time_end_9973 - time_start_9972;
+            
+            fprintf(ctx->log, "kernel %s runtime: %ldus\n",
+                    "get_envelope.copy_9955", time_diff_9974);
+        }
+    }
+    if (n_9485 > 0) {
+        OPENCL_SUCCEED_OR_RETURN(clEnqueueCopyBuffer(ctx->opencl.queue,
+                                                     chunk_board_mem_9941.mem,
+                                                     mem_9943.mem, i_9490 *
+                                                     n_9485, (int64_t) 2 *
+                                                     n_9485, n_9485, 0, NULL,
+                                                     ctx->profiling_paused ||
+                                                     !ctx->profiling ? NULL : opencl_get_event(&ctx->opencl,
+                                                                                               &ctx->copy_dev_to_dev_runs,
+                                                                                               &ctx->copy_dev_to_dev_total_runtime)));
+        if (ctx->debugging)
+            OPENCL_SUCCEED_FATAL(clFinish(ctx->opencl.queue));
+    }
+    
+    int64_t group_sizze_9963;
+    
+    group_sizze_9963 = ctx->sizes.get_envelopezigroup_sizze_9963;
+    
+    int64_t num_groups_9964;
+    
+    num_groups_9964 = sdiv_up64(n_9485, group_sizze_9963);
+    OPENCL_SUCCEED_OR_RETURN(clSetKernelArg(ctx->get_envelopezicopy_9960, 0,
+                                            sizeof(n_9485), &n_9485));
+    OPENCL_SUCCEED_OR_RETURN(clSetKernelArg(ctx->get_envelopezicopy_9960, 1,
+                                            sizeof(chunk_board_mem_9941.mem),
+                                            &chunk_board_mem_9941.mem));
+    OPENCL_SUCCEED_OR_RETURN(clSetKernelArg(ctx->get_envelopezicopy_9960, 2,
+                                            sizeof(mem_9943.mem),
+                                            &mem_9943.mem));
+    if (1 * ((size_t) num_groups_9964 * (size_t) group_sizze_9963) != 0) {
+        const size_t global_work_sizze_9976[1] = {(size_t) num_groups_9964 *
+                     (size_t) group_sizze_9963};
+        const size_t local_work_sizze_9980[1] = {group_sizze_9963};
+        int64_t time_start_9977 = 0, time_end_9978 = 0;
+        
+        if (ctx->debugging) {
+            fprintf(ctx->log, "Launching %s with global work size [",
+                    "get_envelope.copy_9960");
+            fprintf(ctx->log, "%zu", global_work_sizze_9976[0]);
+            fprintf(ctx->log, "] and local work size [");
+            fprintf(ctx->log, "%zu", local_work_sizze_9980[0]);
+            fprintf(ctx->log, "]; local memory parameters sum to %d bytes.\n",
+                    (int) 0);
+            time_start_9977 = get_wall_time();
+        }
+        OPENCL_SUCCEED_OR_RETURN(clEnqueueNDRangeKernel(ctx->opencl.queue,
+                                                        ctx->get_envelopezicopy_9960,
+                                                        1, NULL,
+                                                        global_work_sizze_9976,
+                                                        local_work_sizze_9980,
+                                                        0, NULL,
+                                                        ctx->profiling_paused ||
+                                                        !ctx->profiling ? NULL : opencl_get_event(&ctx->opencl,
+                                                                                                  &ctx->get_envelopezicopy_9960_runs,
+                                                                                                  &ctx->get_envelopezicopy_9960_total_runtime)));
+        if (ctx->debugging) {
+            OPENCL_SUCCEED_FATAL(clFinish(ctx->opencl.queue));
+            time_end_9978 = get_wall_time();
+            
+            long time_diff_9979 = time_end_9978 - time_start_9977;
+            
+            fprintf(ctx->log, "kernel %s runtime: %ldus\n",
+                    "get_envelope.copy_9960", time_diff_9979);
+        }
+    }
+    if (memblock_set_device(ctx, &out_mem_9954, &mem_9943, "mem_9943") != 0)
+        return 1;
+    (*out_mem_p_9970).references = NULL;
+    if (memblock_set_device(ctx, &*out_mem_p_9970, &out_mem_9954,
+                            "out_mem_9954") != 0)
+        return 1;
+    if (memblock_unref_device(ctx, &mem_9943, "mem_9943") != 0)
+        return 1;
+    if (memblock_unref_device(ctx, &out_mem_9954, "out_mem_9954") != 0)
+        return 1;
+    
+  cleanup:
+    { }
+    return err;
+}
+static int futrts_next_chunk_board(struct futhark_context *ctx,
+                                   struct memblock_device *out_mem_p_9981,
+                                   struct memblock_device chunk_board_mem_9941,
+                                   struct memblock_device envelope_board_mem_9942,
+                                   int64_t n_9500, int64_t m_9501)
+{
+    (void) ctx;
+    
+    int err = 0;
+    struct memblock_device out_mem_9954;
+    
+    out_mem_9954.references = NULL;
+    
+    int64_t nest_sizze_9733 = m_9501 * m_9501;
+    int64_t segmap_group_sizze_9734;
+    
+    segmap_group_sizze_9734 =
+        ctx->sizes.next_chunk_boardzisegmap_group_sizze_9622;
+    
+    int64_t segmap_usable_groups_9735 = sdiv_up64(nest_sizze_9733,
+                                                  segmap_group_sizze_9734);
+    struct memblock_device mem_9945;
+    
+    mem_9945.references = NULL;
+    if (memblock_alloc_device(ctx, &mem_9945, nest_sizze_9733, "mem_9945")) {
+        err = 1;
+        goto cleanup;
+    }
+    if (ctx->debugging)
+        fprintf(ctx->log, "%s\n", "\n# SegMap");
+    OPENCL_SUCCEED_OR_RETURN(clSetKernelArg(ctx->next_chunk_boardzisegmap_9619,
+                                            1,
+                                            sizeof(ctx->failure_is_an_option),
+                                            &ctx->failure_is_an_option));
+    OPENCL_SUCCEED_OR_RETURN(clSetKernelArg(ctx->next_chunk_boardzisegmap_9619,
+                                            3, sizeof(n_9500), &n_9500));
+    OPENCL_SUCCEED_OR_RETURN(clSetKernelArg(ctx->next_chunk_boardzisegmap_9619,
+                                            4, sizeof(m_9501), &m_9501));
+    OPENCL_SUCCEED_OR_RETURN(clSetKernelArg(ctx->next_chunk_boardzisegmap_9619,
+                                            5, sizeof(chunk_board_mem_9941.mem),
+                                            &chunk_board_mem_9941.mem));
+    OPENCL_SUCCEED_OR_RETURN(clSetKernelArg(ctx->next_chunk_boardzisegmap_9619,
+                                            6,
+                                            sizeof(envelope_board_mem_9942.mem),
+                                            &envelope_board_mem_9942.mem));
+    OPENCL_SUCCEED_OR_RETURN(clSetKernelArg(ctx->next_chunk_boardzisegmap_9619,
+                                            7, sizeof(mem_9945.mem),
+                                            &mem_9945.mem));
+    if (1 * ((size_t) segmap_usable_groups_9735 *
+             (size_t) segmap_group_sizze_9734) != 0) {
+        const size_t global_work_sizze_9982[1] =
+                     {(size_t) segmap_usable_groups_9735 *
+                     (size_t) segmap_group_sizze_9734};
+        const size_t local_work_sizze_9986[1] = {segmap_group_sizze_9734};
+        int64_t time_start_9983 = 0, time_end_9984 = 0;
+        
+        if (ctx->debugging) {
+            fprintf(ctx->log, "Launching %s with global work size [",
+                    "next_chunk_board.segmap_9619");
+            fprintf(ctx->log, "%zu", global_work_sizze_9982[0]);
+            fprintf(ctx->log, "] and local work size [");
+            fprintf(ctx->log, "%zu", local_work_sizze_9986[0]);
+            fprintf(ctx->log, "]; local memory parameters sum to %d bytes.\n",
+                    (int) 0);
+            time_start_9983 = get_wall_time();
+        }
+        OPENCL_SUCCEED_OR_RETURN(clEnqueueNDRangeKernel(ctx->opencl.queue,
+                                                        ctx->next_chunk_boardzisegmap_9619,
+                                                        1, NULL,
+                                                        global_work_sizze_9982,
+                                                        local_work_sizze_9986,
+                                                        0, NULL,
+                                                        ctx->profiling_paused ||
+                                                        !ctx->profiling ? NULL : opencl_get_event(&ctx->opencl,
+                                                                                                  &ctx->next_chunk_boardzisegmap_9619_runs,
+                                                                                                  &ctx->next_chunk_boardzisegmap_9619_total_runtime)));
+        if (ctx->debugging) {
+            OPENCL_SUCCEED_FATAL(clFinish(ctx->opencl.queue));
+            time_end_9984 = get_wall_time();
+            
+            long time_diff_9985 = time_end_9984 - time_start_9983;
+            
+            fprintf(ctx->log, "kernel %s runtime: %ldus\n",
+                    "next_chunk_board.segmap_9619", time_diff_9985);
+        }
+    }
+    ctx->failure_is_an_option = 1;
+    
+    int64_t segmap_group_sizze_9865;
+    
+    segmap_group_sizze_9865 =
+        ctx->sizes.next_chunk_boardzisegmap_group_sizze_9793;
+    
+    int64_t segmap_usable_groups_9866 = sdiv_up64(nest_sizze_9733,
+                                                  segmap_group_sizze_9865);
+    struct memblock_device mem_9948;
+    
+    mem_9948.references = NULL;
+    if (memblock_alloc_device(ctx, &mem_9948, nest_sizze_9733, "mem_9948")) {
+        err = 1;
+        goto cleanup;
+    }
+    if (ctx->debugging)
+        fprintf(ctx->log, "%s\n", "\n# SegMap");
+    OPENCL_SUCCEED_OR_RETURN(clSetKernelArg(ctx->next_chunk_boardzisegmap_9790,
+                                            1, sizeof(m_9501), &m_9501));
+    OPENCL_SUCCEED_OR_RETURN(clSetKernelArg(ctx->next_chunk_boardzisegmap_9790,
+                                            2, sizeof(mem_9945.mem),
+                                            &mem_9945.mem));
+    OPENCL_SUCCEED_OR_RETURN(clSetKernelArg(ctx->next_chunk_boardzisegmap_9790,
+                                            3, sizeof(mem_9948.mem),
+                                            &mem_9948.mem));
+    if (1 * ((size_t) segmap_usable_groups_9866 *
+             (size_t) segmap_group_sizze_9865) != 0) {
+        const size_t global_work_sizze_9987[1] =
+                     {(size_t) segmap_usable_groups_9866 *
+                     (size_t) segmap_group_sizze_9865};
+        const size_t local_work_sizze_9991[1] = {segmap_group_sizze_9865};
+        int64_t time_start_9988 = 0, time_end_9989 = 0;
+        
+        if (ctx->debugging) {
+            fprintf(ctx->log, "Launching %s with global work size [",
+                    "next_chunk_board.segmap_9790");
+            fprintf(ctx->log, "%zu", global_work_sizze_9987[0]);
+            fprintf(ctx->log, "] and local work size [");
+            fprintf(ctx->log, "%zu", local_work_sizze_9991[0]);
+            fprintf(ctx->log, "]; local memory parameters sum to %d bytes.\n",
+                    (int) 0);
+            time_start_9988 = get_wall_time();
+        }
+        OPENCL_SUCCEED_OR_RETURN(clEnqueueNDRangeKernel(ctx->opencl.queue,
+                                                        ctx->next_chunk_boardzisegmap_9790,
+                                                        1, NULL,
+                                                        global_work_sizze_9987,
+                                                        local_work_sizze_9991,
+                                                        0, NULL,
+                                                        ctx->profiling_paused ||
+                                                        !ctx->profiling ? NULL : opencl_get_event(&ctx->opencl,
+                                                                                                  &ctx->next_chunk_boardzisegmap_9790_runs,
+                                                                                                  &ctx->next_chunk_boardzisegmap_9790_total_runtime)));
+        if (ctx->debugging) {
+            OPENCL_SUCCEED_FATAL(clFinish(ctx->opencl.queue));
+            time_end_9989 = get_wall_time();
+            
+            long time_diff_9990 = time_end_9989 - time_start_9988;
+            
+            fprintf(ctx->log, "kernel %s runtime: %ldus\n",
+                    "next_chunk_board.segmap_9790", time_diff_9990);
+        }
+    }
+    if (memblock_unref_device(ctx, &mem_9945, "mem_9945") != 0)
+        return 1;
+    
+    int64_t j_9597 = add64((int64_t) 1, n_9500);
+    bool empty_slice_9598 = n_9500 == (int64_t) 0;
+    bool zzero_leq_i_p_m_t_s_9599 = sle64((int64_t) 0, n_9500);
+    bool i_p_m_t_s_leq_w_9600 = slt64(n_9500, m_9501);
+    bool i_lte_j_9601 = sle64((int64_t) 1, j_9597);
+    bool y_9602 = zzero_leq_i_p_m_t_s_9599 && i_p_m_t_s_leq_w_9600;
+    bool y_9603 = i_lte_j_9601 && y_9602;
+    bool ok_or_empty_9604 = empty_slice_9598 || y_9603;
+    bool index_ok_9605 = ok_or_empty_9604 && ok_or_empty_9604;
+    bool index_certs_9606;
+    
+    if (!index_ok_9605) {
+        ctx->error =
+            msgprintf("Error: %s%lld%s%lld%s%lld%s%lld%s%lld%s%lld%s\n\nBacktrace:\n%s",
+                      "Index [", (int64_t) 1, ":", j_9597, ", ", (int64_t) 1,
+                      ":", j_9597, "] out of bounds for array of shape [",
+                      m_9501, "][", m_9501, "].",
+                      "-> #0  gol.fut:40:8-31\n   #1  gol.fut:30:1-40:43\n");
+        if (memblock_unref_device(ctx, &mem_9948, "mem_9948") != 0)
+            return 1;
+        if (memblock_unref_device(ctx, &mem_9945, "mem_9945") != 0)
+            return 1;
+        if (memblock_unref_device(ctx, &out_mem_9954, "out_mem_9954") != 0)
+            return 1;
+        return 1;
+    }
+    
+    int64_t bytes_9949 = n_9500 * n_9500;
+    struct memblock_device mem_9950;
+    
+    mem_9950.references = NULL;
+    if (memblock_alloc_device(ctx, &mem_9950, bytes_9949, "mem_9950")) {
+        err = 1;
+        goto cleanup;
+    }
+    
+    int64_t group_sizze_9968;
+    
+    group_sizze_9968 = ctx->sizes.next_chunk_boardzigroup_sizze_9968;
+    
+    int64_t num_groups_9969;
+    
+    num_groups_9969 = sdiv_up64(n_9500 * n_9500, group_sizze_9968);
+    OPENCL_SUCCEED_OR_RETURN(clSetKernelArg(ctx->next_chunk_boardzicopy_9965, 0,
+                                            sizeof(n_9500), &n_9500));
+    OPENCL_SUCCEED_OR_RETURN(clSetKernelArg(ctx->next_chunk_boardzicopy_9965, 1,
+                                            sizeof(m_9501), &m_9501));
+    OPENCL_SUCCEED_OR_RETURN(clSetKernelArg(ctx->next_chunk_boardzicopy_9965, 2,
+                                            sizeof(mem_9948.mem),
+                                            &mem_9948.mem));
+    OPENCL_SUCCEED_OR_RETURN(clSetKernelArg(ctx->next_chunk_boardzicopy_9965, 3,
+                                            sizeof(mem_9950.mem),
+                                            &mem_9950.mem));
+    if (1 * ((size_t) num_groups_9969 * (size_t) group_sizze_9968) != 0) {
+        const size_t global_work_sizze_9992[1] = {(size_t) num_groups_9969 *
+                     (size_t) group_sizze_9968};
+        const size_t local_work_sizze_9996[1] = {group_sizze_9968};
+        int64_t time_start_9993 = 0, time_end_9994 = 0;
+        
+        if (ctx->debugging) {
+            fprintf(ctx->log, "Launching %s with global work size [",
+                    "next_chunk_board.copy_9965");
+            fprintf(ctx->log, "%zu", global_work_sizze_9992[0]);
+            fprintf(ctx->log, "] and local work size [");
+            fprintf(ctx->log, "%zu", local_work_sizze_9996[0]);
+            fprintf(ctx->log, "]; local memory parameters sum to %d bytes.\n",
+                    (int) 0);
+            time_start_9993 = get_wall_time();
+        }
+        OPENCL_SUCCEED_OR_RETURN(clEnqueueNDRangeKernel(ctx->opencl.queue,
+                                                        ctx->next_chunk_boardzicopy_9965,
+                                                        1, NULL,
+                                                        global_work_sizze_9992,
+                                                        local_work_sizze_9996,
+                                                        0, NULL,
+                                                        ctx->profiling_paused ||
+                                                        !ctx->profiling ? NULL : opencl_get_event(&ctx->opencl,
+                                                                                                  &ctx->next_chunk_boardzicopy_9965_runs,
+                                                                                                  &ctx->next_chunk_boardzicopy_9965_total_runtime)));
+        if (ctx->debugging) {
+            OPENCL_SUCCEED_FATAL(clFinish(ctx->opencl.queue));
+            time_end_9994 = get_wall_time();
+            
+            long time_diff_9995 = time_end_9994 - time_start_9993;
+            
+            fprintf(ctx->log, "kernel %s runtime: %ldus\n",
+                    "next_chunk_board.copy_9965", time_diff_9995);
+        }
+    }
+    if (memblock_unref_device(ctx, &mem_9948, "mem_9948") != 0)
+        return 1;
+    if (memblock_set_device(ctx, &out_mem_9954, &mem_9950, "mem_9950") != 0)
+        return 1;
+    (*out_mem_p_9981).references = NULL;
+    if (memblock_set_device(ctx, &*out_mem_p_9981, &out_mem_9954,
+                            "out_mem_9954") != 0)
+        return 1;
+    if (memblock_unref_device(ctx, &mem_9950, "mem_9950") != 0)
+        return 1;
+    if (memblock_unref_device(ctx, &mem_9948, "mem_9948") != 0)
+        return 1;
+    if (memblock_unref_device(ctx, &mem_9945, "mem_9945") != 0)
+        return 1;
+    if (memblock_unref_device(ctx, &out_mem_9954, "out_mem_9954") != 0)
+        return 1;
+    
+  cleanup:
+    { }
+    return err;
+}
+struct futhark_i8_2d {
+    struct memblock_device mem;
+    int64_t shape[2];
+} ;
+struct futhark_i8_2d *futhark_new_i8_2d(struct futhark_context *ctx, const
+                                        int8_t *data, int64_t dim0,
+                                        int64_t dim1)
+{
+    struct futhark_i8_2d *bad = NULL;
+    struct futhark_i8_2d *arr =
+                         (struct futhark_i8_2d *) malloc(sizeof(struct futhark_i8_2d));
+    
+    if (arr == NULL)
+        return bad;
+    lock_lock(&ctx->lock);
+    arr->mem.references = NULL;
+    if (memblock_alloc_device(ctx, &arr->mem, (size_t) (dim0 * dim1) * 1,
+                              "arr->mem"))
+        return NULL;
+    arr->shape[0] = dim0;
+    arr->shape[1] = dim1;
+    if ((size_t) (dim0 * dim1) * 1 > 0)
+        OPENCL_SUCCEED_OR_RETURN(clEnqueueWriteBuffer(ctx->opencl.queue,
+                                                      arr->mem.mem, CL_TRUE, 0,
+                                                      (size_t) (dim0 * dim1) *
+                                                      1, data + 0, 0, NULL,
+                                                      ctx->profiling_paused ||
+                                                      !ctx->profiling ? NULL : opencl_get_event(&ctx->opencl,
+                                                                                                &ctx->copy_dev_to_host_runs,
+                                                                                                &ctx->copy_dev_to_host_total_runtime)));
+    lock_unlock(&ctx->lock);
+    return arr;
+}
+struct futhark_i8_2d *futhark_new_raw_i8_2d(struct futhark_context *ctx, const
+                                            cl_mem data, int offset,
+                                            int64_t dim0, int64_t dim1)
+{
+    struct futhark_i8_2d *bad = NULL;
+    struct futhark_i8_2d *arr =
+                         (struct futhark_i8_2d *) malloc(sizeof(struct futhark_i8_2d));
+    
+    if (arr == NULL)
+        return bad;
+    lock_lock(&ctx->lock);
+    arr->mem.references = NULL;
+    if (memblock_alloc_device(ctx, &arr->mem, (size_t) (dim0 * dim1) * 1,
+                              "arr->mem"))
+        return NULL;
+    arr->shape[0] = dim0;
+    arr->shape[1] = dim1;
+    if ((size_t) (dim0 * dim1) * 1 > 0) {
+        OPENCL_SUCCEED_OR_RETURN(clEnqueueCopyBuffer(ctx->opencl.queue, data,
+                                                     arr->mem.mem, offset, 0,
+                                                     (size_t) (dim0 * dim1) * 1,
+                                                     0, NULL,
+                                                     ctx->profiling_paused ||
+                                                     !ctx->profiling ? NULL : opencl_get_event(&ctx->opencl,
+                                                                                               &ctx->copy_dev_to_dev_runs,
+                                                                                               &ctx->copy_dev_to_dev_total_runtime)));
+        if (ctx->debugging)
+            OPENCL_SUCCEED_FATAL(clFinish(ctx->opencl.queue));
+    }
+    lock_unlock(&ctx->lock);
+    return arr;
+}
+int futhark_free_i8_2d(struct futhark_context *ctx, struct futhark_i8_2d *arr)
+{
+    lock_lock(&ctx->lock);
+    if (memblock_unref_device(ctx, &arr->mem, "arr->mem") != 0)
+        return 1;
+    lock_unlock(&ctx->lock);
+    free(arr);
+    return 0;
+}
+int futhark_values_i8_2d(struct futhark_context *ctx, struct futhark_i8_2d *arr,
+                         int8_t *data)
+{
+    lock_lock(&ctx->lock);
+    if ((size_t) (arr->shape[0] * arr->shape[1]) * 1 > 0) {
+        OPENCL_SUCCEED_OR_RETURN(clEnqueueReadBuffer(ctx->opencl.queue,
+                                                     arr->mem.mem,
+                                                     ctx->failure_is_an_option ? CL_FALSE : CL_TRUE,
+                                                     0,
+                                                     (size_t) (arr->shape[0] *
+                                                               arr->shape[1]) *
+                                                     1, data + 0, 0, NULL,
+                                                     ctx->profiling_paused ||
+                                                     !ctx->profiling ? NULL : opencl_get_event(&ctx->opencl,
+                                                                                               &ctx->copy_host_to_dev_runs,
+                                                                                               &ctx->copy_host_to_dev_total_runtime)));
+        if (ctx->failure_is_an_option && futhark_context_sync(ctx) != 0)
+            return 1;
+    }
+    lock_unlock(&ctx->lock);
+    return 0;
+}
+cl_mem futhark_values_raw_i8_2d(struct futhark_context *ctx,
+                                struct futhark_i8_2d *arr)
+{
+    (void) ctx;
+    return arr->mem.mem;
+}
+const int64_t *futhark_shape_i8_2d(struct futhark_context *ctx,
+                                   struct futhark_i8_2d *arr)
+{
+    (void) ctx;
+    return arr->shape;
+}
+int futhark_entry_get_envelope(struct futhark_context *ctx,
+                               struct futhark_i8_2d **out0, const
+                               struct futhark_i8_2d *in0)
+{
+    struct memblock_device chunk_board_mem_9941;
+    
+    chunk_board_mem_9941.references = NULL;
+    
+    int64_t n_9485;
+    struct memblock_device out_mem_9954;
+    
+    out_mem_9954.references = NULL;
+    
+    int ret = 0;
+    
+    lock_lock(&ctx->lock);
+    chunk_board_mem_9941 = in0->mem;
+    n_9485 = in0->shape[0];
+    n_9485 = in0->shape[1];
+    if (!(n_9485 == in0->shape[0] && n_9485 == in0->shape[1])) {
+        ret = 1;
+        if (!ctx->error)
+            ctx->error =
+                msgprintf("Error: entry point arguments have invalid sizes.\n");
+    } else {
+        ret = futrts_get_envelope(ctx, &out_mem_9954, chunk_board_mem_9941,
+                                  n_9485);
+        if (ret == 0) {
+            assert((*out0 =
+                    (struct futhark_i8_2d *) malloc(sizeof(struct futhark_i8_2d))) !=
+                NULL);
+            (*out0)->mem = out_mem_9954;
+            (*out0)->shape[0] = 4;
+            (*out0)->shape[1] = n_9485;
+        }
+    }
+    lock_unlock(&ctx->lock);
+    return ret;
+}
+int futhark_entry_next_chunk_board(struct futhark_context *ctx,
+                                   struct futhark_i8_2d **out0, const
+                                   struct futhark_i8_2d *in0, const
+                                   struct futhark_i8_2d *in1)
+{
+    struct memblock_device chunk_board_mem_9941;
+    
+    chunk_board_mem_9941.references = NULL;
+    
+    struct memblock_device envelope_board_mem_9942;
+    
+    envelope_board_mem_9942.references = NULL;
+    
+    int64_t n_9500;
+    int64_t m_9501;
+    struct memblock_device out_mem_9954;
+    
+    out_mem_9954.references = NULL;
+    
+    int ret = 0;
+    
+    lock_lock(&ctx->lock);
+    chunk_board_mem_9941 = in0->mem;
+    n_9500 = in0->shape[0];
+    n_9500 = in0->shape[1];
+    envelope_board_mem_9942 = in1->mem;
+    m_9501 = in1->shape[1];
+    if (!((n_9500 == in0->shape[0] && n_9500 == in0->shape[1]) && (4 ==
+                                                                   in1->shape[0] &&
+                                                                   m_9501 ==
+                                                                   in1->shape[1]))) {
+        ret = 1;
+        if (!ctx->error)
+            ctx->error =
+                msgprintf("Error: entry point arguments have invalid sizes.\n");
+    } else {
+        ret = futrts_next_chunk_board(ctx, &out_mem_9954, chunk_board_mem_9941,
+                                      envelope_board_mem_9942, n_9500, m_9501);
+        if (ret == 0) {
+            assert((*out0 =
+                    (struct futhark_i8_2d *) malloc(sizeof(struct futhark_i8_2d))) !=
+                NULL);
+            (*out0)->mem = out_mem_9954;
+            (*out0)->shape[0] = n_9500;
+            (*out0)->shape[1] = n_9500;
+        }
+    }
+    lock_unlock(&ctx->lock);
+    return ret;
+}
diff --git a/futmpi/gol.fut b/futmpi/gol.fut
new file mode 100644
index 0000000000000000000000000000000000000000..25eae32c3d2eff33780351abcfe311ac6e6b6200
--- /dev/null
+++ b/futmpi/gol.fut
@@ -0,0 +1,48 @@
+let count_neighbours [n] (board: [n][n]i8) : [n][n]i8 =
+    let north = rotate (-1) board
+    let south = rotate 1 board
+    let east = map(rotate 1) board
+    let west = map(rotate (-1)) board
+
+    let north_east = map(rotate 1) north
+    let north_west = map(rotate (-1)) north
+    let south_east = map(rotate 1) south
+    let south_west = map(rotate (-1)) south
+
+    in map3 (\(nwr,nr,ner) (wr, br, er) (swr, sr, ser) ->
+        map3 (\(nw,n,ne) (w, _, e) (sw, s, se) -> nw + n + ne + w + e + sw + s + se)
+        (zip3 nwr nr ner) (zip3 wr br er) (zip3 swr sr ser))
+    (zip3 north_west north north_east) (zip3 west board east) (zip3 south_west south south_east)
+
+let augment_board [n][m] (chunk_board :[n][n]i8) (envelope_board: [4][m]i8): [m][m]i8 =
+    tabulate_2d (m) (m) (\i j ->
+                              -- North
+                              if (i == 0) then envelope_board[0,j]
+                              -- East
+                              else if (j == m-1) then envelope_board[1,i]
+                              -- South
+                              else if (i == m-1) then envelope_board[2,j]
+                              -- West
+                              else if (j == 0) then envelope_board[3,i]
+                              else chunk_board[i-1,j-1])
+
+
+entry next_chunk_board [n][m] (chunk_board :[n][n]i8) (envelope_board: [4][m]i8) :[n][n]i8 =
+    let augmented_board = augment_board chunk_board envelope_board
+    let neighbours = count_neighbours augmented_board
+    let next_board = map2 (\augmented_board_r neighbours_r ->
+        map2(\cell nb_alive_cells ->
+            if (cell == 1 && (nb_alive_cells == 2 || nb_alive_cells == 3)) || (cell == 0 && nb_alive_cells == 3)
+            then 1
+            else 0)
+        augmented_board_r neighbours_r)
+      augmented_board neighbours
+    in next_board[1:n+1, 1:n+1] :> [n][n]i8
+
+entry get_envelope [n] (chunk_board: [n][n]i8): [4][n]i8 =
+    let north = chunk_board[0]
+    let south = chunk_board[n-1]
+    let tr_chunk_board = transpose chunk_board
+    let east = tr_chunk_board[n-1]
+    let west = tr_chunk_board[0]
+    in [north, east, south, west]
diff --git a/futmpi/gol.h b/futmpi/gol.h
new file mode 100644
index 0000000000000000000000000000000000000000..4d300c14e4b8472bf7f626c29741a99a92542ed0
--- /dev/null
+++ b/futmpi/gol.h
@@ -0,0 +1,122 @@
+#pragma once
+
+// Headers
+
+#include <stdint.h>
+#include <stddef.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <float.h>
+#define CL_TARGET_OPENCL_VERSION 120
+#define CL_USE_DEPRECATED_OPENCL_1_2_APIS
+#ifdef __APPLE__
+#define CL_SILENCE_DEPRECATION
+#include <OpenCL/cl.h>
+#else
+#include <CL/cl.h>
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Initialisation
+
+struct futhark_context_config ;
+struct futhark_context_config *futhark_context_config_new(void);
+void futhark_context_config_free(struct futhark_context_config *cfg);
+void futhark_context_config_add_build_option(struct futhark_context_config *cfg,
+                                             const char *opt);
+void futhark_context_config_set_debugging(struct futhark_context_config *cfg,
+                                          int flag);
+void futhark_context_config_set_profiling(struct futhark_context_config *cfg,
+                                          int flag);
+void futhark_context_config_set_logging(struct futhark_context_config *cfg,
+                                        int flag);
+void futhark_context_config_set_device(struct futhark_context_config *cfg, const
+                                       char *s);
+void futhark_context_config_set_platform(struct futhark_context_config *cfg,
+                                         const char *s);
+void
+futhark_context_config_select_device_interactively(struct futhark_context_config *cfg);
+void futhark_context_config_list_devices(struct futhark_context_config *cfg);
+void futhark_context_config_dump_program_to(struct futhark_context_config *cfg,
+                                            const char *path);
+void
+futhark_context_config_load_program_from(struct futhark_context_config *cfg,
+                                         const char *path);
+void futhark_context_config_dump_binary_to(struct futhark_context_config *cfg,
+                                           const char *path);
+void futhark_context_config_load_binary_from(struct futhark_context_config *cfg,
+                                             const char *path);
+void
+futhark_context_config_set_default_group_size(struct futhark_context_config *cfg,
+                                              int size);
+void
+futhark_context_config_set_default_num_groups(struct futhark_context_config *cfg,
+                                              int num);
+void
+futhark_context_config_set_default_tile_size(struct futhark_context_config *cfg,
+                                             int num);
+void
+futhark_context_config_set_default_reg_tile_size(struct futhark_context_config *cfg,
+                                                 int num);
+void
+futhark_context_config_set_default_threshold(struct futhark_context_config *cfg,
+                                             int num);
+int futhark_context_config_set_size(struct futhark_context_config *cfg, const
+                                    char *size_name, size_t size_value);
+struct futhark_context ;
+struct futhark_context *futhark_context_new(struct futhark_context_config *cfg);
+struct futhark_context
+*futhark_context_new_with_command_queue(struct futhark_context_config *cfg,
+                                        cl_command_queue queue);
+void futhark_context_free(struct futhark_context *ctx);
+cl_command_queue futhark_context_get_command_queue(struct futhark_context *ctx);
+int futhark_get_num_sizes(void);
+const char *futhark_get_size_name(int);
+const char *futhark_get_size_class(int);
+
+// Arrays
+
+struct futhark_i8_2d ;
+struct futhark_i8_2d *futhark_new_i8_2d(struct futhark_context *ctx, const
+                                        int8_t *data, int64_t dim0,
+                                        int64_t dim1);
+struct futhark_i8_2d *futhark_new_raw_i8_2d(struct futhark_context *ctx, const
+                                            cl_mem data, int offset,
+                                            int64_t dim0, int64_t dim1);
+int futhark_free_i8_2d(struct futhark_context *ctx, struct futhark_i8_2d *arr);
+int futhark_values_i8_2d(struct futhark_context *ctx, struct futhark_i8_2d *arr,
+                         int8_t *data);
+cl_mem futhark_values_raw_i8_2d(struct futhark_context *ctx,
+                                struct futhark_i8_2d *arr);
+const int64_t *futhark_shape_i8_2d(struct futhark_context *ctx,
+                                   struct futhark_i8_2d *arr);
+
+// Opaque values
+
+
+// Entry points
+
+int futhark_entry_get_envelope(struct futhark_context *ctx,
+                               struct futhark_i8_2d **out0, const
+                               struct futhark_i8_2d *in0);
+int futhark_entry_next_chunk_board(struct futhark_context *ctx,
+                                   struct futhark_i8_2d **out0, const
+                                   struct futhark_i8_2d *in0, const
+                                   struct futhark_i8_2d *in1);
+
+// Miscellaneous
+
+int futhark_context_sync(struct futhark_context *ctx);
+char *futhark_context_report(struct futhark_context *ctx);
+char *futhark_context_get_error(struct futhark_context *ctx);
+void futhark_context_set_logging_file(struct futhark_context *ctx, FILE *f);
+void futhark_context_pause_profiling(struct futhark_context *ctx);
+void futhark_context_unpause_profiling(struct futhark_context *ctx);
+int futhark_context_clear_caches(struct futhark_context *ctx);
+#define FUTHARK_BACKEND_opencl
+#ifdef __cplusplus
+}
+#endif
diff --git a/futmpi/main.c b/futmpi/main.c
new file mode 100644
index 0000000000000000000000000000000000000000..ab2413acc9b2a7600b88ea85d6324bb1d803ab18
--- /dev/null
+++ b/futmpi/main.c
@@ -0,0 +1,346 @@
+#include <stdio.h>
+#include <stdint.h>
+#include <mpi.h>
+#include <math.h>
+#include <stdbool.h>
+#include <unistd.h>
+#include "gol.h"
+#include "gfx.h"
+
+#define BOARD_N 800
+
+#define INDEX_2D_TO_1D(y, x, nb_columns) ((y) * nb_columns + (x))
+
+#define NORTH_INDEX 0
+#define EAST_INDEX 1
+#define SOUTH_INDEX 2
+#define WEST_INDEX 3
+
+#define NORTH_ROW_TAG 0
+#define EAST_COLUMN_TAG 1
+#define SOUTH_ROW_TAG 2
+#define WEST_COLUMN_TAG 3
+
+#define NORTH_EAST_CELL_TAG 4
+#define SOUTH_EAST_CELL_TAG 5
+#define SOUTH_WEST_CELL_TAG 6
+#define NORTH_WEST_CELL_TAG 7
+
+#define CHUNK_BOARD_TAG 8
+
+//void printChunkBoard(int8_t *chunkBoard, int n1, int n2) {
+//    for (int i = 0; i < n1; ++i) {
+//        for (int j = 0; j < n2; ++j) {
+//            printf("%d ", chunkBoard[INDEX_2D_TO_1D(i, j, n2)]);
+//        }
+//        printf("\n");
+//    }
+//}
+
+int createGridCommunicators(MPI_Comm *cartComm, MPI_Comm *rowComm, MPI_Comm *colComm, int nProc) {
+    int gridN = (int) sqrt(nProc);
+    int dimensions[2] = {gridN, gridN};
+    int periods[2] = {true, true}; // Cyclic on column for B matrix
+
+    MPI_Cart_create(MPI_COMM_WORLD, 2, dimensions, periods, 1, cartComm);
+
+    /* Create row communicator */
+    int remainDims[2] = {false, true};
+    MPI_Cart_sub(*cartComm, remainDims, rowComm);
+
+    /* Create column communicator */
+    remainDims[0] = true; // rows
+    remainDims[1] = false; // columns
+    MPI_Cart_sub(*cartComm, remainDims, colComm);
+    return gridN;
+}
+
+int *divideBoard(int n, int chunkN, int nProc) {
+    int *indexes = calloc((size_t) nProc * 2, sizeof(int));
+    for (int i = 0, y = 0, x = 0; i < nProc; ++i) {
+        indexes[i * 2] = y;
+        indexes[i * 2 + 1] = x;
+
+        x += (int) chunkN;
+        if (x >= (int) n) {
+            x = 0;
+            y += (int) chunkN;
+        }
+    }
+    return indexes;
+}
+
+void initChunkBoard(int8_t *chunkBoard, int chunkN) {
+    for (int i = 0; i < chunkN; ++i) {
+        for (int j = 0; j < chunkN; ++j) {
+            chunkBoard[INDEX_2D_TO_1D(i, j, chunkN)] = rand() % 2;
+        }
+    }
+}
+
+void shareAndBuildEnvelope(int8_t *chunkBoardMyEnvelope, int8_t *chunkBoardEnvelope, MPI_Comm rowComm,
+                           MPI_Comm colComm, int gridN, int coordinates[2], int chunkN, int chunkM) {
+    int coordinateY = coordinates[0];
+    int coordinateX = coordinates[1];
+    MPI_Request requests[16] = {0};
+    int iRequest = 0;
+
+    // North
+    {
+        int8_t *chunkBoardMyEnvelopeNorth = &chunkBoardMyEnvelope[INDEX_2D_TO_1D(NORTH_INDEX, 0, chunkN)];
+        int8_t *chunkBoardEnvelopeNorth = &chunkBoardEnvelope[INDEX_2D_TO_1D(NORTH_INDEX, 1, chunkM)];
+        int destSource = (coordinateY - 1) < 0 ? (gridN - 1) : (coordinateY - 1);
+
+        MPI_Isend(chunkBoardMyEnvelopeNorth, chunkN, MPI_INT8_T, destSource, NORTH_ROW_TAG, colComm,
+                  &requests[iRequest++]);
+        /* Neighbour send south row, which correspond to north envelope */
+        MPI_Irecv(chunkBoardEnvelopeNorth, chunkN, MPI_INT8_T, destSource, SOUTH_ROW_TAG, colComm,
+                  &requests[iRequest++]);
+    }
+
+    // East
+    {
+        int8_t *chunkBoardMyEnvelopeEast = &chunkBoardMyEnvelope[INDEX_2D_TO_1D(EAST_INDEX, 0, chunkN)];
+        int8_t *chunkBoardEnvelopeEast = &chunkBoardEnvelope[INDEX_2D_TO_1D(EAST_INDEX, 1, chunkM)];
+        int destSource = (coordinateX + 1) % gridN;
+
+        MPI_Isend(chunkBoardMyEnvelopeEast, chunkN, MPI_INT8_T, destSource, EAST_COLUMN_TAG, rowComm,
+                  &requests[iRequest++]);
+        /* Neighbour send west column, which correspond to east envelope */
+        MPI_Irecv(chunkBoardEnvelopeEast, chunkN, MPI_INT8_T, destSource, WEST_COLUMN_TAG, rowComm,
+                  &requests[iRequest++]);
+    }
+
+    // South
+    {
+        int8_t *chunkBoardMyEnvelopeSouth = &chunkBoardMyEnvelope[INDEX_2D_TO_1D(SOUTH_INDEX, 0, chunkN)];
+        int8_t *chunkBoardEnvelopeSouth = &chunkBoardEnvelope[INDEX_2D_TO_1D(SOUTH_INDEX, 1, chunkM)];
+        int destSource = (coordinateY + 1) % gridN;
+
+        MPI_Isend(chunkBoardMyEnvelopeSouth, chunkN, MPI_INT8_T, destSource, SOUTH_ROW_TAG, colComm,
+                  &requests[iRequest++]);
+        /* Neighbour send north row, which correspond to south envelope */
+        MPI_Irecv(chunkBoardEnvelopeSouth, chunkN, MPI_INT8_T, destSource, NORTH_ROW_TAG, colComm,
+                  &requests[iRequest++]);
+    }
+
+    // West
+    {
+        int8_t *chunkBoardMyEnvelopeWest = &chunkBoardMyEnvelope[INDEX_2D_TO_1D(WEST_INDEX, 0, chunkN)];
+        int8_t *chunkBoardEnvelopeWest = &chunkBoardEnvelope[INDEX_2D_TO_1D(WEST_INDEX, 1, chunkM)];
+        int destSource = (coordinateX - 1) < 0 ? (gridN - 1) : (coordinateX - 1);
+
+        MPI_Isend(chunkBoardMyEnvelopeWest, chunkN, MPI_INT8_T, destSource, WEST_COLUMN_TAG, rowComm,
+                  &requests[iRequest++]);
+        /* Neighbour send east column, which correspond to west envelope */
+        MPI_Irecv(chunkBoardEnvelopeWest, chunkN, MPI_INT8_T, destSource, EAST_COLUMN_TAG, rowComm,
+                  &requests[iRequest++]);
+    }
+
+    int8_t missingCells[4] = {0};
+
+    // North-East
+    {
+        int8_t *chunkBoardMyEnvelopeNorthEast = &chunkBoardMyEnvelope[INDEX_2D_TO_1D(NORTH_INDEX, chunkN - 1, chunkN)];
+        int destSrcY = (coordinateY - 1) < 0 ? gridN - 1 : coordinateY - 1;
+        int destSrcX = (coordinateX + 1) % gridN;
+        int destSource = INDEX_2D_TO_1D(destSrcY, destSrcX, gridN);
+
+        MPI_Isend(chunkBoardMyEnvelopeNorthEast, 1, MPI_INT8_T, destSource, NORTH_EAST_CELL_TAG, MPI_COMM_WORLD,
+                  &requests[iRequest++]);
+        MPI_Irecv(&missingCells[1], 1, MPI_INT8_T, destSource, SOUTH_WEST_CELL_TAG, MPI_COMM_WORLD,
+                  &requests[iRequest++]);
+    }
+
+    // South-East
+    {
+        int8_t *chunkBoardMyEnvelopeSouthEast = &chunkBoardMyEnvelope[INDEX_2D_TO_1D(SOUTH_INDEX, chunkN - 1, chunkN)];
+        int destSrcY = (coordinateY + 1) % gridN;
+        int destSrcX = (coordinateX + 1) % gridN;
+        int destSource = INDEX_2D_TO_1D(destSrcY, destSrcX, gridN);
+
+        MPI_Isend(chunkBoardMyEnvelopeSouthEast, 1, MPI_INT8_T, destSource, SOUTH_EAST_CELL_TAG, MPI_COMM_WORLD,
+                  &requests[iRequest++]);
+        MPI_Irecv(&missingCells[2], 1, MPI_INT8_T, destSource, NORTH_WEST_CELL_TAG, MPI_COMM_WORLD,
+                  &requests[iRequest++]);
+    }
+
+    // South-West
+    {
+        int8_t *chunkBoardMyEnvelopeSouthWest = &chunkBoardMyEnvelope[INDEX_2D_TO_1D(SOUTH_INDEX, 0, chunkN)];
+        int destSrcY = (coordinateY + 1) % gridN;
+        int destSrcX = (coordinateX - 1) < 0 ? gridN - 1 : coordinateX - 1;
+        int destSource = INDEX_2D_TO_1D(destSrcY, destSrcX, gridN);
+
+        MPI_Isend(chunkBoardMyEnvelopeSouthWest, 1, MPI_INT8_T, destSource, SOUTH_WEST_CELL_TAG, MPI_COMM_WORLD,
+                  &requests[iRequest++]);
+        MPI_Irecv(&missingCells[3], 1, MPI_INT8_T, destSource, NORTH_EAST_CELL_TAG, MPI_COMM_WORLD,
+                  &requests[iRequest++]);
+    }
+
+    // North-West
+    {
+        int8_t *chunkBoardMyEnvelopeNorthWest = &chunkBoardMyEnvelope[INDEX_2D_TO_1D(NORTH_INDEX, 0, chunkN)];
+        int destSrcY = (coordinateY - 1) < 0 ? gridN - 1 : coordinateY - 1;
+        int destSrcX = (coordinateX - 1) < 0 ? gridN - 1 : coordinateX - 1;
+        int destSource = INDEX_2D_TO_1D(destSrcY, destSrcX, gridN);
+
+        MPI_Isend(chunkBoardMyEnvelopeNorthWest, 1, MPI_INT8_T, destSource, NORTH_WEST_CELL_TAG, MPI_COMM_WORLD,
+                  &requests[iRequest++]);
+        MPI_Irecv(&missingCells[0], 1, MPI_INT8_T, destSource, SOUTH_EAST_CELL_TAG, MPI_COMM_WORLD,
+                  &requests[iRequest]);
+    }
+
+    MPI_Waitall(16, requests, MPI_STATUSES_IGNORE);
+
+    chunkBoardEnvelope[INDEX_2D_TO_1D(NORTH_INDEX, chunkN, chunkM)] = chunkBoardEnvelope[INDEX_2D_TO_1D(
+            EAST_INDEX, 0, chunkM)] = missingCells[1];
+    chunkBoardEnvelope[INDEX_2D_TO_1D(SOUTH_INDEX, chunkN, chunkM)] = chunkBoardEnvelope[INDEX_2D_TO_1D(EAST_INDEX,
+                                                                                                        chunkN,
+                                                                                                        chunkM)] = missingCells[2];
+    chunkBoardEnvelope[INDEX_2D_TO_1D(SOUTH_INDEX, 0, chunkM)] = chunkBoardEnvelope[INDEX_2D_TO_1D(WEST_INDEX,
+                                                                                                   chunkN,
+                                                                                                   chunkM)] = missingCells[3];
+    chunkBoardEnvelope[INDEX_2D_TO_1D(NORTH_INDEX, 0, chunkM)] = chunkBoardEnvelope[INDEX_2D_TO_1D(WEST_INDEX,
+                                                                                                   0,
+                                                                                                   chunkM)] = missingCells[0];
+}
+
+void chunkBoardToBoard(int8_t *board, int n, const int8_t *chunkBoard, int chunkN, const int *indexes, int rank) {
+    int y = indexes[rank * 2];
+    int x = indexes[rank * 2 + 1];
+
+    for (int i = 0; i < chunkN; ++i) {
+        for (int j = 0; j < chunkN; ++j) {
+            board[INDEX_2D_TO_1D(y + i, x + j, n)] = chunkBoard[INDEX_2D_TO_1D(i, j, chunkN)];
+        }
+    }
+}
+
+int main(int argc, char *argv[]) {
+    int myRank;
+    int nProc;
+
+    /* MPI Initialization */
+    MPI_Init(&argc, &argv);
+    MPI_Comm_rank(MPI_COMM_WORLD, &myRank);
+    MPI_Comm_size(MPI_COMM_WORLD, &nProc);
+    srand((unsigned int) myRank);
+
+    MPI_Comm cartComm, rowComm, colComm;
+    int gridN = createGridCommunicators(&cartComm, &rowComm, &colComm, nProc);
+
+    int myCartRank;
+    MPI_Comm_rank(cartComm, &myCartRank);
+
+    int coordinates[2] = {0};
+    MPI_Cart_coords(cartComm, myCartRank, 2, coordinates);
+
+    /* Futhark Initialization */
+    struct futhark_context_config *contextConfig = futhark_context_config_new();
+    futhark_context_config_set_device(contextConfig, "AMD");
+    struct futhark_context *futharkContext = futhark_context_new(contextConfig);
+
+    /* GFX Initialization */
+    struct gfx_context_t *gfxContext = myRank == 0 ? gfx_create("Game of Life", BOARD_N, BOARD_N) : NULL;
+    if (myRank == 0 && !gfxContext) {
+        fprintf(stderr, "Graphic mode initialization failed!\n");
+        return EXIT_FAILURE;
+    }
+    if (myRank == 0) {
+        SDL_ShowCursor(SDL_ENABLE);
+    }
+
+    /* GoL Initialization */
+    int chunkN = (int) (BOARD_N / sqrt(nProc));
+    int chunkNN = chunkN * chunkN;
+    int chunkM = chunkN + 2;
+    int *indexes = divideBoard(BOARD_N, chunkN, nProc);
+
+    int8_t *board = myRank == 0 ? calloc(BOARD_N * BOARD_N, sizeof(int8_t)) : NULL;
+    int8_t *chunkBoard = calloc((size_t) chunkNN, sizeof(int8_t));
+    int8_t *chunkBoardMyEnvelope = calloc(((size_t) (4 * chunkN)), sizeof(int8_t));
+    int8_t *chunkBoardEnvelope = calloc(((size_t) (4 * chunkM)), sizeof(int8_t));
+
+    initChunkBoard(chunkBoard, chunkN);
+
+    bool exit = false;
+    while (!exit) {
+        struct futhark_i8_2d *futChunkBoard = futhark_new_i8_2d(futharkContext, chunkBoard, chunkN, chunkN);
+        futhark_context_sync(futharkContext);
+        struct futhark_i8_2d *futChunkBoardMyEnvelope;
+        futhark_entry_get_envelope(futharkContext, &futChunkBoardMyEnvelope, futChunkBoard);
+        futhark_context_sync(futharkContext);
+        futhark_values_i8_2d(futharkContext, futChunkBoardMyEnvelope, chunkBoardMyEnvelope);
+        futhark_context_sync(futharkContext);
+
+        shareAndBuildEnvelope(chunkBoardMyEnvelope, chunkBoardEnvelope, rowComm, colComm, gridN, coordinates, chunkN,
+                              chunkM);
+
+        struct futhark_i8_2d *futChunkBoardEnvelope = futhark_new_i8_2d(futharkContext, chunkBoardEnvelope, 4, chunkM);
+        futhark_context_sync(futharkContext);
+        struct futhark_i8_2d *futNextChunkBoard;
+        futhark_entry_next_chunk_board(futharkContext, &futNextChunkBoard, futChunkBoard, futChunkBoardEnvelope);
+        futhark_context_sync(futharkContext);
+        futhark_values_i8_2d(futharkContext, futNextChunkBoard, chunkBoard);
+        futhark_context_sync(futharkContext);
+
+        if (myRank == 0) {
+            chunkBoardToBoard(board, BOARD_N, chunkBoard, chunkN, indexes, myRank);
+            int8_t *tmpChunkBoard = calloc((size_t) chunkNN, sizeof(int8_t));
+            MPI_Status status = {0};
+            for (int i = 0; i < nProc - 1; ++i) {
+                MPI_Recv(tmpChunkBoard, chunkNN, MPI_INT8_T, MPI_ANY_SOURCE, CHUNK_BOARD_TAG, MPI_COMM_WORLD, &status);
+                chunkBoardToBoard(board, BOARD_N, tmpChunkBoard, chunkN, indexes, status.MPI_SOURCE);
+            }
+            free(tmpChunkBoard);
+        } else {
+            MPI_Send(chunkBoard, chunkNN, MPI_INT8_T, 0, CHUNK_BOARD_TAG, MPI_COMM_WORLD);
+        }
+
+        if (myRank == 0) {
+            SDL_PumpEvents();
+            SDL_Event event;
+            SDL_PollEvent(&event);
+
+            exit = gfx_keypressed() == SDLK_ESCAPE ||
+                   (event.type == SDL_WINDOWEVENT && event.window.event == SDL_WINDOWEVENT_CLOSE);
+
+            gfx_clear(gfxContext, COLOR_BLACK);
+            for (int y = 0; y < BOARD_N; ++y) {
+                for (int x = 0; x < BOARD_N; ++x) {
+                    int cell = (int) board[INDEX_2D_TO_1D(y, x, BOARD_N)];
+                    gfx_putpixel(gfxContext, x, y, MAKE_COLOR(cell * 255, cell * 255, cell * 255));
+                }
+            }
+            gfx_present(gfxContext);
+        }
+
+        futhark_context_sync(futharkContext);
+        futhark_free_i8_2d(futharkContext, futChunkBoard);
+        futhark_free_i8_2d(futharkContext, futChunkBoardMyEnvelope);
+        futhark_free_i8_2d(futharkContext, futChunkBoardEnvelope);
+        futhark_free_i8_2d(futharkContext, futNextChunkBoard);
+
+        MPI_Bcast(&exit, 1, MPI_C_BOOL, 0, MPI_COMM_WORLD);
+        usleep(16666);
+    }
+
+    free(chunkBoard);
+    free(chunkBoardEnvelope);
+    free(chunkBoardMyEnvelope);
+
+    if (myRank == 0) {
+        free(board);
+        gfx_destroy(gfxContext);
+    }
+
+    futhark_context_free(futharkContext);
+    futhark_context_config_free(contextConfig);
+
+    MPI_Comm_free(&cartComm);
+    MPI_Comm_free(&rowComm);
+    MPI_Comm_free(&colComm);
+    MPI_Finalize();
+    return 0;
+}
diff --git a/game_of_life/CMakeLists.txt b/game_of_life/CMakeLists.txt
deleted file mode 100644
index 1722508afa1f0bd1a9b108727b9b01cb6f9bfb21..0000000000000000000000000000000000000000
--- a/game_of_life/CMakeLists.txt
+++ /dev/null
@@ -1,55 +0,0 @@
-cmake_minimum_required(VERSION 3.17)
-project(game_of_life C)
-
-set(CMAKE_C_STANDARD 11)
-
-include_directories(".")
-
-if (CMAKE_BUILD_TYPE MATCHES Debug)
-    set(GCC_COMPILE_FLAGS "-Wall -Wextra -pedantic -fsanitize=address -fsanitize=null")
-    if (CMAKE_SYSTEM_NAME MATCHES "Linux")
-        set(GCC_COMPILE_FLAGS "${GCC_COMPILE_FLAGS} -fsanitize=leak")
-    endif ()
-elseif (CMAKE_BUILD_TYPE MATCHES Release)
-    set(GCC_COMPILE_FLAGS "-g")
-endif ()
-
-if (CMAKE_SYSTEM_NAME MATCHES "Linux")
-    execute_process(COMMAND sdl2-config --cflags OUTPUT_VARIABLE SDL2_C_FLAGS)
-    include_directories(${SDL2_C_FLAGS})
-endif ()
-
-if (CMAKE_SYSTEM_NAME MATCHES "Darwin")
-    include_directories(/usr/local/include)
-endif ()
-
-find_package(MPI REQUIRED)
-include_directories(${MPI_C_INCLUDE_PATH})
-
-set(CMAKE_MACRO_FLAGS -DPROGHEADER='\"${CMAKE_CURRENT_SOURCE_DIR}/gol.h\"')
-
-set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${GCC_COMPILE_FLAGS} ${CMAKE_MACRO_FLAGS}")
-
-add_custom_target(
-        futhark_opencl
-        COMMAND futhark opencl ${CMAKE_CURRENT_SOURCE_DIR}/gol.fut --library
-)
-add_executable(game_of_life_opencl gol.c gol.h main.c lib/github.com/diku-dk/lys/liblys.c lib/github.com/diku-dk/lys/liblys.h lib/github.com/diku-dk/lys/context_setup.c lib/github.com/diku-dk/lys/context_setup.h ../lib/fpmpi.c ../lib/fpmpi.h ../lib/fp.h ../lib/fp.c ../lib/dispatch.c ../lib/dispatch.h)
-
-if (CMAKE_SYSTEM_NAME MATCHES "Darwin")
-    target_link_libraries(game_of_life_opencl "-framework OpenCL" m SDL2 ${MPI_C_LIBRARIES})
-endif ()
-
-if (CMAKE_SYSTEM_NAME MATCHES "Linux")
-    target_link_libraries(game_of_life_opencl OpenCL m SDL2 ${MPI_C_LIBRARIES})
-endif ()
-
-add_dependencies(game_of_life_opencl futhark_opencl)
-
-add_custom_target(
-        futhark_multicore
-        COMMAND futhark multicore ${CMAKE_CURRENT_SOURCE_DIR}/gol.fut --library
-)
-add_executable(game_of_life_multicore gol.c gol.h main.c lib/github.com/diku-dk/lys/liblys.c lib/github.com/diku-dk/lys/liblys.h lib/github.com/diku-dk/lys/context_setup.c lib/github.com/diku-dk/lys/context_setup.h ../lib/fpmpi.c ../lib/fpmpi.h ../lib/fp.h ../lib/fp.c ../lib/dispatch.c ../lib/dispatch.h)
-add_dependencies(game_of_life_multicore futhark_multicore)
-target_link_libraries(game_of_life_multicore m pthread SDL2 ${MPI_C_LIBRARIES})
diff --git a/game_of_life/Makefile b/game_of_life/Makefile
deleted file mode 100644
index 7cc239aa59d9de044fe41a2e9dd8997551f4e5c0..0000000000000000000000000000000000000000
--- a/game_of_life/Makefile
+++ /dev/null
@@ -1,34 +0,0 @@
-all: release debug
-
-release:
-	mkdir -p "cmake-build-release"
-	cmake -DCMAKE_BUILD_TYPE=Release -Bcmake-build-release
-	$(MAKE) -C cmake-build-release all
-
-release/multicore:
-	mkdir -p "cmake-build-release"
-	cmake -DCMAKE_BUILD_TYPE=Release -Bcmake-build-release
-	$(MAKE) -C cmake-build-release game_of_life_multicore
-
-release/opencl:
-	mkdir -p "cmake-build-release"
-	cmake -DCMAKE_BUILD_TYPE=Release -Bcmake-build-release
-	$(MAKE) -C cmake-build-release game_of_life_opencl
-
-debug:
-	mkdir -p "cmake-build-debug"
-	cmake -DCMAKE_BUILD_TYPE=Debug -Bcmake-build-debug
-	$(MAKE) -C cmake-build-release all
-
-
-debug/multicore:
-	mkdir -p "cmake-build-debug"
-	cmake -DCMAKE_BUILD_TYPE=Debug -Bcmake-build-debug
-	$(MAKE) -C cmake-build-debug game_of_life_multicore
-
-debug/opencl:
-	mkdir -p "cmake-build-debug"
-	cmake -DCMAKE_BUILD_TYPE=Debug -Bcmake-build-debug
-	$(MAKE) -C cmake-build-debug game_of_life_opencl
-
-.PHONY: release release/multicore release/opencl debug debug/multicore debug/opencl
diff --git a/game_of_life/README.md b/game_of_life/README.md
deleted file mode 100644
index d31463ccac39ea0bdbc3238426e898f5471f4f9d..0000000000000000000000000000000000000000
--- a/game_of_life/README.md
+++ /dev/null
@@ -1,12 +0,0 @@
-# Jeu de la vie en Futhark/C
-
-Le but de ce projet est de créer le jeu de la vie en Futhark + C avec l'affichage du monde dans une fenêtre SDL gérée par Futhark.
-La contrainte de cette version est que le monde est représenté dans un tableau en une dimension.
-
-## Construire le projet
-
-* Exécuter la commande `futhark pkg sync`
-* Exécuter la commande `make`
-* Les exécutables sont présents dans le dossier `cmake-build-debug` et/ou `cmake-build-release`
-  * `./game_of_life_opencl`
-  * `./game_of_life_multicore`
diff --git a/game_of_life/futhark.pkg b/game_of_life/futhark.pkg
deleted file mode 100644
index 80bc4b6b4457688252b547676b59fe0e1c1e71a7..0000000000000000000000000000000000000000
--- a/game_of_life/futhark.pkg
+++ /dev/null
@@ -1,3 +0,0 @@
-require {
-  github.com/diku-dk/lys 0.1.12 #34e5ff985fefac9a9627d49e26a19ef5352e7019
-}
diff --git a/game_of_life/gol.c b/game_of_life/gol.c
deleted file mode 100644
index 90ca1da44a0de5d1684a998ab3b68f315fea7141..0000000000000000000000000000000000000000
--- a/game_of_life/gol.c
+++ /dev/null
@@ -1,5273 +0,0 @@
-#ifndef _GNU_SOURCE
-#define _GNU_SOURCE
-#endif
-#ifdef __GNUC__
-#pragma GCC diagnostic ignored "-Wunused-function"
-#pragma GCC diagnostic ignored "-Wunused-variable"
-#pragma GCC diagnostic ignored "-Wparentheses"
-#pragma GCC diagnostic ignored "-Wunused-label"
-#pragma GCC diagnostic ignored "-Wunused-but-set-variable"
-#endif
-#ifdef __clang__
-#pragma clang diagnostic ignored "-Wunused-function"
-#pragma clang diagnostic ignored "-Wunused-variable"
-#pragma clang diagnostic ignored "-Wparentheses"
-#pragma clang diagnostic ignored "-Wunused-label"
-#endif
-// Headers
-
-#include <stdint.h>
-#include <stddef.h>
-#include <stdbool.h>
-#include <stdio.h>
-#include <float.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-// Initialisation
-
-struct futhark_context_config ;
-struct futhark_context_config *futhark_context_config_new(void);
-void futhark_context_config_free(struct futhark_context_config *cfg);
-void futhark_context_config_set_debugging(struct futhark_context_config *cfg,
-                                          int flag);
-void futhark_context_config_set_profiling(struct futhark_context_config *cfg,
-                                          int flag);
-void futhark_context_config_set_logging(struct futhark_context_config *cfg,
-                                        int flag);
-void futhark_context_config_set_num_threads(struct futhark_context_config *cfg,
-                                            int n);
-struct futhark_context ;
-struct futhark_context *futhark_context_new(struct futhark_context_config *cfg);
-void futhark_context_free(struct futhark_context *ctx);
-int futhark_context_sync(struct futhark_context *ctx);
-int futhark_context_config_set_size(struct futhark_context_config *cfg, const
-                                    char *size_name, size_t size_value);
-int futhark_get_num_sizes(void);
-const char *futhark_get_size_name(int);
-const char *futhark_get_size_class(int);
-
-// Arrays
-
-struct futhark_i8_1d ;
-struct futhark_i8_1d *futhark_new_i8_1d(struct futhark_context *ctx, const
-                                        int8_t *data, int64_t dim0);
-struct futhark_i8_1d *futhark_new_raw_i8_1d(struct futhark_context *ctx, const
-                                            char *data, int offset,
-                                            int64_t dim0);
-int futhark_free_i8_1d(struct futhark_context *ctx, struct futhark_i8_1d *arr);
-int futhark_values_i8_1d(struct futhark_context *ctx, struct futhark_i8_1d *arr,
-                         int8_t *data);
-char *futhark_values_raw_i8_1d(struct futhark_context *ctx,
-                               struct futhark_i8_1d *arr);
-const int64_t *futhark_shape_i8_1d(struct futhark_context *ctx,
-                                   struct futhark_i8_1d *arr);
-struct futhark_u32_2d ;
-struct futhark_u32_2d *futhark_new_u32_2d(struct futhark_context *ctx, const
-                                          uint32_t *data, int64_t dim0,
-                                          int64_t dim1);
-struct futhark_u32_2d *futhark_new_raw_u32_2d(struct futhark_context *ctx, const
-                                              char *data, int offset,
-                                              int64_t dim0, int64_t dim1);
-int futhark_free_u32_2d(struct futhark_context *ctx,
-                        struct futhark_u32_2d *arr);
-int futhark_values_u32_2d(struct futhark_context *ctx,
-                          struct futhark_u32_2d *arr, uint32_t *data);
-char *futhark_values_raw_u32_2d(struct futhark_context *ctx,
-                                struct futhark_u32_2d *arr);
-const int64_t *futhark_shape_u32_2d(struct futhark_context *ctx,
-                                    struct futhark_u32_2d *arr);
-
-// Opaque values
-
-struct futhark_opaque_state ;
-int futhark_free_opaque_state(struct futhark_context *ctx,
-                              struct futhark_opaque_state *obj);
-int futhark_store_opaque_state(struct futhark_context *ctx, const
-                               struct futhark_opaque_state *obj, void **p,
-                               size_t *n);
-struct futhark_opaque_state
-*futhark_restore_opaque_state(struct futhark_context *ctx, const void *p);
-
-// Entry points
-
-int futhark_entry_init(struct futhark_context *ctx,
-                       struct futhark_opaque_state **out0, const
-                       struct futhark_i8_1d *in0, const int64_t in1, const
-                       int64_t in2, const int64_t in3);
-int futhark_entry_key(struct futhark_context *ctx,
-                      struct futhark_opaque_state **out0, const int32_t in0,
-                      const int32_t in1, const
-                      struct futhark_opaque_state *in2);
-int futhark_entry_mouse(struct futhark_context *ctx,
-                        struct futhark_opaque_state **out0, const int32_t in0,
-                        const int32_t in1, const int32_t in2, const
-                        struct futhark_opaque_state *in3);
-int futhark_entry_render(struct futhark_context *ctx,
-                         struct futhark_u32_2d **out0, const
-                         struct futhark_opaque_state *in0);
-int futhark_entry_resize(struct futhark_context *ctx,
-                         struct futhark_opaque_state **out0, const int64_t in0,
-                         const int64_t in1, const
-                         struct futhark_opaque_state *in2);
-int futhark_entry_step(struct futhark_context *ctx,
-                       struct futhark_opaque_state **out0, const float in0,
-                       const struct futhark_opaque_state *in1);
-int futhark_entry_wheel(struct futhark_context *ctx,
-                        struct futhark_opaque_state **out0, const int32_t in0,
-                        const int32_t in1, const
-                        struct futhark_opaque_state *in2);
-
-// Miscellaneous
-
-char *futhark_context_report(struct futhark_context *ctx);
-char *futhark_context_get_error(struct futhark_context *ctx);
-void futhark_context_set_logging_file(struct futhark_context *ctx, FILE *f);
-void futhark_context_pause_profiling(struct futhark_context *ctx);
-void futhark_context_unpause_profiling(struct futhark_context *ctx);
-int futhark_context_clear_caches(struct futhark_context *ctx);
-#define FUTHARK_BACKEND_multicore
-#ifdef __cplusplus
-}
-#endif
-#include <stdio.h>
-#include <stdlib.h>
-#include <stdbool.h>
-#include <math.h>
-#include <stdint.h>
-#undef NDEBUG
-#include <assert.h>
-#include <stdarg.h>
-// Start of util.h.
-//
-// Various helper functions that are useful in all generated C code.
-
-#include <errno.h>
-#include <string.h>
-
-static const char *fut_progname = "(embedded Futhark)";
-
-static void futhark_panic(int eval, const char *fmt, ...) {
-  va_list ap;
-  va_start(ap, fmt);
-  fprintf(stderr, "%s: ", fut_progname);
-  vfprintf(stderr, fmt, ap);
-  va_end(ap);
-  exit(eval);
-}
-
-// For generating arbitrary-sized error messages.  It is the callers
-// responsibility to free the buffer at some point.
-static char* msgprintf(const char *s, ...) {
-  va_list vl;
-  va_start(vl, s);
-  size_t needed = 1 + (size_t)vsnprintf(NULL, 0, s, vl);
-  char *buffer = (char*) malloc(needed);
-  va_start(vl, s); // Must re-init.
-  vsnprintf(buffer, needed, s, vl);
-  return buffer;
-}
-
-
-static inline void check_err(int errval, int sets_errno, const char *fun, int line,
-                            const char *msg, ...) {
-  if (errval) {
-    char errnum[10];
-
-    va_list vl;
-    va_start(vl, msg);
-
-    fprintf(stderr, "ERROR: ");
-    vfprintf(stderr, msg, vl);
-    fprintf(stderr, " in %s() at line %d with error code %s\n",
-            fun, line,
-            sets_errno ? strerror(errno) : errnum);
-    exit(errval);
-  }
-}
-
-#define CHECK_ERR(err, msg...) check_err(err, 0, __func__, __LINE__, msg)
-#define CHECK_ERRNO(err, msg...) check_err(err, 1, __func__, __LINE__, msg)
-
-// Read the rest of an open file into a NUL-terminated string; returns
-// NULL on error.
-static void* fslurp_file(FILE *f, size_t *size) {
-  size_t start = ftell(f);
-  fseek(f, 0, SEEK_END);
-  size_t src_size = ftell(f)-start;
-  fseek(f, start, SEEK_SET);
-  unsigned char *s = (unsigned char*) malloc(src_size + 1);
-  if (fread(s, 1, src_size, f) != src_size) {
-    free(s);
-    s = NULL;
-  } else {
-    s[src_size] = '\0';
-  }
-
-  if (size) {
-    *size = src_size;
-  }
-
-  return s;
-}
-
-// Read a file into a NUL-terminated string; returns NULL on error.
-static void* slurp_file(const char *filename, size_t *size) {
-  FILE *f = fopen(filename, "rb"); // To avoid Windows messing with linebreaks.
-  if (f == NULL) return NULL;
-  unsigned char *s = fslurp_file(f, size);
-  fclose(f);
-  return s;
-}
-
-// Dump 'n' bytes from 'buf' into the file at the designated location.
-// Returns 0 on success.
-static int dump_file(const char *file, const void *buf, size_t n) {
-  FILE *f = fopen(file, "w");
-
-  if (f == NULL) {
-    return 1;
-  }
-
-  if (fwrite(buf, sizeof(char), n, f) != n) {
-    return 1;
-  }
-
-  if (fclose(f) != 0) {
-    return 1;
-  }
-
-  return 0;
-}
-
-struct str_builder {
-  char *str;
-  size_t capacity; // Size of buffer.
-  size_t used; // Bytes used, *not* including final zero.
-};
-
-static void str_builder_init(struct str_builder *b) {
-  b->capacity = 10;
-  b->used = 0;
-  b->str = malloc(b->capacity);
-  b->str[0] = 0;
-}
-
-static void str_builder(struct str_builder *b, const char *s, ...) {
-  va_list vl;
-  va_start(vl, s);
-  size_t needed = (size_t)vsnprintf(NULL, 0, s, vl);
-
-  while (b->capacity < b->used + needed + 1) {
-    b->capacity *= 2;
-    b->str = realloc(b->str, b->capacity);
-  }
-
-  va_start(vl, s); // Must re-init.
-  vsnprintf(b->str+b->used, b->capacity-b->used, s, vl);
-  b->used += needed;
-}
-
-// End of util.h.
-
-// Start of timing.h.
-
-// The function get_wall_time() returns the wall time in microseconds
-// (with an unspecified offset).
-
-#ifdef _WIN32
-
-#include <windows.h>
-
-static int64_t get_wall_time(void) {
-  LARGE_INTEGER time,freq;
-  assert(QueryPerformanceFrequency(&freq));
-  assert(QueryPerformanceCounter(&time));
-  return ((double)time.QuadPart / freq.QuadPart) * 1000000;
-}
-
-#else
-// Assuming POSIX
-
-#include <time.h>
-#include <sys/time.h>
-
-static int64_t get_wall_time(void) {
-  struct timeval time;
-  assert(gettimeofday(&time,NULL) == 0);
-  return time.tv_sec * 1000000 + time.tv_usec;
-}
-
-static int64_t get_wall_time_ns(void) {
-  struct timespec time;
-  assert(clock_gettime(CLOCK_REALTIME, &time) == 0);
-  return time.tv_sec * 1000000000 + time.tv_nsec;
-}
-
-#endif
-
-// End of timing.h.
-
-#ifdef _MSC_VER
-#define inline __inline
-#endif
-#include <string.h>
-#include <string.h>
-#include <errno.h>
-#include <assert.h>
-#include <ctype.h>
-
-// Start of lock.h.
-
-// A very simple cross-platform implementation of locks.  Uses
-// pthreads on Unix and some Windows thing there.  Futhark's
-// host-level code is not multithreaded, but user code may be, so we
-// need some mechanism for ensuring atomic access to API functions.
-// This is that mechanism.  It is not exposed to user code at all, so
-// we do not have to worry about name collisions.
-
-#ifdef _WIN32
-
-typedef HANDLE lock_t;
-
-static void create_lock(lock_t *lock) {
-  *lock = CreateMutex(NULL,  // Default security attributes.
-                      FALSE, // Initially unlocked.
-                      NULL); // Unnamed.
-}
-
-static void lock_lock(lock_t *lock) {
-  assert(WaitForSingleObject(*lock, INFINITE) == WAIT_OBJECT_0);
-}
-
-static void lock_unlock(lock_t *lock) {
-  assert(ReleaseMutex(*lock));
-}
-
-static void free_lock(lock_t *lock) {
-  CloseHandle(*lock);
-}
-
-#else
-// Assuming POSIX
-
-#include <pthread.h>
-
-typedef pthread_mutex_t lock_t;
-
-static void create_lock(lock_t *lock) {
-  int r = pthread_mutex_init(lock, NULL);
-  assert(r == 0);
-}
-
-static void lock_lock(lock_t *lock) {
-  int r = pthread_mutex_lock(lock);
-  assert(r == 0);
-}
-
-static void lock_unlock(lock_t *lock) {
-  int r = pthread_mutex_unlock(lock);
-  assert(r == 0);
-}
-
-static void free_lock(lock_t *lock) {
-  // Nothing to do for pthreads.
-  (void)lock;
-}
-
-#endif
-
-// End of lock.h.
-
-static inline uint8_t add8(uint8_t x, uint8_t y)
-{
-    return x + y;
-}
-static inline uint16_t add16(uint16_t x, uint16_t y)
-{
-    return x + y;
-}
-static inline uint32_t add32(uint32_t x, uint32_t y)
-{
-    return x + y;
-}
-static inline uint64_t add64(uint64_t x, uint64_t y)
-{
-    return x + y;
-}
-static inline uint8_t sub8(uint8_t x, uint8_t y)
-{
-    return x - y;
-}
-static inline uint16_t sub16(uint16_t x, uint16_t y)
-{
-    return x - y;
-}
-static inline uint32_t sub32(uint32_t x, uint32_t y)
-{
-    return x - y;
-}
-static inline uint64_t sub64(uint64_t x, uint64_t y)
-{
-    return x - y;
-}
-static inline uint8_t mul8(uint8_t x, uint8_t y)
-{
-    return x * y;
-}
-static inline uint16_t mul16(uint16_t x, uint16_t y)
-{
-    return x * y;
-}
-static inline uint32_t mul32(uint32_t x, uint32_t y)
-{
-    return x * y;
-}
-static inline uint64_t mul64(uint64_t x, uint64_t y)
-{
-    return x * y;
-}
-static inline uint8_t udiv8(uint8_t x, uint8_t y)
-{
-    return x / y;
-}
-static inline uint16_t udiv16(uint16_t x, uint16_t y)
-{
-    return x / y;
-}
-static inline uint32_t udiv32(uint32_t x, uint32_t y)
-{
-    return x / y;
-}
-static inline uint64_t udiv64(uint64_t x, uint64_t y)
-{
-    return x / y;
-}
-static inline uint8_t udiv_up8(uint8_t x, uint8_t y)
-{
-    return (x + y - 1) / y;
-}
-static inline uint16_t udiv_up16(uint16_t x, uint16_t y)
-{
-    return (x + y - 1) / y;
-}
-static inline uint32_t udiv_up32(uint32_t x, uint32_t y)
-{
-    return (x + y - 1) / y;
-}
-static inline uint64_t udiv_up64(uint64_t x, uint64_t y)
-{
-    return (x + y - 1) / y;
-}
-static inline uint8_t umod8(uint8_t x, uint8_t y)
-{
-    return x % y;
-}
-static inline uint16_t umod16(uint16_t x, uint16_t y)
-{
-    return x % y;
-}
-static inline uint32_t umod32(uint32_t x, uint32_t y)
-{
-    return x % y;
-}
-static inline uint64_t umod64(uint64_t x, uint64_t y)
-{
-    return x % y;
-}
-static inline uint8_t udiv_safe8(uint8_t x, uint8_t y)
-{
-    return y == 0 ? 0 : x / y;
-}
-static inline uint16_t udiv_safe16(uint16_t x, uint16_t y)
-{
-    return y == 0 ? 0 : x / y;
-}
-static inline uint32_t udiv_safe32(uint32_t x, uint32_t y)
-{
-    return y == 0 ? 0 : x / y;
-}
-static inline uint64_t udiv_safe64(uint64_t x, uint64_t y)
-{
-    return y == 0 ? 0 : x / y;
-}
-static inline uint8_t udiv_up_safe8(uint8_t x, uint8_t y)
-{
-    return y == 0 ? 0 : (x + y - 1) / y;
-}
-static inline uint16_t udiv_up_safe16(uint16_t x, uint16_t y)
-{
-    return y == 0 ? 0 : (x + y - 1) / y;
-}
-static inline uint32_t udiv_up_safe32(uint32_t x, uint32_t y)
-{
-    return y == 0 ? 0 : (x + y - 1) / y;
-}
-static inline uint64_t udiv_up_safe64(uint64_t x, uint64_t y)
-{
-    return y == 0 ? 0 : (x + y - 1) / y;
-}
-static inline uint8_t umod_safe8(uint8_t x, uint8_t y)
-{
-    return y == 0 ? 0 : x % y;
-}
-static inline uint16_t umod_safe16(uint16_t x, uint16_t y)
-{
-    return y == 0 ? 0 : x % y;
-}
-static inline uint32_t umod_safe32(uint32_t x, uint32_t y)
-{
-    return y == 0 ? 0 : x % y;
-}
-static inline uint64_t umod_safe64(uint64_t x, uint64_t y)
-{
-    return y == 0 ? 0 : x % y;
-}
-static inline int8_t sdiv8(int8_t x, int8_t y)
-{
-    int8_t q = x / y;
-    int8_t r = x % y;
-    
-    return q - ((r != 0 && r < 0 != y < 0) ? 1 : 0);
-}
-static inline int16_t sdiv16(int16_t x, int16_t y)
-{
-    int16_t q = x / y;
-    int16_t r = x % y;
-    
-    return q - ((r != 0 && r < 0 != y < 0) ? 1 : 0);
-}
-static inline int32_t sdiv32(int32_t x, int32_t y)
-{
-    int32_t q = x / y;
-    int32_t r = x % y;
-    
-    return q - ((r != 0 && r < 0 != y < 0) ? 1 : 0);
-}
-static inline int64_t sdiv64(int64_t x, int64_t y)
-{
-    int64_t q = x / y;
-    int64_t r = x % y;
-    
-    return q - ((r != 0 && r < 0 != y < 0) ? 1 : 0);
-}
-static inline int8_t sdiv_up8(int8_t x, int8_t y)
-{
-    return sdiv8(x + y - 1, y);
-}
-static inline int16_t sdiv_up16(int16_t x, int16_t y)
-{
-    return sdiv16(x + y - 1, y);
-}
-static inline int32_t sdiv_up32(int32_t x, int32_t y)
-{
-    return sdiv32(x + y - 1, y);
-}
-static inline int64_t sdiv_up64(int64_t x, int64_t y)
-{
-    return sdiv64(x + y - 1, y);
-}
-static inline int8_t smod8(int8_t x, int8_t y)
-{
-    int8_t r = x % y;
-    
-    return r + (r == 0 || (x > 0 && y > 0) || (x < 0 && y < 0) ? 0 : y);
-}
-static inline int16_t smod16(int16_t x, int16_t y)
-{
-    int16_t r = x % y;
-    
-    return r + (r == 0 || (x > 0 && y > 0) || (x < 0 && y < 0) ? 0 : y);
-}
-static inline int32_t smod32(int32_t x, int32_t y)
-{
-    int32_t r = x % y;
-    
-    return r + (r == 0 || (x > 0 && y > 0) || (x < 0 && y < 0) ? 0 : y);
-}
-static inline int64_t smod64(int64_t x, int64_t y)
-{
-    int64_t r = x % y;
-    
-    return r + (r == 0 || (x > 0 && y > 0) || (x < 0 && y < 0) ? 0 : y);
-}
-static inline int8_t sdiv_safe8(int8_t x, int8_t y)
-{
-    return y == 0 ? 0 : sdiv8(x, y);
-}
-static inline int16_t sdiv_safe16(int16_t x, int16_t y)
-{
-    return y == 0 ? 0 : sdiv16(x, y);
-}
-static inline int32_t sdiv_safe32(int32_t x, int32_t y)
-{
-    return y == 0 ? 0 : sdiv32(x, y);
-}
-static inline int64_t sdiv_safe64(int64_t x, int64_t y)
-{
-    return y == 0 ? 0 : sdiv64(x, y);
-}
-static inline int8_t sdiv_up_safe8(int8_t x, int8_t y)
-{
-    return sdiv_safe8(x + y - 1, y);
-}
-static inline int16_t sdiv_up_safe16(int16_t x, int16_t y)
-{
-    return sdiv_safe16(x + y - 1, y);
-}
-static inline int32_t sdiv_up_safe32(int32_t x, int32_t y)
-{
-    return sdiv_safe32(x + y - 1, y);
-}
-static inline int64_t sdiv_up_safe64(int64_t x, int64_t y)
-{
-    return sdiv_safe64(x + y - 1, y);
-}
-static inline int8_t smod_safe8(int8_t x, int8_t y)
-{
-    return y == 0 ? 0 : smod8(x, y);
-}
-static inline int16_t smod_safe16(int16_t x, int16_t y)
-{
-    return y == 0 ? 0 : smod16(x, y);
-}
-static inline int32_t smod_safe32(int32_t x, int32_t y)
-{
-    return y == 0 ? 0 : smod32(x, y);
-}
-static inline int64_t smod_safe64(int64_t x, int64_t y)
-{
-    return y == 0 ? 0 : smod64(x, y);
-}
-static inline int8_t squot8(int8_t x, int8_t y)
-{
-    return x / y;
-}
-static inline int16_t squot16(int16_t x, int16_t y)
-{
-    return x / y;
-}
-static inline int32_t squot32(int32_t x, int32_t y)
-{
-    return x / y;
-}
-static inline int64_t squot64(int64_t x, int64_t y)
-{
-    return x / y;
-}
-static inline int8_t srem8(int8_t x, int8_t y)
-{
-    return x % y;
-}
-static inline int16_t srem16(int16_t x, int16_t y)
-{
-    return x % y;
-}
-static inline int32_t srem32(int32_t x, int32_t y)
-{
-    return x % y;
-}
-static inline int64_t srem64(int64_t x, int64_t y)
-{
-    return x % y;
-}
-static inline int8_t squot_safe8(int8_t x, int8_t y)
-{
-    return y == 0 ? 0 : x / y;
-}
-static inline int16_t squot_safe16(int16_t x, int16_t y)
-{
-    return y == 0 ? 0 : x / y;
-}
-static inline int32_t squot_safe32(int32_t x, int32_t y)
-{
-    return y == 0 ? 0 : x / y;
-}
-static inline int64_t squot_safe64(int64_t x, int64_t y)
-{
-    return y == 0 ? 0 : x / y;
-}
-static inline int8_t srem_safe8(int8_t x, int8_t y)
-{
-    return y == 0 ? 0 : x % y;
-}
-static inline int16_t srem_safe16(int16_t x, int16_t y)
-{
-    return y == 0 ? 0 : x % y;
-}
-static inline int32_t srem_safe32(int32_t x, int32_t y)
-{
-    return y == 0 ? 0 : x % y;
-}
-static inline int64_t srem_safe64(int64_t x, int64_t y)
-{
-    return y == 0 ? 0 : x % y;
-}
-static inline int8_t smin8(int8_t x, int8_t y)
-{
-    return x < y ? x : y;
-}
-static inline int16_t smin16(int16_t x, int16_t y)
-{
-    return x < y ? x : y;
-}
-static inline int32_t smin32(int32_t x, int32_t y)
-{
-    return x < y ? x : y;
-}
-static inline int64_t smin64(int64_t x, int64_t y)
-{
-    return x < y ? x : y;
-}
-static inline uint8_t umin8(uint8_t x, uint8_t y)
-{
-    return x < y ? x : y;
-}
-static inline uint16_t umin16(uint16_t x, uint16_t y)
-{
-    return x < y ? x : y;
-}
-static inline uint32_t umin32(uint32_t x, uint32_t y)
-{
-    return x < y ? x : y;
-}
-static inline uint64_t umin64(uint64_t x, uint64_t y)
-{
-    return x < y ? x : y;
-}
-static inline int8_t smax8(int8_t x, int8_t y)
-{
-    return x < y ? y : x;
-}
-static inline int16_t smax16(int16_t x, int16_t y)
-{
-    return x < y ? y : x;
-}
-static inline int32_t smax32(int32_t x, int32_t y)
-{
-    return x < y ? y : x;
-}
-static inline int64_t smax64(int64_t x, int64_t y)
-{
-    return x < y ? y : x;
-}
-static inline uint8_t umax8(uint8_t x, uint8_t y)
-{
-    return x < y ? y : x;
-}
-static inline uint16_t umax16(uint16_t x, uint16_t y)
-{
-    return x < y ? y : x;
-}
-static inline uint32_t umax32(uint32_t x, uint32_t y)
-{
-    return x < y ? y : x;
-}
-static inline uint64_t umax64(uint64_t x, uint64_t y)
-{
-    return x < y ? y : x;
-}
-static inline uint8_t shl8(uint8_t x, uint8_t y)
-{
-    return x << y;
-}
-static inline uint16_t shl16(uint16_t x, uint16_t y)
-{
-    return x << y;
-}
-static inline uint32_t shl32(uint32_t x, uint32_t y)
-{
-    return x << y;
-}
-static inline uint64_t shl64(uint64_t x, uint64_t y)
-{
-    return x << y;
-}
-static inline uint8_t lshr8(uint8_t x, uint8_t y)
-{
-    return x >> y;
-}
-static inline uint16_t lshr16(uint16_t x, uint16_t y)
-{
-    return x >> y;
-}
-static inline uint32_t lshr32(uint32_t x, uint32_t y)
-{
-    return x >> y;
-}
-static inline uint64_t lshr64(uint64_t x, uint64_t y)
-{
-    return x >> y;
-}
-static inline int8_t ashr8(int8_t x, int8_t y)
-{
-    return x >> y;
-}
-static inline int16_t ashr16(int16_t x, int16_t y)
-{
-    return x >> y;
-}
-static inline int32_t ashr32(int32_t x, int32_t y)
-{
-    return x >> y;
-}
-static inline int64_t ashr64(int64_t x, int64_t y)
-{
-    return x >> y;
-}
-static inline uint8_t and8(uint8_t x, uint8_t y)
-{
-    return x & y;
-}
-static inline uint16_t and16(uint16_t x, uint16_t y)
-{
-    return x & y;
-}
-static inline uint32_t and32(uint32_t x, uint32_t y)
-{
-    return x & y;
-}
-static inline uint64_t and64(uint64_t x, uint64_t y)
-{
-    return x & y;
-}
-static inline uint8_t or8(uint8_t x, uint8_t y)
-{
-    return x | y;
-}
-static inline uint16_t or16(uint16_t x, uint16_t y)
-{
-    return x | y;
-}
-static inline uint32_t or32(uint32_t x, uint32_t y)
-{
-    return x | y;
-}
-static inline uint64_t or64(uint64_t x, uint64_t y)
-{
-    return x | y;
-}
-static inline uint8_t xor8(uint8_t x, uint8_t y)
-{
-    return x ^ y;
-}
-static inline uint16_t xor16(uint16_t x, uint16_t y)
-{
-    return x ^ y;
-}
-static inline uint32_t xor32(uint32_t x, uint32_t y)
-{
-    return x ^ y;
-}
-static inline uint64_t xor64(uint64_t x, uint64_t y)
-{
-    return x ^ y;
-}
-static inline bool ult8(uint8_t x, uint8_t y)
-{
-    return x < y;
-}
-static inline bool ult16(uint16_t x, uint16_t y)
-{
-    return x < y;
-}
-static inline bool ult32(uint32_t x, uint32_t y)
-{
-    return x < y;
-}
-static inline bool ult64(uint64_t x, uint64_t y)
-{
-    return x < y;
-}
-static inline bool ule8(uint8_t x, uint8_t y)
-{
-    return x <= y;
-}
-static inline bool ule16(uint16_t x, uint16_t y)
-{
-    return x <= y;
-}
-static inline bool ule32(uint32_t x, uint32_t y)
-{
-    return x <= y;
-}
-static inline bool ule64(uint64_t x, uint64_t y)
-{
-    return x <= y;
-}
-static inline bool slt8(int8_t x, int8_t y)
-{
-    return x < y;
-}
-static inline bool slt16(int16_t x, int16_t y)
-{
-    return x < y;
-}
-static inline bool slt32(int32_t x, int32_t y)
-{
-    return x < y;
-}
-static inline bool slt64(int64_t x, int64_t y)
-{
-    return x < y;
-}
-static inline bool sle8(int8_t x, int8_t y)
-{
-    return x <= y;
-}
-static inline bool sle16(int16_t x, int16_t y)
-{
-    return x <= y;
-}
-static inline bool sle32(int32_t x, int32_t y)
-{
-    return x <= y;
-}
-static inline bool sle64(int64_t x, int64_t y)
-{
-    return x <= y;
-}
-static inline int8_t pow8(int8_t x, int8_t y)
-{
-    int8_t res = 1, rem = y;
-    
-    while (rem != 0) {
-        if (rem & 1)
-            res *= x;
-        rem >>= 1;
-        x *= x;
-    }
-    return res;
-}
-static inline int16_t pow16(int16_t x, int16_t y)
-{
-    int16_t res = 1, rem = y;
-    
-    while (rem != 0) {
-        if (rem & 1)
-            res *= x;
-        rem >>= 1;
-        x *= x;
-    }
-    return res;
-}
-static inline int32_t pow32(int32_t x, int32_t y)
-{
-    int32_t res = 1, rem = y;
-    
-    while (rem != 0) {
-        if (rem & 1)
-            res *= x;
-        rem >>= 1;
-        x *= x;
-    }
-    return res;
-}
-static inline int64_t pow64(int64_t x, int64_t y)
-{
-    int64_t res = 1, rem = y;
-    
-    while (rem != 0) {
-        if (rem & 1)
-            res *= x;
-        rem >>= 1;
-        x *= x;
-    }
-    return res;
-}
-static inline bool itob_i8_bool(int8_t x)
-{
-    return x;
-}
-static inline bool itob_i16_bool(int16_t x)
-{
-    return x;
-}
-static inline bool itob_i32_bool(int32_t x)
-{
-    return x;
-}
-static inline bool itob_i64_bool(int64_t x)
-{
-    return x;
-}
-static inline int8_t btoi_bool_i8(bool x)
-{
-    return x;
-}
-static inline int16_t btoi_bool_i16(bool x)
-{
-    return x;
-}
-static inline int32_t btoi_bool_i32(bool x)
-{
-    return x;
-}
-static inline int64_t btoi_bool_i64(bool x)
-{
-    return x;
-}
-#define sext_i8_i8(x) ((int8_t) (int8_t) x)
-#define sext_i8_i16(x) ((int16_t) (int8_t) x)
-#define sext_i8_i32(x) ((int32_t) (int8_t) x)
-#define sext_i8_i64(x) ((int64_t) (int8_t) x)
-#define sext_i16_i8(x) ((int8_t) (int16_t) x)
-#define sext_i16_i16(x) ((int16_t) (int16_t) x)
-#define sext_i16_i32(x) ((int32_t) (int16_t) x)
-#define sext_i16_i64(x) ((int64_t) (int16_t) x)
-#define sext_i32_i8(x) ((int8_t) (int32_t) x)
-#define sext_i32_i16(x) ((int16_t) (int32_t) x)
-#define sext_i32_i32(x) ((int32_t) (int32_t) x)
-#define sext_i32_i64(x) ((int64_t) (int32_t) x)
-#define sext_i64_i8(x) ((int8_t) (int64_t) x)
-#define sext_i64_i16(x) ((int16_t) (int64_t) x)
-#define sext_i64_i32(x) ((int32_t) (int64_t) x)
-#define sext_i64_i64(x) ((int64_t) (int64_t) x)
-#define zext_i8_i8(x) ((int8_t) (uint8_t) x)
-#define zext_i8_i16(x) ((int16_t) (uint8_t) x)
-#define zext_i8_i32(x) ((int32_t) (uint8_t) x)
-#define zext_i8_i64(x) ((int64_t) (uint8_t) x)
-#define zext_i16_i8(x) ((int8_t) (uint16_t) x)
-#define zext_i16_i16(x) ((int16_t) (uint16_t) x)
-#define zext_i16_i32(x) ((int32_t) (uint16_t) x)
-#define zext_i16_i64(x) ((int64_t) (uint16_t) x)
-#define zext_i32_i8(x) ((int8_t) (uint32_t) x)
-#define zext_i32_i16(x) ((int16_t) (uint32_t) x)
-#define zext_i32_i32(x) ((int32_t) (uint32_t) x)
-#define zext_i32_i64(x) ((int64_t) (uint32_t) x)
-#define zext_i64_i8(x) ((int8_t) (uint64_t) x)
-#define zext_i64_i16(x) ((int16_t) (uint64_t) x)
-#define zext_i64_i32(x) ((int32_t) (uint64_t) x)
-#define zext_i64_i64(x) ((int64_t) (uint64_t) x)
-#if defined(__OPENCL_VERSION__)
-static int32_t futrts_popc8(int8_t x)
-{
-    return popcount(x);
-}
-static int32_t futrts_popc16(int16_t x)
-{
-    return popcount(x);
-}
-static int32_t futrts_popc32(int32_t x)
-{
-    return popcount(x);
-}
-static int32_t futrts_popc64(int64_t x)
-{
-    return popcount(x);
-}
-#elif defined(__CUDA_ARCH__)
-static int32_t futrts_popc8(int8_t x)
-{
-    return __popc(zext_i8_i32(x));
-}
-static int32_t futrts_popc16(int16_t x)
-{
-    return __popc(zext_i16_i32(x));
-}
-static int32_t futrts_popc32(int32_t x)
-{
-    return __popc(x);
-}
-static int32_t futrts_popc64(int64_t x)
-{
-    return __popcll(x);
-}
-#else
-static int32_t futrts_popc8(int8_t x)
-{
-    int c = 0;
-    
-    for (; x; ++c)
-        x &= x - 1;
-    return c;
-}
-static int32_t futrts_popc16(int16_t x)
-{
-    int c = 0;
-    
-    for (; x; ++c)
-        x &= x - 1;
-    return c;
-}
-static int32_t futrts_popc32(int32_t x)
-{
-    int c = 0;
-    
-    for (; x; ++c)
-        x &= x - 1;
-    return c;
-}
-static int32_t futrts_popc64(int64_t x)
-{
-    int c = 0;
-    
-    for (; x; ++c)
-        x &= x - 1;
-    return c;
-}
-#endif
-#if defined(__OPENCL_VERSION__)
-static uint8_t futrts_mul_hi8(uint8_t a, uint8_t b)
-{
-    return mul_hi(a, b);
-}
-static uint16_t futrts_mul_hi16(uint16_t a, uint16_t b)
-{
-    return mul_hi(a, b);
-}
-static uint32_t futrts_mul_hi32(uint32_t a, uint32_t b)
-{
-    return mul_hi(a, b);
-}
-static uint64_t futrts_mul_hi64(uint64_t a, uint64_t b)
-{
-    return mul_hi(a, b);
-}
-#elif defined(__CUDA_ARCH__)
-static uint8_t futrts_mul_hi8(uint8_t a, uint8_t b)
-{
-    uint16_t aa = a;
-    uint16_t bb = b;
-    
-    return aa * bb >> 8;
-}
-static uint16_t futrts_mul_hi16(uint16_t a, uint16_t b)
-{
-    uint32_t aa = a;
-    uint32_t bb = b;
-    
-    return aa * bb >> 16;
-}
-static uint32_t futrts_mul_hi32(uint32_t a, uint32_t b)
-{
-    return mulhi(a, b);
-}
-static uint64_t futrts_mul_hi64(uint64_t a, uint64_t b)
-{
-    return mul64hi(a, b);
-}
-#else
-static uint8_t futrts_mul_hi8(uint8_t a, uint8_t b)
-{
-    uint16_t aa = a;
-    uint16_t bb = b;
-    
-    return aa * bb >> 8;
-}
-static uint16_t futrts_mul_hi16(uint16_t a, uint16_t b)
-{
-    uint32_t aa = a;
-    uint32_t bb = b;
-    
-    return aa * bb >> 16;
-}
-static uint32_t futrts_mul_hi32(uint32_t a, uint32_t b)
-{
-    uint64_t aa = a;
-    uint64_t bb = b;
-    
-    return aa * bb >> 32;
-}
-static uint64_t futrts_mul_hi64(uint64_t a, uint64_t b)
-{
-    __uint128_t aa = a;
-    __uint128_t bb = b;
-    
-    return aa * bb >> 64;
-}
-#endif
-#if defined(__OPENCL_VERSION__)
-static uint8_t futrts_mad_hi8(uint8_t a, uint8_t b, uint8_t c)
-{
-    return mad_hi(a, b, c);
-}
-static uint16_t futrts_mad_hi16(uint16_t a, uint16_t b, uint16_t c)
-{
-    return mad_hi(a, b, c);
-}
-static uint32_t futrts_mad_hi32(uint32_t a, uint32_t b, uint32_t c)
-{
-    return mad_hi(a, b, c);
-}
-static uint64_t futrts_mad_hi64(uint64_t a, uint64_t b, uint64_t c)
-{
-    return mad_hi(a, b, c);
-}
-#else
-static uint8_t futrts_mad_hi8(uint8_t a, uint8_t b, uint8_t c)
-{
-    return futrts_mul_hi8(a, b) + c;
-}
-static uint16_t futrts_mad_hi16(uint16_t a, uint16_t b, uint16_t c)
-{
-    return futrts_mul_hi16(a, b) + c;
-}
-static uint32_t futrts_mad_hi32(uint32_t a, uint32_t b, uint32_t c)
-{
-    return futrts_mul_hi32(a, b) + c;
-}
-static uint64_t futrts_mad_hi64(uint64_t a, uint64_t b, uint64_t c)
-{
-    return futrts_mul_hi64(a, b) + c;
-}
-#endif
-#if defined(__OPENCL_VERSION__)
-static int32_t futrts_clzz8(int8_t x)
-{
-    return clz(x);
-}
-static int32_t futrts_clzz16(int16_t x)
-{
-    return clz(x);
-}
-static int32_t futrts_clzz32(int32_t x)
-{
-    return clz(x);
-}
-static int32_t futrts_clzz64(int64_t x)
-{
-    return clz(x);
-}
-#elif defined(__CUDA_ARCH__)
-static int32_t futrts_clzz8(int8_t x)
-{
-    return __clz(zext_i8_i32(x)) - 24;
-}
-static int32_t futrts_clzz16(int16_t x)
-{
-    return __clz(zext_i16_i32(x)) - 16;
-}
-static int32_t futrts_clzz32(int32_t x)
-{
-    return __clz(x);
-}
-static int32_t futrts_clzz64(int64_t x)
-{
-    return __clzll(x);
-}
-#else
-static int32_t futrts_clzz8(int8_t x)
-{
-    int n = 0;
-    int bits = sizeof(x) * 8;
-    
-    for (int i = 0; i < bits; i++) {
-        if (x < 0)
-            break;
-        n++;
-        x <<= 1;
-    }
-    return n;
-}
-static int32_t futrts_clzz16(int16_t x)
-{
-    int n = 0;
-    int bits = sizeof(x) * 8;
-    
-    for (int i = 0; i < bits; i++) {
-        if (x < 0)
-            break;
-        n++;
-        x <<= 1;
-    }
-    return n;
-}
-static int32_t futrts_clzz32(int32_t x)
-{
-    int n = 0;
-    int bits = sizeof(x) * 8;
-    
-    for (int i = 0; i < bits; i++) {
-        if (x < 0)
-            break;
-        n++;
-        x <<= 1;
-    }
-    return n;
-}
-static int32_t futrts_clzz64(int64_t x)
-{
-    int n = 0;
-    int bits = sizeof(x) * 8;
-    
-    for (int i = 0; i < bits; i++) {
-        if (x < 0)
-            break;
-        n++;
-        x <<= 1;
-    }
-    return n;
-}
-#endif
-#if defined(__OPENCL_VERSION__)
-static int32_t futrts_ctzz8(int8_t x)
-{
-    int i = 0;
-    
-    for (; i < 8 && (x & 1) == 0; i++, x >>= 1)
-        ;
-    return i;
-}
-static int32_t futrts_ctzz16(int16_t x)
-{
-    int i = 0;
-    
-    for (; i < 16 && (x & 1) == 0; i++, x >>= 1)
-        ;
-    return i;
-}
-static int32_t futrts_ctzz32(int32_t x)
-{
-    int i = 0;
-    
-    for (; i < 32 && (x & 1) == 0; i++, x >>= 1)
-        ;
-    return i;
-}
-static int32_t futrts_ctzz64(int64_t x)
-{
-    int i = 0;
-    
-    for (; i < 64 && (x & 1) == 0; i++, x >>= 1)
-        ;
-    return i;
-}
-#elif defined(__CUDA_ARCH__)
-static int32_t futrts_ctzz8(int8_t x)
-{
-    int y = __ffs(x);
-    
-    return y == 0 ? 8 : y - 1;
-}
-static int32_t futrts_ctzz16(int16_t x)
-{
-    int y = __ffs(x);
-    
-    return y == 0 ? 16 : y - 1;
-}
-static int32_t futrts_ctzz32(int32_t x)
-{
-    int y = __ffs(x);
-    
-    return y == 0 ? 32 : y - 1;
-}
-static int32_t futrts_ctzz64(int64_t x)
-{
-    int y = __ffsll(x);
-    
-    return y == 0 ? 64 : y - 1;
-}
-#else
-static int32_t futrts_ctzz8(int8_t x)
-{
-    return x == 0 ? 8 : __builtin_ctz((uint32_t) x);
-}
-static int32_t futrts_ctzz16(int16_t x)
-{
-    return x == 0 ? 16 : __builtin_ctz((uint32_t) x);
-}
-static int32_t futrts_ctzz32(int32_t x)
-{
-    return x == 0 ? 32 : __builtin_ctz(x);
-}
-static int32_t futrts_ctzz64(int64_t x)
-{
-    return x == 0 ? 64 : __builtin_ctzll(x);
-}
-#endif
-static inline float fdiv32(float x, float y)
-{
-    return x / y;
-}
-static inline float fadd32(float x, float y)
-{
-    return x + y;
-}
-static inline float fsub32(float x, float y)
-{
-    return x - y;
-}
-static inline float fmul32(float x, float y)
-{
-    return x * y;
-}
-static inline float fmin32(float x, float y)
-{
-    return fmin(x, y);
-}
-static inline float fmax32(float x, float y)
-{
-    return fmax(x, y);
-}
-static inline float fpow32(float x, float y)
-{
-    return pow(x, y);
-}
-static inline bool cmplt32(float x, float y)
-{
-    return x < y;
-}
-static inline bool cmple32(float x, float y)
-{
-    return x <= y;
-}
-static inline float sitofp_i8_f32(int8_t x)
-{
-    return (float) x;
-}
-static inline float sitofp_i16_f32(int16_t x)
-{
-    return (float) x;
-}
-static inline float sitofp_i32_f32(int32_t x)
-{
-    return (float) x;
-}
-static inline float sitofp_i64_f32(int64_t x)
-{
-    return (float) x;
-}
-static inline float uitofp_i8_f32(uint8_t x)
-{
-    return (float) x;
-}
-static inline float uitofp_i16_f32(uint16_t x)
-{
-    return (float) x;
-}
-static inline float uitofp_i32_f32(uint32_t x)
-{
-    return (float) x;
-}
-static inline float uitofp_i64_f32(uint64_t x)
-{
-    return (float) x;
-}
-static inline int8_t fptosi_f32_i8(float x)
-{
-    return (int8_t) x;
-}
-static inline int16_t fptosi_f32_i16(float x)
-{
-    return (int16_t) x;
-}
-static inline int32_t fptosi_f32_i32(float x)
-{
-    return (int32_t) x;
-}
-static inline int64_t fptosi_f32_i64(float x)
-{
-    return (int64_t) x;
-}
-static inline uint8_t fptoui_f32_i8(float x)
-{
-    return (uint8_t) x;
-}
-static inline uint16_t fptoui_f32_i16(float x)
-{
-    return (uint16_t) x;
-}
-static inline uint32_t fptoui_f32_i32(float x)
-{
-    return (uint32_t) x;
-}
-static inline uint64_t fptoui_f32_i64(float x)
-{
-    return (uint64_t) x;
-}
-static inline double fdiv64(double x, double y)
-{
-    return x / y;
-}
-static inline double fadd64(double x, double y)
-{
-    return x + y;
-}
-static inline double fsub64(double x, double y)
-{
-    return x - y;
-}
-static inline double fmul64(double x, double y)
-{
-    return x * y;
-}
-static inline double fmin64(double x, double y)
-{
-    return fmin(x, y);
-}
-static inline double fmax64(double x, double y)
-{
-    return fmax(x, y);
-}
-static inline double fpow64(double x, double y)
-{
-    return pow(x, y);
-}
-static inline bool cmplt64(double x, double y)
-{
-    return x < y;
-}
-static inline bool cmple64(double x, double y)
-{
-    return x <= y;
-}
-static inline double sitofp_i8_f64(int8_t x)
-{
-    return (double) x;
-}
-static inline double sitofp_i16_f64(int16_t x)
-{
-    return (double) x;
-}
-static inline double sitofp_i32_f64(int32_t x)
-{
-    return (double) x;
-}
-static inline double sitofp_i64_f64(int64_t x)
-{
-    return (double) x;
-}
-static inline double uitofp_i8_f64(uint8_t x)
-{
-    return (double) x;
-}
-static inline double uitofp_i16_f64(uint16_t x)
-{
-    return (double) x;
-}
-static inline double uitofp_i32_f64(uint32_t x)
-{
-    return (double) x;
-}
-static inline double uitofp_i64_f64(uint64_t x)
-{
-    return (double) x;
-}
-static inline int8_t fptosi_f64_i8(double x)
-{
-    return (int8_t) x;
-}
-static inline int16_t fptosi_f64_i16(double x)
-{
-    return (int16_t) x;
-}
-static inline int32_t fptosi_f64_i32(double x)
-{
-    return (int32_t) x;
-}
-static inline int64_t fptosi_f64_i64(double x)
-{
-    return (int64_t) x;
-}
-static inline uint8_t fptoui_f64_i8(double x)
-{
-    return (uint8_t) x;
-}
-static inline uint16_t fptoui_f64_i16(double x)
-{
-    return (uint16_t) x;
-}
-static inline uint32_t fptoui_f64_i32(double x)
-{
-    return (uint32_t) x;
-}
-static inline uint64_t fptoui_f64_i64(double x)
-{
-    return (uint64_t) x;
-}
-static inline float fpconv_f32_f32(float x)
-{
-    return (float) x;
-}
-static inline double fpconv_f32_f64(float x)
-{
-    return (double) x;
-}
-static inline float fpconv_f64_f32(double x)
-{
-    return (float) x;
-}
-static inline double fpconv_f64_f64(double x)
-{
-    return (double) x;
-}
-static inline bool futrts_isnan32(float x)
-{
-    return isnan(x);
-}
-static inline bool futrts_isinf32(float x)
-{
-    return isinf(x);
-}
-#ifdef __OPENCL_VERSION__
-static inline float futrts_log32(float x)
-{
-    return log(x);
-}
-static inline float futrts_log2_32(float x)
-{
-    return log2(x);
-}
-static inline float futrts_log10_32(float x)
-{
-    return log10(x);
-}
-static inline float futrts_sqrt32(float x)
-{
-    return sqrt(x);
-}
-static inline float futrts_exp32(float x)
-{
-    return exp(x);
-}
-static inline float futrts_cos32(float x)
-{
-    return cos(x);
-}
-static inline float futrts_sin32(float x)
-{
-    return sin(x);
-}
-static inline float futrts_tan32(float x)
-{
-    return tan(x);
-}
-static inline float futrts_acos32(float x)
-{
-    return acos(x);
-}
-static inline float futrts_asin32(float x)
-{
-    return asin(x);
-}
-static inline float futrts_atan32(float x)
-{
-    return atan(x);
-}
-static inline float futrts_cosh32(float x)
-{
-    return cosh(x);
-}
-static inline float futrts_sinh32(float x)
-{
-    return sinh(x);
-}
-static inline float futrts_tanh32(float x)
-{
-    return tanh(x);
-}
-static inline float futrts_acosh32(float x)
-{
-    return acosh(x);
-}
-static inline float futrts_asinh32(float x)
-{
-    return asinh(x);
-}
-static inline float futrts_atanh32(float x)
-{
-    return atanh(x);
-}
-static inline float futrts_atan2_32(float x, float y)
-{
-    return atan2(x, y);
-}
-static inline float futrts_gamma32(float x)
-{
-    return tgamma(x);
-}
-static inline float futrts_lgamma32(float x)
-{
-    return lgamma(x);
-}
-static inline float fmod32(float x, float y)
-{
-    return fmod(x, y);
-}
-static inline float futrts_round32(float x)
-{
-    return rint(x);
-}
-static inline float futrts_floor32(float x)
-{
-    return floor(x);
-}
-static inline float futrts_ceil32(float x)
-{
-    return ceil(x);
-}
-static inline float futrts_lerp32(float v0, float v1, float t)
-{
-    return mix(v0, v1, t);
-}
-static inline float futrts_mad32(float a, float b, float c)
-{
-    return mad(a, b, c);
-}
-static inline float futrts_fma32(float a, float b, float c)
-{
-    return fma(a, b, c);
-}
-#else
-static inline float futrts_log32(float x)
-{
-    return logf(x);
-}
-static inline float futrts_log2_32(float x)
-{
-    return log2f(x);
-}
-static inline float futrts_log10_32(float x)
-{
-    return log10f(x);
-}
-static inline float futrts_sqrt32(float x)
-{
-    return sqrtf(x);
-}
-static inline float futrts_exp32(float x)
-{
-    return expf(x);
-}
-static inline float futrts_cos32(float x)
-{
-    return cosf(x);
-}
-static inline float futrts_sin32(float x)
-{
-    return sinf(x);
-}
-static inline float futrts_tan32(float x)
-{
-    return tanf(x);
-}
-static inline float futrts_acos32(float x)
-{
-    return acosf(x);
-}
-static inline float futrts_asin32(float x)
-{
-    return asinf(x);
-}
-static inline float futrts_atan32(float x)
-{
-    return atanf(x);
-}
-static inline float futrts_cosh32(float x)
-{
-    return coshf(x);
-}
-static inline float futrts_sinh32(float x)
-{
-    return sinhf(x);
-}
-static inline float futrts_tanh32(float x)
-{
-    return tanhf(x);
-}
-static inline float futrts_acosh32(float x)
-{
-    return acoshf(x);
-}
-static inline float futrts_asinh32(float x)
-{
-    return asinhf(x);
-}
-static inline float futrts_atanh32(float x)
-{
-    return atanhf(x);
-}
-static inline float futrts_atan2_32(float x, float y)
-{
-    return atan2f(x, y);
-}
-static inline float futrts_gamma32(float x)
-{
-    return tgammaf(x);
-}
-static inline float futrts_lgamma32(float x)
-{
-    return lgammaf(x);
-}
-static inline float fmod32(float x, float y)
-{
-    return fmodf(x, y);
-}
-static inline float futrts_round32(float x)
-{
-    return rintf(x);
-}
-static inline float futrts_floor32(float x)
-{
-    return floorf(x);
-}
-static inline float futrts_ceil32(float x)
-{
-    return ceilf(x);
-}
-static inline float futrts_lerp32(float v0, float v1, float t)
-{
-    return v0 + (v1 - v0) * t;
-}
-static inline float futrts_mad32(float a, float b, float c)
-{
-    return a * b + c;
-}
-static inline float futrts_fma32(float a, float b, float c)
-{
-    return fmaf(a, b, c);
-}
-#endif
-static inline int32_t futrts_to_bits32(float x)
-{
-    union {
-        float f;
-        int32_t t;
-    } p;
-    
-    p.f = x;
-    return p.t;
-}
-static inline float futrts_from_bits32(int32_t x)
-{
-    union {
-        int32_t f;
-        float t;
-    } p;
-    
-    p.f = x;
-    return p.t;
-}
-static inline float fsignum32(float x)
-{
-    return futrts_isnan32(x) ? x : (x > 0) - (x < 0);
-}
-static inline double futrts_log64(double x)
-{
-    return log(x);
-}
-static inline double futrts_log2_64(double x)
-{
-    return log2(x);
-}
-static inline double futrts_log10_64(double x)
-{
-    return log10(x);
-}
-static inline double futrts_sqrt64(double x)
-{
-    return sqrt(x);
-}
-static inline double futrts_exp64(double x)
-{
-    return exp(x);
-}
-static inline double futrts_cos64(double x)
-{
-    return cos(x);
-}
-static inline double futrts_sin64(double x)
-{
-    return sin(x);
-}
-static inline double futrts_tan64(double x)
-{
-    return tan(x);
-}
-static inline double futrts_acos64(double x)
-{
-    return acos(x);
-}
-static inline double futrts_asin64(double x)
-{
-    return asin(x);
-}
-static inline double futrts_atan64(double x)
-{
-    return atan(x);
-}
-static inline double futrts_cosh64(double x)
-{
-    return cosh(x);
-}
-static inline double futrts_sinh64(double x)
-{
-    return sinh(x);
-}
-static inline double futrts_tanh64(double x)
-{
-    return tanh(x);
-}
-static inline double futrts_acosh64(double x)
-{
-    return acosh(x);
-}
-static inline double futrts_asinh64(double x)
-{
-    return asinh(x);
-}
-static inline double futrts_atanh64(double x)
-{
-    return atanh(x);
-}
-static inline double futrts_atan2_64(double x, double y)
-{
-    return atan2(x, y);
-}
-static inline double futrts_gamma64(double x)
-{
-    return tgamma(x);
-}
-static inline double futrts_lgamma64(double x)
-{
-    return lgamma(x);
-}
-static inline double futrts_fma64(double a, double b, double c)
-{
-    return fma(a, b, c);
-}
-static inline double futrts_round64(double x)
-{
-    return rint(x);
-}
-static inline double futrts_ceil64(double x)
-{
-    return ceil(x);
-}
-static inline double futrts_floor64(double x)
-{
-    return floor(x);
-}
-static inline bool futrts_isnan64(double x)
-{
-    return isnan(x);
-}
-static inline bool futrts_isinf64(double x)
-{
-    return isinf(x);
-}
-static inline int64_t futrts_to_bits64(double x)
-{
-    union {
-        double f;
-        int64_t t;
-    } p;
-    
-    p.f = x;
-    return p.t;
-}
-static inline double futrts_from_bits64(int64_t x)
-{
-    union {
-        int64_t f;
-        double t;
-    } p;
-    
-    p.f = x;
-    return p.t;
-}
-static inline double fmod64(double x, double y)
-{
-    return fmod(x, y);
-}
-static inline double fsignum64(double x)
-{
-    return futrts_isnan64(x) ? x : (x > 0) - (x < 0);
-}
-#ifdef __OPENCL_VERSION__
-static inline double futrts_lerp64(double v0, double v1, double t)
-{
-    return mix(v0, v1, t);
-}
-static inline double futrts_mad64(double a, double b, double c)
-{
-    return mad(a, b, c);
-}
-#else
-static inline double futrts_lerp64(double v0, double v1, double t)
-{
-    return v0 + (v1 - v0) * t;
-}
-static inline double futrts_mad64(double a, double b, double c)
-{
-    return a * b + c;
-}
-#endif
-static int init_constants(struct futhark_context *);
-static int free_constants(struct futhark_context *);
-struct memblock {
-    int *references;
-    char *mem;
-    int64_t size;
-    const char *desc;
-} ;
-// start of scheduler.h
-
-// First, the API that the generated code will access.  In principle,
-// we could then compile the scheduler separately and link an object
-// file with the generated code.  In practice, we will embed all of
-// this in the generated code.
-
-// Scheduler handle.
-struct scheduler;
-
-// Initialise a scheduler (and start worker threads).
-static int scheduler_init(struct scheduler *scheduler,
-                          int num_workers,
-                          double kappa);
-
-// Shut down a scheduler (and destroy worker threads).
-static int scheduler_destroy(struct scheduler *scheduler);
-
-// Figure out the smallest amount of work that amortises task
-// creation.
-static int determine_kappa(double *kappa);
-
-// How a segop should be scheduled.
-enum scheduling {
-  DYNAMIC,
-  STATIC
-};
-
-// How a given task should be executed.  Filled out by the scheduler
-// and passed to the segop function
-struct scheduler_info {
-  int64_t iter_pr_subtask;
-  int64_t remainder;
-  int nsubtasks;
-  enum scheduling sched;
-  int wake_up_threads;
-
-  int64_t *task_time;
-  int64_t *task_iter;
-};
-
-// A segop function.  This is what you hand the scheduler for
-// execution.
-typedef int (*segop_fn)(void* args,
-                        int64_t iterations,
-                        int tid,
-                        struct scheduler_info info);
-
-// A task for the scheduler to execute.
-struct scheduler_segop {
-  void *args;
-  segop_fn top_level_fn;
-  segop_fn nested_fn;
-  int64_t iterations;
-  enum scheduling sched;
-
-  // Pointers to timer and iter associated with the task
-  int64_t *task_time;
-  int64_t *task_iter;
-
-  // For debugging
-  const char* name;
-};
-
-static inline int scheduler_prepare_task(struct scheduler *scheduler,
-                                         struct scheduler_segop *task);
-
-typedef int (*parloop_fn)(void* args,
-                          int64_t start,
-                          int64_t end,
-                          int subtask_id,
-                          int tid);
-
-// A parallel parloop task.
-struct scheduler_parloop {
-  void* args;
-  parloop_fn fn;
-  int64_t iterations;
-  struct scheduler_info info;
-
-  // For debugging
-  const char* name;
-};
-
-static inline int scheduler_execute_task(struct scheduler *scheduler,
-                                         struct scheduler_parloop *task);
-
-// Then the API implementation.
-
-#include <signal.h>
-
-#if defined(_WIN32)
-#include <windows.h>
-#elif defined(__APPLE__)
-#include <sys/sysctl.h>
-// For getting cpu usage of threads
-#include <mach/mach.h>
-#include <sys/resource.h>
-#elif defined(__linux__)
-#include <sys/sysinfo.h>
-#include <sys/resource.h>
-#include <signal.h>
-#endif
-
-/* Multicore Utility functions */
-
-/* A wrapper for getting rusage on Linux and MacOS */
-/* TODO maybe figure out this for windows */
-static inline int getrusage_thread(struct rusage *rusage)
-{
-  int err = -1;
-#if  defined(__APPLE__)
-    thread_basic_info_data_t info = { 0 };
-    mach_msg_type_number_t info_count = THREAD_BASIC_INFO_COUNT;
-    kern_return_t kern_err;
-
-    kern_err = thread_info(mach_thread_self(),
-                           THREAD_BASIC_INFO,
-                           (thread_info_t)&info,
-                           &info_count);
-    if (kern_err == KERN_SUCCESS) {
-        memset(rusage, 0, sizeof(struct rusage));
-        rusage->ru_utime.tv_sec = info.user_time.seconds;
-        rusage->ru_utime.tv_usec = info.user_time.microseconds;
-        rusage->ru_stime.tv_sec = info.system_time.seconds;
-        rusage->ru_stime.tv_usec = info.system_time.microseconds;
-        err = 0;
-    } else {
-        errno = EINVAL;
-    }
-#elif defined(__linux__)
-    err = getrusage(RUSAGE_THREAD, rusage);
-#endif
-    return err;
-}
-
-/* returns the number of logical cores */
-static int num_processors()
-{
-#if  defined(_WIN32)
-/* https://docs.microsoft.com/en-us/windows/win32/api/sysinfoapi/ns-sysinfoapi-system_info */
-    SYSTEM_INFO sysinfo;
-    GetSystemInfo(&sysinfo);
-    int ncores = sysinfo.dwNumberOfProcessors;
-    fprintf(stderr, "Found %d cores on your Windows machine\n Is that correct?\n", ncores);
-    return ncores;
-#elif defined(__APPLE__)
-    int ncores;
-    size_t ncores_size = sizeof(ncores);
-    CHECK_ERRNO(sysctlbyname("hw.logicalcpu", &ncores, &ncores_size, NULL, 0),
-                "sysctlbyname (hw.logicalcpu)");
-    return ncores;
-#elif defined(__linux__)
-  return get_nprocs();
-#else
-  fprintf(stderr, "operating system not recognised\n");
-  return -1;
-#endif
-}
-
-static unsigned int g_seed;
-
-// Used to seed the generator.
-static inline void fast_srand(unsigned int seed) {
-    g_seed = seed;
-}
-
-// Compute a pseudorandom integer.
-// Output value in range [0, 32767]
-static inline unsigned int fast_rand(void) {
-    g_seed = (214013*g_seed+2531011);
-    return (g_seed>>16)&0x7FFF;
-}
-
-struct subtask_queue {
-  int capacity;             // Size of the buffer.
-  int first;                // Index of the start of the ring buffer.
-  int num_used;             // Number of used elements in the buffer.
-  struct subtask **buffer;
-
-  pthread_mutex_t mutex;    // Mutex used for synchronisation.
-  pthread_cond_t cond;      // Condition variable used for synchronisation.
-  int dead;
-
-#if defined(MCPROFILE)
-  /* Profiling fields */
-  uint64_t time_enqueue;
-  uint64_t time_dequeue;
-  uint64_t n_dequeues;
-  uint64_t n_enqueues;
-#endif
-};
-
-/* A subtask that can be executed by a worker */
-struct subtask {
-  /* The parloop function */
-  parloop_fn fn;
-  /* Execution parameters */
-  void* args;
-  int64_t start, end;
-  int id;
-
-  /* Dynamic scheduling parameters */
-  int chunkable;
-  int64_t chunk_size;
-
-  /* Shared variables across subtasks */
-  volatile int *counter; // Counter for ongoing subtasks
-  // Shared task timers and iterators
-  int64_t *task_time;
-  int64_t *task_iter;
-
-  /* For debugging */
-  const char *name;
-};
-
-
-struct worker {
-  pthread_t thread;
-  struct scheduler *scheduler;  /* Reference to the scheduler struct the worker belongs to*/
-  struct subtask_queue q;
-  int dead;
-  int tid;                      /* Just a thread id */
-
-  /* "thread local" time fields used for online algorithm */
-  uint64_t timer;
-  uint64_t total;
-  int nested; /* How nested the current computation is */
-
-  // Profiling fields
-  int output_usage;            /* Whether to dump thread usage */
-  uint64_t time_spent_working; /* Time spent in parloop functions */
-};
-
-static inline void output_worker_usage(struct worker *worker)
-{
-  struct rusage usage;
-  CHECK_ERRNO(getrusage_thread(&usage), "getrusage_thread");
-  struct timeval user_cpu_time = usage.ru_utime;
-  struct timeval sys_cpu_time = usage.ru_stime;
-  fprintf(stderr, "tid: %2d - work time %10llu us - user time: %10llu us - sys: %10llu us\n",
-          worker->tid,
-          (long long unsigned)worker->time_spent_working / 1000,
-          (long long unsigned)(user_cpu_time.tv_sec * 1000000 + user_cpu_time.tv_usec),
-          (long long unsigned)(sys_cpu_time.tv_sec * 1000000 + sys_cpu_time.tv_usec));
-}
-
-/* Doubles the size of the queue */
-static inline int subtask_queue_grow_queue(struct subtask_queue *subtask_queue) {
-
-  int new_capacity = 2 * subtask_queue->capacity;
-#ifdef MCDEBUG
-  fprintf(stderr, "Growing queue to %d\n", subtask_queue->capacity * 2);
-#endif
-
-  struct subtask **new_buffer = calloc(new_capacity, sizeof(struct subtask*));
-  for (int i = 0; i < subtask_queue->num_used; i++) {
-    new_buffer[i] = subtask_queue->buffer[(subtask_queue->first + i) % subtask_queue->capacity];
-  }
-
-  free(subtask_queue->buffer);
-  subtask_queue->buffer = new_buffer;
-  subtask_queue->capacity = new_capacity;
-  subtask_queue->first = 0;
-
-  return 0;
-}
-
-// Initialise a job queue with the given capacity.  The queue starts out
-// empty.  Returns non-zero on error.
-static inline int subtask_queue_init(struct subtask_queue *subtask_queue, int capacity)
-{
-  assert(subtask_queue != NULL);
-  memset(subtask_queue, 0, sizeof(struct subtask_queue));
-
-  subtask_queue->capacity = capacity;
-  subtask_queue->buffer = calloc(capacity, sizeof(struct subtask*));
-  if (subtask_queue->buffer == NULL) {
-    return -1;
-  }
-
-  CHECK_ERRNO(pthread_mutex_init(&subtask_queue->mutex, NULL), "pthread_mutex_init");
-  CHECK_ERRNO(pthread_cond_init(&subtask_queue->cond, NULL), "pthread_cond_init");
-
-  return 0;
-}
-
-// Destroy the job queue.  Blocks until the queue is empty before it
-// is destroyed.
-static inline int subtask_queue_destroy(struct subtask_queue *subtask_queue)
-{
-  assert(subtask_queue != NULL);
-
-  CHECK_ERR(pthread_mutex_lock(&subtask_queue->mutex), "pthread_mutex_lock");
-
-  while (subtask_queue->num_used != 0) {
-    CHECK_ERR(pthread_cond_wait(&subtask_queue->cond, &subtask_queue->mutex), "pthread_cond_wait");
-  }
-
-  // Queue is now empty.  Let's kill it!
-  subtask_queue->dead = 1;
-  free(subtask_queue->buffer);
-  CHECK_ERR(pthread_cond_broadcast(&subtask_queue->cond), "pthread_cond_broadcast");
-  CHECK_ERR(pthread_mutex_unlock(&subtask_queue->mutex), "pthread_mutex_unlock");
-
-  return 0;
-}
-
-static inline void dump_queue(struct worker *worker)
-{
-  struct subtask_queue *subtask_queue = &worker->q;
-  CHECK_ERR(pthread_mutex_lock(&subtask_queue->mutex), "pthread_mutex_lock");
-  for (int i = 0; i < subtask_queue->num_used; i++) {
-    struct subtask * subtask = subtask_queue->buffer[(subtask_queue->first + i) % subtask_queue->capacity];
-    printf("queue tid %d with %d task %s\n", worker->tid, i, subtask->name);
-  }
-  CHECK_ERR(pthread_cond_broadcast(&subtask_queue->cond), "pthread_cond_broadcast");
-  CHECK_ERR(pthread_mutex_unlock(&subtask_queue->mutex), "pthread_mutex_unlock");
-}
-
-// Push an element onto the end of the job queue.  Blocks if the
-// subtask_queue is full (its size is equal to its capacity).  Returns
-// non-zero on error.  It is an error to push a job onto a queue that
-// has been destroyed.
-static inline int subtask_queue_enqueue(struct worker *worker, struct subtask *subtask )
-{
-  assert(worker != NULL);
-  struct subtask_queue *subtask_queue = &worker->q;
-
-#ifdef MCPROFILE
-  uint64_t start = get_wall_time();
-#endif
-
-  CHECK_ERR(pthread_mutex_lock(&subtask_queue->mutex), "pthread_mutex_lock");
-  // Wait until there is room in the subtask_queue.
-  while (subtask_queue->num_used == subtask_queue->capacity && !subtask_queue->dead) {
-    if (subtask_queue->num_used == subtask_queue->capacity) {
-      CHECK_ERR(subtask_queue_grow_queue(subtask_queue), "subtask_queue_grow_queue");
-      continue;
-    }
-    CHECK_ERR(pthread_cond_wait(&subtask_queue->cond, &subtask_queue->mutex), "pthread_cond_wait");
-  }
-
-  if (subtask_queue->dead) {
-    CHECK_ERR(pthread_mutex_unlock(&subtask_queue->mutex), "pthread_mutex_unlock");
-    return -1;
-  }
-
-  // If we made it past the loop, there is room in the subtask_queue.
-  subtask_queue->buffer[(subtask_queue->first + subtask_queue->num_used) % subtask_queue->capacity] = subtask;
-  subtask_queue->num_used++;
-
-#ifdef MCPROFILE
-  uint64_t end = get_wall_time();
-  subtask_queue->time_enqueue += (end - start);
-  subtask_queue->n_enqueues++;
-#endif
-  // Broadcast a reader (if any) that there is now an element.
-  CHECK_ERR(pthread_cond_broadcast(&subtask_queue->cond), "pthread_cond_broadcast");
-  CHECK_ERR(pthread_mutex_unlock(&subtask_queue->mutex), "pthread_mutex_unlock");
-
-  return 0;
-}
-
-
-/* Like subtask_queue_dequeue, but with two differences:
-   1) the subtask is stolen from the __front__ of the queue
-   2) returns immediately if there is no subtasks queued,
-      as we dont' want to block on another workers queue and
-*/
-static inline int subtask_queue_steal(struct worker *worker,
-                                      struct subtask **subtask)
-{
-  struct subtask_queue *subtask_queue = &worker->q;
-  assert(subtask_queue != NULL);
-
-#ifdef MCPROFILE
-  uint64_t start = get_wall_time();
-#endif
-  CHECK_ERR(pthread_mutex_lock(&subtask_queue->mutex), "pthread_mutex_lock");
-
-  if (subtask_queue->num_used == 0) {
-    CHECK_ERR(pthread_cond_broadcast(&subtask_queue->cond), "pthread_cond_broadcast");
-    CHECK_ERR(pthread_mutex_unlock(&subtask_queue->mutex), "pthread_mutex_unlock");
-    return 1;
-  }
-
-  if (subtask_queue->dead) {
-    CHECK_ERR(pthread_mutex_unlock(&subtask_queue->mutex), "pthread_mutex_unlock");
-    return -1;
-  }
-
-  // Tasks gets stolen from the "front"
-  struct subtask *cur_back = subtask_queue->buffer[subtask_queue->first];
-  struct subtask *new_subtask = NULL;
-  int remaining_iter = cur_back->end - cur_back->start;
-  // If subtask is chunkable, we steal half of the iterations
-  if (cur_back->chunkable && remaining_iter > 1) {
-      int64_t half = remaining_iter / 2;
-      new_subtask = malloc(sizeof(struct subtask));
-      *new_subtask = *cur_back;
-      new_subtask->start = cur_back->end - half;
-      cur_back->end = new_subtask->start;
-      __atomic_fetch_add(cur_back->counter, 1, __ATOMIC_RELAXED);
-  } else {
-    new_subtask = cur_back;
-    subtask_queue->num_used--;
-    subtask_queue->first = (subtask_queue->first + 1) % subtask_queue->capacity;
-  }
-  *subtask = new_subtask;
-
-  if (*subtask == NULL) {
-    CHECK_ERR(pthread_mutex_unlock(&subtask_queue->mutex), "pthred_mutex_unlock");
-    return 1;
-  }
-
-#ifdef MCPROFILE
-  uint64_t end = get_wall_time();
-  subtask_queue->time_dequeue += (end - start);
-  subtask_queue->n_dequeues++;
-#endif
-
-  // Broadcast a writer (if any) that there is now room for more.
-  CHECK_ERR(pthread_cond_broadcast(&subtask_queue->cond), "pthread_cond_broadcast");
-  CHECK_ERR(pthread_mutex_unlock(&subtask_queue->mutex), "pthread_mutex_unlock");
-
-  return 0;
-}
-
-
-// Pop an element from the back of the job queue.
-// Optional argument can be provided to block or not
-static inline int subtask_queue_dequeue(struct worker *worker,
-                                        struct subtask **subtask, int blocking)
-{
-  assert(worker != NULL);
-  struct subtask_queue *subtask_queue = &worker->q;
-
-#ifdef MCPROFILE
-  uint64_t start = get_wall_time();
-#endif
-
-  CHECK_ERR(pthread_mutex_lock(&subtask_queue->mutex), "pthread_mutex_lock");
-  if (subtask_queue->num_used == 0 && !blocking) {
-    CHECK_ERR(pthread_mutex_unlock(&subtask_queue->mutex), "pthread_mutex_unlock");
-    return 1;
-  }
-  // Try to steal some work while the subtask_queue is empty
-  while (subtask_queue->num_used == 0 && !subtask_queue->dead) {
-    pthread_cond_wait(&subtask_queue->cond, &subtask_queue->mutex);
-  }
-
-  if (subtask_queue->dead) {
-    CHECK_ERR(pthread_mutex_unlock(&subtask_queue->mutex), "pthread_mutex_unlock");
-    return -1;
-  }
-
-  // dequeue pops from the back
-  *subtask = subtask_queue->buffer[(subtask_queue->first + subtask_queue->num_used - 1) % subtask_queue->capacity];
-  subtask_queue->num_used--;
-
-  if (*subtask == NULL) {
-    assert(!"got NULL ptr");
-    CHECK_ERR(pthread_mutex_unlock(&subtask_queue->mutex), "pthred_mutex_unlock");
-    return -1;
-  }
-
-#ifdef MCPROFILE
-  uint64_t end = get_wall_time();
-  subtask_queue->time_dequeue += (end - start);
-  subtask_queue->n_dequeues++;
-#endif
-
-  // Broadcast a writer (if any) that there is now room for more.
-  CHECK_ERR(pthread_cond_broadcast(&subtask_queue->cond), "pthread_cond_broadcast");
-  CHECK_ERR(pthread_mutex_unlock(&subtask_queue->mutex), "pthread_mutex_unlock");
-
-  return 0;
-}
-
-static inline int subtask_queue_is_empty(struct subtask_queue *subtask_queue)
-{
-  return subtask_queue->num_used == 0;
-}
-
-/* Scheduler definitions */
-
-struct scheduler {
-  struct worker *workers;
-  int num_threads;
-
-  // If there is work to steal => active_work > 0
-  volatile int active_work;
-
-  // Only one error can be returned at the time now.  Maybe we can
-  // provide a stack like structure for pushing errors onto if we wish
-  // to backpropagte multiple errors
-  volatile int error;
-
-  // kappa time unit in nanoseconds
-  double kappa;
-};
-
-
-// Thread local variable worker struct
-// Note that, accesses to tls variables are expensive
-// Minimize direct references to this variable
-__thread struct worker* worker_local = NULL;
-
-static int64_t total_now(int64_t total, int64_t time) {
-  return total + (get_wall_time_ns() - time);
-}
-
-static int random_other_worker(struct scheduler *scheduler, int my_id) {
-  int my_num_workers = scheduler->num_threads;
-  assert(my_num_workers != 1);
-  int i = fast_rand() % (my_num_workers - 1);
-  if (i >= my_id) {
-    i++;
-  }
-#ifdef MCDEBUG
-  assert(i >= 0);
-  assert(i < my_num_workers);
-  assert(i != my_id);
-#endif
-
-  return i;
-}
-
-
-static inline int64_t compute_chunk_size(double kappa, struct subtask* subtask)
-{
-  double C = (double)*subtask->task_time / (double)*subtask->task_iter;
-  if (C == 0.0F) C += DBL_EPSILON;
-  return smax64((int64_t)(kappa / C), 1);
-}
-
-/* Takes a chunk from subtask and enqueues the remaining iterations onto the worker's queue */
-/* A no-op if the subtask is not chunkable */
-static inline struct subtask* chunk_subtask(struct worker* worker, struct subtask *subtask)
-{
-  if (subtask->chunkable) {
-    // Do we have information from previous runs avaliable
-    if (*subtask->task_iter > 0) {
-      subtask->chunk_size = compute_chunk_size(worker->scheduler->kappa, subtask);
-      assert(subtask->chunk_size > 0);
-    }
-    int64_t remaining_iter = subtask->end - subtask->start;
-    assert(remaining_iter > 0);
-    if (remaining_iter > subtask->chunk_size) {
-      struct subtask *new_subtask = malloc(sizeof(struct subtask));
-      *new_subtask = *subtask;
-      // increment the subtask join counter to account for new subtask
-      __atomic_fetch_add(subtask->counter, 1, __ATOMIC_RELAXED);
-      // Update range parameters
-      subtask->end = subtask->start + subtask->chunk_size;
-      new_subtask->start = subtask->end;
-      subtask_queue_enqueue(worker, new_subtask);
-    }
-  }
-  return subtask;
-}
-
-static inline int run_subtask(struct worker* worker, struct subtask* subtask)
-{
-  assert(subtask != NULL);
-  assert(worker != NULL);
-
-  subtask = chunk_subtask(worker, subtask);
-  worker->total = 0;
-  worker->timer = get_wall_time_ns();
-#if defined(MCPROFILE)
-  int64_t start = worker->timer;
-#endif
-  worker->nested++;
-  int err = subtask->fn(subtask->args, subtask->start, subtask->end,
-                        subtask->chunkable ? worker->tid : subtask->id,
-                        worker->tid);
-  worker->nested--;
-  // Some error occured during some other subtask
-  // so we just clean-up and return
-  if (worker->scheduler->error != 0) {
-    // Even a failed task counts as finished.
-    __atomic_fetch_sub(subtask->counter, 1, __ATOMIC_RELAXED);
-    free(subtask);
-    return 0;
-  }
-  if (err != 0) {
-    __atomic_store_n(&worker->scheduler->error, err, __ATOMIC_RELAXED);
-  }
-  // Total sequential time spent
-  int64_t time_elapsed = total_now(worker->total, worker->timer);
-#if defined(MCPROFILE)
-  worker->time_spent_working += get_wall_time_ns() - start;
-#endif
-  int64_t iter = subtask->end - subtask->start;
-  // report measurements
-  // These updates should really be done using a single atomic CAS operation
-  __atomic_fetch_add(subtask->task_time, time_elapsed, __ATOMIC_RELAXED);
-  __atomic_fetch_add(subtask->task_iter, iter, __ATOMIC_RELAXED);
-  // We need a fence here, since if the counter is decremented before either
-  // of the two above are updated bad things can happen, e.g. if they are stack-allocated
-  __atomic_thread_fence(__ATOMIC_SEQ_CST);
-  __atomic_fetch_sub(subtask->counter, 1, __ATOMIC_RELAXED);
-  free(subtask);
-  return 0;
-}
-
-
-static inline int is_small(struct scheduler_segop *task, struct scheduler *scheduler, int *nsubtasks)
-{
-  int64_t time = *task->task_time;
-  int64_t iter = *task->task_iter;
-
-  if (task->sched == DYNAMIC || iter == 0) {
-    *nsubtasks = scheduler->num_threads;
-    return 0;
-  }
-
-  // Estimate the constant C
-  double C = (double)time / (double)iter;
-  double cur_task_iter = (double) task->iterations;
-
-  // Returns true if the task is small i.e.
-  // if the number of iterations times C is smaller
-  // than the overhead of subtask creation
-  if (C == 0.0F || C * cur_task_iter < scheduler->kappa) {
-    *nsubtasks = 1;
-    return 1;
-  }
-
-  // Else compute how many subtasks this tasks should create
-  int64_t min_iter_pr_subtask = smax64(scheduler->kappa / C, 1);
-  *nsubtasks = smin64(smax64(task->iterations / min_iter_pr_subtask, 1), scheduler->num_threads);
-
-  return 0;
-}
-
-// TODO make this prettier
-static inline struct subtask* create_subtask(parloop_fn fn,
-                                             void* args,
-                                             const char* name,
-                                             volatile int* counter,
-                                             int64_t *timer,
-                                             int64_t *iter,
-                                             int64_t start, int64_t end,
-                                             int chunkable,
-                                             int64_t chunk_size,
-                                             int id)
-{
-  struct subtask* subtask = malloc(sizeof(struct subtask));
-  if (subtask == NULL) {
-    assert(!"malloc failed in create_subtask");
-    return NULL;
-  }
-  subtask->fn         = fn;
-  subtask->args       = args;
-
-  subtask->counter    = counter;
-  subtask->task_time  = timer;
-  subtask->task_iter  = iter;
-
-  subtask->start      = start;
-  subtask->end        = end;
-  subtask->id         = id;
-  subtask->chunkable  = chunkable;
-  subtask->chunk_size = chunk_size;
-
-  subtask->name       = name;
-  return subtask;
-}
-
-static int dummy_counter = 0;
-static int64_t dummy_timer = 0;
-static int64_t dummy_iter = 0;
-
-static int dummy_fn(void *args, int64_t start, int64_t end, int subtask_id, int tid) {
-  (void)args;
-  (void)start;
-  (void)end;
-  (void)subtask_id;
-  (void)tid;
-  return 0;
-}
-
-// Wake up threads, who are blocking by pushing a dummy task
-// onto their queue
-static inline void wake_up_threads(struct scheduler *scheduler, int start_tid, int end_tid) {
-
-#if defined(MCDEBUG)
-  assert(start_tid >= 1);
-  assert(end_tid <= scheduler->num_threads);
-#endif
-  for (int i = start_tid; i < end_tid; i++) {
-    struct subtask *subtask = create_subtask(dummy_fn, NULL, "dummy_fn",
-                                            &dummy_counter,
-                                            &dummy_timer, &dummy_iter,
-                                            0, 0,
-                                            0, 0,
-                                            0);
-    CHECK_ERR(subtask_queue_enqueue(&scheduler->workers[i], subtask), "subtask_queue_enqueue");
-  }
-}
-
-static inline int is_finished(struct worker *worker) {
-  return worker->dead && subtask_queue_is_empty(&worker->q);
-}
-
-// Try to steal from a random queue
-static inline int steal_from_random_worker(struct worker* worker)
-{
-  int my_id = worker->tid;
-  struct scheduler* scheduler = worker->scheduler;
-  int k = random_other_worker(scheduler, my_id);
-  struct worker *worker_k = &scheduler->workers[k];
-  struct subtask* subtask =  NULL;
-  int retval = subtask_queue_steal(worker_k, &subtask);
-  if (retval == 0) {
-    subtask_queue_enqueue(worker, subtask);
-    return 1;
-  }
-  return 0;
-}
-
-
-static inline void *scheduler_worker(void* args)
-{
-  struct worker *worker = (struct worker*) args;
-  struct scheduler *scheduler = worker->scheduler;
-  worker_local = worker;
-  struct subtask *subtask = NULL;
-
-  while(!is_finished(worker)) {
-    if (!subtask_queue_is_empty(&worker->q)) {
-      int retval = subtask_queue_dequeue(worker, &subtask, 0);
-      if (retval == 0) {
-        assert(subtask != NULL);
-        CHECK_ERR(run_subtask(worker, subtask), "run_subtask");
-      } // else someone stole our work
-
-    } else if (scheduler->active_work) { /* steal */
-      while (!is_finished(worker) && scheduler->active_work) {
-        if (steal_from_random_worker(worker)) {
-          break;
-        }
-      }
-    } else { /* go back to sleep and wait for work */
-      int retval = subtask_queue_dequeue(worker, &subtask, 1);
-      if (retval == 0) {
-        assert(subtask != NULL);
-        CHECK_ERR(run_subtask(worker, subtask), "run_subtask");
-      }
-    }
-  }
-
-  assert(subtask_queue_is_empty(&worker->q));
-#if defined(MCPROFILE)
-  if (worker->output_usage)
-    output_worker_usage(worker);
-#endif
-  return NULL;
-}
-
-
-static inline int scheduler_execute_parloop(struct scheduler *scheduler,
-                                            struct scheduler_parloop *task,
-                                            int64_t *timer)
-{
-
-  struct worker *worker = worker_local;
-
-  struct scheduler_info info = task->info;
-  int64_t iter_pr_subtask = info.iter_pr_subtask;
-  int64_t remainder = info.remainder;
-  int nsubtasks = info.nsubtasks;
-  volatile int join_counter = nsubtasks;
-
-  // Shared timer used to sum up all
-  // sequential work from each subtask
-  int64_t task_timer = 0;
-  int64_t task_iter = 0;
-
-  enum scheduling sched = info.sched;
-  /* If each subtasks should be processed in chunks */
-  int chunkable = sched == STATIC ? 0 : 1;
-  int64_t chunk_size = 1; // The initial chunk size when no info is avaliable
-
-
-  if (info.wake_up_threads || sched == DYNAMIC)
-    __atomic_add_fetch(&scheduler->active_work, nsubtasks, __ATOMIC_RELAXED);
-
-  int64_t start = 0;
-  int64_t end = iter_pr_subtask + (int64_t)(remainder != 0);
-  for (int subtask_id = 0; subtask_id < nsubtasks; subtask_id++) {
-    struct subtask *subtask = create_subtask(task->fn, task->args, task->name,
-                                              &join_counter,
-                                              &task_timer, &task_iter,
-                                              start, end,
-                                              chunkable, chunk_size,
-                                              subtask_id);
-    assert(subtask != NULL);
-    // In most cases we will never have more subtasks than workers,
-    // but there can be exceptions (e.g. the kappa tuning function).
-    struct worker *subtask_worker =
-      worker->nested
-      ? &scheduler->workers[worker->tid]
-      : &scheduler->workers[subtask_id % scheduler->num_threads];
-    CHECK_ERR(subtask_queue_enqueue(subtask_worker, subtask),
-              "subtask_queue_enqueue");
-    // Update range params
-    start = end;
-    end += iter_pr_subtask + ((subtask_id + 1) < remainder);
-  }
-
-  if (info.wake_up_threads) {
-    wake_up_threads(scheduler, nsubtasks, scheduler->num_threads);
-  }
-
-  // Join (wait for subtasks to finish)
-  while(join_counter != 0) {
-    if (!subtask_queue_is_empty(&worker->q)) {
-      struct subtask *subtask = NULL;
-      int err = subtask_queue_dequeue(worker, &subtask, 0);
-      if (err == 0 ) {
-        CHECK_ERR(run_subtask(worker, subtask), "run_subtask");
-      }
-    } else {
-      if (steal_from_random_worker(worker)) {
-        struct subtask *subtask = NULL;
-        int err = subtask_queue_dequeue(worker, &subtask, 0);
-        if (err == 0) {
-          CHECK_ERR(run_subtask(worker, subtask), "run_subtask");
-        }
-      }
-    }
-  }
-
-
-  if (info.wake_up_threads || sched == DYNAMIC) {
-    __atomic_sub_fetch(&scheduler->active_work, nsubtasks, __ATOMIC_RELAXED);
-  }
-
-  // Write back timing results of all sequential work
-  (*timer) += task_timer;
-  return scheduler->error;
-}
-
-
-static inline int scheduler_execute_task(struct scheduler *scheduler,
-                                         struct scheduler_parloop *task)
-{
-
-  struct worker *worker = worker_local;
-
-  int err = 0;
-
-  // How much sequential work was performed by the task
-  int64_t task_timer = 0;
-
-  /* Execute task sequential or parallel based on decision made earlier */
-  if (task->info.nsubtasks == 1) {
-    int64_t start = get_wall_time_ns();
-    err = task->fn(task->args, 0, task->iterations, 0, worker->tid);
-    int64_t end = get_wall_time_ns();
-    task_timer = end - start;
-    worker->time_spent_working += task_timer;
-    // Report time measurements
-    // TODO the update of both of these should really be a single atomic!!
-    __atomic_fetch_add(task->info.task_time, task_timer, __ATOMIC_RELAXED);
-    __atomic_fetch_add(task->info.task_iter, task->iterations, __ATOMIC_RELAXED);
-  } else {
-    // Add "before" time if we already are inside a task
-    int64_t time_before = 0;
-    if (worker->nested > 0) {
-      time_before = total_now(worker->total, worker->timer);
-    }
-
-    err = scheduler_execute_parloop(scheduler, task, &task_timer);
-
-    // Report time measurements
-    // TODO the update of both of these should really be a single atomic!!
-    __atomic_fetch_add(task->info.task_time, task_timer, __ATOMIC_RELAXED);
-    __atomic_fetch_add(task->info.task_iter, task->iterations, __ATOMIC_RELAXED);
-
-    // Update timers to account for new timings
-    worker->total = time_before + task_timer;
-    worker->timer = get_wall_time_ns();
-  }
-
-
-  return err;
-}
-
-/* Decide on how schedule the incoming task i.e. how many subtasks and
-   to run sequential or (potentially nested) parallel code body */
-static inline int scheduler_prepare_task(struct scheduler* scheduler,
-                                         struct scheduler_segop *task)
-{
-  assert(task != NULL);
-
-  struct worker *worker = worker_local;
-  struct scheduler_info info;
-  info.task_time = task->task_time;
-  info.task_iter = task->task_iter;
-
-  int nsubtasks;
-  // Decide if task should be scheduled sequentially
-  if (is_small(task, scheduler, &nsubtasks)) {
-    info.iter_pr_subtask = task->iterations;
-    info.remainder = 0;
-    info.nsubtasks = nsubtasks;
-    return task->top_level_fn(task->args, task->iterations, worker->tid, info);
-  } else {
-    info.iter_pr_subtask = task->iterations / nsubtasks;
-    info.remainder = task->iterations % nsubtasks;
-    info.sched = task->sched;
-    switch (task->sched) {
-    case STATIC:
-      info.nsubtasks = info.iter_pr_subtask == 0 ? info.remainder : ((task->iterations - info.remainder) / info.iter_pr_subtask);
-      break;
-    case DYNAMIC:
-      // As any thread can take any subtasks, we are being safe with using
-      // an upper bound on the number of tasks such that the task allocate enough memory
-      info.nsubtasks = info.iter_pr_subtask == 0 ? info.remainder : nsubtasks;
-      break;
-    default:
-      assert(!"Got unknown scheduling");
-    }
-  }
-
-  info.wake_up_threads = 0;
-  // We only use the nested parallel segop function if we can't exchaust all cores
-  // using the outer most level
-  if (task->nested_fn != NULL && info.nsubtasks < scheduler->num_threads && info.nsubtasks == task->iterations) {
-    if (worker->nested == 0)
-      info.wake_up_threads = 1;
-    return task->nested_fn(task->args, task->iterations, worker->tid, info);
-  }
-
-  return task->top_level_fn(task->args, task->iterations, worker->tid, info);
-}
-
-// Now some code for finding the proper value of kappa on a given
-// machine (the smallest amount of work that amortises the cost of
-// task creation).
-
-struct tuning_struct {
-  int32_t *free_tuning_res;
-  int32_t *array;
-};
-
-// Reduction function over an integer array
-static int tuning_loop(void *args, int64_t start, int64_t end,
-                                     int flat_tid, int tid) {
-  (void)flat_tid;
-  (void)tid;
-
-  int err = 0;
-  struct tuning_struct *tuning_struct = (struct tuning_struct *) args;
-  int32_t *array = tuning_struct->array;
-  int32_t *tuning_res = tuning_struct->free_tuning_res;
-
-  int32_t sum = 0;
-  for (int i = start; i < end; i++) {
-    int32_t y = array[i];
-    sum = add32(sum, y);
-  }
-  *tuning_res = sum;
-  return err;
-}
-
-// The main entry point for the tuning process.  Sets the provided
-// variable ``kappa``.
-static int determine_kappa(double *kappa) {
-  int err = 0;
-
-  int64_t iterations = 100000000;
-  int64_t tuning_time = 0;
-  int64_t tuning_iter = 0;
-
-  int32_t *array = malloc(sizeof(int32_t) * iterations);
-  for (int64_t i = 0; i < iterations; i++) {
-    array[i] = fast_rand();
-  }
-
-  int64_t start_tuning = get_wall_time_ns();
-  /* **************************** */
-  /* Run sequential reduce first' */
-  /* **************************** */
-  int64_t tuning_sequentiual_start = get_wall_time_ns();
-  struct tuning_struct tuning_struct;
-  int32_t tuning_res;
-  tuning_struct.free_tuning_res = &tuning_res;
-  tuning_struct.array = array;
-
-  err = tuning_loop(&tuning_struct, 0, iterations, 0, 0);
-  int64_t tuning_sequentiual_end = get_wall_time_ns();
-  int64_t sequential_elapsed = tuning_sequentiual_end - tuning_sequentiual_start;
-
-  double C = (double)sequential_elapsed / (double)iterations;
-  fprintf(stderr, " Time for sequential run is %lld - Found C %f\n", (long long)sequential_elapsed, C);
-
-  /* ********************** */
-  /* Now run tuning process */
-  /* ********************** */
-  // Setup a scheduler with a single worker
-  struct scheduler scheduler;
-  scheduler.num_threads = 1;
-  scheduler.workers = malloc(sizeof(struct worker));
-  worker_local = &scheduler.workers[0];
-  worker_local->tid = 0;
-  CHECK_ERR(subtask_queue_init(&scheduler.workers[0].q, 1024),
-            "failed to init queue for worker %d\n", 0);
-
-  // Start tuning for kappa
-  double kappa_tune = 1000; // Initial kappa is 1 us
-  double ratio;
-  int64_t time_elapsed;
-  while(1) {
-    int64_t min_iter_pr_subtask = (int64_t) (kappa_tune / C) == 0 ? 1 : (kappa_tune / C);
-    int nsubtasks = iterations / min_iter_pr_subtask;
-    struct scheduler_info info;
-    info.iter_pr_subtask = min_iter_pr_subtask;
-
-    info.nsubtasks = iterations / min_iter_pr_subtask;
-    info.remainder = iterations % min_iter_pr_subtask;
-    info.task_time = &tuning_time;
-    info.task_iter = &tuning_iter;
-    info.sched = STATIC;
-
-    struct scheduler_parloop parloop;
-    parloop.name = "tuning_loop";
-    parloop.fn = tuning_loop;
-    parloop.args = &tuning_struct;
-    parloop.iterations = iterations;
-    parloop.info = info;
-
-    int64_t tuning_chunked_start = get_wall_time_ns();
-    int determine_kappa_err =
-      scheduler_execute_task(&scheduler,
-                             &parloop);
-    assert(determine_kappa_err == 0);
-    int64_t tuning_chunked_end = get_wall_time_ns();
-    time_elapsed =  tuning_chunked_end - tuning_chunked_start;
-
-    ratio = (double)time_elapsed / (double)sequential_elapsed;
-    if (ratio < 1.055) {
-      break;
-    }
-    kappa_tune += 100; // Increase by 100 ns at the time
-    fprintf(stderr, "nsubtask %d - kappa %f - ratio %f\n", nsubtasks, kappa_tune, ratio);
-  }
-
-  int64_t end_tuning = get_wall_time_ns();
-  fprintf(stderr, "tuning took %lld ns and found kappa %f - time %lld - ratio %f\n",
-          (long long)end_tuning - start_tuning,
-          kappa_tune,
-          (long long)time_elapsed,
-          ratio);
-  *kappa = kappa_tune;
-
-  // Clean-up
-  CHECK_ERR(subtask_queue_destroy(&scheduler.workers[0].q), "failed to destroy queue");
-  free(array);
-  free(scheduler.workers);
-  return err;
-}
-
-static int scheduler_init(struct scheduler *scheduler,
-                          int num_workers,
-                          double kappa) {
-  assert(num_workers > 0);
-
-  scheduler->kappa = kappa;
-  scheduler->num_threads = num_workers;
-  scheduler->active_work = 0;
-  scheduler->error = 0;
-
-  scheduler->workers = calloc(num_workers, sizeof(struct worker));
-
-  const int queue_capacity = 1024;
-
-  worker_local = &scheduler->workers[0];
-  worker_local->tid = 0;
-  worker_local->scheduler = scheduler;
-  CHECK_ERR(subtask_queue_init(&worker_local->q, queue_capacity),
-            "failed to init queue for worker %d\n", 0);
-
-  for (int i = 1; i < num_workers; i++) {
-    struct worker *cur_worker = &scheduler->workers[i];
-    memset(cur_worker, 0, sizeof(struct worker));
-    cur_worker->tid = i;
-    cur_worker->output_usage = 0;
-    cur_worker->scheduler = scheduler;
-    CHECK_ERR(subtask_queue_init(&cur_worker->q, queue_capacity),
-              "failed to init queue for worker %d\n", i);
-
-    CHECK_ERR(pthread_create(&cur_worker->thread,
-                             NULL,
-                             &scheduler_worker,
-                             cur_worker),
-              "Failed to create worker %d\n", i);
-  }
-
-  return 0;
-}
-
-static int scheduler_destroy(struct scheduler *scheduler) {
-  // First mark them all as dead.
-  for (int i = 1; i < scheduler->num_threads; i++) {
-    struct worker *cur_worker = &scheduler->workers[i];
-    cur_worker->dead = 1;
-  }
-
-  // Then destroy their task queues (this will wake up the threads and
-  // make them do their shutdown).
-  for (int i = 1; i < scheduler->num_threads; i++) {
-    struct worker *cur_worker = &scheduler->workers[i];
-    subtask_queue_destroy(&cur_worker->q);
-  }
-
-  // Then actually wait for them to stop.
-  for (int i = 1; i < scheduler->num_threads; i++) {
-    struct worker *cur_worker = &scheduler->workers[i];
-    CHECK_ERR(pthread_join(scheduler->workers[i].thread, NULL), "pthread_join");
-  }
-
-  free(scheduler->workers);
-
-  return 0;
-}
-
-// End of scheduler.h
-
-struct futhark_context_config {
-    int debugging;
-    int profiling;
-    int num_threads;
-} ;
-struct futhark_context_config *futhark_context_config_new(void)
-{
-    struct futhark_context_config *cfg =
-                                  (struct futhark_context_config *) malloc(sizeof(struct futhark_context_config));
-    
-    if (cfg == NULL)
-        return NULL;
-    cfg->debugging = 0;
-    cfg->profiling = 0;
-    cfg->num_threads = 0;
-    return cfg;
-}
-void futhark_context_config_free(struct futhark_context_config *cfg)
-{
-    free(cfg);
-}
-void futhark_context_config_set_debugging(struct futhark_context_config *cfg,
-                                          int detail)
-{
-    cfg->debugging = detail;
-}
-void futhark_context_config_set_profiling(struct futhark_context_config *cfg,
-                                          int flag)
-{
-    cfg->profiling = flag;
-}
-void futhark_context_config_set_logging(struct futhark_context_config *cfg,
-                                        int detail)
-{
-    /* Does nothing for this backend. */
-    (void) cfg;
-    (void) detail;
-}
-void futhark_context_config_set_num_threads(struct futhark_context_config *cfg,
-                                            int n)
-{
-    cfg->num_threads = n;
-}
-struct futhark_context {
-    struct scheduler scheduler;
-    int detail_memory;
-    int debugging;
-    int profiling;
-    int profiling_paused;
-    int logging;
-    lock_t lock;
-    char *error;
-    FILE *log;
-    int total_runs;
-    long total_runtime;
-    int64_t peak_mem_usage_default;
-    int64_t cur_mem_usage_default;
-    struct {
-        int dummy;
-    } constants;
-    int64_t *futhark_mc_segmap_parloop_6011_total_runtime;
-    int *futhark_mc_segmap_parloop_6011_runs;
-    int64_t *futhark_mc_segmap_parloop_6011_iter;
-    int64_t futhark_mc_segmap_parloop_6011_total_total_runtime;
-    int futhark_mc_segmap_parloop_6011_total_runs;
-    int64_t futhark_mc_segmap_parloop_6011_total_iter;
-    int64_t *futhark_mc_segmap_task_6009_total_runtime;
-    int *futhark_mc_segmap_task_6009_runs;
-    int64_t *futhark_mc_segmap_task_6009_iter;
-    int64_t futhark_mc_segmap_task_6009_total_time;
-    int64_t futhark_mc_segmap_task_6009_total_iter;
-    int64_t *futhark_mc_segmap_parloop_6020_total_runtime;
-    int *futhark_mc_segmap_parloop_6020_runs;
-    int64_t *futhark_mc_segmap_parloop_6020_iter;
-    int64_t futhark_mc_segmap_parloop_6020_total_total_runtime;
-    int futhark_mc_segmap_parloop_6020_total_runs;
-    int64_t futhark_mc_segmap_parloop_6020_total_iter;
-    int64_t *futhark_mc_segmap_task_6018_total_runtime;
-    int *futhark_mc_segmap_task_6018_runs;
-    int64_t *futhark_mc_segmap_task_6018_iter;
-    int64_t futhark_mc_segmap_task_6018_total_time;
-    int64_t futhark_mc_segmap_task_6018_total_iter;
-    int64_t *futhark_mc_segmap_parloop_6015_total_runtime;
-    int *futhark_mc_segmap_parloop_6015_runs;
-    int64_t *futhark_mc_segmap_parloop_6015_iter;
-    int64_t futhark_mc_segmap_parloop_6015_total_total_runtime;
-    int futhark_mc_segmap_parloop_6015_total_runs;
-    int64_t futhark_mc_segmap_parloop_6015_total_iter;
-    int64_t *futhark_mc_segmap_nested_task_6013_total_runtime;
-    int *futhark_mc_segmap_nested_task_6013_runs;
-    int64_t *futhark_mc_segmap_nested_task_6013_iter;
-    int64_t tuning_timing;
-    int64_t tuning_iter;
-} ;
-struct futhark_context *futhark_context_new(struct futhark_context_config *cfg)
-{
-    struct futhark_context *ctx =
-                           (struct futhark_context *) malloc(sizeof(struct futhark_context));
-    
-    if (ctx == NULL)
-        return NULL;
-    fast_srand(time(0));
-    ctx->detail_memory = cfg->debugging;
-    ctx->debugging = cfg->debugging;
-    ctx->profiling = cfg->profiling;
-    ctx->profiling_paused = 0;
-    ctx->logging = 0;
-    ctx->error = NULL;
-    ctx->log = stderr;
-    create_lock(&ctx->lock);
-    
-    int tune_kappa = 0;
-    double kappa = 5.1f * 1000;
-    
-    if (tune_kappa) {
-        if (determine_kappa(&kappa) != 0)
-            return NULL;
-    }
-    if (scheduler_init(&ctx->scheduler, cfg->num_threads >
-                       0 ? cfg->num_threads : num_processors(), kappa) != 0)
-        return NULL;
-    ctx->peak_mem_usage_default = 0;
-    ctx->cur_mem_usage_default = 0;
-    ctx->futhark_mc_segmap_parloop_6011_total_runtime = calloc(sizeof(int64_t),
-                                                               ctx->scheduler.num_threads);
-    ctx->futhark_mc_segmap_parloop_6011_runs = calloc(sizeof(int),
-                                                      ctx->scheduler.num_threads);
-    ctx->futhark_mc_segmap_parloop_6011_iter = calloc(sizeof(sizeof(int64_t)),
-                                                      ctx->scheduler.num_threads);
-    ctx->futhark_mc_segmap_parloop_6011_total_total_runtime = 0;
-    ctx->futhark_mc_segmap_parloop_6011_total_runs = 0;
-    ctx->futhark_mc_segmap_parloop_6011_total_iter = 0;
-    ctx->futhark_mc_segmap_task_6009_total_runtime = calloc(sizeof(int64_t),
-                                                            ctx->scheduler.num_threads);
-    ctx->futhark_mc_segmap_task_6009_runs = calloc(sizeof(int),
-                                                   ctx->scheduler.num_threads);
-    ctx->futhark_mc_segmap_task_6009_iter = calloc(sizeof(sizeof(int64_t)),
-                                                   ctx->scheduler.num_threads);
-    ctx->futhark_mc_segmap_task_6009_total_time = 0;
-    ctx->futhark_mc_segmap_task_6009_total_iter = 0;
-    ctx->futhark_mc_segmap_parloop_6020_total_runtime = calloc(sizeof(int64_t),
-                                                               ctx->scheduler.num_threads);
-    ctx->futhark_mc_segmap_parloop_6020_runs = calloc(sizeof(int),
-                                                      ctx->scheduler.num_threads);
-    ctx->futhark_mc_segmap_parloop_6020_iter = calloc(sizeof(sizeof(int64_t)),
-                                                      ctx->scheduler.num_threads);
-    ctx->futhark_mc_segmap_parloop_6020_total_total_runtime = 0;
-    ctx->futhark_mc_segmap_parloop_6020_total_runs = 0;
-    ctx->futhark_mc_segmap_parloop_6020_total_iter = 0;
-    ctx->futhark_mc_segmap_task_6018_total_runtime = calloc(sizeof(int64_t),
-                                                            ctx->scheduler.num_threads);
-    ctx->futhark_mc_segmap_task_6018_runs = calloc(sizeof(int),
-                                                   ctx->scheduler.num_threads);
-    ctx->futhark_mc_segmap_task_6018_iter = calloc(sizeof(sizeof(int64_t)),
-                                                   ctx->scheduler.num_threads);
-    ctx->futhark_mc_segmap_task_6018_total_time = 0;
-    ctx->futhark_mc_segmap_task_6018_total_iter = 0;
-    ctx->futhark_mc_segmap_parloop_6015_total_runtime = calloc(sizeof(int64_t),
-                                                               ctx->scheduler.num_threads);
-    ctx->futhark_mc_segmap_parloop_6015_runs = calloc(sizeof(int),
-                                                      ctx->scheduler.num_threads);
-    ctx->futhark_mc_segmap_parloop_6015_iter = calloc(sizeof(sizeof(int64_t)),
-                                                      ctx->scheduler.num_threads);
-    ctx->futhark_mc_segmap_parloop_6015_total_total_runtime = 0;
-    ctx->futhark_mc_segmap_parloop_6015_total_runs = 0;
-    ctx->futhark_mc_segmap_parloop_6015_total_iter = 0;
-    ctx->futhark_mc_segmap_nested_task_6013_total_runtime =
-        calloc(sizeof(int64_t), ctx->scheduler.num_threads);
-    ctx->futhark_mc_segmap_nested_task_6013_runs = calloc(sizeof(int),
-                                                          ctx->scheduler.num_threads);
-    ctx->futhark_mc_segmap_nested_task_6013_iter =
-        calloc(sizeof(sizeof(int64_t)), ctx->scheduler.num_threads);
-    init_constants(ctx);
-    return ctx;
-}
-void futhark_context_free(struct futhark_context *ctx)
-{
-    free_constants(ctx);
-    (void) scheduler_destroy(&ctx->scheduler);
-    free_lock(&ctx->lock);
-    free(ctx);
-}
-int futhark_context_sync(struct futhark_context *ctx)
-{
-    (void) ctx;
-    return 0;
-}
-static const char *size_names[0];
-static const char *size_vars[0];
-static const char *size_classes[0];
-int futhark_context_config_set_size(struct futhark_context_config *cfg, const
-                                    char *size_name, size_t size_value)
-{
-    (void) cfg;
-    (void) size_name;
-    (void) size_value;
-    return 1;
-}
-static int memblock_unref(struct futhark_context *ctx, struct memblock *block,
-                          const char *desc)
-{
-    if (block->references != NULL) {
-        *block->references -= 1;
-        if (ctx->detail_memory)
-            fprintf(ctx->log,
-                    "Unreferencing block %s (allocated as %s) in %s: %d references remaining.\n",
-                    desc, block->desc, "default space", *block->references);
-        if (*block->references == 0) {
-            ctx->cur_mem_usage_default -= block->size;
-            free(block->mem);
-            free(block->references);
-            if (ctx->detail_memory)
-                fprintf(ctx->log,
-                        "%lld bytes freed (now allocated: %lld bytes)\n",
-                        (long long) block->size,
-                        (long long) ctx->cur_mem_usage_default);
-        }
-        block->references = NULL;
-    }
-    return 0;
-}
-static int memblock_alloc(struct futhark_context *ctx, struct memblock *block,
-                          int64_t size, const char *desc)
-{
-    if (size < 0)
-        futhark_panic(1,
-                      "Negative allocation of %lld bytes attempted for %s in %s.\n",
-                      (long long) size, desc, "default space",
-                      ctx->cur_mem_usage_default);
-    
-    int ret = memblock_unref(ctx, block, desc);
-    
-    ctx->cur_mem_usage_default += size;
-    if (ctx->detail_memory)
-        fprintf(ctx->log,
-                "Allocating %lld bytes for %s in %s (then allocated: %lld bytes)",
-                (long long) size, desc, "default space",
-                (long long) ctx->cur_mem_usage_default);
-    if (ctx->cur_mem_usage_default > ctx->peak_mem_usage_default) {
-        ctx->peak_mem_usage_default = ctx->cur_mem_usage_default;
-        if (ctx->detail_memory)
-            fprintf(ctx->log, " (new peak).\n");
-    } else if (ctx->detail_memory)
-        fprintf(ctx->log, ".\n");
-    block->mem = (char *) malloc(size);
-    block->references = (int *) malloc(sizeof(int));
-    *block->references = 1;
-    block->size = size;
-    block->desc = desc;
-    return ret;
-}
-static int memblock_set(struct futhark_context *ctx, struct memblock *lhs,
-                        struct memblock *rhs, const char *lhs_desc)
-{
-    int ret = memblock_unref(ctx, lhs, lhs_desc);
-    
-    if (rhs->references != NULL)
-        (*rhs->references)++;
-    *lhs = *rhs;
-    return ret;
-}
-int futhark_get_num_sizes(void)
-{
-    return sizeof(size_names) / sizeof(size_names[0]);
-}
-const char *futhark_get_size_name(int i)
-{
-    return size_names[i];
-}
-const char *futhark_get_size_class(int i)
-{
-    return size_classes[i];
-}
-char *futhark_context_report(struct futhark_context *ctx)
-{
-    struct str_builder builder;
-    
-    str_builder_init(&builder);
-    if (ctx->detail_memory || ctx->profiling || ctx->logging) {
-        { }
-    }
-    if (ctx->profiling) {
-        for (int i = 0; i < ctx->scheduler.num_threads; i++)
-            fprintf(ctx->log,
-                    "tid %2d - futhark_mc_segmap_parloop_6011           ran %10d times; avg: %10ldus; total: %10ldus; time pr. iter %9.6f; iters %9ld; avg %ld\n",
-                    i, ctx->futhark_mc_segmap_parloop_6011_runs[i],
-                    (long) ctx->futhark_mc_segmap_parloop_6011_total_runtime[i] /
-                    (ctx->futhark_mc_segmap_parloop_6011_runs[i] !=
-                     0 ? ctx->futhark_mc_segmap_parloop_6011_runs[i] : 1),
-                    (long) ctx->futhark_mc_segmap_parloop_6011_total_runtime[i],
-                    (double) ctx->futhark_mc_segmap_parloop_6011_total_runtime[i] /
-                    (ctx->futhark_mc_segmap_parloop_6011_iter[i] ==
-                     0 ? 1 : (double) ctx->futhark_mc_segmap_parloop_6011_iter[i]),
-                    (long) ctx->futhark_mc_segmap_parloop_6011_iter[i],
-                    (long) ctx->futhark_mc_segmap_parloop_6011_iter[i] /
-                    (ctx->futhark_mc_segmap_parloop_6011_runs[i] !=
-                     0 ? ctx->futhark_mc_segmap_parloop_6011_runs[i] : 1));
-        fprintf(ctx->log,
-                "         futhark_mc_segmap_parloop_6011_total     ran %10d times; avg: %10ldus; total: %10ldus; time pr. iter %9.6f; iters %9ld; avg %ld\n",
-                ctx->futhark_mc_segmap_parloop_6011_total_runs,
-                (long) ctx->futhark_mc_segmap_parloop_6011_total_total_runtime /
-                (ctx->futhark_mc_segmap_parloop_6011_total_runs !=
-                 0 ? ctx->futhark_mc_segmap_parloop_6011_total_runs : 1),
-                (long) ctx->futhark_mc_segmap_parloop_6011_total_total_runtime,
-                (double) ctx->futhark_mc_segmap_parloop_6011_total_total_runtime /
-                (ctx->futhark_mc_segmap_parloop_6011_total_iter ==
-                 0 ? 1 : (double) ctx->futhark_mc_segmap_parloop_6011_total_iter),
-                (long) ctx->futhark_mc_segmap_parloop_6011_total_iter,
-                (long) ctx->futhark_mc_segmap_parloop_6011_total_iter /
-                (ctx->futhark_mc_segmap_parloop_6011_total_runs !=
-                 0 ? ctx->futhark_mc_segmap_parloop_6011_total_runs : 1));
-        ctx->total_runtime +=
-            ctx->futhark_mc_segmap_parloop_6011_total_total_runtime;
-        ctx->total_runs += ctx->futhark_mc_segmap_parloop_6011_total_runs;
-        for (int i = 0; i < ctx->scheduler.num_threads; i++)
-            fprintf(ctx->log,
-                    "tid %2d - futhark_mc_segmap_parloop_6020           ran %10d times; avg: %10ldus; total: %10ldus; time pr. iter %9.6f; iters %9ld; avg %ld\n",
-                    i, ctx->futhark_mc_segmap_parloop_6020_runs[i],
-                    (long) ctx->futhark_mc_segmap_parloop_6020_total_runtime[i] /
-                    (ctx->futhark_mc_segmap_parloop_6020_runs[i] !=
-                     0 ? ctx->futhark_mc_segmap_parloop_6020_runs[i] : 1),
-                    (long) ctx->futhark_mc_segmap_parloop_6020_total_runtime[i],
-                    (double) ctx->futhark_mc_segmap_parloop_6020_total_runtime[i] /
-                    (ctx->futhark_mc_segmap_parloop_6020_iter[i] ==
-                     0 ? 1 : (double) ctx->futhark_mc_segmap_parloop_6020_iter[i]),
-                    (long) ctx->futhark_mc_segmap_parloop_6020_iter[i],
-                    (long) ctx->futhark_mc_segmap_parloop_6020_iter[i] /
-                    (ctx->futhark_mc_segmap_parloop_6020_runs[i] !=
-                     0 ? ctx->futhark_mc_segmap_parloop_6020_runs[i] : 1));
-        fprintf(ctx->log,
-                "         futhark_mc_segmap_parloop_6020_total     ran %10d times; avg: %10ldus; total: %10ldus; time pr. iter %9.6f; iters %9ld; avg %ld\n",
-                ctx->futhark_mc_segmap_parloop_6020_total_runs,
-                (long) ctx->futhark_mc_segmap_parloop_6020_total_total_runtime /
-                (ctx->futhark_mc_segmap_parloop_6020_total_runs !=
-                 0 ? ctx->futhark_mc_segmap_parloop_6020_total_runs : 1),
-                (long) ctx->futhark_mc_segmap_parloop_6020_total_total_runtime,
-                (double) ctx->futhark_mc_segmap_parloop_6020_total_total_runtime /
-                (ctx->futhark_mc_segmap_parloop_6020_total_iter ==
-                 0 ? 1 : (double) ctx->futhark_mc_segmap_parloop_6020_total_iter),
-                (long) ctx->futhark_mc_segmap_parloop_6020_total_iter,
-                (long) ctx->futhark_mc_segmap_parloop_6020_total_iter /
-                (ctx->futhark_mc_segmap_parloop_6020_total_runs !=
-                 0 ? ctx->futhark_mc_segmap_parloop_6020_total_runs : 1));
-        ctx->total_runtime +=
-            ctx->futhark_mc_segmap_parloop_6020_total_total_runtime;
-        ctx->total_runs += ctx->futhark_mc_segmap_parloop_6020_total_runs;
-        for (int i = 0; i < ctx->scheduler.num_threads; i++)
-            fprintf(ctx->log,
-                    "tid %2d - futhark_mc_segmap_task_6018              ran %10d times; avg: %10ldus; total: %10ldus; time pr. iter %9.6f; iters %9ld; avg %ld\n",
-                    i, ctx->futhark_mc_segmap_task_6018_runs[i],
-                    (long) ctx->futhark_mc_segmap_task_6018_total_runtime[i] /
-                    (ctx->futhark_mc_segmap_task_6018_runs[i] !=
-                     0 ? ctx->futhark_mc_segmap_task_6018_runs[i] : 1),
-                    (long) ctx->futhark_mc_segmap_task_6018_total_runtime[i],
-                    (double) ctx->futhark_mc_segmap_task_6018_total_runtime[i] /
-                    (ctx->futhark_mc_segmap_task_6018_iter[i] ==
-                     0 ? 1 : (double) ctx->futhark_mc_segmap_task_6018_iter[i]),
-                    (long) ctx->futhark_mc_segmap_task_6018_iter[i],
-                    (long) ctx->futhark_mc_segmap_task_6018_iter[i] /
-                    (ctx->futhark_mc_segmap_task_6018_runs[i] !=
-                     0 ? ctx->futhark_mc_segmap_task_6018_runs[i] : 1));
-        for (int i = 0; i < ctx->scheduler.num_threads; i++)
-            fprintf(ctx->log,
-                    "tid %2d - futhark_mc_segmap_parloop_6015           ran %10d times; avg: %10ldus; total: %10ldus; time pr. iter %9.6f; iters %9ld; avg %ld\n",
-                    i, ctx->futhark_mc_segmap_parloop_6015_runs[i],
-                    (long) ctx->futhark_mc_segmap_parloop_6015_total_runtime[i] /
-                    (ctx->futhark_mc_segmap_parloop_6015_runs[i] !=
-                     0 ? ctx->futhark_mc_segmap_parloop_6015_runs[i] : 1),
-                    (long) ctx->futhark_mc_segmap_parloop_6015_total_runtime[i],
-                    (double) ctx->futhark_mc_segmap_parloop_6015_total_runtime[i] /
-                    (ctx->futhark_mc_segmap_parloop_6015_iter[i] ==
-                     0 ? 1 : (double) ctx->futhark_mc_segmap_parloop_6015_iter[i]),
-                    (long) ctx->futhark_mc_segmap_parloop_6015_iter[i],
-                    (long) ctx->futhark_mc_segmap_parloop_6015_iter[i] /
-                    (ctx->futhark_mc_segmap_parloop_6015_runs[i] !=
-                     0 ? ctx->futhark_mc_segmap_parloop_6015_runs[i] : 1));
-        fprintf(ctx->log,
-                "         futhark_mc_segmap_parloop_6015_total     ran %10d times; avg: %10ldus; total: %10ldus; time pr. iter %9.6f; iters %9ld; avg %ld\n",
-                ctx->futhark_mc_segmap_parloop_6015_total_runs,
-                (long) ctx->futhark_mc_segmap_parloop_6015_total_total_runtime /
-                (ctx->futhark_mc_segmap_parloop_6015_total_runs !=
-                 0 ? ctx->futhark_mc_segmap_parloop_6015_total_runs : 1),
-                (long) ctx->futhark_mc_segmap_parloop_6015_total_total_runtime,
-                (double) ctx->futhark_mc_segmap_parloop_6015_total_total_runtime /
-                (ctx->futhark_mc_segmap_parloop_6015_total_iter ==
-                 0 ? 1 : (double) ctx->futhark_mc_segmap_parloop_6015_total_iter),
-                (long) ctx->futhark_mc_segmap_parloop_6015_total_iter,
-                (long) ctx->futhark_mc_segmap_parloop_6015_total_iter /
-                (ctx->futhark_mc_segmap_parloop_6015_total_runs !=
-                 0 ? ctx->futhark_mc_segmap_parloop_6015_total_runs : 1));
-        ctx->total_runtime +=
-            ctx->futhark_mc_segmap_parloop_6015_total_total_runtime;
-        ctx->total_runs += ctx->futhark_mc_segmap_parloop_6015_total_runs;
-        for (int i = 0; i < ctx->scheduler.num_threads; i++)
-            fprintf(ctx->log,
-                    "tid %2d - futhark_mc_segmap_task_6009              ran %10d times; avg: %10ldus; total: %10ldus; time pr. iter %9.6f; iters %9ld; avg %ld\n",
-                    i, ctx->futhark_mc_segmap_task_6009_runs[i],
-                    (long) ctx->futhark_mc_segmap_task_6009_total_runtime[i] /
-                    (ctx->futhark_mc_segmap_task_6009_runs[i] !=
-                     0 ? ctx->futhark_mc_segmap_task_6009_runs[i] : 1),
-                    (long) ctx->futhark_mc_segmap_task_6009_total_runtime[i],
-                    (double) ctx->futhark_mc_segmap_task_6009_total_runtime[i] /
-                    (ctx->futhark_mc_segmap_task_6009_iter[i] ==
-                     0 ? 1 : (double) ctx->futhark_mc_segmap_task_6009_iter[i]),
-                    (long) ctx->futhark_mc_segmap_task_6009_iter[i],
-                    (long) ctx->futhark_mc_segmap_task_6009_iter[i] /
-                    (ctx->futhark_mc_segmap_task_6009_runs[i] !=
-                     0 ? ctx->futhark_mc_segmap_task_6009_runs[i] : 1));
-        for (int i = 0; i < ctx->scheduler.num_threads; i++)
-            fprintf(ctx->log,
-                    "tid %2d - futhark_mc_segmap_nested_task_6013       ran %10d times; avg: %10ldus; total: %10ldus; time pr. iter %9.6f; iters %9ld; avg %ld\n",
-                    i, ctx->futhark_mc_segmap_nested_task_6013_runs[i],
-                    (long) ctx->futhark_mc_segmap_nested_task_6013_total_runtime[i] /
-                    (ctx->futhark_mc_segmap_nested_task_6013_runs[i] !=
-                     0 ? ctx->futhark_mc_segmap_nested_task_6013_runs[i] : 1),
-                    (long) ctx->futhark_mc_segmap_nested_task_6013_total_runtime[i],
-                    (double) ctx->futhark_mc_segmap_nested_task_6013_total_runtime[i] /
-                    (ctx->futhark_mc_segmap_nested_task_6013_iter[i] ==
-                     0 ? 1 : (double) ctx->futhark_mc_segmap_nested_task_6013_iter[i]),
-                    (long) ctx->futhark_mc_segmap_nested_task_6013_iter[i],
-                    (long) ctx->futhark_mc_segmap_nested_task_6013_iter[i] /
-                    (ctx->futhark_mc_segmap_nested_task_6013_runs[i] !=
-                     0 ? ctx->futhark_mc_segmap_nested_task_6013_runs[i] : 1));
-    }
-    return builder.str;
-}
-char *futhark_context_get_error(struct futhark_context *ctx)
-{
-    char *error = ctx->error;
-    
-    ctx->error = NULL;
-    return error;
-}
-void futhark_context_set_logging_file(struct futhark_context *ctx, FILE *f)
-{
-    ctx->log = f;
-}
-void futhark_context_pause_profiling(struct futhark_context *ctx)
-{
-    ctx->profiling_paused = 1;
-}
-void futhark_context_unpause_profiling(struct futhark_context *ctx)
-{
-    ctx->profiling_paused = 0;
-}
-int futhark_context_clear_caches(struct futhark_context *ctx)
-{
-    lock_lock(&ctx->lock);
-    worker_local = &ctx->scheduler.workers[0];
-    ctx->peak_mem_usage_default = 0;
-    lock_unlock(&ctx->lock);
-    return ctx->error != NULL;
-}
-static int futrts_init(struct futhark_context *ctx,
-                       struct memblock *out_mem_p_5993,
-                       int64_t *out_scalar_out_5994,
-                       int64_t *out_scalar_out_5995,
-                       int64_t *out_scalar_out_5996,
-                       struct memblock board_mem_5945, int64_t n_5861,
-                       int64_t nb_rows_5863, int64_t nb_columns_5864,
-                       int64_t sizze_5865);
-static int futrts_key(struct futhark_context *ctx,
-                      struct memblock *out_mem_p_5997,
-                      int64_t *out_scalar_out_5998,
-                      int64_t *out_scalar_out_5999,
-                      int64_t *out_scalar_out_6000,
-                      struct memblock board_mem_5945, int64_t implz2080U_5881,
-                      int32_t e_5882, int32_t key_5883, int64_t nb_columns_5885,
-                      int64_t nb_rows_5886, int64_t sizze_5887);
-static int futrts_mouse(struct futhark_context *ctx,
-                        struct memblock *out_mem_p_6001,
-                        int64_t *out_scalar_out_6002,
-                        int64_t *out_scalar_out_6003,
-                        int64_t *out_scalar_out_6004,
-                        struct memblock board_mem_5945, int64_t implz2080U_5866,
-                        int32_t buttons_5867, int32_t x_5868, int32_t y_5869,
-                        int64_t nb_columns_5871, int64_t nb_rows_5872,
-                        int64_t sizze_5873);
-static int futrts_render(struct futhark_context *ctx,
-                         struct memblock *out_mem_p_6005,
-                         int64_t *out_out_arrsizze_6006,
-                         int64_t *out_out_arrsizze_6007,
-                         struct memblock board_mem_5945,
-                         int64_t implz2080U_5888, int64_t nb_columns_5890,
-                         int64_t nb_rows_5891, int64_t sizze_5892);
-static int futrts_resizze(struct futhark_context *ctx,
-                          struct memblock *out_mem_p_6021,
-                          int64_t *out_scalar_out_6022,
-                          int64_t *out_scalar_out_6023,
-                          int64_t *out_scalar_out_6024,
-                          struct memblock board_mem_5945,
-                          int64_t implz2080U_5848, int64_t h_5849,
-                          int64_t w_5850, int64_t nb_columns_5852,
-                          int64_t nb_rows_5853, int64_t sizze_5854);
-static int futrts_step(struct futhark_context *ctx,
-                       struct memblock *out_mem_p_6025,
-                       int64_t *out_scalar_out_6026,
-                       int64_t *out_scalar_out_6027,
-                       int64_t *out_scalar_out_6028,
-                       struct memblock board_mem_5945, int64_t implz2080U_5855,
-                       float nameless_5856, int64_t nb_columns_5858,
-                       int64_t nb_rows_5859, int64_t sizze_5860);
-static int futrts_wheel(struct futhark_context *ctx,
-                        struct memblock *out_mem_p_6029,
-                        int64_t *out_scalar_out_6030,
-                        int64_t *out_scalar_out_6031,
-                        int64_t *out_scalar_out_6032,
-                        struct memblock board_mem_5945, int64_t implz2080U_5874,
-                        int32_t dx_5875, int32_t dy_5876,
-                        int64_t nb_columns_5878, int64_t nb_rows_5879,
-                        int64_t sizze_5880);
-static int init_constants(struct futhark_context *ctx)
-{
-    (void) ctx;
-    
-    int err = 0;
-    
-    
-  cleanup:
-    return err;
-}
-static int free_constants(struct futhark_context *ctx)
-{
-    (void) ctx;
-    return 0;
-}
-struct futhark_mc_task_6008 {
-    struct futhark_context *ctx;
-    int64_t free_implz2080U_5888;
-    int64_t free_nb_columns_5890;
-    char *free_board_mem_5945;
-    int64_t free_bytes_5946;
-    char *free_mem_5964;
-} ;
-struct futhark_mc_segmap_parloop_struct_6010 {
-    struct futhark_context *ctx;
-    int64_t free_implz2080U_5888;
-    int64_t free_nb_columns_5890;
-    char *free_board_mem_5945;
-    int64_t free_bytes_5946;
-    char *free_mem_5964;
-} ;
-static int futhark_mc_segmap_parloop_6011(void *args, int64_t start,
-                                          int64_t end, int flat_tid_5915,
-                                          int tid)
-{
-    int err = 0;
-    struct futhark_mc_segmap_parloop_struct_6010
-    *futhark_mc_segmap_parloop_struct_6010 =
-    (struct futhark_mc_segmap_parloop_struct_6010 *) args;
-    struct futhark_context *ctx = futhark_mc_segmap_parloop_struct_6010->ctx;
-    uint64_t futhark_mc_segmap_parloop_6011_start = 0;
-    
-    if (ctx->profiling && !ctx->profiling_paused)
-        futhark_mc_segmap_parloop_6011_start = get_wall_time();
-    
-    int64_t implz2080U_5888 =
-            futhark_mc_segmap_parloop_struct_6010->free_implz2080U_5888;
-    int64_t nb_columns_5890 =
-            futhark_mc_segmap_parloop_struct_6010->free_nb_columns_5890;
-    struct memblock board_mem_5945 = {.desc ="board_mem_5945", .mem =
-                                      futhark_mc_segmap_parloop_struct_6010->free_board_mem_5945,
-                                      .size =0, .references =NULL};
-    int64_t bytes_5946 = futhark_mc_segmap_parloop_struct_6010->free_bytes_5946;
-    struct memblock mem_5964 = {.desc ="mem_5964", .mem =
-                                futhark_mc_segmap_parloop_struct_6010->free_mem_5964,
-                                .size =0, .references =NULL};
-    size_t mem_5949_cached_sizze_6012 = 0;
-    char *mem_5949 = NULL;
-    int64_t iterations = end - start;
-    int64_t iter_5969 = start;
-    
-    if (mem_5949_cached_sizze_6012 < (size_t) bytes_5946) {
-        mem_5949 = realloc(mem_5949, bytes_5946);
-        mem_5949_cached_sizze_6012 = bytes_5946;
-    }
-    for (; iter_5969 < end; iter_5969++) {
-        if (ctx->debugging)
-            fprintf(ctx->log, "%s\n", "SegMap fbody");
-        
-        int64_t gtid_5916;
-        
-        gtid_5916 = iter_5969;
-        
-        int64_t x_5903;
-        
-        x_5903 = mul64(nb_columns_5890, gtid_5916);
-        for (int64_t i_5970 = 0; i_5970 < nb_columns_5890; i_5970++) {
-            int64_t get_cell_index_res_5972 = add64(x_5903, i_5970);
-            bool x_5973 = sle64((int64_t) 0, get_cell_index_res_5972);
-            bool y_5974 = slt64(get_cell_index_res_5972, implz2080U_5888);
-            bool bounds_check_5975 = x_5973 && y_5974;
-            bool index_certs_5976;
-            
-            if (!bounds_check_5975) {
-                ctx->error =
-                    msgprintf("Error: %s%lld%s%lld%s\n\nBacktrace:\n%s",
-                              "Index [", get_cell_index_res_5972,
-                              "] out of bounds for array of shape [",
-                              implz2080U_5888, "].",
-                              "-> #0  /home/baptistecdr/Documents/Cours/projet-de-bachelor/game_of_life/gol.fut:30:24-63\n   #1  /home/baptistecdr/Documents/Cours/projet-de-bachelor/game_of_life/gol.fut:30:10-107\n   #2  /home/baptistecdr/Documents/Cours/projet-de-bachelor/game_of_life/gol.fut:29:8-31:12\n   #3  /home/baptistecdr/Documents/Cours/projet-de-bachelor/game_of_life/gol.fut:26:1-31:12\n");
-                return 1;
-            }
-            
-            int8_t x_5977 =
-                   ((int8_t *) board_mem_5945.mem)[get_cell_index_res_5972];
-            bool cond_5978 = x_5977 == (int8_t) 1;
-            int32_t defunc_0_f_res_5979;
-            
-            if (cond_5978) {
-                defunc_0_f_res_5979 = -1;
-            } else {
-                defunc_0_f_res_5979 = -16777216;
-            }
-            ((int32_t *) mem_5949)[i_5970] = defunc_0_f_res_5979;
-        }
-        memmove(mem_5964.mem + gtid_5916 * nb_columns_5890 * (int64_t) 4,
-                mem_5949 + (int64_t) 0, nb_columns_5890 *
-                (int64_t) sizeof(int32_t));
-    }
-    
-  cleanup:
-    { }
-    free(mem_5949);
-    if (ctx->profiling && !ctx->profiling_paused) {
-        uint64_t futhark_mc_segmap_parloop_6011_end = get_wall_time();
-        uint64_t elapsed = futhark_mc_segmap_parloop_6011_end -
-                 futhark_mc_segmap_parloop_6011_start;
-        
-        ctx->futhark_mc_segmap_parloop_6011_runs[tid]++;
-        ctx->futhark_mc_segmap_parloop_6011_total_runtime[tid] += elapsed;
-        ctx->futhark_mc_segmap_parloop_6011_iter[tid] += iterations;
-    }
-    return err;
-}
-int futhark_mc_segmap_task_6009(void *args, int64_t iterations, int tid,
-                                struct scheduler_info info)
-{
-    int err = 0;
-    int flat_tid_5915 = tid;
-    int num_tasks_5968 = info.nsubtasks;
-    struct futhark_mc_task_6008 *futhark_mc_task_6008 =
-                                (struct futhark_mc_task_6008 *) args;
-    struct futhark_context *ctx = futhark_mc_task_6008->ctx;
-    uint64_t futhark_mc_segmap_task_6009_start = 0;
-    
-    if (ctx->profiling && !ctx->profiling_paused)
-        futhark_mc_segmap_task_6009_start = get_wall_time();
-    
-    int64_t implz2080U_5888 = futhark_mc_task_6008->free_implz2080U_5888;
-    int64_t nb_columns_5890 = futhark_mc_task_6008->free_nb_columns_5890;
-    struct memblock board_mem_5945 = {.desc ="board_mem_5945", .mem =
-                                      futhark_mc_task_6008->free_board_mem_5945,
-                                      .size =0, .references =NULL};
-    int64_t bytes_5946 = futhark_mc_task_6008->free_bytes_5946;
-    struct memblock mem_5964 = {.desc ="mem_5964", .mem =
-                                futhark_mc_task_6008->free_mem_5964, .size =0,
-                                .references =NULL};
-    int64_t iter_5969;
-    struct futhark_mc_segmap_parloop_struct_6010
-    futhark_mc_segmap_parloop_struct_6010;
-    
-    futhark_mc_segmap_parloop_struct_6010.ctx = ctx;
-    futhark_mc_segmap_parloop_struct_6010.free_implz2080U_5888 =
-        implz2080U_5888;
-    futhark_mc_segmap_parloop_struct_6010.free_nb_columns_5890 =
-        nb_columns_5890;
-    futhark_mc_segmap_parloop_struct_6010.free_board_mem_5945 =
-        board_mem_5945.mem;
-    futhark_mc_segmap_parloop_struct_6010.free_bytes_5946 = bytes_5946;
-    futhark_mc_segmap_parloop_struct_6010.free_mem_5964 = mem_5964.mem;
-    
-    struct scheduler_parloop futhark_mc_segmap_parloop_6011_task;
-    
-    futhark_mc_segmap_parloop_6011_task.name = "futhark_mc_segmap_parloop_6011";
-    futhark_mc_segmap_parloop_6011_task.fn = futhark_mc_segmap_parloop_6011;
-    futhark_mc_segmap_parloop_6011_task.args =
-        &futhark_mc_segmap_parloop_struct_6010;
-    futhark_mc_segmap_parloop_6011_task.iterations = iterations;
-    futhark_mc_segmap_parloop_6011_task.info = info;
-    
-    uint64_t futhark_mc_segmap_parloop_6011_total_start = 0;
-    
-    if (ctx->profiling && !ctx->profiling_paused)
-        futhark_mc_segmap_parloop_6011_total_start = get_wall_time();
-    
-    int futhark_mc_segmap_parloop_6011_err =
-        scheduler_execute_task(&ctx->scheduler,
-                               &futhark_mc_segmap_parloop_6011_task);
-    
-    if (futhark_mc_segmap_parloop_6011_err != 0) {
-        err = 1;
-        goto cleanup;
-    }
-    if (ctx->profiling && !ctx->profiling_paused) {
-        uint64_t futhark_mc_segmap_parloop_6011_total_end = get_wall_time();
-        uint64_t elapsed = futhark_mc_segmap_parloop_6011_total_end -
-                 futhark_mc_segmap_parloop_6011_total_start;
-        
-        __atomic_fetch_add(&ctx->futhark_mc_segmap_parloop_6011_total_runs, 1,
-                           __ATOMIC_RELAXED);
-        __atomic_fetch_add(&ctx->futhark_mc_segmap_parloop_6011_total_total_runtime,
-                           elapsed, __ATOMIC_RELAXED);
-        __atomic_fetch_add(&ctx->futhark_mc_segmap_parloop_6011_total_iter,
-                           iterations, __ATOMIC_RELAXED);
-    }
-    if (ctx->profiling && !ctx->profiling_paused) {
-        uint64_t futhark_mc_segmap_task_6009_end = get_wall_time();
-        uint64_t elapsed = futhark_mc_segmap_task_6009_end -
-                 futhark_mc_segmap_task_6009_start;
-        
-        ctx->futhark_mc_segmap_task_6009_runs[tid]++;
-        ctx->futhark_mc_segmap_task_6009_total_runtime[tid] += elapsed;
-        ctx->futhark_mc_segmap_task_6009_iter[tid] += iterations;
-    }
-    
-  cleanup:
-    { }
-    return err;
-}
-struct futhark_mc_segmap_parloop_struct_6014 {
-    struct futhark_context *ctx;
-    int64_t free_implz2080U_5888;
-    int64_t free_nb_columns_5890;
-    char *free_board_mem_5945;
-    int64_t free_bytes_5946;
-    char *free_mem_5964;
-} ;
-struct futhark_mc_task_6017 {
-    struct futhark_context *ctx;
-    int64_t free_implz2080U_5888;
-    int64_t free_x_5922;
-    char *free_board_mem_5945;
-    char *free_mem_5947;
-} ;
-struct futhark_mc_segmap_parloop_struct_6019 {
-    struct futhark_context *ctx;
-    int64_t free_implz2080U_5888;
-    int64_t free_x_5922;
-    char *free_board_mem_5945;
-    char *free_mem_5947;
-} ;
-static int futhark_mc_segmap_parloop_6020(void *args, int64_t start,
-                                          int64_t end, int flat_tid_5919,
-                                          int tid)
-{
-    int err = 0;
-    struct futhark_mc_segmap_parloop_struct_6019
-    *futhark_mc_segmap_parloop_struct_6019 =
-    (struct futhark_mc_segmap_parloop_struct_6019 *) args;
-    struct futhark_context *ctx = futhark_mc_segmap_parloop_struct_6019->ctx;
-    uint64_t futhark_mc_segmap_parloop_6020_start = 0;
-    
-    if (ctx->profiling && !ctx->profiling_paused)
-        futhark_mc_segmap_parloop_6020_start = get_wall_time();
-    
-    int64_t implz2080U_5888 =
-            futhark_mc_segmap_parloop_struct_6019->free_implz2080U_5888;
-    int64_t x_5922 = futhark_mc_segmap_parloop_struct_6019->free_x_5922;
-    struct memblock board_mem_5945 = {.desc ="board_mem_5945", .mem =
-                                      futhark_mc_segmap_parloop_struct_6019->free_board_mem_5945,
-                                      .size =0, .references =NULL};
-    struct memblock mem_5947 = {.desc ="mem_5947", .mem =
-                                futhark_mc_segmap_parloop_struct_6019->free_mem_5947,
-                                .size =0, .references =NULL};
-    int64_t iterations = end - start;
-    int64_t iter_5992 = start;
-    
-    for (; iter_5992 < end; iter_5992++) {
-        if (ctx->debugging)
-            fprintf(ctx->log, "%s\n", "SegMap fbody");
-        
-        int64_t gtid_5920;
-        
-        gtid_5920 = iter_5992;
-        
-        int64_t get_cell_index_res_5983;
-        
-        get_cell_index_res_5983 = add64(gtid_5920, x_5922);
-        
-        bool x_5984 = sle64((int64_t) 0, get_cell_index_res_5983);
-        bool y_5985 = slt64(get_cell_index_res_5983, implz2080U_5888);
-        bool bounds_check_5986 = x_5984 && y_5985;
-        bool index_certs_5987;
-        
-        if (!bounds_check_5986) {
-            ctx->error = msgprintf("Error: %s%lld%s%lld%s\n\nBacktrace:\n%s",
-                                   "Index [", get_cell_index_res_5983,
-                                   "] out of bounds for array of shape [",
-                                   implz2080U_5888, "].",
-                                   "-> #0  /home/baptistecdr/Documents/Cours/projet-de-bachelor/game_of_life/gol.fut:30:24-63\n   #1  /home/baptistecdr/Documents/Cours/projet-de-bachelor/game_of_life/gol.fut:30:10-107\n   #2  /home/baptistecdr/Documents/Cours/projet-de-bachelor/game_of_life/gol.fut:29:8-31:12\n   #3  /home/baptistecdr/Documents/Cours/projet-de-bachelor/game_of_life/gol.fut:26:1-31:12\n");
-            return 1;
-        }
-        
-        int8_t x_5988 =
-               ((int8_t *) board_mem_5945.mem)[get_cell_index_res_5983];
-        bool cond_5989 = x_5988 == (int8_t) 1;
-        int32_t defunc_0_f_res_5990;
-        
-        if (cond_5989) {
-            defunc_0_f_res_5990 = -1;
-        } else {
-            defunc_0_f_res_5990 = -16777216;
-        }
-        ((int32_t *) mem_5947.mem)[gtid_5920] = defunc_0_f_res_5990;
-    }
-    
-  cleanup:
-    { }
-    if (ctx->profiling && !ctx->profiling_paused) {
-        uint64_t futhark_mc_segmap_parloop_6020_end = get_wall_time();
-        uint64_t elapsed = futhark_mc_segmap_parloop_6020_end -
-                 futhark_mc_segmap_parloop_6020_start;
-        
-        ctx->futhark_mc_segmap_parloop_6020_runs[tid]++;
-        ctx->futhark_mc_segmap_parloop_6020_total_runtime[tid] += elapsed;
-        ctx->futhark_mc_segmap_parloop_6020_iter[tid] += iterations;
-    }
-    return err;
-}
-int futhark_mc_segmap_task_6018(void *args, int64_t iterations, int tid,
-                                struct scheduler_info info)
-{
-    int err = 0;
-    int flat_tid_5919 = tid;
-    int num_tasks_5991 = info.nsubtasks;
-    struct futhark_mc_task_6017 *futhark_mc_task_6017 =
-                                (struct futhark_mc_task_6017 *) args;
-    struct futhark_context *ctx = futhark_mc_task_6017->ctx;
-    uint64_t futhark_mc_segmap_task_6018_start = 0;
-    
-    if (ctx->profiling && !ctx->profiling_paused)
-        futhark_mc_segmap_task_6018_start = get_wall_time();
-    
-    int64_t implz2080U_5888 = futhark_mc_task_6017->free_implz2080U_5888;
-    int64_t x_5922 = futhark_mc_task_6017->free_x_5922;
-    struct memblock board_mem_5945 = {.desc ="board_mem_5945", .mem =
-                                      futhark_mc_task_6017->free_board_mem_5945,
-                                      .size =0, .references =NULL};
-    struct memblock mem_5947 = {.desc ="mem_5947", .mem =
-                                futhark_mc_task_6017->free_mem_5947, .size =0,
-                                .references =NULL};
-    int64_t iter_5992;
-    struct futhark_mc_segmap_parloop_struct_6019
-    futhark_mc_segmap_parloop_struct_6019;
-    
-    futhark_mc_segmap_parloop_struct_6019.ctx = ctx;
-    futhark_mc_segmap_parloop_struct_6019.free_implz2080U_5888 =
-        implz2080U_5888;
-    futhark_mc_segmap_parloop_struct_6019.free_x_5922 = x_5922;
-    futhark_mc_segmap_parloop_struct_6019.free_board_mem_5945 =
-        board_mem_5945.mem;
-    futhark_mc_segmap_parloop_struct_6019.free_mem_5947 = mem_5947.mem;
-    
-    struct scheduler_parloop futhark_mc_segmap_parloop_6020_task;
-    
-    futhark_mc_segmap_parloop_6020_task.name = "futhark_mc_segmap_parloop_6020";
-    futhark_mc_segmap_parloop_6020_task.fn = futhark_mc_segmap_parloop_6020;
-    futhark_mc_segmap_parloop_6020_task.args =
-        &futhark_mc_segmap_parloop_struct_6019;
-    futhark_mc_segmap_parloop_6020_task.iterations = iterations;
-    futhark_mc_segmap_parloop_6020_task.info = info;
-    
-    uint64_t futhark_mc_segmap_parloop_6020_total_start = 0;
-    
-    if (ctx->profiling && !ctx->profiling_paused)
-        futhark_mc_segmap_parloop_6020_total_start = get_wall_time();
-    
-    int futhark_mc_segmap_parloop_6020_err =
-        scheduler_execute_task(&ctx->scheduler,
-                               &futhark_mc_segmap_parloop_6020_task);
-    
-    if (futhark_mc_segmap_parloop_6020_err != 0) {
-        err = 1;
-        goto cleanup;
-    }
-    if (ctx->profiling && !ctx->profiling_paused) {
-        uint64_t futhark_mc_segmap_parloop_6020_total_end = get_wall_time();
-        uint64_t elapsed = futhark_mc_segmap_parloop_6020_total_end -
-                 futhark_mc_segmap_parloop_6020_total_start;
-        
-        __atomic_fetch_add(&ctx->futhark_mc_segmap_parloop_6020_total_runs, 1,
-                           __ATOMIC_RELAXED);
-        __atomic_fetch_add(&ctx->futhark_mc_segmap_parloop_6020_total_total_runtime,
-                           elapsed, __ATOMIC_RELAXED);
-        __atomic_fetch_add(&ctx->futhark_mc_segmap_parloop_6020_total_iter,
-                           iterations, __ATOMIC_RELAXED);
-    }
-    if (ctx->profiling && !ctx->profiling_paused) {
-        uint64_t futhark_mc_segmap_task_6018_end = get_wall_time();
-        uint64_t elapsed = futhark_mc_segmap_task_6018_end -
-                 futhark_mc_segmap_task_6018_start;
-        
-        ctx->futhark_mc_segmap_task_6018_runs[tid]++;
-        ctx->futhark_mc_segmap_task_6018_total_runtime[tid] += elapsed;
-        ctx->futhark_mc_segmap_task_6018_iter[tid] += iterations;
-    }
-    
-  cleanup:
-    { }
-    return err;
-}
-static int futhark_mc_segmap_parloop_6015(void *args, int64_t start,
-                                          int64_t end, int flat_tid_5917,
-                                          int tid)
-{
-    int err = 0;
-    struct futhark_mc_segmap_parloop_struct_6014
-    *futhark_mc_segmap_parloop_struct_6014 =
-    (struct futhark_mc_segmap_parloop_struct_6014 *) args;
-    struct futhark_context *ctx = futhark_mc_segmap_parloop_struct_6014->ctx;
-    uint64_t futhark_mc_segmap_parloop_6015_start = 0;
-    
-    if (ctx->profiling && !ctx->profiling_paused)
-        futhark_mc_segmap_parloop_6015_start = get_wall_time();
-    
-    int64_t implz2080U_5888 =
-            futhark_mc_segmap_parloop_struct_6014->free_implz2080U_5888;
-    int64_t nb_columns_5890 =
-            futhark_mc_segmap_parloop_struct_6014->free_nb_columns_5890;
-    struct memblock board_mem_5945 = {.desc ="board_mem_5945", .mem =
-                                      futhark_mc_segmap_parloop_struct_6014->free_board_mem_5945,
-                                      .size =0, .references =NULL};
-    int64_t bytes_5946 = futhark_mc_segmap_parloop_struct_6014->free_bytes_5946;
-    struct memblock mem_5964 = {.desc ="mem_5964", .mem =
-                                futhark_mc_segmap_parloop_struct_6014->free_mem_5964,
-                                .size =0, .references =NULL};
-    size_t mem_5947_cached_sizze_6016 = 0;
-    char *mem_5947 = NULL;
-    int64_t iterations = end - start;
-    int64_t iter_5982 = start;
-    
-    if (mem_5947_cached_sizze_6016 < (size_t) bytes_5946) {
-        mem_5947 = realloc(mem_5947, bytes_5946);
-        mem_5947_cached_sizze_6016 = bytes_5946;
-    }
-    for (; iter_5982 < end; iter_5982++) {
-        if (ctx->debugging)
-            fprintf(ctx->log, "%s\n", "SegMap fbody");
-        
-        int64_t gtid_5918;
-        
-        gtid_5918 = iter_5982;
-        
-        int64_t x_5922;
-        
-        x_5922 = mul64(nb_columns_5890, gtid_5918);
-        
-        int64_t flat_tid_5919 = (int64_t) 0;
-        int32_t num_tasks_5991;
-        struct futhark_mc_task_6017 futhark_mc_task_6017;
-        
-        futhark_mc_task_6017.ctx = ctx;
-        futhark_mc_task_6017.free_implz2080U_5888 = implz2080U_5888;
-        futhark_mc_task_6017.free_x_5922 = x_5922;
-        futhark_mc_task_6017.free_board_mem_5945 = board_mem_5945.mem;
-        futhark_mc_task_6017.free_mem_5947 = mem_5947;
-        
-        struct scheduler_segop futhark_mc_task_6017_task;
-        
-        futhark_mc_task_6017_task.args = &futhark_mc_task_6017;
-        futhark_mc_task_6017_task.top_level_fn = futhark_mc_segmap_task_6018;
-        futhark_mc_task_6017_task.name = "futhark_mc_segmap_task_6018";
-        futhark_mc_task_6017_task.iterations = nb_columns_5890;
-        futhark_mc_task_6017_task.task_time =
-            &ctx->futhark_mc_segmap_task_6018_total_time;
-        futhark_mc_task_6017_task.task_iter =
-            &ctx->futhark_mc_segmap_task_6018_total_iter;
-        futhark_mc_task_6017_task.sched = STATIC;
-        futhark_mc_task_6017_task.nested_fn = NULL;
-        
-        int futhark_mc_segmap_task_6018_err =
-            scheduler_prepare_task(&ctx->scheduler, &futhark_mc_task_6017_task);
-        
-        if (futhark_mc_segmap_task_6018_err != 0) {
-            err = 1;
-            goto cleanup;
-        }
-        memmove(mem_5964.mem + gtid_5918 * nb_columns_5890 * (int64_t) 4,
-                mem_5947 + (int64_t) 0, nb_columns_5890 *
-                (int64_t) sizeof(int32_t));
-    }
-    
-  cleanup:
-    { }
-    free(mem_5947);
-    if (ctx->profiling && !ctx->profiling_paused) {
-        uint64_t futhark_mc_segmap_parloop_6015_end = get_wall_time();
-        uint64_t elapsed = futhark_mc_segmap_parloop_6015_end -
-                 futhark_mc_segmap_parloop_6015_start;
-        
-        ctx->futhark_mc_segmap_parloop_6015_runs[tid]++;
-        ctx->futhark_mc_segmap_parloop_6015_total_runtime[tid] += elapsed;
-        ctx->futhark_mc_segmap_parloop_6015_iter[tid] += iterations;
-    }
-    return err;
-}
-int futhark_mc_segmap_nested_task_6013(void *args, int64_t iterations, int tid,
-                                       struct scheduler_info info)
-{
-    int err = 0;
-    int flat_tid_5917 = tid;
-    int num_tasks_5968 = info.nsubtasks;
-    struct futhark_mc_task_6008 *futhark_mc_task_6008 =
-                                (struct futhark_mc_task_6008 *) args;
-    struct futhark_context *ctx = futhark_mc_task_6008->ctx;
-    uint64_t futhark_mc_segmap_nested_task_6013_start = 0;
-    
-    if (ctx->profiling && !ctx->profiling_paused)
-        futhark_mc_segmap_nested_task_6013_start = get_wall_time();
-    
-    int64_t implz2080U_5888 = futhark_mc_task_6008->free_implz2080U_5888;
-    int64_t nb_columns_5890 = futhark_mc_task_6008->free_nb_columns_5890;
-    struct memblock board_mem_5945 = {.desc ="board_mem_5945", .mem =
-                                      futhark_mc_task_6008->free_board_mem_5945,
-                                      .size =0, .references =NULL};
-    int64_t bytes_5946 = futhark_mc_task_6008->free_bytes_5946;
-    struct memblock mem_5964 = {.desc ="mem_5964", .mem =
-                                futhark_mc_task_6008->free_mem_5964, .size =0,
-                                .references =NULL};
-    int64_t iter_5982;
-    struct futhark_mc_segmap_parloop_struct_6014
-    futhark_mc_segmap_parloop_struct_6014;
-    
-    futhark_mc_segmap_parloop_struct_6014.ctx = ctx;
-    futhark_mc_segmap_parloop_struct_6014.free_implz2080U_5888 =
-        implz2080U_5888;
-    futhark_mc_segmap_parloop_struct_6014.free_nb_columns_5890 =
-        nb_columns_5890;
-    futhark_mc_segmap_parloop_struct_6014.free_board_mem_5945 =
-        board_mem_5945.mem;
-    futhark_mc_segmap_parloop_struct_6014.free_bytes_5946 = bytes_5946;
-    futhark_mc_segmap_parloop_struct_6014.free_mem_5964 = mem_5964.mem;
-    
-    struct scheduler_parloop futhark_mc_segmap_parloop_6015_task;
-    
-    futhark_mc_segmap_parloop_6015_task.name = "futhark_mc_segmap_parloop_6015";
-    futhark_mc_segmap_parloop_6015_task.fn = futhark_mc_segmap_parloop_6015;
-    futhark_mc_segmap_parloop_6015_task.args =
-        &futhark_mc_segmap_parloop_struct_6014;
-    futhark_mc_segmap_parloop_6015_task.iterations = iterations;
-    futhark_mc_segmap_parloop_6015_task.info = info;
-    
-    uint64_t futhark_mc_segmap_parloop_6015_total_start = 0;
-    
-    if (ctx->profiling && !ctx->profiling_paused)
-        futhark_mc_segmap_parloop_6015_total_start = get_wall_time();
-    
-    int futhark_mc_segmap_parloop_6015_err =
-        scheduler_execute_task(&ctx->scheduler,
-                               &futhark_mc_segmap_parloop_6015_task);
-    
-    if (futhark_mc_segmap_parloop_6015_err != 0) {
-        err = 1;
-        goto cleanup;
-    }
-    if (ctx->profiling && !ctx->profiling_paused) {
-        uint64_t futhark_mc_segmap_parloop_6015_total_end = get_wall_time();
-        uint64_t elapsed = futhark_mc_segmap_parloop_6015_total_end -
-                 futhark_mc_segmap_parloop_6015_total_start;
-        
-        __atomic_fetch_add(&ctx->futhark_mc_segmap_parloop_6015_total_runs, 1,
-                           __ATOMIC_RELAXED);
-        __atomic_fetch_add(&ctx->futhark_mc_segmap_parloop_6015_total_total_runtime,
-                           elapsed, __ATOMIC_RELAXED);
-        __atomic_fetch_add(&ctx->futhark_mc_segmap_parloop_6015_total_iter,
-                           iterations, __ATOMIC_RELAXED);
-    }
-    if (ctx->profiling && !ctx->profiling_paused) {
-        uint64_t futhark_mc_segmap_nested_task_6013_end = get_wall_time();
-        uint64_t elapsed = futhark_mc_segmap_nested_task_6013_end -
-                 futhark_mc_segmap_nested_task_6013_start;
-        
-        ctx->futhark_mc_segmap_nested_task_6013_runs[tid]++;
-        ctx->futhark_mc_segmap_nested_task_6013_total_runtime[tid] += elapsed;
-        ctx->futhark_mc_segmap_nested_task_6013_iter[tid] += iterations;
-    }
-    
-  cleanup:
-    { }
-    return err;
-}
-static int futrts_init(struct futhark_context *ctx,
-                       struct memblock *out_mem_p_5993,
-                       int64_t *out_scalar_out_5994,
-                       int64_t *out_scalar_out_5995,
-                       int64_t *out_scalar_out_5996,
-                       struct memblock board_mem_5945, int64_t n_5861,
-                       int64_t nb_rows_5863, int64_t nb_columns_5864,
-                       int64_t sizze_5865)
-{
-    (void) ctx;
-    
-    int err = 0;
-    struct memblock out_mem_5965;
-    
-    out_mem_5965.references = NULL;
-    
-    int64_t scalar_out_5966;
-    int64_t scalar_out_5967;
-    int64_t scalar_out_5968;
-    
-    if (memblock_set(ctx, &out_mem_5965, &board_mem_5945, "board_mem_5945") !=
-        0)
-        return 1;
-    scalar_out_5966 = nb_columns_5864;
-    scalar_out_5967 = nb_rows_5863;
-    scalar_out_5968 = sizze_5865;
-    (*out_mem_p_5993).references = NULL;
-    if (memblock_set(ctx, &*out_mem_p_5993, &out_mem_5965, "out_mem_5965") != 0)
-        return 1;
-    *out_scalar_out_5994 = scalar_out_5966;
-    *out_scalar_out_5995 = scalar_out_5967;
-    *out_scalar_out_5996 = scalar_out_5968;
-    if (memblock_unref(ctx, &out_mem_5965, "out_mem_5965") != 0)
-        return 1;
-    
-  cleanup:
-    { }
-    return err;
-}
-static int futrts_key(struct futhark_context *ctx,
-                      struct memblock *out_mem_p_5997,
-                      int64_t *out_scalar_out_5998,
-                      int64_t *out_scalar_out_5999,
-                      int64_t *out_scalar_out_6000,
-                      struct memblock board_mem_5945, int64_t implz2080U_5881,
-                      int32_t e_5882, int32_t key_5883, int64_t nb_columns_5885,
-                      int64_t nb_rows_5886, int64_t sizze_5887)
-{
-    (void) ctx;
-    
-    int err = 0;
-    struct memblock out_mem_5965;
-    
-    out_mem_5965.references = NULL;
-    
-    int64_t scalar_out_5966;
-    int64_t scalar_out_5967;
-    int64_t scalar_out_5968;
-    
-    if (memblock_set(ctx, &out_mem_5965, &board_mem_5945, "board_mem_5945") !=
-        0)
-        return 1;
-    scalar_out_5966 = nb_columns_5885;
-    scalar_out_5967 = nb_rows_5886;
-    scalar_out_5968 = sizze_5887;
-    (*out_mem_p_5997).references = NULL;
-    if (memblock_set(ctx, &*out_mem_p_5997, &out_mem_5965, "out_mem_5965") != 0)
-        return 1;
-    *out_scalar_out_5998 = scalar_out_5966;
-    *out_scalar_out_5999 = scalar_out_5967;
-    *out_scalar_out_6000 = scalar_out_5968;
-    if (memblock_unref(ctx, &out_mem_5965, "out_mem_5965") != 0)
-        return 1;
-    
-  cleanup:
-    { }
-    return err;
-}
-static int futrts_mouse(struct futhark_context *ctx,
-                        struct memblock *out_mem_p_6001,
-                        int64_t *out_scalar_out_6002,
-                        int64_t *out_scalar_out_6003,
-                        int64_t *out_scalar_out_6004,
-                        struct memblock board_mem_5945, int64_t implz2080U_5866,
-                        int32_t buttons_5867, int32_t x_5868, int32_t y_5869,
-                        int64_t nb_columns_5871, int64_t nb_rows_5872,
-                        int64_t sizze_5873)
-{
-    (void) ctx;
-    
-    int err = 0;
-    struct memblock out_mem_5965;
-    
-    out_mem_5965.references = NULL;
-    
-    int64_t scalar_out_5966;
-    int64_t scalar_out_5967;
-    int64_t scalar_out_5968;
-    
-    if (memblock_set(ctx, &out_mem_5965, &board_mem_5945, "board_mem_5945") !=
-        0)
-        return 1;
-    scalar_out_5966 = nb_columns_5871;
-    scalar_out_5967 = nb_rows_5872;
-    scalar_out_5968 = sizze_5873;
-    (*out_mem_p_6001).references = NULL;
-    if (memblock_set(ctx, &*out_mem_p_6001, &out_mem_5965, "out_mem_5965") != 0)
-        return 1;
-    *out_scalar_out_6002 = scalar_out_5966;
-    *out_scalar_out_6003 = scalar_out_5967;
-    *out_scalar_out_6004 = scalar_out_5968;
-    if (memblock_unref(ctx, &out_mem_5965, "out_mem_5965") != 0)
-        return 1;
-    
-  cleanup:
-    { }
-    return err;
-}
-static int futrts_render(struct futhark_context *ctx,
-                         struct memblock *out_mem_p_6005,
-                         int64_t *out_out_arrsizze_6006,
-                         int64_t *out_out_arrsizze_6007,
-                         struct memblock board_mem_5945,
-                         int64_t implz2080U_5888, int64_t nb_columns_5890,
-                         int64_t nb_rows_5891, int64_t sizze_5892)
-{
-    (void) ctx;
-    
-    int err = 0;
-    struct memblock out_mem_5965;
-    
-    out_mem_5965.references = NULL;
-    
-    int64_t out_arrsizze_5966;
-    int64_t out_arrsizze_5967;
-    bool bounds_invalid_upwards_5893 = slt64(nb_rows_5891, (int64_t) 0);
-    bool valid_5894 = !bounds_invalid_upwards_5893;
-    bool range_valid_c_5895;
-    
-    if (!valid_5894) {
-        ctx->error = msgprintf("Error: %s%lld%s%lld%s%lld%s\n\nBacktrace:\n%s",
-                               "Range ", (int64_t) 0, "..", (int64_t) 1, "..<",
-                               nb_rows_5891, " is invalid.",
-                               "-> #0  /prelude/array.fut:90:3-10\n   #1  /home/baptistecdr/Documents/Cours/projet-de-bachelor/game_of_life/gol.fut:27:17-30\n   #2  /home/baptistecdr/Documents/Cours/projet-de-bachelor/game_of_life/gol.fut:26:1-31:12\n");
-        if (memblock_unref(ctx, &out_mem_5965, "out_mem_5965") != 0)
-            return 1;
-        return 1;
-    }
-    
-    bool bounds_invalid_upwards_5897 = slt64(nb_columns_5890, (int64_t) 0);
-    bool valid_5898 = !bounds_invalid_upwards_5897;
-    bool range_valid_c_5899;
-    
-    if (!valid_5898) {
-        ctx->error = msgprintf("Error: %s%lld%s%lld%s%lld%s\n\nBacktrace:\n%s",
-                               "Range ", (int64_t) 0, "..", (int64_t) 1, "..<",
-                               nb_columns_5890, " is invalid.",
-                               "-> #0  /prelude/array.fut:90:3-10\n   #1  /home/baptistecdr/Documents/Cours/projet-de-bachelor/game_of_life/gol.fut:28:17-33\n   #2  /home/baptistecdr/Documents/Cours/projet-de-bachelor/game_of_life/gol.fut:26:1-31:12\n");
-        if (memblock_unref(ctx, &out_mem_5965, "out_mem_5965") != 0)
-            return 1;
-        return 1;
-    }
-    
-    int64_t binop_x_5963 = nb_columns_5890 * nb_rows_5891;
-    int64_t bytes_5962 = (int64_t) 4 * binop_x_5963;
-    struct memblock mem_5964;
-    
-    mem_5964.references = NULL;
-    if (memblock_alloc(ctx, &mem_5964, bytes_5962, "mem_5964")) {
-        err = 1;
-        goto cleanup;
-    }
-    
-    int64_t bytes_5946 = (int64_t) 4 * nb_columns_5890;
-    int64_t flat_tid_5915 = (int64_t) 0;
-    int32_t num_tasks_5968;
-    int64_t flat_tid_5917;
-    
-    flat_tid_5917 = (int64_t) 0;
-    
-    struct futhark_mc_task_6008 futhark_mc_task_6008;
-    
-    futhark_mc_task_6008.ctx = ctx;
-    futhark_mc_task_6008.free_implz2080U_5888 = implz2080U_5888;
-    futhark_mc_task_6008.free_nb_columns_5890 = nb_columns_5890;
-    futhark_mc_task_6008.free_board_mem_5945 = board_mem_5945.mem;
-    futhark_mc_task_6008.free_bytes_5946 = bytes_5946;
-    futhark_mc_task_6008.free_mem_5964 = mem_5964.mem;
-    
-    struct scheduler_segop futhark_mc_task_6008_task;
-    
-    futhark_mc_task_6008_task.args = &futhark_mc_task_6008;
-    futhark_mc_task_6008_task.top_level_fn = futhark_mc_segmap_task_6009;
-    futhark_mc_task_6008_task.name = "futhark_mc_segmap_task_6009";
-    futhark_mc_task_6008_task.iterations = nb_rows_5891;
-    futhark_mc_task_6008_task.task_time =
-        &ctx->futhark_mc_segmap_task_6009_total_time;
-    futhark_mc_task_6008_task.task_iter =
-        &ctx->futhark_mc_segmap_task_6009_total_iter;
-    futhark_mc_task_6008_task.sched = STATIC;
-    futhark_mc_task_6008_task.nested_fn = futhark_mc_segmap_nested_task_6013;
-    
-    int futhark_mc_segmap_task_6009_err =
-        scheduler_prepare_task(&ctx->scheduler, &futhark_mc_task_6008_task);
-    
-    if (futhark_mc_segmap_task_6009_err != 0) {
-        err = 1;
-        goto cleanup;
-    }
-    out_arrsizze_5966 = nb_rows_5891;
-    out_arrsizze_5967 = nb_columns_5890;
-    if (memblock_set(ctx, &out_mem_5965, &mem_5964, "mem_5964") != 0)
-        return 1;
-    (*out_mem_p_6005).references = NULL;
-    if (memblock_set(ctx, &*out_mem_p_6005, &out_mem_5965, "out_mem_5965") != 0)
-        return 1;
-    *out_out_arrsizze_6006 = out_arrsizze_5966;
-    *out_out_arrsizze_6007 = out_arrsizze_5967;
-    if (memblock_unref(ctx, &mem_5964, "mem_5964") != 0)
-        return 1;
-    if (memblock_unref(ctx, &out_mem_5965, "out_mem_5965") != 0)
-        return 1;
-    
-  cleanup:
-    { }
-    return err;
-}
-static int futrts_resizze(struct futhark_context *ctx,
-                          struct memblock *out_mem_p_6021,
-                          int64_t *out_scalar_out_6022,
-                          int64_t *out_scalar_out_6023,
-                          int64_t *out_scalar_out_6024,
-                          struct memblock board_mem_5945,
-                          int64_t implz2080U_5848, int64_t h_5849,
-                          int64_t w_5850, int64_t nb_columns_5852,
-                          int64_t nb_rows_5853, int64_t sizze_5854)
-{
-    (void) ctx;
-    
-    int err = 0;
-    struct memblock out_mem_5965;
-    
-    out_mem_5965.references = NULL;
-    
-    int64_t scalar_out_5966;
-    int64_t scalar_out_5967;
-    int64_t scalar_out_5968;
-    
-    if (memblock_set(ctx, &out_mem_5965, &board_mem_5945, "board_mem_5945") !=
-        0)
-        return 1;
-    scalar_out_5966 = nb_columns_5852;
-    scalar_out_5967 = nb_rows_5853;
-    scalar_out_5968 = sizze_5854;
-    (*out_mem_p_6021).references = NULL;
-    if (memblock_set(ctx, &*out_mem_p_6021, &out_mem_5965, "out_mem_5965") != 0)
-        return 1;
-    *out_scalar_out_6022 = scalar_out_5966;
-    *out_scalar_out_6023 = scalar_out_5967;
-    *out_scalar_out_6024 = scalar_out_5968;
-    if (memblock_unref(ctx, &out_mem_5965, "out_mem_5965") != 0)
-        return 1;
-    
-  cleanup:
-    { }
-    return err;
-}
-static int futrts_step(struct futhark_context *ctx,
-                       struct memblock *out_mem_p_6025,
-                       int64_t *out_scalar_out_6026,
-                       int64_t *out_scalar_out_6027,
-                       int64_t *out_scalar_out_6028,
-                       struct memblock board_mem_5945, int64_t implz2080U_5855,
-                       float nameless_5856, int64_t nb_columns_5858,
-                       int64_t nb_rows_5859, int64_t sizze_5860)
-{
-    (void) ctx;
-    
-    int err = 0;
-    struct memblock out_mem_5965;
-    
-    out_mem_5965.references = NULL;
-    
-    int64_t scalar_out_5966;
-    int64_t scalar_out_5967;
-    int64_t scalar_out_5968;
-    
-    if (memblock_set(ctx, &out_mem_5965, &board_mem_5945, "board_mem_5945") !=
-        0)
-        return 1;
-    scalar_out_5966 = nb_columns_5858;
-    scalar_out_5967 = nb_rows_5859;
-    scalar_out_5968 = sizze_5860;
-    (*out_mem_p_6025).references = NULL;
-    if (memblock_set(ctx, &*out_mem_p_6025, &out_mem_5965, "out_mem_5965") != 0)
-        return 1;
-    *out_scalar_out_6026 = scalar_out_5966;
-    *out_scalar_out_6027 = scalar_out_5967;
-    *out_scalar_out_6028 = scalar_out_5968;
-    if (memblock_unref(ctx, &out_mem_5965, "out_mem_5965") != 0)
-        return 1;
-    
-  cleanup:
-    { }
-    return err;
-}
-static int futrts_wheel(struct futhark_context *ctx,
-                        struct memblock *out_mem_p_6029,
-                        int64_t *out_scalar_out_6030,
-                        int64_t *out_scalar_out_6031,
-                        int64_t *out_scalar_out_6032,
-                        struct memblock board_mem_5945, int64_t implz2080U_5874,
-                        int32_t dx_5875, int32_t dy_5876,
-                        int64_t nb_columns_5878, int64_t nb_rows_5879,
-                        int64_t sizze_5880)
-{
-    (void) ctx;
-    
-    int err = 0;
-    struct memblock out_mem_5965;
-    
-    out_mem_5965.references = NULL;
-    
-    int64_t scalar_out_5966;
-    int64_t scalar_out_5967;
-    int64_t scalar_out_5968;
-    
-    if (memblock_set(ctx, &out_mem_5965, &board_mem_5945, "board_mem_5945") !=
-        0)
-        return 1;
-    scalar_out_5966 = nb_columns_5878;
-    scalar_out_5967 = nb_rows_5879;
-    scalar_out_5968 = sizze_5880;
-    (*out_mem_p_6029).references = NULL;
-    if (memblock_set(ctx, &*out_mem_p_6029, &out_mem_5965, "out_mem_5965") != 0)
-        return 1;
-    *out_scalar_out_6030 = scalar_out_5966;
-    *out_scalar_out_6031 = scalar_out_5967;
-    *out_scalar_out_6032 = scalar_out_5968;
-    if (memblock_unref(ctx, &out_mem_5965, "out_mem_5965") != 0)
-        return 1;
-    
-  cleanup:
-    { }
-    return err;
-}
-struct futhark_u32_2d {
-    struct memblock mem;
-    int64_t shape[2];
-} ;
-struct futhark_u32_2d *futhark_new_u32_2d(struct futhark_context *ctx, const
-                                          uint32_t *data, int64_t dim0,
-                                          int64_t dim1)
-{
-    struct futhark_u32_2d *bad = NULL;
-    struct futhark_u32_2d *arr =
-                          (struct futhark_u32_2d *) malloc(sizeof(struct futhark_u32_2d));
-    
-    if (arr == NULL)
-        return bad;
-    lock_lock(&ctx->lock);
-    worker_local = &ctx->scheduler.workers[0];
-    arr->mem.references = NULL;
-    if (memblock_alloc(ctx, &arr->mem, (size_t) (dim0 * dim1) *
-                       sizeof(uint32_t), "arr->mem"))
-        return NULL;
-    arr->shape[0] = dim0;
-    arr->shape[1] = dim1;
-    memmove(arr->mem.mem + 0, data + 0, (size_t) (dim0 * dim1) *
-            sizeof(uint32_t));
-    lock_unlock(&ctx->lock);
-    return arr;
-}
-struct futhark_u32_2d *futhark_new_raw_u32_2d(struct futhark_context *ctx, const
-                                              char *data, int offset,
-                                              int64_t dim0, int64_t dim1)
-{
-    struct futhark_u32_2d *bad = NULL;
-    struct futhark_u32_2d *arr =
-                          (struct futhark_u32_2d *) malloc(sizeof(struct futhark_u32_2d));
-    
-    if (arr == NULL)
-        return bad;
-    lock_lock(&ctx->lock);
-    worker_local = &ctx->scheduler.workers[0];
-    arr->mem.references = NULL;
-    if (memblock_alloc(ctx, &arr->mem, (size_t) (dim0 * dim1) *
-                       sizeof(uint32_t), "arr->mem"))
-        return NULL;
-    arr->shape[0] = dim0;
-    arr->shape[1] = dim1;
-    memmove(arr->mem.mem + 0, data + offset, (size_t) (dim0 * dim1) *
-            sizeof(uint32_t));
-    lock_unlock(&ctx->lock);
-    return arr;
-}
-int futhark_free_u32_2d(struct futhark_context *ctx, struct futhark_u32_2d *arr)
-{
-    lock_lock(&ctx->lock);
-    worker_local = &ctx->scheduler.workers[0];
-    if (memblock_unref(ctx, &arr->mem, "arr->mem") != 0)
-        return 1;
-    lock_unlock(&ctx->lock);
-    free(arr);
-    return 0;
-}
-int futhark_values_u32_2d(struct futhark_context *ctx,
-                          struct futhark_u32_2d *arr, uint32_t *data)
-{
-    lock_lock(&ctx->lock);
-    worker_local = &ctx->scheduler.workers[0];
-    memmove(data + 0, arr->mem.mem + 0, (size_t) (arr->shape[0] *
-                                                  arr->shape[1]) *
-            sizeof(uint32_t));
-    lock_unlock(&ctx->lock);
-    return 0;
-}
-char *futhark_values_raw_u32_2d(struct futhark_context *ctx,
-                                struct futhark_u32_2d *arr)
-{
-    (void) ctx;
-    return arr->mem.mem;
-}
-const int64_t *futhark_shape_u32_2d(struct futhark_context *ctx,
-                                    struct futhark_u32_2d *arr)
-{
-    (void) ctx;
-    return arr->shape;
-}
-struct futhark_i8_1d {
-    struct memblock mem;
-    int64_t shape[1];
-} ;
-struct futhark_i8_1d *futhark_new_i8_1d(struct futhark_context *ctx, const
-                                        int8_t *data, int64_t dim0)
-{
-    struct futhark_i8_1d *bad = NULL;
-    struct futhark_i8_1d *arr =
-                         (struct futhark_i8_1d *) malloc(sizeof(struct futhark_i8_1d));
-    
-    if (arr == NULL)
-        return bad;
-    lock_lock(&ctx->lock);
-    worker_local = &ctx->scheduler.workers[0];
-    arr->mem.references = NULL;
-    if (memblock_alloc(ctx, &arr->mem, (size_t) dim0 * sizeof(int8_t),
-                       "arr->mem"))
-        return NULL;
-    arr->shape[0] = dim0;
-    memmove(arr->mem.mem + 0, data + 0, (size_t) dim0 * sizeof(int8_t));
-    lock_unlock(&ctx->lock);
-    return arr;
-}
-struct futhark_i8_1d *futhark_new_raw_i8_1d(struct futhark_context *ctx, const
-                                            char *data, int offset,
-                                            int64_t dim0)
-{
-    struct futhark_i8_1d *bad = NULL;
-    struct futhark_i8_1d *arr =
-                         (struct futhark_i8_1d *) malloc(sizeof(struct futhark_i8_1d));
-    
-    if (arr == NULL)
-        return bad;
-    lock_lock(&ctx->lock);
-    worker_local = &ctx->scheduler.workers[0];
-    arr->mem.references = NULL;
-    if (memblock_alloc(ctx, &arr->mem, (size_t) dim0 * sizeof(int8_t),
-                       "arr->mem"))
-        return NULL;
-    arr->shape[0] = dim0;
-    memmove(arr->mem.mem + 0, data + offset, (size_t) dim0 * sizeof(int8_t));
-    lock_unlock(&ctx->lock);
-    return arr;
-}
-int futhark_free_i8_1d(struct futhark_context *ctx, struct futhark_i8_1d *arr)
-{
-    lock_lock(&ctx->lock);
-    worker_local = &ctx->scheduler.workers[0];
-    if (memblock_unref(ctx, &arr->mem, "arr->mem") != 0)
-        return 1;
-    lock_unlock(&ctx->lock);
-    free(arr);
-    return 0;
-}
-int futhark_values_i8_1d(struct futhark_context *ctx, struct futhark_i8_1d *arr,
-                         int8_t *data)
-{
-    lock_lock(&ctx->lock);
-    worker_local = &ctx->scheduler.workers[0];
-    memmove(data + 0, arr->mem.mem + 0, (size_t) arr->shape[0] *
-            sizeof(int8_t));
-    lock_unlock(&ctx->lock);
-    return 0;
-}
-char *futhark_values_raw_i8_1d(struct futhark_context *ctx,
-                               struct futhark_i8_1d *arr)
-{
-    (void) ctx;
-    return arr->mem.mem;
-}
-const int64_t *futhark_shape_i8_1d(struct futhark_context *ctx,
-                                   struct futhark_i8_1d *arr)
-{
-    (void) ctx;
-    return arr->shape;
-}
-struct futhark_opaque_state {
-    struct futhark_i8_1d *v0;
-    int64_t v1;
-    int64_t v2;
-    int64_t v3;
-} ;
-int futhark_free_opaque_state(struct futhark_context *ctx,
-                              struct futhark_opaque_state *obj)
-{
-    int ret = 0, tmp;
-    
-    if (obj->v0 != NULL && (tmp = futhark_free_i8_1d(ctx, obj->v0)) != 0)
-        ret = tmp;
-    free(obj);
-    return ret;
-}
-int futhark_store_opaque_state(struct futhark_context *ctx, const
-                               struct futhark_opaque_state *obj, void **p,
-                               size_t *n)
-{
-    int ret = 0;
-    int64_t size_0 = 7 + 1 * sizeof(int64_t) + futhark_shape_i8_1d(ctx,
-                                                                   obj->v0)[0] *
-            1;
-    int64_t size_1 = 7 + 0 * sizeof(int64_t) + 1 * 8;
-    int64_t size_2 = 7 + 0 * sizeof(int64_t) + 1 * 8;
-    int64_t size_3 = 7 + 0 * sizeof(int64_t) + 1 * 8;
-    
-    *n = size_0 + size_1 + size_2 + size_3;
-    if (p != NULL && *p == NULL)
-        *p = malloc(*n);
-    if (p != NULL) {
-        unsigned char *out = *p;
-        
-        *out++ = 'b';
-        *out++ = 2;
-        *out++ = 1;
-        memcpy(out, "  i8", 4);
-        out += 4;
-        memcpy(out, futhark_shape_i8_1d(ctx, obj->v0), 1 * sizeof(int64_t));
-        out += 1 * sizeof(int64_t);
-        ret |= futhark_values_i8_1d(ctx, obj->v0, (void *) out);
-        out += futhark_shape_i8_1d(ctx, obj->v0)[0] * sizeof(int8_t);
-        *out++ = 'b';
-        *out++ = 2;
-        *out++ = 0;
-        memcpy(out, " i64", 4);
-        out += 4;
-        memcpy(out, &obj->v1, sizeof(obj->v1));
-        out += sizeof(obj->v1);
-        *out++ = 'b';
-        *out++ = 2;
-        *out++ = 0;
-        memcpy(out, " i64", 4);
-        out += 4;
-        memcpy(out, &obj->v2, sizeof(obj->v2));
-        out += sizeof(obj->v2);
-        *out++ = 'b';
-        *out++ = 2;
-        *out++ = 0;
-        memcpy(out, " i64", 4);
-        out += 4;
-        memcpy(out, &obj->v3, sizeof(obj->v3));
-        out += sizeof(obj->v3);
-    }
-    return ret;
-}
-struct futhark_opaque_state *futhark_restore_opaque_state(struct futhark_context *ctx,
-                                                          const void *p)
-{
-    int err = 0;
-    const unsigned char *src = p;
-    struct futhark_opaque_state *obj =
-                                malloc(sizeof(struct futhark_opaque_state));
-    int64_t shape_0[1];
-    
-    err |= *src++ != 'b';
-    err |= *src++ != 2;
-    err |= *src++ != 1;
-    err |= memcmp(src, "  i8", 4) != 0;
-    src += 4;
-    if (err == 0) {
-        memcpy(shape_0, src, 1 * sizeof(int64_t));
-        src += 1 * sizeof(int64_t);
-    }
-    
-    const void *data_0 = src;
-    
-    obj->v0 = NULL;
-    src += shape_0[0] * sizeof(int8_t);
-    err |= *src++ != 'b';
-    err |= *src++ != 2;
-    err |= *src++ != 0;
-    err |= memcmp(src, " i64", 4) != 0;
-    src += 4;
-    if (err == 0) {
-        src += 0 * sizeof(int64_t);
-    }
-    
-    const void *data_1 = src;
-    
-    src += sizeof(obj->v1);
-    err |= *src++ != 'b';
-    err |= *src++ != 2;
-    err |= *src++ != 0;
-    err |= memcmp(src, " i64", 4) != 0;
-    src += 4;
-    if (err == 0) {
-        src += 0 * sizeof(int64_t);
-    }
-    
-    const void *data_2 = src;
-    
-    src += sizeof(obj->v2);
-    err |= *src++ != 'b';
-    err |= *src++ != 2;
-    err |= *src++ != 0;
-    err |= memcmp(src, " i64", 4) != 0;
-    src += 4;
-    if (err == 0) {
-        src += 0 * sizeof(int64_t);
-    }
-    
-    const void *data_3 = src;
-    
-    src += sizeof(obj->v3);
-    if (err == 0) {
-        obj->v0 = futhark_new_i8_1d(ctx, data_0, shape_0[0]);
-        if (obj->v0 == NULL)
-            err = 1;
-        memcpy(&obj->v1, data_1, sizeof(obj->v1));
-        memcpy(&obj->v2, data_2, sizeof(obj->v2));
-        memcpy(&obj->v3, data_3, sizeof(obj->v3));
-    }
-    if (err != 0) {
-        int ret = 0, tmp;
-        
-        if (obj->v0 != NULL && (tmp = futhark_free_i8_1d(ctx, obj->v0)) != 0)
-            ret = tmp;
-        free(obj);
-        obj = NULL;
-    }
-    return obj;
-}
-int futhark_entry_init(struct futhark_context *ctx,
-                       struct futhark_opaque_state **out0, const
-                       struct futhark_i8_1d *in0, const int64_t in1, const
-                       int64_t in2, const int64_t in3)
-{
-    struct memblock board_mem_5945;
-    
-    board_mem_5945.references = NULL;
-    
-    int64_t n_5861;
-    int64_t nb_rows_5863;
-    int64_t nb_columns_5864;
-    int64_t sizze_5865;
-    struct memblock out_mem_5965;
-    
-    out_mem_5965.references = NULL;
-    
-    int64_t scalar_out_5966;
-    int64_t scalar_out_5967;
-    int64_t scalar_out_5968;
-    int ret = 0;
-    
-    lock_lock(&ctx->lock);
-    worker_local = &ctx->scheduler.workers[0];
-    board_mem_5945 = in0->mem;
-    n_5861 = in0->shape[0];
-    nb_rows_5863 = in1;
-    nb_columns_5864 = in2;
-    sizze_5865 = in3;
-    if (!(n_5861 == in0->shape[0] && (true && (true && true)))) {
-        ret = 1;
-        if (!ctx->error)
-            ctx->error =
-                msgprintf("Error: entry point arguments have invalid sizes.\n");
-    } else {
-        ret = futrts_init(ctx, &out_mem_5965, &scalar_out_5966,
-                          &scalar_out_5967, &scalar_out_5968, board_mem_5945,
-                          n_5861, nb_rows_5863, nb_columns_5864, sizze_5865);
-        if (ret == 0) {
-            assert((*out0 =
-                    (struct futhark_opaque_state *) malloc(sizeof(struct futhark_opaque_state))) !=
-                NULL);
-            assert(((*out0)->v0 =
-                    (struct futhark_i8_1d *) malloc(sizeof(struct futhark_i8_1d))) !=
-                NULL);
-            (*out0)->v0->mem = out_mem_5965;
-            (*out0)->v0->shape[0] = n_5861;
-            (*out0)->v1 = scalar_out_5966;
-            (*out0)->v2 = scalar_out_5967;
-            (*out0)->v3 = scalar_out_5968;
-        }
-    }
-    lock_unlock(&ctx->lock);
-    return ret;
-}
-int futhark_entry_key(struct futhark_context *ctx,
-                      struct futhark_opaque_state **out0, const int32_t in0,
-                      const int32_t in1, const struct futhark_opaque_state *in2)
-{
-    struct memblock board_mem_5945;
-    
-    board_mem_5945.references = NULL;
-    
-    int64_t implz2080U_5881;
-    int32_t e_5882;
-    int32_t key_5883;
-    int64_t nb_columns_5885;
-    int64_t nb_rows_5886;
-    int64_t sizze_5887;
-    struct memblock out_mem_5965;
-    
-    out_mem_5965.references = NULL;
-    
-    int64_t scalar_out_5966;
-    int64_t scalar_out_5967;
-    int64_t scalar_out_5968;
-    int ret = 0;
-    
-    lock_lock(&ctx->lock);
-    worker_local = &ctx->scheduler.workers[0];
-    e_5882 = in0;
-    key_5883 = in1;
-    board_mem_5945 = in2->v0->mem;
-    implz2080U_5881 = in2->v0->shape[0];
-    nb_columns_5885 = in2->v1;
-    nb_rows_5886 = in2->v2;
-    sizze_5887 = in2->v3;
-    if (!(true && (true && implz2080U_5881 == in2->v0->shape[0]))) {
-        ret = 1;
-        if (!ctx->error)
-            ctx->error =
-                msgprintf("Error: entry point arguments have invalid sizes.\n");
-    } else {
-        ret = futrts_key(ctx, &out_mem_5965, &scalar_out_5966, &scalar_out_5967,
-                         &scalar_out_5968, board_mem_5945, implz2080U_5881,
-                         e_5882, key_5883, nb_columns_5885, nb_rows_5886,
-                         sizze_5887);
-        if (ret == 0) {
-            assert((*out0 =
-                    (struct futhark_opaque_state *) malloc(sizeof(struct futhark_opaque_state))) !=
-                NULL);
-            assert(((*out0)->v0 =
-                    (struct futhark_i8_1d *) malloc(sizeof(struct futhark_i8_1d))) !=
-                NULL);
-            (*out0)->v0->mem = out_mem_5965;
-            (*out0)->v0->shape[0] = implz2080U_5881;
-            (*out0)->v1 = scalar_out_5966;
-            (*out0)->v2 = scalar_out_5967;
-            (*out0)->v3 = scalar_out_5968;
-        }
-    }
-    lock_unlock(&ctx->lock);
-    return ret;
-}
-int futhark_entry_mouse(struct futhark_context *ctx,
-                        struct futhark_opaque_state **out0, const int32_t in0,
-                        const int32_t in1, const int32_t in2, const
-                        struct futhark_opaque_state *in3)
-{
-    struct memblock board_mem_5945;
-    
-    board_mem_5945.references = NULL;
-    
-    int64_t implz2080U_5866;
-    int32_t buttons_5867;
-    int32_t x_5868;
-    int32_t y_5869;
-    int64_t nb_columns_5871;
-    int64_t nb_rows_5872;
-    int64_t sizze_5873;
-    struct memblock out_mem_5965;
-    
-    out_mem_5965.references = NULL;
-    
-    int64_t scalar_out_5966;
-    int64_t scalar_out_5967;
-    int64_t scalar_out_5968;
-    int ret = 0;
-    
-    lock_lock(&ctx->lock);
-    worker_local = &ctx->scheduler.workers[0];
-    buttons_5867 = in0;
-    x_5868 = in1;
-    y_5869 = in2;
-    board_mem_5945 = in3->v0->mem;
-    implz2080U_5866 = in3->v0->shape[0];
-    nb_columns_5871 = in3->v1;
-    nb_rows_5872 = in3->v2;
-    sizze_5873 = in3->v3;
-    if (!(true && (true && (true && implz2080U_5866 == in3->v0->shape[0])))) {
-        ret = 1;
-        if (!ctx->error)
-            ctx->error =
-                msgprintf("Error: entry point arguments have invalid sizes.\n");
-    } else {
-        ret = futrts_mouse(ctx, &out_mem_5965, &scalar_out_5966,
-                           &scalar_out_5967, &scalar_out_5968, board_mem_5945,
-                           implz2080U_5866, buttons_5867, x_5868, y_5869,
-                           nb_columns_5871, nb_rows_5872, sizze_5873);
-        if (ret == 0) {
-            assert((*out0 =
-                    (struct futhark_opaque_state *) malloc(sizeof(struct futhark_opaque_state))) !=
-                NULL);
-            assert(((*out0)->v0 =
-                    (struct futhark_i8_1d *) malloc(sizeof(struct futhark_i8_1d))) !=
-                NULL);
-            (*out0)->v0->mem = out_mem_5965;
-            (*out0)->v0->shape[0] = implz2080U_5866;
-            (*out0)->v1 = scalar_out_5966;
-            (*out0)->v2 = scalar_out_5967;
-            (*out0)->v3 = scalar_out_5968;
-        }
-    }
-    lock_unlock(&ctx->lock);
-    return ret;
-}
-int futhark_entry_render(struct futhark_context *ctx,
-                         struct futhark_u32_2d **out0, const
-                         struct futhark_opaque_state *in0)
-{
-    struct memblock board_mem_5945;
-    
-    board_mem_5945.references = NULL;
-    
-    int64_t implz2080U_5888;
-    int64_t nb_columns_5890;
-    int64_t nb_rows_5891;
-    int64_t sizze_5892;
-    struct memblock out_mem_5965;
-    
-    out_mem_5965.references = NULL;
-    
-    int64_t out_arrsizze_5966;
-    int64_t out_arrsizze_5967;
-    int ret = 0;
-    
-    lock_lock(&ctx->lock);
-    worker_local = &ctx->scheduler.workers[0];
-    board_mem_5945 = in0->v0->mem;
-    implz2080U_5888 = in0->v0->shape[0];
-    nb_columns_5890 = in0->v1;
-    nb_rows_5891 = in0->v2;
-    sizze_5892 = in0->v3;
-    if (!(implz2080U_5888 == in0->v0->shape[0])) {
-        ret = 1;
-        if (!ctx->error)
-            ctx->error =
-                msgprintf("Error: entry point arguments have invalid sizes.\n");
-    } else {
-        ret = futrts_render(ctx, &out_mem_5965, &out_arrsizze_5966,
-                            &out_arrsizze_5967, board_mem_5945, implz2080U_5888,
-                            nb_columns_5890, nb_rows_5891, sizze_5892);
-        if (ret == 0) {
-            assert((*out0 =
-                    (struct futhark_u32_2d *) malloc(sizeof(struct futhark_u32_2d))) !=
-                NULL);
-            (*out0)->mem = out_mem_5965;
-            (*out0)->shape[0] = out_arrsizze_5966;
-            (*out0)->shape[1] = out_arrsizze_5967;
-        }
-    }
-    lock_unlock(&ctx->lock);
-    return ret;
-}
-int futhark_entry_resize(struct futhark_context *ctx,
-                         struct futhark_opaque_state **out0, const int64_t in0,
-                         const int64_t in1, const
-                         struct futhark_opaque_state *in2)
-{
-    struct memblock board_mem_5945;
-    
-    board_mem_5945.references = NULL;
-    
-    int64_t implz2080U_5848;
-    int64_t h_5849;
-    int64_t w_5850;
-    int64_t nb_columns_5852;
-    int64_t nb_rows_5853;
-    int64_t sizze_5854;
-    struct memblock out_mem_5965;
-    
-    out_mem_5965.references = NULL;
-    
-    int64_t scalar_out_5966;
-    int64_t scalar_out_5967;
-    int64_t scalar_out_5968;
-    int ret = 0;
-    
-    lock_lock(&ctx->lock);
-    worker_local = &ctx->scheduler.workers[0];
-    h_5849 = in0;
-    w_5850 = in1;
-    board_mem_5945 = in2->v0->mem;
-    implz2080U_5848 = in2->v0->shape[0];
-    nb_columns_5852 = in2->v1;
-    nb_rows_5853 = in2->v2;
-    sizze_5854 = in2->v3;
-    if (!(true && (true && implz2080U_5848 == in2->v0->shape[0]))) {
-        ret = 1;
-        if (!ctx->error)
-            ctx->error =
-                msgprintf("Error: entry point arguments have invalid sizes.\n");
-    } else {
-        ret = futrts_resizze(ctx, &out_mem_5965, &scalar_out_5966,
-                             &scalar_out_5967, &scalar_out_5968, board_mem_5945,
-                             implz2080U_5848, h_5849, w_5850, nb_columns_5852,
-                             nb_rows_5853, sizze_5854);
-        if (ret == 0) {
-            assert((*out0 =
-                    (struct futhark_opaque_state *) malloc(sizeof(struct futhark_opaque_state))) !=
-                NULL);
-            assert(((*out0)->v0 =
-                    (struct futhark_i8_1d *) malloc(sizeof(struct futhark_i8_1d))) !=
-                NULL);
-            (*out0)->v0->mem = out_mem_5965;
-            (*out0)->v0->shape[0] = implz2080U_5848;
-            (*out0)->v1 = scalar_out_5966;
-            (*out0)->v2 = scalar_out_5967;
-            (*out0)->v3 = scalar_out_5968;
-        }
-    }
-    lock_unlock(&ctx->lock);
-    return ret;
-}
-int futhark_entry_step(struct futhark_context *ctx,
-                       struct futhark_opaque_state **out0, const float in0,
-                       const struct futhark_opaque_state *in1)
-{
-    struct memblock board_mem_5945;
-    
-    board_mem_5945.references = NULL;
-    
-    int64_t implz2080U_5855;
-    float nameless_5856;
-    int64_t nb_columns_5858;
-    int64_t nb_rows_5859;
-    int64_t sizze_5860;
-    struct memblock out_mem_5965;
-    
-    out_mem_5965.references = NULL;
-    
-    int64_t scalar_out_5966;
-    int64_t scalar_out_5967;
-    int64_t scalar_out_5968;
-    int ret = 0;
-    
-    lock_lock(&ctx->lock);
-    worker_local = &ctx->scheduler.workers[0];
-    nameless_5856 = in0;
-    board_mem_5945 = in1->v0->mem;
-    implz2080U_5855 = in1->v0->shape[0];
-    nb_columns_5858 = in1->v1;
-    nb_rows_5859 = in1->v2;
-    sizze_5860 = in1->v3;
-    if (!(true && implz2080U_5855 == in1->v0->shape[0])) {
-        ret = 1;
-        if (!ctx->error)
-            ctx->error =
-                msgprintf("Error: entry point arguments have invalid sizes.\n");
-    } else {
-        ret = futrts_step(ctx, &out_mem_5965, &scalar_out_5966,
-                          &scalar_out_5967, &scalar_out_5968, board_mem_5945,
-                          implz2080U_5855, nameless_5856, nb_columns_5858,
-                          nb_rows_5859, sizze_5860);
-        if (ret == 0) {
-            assert((*out0 =
-                    (struct futhark_opaque_state *) malloc(sizeof(struct futhark_opaque_state))) !=
-                NULL);
-            assert(((*out0)->v0 =
-                    (struct futhark_i8_1d *) malloc(sizeof(struct futhark_i8_1d))) !=
-                NULL);
-            (*out0)->v0->mem = out_mem_5965;
-            (*out0)->v0->shape[0] = implz2080U_5855;
-            (*out0)->v1 = scalar_out_5966;
-            (*out0)->v2 = scalar_out_5967;
-            (*out0)->v3 = scalar_out_5968;
-        }
-    }
-    lock_unlock(&ctx->lock);
-    return ret;
-}
-int futhark_entry_wheel(struct futhark_context *ctx,
-                        struct futhark_opaque_state **out0, const int32_t in0,
-                        const int32_t in1, const
-                        struct futhark_opaque_state *in2)
-{
-    struct memblock board_mem_5945;
-    
-    board_mem_5945.references = NULL;
-    
-    int64_t implz2080U_5874;
-    int32_t dx_5875;
-    int32_t dy_5876;
-    int64_t nb_columns_5878;
-    int64_t nb_rows_5879;
-    int64_t sizze_5880;
-    struct memblock out_mem_5965;
-    
-    out_mem_5965.references = NULL;
-    
-    int64_t scalar_out_5966;
-    int64_t scalar_out_5967;
-    int64_t scalar_out_5968;
-    int ret = 0;
-    
-    lock_lock(&ctx->lock);
-    worker_local = &ctx->scheduler.workers[0];
-    dx_5875 = in0;
-    dy_5876 = in1;
-    board_mem_5945 = in2->v0->mem;
-    implz2080U_5874 = in2->v0->shape[0];
-    nb_columns_5878 = in2->v1;
-    nb_rows_5879 = in2->v2;
-    sizze_5880 = in2->v3;
-    if (!(true && (true && implz2080U_5874 == in2->v0->shape[0]))) {
-        ret = 1;
-        if (!ctx->error)
-            ctx->error =
-                msgprintf("Error: entry point arguments have invalid sizes.\n");
-    } else {
-        ret = futrts_wheel(ctx, &out_mem_5965, &scalar_out_5966,
-                           &scalar_out_5967, &scalar_out_5968, board_mem_5945,
-                           implz2080U_5874, dx_5875, dy_5876, nb_columns_5878,
-                           nb_rows_5879, sizze_5880);
-        if (ret == 0) {
-            assert((*out0 =
-                    (struct futhark_opaque_state *) malloc(sizeof(struct futhark_opaque_state))) !=
-                NULL);
-            assert(((*out0)->v0 =
-                    (struct futhark_i8_1d *) malloc(sizeof(struct futhark_i8_1d))) !=
-                NULL);
-            (*out0)->v0->mem = out_mem_5965;
-            (*out0)->v0->shape[0] = implz2080U_5874;
-            (*out0)->v1 = scalar_out_5966;
-            (*out0)->v2 = scalar_out_5967;
-            (*out0)->v3 = scalar_out_5968;
-        }
-    }
-    lock_unlock(&ctx->lock);
-    return ret;
-}
diff --git a/game_of_life/gol.fut b/game_of_life/gol.fut
deleted file mode 100644
index da907c68aa88400cf61f247bfcc5c200dd107381..0000000000000000000000000000000000000000
--- a/game_of_life/gol.fut
+++ /dev/null
@@ -1,34 +0,0 @@
-import "./lib/github.com/diku-dk/lys/lys"
-
-type sized_state [n] = {board: [n]i8, nb_rows: i64, nb_columns:i64, size:i64}
-
-type^ state = sized_state []
-
-let keydown (key: i32) (s: state) = s
-let event (e: event) (s: state): state = s
-
-entry mouse (buttons: i32) (x: i32) (y: i32) (s: state): state =
-  event (#mouse {buttons, x, y}) s
-
-entry wheel (dx: i32) (dy: i32) (s: state): state =
-  event (#wheel {dx, dy}) s
-
-entry key (e: i32) (key: i32) (s: state): state =
-  let e' = if e == 0 then #keydown {key} else #keyup {key}
-  in event e' s
-
-entry resize (h: i64) (w: i64) (s: state): state = s
-
-let get_cell_index (x:i64) (y:i64) (nb_columns:i64) :i64 = (y * nb_columns + x)
-
-entry step (_: f32) (s: state): state = s
-
-entry render (s: state): [][]argb.colour =
-    let ridxs = iota s.nb_rows
-    let cidxs = iota s.nb_columns
-    in map (\y ->
-         map (\x -> if s.board[get_cell_index x y s.nb_columns] == 1 then argb.white else argb.black) cidxs)
-       ridxs
-
-entry init [n] (board: [n]i8) (nb_rows: i64) (nb_columns: i64) (size:i64) : state =
-    { board = board, nb_rows = nb_rows, nb_columns = nb_columns, size = size }
diff --git a/game_of_life/gol.h b/game_of_life/gol.h
deleted file mode 100644
index dca27183d50680d61c3ff4ed213266b924554909..0000000000000000000000000000000000000000
--- a/game_of_life/gol.h
+++ /dev/null
@@ -1,120 +0,0 @@
-#pragma once
-
-// Headers
-
-#include <stdint.h>
-#include <stddef.h>
-#include <stdbool.h>
-#include <stdio.h>
-#include <float.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-// Initialisation
-
-struct futhark_context_config ;
-struct futhark_context_config *futhark_context_config_new(void);
-void futhark_context_config_free(struct futhark_context_config *cfg);
-void futhark_context_config_set_debugging(struct futhark_context_config *cfg,
-                                          int flag);
-void futhark_context_config_set_profiling(struct futhark_context_config *cfg,
-                                          int flag);
-void futhark_context_config_set_logging(struct futhark_context_config *cfg,
-                                        int flag);
-void futhark_context_config_set_num_threads(struct futhark_context_config *cfg,
-                                            int n);
-struct futhark_context ;
-struct futhark_context *futhark_context_new(struct futhark_context_config *cfg);
-void futhark_context_free(struct futhark_context *ctx);
-int futhark_context_sync(struct futhark_context *ctx);
-int futhark_context_config_set_size(struct futhark_context_config *cfg, const
-                                    char *size_name, size_t size_value);
-int futhark_get_num_sizes(void);
-const char *futhark_get_size_name(int);
-const char *futhark_get_size_class(int);
-
-// Arrays
-
-struct futhark_i8_1d ;
-struct futhark_i8_1d *futhark_new_i8_1d(struct futhark_context *ctx, const
-                                        int8_t *data, int64_t dim0);
-struct futhark_i8_1d *futhark_new_raw_i8_1d(struct futhark_context *ctx, const
-                                            char *data, int offset,
-                                            int64_t dim0);
-int futhark_free_i8_1d(struct futhark_context *ctx, struct futhark_i8_1d *arr);
-int futhark_values_i8_1d(struct futhark_context *ctx, struct futhark_i8_1d *arr,
-                         int8_t *data);
-char *futhark_values_raw_i8_1d(struct futhark_context *ctx,
-                               struct futhark_i8_1d *arr);
-const int64_t *futhark_shape_i8_1d(struct futhark_context *ctx,
-                                   struct futhark_i8_1d *arr);
-struct futhark_u32_2d ;
-struct futhark_u32_2d *futhark_new_u32_2d(struct futhark_context *ctx, const
-                                          uint32_t *data, int64_t dim0,
-                                          int64_t dim1);
-struct futhark_u32_2d *futhark_new_raw_u32_2d(struct futhark_context *ctx, const
-                                              char *data, int offset,
-                                              int64_t dim0, int64_t dim1);
-int futhark_free_u32_2d(struct futhark_context *ctx,
-                        struct futhark_u32_2d *arr);
-int futhark_values_u32_2d(struct futhark_context *ctx,
-                          struct futhark_u32_2d *arr, uint32_t *data);
-char *futhark_values_raw_u32_2d(struct futhark_context *ctx,
-                                struct futhark_u32_2d *arr);
-const int64_t *futhark_shape_u32_2d(struct futhark_context *ctx,
-                                    struct futhark_u32_2d *arr);
-
-// Opaque values
-
-struct futhark_opaque_state ;
-int futhark_free_opaque_state(struct futhark_context *ctx,
-                              struct futhark_opaque_state *obj);
-int futhark_store_opaque_state(struct futhark_context *ctx, const
-                               struct futhark_opaque_state *obj, void **p,
-                               size_t *n);
-struct futhark_opaque_state
-*futhark_restore_opaque_state(struct futhark_context *ctx, const void *p);
-
-// Entry points
-
-int futhark_entry_init(struct futhark_context *ctx,
-                       struct futhark_opaque_state **out0, const
-                       struct futhark_i8_1d *in0, const int64_t in1, const
-                       int64_t in2, const int64_t in3);
-int futhark_entry_key(struct futhark_context *ctx,
-                      struct futhark_opaque_state **out0, const int32_t in0,
-                      const int32_t in1, const
-                      struct futhark_opaque_state *in2);
-int futhark_entry_mouse(struct futhark_context *ctx,
-                        struct futhark_opaque_state **out0, const int32_t in0,
-                        const int32_t in1, const int32_t in2, const
-                        struct futhark_opaque_state *in3);
-int futhark_entry_render(struct futhark_context *ctx,
-                         struct futhark_u32_2d **out0, const
-                         struct futhark_opaque_state *in0);
-int futhark_entry_resize(struct futhark_context *ctx,
-                         struct futhark_opaque_state **out0, const int64_t in0,
-                         const int64_t in1, const
-                         struct futhark_opaque_state *in2);
-int futhark_entry_step(struct futhark_context *ctx,
-                       struct futhark_opaque_state **out0, const float in0,
-                       const struct futhark_opaque_state *in1);
-int futhark_entry_wheel(struct futhark_context *ctx,
-                        struct futhark_opaque_state **out0, const int32_t in0,
-                        const int32_t in1, const
-                        struct futhark_opaque_state *in2);
-
-// Miscellaneous
-
-char *futhark_context_report(struct futhark_context *ctx);
-char *futhark_context_get_error(struct futhark_context *ctx);
-void futhark_context_set_logging_file(struct futhark_context *ctx, FILE *f);
-void futhark_context_pause_profiling(struct futhark_context *ctx);
-void futhark_context_unpause_profiling(struct futhark_context *ctx);
-int futhark_context_clear_caches(struct futhark_context *ctx);
-#define FUTHARK_BACKEND_multicore
-#ifdef __cplusplus
-}
-#endif
diff --git a/game_of_life/lib/github.com/athas/matte/.gitignore b/game_of_life/lib/github.com/athas/matte/.gitignore
deleted file mode 100644
index 3d8fd0fe6760e20a08eee8bc30d61cf8883d0648..0000000000000000000000000000000000000000
--- a/game_of_life/lib/github.com/athas/matte/.gitignore
+++ /dev/null
@@ -1,3 +0,0 @@
-*
-!.gitignore
-!*.fut
diff --git a/game_of_life/lib/github.com/athas/matte/colour.fut b/game_of_life/lib/github.com/athas/matte/colour.fut
deleted file mode 100644
index 4d47177c1087302ee8da3d86e3c939868c086c91..0000000000000000000000000000000000000000
--- a/game_of_life/lib/github.com/athas/matte/colour.fut
+++ /dev/null
@@ -1,186 +0,0 @@
--- | Colour manipulation library.
---
--- Adapted from the [Gloss](https://hackage.haskell.org/package/gloss)
--- library by Ben Lippmeier.
-
--- | A colour that can be converted back and forth between an RGBA
--- representation.  Not very useful by itself, but using just this
--- interface one can generate a lot of other useful functions via the
--- colourspace parametric module.
-module type colour = {
-  type colour
-
-  -- | Construct a colour from R, G, B and A channels, each of which
-  -- must be a floating-point number between 0.0 and 1.0.  The
-  -- concrete representation need not be able to handle the full
-  -- precision of each channel.  Thus, `from_rgba` and `to_rgba` need
-  -- not be inverse of each other (but should be close).
-  val from_rgba: f32 -> f32 -> f32 -> f32 -> colour
-
-  -- | Convert a colour to four R, G, B and A channels, each of which
-  -- is a floating-point number between 0.0 and 1.0.
-  val to_rgba: colour -> (f32, f32, f32, f32)
-}
-
--- | A colour representation that encodes the four RGBA channels as a
--- byte each in a 32-bit word, using the order A-R-G-B.
-module argb_colour: colour with colour = u32 = {
-  -- ARGB storage.
-  type colour = u32
-
-  let clamp_channel (x: f32): f32 =
-    if x < 0f32 then 0f32 else if x > 1f32 then 1f32 else x
-
-  let from_rgba (r: f32) (g: f32) (b: f32) (a: f32): colour =
-    ((u32.f32 (clamp_channel a * 255) << 24) |
-     (u32.f32 (clamp_channel r * 255) << 16) |
-     (u32.f32 (clamp_channel g * 255) << 8)  |
-     (u32.f32 (clamp_channel b * 255)))
-
-  let to_rgba (x: colour): (f32,f32,f32,f32) =
-    (f32.u32 ((x>>16) & 0xFF) / 255,
-     f32.u32 ((x>>8) & 0xFF) / 255,
-     f32.u32 ((x>>0) & 0xFF) / 255,
-     f32.u32 ((x>>24) & 0xFF) / 255)
-}
-
--- | A colour representation and a host of useful functions and constants.
-module type colourspace = {
-  include colour
-
-  -- | Add RGB components of a color component-wise, then normalise
-  -- them to the highest resulting one. The alpha components are
-  -- averaged.
-  val add: colour -> colour -> colour
-
-  -- | Add RGBA components of a color component-wise, capping them at
-  -- the maximum.
-  val add_linear: colour -> colour -> colour
-
-  val mult: colour -> colour -> colour
-  val scale: colour -> f32 -> colour
-  val mix: f32 -> colour -> f32 -> colour -> colour
-
-  -- | Brighten 20%.
-  val bright: colour -> colour
-  -- | Dim 20%.
-  val dim: colour -> colour
-  -- | 20% lighter.
-  val light: colour -> colour
-  -- | 20% darker.
-  val dark: colour -> colour
-
-  -- Basic colours
-  val black: colour
-  val red: colour
-  val green: colour
-  val blue: colour
-  val white: colour
-  val brown: colour
-
-  -- Derived colours
-  val yellow: colour
-  val orange: colour
-  val magenta: colour
-  val violet: colour
-
-  -- | Grayness from 0-1.
-  val gray: f32 -> colour
-}
-
--- | Given a colour representation, construct a colourspace with all
--- the handy functions and constants.
-module colourspace(C: colour): colourspace with colour = C.colour = {
-  open C
-
-  let from_rgb_normalised (r: f32) (g: f32) (b: f32): colour =
-    let m = f32.max r (f32.max g b)
-    in from_rgba (r / m) (g / m) (b / m) 1f32
-
-  -- Normalise a color to the value of its largest RGB component.
-  let normalised_colour (r: f32) (g: f32) (b: f32) (a: f32): colour =
-    let m = f32.max r (f32.max g b)
-    in from_rgba (r / m) (g / m) (b / m) a
-
-  let add (x: colour) (y: colour): colour =
-    let (r1,g1,b1,a1) = to_rgba x
-    let (r2,g2,b2,a2) = to_rgba y
-    in normalised_colour
-       (f32.max r1 r2)
-       (f32.max g1 g2)
-       (f32.max b1 b2)
-       ((a1+a2)/2f32)
-
-  let add_linear (x: colour) (y: colour): colour =
-    let (r1,g1,b1,a1) = to_rgba x
-    let (r2,g2,b2,a2) = to_rgba y
-    in from_rgba (r1+r2) (g1+g2) (b1+b2) (a1+a2)
-
-  let mult (x: colour) (y: colour): colour =
-    let (r1,g1,b1,a1) = to_rgba x
-    let (r2,g2,b2,a2) = to_rgba y
-    in from_rgba (r1*r2) (g1*g2) (b1*b2) (a1*a2)
-
-  let scale (x: colour) (s: f32): colour =
-    let (r,g,b,a) = to_rgba x
-    in from_rgba (r*s) (g*s) (b*s) (a*s)
-
-  let mix (m1: f32) (c1: colour) (m2: f32) (c2: colour): colour =
-    let (r1,g1,b1,a1) = to_rgba c1
-    let (r2,g2,b2,a2) = to_rgba c2
-
-    let m12 = m1 + m2
-    let m1' = m1 / m12
-    let m2' = m2 / m12
-
-    let r1s = r1 * r1
-    let r2s = r2 * r2
-
-    let g1s = g1 * g1
-    let g2s = g2 * g2
-
-    let b1s = b1 * b1
-    let b2s = b2 * b2
-
-    in from_rgba (f32.sqrt (m1' * r1s + m2' * r2s))
-                 (f32.sqrt (m1' * g1s + m2' * g2s))
-                 (f32.sqrt (m1' * b1s + m2' * b2s))
-                 ((m1 * a1 + m2 * a2) / m12)
-
-
-  let bright (c: colour): colour =
-    let (r,g,b,a) = to_rgba c
-    in from_rgba (r * 1.2f32) (g * 1.2f32) (b * 1.2f32) a
-
-  let dim (c: colour): colour =
-    let (r,g,b,a) = to_rgba c
-    in from_rgba (r * 0.8f32) (g * 0.8f32) (b * 0.8f32) a
-
-  let light (c: colour): colour =
-    let (r,g,b,a) = to_rgba c
-    in from_rgba (r + 0.2f32) (g + 0.2f32) (b + 0.2f32) a
-
-  let dark (c: colour): colour =
-    let (r,g,b,a) = to_rgba c
-    in from_rgba (r - 0.2f32) (g - 0.2f32) (b - 0.2f32) a
-
-  -- Basic colours
-  let black: colour = from_rgba 0f32 0f32 0f32 1f32
-  let red: colour = from_rgba 1f32 0f32 0f32 1f32
-  let green: colour = from_rgba 0f32 1f32 0f32 1f32
-  let blue: colour = from_rgba 0f32 0f32 1f32 1f32
-  let white: colour = from_rgba 1f32 1f32 1f32 1f32
-  let brown: colour = from_rgba 0.49f32 0.19f32 0.11f32 1f32
-
-  -- Derived colours
-  let yellow: colour = add red green
-  let orange: colour = add yellow red
-  let magenta: colour = add red blue
-  let violet: colour = add magenta blue
-
-  let gray (d: f32): colour = from_rgba d d d 1f32
-}
-
--- | An ARGB colour space - simply `colourspace`@term applied to
--- `argb_colour`@term.
-module argb: colourspace with colour = argb_colour.colour = colourspace argb_colour
diff --git a/game_of_life/lib/github.com/athas/matte/colour_test.fut b/game_of_life/lib/github.com/athas/matte/colour_test.fut
deleted file mode 100644
index f2e5eed7425380e45f831f11ea32936c6327578b..0000000000000000000000000000000000000000
--- a/game_of_life/lib/github.com/athas/matte/colour_test.fut
+++ /dev/null
@@ -1,17 +0,0 @@
--- | ignore
-
--- Proper tests of this library require drawing colours to the screen,
--- I think.
-
-import "colour"
-
--- ==
--- entry: basic_mix
--- input {} output {0.7058824f32 0.7058824f32 0.7058824f32 1.0f32}
-entry basic_mix =
-  argb.to_rgba (argb.mix 0.5f32 argb.white 0.5f32 argb.black)
-
--- ==
--- entry: is_argb
--- input {} output {0xFF000000u32}
-entry is_argb: u32 = argb.black
diff --git a/game_of_life/lib/github.com/diku-dk/lys/Inconsolata-Regular.ttf b/game_of_life/lib/github.com/diku-dk/lys/Inconsolata-Regular.ttf
deleted file mode 100644
index 592ccd20073f76a663c56fe0176397149782565c..0000000000000000000000000000000000000000
Binary files a/game_of_life/lib/github.com/diku-dk/lys/Inconsolata-Regular.ttf and /dev/null differ
diff --git a/game_of_life/lib/github.com/diku-dk/lys/common.mk b/game_of_life/lib/github.com/diku-dk/lys/common.mk
deleted file mode 100644
index b6756ebf6c9d6d52606c4fd58e88fcc8c6384e0b..0000000000000000000000000000000000000000
--- a/game_of_life/lib/github.com/diku-dk/lys/common.mk
+++ /dev/null
@@ -1,36 +0,0 @@
-.PHONY: all run clean
-
-PROGNAME?=lys
-
-all: $(PROGNAME)
-
-LYS_TTF=1
-
-ifeq ($(shell test futhark.pkg -nt lib; echo $$?),0)
-$(PROGNAME):
-	futhark pkg sync
-	@make # The sync might have resulted in a new Makefile.
-else
-include lib/github.com/diku-dk/lys/setup_flags.mk
-$(PROGNAME): $(PROGNAME)_wrapper.o $(PROGNAME)_printf.h lib/github.com/diku-dk/lys/liblys.c lib/github.com/diku-dk/lys/liblys.h lib/github.com/diku-dk/lys/context_setup.c lib/github.com/diku-dk/lys/context_setup.h lib/github.com/diku-dk/lys/main.c
-	gcc lib/github.com/diku-dk/lys/liblys.c lib/github.com/diku-dk/lys/context_setup.c lib/github.com/diku-dk/lys/main.c -I. -DPROGHEADER='"$(PROGNAME)_wrapper.h"' -DPRINTFHEADER='"$(PROGNAME)_printf.h"' $(PROGNAME)_wrapper.o -o $@ $(CFLAGS) $(LDFLAGS)
-endif
-
-$(PROGNAME)_printf.h: $(PROGNAME)_wrapper.c
-	python3 lib/github.com/diku-dk/lys/gen_printf.py $@ $<
-
-# We do not want warnings and such for the generated code.
-$(PROGNAME)_wrapper.o: $(PROGNAME)_wrapper.c
-	gcc -o $@ -c $< $(NOWARN_CFLAGS)
-
-%.c: %.fut
-	futhark $(LYS_BACKEND) --library $<
-
-%_wrapper.fut: lib/github.com/diku-dk/lys/genlys.fut $(PROG_FUT_DEPS)
-	cat $< | sed 's/"lys"/"$(PROGNAME)"/' > $@
-
-run: $(PROGNAME)
-	./$(PROGNAME)
-
-clean:
-	rm -f $(PROGNAME) $(PROGNAME).c $(PROGNAME).h $(PROGNAME)_wrapper.* $(PROGNAME)_printf.h *.o
diff --git a/game_of_life/lib/github.com/diku-dk/lys/context_setup.c b/game_of_life/lib/github.com/diku-dk/lys/context_setup.c
deleted file mode 100644
index 96a387b2713eb49b1097c1888bc788facb0cbc8b..0000000000000000000000000000000000000000
--- a/game_of_life/lib/github.com/diku-dk/lys/context_setup.c
+++ /dev/null
@@ -1,50 +0,0 @@
-#include "context_setup.h"
-
-void lys_setup_futhark_context(const char *deviceopt, bool device_interactive,
-                               struct futhark_context_config* *futcfg,
-                               struct futhark_context* *futctx,
-                               char* *opencl_device_name) {
-  *futcfg = futhark_context_config_new();
-  assert(*futcfg != NULL);
-
-#if defined(FUTHARK_BACKEND_opencl) || defined(FUTHARK_BACKEND_cuda)
-  if (deviceopt != NULL) {
-    futhark_context_config_set_device(*futcfg, deviceopt);
-  }
-#else
-  (void)deviceopt;
-#endif
-
-#ifdef FUTHARK_BACKEND_opencl
-  if (device_interactive) {
-    futhark_context_config_select_device_interactively(*futcfg);
-  }
-#else
-  (void)device_interactive;
-#endif
-
-  *futctx = futhark_context_new(*futcfg);
-  assert(*futctx != NULL);
-
-#ifdef FUTHARK_BACKEND_opencl
-  cl_device_id device;
-  assert(clGetCommandQueueInfo(futhark_context_get_command_queue(*futctx),
-                               CL_QUEUE_DEVICE, sizeof(cl_device_id), &device, NULL)
-         == CL_SUCCESS);
-
-  size_t dev_name_size;
-  assert(clGetDeviceInfo(device, CL_DEVICE_NAME, 0, NULL, &dev_name_size)
-         == CL_SUCCESS);
-  *opencl_device_name = malloc(dev_name_size);
-  assert(clGetDeviceInfo(device, CL_DEVICE_NAME, dev_name_size, *opencl_device_name, NULL)
-         == CL_SUCCESS);
-#else
-  *opencl_device_name = NULL;
-#endif
-}
-
-int64_t lys_wall_time() {
-  struct timeval time;
-  assert(gettimeofday(&time,NULL) == 0);
-  return time.tv_sec * 1000000 + time.tv_usec;
-}
diff --git a/game_of_life/lib/github.com/diku-dk/lys/context_setup.h b/game_of_life/lib/github.com/diku-dk/lys/context_setup.h
deleted file mode 100644
index d613bd7396d107c2db8e5a1296ea9484f6e00f51..0000000000000000000000000000000000000000
--- a/game_of_life/lib/github.com/diku-dk/lys/context_setup.h
+++ /dev/null
@@ -1,29 +0,0 @@
-#ifndef LIBLYS_CONTEXT_SETUP
-#define LIBLYS_CONTEXT_SETUP
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <assert.h>
-#include <time.h>
-#include <sys/time.h>
-
-#include PROGHEADER
-
-void lys_setup_futhark_context(const char *deviceopt, bool device_interactive,
-                               struct futhark_context_config* *futcfg,
-                               struct futhark_context* *futctx,
-                               char* *opencl_device_name);
-
-int64_t lys_wall_time();
-
-#define FUT_CHECK(ctx, x) _fut_check(ctx, x, __FILE__, __LINE__)
-static inline void _fut_check(struct futhark_context *ctx, int res,
-                              const char *file, int line) {
-  if (res != 0) {
-    fprintf(stderr, "%s:%d: Futhark error %d: %s\n",
-            file, line, res, futhark_context_get_error(ctx));
-    exit(EXIT_FAILURE);
-  }
-}
-
-#endif
diff --git a/game_of_life/lib/github.com/diku-dk/lys/default.nix b/game_of_life/lib/github.com/diku-dk/lys/default.nix
deleted file mode 100644
index f45fb1f91bdd28da811700f342f6e449b61d06a6..0000000000000000000000000000000000000000
--- a/game_of_life/lib/github.com/diku-dk/lys/default.nix
+++ /dev/null
@@ -1,5 +0,0 @@
-with import <nixpkgs> {};
-stdenv.mkDerivation {
-    name = "lys";
-    buildInputs = [ pkgconfig SDL2 SDL2_ttf ocl-icd opencl-headers ];
-}
diff --git a/game_of_life/lib/github.com/diku-dk/lys/gen_printf.py b/game_of_life/lib/github.com/diku-dk/lys/gen_printf.py
deleted file mode 100644
index f1f448c031adab7c701377934c695542ace17b39..0000000000000000000000000000000000000000
--- a/game_of_life/lib/github.com/diku-dk/lys/gen_printf.py
+++ /dev/null
@@ -1,40 +0,0 @@
-#!/usr/bin/env python3
-
-import sys
-import re
-
-out_file, in_file = sys.argv[1:]
-
-with open(in_file) as f:
-    contents = f.read()
-
-start = contents.find('futhark_entry_text_content')
-end = contents.find(')', start)
-types = re.findall('([^ ]+) \*out\d+,', contents[start:end])
-out_vars = ['out{}'.format(i) for i in range(len(types))]
-
-with open(out_file, 'w') as f:
-    print('#include <stdio.h>', file=f)
-    print('#include "lib/github.com/diku-dk/lys/liblys.h"', file=f)
-    print('', file=f)
-    if len(types) == 0:
-        print('#define UNUSED(x) (void)(x)', file=f)
-    print('void build_text(const struct lys_context *ctx, char* dest, size_t dest_len, const char* format, float render_milliseconds, char* **sum_names) {', file=f)
-    if len(types) > 0:
-        for v, t in zip(out_vars, types):
-            print('  union {{ {} val; char* sum_name; }} {};'.format(t, v), file=f)
-        print('  FUT_CHECK(ctx->fut, futhark_entry_text_content(ctx->fut, {}, render_milliseconds, ctx->state));'.format(', '.join('&{}.val'.format(v) for v in out_vars)), file=f)
-        for v, i in zip(out_vars, range(len(out_vars))):
-            print('  if (sum_names[{}] != NULL) {{'.format(i), file=f)
-            print('    {v}.sum_name = sum_names[{i}][(int32_t) {v}.val];'.format(v=v, i=i), file=f)
-            print('  }', file=f)
-        print('  snprintf(dest, dest_len, format, {});'.format(', '.join((s + ('.sum_name' if t == 'int32_t' else '.val')) for s, t in zip(out_vars, types))), file=f)
-    else:
-        for x in ['ctx', 'render_milliseconds', 'sum_names']:
-            print('UNUSED({});'.format(x), file=f)
-        print('  snprintf(dest, dest_len, "%s", format);', file=f)
-    print('}', file=f)
-    print('', file=f)
-    print('size_t n_printf_arguments() {', file=f)
-    print('  return {};'.format(len(types)), file=f)
-    print('}', file=f)
diff --git a/game_of_life/lib/github.com/diku-dk/lys/genlys.fut b/game_of_life/lib/github.com/diku-dk/lys/genlys.fut
deleted file mode 100644
index e9264eccf42893840802894f2545e663b472893b..0000000000000000000000000000000000000000
--- a/game_of_life/lib/github.com/diku-dk/lys/genlys.fut
+++ /dev/null
@@ -1,41 +0,0 @@
--- | ignore
-
--- This file exists as a wrapper that defines entry points in the
--- specific form that liblys.c requires.  It is copied into place and
--- modified by the rules in common.mk.
-
-module m = import "lys"
-
-type^ state = m.lys.state
-
-entry init (seed: u32) (h: i32) (w: i32): state =
-  m.lys.init seed (i64.i32 h) (i64.i32 w)
-
-entry grab_mouse: bool =
-  m.lys.grab_mouse
-
-entry resize (h: i32) (w: i32) (s: state): state =
-  m.lys.resize (i64.i32 h) (i64.i32 w) s
-
-entry key (e: i32) (key: i32) (s: state): state =
-  let e' = if e == 0 then #keydown {key} else #keyup {key}
-  in m.lys.event e' s
-
-entry mouse (buttons: i32) (x: i32) (y: i32) (s: state): state =
-  m.lys.event (#mouse {buttons, x, y}) s
-
-entry wheel (dx: i32) (dy: i32) (s: state): state =
-  m.lys.event (#wheel {dx, dy}) s
-
-entry step (td: f32) (s: state): state =
-  m.lys.event (#step td) s
-
-entry render (s: state) = m.lys.render s
-
-entry text_colour (s: state): u32 =
-  m.lys.text_colour s
-
-entry text_format: []u8 = m.lys.text_format ()
-
-entry text_content (render_duration: f32) (s: state) =
-  m.lys.text_content render_duration s
diff --git a/game_of_life/lib/github.com/diku-dk/lys/liblys.c b/game_of_life/lib/github.com/diku-dk/lys/liblys.c
deleted file mode 100644
index e7d425206edc70a26e85e2cd9a04c3429b7c3b9e..0000000000000000000000000000000000000000
--- a/game_of_life/lib/github.com/diku-dk/lys/liblys.c
+++ /dev/null
@@ -1,269 +0,0 @@
-// Convenience framework for writing visualisations with Futhark and
-// C/SDL.
-//
-// Based on initial SDL wrapper code by Jakob Stokholm Bertelsen.
-
-#include "liblys.h"
-
-
-static void trigger_event(struct lys_context *ctx, enum lys_event event) {
-  ctx->event_handler(ctx, event);
-}
-
-static void window_size_updated(struct lys_context *ctx, int newx, int newy) {
-  // https://stackoverflow.com/a/40122002
-  ctx->wnd_surface = SDL_GetWindowSurface(ctx->wnd);
-  SDL_ASSERT(ctx->wnd_surface != NULL);
-
-  ctx->width = newx;
-  ctx->height = newy;
-
-  struct futhark_opaque_state *new_state;
-  FUT_CHECK(ctx->fut, futhark_entry_resize(ctx->fut, &new_state, ctx->height, ctx->width, ctx->state));
-  futhark_free_opaque_state(ctx->fut, ctx->state);
-  ctx->state = new_state;
-
-  ctx->wnd_surface = SDL_GetWindowSurface(ctx->wnd);
-  SDL_ASSERT(ctx->wnd_surface != NULL);
-
-  if (ctx->data != NULL) {
-    free(ctx->data);
-  }
-  ctx->data = malloc(ctx->width * ctx->height * sizeof(uint32_t));
-  assert(ctx->data != NULL);
-
-  if (ctx->surface != NULL) {
-    SDL_FreeSurface(ctx->surface);
-  }
-  ctx->surface = SDL_CreateRGBSurfaceFrom(ctx->data, ctx->width, ctx->height,
-                                          32, ctx->width * sizeof(uint32_t), 0xFF0000, 0xFF00, 0xFF, 0x00000000);
-  SDL_ASSERT(ctx->surface != NULL);
-
-  trigger_event(ctx, LYS_WINDOW_SIZE_UPDATED);
-}
-
-static void mouse_event(struct lys_context *ctx, Uint32 state, int x, int y) {
-  // We ignore mouse events if we are running a program that would
-  // like mouse grab, but where we have temporarily taken the mouse
-  // back from it (to e.g. resize the window).
-  if (ctx->grab_mouse != ctx->mouse_grabbed) {
-    return;
-  }
-
-  struct futhark_opaque_state *new_state;
-  FUT_CHECK(ctx->fut, futhark_entry_mouse(ctx->fut, &new_state, state, x, y, ctx->state));
-  futhark_free_opaque_state(ctx->fut, ctx->state);
-  ctx->state = new_state;
-}
-
-static void wheel_event(struct lys_context *ctx, int x, int y) {
-  struct futhark_opaque_state *new_state;
-  FUT_CHECK(ctx->fut, futhark_entry_wheel(ctx->fut, &new_state, x, y, ctx->state));
-  futhark_free_opaque_state(ctx->fut, ctx->state);
-  ctx->state = new_state;
-}
-
-static void handle_sdl_events(struct lys_context *ctx) {
-  SDL_Event event;
-
-  while (SDL_PollEvent(&event) == 1) {
-    switch (event.type) {
-    case SDL_WINDOWEVENT:
-      switch (event.window.event) {
-      case SDL_WINDOWEVENT_RESIZED:
-        {
-          int newx = (int)event.window.data1;
-          int newy = (int)event.window.data2;
-          window_size_updated(ctx, newx, newy);
-          break;
-        }
-      }
-      break;
-    case SDL_QUIT:
-      ctx->running = 0;
-      break;
-    case SDL_MOUSEMOTION:
-      if (ctx->grab_mouse) {
-        mouse_event(ctx, event.motion.state, event.motion.xrel, event.motion.yrel);
-      } else {
-        mouse_event(ctx, event.motion.state, event.motion.x, event.motion.y);
-      }
-      break;
-    case SDL_MOUSEBUTTONDOWN:
-    case SDL_MOUSEBUTTONUP:
-      if (ctx->grab_mouse && !ctx->mouse_grabbed) {
-        assert(SDL_SetRelativeMouseMode(1) == 0);
-        ctx->mouse_grabbed = 1;
-      }
-
-      if (ctx->grab_mouse) {
-        mouse_event(ctx, 1<<(event.button.button-1), event.motion.xrel, event.motion.yrel);
-      } else {
-        mouse_event(ctx, 1<<(event.button.button-1), event.motion.x, event.motion.y);
-      }
-      break;
-    case SDL_MOUSEWHEEL:
-      wheel_event(ctx, event.wheel.x, event.wheel.y);
-      break;
-    case SDL_KEYDOWN:
-    case SDL_KEYUP:
-      switch (event.key.keysym.sym) {
-      case SDLK_ESCAPE:
-        if (ctx->grab_mouse && ctx->mouse_grabbed) {
-          assert(SDL_SetRelativeMouseMode(0) == 0);
-          ctx->mouse_grabbed = 0;
-        } else if (event.key.type == SDL_KEYDOWN) {
-          ctx->running = 0;
-        }
-        break;
-      case SDLK_F1:
-        if (event.key.type == SDL_KEYDOWN) {
-          trigger_event(ctx, LYS_F1);
-        }
-        break;
-      default:
-        {
-          struct futhark_opaque_state *new_state;
-          int e = event.key.type == SDL_KEYDOWN ? 0 : 1;
-          FUT_CHECK(ctx->fut, futhark_entry_key(ctx->fut, &new_state,
-                                                e, event.key.keysym.sym, ctx->state));
-          futhark_free_opaque_state(ctx->fut, ctx->state);
-          ctx->state = new_state;
-        }
-      }
-    }
-  }
-}
-
-static void sdl_loop(struct lys_context *ctx) {
-  struct futhark_u32_2d *out_arr;
-
-  while (ctx->running) {
-    int64_t now = lys_wall_time();
-    float delta = ((float)(now - ctx->last_time))/1000000.0;
-    ctx->fps = (ctx->fps*0.9 + (1/delta)*0.1);
-    ctx->last_time = now;
-    struct futhark_opaque_state *new_state;
-    FUT_CHECK(ctx->fut, futhark_entry_step(ctx->fut, &new_state, delta, ctx->state));
-    futhark_free_opaque_state(ctx->fut, ctx->state);
-    ctx->state = new_state;
-
-    FUT_CHECK(ctx->fut, futhark_entry_render(ctx->fut, &out_arr, ctx->state));
-    FUT_CHECK(ctx->fut, futhark_values_u32_2d(ctx->fut, out_arr, ctx->data));
-    FUT_CHECK(ctx->fut, futhark_free_u32_2d(ctx->fut, out_arr));
-
-    SDL_ASSERT(SDL_BlitSurface(ctx->surface, NULL, ctx->wnd_surface, NULL)==0);
-
-    trigger_event(ctx, LYS_LOOP_ITERATION);
-
-    SDL_ASSERT(SDL_UpdateWindowSurface(ctx->wnd) == 0);
-
-    int delay =  1000.0/ctx->max_fps - delta*1000.0;
-    if (delay > 0) {
-      SDL_Delay(delay);
-    }
-
-    handle_sdl_events(ctx);
-  }
-}
-
-void lys_run_sdl(struct lys_context *ctx) {
-  struct futhark_context *fut = ctx->fut;
-
-  ctx->last_time = lys_wall_time();
-
-  ctx->wnd =
-    SDL_CreateWindow("Lys",
-                     SDL_WINDOWPOS_UNDEFINED, SDL_WINDOWPOS_UNDEFINED,
-                     ctx->width, ctx->height,
-                     ctx->sdl_flags |
-                     SDL_RENDERER_ACCELERATED |
-                     SDL_RENDERER_PRESENTVSYNC);
-  SDL_ASSERT(ctx->wnd != NULL);
-
-  window_size_updated(ctx, ctx->width, ctx->height);
-
-  ctx->running = 1;
-  ctx->mouse_grabbed = 0;
-
-  if (ctx->grab_mouse) {
-    assert(SDL_SetRelativeMouseMode(1) == 0);
-    ctx->mouse_grabbed = 1;
-  }
-
-  trigger_event(ctx, LYS_LOOP_START);
-
-  sdl_loop(ctx);
-
-  FUT_CHECK(fut, futhark_free_opaque_state(fut, ctx->state));
-
-  trigger_event(ctx, LYS_LOOP_END);
-
-  SDL_FreeSurface(ctx->surface);
-  // do not free wnd_surface (see SDL_GetWindowSurface)
-  SDL_DestroyWindow(ctx->wnd);
-  SDL_Quit();
-}
-
-void lys_setup(struct lys_context *ctx, int width, int height, int max_fps, int sdl_flags) {
-  memset(ctx, 0, sizeof(struct lys_context));
-  ctx->width = width;
-  ctx->height = height;
-  ctx->fps = 0;
-  ctx->max_fps = max_fps;
-  ctx->sdl_flags = sdl_flags;
-
-  SDL_ASSERT(SDL_Init(SDL_INIT_EVERYTHING) == 0);
-}
-
-#ifdef LYS_TTF
-void draw_text(struct lys_context *ctx,
-               TTF_Font *font, int font_size,
-               char* buffer, int32_t colour,
-               int y_start, int x_start) {
-  SDL_Surface *text_surface;
-  SDL_Rect offset_rect;
-
-  SDL_Color sdl_colour =
-      { .a = (colour >> 24) & 0xff,
-        .r = (colour >> 16) & 0xff,
-        .g = (colour >> 8) & 0xff,
-        .b = colour & 0xff };
-
-  offset_rect.x = x_start;
-  int y = y_start;
-  while (true) {
-    char* buffer_start = buffer;
-
-    bool no_more_text = false;
-    while (true) {
-      if (*buffer == '\n') {
-        *buffer = '\0';
-        break;
-      } else if (*buffer == '\0') {
-        no_more_text = true;
-        break;
-      }
-      buffer++;
-    }
-
-    if (*buffer_start != '\0') {
-      text_surface = TTF_RenderUTF8_Blended(font, buffer_start, sdl_colour);
-      SDL_ASSERT(text_surface != NULL);
-      offset_rect.y = y;
-      offset_rect.w = text_surface->w;
-      offset_rect.h = text_surface->h;
-      SDL_ASSERT(SDL_BlitSurface(text_surface, NULL,
-                                 ctx->wnd_surface, &offset_rect) == 0);
-      SDL_FreeSurface(text_surface);
-    }
-
-    if (no_more_text) {
-      break;
-    } else {
-      buffer++;
-      y += font_size;
-    }
-  }
-}
-#endif
diff --git a/game_of_life/lib/github.com/diku-dk/lys/liblys.h b/game_of_life/lib/github.com/diku-dk/lys/liblys.h
deleted file mode 100644
index 4c0e7750a068fe3a5e5db2d138ac96d22dba1753..0000000000000000000000000000000000000000
--- a/game_of_life/lib/github.com/diku-dk/lys/liblys.h
+++ /dev/null
@@ -1,61 +0,0 @@
-#ifndef LIBLYS_HEADER
-#define LIBLYS_HEADER
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <stdbool.h>
-#include <assert.h>
-#include <SDL2/SDL.h>
-#include <SDL2/SDL_ttf.h>
-
-#include PROGHEADER
-
-#include "context_setup.h"
-
-enum lys_event {
-  LYS_LOOP_START,
-  LYS_LOOP_ITERATION,
-  LYS_LOOP_END,
-  LYS_WINDOW_SIZE_UPDATED,
-  LYS_F1
-};
-
-struct lys_context {
-  struct futhark_context *fut;
-  struct futhark_opaque_state *state;
-  SDL_Window *wnd;
-  SDL_Surface *wnd_surface;
-  SDL_Surface *surface;
-  int width;
-  int height;
-  uint32_t *data;
-  int64_t last_time;
-  bool running;
-  bool grab_mouse;
-  bool mouse_grabbed;
-  float fps;
-  int max_fps;
-  int sdl_flags;
-  void* event_handler_data;
-  void (*event_handler)(struct lys_context*, enum lys_event);
-};
-
-#define SDL_ASSERT(x) _sdl_assert(x, __FILE__, __LINE__)
-static inline void _sdl_assert(int res, const char *file, int line) {
-  if (res == 0) {
-    fprintf(stderr, "%s:%d: SDL error %d: %s\n",
-            file, line, res, SDL_GetError());
-    exit(EXIT_FAILURE);
-  }
-}
-
-void lys_setup(struct lys_context *ctx, int width, int height, int max_fps, int sdl_flags);
-
-void lys_run_sdl(struct lys_context *ctx);
-
-#ifdef LYS_TTF
-void draw_text(struct lys_context *ctx, TTF_Font *font, int font_size, char* buffer, int32_t colour,
-               int x_start, int y_start);
-#endif
-
-#endif
diff --git a/game_of_life/lib/github.com/diku-dk/lys/lys.fut b/game_of_life/lib/github.com/diku-dk/lys/lys.fut
deleted file mode 100644
index e1039d62bf658bfde2c9bd76372699cd9267727f..0000000000000000000000000000000000000000
--- a/game_of_life/lib/github.com/diku-dk/lys/lys.fut
+++ /dev/null
@@ -1,366 +0,0 @@
--- | Lights, camera, action!
---
--- Making use of Lys requires hooking into (or duplicating) its custom
--- Makefile rules, so you should also read the [usage
--- section](https://github.com/diku-dk/lys/blob/master/README.md#general-usage)
--- of the README.
---
--- On the Futhark side, you need to define a module called `lys` that
--- implements the module type `lys`@mtype.  You can do this directly,
--- or use some of the various conveniences defined in this file.  For
--- example, if you do not care about showing any text, you can use
--- `lys_no_text`@mtype.
-
--- | For convenience, re-export the colour module.
-open import "../../athas/matte/colour"
-
--- | UTF-8 encoded string.  This is what is produced by string
--- literals in Futhark code.
-type string [n] = [n]u8
-
--- | An event is sent when something has happened that might cause the
--- state of the program to change, or just when some time has passed.
--- It is permissible to ignore all of these events.  Things that must
--- not be ignored are separate functions in `lys`@mtype.
---
--- * `#step x`: `x` seconds have passed since `init` or the last time
---   this event was received.
---
--- * `#keydown {key}`: `key` has pressed.
---
--- * `#keyup {key}`: `key` has been released.
---
--- * `#mouse {buttons, x, y}`: The mouse has been moved or clicked.
---   `buttons` is a bit mask indicating which button(s) are held down,
---   and the `x`/`y` the new position of the mouse.
---
--- * `#wheel {dx, dy}`: The mouse wheel has been used.  Note that there can
---   be multiple wheels; this is why the `dy` direction also makes
---   sense.  In most cases, however, only the `dy` will be non-zero.
-type event = #step f32
-           | #keydown {key:i32}
-           | #keyup {key:i32}
-           | #mouse {buttons:i32, x:i32, y:i32}
-           | #wheel {dx:i32, dy:i32}
-
--- | The core subset of the module type of Lys applications.  This is useful if
--- you need a Lys application with custom initialisation or without text
--- rendering.
-module type lys_core = {
-  -- | The state maintained by this Lys application.  Most functions
-  -- will take the current state and return a new state.
-  type~ state
-
-  -- | An event occured.  It is permissible to ignore any of these
-  -- events by returning the same state unchanged.
-  val event : event -> state -> state
-
-  -- | The window was resized.
-  val resize : (h: i64) -> (w: i64) -> state -> state
-
-  -- | The function for rendering a screen image in row-major order
-  -- (height by width).  The size of the array returned must match the
-  -- last dimensions provided to the state (via `init`@term or
-  -- `resize`@term).
-  val render : state -> [][]argb.colour
-}
-
--- | The module type of Lys applications.  If you define a module
--- called `lys` that has this module type, then the autogenerated Lys
--- wrapper application can automatically define the entry point
--- functions that allows Lys to communicate with the C program that
--- actually implements the user interaction.
-module type lys = {
-  include lys_core
-
-  -- | Initial state for a given window size.  A random seed is passed
-  -- in.  Don't treat this as a true random number (it's currently
-  -- just a timestamp), but use it for initialising a proper RNG.
-  val init : (seed: u32) -> (h: i64) -> (w: i64) -> state
-
-  -- | If true, the program will grab the mouse, and all positions
-  -- reported via the `mouse`@term function will be relative to the
-  -- last time `mouse`@term was called.  If in doubt, leave this
-  -- `false`.
-  val grab_mouse : bool
-
-  -- | Show helpful text in the upper-left corner.  Specify in printf format
-  -- with extensions: '%[string1|string2|...]' prints a string but takes an
-  -- index into the given list of strings, separated by '|'.  For example,
-  -- '%[circle|square]' prints 'circle' if passed the i32 value 0, and 'square'
-  -- if passed 1.
-
-  val text_format : () -> string []
-  -- | The content must be a scalar or a tuple of scalars.
-  type text_content
-  val text_content : (fps: f32) -> state -> text_content
-  -- | The colour can vary based on the state.
-  val text_colour : state -> argb.colour
-}
-
--- | A module type for the simple case where we don't want any text.
--- You can define the `lys` module to have this module type instead of
--- `lys`@mtype.  For maximal convenience, you can `open`
--- `lys_no_text`@module inside your module definition.
-module type lys_no_text = lys with text_content = ()
-
--- | A convenience module that can be `open`ed to give dummy
--- definitions for the text-related functionality.
-module lys_no_text = {
-  let text_format () = ""
-  type text_content = ()
-  let text_content _ _ = ()
-  let text_colour _ = argb.black
-}
-
--- | A dummy lys module that just produces a black rectangle and does
--- nothing in response to events.
-module lys: lys_no_text = {
-  type state = {h: i64, w: i64}
-  let init _ h w = {h,w}
-  let event _ s = s
-  let resize h w _ = {h,w}
-  let grab_mouse = false
-  let render {h,w} = replicate w argb.black |> replicate h
-  open lys_no_text
-}
-
--- The following values are taken from
--- https://wiki.libsdl.org/SDLKeycodeLookup
-
-let SDLK_UNKNOWN: i32 = 0x00
-let SDLK_BACKSPACE: i32 = 0x08
-let SDLK_TAB: i32 = 0x09
-let SDLK_RETURN: i32 = 0x0D
-let SDLK_ESCAPE: i32 = 0x1B
-let SDLK_SPACE: i32 = 0x20
-let SDLK_EXCLAIM: i32 = 0x21
-let SDLK_QUOTEDBL: i32 = 0x22
-let SDLK_HASH: i32 = 0x23
-let SDLK_DOLLAR: i32 = 0x24
-let SDLK_PERCENT: i32 = 0x25
-let SDLK_AMPERSAND: i32 = 0x26
-let SDLK_QUOTE: i32 = 0x27
-let SDLK_LEFTPAREN: i32 = 0x28
-let SDLK_RIGHTPAREN: i32 = 0x29
-let SDLK_ASTERISK: i32 = 0x2A
-let SDLK_PLUS: i32 = 0x2B
-let SDLK_COMMA: i32 = 0x2C
-let SDLK_MINUS: i32 = 0x2D
-let SDLK_PERIOD: i32 = 0x2E
-let SDLK_SLASH: i32 = 0x2F
-let SDLK_0: i32 = 0x30
-let SDLK_1: i32 = 0x31
-let SDLK_2: i32 = 0x32
-let SDLK_3: i32 = 0x33
-let SDLK_4: i32 = 0x34
-let SDLK_5: i32 = 0x35
-let SDLK_6: i32 = 0x36
-let SDLK_7: i32 = 0x37
-let SDLK_8: i32 = 0x38
-let SDLK_9: i32 = 0x39
-let SDLK_COLON: i32 = 0x3A
-let SDLK_SEMICOLON: i32 = 0x3B
-let SDLK_LESS: i32 = 0x3C
-let SDLK_EQUALS: i32 = 0x3D
-let SDLK_GREATER: i32 = 0x3E
-let SDLK_QUESTION: i32 = 0x3F
-let SDLK_AT: i32 = 0x40
-let SDLK_LEFTBRACKET: i32 = 0x5B
-let SDLK_BACKSLASH: i32 = 0x5C
-let SDLK_RIGHTBRACKET: i32 = 0x5D
-let SDLK_CARET: i32 = 0x5E
-let SDLK_UNDERSCORE: i32 = 0x5F
-let SDLK_BACKQUOTE: i32 = 0x60
-let SDLK_a: i32 = 0x61
-let SDLK_b: i32 = 0x62
-let SDLK_c: i32 = 0x63
-let SDLK_d: i32 = 0x64
-let SDLK_e: i32 = 0x65
-let SDLK_f: i32 = 0x66
-let SDLK_g: i32 = 0x67
-let SDLK_h: i32 = 0x68
-let SDLK_i: i32 = 0x69
-let SDLK_j: i32 = 0x6A
-let SDLK_k: i32 = 0x6B
-let SDLK_l: i32 = 0x6C
-let SDLK_m: i32 = 0x6D
-let SDLK_n: i32 = 0x6E
-let SDLK_o: i32 = 0x6F
-let SDLK_p: i32 = 0x70
-let SDLK_q: i32 = 0x71
-let SDLK_r: i32 = 0x72
-let SDLK_s: i32 = 0x73
-let SDLK_t: i32 = 0x74
-let SDLK_u: i32 = 0x75
-let SDLK_v: i32 = 0x76
-let SDLK_w: i32 = 0x77
-let SDLK_x: i32 = 0x78
-let SDLK_y: i32 = 0x79
-let SDLK_z: i32 = 0x7A
-let SDLK_DELETE: i32 = 0x7F
-let SDLK_CAPSLOCK: i32 = 0x40000039
-let SDLK_F1: i32 = 0x4000003A
-let SDLK_F2: i32 = 0x4000003B
-let SDLK_F3: i32 = 0x4000003C
-let SDLK_F4: i32 = 0x4000003D
-let SDLK_F5: i32 = 0x4000003E
-let SDLK_F6: i32 = 0x4000003F
-let SDLK_F7: i32 = 0x40000040
-let SDLK_F8: i32 = 0x40000041
-let SDLK_F9: i32 = 0x40000042
-let SDLK_F10: i32 = 0x40000043
-let SDLK_F11: i32 = 0x40000044
-let SDLK_F12: i32 = 0x40000045
-let SDLK_PRINTSCREEN: i32 = 0x40000046
-let SDLK_SCROLLLOCK: i32 = 0x40000047
-let SDLK_PAUSE: i32 = 0x40000048
-let SDLK_INSERT: i32 = 0x40000049
-let SDLK_HOME: i32 = 0x4000004A
-let SDLK_PAGEUP: i32 = 0x4000004B
-let SDLK_END: i32 = 0x4000004D
-let SDLK_PAGEDOWN: i32 = 0x4000004E
-let SDLK_RIGHT: i32 = 0x4000004F
-let SDLK_LEFT: i32 = 0x40000050
-let SDLK_DOWN: i32 = 0x40000051
-let SDLK_UP: i32 = 0x40000052
-let SDLK_NUMLOCKCLEAR: i32 = 0x40000053
-let SDLK_KP_DIVIDE: i32 = 0x40000054
-let SDLK_KP_MULTIPLY: i32 = 0x40000055
-let SDLK_KP_MINUS: i32 = 0x40000056
-let SDLK_KP_PLUS: i32 = 0x40000057
-let SDLK_KP_ENTER: i32 = 0x40000058
-let SDLK_KP_1: i32 = 0x40000059
-let SDLK_KP_2: i32 = 0x4000005A
-let SDLK_KP_3: i32 = 0x4000005B
-let SDLK_KP_4: i32 = 0x4000005C
-let SDLK_KP_5: i32 = 0x4000005D
-let SDLK_KP_6: i32 = 0x4000005E
-let SDLK_KP_7: i32 = 0x4000005F
-let SDLK_KP_8: i32 = 0x40000060
-let SDLK_KP_9: i32 = 0x40000061
-let SDLK_KP_0: i32 = 0x40000062
-let SDLK_KP_PERIOD: i32 = 0x40000063
-let SDLK_APPLICATION: i32 = 0x40000065
-let SDLK_POWER: i32 = 0x40000066
-let SDLK_KP_EQUALS: i32 = 0x40000067
-let SDLK_F13: i32 = 0x40000068
-let SDLK_F14: i32 = 0x40000069
-let SDLK_F15: i32 = 0x4000006A
-let SDLK_F16: i32 = 0x4000006B
-let SDLK_F17: i32 = 0x4000006C
-let SDLK_F18: i32 = 0x4000006D
-let SDLK_F19: i32 = 0x4000006E
-let SDLK_F20: i32 = 0x4000006F
-let SDLK_F21: i32 = 0x40000070
-let SDLK_F22: i32 = 0x40000071
-let SDLK_F23: i32 = 0x40000072
-let SDLK_F24: i32 = 0x40000073
-let SDLK_EXECUTE: i32 = 0x40000074
-let SDLK_HELP: i32 = 0x40000075
-let SDLK_MENU: i32 = 0x40000076
-let SDLK_SELECT: i32 = 0x40000077
-let SDLK_STOP: i32 = 0x40000078
-let SDLK_AGAIN: i32 = 0x40000079
-let SDLK_UNDO: i32 = 0x4000007A
-let SDLK_CUT: i32 = 0x4000007B
-let SDLK_COPY: i32 = 0x4000007C
-let SDLK_PASTE: i32 = 0x4000007D
-let SDLK_FIND: i32 = 0x4000007E
-let SDLK_MUTE: i32 = 0x4000007F
-let SDLK_VOLUMEUP: i32 = 0x40000080
-let SDLK_VOLUMEDOWN: i32 = 0x40000081
-let SDLK_KP_COMMA: i32 = 0x40000085
-let SDLK_KP_EQUALSAS400: i32 = 0x40000086
-let SDLK_ALTERASE: i32 = 0x40000099
-let SDLK_SYSREQ: i32 = 0x4000009A
-let SDLK_CANCEL: i32 = 0x4000009B
-let SDLK_CLEAR: i32 = 0x4000009C
-let SDLK_PRIOR: i32 = 0x4000009D
-let SDLK_RETURN2: i32 = 0x4000009E
-let SDLK_SEPARATOR: i32 = 0x4000009F
-let SDLK_OUT: i32 = 0x400000A0
-let SDLK_OPER: i32 = 0x400000A1
-let SDLK_CLEARAGAIN: i32 = 0x400000A2
-let SDLK_CRSEL: i32 = 0x400000A3
-let SDLK_EXSEL: i32 = 0x400000A4
-let SDLK_KP_00: i32 = 0x400000B0
-let SDLK_KP_000: i32 = 0x400000B1
-let SDLK_THOUSANDSSEPARATOR: i32 = 0x400000B2
-let SDLK_DECIMALSEPARATOR: i32 = 0x400000B3
-let SDLK_CURRENCYUNIT: i32 = 0x400000B4
-let SDLK_CURRENCYSUBUNIT: i32 = 0x400000B5
-let SDLK_KP_LEFTPAREN: i32 = 0x400000B6
-let SDLK_KP_RIGHTPAREN: i32 = 0x400000B7
-let SDLK_KP_LEFTBRACE: i32 = 0x400000B8
-let SDLK_KP_RIGHTBRACE: i32 = 0x400000B9
-let SDLK_KP_TAB: i32 = 0x400000BA
-let SDLK_KP_BACKSPACE: i32 = 0x400000BB
-let SDLK_KP_A: i32 = 0x400000BC
-let SDLK_KP_B: i32 = 0x400000BD
-let SDLK_KP_C: i32 = 0x400000BE
-let SDLK_KP_D: i32 = 0x400000BF
-let SDLK_KP_E: i32 = 0x400000C0
-let SDLK_KP_F: i32 = 0x400000C1
-let SDLK_KP_XOR: i32 = 0x400000C2
-let SDLK_KP_POWER: i32 = 0x400000C3
-let SDLK_KP_PERCENT: i32 = 0x400000C4
-let SDLK_KP_LESS: i32 = 0x400000C5
-let SDLK_KP_GREATER: i32 = 0x400000C6
-let SDLK_KP_AMPERSAND: i32 = 0x400000C7
-let SDLK_KP_DBLAMPERSAND: i32 = 0x400000C8
-let SDLK_KP_VERTICALBAR: i32 = 0x400000C9
-let SDLK_KP_DBLVERTICALBAR: i32 = 0x400000CA
-let SDLK_KP_COLON: i32 = 0x400000CB
-let SDLK_KP_HASH: i32 = 0x400000CC
-let SDLK_KP_SPACE: i32 = 0x400000CD
-let SDLK_KP_AT: i32 = 0x400000CE
-let SDLK_KP_EXCLAM: i32 = 0x400000CF
-let SDLK_KP_MEMSTORE: i32 = 0x400000D0
-let SDLK_KP_MEMRECALL: i32 = 0x400000D1
-let SDLK_KP_MEMCLEAR: i32 = 0x400000D2
-let SDLK_KP_MEMADD: i32 = 0x400000D3
-let SDLK_KP_MEMSUBTRACT: i32 = 0x400000D4
-let SDLK_KP_MEMMULTIPLY: i32 = 0x400000D5
-let SDLK_KP_MEMDIVIDE: i32 = 0x400000D6
-let SDLK_KP_PLUSMINUS: i32 = 0x400000D7
-let SDLK_KP_CLEAR: i32 = 0x400000D8
-let SDLK_KP_CLEARENTRY: i32 = 0x400000D9
-let SDLK_KP_BINARY: i32 = 0x400000DA
-let SDLK_KP_OCTAL: i32 = 0x400000DB
-let SDLK_KP_DECIMAL: i32 = 0x400000DC
-let SDLK_KP_HEXADECIMAL: i32 = 0x400000DD
-let SDLK_LCTRL: i32 = 0x400000E0
-let SDLK_LSHIFT: i32 = 0x400000E1
-let SDLK_LALT: i32 = 0x400000E2
-let SDLK_LGUI: i32 = 0x400000E3
-let SDLK_RCTRL: i32 = 0x400000E4
-let SDLK_RSHIFT: i32 = 0x400000E5
-let SDLK_RALT: i32 = 0x400000E6
-let SDLK_RGUI: i32 = 0x400000E7
-let SDLK_MODE: i32 = 0x40000101
-let SDLK_AUDIONEXT: i32 = 0x40000102
-let SDLK_AUDIOPREV: i32 = 0x40000103
-let SDLK_AUDIOSTOP: i32 = 0x40000104
-let SDLK_AUDIOPLAY: i32 = 0x40000105
-let SDLK_AUDIOMUTE: i32 = 0x40000106
-let SDLK_MEDIASELECT: i32 = 0x40000107
-let SDLK_WWW: i32 = 0x40000108
-let SDLK_MAIL: i32 = 0x40000109
-let SDLK_CALCULATOR: i32 = 0x4000010A
-let SDLK_COMPUTER: i32 = 0x4000010B
-let SDLK_AC_SEARCH: i32 = 0x4000010C
-let SDLK_AC_HOME: i32 = 0x4000010D
-let SDLK_AC_BACK: i32 = 0x4000010E
-let SDLK_AC_FORWARD: i32 = 0x4000010F
-let SDLK_AC_STOP: i32 = 0x40000110
-let SDLK_AC_REFRESH: i32 = 0x40000111
-let SDLK_AC_BOOKMARKS: i32 = 0x40000112
-let SDLK_BRIGHTNESSDOWN: i32 = 0x40000113
-let SDLK_BRIGHTNESSUP: i32 = 0x40000114
-let SDLK_DISPLAYSWITCH: i32 = 0x40000115
-let SDLK_KBDILLUMTOGGLE: i32 = 0x40000116
-let SDLK_KBDILLUMDOWN: i32 = 0x40000117
-let SDLK_KBDILLUMUP: i32 = 0x40000118
-let SDLK_EJECT: i32 = 0x40000119
-let SDLK_SLEEP: i32 = 0x4000011A
diff --git a/game_of_life/lib/github.com/diku-dk/lys/main.c b/game_of_life/lib/github.com/diku-dk/lys/main.c
deleted file mode 100644
index 2c24d1fce5124f45245379eda1a9c1a6074031ef..0000000000000000000000000000000000000000
--- a/game_of_life/lib/github.com/diku-dk/lys/main.c
+++ /dev/null
@@ -1,355 +0,0 @@
-#include "liblys.h"
-#include PRINTFHEADER
-
-#define _XOPEN_SOURCE
-#include <unistd.h>
-#include <getopt.h>
-
-#define INITIAL_WIDTH 800
-#define INITIAL_HEIGHT 600
-
-struct lys_text {
-  TTF_Font *font;
-  char* font_path;
-  int font_size;
-  char* text_format;
-  char* text_buffer;
-  size_t text_buffer_len;
-  bool show_text;
-  char* **sum_names;
-};
-
-void loop_start(struct lys_context *ctx, struct lys_text *text) {
-  struct futhark_u8_1d *text_format_array;
-  FUT_CHECK(ctx->fut, futhark_entry_text_format(ctx->fut, &text_format_array));
-  size_t text_format_len = futhark_shape_u8_1d(ctx->fut, text_format_array)[0];
-  text->text_format = malloc(sizeof(char) * (text_format_len + 1));
-  assert(text->text_format != NULL);
-  FUT_CHECK(ctx->fut, futhark_values_u8_1d(ctx->fut, text_format_array, (unsigned char*) text->text_format));
-  FUT_CHECK(ctx->fut, futhark_context_sync(ctx->fut));
-  text->text_format[text_format_len] = '\0';
-  FUT_CHECK(ctx->fut, futhark_free_u8_1d(ctx->fut, text_format_array));
-
-  text->sum_names = (char* **) malloc(sizeof(char* *) * n_printf_arguments());
-  assert(text->sum_names != NULL);
-
-  text->text_buffer_len = text_format_len;
-  size_t i_arg = -1;
-  for (size_t i = 0; i < text_format_len; i++) {
-    if (text->text_format[i] == '%' &&
-        i + 1 < text_format_len && text->text_format[i + 1] != '%') {
-      i_arg++;
-      if (text->text_format[i + 1] == '[') {
-        text->text_format[i + 1] = 's';
-        size_t end_pos;
-        size_t n_choices = 1;
-        bool found_end = false;
-        for (end_pos = i + 2; end_pos < text_format_len; end_pos++) {
-          if (text->text_format[end_pos] == '|') {
-            n_choices++;
-          } else if (text->text_format[end_pos] == ']') {
-            found_end = true;
-            break;
-          }
-        }
-        assert(found_end);
-        text->sum_names[i_arg] = (char* *) malloc(sizeof(char*) * (n_choices + 1));
-        assert(text->sum_names[i_arg] != NULL);
-        text->sum_names[i_arg][n_choices] = NULL;
-        char* temp_choice = (char*) malloc(sizeof(char) * (end_pos - i - n_choices));
-        assert(temp_choice != NULL);
-        size_t choice_cur = 0;
-        size_t i_choice = 0;
-        for (size_t j = i + 2; j < end_pos + 1; j++) {
-          if (text->text_format[j] == '|' || text->text_format[j] == ']') {
-            temp_choice[choice_cur] = '\0';
-            text->sum_names[i_arg][i_choice] = (char*) malloc(sizeof(char) * (choice_cur + 1));
-            assert(text->sum_names[i_arg][i_choice] != NULL);
-            strncpy(text->sum_names[i_arg][i_choice], temp_choice, choice_cur + 1);
-            choice_cur = 0;
-            i_choice++;
-          } else {
-            temp_choice[choice_cur] = text->text_format[j];
-            choice_cur++;
-          }
-        }
-        free(temp_choice);
-        size_t shift_left = end_pos - i - 1;
-        for (size_t j = end_pos + 1; j < text_format_len; j++) {
-          text->text_format[j - shift_left] = text->text_format[j];
-        }
-        text_format_len -= shift_left;
-        text->text_format[text_format_len] = '\0';
-        i++;
-      } else {
-        text->sum_names[i_arg] = NULL;
-        text->text_buffer_len += 20; // estimate
-      }
-    }
-  }
-
-  text->text_buffer = malloc(sizeof(char) * text->text_buffer_len);
-  assert(text->text_buffer != NULL);
-  text->text_buffer[0] = '\0';
-
-  text->show_text = true;
-}
-
-void loop_iteration(struct lys_context *ctx, struct lys_text *text) {
-  if (!text->show_text) {
-    return;
-  }
-
-  build_text(ctx, text->text_buffer, text->text_buffer_len, text->text_format,
-             ctx->fps, text->sum_names);
-  if (*(text->text_buffer) != '\0') {
-    int32_t text_colour;
-    FUT_CHECK(ctx->fut,
-              futhark_entry_text_colour(ctx->fut, (uint32_t*) &text_colour,
-                                        ctx->state));
-    draw_text(ctx, text->font, text->font_size, text->text_buffer, text_colour, 10, 10);
-  }
-}
-
-void loop_end(struct lys_text *text) {
-  free(text->text_format);
-  free(text->text_buffer);
-
-  for (size_t i = 0; i < n_printf_arguments(); i++) {
-    if (text->sum_names[i] != NULL) {
-      size_t j = 0;
-      while (text->sum_names[i][j] != NULL) {
-        free(text->sum_names[i][j]);
-        j++;
-      }
-      free(text->sum_names[i]);
-    }
-  }
-  free(text->sum_names);
-}
-
-int font_size_from_dimensions(int width, int height) {
-  int size, font_size;
-  if (height < width) {
-    size = height;
-  } else {
-    size = width;
-  }
-  font_size = size / 45;
-  if (font_size < 14) {
-    font_size = 14;
-  } else if (font_size > 32) {
-    font_size = 32;
-  }
-  return font_size;
-}
-
-void window_size_updated(struct lys_context *ctx, struct lys_text *text) {
-  text->font_size = font_size_from_dimensions(ctx->width, ctx->height);
-  TTF_CloseFont(text->font);
-  text->font = TTF_OpenFont(text->font_path, text->font_size);
-  SDL_ASSERT(text->font != NULL);
-}
-
-void f1(struct lys_text *text) {
-  text->show_text = !text->show_text;
-}
-
-void handle_event(struct lys_context *ctx, enum lys_event event) {
-  struct lys_text *text = (struct lys_text *) ctx->event_handler_data;
-  switch (event) {
-  case LYS_LOOP_START:
-    loop_start(ctx, text);
-    break;
-  case LYS_LOOP_ITERATION:
-    loop_iteration(ctx, text);
-    break;
-  case LYS_LOOP_END:
-    loop_end(text);
-    break;
-  case LYS_WINDOW_SIZE_UPDATED:
-    window_size_updated(ctx, text);
-    break;
-  case LYS_F1:
-    f1(text);
-  }
-}
-
-void do_bench(struct futhark_context *fut, int height, int width, int n, const char *operation) {
-  struct futhark_opaque_state *state;
-  int64_t start, end;
-  FUT_CHECK(fut, futhark_entry_init(fut, &state, (int32_t) lys_wall_time(), height, width));
-  futhark_context_sync(fut);
-  bool do_step = false, do_render = false;
-
-  if (strstr(operation, "step") != NULL) {
-    do_step = true;
-  }
-
-  if (strstr(operation, "render") != NULL) {
-    do_render = true;
-  }
-
-  start = lys_wall_time();
-  for (int i = 0; i < n; i++) {
-    if (do_step) {
-      struct futhark_opaque_state *new_state;
-      FUT_CHECK(fut, futhark_entry_step(fut, &new_state, 1.0/n, state));
-      futhark_free_opaque_state(fut, state);
-      state = new_state;
-    }
-    if (do_render) {
-      struct futhark_u32_2d *out_arr;
-      FUT_CHECK(fut, futhark_entry_render(fut, &out_arr, state));
-      FUT_CHECK(fut, futhark_free_u32_2d(fut, out_arr));
-    }
-  }
-  futhark_context_sync(fut);
-  end = lys_wall_time();
-
-  printf("Rendered %d frames in %fs (%f FPS)\n",
-         n, ((double)end-start)/1000000,
-         n / (((double)end-start)/1000000));
-
-  FUT_CHECK(fut, futhark_free_opaque_state(fut, state));
-}
-
-void usage(char **argv) {
-  printf("Usage: %s options...\n", argv[0]);
-  puts("Options:");
-  puts("  -?      Print this help and exit.");
-  puts("  -w INT  Set the initial width of the window.");
-  puts("  -h INT  Set the initial height of the window.");
-  puts("  -R      Disallow resizing the window.");
-  puts("  -d DEV  Set the computation device.");
-  puts("  -r INT  Maximum frames per second.");
-  puts("  -i      Select execution device interactively.");
-  puts("  -b <render|step>  Benchmark program.");
-}
-
-int main(int argc, char** argv) {
-  int width = INITIAL_WIDTH, height = INITIAL_HEIGHT, max_fps = 60;
-  bool allow_resize = true;
-  char *deviceopt = NULL;
-  bool device_interactive = false;
-  char *benchopt = NULL;
-
-  int c;
-  while ( (c = getopt(argc, argv, "w:h:r:Rd:b:i")) != -1) {
-    switch (c) {
-    case 'w':
-      width = atoi(optarg);
-      if (width <= 0) {
-        fprintf(stderr, "'%s' is not a valid width.\n", optarg);
-        exit(EXIT_FAILURE);
-      }
-      break;
-    case 'h':
-      height = atoi(optarg);
-      if (height <= 0) {
-        fprintf(stderr, "'%s' is not a valid width.\n", optarg);
-        exit(EXIT_FAILURE);
-      }
-      break;
-    case 'r':
-      max_fps = atoi(optarg);
-      if (max_fps <= 0) {
-        fprintf(stderr, "'%s' is not a valid framerate.\n", optarg);
-        exit(EXIT_FAILURE);
-      }
-      break;
-    case 'R':
-      allow_resize = false;
-      break;
-    case 'd':
-      deviceopt = optarg;
-      break;
-    case 'i':
-      device_interactive = true;
-      break;
-    case 'b':
-      if (strcmp(optarg, "render") == 0 ||
-          strcmp(optarg, "step") == 0) {
-        benchopt = optarg;
-      } else {
-        fprintf(stderr, "Use -b <render|step>\n");
-        return EXIT_FAILURE;
-      }
-      break;
-    case '?':
-      usage(argv);
-      return EXIT_SUCCESS;
-    default:
-      fprintf(stderr, "unknown option: %c\n", c);
-      usage(argv);
-      return EXIT_FAILURE;
-    }
-  }
-
-  if (optind < argc) {
-    fprintf(stderr, "Excess non-options: ");
-    while (optind < argc)
-      fprintf(stderr, "%s ", argv[optind++]);
-    fprintf(stderr, "\n");
-    exit(EXIT_FAILURE);
-  }
-
-  char font_path_rel[] = "/lib/github.com/diku-dk/lys/Inconsolata-Regular.ttf";
-  char* font_path = malloc(sizeof(char) * strlen(argv[0]) + sizeof(font_path_rel));
-  assert(font_path != NULL);
-  strcpy(font_path, argv[0]);
-  char *last_dash = strrchr(font_path, '/');
-  if (last_dash != NULL) {
-    *last_dash = '\0';
-  }
-  strcat(font_path, font_path_rel);
-
-  int sdl_flags = 0;
-  if (allow_resize) {
-    sdl_flags |= SDL_WINDOW_RESIZABLE;
-  }
-
-  struct lys_context ctx;
-  struct futhark_context_config *futcfg;
-  lys_setup(&ctx, width, height, max_fps, sdl_flags);
-
-  char* opencl_device_name = NULL;
-  lys_setup_futhark_context(deviceopt, device_interactive,
-                            &futcfg, &ctx.fut, &opencl_device_name);
-  if (opencl_device_name != NULL) {
-    printf("Using OpenCL device: %s\n", opencl_device_name);
-    printf("Use -d or -i to change this.\n");
-    free(opencl_device_name);
-  }
-
-  FUT_CHECK(ctx.fut, futhark_entry_grab_mouse(ctx.fut, &ctx.grab_mouse));
-
-  struct lys_text text;
-  ctx.event_handler_data = (void*) &text;
-  ctx.event_handler = handle_event;
-
-  SDL_ASSERT(TTF_Init() == 0);
-
-  text.font_path = font_path;
-  text.font_size = font_size_from_dimensions(ctx.width, ctx.height);
-  text.font = TTF_OpenFont(text.font_path, text.font_size);
-  SDL_ASSERT(text.font != NULL);
-
-  if (benchopt != NULL) {
-    do_bench(ctx.fut, height, width, max_fps, benchopt);
-  } else {
-    int32_t seed = (int32_t) lys_wall_time();
-    futhark_entry_init(ctx.fut, &ctx.state,
-                       seed, ctx.height, ctx.width);
-    lys_run_sdl(&ctx);
-    free(ctx.data);
-  }
-
-  TTF_CloseFont(text.font);
-  free(font_path);
-
-  futhark_context_free(ctx.fut);
-  futhark_context_config_free(futcfg);
-
-  return EXIT_SUCCESS;
-}
diff --git a/game_of_life/lib/github.com/diku-dk/lys/setup_flags.mk b/game_of_life/lib/github.com/diku-dk/lys/setup_flags.mk
deleted file mode 100644
index 872f590661e3e7b1eacdffd1e113db758dee934f..0000000000000000000000000000000000000000
--- a/game_of_life/lib/github.com/diku-dk/lys/setup_flags.mk
+++ /dev/null
@@ -1,43 +0,0 @@
-LYS_BACKEND?=opencl
-LYS_TTF?=0
-
-ifeq ($(origin PROG_FUT_DEPS), undefined)
-PROG_FUT_DEPS:=$(shell ls *.fut; find lib -name \*.fut)
-endif
-
-PKG_CFLAGS_PKGS=sdl2
-ifeq ($(LYS_TTF),1)
-PKG_CFLAGS_PKGS+= SDL2_ttf
-endif
-
-PKG_CFLAGS=$(shell pkg-config --cflags $(PKG_CFLAGS_PKGS))
-
-BASE_LDFLAGS=-lm -lSDL2
-ifeq ($(LYS_TTF),1)
-BASE_LDFLAGS+= -lSDL2_ttf
-endif
-
-NOWARN_CFLAGS=-std=c11 -O
-
-CFLAGS?=$(NOWARN_CFLAGS) $(PKG_CFLAGS) -Wall -Wextra -pedantic
-ifeq ($(LYS_TTF),1)
-CFLAGS+= -DLYS_TTF
-endif
-
-ifeq ($(LYS_BACKEND),opencl)
-OS=$(shell uname -s)
-ifeq ($(OS),Darwin)
-DEVICE_LDFLAGS=-framework OpenCL
-else
-DEVICE_LDFLAGS=-lOpenCL
-endif
-else ifeq ($(LYS_BACKEND),cuda)
-DEVICE_LDFLAGS=-lcuda -lnvrtc
-else ifeq ($(LYS_BACKEND),c)
-DEVICE_LDFLAGS=
-else ifeq ($(LYS_BACKEND),multicore)
-DEVICE_LDFLAGS=-lpthread
-else
-$(error Unknown LYS_BACKEND: $(LYS_BACKEND).  Must be 'opencl', 'cuda', 'multicore', or 'c')
-endif
-LDFLAGS?=$(BASE_LDFLAGS) $(DEVICE_LDFLAGS)
diff --git a/game_of_life/libfpmpi.a b/game_of_life/libfpmpi.a
deleted file mode 100644
index d1781ab3a3db4e7f94fc91be679cf5dafb3ae95e..0000000000000000000000000000000000000000
Binary files a/game_of_life/libfpmpi.a and /dev/null differ
diff --git a/game_of_life/main.c b/game_of_life/main.c
deleted file mode 100644
index 5fc18443fd5dd78c5516635fe8a54926f83030ae..0000000000000000000000000000000000000000
--- a/game_of_life/main.c
+++ /dev/null
@@ -1,153 +0,0 @@
-#include <stdio.h>
-#include <stdlib.h>
-#include <mpi.h>
-#include "../lib/fpmpi.h"
-#include "../lib/fp.h"
-#include "gol.h"
-#include "lib/github.com/diku-dk/lys/liblys.h"
-
-#define NB_ROWS (800)
-#define NB_COLUMNS (800)
-#define BOARD_SIZE (NB_ROWS * NB_COLUMNS)
-#define NB_NEIGHBOURS 8
-
-#define MAX_FPS (60)
-
-int8_t board[NB_ROWS][NB_COLUMNS] = {0};
-int my_rank;
-
-typedef struct tuple2 {
-    int8_t cell;
-    int8_t neighbours[NB_NEIGHBOURS];
-} tuple2_t;
-
-void init_board() {
-    for (int y = 0; y < NB_ROWS; ++y) {
-        for (int x = 0; x < NB_COLUMNS; ++x) {
-            board[y][x] = rand() % 2;
-        }
-    }
-}
-
-void *get_neighbours(void *index) {
-    int cell_x = *(int *) index % NB_ROWS;
-    int cell_y = *(int *) index / NB_COLUMNS;
-    int8_t *neighbours = calloc(8, sizeof(int8_t));
-    int i = 0;
-    for (int y = -1; y <= 1; ++y) {
-        for (int x = -1; x <= 1; ++x) {
-            if (y == 0 && x == 0) continue;
-            int neigh_y = cell_y + y;
-            if (neigh_y < 0) {
-                neigh_y = NB_ROWS - 1;
-            } else if (neigh_y >= NB_ROWS) {
-                neigh_y = 0;
-            }
-            int neigh_x = cell_x + x;
-            if (neigh_x < 0) {
-                neigh_x = NB_COLUMNS - 1;
-            } else if (neigh_x >= NB_COLUMNS) {
-                neigh_x = 0;
-            }
-            neighbours[i++] = board[neigh_y][neigh_x];
-        }
-    }
-    return neighbours;
-}
-
-void fold_sum(void *acc, void *neighbour) {
-    int8_t *acc8 = acc;
-    int8_t *neighbour8 = neighbour;
-    *acc8 += *neighbour8;
-}
-
-void *next_state(void *element) {
-    tuple2_t *tuple2 = (tuple2_t *) element;
-    int8_t initial_value = 0;
-    int8_t *nb_cells_alive = local_fold_left(tuple2->neighbours, 8, FPMPI_INT8, FPMPI_INT8, fold_sum, &initial_value);
-    int8_t *next_state = calloc(1, sizeof(int8_t));
-    *next_state = (tuple2->cell == 1 && (*nb_cells_alive == 2 || *nb_cells_alive == 3)) ||
-                  (tuple2->cell == 0 && *nb_cells_alive == 3);
-    return next_state;
-}
-
-void *zip_cell_neigh(void *cell, void *neighs) {
-    tuple2_t *tuple2 = calloc(1, sizeof(tuple2_t));
-    tuple2->cell = *(int8_t *) cell;
-    memcpy(tuple2->neighbours, neighs, NB_NEIGHBOURS * sizeof(int8_t));
-    return tuple2;
-}
-
-void handle_event(struct lys_context *ctx, enum lys_event event) {
-    MPI_Bcast(&board[0][0], BOARD_SIZE, MPI_INT8_T, FPMPI_ROOT_RANK, MPI_COMM_WORLD);
-
-    fpmpi_result_t indexes = iota(BOARD_SIZE, MPI_COMM_WORLD);
-//    printf("Indexes OK: %d\n", indexes.count);
-
-    fpmpi_result_t neighbours = map(indexes.content, BOARD_SIZE, FPMPI_INT32, NB_NEIGHBOURS * FPMPI_INT8,
-                                    get_neighbours,
-                                    MPI_COMM_WORLD);
-//    printf("Neighbours OK: %d\n", neighbours.count);
-    fpmpi_result_t board_with_neighbours = zip(&board[0][0], neighbours.content, BOARD_SIZE, FPMPI_INT8, 8 * FPMPI_INT8,
-                                               sizeof(tuple2_t), zip_cell_neigh, MPI_COMM_WORLD);
-//    printf("Board with Neigh OK\n");
-    fpmpi_result_t new_board = map(board_with_neighbours.content, BOARD_SIZE, sizeof(tuple2_t), FPMPI_INT8, next_state,
-                                   MPI_COMM_WORLD);
-//    printf("New Board OK\n");
-
-    if (my_rank == FPMPI_ROOT_RANK) {
-        memcpy(&board[0][0], new_board.content, BOARD_SIZE);
-        struct futhark_i8_1d *fut_new_board = futhark_new_i8_1d(ctx->fut, &board[0][0], BOARD_SIZE);
-        futhark_entry_init(ctx->fut, &ctx->state, fut_new_board, NB_ROWS, NB_COLUMNS, BOARD_SIZE);
-        free(indexes.content);
-        free(neighbours.content);
-        free(board_with_neighbours.content);
-        free(new_board.content);
-        futhark_free_i8_1d(ctx->fut, fut_new_board);
-    }
-}
-
-uint32_t *run_interactive(struct futhark_context *fut_ctx, int width, int height, struct futhark_i8_1d *fut_board) {
-    struct lys_context ctx = {0};
-    lys_setup(&ctx, width, height, MAX_FPS, 0);
-
-    ctx.fut = fut_ctx;
-    ctx.event_handler_data = NULL;
-    ctx.event_handler = handle_event;
-
-    futhark_entry_init(ctx.fut, &ctx.state, fut_board, NB_ROWS, NB_COLUMNS, BOARD_SIZE);
-    lys_run_sdl(&ctx);
-    return ctx.data;
-}
-
-int main(int argc, char *argv[]) {
-    MPI_Init(&argc, &argv);
-    MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
-    if (my_rank == FPMPI_ROOT_RANK) {
-        struct futhark_context_config *fut_cfg;
-        struct futhark_context *fut_ctx;
-        char *deviceopt = NULL;
-        bool device_interactive = true;
-        char *opencl_device_name = NULL;
-
-        lys_setup_futhark_context(deviceopt, device_interactive, &fut_cfg, &fut_ctx, &opencl_device_name);
-        if (opencl_device_name != NULL) {
-            fprintf(stdout, "Using OpenCL device: %s\n", opencl_device_name);
-        }
-        init_board();
-        struct futhark_i8_1d *fut_board = futhark_new_i8_1d(fut_ctx, &board[0][0], BOARD_SIZE);
-        run_interactive(fut_ctx, NB_COLUMNS, NB_ROWS, fut_board);
-
-        free(opencl_device_name);
-        futhark_free_i8_1d(fut_ctx, fut_board);
-        futhark_context_config_free(fut_cfg);
-        futhark_context_free(fut_ctx);
-    } else {
-        for (;;) {
-            handle_event(NULL, LYS_LOOP_ITERATION);
-        }
-    }
-
-    MPI_Finalize();
-    return 0;
-}
diff --git a/lib/.gitignore b/lib/.gitignore
deleted file mode 100644
index ef86935756e61596d26595a4e7faa9459b8b7dec..0000000000000000000000000000000000000000
--- a/lib/.gitignore
+++ /dev/null
@@ -1,383 +0,0 @@
-### macOS template
-# General
-.DS_Store
-.AppleDouble
-.LSOverride
-
-# Icon must end with two \r
-Icon
-
-# Thumbnails
-._*
-
-# Files that might appear in the root of a volume
-.DocumentRevisions-V100
-.fseventsd
-.Spotlight-V100
-.TemporaryItems
-.Trashes
-.VolumeIcon.icns
-.com.apple.timemachine.donotpresent
-
-# Directories potentially created on remote AFP share
-.AppleDB
-.AppleDesktop
-Network Trash Folder
-Temporary Items
-.apdisk
-
-### Windows template
-# Windows thumbnail cache files
-Thumbs.db
-Thumbs.db:encryptable
-ehthumbs.db
-ehthumbs_vista.db
-
-# Dump file
-*.stackdump
-
-# Folder config file
-[Dd]esktop.ini
-
-# Recycle Bin used on file shares
-$RECYCLE.BIN/
-
-# Windows Installer files
-*.cab
-*.msi
-*.msix
-*.msm
-*.msp
-
-# Windows shortcuts
-*.lnk
-
-### C template
-# Prerequisites
-*.d
-
-# Object files
-*.o
-*.ko
-*.obj
-*.elf
-
-# Linker output
-*.ilk
-*.map
-*.exp
-
-# Precompiled Headers
-*.gch
-*.pch
-
-# Libraries
-*.lib
-*.a
-*.la
-*.lo
-
-# Shared objects (inc. Windows DLLs)
-*.dll
-*.so
-*.so.*
-*.dylib
-
-# Executables
-*.exe
-*.out
-*.app
-*.i*86
-*.x86_64
-*.hex
-
-# Debug files
-*.dSYM/
-*.su
-*.idb
-*.pdb
-
-# Kernel Module Compile Results
-*.mod*
-*.cmd
-.tmp_versions/
-modules.order
-Module.symvers
-Mkfile.old
-dkms.conf
-
-### macOS template
-# General
-.DS_Store
-.AppleDouble
-.LSOverride
-
-# Icon must end with two \r
-Icon
-
-# Thumbnails
-._*
-
-# Files that might appear in the root of a volume
-.DocumentRevisions-V100
-.fseventsd
-.Spotlight-V100
-.TemporaryItems
-.Trashes
-.VolumeIcon.icns
-.com.apple.timemachine.donotpresent
-
-# Directories potentially created on remote AFP share
-.AppleDB
-.AppleDesktop
-Network Trash Folder
-Temporary Items
-.apdisk
-
-### Linux template
-*~
-
-# temporary files which can be created if a process still has a handle open of a deleted file
-.fuse_hidden*
-
-# KDE directory preferences
-.directory
-
-# Linux trash folder which might appear on any partition or disk
-.Trash-*
-
-# .nfs files are created when an open file is removed but is still being accessed
-.nfs*
-
-### C template
-# Prerequisites
-*.d
-
-# Object files
-*.o
-*.ko
-*.obj
-*.elf
-
-# Linker output
-*.ilk
-*.map
-*.exp
-
-# Precompiled Headers
-*.gch
-*.pch
-
-# Libraries
-*.lib
-*.a
-*.la
-*.lo
-
-# Shared objects (inc. Windows DLLs)
-*.dll
-*.so
-*.so.*
-*.dylib
-
-# Executables
-*.exe
-*.out
-*.app
-*.i*86
-*.x86_64
-*.hex
-
-# Debug files
-*.dSYM/
-*.su
-*.idb
-*.pdb
-
-# Kernel Module Compile Results
-*.mod*
-*.cmd
-.tmp_versions/
-modules.order
-Module.symvers
-Mkfile.old
-dkms.conf
-
-### C template
-# Prerequisites
-*.d
-
-# Object files
-*.o
-*.ko
-*.obj
-*.elf
-
-# Linker output
-*.ilk
-*.map
-*.exp
-
-# Precompiled Headers
-*.gch
-*.pch
-
-# Libraries
-*.lib
-*.a
-*.la
-*.lo
-
-# Shared objects (inc. Windows DLLs)
-*.dll
-*.so
-*.so.*
-*.dylib
-
-# Executables
-*.exe
-*.out
-*.app
-*.i*86
-*.x86_64
-*.hex
-
-# Debug files
-*.dSYM/
-*.su
-*.idb
-*.pdb
-
-# Kernel Module Compile Results
-*.mod*
-*.cmd
-.tmp_versions/
-modules.order
-Module.symvers
-Mkfile.old
-dkms.conf
-
-### macOS template
-# General
-.DS_Store
-.AppleDouble
-.LSOverride
-
-# Icon must end with two \r
-Icon
-
-# Thumbnails
-._*
-
-# Files that might appear in the root of a volume
-.DocumentRevisions-V100
-.fseventsd
-.Spotlight-V100
-.TemporaryItems
-.Trashes
-.VolumeIcon.icns
-.com.apple.timemachine.donotpresent
-
-# Directories potentially created on remote AFP share
-.AppleDB
-.AppleDesktop
-Network Trash Folder
-Temporary Items
-.apdisk
-
-### Windows template
-# Windows thumbnail cache files
-Thumbs.db
-Thumbs.db:encryptable
-ehthumbs.db
-ehthumbs_vista.db
-
-# Dump file
-*.stackdump
-
-# Folder config file
-[Dd]esktop.ini
-
-# Recycle Bin used on file shares
-$RECYCLE.BIN/
-
-# Windows Installer files
-*.cab
-*.msi
-*.msix
-*.msm
-*.msp
-
-# Windows shortcuts
-*.lnk
-
-# User-specific stuff
-.idea/**/workspace.xml
-.idea/**/tasks.xml
-.idea/**/usage.statistics.xml
-.idea/**/dictionaries
-.idea/**/shelf
-
-# Generated files
-.idea/**/contentModel.xml
-
-# Sensitive or high-churn files
-.idea/**/dataSources/
-.idea/**/dataSources.ids
-.idea/**/dataSources.local.xml
-.idea/**/sqlDataSources.xml
-.idea/**/dynamic.xml
-.idea/**/uiDesigner.xml
-.idea/**/dbnavigator.xml
-
-# Gradle
-.idea/**/gradle.xml
-.idea/**/libraries
-
-# Gradle and Maven with auto-import
-# When using Gradle or Maven with auto-import, you should exclude module files,
-# since they will be recreated, and may cause churn.  Uncomment if using
-# auto-import.
-# .idea/artifacts
-# .idea/compiler.xml
-# .idea/jarRepositories.xml
-# .idea/modules.xml
-# .idea/*.iml
-# .idea/modules
-# *.iml
-# *.ipr
-
-# CMake
-cmake-build-*/
-
-# Mongo Explorer plugin
-.idea/**/mongoSettings.xml
-
-# File-based project format
-*.iws
-
-# IntelliJ
-out/
-
-# mpeltonen/sbt-idea plugin
-.idea_modules/
-
-# JIRA plugin
-atlassian-ide-plugin.xml
-
-# Cursive Clojure plugin
-.idea/replstate.xml
-
-# Crashlytics plugin (for Android Studio and IntelliJ)
-com_crashlytics_export_strings.xml
-crashlytics.properties
-crashlytics-build.properties
-fabric.properties
-
-# Editor-based Rest Client
-.idea/httpRequests
-
-# Android studio 3.1+ serialized cache file
-.idea/caches/build_file_checksums.ser
-
-.idea
diff --git a/lib/CMakeLists.txt b/lib/CMakeLists.txt
deleted file mode 100644
index fe86691c8a4d7dc80de0394d5fb6898c90237ec2..0000000000000000000000000000000000000000
--- a/lib/CMakeLists.txt
+++ /dev/null
@@ -1,29 +0,0 @@
-cmake_minimum_required(VERSION 3.17)
-project(fpmpi C)
-
-set(CMAKE_C_STANDARD 11)
-
-if (CMAKE_BUILD_TYPE MATCHES Debug)
-    set(GCC_COMPILE_FLAGS "-Wall -Wextra -pedantic -fsanitize=undefined -fsanitize=address")
-    if (CMAKE_SYSTEM_NAME MATCHES "Linux")
-        set(GCC_COMPILE_FLAGS "${GCC_COMPILE_FLAGS} -fsanitize=leak")
-    endif ()
-elseif (CMAKE_BUILD_TYPE MATCHES Release)
-    set(GCC_COMPILE_FLAGS "-O2")
-elseif (CMAKE_BUILD_TYPE MATCHES Benchmark)
-    set(GCC_COMPILE_FLAGS "-DBENCHMARK -O2")
-endif ()
-
-set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${GCC_COMPILE_FLAGS}")
-
-find_package(MPI REQUIRED)
-include_directories(${MPI_C_INCLUDE_PATH})
-
-add_library(fpmpi fpmpi.c fpmpi.h fp.c fp.h dispatch.c dispatch.h)
-target_link_libraries(fpmpi ${MPI_C_LIBRARIES})
-
-add_executable(fpmpi_benchmark benchmark/benchmark.c)
-target_link_libraries(fpmpi_benchmark fpmpi)
-
-add_executable(fpmpi_tests tests/tests.c)
-target_link_libraries(fpmpi_tests fpmpi)
diff --git a/lib/Makefile b/lib/Makefile
deleted file mode 100644
index be8c2a3a7e9dc1e85adaacea01e04d38294d2fde..0000000000000000000000000000000000000000
--- a/lib/Makefile
+++ /dev/null
@@ -1,18 +0,0 @@
-all: release debug benchmark
-
-release:
-	mkdir -p "cmake-build-release"
-	cmake -DCMAKE_BUILD_TYPE=Release -Bcmake-build-release
-	$(MAKE) -C cmake-build-release all
-
-debug:
-	mkdir -p "cmake-build-debug"
-	cmake -DCMAKE_BUILD_TYPE=Debug -Bcmake-build-debug
-	$(MAKE) -C cmake-build-release all
-
-benchmark:
-	mkdir -p "cmake-build-benchmark"
-	cmake -DCMAKE_BUILD_TYPE=Benchmark -Bcmake-build-benchmark
-	$(MAKE) -C cmake-build-benchmark all
-
-.PHONY: all release benchmark
diff --git a/lib/benchmark/benchmark.c b/lib/benchmark/benchmark.c
deleted file mode 100644
index 60e0163f7e2ef7aee5504188ac33747a191ecdbc..0000000000000000000000000000000000000000
--- a/lib/benchmark/benchmark.c
+++ /dev/null
@@ -1,149 +0,0 @@
-#include <mpi.h>
-#include <stdlib.h>
-#include <stdio.h>
-#include "../fpmpi.h"
-
-#define BENCHMARK_MAP 1
-#define BENCHMARK_FILTER 2
-#define BENCHMARK_REDUCE 3
-#define BENCHMARK_FIND 4
-#define BENCHMARK_FOLD_LEFT 5
-#define BENCHMARK_FOLD_RIGHT 6
-#define BENCHMARK_SORT 7
-#define BENCHMARK_SCAN 8
-#define BENCHMARK_IOTA 9
-#define BENCHMARK_ZIP 10
-
-void *map_mul_int(void *element) {
-    int *result = calloc(1, sizeof(int));
-    *result = (*(int *) (element)) * 2;
-    return (void *) result;
-}
-
-bool filter_only_even(void *element) {
-    int element32 = *(int *) element;
-    return element32 % 2 == 0;
-}
-
-bool find_divide_by_three(void *element) {
-    int element32 = *(int *) element;
-    return element32 % 3 == 0;
-}
-
-void reduce_sum(void *accumulator, void *current_value) {
-    int *accumulator32 = (int *) accumulator;
-    int current_value32 = *(int *) current_value;
-    *accumulator32 = (*accumulator32 + current_value32);
-}
-
-void fold_left_sub(void *accumulator, void *current_value) {
-    int *accumulator32 = (int *) accumulator;
-    int current_value32 = *(int *) current_value;
-    *accumulator32 = (*accumulator32 - current_value32);
-}
-
-void fold_right_sub(void *current_value, void *accumulator) {
-    int *accumulator32 = (int *) accumulator;
-    int current_value32 = *(int *) current_value;
-    *accumulator32 = (current_value32 - *accumulator32);
-}
-
-bool sort_asc(void *left, void *right) {
-    int left32 = *(int *) left;
-    int right32 = *(int *) right;
-    return left32 < right32;
-}
-
-int main(int argc, char *argv[]) {
-    if(argc < 4) {
-        printf("Missing argv parameters.\n");
-        exit(0);
-    }
-
-    MPI_Init(&argc, &argv);
-
-    int my_rank;
-    MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
-
-    int benchmark = atoi(argv[1]);
-    int times = atoi(argv[2]);
-    int N = atoi(argv[3]);
-
-    int *array1 = NULL;
-    int *array2 = NULL;
-    if (benchmark == BENCHMARK_REDUCE || benchmark == BENCHMARK_FOLD_LEFT || benchmark == BENCHMARK_FOLD_RIGHT ||
-        my_rank == FPMPI_ROOT_RANK) {
-        array1 = calloc(N, sizeof(int));
-        array2 = calloc(N, sizeof(int));
-        for (int i = 0; i < N; ++i) {
-            if (benchmark == BENCHMARK_SORT) {
-                array1[i] = rand();
-                array2[i] = rand();
-            } else {
-                array1[i] = i;
-                array2[i] = i;
-            }
-        }
-    }
-
-    for (int i = 0; i < times; ++i) {
-        fpmpi_result_t result;
-        switch (benchmark) {
-            case BENCHMARK_MAP: {
-                result = map(array1, N, FPMPI_INT32, FPMPI_INT32, map_mul_int, MPI_COMM_WORLD);
-            }
-                break;
-            case BENCHMARK_FILTER: {
-                result = filter(array1, N, FPMPI_INT32, filter_only_even, MPI_COMM_WORLD);
-            }
-                break;
-            case BENCHMARK_REDUCE: {
-                result = reduce(array1, N, FPMPI_INT32, reduce_sum, MPI_COMM_WORLD);
-            }
-                break;
-            case BENCHMARK_FIND: {
-                result = find(array1, N, FPMPI_INT32, find_divide_by_three, MPI_COMM_WORLD);
-            }
-                break;
-            case BENCHMARK_FOLD_LEFT: {
-                int initial_value = 0;
-                result = fold_left(array1, N, FPMPI_INT32, FPMPI_INT32, fold_left_sub, &initial_value, MPI_COMM_WORLD);
-            }
-                break;
-            case BENCHMARK_FOLD_RIGHT: {
-                int initial_value = 0;
-                result = fold_right(array1, N, FPMPI_INT32, FPMPI_INT32, fold_right_sub, &initial_value, MPI_COMM_WORLD);
-            }
-                break;
-            case BENCHMARK_SORT: {
-                result = sort(array1, N, FPMPI_INT32, FPMPI_MERGE_SORT, sort_asc, MPI_COMM_WORLD);
-            }
-                break;
-            case BENCHMARK_SCAN: {
-//                result = sort(array1, N, FPMPI_INT32, FPMPI_MERGE_SORT, sort_asc, MPI_COMM_WORLD);
-            }
-                break;
-            case BENCHMARK_IOTA: {
-                result = iota(N, MPI_COMM_WORLD);
-            }
-                break;
-            case BENCHMARK_ZIP: {
-//                result = zip(array1, array2, FPMPI_INT32, FPMPI_MERGE_SORT, sort_asc, MPI_COMM_WORLD);
-            }
-                break;
-            default:
-                MPI_Finalize();
-                exit(0);
-        }
-
-        if (my_rank == FPMPI_ROOT_RANK) {
-            free(result.content);
-        }
-    }
-    if (benchmark == BENCHMARK_REDUCE || benchmark == BENCHMARK_FOLD_LEFT || benchmark == BENCHMARK_FOLD_RIGHT ||
-        my_rank == FPMPI_ROOT_RANK) {
-        free(array1);
-    }
-    MPI_Finalize();
-    return 0;
-}
diff --git a/lib/dispatch.c b/lib/dispatch.c
deleted file mode 100644
index 755405841750938d40de6cb1976521dfc97f3ba1..0000000000000000000000000000000000000000
--- a/lib/dispatch.c
+++ /dev/null
@@ -1,36 +0,0 @@
-#include <stdlib.h>
-#include "dispatch.h"
-
-dispatch_t dispatch_init(int count, int type, int out_type, int world_size, int root) {
-    int nb_columns_per_process = count / world_size;
-    int remaining_columns = count % world_size;
-
-    dispatch_t dispatch = {
-            .in_counts8 = calloc(world_size, sizeof(int)),
-            .in_displacements8 = calloc(world_size, sizeof(int)),
-            .out_counts8 = calloc(world_size, sizeof(int)),
-            .out_displacements8 = calloc(world_size, sizeof(int)),
-    };
-
-    for (int i = 0; i < world_size; ++i) {
-        int root_nb_columns = nb_columns_per_process + remaining_columns;
-        int nb_columns = (i == root) ? root_nb_columns : nb_columns_per_process;
-        if (i == 0) {
-            dispatch.in_displacements8[i] = 0;
-            dispatch.out_displacements8[i] = 0;
-        } else {
-            dispatch.in_displacements8[i] = dispatch.in_displacements8[i - 1] + dispatch.in_counts8[i - 1];
-            dispatch.out_displacements8[i] = dispatch.out_displacements8[i - 1] + dispatch.out_counts8[i - 1];
-        }
-        dispatch.in_counts8[i] = nb_columns * type;
-        dispatch.out_counts8[i] = nb_columns * out_type;
-    }
-    return dispatch;
-}
-
-void dispatch_destroy(dispatch_t *dispatch) {
-    free(dispatch->in_displacements8);
-    free(dispatch->in_counts8);
-    free(dispatch->out_displacements8);
-    free(dispatch->out_counts8);
-}
diff --git a/lib/dispatch.h b/lib/dispatch.h
deleted file mode 100644
index bdb08d83ea2ae9e51b2da503624a0b3543f57028..0000000000000000000000000000000000000000
--- a/lib/dispatch.h
+++ /dev/null
@@ -1,15 +0,0 @@
-#ifndef _DISPATCH_H_
-#define _DISPATCH_H_
-
-typedef struct dispatch {
-    int *in_counts8;
-    int *in_displacements8;
-    int *out_counts8;
-    int *out_displacements8;
-} dispatch_t;
-
-dispatch_t dispatch_init(int count, int type, int out_type, int world_size, int root);
-
-void dispatch_destroy(dispatch_t *dispatch);
-
-#endif //_DISPATCH_H_
diff --git a/lib/fp.c b/lib/fp.c
deleted file mode 100644
index 3ca8106389f5f0473ea4e474809c3d2151b77c65..0000000000000000000000000000000000000000
--- a/lib/fp.c
+++ /dev/null
@@ -1,164 +0,0 @@
-#include "fp.h"
-#include "fpmpi.h"
-#include <stdint.h>
-#include <stdlib.h>
-#include <string.h>
-
-void *local_map(void *array, int count, int type, int map_type, void *f(void *)) {
-    uint8_t *array8 = (uint8_t *) array;
-    void *output = calloc(count, map_type);
-    uint8_t *output8 = (uint8_t *) output;
-
-    for (int i = 0; i < count; ++i) {
-        void *result = f(array8 + i * type);
-        memcpy(output8, result, (size_t) map_type);
-        output8 += map_type;
-        free(result);
-    }
-    return output;
-}
-
-void *local_filter(void *array, int count, int type, bool f(void *), int *output_count) {
-    uint8_t *array8 = (uint8_t *) array;
-    void *output = calloc(count, type);
-    uint8_t *output8 = (uint8_t *) output;
-
-    for (int i = 0; i < count; ++i) {
-        uint8_t *element8 = array8 + i * type;
-        if (f(element8)) {
-            memcpy(output8, element8, type);
-            output8 += type;
-            ++(*output_count);
-        }
-    }
-    return output;
-}
-
-void *local_fold_left(void *array, int count, int type, int fold_type, void f(void *, void *), void *initial_value) {
-    uint8_t *array8 = (uint8_t *) array;
-    void *accumulator = calloc(1, fold_type);
-    int i = 0;
-    /* initial_value is NULL for reduce */
-    if (initial_value == NULL) {
-        memcpy(accumulator, array8, type);
-        ++i;
-    } else {
-        memcpy(accumulator, initial_value, fold_type);
-    }
-
-    for (; i < count; ++i) {
-        f(accumulator, array8 + i * type);
-    }
-    return accumulator;
-}
-
-void *local_fold_right(void *array, int count, int type, int fold_type, void f(void *, void *), void *initial_value) {
-    uint8_t *array8 = (uint8_t *) array;
-    void *accumulator = calloc(1, fold_type);
-    memcpy(accumulator, initial_value, fold_type);
-    for (int i = count - 1; i >= 0; --i) {
-        f(array8 + i * type, accumulator);
-    }
-    return accumulator;
-}
-
-void *local_find(void *array, int count, int type, bool f(void *)) {
-    uint8_t *array8 = (uint8_t *) array;
-    for (int i = 0; i < count; ++i) {
-        if (f(array8 + i * type)) {
-            return array8 + i * type;
-        }
-    }
-    return NULL;
-}
-
-// https://gist.github.com/hackrio1/a11c8499ed68f5df6c30e53d1c3fe076
-static void merge_sort(void *array, void *work_array, int type, bool f(void *, void *), int i, int j) {
-    uint8_t *array8 = (uint8_t *) array;
-    uint8_t *work_array8 = (uint8_t *) work_array;
-    if (j <= i) {
-        return; // the subsection is empty or a single element
-    }
-    int mid = (i + j) / 2;
-
-    // left sub-array is a[i .. mid]
-    // right sub-array is a[mid + 1 .. j]
-
-    merge_sort(array, work_array, type, f, i, mid);     // sort the left sub-array recursively
-    merge_sort(array, work_array, type, f, mid + 1, j);     // sort the right sub-array recursively
-
-    int pointer_left = i;       // pointer_left points to the beginning of the left sub-array
-    int pointer_right = mid + 1;        // pointer_right points to the beginning of the right sub-array
-    int k;      // k is the loop counter
-
-    // we loop from i to j to fill each element of the final merged array
-    for (k = i; k <= j; k++) {
-        if (pointer_left == mid + 1) {      // left pointer has reached the limit
-            memcpy(work_array8 + k * type, array8 + pointer_right * type, type);
-            pointer_right++;
-        } else if (pointer_right == j + 1) {        // right pointer has reached the limit
-            memcpy(work_array8 + k * type, array8 + pointer_left * type, type);
-            pointer_left++;
-        } else if (f(array8 + pointer_left * type,
-                     array8 + pointer_right * type)) {        // pointer left points to smaller element
-            memcpy(work_array8 + k * type, array8 + pointer_left * type, type);
-            pointer_left++;
-        } else {        // pointer right points to smaller element
-            memcpy(work_array8 + k * type, array8 + pointer_right * type, type);
-            pointer_right++;
-        }
-    }
-
-    for (k = i; k <= j; k++) {      // copy the elements from work_array[] to array[]
-        memcpy(array8 + k * type, work_array8 + k * type, type);
-    }
-}
-
-void local_sort(void *array, int count, int type, int sort_method, bool f(void *, void *)) {
-    switch (sort_method) {
-        case FPMPI_MERGE_SORT: {
-            void *work_array = calloc(count, type);
-            merge_sort(array, work_array, type, f, 0, count - 1);
-            free(work_array);
-        }
-        default:
-            break;
-    }
-}
-
-void *local_scan(void *array, int count, int type, int scan_type, void *f(void *, void *), void *initial_value) {
-    uint8_t *array8 = (uint8_t *) array;
-    void *accumulators = calloc(count + 1, scan_type);
-    uint8_t *accumulators8 = (uint8_t *) accumulators;
-    memcpy(accumulators, initial_value, scan_type);
-
-    for (int i = 0; i < count; ++i) {
-        void *accumulator = f(accumulators8 + i * scan_type, array8 + i * type);
-        memcpy(accumulators8 + (i + 1) * scan_type, accumulator, scan_type);
-        free(accumulator);
-    }
-    return accumulators;
-}
-
-void *local_zip(void *array1, void *array2, int count, int type1, int type2, int tuple_type, void *f(void *, void *)) {
-    uint8_t *array1_8 = (uint8_t *) array1;
-    uint8_t *array2_8 = (uint8_t *) array2;
-
-    void *output = calloc(count, tuple_type);
-    uint8_t *output8 = (uint8_t *) output;
-
-    for (int i = 0; i < count; ++i) {
-        void *tuple = f(array1_8 + i * type1, array2_8 + i * type2);
-        memcpy(output8 + i * tuple_type, tuple, tuple_type);
-        free(tuple);
-    }
-    return output;
-}
-
-int *local_iota(int start, int count) {
-    int *output = calloc(count, sizeof(int));
-    for (int i = 0; i < count; ++i) {
-        output[i] = start++;
-    }
-    return output;
-}
diff --git a/lib/fp.h b/lib/fp.h
deleted file mode 100644
index 0d7f6c23c974f2e8a5d6b4f9bf52a5fa972c251e..0000000000000000000000000000000000000000
--- a/lib/fp.h
+++ /dev/null
@@ -1,25 +0,0 @@
-#ifndef _FP_H_
-#define _FP_H_
-
-#include <stdbool.h>
-#include "fpmpi.h"
-
-void *local_map(void *array, int count, int type, int map_type, void *f(void *));
-
-void *local_filter(void *array, int count, int type, bool f(void *), int *output_count);
-
-void *local_fold_left(void *array, int count, int type, int fold_type, void f(void *, void *), void *initial_value);
-
-void *local_fold_right(void *array, int count, int type, int fold_type, void f(void *, void *), void *initial_value);
-
-void *local_find(void *array, int count, int type, bool f(void *));
-
-void local_sort(void *array, int count, int type, int sort_method, bool f(void *, void *));
-
-void *local_scan(void *array, int count, int type, int scan_type, void *f(void *, void *), void *initial_value);
-
-void *local_zip(void *array1, void *array2, int count, int type1, int type2, int tuple_type, void *f(void *, void *));
-
-int *local_iota(int start, int count);
-
-#endif //_FP_H_
diff --git a/lib/fpmpi.c b/lib/fpmpi.c
deleted file mode 100644
index b92e9657695fe57288dd4d77d30f52516ecf5f61..0000000000000000000000000000000000000000
--- a/lib/fpmpi.c
+++ /dev/null
@@ -1,546 +0,0 @@
-#include <stdlib.h>
-
-#ifdef BENCHMARK
-#include <stdio.h>
-#endif
-
-#include <string.h>
-#include <stdio.h>
-#include "fpmpi.h"
-#include "fp.h"
-#include "dispatch.h"
-
-#define TAG_FILTER_LOCAL_OUTPUT_COUNT 0
-#define TAG_FIND_HAS_RESULT 1
-#define TAG_FIND_LOCAL_OUTPUT 2
-#define TAG_FOLD_LEFT_ACCUMULATOR 3
-#define TAG_FOLD_RIGHT_ACCUMULATOR 4
-#define TAG_SORT_LOCAL_OUTPUT 5
-#define TAG_SCAN_ACCUMULATORS 6
-#define TAG_ZIP_START_ARRAYS 7
-
-#define min(a, b) (((a) <= (b)) ? (a) : (b))
-#define set_count(my_rank, count) my_rank == FPMPI_ROOT_RANK ? count : 0
-
-int get_world_size(MPI_Comm comm) {
-    int world_size;
-    MPI_Comm_size(comm, &world_size);
-    return world_size;
-}
-
-int get_my_rank(MPI_Comm comm) {
-    int my_rank;
-    MPI_Comm_rank(comm, &my_rank);
-    return my_rank;
-}
-
-fpmpi_result_t map(void *array, int count, int type, int map_type, void *f(void *), MPI_Comm comm) {
-#ifdef BENCHMARK
-    double start = MPI_Wtime();
-#endif
-    int my_rank = get_my_rank(comm);
-    int world_size = get_world_size(comm);
-
-    dispatch_t dispatch = dispatch_init(count, type, map_type, world_size, FPMPI_ROOT_RANK);
-
-    void *local_array = calloc(dispatch.in_counts8[my_rank], sizeof(uint8_t));
-    MPI_Scatterv(array, dispatch.in_counts8, dispatch.in_displacements8, MPI_UINT8_T, local_array,
-                 dispatch.in_counts8[my_rank], MPI_UINT8_T, FPMPI_ROOT_RANK, comm);
-
-
-    int local_count = dispatch.in_counts8[my_rank] / type;
-    void *local_output = local_map(local_array, local_count, type, map_type, f);
-
-    void *result = NULL;
-    if (my_rank == FPMPI_ROOT_RANK) {
-        result = calloc(count, map_type);
-    }
-    MPI_Gatherv(local_output, dispatch.out_counts8[my_rank], MPI_UINT8_T, result, dispatch.out_counts8,
-                dispatch.out_displacements8, MPI_UINT8_T, FPMPI_ROOT_RANK, comm);
-
-
-    free(local_array);
-    free(local_output);
-    dispatch_destroy(&dispatch);
-
-#ifdef BENCHMARK
-    double finish = MPI_Wtime();
-    if (my_rank == FPMPI_ROOT_RANK) {
-        printf("%d;%f\n", world_size, finish - start);
-    }
-#endif
-    return (fpmpi_result_t) {
-            .content = result,
-            .count = set_count(my_rank, count),
-    };
-}
-
-fpmpi_result_t filter(void *array, int count, int type, bool f(void *), MPI_Comm comm) {
-#ifdef BENCHMARK
-    double start = MPI_Wtime();
-#endif
-    int my_rank = get_my_rank(comm);
-    int world_size = get_world_size(comm);
-
-    dispatch_t dispatch = dispatch_init(count, type, type, world_size, FPMPI_ROOT_RANK);
-
-    void *local_array = calloc(dispatch.in_counts8[my_rank], sizeof(uint8_t));
-    MPI_Scatterv(array, dispatch.in_counts8, dispatch.in_displacements8, MPI_UINT8_T, local_array,
-                 dispatch.in_counts8[my_rank], MPI_UINT8_T, FPMPI_ROOT_RANK, comm);
-
-    int local_array_count = dispatch.in_counts8[my_rank] / type;
-    int local_output_count = 0;
-    void *local_output = local_filter(local_array, local_array_count, type, f, &local_output_count);
-
-    MPI_Request request;
-    MPI_Isend(&local_output_count, 1, MPI_INT, FPMPI_ROOT_RANK, TAG_FILTER_LOCAL_OUTPUT_COUNT, comm, &request);
-
-    void *result = NULL;
-    int result_count = 0;
-    if (my_rank == FPMPI_ROOT_RANK) {
-        for (int i = 0; i < world_size; ++i) {
-            MPI_Recv(&dispatch.out_counts8[i], 1, MPI_INT, i, TAG_FILTER_LOCAL_OUTPUT_COUNT, comm, MPI_STATUS_IGNORE);
-            result_count += dispatch.out_counts8[i];
-            dispatch.out_counts8[i] *= type;
-            dispatch.out_displacements8[i] =
-                    i == 0 ? 0 : dispatch.out_displacements8[i - 1] + dispatch.out_counts8[i - 1];
-        }
-        result = calloc(result_count, type);
-    }
-
-    MPI_Wait(&request, MPI_STATUS_IGNORE);
-
-    MPI_Gatherv(local_output, local_output_count * type, MPI_UINT8_T, result, dispatch.out_counts8,
-                dispatch.out_displacements8, MPI_UINT8_T, FPMPI_ROOT_RANK, MPI_COMM_WORLD);
-
-    free(local_array);
-    free(local_output);
-    dispatch_destroy(&dispatch);
-
-#ifdef BENCHMARK
-    double finish = MPI_Wtime();
-    if (my_rank == FPMPI_ROOT_RANK) {
-        printf("%d;%f\n", world_size, finish - start);
-    }
-#endif
-    return (fpmpi_result_t) {
-            .content = result,
-            .count = set_count(my_rank, result_count),
-    };
-}
-
-fpmpi_result_t reduce(void *array, int count, int type, void f(void *, void *), MPI_Comm comm) {
-    return fold_left(array, count, type, type, f, NULL, comm);
-}
-
-fpmpi_result_t
-fold_left(void *array, int count, int type, int fold_type, void f(void *, void *), void *initial_value, MPI_Comm comm) {
-#ifdef BENCHMARK
-    double start = MPI_Wtime();
-#endif
-    int my_rank = get_my_rank(comm);
-    int world_size = get_world_size(comm);
-
-    void *accumulator = calloc(1, fold_type);
-    if (my_rank != 0) {
-        MPI_Recv(accumulator, 1 * fold_type, MPI_UINT8_T, my_rank - 1, TAG_FOLD_LEFT_ACCUMULATOR, comm,
-                 MPI_STATUS_IGNORE);
-        initial_value = accumulator;
-    }
-
-    void *local_result = local_fold_left(array, count, type, fold_type, f, initial_value);
-
-    int dest = my_rank == world_size - 1 ? FPMPI_ROOT_RANK : my_rank + 1;
-
-    /* Isend because if dest == my_rank, a deadlock will occur, MPI_Recv is after */
-    MPI_Request request = {0};
-    MPI_Isend(local_result, 1 * fold_type, MPI_UINT8_T, dest, TAG_FOLD_LEFT_ACCUMULATOR, comm, &request);
-
-
-    void *result = NULL;
-    if (my_rank == FPMPI_ROOT_RANK) {
-        result = calloc(1, fold_type);
-        MPI_Recv(result, 1 * fold_type, MPI_UINT8_T, world_size - 1, TAG_FOLD_LEFT_ACCUMULATOR, comm,
-                 MPI_STATUS_IGNORE);
-    }
-
-    MPI_Wait(&request, MPI_STATUS_IGNORE);
-
-    free(local_result);
-    free(accumulator);
-
-#ifdef BENCHMARK
-    double finish = MPI_Wtime();
-    if (my_rank == FPMPI_ROOT_RANK) {
-        printf("%d;%f\n", world_size, finish - start);
-    }
-#endif
-    return (fpmpi_result_t) {
-            .content = result,
-            .count = set_count(my_rank, 1),
-    };
-}
-
-fpmpi_result_t
-fold_right(void *array, int count, int type, int fold_type, void f(void *, void *), void *initial_value,
-           MPI_Comm comm) {
-#ifdef BENCHMARK
-    double start = MPI_Wtime();
-#endif
-    int my_rank = get_my_rank(comm);
-    int world_size = get_world_size(comm);
-
-    void *accumulator = calloc(1, fold_type);
-    if (my_rank != world_size - 1) {
-        MPI_Recv(accumulator, 1 * fold_type, MPI_UINT8_T, my_rank + 1, TAG_FOLD_RIGHT_ACCUMULATOR, comm,
-                 MPI_STATUS_IGNORE);
-        initial_value = accumulator;
-    }
-
-    void *local_result = local_fold_right(array, count, type, fold_type, f, initial_value);
-
-    int dest = my_rank == 0 ? FPMPI_ROOT_RANK : my_rank - 1;
-    /* Isend because if dest == my_rank, a deadlock will occur, MPI_Recv will be after */
-    MPI_Request request = {0};
-    MPI_Isend(local_result, 1 * fold_type, MPI_UINT8_T, dest, TAG_FOLD_RIGHT_ACCUMULATOR, comm, &request);
-
-    void *result = NULL;
-    if (my_rank == FPMPI_ROOT_RANK) {
-        result = calloc(1, fold_type);
-        MPI_Recv(result, 1 * fold_type, MPI_UINT8_T, 0, TAG_FOLD_RIGHT_ACCUMULATOR, comm,
-                 MPI_STATUS_IGNORE);
-    }
-
-    MPI_Wait(&request, MPI_STATUS_IGNORE);
-
-    free(local_result);
-    free(accumulator);
-
-#ifdef BENCHMARK
-    double finish = MPI_Wtime();
-    if (my_rank == FPMPI_ROOT_RANK) {
-        printf("%d;%f\n", world_size, finish - start);
-    }
-#endif
-    return (fpmpi_result_t) {
-            .content = result,
-            .count = set_count(my_rank, 1),
-    };
-}
-
-fpmpi_result_t find(void *array, int count, int type, bool f(void *), MPI_Comm comm) {
-#ifdef BENCHMARK
-    double start = MPI_Wtime();
-#endif
-    int my_rank = get_my_rank(comm);
-    int world_size = get_world_size(comm);
-
-    dispatch_t dispatch = dispatch_init(count, type, type, world_size, FPMPI_ROOT_RANK);
-
-    void *local_array = calloc(dispatch.in_counts8[my_rank], sizeof(uint8_t));
-    MPI_Scatterv(array, dispatch.in_counts8, dispatch.in_displacements8, MPI_UINT8_T, local_array,
-                 dispatch.in_counts8[my_rank], MPI_UINT8_T, FPMPI_ROOT_RANK, comm);
-
-    int local_count = dispatch.in_counts8[my_rank] / type;
-    void *local_output = local_find(local_array, local_count, type, f);
-    bool local_has_result = local_output != NULL;
-
-    MPI_Request requests[2] = {0};
-    MPI_Isend(&local_has_result, 1, MPI_C_BOOL, FPMPI_ROOT_RANK, TAG_FIND_HAS_RESULT, comm, &requests[0]);
-
-    if (local_has_result) {
-        MPI_Isend(local_output, 1 * type, MPI_UINT8_T, FPMPI_ROOT_RANK, TAG_FIND_LOCAL_OUTPUT, comm, &requests[1]);
-    }
-
-    void *result = NULL;
-    if (my_rank == FPMPI_ROOT_RANK) {
-        uint8_t *results = calloc(world_size, type);
-        int result_index = INT32_MAX;
-        MPI_Status status;
-        bool has_result = false;
-        for (int i = 0; i < world_size; ++i) {
-            MPI_Recv(&has_result, 1, MPI_C_BOOL, MPI_ANY_SOURCE, TAG_FIND_HAS_RESULT, comm, &status);
-            if (has_result) {
-                MPI_Recv(results + status.MPI_SOURCE * type, 1 * type, MPI_UINT8_T, status.MPI_SOURCE,
-                         TAG_FIND_LOCAL_OUTPUT, comm, MPI_STATUS_IGNORE);
-                result_index = min(status.MPI_SOURCE, result_index);
-            }
-        }
-        if (result_index != INT32_MAX) {
-            result = calloc(1, type);
-            memcpy(result, results + result_index * type, type);
-        }
-        free(results);
-    }
-
-    MPI_Waitall(local_has_result + 1, requests, MPI_STATUSES_IGNORE);
-
-    free(local_array);
-    dispatch_destroy(&dispatch);
-
-#ifdef BENCHMARK
-    double finish = MPI_Wtime();
-    if (my_rank == FPMPI_ROOT_RANK) {
-        printf("%d;%f\n", world_size, finish - start);
-    }
-#endif
-    return (fpmpi_result_t) {
-            .content = result,
-            .count = set_count(my_rank, result != NULL),
-    };
-}
-
-// https://www.geeksforgeeks.org/merge-k-sorted-arrays/
-static void
-marge_arrays(void *array1, void *array2, int count1, int count2, int type, bool f(void *, void *), void *result_array) {
-    int i = 0, j = 0, k = 0;
-    uint8_t *array1_8 = (uint8_t *) array1;
-    uint8_t *array2_8 = (uint8_t *) array2;
-    uint8_t *result_array8 = (uint8_t *) result_array;
-
-    // Traverse both array
-    while (i < count1 && j < count2) {
-        // Check if current element of first
-        // array is smaller than current element
-        // of second array. If yes, store first
-        // array element and increment first array
-        // index. Otherwise do same with second array
-        if (f(array1_8 + i * type, array2_8 + j * type)) {
-            memcpy(result_array8 + k * type, array1_8 + i * type, type);
-            ++k, ++i;
-        } else {
-            memcpy(result_array8 + k * type, array2_8 + j * type, type);
-            ++k, ++j;
-        }
-    }
-
-    // Store remaining elements of first array
-    while (i < count1) {
-        memcpy(result_array8 + k * type, array1_8 + i * type, type);
-        ++k, ++i;
-    }
-
-    // Store remaining elements of second array
-    while (j < count2) {
-        memcpy(result_array8 + k * type, array2_8 + j * type, type);
-        ++k, ++j;
-    }
-}
-
-fpmpi_result_t sort(void *array, int count, int type, int sort_method, bool f(void *, void *), MPI_Comm comm) {
-#ifdef BENCHMARK
-    double start = MPI_Wtime();
-#endif
-    int my_rank = get_my_rank(comm);
-    int world_size = get_world_size(comm);
-
-    dispatch_t dispatch = dispatch_init(count, type, type, world_size, FPMPI_ROOT_RANK);
-
-    void *local_array = calloc(dispatch.in_counts8[my_rank], sizeof(uint8_t));
-    MPI_Scatterv(array, dispatch.in_counts8, dispatch.in_displacements8, MPI_UINT8_T, local_array,
-                 dispatch.in_counts8[my_rank], MPI_UINT8_T, FPMPI_ROOT_RANK, comm);
-
-    int local_count = dispatch.in_counts8[my_rank] / type;
-    local_sort(local_array, local_count, type, sort_method, f);
-
-    void *result = NULL;
-    if (my_rank == FPMPI_ROOT_RANK) {
-        result = calloc(local_count, type);
-        memcpy(result, local_array, dispatch.in_counts8[my_rank]);
-
-        if (world_size > 1) {
-            int current_count = local_count;
-            int recv_count8 = dispatch.in_counts8[(my_rank + 1) % world_size];
-            int recv_count = dispatch.in_counts8[(my_rank + 1) % world_size] / type;
-            void *recv_buffer = calloc(recv_count8, sizeof(uint8_t));
-
-            for (int i = 0; i < world_size - 1; ++i) {
-                MPI_Recv(recv_buffer, recv_count8, MPI_UINT8_T, MPI_ANY_SOURCE, TAG_SORT_LOCAL_OUTPUT, comm,
-                         MPI_STATUS_IGNORE);
-                void *tmp_result = calloc(current_count + recv_count, type);
-                marge_arrays(result, recv_buffer, current_count, recv_count, type, f, tmp_result);
-                free(result);
-                result = tmp_result;
-                current_count += recv_count;
-            }
-            free(recv_buffer);
-        }
-    } else {
-        MPI_Send(local_array, dispatch.in_counts8[my_rank], MPI_UINT8_T, FPMPI_ROOT_RANK, TAG_SORT_LOCAL_OUTPUT, comm);
-    }
-
-    free(local_array);
-    dispatch_destroy(&dispatch);
-
-#ifdef BENCHMARK
-    double finish = MPI_Wtime();
-    if (my_rank == FPMPI_ROOT_RANK) {
-        printf("%d;%f\n", world_size, finish - start);
-    }
-#endif
-    return (fpmpi_result_t) {
-            .content = result,
-            .count = count,
-    };
-}
-
-
-fpmpi_result_t
-scan(void *array, int count, int type, int scan_type, void *f(void *, void *), void *initial_value, MPI_Comm comm) {
-#ifdef BENCHMARK
-    double start = MPI_Wtime();
-#endif
-    int my_rank = get_my_rank(comm);
-    int world_size = get_world_size(comm);
-
-    void *current_accumulators = NULL;
-    int current_accumulators_count8 = 0;
-
-    if (my_rank != 0) {
-        MPI_Status status = {0};
-        MPI_Probe(my_rank - 1, TAG_SCAN_ACCUMULATORS, MPI_COMM_WORLD, &status);
-        MPI_Get_count(&status, MPI_UINT8_T, &current_accumulators_count8);
-        current_accumulators = calloc(current_accumulators_count8, sizeof(uint8_t));
-
-        MPI_Recv(current_accumulators, current_accumulators_count8, MPI_UINT8_T, my_rank - 1, TAG_SCAN_ACCUMULATORS,
-                 comm, MPI_STATUS_IGNORE);
-        /* Initial value is the last accumulator value */
-        initial_value = ((uint8_t *) current_accumulators) + current_accumulators_count8 - scan_type;
-    } else {
-        current_accumulators = calloc(count + 1, scan_type);
-        current_accumulators_count8 = (count + 1) * scan_type;
-    }
-
-    void *local_accumulators = local_scan(array, count, type, scan_type, f, initial_value);
-    int local_accumulators_count8 = (count + 1) * scan_type;
-
-    if (my_rank != 0) {
-        /* First accumulators is ignored because it will be duplicated */
-        uint8_t *local_accumulators8 = (uint8_t *) local_accumulators + scan_type;
-        local_accumulators_count8 = (count) * scan_type;
-        current_accumulators = realloc(current_accumulators, current_accumulators_count8 + local_accumulators_count8);
-        uint8_t *current_accumulators8 = (uint8_t *) current_accumulators + current_accumulators_count8;
-        current_accumulators_count8 += local_accumulators_count8;
-        memcpy(current_accumulators8, local_accumulators8, local_accumulators_count8);
-    } else {
-        memcpy(current_accumulators, local_accumulators, local_accumulators_count8);
-    }
-
-    int dest = my_rank == world_size - 1 ? FPMPI_ROOT_RANK : my_rank + 1;
-
-    /* Isend because if dest == my_rank, a deadlock will occur, MPI_Recv is after */
-    MPI_Request request = {0};
-    MPI_Isend(current_accumulators, current_accumulators_count8, MPI_UINT8_T, dest, TAG_SCAN_ACCUMULATORS, comm,
-              &request);
-
-    void *result = NULL;
-    int recv_count = 0;
-    if (my_rank == FPMPI_ROOT_RANK) {
-        MPI_Status status = {0};
-        MPI_Probe(world_size - 1, TAG_SCAN_ACCUMULATORS, MPI_COMM_WORLD, &status);
-        MPI_Get_count(&status, MPI_UINT8_T, &recv_count);
-        result = calloc(recv_count, sizeof(uint8_t));
-        MPI_Recv(result, recv_count, MPI_UINT8_T, world_size - 1, TAG_SCAN_ACCUMULATORS, comm,
-                 MPI_STATUS_IGNORE);
-    }
-
-    MPI_Wait(&request, MPI_STATUS_IGNORE);
-
-    free(local_accumulators);
-    free(current_accumulators);
-
-#ifdef BENCHMARK
-    double finish = MPI_Wtime();
-    if (my_rank == FPMPI_ROOT_RANK) {
-        printf("%d;%f\n", world_size, finish - start);
-    }
-#endif
-    return (fpmpi_result_t) {
-            .content = result,
-            .count = set_count(my_rank, recv_count / scan_type),
-    };
-}
-
-fpmpi_result_t iota(int n, MPI_Comm comm) {
-#ifdef BENCHMARK
-    double start = MPI_Wtime();
-#endif
-    int my_rank = get_my_rank(comm);
-    int world_size = get_world_size(comm);
-    dispatch_t dispatch = dispatch_init(n, FPMPI_INT32, FPMPI_INT32, world_size, my_rank);
-
-    int local_start = dispatch.in_displacements8[my_rank] / FPMPI_INT32;
-    int local_n = dispatch.in_counts8[my_rank] / FPMPI_INT32;
-    void *local_output = local_iota(local_start, local_n);
-
-    void *result = NULL;
-    if (my_rank == FPMPI_ROOT_RANK) {
-        result = calloc(n, FPMPI_INT32);
-    }
-    MPI_Gatherv(local_output, dispatch.out_counts8[my_rank], MPI_UINT8_T, result, dispatch.out_counts8,
-                dispatch.out_displacements8, MPI_UINT8_T, FPMPI_ROOT_RANK, comm);
-
-    free(local_output);
-    dispatch_destroy(&dispatch);
-#ifdef BENCHMARK
-    double finish = MPI_Wtime();
-    if (my_rank == FPMPI_ROOT_RANK) {
-        printf("%d;%f\n", world_size, finish - start);
-    }
-#endif
-    return (fpmpi_result_t) {
-            .content = result,
-            .count = set_count(my_rank, n)
-    };
-}
-
-fpmpi_result_t zip(void *array1, void *array2, int count, int type1, int type2, int tuple_type, void *f(void *, void *),
-                   MPI_Comm comm) {
-#ifdef BENCHMARK
-    double start = MPI_Wtime();
-#endif
-    int my_rank = get_my_rank(comm);
-    int world_size = get_world_size(comm);
-
-    dispatch_t dispatch1 = dispatch_init(count, type1, type1, world_size, FPMPI_ROOT_RANK);
-    dispatch_t dispatch2 = dispatch_init(count, type2, type2, world_size, FPMPI_ROOT_RANK);
-    dispatch_t dispatch3 = dispatch_init(count, tuple_type, tuple_type, world_size, FPMPI_ROOT_RANK);
-
-    void *local_array1 = calloc(dispatch1.in_counts8[my_rank], sizeof(uint8_t));
-    MPI_Scatterv(array1, dispatch1.in_counts8, dispatch1.in_displacements8, MPI_UINT8_T, local_array1,
-                 dispatch1.in_counts8[my_rank], MPI_UINT8_T, FPMPI_ROOT_RANK, comm);
-
-    void *local_array2 = calloc(dispatch2.in_counts8[my_rank], sizeof(uint8_t));
-    MPI_Scatterv(array2, dispatch2.in_counts8, dispatch2.in_displacements8, MPI_UINT8_T, local_array2,
-                 dispatch2.in_counts8[my_rank], MPI_UINT8_T, FPMPI_ROOT_RANK, comm);
-
-    int local_count = dispatch1.in_counts8[my_rank] / type1;
-    void *local_output = local_zip(local_array1, local_array2, local_count, type1, type2, tuple_type, f);
-
-    void *result = NULL;
-    if (my_rank == FPMPI_ROOT_RANK) {
-        result = calloc(count, tuple_type);
-    }
-    MPI_Gatherv(local_output, dispatch3.out_counts8[my_rank], MPI_UINT8_T, result, dispatch3.out_counts8,
-                dispatch3.out_displacements8, MPI_UINT8_T, FPMPI_ROOT_RANK, comm);
-
-
-    free(local_array1);
-    free(local_array2);
-    free(local_output);
-
-    dispatch_destroy(&dispatch1);
-    dispatch_destroy(&dispatch2);
-    dispatch_destroy(&dispatch3);
-
-#ifdef BENCHMARK
-    double finish = MPI_Wtime();
-    if (my_rank == FPMPI_ROOT_RANK) {
-        printf("%d;%f\n", world_size, finish - start);
-    }
-#endif
-    return (fpmpi_result_t) {
-            .content = result,
-            .count = set_count(my_rank, count)
-    };
-}
diff --git a/lib/fpmpi.h b/lib/fpmpi.h
deleted file mode 100644
index 02ae54bbd5580a95ed229c81b92d18d396291fcb..0000000000000000000000000000000000000000
--- a/lib/fpmpi.h
+++ /dev/null
@@ -1,61 +0,0 @@
-#ifndef FPMPI_LIBRARY_H
-#define FPMPI_LIBRARY_H
-
-#include <mpi.h>
-#include <stdint.h>
-#include <stdbool.h>
-
-#define FPMPI_INT8 sizeof(int8_t)
-#define FPMPI_UINT8 sizeof(uint8_t)
-
-#define FPMPI_INT16 sizeof(int16_t)
-#define FPMPI_UINT16 sizeof(uint16_t)
-
-#define FPMPI_INT32 sizeof(int32_t)
-#define FPMPI_UINT32 sizeof(uint32_t)
-
-#define FPMPI_INT64 sizeof(int64_t)
-#define FPMPI_UINT64 sizeof(int64_t)
-
-#define FPMPI_DOUBLE sizeof(double)
-
-#define FPMPI_ROOT_RANK 0
-
-#define FPMPI_MERGE_SORT 1
-
-typedef struct fpmpi_result {
-    void *content;
-    int count;
-} fpmpi_result_t;
-
-fpmpi_result_t map(void *array, int count, int type, int map_type, void *f(void *), MPI_Comm comm);
-
-fpmpi_result_t filter(void *array, int count, int type, bool f(void *), MPI_Comm comm);
-
-fpmpi_result_t reduce(void *array, int count, int type, void f(void *, void *), MPI_Comm comm);
-
-fpmpi_result_t
-fold_left(void *array, int count, int type, int fold_type, void f(void *, void *), void *initial_value, MPI_Comm comm);
-
-fpmpi_result_t
-fold_right(void *array, int count, int type, int fold_type, void f(void *, void *), void *initial_value, MPI_Comm comm);
-
-fpmpi_result_t
-scan(void *array, int count, int type, int scan_type, void *f(void *, void *), void *initial_value, MPI_Comm comm);
-
-fpmpi_result_t sort(void *array, int count, int type, int sort_method, bool f(void *, void *), MPI_Comm comm);
-
-fpmpi_result_t find(void *array, int count, int type, bool f(void *), MPI_Comm comm);
-
-fpmpi_result_t iota(int n, MPI_Comm comm);
-
-fpmpi_result_t zip(void *array1, void *array2, int count, int type1, int type2, int tuple_type, void *f(void *, void *),
-                   MPI_Comm comm);
-
-//fpmpi_result_t unzip(tuple2_t *array, int count, int type1, int type2, MPI_Comm comm);
-
-//fpmpi_result_t flat_map(void *array, int dimensions[2], int type, MPI_Comm comm);
-// map2, map (zip xs ys)
-// rotate
-
-#endif //FPMPI_LIBRARY_H
diff --git a/lib/tests/tests.c b/lib/tests/tests.c
deleted file mode 100644
index 6edb8afb08dc92d1aeadf06b294ffb85a281bef9..0000000000000000000000000000000000000000
--- a/lib/tests/tests.c
+++ /dev/null
@@ -1,453 +0,0 @@
-#include <mpi.h>
-#include <stdlib.h>
-#include <stdio.h>
-#include <assert.h>
-#include "../fpmpi.h"
-#include "../fp.h"
-
-#define TEST_MAP 1
-#define TEST_FILTER 2
-#define TEST_REDUCE 3
-#define TEST_FIND 4
-#define TEST_FOLD_LEFT 5
-#define TEST_FOLD_RIGHT 6
-#define TEST_SORT 7
-#define TEST_SCAN 8
-#define TEST_IOTA 9
-#define TEST_ZIP 10
-
-#define N 12
-
-void *map_mul_int(void *element) {
-    int *result = calloc(1, sizeof(int));
-    *result = (*(int *) (element)) * 2;
-    return (void *) result;
-}
-
-void *map_mul_int_double(void *element) {
-    double *result = calloc(1, sizeof(double));
-    *result = (*(int *) (element)) * 2.0;
-    return (void *) result;
-}
-
-bool filter_only_even(void *element) {
-    int element32 = *(int *) element;
-    return element32 % 2 == 0;
-}
-
-bool find_divide_by_three(void *element) {
-    int element32 = *(int *) element;
-    return element32 % 3 == 0;
-}
-
-bool find_fifty(void *element) {
-    int element32 = *(int *) element;
-    return element32 == 50;
-}
-
-void reduce_sum(void *accumulator, void *current_value) {
-    int *accumulator32 = (int *) accumulator;
-    int current_value32 = *(int *) current_value;
-    *accumulator32 = (*accumulator32 + current_value32);
-}
-
-void fold_left_sub(void *accumulator, void *current_value) {
-    int *accumulator32 = (int *) accumulator;
-    int current_value32 = *(int *) current_value;
-    *accumulator32 = (*accumulator32 - current_value32);
-}
-
-void fold_left_sub_int8_int(void *accumulator, void *current_value) {
-    int *accumulator32 = (int *) accumulator;
-    int current_value32 = (int) (*(char *) current_value);
-    *accumulator32 = (*accumulator32 - current_value32);
-}
-
-void fold_right_sub(void *current_value, void *accumulator) {
-    int *accumulator32 = (int *) accumulator;
-    int current_value32 = *(int *) current_value;
-    *accumulator32 = (current_value32 - *accumulator32);
-}
-
-void fold_right_sub_int8_int(void *current_value, void *accumulator) {
-    int *accumulator32 = (int *) accumulator;
-    int current_value32 = (int) (*(char *) current_value);
-    *accumulator32 = (current_value32 - *accumulator32);
-}
-
-void *scan_add(void *accumulator, void *current_value) {
-    int *accumulator32 = (int *) accumulator;
-    int current_value32 = *(int *) current_value;
-    int *new_accumulator = calloc(1, sizeof(int));
-    *new_accumulator = (*accumulator32 + current_value32);
-    return new_accumulator;
-}
-
-void tests_map(int my_rank) {
-    int array[N] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12};
-    {
-        printf("Test map 1...\n");
-        fpmpi_result_t result = map(array, N, FPMPI_INT32, FPMPI_INT32, map_mul_int, MPI_COMM_WORLD);
-        if (my_rank == FPMPI_ROOT_RANK) {
-            int expected_content[N] = {2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24};
-            int *content = result.content;
-
-            assert(result.content != NULL);
-            assert(result.count == N);
-            for (int i = 0; i < result.count; ++i) {
-                assert(content[i] == expected_content[i]);
-            }
-            free(result.content);
-        } else {
-            assert(result.content == NULL);
-            assert(result.count == 0);
-        }
-    }
-
-    {
-        printf("Test map 2...\n");
-        fpmpi_result_t result = map(array, N, FPMPI_INT32, FPMPI_DOUBLE, map_mul_int_double, MPI_COMM_WORLD);
-        if (my_rank == FPMPI_ROOT_RANK) {
-            double expected_content[N] = {2.0, 4.0, 6.0, 8.0, 10.0, 12.0, 14.0, 16.0, 18.0, 20.0, 22.0, 24.0};
-            double *content = result.content;
-
-            assert(result.content != NULL);
-            assert(result.count == N);
-            for (int i = 0; i < result.count; ++i) {
-                assert(content[i] == expected_content[i]);
-            }
-
-            free(result.content);
-        } else {
-            assert(result.content == NULL);
-            assert(result.count == 0);
-        }
-    }
-}
-
-void tests_filter(int my_rank) {
-    int array[N] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12};
-    {
-        printf("Test filter 1...\n");
-        fpmpi_result_t result = filter(array, N, FPMPI_INT32, filter_only_even, MPI_COMM_WORLD);
-        if (my_rank == FPMPI_ROOT_RANK) {
-            int expected_content[N / 2] = {2, 4, 6, 8, 10, 12};
-            int *content = result.content;
-
-            assert(result.count == 6);
-            for (int i = 0; i < result.count; ++i) {
-                assert(content[i] == expected_content[i]);
-            }
-            free(result.content);
-        } else {
-            assert(result.content == NULL);
-            assert(result.count == 0);
-        }
-    }
-}
-
-void tests_reduce(int my_rank) {
-    int array[N] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12};
-    {
-        printf("Test reduce 1...\n");
-        fpmpi_result_t result = reduce(array, N, FPMPI_INT32, reduce_sum, MPI_COMM_WORLD);
-        if (my_rank == FPMPI_ROOT_RANK) {
-            int expected_content = 468;
-            int *content = result.content;
-
-            assert(result.count == 1);
-            assert(*content == expected_content);
-            free(result.content);
-        } else {
-            assert(result.content == NULL);
-            assert(result.count == 0);
-        }
-    }
-}
-
-void tests_find(int my_rank) {
-    int array[N] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12};
-    {
-        printf("Test find 1...\n");
-        fpmpi_result_t result = find(array, N, FPMPI_INT32, find_divide_by_three, MPI_COMM_WORLD);
-        if (my_rank == FPMPI_ROOT_RANK) {
-            int expected_content = 3;
-            int *content = result.content;
-
-            assert(result.count == 1);
-            assert(*content == expected_content);
-            free(result.content);
-        } else {
-            assert(result.content == NULL);
-            assert(result.count == 0);
-        }
-    }
-
-    {
-        printf("Test find 2...\n");
-        fpmpi_result_t result = find(array, N, FPMPI_INT32, find_fifty, MPI_COMM_WORLD);
-        if (my_rank == FPMPI_ROOT_RANK) {
-            int *content = result.content;
-
-            assert(result.count == 0);
-            assert(content == NULL);
-            free(result.content);
-        } else {
-            assert(result.content == NULL);
-            assert(result.count == 0);
-        }
-    }
-}
-
-void tests_fold_left(int my_rank) {
-    {
-        printf("Test fold_left 1...\n");
-        int array[N] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12};
-        int initial_value = 0;
-        fpmpi_result_t result = fold_left(array, N, FPMPI_INT32, FPMPI_INT32, fold_left_sub, &initial_value,
-                                          MPI_COMM_WORLD);
-        if (my_rank == FPMPI_ROOT_RANK) {
-            int expected_content = -468;
-            int *content = result.content;
-
-            assert(result.count == 1);
-            assert(*content == expected_content);
-            free(result.content);
-        } else {
-            assert(result.content == NULL);
-            assert(result.count == 0);
-        }
-    }
-    {
-        printf("Test fold_left 2...\n");
-        char array[N] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12};
-        double initial_value = 0;
-        fpmpi_result_t result = fold_left(array, N, FPMPI_INT8, FPMPI_INT32, fold_left_sub_int8_int, &initial_value,
-                                          MPI_COMM_WORLD);
-        if (my_rank == FPMPI_ROOT_RANK) {
-            double expected_content = -468.0;
-            int *content = result.content;
-
-            assert(result.count == 1);
-            assert(*content == expected_content);
-            free(result.content);
-        } else {
-            assert(result.content == NULL);
-            assert(result.count == 0);
-        }
-    }
-}
-
-void tests_fold_right(int my_rank) {
-    {
-        printf("Test fold_left 1...\n");
-        int array[N] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12};
-        int initial_value = 0;
-        fpmpi_result_t result = fold_right(array, N, FPMPI_INT32, FPMPI_INT32, fold_right_sub, &initial_value,
-                                           MPI_COMM_WORLD);
-        if (my_rank == FPMPI_ROOT_RANK) {
-            int expected_content = -36;
-            int *content = result.content;
-
-            assert(result.count == 1);
-            assert(*content == expected_content);
-            free(result.content);
-        } else {
-            assert(result.content == NULL);
-            assert(result.count == 0);
-        }
-    }
-
-    {
-        printf("Test fold_left 2...\n");
-        char array[N] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12};
-        int initial_value = 0;
-        fpmpi_result_t result = fold_right(array, N, FPMPI_INT8, FPMPI_INT32, fold_right_sub_int8_int, &initial_value,
-                                           MPI_COMM_WORLD);
-        if (my_rank == FPMPI_ROOT_RANK) {
-            int expected_content = -36;
-            int *content = result.content;
-
-            assert(result.count == 1);
-            assert(*content == expected_content);
-            free(result.content);
-        } else {
-            assert(result.content == NULL);
-            assert(result.count == 0);
-        }
-    }
-}
-
-bool sort_asc(void *left, void *right) {
-    int left32 = *(int *) left;
-    int right32 = *(int *) right;
-    return left32 < right32;
-}
-
-bool sort_dsc(void *left, void *right) {
-    int left32 = *(int *) left;
-    int right32 = *(int *) right;
-    return left32 > right32;
-}
-
-void tests_sort(int my_rank) {
-    int array[N] = {18, 15, 83, 56, 41, 100, 71, 7, 69, 23, 36, 77};
-    {
-        printf("Test sort 1...\n");
-        fpmpi_result_t result = sort(array, N, FPMPI_INT32, FPMPI_MERGE_SORT, sort_asc, MPI_COMM_WORLD);
-        if (my_rank == FPMPI_ROOT_RANK) {
-            int expected_content[N] = {7, 15, 18, 23, 36, 41, 56, 69, 71, 77, 83, 100};
-            int *content = result.content;
-
-            assert(result.count == N);
-            for (int i = 0; i < result.count; ++i) {
-                assert(content[i] == expected_content[i]);
-            }
-            free(result.content);
-        } else {
-            assert(result.content == NULL);
-            assert(result.count == N);
-        }
-    }
-    {
-        printf("Test sort 2...\n");
-        fpmpi_result_t result = sort(array, N, FPMPI_INT32, FPMPI_MERGE_SORT, sort_dsc, MPI_COMM_WORLD);
-        if (my_rank == FPMPI_ROOT_RANK) {
-            int expected_content[N] = {100, 83, 77, 71, 69, 56, 41, 36, 23, 18, 15, 7};
-            int *content = result.content;
-
-            assert(result.count == N);
-            for (int i = 0; i < result.count; ++i) {
-                assert(content[i] == expected_content[i]);
-            }
-            free(result.content);
-        } else {
-            assert(result.content == NULL);
-            assert(result.count == N);
-        }
-    }
-}
-
-void tests_scan(int my_rank) {
-    {
-        printf("Test scan 1...\n");
-        int array[N] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12};
-        int initial_value = 0;
-        fpmpi_result_t result = scan(array, N, FPMPI_INT32, FPMPI_INT32, scan_add, &initial_value, MPI_COMM_WORLD);
-        if (my_rank == FPMPI_ROOT_RANK) {
-            int expected_content = 468;
-            int *content = result.content;
-
-            assert(result.count == 73);
-            assert(content[result.count - 1] == expected_content);
-            free(result.content);
-        } else {
-            assert(result.content == NULL);
-            assert(result.count == 0);
-        }
-    }
-}
-
-void tests_iota(int my_rank) {
-    {
-        printf("Test iota 1...\n");
-        fpmpi_result_t result = iota(N, MPI_COMM_WORLD);
-        if (my_rank == FPMPI_ROOT_RANK) {
-            int expected_content[N] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11};
-            int *content = result.content;
-
-            assert(result.count == N);
-            for (int i = 0; i < result.count; ++i) {
-                assert(content[i] == expected_content[i]);
-            }
-            free(result.content);
-        } else {
-            assert(result.content == NULL);
-            assert(result.count == 0);
-        }
-    }
-}
-
-//void tests_zip(int my_rank) {
-//    int array1[N] = {18, 15, 83, 56, 41, 100, 71, 7, 69, 23, 36, 77};
-//    char array2[N] = {5, 91, 70, 96, 9, 98, 37, 1, 13, 3, 42, 7};
-//    {
-//        printf("Test zip 1...\n");
-//        fpmpi_result_t result = zip(array1, array2, N, FPMPI_INT32, FPMPI_INT8, MPI_COMM_WORLD);
-//        if (my_rank == FPMPI_ROOT_RANK) {
-//            tuple2_t expected_content[N] = {
-//                    {.first = &array1[0], .second = &array2[0]},
-//                    {.first = &array1[1], .second = &array2[1]},
-//                    {.first = &array1[2], .second = &array2[2]},
-//                    {.first = &array1[3], .second = &array2[3]},
-//                    {.first = &array1[4], .second = &array2[4]},
-//                    {.first = &array1[5], .second = &array2[5]},
-//                    {.first = &array1[6], .second = &array2[6]},
-//                    {.first = &array1[7], .second = &array2[7]},
-//                    {.first = &array1[8], .second = &array2[8]},
-//                    {.first = &array1[9], .second = &array2[9]},
-//                    {.first = &array1[10], .second = &array2[10]},
-//                    {.first = &array1[11], .second = &array2[11]},
-//            };
-//            tuple2_t *content = result.content;
-//            assert(result.count == N);
-//            for (int i = 0; i < result.count; ++i) {
-//                assert(content[i].first == expected_content[i].first);
-//                assert(content[i].second == expected_content[i].second);
-//            }
-//            free(result.content);
-//        } else {
-//            assert(result.content == NULL);
-//            assert(result.count == 0);
-//        }
-//    }
-//}
-
-int main(int argc, char *argv[]) {
-    MPI_Init(&argc, &argv);
-
-    int my_rank;
-    MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
-
-    for (int i = TEST_MAP; i <= TEST_ZIP; ++i) {
-        switch (i) {
-            case TEST_MAP:
-                tests_map(my_rank);
-                break;
-            case TEST_FILTER:
-                tests_filter(my_rank);
-                break;
-            case TEST_REDUCE:
-                tests_reduce(my_rank);
-                break;
-            case TEST_FIND:
-                tests_find(my_rank);
-                break;
-            case TEST_FOLD_LEFT:
-                tests_fold_left(my_rank);
-                break;
-            case TEST_FOLD_RIGHT:
-                tests_fold_right(my_rank);
-                break;
-            case TEST_SORT:
-                tests_sort(my_rank);
-                break;
-            case TEST_SCAN:
-                tests_scan(my_rank);
-                break;
-            case TEST_IOTA:
-                tests_iota(my_rank);
-                break;
-            case TEST_ZIP:
-//                tests_zip(my_rank);
-                break;
-            default:
-                MPI_Finalize();
-                exit(0);
-        }
-        MPI_Barrier(MPI_COMM_WORLD);
-    }
-
-    MPI_Finalize();
-    return 0;
-}