diff --git a/futmpi/.gitignore b/futmpi/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..3c9d1274d19e37270fa4a2fbf476cec51486b874 --- /dev/null +++ b/futmpi/.gitignore @@ -0,0 +1,128 @@ +### C template +# Prerequisites +*.d + +# Object files +*.o +*.ko +*.obj +*.elf + +# Linker output +*.ilk +*.map +*.exp + +# Precompiled Headers +*.gch +*.pch + +# Libraries +*.lib +*.a +*.la +*.lo + +# Shared objects (inc. Windows DLLs) +*.dll +*.so +*.so.* +*.dylib + +# Executables +*.exe +*.out +*.app +*.i*86 +*.x86_64 +*.hex + +# Debug files +*.dSYM/ +*.su +*.idb +*.pdb + +# Kernel Module Compile Results +*.mod* +*.cmd +.tmp_versions/ +modules.order +Module.symvers +Mkfile.old +dkms.conf + +### JetBrains template +# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider +# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 + +# User-specific stuff +.idea/**/workspace.xml +.idea/**/tasks.xml +.idea/**/usage.statistics.xml +.idea/**/dictionaries +.idea/**/shelf + +# Generated files +.idea/**/contentModel.xml + +# Sensitive or high-churn files +.idea/**/dataSources/ +.idea/**/dataSources.ids +.idea/**/dataSources.local.xml +.idea/**/sqlDataSources.xml +.idea/**/dynamic.xml +.idea/**/uiDesigner.xml +.idea/**/dbnavigator.xml + +# Gradle +.idea/**/gradle.xml +.idea/**/libraries + +# Gradle and Maven with auto-import +# When using Gradle or Maven with auto-import, you should exclude module files, +# since they will be recreated, and may cause churn. Uncomment if using +# auto-import. +# .idea/artifacts +# .idea/compiler.xml +# .idea/jarRepositories.xml +# .idea/modules.xml +# .idea/*.iml +# .idea/modules +# *.iml +# *.ipr + +# CMake +cmake-build-*/ + +# Mongo Explorer plugin +.idea/**/mongoSettings.xml + +# File-based project format +*.iws + +# IntelliJ +out/ + +# mpeltonen/sbt-idea plugin +.idea_modules/ + +# JIRA plugin +atlassian-ide-plugin.xml + +# Cursive Clojure plugin +.idea/replstate.xml + +# Crashlytics plugin (for Android Studio and IntelliJ) +com_crashlytics_export_strings.xml +crashlytics.properties +crashlytics-build.properties +fabric.properties + +# Editor-based Rest Client +.idea/httpRequests + +# Android studio 3.1+ serialized cache file +.idea/caches/build_file_checksums.ser + +.idea diff --git a/futmpi/CMakeLists.txt b/futmpi/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..56ff99060bf09698ddc9654930499a43e8e16e2e --- /dev/null +++ b/futmpi/CMakeLists.txt @@ -0,0 +1,29 @@ +cmake_minimum_required(VERSION 3.19) +project(futmpi C) + +set(CMAKE_C_STANDARD 11) + +if (CMAKE_BUILD_TYPE MATCHES Debug) + set(GCC_COMPILE_FLAGS "-DDEBUG -Wall -Wextra -Wconversion -pedantic -fsanitize=undefined -fsanitize=address") + if (CMAKE_SYSTEM_NAME MATCHES "Linux") + set(GCC_COMPILE_FLAGS "${GCC_COMPILE_FLAGS} -fsanitize=leak") + endif () +elseif (CMAKE_BUILD_TYPE MATCHES Release) + set(GCC_COMPILE_FLAGS "-O3") +endif () + +set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${GCC_COMPILE_FLAGS}") + +if (CMAKE_SYSTEM_NAME MATCHES "Darwin") + include_directories(/usr/local/include) +endif () + +if (CMAKE_SYSTEM_NAME MATCHES "Linux") + execute_process(COMMAND sdl2-config --cflags OUTPUT_VARIABLE SDL2_C_FLAGS) +endif () + +find_package(MPI REQUIRED) +include_directories(${MPI_C_INCLUDE_PATH}) + +add_executable(futmpi main.c gol.h gol.c gfx.c gfx.h) +target_link_libraries(futmpi ${MPI_C_LIBRARIES} SDL2 m "-framework OpenCL") diff --git a/futmpi/gfx.c b/futmpi/gfx.c new file mode 100644 index 0000000000000000000000000000000000000000..bc1ae9d8f1da68180d286fb6213761ff32511de4 --- /dev/null +++ b/futmpi/gfx.c @@ -0,0 +1,95 @@ +/// @file gfx.c +/// @author Florent Gluck +/// @date November 6, 2016 +/// Helper routines to render pixels in fullscreen graphic mode. +/// Uses the SDL2 library. + +#include "gfx.h" + +/// Create a fullscreen graphic window. +/// @param title Title of the window. +/// @param width Width of the window in pixels. +/// @param height Height of the window in pixels. +/// @return a pointer to the graphic context or NULL if it failed. +struct gfx_context_t *gfx_create(char *title, uint width, uint height) { + if (SDL_Init(SDL_INIT_VIDEO) != 0) goto error; + SDL_Window *window = SDL_CreateWindow(title, SDL_WINDOWPOS_CENTERED, + SDL_WINDOWPOS_CENTERED, width, height, SDL_WINDOW_RESIZABLE); + SDL_Renderer *renderer = SDL_CreateRenderer(window, -1, 0); + SDL_Texture *texture = SDL_CreateTexture(renderer, SDL_PIXELFORMAT_ARGB8888, + SDL_TEXTUREACCESS_STREAMING, width, height); + uint32_t *pixels = malloc(width * height * sizeof(uint32_t)); + struct gfx_context_t *ctxt = malloc(sizeof(struct gfx_context_t)); + + if (!window || !renderer || !texture || !pixels || !ctxt) goto error; + + ctxt->renderer = renderer; + ctxt->texture = texture; + ctxt->window = window; + ctxt->width = width; + ctxt->height = height; + ctxt->pixels = pixels; + + SDL_ShowCursor(SDL_DISABLE); + gfx_clear(ctxt, COLOR_BLACK); + return ctxt; + + error: + return NULL; +} + +/// Draw a pixel in the specified graphic context. +/// @param ctxt Graphic context where the pixel is to be drawn. +/// @param x X coordinate of the pixel. +/// @param y Y coordinate of the pixel. +/// @param color Color of the pixel. +void gfx_putpixel(struct gfx_context_t *ctxt, int x, int y, uint32_t color) { + if (x < ctxt->width && y < ctxt->height) + ctxt->pixels[ctxt->width * y + x] = color; +} + +/// Clear the specified graphic context. +/// @param ctxt Graphic context to clear. +/// @param color Color to use. +void gfx_clear(struct gfx_context_t *ctxt, uint32_t color) { + int n = ctxt->width * ctxt->height; + while (n) + ctxt->pixels[--n] = color; +} + +/// Display the graphic context. +/// @param ctxt Graphic context to clear. +void gfx_present(struct gfx_context_t *ctxt) { + SDL_UpdateTexture(ctxt->texture, NULL, ctxt->pixels, ctxt->width * sizeof(uint32_t)); + SDL_RenderCopy(ctxt->renderer, ctxt->texture, NULL, NULL); + SDL_RenderPresent(ctxt->renderer); +} + +/// Destroy a graphic window. +/// @param ctxt Graphic context of the window to close. +void *gfx_destroy(struct gfx_context_t *ctxt) { + SDL_ShowCursor(SDL_ENABLE); + SDL_DestroyTexture(ctxt->texture); + SDL_DestroyRenderer(ctxt->renderer); + SDL_DestroyWindow(ctxt->window); + free(ctxt->pixels); + ctxt->texture = NULL; + ctxt->renderer = NULL; + ctxt->window = NULL; + ctxt->pixels = NULL; + SDL_Quit(); + free(ctxt); + return NULL; +} + +/// If a key was pressed, returns its key code (non blocking call). +/// List of key codes: https://wiki.libsdl.org/SDL_Keycode +/// SDL_PumpEvents() must be called before. +/// @return 0 if escape was not pressed. +SDL_Keycode gfx_keypressed() { + const Uint8 *state = SDL_GetKeyboardState(NULL); + if (state && state[SDL_SCANCODE_ESCAPE]) { + return SDLK_ESCAPE; + } + return 0; +} diff --git a/futmpi/gfx.h b/futmpi/gfx.h new file mode 100644 index 0000000000000000000000000000000000000000..d6604aeab83d63749ab6ab6424557e282b50e511 --- /dev/null +++ b/futmpi/gfx.h @@ -0,0 +1,43 @@ +#ifndef _GFX_H_ +#define _GFX_H_ + +#include <stdint.h> +#include <SDL2/SDL.h> + +#define MAKE_COLOR(r, g, b) ((uint32_t)b|((uint32_t)g<<8)|((uint32_t)r<<16)) + +#define COLOR_BLACK 0x00000000 +#define COLOR_RED 0x00FF0000 +#define COLOR_GREEN 0x0000FF00 +#define COLOR_BLUE 0x000000FF +#define COLOR_WHITE 0x00FFFFFF +#define COLOR_YELLOW 0x00FFFF00 + +typedef unsigned int uint; +typedef unsigned long ulong; +typedef unsigned char uchar; + +struct gfx_context_t { + SDL_Window *window; + SDL_Renderer *renderer; + SDL_Texture *texture; + uint32_t *pixels; + int width; + int height; +}; + +extern void gfx_putpixel(struct gfx_context_t *ctxt, int x, int y, uint32_t color); + +extern void gfx_clear(struct gfx_context_t *ctxt, uint32_t color); + +extern struct gfx_context_t *gfx_create(char *text, uint width, uint height); + +extern void *gfx_destroy(struct gfx_context_t *ctxt); + +extern void gfx_present(struct gfx_context_t *ctxt); + +extern SDL_Keycode gfx_keypressed(); + +extern SDL_EventType poll_event(); + +#endif diff --git a/futmpi/gol.c b/futmpi/gol.c new file mode 100644 index 0000000000000000000000000000000000000000..cc1a72e8f68e773aeaebb18d63c30f26f3f69767 --- /dev/null +++ b/futmpi/gol.c @@ -0,0 +1,4707 @@ +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#endif +#ifdef __GNUC__ +#pragma GCC diagnostic ignored "-Wunused-function" +#pragma GCC diagnostic ignored "-Wunused-variable" +#pragma GCC diagnostic ignored "-Wparentheses" +#pragma GCC diagnostic ignored "-Wunused-label" +#pragma GCC diagnostic ignored "-Wunused-but-set-variable" +#endif +#ifdef __clang__ +#pragma clang diagnostic ignored "-Wunused-function" +#pragma clang diagnostic ignored "-Wunused-variable" +#pragma clang diagnostic ignored "-Wparentheses" +#pragma clang diagnostic ignored "-Wunused-label" +#endif +// Headers + +#include <stdint.h> +#include <stddef.h> +#include <stdbool.h> +#include <stdio.h> +#include <float.h> +#define CL_TARGET_OPENCL_VERSION 120 +#define CL_USE_DEPRECATED_OPENCL_1_2_APIS +#ifdef __APPLE__ +#define CL_SILENCE_DEPRECATION +#include <OpenCL/cl.h> +#else +#include <CL/cl.h> +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +// Initialisation + +struct futhark_context_config ; +struct futhark_context_config *futhark_context_config_new(void); +void futhark_context_config_free(struct futhark_context_config *cfg); +void futhark_context_config_add_build_option(struct futhark_context_config *cfg, + const char *opt); +void futhark_context_config_set_debugging(struct futhark_context_config *cfg, + int flag); +void futhark_context_config_set_profiling(struct futhark_context_config *cfg, + int flag); +void futhark_context_config_set_logging(struct futhark_context_config *cfg, + int flag); +void futhark_context_config_set_device(struct futhark_context_config *cfg, const + char *s); +void futhark_context_config_set_platform(struct futhark_context_config *cfg, + const char *s); +void +futhark_context_config_select_device_interactively(struct futhark_context_config *cfg); +void futhark_context_config_list_devices(struct futhark_context_config *cfg); +void futhark_context_config_dump_program_to(struct futhark_context_config *cfg, + const char *path); +void +futhark_context_config_load_program_from(struct futhark_context_config *cfg, + const char *path); +void futhark_context_config_dump_binary_to(struct futhark_context_config *cfg, + const char *path); +void futhark_context_config_load_binary_from(struct futhark_context_config *cfg, + const char *path); +void +futhark_context_config_set_default_group_size(struct futhark_context_config *cfg, + int size); +void +futhark_context_config_set_default_num_groups(struct futhark_context_config *cfg, + int num); +void +futhark_context_config_set_default_tile_size(struct futhark_context_config *cfg, + int num); +void +futhark_context_config_set_default_reg_tile_size(struct futhark_context_config *cfg, + int num); +void +futhark_context_config_set_default_threshold(struct futhark_context_config *cfg, + int num); +int futhark_context_config_set_size(struct futhark_context_config *cfg, const + char *size_name, size_t size_value); +struct futhark_context ; +struct futhark_context *futhark_context_new(struct futhark_context_config *cfg); +struct futhark_context +*futhark_context_new_with_command_queue(struct futhark_context_config *cfg, + cl_command_queue queue); +void futhark_context_free(struct futhark_context *ctx); +cl_command_queue futhark_context_get_command_queue(struct futhark_context *ctx); +int futhark_get_num_sizes(void); +const char *futhark_get_size_name(int); +const char *futhark_get_size_class(int); + +// Arrays + +struct futhark_i8_2d ; +struct futhark_i8_2d *futhark_new_i8_2d(struct futhark_context *ctx, const + int8_t *data, int64_t dim0, + int64_t dim1); +struct futhark_i8_2d *futhark_new_raw_i8_2d(struct futhark_context *ctx, const + cl_mem data, int offset, + int64_t dim0, int64_t dim1); +int futhark_free_i8_2d(struct futhark_context *ctx, struct futhark_i8_2d *arr); +int futhark_values_i8_2d(struct futhark_context *ctx, struct futhark_i8_2d *arr, + int8_t *data); +cl_mem futhark_values_raw_i8_2d(struct futhark_context *ctx, + struct futhark_i8_2d *arr); +const int64_t *futhark_shape_i8_2d(struct futhark_context *ctx, + struct futhark_i8_2d *arr); + +// Opaque values + + +// Entry points + +int futhark_entry_get_envelope(struct futhark_context *ctx, + struct futhark_i8_2d **out0, const + struct futhark_i8_2d *in0); +int futhark_entry_next_chunk_board(struct futhark_context *ctx, + struct futhark_i8_2d **out0, const + struct futhark_i8_2d *in0, const + struct futhark_i8_2d *in1); + +// Miscellaneous + +int futhark_context_sync(struct futhark_context *ctx); +char *futhark_context_report(struct futhark_context *ctx); +char *futhark_context_get_error(struct futhark_context *ctx); +void futhark_context_set_logging_file(struct futhark_context *ctx, FILE *f); +void futhark_context_pause_profiling(struct futhark_context *ctx); +void futhark_context_unpause_profiling(struct futhark_context *ctx); +int futhark_context_clear_caches(struct futhark_context *ctx); +#define FUTHARK_BACKEND_opencl +#ifdef __cplusplus +} +#endif +#include <stdio.h> +#include <stdlib.h> +#include <stdbool.h> +#include <math.h> +#include <stdint.h> +#undef NDEBUG +#include <assert.h> +#include <stdarg.h> +// Start of util.h. +// +// Various helper functions that are useful in all generated C code. + +#include <errno.h> +#include <string.h> + +static const char *fut_progname = "(embedded Futhark)"; + +static void futhark_panic(int eval, const char *fmt, ...) { + va_list ap; + va_start(ap, fmt); + fprintf(stderr, "%s: ", fut_progname); + vfprintf(stderr, fmt, ap); + va_end(ap); + exit(eval); +} + +// For generating arbitrary-sized error messages. It is the callers +// responsibility to free the buffer at some point. +static char* msgprintf(const char *s, ...) { + va_list vl; + va_start(vl, s); + size_t needed = 1 + (size_t)vsnprintf(NULL, 0, s, vl); + char *buffer = (char*) malloc(needed); + va_start(vl, s); // Must re-init. + vsnprintf(buffer, needed, s, vl); + return buffer; +} + + +static inline void check_err(int errval, int sets_errno, const char *fun, int line, + const char *msg, ...) { + if (errval) { + char errnum[10]; + + va_list vl; + va_start(vl, msg); + + fprintf(stderr, "ERROR: "); + vfprintf(stderr, msg, vl); + fprintf(stderr, " in %s() at line %d with error code %s\n", + fun, line, + sets_errno ? strerror(errno) : errnum); + exit(errval); + } +} + +#define CHECK_ERR(err, msg...) check_err(err, 0, __func__, __LINE__, msg) +#define CHECK_ERRNO(err, msg...) check_err(err, 1, __func__, __LINE__, msg) + +// Read the rest of an open file into a NUL-terminated string; returns +// NULL on error. +static void* fslurp_file(FILE *f, size_t *size) { + size_t start = ftell(f); + fseek(f, 0, SEEK_END); + size_t src_size = ftell(f)-start; + fseek(f, start, SEEK_SET); + unsigned char *s = (unsigned char*) malloc(src_size + 1); + if (fread(s, 1, src_size, f) != src_size) { + free(s); + s = NULL; + } else { + s[src_size] = '\0'; + } + + if (size) { + *size = src_size; + } + + return s; +} + +// Read a file into a NUL-terminated string; returns NULL on error. +static void* slurp_file(const char *filename, size_t *size) { + FILE *f = fopen(filename, "rb"); // To avoid Windows messing with linebreaks. + if (f == NULL) return NULL; + unsigned char *s = fslurp_file(f, size); + fclose(f); + return s; +} + +// Dump 'n' bytes from 'buf' into the file at the designated location. +// Returns 0 on success. +static int dump_file(const char *file, const void *buf, size_t n) { + FILE *f = fopen(file, "w"); + + if (f == NULL) { + return 1; + } + + if (fwrite(buf, sizeof(char), n, f) != n) { + return 1; + } + + if (fclose(f) != 0) { + return 1; + } + + return 0; +} + +struct str_builder { + char *str; + size_t capacity; // Size of buffer. + size_t used; // Bytes used, *not* including final zero. +}; + +static void str_builder_init(struct str_builder *b) { + b->capacity = 10; + b->used = 0; + b->str = malloc(b->capacity); + b->str[0] = 0; +} + +static void str_builder(struct str_builder *b, const char *s, ...) { + va_list vl; + va_start(vl, s); + size_t needed = (size_t)vsnprintf(NULL, 0, s, vl); + + while (b->capacity < b->used + needed + 1) { + b->capacity *= 2; + b->str = realloc(b->str, b->capacity); + } + + va_start(vl, s); // Must re-init. + vsnprintf(b->str+b->used, b->capacity-b->used, s, vl); + b->used += needed; +} + +// End of util.h. + +// Start of timing.h. + +// The function get_wall_time() returns the wall time in microseconds +// (with an unspecified offset). + +#ifdef _WIN32 + +#include <windows.h> + +static int64_t get_wall_time(void) { + LARGE_INTEGER time,freq; + assert(QueryPerformanceFrequency(&freq)); + assert(QueryPerformanceCounter(&time)); + return ((double)time.QuadPart / freq.QuadPart) * 1000000; +} + +#else +// Assuming POSIX + +#include <time.h> +#include <sys/time.h> + +static int64_t get_wall_time(void) { + struct timeval time; + assert(gettimeofday(&time,NULL) == 0); + return time.tv_sec * 1000000 + time.tv_usec; +} + +static int64_t get_wall_time_ns(void) { + struct timespec time; + assert(clock_gettime(CLOCK_REALTIME, &time) == 0); + return time.tv_sec * 1000000000 + time.tv_nsec; +} + +#endif + +// End of timing.h. + +#ifdef _MSC_VER +#define inline __inline +#endif +#include <string.h> +#include <string.h> +#include <errno.h> +#include <assert.h> +#include <ctype.h> +#define CL_TARGET_OPENCL_VERSION 120 +#define CL_USE_DEPRECATED_OPENCL_1_2_APIS +#ifdef __APPLE__ +#define CL_SILENCE_DEPRECATION +#include <OpenCL/cl.h> +#else +#include <CL/cl.h> +#endif + +// Start of lock.h. + +// A very simple cross-platform implementation of locks. Uses +// pthreads on Unix and some Windows thing there. Futhark's +// host-level code is not multithreaded, but user code may be, so we +// need some mechanism for ensuring atomic access to API functions. +// This is that mechanism. It is not exposed to user code at all, so +// we do not have to worry about name collisions. + +#ifdef _WIN32 + +typedef HANDLE lock_t; + +static void create_lock(lock_t *lock) { + *lock = CreateMutex(NULL, // Default security attributes. + FALSE, // Initially unlocked. + NULL); // Unnamed. +} + +static void lock_lock(lock_t *lock) { + assert(WaitForSingleObject(*lock, INFINITE) == WAIT_OBJECT_0); +} + +static void lock_unlock(lock_t *lock) { + assert(ReleaseMutex(*lock)); +} + +static void free_lock(lock_t *lock) { + CloseHandle(*lock); +} + +#else +// Assuming POSIX + +#include <pthread.h> + +typedef pthread_mutex_t lock_t; + +static void create_lock(lock_t *lock) { + int r = pthread_mutex_init(lock, NULL); + assert(r == 0); +} + +static void lock_lock(lock_t *lock) { + int r = pthread_mutex_lock(lock); + assert(r == 0); +} + +static void lock_unlock(lock_t *lock) { + int r = pthread_mutex_unlock(lock); + assert(r == 0); +} + +static void free_lock(lock_t *lock) { + // Nothing to do for pthreads. + (void)lock; +} + +#endif + +// End of lock.h. + +static inline uint8_t add8(uint8_t x, uint8_t y) +{ + return x + y; +} +static inline uint16_t add16(uint16_t x, uint16_t y) +{ + return x + y; +} +static inline uint32_t add32(uint32_t x, uint32_t y) +{ + return x + y; +} +static inline uint64_t add64(uint64_t x, uint64_t y) +{ + return x + y; +} +static inline uint8_t sub8(uint8_t x, uint8_t y) +{ + return x - y; +} +static inline uint16_t sub16(uint16_t x, uint16_t y) +{ + return x - y; +} +static inline uint32_t sub32(uint32_t x, uint32_t y) +{ + return x - y; +} +static inline uint64_t sub64(uint64_t x, uint64_t y) +{ + return x - y; +} +static inline uint8_t mul8(uint8_t x, uint8_t y) +{ + return x * y; +} +static inline uint16_t mul16(uint16_t x, uint16_t y) +{ + return x * y; +} +static inline uint32_t mul32(uint32_t x, uint32_t y) +{ + return x * y; +} +static inline uint64_t mul64(uint64_t x, uint64_t y) +{ + return x * y; +} +static inline uint8_t udiv8(uint8_t x, uint8_t y) +{ + return x / y; +} +static inline uint16_t udiv16(uint16_t x, uint16_t y) +{ + return x / y; +} +static inline uint32_t udiv32(uint32_t x, uint32_t y) +{ + return x / y; +} +static inline uint64_t udiv64(uint64_t x, uint64_t y) +{ + return x / y; +} +static inline uint8_t udiv_up8(uint8_t x, uint8_t y) +{ + return (x + y - 1) / y; +} +static inline uint16_t udiv_up16(uint16_t x, uint16_t y) +{ + return (x + y - 1) / y; +} +static inline uint32_t udiv_up32(uint32_t x, uint32_t y) +{ + return (x + y - 1) / y; +} +static inline uint64_t udiv_up64(uint64_t x, uint64_t y) +{ + return (x + y - 1) / y; +} +static inline uint8_t umod8(uint8_t x, uint8_t y) +{ + return x % y; +} +static inline uint16_t umod16(uint16_t x, uint16_t y) +{ + return x % y; +} +static inline uint32_t umod32(uint32_t x, uint32_t y) +{ + return x % y; +} +static inline uint64_t umod64(uint64_t x, uint64_t y) +{ + return x % y; +} +static inline uint8_t udiv_safe8(uint8_t x, uint8_t y) +{ + return y == 0 ? 0 : x / y; +} +static inline uint16_t udiv_safe16(uint16_t x, uint16_t y) +{ + return y == 0 ? 0 : x / y; +} +static inline uint32_t udiv_safe32(uint32_t x, uint32_t y) +{ + return y == 0 ? 0 : x / y; +} +static inline uint64_t udiv_safe64(uint64_t x, uint64_t y) +{ + return y == 0 ? 0 : x / y; +} +static inline uint8_t udiv_up_safe8(uint8_t x, uint8_t y) +{ + return y == 0 ? 0 : (x + y - 1) / y; +} +static inline uint16_t udiv_up_safe16(uint16_t x, uint16_t y) +{ + return y == 0 ? 0 : (x + y - 1) / y; +} +static inline uint32_t udiv_up_safe32(uint32_t x, uint32_t y) +{ + return y == 0 ? 0 : (x + y - 1) / y; +} +static inline uint64_t udiv_up_safe64(uint64_t x, uint64_t y) +{ + return y == 0 ? 0 : (x + y - 1) / y; +} +static inline uint8_t umod_safe8(uint8_t x, uint8_t y) +{ + return y == 0 ? 0 : x % y; +} +static inline uint16_t umod_safe16(uint16_t x, uint16_t y) +{ + return y == 0 ? 0 : x % y; +} +static inline uint32_t umod_safe32(uint32_t x, uint32_t y) +{ + return y == 0 ? 0 : x % y; +} +static inline uint64_t umod_safe64(uint64_t x, uint64_t y) +{ + return y == 0 ? 0 : x % y; +} +static inline int8_t sdiv8(int8_t x, int8_t y) +{ + int8_t q = x / y; + int8_t r = x % y; + + return q - ((r != 0 && r < 0 != y < 0) ? 1 : 0); +} +static inline int16_t sdiv16(int16_t x, int16_t y) +{ + int16_t q = x / y; + int16_t r = x % y; + + return q - ((r != 0 && r < 0 != y < 0) ? 1 : 0); +} +static inline int32_t sdiv32(int32_t x, int32_t y) +{ + int32_t q = x / y; + int32_t r = x % y; + + return q - ((r != 0 && r < 0 != y < 0) ? 1 : 0); +} +static inline int64_t sdiv64(int64_t x, int64_t y) +{ + int64_t q = x / y; + int64_t r = x % y; + + return q - ((r != 0 && r < 0 != y < 0) ? 1 : 0); +} +static inline int8_t sdiv_up8(int8_t x, int8_t y) +{ + return sdiv8(x + y - 1, y); +} +static inline int16_t sdiv_up16(int16_t x, int16_t y) +{ + return sdiv16(x + y - 1, y); +} +static inline int32_t sdiv_up32(int32_t x, int32_t y) +{ + return sdiv32(x + y - 1, y); +} +static inline int64_t sdiv_up64(int64_t x, int64_t y) +{ + return sdiv64(x + y - 1, y); +} +static inline int8_t smod8(int8_t x, int8_t y) +{ + int8_t r = x % y; + + return r + (r == 0 || (x > 0 && y > 0) || (x < 0 && y < 0) ? 0 : y); +} +static inline int16_t smod16(int16_t x, int16_t y) +{ + int16_t r = x % y; + + return r + (r == 0 || (x > 0 && y > 0) || (x < 0 && y < 0) ? 0 : y); +} +static inline int32_t smod32(int32_t x, int32_t y) +{ + int32_t r = x % y; + + return r + (r == 0 || (x > 0 && y > 0) || (x < 0 && y < 0) ? 0 : y); +} +static inline int64_t smod64(int64_t x, int64_t y) +{ + int64_t r = x % y; + + return r + (r == 0 || (x > 0 && y > 0) || (x < 0 && y < 0) ? 0 : y); +} +static inline int8_t sdiv_safe8(int8_t x, int8_t y) +{ + return y == 0 ? 0 : sdiv8(x, y); +} +static inline int16_t sdiv_safe16(int16_t x, int16_t y) +{ + return y == 0 ? 0 : sdiv16(x, y); +} +static inline int32_t sdiv_safe32(int32_t x, int32_t y) +{ + return y == 0 ? 0 : sdiv32(x, y); +} +static inline int64_t sdiv_safe64(int64_t x, int64_t y) +{ + return y == 0 ? 0 : sdiv64(x, y); +} +static inline int8_t sdiv_up_safe8(int8_t x, int8_t y) +{ + return sdiv_safe8(x + y - 1, y); +} +static inline int16_t sdiv_up_safe16(int16_t x, int16_t y) +{ + return sdiv_safe16(x + y - 1, y); +} +static inline int32_t sdiv_up_safe32(int32_t x, int32_t y) +{ + return sdiv_safe32(x + y - 1, y); +} +static inline int64_t sdiv_up_safe64(int64_t x, int64_t y) +{ + return sdiv_safe64(x + y - 1, y); +} +static inline int8_t smod_safe8(int8_t x, int8_t y) +{ + return y == 0 ? 0 : smod8(x, y); +} +static inline int16_t smod_safe16(int16_t x, int16_t y) +{ + return y == 0 ? 0 : smod16(x, y); +} +static inline int32_t smod_safe32(int32_t x, int32_t y) +{ + return y == 0 ? 0 : smod32(x, y); +} +static inline int64_t smod_safe64(int64_t x, int64_t y) +{ + return y == 0 ? 0 : smod64(x, y); +} +static inline int8_t squot8(int8_t x, int8_t y) +{ + return x / y; +} +static inline int16_t squot16(int16_t x, int16_t y) +{ + return x / y; +} +static inline int32_t squot32(int32_t x, int32_t y) +{ + return x / y; +} +static inline int64_t squot64(int64_t x, int64_t y) +{ + return x / y; +} +static inline int8_t srem8(int8_t x, int8_t y) +{ + return x % y; +} +static inline int16_t srem16(int16_t x, int16_t y) +{ + return x % y; +} +static inline int32_t srem32(int32_t x, int32_t y) +{ + return x % y; +} +static inline int64_t srem64(int64_t x, int64_t y) +{ + return x % y; +} +static inline int8_t squot_safe8(int8_t x, int8_t y) +{ + return y == 0 ? 0 : x / y; +} +static inline int16_t squot_safe16(int16_t x, int16_t y) +{ + return y == 0 ? 0 : x / y; +} +static inline int32_t squot_safe32(int32_t x, int32_t y) +{ + return y == 0 ? 0 : x / y; +} +static inline int64_t squot_safe64(int64_t x, int64_t y) +{ + return y == 0 ? 0 : x / y; +} +static inline int8_t srem_safe8(int8_t x, int8_t y) +{ + return y == 0 ? 0 : x % y; +} +static inline int16_t srem_safe16(int16_t x, int16_t y) +{ + return y == 0 ? 0 : x % y; +} +static inline int32_t srem_safe32(int32_t x, int32_t y) +{ + return y == 0 ? 0 : x % y; +} +static inline int64_t srem_safe64(int64_t x, int64_t y) +{ + return y == 0 ? 0 : x % y; +} +static inline int8_t smin8(int8_t x, int8_t y) +{ + return x < y ? x : y; +} +static inline int16_t smin16(int16_t x, int16_t y) +{ + return x < y ? x : y; +} +static inline int32_t smin32(int32_t x, int32_t y) +{ + return x < y ? x : y; +} +static inline int64_t smin64(int64_t x, int64_t y) +{ + return x < y ? x : y; +} +static inline uint8_t umin8(uint8_t x, uint8_t y) +{ + return x < y ? x : y; +} +static inline uint16_t umin16(uint16_t x, uint16_t y) +{ + return x < y ? x : y; +} +static inline uint32_t umin32(uint32_t x, uint32_t y) +{ + return x < y ? x : y; +} +static inline uint64_t umin64(uint64_t x, uint64_t y) +{ + return x < y ? x : y; +} +static inline int8_t smax8(int8_t x, int8_t y) +{ + return x < y ? y : x; +} +static inline int16_t smax16(int16_t x, int16_t y) +{ + return x < y ? y : x; +} +static inline int32_t smax32(int32_t x, int32_t y) +{ + return x < y ? y : x; +} +static inline int64_t smax64(int64_t x, int64_t y) +{ + return x < y ? y : x; +} +static inline uint8_t umax8(uint8_t x, uint8_t y) +{ + return x < y ? y : x; +} +static inline uint16_t umax16(uint16_t x, uint16_t y) +{ + return x < y ? y : x; +} +static inline uint32_t umax32(uint32_t x, uint32_t y) +{ + return x < y ? y : x; +} +static inline uint64_t umax64(uint64_t x, uint64_t y) +{ + return x < y ? y : x; +} +static inline uint8_t shl8(uint8_t x, uint8_t y) +{ + return x << y; +} +static inline uint16_t shl16(uint16_t x, uint16_t y) +{ + return x << y; +} +static inline uint32_t shl32(uint32_t x, uint32_t y) +{ + return x << y; +} +static inline uint64_t shl64(uint64_t x, uint64_t y) +{ + return x << y; +} +static inline uint8_t lshr8(uint8_t x, uint8_t y) +{ + return x >> y; +} +static inline uint16_t lshr16(uint16_t x, uint16_t y) +{ + return x >> y; +} +static inline uint32_t lshr32(uint32_t x, uint32_t y) +{ + return x >> y; +} +static inline uint64_t lshr64(uint64_t x, uint64_t y) +{ + return x >> y; +} +static inline int8_t ashr8(int8_t x, int8_t y) +{ + return x >> y; +} +static inline int16_t ashr16(int16_t x, int16_t y) +{ + return x >> y; +} +static inline int32_t ashr32(int32_t x, int32_t y) +{ + return x >> y; +} +static inline int64_t ashr64(int64_t x, int64_t y) +{ + return x >> y; +} +static inline uint8_t and8(uint8_t x, uint8_t y) +{ + return x & y; +} +static inline uint16_t and16(uint16_t x, uint16_t y) +{ + return x & y; +} +static inline uint32_t and32(uint32_t x, uint32_t y) +{ + return x & y; +} +static inline uint64_t and64(uint64_t x, uint64_t y) +{ + return x & y; +} +static inline uint8_t or8(uint8_t x, uint8_t y) +{ + return x | y; +} +static inline uint16_t or16(uint16_t x, uint16_t y) +{ + return x | y; +} +static inline uint32_t or32(uint32_t x, uint32_t y) +{ + return x | y; +} +static inline uint64_t or64(uint64_t x, uint64_t y) +{ + return x | y; +} +static inline uint8_t xor8(uint8_t x, uint8_t y) +{ + return x ^ y; +} +static inline uint16_t xor16(uint16_t x, uint16_t y) +{ + return x ^ y; +} +static inline uint32_t xor32(uint32_t x, uint32_t y) +{ + return x ^ y; +} +static inline uint64_t xor64(uint64_t x, uint64_t y) +{ + return x ^ y; +} +static inline bool ult8(uint8_t x, uint8_t y) +{ + return x < y; +} +static inline bool ult16(uint16_t x, uint16_t y) +{ + return x < y; +} +static inline bool ult32(uint32_t x, uint32_t y) +{ + return x < y; +} +static inline bool ult64(uint64_t x, uint64_t y) +{ + return x < y; +} +static inline bool ule8(uint8_t x, uint8_t y) +{ + return x <= y; +} +static inline bool ule16(uint16_t x, uint16_t y) +{ + return x <= y; +} +static inline bool ule32(uint32_t x, uint32_t y) +{ + return x <= y; +} +static inline bool ule64(uint64_t x, uint64_t y) +{ + return x <= y; +} +static inline bool slt8(int8_t x, int8_t y) +{ + return x < y; +} +static inline bool slt16(int16_t x, int16_t y) +{ + return x < y; +} +static inline bool slt32(int32_t x, int32_t y) +{ + return x < y; +} +static inline bool slt64(int64_t x, int64_t y) +{ + return x < y; +} +static inline bool sle8(int8_t x, int8_t y) +{ + return x <= y; +} +static inline bool sle16(int16_t x, int16_t y) +{ + return x <= y; +} +static inline bool sle32(int32_t x, int32_t y) +{ + return x <= y; +} +static inline bool sle64(int64_t x, int64_t y) +{ + return x <= y; +} +static inline int8_t pow8(int8_t x, int8_t y) +{ + int8_t res = 1, rem = y; + + while (rem != 0) { + if (rem & 1) + res *= x; + rem >>= 1; + x *= x; + } + return res; +} +static inline int16_t pow16(int16_t x, int16_t y) +{ + int16_t res = 1, rem = y; + + while (rem != 0) { + if (rem & 1) + res *= x; + rem >>= 1; + x *= x; + } + return res; +} +static inline int32_t pow32(int32_t x, int32_t y) +{ + int32_t res = 1, rem = y; + + while (rem != 0) { + if (rem & 1) + res *= x; + rem >>= 1; + x *= x; + } + return res; +} +static inline int64_t pow64(int64_t x, int64_t y) +{ + int64_t res = 1, rem = y; + + while (rem != 0) { + if (rem & 1) + res *= x; + rem >>= 1; + x *= x; + } + return res; +} +static inline bool itob_i8_bool(int8_t x) +{ + return x; +} +static inline bool itob_i16_bool(int16_t x) +{ + return x; +} +static inline bool itob_i32_bool(int32_t x) +{ + return x; +} +static inline bool itob_i64_bool(int64_t x) +{ + return x; +} +static inline int8_t btoi_bool_i8(bool x) +{ + return x; +} +static inline int16_t btoi_bool_i16(bool x) +{ + return x; +} +static inline int32_t btoi_bool_i32(bool x) +{ + return x; +} +static inline int64_t btoi_bool_i64(bool x) +{ + return x; +} +#define sext_i8_i8(x) ((int8_t) (int8_t) x) +#define sext_i8_i16(x) ((int16_t) (int8_t) x) +#define sext_i8_i32(x) ((int32_t) (int8_t) x) +#define sext_i8_i64(x) ((int64_t) (int8_t) x) +#define sext_i16_i8(x) ((int8_t) (int16_t) x) +#define sext_i16_i16(x) ((int16_t) (int16_t) x) +#define sext_i16_i32(x) ((int32_t) (int16_t) x) +#define sext_i16_i64(x) ((int64_t) (int16_t) x) +#define sext_i32_i8(x) ((int8_t) (int32_t) x) +#define sext_i32_i16(x) ((int16_t) (int32_t) x) +#define sext_i32_i32(x) ((int32_t) (int32_t) x) +#define sext_i32_i64(x) ((int64_t) (int32_t) x) +#define sext_i64_i8(x) ((int8_t) (int64_t) x) +#define sext_i64_i16(x) ((int16_t) (int64_t) x) +#define sext_i64_i32(x) ((int32_t) (int64_t) x) +#define sext_i64_i64(x) ((int64_t) (int64_t) x) +#define zext_i8_i8(x) ((int8_t) (uint8_t) x) +#define zext_i8_i16(x) ((int16_t) (uint8_t) x) +#define zext_i8_i32(x) ((int32_t) (uint8_t) x) +#define zext_i8_i64(x) ((int64_t) (uint8_t) x) +#define zext_i16_i8(x) ((int8_t) (uint16_t) x) +#define zext_i16_i16(x) ((int16_t) (uint16_t) x) +#define zext_i16_i32(x) ((int32_t) (uint16_t) x) +#define zext_i16_i64(x) ((int64_t) (uint16_t) x) +#define zext_i32_i8(x) ((int8_t) (uint32_t) x) +#define zext_i32_i16(x) ((int16_t) (uint32_t) x) +#define zext_i32_i32(x) ((int32_t) (uint32_t) x) +#define zext_i32_i64(x) ((int64_t) (uint32_t) x) +#define zext_i64_i8(x) ((int8_t) (uint64_t) x) +#define zext_i64_i16(x) ((int16_t) (uint64_t) x) +#define zext_i64_i32(x) ((int32_t) (uint64_t) x) +#define zext_i64_i64(x) ((int64_t) (uint64_t) x) +#if defined(__OPENCL_VERSION__) +static int32_t futrts_popc8(int8_t x) +{ + return popcount(x); +} +static int32_t futrts_popc16(int16_t x) +{ + return popcount(x); +} +static int32_t futrts_popc32(int32_t x) +{ + return popcount(x); +} +static int32_t futrts_popc64(int64_t x) +{ + return popcount(x); +} +#elif defined(__CUDA_ARCH__) +static int32_t futrts_popc8(int8_t x) +{ + return __popc(zext_i8_i32(x)); +} +static int32_t futrts_popc16(int16_t x) +{ + return __popc(zext_i16_i32(x)); +} +static int32_t futrts_popc32(int32_t x) +{ + return __popc(x); +} +static int32_t futrts_popc64(int64_t x) +{ + return __popcll(x); +} +#else +static int32_t futrts_popc8(int8_t x) +{ + int c = 0; + + for (; x; ++c) + x &= x - 1; + return c; +} +static int32_t futrts_popc16(int16_t x) +{ + int c = 0; + + for (; x; ++c) + x &= x - 1; + return c; +} +static int32_t futrts_popc32(int32_t x) +{ + int c = 0; + + for (; x; ++c) + x &= x - 1; + return c; +} +static int32_t futrts_popc64(int64_t x) +{ + int c = 0; + + for (; x; ++c) + x &= x - 1; + return c; +} +#endif +#if defined(__OPENCL_VERSION__) +static uint8_t futrts_mul_hi8(uint8_t a, uint8_t b) +{ + return mul_hi(a, b); +} +static uint16_t futrts_mul_hi16(uint16_t a, uint16_t b) +{ + return mul_hi(a, b); +} +static uint32_t futrts_mul_hi32(uint32_t a, uint32_t b) +{ + return mul_hi(a, b); +} +static uint64_t futrts_mul_hi64(uint64_t a, uint64_t b) +{ + return mul_hi(a, b); +} +#elif defined(__CUDA_ARCH__) +static uint8_t futrts_mul_hi8(uint8_t a, uint8_t b) +{ + uint16_t aa = a; + uint16_t bb = b; + + return aa * bb >> 8; +} +static uint16_t futrts_mul_hi16(uint16_t a, uint16_t b) +{ + uint32_t aa = a; + uint32_t bb = b; + + return aa * bb >> 16; +} +static uint32_t futrts_mul_hi32(uint32_t a, uint32_t b) +{ + return mulhi(a, b); +} +static uint64_t futrts_mul_hi64(uint64_t a, uint64_t b) +{ + return mul64hi(a, b); +} +#else +static uint8_t futrts_mul_hi8(uint8_t a, uint8_t b) +{ + uint16_t aa = a; + uint16_t bb = b; + + return aa * bb >> 8; +} +static uint16_t futrts_mul_hi16(uint16_t a, uint16_t b) +{ + uint32_t aa = a; + uint32_t bb = b; + + return aa * bb >> 16; +} +static uint32_t futrts_mul_hi32(uint32_t a, uint32_t b) +{ + uint64_t aa = a; + uint64_t bb = b; + + return aa * bb >> 32; +} +static uint64_t futrts_mul_hi64(uint64_t a, uint64_t b) +{ + __uint128_t aa = a; + __uint128_t bb = b; + + return aa * bb >> 64; +} +#endif +#if defined(__OPENCL_VERSION__) +static uint8_t futrts_mad_hi8(uint8_t a, uint8_t b, uint8_t c) +{ + return mad_hi(a, b, c); +} +static uint16_t futrts_mad_hi16(uint16_t a, uint16_t b, uint16_t c) +{ + return mad_hi(a, b, c); +} +static uint32_t futrts_mad_hi32(uint32_t a, uint32_t b, uint32_t c) +{ + return mad_hi(a, b, c); +} +static uint64_t futrts_mad_hi64(uint64_t a, uint64_t b, uint64_t c) +{ + return mad_hi(a, b, c); +} +#else +static uint8_t futrts_mad_hi8(uint8_t a, uint8_t b, uint8_t c) +{ + return futrts_mul_hi8(a, b) + c; +} +static uint16_t futrts_mad_hi16(uint16_t a, uint16_t b, uint16_t c) +{ + return futrts_mul_hi16(a, b) + c; +} +static uint32_t futrts_mad_hi32(uint32_t a, uint32_t b, uint32_t c) +{ + return futrts_mul_hi32(a, b) + c; +} +static uint64_t futrts_mad_hi64(uint64_t a, uint64_t b, uint64_t c) +{ + return futrts_mul_hi64(a, b) + c; +} +#endif +#if defined(__OPENCL_VERSION__) +static int32_t futrts_clzz8(int8_t x) +{ + return clz(x); +} +static int32_t futrts_clzz16(int16_t x) +{ + return clz(x); +} +static int32_t futrts_clzz32(int32_t x) +{ + return clz(x); +} +static int32_t futrts_clzz64(int64_t x) +{ + return clz(x); +} +#elif defined(__CUDA_ARCH__) +static int32_t futrts_clzz8(int8_t x) +{ + return __clz(zext_i8_i32(x)) - 24; +} +static int32_t futrts_clzz16(int16_t x) +{ + return __clz(zext_i16_i32(x)) - 16; +} +static int32_t futrts_clzz32(int32_t x) +{ + return __clz(x); +} +static int32_t futrts_clzz64(int64_t x) +{ + return __clzll(x); +} +#else +static int32_t futrts_clzz8(int8_t x) +{ + int n = 0; + int bits = sizeof(x) * 8; + + for (int i = 0; i < bits; i++) { + if (x < 0) + break; + n++; + x <<= 1; + } + return n; +} +static int32_t futrts_clzz16(int16_t x) +{ + int n = 0; + int bits = sizeof(x) * 8; + + for (int i = 0; i < bits; i++) { + if (x < 0) + break; + n++; + x <<= 1; + } + return n; +} +static int32_t futrts_clzz32(int32_t x) +{ + int n = 0; + int bits = sizeof(x) * 8; + + for (int i = 0; i < bits; i++) { + if (x < 0) + break; + n++; + x <<= 1; + } + return n; +} +static int32_t futrts_clzz64(int64_t x) +{ + int n = 0; + int bits = sizeof(x) * 8; + + for (int i = 0; i < bits; i++) { + if (x < 0) + break; + n++; + x <<= 1; + } + return n; +} +#endif +#if defined(__OPENCL_VERSION__) +static int32_t futrts_ctzz8(int8_t x) +{ + int i = 0; + + for (; i < 8 && (x & 1) == 0; i++, x >>= 1) + ; + return i; +} +static int32_t futrts_ctzz16(int16_t x) +{ + int i = 0; + + for (; i < 16 && (x & 1) == 0; i++, x >>= 1) + ; + return i; +} +static int32_t futrts_ctzz32(int32_t x) +{ + int i = 0; + + for (; i < 32 && (x & 1) == 0; i++, x >>= 1) + ; + return i; +} +static int32_t futrts_ctzz64(int64_t x) +{ + int i = 0; + + for (; i < 64 && (x & 1) == 0; i++, x >>= 1) + ; + return i; +} +#elif defined(__CUDA_ARCH__) +static int32_t futrts_ctzz8(int8_t x) +{ + int y = __ffs(x); + + return y == 0 ? 8 : y - 1; +} +static int32_t futrts_ctzz16(int16_t x) +{ + int y = __ffs(x); + + return y == 0 ? 16 : y - 1; +} +static int32_t futrts_ctzz32(int32_t x) +{ + int y = __ffs(x); + + return y == 0 ? 32 : y - 1; +} +static int32_t futrts_ctzz64(int64_t x) +{ + int y = __ffsll(x); + + return y == 0 ? 64 : y - 1; +} +#else +static int32_t futrts_ctzz8(int8_t x) +{ + return x == 0 ? 8 : __builtin_ctz((uint32_t) x); +} +static int32_t futrts_ctzz16(int16_t x) +{ + return x == 0 ? 16 : __builtin_ctz((uint32_t) x); +} +static int32_t futrts_ctzz32(int32_t x) +{ + return x == 0 ? 32 : __builtin_ctz(x); +} +static int32_t futrts_ctzz64(int64_t x) +{ + return x == 0 ? 64 : __builtin_ctzll(x); +} +#endif +static inline float fdiv32(float x, float y) +{ + return x / y; +} +static inline float fadd32(float x, float y) +{ + return x + y; +} +static inline float fsub32(float x, float y) +{ + return x - y; +} +static inline float fmul32(float x, float y) +{ + return x * y; +} +static inline float fmin32(float x, float y) +{ + return fmin(x, y); +} +static inline float fmax32(float x, float y) +{ + return fmax(x, y); +} +static inline float fpow32(float x, float y) +{ + return pow(x, y); +} +static inline bool cmplt32(float x, float y) +{ + return x < y; +} +static inline bool cmple32(float x, float y) +{ + return x <= y; +} +static inline float sitofp_i8_f32(int8_t x) +{ + return (float) x; +} +static inline float sitofp_i16_f32(int16_t x) +{ + return (float) x; +} +static inline float sitofp_i32_f32(int32_t x) +{ + return (float) x; +} +static inline float sitofp_i64_f32(int64_t x) +{ + return (float) x; +} +static inline float uitofp_i8_f32(uint8_t x) +{ + return (float) x; +} +static inline float uitofp_i16_f32(uint16_t x) +{ + return (float) x; +} +static inline float uitofp_i32_f32(uint32_t x) +{ + return (float) x; +} +static inline float uitofp_i64_f32(uint64_t x) +{ + return (float) x; +} +static inline int8_t fptosi_f32_i8(float x) +{ + return (int8_t) x; +} +static inline int16_t fptosi_f32_i16(float x) +{ + return (int16_t) x; +} +static inline int32_t fptosi_f32_i32(float x) +{ + return (int32_t) x; +} +static inline int64_t fptosi_f32_i64(float x) +{ + return (int64_t) x; +} +static inline uint8_t fptoui_f32_i8(float x) +{ + return (uint8_t) x; +} +static inline uint16_t fptoui_f32_i16(float x) +{ + return (uint16_t) x; +} +static inline uint32_t fptoui_f32_i32(float x) +{ + return (uint32_t) x; +} +static inline uint64_t fptoui_f32_i64(float x) +{ + return (uint64_t) x; +} +static inline double fdiv64(double x, double y) +{ + return x / y; +} +static inline double fadd64(double x, double y) +{ + return x + y; +} +static inline double fsub64(double x, double y) +{ + return x - y; +} +static inline double fmul64(double x, double y) +{ + return x * y; +} +static inline double fmin64(double x, double y) +{ + return fmin(x, y); +} +static inline double fmax64(double x, double y) +{ + return fmax(x, y); +} +static inline double fpow64(double x, double y) +{ + return pow(x, y); +} +static inline bool cmplt64(double x, double y) +{ + return x < y; +} +static inline bool cmple64(double x, double y) +{ + return x <= y; +} +static inline double sitofp_i8_f64(int8_t x) +{ + return (double) x; +} +static inline double sitofp_i16_f64(int16_t x) +{ + return (double) x; +} +static inline double sitofp_i32_f64(int32_t x) +{ + return (double) x; +} +static inline double sitofp_i64_f64(int64_t x) +{ + return (double) x; +} +static inline double uitofp_i8_f64(uint8_t x) +{ + return (double) x; +} +static inline double uitofp_i16_f64(uint16_t x) +{ + return (double) x; +} +static inline double uitofp_i32_f64(uint32_t x) +{ + return (double) x; +} +static inline double uitofp_i64_f64(uint64_t x) +{ + return (double) x; +} +static inline int8_t fptosi_f64_i8(double x) +{ + return (int8_t) x; +} +static inline int16_t fptosi_f64_i16(double x) +{ + return (int16_t) x; +} +static inline int32_t fptosi_f64_i32(double x) +{ + return (int32_t) x; +} +static inline int64_t fptosi_f64_i64(double x) +{ + return (int64_t) x; +} +static inline uint8_t fptoui_f64_i8(double x) +{ + return (uint8_t) x; +} +static inline uint16_t fptoui_f64_i16(double x) +{ + return (uint16_t) x; +} +static inline uint32_t fptoui_f64_i32(double x) +{ + return (uint32_t) x; +} +static inline uint64_t fptoui_f64_i64(double x) +{ + return (uint64_t) x; +} +static inline float fpconv_f32_f32(float x) +{ + return (float) x; +} +static inline double fpconv_f32_f64(float x) +{ + return (double) x; +} +static inline float fpconv_f64_f32(double x) +{ + return (float) x; +} +static inline double fpconv_f64_f64(double x) +{ + return (double) x; +} +static inline bool futrts_isnan32(float x) +{ + return isnan(x); +} +static inline bool futrts_isinf32(float x) +{ + return isinf(x); +} +#ifdef __OPENCL_VERSION__ +static inline float futrts_log32(float x) +{ + return log(x); +} +static inline float futrts_log2_32(float x) +{ + return log2(x); +} +static inline float futrts_log10_32(float x) +{ + return log10(x); +} +static inline float futrts_sqrt32(float x) +{ + return sqrt(x); +} +static inline float futrts_exp32(float x) +{ + return exp(x); +} +static inline float futrts_cos32(float x) +{ + return cos(x); +} +static inline float futrts_sin32(float x) +{ + return sin(x); +} +static inline float futrts_tan32(float x) +{ + return tan(x); +} +static inline float futrts_acos32(float x) +{ + return acos(x); +} +static inline float futrts_asin32(float x) +{ + return asin(x); +} +static inline float futrts_atan32(float x) +{ + return atan(x); +} +static inline float futrts_cosh32(float x) +{ + return cosh(x); +} +static inline float futrts_sinh32(float x) +{ + return sinh(x); +} +static inline float futrts_tanh32(float x) +{ + return tanh(x); +} +static inline float futrts_acosh32(float x) +{ + return acosh(x); +} +static inline float futrts_asinh32(float x) +{ + return asinh(x); +} +static inline float futrts_atanh32(float x) +{ + return atanh(x); +} +static inline float futrts_atan2_32(float x, float y) +{ + return atan2(x, y); +} +static inline float futrts_hypot32(float x, float y) +{ + return hypot(x, y); +} +static inline float futrts_gamma32(float x) +{ + return tgamma(x); +} +static inline float futrts_lgamma32(float x) +{ + return lgamma(x); +} +static inline float fmod32(float x, float y) +{ + return fmod(x, y); +} +static inline float futrts_round32(float x) +{ + return rint(x); +} +static inline float futrts_floor32(float x) +{ + return floor(x); +} +static inline float futrts_ceil32(float x) +{ + return ceil(x); +} +static inline float futrts_lerp32(float v0, float v1, float t) +{ + return mix(v0, v1, t); +} +static inline float futrts_mad32(float a, float b, float c) +{ + return mad(a, b, c); +} +static inline float futrts_fma32(float a, float b, float c) +{ + return fma(a, b, c); +} +#else +static inline float futrts_log32(float x) +{ + return logf(x); +} +static inline float futrts_log2_32(float x) +{ + return log2f(x); +} +static inline float futrts_log10_32(float x) +{ + return log10f(x); +} +static inline float futrts_sqrt32(float x) +{ + return sqrtf(x); +} +static inline float futrts_exp32(float x) +{ + return expf(x); +} +static inline float futrts_cos32(float x) +{ + return cosf(x); +} +static inline float futrts_sin32(float x) +{ + return sinf(x); +} +static inline float futrts_tan32(float x) +{ + return tanf(x); +} +static inline float futrts_acos32(float x) +{ + return acosf(x); +} +static inline float futrts_asin32(float x) +{ + return asinf(x); +} +static inline float futrts_atan32(float x) +{ + return atanf(x); +} +static inline float futrts_cosh32(float x) +{ + return coshf(x); +} +static inline float futrts_sinh32(float x) +{ + return sinhf(x); +} +static inline float futrts_tanh32(float x) +{ + return tanhf(x); +} +static inline float futrts_acosh32(float x) +{ + return acoshf(x); +} +static inline float futrts_asinh32(float x) +{ + return asinhf(x); +} +static inline float futrts_atanh32(float x) +{ + return atanhf(x); +} +static inline float futrts_atan2_32(float x, float y) +{ + return atan2f(x, y); +} +static inline float futrts_hypot32(float x, float y) +{ + return hypotf(x, y); +} +static inline float futrts_gamma32(float x) +{ + return tgammaf(x); +} +static inline float futrts_lgamma32(float x) +{ + return lgammaf(x); +} +static inline float fmod32(float x, float y) +{ + return fmodf(x, y); +} +static inline float futrts_round32(float x) +{ + return rintf(x); +} +static inline float futrts_floor32(float x) +{ + return floorf(x); +} +static inline float futrts_ceil32(float x) +{ + return ceilf(x); +} +static inline float futrts_lerp32(float v0, float v1, float t) +{ + return v0 + (v1 - v0) * t; +} +static inline float futrts_mad32(float a, float b, float c) +{ + return a * b + c; +} +static inline float futrts_fma32(float a, float b, float c) +{ + return fmaf(a, b, c); +} +#endif +static inline int32_t futrts_to_bits32(float x) +{ + union { + float f; + int32_t t; + } p; + + p.f = x; + return p.t; +} +static inline float futrts_from_bits32(int32_t x) +{ + union { + int32_t f; + float t; + } p; + + p.f = x; + return p.t; +} +static inline float fsignum32(float x) +{ + return futrts_isnan32(x) ? x : (x > 0) - (x < 0); +} +static inline double futrts_log64(double x) +{ + return log(x); +} +static inline double futrts_log2_64(double x) +{ + return log2(x); +} +static inline double futrts_log10_64(double x) +{ + return log10(x); +} +static inline double futrts_sqrt64(double x) +{ + return sqrt(x); +} +static inline double futrts_exp64(double x) +{ + return exp(x); +} +static inline double futrts_cos64(double x) +{ + return cos(x); +} +static inline double futrts_sin64(double x) +{ + return sin(x); +} +static inline double futrts_tan64(double x) +{ + return tan(x); +} +static inline double futrts_acos64(double x) +{ + return acos(x); +} +static inline double futrts_asin64(double x) +{ + return asin(x); +} +static inline double futrts_atan64(double x) +{ + return atan(x); +} +static inline double futrts_cosh64(double x) +{ + return cosh(x); +} +static inline double futrts_sinh64(double x) +{ + return sinh(x); +} +static inline double futrts_tanh64(double x) +{ + return tanh(x); +} +static inline double futrts_acosh64(double x) +{ + return acosh(x); +} +static inline double futrts_asinh64(double x) +{ + return asinh(x); +} +static inline double futrts_atanh64(double x) +{ + return atanh(x); +} +static inline double futrts_atan2_64(double x, double y) +{ + return atan2(x, y); +} +static inline double futrts_hypot64(double x, double y) +{ + return hypot(x, y); +} +static inline double futrts_gamma64(double x) +{ + return tgamma(x); +} +static inline double futrts_lgamma64(double x) +{ + return lgamma(x); +} +static inline double futrts_fma64(double a, double b, double c) +{ + return fma(a, b, c); +} +static inline double futrts_round64(double x) +{ + return rint(x); +} +static inline double futrts_ceil64(double x) +{ + return ceil(x); +} +static inline double futrts_floor64(double x) +{ + return floor(x); +} +static inline bool futrts_isnan64(double x) +{ + return isnan(x); +} +static inline bool futrts_isinf64(double x) +{ + return isinf(x); +} +static inline int64_t futrts_to_bits64(double x) +{ + union { + double f; + int64_t t; + } p; + + p.f = x; + return p.t; +} +static inline double futrts_from_bits64(int64_t x) +{ + union { + int64_t f; + double t; + } p; + + p.f = x; + return p.t; +} +static inline double fmod64(double x, double y) +{ + return fmod(x, y); +} +static inline double fsignum64(double x) +{ + return futrts_isnan64(x) ? x : (x > 0) - (x < 0); +} +#ifdef __OPENCL_VERSION__ +static inline double futrts_lerp64(double v0, double v1, double t) +{ + return mix(v0, v1, t); +} +static inline double futrts_mad64(double a, double b, double c) +{ + return mad(a, b, c); +} +#else +static inline double futrts_lerp64(double v0, double v1, double t) +{ + return v0 + (v1 - v0) * t; +} +static inline double futrts_mad64(double a, double b, double c) +{ + return a * b + c; +} +#endif +static int init_constants(struct futhark_context *); +static int free_constants(struct futhark_context *); +struct memblock_device { + int *references; + cl_mem mem; + int64_t size; + const char *desc; +} ; +struct memblock { + int *references; + char *mem; + int64_t size; + const char *desc; +} ; +typedef cl_mem fl_mem_t; +// Start of free_list.h. + +// An entry in the free list. May be invalid, to avoid having to +// deallocate entries as soon as they are removed. There is also a +// tag, to help with memory reuse. +struct free_list_entry { + size_t size; + fl_mem_t mem; + const char *tag; + unsigned char valid; +}; + +struct free_list { + struct free_list_entry *entries; // Pointer to entries. + int capacity; // Number of entries. + int used; // Number of valid entries. +}; + +static void free_list_init(struct free_list *l) { + l->capacity = 30; // Picked arbitrarily. + l->used = 0; + l->entries = (struct free_list_entry*) malloc(sizeof(struct free_list_entry) * l->capacity); + for (int i = 0; i < l->capacity; i++) { + l->entries[i].valid = 0; + } +} + +// Remove invalid entries from the free list. +static void free_list_pack(struct free_list *l) { + int p = 0; + for (int i = 0; i < l->capacity; i++) { + if (l->entries[i].valid) { + l->entries[p] = l->entries[i]; + if (i > p) { + l->entries[i].valid = 0; + } + p++; + } + } + + // Now p is the number of used elements. We don't want it to go + // less than the default capacity (although in practice it's OK as + // long as it doesn't become 1). + if (p < 30) { + p = 30; + } + l->entries = realloc(l->entries, p * sizeof(struct free_list_entry)); + l->capacity = p; +} + +static void free_list_destroy(struct free_list *l) { + assert(l->used == 0); + free(l->entries); +} + +static int free_list_find_invalid(struct free_list *l) { + int i; + for (i = 0; i < l->capacity; i++) { + if (!l->entries[i].valid) { + break; + } + } + return i; +} + +static void free_list_insert(struct free_list *l, size_t size, fl_mem_t mem, const char *tag) { + int i = free_list_find_invalid(l); + + if (i == l->capacity) { + // List is full; so we have to grow it. + int new_capacity = l->capacity * 2 * sizeof(struct free_list_entry); + l->entries = realloc(l->entries, new_capacity); + for (int j = 0; j < l->capacity; j++) { + l->entries[j+l->capacity].valid = 0; + } + l->capacity *= 2; + } + + // Now 'i' points to the first invalid entry. + l->entries[i].valid = 1; + l->entries[i].size = size; + l->entries[i].mem = mem; + l->entries[i].tag = tag; + + l->used++; +} + +// Find and remove a memory block of the indicated tag, or if that +// does not exist, another memory block with exactly the desired size. +// Returns 0 on success. +static int free_list_find(struct free_list *l, size_t size, + size_t *size_out, fl_mem_t *mem_out) { + int size_match = -1; + int i; + for (i = 0; i < l->capacity; i++) { + if (l->entries[i].valid && + size <= l->entries[i].size && + (size_match < 0 || l->entries[i].size < l->entries[size_match].size)) { + // If this entry is valid, has sufficient size, and is smaller than the + // best entry found so far, use this entry. + size_match = i; + } + } + + if (size_match >= 0) { + l->entries[size_match].valid = 0; + *size_out = l->entries[size_match].size; + *mem_out = l->entries[size_match].mem; + l->used--; + return 0; + } else { + return 1; + } +} + +// Remove the first block in the free list. Returns 0 if a block was +// removed, and nonzero if the free list was already empty. +static int free_list_first(struct free_list *l, fl_mem_t *mem_out) { + for (int i = 0; i < l->capacity; i++) { + if (l->entries[i].valid) { + l->entries[i].valid = 0; + *mem_out = l->entries[i].mem; + l->used--; + return 0; + } + } + + return 1; +} + +// End of free_list.h. + +// Start of opencl.h. + +#define OPENCL_SUCCEED_FATAL(e) opencl_succeed_fatal(e, #e, __FILE__, __LINE__) +#define OPENCL_SUCCEED_NONFATAL(e) opencl_succeed_nonfatal(e, #e, __FILE__, __LINE__) +// Take care not to override an existing error. +#define OPENCL_SUCCEED_OR_RETURN(e) { \ + char *serror = OPENCL_SUCCEED_NONFATAL(e); \ + if (serror) { \ + if (!ctx->error) { \ + ctx->error = serror; \ + return bad; \ + } else { \ + free(serror); \ + } \ + } \ + } + +// OPENCL_SUCCEED_OR_RETURN returns the value of the variable 'bad' in +// scope. By default, it will be this one. Create a local variable +// of some other type if needed. This is a bit of a hack, but it +// saves effort in the code generator. +static const int bad = 1; + +struct opencl_config { + int debugging; + int profiling; + int logging; + int preferred_device_num; + const char *preferred_platform; + const char *preferred_device; + int ignore_blacklist; + + const char* dump_program_to; + const char* load_program_from; + const char* dump_binary_to; + const char* load_binary_from; + + size_t default_group_size; + size_t default_num_groups; + size_t default_tile_size; + size_t default_reg_tile_size; + size_t default_threshold; + + int default_group_size_changed; + int default_tile_size_changed; + + int num_sizes; + const char **size_names; + const char **size_vars; + int64_t *size_values; + const char **size_classes; +}; + +static void opencl_config_init(struct opencl_config *cfg, + int num_sizes, + const char *size_names[], + const char *size_vars[], + int64_t *size_values, + const char *size_classes[]) { + cfg->debugging = 0; + cfg->logging = 0; + cfg->profiling = 0; + cfg->preferred_device_num = 0; + cfg->preferred_platform = ""; + cfg->preferred_device = ""; + cfg->ignore_blacklist = 0; + cfg->dump_program_to = NULL; + cfg->load_program_from = NULL; + cfg->dump_binary_to = NULL; + cfg->load_binary_from = NULL; + + // The following are dummy sizes that mean the concrete defaults + // will be set during initialisation via hardware-inspection-based + // heuristics. + cfg->default_group_size = 0; + cfg->default_num_groups = 0; + cfg->default_tile_size = 0; + cfg->default_reg_tile_size = 0; + cfg->default_threshold = 0; + + cfg->default_group_size_changed = 0; + cfg->default_tile_size_changed = 0; + + cfg->num_sizes = num_sizes; + cfg->size_names = size_names; + cfg->size_vars = size_vars; + cfg->size_values = size_values; + cfg->size_classes = size_classes; +} + +// A record of something that happened. +struct profiling_record { + cl_event *event; + int *runs; + int64_t *runtime; +}; + +struct opencl_context { + cl_device_id device; + cl_context ctx; + cl_command_queue queue; + + struct opencl_config cfg; + + struct free_list free_list; + + size_t max_group_size; + size_t max_num_groups; + size_t max_tile_size; + size_t max_threshold; + size_t max_local_memory; + + size_t lockstep_width; + + struct profiling_record *profiling_records; + int profiling_records_capacity; + int profiling_records_used; +}; + +struct opencl_device_option { + cl_platform_id platform; + cl_device_id device; + cl_device_type device_type; + char *platform_name; + char *device_name; +}; + +// This function must be defined by the user. It is invoked by +// setup_opencl() after the platform and device has been found, but +// before the program is loaded. Its intended use is to tune +// constants based on the selected platform and device. +static void post_opencl_setup(struct opencl_context*, struct opencl_device_option*); + +static char *strclone(const char *str) { + size_t size = strlen(str) + 1; + char *copy = (char*) malloc(size); + if (copy == NULL) { + return NULL; + } + + memcpy(copy, str, size); + return copy; +} + +static const char* opencl_error_string(cl_int err) +{ + switch (err) { + case CL_SUCCESS: return "Success!"; + case CL_DEVICE_NOT_FOUND: return "Device not found."; + case CL_DEVICE_NOT_AVAILABLE: return "Device not available"; + case CL_COMPILER_NOT_AVAILABLE: return "Compiler not available"; + case CL_MEM_OBJECT_ALLOCATION_FAILURE: return "Memory object allocation failure"; + case CL_OUT_OF_RESOURCES: return "Out of resources"; + case CL_OUT_OF_HOST_MEMORY: return "Out of host memory"; + case CL_PROFILING_INFO_NOT_AVAILABLE: return "Profiling information not available"; + case CL_MEM_COPY_OVERLAP: return "Memory copy overlap"; + case CL_IMAGE_FORMAT_MISMATCH: return "Image format mismatch"; + case CL_IMAGE_FORMAT_NOT_SUPPORTED: return "Image format not supported"; + case CL_BUILD_PROGRAM_FAILURE: return "Program build failure"; + case CL_MAP_FAILURE: return "Map failure"; + case CL_INVALID_VALUE: return "Invalid value"; + case CL_INVALID_DEVICE_TYPE: return "Invalid device type"; + case CL_INVALID_PLATFORM: return "Invalid platform"; + case CL_INVALID_DEVICE: return "Invalid device"; + case CL_INVALID_CONTEXT: return "Invalid context"; + case CL_INVALID_QUEUE_PROPERTIES: return "Invalid queue properties"; + case CL_INVALID_COMMAND_QUEUE: return "Invalid command queue"; + case CL_INVALID_HOST_PTR: return "Invalid host pointer"; + case CL_INVALID_MEM_OBJECT: return "Invalid memory object"; + case CL_INVALID_IMAGE_FORMAT_DESCRIPTOR: return "Invalid image format descriptor"; + case CL_INVALID_IMAGE_SIZE: return "Invalid image size"; + case CL_INVALID_SAMPLER: return "Invalid sampler"; + case CL_INVALID_BINARY: return "Invalid binary"; + case CL_INVALID_BUILD_OPTIONS: return "Invalid build options"; + case CL_INVALID_PROGRAM: return "Invalid program"; + case CL_INVALID_PROGRAM_EXECUTABLE: return "Invalid program executable"; + case CL_INVALID_KERNEL_NAME: return "Invalid kernel name"; + case CL_INVALID_KERNEL_DEFINITION: return "Invalid kernel definition"; + case CL_INVALID_KERNEL: return "Invalid kernel"; + case CL_INVALID_ARG_INDEX: return "Invalid argument index"; + case CL_INVALID_ARG_VALUE: return "Invalid argument value"; + case CL_INVALID_ARG_SIZE: return "Invalid argument size"; + case CL_INVALID_KERNEL_ARGS: return "Invalid kernel arguments"; + case CL_INVALID_WORK_DIMENSION: return "Invalid work dimension"; + case CL_INVALID_WORK_GROUP_SIZE: return "Invalid work group size"; + case CL_INVALID_WORK_ITEM_SIZE: return "Invalid work item size"; + case CL_INVALID_GLOBAL_OFFSET: return "Invalid global offset"; + case CL_INVALID_EVENT_WAIT_LIST: return "Invalid event wait list"; + case CL_INVALID_EVENT: return "Invalid event"; + case CL_INVALID_OPERATION: return "Invalid operation"; + case CL_INVALID_GL_OBJECT: return "Invalid OpenGL object"; + case CL_INVALID_BUFFER_SIZE: return "Invalid buffer size"; + case CL_INVALID_MIP_LEVEL: return "Invalid mip-map level"; + default: return "Unknown"; + } +} + +static void opencl_succeed_fatal(unsigned int ret, + const char *call, + const char *file, + int line) { + if (ret != CL_SUCCESS) { + futhark_panic(-1, "%s:%d: OpenCL call\n %s\nfailed with error code %d (%s)\n", + file, line, call, ret, opencl_error_string(ret)); + } +} + +static char* opencl_succeed_nonfatal(unsigned int ret, + const char *call, + const char *file, + int line) { + if (ret != CL_SUCCESS) { + return msgprintf("%s:%d: OpenCL call\n %s\nfailed with error code %d (%s)\n", + file, line, call, ret, opencl_error_string(ret)); + } else { + return NULL; + } +} + +static void set_preferred_platform(struct opencl_config *cfg, const char *s) { + cfg->preferred_platform = s; + cfg->ignore_blacklist = 1; +} + +static void set_preferred_device(struct opencl_config *cfg, const char *s) { + int x = 0; + if (*s == '#') { + s++; + while (isdigit(*s)) { + x = x * 10 + (*s++)-'0'; + } + // Skip trailing spaces. + while (isspace(*s)) { + s++; + } + } + cfg->preferred_device = s; + cfg->preferred_device_num = x; + cfg->ignore_blacklist = 1; +} + +static char* opencl_platform_info(cl_platform_id platform, + cl_platform_info param) { + size_t req_bytes; + char *info; + + OPENCL_SUCCEED_FATAL(clGetPlatformInfo(platform, param, 0, NULL, &req_bytes)); + + info = (char*) malloc(req_bytes); + + OPENCL_SUCCEED_FATAL(clGetPlatformInfo(platform, param, req_bytes, info, NULL)); + + return info; +} + +static char* opencl_device_info(cl_device_id device, + cl_device_info param) { + size_t req_bytes; + char *info; + + OPENCL_SUCCEED_FATAL(clGetDeviceInfo(device, param, 0, NULL, &req_bytes)); + + info = (char*) malloc(req_bytes); + + OPENCL_SUCCEED_FATAL(clGetDeviceInfo(device, param, req_bytes, info, NULL)); + + return info; +} + +static void opencl_all_device_options(struct opencl_device_option **devices_out, + size_t *num_devices_out) { + size_t num_devices = 0, num_devices_added = 0; + + cl_platform_id *all_platforms; + cl_uint *platform_num_devices; + + cl_uint num_platforms; + + // Find the number of platforms. + OPENCL_SUCCEED_FATAL(clGetPlatformIDs(0, NULL, &num_platforms)); + + // Make room for them. + all_platforms = calloc(num_platforms, sizeof(cl_platform_id)); + platform_num_devices = calloc(num_platforms, sizeof(cl_uint)); + + // Fetch all the platforms. + OPENCL_SUCCEED_FATAL(clGetPlatformIDs(num_platforms, all_platforms, NULL)); + + // Count the number of devices for each platform, as well as the + // total number of devices. + for (cl_uint i = 0; i < num_platforms; i++) { + if (clGetDeviceIDs(all_platforms[i], CL_DEVICE_TYPE_ALL, + 0, NULL, &platform_num_devices[i]) == CL_SUCCESS) { + num_devices += platform_num_devices[i]; + } else { + platform_num_devices[i] = 0; + } + } + + // Make room for all the device options. + struct opencl_device_option *devices = + calloc(num_devices, sizeof(struct opencl_device_option)); + + // Loop through the platforms, getting information about their devices. + for (cl_uint i = 0; i < num_platforms; i++) { + cl_platform_id platform = all_platforms[i]; + cl_uint num_platform_devices = platform_num_devices[i]; + + if (num_platform_devices == 0) { + continue; + } + + char *platform_name = opencl_platform_info(platform, CL_PLATFORM_NAME); + cl_device_id *platform_devices = + calloc(num_platform_devices, sizeof(cl_device_id)); + + // Fetch all the devices. + OPENCL_SUCCEED_FATAL(clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, + num_platform_devices, platform_devices, NULL)); + + // Loop through the devices, adding them to the devices array. + for (cl_uint i = 0; i < num_platform_devices; i++) { + char *device_name = opencl_device_info(platform_devices[i], CL_DEVICE_NAME); + devices[num_devices_added].platform = platform; + devices[num_devices_added].device = platform_devices[i]; + OPENCL_SUCCEED_FATAL(clGetDeviceInfo(platform_devices[i], CL_DEVICE_TYPE, + sizeof(cl_device_type), + &devices[num_devices_added].device_type, + NULL)); + // We don't want the structs to share memory, so copy the platform name. + // Each device name is already unique. + devices[num_devices_added].platform_name = strclone(platform_name); + devices[num_devices_added].device_name = device_name; + num_devices_added++; + } + free(platform_devices); + free(platform_name); + } + free(all_platforms); + free(platform_num_devices); + + *devices_out = devices; + *num_devices_out = num_devices; +} + +// Returns 0 on success. +static int list_devices(void) { + struct opencl_device_option *devices; + size_t num_devices; + + opencl_all_device_options(&devices, &num_devices); + + const char *cur_platform = ""; + for (size_t i = 0; i < num_devices; i++) { + struct opencl_device_option device = devices[i]; + if (strcmp(cur_platform, device.platform_name) != 0) { + printf("Platform: %s\n", device.platform_name); + cur_platform = device.platform_name; + } + printf("[%d]: %s\n", (int)i, device.device_name); + } + + // Free all the platform and device names. + for (size_t j = 0; j < num_devices; j++) { + free(devices[j].platform_name); + free(devices[j].device_name); + } + free(devices); + + return 0; +} + +// Returns 0 on success. +static int select_device_interactively(struct opencl_config *cfg) { + struct opencl_device_option *devices; + size_t num_devices; + int ret = 1; + + opencl_all_device_options(&devices, &num_devices); + + printf("Choose OpenCL device:\n"); + const char *cur_platform = ""; + for (size_t i = 0; i < num_devices; i++) { + struct opencl_device_option device = devices[i]; + if (strcmp(cur_platform, device.platform_name) != 0) { + printf("Platform: %s\n", device.platform_name); + cur_platform = device.platform_name; + } + printf("[%d] %s\n", (int)i, device.device_name); + } + + int selection; + printf("Choice: "); + if (scanf("%d", &selection) == 1) { + ret = 0; + cfg->preferred_platform = ""; + cfg->preferred_device = ""; + cfg->preferred_device_num = selection; + cfg->ignore_blacklist = 1; + } + + // Free all the platform and device names. + for (size_t j = 0; j < num_devices; j++) { + free(devices[j].platform_name); + free(devices[j].device_name); + } + free(devices); + + return ret; +} + +static int is_blacklisted(const char *platform_name, const char *device_name, + const struct opencl_config *cfg) { + if (strcmp(cfg->preferred_platform, "") != 0 || + strcmp(cfg->preferred_device, "") != 0) { + return 0; + } else if (strstr(platform_name, "Apple") != NULL && + strstr(device_name, "Intel(R) Core(TM)") != NULL) { + return 1; + } else { + return 0; + } +} + +static struct opencl_device_option get_preferred_device(const struct opencl_config *cfg) { + struct opencl_device_option *devices; + size_t num_devices; + + opencl_all_device_options(&devices, &num_devices); + + int num_device_matches = 0; + + for (size_t i = 0; i < num_devices; i++) { + struct opencl_device_option device = devices[i]; + if (strstr(device.platform_name, cfg->preferred_platform) != NULL && + strstr(device.device_name, cfg->preferred_device) != NULL && + (cfg->ignore_blacklist || + !is_blacklisted(device.platform_name, device.device_name, cfg)) && + num_device_matches++ == cfg->preferred_device_num) { + // Free all the platform and device names, except the ones we have chosen. + for (size_t j = 0; j < num_devices; j++) { + if (j != i) { + free(devices[j].platform_name); + free(devices[j].device_name); + } + } + free(devices); + return device; + } + } + + futhark_panic(1, "Could not find acceptable OpenCL device.\n"); + exit(1); // Never reached +} + +static void describe_device_option(struct opencl_device_option device) { + fprintf(stderr, "Using platform: %s\n", device.platform_name); + fprintf(stderr, "Using device: %s\n", device.device_name); +} + +static cl_build_status build_opencl_program(cl_program program, cl_device_id device, const char* options) { + cl_int clBuildProgram_error = clBuildProgram(program, 1, &device, options, NULL, NULL); + + // Avoid termination due to CL_BUILD_PROGRAM_FAILURE + if (clBuildProgram_error != CL_SUCCESS && + clBuildProgram_error != CL_BUILD_PROGRAM_FAILURE) { + OPENCL_SUCCEED_FATAL(clBuildProgram_error); + } + + cl_build_status build_status; + OPENCL_SUCCEED_FATAL(clGetProgramBuildInfo(program, + device, + CL_PROGRAM_BUILD_STATUS, + sizeof(cl_build_status), + &build_status, + NULL)); + + if (build_status != CL_SUCCESS) { + char *build_log; + size_t ret_val_size; + OPENCL_SUCCEED_FATAL(clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, 0, NULL, &ret_val_size)); + + build_log = (char*) malloc(ret_val_size+1); + OPENCL_SUCCEED_FATAL(clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, ret_val_size, build_log, NULL)); + + // The spec technically does not say whether the build log is zero-terminated, so let's be careful. + build_log[ret_val_size] = '\0'; + + fprintf(stderr, "Build log:\n%s\n", build_log); + + free(build_log); + } + + return build_status; +} + +// Fields in a bitmask indicating which types we must be sure are +// available. +enum opencl_required_type { OPENCL_F64 = 1 }; + +// We take as input several strings representing the program, because +// C does not guarantee that the compiler supports particularly large +// literals. Notably, Visual C has a limit of 2048 characters. The +// array must be NULL-terminated. +static cl_program setup_opencl_with_command_queue(struct opencl_context *ctx, + cl_command_queue queue, + const char *srcs[], + int required_types, + const char *extra_build_opts[]) { + int error; + + free_list_init(&ctx->free_list); + ctx->queue = queue; + + OPENCL_SUCCEED_FATAL(clGetCommandQueueInfo(ctx->queue, CL_QUEUE_CONTEXT, sizeof(cl_context), &ctx->ctx, NULL)); + + // Fill out the device info. This is redundant work if we are + // called from setup_opencl() (which is the common case), but I + // doubt it matters much. + struct opencl_device_option device_option; + OPENCL_SUCCEED_FATAL(clGetCommandQueueInfo(ctx->queue, CL_QUEUE_DEVICE, + sizeof(cl_device_id), + &device_option.device, + NULL)); + OPENCL_SUCCEED_FATAL(clGetDeviceInfo(device_option.device, CL_DEVICE_PLATFORM, + sizeof(cl_platform_id), + &device_option.platform, + NULL)); + OPENCL_SUCCEED_FATAL(clGetDeviceInfo(device_option.device, CL_DEVICE_TYPE, + sizeof(cl_device_type), + &device_option.device_type, + NULL)); + device_option.platform_name = opencl_platform_info(device_option.platform, CL_PLATFORM_NAME); + device_option.device_name = opencl_device_info(device_option.device, CL_DEVICE_NAME); + + ctx->device = device_option.device; + + if (required_types & OPENCL_F64) { + cl_uint supported; + OPENCL_SUCCEED_FATAL(clGetDeviceInfo(device_option.device, CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE, + sizeof(cl_uint), &supported, NULL)); + if (!supported) { + futhark_panic(1, "Program uses double-precision floats, but this is not supported on the chosen device: %s\n", + device_option.device_name); + } + } + + size_t max_group_size; + OPENCL_SUCCEED_FATAL(clGetDeviceInfo(device_option.device, CL_DEVICE_MAX_WORK_GROUP_SIZE, + sizeof(size_t), &max_group_size, NULL)); + + size_t max_tile_size = sqrt(max_group_size); + + cl_ulong max_local_memory; + OPENCL_SUCCEED_FATAL(clGetDeviceInfo(device_option.device, CL_DEVICE_LOCAL_MEM_SIZE, + sizeof(size_t), &max_local_memory, NULL)); + + // Futhark reserves 4 bytes for bookkeeping information. + max_local_memory -= 4; + + // The OpenCL implementation may reserve some local memory bytes for + // various purposes. In principle, we should use + // clGetKernelWorkGroupInfo() to figure out for each kernel how much + // is actually available, but our current code generator design + // makes this infeasible. Instead, we have this nasty hack where we + // arbitrarily subtract some bytes, based on empirical measurements + // (but which might be arbitrarily wrong). Fortunately, we rarely + // try to really push the local memory usage. + if (strstr(device_option.platform_name, "NVIDIA CUDA") != NULL) { + max_local_memory -= 12; + } else if (strstr(device_option.platform_name, "AMD") != NULL) { + max_local_memory -= 16; + } + + // Make sure this function is defined. + post_opencl_setup(ctx, &device_option); + + if (max_group_size < ctx->cfg.default_group_size) { + if (ctx->cfg.default_group_size_changed) { + fprintf(stderr, "Note: Device limits default group size to %zu (down from %zu).\n", + max_group_size, ctx->cfg.default_group_size); + } + ctx->cfg.default_group_size = max_group_size; + } + + if (max_tile_size < ctx->cfg.default_tile_size) { + if (ctx->cfg.default_tile_size_changed) { + fprintf(stderr, "Note: Device limits default tile size to %zu (down from %zu).\n", + max_tile_size, ctx->cfg.default_tile_size); + } + ctx->cfg.default_tile_size = max_tile_size; + } + + ctx->max_group_size = max_group_size; + ctx->max_tile_size = max_tile_size; // No limit. + ctx->max_threshold = ctx->max_num_groups = 0; // No limit. + ctx->max_local_memory = max_local_memory; + + // Now we go through all the sizes, clamp them to the valid range, + // or set them to the default. + for (int i = 0; i < ctx->cfg.num_sizes; i++) { + const char *size_class = ctx->cfg.size_classes[i]; + int64_t *size_value = &ctx->cfg.size_values[i]; + const char* size_name = ctx->cfg.size_names[i]; + int64_t max_value = 0, default_value = 0; + + if (strstr(size_class, "group_size") == size_class) { + max_value = max_group_size; + default_value = ctx->cfg.default_group_size; + } else if (strstr(size_class, "num_groups") == size_class) { + max_value = max_group_size; // Futhark assumes this constraint. + default_value = ctx->cfg.default_num_groups; + // XXX: as a quick and dirty hack, use twice as many threads for + // histograms by default. We really should just be smarter + // about sizes somehow. + if (strstr(size_name, ".seghist_") != NULL) { + default_value *= 2; + } + } else if (strstr(size_class, "tile_size") == size_class) { + max_value = sqrt(max_group_size); + default_value = ctx->cfg.default_tile_size; + } else if (strstr(size_class, "reg_tile_size") == size_class) { + max_value = 0; // No limit. + default_value = ctx->cfg.default_reg_tile_size; + } else if (strstr(size_class, "threshold") == size_class) { + // Threshold can be as large as it takes. + default_value = ctx->cfg.default_threshold; + } else { + // Bespoke sizes have no limit or default. + } + if (*size_value == 0) { + *size_value = default_value; + } else if (max_value > 0 && *size_value > max_value) { + fprintf(stderr, "Note: Device limits %s to %d (down from %d)\n", + size_name, (int)max_value, (int)*size_value); + *size_value = max_value; + } + } + + if (ctx->lockstep_width == 0) { + ctx->lockstep_width = 1; + } + + if (ctx->cfg.logging) { + fprintf(stderr, "Lockstep width: %d\n", (int)ctx->lockstep_width); + fprintf(stderr, "Default group size: %d\n", (int)ctx->cfg.default_group_size); + fprintf(stderr, "Default number of groups: %d\n", (int)ctx->cfg.default_num_groups); + } + + char *fut_opencl_src = NULL; + cl_program prog; + error = CL_SUCCESS; + + if (ctx->cfg.load_binary_from == NULL) { + size_t src_size = 0; + + // Maybe we have to read OpenCL source from somewhere else (used for debugging). + if (ctx->cfg.load_program_from != NULL) { + fut_opencl_src = slurp_file(ctx->cfg.load_program_from, NULL); + assert(fut_opencl_src != NULL); + } else { + // Construct the OpenCL source concatenating all the fragments. + for (const char **src = srcs; src && *src; src++) { + src_size += strlen(*src); + } + + fut_opencl_src = (char*) malloc(src_size + 1); + + size_t n, i; + for (i = 0, n = 0; srcs && srcs[i]; i++) { + strncpy(fut_opencl_src+n, srcs[i], src_size-n); + n += strlen(srcs[i]); + } + fut_opencl_src[src_size] = 0; + } + + if (ctx->cfg.dump_program_to != NULL) { + if (ctx->cfg.debugging) { + fprintf(stderr, "Dumping OpenCL source to %s...\n", ctx->cfg.dump_program_to); + } + + dump_file(ctx->cfg.dump_program_to, fut_opencl_src, strlen(fut_opencl_src)); + } + + if (ctx->cfg.debugging) { + fprintf(stderr, "Creating OpenCL program...\n"); + } + + const char* src_ptr[] = {fut_opencl_src}; + prog = clCreateProgramWithSource(ctx->ctx, 1, src_ptr, &src_size, &error); + OPENCL_SUCCEED_FATAL(error); + } else { + if (ctx->cfg.debugging) { + fprintf(stderr, "Loading OpenCL binary from %s...\n", ctx->cfg.load_binary_from); + } + size_t binary_size; + unsigned char *fut_opencl_bin = + (unsigned char*) slurp_file(ctx->cfg.load_binary_from, &binary_size); + assert(fut_opencl_bin != NULL); + const unsigned char *binaries[1] = { fut_opencl_bin }; + cl_int status = 0; + + prog = clCreateProgramWithBinary(ctx->ctx, 1, &device_option.device, + &binary_size, binaries, + &status, &error); + + OPENCL_SUCCEED_FATAL(status); + OPENCL_SUCCEED_FATAL(error); + } + + int compile_opts_size = 1024; + + for (int i = 0; i < ctx->cfg.num_sizes; i++) { + compile_opts_size += strlen(ctx->cfg.size_names[i]) + 20; + } + + for (int i = 0; extra_build_opts[i] != NULL; i++) { + compile_opts_size += strlen(extra_build_opts[i] + 1); + } + + char *compile_opts = (char*) malloc(compile_opts_size); + + int w = snprintf(compile_opts, compile_opts_size, + "-DLOCKSTEP_WIDTH=%d ", + (int)ctx->lockstep_width); + + for (int i = 0; i < ctx->cfg.num_sizes; i++) { + w += snprintf(compile_opts+w, compile_opts_size-w, + "-D%s=%d ", + ctx->cfg.size_vars[i], + (int)ctx->cfg.size_values[i]); + } + + for (int i = 0; extra_build_opts[i] != NULL; i++) { + w += snprintf(compile_opts+w, compile_opts_size-w, + "%s ", extra_build_opts[i]); + } + + if (ctx->cfg.debugging) { + fprintf(stderr, "OpenCL compiler options: %s\n", compile_opts); + fprintf(stderr, "Building OpenCL program...\n"); + } + OPENCL_SUCCEED_FATAL(build_opencl_program(prog, device_option.device, compile_opts)); + + free(compile_opts); + free(fut_opencl_src); + + if (ctx->cfg.dump_binary_to != NULL) { + if (ctx->cfg.debugging) { + fprintf(stderr, "Dumping OpenCL binary to %s...\n", ctx->cfg.dump_binary_to); + } + + size_t binary_size; + OPENCL_SUCCEED_FATAL(clGetProgramInfo(prog, CL_PROGRAM_BINARY_SIZES, + sizeof(size_t), &binary_size, NULL)); + unsigned char *binary = (unsigned char*) malloc(binary_size); + unsigned char *binaries[1] = { binary }; + OPENCL_SUCCEED_FATAL(clGetProgramInfo(prog, CL_PROGRAM_BINARIES, + sizeof(unsigned char*), binaries, NULL)); + + dump_file(ctx->cfg.dump_binary_to, binary, binary_size); + } + + return prog; +} + +static cl_program setup_opencl(struct opencl_context *ctx, + const char *srcs[], + int required_types, + const char *extra_build_opts[]) { + + ctx->lockstep_width = 0; // Real value set later. + + struct opencl_device_option device_option = get_preferred_device(&ctx->cfg); + + if (ctx->cfg.logging) { + describe_device_option(device_option); + } + + // Note that NVIDIA's OpenCL requires the platform property + cl_context_properties properties[] = { + CL_CONTEXT_PLATFORM, + (cl_context_properties)device_option.platform, + 0 + }; + + cl_int clCreateContext_error; + ctx->ctx = clCreateContext(properties, 1, &device_option.device, NULL, NULL, &clCreateContext_error); + OPENCL_SUCCEED_FATAL(clCreateContext_error); + + cl_int clCreateCommandQueue_error; + cl_command_queue queue = + clCreateCommandQueue(ctx->ctx, + device_option.device, + ctx->cfg.profiling ? CL_QUEUE_PROFILING_ENABLE : 0, + &clCreateCommandQueue_error); + OPENCL_SUCCEED_FATAL(clCreateCommandQueue_error); + + return setup_opencl_with_command_queue(ctx, queue, srcs, required_types, extra_build_opts); +} + +// Count up the runtime all the profiling_records that occured during execution. +// Also clears the buffer of profiling_records. +static cl_int opencl_tally_profiling_records(struct opencl_context *ctx) { + cl_int err; + for (int i = 0; i < ctx->profiling_records_used; i++) { + struct profiling_record record = ctx->profiling_records[i]; + + cl_ulong start_t, end_t; + + if ((err = clGetEventProfilingInfo(*record.event, + CL_PROFILING_COMMAND_START, + sizeof(start_t), + &start_t, + NULL)) != CL_SUCCESS) { + return err; + } + + if ((err = clGetEventProfilingInfo(*record.event, + CL_PROFILING_COMMAND_END, + sizeof(end_t), + &end_t, + NULL)) != CL_SUCCESS) { + return err; + } + + // OpenCL provides nanosecond resolution, but we want + // microseconds. + *record.runs += 1; + *record.runtime += (end_t - start_t)/1000; + + if ((err = clReleaseEvent(*record.event)) != CL_SUCCESS) { + return err; + } + free(record.event); + } + + ctx->profiling_records_used = 0; + + return CL_SUCCESS; +} + +// If profiling, produce an event associated with a profiling record. +static cl_event* opencl_get_event(struct opencl_context *ctx, int *runs, int64_t *runtime) { + if (ctx->profiling_records_used == ctx->profiling_records_capacity) { + ctx->profiling_records_capacity *= 2; + ctx->profiling_records = + realloc(ctx->profiling_records, + ctx->profiling_records_capacity * + sizeof(struct profiling_record)); + } + cl_event *event = malloc(sizeof(cl_event)); + ctx->profiling_records[ctx->profiling_records_used].event = event; + ctx->profiling_records[ctx->profiling_records_used].runs = runs; + ctx->profiling_records[ctx->profiling_records_used].runtime = runtime; + ctx->profiling_records_used++; + return event; +} + +// Allocate memory from driver. The problem is that OpenCL may perform +// lazy allocation, so we cannot know whether an allocation succeeded +// until the first time we try to use it. Hence we immediately +// perform a write to see if the allocation succeeded. This is slow, +// but the assumption is that this operation will be rare (most things +// will go through the free list). +static int opencl_alloc_actual(struct opencl_context *ctx, size_t size, cl_mem *mem_out) { + int error; + *mem_out = clCreateBuffer(ctx->ctx, CL_MEM_READ_WRITE, size, NULL, &error); + + if (error != CL_SUCCESS) { + return error; + } + + int x = 2; + error = clEnqueueWriteBuffer(ctx->queue, *mem_out, 1, 0, sizeof(x), &x, 0, NULL, NULL); + + // No need to wait for completion here. clWaitForEvents() cannot + // return mem object allocation failures. This implies that the + // buffer is faulted onto the device on enqueue. (Observation by + // Andreas Kloeckner.) + + return error; +} + +static int opencl_alloc(struct opencl_context *ctx, size_t min_size, const char *tag, cl_mem *mem_out) { + (void)tag; + if (min_size < sizeof(int)) { + min_size = sizeof(int); + } + + size_t size; + + if (free_list_find(&ctx->free_list, min_size, &size, mem_out) == 0) { + // Successfully found a free block. Is it big enough? + // + // FIXME: we might also want to check whether the block is *too + // big*, to avoid internal fragmentation. However, this can + // sharply impact performance on programs where arrays change size + // frequently. Fortunately, such allocations are usually fairly + // short-lived, as they are necessarily within a loop, so the risk + // of internal fragmentation resulting in an OOM situation is + // limited. However, it would be preferable if we could go back + // and *shrink* oversize allocations when we encounter an OOM + // condition. That is technically feasible, since we do not + // expose OpenCL pointer values directly to the application, but + // instead rely on a level of indirection. + if (size >= min_size) { + if (ctx->cfg.debugging) { + fprintf(stderr, "No need to allocate: Found a block in the free list.\n"); + } + + return CL_SUCCESS; + } else { + if (ctx->cfg.debugging) { + fprintf(stderr, "Found a free block, but it was too small.\n"); + } + + // Not just right - free it. + int error = clReleaseMemObject(*mem_out); + if (error != CL_SUCCESS) { + return error; + } + } + } + + // We have to allocate a new block from the driver. If the + // allocation does not succeed, then we might be in an out-of-memory + // situation. We now start freeing things from the free list until + // we think we have freed enough that the allocation will succeed. + // Since we don't know how far the allocation is from fitting, we + // have to check after every deallocation. This might be pretty + // expensive. Let's hope that this case is hit rarely. + + if (ctx->cfg.debugging) { + fprintf(stderr, "Actually allocating the desired block.\n"); + } + + int error = opencl_alloc_actual(ctx, min_size, mem_out); + + while (error == CL_MEM_OBJECT_ALLOCATION_FAILURE) { + if (ctx->cfg.debugging) { + fprintf(stderr, "Out of OpenCL memory: releasing entry from the free list...\n"); + } + cl_mem mem; + if (free_list_first(&ctx->free_list, &mem) == 0) { + error = clReleaseMemObject(mem); + if (error != CL_SUCCESS) { + return error; + } + } else { + break; + } + error = opencl_alloc_actual(ctx, min_size, mem_out); + } + + return error; +} + +static int opencl_free(struct opencl_context *ctx, cl_mem mem, const char *tag) { + size_t size; + cl_mem existing_mem; + + // If there is already a block with this tag, then remove it. + if (free_list_find(&ctx->free_list, -1, &size, &existing_mem) == 0) { + int error = clReleaseMemObject(existing_mem); + if (error != CL_SUCCESS) { + return error; + } + } + + int error = clGetMemObjectInfo(mem, CL_MEM_SIZE, sizeof(size_t), &size, NULL); + + if (error == CL_SUCCESS) { + free_list_insert(&ctx->free_list, size, mem, tag); + } + + return error; +} + +static int opencl_free_all(struct opencl_context *ctx) { + cl_mem mem; + free_list_pack(&ctx->free_list); + while (free_list_first(&ctx->free_list, &mem) == 0) { + int error = clReleaseMemObject(mem); + if (error != CL_SUCCESS) { + return error; + } + } + + return CL_SUCCESS; +} + +// Free everything that belongs to 'ctx', but do not free 'ctx' +// itself. +static void teardown_opencl(struct opencl_context *ctx) { + (void)opencl_tally_profiling_records(ctx); + free(ctx->profiling_records); + (void)opencl_free_all(ctx); + (void)clReleaseCommandQueue(ctx->queue); + (void)clReleaseContext(ctx->ctx); +} + +// End of opencl.h. + +static const char *opencl_program[] = + {"#ifdef cl_clang_storage_class_specifiers\n#pragma OPENCL EXTENSION cl_clang_storage_class_specifiers : enable\n#endif\n#pragma OPENCL EXTENSION cl_khr_byte_addressable_store : enable\n__kernel void dummy_kernel(__global unsigned char *dummy, int n)\n{\n const int thread_gid = get_global_id(0);\n \n if (thread_gid >= n)\n return;\n}\n#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable\n#pragma OPENCL EXTENSION cl_khr_int64_extended_atomics : enable\ntypedef char int8_t;\ntypedef short int16_t;\ntypedef int int32_t;\ntypedef long int64_t;\ntypedef uchar uint8_t;\ntypedef ushort uint16_t;\ntypedef uint uint32_t;\ntypedef ulong uint64_t;\n#ifdef cl_nv_pragma_unroll\nstatic inline void mem_fence_global()\n{\n asm(\"membar.gl;\");\n}\n#else\nstatic inline void mem_fence_global()\n{\n mem_fence(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE);\n}\n#endif\nstatic inline void mem_fence_local()\n{\n mem_fence(CLK_LOCAL_MEM_FENCE);\n}\nstatic inline uint8_t add8(uint8_t x, uint8_t y)\n{\n return x + y;\n}\nstatic inline uint16_t add16(uint16_t x, uint16_t y)\n{\n return x + y;\n}\nstatic inline uint32_t add32(uint32_t x, uint32_t y)\n{\n return x + y;\n}\nstatic inline uint64_t add64(uint64_t x, uint64_t y)\n{\n return x + y;\n}\nstatic inline uint8_t sub8(uint8_t x, uint8_t y)\n{\n return x - y;\n}\nstatic inline uint16_t sub16(uint16_t x, uint16_t y)\n{\n return x - y;\n}\nstatic inline uint32_t sub32(uint32_t x, uint32_t y)\n{\n return x - y;\n}\nstatic inline uint64_t sub64(uint64_t x, uint64_t y)\n{\n return x - y;\n}\nstatic inline uint8_t mul8(uint8_t x, uint8_t y)\n{\n return x * y;\n}\nstatic inline uint16_t mul16(uint16_t x, uint16_t y)\n{\n return x * y;\n}\nstatic inline uint32_t mul32(uint32_t x, uint32_t y)\n{\n return x * y;\n}\nstatic inline uint64_t mul64(uint64_t x, uint64_t y)\n{\n return x * y;\n}\nstatic inline uint8_t udiv8(uint8_t x, uint8_t y)\n{\n return x / y;\n}\nstatic inline uint16_t udiv16(uint16_t x, uint16_t y)\n{\n return x / y;\n}\nstatic inline uint32_t udiv32(uint3", + "2_t x, uint32_t y)\n{\n return x / y;\n}\nstatic inline uint64_t udiv64(uint64_t x, uint64_t y)\n{\n return x / y;\n}\nstatic inline uint8_t udiv_up8(uint8_t x, uint8_t y)\n{\n return (x + y - 1) / y;\n}\nstatic inline uint16_t udiv_up16(uint16_t x, uint16_t y)\n{\n return (x + y - 1) / y;\n}\nstatic inline uint32_t udiv_up32(uint32_t x, uint32_t y)\n{\n return (x + y - 1) / y;\n}\nstatic inline uint64_t udiv_up64(uint64_t x, uint64_t y)\n{\n return (x + y - 1) / y;\n}\nstatic inline uint8_t umod8(uint8_t x, uint8_t y)\n{\n return x % y;\n}\nstatic inline uint16_t umod16(uint16_t x, uint16_t y)\n{\n return x % y;\n}\nstatic inline uint32_t umod32(uint32_t x, uint32_t y)\n{\n return x % y;\n}\nstatic inline uint64_t umod64(uint64_t x, uint64_t y)\n{\n return x % y;\n}\nstatic inline uint8_t udiv_safe8(uint8_t x, uint8_t y)\n{\n return y == 0 ? 0 : x / y;\n}\nstatic inline uint16_t udiv_safe16(uint16_t x, uint16_t y)\n{\n return y == 0 ? 0 : x / y;\n}\nstatic inline uint32_t udiv_safe32(uint32_t x, uint32_t y)\n{\n return y == 0 ? 0 : x / y;\n}\nstatic inline uint64_t udiv_safe64(uint64_t x, uint64_t y)\n{\n return y == 0 ? 0 : x / y;\n}\nstatic inline uint8_t udiv_up_safe8(uint8_t x, uint8_t y)\n{\n return y == 0 ? 0 : (x + y - 1) / y;\n}\nstatic inline uint16_t udiv_up_safe16(uint16_t x, uint16_t y)\n{\n return y == 0 ? 0 : (x + y - 1) / y;\n}\nstatic inline uint32_t udiv_up_safe32(uint32_t x, uint32_t y)\n{\n return y == 0 ? 0 : (x + y - 1) / y;\n}\nstatic inline uint64_t udiv_up_safe64(uint64_t x, uint64_t y)\n{\n return y == 0 ? 0 : (x + y - 1) / y;\n}\nstatic inline uint8_t umod_safe8(uint8_t x, uint8_t y)\n{\n return y == 0 ? 0 : x % y;\n}\nstatic inline uint16_t umod_safe16(uint16_t x, uint16_t y)\n{\n return y == 0 ? 0 : x % y;\n}\nstatic inline uint32_t umod_safe32(uint32_t x, uint32_t y)\n{\n return y == 0 ? 0 : x % y;\n}\nstatic inline uint64_t umod_safe64(uint64_t x, uint64_t y)\n{\n return y == 0 ? 0 : x % y;\n}\nstatic inline int8_t sdiv8(int8_t x, int8_t y)\n{\n int8_t q =", + " x / y;\n int8_t r = x % y;\n \n return q - ((r != 0 && r < 0 != y < 0) ? 1 : 0);\n}\nstatic inline int16_t sdiv16(int16_t x, int16_t y)\n{\n int16_t q = x / y;\n int16_t r = x % y;\n \n return q - ((r != 0 && r < 0 != y < 0) ? 1 : 0);\n}\nstatic inline int32_t sdiv32(int32_t x, int32_t y)\n{\n int32_t q = x / y;\n int32_t r = x % y;\n \n return q - ((r != 0 && r < 0 != y < 0) ? 1 : 0);\n}\nstatic inline int64_t sdiv64(int64_t x, int64_t y)\n{\n int64_t q = x / y;\n int64_t r = x % y;\n \n return q - ((r != 0 && r < 0 != y < 0) ? 1 : 0);\n}\nstatic inline int8_t sdiv_up8(int8_t x, int8_t y)\n{\n return sdiv8(x + y - 1, y);\n}\nstatic inline int16_t sdiv_up16(int16_t x, int16_t y)\n{\n return sdiv16(x + y - 1, y);\n}\nstatic inline int32_t sdiv_up32(int32_t x, int32_t y)\n{\n return sdiv32(x + y - 1, y);\n}\nstatic inline int64_t sdiv_up64(int64_t x, int64_t y)\n{\n return sdiv64(x + y - 1, y);\n}\nstatic inline int8_t smod8(int8_t x, int8_t y)\n{\n int8_t r = x % y;\n \n return r + (r == 0 || (x > 0 && y > 0) || (x < 0 && y < 0) ? 0 : y);\n}\nstatic inline int16_t smod16(int16_t x, int16_t y)\n{\n int16_t r = x % y;\n \n return r + (r == 0 || (x > 0 && y > 0) || (x < 0 && y < 0) ? 0 : y);\n}\nstatic inline int32_t smod32(int32_t x, int32_t y)\n{\n int32_t r = x % y;\n \n return r + (r == 0 || (x > 0 && y > 0) || (x < 0 && y < 0) ? 0 : y);\n}\nstatic inline int64_t smod64(int64_t x, int64_t y)\n{\n int64_t r = x % y;\n \n return r + (r == 0 || (x > 0 && y > 0) || (x < 0 && y < 0) ? 0 : y);\n}\nstatic inline int8_t sdiv_safe8(int8_t x, int8_t y)\n{\n return y == 0 ? 0 : sdiv8(x, y);\n}\nstatic inline int16_t sdiv_safe16(int16_t x, int16_t y)\n{\n return y == 0 ? 0 : sdiv16(x, y);\n}\nstatic inline int32_t sdiv_safe32(int32_t x, int32_t y)\n{\n return y == 0 ? 0 : sdiv32(x, y);\n}\nstatic inline int64_t sdiv_safe64(int64_t x, int64_t y)\n{\n return y == 0 ? 0 : sdiv64(x, y);\n}\nstatic inline int8_t sdiv_up_safe8(int8_t x, int8_t y)\n{\n return ", + "sdiv_safe8(x + y - 1, y);\n}\nstatic inline int16_t sdiv_up_safe16(int16_t x, int16_t y)\n{\n return sdiv_safe16(x + y - 1, y);\n}\nstatic inline int32_t sdiv_up_safe32(int32_t x, int32_t y)\n{\n return sdiv_safe32(x + y - 1, y);\n}\nstatic inline int64_t sdiv_up_safe64(int64_t x, int64_t y)\n{\n return sdiv_safe64(x + y - 1, y);\n}\nstatic inline int8_t smod_safe8(int8_t x, int8_t y)\n{\n return y == 0 ? 0 : smod8(x, y);\n}\nstatic inline int16_t smod_safe16(int16_t x, int16_t y)\n{\n return y == 0 ? 0 : smod16(x, y);\n}\nstatic inline int32_t smod_safe32(int32_t x, int32_t y)\n{\n return y == 0 ? 0 : smod32(x, y);\n}\nstatic inline int64_t smod_safe64(int64_t x, int64_t y)\n{\n return y == 0 ? 0 : smod64(x, y);\n}\nstatic inline int8_t squot8(int8_t x, int8_t y)\n{\n return x / y;\n}\nstatic inline int16_t squot16(int16_t x, int16_t y)\n{\n return x / y;\n}\nstatic inline int32_t squot32(int32_t x, int32_t y)\n{\n return x / y;\n}\nstatic inline int64_t squot64(int64_t x, int64_t y)\n{\n return x / y;\n}\nstatic inline int8_t srem8(int8_t x, int8_t y)\n{\n return x % y;\n}\nstatic inline int16_t srem16(int16_t x, int16_t y)\n{\n return x % y;\n}\nstatic inline int32_t srem32(int32_t x, int32_t y)\n{\n return x % y;\n}\nstatic inline int64_t srem64(int64_t x, int64_t y)\n{\n return x % y;\n}\nstatic inline int8_t squot_safe8(int8_t x, int8_t y)\n{\n return y == 0 ? 0 : x / y;\n}\nstatic inline int16_t squot_safe16(int16_t x, int16_t y)\n{\n return y == 0 ? 0 : x / y;\n}\nstatic inline int32_t squot_safe32(int32_t x, int32_t y)\n{\n return y == 0 ? 0 : x / y;\n}\nstatic inline int64_t squot_safe64(int64_t x, int64_t y)\n{\n return y == 0 ? 0 : x / y;\n}\nstatic inline int8_t srem_safe8(int8_t x, int8_t y)\n{\n return y == 0 ? 0 : x % y;\n}\nstatic inline int16_t srem_safe16(int16_t x, int16_t y)\n{\n return y == 0 ? 0 : x % y;\n}\nstatic inline int32_t srem_safe32(int32_t x, int32_t y)\n{\n return y == 0 ? 0 : x % y;\n}\nstatic inline int64_t srem_safe64(int64_t x, int64_t y)\n{\n return ", + "y == 0 ? 0 : x % y;\n}\nstatic inline int8_t smin8(int8_t x, int8_t y)\n{\n return x < y ? x : y;\n}\nstatic inline int16_t smin16(int16_t x, int16_t y)\n{\n return x < y ? x : y;\n}\nstatic inline int32_t smin32(int32_t x, int32_t y)\n{\n return x < y ? x : y;\n}\nstatic inline int64_t smin64(int64_t x, int64_t y)\n{\n return x < y ? x : y;\n}\nstatic inline uint8_t umin8(uint8_t x, uint8_t y)\n{\n return x < y ? x : y;\n}\nstatic inline uint16_t umin16(uint16_t x, uint16_t y)\n{\n return x < y ? x : y;\n}\nstatic inline uint32_t umin32(uint32_t x, uint32_t y)\n{\n return x < y ? x : y;\n}\nstatic inline uint64_t umin64(uint64_t x, uint64_t y)\n{\n return x < y ? x : y;\n}\nstatic inline int8_t smax8(int8_t x, int8_t y)\n{\n return x < y ? y : x;\n}\nstatic inline int16_t smax16(int16_t x, int16_t y)\n{\n return x < y ? y : x;\n}\nstatic inline int32_t smax32(int32_t x, int32_t y)\n{\n return x < y ? y : x;\n}\nstatic inline int64_t smax64(int64_t x, int64_t y)\n{\n return x < y ? y : x;\n}\nstatic inline uint8_t umax8(uint8_t x, uint8_t y)\n{\n return x < y ? y : x;\n}\nstatic inline uint16_t umax16(uint16_t x, uint16_t y)\n{\n return x < y ? y : x;\n}\nstatic inline uint32_t umax32(uint32_t x, uint32_t y)\n{\n return x < y ? y : x;\n}\nstatic inline uint64_t umax64(uint64_t x, uint64_t y)\n{\n return x < y ? y : x;\n}\nstatic inline uint8_t shl8(uint8_t x, uint8_t y)\n{\n return x << y;\n}\nstatic inline uint16_t shl16(uint16_t x, uint16_t y)\n{\n return x << y;\n}\nstatic inline uint32_t shl32(uint32_t x, uint32_t y)\n{\n return x << y;\n}\nstatic inline uint64_t shl64(uint64_t x, uint64_t y)\n{\n return x << y;\n}\nstatic inline uint8_t lshr8(uint8_t x, uint8_t y)\n{\n return x >> y;\n}\nstatic inline uint16_t lshr16(uint16_t x, uint16_t y)\n{\n return x >> y;\n}\nstatic inline uint32_t lshr32(uint32_t x, uint32_t y)\n{\n return x >> y;\n}\nstatic inline uint64_t lshr64(uint64_t x, uint64_t y)\n{\n return x >> y;\n}\nstatic inline int8_t ashr8(int8_t x, int8_t y)\n{\n return x >> y;\n}\n", + "static inline int16_t ashr16(int16_t x, int16_t y)\n{\n return x >> y;\n}\nstatic inline int32_t ashr32(int32_t x, int32_t y)\n{\n return x >> y;\n}\nstatic inline int64_t ashr64(int64_t x, int64_t y)\n{\n return x >> y;\n}\nstatic inline uint8_t and8(uint8_t x, uint8_t y)\n{\n return x & y;\n}\nstatic inline uint16_t and16(uint16_t x, uint16_t y)\n{\n return x & y;\n}\nstatic inline uint32_t and32(uint32_t x, uint32_t y)\n{\n return x & y;\n}\nstatic inline uint64_t and64(uint64_t x, uint64_t y)\n{\n return x & y;\n}\nstatic inline uint8_t or8(uint8_t x, uint8_t y)\n{\n return x | y;\n}\nstatic inline uint16_t or16(uint16_t x, uint16_t y)\n{\n return x | y;\n}\nstatic inline uint32_t or32(uint32_t x, uint32_t y)\n{\n return x | y;\n}\nstatic inline uint64_t or64(uint64_t x, uint64_t y)\n{\n return x | y;\n}\nstatic inline uint8_t xor8(uint8_t x, uint8_t y)\n{\n return x ^ y;\n}\nstatic inline uint16_t xor16(uint16_t x, uint16_t y)\n{\n return x ^ y;\n}\nstatic inline uint32_t xor32(uint32_t x, uint32_t y)\n{\n return x ^ y;\n}\nstatic inline uint64_t xor64(uint64_t x, uint64_t y)\n{\n return x ^ y;\n}\nstatic inline bool ult8(uint8_t x, uint8_t y)\n{\n return x < y;\n}\nstatic inline bool ult16(uint16_t x, uint16_t y)\n{\n return x < y;\n}\nstatic inline bool ult32(uint32_t x, uint32_t y)\n{\n return x < y;\n}\nstatic inline bool ult64(uint64_t x, uint64_t y)\n{\n return x < y;\n}\nstatic inline bool ule8(uint8_t x, uint8_t y)\n{\n return x <= y;\n}\nstatic inline bool ule16(uint16_t x, uint16_t y)\n{\n return x <= y;\n}\nstatic inline bool ule32(uint32_t x, uint32_t y)\n{\n return x <= y;\n}\nstatic inline bool ule64(uint64_t x, uint64_t y)\n{\n return x <= y;\n}\nstatic inline bool slt8(int8_t x, int8_t y)\n{\n return x < y;\n}\nstatic inline bool slt16(int16_t x, int16_t y)\n{\n return x < y;\n}\nstatic inline bool slt32(int32_t x, int32_t y)\n{\n return x < y;\n}\nstatic inline bool slt64(int64_t x, int64_t y)\n{\n return x < y;\n}\nstatic inline bool sle8(int8_t x, int8_t y)\n{\n retur", + "n x <= y;\n}\nstatic inline bool sle16(int16_t x, int16_t y)\n{\n return x <= y;\n}\nstatic inline bool sle32(int32_t x, int32_t y)\n{\n return x <= y;\n}\nstatic inline bool sle64(int64_t x, int64_t y)\n{\n return x <= y;\n}\nstatic inline int8_t pow8(int8_t x, int8_t y)\n{\n int8_t res = 1, rem = y;\n \n while (rem != 0) {\n if (rem & 1)\n res *= x;\n rem >>= 1;\n x *= x;\n }\n return res;\n}\nstatic inline int16_t pow16(int16_t x, int16_t y)\n{\n int16_t res = 1, rem = y;\n \n while (rem != 0) {\n if (rem & 1)\n res *= x;\n rem >>= 1;\n x *= x;\n }\n return res;\n}\nstatic inline int32_t pow32(int32_t x, int32_t y)\n{\n int32_t res = 1, rem = y;\n \n while (rem != 0) {\n if (rem & 1)\n res *= x;\n rem >>= 1;\n x *= x;\n }\n return res;\n}\nstatic inline int64_t pow64(int64_t x, int64_t y)\n{\n int64_t res = 1, rem = y;\n \n while (rem != 0) {\n if (rem & 1)\n res *= x;\n rem >>= 1;\n x *= x;\n }\n return res;\n}\nstatic inline bool itob_i8_bool(int8_t x)\n{\n return x;\n}\nstatic inline bool itob_i16_bool(int16_t x)\n{\n return x;\n}\nstatic inline bool itob_i32_bool(int32_t x)\n{\n return x;\n}\nstatic inline bool itob_i64_bool(int64_t x)\n{\n return x;\n}\nstatic inline int8_t btoi_bool_i8(bool x)\n{\n return x;\n}\nstatic inline int16_t btoi_bool_i16(bool x)\n{\n return x;\n}\nstatic inline int32_t btoi_bool_i32(bool x)\n{\n return x;\n}\nstatic inline int64_t btoi_bool_i64(bool x)\n{\n return x;\n}\n#define sext_i8_i8(x) ((int8_t) (int8_t) x)\n#define sext_i8_i16(x) ((int16_t) (int8_t) x)\n#define sext_i8_i32(x) ((int32_t) (int8_t) x)\n#define sext_i8_i64(x) ((int64_t) (int8_t) x)\n#define sext_i16_i8(x) ((int8_t) (int16_t) x)\n#define sext_i16_i16(x) ((int16_t) (int16_t) x)\n#define sext_i16_i32(x) ((int32_t) (int16_t) x)\n#define sext_i16_i64(x) ((int64_t) (int16_t) x)\n#define sext_i32_i8(x) ((int8_t) (int32_t) x)\n#define sext_i32_i16(x) (", + "(int16_t) (int32_t) x)\n#define sext_i32_i32(x) ((int32_t) (int32_t) x)\n#define sext_i32_i64(x) ((int64_t) (int32_t) x)\n#define sext_i64_i8(x) ((int8_t) (int64_t) x)\n#define sext_i64_i16(x) ((int16_t) (int64_t) x)\n#define sext_i64_i32(x) ((int32_t) (int64_t) x)\n#define sext_i64_i64(x) ((int64_t) (int64_t) x)\n#define zext_i8_i8(x) ((int8_t) (uint8_t) x)\n#define zext_i8_i16(x) ((int16_t) (uint8_t) x)\n#define zext_i8_i32(x) ((int32_t) (uint8_t) x)\n#define zext_i8_i64(x) ((int64_t) (uint8_t) x)\n#define zext_i16_i8(x) ((int8_t) (uint16_t) x)\n#define zext_i16_i16(x) ((int16_t) (uint16_t) x)\n#define zext_i16_i32(x) ((int32_t) (uint16_t) x)\n#define zext_i16_i64(x) ((int64_t) (uint16_t) x)\n#define zext_i32_i8(x) ((int8_t) (uint32_t) x)\n#define zext_i32_i16(x) ((int16_t) (uint32_t) x)\n#define zext_i32_i32(x) ((int32_t) (uint32_t) x)\n#define zext_i32_i64(x) ((int64_t) (uint32_t) x)\n#define zext_i64_i8(x) ((int8_t) (uint64_t) x)\n#define zext_i64_i16(x) ((int16_t) (uint64_t) x)\n#define zext_i64_i32(x) ((int32_t) (uint64_t) x)\n#define zext_i64_i64(x) ((int64_t) (uint64_t) x)\n#if defined(__OPENCL_VERSION__)\nstatic int32_t futrts_popc8(int8_t x)\n{\n return popcount(x);\n}\nstatic int32_t futrts_popc16(int16_t x)\n{\n return popcount(x);\n}\nstatic int32_t futrts_popc32(int32_t x)\n{\n return popcount(x);\n}\nstatic int32_t futrts_popc64(int64_t x)\n{\n return popcount(x);\n}\n#elif defined(__CUDA_ARCH__)\nstatic int32_t futrts_popc8(int8_t x)\n{\n return __popc(zext_i8_i32(x));\n}\nstatic int32_t futrts_popc16(int16_t x)\n{\n return __popc(zext_i16_i32(x));\n}\nstatic int32_t futrts_popc32(int32_t x)\n{\n return __popc(x);\n}\nstatic int32_t futrts_popc64(int64_t x)\n{\n return __popcll(x);\n}\n#else\nstatic int32_t futrts_popc8(int8_t x)\n{\n int c = 0;\n \n for (; x; ++c)\n x &= x - 1;\n return c;\n}\nstatic int32_t futrts_popc16(int16_t x)\n{\n int c = 0;\n \n for (; x; ++c)\n x &= x - 1;\n return c;\n}\nstatic int32_t futrts_popc32(int32_t x)\n{\n int c = 0;\n \n ", + " for (; x; ++c)\n x &= x - 1;\n return c;\n}\nstatic int32_t futrts_popc64(int64_t x)\n{\n int c = 0;\n \n for (; x; ++c)\n x &= x - 1;\n return c;\n}\n#endif\n#if defined(__OPENCL_VERSION__)\nstatic uint8_t futrts_mul_hi8(uint8_t a, uint8_t b)\n{\n return mul_hi(a, b);\n}\nstatic uint16_t futrts_mul_hi16(uint16_t a, uint16_t b)\n{\n return mul_hi(a, b);\n}\nstatic uint32_t futrts_mul_hi32(uint32_t a, uint32_t b)\n{\n return mul_hi(a, b);\n}\nstatic uint64_t futrts_mul_hi64(uint64_t a, uint64_t b)\n{\n return mul_hi(a, b);\n}\n#elif defined(__CUDA_ARCH__)\nstatic uint8_t futrts_mul_hi8(uint8_t a, uint8_t b)\n{\n uint16_t aa = a;\n uint16_t bb = b;\n \n return aa * bb >> 8;\n}\nstatic uint16_t futrts_mul_hi16(uint16_t a, uint16_t b)\n{\n uint32_t aa = a;\n uint32_t bb = b;\n \n return aa * bb >> 16;\n}\nstatic uint32_t futrts_mul_hi32(uint32_t a, uint32_t b)\n{\n return mulhi(a, b);\n}\nstatic uint64_t futrts_mul_hi64(uint64_t a, uint64_t b)\n{\n return mul64hi(a, b);\n}\n#else\nstatic uint8_t futrts_mul_hi8(uint8_t a, uint8_t b)\n{\n uint16_t aa = a;\n uint16_t bb = b;\n \n return aa * bb >> 8;\n}\nstatic uint16_t futrts_mul_hi16(uint16_t a, uint16_t b)\n{\n uint32_t aa = a;\n uint32_t bb = b;\n \n return aa * bb >> 16;\n}\nstatic uint32_t futrts_mul_hi32(uint32_t a, uint32_t b)\n{\n uint64_t aa = a;\n uint64_t bb = b;\n \n return aa * bb >> 32;\n}\nstatic uint64_t futrts_mul_hi64(uint64_t a, uint64_t b)\n{\n __uint128_t aa = a;\n __uint128_t bb = b;\n \n return aa * bb >> 64;\n}\n#endif\n#if defined(__OPENCL_VERSION__)\nstatic uint8_t futrts_mad_hi8(uint8_t a, uint8_t b, uint8_t c)\n{\n return mad_hi(a, b, c);\n}\nstatic uint16_t futrts_mad_hi16(uint16_t a, uint16_t b, uint16_t c)\n{\n return mad_hi(a, b, c);\n}\nstatic uint32_t futrts_mad_hi32(uint32_t a, uint32_t b, uint32_t c)\n{\n return mad_hi(a, b, c);\n}\nstatic uint64_t futrts_mad_hi64(uint64_t a, uint64_t b, uint64_t c)\n{\n return mad_hi(a, b, c);\n}\n#else\nstatic uint8_t ", + "futrts_mad_hi8(uint8_t a, uint8_t b, uint8_t c)\n{\n return futrts_mul_hi8(a, b) + c;\n}\nstatic uint16_t futrts_mad_hi16(uint16_t a, uint16_t b, uint16_t c)\n{\n return futrts_mul_hi16(a, b) + c;\n}\nstatic uint32_t futrts_mad_hi32(uint32_t a, uint32_t b, uint32_t c)\n{\n return futrts_mul_hi32(a, b) + c;\n}\nstatic uint64_t futrts_mad_hi64(uint64_t a, uint64_t b, uint64_t c)\n{\n return futrts_mul_hi64(a, b) + c;\n}\n#endif\n#if defined(__OPENCL_VERSION__)\nstatic int32_t futrts_clzz8(int8_t x)\n{\n return clz(x);\n}\nstatic int32_t futrts_clzz16(int16_t x)\n{\n return clz(x);\n}\nstatic int32_t futrts_clzz32(int32_t x)\n{\n return clz(x);\n}\nstatic int32_t futrts_clzz64(int64_t x)\n{\n return clz(x);\n}\n#elif defined(__CUDA_ARCH__)\nstatic int32_t futrts_clzz8(int8_t x)\n{\n return __clz(zext_i8_i32(x)) - 24;\n}\nstatic int32_t futrts_clzz16(int16_t x)\n{\n return __clz(zext_i16_i32(x)) - 16;\n}\nstatic int32_t futrts_clzz32(int32_t x)\n{\n return __clz(x);\n}\nstatic int32_t futrts_clzz64(int64_t x)\n{\n return __clzll(x);\n}\n#else\nstatic int32_t futrts_clzz8(int8_t x)\n{\n int n = 0;\n int bits = sizeof(x) * 8;\n \n for (int i = 0; i < bits; i++) {\n if (x < 0)\n break;\n n++;\n x <<= 1;\n }\n return n;\n}\nstatic int32_t futrts_clzz16(int16_t x)\n{\n int n = 0;\n int bits = sizeof(x) * 8;\n \n for (int i = 0; i < bits; i++) {\n if (x < 0)\n break;\n n++;\n x <<= 1;\n }\n return n;\n}\nstatic int32_t futrts_clzz32(int32_t x)\n{\n int n = 0;\n int bits = sizeof(x) * 8;\n \n for (int i = 0; i < bits; i++) {\n if (x < 0)\n break;\n n++;\n x <<= 1;\n }\n return n;\n}\nstatic int32_t futrts_clzz64(int64_t x)\n{\n int n = 0;\n int bits = sizeof(x) * 8;\n \n for (int i = 0; i < bits; i++) {\n if (x < 0)\n break;\n n++;\n x <<= 1;\n }\n return n;\n}\n#endif\n#if defined(__OPENCL_VERSION__)\nstatic int32_t futrts_ctzz8(int8_t x)\n{\n int", + " i = 0;\n \n for (; i < 8 && (x & 1) == 0; i++, x >>= 1)\n ;\n return i;\n}\nstatic int32_t futrts_ctzz16(int16_t x)\n{\n int i = 0;\n \n for (; i < 16 && (x & 1) == 0; i++, x >>= 1)\n ;\n return i;\n}\nstatic int32_t futrts_ctzz32(int32_t x)\n{\n int i = 0;\n \n for (; i < 32 && (x & 1) == 0; i++, x >>= 1)\n ;\n return i;\n}\nstatic int32_t futrts_ctzz64(int64_t x)\n{\n int i = 0;\n \n for (; i < 64 && (x & 1) == 0; i++, x >>= 1)\n ;\n return i;\n}\n#elif defined(__CUDA_ARCH__)\nstatic int32_t futrts_ctzz8(int8_t x)\n{\n int y = __ffs(x);\n \n return y == 0 ? 8 : y - 1;\n}\nstatic int32_t futrts_ctzz16(int16_t x)\n{\n int y = __ffs(x);\n \n return y == 0 ? 16 : y - 1;\n}\nstatic int32_t futrts_ctzz32(int32_t x)\n{\n int y = __ffs(x);\n \n return y == 0 ? 32 : y - 1;\n}\nstatic int32_t futrts_ctzz64(int64_t x)\n{\n int y = __ffsll(x);\n \n return y == 0 ? 64 : y - 1;\n}\n#else\nstatic int32_t futrts_ctzz8(int8_t x)\n{\n return x == 0 ? 8 : __builtin_ctz((uint32_t) x);\n}\nstatic int32_t futrts_ctzz16(int16_t x)\n{\n return x == 0 ? 16 : __builtin_ctz((uint32_t) x);\n}\nstatic int32_t futrts_ctzz32(int32_t x)\n{\n return x == 0 ? 32 : __builtin_ctz(x);\n}\nstatic int32_t futrts_ctzz64(int64_t x)\n{\n return x == 0 ? 64 : __builtin_ctzll(x);\n}\n#endif\nstatic inline float fdiv32(float x, float y)\n{\n return x / y;\n}\nstatic inline float fadd32(float x, float y)\n{\n return x + y;\n}\nstatic inline float fsub32(float x, float y)\n{\n return x - y;\n}\nstatic inline float fmul32(float x, float y)\n{\n return x * y;\n}\nstatic inline float fmin32(float x, float y)\n{\n return fmin(x, y);\n}\nstatic inline float fmax32(float x, float y)\n{\n return fmax(x, y);\n}\nstatic inline float fpow32(float x, float y)\n{\n return pow(x, y);\n}\nstatic inline bool cmplt32(float x, float y)\n{\n return x < y;\n}\nstatic inline bool cmple32(float x, float y)\n{\n return x <= y;\n}\nstatic inline float sitofp_i8_f32(int8_t x)\n{\n return (floa", + "t) x;\n}\nstatic inline float sitofp_i16_f32(int16_t x)\n{\n return (float) x;\n}\nstatic inline float sitofp_i32_f32(int32_t x)\n{\n return (float) x;\n}\nstatic inline float sitofp_i64_f32(int64_t x)\n{\n return (float) x;\n}\nstatic inline float uitofp_i8_f32(uint8_t x)\n{\n return (float) x;\n}\nstatic inline float uitofp_i16_f32(uint16_t x)\n{\n return (float) x;\n}\nstatic inline float uitofp_i32_f32(uint32_t x)\n{\n return (float) x;\n}\nstatic inline float uitofp_i64_f32(uint64_t x)\n{\n return (float) x;\n}\nstatic inline int8_t fptosi_f32_i8(float x)\n{\n return (int8_t) x;\n}\nstatic inline int16_t fptosi_f32_i16(float x)\n{\n return (int16_t) x;\n}\nstatic inline int32_t fptosi_f32_i32(float x)\n{\n return (int32_t) x;\n}\nstatic inline int64_t fptosi_f32_i64(float x)\n{\n return (int64_t) x;\n}\nstatic inline uint8_t fptoui_f32_i8(float x)\n{\n return (uint8_t) x;\n}\nstatic inline uint16_t fptoui_f32_i16(float x)\n{\n return (uint16_t) x;\n}\nstatic inline uint32_t fptoui_f32_i32(float x)\n{\n return (uint32_t) x;\n}\nstatic inline uint64_t fptoui_f32_i64(float x)\n{\n return (uint64_t) x;\n}\nstatic inline bool futrts_isnan32(float x)\n{\n return isnan(x);\n}\nstatic inline bool futrts_isinf32(float x)\n{\n return isinf(x);\n}\n#ifdef __OPENCL_VERSION__\nstatic inline float futrts_log32(float x)\n{\n return log(x);\n}\nstatic inline float futrts_log2_32(float x)\n{\n return log2(x);\n}\nstatic inline float futrts_log10_32(float x)\n{\n return log10(x);\n}\nstatic inline float futrts_sqrt32(float x)\n{\n return sqrt(x);\n}\nstatic inline float futrts_exp32(float x)\n{\n return exp(x);\n}\nstatic inline float futrts_cos32(float x)\n{\n return cos(x);\n}\nstatic inline float futrts_sin32(float x)\n{\n return sin(x);\n}\nstatic inline float futrts_tan32(float x)\n{\n return tan(x);\n}\nstatic inline float futrts_acos32(float x)\n{\n return acos(x);\n}\nstatic inline float futrts_asin32(float x)\n{\n return asin(x);\n}\nstatic inline float futrts_atan32(float x)\n{\n return atan(x);\n}", + "\nstatic inline float futrts_cosh32(float x)\n{\n return cosh(x);\n}\nstatic inline float futrts_sinh32(float x)\n{\n return sinh(x);\n}\nstatic inline float futrts_tanh32(float x)\n{\n return tanh(x);\n}\nstatic inline float futrts_acosh32(float x)\n{\n return acosh(x);\n}\nstatic inline float futrts_asinh32(float x)\n{\n return asinh(x);\n}\nstatic inline float futrts_atanh32(float x)\n{\n return atanh(x);\n}\nstatic inline float futrts_atan2_32(float x, float y)\n{\n return atan2(x, y);\n}\nstatic inline float futrts_hypot32(float x, float y)\n{\n return hypot(x, y);\n}\nstatic inline float futrts_gamma32(float x)\n{\n return tgamma(x);\n}\nstatic inline float futrts_lgamma32(float x)\n{\n return lgamma(x);\n}\nstatic inline float fmod32(float x, float y)\n{\n return fmod(x, y);\n}\nstatic inline float futrts_round32(float x)\n{\n return rint(x);\n}\nstatic inline float futrts_floor32(float x)\n{\n return floor(x);\n}\nstatic inline float futrts_ceil32(float x)\n{\n return ceil(x);\n}\nstatic inline float futrts_lerp32(float v0, float v1, float t)\n{\n return mix(v0, v1, t);\n}\nstatic inline float futrts_mad32(float a, float b, float c)\n{\n return mad(a, b, c);\n}\nstatic inline float futrts_fma32(float a, float b, float c)\n{\n return fma(a, b, c);\n}\n#else\nstatic inline float futrts_log32(float x)\n{\n return logf(x);\n}\nstatic inline float futrts_log2_32(float x)\n{\n return log2f(x);\n}\nstatic inline float futrts_log10_32(float x)\n{\n return log10f(x);\n}\nstatic inline float futrts_sqrt32(float x)\n{\n return sqrtf(x);\n}\nstatic inline float futrts_exp32(float x)\n{\n return expf(x);\n}\nstatic inline float futrts_cos32(float x)\n{\n return cosf(x);\n}\nstatic inline float futrts_sin32(float x)\n{\n return sinf(x);\n}\nstatic inline float futrts_tan32(float x)\n{\n return tanf(x);\n}\nstatic inline float futrts_acos32(float x)\n{\n return acosf(x);\n}\nstatic inline float futrts_asin32(float x)\n{\n return asinf(x);\n}\nstatic inline float futrts_atan32(float x)\n{\n return ata", + "nf(x);\n}\nstatic inline float futrts_cosh32(float x)\n{\n return coshf(x);\n}\nstatic inline float futrts_sinh32(float x)\n{\n return sinhf(x);\n}\nstatic inline float futrts_tanh32(float x)\n{\n return tanhf(x);\n}\nstatic inline float futrts_acosh32(float x)\n{\n return acoshf(x);\n}\nstatic inline float futrts_asinh32(float x)\n{\n return asinhf(x);\n}\nstatic inline float futrts_atanh32(float x)\n{\n return atanhf(x);\n}\nstatic inline float futrts_atan2_32(float x, float y)\n{\n return atan2f(x, y);\n}\nstatic inline float futrts_hypot32(float x, float y)\n{\n return hypotf(x, y);\n}\nstatic inline float futrts_gamma32(float x)\n{\n return tgammaf(x);\n}\nstatic inline float futrts_lgamma32(float x)\n{\n return lgammaf(x);\n}\nstatic inline float fmod32(float x, float y)\n{\n return fmodf(x, y);\n}\nstatic inline float futrts_round32(float x)\n{\n return rintf(x);\n}\nstatic inline float futrts_floor32(float x)\n{\n return floorf(x);\n}\nstatic inline float futrts_ceil32(float x)\n{\n return ceilf(x);\n}\nstatic inline float futrts_lerp32(float v0, float v1, float t)\n{\n return v0 + (v1 - v0) * t;\n}\nstatic inline float futrts_mad32(float a, float b, float c)\n{\n return a * b + c;\n}\nstatic inline float futrts_fma32(float a, float b, float c)\n{\n return fmaf(a, b, c);\n}\n#endif\nstatic inline int32_t futrts_to_bits32(float x)\n{\n union {\n float f;\n int32_t t;\n } p;\n \n p.f = x;\n return p.t;\n}\nstatic inline float futrts_from_bits32(int32_t x)\n{\n union {\n int32_t f;\n float t;\n } p;\n \n p.f = x;\n return p.t;\n}\nstatic inline float fsignum32(float x)\n{\n return futrts_isnan32(x) ? x : (x > 0) - (x < 0);\n}\n// Start of atomics.h\n\ninline int32_t atomic_xchg_i32_global(volatile __global int32_t *p, int32_t x) {\n#ifdef FUTHARK_CUDA\n return atomicExch((int32_t*)p, x);\n#else\n return atomic_xor(p, x);\n#endif\n}\n\ninline int32_t atomic_xchg_i32_local(volatile __local int32_t *p, int32_t x) {\n#ifdef FUTHARK_CUDA\n return atomicExch(", + "(int32_t*)p, x);\n#else\n return atomic_xor(p, x);\n#endif\n}\n\ninline int32_t atomic_cmpxchg_i32_global(volatile __global int32_t *p,\n int32_t cmp, int32_t val) {\n#ifdef FUTHARK_CUDA\n return atomicCAS((int32_t*)p, cmp, val);\n#else\n return atomic_cmpxchg(p, cmp, val);\n#endif\n}\n\ninline int32_t atomic_cmpxchg_i32_local(volatile __local int32_t *p,\n int32_t cmp, int32_t val) {\n#ifdef FUTHARK_CUDA\n return atomicCAS((int32_t*)p, cmp, val);\n#else\n return atomic_cmpxchg(p, cmp, val);\n#endif\n}\n\ninline int32_t atomic_add_i32_global(volatile __global int32_t *p, int32_t x) {\n#ifdef FUTHARK_CUDA\n return atomicAdd((int32_t*)p, x);\n#else\n return atomic_add(p, x);\n#endif\n}\n\ninline int32_t atomic_add_i32_local(volatile __local int32_t *p, int32_t x) {\n#ifdef FUTHARK_CUDA\n return atomicAdd((int32_t*)p, x);\n#else\n return atomic_add(p, x);\n#endif\n}\n\ninline float atomic_fadd_f32_global(volatile __global float *p, float x) {\n#ifdef FUTHARK_CUDA\n return atomicAdd((float*)p, x);\n#else\n union { int32_t i; float f; } old;\n union { int32_t i; float f; } assumed;\n old.f = *p;\n do {\n assumed.f = old.f;\n old.f = old.f + x;\n old.i = atomic_cmpxchg_i32_global((volatile __global int32_t*)p, assumed.i, old.i);\n } while (assumed.i != old.i);\n return old.f;\n#endif\n}\n\ninline float atomic_fadd_f32_local(volatile __local float *p, float x) {\n#ifdef FUTHARK_CUDA\n return atomicAdd((float*)p, x);\n#else\n union { int32_t i; float f; } old;\n union { int32_t i; float f; } assumed;\n old.f = *p;\n do {\n assumed.f = old.f;\n old.f = old.f + x;\n old.i = atomic_cmpxchg_i32_local((volatile __local int32_t*)p, assumed.i, old.i);\n } while (assumed.i != old.i);\n return old.f;\n#endif\n}\n\ninline int32_t atomic_smax_i32_global(volatile __global int32_t *p, int32_t x) {\n#ifdef FUTHARK_CUDA\n return atomicMax((int32_t*)p, x);\n#else\n return atomic_max(p, x);\n#endif\n}\n\ninline int32_t atomic_smax_i32_local(volatile", + " __local int32_t *p, int32_t x) {\n#ifdef FUTHARK_CUDA\n return atomicMax((int32_t*)p, x);\n#else\n return atomic_max(p, x);\n#endif\n}\n\ninline int32_t atomic_smin_i32_global(volatile __global int32_t *p, int32_t x) {\n#ifdef FUTHARK_CUDA\n return atomicMin((int32_t*)p, x);\n#else\n return atomic_min(p, x);\n#endif\n}\n\ninline int32_t atomic_smin_i32_local(volatile __local int32_t *p, int32_t x) {\n#ifdef FUTHARK_CUDA\n return atomicMin((int32_t*)p, x);\n#else\n return atomic_min(p, x);\n#endif\n}\n\ninline uint32_t atomic_umax_i32_global(volatile __global uint32_t *p, uint32_t x) {\n#ifdef FUTHARK_CUDA\n return atomicMax((uint32_t*)p, x);\n#else\n return atomic_max(p, x);\n#endif\n}\n\ninline uint32_t atomic_umax_i32_local(volatile __local uint32_t *p, uint32_t x) {\n#ifdef FUTHARK_CUDA\n return atomicMax((uint32_t*)p, x);\n#else\n return atomic_max(p, x);\n#endif\n}\n\ninline uint32_t atomic_umin_i32_global(volatile __global uint32_t *p, uint32_t x) {\n#ifdef FUTHARK_CUDA\n return atomicMin((uint32_t*)p, x);\n#else\n return atomic_min(p, x);\n#endif\n}\n\ninline uint32_t atomic_umin_i32_local(volatile __local uint32_t *p, uint32_t x) {\n#ifdef FUTHARK_CUDA\n return atomicMin((uint32_t*)p, x);\n#else\n return atomic_min(p, x);\n#endif\n}\n\ninline int32_t atomic_and_i32_global(volatile __global int32_t *p, int32_t x) {\n#ifdef FUTHARK_CUDA\n return atomicAnd((int32_t*)p, x);\n#else\n return atomic_and(p, x);\n#endif\n}\n\ninline int32_t atomic_and_i32_local(volatile __local int32_t *p, int32_t x) {\n#ifdef FUTHARK_CUDA\n return atomicAnd((int32_t*)p, x);\n#else\n return atomic_and(p, x);\n#endif\n}\n\ninline int32_t atomic_or_i32_global(volatile __global int32_t *p, int32_t x) {\n#ifdef FUTHARK_CUDA\n return atomicOr((int32_t*)p, x);\n#else\n return atomic_or(p, x);\n#endif\n}\n\ninline int32_t atomic_or_i32_local(volatile __local int32_t *p, int32_t x) {\n#ifdef FUTHARK_CUDA\n return atomicOr((int32_t*)p, x);\n#else\n return atomic_or(p, x);\n#endif\n}\n\ninline int32_t atomic_xor_i32_global(volatile __global int32_t *p, int3", + "2_t x) {\n#ifdef FUTHARK_CUDA\n return atomicXor((int32_t*)p, x);\n#else\n return atomic_xor(p, x);\n#endif\n}\n\ninline int32_t atomic_xor_i32_local(volatile __local int32_t *p, int32_t x) {\n#ifdef FUTHARK_CUDA\n return atomicXor((int32_t*)p, x);\n#else\n return atomic_xor(p, x);\n#endif\n}\n\n// Start of 64 bit atomics\n\ninline int64_t atomic_xchg_i64_global(volatile __global int64_t *p, int64_t x) {\n#ifdef FUTHARK_CUDA\n return atomicExch((uint64_t*)p, x);\n#else\n return atom_xor(p, x);\n#endif\n}\n\ninline int64_t atomic_xchg_i64_local(volatile __local int64_t *p, int64_t x) {\n#ifdef FUTHARK_CUDA\n return atomicExch((uint64_t*)p, x);\n#else\n return atom_xor(p, x);\n#endif\n}\n\ninline int64_t atomic_cmpxchg_i64_global(volatile __global int64_t *p,\n int64_t cmp, int64_t val) {\n#ifdef FUTHARK_CUDA\n return atomicCAS((uint64_t*)p, cmp, val);\n#else\n return atom_cmpxchg(p, cmp, val);\n#endif\n}\n\ninline int64_t atomic_cmpxchg_i64_local(volatile __local int64_t *p,\n int64_t cmp, int64_t val) {\n#ifdef FUTHARK_CUDA\n return atomicCAS((uint64_t*)p, cmp, val);\n#else\n return atom_cmpxchg(p, cmp, val);\n#endif\n}\n\ninline int64_t atomic_add_i64_global(volatile __global int64_t *p, int64_t x) {\n#ifdef FUTHARK_CUDA\n return atomicAdd((uint64_t*)p, x);\n#else\n return atom_add(p, x);\n#endif\n}\n\ninline int64_t atomic_add_i64_local(volatile __local int64_t *p, int64_t x) {\n#ifdef FUTHARK_CUDA\n return atomicAdd((uint64_t*)p, x);\n#else\n return atom_add(p, x);\n#endif\n}\n\n#ifdef FUTHARK_F64_ENABLED\n\ninline double atomic_fadd_f64_global(volatile __global double *p, double x) {\n#if defined(FUTHARK_CUDA) && __CUDA_ARCH__ >= 600\n return atomicAdd((double*)p, x);\n#else\n union { int64_t i; double f; } old;\n union { int64_t i; double f; } assumed;\n old.f = *p;\n do {\n assumed.f = old.f;\n old.f = old.f + x;\n old.i = atomic_cmpxchg_i64_global((volatile __global int64_t*)p, assumed.i, old.i);\n } while (assumed.i != old.i);\n ", + "return old.f;\n#endif\n}\n\ninline double atomic_fadd_f64_local(volatile __local double *p, double x) {\n#if defined(FUTHARK_CUDA) && __CUDA_ARCH__ >= 600\n return atomicAdd((double*)p, x);\n#else\n union { int64_t i; double f; } old;\n union { int64_t i; double f; } assumed;\n old.f = *p;\n do {\n assumed.f = old.f;\n old.f = old.f + x;\n old.i = atomic_cmpxchg_i64_local((volatile __local int64_t*)p, assumed.i, old.i);\n } while (assumed.i != old.i);\n return old.f;\n#endif\n}\n\n#endif\n\ninline int64_t atomic_smax_i64_global(volatile __global int64_t *p, int64_t x) {\n#ifdef FUTHARK_CUDA\n return atomicMax((int64_t*)p, x);\n#else\n return atom_max(p, x);\n#endif\n}\n\ninline int64_t atomic_smax_i64_local(volatile __local int64_t *p, int64_t x) {\n#ifdef FUTHARK_CUDA\n return atomicMax((int64_t*)p, x);\n#else\n return atom_max(p, x);\n#endif\n}\n\ninline int64_t atomic_smin_i64_global(volatile __global int64_t *p, int64_t x) {\n#ifdef FUTHARK_CUDA\n return atomicMin((int64_t*)p, x);\n#else\n return atom_min(p, x);\n#endif\n}\n\ninline int64_t atomic_smin_i64_local(volatile __local int64_t *p, int64_t x) {\n#ifdef FUTHARK_CUDA\n return atomicMin((int64_t*)p, x);\n#else\n return atom_min(p, x);\n#endif\n}\n\ninline uint64_t atomic_umax_i64_global(volatile __global uint64_t *p, uint64_t x) {\n#ifdef FUTHARK_CUDA\n return atomicMax((uint64_t*)p, x);\n#else\n return atom_max(p, x);\n#endif\n}\n\ninline uint64_t atomic_umax_i64_local(volatile __local uint64_t *p, uint64_t x) {\n#ifdef FUTHARK_CUDA\n return atomicMax((uint64_t*)p, x);\n#else\n return atom_max(p, x);\n#endif\n}\n\ninline uint64_t atomic_umin_i64_global(volatile __global uint64_t *p, uint64_t x) {\n#ifdef FUTHARK_CUDA\n return atomicMin((uint64_t*)p, x);\n#else\n return atom_min(p, x);\n#endif\n}\n\ninline uint64_t atomic_umin_i64_local(volatile __local uint64_t *p, uint64_t x) {\n#ifdef FUTHARK_CUDA\n return atomicMin((uint64_t*)p, x);\n#else\n return atom_min(p, x);\n#endif\n}\n\ninline int64_t atomic_and_i64_global(volatile __global int64_t *p, int64_t x) ", + "{\n#ifdef FUTHARK_CUDA\n return atomicAnd((int64_t*)p, x);\n#else\n return atom_and(p, x);\n#endif\n}\n\ninline int64_t atomic_and_i64_local(volatile __local int64_t *p, int64_t x) {\n#ifdef FUTHARK_CUDA\n return atomicAnd((int64_t*)p, x);\n#else\n return atom_and(p, x);\n#endif\n}\n\ninline int64_t atomic_or_i64_global(volatile __global int64_t *p, int64_t x) {\n#ifdef FUTHARK_CUDA\n return atomicOr((int64_t*)p, x);\n#else\n return atom_or(p, x);\n#endif\n}\n\ninline int64_t atomic_or_i64_local(volatile __local int64_t *p, int64_t x) {\n#ifdef FUTHARK_CUDA\n return atomicOr((int64_t*)p, x);\n#else\n return atom_or(p, x);\n#endif\n}\n\ninline int64_t atomic_xor_i64_global(volatile __global int64_t *p, int64_t x) {\n#ifdef FUTHARK_CUDA\n return atomicXor((int64_t*)p, x);\n#else\n return atom_xor(p, x);\n#endif\n}\n\ninline int64_t atomic_xor_i64_local(volatile __local int64_t *p, int64_t x) {\n#ifdef FUTHARK_CUDA\n return atomicXor((int64_t*)p, x);\n#else\n return atom_xor(p, x);\n#endif\n}\n\n// End of atomics.h\n\n\n\n\n__kernel void get_envelopezicopy_9955(int64_t n_9485, int64_t i_9490, __global\n unsigned char *chunk_board_mem_9941,\n __global unsigned char *mem_9943)\n{\n const int block_dim0 = 0;\n const int block_dim1 = 1;\n const int block_dim2 = 2;\n int32_t copy_gtid_9955;\n int32_t copy_ltid_9956;\n int32_t copy_gid_9957;\n \n copy_gtid_9955 = get_global_id(0);\n copy_ltid_9956 = get_local_id(0);\n copy_gid_9957 = get_group_id(0);\n if (slt64(sext_i32_i64(copy_gtid_9955), n_9485)) {\n ((__global int8_t *) mem_9943)[n_9485 + sext_i32_i64(copy_gtid_9955)] =\n ((__global int8_t *) chunk_board_mem_9941)[i_9490 +\n sext_i32_i64(copy_gtid_9955) *\n n_9485];\n }\n \n error_0:\n return;\n}\n__kernel void get_envelopezicopy_9960(int64_t n_9485, __global\n ", + " unsigned char *chunk_board_mem_9941,\n __global unsigned char *mem_9943)\n{\n const int block_dim0 = 0;\n const int block_dim1 = 1;\n const int block_dim2 = 2;\n int32_t copy_gtid_9960;\n int32_t copy_ltid_9961;\n int32_t copy_gid_9962;\n \n copy_gtid_9960 = get_global_id(0);\n copy_ltid_9961 = get_local_id(0);\n copy_gid_9962 = get_group_id(0);\n if (slt64(sext_i32_i64(copy_gtid_9960), n_9485)) {\n ((__global int8_t *) mem_9943)[(int64_t) 3 * n_9485 +\n sext_i32_i64(copy_gtid_9960)] =\n ((__global\n int8_t *) chunk_board_mem_9941)[sext_i32_i64(copy_gtid_9960) *\n n_9485];\n }\n \n error_0:\n return;\n}\n__kernel void next_chunk_boardzicopy_9965(int64_t n_9500, int64_t m_9501,\n __global unsigned char *mem_9948,\n __global unsigned char *mem_9950)\n{\n const int block_dim0 = 0;\n const int block_dim1 = 1;\n const int block_dim2 = 2;\n int32_t copy_gtid_9965;\n int32_t copy_ltid_9966;\n int32_t copy_gid_9967;\n \n copy_gtid_9965 = get_global_id(0);\n copy_ltid_9966 = get_local_id(0);\n copy_gid_9967 = get_group_id(0);\n if (slt64(sext_i32_i64(copy_gtid_9965), n_9500 * n_9500)) {\n ((__global int8_t *) mem_9950)[squot64(sext_i32_i64(copy_gtid_9965),\n n_9500) * n_9500 +\n (sext_i32_i64(copy_gtid_9965) -\n squot64(sext_i32_i64(copy_gtid_9965),\n n_9500) * n_9500)] = ((__global\n int8_t *) mem_9948)[m_9501 +\n (int64_t) 1 +\n ", + " (squot64(sext_i32_i64(copy_gtid_9965),\n n_9500) *\n m_9501 +\n (sext_i32_i64(copy_gtid_9965) -\n squot64(sext_i32_i64(copy_gtid_9965),\n n_9500) *\n n_9500))];\n }\n \n error_0:\n return;\n}\n__kernel void next_chunk_boardzisegmap_9619(__global int *global_failure,\n int failure_is_an_option, __global\n int64_t *global_failure_args,\n int64_t n_9500, int64_t m_9501,\n __global\n unsigned char *chunk_board_mem_9941,\n __global\n unsigned char *envelope_board_mem_9942,\n __global unsigned char *mem_9945)\n{\n #define segmap_group_sizze_9734 (next_chunk_boardzisegmap_group_sizze_9622)\n \n const int block_dim0 = 0;\n const int block_dim1 = 1;\n const int block_dim2 = 2;\n \n if (*global_failure >= 0)\n return;\n \n int32_t global_tid_9955;\n int32_t local_tid_9956;\n int64_t group_sizze_9959;\n int32_t wave_sizze_9958;\n int32_t group_tid_9957;\n \n global_tid_9955 = get_global_id(0);\n local_tid_9956 = get_local_id(0);\n group_sizze_9959 = get_local_size(0);\n wave_sizze_9958 = LOCKSTEP_WIDTH;\n group_tid_99", + "57 = get_group_id(0);\n \n int32_t phys_tid_9619;\n \n phys_tid_9619 = global_tid_9955;\n \n int64_t gtid_9617;\n \n gtid_9617 = squot64(sext_i32_i64(group_tid_9957) * segmap_group_sizze_9734 +\n sext_i32_i64(local_tid_9956), m_9501);\n \n int64_t gtid_9618;\n \n gtid_9618 = sext_i32_i64(group_tid_9957) * segmap_group_sizze_9734 +\n sext_i32_i64(local_tid_9956) - squot64(sext_i32_i64(group_tid_9957) *\n segmap_group_sizze_9734 +\n sext_i32_i64(local_tid_9956),\n m_9501) * m_9501;\n if (slt64(gtid_9617, m_9501) && slt64(gtid_9618, m_9501)) {\n bool index_primexp_9904 = gtid_9617 == (int64_t) 0;\n int8_t defunc_0_f_res_9740;\n \n if (index_primexp_9904) {\n int8_t defunc_0_f_res_t_res_9745 = ((__global\n int8_t *) envelope_board_mem_9942)[gtid_9618];\n \n defunc_0_f_res_9740 = defunc_0_f_res_t_res_9745;\n } else {\n int64_t y_9746 = sub64(m_9501, (int64_t) 1);\n bool cond_9747 = gtid_9618 == y_9746;\n int8_t defunc_0_f_res_f_res_9748;\n \n if (cond_9747) {\n int8_t defunc_0_f_res_f_res_t_res_9753 = ((__global\n int8_t *) envelope_board_mem_9942)[m_9501 +\n gtid_9617];\n \n defunc_0_f_res_f_res_9748 = defunc_0_f_res_f_res_t_res_9753;\n } else {\n bool cond_9754 = gtid_9617 == y_9746;\n int8_t defunc_0_f_res_f_res_f_res_9755;\n \n if (cond_9754) {\n int8_t defunc_0_f_res_f_res_f_res_t_res_9760 = ((__global\n ", + " int8_t *) envelope_board_mem_9942)[(int64_t) 2 *\n m_9501 +\n gtid_9618];\n \n defunc_0_f_res_f_res_f_res_9755 =\n defunc_0_f_res_f_res_f_res_t_res_9760;\n } else {\n bool cond_9761 = gtid_9618 == (int64_t) 0;\n int8_t defunc_0_f_res_f_res_f_res_f_res_9762;\n \n if (cond_9761) {\n int8_t defunc_0_f_res_f_res_f_res_f_res_t_res_9767 =\n ((__global\n int8_t *) envelope_board_mem_9942)[(int64_t) 3 *\n m_9501 +\n gtid_9617];\n \n defunc_0_f_res_f_res_f_res_f_res_9762 =\n defunc_0_f_res_f_res_f_res_f_res_t_res_9767;\n } else {\n int64_t i_9768 = sub64(gtid_9617, (int64_t) 1);\n bool x_9769 = sle64((int64_t) 0, i_9768);\n bool y_9770 = slt64(i_9768, n_9500);\n bool bounds_check_9771 = x_9769 && y_9770;\n int64_t i_9772 = sub64(gtid_9618, (int64_t) 1);\n bool x_9773 = sle64((int64_t) 0, i_9772);\n bool y_9774 = slt64(i_9772, n_9500);\n bool bounds_check_9775 = x_9773 && y_9774;\n bool index_ok_9776 = bounds_check_9771 &&\n bounds_check_9775;\n bool index_certs_9777;\n \n if (!index_ok_9776) {\n {\n ", + " if (atomic_cmpxchg_i32_global(global_failure,\n -1, 0) == -1) {\n global_failure_args[0] = i_9768;\n global_failure_args[1] = i_9772;\n global_failure_args[2] = n_9500;\n global_failure_args[3] = n_9500;\n ;\n }\n return;\n }\n }\n \n int8_t defunc_0_f_res_f_res_f_res_f_res_f_res_9778 =\n ((__global\n int8_t *) chunk_board_mem_9941)[i_9768 *\n n_9500 +\n i_9772];\n \n defunc_0_f_res_f_res_f_res_f_res_9762 =\n defunc_0_f_res_f_res_f_res_f_res_f_res_9778;\n }\n defunc_0_f_res_f_res_f_res_9755 =\n defunc_0_f_res_f_res_f_res_f_res_9762;\n }\n defunc_0_f_res_f_res_9748 = defunc_0_f_res_f_res_f_res_9755;\n }\n defunc_0_f_res_9740 = defunc_0_f_res_f_res_9748;\n }\n ((__global int8_t *) mem_9945)[gtid_9617 * m_9501 + gtid_9618] =\n defunc_0_f_res_9740;\n }\n \n error_0:\n return;\n #undef segmap_group_sizze_9734\n}\n__kernel void next_chunk_boardzisegmap_9790(__global int *global_failure,\n int64_t m_9501, __global\n unsigned char *mem_9945, __global\n unsigned char *mem_9948)\n{\n #define segmap_group_sizze_9865 (next_chunk_boardzisegmap_group_sizze_9793)\n \n const int ", + "block_dim0 = 0;\n const int block_dim1 = 1;\n const int block_dim2 = 2;\n \n if (*global_failure >= 0)\n return;\n \n int32_t global_tid_9960;\n int32_t local_tid_9961;\n int64_t group_sizze_9964;\n int32_t wave_sizze_9963;\n int32_t group_tid_9962;\n \n global_tid_9960 = get_global_id(0);\n local_tid_9961 = get_local_id(0);\n group_sizze_9964 = get_local_size(0);\n wave_sizze_9963 = LOCKSTEP_WIDTH;\n group_tid_9962 = get_group_id(0);\n \n int32_t phys_tid_9790;\n \n phys_tid_9790 = global_tid_9960;\n \n int64_t gtid_9788;\n \n gtid_9788 = squot64(sext_i32_i64(group_tid_9962) * segmap_group_sizze_9865 +\n sext_i32_i64(local_tid_9961), m_9501);\n \n int64_t gtid_9789;\n \n gtid_9789 = sext_i32_i64(group_tid_9962) * segmap_group_sizze_9865 +\n sext_i32_i64(local_tid_9961) - squot64(sext_i32_i64(group_tid_9962) *\n segmap_group_sizze_9865 +\n sext_i32_i64(local_tid_9961),\n m_9501) * m_9501;\n if (slt64(gtid_9788, m_9501) && slt64(gtid_9789, m_9501)) {\n int64_t i_p_o_9935 = add64((int64_t) -1, gtid_9788);\n int64_t rot_i_9936 = smod64(i_p_o_9935, m_9501);\n int64_t i_p_o_9937 = add64((int64_t) -1, gtid_9789);\n int64_t rot_i_9938 = smod64(i_p_o_9937, m_9501);\n int8_t x_9868 = ((__global int8_t *) mem_9945)[rot_i_9936 * m_9501 +\n rot_i_9938];\n int64_t rot_i_9934 = smod64(gtid_9789, m_9501);\n int8_t x_9869 = ((__global int8_t *) mem_9945)[rot_i_9936 * m_9501 +\n rot_i_9934];\n int64_t i_p_o_9929 = add64((int64_t) 1, gtid_9789);\n int64_t rot_i_9930 = smod64(i_p_o_9929, m_9501);\n int8_t x_9870 = ((__global int8_t *) mem_9945)[rot_i_9936 * m_9501 +\n ", + " rot_i_9930];\n int64_t rot_i_9924 = smod64(gtid_9788, m_9501);\n int8_t x_9871 = ((__global int8_t *) mem_9945)[rot_i_9924 * m_9501 +\n rot_i_9938];\n int8_t x_9872 = ((__global int8_t *) mem_9945)[rot_i_9924 * m_9501 +\n rot_i_9930];\n int64_t i_p_o_9915 = add64((int64_t) 1, gtid_9788);\n int64_t rot_i_9916 = smod64(i_p_o_9915, m_9501);\n int8_t x_9873 = ((__global int8_t *) mem_9945)[rot_i_9916 * m_9501 +\n rot_i_9938];\n int8_t x_9874 = ((__global int8_t *) mem_9945)[rot_i_9916 * m_9501 +\n rot_i_9934];\n int8_t x_9875 = ((__global int8_t *) mem_9945)[rot_i_9916 * m_9501 +\n rot_i_9930];\n int8_t x_9876 = ((__global int8_t *) mem_9945)[gtid_9788 * m_9501 +\n gtid_9789];\n int8_t x_9877 = add8(x_9868, x_9869);\n int8_t x_9878 = add8(x_9870, x_9877);\n int8_t x_9879 = add8(x_9871, x_9878);\n int8_t x_9880 = add8(x_9872, x_9879);\n int8_t x_9881 = add8(x_9873, x_9880);\n int8_t x_9882 = add8(x_9874, x_9881);\n int8_t defunc_2_f_res_9883 = add8(x_9875, x_9882);\n bool cond_9884 = x_9876 == (int8_t) 1;\n bool cond_9885 = defunc_2_f_res_9883 == (int8_t) 2;\n bool cond_t_res_f_res_9886 = defunc_2_f_res_9883 == (int8_t) 3;\n bool x_9887 = !cond_9885;\n bool y_9888 = cond_t_res_f_res_9886 && x_9887;\n bool cond_t_res_9889 = cond_9885 || y_9888;\n bool x_9890 = cond_9884 && cond_t_res_9889;\n bool cond_9891 = x_9876 == (int8_t) 0;\n bool x_9892 = cond_t_res_f_res_9886 && cond_9891;\n bool x_9893 = !x_9890;\n bool y_9894 = x_9892 && x_9893;\n bool cond_9895 = x_9890 || y_9894", + ";\n int8_t defunc_1_f_res_9896 = btoi_bool_i8(cond_9895);\n \n ((__global int8_t *) mem_9948)[gtid_9788 * m_9501 + gtid_9789] =\n defunc_1_f_res_9896;\n }\n \n error_0:\n return;\n #undef segmap_group_sizze_9865\n}\n", + NULL}; +static const char *size_names[] = {"get_envelope.group_size_9958", + "get_envelope.group_size_9963", + "next_chunk_board.group_size_9968", + "next_chunk_board.segmap_group_size_9622", + "next_chunk_board.segmap_group_size_9793"}; +static const char *size_vars[] = {"get_envelopezigroup_sizze_9958", + "get_envelopezigroup_sizze_9963", + "next_chunk_boardzigroup_sizze_9968", + "next_chunk_boardzisegmap_group_sizze_9622", + "next_chunk_boardzisegmap_group_sizze_9793"}; +static const char *size_classes[] = {"group_size", "group_size", "group_size", + "group_size", "group_size"}; +struct sizes { + int64_t get_envelopezigroup_sizze_9958; + int64_t get_envelopezigroup_sizze_9963; + int64_t next_chunk_boardzigroup_sizze_9968; + int64_t next_chunk_boardzisegmap_group_sizze_9622; + int64_t next_chunk_boardzisegmap_group_sizze_9793; +} ; +struct futhark_context_config { + struct opencl_config opencl; + int64_t sizes[5]; + int num_build_opts; + const char **build_opts; +} ; +struct futhark_context_config *futhark_context_config_new(void) +{ + struct futhark_context_config *cfg = + (struct futhark_context_config *) malloc(sizeof(struct futhark_context_config)); + + if (cfg == NULL) + return NULL; + cfg->num_build_opts = 0; + cfg->build_opts = (const char **) malloc(sizeof(const char *)); + cfg->build_opts[0] = NULL; + cfg->sizes[0] = 0; + cfg->sizes[1] = 0; + cfg->sizes[2] = 0; + cfg->sizes[3] = 0; + cfg->sizes[4] = 0; + opencl_config_init(&cfg->opencl, 5, size_names, size_vars, cfg->sizes, + size_classes); + return cfg; +} +void futhark_context_config_free(struct futhark_context_config *cfg) +{ + free(cfg->build_opts); + free(cfg); +} +void futhark_context_config_add_build_option(struct futhark_context_config *cfg, + const char *opt) +{ + cfg->build_opts[cfg->num_build_opts] = opt; + cfg->num_build_opts++; + cfg->build_opts = (const char **) realloc(cfg->build_opts, + (cfg->num_build_opts + 1) * + sizeof(const char *)); + cfg->build_opts[cfg->num_build_opts] = NULL; +} +void futhark_context_config_set_debugging(struct futhark_context_config *cfg, + int flag) +{ + cfg->opencl.profiling = cfg->opencl.logging = cfg->opencl.debugging = flag; +} +void futhark_context_config_set_profiling(struct futhark_context_config *cfg, + int flag) +{ + cfg->opencl.profiling = flag; +} +void futhark_context_config_set_logging(struct futhark_context_config *cfg, + int flag) +{ + cfg->opencl.logging = flag; +} +void futhark_context_config_set_device(struct futhark_context_config *cfg, const + char *s) +{ + set_preferred_device(&cfg->opencl, s); +} +void futhark_context_config_set_platform(struct futhark_context_config *cfg, + const char *s) +{ + set_preferred_platform(&cfg->opencl, s); +} +void futhark_context_config_select_device_interactively(struct futhark_context_config *cfg) +{ + select_device_interactively(&cfg->opencl); +} +void futhark_context_config_list_devices(struct futhark_context_config *cfg) +{ + (void) cfg; + list_devices(); +} +void futhark_context_config_dump_program_to(struct futhark_context_config *cfg, + const char *path) +{ + cfg->opencl.dump_program_to = path; +} +void futhark_context_config_load_program_from(struct futhark_context_config *cfg, + const char *path) +{ + cfg->opencl.load_program_from = path; +} +void futhark_context_config_dump_binary_to(struct futhark_context_config *cfg, + const char *path) +{ + cfg->opencl.dump_binary_to = path; +} +void futhark_context_config_load_binary_from(struct futhark_context_config *cfg, + const char *path) +{ + cfg->opencl.load_binary_from = path; +} +void futhark_context_config_set_default_group_size(struct futhark_context_config *cfg, + int size) +{ + cfg->opencl.default_group_size = size; + cfg->opencl.default_group_size_changed = 1; +} +void futhark_context_config_set_default_num_groups(struct futhark_context_config *cfg, + int num) +{ + cfg->opencl.default_num_groups = num; +} +void futhark_context_config_set_default_tile_size(struct futhark_context_config *cfg, + int size) +{ + cfg->opencl.default_tile_size = size; + cfg->opencl.default_tile_size_changed = 1; +} +void futhark_context_config_set_default_reg_tile_size(struct futhark_context_config *cfg, + int size) +{ + cfg->opencl.default_reg_tile_size = size; +} +void futhark_context_config_set_default_threshold(struct futhark_context_config *cfg, + int size) +{ + cfg->opencl.default_threshold = size; +} +int futhark_context_config_set_size(struct futhark_context_config *cfg, const + char *size_name, size_t size_value) +{ + for (int i = 0; i < 5; i++) { + if (strcmp(size_name, size_names[i]) == 0) { + cfg->sizes[i] = size_value; + return 0; + } + } + if (strcmp(size_name, "default_group_size") == 0) { + cfg->opencl.default_group_size = size_value; + return 0; + } + if (strcmp(size_name, "default_num_groups") == 0) { + cfg->opencl.default_num_groups = size_value; + return 0; + } + if (strcmp(size_name, "default_threshold") == 0) { + cfg->opencl.default_threshold = size_value; + return 0; + } + if (strcmp(size_name, "default_tile_size") == 0) { + cfg->opencl.default_tile_size = size_value; + return 0; + } + if (strcmp(size_name, "default_reg_tile_size") == 0) { + cfg->opencl.default_reg_tile_size = size_value; + return 0; + } + return 1; +} +struct futhark_context { + int detail_memory; + int debugging; + int profiling; + int profiling_paused; + int logging; + lock_t lock; + char *error; + FILE *log; + int64_t peak_mem_usage_device; + int64_t cur_mem_usage_device; + int64_t peak_mem_usage_default; + int64_t cur_mem_usage_default; + struct { + int dummy; + } constants; + int total_runs; + long total_runtime; + cl_kernel get_envelopezicopy_9955; + cl_kernel get_envelopezicopy_9960; + cl_kernel next_chunk_boardzicopy_9965; + cl_kernel next_chunk_boardzisegmap_9619; + cl_kernel next_chunk_boardzisegmap_9790; + int64_t copy_dev_to_dev_total_runtime; + int copy_dev_to_dev_runs; + int64_t copy_dev_to_host_total_runtime; + int copy_dev_to_host_runs; + int64_t copy_host_to_dev_total_runtime; + int copy_host_to_dev_runs; + int64_t copy_scalar_to_dev_total_runtime; + int copy_scalar_to_dev_runs; + int64_t copy_scalar_from_dev_total_runtime; + int copy_scalar_from_dev_runs; + int64_t get_envelopezicopy_9955_total_runtime; + int get_envelopezicopy_9955_runs; + int64_t get_envelopezicopy_9960_total_runtime; + int get_envelopezicopy_9960_runs; + int64_t next_chunk_boardzicopy_9965_total_runtime; + int next_chunk_boardzicopy_9965_runs; + int64_t next_chunk_boardzisegmap_9619_total_runtime; + int next_chunk_boardzisegmap_9619_runs; + int64_t next_chunk_boardzisegmap_9790_total_runtime; + int next_chunk_boardzisegmap_9790_runs; + cl_mem global_failure; + cl_mem global_failure_args; + struct opencl_context opencl; + struct sizes sizes; + cl_int failure_is_an_option; +} ; +void post_opencl_setup(struct opencl_context *ctx, + struct opencl_device_option *option) +{ + if ((ctx->lockstep_width == 0 && strstr(option->platform_name, + "NVIDIA CUDA") != NULL) && + (option->device_type & CL_DEVICE_TYPE_GPU) == CL_DEVICE_TYPE_GPU) { + ctx->lockstep_width = 32; + } + if ((ctx->lockstep_width == 0 && strstr(option->platform_name, + "AMD Accelerated Parallel Processing") != + NULL) && (option->device_type & CL_DEVICE_TYPE_GPU) == + CL_DEVICE_TYPE_GPU) { + ctx->lockstep_width = 32; + } + if ((ctx->lockstep_width == 0 && strstr(option->platform_name, "") != + NULL) && (option->device_type & CL_DEVICE_TYPE_GPU) == + CL_DEVICE_TYPE_GPU) { + ctx->lockstep_width = 1; + } + if ((ctx->cfg.default_num_groups == 0 && strstr(option->platform_name, + "") != NULL) && + (option->device_type & CL_DEVICE_TYPE_GPU) == CL_DEVICE_TYPE_GPU) { + size_t MAX_COMPUTE_UNITS_val = 0; + + clGetDeviceInfo(ctx->device, CL_DEVICE_MAX_COMPUTE_UNITS, + sizeof(MAX_COMPUTE_UNITS_val), &MAX_COMPUTE_UNITS_val, + NULL); + ctx->cfg.default_num_groups = 4 * MAX_COMPUTE_UNITS_val; + } + if ((ctx->cfg.default_group_size == 0 && strstr(option->platform_name, + "") != NULL) && + (option->device_type & CL_DEVICE_TYPE_GPU) == CL_DEVICE_TYPE_GPU) { + ctx->cfg.default_group_size = 256; + } + if ((ctx->cfg.default_tile_size == 0 && strstr(option->platform_name, "") != + NULL) && (option->device_type & CL_DEVICE_TYPE_GPU) == + CL_DEVICE_TYPE_GPU) { + ctx->cfg.default_tile_size = 32; + } + if ((ctx->cfg.default_reg_tile_size == 0 && strstr(option->platform_name, + "") != NULL) && + (option->device_type & CL_DEVICE_TYPE_GPU) == CL_DEVICE_TYPE_GPU) { + ctx->cfg.default_reg_tile_size = 2; + } + if ((ctx->cfg.default_threshold == 0 && strstr(option->platform_name, "") != + NULL) && (option->device_type & CL_DEVICE_TYPE_GPU) == + CL_DEVICE_TYPE_GPU) { + ctx->cfg.default_threshold = 32768; + } + if ((ctx->lockstep_width == 0 && strstr(option->platform_name, "") != + NULL) && (option->device_type & CL_DEVICE_TYPE_CPU) == + CL_DEVICE_TYPE_CPU) { + ctx->lockstep_width = 1; + } + if ((ctx->cfg.default_num_groups == 0 && strstr(option->platform_name, + "") != NULL) && + (option->device_type & CL_DEVICE_TYPE_CPU) == CL_DEVICE_TYPE_CPU) { + size_t MAX_COMPUTE_UNITS_val = 0; + + clGetDeviceInfo(ctx->device, CL_DEVICE_MAX_COMPUTE_UNITS, + sizeof(MAX_COMPUTE_UNITS_val), &MAX_COMPUTE_UNITS_val, + NULL); + ctx->cfg.default_num_groups = MAX_COMPUTE_UNITS_val; + } + if ((ctx->cfg.default_group_size == 0 && strstr(option->platform_name, + "") != NULL) && + (option->device_type & CL_DEVICE_TYPE_CPU) == CL_DEVICE_TYPE_CPU) { + ctx->cfg.default_group_size = 32; + } + if ((ctx->cfg.default_tile_size == 0 && strstr(option->platform_name, "") != + NULL) && (option->device_type & CL_DEVICE_TYPE_CPU) == + CL_DEVICE_TYPE_CPU) { + ctx->cfg.default_tile_size = 4; + } + if ((ctx->cfg.default_reg_tile_size == 0 && strstr(option->platform_name, + "") != NULL) && + (option->device_type & CL_DEVICE_TYPE_CPU) == CL_DEVICE_TYPE_CPU) { + ctx->cfg.default_reg_tile_size = 1; + } + if ((ctx->cfg.default_threshold == 0 && strstr(option->platform_name, "") != + NULL) && (option->device_type & CL_DEVICE_TYPE_CPU) == + CL_DEVICE_TYPE_CPU) { + size_t MAX_COMPUTE_UNITS_val = 0; + + clGetDeviceInfo(ctx->device, CL_DEVICE_MAX_COMPUTE_UNITS, + sizeof(MAX_COMPUTE_UNITS_val), &MAX_COMPUTE_UNITS_val, + NULL); + ctx->cfg.default_threshold = MAX_COMPUTE_UNITS_val; + } +} +static void init_context_early(struct futhark_context_config *cfg, + struct futhark_context *ctx) +{ + ctx->opencl.cfg = cfg->opencl; + ctx->detail_memory = cfg->opencl.debugging; + ctx->debugging = cfg->opencl.debugging; + ctx->profiling = cfg->opencl.profiling; + ctx->profiling_paused = 0; + ctx->logging = cfg->opencl.logging; + ctx->error = NULL; + ctx->log = stderr; + ctx->opencl.profiling_records_capacity = 200; + ctx->opencl.profiling_records_used = 0; + ctx->opencl.profiling_records = + malloc(ctx->opencl.profiling_records_capacity * + sizeof(struct profiling_record)); + create_lock(&ctx->lock); + ctx->failure_is_an_option = 0; + ctx->peak_mem_usage_device = 0; + ctx->cur_mem_usage_device = 0; + ctx->peak_mem_usage_default = 0; + ctx->cur_mem_usage_default = 0; + ctx->total_runs = 0; + ctx->total_runtime = 0; + ctx->copy_dev_to_dev_total_runtime = 0; + ctx->copy_dev_to_dev_runs = 0; + ctx->copy_dev_to_host_total_runtime = 0; + ctx->copy_dev_to_host_runs = 0; + ctx->copy_host_to_dev_total_runtime = 0; + ctx->copy_host_to_dev_runs = 0; + ctx->copy_scalar_to_dev_total_runtime = 0; + ctx->copy_scalar_to_dev_runs = 0; + ctx->copy_scalar_from_dev_total_runtime = 0; + ctx->copy_scalar_from_dev_runs = 0; + ctx->get_envelopezicopy_9955_total_runtime = 0; + ctx->get_envelopezicopy_9955_runs = 0; + ctx->get_envelopezicopy_9960_total_runtime = 0; + ctx->get_envelopezicopy_9960_runs = 0; + ctx->next_chunk_boardzicopy_9965_total_runtime = 0; + ctx->next_chunk_boardzicopy_9965_runs = 0; + ctx->next_chunk_boardzisegmap_9619_total_runtime = 0; + ctx->next_chunk_boardzisegmap_9619_runs = 0; + ctx->next_chunk_boardzisegmap_9790_total_runtime = 0; + ctx->next_chunk_boardzisegmap_9790_runs = 0; +} +static int init_context_late(struct futhark_context_config *cfg, + struct futhark_context *ctx, cl_program prog) +{ + cl_int error; + cl_int no_error = -1; + + ctx->global_failure = clCreateBuffer(ctx->opencl.ctx, CL_MEM_READ_WRITE | + CL_MEM_COPY_HOST_PTR, sizeof(cl_int), + &no_error, &error); + OPENCL_SUCCEED_OR_RETURN(error); + // The +1 is to avoid zero-byte allocations. + ctx->global_failure_args = clCreateBuffer(ctx->opencl.ctx, + CL_MEM_READ_WRITE, + sizeof(int64_t) * (4 + 1), NULL, + &error); + OPENCL_SUCCEED_OR_RETURN(error); + { + ctx->get_envelopezicopy_9955 = clCreateKernel(prog, + "get_envelopezicopy_9955", + &error); + OPENCL_SUCCEED_FATAL(error); + if (ctx->debugging) + fprintf(ctx->log, "Created kernel %s.\n", "get_envelope.copy_9955"); + } + { + ctx->get_envelopezicopy_9960 = clCreateKernel(prog, + "get_envelopezicopy_9960", + &error); + OPENCL_SUCCEED_FATAL(error); + if (ctx->debugging) + fprintf(ctx->log, "Created kernel %s.\n", "get_envelope.copy_9960"); + } + { + ctx->next_chunk_boardzicopy_9965 = clCreateKernel(prog, + "next_chunk_boardzicopy_9965", + &error); + OPENCL_SUCCEED_FATAL(error); + if (ctx->debugging) + fprintf(ctx->log, "Created kernel %s.\n", + "next_chunk_board.copy_9965"); + } + { + ctx->next_chunk_boardzisegmap_9619 = clCreateKernel(prog, + "next_chunk_boardzisegmap_9619", + &error); + OPENCL_SUCCEED_FATAL(error); + OPENCL_SUCCEED_FATAL(clSetKernelArg(ctx->next_chunk_boardzisegmap_9619, + 0, sizeof(cl_mem), + &ctx->global_failure)); + OPENCL_SUCCEED_FATAL(clSetKernelArg(ctx->next_chunk_boardzisegmap_9619, + 2, sizeof(cl_mem), + &ctx->global_failure_args)); + if (ctx->debugging) + fprintf(ctx->log, "Created kernel %s.\n", + "next_chunk_board.segmap_9619"); + } + { + ctx->next_chunk_boardzisegmap_9790 = clCreateKernel(prog, + "next_chunk_boardzisegmap_9790", + &error); + OPENCL_SUCCEED_FATAL(error); + OPENCL_SUCCEED_FATAL(clSetKernelArg(ctx->next_chunk_boardzisegmap_9790, + 0, sizeof(cl_mem), + &ctx->global_failure)); + if (ctx->debugging) + fprintf(ctx->log, "Created kernel %s.\n", + "next_chunk_board.segmap_9790"); + } + ctx->sizes.get_envelopezigroup_sizze_9958 = cfg->sizes[0]; + ctx->sizes.get_envelopezigroup_sizze_9963 = cfg->sizes[1]; + ctx->sizes.next_chunk_boardzigroup_sizze_9968 = cfg->sizes[2]; + ctx->sizes.next_chunk_boardzisegmap_group_sizze_9622 = cfg->sizes[3]; + ctx->sizes.next_chunk_boardzisegmap_group_sizze_9793 = cfg->sizes[4]; + init_constants(ctx); + // Clear the free list of any deallocations that occurred while initialising constants. + OPENCL_SUCCEED_OR_RETURN(opencl_free_all(&ctx->opencl)); + // The program will be properly freed after all the kernels have also been freed. + OPENCL_SUCCEED_OR_RETURN(clReleaseProgram(prog)); + return futhark_context_sync(ctx); +} +struct futhark_context *futhark_context_new(struct futhark_context_config *cfg) +{ + struct futhark_context *ctx = + (struct futhark_context *) malloc(sizeof(struct futhark_context)); + + if (ctx == NULL) + return NULL; + + int required_types = 0; + + init_context_early(cfg, ctx); + + cl_program prog = setup_opencl(&ctx->opencl, opencl_program, required_types, + cfg->build_opts); + + init_context_late(cfg, ctx, prog); + return ctx; +} +struct futhark_context *futhark_context_new_with_command_queue(struct futhark_context_config *cfg, + cl_command_queue queue) +{ + struct futhark_context *ctx = + (struct futhark_context *) malloc(sizeof(struct futhark_context)); + + if (ctx == NULL) + return NULL; + + int required_types = 0; + + init_context_early(cfg, ctx); + + cl_program prog = setup_opencl_with_command_queue(&ctx->opencl, queue, + opencl_program, + required_types, + cfg->build_opts); + + init_context_late(cfg, ctx, prog); + return ctx; +} +void futhark_context_free(struct futhark_context *ctx) +{ + free_constants(ctx); + free_lock(&ctx->lock); + OPENCL_SUCCEED_FATAL(clReleaseKernel(ctx->get_envelopezicopy_9955)); + OPENCL_SUCCEED_FATAL(clReleaseKernel(ctx->get_envelopezicopy_9960)); + OPENCL_SUCCEED_FATAL(clReleaseKernel(ctx->next_chunk_boardzicopy_9965)); + OPENCL_SUCCEED_FATAL(clReleaseKernel(ctx->next_chunk_boardzisegmap_9619)); + OPENCL_SUCCEED_FATAL(clReleaseKernel(ctx->next_chunk_boardzisegmap_9790)); + teardown_opencl(&ctx->opencl); + free(ctx); +} +int futhark_context_sync(struct futhark_context *ctx) +{ + cl_int failure_idx = -1; + + if (ctx->failure_is_an_option) { + OPENCL_SUCCEED_OR_RETURN(clEnqueueReadBuffer(ctx->opencl.queue, + ctx->global_failure, + CL_FALSE, 0, + sizeof(cl_int), + &failure_idx, 0, NULL, + ctx->profiling_paused || + !ctx->profiling ? NULL : opencl_get_event(&ctx->opencl, + &ctx->copy_scalar_from_dev_runs, + &ctx->copy_scalar_from_dev_total_runtime))); + ctx->failure_is_an_option = 0; + } + OPENCL_SUCCEED_OR_RETURN(clFinish(ctx->opencl.queue)); + if (failure_idx >= 0) { + cl_int no_failure = -1; + + OPENCL_SUCCEED_OR_RETURN(clEnqueueWriteBuffer(ctx->opencl.queue, + ctx->global_failure, + CL_TRUE, 0, + sizeof(cl_int), + &no_failure, 0, NULL, + NULL)); + + int64_t args[4 + 1]; + + OPENCL_SUCCEED_OR_RETURN(clEnqueueReadBuffer(ctx->opencl.queue, + ctx->global_failure_args, + CL_TRUE, 0, sizeof(args), + &args, 0, NULL, + ctx->profiling_paused || + !ctx->profiling ? NULL : opencl_get_event(&ctx->opencl, + &ctx->copy_dev_to_host_runs, + &ctx->copy_dev_to_host_total_runtime))); + switch (failure_idx) { + + case 0: + { + ctx->error = + msgprintf("Index [%lld, %lld] out of bounds for array of shape [%lld][%lld].\n-> #0 gol.fut:27:36-55\n #1 /prelude/soacs.fut:59:3-10\n #2 /prelude/array.fut:195:3-17\n #3 /prelude/functional.fut:39:59-65\n #4 /prelude/soacs.fut:59:3-10\n #5 /prelude/array.fut:203:3-34\n #6 gol.fut:18:5-27:56\n #7 gol.fut:31:27-66\n #8 gol.fut:30:1-40:43\n", + args[0], args[1], args[2], args[3]); + break; + } + } + return 1; + } + return 0; +} +cl_command_queue futhark_context_get_command_queue(struct futhark_context *ctx) +{ + return ctx->opencl.queue; +} +static int memblock_unref_device(struct futhark_context *ctx, + struct memblock_device *block, const + char *desc) +{ + if (block->references != NULL) { + *block->references -= 1; + if (ctx->detail_memory) + fprintf(ctx->log, + "Unreferencing block %s (allocated as %s) in %s: %d references remaining.\n", + desc, block->desc, "space 'device'", *block->references); + if (*block->references == 0) { + ctx->cur_mem_usage_device -= block->size; + OPENCL_SUCCEED_OR_RETURN(opencl_free(&ctx->opencl, block->mem, + desc)); + free(block->references); + if (ctx->detail_memory) + fprintf(ctx->log, + "%lld bytes freed (now allocated: %lld bytes)\n", + (long long) block->size, + (long long) ctx->cur_mem_usage_device); + } + block->references = NULL; + } + return 0; +} +static int memblock_alloc_device(struct futhark_context *ctx, + struct memblock_device *block, int64_t size, + const char *desc) +{ + if (size < 0) + futhark_panic(1, + "Negative allocation of %lld bytes attempted for %s in %s.\n", + (long long) size, desc, "space 'device'", + ctx->cur_mem_usage_device); + + int ret = memblock_unref_device(ctx, block, desc); + + ctx->cur_mem_usage_device += size; + if (ctx->detail_memory) + fprintf(ctx->log, + "Allocating %lld bytes for %s in %s (then allocated: %lld bytes)", + (long long) size, desc, "space 'device'", + (long long) ctx->cur_mem_usage_device); + if (ctx->cur_mem_usage_device > ctx->peak_mem_usage_device) { + ctx->peak_mem_usage_device = ctx->cur_mem_usage_device; + if (ctx->detail_memory) + fprintf(ctx->log, " (new peak).\n"); + } else if (ctx->detail_memory) + fprintf(ctx->log, ".\n"); + OPENCL_SUCCEED_OR_RETURN(opencl_alloc(&ctx->opencl, size, desc, + &block->mem)); + block->references = (int *) malloc(sizeof(int)); + *block->references = 1; + block->size = size; + block->desc = desc; + return ret; +} +static int memblock_set_device(struct futhark_context *ctx, + struct memblock_device *lhs, + struct memblock_device *rhs, const + char *lhs_desc) +{ + int ret = memblock_unref_device(ctx, lhs, lhs_desc); + + if (rhs->references != NULL) + (*rhs->references)++; + *lhs = *rhs; + return ret; +} +static int memblock_unref(struct futhark_context *ctx, struct memblock *block, + const char *desc) +{ + if (block->references != NULL) { + *block->references -= 1; + if (ctx->detail_memory) + fprintf(ctx->log, + "Unreferencing block %s (allocated as %s) in %s: %d references remaining.\n", + desc, block->desc, "default space", *block->references); + if (*block->references == 0) { + ctx->cur_mem_usage_default -= block->size; + free(block->mem); + free(block->references); + if (ctx->detail_memory) + fprintf(ctx->log, + "%lld bytes freed (now allocated: %lld bytes)\n", + (long long) block->size, + (long long) ctx->cur_mem_usage_default); + } + block->references = NULL; + } + return 0; +} +static int memblock_alloc(struct futhark_context *ctx, struct memblock *block, + int64_t size, const char *desc) +{ + if (size < 0) + futhark_panic(1, + "Negative allocation of %lld bytes attempted for %s in %s.\n", + (long long) size, desc, "default space", + ctx->cur_mem_usage_default); + + int ret = memblock_unref(ctx, block, desc); + + ctx->cur_mem_usage_default += size; + if (ctx->detail_memory) + fprintf(ctx->log, + "Allocating %lld bytes for %s in %s (then allocated: %lld bytes)", + (long long) size, desc, "default space", + (long long) ctx->cur_mem_usage_default); + if (ctx->cur_mem_usage_default > ctx->peak_mem_usage_default) { + ctx->peak_mem_usage_default = ctx->cur_mem_usage_default; + if (ctx->detail_memory) + fprintf(ctx->log, " (new peak).\n"); + } else if (ctx->detail_memory) + fprintf(ctx->log, ".\n"); + block->mem = (char *) malloc(size); + block->references = (int *) malloc(sizeof(int)); + *block->references = 1; + block->size = size; + block->desc = desc; + return ret; +} +static int memblock_set(struct futhark_context *ctx, struct memblock *lhs, + struct memblock *rhs, const char *lhs_desc) +{ + int ret = memblock_unref(ctx, lhs, lhs_desc); + + if (rhs->references != NULL) + (*rhs->references)++; + *lhs = *rhs; + return ret; +} +int futhark_get_num_sizes(void) +{ + return sizeof(size_names) / sizeof(size_names[0]); +} +const char *futhark_get_size_name(int i) +{ + return size_names[i]; +} +const char *futhark_get_size_class(int i) +{ + return size_classes[i]; +} +char *futhark_context_report(struct futhark_context *ctx) +{ + if (futhark_context_sync(ctx) != 0) + return NULL; + + struct str_builder builder; + + str_builder_init(&builder); + if (ctx->detail_memory || ctx->profiling || ctx->logging) { + str_builder(&builder, + "Peak memory usage for space 'device': %lld bytes.\n", + (long long) ctx->peak_mem_usage_device); + { } + } + if (ctx->profiling) { + OPENCL_SUCCEED_FATAL(opencl_tally_profiling_records(&ctx->opencl)); + str_builder(&builder, + "copy_dev_to_dev ran %5d times; avg: %8ldus; total: %8ldus\n", + ctx->copy_dev_to_dev_runs, + (long) ctx->copy_dev_to_dev_total_runtime / + (ctx->copy_dev_to_dev_runs != + 0 ? ctx->copy_dev_to_dev_runs : 1), + (long) ctx->copy_dev_to_dev_total_runtime); + ctx->total_runtime += ctx->copy_dev_to_dev_total_runtime; + ctx->total_runs += ctx->copy_dev_to_dev_runs; + str_builder(&builder, + "copy_dev_to_host ran %5d times; avg: %8ldus; total: %8ldus\n", + ctx->copy_dev_to_host_runs, + (long) ctx->copy_dev_to_host_total_runtime / + (ctx->copy_dev_to_host_runs != + 0 ? ctx->copy_dev_to_host_runs : 1), + (long) ctx->copy_dev_to_host_total_runtime); + ctx->total_runtime += ctx->copy_dev_to_host_total_runtime; + ctx->total_runs += ctx->copy_dev_to_host_runs; + str_builder(&builder, + "copy_host_to_dev ran %5d times; avg: %8ldus; total: %8ldus\n", + ctx->copy_host_to_dev_runs, + (long) ctx->copy_host_to_dev_total_runtime / + (ctx->copy_host_to_dev_runs != + 0 ? ctx->copy_host_to_dev_runs : 1), + (long) ctx->copy_host_to_dev_total_runtime); + ctx->total_runtime += ctx->copy_host_to_dev_total_runtime; + ctx->total_runs += ctx->copy_host_to_dev_runs; + str_builder(&builder, + "copy_scalar_to_dev ran %5d times; avg: %8ldus; total: %8ldus\n", + ctx->copy_scalar_to_dev_runs, + (long) ctx->copy_scalar_to_dev_total_runtime / + (ctx->copy_scalar_to_dev_runs != + 0 ? ctx->copy_scalar_to_dev_runs : 1), + (long) ctx->copy_scalar_to_dev_total_runtime); + ctx->total_runtime += ctx->copy_scalar_to_dev_total_runtime; + ctx->total_runs += ctx->copy_scalar_to_dev_runs; + str_builder(&builder, + "copy_scalar_from_dev ran %5d times; avg: %8ldus; total: %8ldus\n", + ctx->copy_scalar_from_dev_runs, + (long) ctx->copy_scalar_from_dev_total_runtime / + (ctx->copy_scalar_from_dev_runs != + 0 ? ctx->copy_scalar_from_dev_runs : 1), + (long) ctx->copy_scalar_from_dev_total_runtime); + ctx->total_runtime += ctx->copy_scalar_from_dev_total_runtime; + ctx->total_runs += ctx->copy_scalar_from_dev_runs; + str_builder(&builder, + "get_envelope.copy_9955 ran %5d times; avg: %8ldus; total: %8ldus\n", + ctx->get_envelopezicopy_9955_runs, + (long) ctx->get_envelopezicopy_9955_total_runtime / + (ctx->get_envelopezicopy_9955_runs != + 0 ? ctx->get_envelopezicopy_9955_runs : 1), + (long) ctx->get_envelopezicopy_9955_total_runtime); + ctx->total_runtime += ctx->get_envelopezicopy_9955_total_runtime; + ctx->total_runs += ctx->get_envelopezicopy_9955_runs; + str_builder(&builder, + "get_envelope.copy_9960 ran %5d times; avg: %8ldus; total: %8ldus\n", + ctx->get_envelopezicopy_9960_runs, + (long) ctx->get_envelopezicopy_9960_total_runtime / + (ctx->get_envelopezicopy_9960_runs != + 0 ? ctx->get_envelopezicopy_9960_runs : 1), + (long) ctx->get_envelopezicopy_9960_total_runtime); + ctx->total_runtime += ctx->get_envelopezicopy_9960_total_runtime; + ctx->total_runs += ctx->get_envelopezicopy_9960_runs; + str_builder(&builder, + "next_chunk_board.copy_9965 ran %5d times; avg: %8ldus; total: %8ldus\n", + ctx->next_chunk_boardzicopy_9965_runs, + (long) ctx->next_chunk_boardzicopy_9965_total_runtime / + (ctx->next_chunk_boardzicopy_9965_runs != + 0 ? ctx->next_chunk_boardzicopy_9965_runs : 1), + (long) ctx->next_chunk_boardzicopy_9965_total_runtime); + ctx->total_runtime += ctx->next_chunk_boardzicopy_9965_total_runtime; + ctx->total_runs += ctx->next_chunk_boardzicopy_9965_runs; + str_builder(&builder, + "next_chunk_board.segmap_9619 ran %5d times; avg: %8ldus; total: %8ldus\n", + ctx->next_chunk_boardzisegmap_9619_runs, + (long) ctx->next_chunk_boardzisegmap_9619_total_runtime / + (ctx->next_chunk_boardzisegmap_9619_runs != + 0 ? ctx->next_chunk_boardzisegmap_9619_runs : 1), + (long) ctx->next_chunk_boardzisegmap_9619_total_runtime); + ctx->total_runtime += ctx->next_chunk_boardzisegmap_9619_total_runtime; + ctx->total_runs += ctx->next_chunk_boardzisegmap_9619_runs; + str_builder(&builder, + "next_chunk_board.segmap_9790 ran %5d times; avg: %8ldus; total: %8ldus\n", + ctx->next_chunk_boardzisegmap_9790_runs, + (long) ctx->next_chunk_boardzisegmap_9790_total_runtime / + (ctx->next_chunk_boardzisegmap_9790_runs != + 0 ? ctx->next_chunk_boardzisegmap_9790_runs : 1), + (long) ctx->next_chunk_boardzisegmap_9790_total_runtime); + ctx->total_runtime += ctx->next_chunk_boardzisegmap_9790_total_runtime; + ctx->total_runs += ctx->next_chunk_boardzisegmap_9790_runs; + str_builder(&builder, "%d operations with cumulative runtime: %6ldus\n", + ctx->total_runs, ctx->total_runtime); + } + return builder.str; +} +char *futhark_context_get_error(struct futhark_context *ctx) +{ + char *error = ctx->error; + + ctx->error = NULL; + return error; +} +void futhark_context_set_logging_file(struct futhark_context *ctx, FILE *f) +{ + ctx->log = f; +} +void futhark_context_pause_profiling(struct futhark_context *ctx) +{ + ctx->profiling_paused = 1; +} +void futhark_context_unpause_profiling(struct futhark_context *ctx) +{ + ctx->profiling_paused = 0; +} +int futhark_context_clear_caches(struct futhark_context *ctx) +{ + lock_lock(&ctx->lock); + ctx->peak_mem_usage_device = 0; + ctx->peak_mem_usage_default = 0; + if (ctx->error == NULL) + ctx->error = OPENCL_SUCCEED_NONFATAL(opencl_free_all(&ctx->opencl)); + lock_unlock(&ctx->lock); + return ctx->error != NULL; +} +static int futrts_get_envelope(struct futhark_context *ctx, + struct memblock_device *out_mem_p_9970, + struct memblock_device chunk_board_mem_9941, + int64_t n_9485); +static int futrts_next_chunk_board(struct futhark_context *ctx, + struct memblock_device *out_mem_p_9981, + struct memblock_device chunk_board_mem_9941, + struct memblock_device envelope_board_mem_9942, + int64_t n_9500, int64_t m_9501); +static int init_constants(struct futhark_context *ctx) +{ + (void) ctx; + + int err = 0; + + + cleanup: + return err; +} +static int free_constants(struct futhark_context *ctx) +{ + (void) ctx; + return 0; +} +static int futrts_get_envelope(struct futhark_context *ctx, + struct memblock_device *out_mem_p_9970, + struct memblock_device chunk_board_mem_9941, + int64_t n_9485) +{ + (void) ctx; + + int err = 0; + struct memblock_device out_mem_9954; + + out_mem_9954.references = NULL; + + bool y_9487 = slt64((int64_t) 0, n_9485); + bool index_certs_9488; + + if (!y_9487) { + ctx->error = msgprintf("Error: %s%lld%s%lld%s\n\nBacktrace:\n%s", + "Index [", (int64_t) 0, + "] out of bounds for array of shape [", n_9485, + "].", + "-> #0 gol.fut:43:17-30\n #1 gol.fut:42:1-48:33\n"); + if (memblock_unref_device(ctx, &out_mem_9954, "out_mem_9954") != 0) + return 1; + return 1; + } + + int64_t i_9490 = sub64(n_9485, (int64_t) 1); + bool x_9491 = sle64((int64_t) 0, i_9490); + bool y_9492 = slt64(i_9490, n_9485); + bool bounds_check_9493 = x_9491 && y_9492; + bool index_certs_9494; + + if (!bounds_check_9493) { + ctx->error = msgprintf("Error: %s%lld%s%lld%s\n\nBacktrace:\n%s", + "Index [", i_9490, + "] out of bounds for array of shape [", n_9485, + "].", + "-> #0 gol.fut:44:17-32\n #1 gol.fut:42:1-48:33\n"); + if (memblock_unref_device(ctx, &out_mem_9954, "out_mem_9954") != 0) + return 1; + return 1; + } + + int64_t bytes_9942 = (int64_t) 4 * n_9485; + struct memblock_device mem_9943; + + mem_9943.references = NULL; + if (memblock_alloc_device(ctx, &mem_9943, bytes_9942, "mem_9943")) { + err = 1; + goto cleanup; + } + if (n_9485 > 0) { + OPENCL_SUCCEED_OR_RETURN(clEnqueueCopyBuffer(ctx->opencl.queue, + chunk_board_mem_9941.mem, + mem_9943.mem, (int64_t) 0, + (int64_t) 0, n_9485, 0, + NULL, + ctx->profiling_paused || + !ctx->profiling ? NULL : opencl_get_event(&ctx->opencl, + &ctx->copy_dev_to_dev_runs, + &ctx->copy_dev_to_dev_total_runtime))); + if (ctx->debugging) + OPENCL_SUCCEED_FATAL(clFinish(ctx->opencl.queue)); + } + + int64_t group_sizze_9958; + + group_sizze_9958 = ctx->sizes.get_envelopezigroup_sizze_9958; + + int64_t num_groups_9959; + + num_groups_9959 = sdiv_up64(n_9485, group_sizze_9958); + OPENCL_SUCCEED_OR_RETURN(clSetKernelArg(ctx->get_envelopezicopy_9955, 0, + sizeof(n_9485), &n_9485)); + OPENCL_SUCCEED_OR_RETURN(clSetKernelArg(ctx->get_envelopezicopy_9955, 1, + sizeof(i_9490), &i_9490)); + OPENCL_SUCCEED_OR_RETURN(clSetKernelArg(ctx->get_envelopezicopy_9955, 2, + sizeof(chunk_board_mem_9941.mem), + &chunk_board_mem_9941.mem)); + OPENCL_SUCCEED_OR_RETURN(clSetKernelArg(ctx->get_envelopezicopy_9955, 3, + sizeof(mem_9943.mem), + &mem_9943.mem)); + if (1 * ((size_t) num_groups_9959 * (size_t) group_sizze_9958) != 0) { + const size_t global_work_sizze_9971[1] = {(size_t) num_groups_9959 * + (size_t) group_sizze_9958}; + const size_t local_work_sizze_9975[1] = {group_sizze_9958}; + int64_t time_start_9972 = 0, time_end_9973 = 0; + + if (ctx->debugging) { + fprintf(ctx->log, "Launching %s with global work size [", + "get_envelope.copy_9955"); + fprintf(ctx->log, "%zu", global_work_sizze_9971[0]); + fprintf(ctx->log, "] and local work size ["); + fprintf(ctx->log, "%zu", local_work_sizze_9975[0]); + fprintf(ctx->log, "]; local memory parameters sum to %d bytes.\n", + (int) 0); + time_start_9972 = get_wall_time(); + } + OPENCL_SUCCEED_OR_RETURN(clEnqueueNDRangeKernel(ctx->opencl.queue, + ctx->get_envelopezicopy_9955, + 1, NULL, + global_work_sizze_9971, + local_work_sizze_9975, + 0, NULL, + ctx->profiling_paused || + !ctx->profiling ? NULL : opencl_get_event(&ctx->opencl, + &ctx->get_envelopezicopy_9955_runs, + &ctx->get_envelopezicopy_9955_total_runtime))); + if (ctx->debugging) { + OPENCL_SUCCEED_FATAL(clFinish(ctx->opencl.queue)); + time_end_9973 = get_wall_time(); + + long time_diff_9974 = time_end_9973 - time_start_9972; + + fprintf(ctx->log, "kernel %s runtime: %ldus\n", + "get_envelope.copy_9955", time_diff_9974); + } + } + if (n_9485 > 0) { + OPENCL_SUCCEED_OR_RETURN(clEnqueueCopyBuffer(ctx->opencl.queue, + chunk_board_mem_9941.mem, + mem_9943.mem, i_9490 * + n_9485, (int64_t) 2 * + n_9485, n_9485, 0, NULL, + ctx->profiling_paused || + !ctx->profiling ? NULL : opencl_get_event(&ctx->opencl, + &ctx->copy_dev_to_dev_runs, + &ctx->copy_dev_to_dev_total_runtime))); + if (ctx->debugging) + OPENCL_SUCCEED_FATAL(clFinish(ctx->opencl.queue)); + } + + int64_t group_sizze_9963; + + group_sizze_9963 = ctx->sizes.get_envelopezigroup_sizze_9963; + + int64_t num_groups_9964; + + num_groups_9964 = sdiv_up64(n_9485, group_sizze_9963); + OPENCL_SUCCEED_OR_RETURN(clSetKernelArg(ctx->get_envelopezicopy_9960, 0, + sizeof(n_9485), &n_9485)); + OPENCL_SUCCEED_OR_RETURN(clSetKernelArg(ctx->get_envelopezicopy_9960, 1, + sizeof(chunk_board_mem_9941.mem), + &chunk_board_mem_9941.mem)); + OPENCL_SUCCEED_OR_RETURN(clSetKernelArg(ctx->get_envelopezicopy_9960, 2, + sizeof(mem_9943.mem), + &mem_9943.mem)); + if (1 * ((size_t) num_groups_9964 * (size_t) group_sizze_9963) != 0) { + const size_t global_work_sizze_9976[1] = {(size_t) num_groups_9964 * + (size_t) group_sizze_9963}; + const size_t local_work_sizze_9980[1] = {group_sizze_9963}; + int64_t time_start_9977 = 0, time_end_9978 = 0; + + if (ctx->debugging) { + fprintf(ctx->log, "Launching %s with global work size [", + "get_envelope.copy_9960"); + fprintf(ctx->log, "%zu", global_work_sizze_9976[0]); + fprintf(ctx->log, "] and local work size ["); + fprintf(ctx->log, "%zu", local_work_sizze_9980[0]); + fprintf(ctx->log, "]; local memory parameters sum to %d bytes.\n", + (int) 0); + time_start_9977 = get_wall_time(); + } + OPENCL_SUCCEED_OR_RETURN(clEnqueueNDRangeKernel(ctx->opencl.queue, + ctx->get_envelopezicopy_9960, + 1, NULL, + global_work_sizze_9976, + local_work_sizze_9980, + 0, NULL, + ctx->profiling_paused || + !ctx->profiling ? NULL : opencl_get_event(&ctx->opencl, + &ctx->get_envelopezicopy_9960_runs, + &ctx->get_envelopezicopy_9960_total_runtime))); + if (ctx->debugging) { + OPENCL_SUCCEED_FATAL(clFinish(ctx->opencl.queue)); + time_end_9978 = get_wall_time(); + + long time_diff_9979 = time_end_9978 - time_start_9977; + + fprintf(ctx->log, "kernel %s runtime: %ldus\n", + "get_envelope.copy_9960", time_diff_9979); + } + } + if (memblock_set_device(ctx, &out_mem_9954, &mem_9943, "mem_9943") != 0) + return 1; + (*out_mem_p_9970).references = NULL; + if (memblock_set_device(ctx, &*out_mem_p_9970, &out_mem_9954, + "out_mem_9954") != 0) + return 1; + if (memblock_unref_device(ctx, &mem_9943, "mem_9943") != 0) + return 1; + if (memblock_unref_device(ctx, &out_mem_9954, "out_mem_9954") != 0) + return 1; + + cleanup: + { } + return err; +} +static int futrts_next_chunk_board(struct futhark_context *ctx, + struct memblock_device *out_mem_p_9981, + struct memblock_device chunk_board_mem_9941, + struct memblock_device envelope_board_mem_9942, + int64_t n_9500, int64_t m_9501) +{ + (void) ctx; + + int err = 0; + struct memblock_device out_mem_9954; + + out_mem_9954.references = NULL; + + int64_t nest_sizze_9733 = m_9501 * m_9501; + int64_t segmap_group_sizze_9734; + + segmap_group_sizze_9734 = + ctx->sizes.next_chunk_boardzisegmap_group_sizze_9622; + + int64_t segmap_usable_groups_9735 = sdiv_up64(nest_sizze_9733, + segmap_group_sizze_9734); + struct memblock_device mem_9945; + + mem_9945.references = NULL; + if (memblock_alloc_device(ctx, &mem_9945, nest_sizze_9733, "mem_9945")) { + err = 1; + goto cleanup; + } + if (ctx->debugging) + fprintf(ctx->log, "%s\n", "\n# SegMap"); + OPENCL_SUCCEED_OR_RETURN(clSetKernelArg(ctx->next_chunk_boardzisegmap_9619, + 1, + sizeof(ctx->failure_is_an_option), + &ctx->failure_is_an_option)); + OPENCL_SUCCEED_OR_RETURN(clSetKernelArg(ctx->next_chunk_boardzisegmap_9619, + 3, sizeof(n_9500), &n_9500)); + OPENCL_SUCCEED_OR_RETURN(clSetKernelArg(ctx->next_chunk_boardzisegmap_9619, + 4, sizeof(m_9501), &m_9501)); + OPENCL_SUCCEED_OR_RETURN(clSetKernelArg(ctx->next_chunk_boardzisegmap_9619, + 5, sizeof(chunk_board_mem_9941.mem), + &chunk_board_mem_9941.mem)); + OPENCL_SUCCEED_OR_RETURN(clSetKernelArg(ctx->next_chunk_boardzisegmap_9619, + 6, + sizeof(envelope_board_mem_9942.mem), + &envelope_board_mem_9942.mem)); + OPENCL_SUCCEED_OR_RETURN(clSetKernelArg(ctx->next_chunk_boardzisegmap_9619, + 7, sizeof(mem_9945.mem), + &mem_9945.mem)); + if (1 * ((size_t) segmap_usable_groups_9735 * + (size_t) segmap_group_sizze_9734) != 0) { + const size_t global_work_sizze_9982[1] = + {(size_t) segmap_usable_groups_9735 * + (size_t) segmap_group_sizze_9734}; + const size_t local_work_sizze_9986[1] = {segmap_group_sizze_9734}; + int64_t time_start_9983 = 0, time_end_9984 = 0; + + if (ctx->debugging) { + fprintf(ctx->log, "Launching %s with global work size [", + "next_chunk_board.segmap_9619"); + fprintf(ctx->log, "%zu", global_work_sizze_9982[0]); + fprintf(ctx->log, "] and local work size ["); + fprintf(ctx->log, "%zu", local_work_sizze_9986[0]); + fprintf(ctx->log, "]; local memory parameters sum to %d bytes.\n", + (int) 0); + time_start_9983 = get_wall_time(); + } + OPENCL_SUCCEED_OR_RETURN(clEnqueueNDRangeKernel(ctx->opencl.queue, + ctx->next_chunk_boardzisegmap_9619, + 1, NULL, + global_work_sizze_9982, + local_work_sizze_9986, + 0, NULL, + ctx->profiling_paused || + !ctx->profiling ? NULL : opencl_get_event(&ctx->opencl, + &ctx->next_chunk_boardzisegmap_9619_runs, + &ctx->next_chunk_boardzisegmap_9619_total_runtime))); + if (ctx->debugging) { + OPENCL_SUCCEED_FATAL(clFinish(ctx->opencl.queue)); + time_end_9984 = get_wall_time(); + + long time_diff_9985 = time_end_9984 - time_start_9983; + + fprintf(ctx->log, "kernel %s runtime: %ldus\n", + "next_chunk_board.segmap_9619", time_diff_9985); + } + } + ctx->failure_is_an_option = 1; + + int64_t segmap_group_sizze_9865; + + segmap_group_sizze_9865 = + ctx->sizes.next_chunk_boardzisegmap_group_sizze_9793; + + int64_t segmap_usable_groups_9866 = sdiv_up64(nest_sizze_9733, + segmap_group_sizze_9865); + struct memblock_device mem_9948; + + mem_9948.references = NULL; + if (memblock_alloc_device(ctx, &mem_9948, nest_sizze_9733, "mem_9948")) { + err = 1; + goto cleanup; + } + if (ctx->debugging) + fprintf(ctx->log, "%s\n", "\n# SegMap"); + OPENCL_SUCCEED_OR_RETURN(clSetKernelArg(ctx->next_chunk_boardzisegmap_9790, + 1, sizeof(m_9501), &m_9501)); + OPENCL_SUCCEED_OR_RETURN(clSetKernelArg(ctx->next_chunk_boardzisegmap_9790, + 2, sizeof(mem_9945.mem), + &mem_9945.mem)); + OPENCL_SUCCEED_OR_RETURN(clSetKernelArg(ctx->next_chunk_boardzisegmap_9790, + 3, sizeof(mem_9948.mem), + &mem_9948.mem)); + if (1 * ((size_t) segmap_usable_groups_9866 * + (size_t) segmap_group_sizze_9865) != 0) { + const size_t global_work_sizze_9987[1] = + {(size_t) segmap_usable_groups_9866 * + (size_t) segmap_group_sizze_9865}; + const size_t local_work_sizze_9991[1] = {segmap_group_sizze_9865}; + int64_t time_start_9988 = 0, time_end_9989 = 0; + + if (ctx->debugging) { + fprintf(ctx->log, "Launching %s with global work size [", + "next_chunk_board.segmap_9790"); + fprintf(ctx->log, "%zu", global_work_sizze_9987[0]); + fprintf(ctx->log, "] and local work size ["); + fprintf(ctx->log, "%zu", local_work_sizze_9991[0]); + fprintf(ctx->log, "]; local memory parameters sum to %d bytes.\n", + (int) 0); + time_start_9988 = get_wall_time(); + } + OPENCL_SUCCEED_OR_RETURN(clEnqueueNDRangeKernel(ctx->opencl.queue, + ctx->next_chunk_boardzisegmap_9790, + 1, NULL, + global_work_sizze_9987, + local_work_sizze_9991, + 0, NULL, + ctx->profiling_paused || + !ctx->profiling ? NULL : opencl_get_event(&ctx->opencl, + &ctx->next_chunk_boardzisegmap_9790_runs, + &ctx->next_chunk_boardzisegmap_9790_total_runtime))); + if (ctx->debugging) { + OPENCL_SUCCEED_FATAL(clFinish(ctx->opencl.queue)); + time_end_9989 = get_wall_time(); + + long time_diff_9990 = time_end_9989 - time_start_9988; + + fprintf(ctx->log, "kernel %s runtime: %ldus\n", + "next_chunk_board.segmap_9790", time_diff_9990); + } + } + if (memblock_unref_device(ctx, &mem_9945, "mem_9945") != 0) + return 1; + + int64_t j_9597 = add64((int64_t) 1, n_9500); + bool empty_slice_9598 = n_9500 == (int64_t) 0; + bool zzero_leq_i_p_m_t_s_9599 = sle64((int64_t) 0, n_9500); + bool i_p_m_t_s_leq_w_9600 = slt64(n_9500, m_9501); + bool i_lte_j_9601 = sle64((int64_t) 1, j_9597); + bool y_9602 = zzero_leq_i_p_m_t_s_9599 && i_p_m_t_s_leq_w_9600; + bool y_9603 = i_lte_j_9601 && y_9602; + bool ok_or_empty_9604 = empty_slice_9598 || y_9603; + bool index_ok_9605 = ok_or_empty_9604 && ok_or_empty_9604; + bool index_certs_9606; + + if (!index_ok_9605) { + ctx->error = + msgprintf("Error: %s%lld%s%lld%s%lld%s%lld%s%lld%s%lld%s\n\nBacktrace:\n%s", + "Index [", (int64_t) 1, ":", j_9597, ", ", (int64_t) 1, + ":", j_9597, "] out of bounds for array of shape [", + m_9501, "][", m_9501, "].", + "-> #0 gol.fut:40:8-31\n #1 gol.fut:30:1-40:43\n"); + if (memblock_unref_device(ctx, &mem_9948, "mem_9948") != 0) + return 1; + if (memblock_unref_device(ctx, &mem_9945, "mem_9945") != 0) + return 1; + if (memblock_unref_device(ctx, &out_mem_9954, "out_mem_9954") != 0) + return 1; + return 1; + } + + int64_t bytes_9949 = n_9500 * n_9500; + struct memblock_device mem_9950; + + mem_9950.references = NULL; + if (memblock_alloc_device(ctx, &mem_9950, bytes_9949, "mem_9950")) { + err = 1; + goto cleanup; + } + + int64_t group_sizze_9968; + + group_sizze_9968 = ctx->sizes.next_chunk_boardzigroup_sizze_9968; + + int64_t num_groups_9969; + + num_groups_9969 = sdiv_up64(n_9500 * n_9500, group_sizze_9968); + OPENCL_SUCCEED_OR_RETURN(clSetKernelArg(ctx->next_chunk_boardzicopy_9965, 0, + sizeof(n_9500), &n_9500)); + OPENCL_SUCCEED_OR_RETURN(clSetKernelArg(ctx->next_chunk_boardzicopy_9965, 1, + sizeof(m_9501), &m_9501)); + OPENCL_SUCCEED_OR_RETURN(clSetKernelArg(ctx->next_chunk_boardzicopy_9965, 2, + sizeof(mem_9948.mem), + &mem_9948.mem)); + OPENCL_SUCCEED_OR_RETURN(clSetKernelArg(ctx->next_chunk_boardzicopy_9965, 3, + sizeof(mem_9950.mem), + &mem_9950.mem)); + if (1 * ((size_t) num_groups_9969 * (size_t) group_sizze_9968) != 0) { + const size_t global_work_sizze_9992[1] = {(size_t) num_groups_9969 * + (size_t) group_sizze_9968}; + const size_t local_work_sizze_9996[1] = {group_sizze_9968}; + int64_t time_start_9993 = 0, time_end_9994 = 0; + + if (ctx->debugging) { + fprintf(ctx->log, "Launching %s with global work size [", + "next_chunk_board.copy_9965"); + fprintf(ctx->log, "%zu", global_work_sizze_9992[0]); + fprintf(ctx->log, "] and local work size ["); + fprintf(ctx->log, "%zu", local_work_sizze_9996[0]); + fprintf(ctx->log, "]; local memory parameters sum to %d bytes.\n", + (int) 0); + time_start_9993 = get_wall_time(); + } + OPENCL_SUCCEED_OR_RETURN(clEnqueueNDRangeKernel(ctx->opencl.queue, + ctx->next_chunk_boardzicopy_9965, + 1, NULL, + global_work_sizze_9992, + local_work_sizze_9996, + 0, NULL, + ctx->profiling_paused || + !ctx->profiling ? NULL : opencl_get_event(&ctx->opencl, + &ctx->next_chunk_boardzicopy_9965_runs, + &ctx->next_chunk_boardzicopy_9965_total_runtime))); + if (ctx->debugging) { + OPENCL_SUCCEED_FATAL(clFinish(ctx->opencl.queue)); + time_end_9994 = get_wall_time(); + + long time_diff_9995 = time_end_9994 - time_start_9993; + + fprintf(ctx->log, "kernel %s runtime: %ldus\n", + "next_chunk_board.copy_9965", time_diff_9995); + } + } + if (memblock_unref_device(ctx, &mem_9948, "mem_9948") != 0) + return 1; + if (memblock_set_device(ctx, &out_mem_9954, &mem_9950, "mem_9950") != 0) + return 1; + (*out_mem_p_9981).references = NULL; + if (memblock_set_device(ctx, &*out_mem_p_9981, &out_mem_9954, + "out_mem_9954") != 0) + return 1; + if (memblock_unref_device(ctx, &mem_9950, "mem_9950") != 0) + return 1; + if (memblock_unref_device(ctx, &mem_9948, "mem_9948") != 0) + return 1; + if (memblock_unref_device(ctx, &mem_9945, "mem_9945") != 0) + return 1; + if (memblock_unref_device(ctx, &out_mem_9954, "out_mem_9954") != 0) + return 1; + + cleanup: + { } + return err; +} +struct futhark_i8_2d { + struct memblock_device mem; + int64_t shape[2]; +} ; +struct futhark_i8_2d *futhark_new_i8_2d(struct futhark_context *ctx, const + int8_t *data, int64_t dim0, + int64_t dim1) +{ + struct futhark_i8_2d *bad = NULL; + struct futhark_i8_2d *arr = + (struct futhark_i8_2d *) malloc(sizeof(struct futhark_i8_2d)); + + if (arr == NULL) + return bad; + lock_lock(&ctx->lock); + arr->mem.references = NULL; + if (memblock_alloc_device(ctx, &arr->mem, (size_t) (dim0 * dim1) * 1, + "arr->mem")) + return NULL; + arr->shape[0] = dim0; + arr->shape[1] = dim1; + if ((size_t) (dim0 * dim1) * 1 > 0) + OPENCL_SUCCEED_OR_RETURN(clEnqueueWriteBuffer(ctx->opencl.queue, + arr->mem.mem, CL_TRUE, 0, + (size_t) (dim0 * dim1) * + 1, data + 0, 0, NULL, + ctx->profiling_paused || + !ctx->profiling ? NULL : opencl_get_event(&ctx->opencl, + &ctx->copy_dev_to_host_runs, + &ctx->copy_dev_to_host_total_runtime))); + lock_unlock(&ctx->lock); + return arr; +} +struct futhark_i8_2d *futhark_new_raw_i8_2d(struct futhark_context *ctx, const + cl_mem data, int offset, + int64_t dim0, int64_t dim1) +{ + struct futhark_i8_2d *bad = NULL; + struct futhark_i8_2d *arr = + (struct futhark_i8_2d *) malloc(sizeof(struct futhark_i8_2d)); + + if (arr == NULL) + return bad; + lock_lock(&ctx->lock); + arr->mem.references = NULL; + if (memblock_alloc_device(ctx, &arr->mem, (size_t) (dim0 * dim1) * 1, + "arr->mem")) + return NULL; + arr->shape[0] = dim0; + arr->shape[1] = dim1; + if ((size_t) (dim0 * dim1) * 1 > 0) { + OPENCL_SUCCEED_OR_RETURN(clEnqueueCopyBuffer(ctx->opencl.queue, data, + arr->mem.mem, offset, 0, + (size_t) (dim0 * dim1) * 1, + 0, NULL, + ctx->profiling_paused || + !ctx->profiling ? NULL : opencl_get_event(&ctx->opencl, + &ctx->copy_dev_to_dev_runs, + &ctx->copy_dev_to_dev_total_runtime))); + if (ctx->debugging) + OPENCL_SUCCEED_FATAL(clFinish(ctx->opencl.queue)); + } + lock_unlock(&ctx->lock); + return arr; +} +int futhark_free_i8_2d(struct futhark_context *ctx, struct futhark_i8_2d *arr) +{ + lock_lock(&ctx->lock); + if (memblock_unref_device(ctx, &arr->mem, "arr->mem") != 0) + return 1; + lock_unlock(&ctx->lock); + free(arr); + return 0; +} +int futhark_values_i8_2d(struct futhark_context *ctx, struct futhark_i8_2d *arr, + int8_t *data) +{ + lock_lock(&ctx->lock); + if ((size_t) (arr->shape[0] * arr->shape[1]) * 1 > 0) { + OPENCL_SUCCEED_OR_RETURN(clEnqueueReadBuffer(ctx->opencl.queue, + arr->mem.mem, + ctx->failure_is_an_option ? CL_FALSE : CL_TRUE, + 0, + (size_t) (arr->shape[0] * + arr->shape[1]) * + 1, data + 0, 0, NULL, + ctx->profiling_paused || + !ctx->profiling ? NULL : opencl_get_event(&ctx->opencl, + &ctx->copy_host_to_dev_runs, + &ctx->copy_host_to_dev_total_runtime))); + if (ctx->failure_is_an_option && futhark_context_sync(ctx) != 0) + return 1; + } + lock_unlock(&ctx->lock); + return 0; +} +cl_mem futhark_values_raw_i8_2d(struct futhark_context *ctx, + struct futhark_i8_2d *arr) +{ + (void) ctx; + return arr->mem.mem; +} +const int64_t *futhark_shape_i8_2d(struct futhark_context *ctx, + struct futhark_i8_2d *arr) +{ + (void) ctx; + return arr->shape; +} +int futhark_entry_get_envelope(struct futhark_context *ctx, + struct futhark_i8_2d **out0, const + struct futhark_i8_2d *in0) +{ + struct memblock_device chunk_board_mem_9941; + + chunk_board_mem_9941.references = NULL; + + int64_t n_9485; + struct memblock_device out_mem_9954; + + out_mem_9954.references = NULL; + + int ret = 0; + + lock_lock(&ctx->lock); + chunk_board_mem_9941 = in0->mem; + n_9485 = in0->shape[0]; + n_9485 = in0->shape[1]; + if (!(n_9485 == in0->shape[0] && n_9485 == in0->shape[1])) { + ret = 1; + if (!ctx->error) + ctx->error = + msgprintf("Error: entry point arguments have invalid sizes.\n"); + } else { + ret = futrts_get_envelope(ctx, &out_mem_9954, chunk_board_mem_9941, + n_9485); + if (ret == 0) { + assert((*out0 = + (struct futhark_i8_2d *) malloc(sizeof(struct futhark_i8_2d))) != + NULL); + (*out0)->mem = out_mem_9954; + (*out0)->shape[0] = 4; + (*out0)->shape[1] = n_9485; + } + } + lock_unlock(&ctx->lock); + return ret; +} +int futhark_entry_next_chunk_board(struct futhark_context *ctx, + struct futhark_i8_2d **out0, const + struct futhark_i8_2d *in0, const + struct futhark_i8_2d *in1) +{ + struct memblock_device chunk_board_mem_9941; + + chunk_board_mem_9941.references = NULL; + + struct memblock_device envelope_board_mem_9942; + + envelope_board_mem_9942.references = NULL; + + int64_t n_9500; + int64_t m_9501; + struct memblock_device out_mem_9954; + + out_mem_9954.references = NULL; + + int ret = 0; + + lock_lock(&ctx->lock); + chunk_board_mem_9941 = in0->mem; + n_9500 = in0->shape[0]; + n_9500 = in0->shape[1]; + envelope_board_mem_9942 = in1->mem; + m_9501 = in1->shape[1]; + if (!((n_9500 == in0->shape[0] && n_9500 == in0->shape[1]) && (4 == + in1->shape[0] && + m_9501 == + in1->shape[1]))) { + ret = 1; + if (!ctx->error) + ctx->error = + msgprintf("Error: entry point arguments have invalid sizes.\n"); + } else { + ret = futrts_next_chunk_board(ctx, &out_mem_9954, chunk_board_mem_9941, + envelope_board_mem_9942, n_9500, m_9501); + if (ret == 0) { + assert((*out0 = + (struct futhark_i8_2d *) malloc(sizeof(struct futhark_i8_2d))) != + NULL); + (*out0)->mem = out_mem_9954; + (*out0)->shape[0] = n_9500; + (*out0)->shape[1] = n_9500; + } + } + lock_unlock(&ctx->lock); + return ret; +} diff --git a/futmpi/gol.fut b/futmpi/gol.fut new file mode 100644 index 0000000000000000000000000000000000000000..25eae32c3d2eff33780351abcfe311ac6e6b6200 --- /dev/null +++ b/futmpi/gol.fut @@ -0,0 +1,48 @@ +let count_neighbours [n] (board: [n][n]i8) : [n][n]i8 = + let north = rotate (-1) board + let south = rotate 1 board + let east = map(rotate 1) board + let west = map(rotate (-1)) board + + let north_east = map(rotate 1) north + let north_west = map(rotate (-1)) north + let south_east = map(rotate 1) south + let south_west = map(rotate (-1)) south + + in map3 (\(nwr,nr,ner) (wr, br, er) (swr, sr, ser) -> + map3 (\(nw,n,ne) (w, _, e) (sw, s, se) -> nw + n + ne + w + e + sw + s + se) + (zip3 nwr nr ner) (zip3 wr br er) (zip3 swr sr ser)) + (zip3 north_west north north_east) (zip3 west board east) (zip3 south_west south south_east) + +let augment_board [n][m] (chunk_board :[n][n]i8) (envelope_board: [4][m]i8): [m][m]i8 = + tabulate_2d (m) (m) (\i j -> + -- North + if (i == 0) then envelope_board[0,j] + -- East + else if (j == m-1) then envelope_board[1,i] + -- South + else if (i == m-1) then envelope_board[2,j] + -- West + else if (j == 0) then envelope_board[3,i] + else chunk_board[i-1,j-1]) + + +entry next_chunk_board [n][m] (chunk_board :[n][n]i8) (envelope_board: [4][m]i8) :[n][n]i8 = + let augmented_board = augment_board chunk_board envelope_board + let neighbours = count_neighbours augmented_board + let next_board = map2 (\augmented_board_r neighbours_r -> + map2(\cell nb_alive_cells -> + if (cell == 1 && (nb_alive_cells == 2 || nb_alive_cells == 3)) || (cell == 0 && nb_alive_cells == 3) + then 1 + else 0) + augmented_board_r neighbours_r) + augmented_board neighbours + in next_board[1:n+1, 1:n+1] :> [n][n]i8 + +entry get_envelope [n] (chunk_board: [n][n]i8): [4][n]i8 = + let north = chunk_board[0] + let south = chunk_board[n-1] + let tr_chunk_board = transpose chunk_board + let east = tr_chunk_board[n-1] + let west = tr_chunk_board[0] + in [north, east, south, west] diff --git a/futmpi/gol.h b/futmpi/gol.h new file mode 100644 index 0000000000000000000000000000000000000000..4d300c14e4b8472bf7f626c29741a99a92542ed0 --- /dev/null +++ b/futmpi/gol.h @@ -0,0 +1,122 @@ +#pragma once + +// Headers + +#include <stdint.h> +#include <stddef.h> +#include <stdbool.h> +#include <stdio.h> +#include <float.h> +#define CL_TARGET_OPENCL_VERSION 120 +#define CL_USE_DEPRECATED_OPENCL_1_2_APIS +#ifdef __APPLE__ +#define CL_SILENCE_DEPRECATION +#include <OpenCL/cl.h> +#else +#include <CL/cl.h> +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +// Initialisation + +struct futhark_context_config ; +struct futhark_context_config *futhark_context_config_new(void); +void futhark_context_config_free(struct futhark_context_config *cfg); +void futhark_context_config_add_build_option(struct futhark_context_config *cfg, + const char *opt); +void futhark_context_config_set_debugging(struct futhark_context_config *cfg, + int flag); +void futhark_context_config_set_profiling(struct futhark_context_config *cfg, + int flag); +void futhark_context_config_set_logging(struct futhark_context_config *cfg, + int flag); +void futhark_context_config_set_device(struct futhark_context_config *cfg, const + char *s); +void futhark_context_config_set_platform(struct futhark_context_config *cfg, + const char *s); +void +futhark_context_config_select_device_interactively(struct futhark_context_config *cfg); +void futhark_context_config_list_devices(struct futhark_context_config *cfg); +void futhark_context_config_dump_program_to(struct futhark_context_config *cfg, + const char *path); +void +futhark_context_config_load_program_from(struct futhark_context_config *cfg, + const char *path); +void futhark_context_config_dump_binary_to(struct futhark_context_config *cfg, + const char *path); +void futhark_context_config_load_binary_from(struct futhark_context_config *cfg, + const char *path); +void +futhark_context_config_set_default_group_size(struct futhark_context_config *cfg, + int size); +void +futhark_context_config_set_default_num_groups(struct futhark_context_config *cfg, + int num); +void +futhark_context_config_set_default_tile_size(struct futhark_context_config *cfg, + int num); +void +futhark_context_config_set_default_reg_tile_size(struct futhark_context_config *cfg, + int num); +void +futhark_context_config_set_default_threshold(struct futhark_context_config *cfg, + int num); +int futhark_context_config_set_size(struct futhark_context_config *cfg, const + char *size_name, size_t size_value); +struct futhark_context ; +struct futhark_context *futhark_context_new(struct futhark_context_config *cfg); +struct futhark_context +*futhark_context_new_with_command_queue(struct futhark_context_config *cfg, + cl_command_queue queue); +void futhark_context_free(struct futhark_context *ctx); +cl_command_queue futhark_context_get_command_queue(struct futhark_context *ctx); +int futhark_get_num_sizes(void); +const char *futhark_get_size_name(int); +const char *futhark_get_size_class(int); + +// Arrays + +struct futhark_i8_2d ; +struct futhark_i8_2d *futhark_new_i8_2d(struct futhark_context *ctx, const + int8_t *data, int64_t dim0, + int64_t dim1); +struct futhark_i8_2d *futhark_new_raw_i8_2d(struct futhark_context *ctx, const + cl_mem data, int offset, + int64_t dim0, int64_t dim1); +int futhark_free_i8_2d(struct futhark_context *ctx, struct futhark_i8_2d *arr); +int futhark_values_i8_2d(struct futhark_context *ctx, struct futhark_i8_2d *arr, + int8_t *data); +cl_mem futhark_values_raw_i8_2d(struct futhark_context *ctx, + struct futhark_i8_2d *arr); +const int64_t *futhark_shape_i8_2d(struct futhark_context *ctx, + struct futhark_i8_2d *arr); + +// Opaque values + + +// Entry points + +int futhark_entry_get_envelope(struct futhark_context *ctx, + struct futhark_i8_2d **out0, const + struct futhark_i8_2d *in0); +int futhark_entry_next_chunk_board(struct futhark_context *ctx, + struct futhark_i8_2d **out0, const + struct futhark_i8_2d *in0, const + struct futhark_i8_2d *in1); + +// Miscellaneous + +int futhark_context_sync(struct futhark_context *ctx); +char *futhark_context_report(struct futhark_context *ctx); +char *futhark_context_get_error(struct futhark_context *ctx); +void futhark_context_set_logging_file(struct futhark_context *ctx, FILE *f); +void futhark_context_pause_profiling(struct futhark_context *ctx); +void futhark_context_unpause_profiling(struct futhark_context *ctx); +int futhark_context_clear_caches(struct futhark_context *ctx); +#define FUTHARK_BACKEND_opencl +#ifdef __cplusplus +} +#endif diff --git a/futmpi/main.c b/futmpi/main.c new file mode 100644 index 0000000000000000000000000000000000000000..ab2413acc9b2a7600b88ea85d6324bb1d803ab18 --- /dev/null +++ b/futmpi/main.c @@ -0,0 +1,346 @@ +#include <stdio.h> +#include <stdint.h> +#include <mpi.h> +#include <math.h> +#include <stdbool.h> +#include <unistd.h> +#include "gol.h" +#include "gfx.h" + +#define BOARD_N 800 + +#define INDEX_2D_TO_1D(y, x, nb_columns) ((y) * nb_columns + (x)) + +#define NORTH_INDEX 0 +#define EAST_INDEX 1 +#define SOUTH_INDEX 2 +#define WEST_INDEX 3 + +#define NORTH_ROW_TAG 0 +#define EAST_COLUMN_TAG 1 +#define SOUTH_ROW_TAG 2 +#define WEST_COLUMN_TAG 3 + +#define NORTH_EAST_CELL_TAG 4 +#define SOUTH_EAST_CELL_TAG 5 +#define SOUTH_WEST_CELL_TAG 6 +#define NORTH_WEST_CELL_TAG 7 + +#define CHUNK_BOARD_TAG 8 + +//void printChunkBoard(int8_t *chunkBoard, int n1, int n2) { +// for (int i = 0; i < n1; ++i) { +// for (int j = 0; j < n2; ++j) { +// printf("%d ", chunkBoard[INDEX_2D_TO_1D(i, j, n2)]); +// } +// printf("\n"); +// } +//} + +int createGridCommunicators(MPI_Comm *cartComm, MPI_Comm *rowComm, MPI_Comm *colComm, int nProc) { + int gridN = (int) sqrt(nProc); + int dimensions[2] = {gridN, gridN}; + int periods[2] = {true, true}; // Cyclic on column for B matrix + + MPI_Cart_create(MPI_COMM_WORLD, 2, dimensions, periods, 1, cartComm); + + /* Create row communicator */ + int remainDims[2] = {false, true}; + MPI_Cart_sub(*cartComm, remainDims, rowComm); + + /* Create column communicator */ + remainDims[0] = true; // rows + remainDims[1] = false; // columns + MPI_Cart_sub(*cartComm, remainDims, colComm); + return gridN; +} + +int *divideBoard(int n, int chunkN, int nProc) { + int *indexes = calloc((size_t) nProc * 2, sizeof(int)); + for (int i = 0, y = 0, x = 0; i < nProc; ++i) { + indexes[i * 2] = y; + indexes[i * 2 + 1] = x; + + x += (int) chunkN; + if (x >= (int) n) { + x = 0; + y += (int) chunkN; + } + } + return indexes; +} + +void initChunkBoard(int8_t *chunkBoard, int chunkN) { + for (int i = 0; i < chunkN; ++i) { + for (int j = 0; j < chunkN; ++j) { + chunkBoard[INDEX_2D_TO_1D(i, j, chunkN)] = rand() % 2; + } + } +} + +void shareAndBuildEnvelope(int8_t *chunkBoardMyEnvelope, int8_t *chunkBoardEnvelope, MPI_Comm rowComm, + MPI_Comm colComm, int gridN, int coordinates[2], int chunkN, int chunkM) { + int coordinateY = coordinates[0]; + int coordinateX = coordinates[1]; + MPI_Request requests[16] = {0}; + int iRequest = 0; + + // North + { + int8_t *chunkBoardMyEnvelopeNorth = &chunkBoardMyEnvelope[INDEX_2D_TO_1D(NORTH_INDEX, 0, chunkN)]; + int8_t *chunkBoardEnvelopeNorth = &chunkBoardEnvelope[INDEX_2D_TO_1D(NORTH_INDEX, 1, chunkM)]; + int destSource = (coordinateY - 1) < 0 ? (gridN - 1) : (coordinateY - 1); + + MPI_Isend(chunkBoardMyEnvelopeNorth, chunkN, MPI_INT8_T, destSource, NORTH_ROW_TAG, colComm, + &requests[iRequest++]); + /* Neighbour send south row, which correspond to north envelope */ + MPI_Irecv(chunkBoardEnvelopeNorth, chunkN, MPI_INT8_T, destSource, SOUTH_ROW_TAG, colComm, + &requests[iRequest++]); + } + + // East + { + int8_t *chunkBoardMyEnvelopeEast = &chunkBoardMyEnvelope[INDEX_2D_TO_1D(EAST_INDEX, 0, chunkN)]; + int8_t *chunkBoardEnvelopeEast = &chunkBoardEnvelope[INDEX_2D_TO_1D(EAST_INDEX, 1, chunkM)]; + int destSource = (coordinateX + 1) % gridN; + + MPI_Isend(chunkBoardMyEnvelopeEast, chunkN, MPI_INT8_T, destSource, EAST_COLUMN_TAG, rowComm, + &requests[iRequest++]); + /* Neighbour send west column, which correspond to east envelope */ + MPI_Irecv(chunkBoardEnvelopeEast, chunkN, MPI_INT8_T, destSource, WEST_COLUMN_TAG, rowComm, + &requests[iRequest++]); + } + + // South + { + int8_t *chunkBoardMyEnvelopeSouth = &chunkBoardMyEnvelope[INDEX_2D_TO_1D(SOUTH_INDEX, 0, chunkN)]; + int8_t *chunkBoardEnvelopeSouth = &chunkBoardEnvelope[INDEX_2D_TO_1D(SOUTH_INDEX, 1, chunkM)]; + int destSource = (coordinateY + 1) % gridN; + + MPI_Isend(chunkBoardMyEnvelopeSouth, chunkN, MPI_INT8_T, destSource, SOUTH_ROW_TAG, colComm, + &requests[iRequest++]); + /* Neighbour send north row, which correspond to south envelope */ + MPI_Irecv(chunkBoardEnvelopeSouth, chunkN, MPI_INT8_T, destSource, NORTH_ROW_TAG, colComm, + &requests[iRequest++]); + } + + // West + { + int8_t *chunkBoardMyEnvelopeWest = &chunkBoardMyEnvelope[INDEX_2D_TO_1D(WEST_INDEX, 0, chunkN)]; + int8_t *chunkBoardEnvelopeWest = &chunkBoardEnvelope[INDEX_2D_TO_1D(WEST_INDEX, 1, chunkM)]; + int destSource = (coordinateX - 1) < 0 ? (gridN - 1) : (coordinateX - 1); + + MPI_Isend(chunkBoardMyEnvelopeWest, chunkN, MPI_INT8_T, destSource, WEST_COLUMN_TAG, rowComm, + &requests[iRequest++]); + /* Neighbour send east column, which correspond to west envelope */ + MPI_Irecv(chunkBoardEnvelopeWest, chunkN, MPI_INT8_T, destSource, EAST_COLUMN_TAG, rowComm, + &requests[iRequest++]); + } + + int8_t missingCells[4] = {0}; + + // North-East + { + int8_t *chunkBoardMyEnvelopeNorthEast = &chunkBoardMyEnvelope[INDEX_2D_TO_1D(NORTH_INDEX, chunkN - 1, chunkN)]; + int destSrcY = (coordinateY - 1) < 0 ? gridN - 1 : coordinateY - 1; + int destSrcX = (coordinateX + 1) % gridN; + int destSource = INDEX_2D_TO_1D(destSrcY, destSrcX, gridN); + + MPI_Isend(chunkBoardMyEnvelopeNorthEast, 1, MPI_INT8_T, destSource, NORTH_EAST_CELL_TAG, MPI_COMM_WORLD, + &requests[iRequest++]); + MPI_Irecv(&missingCells[1], 1, MPI_INT8_T, destSource, SOUTH_WEST_CELL_TAG, MPI_COMM_WORLD, + &requests[iRequest++]); + } + + // South-East + { + int8_t *chunkBoardMyEnvelopeSouthEast = &chunkBoardMyEnvelope[INDEX_2D_TO_1D(SOUTH_INDEX, chunkN - 1, chunkN)]; + int destSrcY = (coordinateY + 1) % gridN; + int destSrcX = (coordinateX + 1) % gridN; + int destSource = INDEX_2D_TO_1D(destSrcY, destSrcX, gridN); + + MPI_Isend(chunkBoardMyEnvelopeSouthEast, 1, MPI_INT8_T, destSource, SOUTH_EAST_CELL_TAG, MPI_COMM_WORLD, + &requests[iRequest++]); + MPI_Irecv(&missingCells[2], 1, MPI_INT8_T, destSource, NORTH_WEST_CELL_TAG, MPI_COMM_WORLD, + &requests[iRequest++]); + } + + // South-West + { + int8_t *chunkBoardMyEnvelopeSouthWest = &chunkBoardMyEnvelope[INDEX_2D_TO_1D(SOUTH_INDEX, 0, chunkN)]; + int destSrcY = (coordinateY + 1) % gridN; + int destSrcX = (coordinateX - 1) < 0 ? gridN - 1 : coordinateX - 1; + int destSource = INDEX_2D_TO_1D(destSrcY, destSrcX, gridN); + + MPI_Isend(chunkBoardMyEnvelopeSouthWest, 1, MPI_INT8_T, destSource, SOUTH_WEST_CELL_TAG, MPI_COMM_WORLD, + &requests[iRequest++]); + MPI_Irecv(&missingCells[3], 1, MPI_INT8_T, destSource, NORTH_EAST_CELL_TAG, MPI_COMM_WORLD, + &requests[iRequest++]); + } + + // North-West + { + int8_t *chunkBoardMyEnvelopeNorthWest = &chunkBoardMyEnvelope[INDEX_2D_TO_1D(NORTH_INDEX, 0, chunkN)]; + int destSrcY = (coordinateY - 1) < 0 ? gridN - 1 : coordinateY - 1; + int destSrcX = (coordinateX - 1) < 0 ? gridN - 1 : coordinateX - 1; + int destSource = INDEX_2D_TO_1D(destSrcY, destSrcX, gridN); + + MPI_Isend(chunkBoardMyEnvelopeNorthWest, 1, MPI_INT8_T, destSource, NORTH_WEST_CELL_TAG, MPI_COMM_WORLD, + &requests[iRequest++]); + MPI_Irecv(&missingCells[0], 1, MPI_INT8_T, destSource, SOUTH_EAST_CELL_TAG, MPI_COMM_WORLD, + &requests[iRequest]); + } + + MPI_Waitall(16, requests, MPI_STATUSES_IGNORE); + + chunkBoardEnvelope[INDEX_2D_TO_1D(NORTH_INDEX, chunkN, chunkM)] = chunkBoardEnvelope[INDEX_2D_TO_1D( + EAST_INDEX, 0, chunkM)] = missingCells[1]; + chunkBoardEnvelope[INDEX_2D_TO_1D(SOUTH_INDEX, chunkN, chunkM)] = chunkBoardEnvelope[INDEX_2D_TO_1D(EAST_INDEX, + chunkN, + chunkM)] = missingCells[2]; + chunkBoardEnvelope[INDEX_2D_TO_1D(SOUTH_INDEX, 0, chunkM)] = chunkBoardEnvelope[INDEX_2D_TO_1D(WEST_INDEX, + chunkN, + chunkM)] = missingCells[3]; + chunkBoardEnvelope[INDEX_2D_TO_1D(NORTH_INDEX, 0, chunkM)] = chunkBoardEnvelope[INDEX_2D_TO_1D(WEST_INDEX, + 0, + chunkM)] = missingCells[0]; +} + +void chunkBoardToBoard(int8_t *board, int n, const int8_t *chunkBoard, int chunkN, const int *indexes, int rank) { + int y = indexes[rank * 2]; + int x = indexes[rank * 2 + 1]; + + for (int i = 0; i < chunkN; ++i) { + for (int j = 0; j < chunkN; ++j) { + board[INDEX_2D_TO_1D(y + i, x + j, n)] = chunkBoard[INDEX_2D_TO_1D(i, j, chunkN)]; + } + } +} + +int main(int argc, char *argv[]) { + int myRank; + int nProc; + + /* MPI Initialization */ + MPI_Init(&argc, &argv); + MPI_Comm_rank(MPI_COMM_WORLD, &myRank); + MPI_Comm_size(MPI_COMM_WORLD, &nProc); + srand((unsigned int) myRank); + + MPI_Comm cartComm, rowComm, colComm; + int gridN = createGridCommunicators(&cartComm, &rowComm, &colComm, nProc); + + int myCartRank; + MPI_Comm_rank(cartComm, &myCartRank); + + int coordinates[2] = {0}; + MPI_Cart_coords(cartComm, myCartRank, 2, coordinates); + + /* Futhark Initialization */ + struct futhark_context_config *contextConfig = futhark_context_config_new(); + futhark_context_config_set_device(contextConfig, "AMD"); + struct futhark_context *futharkContext = futhark_context_new(contextConfig); + + /* GFX Initialization */ + struct gfx_context_t *gfxContext = myRank == 0 ? gfx_create("Game of Life", BOARD_N, BOARD_N) : NULL; + if (myRank == 0 && !gfxContext) { + fprintf(stderr, "Graphic mode initialization failed!\n"); + return EXIT_FAILURE; + } + if (myRank == 0) { + SDL_ShowCursor(SDL_ENABLE); + } + + /* GoL Initialization */ + int chunkN = (int) (BOARD_N / sqrt(nProc)); + int chunkNN = chunkN * chunkN; + int chunkM = chunkN + 2; + int *indexes = divideBoard(BOARD_N, chunkN, nProc); + + int8_t *board = myRank == 0 ? calloc(BOARD_N * BOARD_N, sizeof(int8_t)) : NULL; + int8_t *chunkBoard = calloc((size_t) chunkNN, sizeof(int8_t)); + int8_t *chunkBoardMyEnvelope = calloc(((size_t) (4 * chunkN)), sizeof(int8_t)); + int8_t *chunkBoardEnvelope = calloc(((size_t) (4 * chunkM)), sizeof(int8_t)); + + initChunkBoard(chunkBoard, chunkN); + + bool exit = false; + while (!exit) { + struct futhark_i8_2d *futChunkBoard = futhark_new_i8_2d(futharkContext, chunkBoard, chunkN, chunkN); + futhark_context_sync(futharkContext); + struct futhark_i8_2d *futChunkBoardMyEnvelope; + futhark_entry_get_envelope(futharkContext, &futChunkBoardMyEnvelope, futChunkBoard); + futhark_context_sync(futharkContext); + futhark_values_i8_2d(futharkContext, futChunkBoardMyEnvelope, chunkBoardMyEnvelope); + futhark_context_sync(futharkContext); + + shareAndBuildEnvelope(chunkBoardMyEnvelope, chunkBoardEnvelope, rowComm, colComm, gridN, coordinates, chunkN, + chunkM); + + struct futhark_i8_2d *futChunkBoardEnvelope = futhark_new_i8_2d(futharkContext, chunkBoardEnvelope, 4, chunkM); + futhark_context_sync(futharkContext); + struct futhark_i8_2d *futNextChunkBoard; + futhark_entry_next_chunk_board(futharkContext, &futNextChunkBoard, futChunkBoard, futChunkBoardEnvelope); + futhark_context_sync(futharkContext); + futhark_values_i8_2d(futharkContext, futNextChunkBoard, chunkBoard); + futhark_context_sync(futharkContext); + + if (myRank == 0) { + chunkBoardToBoard(board, BOARD_N, chunkBoard, chunkN, indexes, myRank); + int8_t *tmpChunkBoard = calloc((size_t) chunkNN, sizeof(int8_t)); + MPI_Status status = {0}; + for (int i = 0; i < nProc - 1; ++i) { + MPI_Recv(tmpChunkBoard, chunkNN, MPI_INT8_T, MPI_ANY_SOURCE, CHUNK_BOARD_TAG, MPI_COMM_WORLD, &status); + chunkBoardToBoard(board, BOARD_N, tmpChunkBoard, chunkN, indexes, status.MPI_SOURCE); + } + free(tmpChunkBoard); + } else { + MPI_Send(chunkBoard, chunkNN, MPI_INT8_T, 0, CHUNK_BOARD_TAG, MPI_COMM_WORLD); + } + + if (myRank == 0) { + SDL_PumpEvents(); + SDL_Event event; + SDL_PollEvent(&event); + + exit = gfx_keypressed() == SDLK_ESCAPE || + (event.type == SDL_WINDOWEVENT && event.window.event == SDL_WINDOWEVENT_CLOSE); + + gfx_clear(gfxContext, COLOR_BLACK); + for (int y = 0; y < BOARD_N; ++y) { + for (int x = 0; x < BOARD_N; ++x) { + int cell = (int) board[INDEX_2D_TO_1D(y, x, BOARD_N)]; + gfx_putpixel(gfxContext, x, y, MAKE_COLOR(cell * 255, cell * 255, cell * 255)); + } + } + gfx_present(gfxContext); + } + + futhark_context_sync(futharkContext); + futhark_free_i8_2d(futharkContext, futChunkBoard); + futhark_free_i8_2d(futharkContext, futChunkBoardMyEnvelope); + futhark_free_i8_2d(futharkContext, futChunkBoardEnvelope); + futhark_free_i8_2d(futharkContext, futNextChunkBoard); + + MPI_Bcast(&exit, 1, MPI_C_BOOL, 0, MPI_COMM_WORLD); + usleep(16666); + } + + free(chunkBoard); + free(chunkBoardEnvelope); + free(chunkBoardMyEnvelope); + + if (myRank == 0) { + free(board); + gfx_destroy(gfxContext); + } + + futhark_context_free(futharkContext); + futhark_context_config_free(contextConfig); + + MPI_Comm_free(&cartComm); + MPI_Comm_free(&rowComm); + MPI_Comm_free(&colComm); + MPI_Finalize(); + return 0; +} diff --git a/game_of_life/CMakeLists.txt b/game_of_life/CMakeLists.txt deleted file mode 100644 index 1722508afa1f0bd1a9b108727b9b01cb6f9bfb21..0000000000000000000000000000000000000000 --- a/game_of_life/CMakeLists.txt +++ /dev/null @@ -1,55 +0,0 @@ -cmake_minimum_required(VERSION 3.17) -project(game_of_life C) - -set(CMAKE_C_STANDARD 11) - -include_directories(".") - -if (CMAKE_BUILD_TYPE MATCHES Debug) - set(GCC_COMPILE_FLAGS "-Wall -Wextra -pedantic -fsanitize=address -fsanitize=null") - if (CMAKE_SYSTEM_NAME MATCHES "Linux") - set(GCC_COMPILE_FLAGS "${GCC_COMPILE_FLAGS} -fsanitize=leak") - endif () -elseif (CMAKE_BUILD_TYPE MATCHES Release) - set(GCC_COMPILE_FLAGS "-g") -endif () - -if (CMAKE_SYSTEM_NAME MATCHES "Linux") - execute_process(COMMAND sdl2-config --cflags OUTPUT_VARIABLE SDL2_C_FLAGS) - include_directories(${SDL2_C_FLAGS}) -endif () - -if (CMAKE_SYSTEM_NAME MATCHES "Darwin") - include_directories(/usr/local/include) -endif () - -find_package(MPI REQUIRED) -include_directories(${MPI_C_INCLUDE_PATH}) - -set(CMAKE_MACRO_FLAGS -DPROGHEADER='\"${CMAKE_CURRENT_SOURCE_DIR}/gol.h\"') - -set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${GCC_COMPILE_FLAGS} ${CMAKE_MACRO_FLAGS}") - -add_custom_target( - futhark_opencl - COMMAND futhark opencl ${CMAKE_CURRENT_SOURCE_DIR}/gol.fut --library -) -add_executable(game_of_life_opencl gol.c gol.h main.c lib/github.com/diku-dk/lys/liblys.c lib/github.com/diku-dk/lys/liblys.h lib/github.com/diku-dk/lys/context_setup.c lib/github.com/diku-dk/lys/context_setup.h ../lib/fpmpi.c ../lib/fpmpi.h ../lib/fp.h ../lib/fp.c ../lib/dispatch.c ../lib/dispatch.h) - -if (CMAKE_SYSTEM_NAME MATCHES "Darwin") - target_link_libraries(game_of_life_opencl "-framework OpenCL" m SDL2 ${MPI_C_LIBRARIES}) -endif () - -if (CMAKE_SYSTEM_NAME MATCHES "Linux") - target_link_libraries(game_of_life_opencl OpenCL m SDL2 ${MPI_C_LIBRARIES}) -endif () - -add_dependencies(game_of_life_opencl futhark_opencl) - -add_custom_target( - futhark_multicore - COMMAND futhark multicore ${CMAKE_CURRENT_SOURCE_DIR}/gol.fut --library -) -add_executable(game_of_life_multicore gol.c gol.h main.c lib/github.com/diku-dk/lys/liblys.c lib/github.com/diku-dk/lys/liblys.h lib/github.com/diku-dk/lys/context_setup.c lib/github.com/diku-dk/lys/context_setup.h ../lib/fpmpi.c ../lib/fpmpi.h ../lib/fp.h ../lib/fp.c ../lib/dispatch.c ../lib/dispatch.h) -add_dependencies(game_of_life_multicore futhark_multicore) -target_link_libraries(game_of_life_multicore m pthread SDL2 ${MPI_C_LIBRARIES}) diff --git a/game_of_life/Makefile b/game_of_life/Makefile deleted file mode 100644 index 7cc239aa59d9de044fe41a2e9dd8997551f4e5c0..0000000000000000000000000000000000000000 --- a/game_of_life/Makefile +++ /dev/null @@ -1,34 +0,0 @@ -all: release debug - -release: - mkdir -p "cmake-build-release" - cmake -DCMAKE_BUILD_TYPE=Release -Bcmake-build-release - $(MAKE) -C cmake-build-release all - -release/multicore: - mkdir -p "cmake-build-release" - cmake -DCMAKE_BUILD_TYPE=Release -Bcmake-build-release - $(MAKE) -C cmake-build-release game_of_life_multicore - -release/opencl: - mkdir -p "cmake-build-release" - cmake -DCMAKE_BUILD_TYPE=Release -Bcmake-build-release - $(MAKE) -C cmake-build-release game_of_life_opencl - -debug: - mkdir -p "cmake-build-debug" - cmake -DCMAKE_BUILD_TYPE=Debug -Bcmake-build-debug - $(MAKE) -C cmake-build-release all - - -debug/multicore: - mkdir -p "cmake-build-debug" - cmake -DCMAKE_BUILD_TYPE=Debug -Bcmake-build-debug - $(MAKE) -C cmake-build-debug game_of_life_multicore - -debug/opencl: - mkdir -p "cmake-build-debug" - cmake -DCMAKE_BUILD_TYPE=Debug -Bcmake-build-debug - $(MAKE) -C cmake-build-debug game_of_life_opencl - -.PHONY: release release/multicore release/opencl debug debug/multicore debug/opencl diff --git a/game_of_life/README.md b/game_of_life/README.md deleted file mode 100644 index d31463ccac39ea0bdbc3238426e898f5471f4f9d..0000000000000000000000000000000000000000 --- a/game_of_life/README.md +++ /dev/null @@ -1,12 +0,0 @@ -# Jeu de la vie en Futhark/C - -Le but de ce projet est de créer le jeu de la vie en Futhark + C avec l'affichage du monde dans une fenêtre SDL gérée par Futhark. -La contrainte de cette version est que le monde est représenté dans un tableau en une dimension. - -## Construire le projet - -* Exécuter la commande `futhark pkg sync` -* Exécuter la commande `make` -* Les exécutables sont présents dans le dossier `cmake-build-debug` et/ou `cmake-build-release` - * `./game_of_life_opencl` - * `./game_of_life_multicore` diff --git a/game_of_life/futhark.pkg b/game_of_life/futhark.pkg deleted file mode 100644 index 80bc4b6b4457688252b547676b59fe0e1c1e71a7..0000000000000000000000000000000000000000 --- a/game_of_life/futhark.pkg +++ /dev/null @@ -1,3 +0,0 @@ -require { - github.com/diku-dk/lys 0.1.12 #34e5ff985fefac9a9627d49e26a19ef5352e7019 -} diff --git a/game_of_life/gol.c b/game_of_life/gol.c deleted file mode 100644 index 90ca1da44a0de5d1684a998ab3b68f315fea7141..0000000000000000000000000000000000000000 --- a/game_of_life/gol.c +++ /dev/null @@ -1,5273 +0,0 @@ -#ifndef _GNU_SOURCE -#define _GNU_SOURCE -#endif -#ifdef __GNUC__ -#pragma GCC diagnostic ignored "-Wunused-function" -#pragma GCC diagnostic ignored "-Wunused-variable" -#pragma GCC diagnostic ignored "-Wparentheses" -#pragma GCC diagnostic ignored "-Wunused-label" -#pragma GCC diagnostic ignored "-Wunused-but-set-variable" -#endif -#ifdef __clang__ -#pragma clang diagnostic ignored "-Wunused-function" -#pragma clang diagnostic ignored "-Wunused-variable" -#pragma clang diagnostic ignored "-Wparentheses" -#pragma clang diagnostic ignored "-Wunused-label" -#endif -// Headers - -#include <stdint.h> -#include <stddef.h> -#include <stdbool.h> -#include <stdio.h> -#include <float.h> - -#ifdef __cplusplus -extern "C" { -#endif - -// Initialisation - -struct futhark_context_config ; -struct futhark_context_config *futhark_context_config_new(void); -void futhark_context_config_free(struct futhark_context_config *cfg); -void futhark_context_config_set_debugging(struct futhark_context_config *cfg, - int flag); -void futhark_context_config_set_profiling(struct futhark_context_config *cfg, - int flag); -void futhark_context_config_set_logging(struct futhark_context_config *cfg, - int flag); -void futhark_context_config_set_num_threads(struct futhark_context_config *cfg, - int n); -struct futhark_context ; -struct futhark_context *futhark_context_new(struct futhark_context_config *cfg); -void futhark_context_free(struct futhark_context *ctx); -int futhark_context_sync(struct futhark_context *ctx); -int futhark_context_config_set_size(struct futhark_context_config *cfg, const - char *size_name, size_t size_value); -int futhark_get_num_sizes(void); -const char *futhark_get_size_name(int); -const char *futhark_get_size_class(int); - -// Arrays - -struct futhark_i8_1d ; -struct futhark_i8_1d *futhark_new_i8_1d(struct futhark_context *ctx, const - int8_t *data, int64_t dim0); -struct futhark_i8_1d *futhark_new_raw_i8_1d(struct futhark_context *ctx, const - char *data, int offset, - int64_t dim0); -int futhark_free_i8_1d(struct futhark_context *ctx, struct futhark_i8_1d *arr); -int futhark_values_i8_1d(struct futhark_context *ctx, struct futhark_i8_1d *arr, - int8_t *data); -char *futhark_values_raw_i8_1d(struct futhark_context *ctx, - struct futhark_i8_1d *arr); -const int64_t *futhark_shape_i8_1d(struct futhark_context *ctx, - struct futhark_i8_1d *arr); -struct futhark_u32_2d ; -struct futhark_u32_2d *futhark_new_u32_2d(struct futhark_context *ctx, const - uint32_t *data, int64_t dim0, - int64_t dim1); -struct futhark_u32_2d *futhark_new_raw_u32_2d(struct futhark_context *ctx, const - char *data, int offset, - int64_t dim0, int64_t dim1); -int futhark_free_u32_2d(struct futhark_context *ctx, - struct futhark_u32_2d *arr); -int futhark_values_u32_2d(struct futhark_context *ctx, - struct futhark_u32_2d *arr, uint32_t *data); -char *futhark_values_raw_u32_2d(struct futhark_context *ctx, - struct futhark_u32_2d *arr); -const int64_t *futhark_shape_u32_2d(struct futhark_context *ctx, - struct futhark_u32_2d *arr); - -// Opaque values - -struct futhark_opaque_state ; -int futhark_free_opaque_state(struct futhark_context *ctx, - struct futhark_opaque_state *obj); -int futhark_store_opaque_state(struct futhark_context *ctx, const - struct futhark_opaque_state *obj, void **p, - size_t *n); -struct futhark_opaque_state -*futhark_restore_opaque_state(struct futhark_context *ctx, const void *p); - -// Entry points - -int futhark_entry_init(struct futhark_context *ctx, - struct futhark_opaque_state **out0, const - struct futhark_i8_1d *in0, const int64_t in1, const - int64_t in2, const int64_t in3); -int futhark_entry_key(struct futhark_context *ctx, - struct futhark_opaque_state **out0, const int32_t in0, - const int32_t in1, const - struct futhark_opaque_state *in2); -int futhark_entry_mouse(struct futhark_context *ctx, - struct futhark_opaque_state **out0, const int32_t in0, - const int32_t in1, const int32_t in2, const - struct futhark_opaque_state *in3); -int futhark_entry_render(struct futhark_context *ctx, - struct futhark_u32_2d **out0, const - struct futhark_opaque_state *in0); -int futhark_entry_resize(struct futhark_context *ctx, - struct futhark_opaque_state **out0, const int64_t in0, - const int64_t in1, const - struct futhark_opaque_state *in2); -int futhark_entry_step(struct futhark_context *ctx, - struct futhark_opaque_state **out0, const float in0, - const struct futhark_opaque_state *in1); -int futhark_entry_wheel(struct futhark_context *ctx, - struct futhark_opaque_state **out0, const int32_t in0, - const int32_t in1, const - struct futhark_opaque_state *in2); - -// Miscellaneous - -char *futhark_context_report(struct futhark_context *ctx); -char *futhark_context_get_error(struct futhark_context *ctx); -void futhark_context_set_logging_file(struct futhark_context *ctx, FILE *f); -void futhark_context_pause_profiling(struct futhark_context *ctx); -void futhark_context_unpause_profiling(struct futhark_context *ctx); -int futhark_context_clear_caches(struct futhark_context *ctx); -#define FUTHARK_BACKEND_multicore -#ifdef __cplusplus -} -#endif -#include <stdio.h> -#include <stdlib.h> -#include <stdbool.h> -#include <math.h> -#include <stdint.h> -#undef NDEBUG -#include <assert.h> -#include <stdarg.h> -// Start of util.h. -// -// Various helper functions that are useful in all generated C code. - -#include <errno.h> -#include <string.h> - -static const char *fut_progname = "(embedded Futhark)"; - -static void futhark_panic(int eval, const char *fmt, ...) { - va_list ap; - va_start(ap, fmt); - fprintf(stderr, "%s: ", fut_progname); - vfprintf(stderr, fmt, ap); - va_end(ap); - exit(eval); -} - -// For generating arbitrary-sized error messages. It is the callers -// responsibility to free the buffer at some point. -static char* msgprintf(const char *s, ...) { - va_list vl; - va_start(vl, s); - size_t needed = 1 + (size_t)vsnprintf(NULL, 0, s, vl); - char *buffer = (char*) malloc(needed); - va_start(vl, s); // Must re-init. - vsnprintf(buffer, needed, s, vl); - return buffer; -} - - -static inline void check_err(int errval, int sets_errno, const char *fun, int line, - const char *msg, ...) { - if (errval) { - char errnum[10]; - - va_list vl; - va_start(vl, msg); - - fprintf(stderr, "ERROR: "); - vfprintf(stderr, msg, vl); - fprintf(stderr, " in %s() at line %d with error code %s\n", - fun, line, - sets_errno ? strerror(errno) : errnum); - exit(errval); - } -} - -#define CHECK_ERR(err, msg...) check_err(err, 0, __func__, __LINE__, msg) -#define CHECK_ERRNO(err, msg...) check_err(err, 1, __func__, __LINE__, msg) - -// Read the rest of an open file into a NUL-terminated string; returns -// NULL on error. -static void* fslurp_file(FILE *f, size_t *size) { - size_t start = ftell(f); - fseek(f, 0, SEEK_END); - size_t src_size = ftell(f)-start; - fseek(f, start, SEEK_SET); - unsigned char *s = (unsigned char*) malloc(src_size + 1); - if (fread(s, 1, src_size, f) != src_size) { - free(s); - s = NULL; - } else { - s[src_size] = '\0'; - } - - if (size) { - *size = src_size; - } - - return s; -} - -// Read a file into a NUL-terminated string; returns NULL on error. -static void* slurp_file(const char *filename, size_t *size) { - FILE *f = fopen(filename, "rb"); // To avoid Windows messing with linebreaks. - if (f == NULL) return NULL; - unsigned char *s = fslurp_file(f, size); - fclose(f); - return s; -} - -// Dump 'n' bytes from 'buf' into the file at the designated location. -// Returns 0 on success. -static int dump_file(const char *file, const void *buf, size_t n) { - FILE *f = fopen(file, "w"); - - if (f == NULL) { - return 1; - } - - if (fwrite(buf, sizeof(char), n, f) != n) { - return 1; - } - - if (fclose(f) != 0) { - return 1; - } - - return 0; -} - -struct str_builder { - char *str; - size_t capacity; // Size of buffer. - size_t used; // Bytes used, *not* including final zero. -}; - -static void str_builder_init(struct str_builder *b) { - b->capacity = 10; - b->used = 0; - b->str = malloc(b->capacity); - b->str[0] = 0; -} - -static void str_builder(struct str_builder *b, const char *s, ...) { - va_list vl; - va_start(vl, s); - size_t needed = (size_t)vsnprintf(NULL, 0, s, vl); - - while (b->capacity < b->used + needed + 1) { - b->capacity *= 2; - b->str = realloc(b->str, b->capacity); - } - - va_start(vl, s); // Must re-init. - vsnprintf(b->str+b->used, b->capacity-b->used, s, vl); - b->used += needed; -} - -// End of util.h. - -// Start of timing.h. - -// The function get_wall_time() returns the wall time in microseconds -// (with an unspecified offset). - -#ifdef _WIN32 - -#include <windows.h> - -static int64_t get_wall_time(void) { - LARGE_INTEGER time,freq; - assert(QueryPerformanceFrequency(&freq)); - assert(QueryPerformanceCounter(&time)); - return ((double)time.QuadPart / freq.QuadPart) * 1000000; -} - -#else -// Assuming POSIX - -#include <time.h> -#include <sys/time.h> - -static int64_t get_wall_time(void) { - struct timeval time; - assert(gettimeofday(&time,NULL) == 0); - return time.tv_sec * 1000000 + time.tv_usec; -} - -static int64_t get_wall_time_ns(void) { - struct timespec time; - assert(clock_gettime(CLOCK_REALTIME, &time) == 0); - return time.tv_sec * 1000000000 + time.tv_nsec; -} - -#endif - -// End of timing.h. - -#ifdef _MSC_VER -#define inline __inline -#endif -#include <string.h> -#include <string.h> -#include <errno.h> -#include <assert.h> -#include <ctype.h> - -// Start of lock.h. - -// A very simple cross-platform implementation of locks. Uses -// pthreads on Unix and some Windows thing there. Futhark's -// host-level code is not multithreaded, but user code may be, so we -// need some mechanism for ensuring atomic access to API functions. -// This is that mechanism. It is not exposed to user code at all, so -// we do not have to worry about name collisions. - -#ifdef _WIN32 - -typedef HANDLE lock_t; - -static void create_lock(lock_t *lock) { - *lock = CreateMutex(NULL, // Default security attributes. - FALSE, // Initially unlocked. - NULL); // Unnamed. -} - -static void lock_lock(lock_t *lock) { - assert(WaitForSingleObject(*lock, INFINITE) == WAIT_OBJECT_0); -} - -static void lock_unlock(lock_t *lock) { - assert(ReleaseMutex(*lock)); -} - -static void free_lock(lock_t *lock) { - CloseHandle(*lock); -} - -#else -// Assuming POSIX - -#include <pthread.h> - -typedef pthread_mutex_t lock_t; - -static void create_lock(lock_t *lock) { - int r = pthread_mutex_init(lock, NULL); - assert(r == 0); -} - -static void lock_lock(lock_t *lock) { - int r = pthread_mutex_lock(lock); - assert(r == 0); -} - -static void lock_unlock(lock_t *lock) { - int r = pthread_mutex_unlock(lock); - assert(r == 0); -} - -static void free_lock(lock_t *lock) { - // Nothing to do for pthreads. - (void)lock; -} - -#endif - -// End of lock.h. - -static inline uint8_t add8(uint8_t x, uint8_t y) -{ - return x + y; -} -static inline uint16_t add16(uint16_t x, uint16_t y) -{ - return x + y; -} -static inline uint32_t add32(uint32_t x, uint32_t y) -{ - return x + y; -} -static inline uint64_t add64(uint64_t x, uint64_t y) -{ - return x + y; -} -static inline uint8_t sub8(uint8_t x, uint8_t y) -{ - return x - y; -} -static inline uint16_t sub16(uint16_t x, uint16_t y) -{ - return x - y; -} -static inline uint32_t sub32(uint32_t x, uint32_t y) -{ - return x - y; -} -static inline uint64_t sub64(uint64_t x, uint64_t y) -{ - return x - y; -} -static inline uint8_t mul8(uint8_t x, uint8_t y) -{ - return x * y; -} -static inline uint16_t mul16(uint16_t x, uint16_t y) -{ - return x * y; -} -static inline uint32_t mul32(uint32_t x, uint32_t y) -{ - return x * y; -} -static inline uint64_t mul64(uint64_t x, uint64_t y) -{ - return x * y; -} -static inline uint8_t udiv8(uint8_t x, uint8_t y) -{ - return x / y; -} -static inline uint16_t udiv16(uint16_t x, uint16_t y) -{ - return x / y; -} -static inline uint32_t udiv32(uint32_t x, uint32_t y) -{ - return x / y; -} -static inline uint64_t udiv64(uint64_t x, uint64_t y) -{ - return x / y; -} -static inline uint8_t udiv_up8(uint8_t x, uint8_t y) -{ - return (x + y - 1) / y; -} -static inline uint16_t udiv_up16(uint16_t x, uint16_t y) -{ - return (x + y - 1) / y; -} -static inline uint32_t udiv_up32(uint32_t x, uint32_t y) -{ - return (x + y - 1) / y; -} -static inline uint64_t udiv_up64(uint64_t x, uint64_t y) -{ - return (x + y - 1) / y; -} -static inline uint8_t umod8(uint8_t x, uint8_t y) -{ - return x % y; -} -static inline uint16_t umod16(uint16_t x, uint16_t y) -{ - return x % y; -} -static inline uint32_t umod32(uint32_t x, uint32_t y) -{ - return x % y; -} -static inline uint64_t umod64(uint64_t x, uint64_t y) -{ - return x % y; -} -static inline uint8_t udiv_safe8(uint8_t x, uint8_t y) -{ - return y == 0 ? 0 : x / y; -} -static inline uint16_t udiv_safe16(uint16_t x, uint16_t y) -{ - return y == 0 ? 0 : x / y; -} -static inline uint32_t udiv_safe32(uint32_t x, uint32_t y) -{ - return y == 0 ? 0 : x / y; -} -static inline uint64_t udiv_safe64(uint64_t x, uint64_t y) -{ - return y == 0 ? 0 : x / y; -} -static inline uint8_t udiv_up_safe8(uint8_t x, uint8_t y) -{ - return y == 0 ? 0 : (x + y - 1) / y; -} -static inline uint16_t udiv_up_safe16(uint16_t x, uint16_t y) -{ - return y == 0 ? 0 : (x + y - 1) / y; -} -static inline uint32_t udiv_up_safe32(uint32_t x, uint32_t y) -{ - return y == 0 ? 0 : (x + y - 1) / y; -} -static inline uint64_t udiv_up_safe64(uint64_t x, uint64_t y) -{ - return y == 0 ? 0 : (x + y - 1) / y; -} -static inline uint8_t umod_safe8(uint8_t x, uint8_t y) -{ - return y == 0 ? 0 : x % y; -} -static inline uint16_t umod_safe16(uint16_t x, uint16_t y) -{ - return y == 0 ? 0 : x % y; -} -static inline uint32_t umod_safe32(uint32_t x, uint32_t y) -{ - return y == 0 ? 0 : x % y; -} -static inline uint64_t umod_safe64(uint64_t x, uint64_t y) -{ - return y == 0 ? 0 : x % y; -} -static inline int8_t sdiv8(int8_t x, int8_t y) -{ - int8_t q = x / y; - int8_t r = x % y; - - return q - ((r != 0 && r < 0 != y < 0) ? 1 : 0); -} -static inline int16_t sdiv16(int16_t x, int16_t y) -{ - int16_t q = x / y; - int16_t r = x % y; - - return q - ((r != 0 && r < 0 != y < 0) ? 1 : 0); -} -static inline int32_t sdiv32(int32_t x, int32_t y) -{ - int32_t q = x / y; - int32_t r = x % y; - - return q - ((r != 0 && r < 0 != y < 0) ? 1 : 0); -} -static inline int64_t sdiv64(int64_t x, int64_t y) -{ - int64_t q = x / y; - int64_t r = x % y; - - return q - ((r != 0 && r < 0 != y < 0) ? 1 : 0); -} -static inline int8_t sdiv_up8(int8_t x, int8_t y) -{ - return sdiv8(x + y - 1, y); -} -static inline int16_t sdiv_up16(int16_t x, int16_t y) -{ - return sdiv16(x + y - 1, y); -} -static inline int32_t sdiv_up32(int32_t x, int32_t y) -{ - return sdiv32(x + y - 1, y); -} -static inline int64_t sdiv_up64(int64_t x, int64_t y) -{ - return sdiv64(x + y - 1, y); -} -static inline int8_t smod8(int8_t x, int8_t y) -{ - int8_t r = x % y; - - return r + (r == 0 || (x > 0 && y > 0) || (x < 0 && y < 0) ? 0 : y); -} -static inline int16_t smod16(int16_t x, int16_t y) -{ - int16_t r = x % y; - - return r + (r == 0 || (x > 0 && y > 0) || (x < 0 && y < 0) ? 0 : y); -} -static inline int32_t smod32(int32_t x, int32_t y) -{ - int32_t r = x % y; - - return r + (r == 0 || (x > 0 && y > 0) || (x < 0 && y < 0) ? 0 : y); -} -static inline int64_t smod64(int64_t x, int64_t y) -{ - int64_t r = x % y; - - return r + (r == 0 || (x > 0 && y > 0) || (x < 0 && y < 0) ? 0 : y); -} -static inline int8_t sdiv_safe8(int8_t x, int8_t y) -{ - return y == 0 ? 0 : sdiv8(x, y); -} -static inline int16_t sdiv_safe16(int16_t x, int16_t y) -{ - return y == 0 ? 0 : sdiv16(x, y); -} -static inline int32_t sdiv_safe32(int32_t x, int32_t y) -{ - return y == 0 ? 0 : sdiv32(x, y); -} -static inline int64_t sdiv_safe64(int64_t x, int64_t y) -{ - return y == 0 ? 0 : sdiv64(x, y); -} -static inline int8_t sdiv_up_safe8(int8_t x, int8_t y) -{ - return sdiv_safe8(x + y - 1, y); -} -static inline int16_t sdiv_up_safe16(int16_t x, int16_t y) -{ - return sdiv_safe16(x + y - 1, y); -} -static inline int32_t sdiv_up_safe32(int32_t x, int32_t y) -{ - return sdiv_safe32(x + y - 1, y); -} -static inline int64_t sdiv_up_safe64(int64_t x, int64_t y) -{ - return sdiv_safe64(x + y - 1, y); -} -static inline int8_t smod_safe8(int8_t x, int8_t y) -{ - return y == 0 ? 0 : smod8(x, y); -} -static inline int16_t smod_safe16(int16_t x, int16_t y) -{ - return y == 0 ? 0 : smod16(x, y); -} -static inline int32_t smod_safe32(int32_t x, int32_t y) -{ - return y == 0 ? 0 : smod32(x, y); -} -static inline int64_t smod_safe64(int64_t x, int64_t y) -{ - return y == 0 ? 0 : smod64(x, y); -} -static inline int8_t squot8(int8_t x, int8_t y) -{ - return x / y; -} -static inline int16_t squot16(int16_t x, int16_t y) -{ - return x / y; -} -static inline int32_t squot32(int32_t x, int32_t y) -{ - return x / y; -} -static inline int64_t squot64(int64_t x, int64_t y) -{ - return x / y; -} -static inline int8_t srem8(int8_t x, int8_t y) -{ - return x % y; -} -static inline int16_t srem16(int16_t x, int16_t y) -{ - return x % y; -} -static inline int32_t srem32(int32_t x, int32_t y) -{ - return x % y; -} -static inline int64_t srem64(int64_t x, int64_t y) -{ - return x % y; -} -static inline int8_t squot_safe8(int8_t x, int8_t y) -{ - return y == 0 ? 0 : x / y; -} -static inline int16_t squot_safe16(int16_t x, int16_t y) -{ - return y == 0 ? 0 : x / y; -} -static inline int32_t squot_safe32(int32_t x, int32_t y) -{ - return y == 0 ? 0 : x / y; -} -static inline int64_t squot_safe64(int64_t x, int64_t y) -{ - return y == 0 ? 0 : x / y; -} -static inline int8_t srem_safe8(int8_t x, int8_t y) -{ - return y == 0 ? 0 : x % y; -} -static inline int16_t srem_safe16(int16_t x, int16_t y) -{ - return y == 0 ? 0 : x % y; -} -static inline int32_t srem_safe32(int32_t x, int32_t y) -{ - return y == 0 ? 0 : x % y; -} -static inline int64_t srem_safe64(int64_t x, int64_t y) -{ - return y == 0 ? 0 : x % y; -} -static inline int8_t smin8(int8_t x, int8_t y) -{ - return x < y ? x : y; -} -static inline int16_t smin16(int16_t x, int16_t y) -{ - return x < y ? x : y; -} -static inline int32_t smin32(int32_t x, int32_t y) -{ - return x < y ? x : y; -} -static inline int64_t smin64(int64_t x, int64_t y) -{ - return x < y ? x : y; -} -static inline uint8_t umin8(uint8_t x, uint8_t y) -{ - return x < y ? x : y; -} -static inline uint16_t umin16(uint16_t x, uint16_t y) -{ - return x < y ? x : y; -} -static inline uint32_t umin32(uint32_t x, uint32_t y) -{ - return x < y ? x : y; -} -static inline uint64_t umin64(uint64_t x, uint64_t y) -{ - return x < y ? x : y; -} -static inline int8_t smax8(int8_t x, int8_t y) -{ - return x < y ? y : x; -} -static inline int16_t smax16(int16_t x, int16_t y) -{ - return x < y ? y : x; -} -static inline int32_t smax32(int32_t x, int32_t y) -{ - return x < y ? y : x; -} -static inline int64_t smax64(int64_t x, int64_t y) -{ - return x < y ? y : x; -} -static inline uint8_t umax8(uint8_t x, uint8_t y) -{ - return x < y ? y : x; -} -static inline uint16_t umax16(uint16_t x, uint16_t y) -{ - return x < y ? y : x; -} -static inline uint32_t umax32(uint32_t x, uint32_t y) -{ - return x < y ? y : x; -} -static inline uint64_t umax64(uint64_t x, uint64_t y) -{ - return x < y ? y : x; -} -static inline uint8_t shl8(uint8_t x, uint8_t y) -{ - return x << y; -} -static inline uint16_t shl16(uint16_t x, uint16_t y) -{ - return x << y; -} -static inline uint32_t shl32(uint32_t x, uint32_t y) -{ - return x << y; -} -static inline uint64_t shl64(uint64_t x, uint64_t y) -{ - return x << y; -} -static inline uint8_t lshr8(uint8_t x, uint8_t y) -{ - return x >> y; -} -static inline uint16_t lshr16(uint16_t x, uint16_t y) -{ - return x >> y; -} -static inline uint32_t lshr32(uint32_t x, uint32_t y) -{ - return x >> y; -} -static inline uint64_t lshr64(uint64_t x, uint64_t y) -{ - return x >> y; -} -static inline int8_t ashr8(int8_t x, int8_t y) -{ - return x >> y; -} -static inline int16_t ashr16(int16_t x, int16_t y) -{ - return x >> y; -} -static inline int32_t ashr32(int32_t x, int32_t y) -{ - return x >> y; -} -static inline int64_t ashr64(int64_t x, int64_t y) -{ - return x >> y; -} -static inline uint8_t and8(uint8_t x, uint8_t y) -{ - return x & y; -} -static inline uint16_t and16(uint16_t x, uint16_t y) -{ - return x & y; -} -static inline uint32_t and32(uint32_t x, uint32_t y) -{ - return x & y; -} -static inline uint64_t and64(uint64_t x, uint64_t y) -{ - return x & y; -} -static inline uint8_t or8(uint8_t x, uint8_t y) -{ - return x | y; -} -static inline uint16_t or16(uint16_t x, uint16_t y) -{ - return x | y; -} -static inline uint32_t or32(uint32_t x, uint32_t y) -{ - return x | y; -} -static inline uint64_t or64(uint64_t x, uint64_t y) -{ - return x | y; -} -static inline uint8_t xor8(uint8_t x, uint8_t y) -{ - return x ^ y; -} -static inline uint16_t xor16(uint16_t x, uint16_t y) -{ - return x ^ y; -} -static inline uint32_t xor32(uint32_t x, uint32_t y) -{ - return x ^ y; -} -static inline uint64_t xor64(uint64_t x, uint64_t y) -{ - return x ^ y; -} -static inline bool ult8(uint8_t x, uint8_t y) -{ - return x < y; -} -static inline bool ult16(uint16_t x, uint16_t y) -{ - return x < y; -} -static inline bool ult32(uint32_t x, uint32_t y) -{ - return x < y; -} -static inline bool ult64(uint64_t x, uint64_t y) -{ - return x < y; -} -static inline bool ule8(uint8_t x, uint8_t y) -{ - return x <= y; -} -static inline bool ule16(uint16_t x, uint16_t y) -{ - return x <= y; -} -static inline bool ule32(uint32_t x, uint32_t y) -{ - return x <= y; -} -static inline bool ule64(uint64_t x, uint64_t y) -{ - return x <= y; -} -static inline bool slt8(int8_t x, int8_t y) -{ - return x < y; -} -static inline bool slt16(int16_t x, int16_t y) -{ - return x < y; -} -static inline bool slt32(int32_t x, int32_t y) -{ - return x < y; -} -static inline bool slt64(int64_t x, int64_t y) -{ - return x < y; -} -static inline bool sle8(int8_t x, int8_t y) -{ - return x <= y; -} -static inline bool sle16(int16_t x, int16_t y) -{ - return x <= y; -} -static inline bool sle32(int32_t x, int32_t y) -{ - return x <= y; -} -static inline bool sle64(int64_t x, int64_t y) -{ - return x <= y; -} -static inline int8_t pow8(int8_t x, int8_t y) -{ - int8_t res = 1, rem = y; - - while (rem != 0) { - if (rem & 1) - res *= x; - rem >>= 1; - x *= x; - } - return res; -} -static inline int16_t pow16(int16_t x, int16_t y) -{ - int16_t res = 1, rem = y; - - while (rem != 0) { - if (rem & 1) - res *= x; - rem >>= 1; - x *= x; - } - return res; -} -static inline int32_t pow32(int32_t x, int32_t y) -{ - int32_t res = 1, rem = y; - - while (rem != 0) { - if (rem & 1) - res *= x; - rem >>= 1; - x *= x; - } - return res; -} -static inline int64_t pow64(int64_t x, int64_t y) -{ - int64_t res = 1, rem = y; - - while (rem != 0) { - if (rem & 1) - res *= x; - rem >>= 1; - x *= x; - } - return res; -} -static inline bool itob_i8_bool(int8_t x) -{ - return x; -} -static inline bool itob_i16_bool(int16_t x) -{ - return x; -} -static inline bool itob_i32_bool(int32_t x) -{ - return x; -} -static inline bool itob_i64_bool(int64_t x) -{ - return x; -} -static inline int8_t btoi_bool_i8(bool x) -{ - return x; -} -static inline int16_t btoi_bool_i16(bool x) -{ - return x; -} -static inline int32_t btoi_bool_i32(bool x) -{ - return x; -} -static inline int64_t btoi_bool_i64(bool x) -{ - return x; -} -#define sext_i8_i8(x) ((int8_t) (int8_t) x) -#define sext_i8_i16(x) ((int16_t) (int8_t) x) -#define sext_i8_i32(x) ((int32_t) (int8_t) x) -#define sext_i8_i64(x) ((int64_t) (int8_t) x) -#define sext_i16_i8(x) ((int8_t) (int16_t) x) -#define sext_i16_i16(x) ((int16_t) (int16_t) x) -#define sext_i16_i32(x) ((int32_t) (int16_t) x) -#define sext_i16_i64(x) ((int64_t) (int16_t) x) -#define sext_i32_i8(x) ((int8_t) (int32_t) x) -#define sext_i32_i16(x) ((int16_t) (int32_t) x) -#define sext_i32_i32(x) ((int32_t) (int32_t) x) -#define sext_i32_i64(x) ((int64_t) (int32_t) x) -#define sext_i64_i8(x) ((int8_t) (int64_t) x) -#define sext_i64_i16(x) ((int16_t) (int64_t) x) -#define sext_i64_i32(x) ((int32_t) (int64_t) x) -#define sext_i64_i64(x) ((int64_t) (int64_t) x) -#define zext_i8_i8(x) ((int8_t) (uint8_t) x) -#define zext_i8_i16(x) ((int16_t) (uint8_t) x) -#define zext_i8_i32(x) ((int32_t) (uint8_t) x) -#define zext_i8_i64(x) ((int64_t) (uint8_t) x) -#define zext_i16_i8(x) ((int8_t) (uint16_t) x) -#define zext_i16_i16(x) ((int16_t) (uint16_t) x) -#define zext_i16_i32(x) ((int32_t) (uint16_t) x) -#define zext_i16_i64(x) ((int64_t) (uint16_t) x) -#define zext_i32_i8(x) ((int8_t) (uint32_t) x) -#define zext_i32_i16(x) ((int16_t) (uint32_t) x) -#define zext_i32_i32(x) ((int32_t) (uint32_t) x) -#define zext_i32_i64(x) ((int64_t) (uint32_t) x) -#define zext_i64_i8(x) ((int8_t) (uint64_t) x) -#define zext_i64_i16(x) ((int16_t) (uint64_t) x) -#define zext_i64_i32(x) ((int32_t) (uint64_t) x) -#define zext_i64_i64(x) ((int64_t) (uint64_t) x) -#if defined(__OPENCL_VERSION__) -static int32_t futrts_popc8(int8_t x) -{ - return popcount(x); -} -static int32_t futrts_popc16(int16_t x) -{ - return popcount(x); -} -static int32_t futrts_popc32(int32_t x) -{ - return popcount(x); -} -static int32_t futrts_popc64(int64_t x) -{ - return popcount(x); -} -#elif defined(__CUDA_ARCH__) -static int32_t futrts_popc8(int8_t x) -{ - return __popc(zext_i8_i32(x)); -} -static int32_t futrts_popc16(int16_t x) -{ - return __popc(zext_i16_i32(x)); -} -static int32_t futrts_popc32(int32_t x) -{ - return __popc(x); -} -static int32_t futrts_popc64(int64_t x) -{ - return __popcll(x); -} -#else -static int32_t futrts_popc8(int8_t x) -{ - int c = 0; - - for (; x; ++c) - x &= x - 1; - return c; -} -static int32_t futrts_popc16(int16_t x) -{ - int c = 0; - - for (; x; ++c) - x &= x - 1; - return c; -} -static int32_t futrts_popc32(int32_t x) -{ - int c = 0; - - for (; x; ++c) - x &= x - 1; - return c; -} -static int32_t futrts_popc64(int64_t x) -{ - int c = 0; - - for (; x; ++c) - x &= x - 1; - return c; -} -#endif -#if defined(__OPENCL_VERSION__) -static uint8_t futrts_mul_hi8(uint8_t a, uint8_t b) -{ - return mul_hi(a, b); -} -static uint16_t futrts_mul_hi16(uint16_t a, uint16_t b) -{ - return mul_hi(a, b); -} -static uint32_t futrts_mul_hi32(uint32_t a, uint32_t b) -{ - return mul_hi(a, b); -} -static uint64_t futrts_mul_hi64(uint64_t a, uint64_t b) -{ - return mul_hi(a, b); -} -#elif defined(__CUDA_ARCH__) -static uint8_t futrts_mul_hi8(uint8_t a, uint8_t b) -{ - uint16_t aa = a; - uint16_t bb = b; - - return aa * bb >> 8; -} -static uint16_t futrts_mul_hi16(uint16_t a, uint16_t b) -{ - uint32_t aa = a; - uint32_t bb = b; - - return aa * bb >> 16; -} -static uint32_t futrts_mul_hi32(uint32_t a, uint32_t b) -{ - return mulhi(a, b); -} -static uint64_t futrts_mul_hi64(uint64_t a, uint64_t b) -{ - return mul64hi(a, b); -} -#else -static uint8_t futrts_mul_hi8(uint8_t a, uint8_t b) -{ - uint16_t aa = a; - uint16_t bb = b; - - return aa * bb >> 8; -} -static uint16_t futrts_mul_hi16(uint16_t a, uint16_t b) -{ - uint32_t aa = a; - uint32_t bb = b; - - return aa * bb >> 16; -} -static uint32_t futrts_mul_hi32(uint32_t a, uint32_t b) -{ - uint64_t aa = a; - uint64_t bb = b; - - return aa * bb >> 32; -} -static uint64_t futrts_mul_hi64(uint64_t a, uint64_t b) -{ - __uint128_t aa = a; - __uint128_t bb = b; - - return aa * bb >> 64; -} -#endif -#if defined(__OPENCL_VERSION__) -static uint8_t futrts_mad_hi8(uint8_t a, uint8_t b, uint8_t c) -{ - return mad_hi(a, b, c); -} -static uint16_t futrts_mad_hi16(uint16_t a, uint16_t b, uint16_t c) -{ - return mad_hi(a, b, c); -} -static uint32_t futrts_mad_hi32(uint32_t a, uint32_t b, uint32_t c) -{ - return mad_hi(a, b, c); -} -static uint64_t futrts_mad_hi64(uint64_t a, uint64_t b, uint64_t c) -{ - return mad_hi(a, b, c); -} -#else -static uint8_t futrts_mad_hi8(uint8_t a, uint8_t b, uint8_t c) -{ - return futrts_mul_hi8(a, b) + c; -} -static uint16_t futrts_mad_hi16(uint16_t a, uint16_t b, uint16_t c) -{ - return futrts_mul_hi16(a, b) + c; -} -static uint32_t futrts_mad_hi32(uint32_t a, uint32_t b, uint32_t c) -{ - return futrts_mul_hi32(a, b) + c; -} -static uint64_t futrts_mad_hi64(uint64_t a, uint64_t b, uint64_t c) -{ - return futrts_mul_hi64(a, b) + c; -} -#endif -#if defined(__OPENCL_VERSION__) -static int32_t futrts_clzz8(int8_t x) -{ - return clz(x); -} -static int32_t futrts_clzz16(int16_t x) -{ - return clz(x); -} -static int32_t futrts_clzz32(int32_t x) -{ - return clz(x); -} -static int32_t futrts_clzz64(int64_t x) -{ - return clz(x); -} -#elif defined(__CUDA_ARCH__) -static int32_t futrts_clzz8(int8_t x) -{ - return __clz(zext_i8_i32(x)) - 24; -} -static int32_t futrts_clzz16(int16_t x) -{ - return __clz(zext_i16_i32(x)) - 16; -} -static int32_t futrts_clzz32(int32_t x) -{ - return __clz(x); -} -static int32_t futrts_clzz64(int64_t x) -{ - return __clzll(x); -} -#else -static int32_t futrts_clzz8(int8_t x) -{ - int n = 0; - int bits = sizeof(x) * 8; - - for (int i = 0; i < bits; i++) { - if (x < 0) - break; - n++; - x <<= 1; - } - return n; -} -static int32_t futrts_clzz16(int16_t x) -{ - int n = 0; - int bits = sizeof(x) * 8; - - for (int i = 0; i < bits; i++) { - if (x < 0) - break; - n++; - x <<= 1; - } - return n; -} -static int32_t futrts_clzz32(int32_t x) -{ - int n = 0; - int bits = sizeof(x) * 8; - - for (int i = 0; i < bits; i++) { - if (x < 0) - break; - n++; - x <<= 1; - } - return n; -} -static int32_t futrts_clzz64(int64_t x) -{ - int n = 0; - int bits = sizeof(x) * 8; - - for (int i = 0; i < bits; i++) { - if (x < 0) - break; - n++; - x <<= 1; - } - return n; -} -#endif -#if defined(__OPENCL_VERSION__) -static int32_t futrts_ctzz8(int8_t x) -{ - int i = 0; - - for (; i < 8 && (x & 1) == 0; i++, x >>= 1) - ; - return i; -} -static int32_t futrts_ctzz16(int16_t x) -{ - int i = 0; - - for (; i < 16 && (x & 1) == 0; i++, x >>= 1) - ; - return i; -} -static int32_t futrts_ctzz32(int32_t x) -{ - int i = 0; - - for (; i < 32 && (x & 1) == 0; i++, x >>= 1) - ; - return i; -} -static int32_t futrts_ctzz64(int64_t x) -{ - int i = 0; - - for (; i < 64 && (x & 1) == 0; i++, x >>= 1) - ; - return i; -} -#elif defined(__CUDA_ARCH__) -static int32_t futrts_ctzz8(int8_t x) -{ - int y = __ffs(x); - - return y == 0 ? 8 : y - 1; -} -static int32_t futrts_ctzz16(int16_t x) -{ - int y = __ffs(x); - - return y == 0 ? 16 : y - 1; -} -static int32_t futrts_ctzz32(int32_t x) -{ - int y = __ffs(x); - - return y == 0 ? 32 : y - 1; -} -static int32_t futrts_ctzz64(int64_t x) -{ - int y = __ffsll(x); - - return y == 0 ? 64 : y - 1; -} -#else -static int32_t futrts_ctzz8(int8_t x) -{ - return x == 0 ? 8 : __builtin_ctz((uint32_t) x); -} -static int32_t futrts_ctzz16(int16_t x) -{ - return x == 0 ? 16 : __builtin_ctz((uint32_t) x); -} -static int32_t futrts_ctzz32(int32_t x) -{ - return x == 0 ? 32 : __builtin_ctz(x); -} -static int32_t futrts_ctzz64(int64_t x) -{ - return x == 0 ? 64 : __builtin_ctzll(x); -} -#endif -static inline float fdiv32(float x, float y) -{ - return x / y; -} -static inline float fadd32(float x, float y) -{ - return x + y; -} -static inline float fsub32(float x, float y) -{ - return x - y; -} -static inline float fmul32(float x, float y) -{ - return x * y; -} -static inline float fmin32(float x, float y) -{ - return fmin(x, y); -} -static inline float fmax32(float x, float y) -{ - return fmax(x, y); -} -static inline float fpow32(float x, float y) -{ - return pow(x, y); -} -static inline bool cmplt32(float x, float y) -{ - return x < y; -} -static inline bool cmple32(float x, float y) -{ - return x <= y; -} -static inline float sitofp_i8_f32(int8_t x) -{ - return (float) x; -} -static inline float sitofp_i16_f32(int16_t x) -{ - return (float) x; -} -static inline float sitofp_i32_f32(int32_t x) -{ - return (float) x; -} -static inline float sitofp_i64_f32(int64_t x) -{ - return (float) x; -} -static inline float uitofp_i8_f32(uint8_t x) -{ - return (float) x; -} -static inline float uitofp_i16_f32(uint16_t x) -{ - return (float) x; -} -static inline float uitofp_i32_f32(uint32_t x) -{ - return (float) x; -} -static inline float uitofp_i64_f32(uint64_t x) -{ - return (float) x; -} -static inline int8_t fptosi_f32_i8(float x) -{ - return (int8_t) x; -} -static inline int16_t fptosi_f32_i16(float x) -{ - return (int16_t) x; -} -static inline int32_t fptosi_f32_i32(float x) -{ - return (int32_t) x; -} -static inline int64_t fptosi_f32_i64(float x) -{ - return (int64_t) x; -} -static inline uint8_t fptoui_f32_i8(float x) -{ - return (uint8_t) x; -} -static inline uint16_t fptoui_f32_i16(float x) -{ - return (uint16_t) x; -} -static inline uint32_t fptoui_f32_i32(float x) -{ - return (uint32_t) x; -} -static inline uint64_t fptoui_f32_i64(float x) -{ - return (uint64_t) x; -} -static inline double fdiv64(double x, double y) -{ - return x / y; -} -static inline double fadd64(double x, double y) -{ - return x + y; -} -static inline double fsub64(double x, double y) -{ - return x - y; -} -static inline double fmul64(double x, double y) -{ - return x * y; -} -static inline double fmin64(double x, double y) -{ - return fmin(x, y); -} -static inline double fmax64(double x, double y) -{ - return fmax(x, y); -} -static inline double fpow64(double x, double y) -{ - return pow(x, y); -} -static inline bool cmplt64(double x, double y) -{ - return x < y; -} -static inline bool cmple64(double x, double y) -{ - return x <= y; -} -static inline double sitofp_i8_f64(int8_t x) -{ - return (double) x; -} -static inline double sitofp_i16_f64(int16_t x) -{ - return (double) x; -} -static inline double sitofp_i32_f64(int32_t x) -{ - return (double) x; -} -static inline double sitofp_i64_f64(int64_t x) -{ - return (double) x; -} -static inline double uitofp_i8_f64(uint8_t x) -{ - return (double) x; -} -static inline double uitofp_i16_f64(uint16_t x) -{ - return (double) x; -} -static inline double uitofp_i32_f64(uint32_t x) -{ - return (double) x; -} -static inline double uitofp_i64_f64(uint64_t x) -{ - return (double) x; -} -static inline int8_t fptosi_f64_i8(double x) -{ - return (int8_t) x; -} -static inline int16_t fptosi_f64_i16(double x) -{ - return (int16_t) x; -} -static inline int32_t fptosi_f64_i32(double x) -{ - return (int32_t) x; -} -static inline int64_t fptosi_f64_i64(double x) -{ - return (int64_t) x; -} -static inline uint8_t fptoui_f64_i8(double x) -{ - return (uint8_t) x; -} -static inline uint16_t fptoui_f64_i16(double x) -{ - return (uint16_t) x; -} -static inline uint32_t fptoui_f64_i32(double x) -{ - return (uint32_t) x; -} -static inline uint64_t fptoui_f64_i64(double x) -{ - return (uint64_t) x; -} -static inline float fpconv_f32_f32(float x) -{ - return (float) x; -} -static inline double fpconv_f32_f64(float x) -{ - return (double) x; -} -static inline float fpconv_f64_f32(double x) -{ - return (float) x; -} -static inline double fpconv_f64_f64(double x) -{ - return (double) x; -} -static inline bool futrts_isnan32(float x) -{ - return isnan(x); -} -static inline bool futrts_isinf32(float x) -{ - return isinf(x); -} -#ifdef __OPENCL_VERSION__ -static inline float futrts_log32(float x) -{ - return log(x); -} -static inline float futrts_log2_32(float x) -{ - return log2(x); -} -static inline float futrts_log10_32(float x) -{ - return log10(x); -} -static inline float futrts_sqrt32(float x) -{ - return sqrt(x); -} -static inline float futrts_exp32(float x) -{ - return exp(x); -} -static inline float futrts_cos32(float x) -{ - return cos(x); -} -static inline float futrts_sin32(float x) -{ - return sin(x); -} -static inline float futrts_tan32(float x) -{ - return tan(x); -} -static inline float futrts_acos32(float x) -{ - return acos(x); -} -static inline float futrts_asin32(float x) -{ - return asin(x); -} -static inline float futrts_atan32(float x) -{ - return atan(x); -} -static inline float futrts_cosh32(float x) -{ - return cosh(x); -} -static inline float futrts_sinh32(float x) -{ - return sinh(x); -} -static inline float futrts_tanh32(float x) -{ - return tanh(x); -} -static inline float futrts_acosh32(float x) -{ - return acosh(x); -} -static inline float futrts_asinh32(float x) -{ - return asinh(x); -} -static inline float futrts_atanh32(float x) -{ - return atanh(x); -} -static inline float futrts_atan2_32(float x, float y) -{ - return atan2(x, y); -} -static inline float futrts_gamma32(float x) -{ - return tgamma(x); -} -static inline float futrts_lgamma32(float x) -{ - return lgamma(x); -} -static inline float fmod32(float x, float y) -{ - return fmod(x, y); -} -static inline float futrts_round32(float x) -{ - return rint(x); -} -static inline float futrts_floor32(float x) -{ - return floor(x); -} -static inline float futrts_ceil32(float x) -{ - return ceil(x); -} -static inline float futrts_lerp32(float v0, float v1, float t) -{ - return mix(v0, v1, t); -} -static inline float futrts_mad32(float a, float b, float c) -{ - return mad(a, b, c); -} -static inline float futrts_fma32(float a, float b, float c) -{ - return fma(a, b, c); -} -#else -static inline float futrts_log32(float x) -{ - return logf(x); -} -static inline float futrts_log2_32(float x) -{ - return log2f(x); -} -static inline float futrts_log10_32(float x) -{ - return log10f(x); -} -static inline float futrts_sqrt32(float x) -{ - return sqrtf(x); -} -static inline float futrts_exp32(float x) -{ - return expf(x); -} -static inline float futrts_cos32(float x) -{ - return cosf(x); -} -static inline float futrts_sin32(float x) -{ - return sinf(x); -} -static inline float futrts_tan32(float x) -{ - return tanf(x); -} -static inline float futrts_acos32(float x) -{ - return acosf(x); -} -static inline float futrts_asin32(float x) -{ - return asinf(x); -} -static inline float futrts_atan32(float x) -{ - return atanf(x); -} -static inline float futrts_cosh32(float x) -{ - return coshf(x); -} -static inline float futrts_sinh32(float x) -{ - return sinhf(x); -} -static inline float futrts_tanh32(float x) -{ - return tanhf(x); -} -static inline float futrts_acosh32(float x) -{ - return acoshf(x); -} -static inline float futrts_asinh32(float x) -{ - return asinhf(x); -} -static inline float futrts_atanh32(float x) -{ - return atanhf(x); -} -static inline float futrts_atan2_32(float x, float y) -{ - return atan2f(x, y); -} -static inline float futrts_gamma32(float x) -{ - return tgammaf(x); -} -static inline float futrts_lgamma32(float x) -{ - return lgammaf(x); -} -static inline float fmod32(float x, float y) -{ - return fmodf(x, y); -} -static inline float futrts_round32(float x) -{ - return rintf(x); -} -static inline float futrts_floor32(float x) -{ - return floorf(x); -} -static inline float futrts_ceil32(float x) -{ - return ceilf(x); -} -static inline float futrts_lerp32(float v0, float v1, float t) -{ - return v0 + (v1 - v0) * t; -} -static inline float futrts_mad32(float a, float b, float c) -{ - return a * b + c; -} -static inline float futrts_fma32(float a, float b, float c) -{ - return fmaf(a, b, c); -} -#endif -static inline int32_t futrts_to_bits32(float x) -{ - union { - float f; - int32_t t; - } p; - - p.f = x; - return p.t; -} -static inline float futrts_from_bits32(int32_t x) -{ - union { - int32_t f; - float t; - } p; - - p.f = x; - return p.t; -} -static inline float fsignum32(float x) -{ - return futrts_isnan32(x) ? x : (x > 0) - (x < 0); -} -static inline double futrts_log64(double x) -{ - return log(x); -} -static inline double futrts_log2_64(double x) -{ - return log2(x); -} -static inline double futrts_log10_64(double x) -{ - return log10(x); -} -static inline double futrts_sqrt64(double x) -{ - return sqrt(x); -} -static inline double futrts_exp64(double x) -{ - return exp(x); -} -static inline double futrts_cos64(double x) -{ - return cos(x); -} -static inline double futrts_sin64(double x) -{ - return sin(x); -} -static inline double futrts_tan64(double x) -{ - return tan(x); -} -static inline double futrts_acos64(double x) -{ - return acos(x); -} -static inline double futrts_asin64(double x) -{ - return asin(x); -} -static inline double futrts_atan64(double x) -{ - return atan(x); -} -static inline double futrts_cosh64(double x) -{ - return cosh(x); -} -static inline double futrts_sinh64(double x) -{ - return sinh(x); -} -static inline double futrts_tanh64(double x) -{ - return tanh(x); -} -static inline double futrts_acosh64(double x) -{ - return acosh(x); -} -static inline double futrts_asinh64(double x) -{ - return asinh(x); -} -static inline double futrts_atanh64(double x) -{ - return atanh(x); -} -static inline double futrts_atan2_64(double x, double y) -{ - return atan2(x, y); -} -static inline double futrts_gamma64(double x) -{ - return tgamma(x); -} -static inline double futrts_lgamma64(double x) -{ - return lgamma(x); -} -static inline double futrts_fma64(double a, double b, double c) -{ - return fma(a, b, c); -} -static inline double futrts_round64(double x) -{ - return rint(x); -} -static inline double futrts_ceil64(double x) -{ - return ceil(x); -} -static inline double futrts_floor64(double x) -{ - return floor(x); -} -static inline bool futrts_isnan64(double x) -{ - return isnan(x); -} -static inline bool futrts_isinf64(double x) -{ - return isinf(x); -} -static inline int64_t futrts_to_bits64(double x) -{ - union { - double f; - int64_t t; - } p; - - p.f = x; - return p.t; -} -static inline double futrts_from_bits64(int64_t x) -{ - union { - int64_t f; - double t; - } p; - - p.f = x; - return p.t; -} -static inline double fmod64(double x, double y) -{ - return fmod(x, y); -} -static inline double fsignum64(double x) -{ - return futrts_isnan64(x) ? x : (x > 0) - (x < 0); -} -#ifdef __OPENCL_VERSION__ -static inline double futrts_lerp64(double v0, double v1, double t) -{ - return mix(v0, v1, t); -} -static inline double futrts_mad64(double a, double b, double c) -{ - return mad(a, b, c); -} -#else -static inline double futrts_lerp64(double v0, double v1, double t) -{ - return v0 + (v1 - v0) * t; -} -static inline double futrts_mad64(double a, double b, double c) -{ - return a * b + c; -} -#endif -static int init_constants(struct futhark_context *); -static int free_constants(struct futhark_context *); -struct memblock { - int *references; - char *mem; - int64_t size; - const char *desc; -} ; -// start of scheduler.h - -// First, the API that the generated code will access. In principle, -// we could then compile the scheduler separately and link an object -// file with the generated code. In practice, we will embed all of -// this in the generated code. - -// Scheduler handle. -struct scheduler; - -// Initialise a scheduler (and start worker threads). -static int scheduler_init(struct scheduler *scheduler, - int num_workers, - double kappa); - -// Shut down a scheduler (and destroy worker threads). -static int scheduler_destroy(struct scheduler *scheduler); - -// Figure out the smallest amount of work that amortises task -// creation. -static int determine_kappa(double *kappa); - -// How a segop should be scheduled. -enum scheduling { - DYNAMIC, - STATIC -}; - -// How a given task should be executed. Filled out by the scheduler -// and passed to the segop function -struct scheduler_info { - int64_t iter_pr_subtask; - int64_t remainder; - int nsubtasks; - enum scheduling sched; - int wake_up_threads; - - int64_t *task_time; - int64_t *task_iter; -}; - -// A segop function. This is what you hand the scheduler for -// execution. -typedef int (*segop_fn)(void* args, - int64_t iterations, - int tid, - struct scheduler_info info); - -// A task for the scheduler to execute. -struct scheduler_segop { - void *args; - segop_fn top_level_fn; - segop_fn nested_fn; - int64_t iterations; - enum scheduling sched; - - // Pointers to timer and iter associated with the task - int64_t *task_time; - int64_t *task_iter; - - // For debugging - const char* name; -}; - -static inline int scheduler_prepare_task(struct scheduler *scheduler, - struct scheduler_segop *task); - -typedef int (*parloop_fn)(void* args, - int64_t start, - int64_t end, - int subtask_id, - int tid); - -// A parallel parloop task. -struct scheduler_parloop { - void* args; - parloop_fn fn; - int64_t iterations; - struct scheduler_info info; - - // For debugging - const char* name; -}; - -static inline int scheduler_execute_task(struct scheduler *scheduler, - struct scheduler_parloop *task); - -// Then the API implementation. - -#include <signal.h> - -#if defined(_WIN32) -#include <windows.h> -#elif defined(__APPLE__) -#include <sys/sysctl.h> -// For getting cpu usage of threads -#include <mach/mach.h> -#include <sys/resource.h> -#elif defined(__linux__) -#include <sys/sysinfo.h> -#include <sys/resource.h> -#include <signal.h> -#endif - -/* Multicore Utility functions */ - -/* A wrapper for getting rusage on Linux and MacOS */ -/* TODO maybe figure out this for windows */ -static inline int getrusage_thread(struct rusage *rusage) -{ - int err = -1; -#if defined(__APPLE__) - thread_basic_info_data_t info = { 0 }; - mach_msg_type_number_t info_count = THREAD_BASIC_INFO_COUNT; - kern_return_t kern_err; - - kern_err = thread_info(mach_thread_self(), - THREAD_BASIC_INFO, - (thread_info_t)&info, - &info_count); - if (kern_err == KERN_SUCCESS) { - memset(rusage, 0, sizeof(struct rusage)); - rusage->ru_utime.tv_sec = info.user_time.seconds; - rusage->ru_utime.tv_usec = info.user_time.microseconds; - rusage->ru_stime.tv_sec = info.system_time.seconds; - rusage->ru_stime.tv_usec = info.system_time.microseconds; - err = 0; - } else { - errno = EINVAL; - } -#elif defined(__linux__) - err = getrusage(RUSAGE_THREAD, rusage); -#endif - return err; -} - -/* returns the number of logical cores */ -static int num_processors() -{ -#if defined(_WIN32) -/* https://docs.microsoft.com/en-us/windows/win32/api/sysinfoapi/ns-sysinfoapi-system_info */ - SYSTEM_INFO sysinfo; - GetSystemInfo(&sysinfo); - int ncores = sysinfo.dwNumberOfProcessors; - fprintf(stderr, "Found %d cores on your Windows machine\n Is that correct?\n", ncores); - return ncores; -#elif defined(__APPLE__) - int ncores; - size_t ncores_size = sizeof(ncores); - CHECK_ERRNO(sysctlbyname("hw.logicalcpu", &ncores, &ncores_size, NULL, 0), - "sysctlbyname (hw.logicalcpu)"); - return ncores; -#elif defined(__linux__) - return get_nprocs(); -#else - fprintf(stderr, "operating system not recognised\n"); - return -1; -#endif -} - -static unsigned int g_seed; - -// Used to seed the generator. -static inline void fast_srand(unsigned int seed) { - g_seed = seed; -} - -// Compute a pseudorandom integer. -// Output value in range [0, 32767] -static inline unsigned int fast_rand(void) { - g_seed = (214013*g_seed+2531011); - return (g_seed>>16)&0x7FFF; -} - -struct subtask_queue { - int capacity; // Size of the buffer. - int first; // Index of the start of the ring buffer. - int num_used; // Number of used elements in the buffer. - struct subtask **buffer; - - pthread_mutex_t mutex; // Mutex used for synchronisation. - pthread_cond_t cond; // Condition variable used for synchronisation. - int dead; - -#if defined(MCPROFILE) - /* Profiling fields */ - uint64_t time_enqueue; - uint64_t time_dequeue; - uint64_t n_dequeues; - uint64_t n_enqueues; -#endif -}; - -/* A subtask that can be executed by a worker */ -struct subtask { - /* The parloop function */ - parloop_fn fn; - /* Execution parameters */ - void* args; - int64_t start, end; - int id; - - /* Dynamic scheduling parameters */ - int chunkable; - int64_t chunk_size; - - /* Shared variables across subtasks */ - volatile int *counter; // Counter for ongoing subtasks - // Shared task timers and iterators - int64_t *task_time; - int64_t *task_iter; - - /* For debugging */ - const char *name; -}; - - -struct worker { - pthread_t thread; - struct scheduler *scheduler; /* Reference to the scheduler struct the worker belongs to*/ - struct subtask_queue q; - int dead; - int tid; /* Just a thread id */ - - /* "thread local" time fields used for online algorithm */ - uint64_t timer; - uint64_t total; - int nested; /* How nested the current computation is */ - - // Profiling fields - int output_usage; /* Whether to dump thread usage */ - uint64_t time_spent_working; /* Time spent in parloop functions */ -}; - -static inline void output_worker_usage(struct worker *worker) -{ - struct rusage usage; - CHECK_ERRNO(getrusage_thread(&usage), "getrusage_thread"); - struct timeval user_cpu_time = usage.ru_utime; - struct timeval sys_cpu_time = usage.ru_stime; - fprintf(stderr, "tid: %2d - work time %10llu us - user time: %10llu us - sys: %10llu us\n", - worker->tid, - (long long unsigned)worker->time_spent_working / 1000, - (long long unsigned)(user_cpu_time.tv_sec * 1000000 + user_cpu_time.tv_usec), - (long long unsigned)(sys_cpu_time.tv_sec * 1000000 + sys_cpu_time.tv_usec)); -} - -/* Doubles the size of the queue */ -static inline int subtask_queue_grow_queue(struct subtask_queue *subtask_queue) { - - int new_capacity = 2 * subtask_queue->capacity; -#ifdef MCDEBUG - fprintf(stderr, "Growing queue to %d\n", subtask_queue->capacity * 2); -#endif - - struct subtask **new_buffer = calloc(new_capacity, sizeof(struct subtask*)); - for (int i = 0; i < subtask_queue->num_used; i++) { - new_buffer[i] = subtask_queue->buffer[(subtask_queue->first + i) % subtask_queue->capacity]; - } - - free(subtask_queue->buffer); - subtask_queue->buffer = new_buffer; - subtask_queue->capacity = new_capacity; - subtask_queue->first = 0; - - return 0; -} - -// Initialise a job queue with the given capacity. The queue starts out -// empty. Returns non-zero on error. -static inline int subtask_queue_init(struct subtask_queue *subtask_queue, int capacity) -{ - assert(subtask_queue != NULL); - memset(subtask_queue, 0, sizeof(struct subtask_queue)); - - subtask_queue->capacity = capacity; - subtask_queue->buffer = calloc(capacity, sizeof(struct subtask*)); - if (subtask_queue->buffer == NULL) { - return -1; - } - - CHECK_ERRNO(pthread_mutex_init(&subtask_queue->mutex, NULL), "pthread_mutex_init"); - CHECK_ERRNO(pthread_cond_init(&subtask_queue->cond, NULL), "pthread_cond_init"); - - return 0; -} - -// Destroy the job queue. Blocks until the queue is empty before it -// is destroyed. -static inline int subtask_queue_destroy(struct subtask_queue *subtask_queue) -{ - assert(subtask_queue != NULL); - - CHECK_ERR(pthread_mutex_lock(&subtask_queue->mutex), "pthread_mutex_lock"); - - while (subtask_queue->num_used != 0) { - CHECK_ERR(pthread_cond_wait(&subtask_queue->cond, &subtask_queue->mutex), "pthread_cond_wait"); - } - - // Queue is now empty. Let's kill it! - subtask_queue->dead = 1; - free(subtask_queue->buffer); - CHECK_ERR(pthread_cond_broadcast(&subtask_queue->cond), "pthread_cond_broadcast"); - CHECK_ERR(pthread_mutex_unlock(&subtask_queue->mutex), "pthread_mutex_unlock"); - - return 0; -} - -static inline void dump_queue(struct worker *worker) -{ - struct subtask_queue *subtask_queue = &worker->q; - CHECK_ERR(pthread_mutex_lock(&subtask_queue->mutex), "pthread_mutex_lock"); - for (int i = 0; i < subtask_queue->num_used; i++) { - struct subtask * subtask = subtask_queue->buffer[(subtask_queue->first + i) % subtask_queue->capacity]; - printf("queue tid %d with %d task %s\n", worker->tid, i, subtask->name); - } - CHECK_ERR(pthread_cond_broadcast(&subtask_queue->cond), "pthread_cond_broadcast"); - CHECK_ERR(pthread_mutex_unlock(&subtask_queue->mutex), "pthread_mutex_unlock"); -} - -// Push an element onto the end of the job queue. Blocks if the -// subtask_queue is full (its size is equal to its capacity). Returns -// non-zero on error. It is an error to push a job onto a queue that -// has been destroyed. -static inline int subtask_queue_enqueue(struct worker *worker, struct subtask *subtask ) -{ - assert(worker != NULL); - struct subtask_queue *subtask_queue = &worker->q; - -#ifdef MCPROFILE - uint64_t start = get_wall_time(); -#endif - - CHECK_ERR(pthread_mutex_lock(&subtask_queue->mutex), "pthread_mutex_lock"); - // Wait until there is room in the subtask_queue. - while (subtask_queue->num_used == subtask_queue->capacity && !subtask_queue->dead) { - if (subtask_queue->num_used == subtask_queue->capacity) { - CHECK_ERR(subtask_queue_grow_queue(subtask_queue), "subtask_queue_grow_queue"); - continue; - } - CHECK_ERR(pthread_cond_wait(&subtask_queue->cond, &subtask_queue->mutex), "pthread_cond_wait"); - } - - if (subtask_queue->dead) { - CHECK_ERR(pthread_mutex_unlock(&subtask_queue->mutex), "pthread_mutex_unlock"); - return -1; - } - - // If we made it past the loop, there is room in the subtask_queue. - subtask_queue->buffer[(subtask_queue->first + subtask_queue->num_used) % subtask_queue->capacity] = subtask; - subtask_queue->num_used++; - -#ifdef MCPROFILE - uint64_t end = get_wall_time(); - subtask_queue->time_enqueue += (end - start); - subtask_queue->n_enqueues++; -#endif - // Broadcast a reader (if any) that there is now an element. - CHECK_ERR(pthread_cond_broadcast(&subtask_queue->cond), "pthread_cond_broadcast"); - CHECK_ERR(pthread_mutex_unlock(&subtask_queue->mutex), "pthread_mutex_unlock"); - - return 0; -} - - -/* Like subtask_queue_dequeue, but with two differences: - 1) the subtask is stolen from the __front__ of the queue - 2) returns immediately if there is no subtasks queued, - as we dont' want to block on another workers queue and -*/ -static inline int subtask_queue_steal(struct worker *worker, - struct subtask **subtask) -{ - struct subtask_queue *subtask_queue = &worker->q; - assert(subtask_queue != NULL); - -#ifdef MCPROFILE - uint64_t start = get_wall_time(); -#endif - CHECK_ERR(pthread_mutex_lock(&subtask_queue->mutex), "pthread_mutex_lock"); - - if (subtask_queue->num_used == 0) { - CHECK_ERR(pthread_cond_broadcast(&subtask_queue->cond), "pthread_cond_broadcast"); - CHECK_ERR(pthread_mutex_unlock(&subtask_queue->mutex), "pthread_mutex_unlock"); - return 1; - } - - if (subtask_queue->dead) { - CHECK_ERR(pthread_mutex_unlock(&subtask_queue->mutex), "pthread_mutex_unlock"); - return -1; - } - - // Tasks gets stolen from the "front" - struct subtask *cur_back = subtask_queue->buffer[subtask_queue->first]; - struct subtask *new_subtask = NULL; - int remaining_iter = cur_back->end - cur_back->start; - // If subtask is chunkable, we steal half of the iterations - if (cur_back->chunkable && remaining_iter > 1) { - int64_t half = remaining_iter / 2; - new_subtask = malloc(sizeof(struct subtask)); - *new_subtask = *cur_back; - new_subtask->start = cur_back->end - half; - cur_back->end = new_subtask->start; - __atomic_fetch_add(cur_back->counter, 1, __ATOMIC_RELAXED); - } else { - new_subtask = cur_back; - subtask_queue->num_used--; - subtask_queue->first = (subtask_queue->first + 1) % subtask_queue->capacity; - } - *subtask = new_subtask; - - if (*subtask == NULL) { - CHECK_ERR(pthread_mutex_unlock(&subtask_queue->mutex), "pthred_mutex_unlock"); - return 1; - } - -#ifdef MCPROFILE - uint64_t end = get_wall_time(); - subtask_queue->time_dequeue += (end - start); - subtask_queue->n_dequeues++; -#endif - - // Broadcast a writer (if any) that there is now room for more. - CHECK_ERR(pthread_cond_broadcast(&subtask_queue->cond), "pthread_cond_broadcast"); - CHECK_ERR(pthread_mutex_unlock(&subtask_queue->mutex), "pthread_mutex_unlock"); - - return 0; -} - - -// Pop an element from the back of the job queue. -// Optional argument can be provided to block or not -static inline int subtask_queue_dequeue(struct worker *worker, - struct subtask **subtask, int blocking) -{ - assert(worker != NULL); - struct subtask_queue *subtask_queue = &worker->q; - -#ifdef MCPROFILE - uint64_t start = get_wall_time(); -#endif - - CHECK_ERR(pthread_mutex_lock(&subtask_queue->mutex), "pthread_mutex_lock"); - if (subtask_queue->num_used == 0 && !blocking) { - CHECK_ERR(pthread_mutex_unlock(&subtask_queue->mutex), "pthread_mutex_unlock"); - return 1; - } - // Try to steal some work while the subtask_queue is empty - while (subtask_queue->num_used == 0 && !subtask_queue->dead) { - pthread_cond_wait(&subtask_queue->cond, &subtask_queue->mutex); - } - - if (subtask_queue->dead) { - CHECK_ERR(pthread_mutex_unlock(&subtask_queue->mutex), "pthread_mutex_unlock"); - return -1; - } - - // dequeue pops from the back - *subtask = subtask_queue->buffer[(subtask_queue->first + subtask_queue->num_used - 1) % subtask_queue->capacity]; - subtask_queue->num_used--; - - if (*subtask == NULL) { - assert(!"got NULL ptr"); - CHECK_ERR(pthread_mutex_unlock(&subtask_queue->mutex), "pthred_mutex_unlock"); - return -1; - } - -#ifdef MCPROFILE - uint64_t end = get_wall_time(); - subtask_queue->time_dequeue += (end - start); - subtask_queue->n_dequeues++; -#endif - - // Broadcast a writer (if any) that there is now room for more. - CHECK_ERR(pthread_cond_broadcast(&subtask_queue->cond), "pthread_cond_broadcast"); - CHECK_ERR(pthread_mutex_unlock(&subtask_queue->mutex), "pthread_mutex_unlock"); - - return 0; -} - -static inline int subtask_queue_is_empty(struct subtask_queue *subtask_queue) -{ - return subtask_queue->num_used == 0; -} - -/* Scheduler definitions */ - -struct scheduler { - struct worker *workers; - int num_threads; - - // If there is work to steal => active_work > 0 - volatile int active_work; - - // Only one error can be returned at the time now. Maybe we can - // provide a stack like structure for pushing errors onto if we wish - // to backpropagte multiple errors - volatile int error; - - // kappa time unit in nanoseconds - double kappa; -}; - - -// Thread local variable worker struct -// Note that, accesses to tls variables are expensive -// Minimize direct references to this variable -__thread struct worker* worker_local = NULL; - -static int64_t total_now(int64_t total, int64_t time) { - return total + (get_wall_time_ns() - time); -} - -static int random_other_worker(struct scheduler *scheduler, int my_id) { - int my_num_workers = scheduler->num_threads; - assert(my_num_workers != 1); - int i = fast_rand() % (my_num_workers - 1); - if (i >= my_id) { - i++; - } -#ifdef MCDEBUG - assert(i >= 0); - assert(i < my_num_workers); - assert(i != my_id); -#endif - - return i; -} - - -static inline int64_t compute_chunk_size(double kappa, struct subtask* subtask) -{ - double C = (double)*subtask->task_time / (double)*subtask->task_iter; - if (C == 0.0F) C += DBL_EPSILON; - return smax64((int64_t)(kappa / C), 1); -} - -/* Takes a chunk from subtask and enqueues the remaining iterations onto the worker's queue */ -/* A no-op if the subtask is not chunkable */ -static inline struct subtask* chunk_subtask(struct worker* worker, struct subtask *subtask) -{ - if (subtask->chunkable) { - // Do we have information from previous runs avaliable - if (*subtask->task_iter > 0) { - subtask->chunk_size = compute_chunk_size(worker->scheduler->kappa, subtask); - assert(subtask->chunk_size > 0); - } - int64_t remaining_iter = subtask->end - subtask->start; - assert(remaining_iter > 0); - if (remaining_iter > subtask->chunk_size) { - struct subtask *new_subtask = malloc(sizeof(struct subtask)); - *new_subtask = *subtask; - // increment the subtask join counter to account for new subtask - __atomic_fetch_add(subtask->counter, 1, __ATOMIC_RELAXED); - // Update range parameters - subtask->end = subtask->start + subtask->chunk_size; - new_subtask->start = subtask->end; - subtask_queue_enqueue(worker, new_subtask); - } - } - return subtask; -} - -static inline int run_subtask(struct worker* worker, struct subtask* subtask) -{ - assert(subtask != NULL); - assert(worker != NULL); - - subtask = chunk_subtask(worker, subtask); - worker->total = 0; - worker->timer = get_wall_time_ns(); -#if defined(MCPROFILE) - int64_t start = worker->timer; -#endif - worker->nested++; - int err = subtask->fn(subtask->args, subtask->start, subtask->end, - subtask->chunkable ? worker->tid : subtask->id, - worker->tid); - worker->nested--; - // Some error occured during some other subtask - // so we just clean-up and return - if (worker->scheduler->error != 0) { - // Even a failed task counts as finished. - __atomic_fetch_sub(subtask->counter, 1, __ATOMIC_RELAXED); - free(subtask); - return 0; - } - if (err != 0) { - __atomic_store_n(&worker->scheduler->error, err, __ATOMIC_RELAXED); - } - // Total sequential time spent - int64_t time_elapsed = total_now(worker->total, worker->timer); -#if defined(MCPROFILE) - worker->time_spent_working += get_wall_time_ns() - start; -#endif - int64_t iter = subtask->end - subtask->start; - // report measurements - // These updates should really be done using a single atomic CAS operation - __atomic_fetch_add(subtask->task_time, time_elapsed, __ATOMIC_RELAXED); - __atomic_fetch_add(subtask->task_iter, iter, __ATOMIC_RELAXED); - // We need a fence here, since if the counter is decremented before either - // of the two above are updated bad things can happen, e.g. if they are stack-allocated - __atomic_thread_fence(__ATOMIC_SEQ_CST); - __atomic_fetch_sub(subtask->counter, 1, __ATOMIC_RELAXED); - free(subtask); - return 0; -} - - -static inline int is_small(struct scheduler_segop *task, struct scheduler *scheduler, int *nsubtasks) -{ - int64_t time = *task->task_time; - int64_t iter = *task->task_iter; - - if (task->sched == DYNAMIC || iter == 0) { - *nsubtasks = scheduler->num_threads; - return 0; - } - - // Estimate the constant C - double C = (double)time / (double)iter; - double cur_task_iter = (double) task->iterations; - - // Returns true if the task is small i.e. - // if the number of iterations times C is smaller - // than the overhead of subtask creation - if (C == 0.0F || C * cur_task_iter < scheduler->kappa) { - *nsubtasks = 1; - return 1; - } - - // Else compute how many subtasks this tasks should create - int64_t min_iter_pr_subtask = smax64(scheduler->kappa / C, 1); - *nsubtasks = smin64(smax64(task->iterations / min_iter_pr_subtask, 1), scheduler->num_threads); - - return 0; -} - -// TODO make this prettier -static inline struct subtask* create_subtask(parloop_fn fn, - void* args, - const char* name, - volatile int* counter, - int64_t *timer, - int64_t *iter, - int64_t start, int64_t end, - int chunkable, - int64_t chunk_size, - int id) -{ - struct subtask* subtask = malloc(sizeof(struct subtask)); - if (subtask == NULL) { - assert(!"malloc failed in create_subtask"); - return NULL; - } - subtask->fn = fn; - subtask->args = args; - - subtask->counter = counter; - subtask->task_time = timer; - subtask->task_iter = iter; - - subtask->start = start; - subtask->end = end; - subtask->id = id; - subtask->chunkable = chunkable; - subtask->chunk_size = chunk_size; - - subtask->name = name; - return subtask; -} - -static int dummy_counter = 0; -static int64_t dummy_timer = 0; -static int64_t dummy_iter = 0; - -static int dummy_fn(void *args, int64_t start, int64_t end, int subtask_id, int tid) { - (void)args; - (void)start; - (void)end; - (void)subtask_id; - (void)tid; - return 0; -} - -// Wake up threads, who are blocking by pushing a dummy task -// onto their queue -static inline void wake_up_threads(struct scheduler *scheduler, int start_tid, int end_tid) { - -#if defined(MCDEBUG) - assert(start_tid >= 1); - assert(end_tid <= scheduler->num_threads); -#endif - for (int i = start_tid; i < end_tid; i++) { - struct subtask *subtask = create_subtask(dummy_fn, NULL, "dummy_fn", - &dummy_counter, - &dummy_timer, &dummy_iter, - 0, 0, - 0, 0, - 0); - CHECK_ERR(subtask_queue_enqueue(&scheduler->workers[i], subtask), "subtask_queue_enqueue"); - } -} - -static inline int is_finished(struct worker *worker) { - return worker->dead && subtask_queue_is_empty(&worker->q); -} - -// Try to steal from a random queue -static inline int steal_from_random_worker(struct worker* worker) -{ - int my_id = worker->tid; - struct scheduler* scheduler = worker->scheduler; - int k = random_other_worker(scheduler, my_id); - struct worker *worker_k = &scheduler->workers[k]; - struct subtask* subtask = NULL; - int retval = subtask_queue_steal(worker_k, &subtask); - if (retval == 0) { - subtask_queue_enqueue(worker, subtask); - return 1; - } - return 0; -} - - -static inline void *scheduler_worker(void* args) -{ - struct worker *worker = (struct worker*) args; - struct scheduler *scheduler = worker->scheduler; - worker_local = worker; - struct subtask *subtask = NULL; - - while(!is_finished(worker)) { - if (!subtask_queue_is_empty(&worker->q)) { - int retval = subtask_queue_dequeue(worker, &subtask, 0); - if (retval == 0) { - assert(subtask != NULL); - CHECK_ERR(run_subtask(worker, subtask), "run_subtask"); - } // else someone stole our work - - } else if (scheduler->active_work) { /* steal */ - while (!is_finished(worker) && scheduler->active_work) { - if (steal_from_random_worker(worker)) { - break; - } - } - } else { /* go back to sleep and wait for work */ - int retval = subtask_queue_dequeue(worker, &subtask, 1); - if (retval == 0) { - assert(subtask != NULL); - CHECK_ERR(run_subtask(worker, subtask), "run_subtask"); - } - } - } - - assert(subtask_queue_is_empty(&worker->q)); -#if defined(MCPROFILE) - if (worker->output_usage) - output_worker_usage(worker); -#endif - return NULL; -} - - -static inline int scheduler_execute_parloop(struct scheduler *scheduler, - struct scheduler_parloop *task, - int64_t *timer) -{ - - struct worker *worker = worker_local; - - struct scheduler_info info = task->info; - int64_t iter_pr_subtask = info.iter_pr_subtask; - int64_t remainder = info.remainder; - int nsubtasks = info.nsubtasks; - volatile int join_counter = nsubtasks; - - // Shared timer used to sum up all - // sequential work from each subtask - int64_t task_timer = 0; - int64_t task_iter = 0; - - enum scheduling sched = info.sched; - /* If each subtasks should be processed in chunks */ - int chunkable = sched == STATIC ? 0 : 1; - int64_t chunk_size = 1; // The initial chunk size when no info is avaliable - - - if (info.wake_up_threads || sched == DYNAMIC) - __atomic_add_fetch(&scheduler->active_work, nsubtasks, __ATOMIC_RELAXED); - - int64_t start = 0; - int64_t end = iter_pr_subtask + (int64_t)(remainder != 0); - for (int subtask_id = 0; subtask_id < nsubtasks; subtask_id++) { - struct subtask *subtask = create_subtask(task->fn, task->args, task->name, - &join_counter, - &task_timer, &task_iter, - start, end, - chunkable, chunk_size, - subtask_id); - assert(subtask != NULL); - // In most cases we will never have more subtasks than workers, - // but there can be exceptions (e.g. the kappa tuning function). - struct worker *subtask_worker = - worker->nested - ? &scheduler->workers[worker->tid] - : &scheduler->workers[subtask_id % scheduler->num_threads]; - CHECK_ERR(subtask_queue_enqueue(subtask_worker, subtask), - "subtask_queue_enqueue"); - // Update range params - start = end; - end += iter_pr_subtask + ((subtask_id + 1) < remainder); - } - - if (info.wake_up_threads) { - wake_up_threads(scheduler, nsubtasks, scheduler->num_threads); - } - - // Join (wait for subtasks to finish) - while(join_counter != 0) { - if (!subtask_queue_is_empty(&worker->q)) { - struct subtask *subtask = NULL; - int err = subtask_queue_dequeue(worker, &subtask, 0); - if (err == 0 ) { - CHECK_ERR(run_subtask(worker, subtask), "run_subtask"); - } - } else { - if (steal_from_random_worker(worker)) { - struct subtask *subtask = NULL; - int err = subtask_queue_dequeue(worker, &subtask, 0); - if (err == 0) { - CHECK_ERR(run_subtask(worker, subtask), "run_subtask"); - } - } - } - } - - - if (info.wake_up_threads || sched == DYNAMIC) { - __atomic_sub_fetch(&scheduler->active_work, nsubtasks, __ATOMIC_RELAXED); - } - - // Write back timing results of all sequential work - (*timer) += task_timer; - return scheduler->error; -} - - -static inline int scheduler_execute_task(struct scheduler *scheduler, - struct scheduler_parloop *task) -{ - - struct worker *worker = worker_local; - - int err = 0; - - // How much sequential work was performed by the task - int64_t task_timer = 0; - - /* Execute task sequential or parallel based on decision made earlier */ - if (task->info.nsubtasks == 1) { - int64_t start = get_wall_time_ns(); - err = task->fn(task->args, 0, task->iterations, 0, worker->tid); - int64_t end = get_wall_time_ns(); - task_timer = end - start; - worker->time_spent_working += task_timer; - // Report time measurements - // TODO the update of both of these should really be a single atomic!! - __atomic_fetch_add(task->info.task_time, task_timer, __ATOMIC_RELAXED); - __atomic_fetch_add(task->info.task_iter, task->iterations, __ATOMIC_RELAXED); - } else { - // Add "before" time if we already are inside a task - int64_t time_before = 0; - if (worker->nested > 0) { - time_before = total_now(worker->total, worker->timer); - } - - err = scheduler_execute_parloop(scheduler, task, &task_timer); - - // Report time measurements - // TODO the update of both of these should really be a single atomic!! - __atomic_fetch_add(task->info.task_time, task_timer, __ATOMIC_RELAXED); - __atomic_fetch_add(task->info.task_iter, task->iterations, __ATOMIC_RELAXED); - - // Update timers to account for new timings - worker->total = time_before + task_timer; - worker->timer = get_wall_time_ns(); - } - - - return err; -} - -/* Decide on how schedule the incoming task i.e. how many subtasks and - to run sequential or (potentially nested) parallel code body */ -static inline int scheduler_prepare_task(struct scheduler* scheduler, - struct scheduler_segop *task) -{ - assert(task != NULL); - - struct worker *worker = worker_local; - struct scheduler_info info; - info.task_time = task->task_time; - info.task_iter = task->task_iter; - - int nsubtasks; - // Decide if task should be scheduled sequentially - if (is_small(task, scheduler, &nsubtasks)) { - info.iter_pr_subtask = task->iterations; - info.remainder = 0; - info.nsubtasks = nsubtasks; - return task->top_level_fn(task->args, task->iterations, worker->tid, info); - } else { - info.iter_pr_subtask = task->iterations / nsubtasks; - info.remainder = task->iterations % nsubtasks; - info.sched = task->sched; - switch (task->sched) { - case STATIC: - info.nsubtasks = info.iter_pr_subtask == 0 ? info.remainder : ((task->iterations - info.remainder) / info.iter_pr_subtask); - break; - case DYNAMIC: - // As any thread can take any subtasks, we are being safe with using - // an upper bound on the number of tasks such that the task allocate enough memory - info.nsubtasks = info.iter_pr_subtask == 0 ? info.remainder : nsubtasks; - break; - default: - assert(!"Got unknown scheduling"); - } - } - - info.wake_up_threads = 0; - // We only use the nested parallel segop function if we can't exchaust all cores - // using the outer most level - if (task->nested_fn != NULL && info.nsubtasks < scheduler->num_threads && info.nsubtasks == task->iterations) { - if (worker->nested == 0) - info.wake_up_threads = 1; - return task->nested_fn(task->args, task->iterations, worker->tid, info); - } - - return task->top_level_fn(task->args, task->iterations, worker->tid, info); -} - -// Now some code for finding the proper value of kappa on a given -// machine (the smallest amount of work that amortises the cost of -// task creation). - -struct tuning_struct { - int32_t *free_tuning_res; - int32_t *array; -}; - -// Reduction function over an integer array -static int tuning_loop(void *args, int64_t start, int64_t end, - int flat_tid, int tid) { - (void)flat_tid; - (void)tid; - - int err = 0; - struct tuning_struct *tuning_struct = (struct tuning_struct *) args; - int32_t *array = tuning_struct->array; - int32_t *tuning_res = tuning_struct->free_tuning_res; - - int32_t sum = 0; - for (int i = start; i < end; i++) { - int32_t y = array[i]; - sum = add32(sum, y); - } - *tuning_res = sum; - return err; -} - -// The main entry point for the tuning process. Sets the provided -// variable ``kappa``. -static int determine_kappa(double *kappa) { - int err = 0; - - int64_t iterations = 100000000; - int64_t tuning_time = 0; - int64_t tuning_iter = 0; - - int32_t *array = malloc(sizeof(int32_t) * iterations); - for (int64_t i = 0; i < iterations; i++) { - array[i] = fast_rand(); - } - - int64_t start_tuning = get_wall_time_ns(); - /* **************************** */ - /* Run sequential reduce first' */ - /* **************************** */ - int64_t tuning_sequentiual_start = get_wall_time_ns(); - struct tuning_struct tuning_struct; - int32_t tuning_res; - tuning_struct.free_tuning_res = &tuning_res; - tuning_struct.array = array; - - err = tuning_loop(&tuning_struct, 0, iterations, 0, 0); - int64_t tuning_sequentiual_end = get_wall_time_ns(); - int64_t sequential_elapsed = tuning_sequentiual_end - tuning_sequentiual_start; - - double C = (double)sequential_elapsed / (double)iterations; - fprintf(stderr, " Time for sequential run is %lld - Found C %f\n", (long long)sequential_elapsed, C); - - /* ********************** */ - /* Now run tuning process */ - /* ********************** */ - // Setup a scheduler with a single worker - struct scheduler scheduler; - scheduler.num_threads = 1; - scheduler.workers = malloc(sizeof(struct worker)); - worker_local = &scheduler.workers[0]; - worker_local->tid = 0; - CHECK_ERR(subtask_queue_init(&scheduler.workers[0].q, 1024), - "failed to init queue for worker %d\n", 0); - - // Start tuning for kappa - double kappa_tune = 1000; // Initial kappa is 1 us - double ratio; - int64_t time_elapsed; - while(1) { - int64_t min_iter_pr_subtask = (int64_t) (kappa_tune / C) == 0 ? 1 : (kappa_tune / C); - int nsubtasks = iterations / min_iter_pr_subtask; - struct scheduler_info info; - info.iter_pr_subtask = min_iter_pr_subtask; - - info.nsubtasks = iterations / min_iter_pr_subtask; - info.remainder = iterations % min_iter_pr_subtask; - info.task_time = &tuning_time; - info.task_iter = &tuning_iter; - info.sched = STATIC; - - struct scheduler_parloop parloop; - parloop.name = "tuning_loop"; - parloop.fn = tuning_loop; - parloop.args = &tuning_struct; - parloop.iterations = iterations; - parloop.info = info; - - int64_t tuning_chunked_start = get_wall_time_ns(); - int determine_kappa_err = - scheduler_execute_task(&scheduler, - &parloop); - assert(determine_kappa_err == 0); - int64_t tuning_chunked_end = get_wall_time_ns(); - time_elapsed = tuning_chunked_end - tuning_chunked_start; - - ratio = (double)time_elapsed / (double)sequential_elapsed; - if (ratio < 1.055) { - break; - } - kappa_tune += 100; // Increase by 100 ns at the time - fprintf(stderr, "nsubtask %d - kappa %f - ratio %f\n", nsubtasks, kappa_tune, ratio); - } - - int64_t end_tuning = get_wall_time_ns(); - fprintf(stderr, "tuning took %lld ns and found kappa %f - time %lld - ratio %f\n", - (long long)end_tuning - start_tuning, - kappa_tune, - (long long)time_elapsed, - ratio); - *kappa = kappa_tune; - - // Clean-up - CHECK_ERR(subtask_queue_destroy(&scheduler.workers[0].q), "failed to destroy queue"); - free(array); - free(scheduler.workers); - return err; -} - -static int scheduler_init(struct scheduler *scheduler, - int num_workers, - double kappa) { - assert(num_workers > 0); - - scheduler->kappa = kappa; - scheduler->num_threads = num_workers; - scheduler->active_work = 0; - scheduler->error = 0; - - scheduler->workers = calloc(num_workers, sizeof(struct worker)); - - const int queue_capacity = 1024; - - worker_local = &scheduler->workers[0]; - worker_local->tid = 0; - worker_local->scheduler = scheduler; - CHECK_ERR(subtask_queue_init(&worker_local->q, queue_capacity), - "failed to init queue for worker %d\n", 0); - - for (int i = 1; i < num_workers; i++) { - struct worker *cur_worker = &scheduler->workers[i]; - memset(cur_worker, 0, sizeof(struct worker)); - cur_worker->tid = i; - cur_worker->output_usage = 0; - cur_worker->scheduler = scheduler; - CHECK_ERR(subtask_queue_init(&cur_worker->q, queue_capacity), - "failed to init queue for worker %d\n", i); - - CHECK_ERR(pthread_create(&cur_worker->thread, - NULL, - &scheduler_worker, - cur_worker), - "Failed to create worker %d\n", i); - } - - return 0; -} - -static int scheduler_destroy(struct scheduler *scheduler) { - // First mark them all as dead. - for (int i = 1; i < scheduler->num_threads; i++) { - struct worker *cur_worker = &scheduler->workers[i]; - cur_worker->dead = 1; - } - - // Then destroy their task queues (this will wake up the threads and - // make them do their shutdown). - for (int i = 1; i < scheduler->num_threads; i++) { - struct worker *cur_worker = &scheduler->workers[i]; - subtask_queue_destroy(&cur_worker->q); - } - - // Then actually wait for them to stop. - for (int i = 1; i < scheduler->num_threads; i++) { - struct worker *cur_worker = &scheduler->workers[i]; - CHECK_ERR(pthread_join(scheduler->workers[i].thread, NULL), "pthread_join"); - } - - free(scheduler->workers); - - return 0; -} - -// End of scheduler.h - -struct futhark_context_config { - int debugging; - int profiling; - int num_threads; -} ; -struct futhark_context_config *futhark_context_config_new(void) -{ - struct futhark_context_config *cfg = - (struct futhark_context_config *) malloc(sizeof(struct futhark_context_config)); - - if (cfg == NULL) - return NULL; - cfg->debugging = 0; - cfg->profiling = 0; - cfg->num_threads = 0; - return cfg; -} -void futhark_context_config_free(struct futhark_context_config *cfg) -{ - free(cfg); -} -void futhark_context_config_set_debugging(struct futhark_context_config *cfg, - int detail) -{ - cfg->debugging = detail; -} -void futhark_context_config_set_profiling(struct futhark_context_config *cfg, - int flag) -{ - cfg->profiling = flag; -} -void futhark_context_config_set_logging(struct futhark_context_config *cfg, - int detail) -{ - /* Does nothing for this backend. */ - (void) cfg; - (void) detail; -} -void futhark_context_config_set_num_threads(struct futhark_context_config *cfg, - int n) -{ - cfg->num_threads = n; -} -struct futhark_context { - struct scheduler scheduler; - int detail_memory; - int debugging; - int profiling; - int profiling_paused; - int logging; - lock_t lock; - char *error; - FILE *log; - int total_runs; - long total_runtime; - int64_t peak_mem_usage_default; - int64_t cur_mem_usage_default; - struct { - int dummy; - } constants; - int64_t *futhark_mc_segmap_parloop_6011_total_runtime; - int *futhark_mc_segmap_parloop_6011_runs; - int64_t *futhark_mc_segmap_parloop_6011_iter; - int64_t futhark_mc_segmap_parloop_6011_total_total_runtime; - int futhark_mc_segmap_parloop_6011_total_runs; - int64_t futhark_mc_segmap_parloop_6011_total_iter; - int64_t *futhark_mc_segmap_task_6009_total_runtime; - int *futhark_mc_segmap_task_6009_runs; - int64_t *futhark_mc_segmap_task_6009_iter; - int64_t futhark_mc_segmap_task_6009_total_time; - int64_t futhark_mc_segmap_task_6009_total_iter; - int64_t *futhark_mc_segmap_parloop_6020_total_runtime; - int *futhark_mc_segmap_parloop_6020_runs; - int64_t *futhark_mc_segmap_parloop_6020_iter; - int64_t futhark_mc_segmap_parloop_6020_total_total_runtime; - int futhark_mc_segmap_parloop_6020_total_runs; - int64_t futhark_mc_segmap_parloop_6020_total_iter; - int64_t *futhark_mc_segmap_task_6018_total_runtime; - int *futhark_mc_segmap_task_6018_runs; - int64_t *futhark_mc_segmap_task_6018_iter; - int64_t futhark_mc_segmap_task_6018_total_time; - int64_t futhark_mc_segmap_task_6018_total_iter; - int64_t *futhark_mc_segmap_parloop_6015_total_runtime; - int *futhark_mc_segmap_parloop_6015_runs; - int64_t *futhark_mc_segmap_parloop_6015_iter; - int64_t futhark_mc_segmap_parloop_6015_total_total_runtime; - int futhark_mc_segmap_parloop_6015_total_runs; - int64_t futhark_mc_segmap_parloop_6015_total_iter; - int64_t *futhark_mc_segmap_nested_task_6013_total_runtime; - int *futhark_mc_segmap_nested_task_6013_runs; - int64_t *futhark_mc_segmap_nested_task_6013_iter; - int64_t tuning_timing; - int64_t tuning_iter; -} ; -struct futhark_context *futhark_context_new(struct futhark_context_config *cfg) -{ - struct futhark_context *ctx = - (struct futhark_context *) malloc(sizeof(struct futhark_context)); - - if (ctx == NULL) - return NULL; - fast_srand(time(0)); - ctx->detail_memory = cfg->debugging; - ctx->debugging = cfg->debugging; - ctx->profiling = cfg->profiling; - ctx->profiling_paused = 0; - ctx->logging = 0; - ctx->error = NULL; - ctx->log = stderr; - create_lock(&ctx->lock); - - int tune_kappa = 0; - double kappa = 5.1f * 1000; - - if (tune_kappa) { - if (determine_kappa(&kappa) != 0) - return NULL; - } - if (scheduler_init(&ctx->scheduler, cfg->num_threads > - 0 ? cfg->num_threads : num_processors(), kappa) != 0) - return NULL; - ctx->peak_mem_usage_default = 0; - ctx->cur_mem_usage_default = 0; - ctx->futhark_mc_segmap_parloop_6011_total_runtime = calloc(sizeof(int64_t), - ctx->scheduler.num_threads); - ctx->futhark_mc_segmap_parloop_6011_runs = calloc(sizeof(int), - ctx->scheduler.num_threads); - ctx->futhark_mc_segmap_parloop_6011_iter = calloc(sizeof(sizeof(int64_t)), - ctx->scheduler.num_threads); - ctx->futhark_mc_segmap_parloop_6011_total_total_runtime = 0; - ctx->futhark_mc_segmap_parloop_6011_total_runs = 0; - ctx->futhark_mc_segmap_parloop_6011_total_iter = 0; - ctx->futhark_mc_segmap_task_6009_total_runtime = calloc(sizeof(int64_t), - ctx->scheduler.num_threads); - ctx->futhark_mc_segmap_task_6009_runs = calloc(sizeof(int), - ctx->scheduler.num_threads); - ctx->futhark_mc_segmap_task_6009_iter = calloc(sizeof(sizeof(int64_t)), - ctx->scheduler.num_threads); - ctx->futhark_mc_segmap_task_6009_total_time = 0; - ctx->futhark_mc_segmap_task_6009_total_iter = 0; - ctx->futhark_mc_segmap_parloop_6020_total_runtime = calloc(sizeof(int64_t), - ctx->scheduler.num_threads); - ctx->futhark_mc_segmap_parloop_6020_runs = calloc(sizeof(int), - ctx->scheduler.num_threads); - ctx->futhark_mc_segmap_parloop_6020_iter = calloc(sizeof(sizeof(int64_t)), - ctx->scheduler.num_threads); - ctx->futhark_mc_segmap_parloop_6020_total_total_runtime = 0; - ctx->futhark_mc_segmap_parloop_6020_total_runs = 0; - ctx->futhark_mc_segmap_parloop_6020_total_iter = 0; - ctx->futhark_mc_segmap_task_6018_total_runtime = calloc(sizeof(int64_t), - ctx->scheduler.num_threads); - ctx->futhark_mc_segmap_task_6018_runs = calloc(sizeof(int), - ctx->scheduler.num_threads); - ctx->futhark_mc_segmap_task_6018_iter = calloc(sizeof(sizeof(int64_t)), - ctx->scheduler.num_threads); - ctx->futhark_mc_segmap_task_6018_total_time = 0; - ctx->futhark_mc_segmap_task_6018_total_iter = 0; - ctx->futhark_mc_segmap_parloop_6015_total_runtime = calloc(sizeof(int64_t), - ctx->scheduler.num_threads); - ctx->futhark_mc_segmap_parloop_6015_runs = calloc(sizeof(int), - ctx->scheduler.num_threads); - ctx->futhark_mc_segmap_parloop_6015_iter = calloc(sizeof(sizeof(int64_t)), - ctx->scheduler.num_threads); - ctx->futhark_mc_segmap_parloop_6015_total_total_runtime = 0; - ctx->futhark_mc_segmap_parloop_6015_total_runs = 0; - ctx->futhark_mc_segmap_parloop_6015_total_iter = 0; - ctx->futhark_mc_segmap_nested_task_6013_total_runtime = - calloc(sizeof(int64_t), ctx->scheduler.num_threads); - ctx->futhark_mc_segmap_nested_task_6013_runs = calloc(sizeof(int), - ctx->scheduler.num_threads); - ctx->futhark_mc_segmap_nested_task_6013_iter = - calloc(sizeof(sizeof(int64_t)), ctx->scheduler.num_threads); - init_constants(ctx); - return ctx; -} -void futhark_context_free(struct futhark_context *ctx) -{ - free_constants(ctx); - (void) scheduler_destroy(&ctx->scheduler); - free_lock(&ctx->lock); - free(ctx); -} -int futhark_context_sync(struct futhark_context *ctx) -{ - (void) ctx; - return 0; -} -static const char *size_names[0]; -static const char *size_vars[0]; -static const char *size_classes[0]; -int futhark_context_config_set_size(struct futhark_context_config *cfg, const - char *size_name, size_t size_value) -{ - (void) cfg; - (void) size_name; - (void) size_value; - return 1; -} -static int memblock_unref(struct futhark_context *ctx, struct memblock *block, - const char *desc) -{ - if (block->references != NULL) { - *block->references -= 1; - if (ctx->detail_memory) - fprintf(ctx->log, - "Unreferencing block %s (allocated as %s) in %s: %d references remaining.\n", - desc, block->desc, "default space", *block->references); - if (*block->references == 0) { - ctx->cur_mem_usage_default -= block->size; - free(block->mem); - free(block->references); - if (ctx->detail_memory) - fprintf(ctx->log, - "%lld bytes freed (now allocated: %lld bytes)\n", - (long long) block->size, - (long long) ctx->cur_mem_usage_default); - } - block->references = NULL; - } - return 0; -} -static int memblock_alloc(struct futhark_context *ctx, struct memblock *block, - int64_t size, const char *desc) -{ - if (size < 0) - futhark_panic(1, - "Negative allocation of %lld bytes attempted for %s in %s.\n", - (long long) size, desc, "default space", - ctx->cur_mem_usage_default); - - int ret = memblock_unref(ctx, block, desc); - - ctx->cur_mem_usage_default += size; - if (ctx->detail_memory) - fprintf(ctx->log, - "Allocating %lld bytes for %s in %s (then allocated: %lld bytes)", - (long long) size, desc, "default space", - (long long) ctx->cur_mem_usage_default); - if (ctx->cur_mem_usage_default > ctx->peak_mem_usage_default) { - ctx->peak_mem_usage_default = ctx->cur_mem_usage_default; - if (ctx->detail_memory) - fprintf(ctx->log, " (new peak).\n"); - } else if (ctx->detail_memory) - fprintf(ctx->log, ".\n"); - block->mem = (char *) malloc(size); - block->references = (int *) malloc(sizeof(int)); - *block->references = 1; - block->size = size; - block->desc = desc; - return ret; -} -static int memblock_set(struct futhark_context *ctx, struct memblock *lhs, - struct memblock *rhs, const char *lhs_desc) -{ - int ret = memblock_unref(ctx, lhs, lhs_desc); - - if (rhs->references != NULL) - (*rhs->references)++; - *lhs = *rhs; - return ret; -} -int futhark_get_num_sizes(void) -{ - return sizeof(size_names) / sizeof(size_names[0]); -} -const char *futhark_get_size_name(int i) -{ - return size_names[i]; -} -const char *futhark_get_size_class(int i) -{ - return size_classes[i]; -} -char *futhark_context_report(struct futhark_context *ctx) -{ - struct str_builder builder; - - str_builder_init(&builder); - if (ctx->detail_memory || ctx->profiling || ctx->logging) { - { } - } - if (ctx->profiling) { - for (int i = 0; i < ctx->scheduler.num_threads; i++) - fprintf(ctx->log, - "tid %2d - futhark_mc_segmap_parloop_6011 ran %10d times; avg: %10ldus; total: %10ldus; time pr. iter %9.6f; iters %9ld; avg %ld\n", - i, ctx->futhark_mc_segmap_parloop_6011_runs[i], - (long) ctx->futhark_mc_segmap_parloop_6011_total_runtime[i] / - (ctx->futhark_mc_segmap_parloop_6011_runs[i] != - 0 ? ctx->futhark_mc_segmap_parloop_6011_runs[i] : 1), - (long) ctx->futhark_mc_segmap_parloop_6011_total_runtime[i], - (double) ctx->futhark_mc_segmap_parloop_6011_total_runtime[i] / - (ctx->futhark_mc_segmap_parloop_6011_iter[i] == - 0 ? 1 : (double) ctx->futhark_mc_segmap_parloop_6011_iter[i]), - (long) ctx->futhark_mc_segmap_parloop_6011_iter[i], - (long) ctx->futhark_mc_segmap_parloop_6011_iter[i] / - (ctx->futhark_mc_segmap_parloop_6011_runs[i] != - 0 ? ctx->futhark_mc_segmap_parloop_6011_runs[i] : 1)); - fprintf(ctx->log, - " futhark_mc_segmap_parloop_6011_total ran %10d times; avg: %10ldus; total: %10ldus; time pr. iter %9.6f; iters %9ld; avg %ld\n", - ctx->futhark_mc_segmap_parloop_6011_total_runs, - (long) ctx->futhark_mc_segmap_parloop_6011_total_total_runtime / - (ctx->futhark_mc_segmap_parloop_6011_total_runs != - 0 ? ctx->futhark_mc_segmap_parloop_6011_total_runs : 1), - (long) ctx->futhark_mc_segmap_parloop_6011_total_total_runtime, - (double) ctx->futhark_mc_segmap_parloop_6011_total_total_runtime / - (ctx->futhark_mc_segmap_parloop_6011_total_iter == - 0 ? 1 : (double) ctx->futhark_mc_segmap_parloop_6011_total_iter), - (long) ctx->futhark_mc_segmap_parloop_6011_total_iter, - (long) ctx->futhark_mc_segmap_parloop_6011_total_iter / - (ctx->futhark_mc_segmap_parloop_6011_total_runs != - 0 ? ctx->futhark_mc_segmap_parloop_6011_total_runs : 1)); - ctx->total_runtime += - ctx->futhark_mc_segmap_parloop_6011_total_total_runtime; - ctx->total_runs += ctx->futhark_mc_segmap_parloop_6011_total_runs; - for (int i = 0; i < ctx->scheduler.num_threads; i++) - fprintf(ctx->log, - "tid %2d - futhark_mc_segmap_parloop_6020 ran %10d times; avg: %10ldus; total: %10ldus; time pr. iter %9.6f; iters %9ld; avg %ld\n", - i, ctx->futhark_mc_segmap_parloop_6020_runs[i], - (long) ctx->futhark_mc_segmap_parloop_6020_total_runtime[i] / - (ctx->futhark_mc_segmap_parloop_6020_runs[i] != - 0 ? ctx->futhark_mc_segmap_parloop_6020_runs[i] : 1), - (long) ctx->futhark_mc_segmap_parloop_6020_total_runtime[i], - (double) ctx->futhark_mc_segmap_parloop_6020_total_runtime[i] / - (ctx->futhark_mc_segmap_parloop_6020_iter[i] == - 0 ? 1 : (double) ctx->futhark_mc_segmap_parloop_6020_iter[i]), - (long) ctx->futhark_mc_segmap_parloop_6020_iter[i], - (long) ctx->futhark_mc_segmap_parloop_6020_iter[i] / - (ctx->futhark_mc_segmap_parloop_6020_runs[i] != - 0 ? ctx->futhark_mc_segmap_parloop_6020_runs[i] : 1)); - fprintf(ctx->log, - " futhark_mc_segmap_parloop_6020_total ran %10d times; avg: %10ldus; total: %10ldus; time pr. iter %9.6f; iters %9ld; avg %ld\n", - ctx->futhark_mc_segmap_parloop_6020_total_runs, - (long) ctx->futhark_mc_segmap_parloop_6020_total_total_runtime / - (ctx->futhark_mc_segmap_parloop_6020_total_runs != - 0 ? ctx->futhark_mc_segmap_parloop_6020_total_runs : 1), - (long) ctx->futhark_mc_segmap_parloop_6020_total_total_runtime, - (double) ctx->futhark_mc_segmap_parloop_6020_total_total_runtime / - (ctx->futhark_mc_segmap_parloop_6020_total_iter == - 0 ? 1 : (double) ctx->futhark_mc_segmap_parloop_6020_total_iter), - (long) ctx->futhark_mc_segmap_parloop_6020_total_iter, - (long) ctx->futhark_mc_segmap_parloop_6020_total_iter / - (ctx->futhark_mc_segmap_parloop_6020_total_runs != - 0 ? ctx->futhark_mc_segmap_parloop_6020_total_runs : 1)); - ctx->total_runtime += - ctx->futhark_mc_segmap_parloop_6020_total_total_runtime; - ctx->total_runs += ctx->futhark_mc_segmap_parloop_6020_total_runs; - for (int i = 0; i < ctx->scheduler.num_threads; i++) - fprintf(ctx->log, - "tid %2d - futhark_mc_segmap_task_6018 ran %10d times; avg: %10ldus; total: %10ldus; time pr. iter %9.6f; iters %9ld; avg %ld\n", - i, ctx->futhark_mc_segmap_task_6018_runs[i], - (long) ctx->futhark_mc_segmap_task_6018_total_runtime[i] / - (ctx->futhark_mc_segmap_task_6018_runs[i] != - 0 ? ctx->futhark_mc_segmap_task_6018_runs[i] : 1), - (long) ctx->futhark_mc_segmap_task_6018_total_runtime[i], - (double) ctx->futhark_mc_segmap_task_6018_total_runtime[i] / - (ctx->futhark_mc_segmap_task_6018_iter[i] == - 0 ? 1 : (double) ctx->futhark_mc_segmap_task_6018_iter[i]), - (long) ctx->futhark_mc_segmap_task_6018_iter[i], - (long) ctx->futhark_mc_segmap_task_6018_iter[i] / - (ctx->futhark_mc_segmap_task_6018_runs[i] != - 0 ? ctx->futhark_mc_segmap_task_6018_runs[i] : 1)); - for (int i = 0; i < ctx->scheduler.num_threads; i++) - fprintf(ctx->log, - "tid %2d - futhark_mc_segmap_parloop_6015 ran %10d times; avg: %10ldus; total: %10ldus; time pr. iter %9.6f; iters %9ld; avg %ld\n", - i, ctx->futhark_mc_segmap_parloop_6015_runs[i], - (long) ctx->futhark_mc_segmap_parloop_6015_total_runtime[i] / - (ctx->futhark_mc_segmap_parloop_6015_runs[i] != - 0 ? ctx->futhark_mc_segmap_parloop_6015_runs[i] : 1), - (long) ctx->futhark_mc_segmap_parloop_6015_total_runtime[i], - (double) ctx->futhark_mc_segmap_parloop_6015_total_runtime[i] / - (ctx->futhark_mc_segmap_parloop_6015_iter[i] == - 0 ? 1 : (double) ctx->futhark_mc_segmap_parloop_6015_iter[i]), - (long) ctx->futhark_mc_segmap_parloop_6015_iter[i], - (long) ctx->futhark_mc_segmap_parloop_6015_iter[i] / - (ctx->futhark_mc_segmap_parloop_6015_runs[i] != - 0 ? ctx->futhark_mc_segmap_parloop_6015_runs[i] : 1)); - fprintf(ctx->log, - " futhark_mc_segmap_parloop_6015_total ran %10d times; avg: %10ldus; total: %10ldus; time pr. iter %9.6f; iters %9ld; avg %ld\n", - ctx->futhark_mc_segmap_parloop_6015_total_runs, - (long) ctx->futhark_mc_segmap_parloop_6015_total_total_runtime / - (ctx->futhark_mc_segmap_parloop_6015_total_runs != - 0 ? ctx->futhark_mc_segmap_parloop_6015_total_runs : 1), - (long) ctx->futhark_mc_segmap_parloop_6015_total_total_runtime, - (double) ctx->futhark_mc_segmap_parloop_6015_total_total_runtime / - (ctx->futhark_mc_segmap_parloop_6015_total_iter == - 0 ? 1 : (double) ctx->futhark_mc_segmap_parloop_6015_total_iter), - (long) ctx->futhark_mc_segmap_parloop_6015_total_iter, - (long) ctx->futhark_mc_segmap_parloop_6015_total_iter / - (ctx->futhark_mc_segmap_parloop_6015_total_runs != - 0 ? ctx->futhark_mc_segmap_parloop_6015_total_runs : 1)); - ctx->total_runtime += - ctx->futhark_mc_segmap_parloop_6015_total_total_runtime; - ctx->total_runs += ctx->futhark_mc_segmap_parloop_6015_total_runs; - for (int i = 0; i < ctx->scheduler.num_threads; i++) - fprintf(ctx->log, - "tid %2d - futhark_mc_segmap_task_6009 ran %10d times; avg: %10ldus; total: %10ldus; time pr. iter %9.6f; iters %9ld; avg %ld\n", - i, ctx->futhark_mc_segmap_task_6009_runs[i], - (long) ctx->futhark_mc_segmap_task_6009_total_runtime[i] / - (ctx->futhark_mc_segmap_task_6009_runs[i] != - 0 ? ctx->futhark_mc_segmap_task_6009_runs[i] : 1), - (long) ctx->futhark_mc_segmap_task_6009_total_runtime[i], - (double) ctx->futhark_mc_segmap_task_6009_total_runtime[i] / - (ctx->futhark_mc_segmap_task_6009_iter[i] == - 0 ? 1 : (double) ctx->futhark_mc_segmap_task_6009_iter[i]), - (long) ctx->futhark_mc_segmap_task_6009_iter[i], - (long) ctx->futhark_mc_segmap_task_6009_iter[i] / - (ctx->futhark_mc_segmap_task_6009_runs[i] != - 0 ? ctx->futhark_mc_segmap_task_6009_runs[i] : 1)); - for (int i = 0; i < ctx->scheduler.num_threads; i++) - fprintf(ctx->log, - "tid %2d - futhark_mc_segmap_nested_task_6013 ran %10d times; avg: %10ldus; total: %10ldus; time pr. iter %9.6f; iters %9ld; avg %ld\n", - i, ctx->futhark_mc_segmap_nested_task_6013_runs[i], - (long) ctx->futhark_mc_segmap_nested_task_6013_total_runtime[i] / - (ctx->futhark_mc_segmap_nested_task_6013_runs[i] != - 0 ? ctx->futhark_mc_segmap_nested_task_6013_runs[i] : 1), - (long) ctx->futhark_mc_segmap_nested_task_6013_total_runtime[i], - (double) ctx->futhark_mc_segmap_nested_task_6013_total_runtime[i] / - (ctx->futhark_mc_segmap_nested_task_6013_iter[i] == - 0 ? 1 : (double) ctx->futhark_mc_segmap_nested_task_6013_iter[i]), - (long) ctx->futhark_mc_segmap_nested_task_6013_iter[i], - (long) ctx->futhark_mc_segmap_nested_task_6013_iter[i] / - (ctx->futhark_mc_segmap_nested_task_6013_runs[i] != - 0 ? ctx->futhark_mc_segmap_nested_task_6013_runs[i] : 1)); - } - return builder.str; -} -char *futhark_context_get_error(struct futhark_context *ctx) -{ - char *error = ctx->error; - - ctx->error = NULL; - return error; -} -void futhark_context_set_logging_file(struct futhark_context *ctx, FILE *f) -{ - ctx->log = f; -} -void futhark_context_pause_profiling(struct futhark_context *ctx) -{ - ctx->profiling_paused = 1; -} -void futhark_context_unpause_profiling(struct futhark_context *ctx) -{ - ctx->profiling_paused = 0; -} -int futhark_context_clear_caches(struct futhark_context *ctx) -{ - lock_lock(&ctx->lock); - worker_local = &ctx->scheduler.workers[0]; - ctx->peak_mem_usage_default = 0; - lock_unlock(&ctx->lock); - return ctx->error != NULL; -} -static int futrts_init(struct futhark_context *ctx, - struct memblock *out_mem_p_5993, - int64_t *out_scalar_out_5994, - int64_t *out_scalar_out_5995, - int64_t *out_scalar_out_5996, - struct memblock board_mem_5945, int64_t n_5861, - int64_t nb_rows_5863, int64_t nb_columns_5864, - int64_t sizze_5865); -static int futrts_key(struct futhark_context *ctx, - struct memblock *out_mem_p_5997, - int64_t *out_scalar_out_5998, - int64_t *out_scalar_out_5999, - int64_t *out_scalar_out_6000, - struct memblock board_mem_5945, int64_t implz2080U_5881, - int32_t e_5882, int32_t key_5883, int64_t nb_columns_5885, - int64_t nb_rows_5886, int64_t sizze_5887); -static int futrts_mouse(struct futhark_context *ctx, - struct memblock *out_mem_p_6001, - int64_t *out_scalar_out_6002, - int64_t *out_scalar_out_6003, - int64_t *out_scalar_out_6004, - struct memblock board_mem_5945, int64_t implz2080U_5866, - int32_t buttons_5867, int32_t x_5868, int32_t y_5869, - int64_t nb_columns_5871, int64_t nb_rows_5872, - int64_t sizze_5873); -static int futrts_render(struct futhark_context *ctx, - struct memblock *out_mem_p_6005, - int64_t *out_out_arrsizze_6006, - int64_t *out_out_arrsizze_6007, - struct memblock board_mem_5945, - int64_t implz2080U_5888, int64_t nb_columns_5890, - int64_t nb_rows_5891, int64_t sizze_5892); -static int futrts_resizze(struct futhark_context *ctx, - struct memblock *out_mem_p_6021, - int64_t *out_scalar_out_6022, - int64_t *out_scalar_out_6023, - int64_t *out_scalar_out_6024, - struct memblock board_mem_5945, - int64_t implz2080U_5848, int64_t h_5849, - int64_t w_5850, int64_t nb_columns_5852, - int64_t nb_rows_5853, int64_t sizze_5854); -static int futrts_step(struct futhark_context *ctx, - struct memblock *out_mem_p_6025, - int64_t *out_scalar_out_6026, - int64_t *out_scalar_out_6027, - int64_t *out_scalar_out_6028, - struct memblock board_mem_5945, int64_t implz2080U_5855, - float nameless_5856, int64_t nb_columns_5858, - int64_t nb_rows_5859, int64_t sizze_5860); -static int futrts_wheel(struct futhark_context *ctx, - struct memblock *out_mem_p_6029, - int64_t *out_scalar_out_6030, - int64_t *out_scalar_out_6031, - int64_t *out_scalar_out_6032, - struct memblock board_mem_5945, int64_t implz2080U_5874, - int32_t dx_5875, int32_t dy_5876, - int64_t nb_columns_5878, int64_t nb_rows_5879, - int64_t sizze_5880); -static int init_constants(struct futhark_context *ctx) -{ - (void) ctx; - - int err = 0; - - - cleanup: - return err; -} -static int free_constants(struct futhark_context *ctx) -{ - (void) ctx; - return 0; -} -struct futhark_mc_task_6008 { - struct futhark_context *ctx; - int64_t free_implz2080U_5888; - int64_t free_nb_columns_5890; - char *free_board_mem_5945; - int64_t free_bytes_5946; - char *free_mem_5964; -} ; -struct futhark_mc_segmap_parloop_struct_6010 { - struct futhark_context *ctx; - int64_t free_implz2080U_5888; - int64_t free_nb_columns_5890; - char *free_board_mem_5945; - int64_t free_bytes_5946; - char *free_mem_5964; -} ; -static int futhark_mc_segmap_parloop_6011(void *args, int64_t start, - int64_t end, int flat_tid_5915, - int tid) -{ - int err = 0; - struct futhark_mc_segmap_parloop_struct_6010 - *futhark_mc_segmap_parloop_struct_6010 = - (struct futhark_mc_segmap_parloop_struct_6010 *) args; - struct futhark_context *ctx = futhark_mc_segmap_parloop_struct_6010->ctx; - uint64_t futhark_mc_segmap_parloop_6011_start = 0; - - if (ctx->profiling && !ctx->profiling_paused) - futhark_mc_segmap_parloop_6011_start = get_wall_time(); - - int64_t implz2080U_5888 = - futhark_mc_segmap_parloop_struct_6010->free_implz2080U_5888; - int64_t nb_columns_5890 = - futhark_mc_segmap_parloop_struct_6010->free_nb_columns_5890; - struct memblock board_mem_5945 = {.desc ="board_mem_5945", .mem = - futhark_mc_segmap_parloop_struct_6010->free_board_mem_5945, - .size =0, .references =NULL}; - int64_t bytes_5946 = futhark_mc_segmap_parloop_struct_6010->free_bytes_5946; - struct memblock mem_5964 = {.desc ="mem_5964", .mem = - futhark_mc_segmap_parloop_struct_6010->free_mem_5964, - .size =0, .references =NULL}; - size_t mem_5949_cached_sizze_6012 = 0; - char *mem_5949 = NULL; - int64_t iterations = end - start; - int64_t iter_5969 = start; - - if (mem_5949_cached_sizze_6012 < (size_t) bytes_5946) { - mem_5949 = realloc(mem_5949, bytes_5946); - mem_5949_cached_sizze_6012 = bytes_5946; - } - for (; iter_5969 < end; iter_5969++) { - if (ctx->debugging) - fprintf(ctx->log, "%s\n", "SegMap fbody"); - - int64_t gtid_5916; - - gtid_5916 = iter_5969; - - int64_t x_5903; - - x_5903 = mul64(nb_columns_5890, gtid_5916); - for (int64_t i_5970 = 0; i_5970 < nb_columns_5890; i_5970++) { - int64_t get_cell_index_res_5972 = add64(x_5903, i_5970); - bool x_5973 = sle64((int64_t) 0, get_cell_index_res_5972); - bool y_5974 = slt64(get_cell_index_res_5972, implz2080U_5888); - bool bounds_check_5975 = x_5973 && y_5974; - bool index_certs_5976; - - if (!bounds_check_5975) { - ctx->error = - msgprintf("Error: %s%lld%s%lld%s\n\nBacktrace:\n%s", - "Index [", get_cell_index_res_5972, - "] out of bounds for array of shape [", - implz2080U_5888, "].", - "-> #0 /home/baptistecdr/Documents/Cours/projet-de-bachelor/game_of_life/gol.fut:30:24-63\n #1 /home/baptistecdr/Documents/Cours/projet-de-bachelor/game_of_life/gol.fut:30:10-107\n #2 /home/baptistecdr/Documents/Cours/projet-de-bachelor/game_of_life/gol.fut:29:8-31:12\n #3 /home/baptistecdr/Documents/Cours/projet-de-bachelor/game_of_life/gol.fut:26:1-31:12\n"); - return 1; - } - - int8_t x_5977 = - ((int8_t *) board_mem_5945.mem)[get_cell_index_res_5972]; - bool cond_5978 = x_5977 == (int8_t) 1; - int32_t defunc_0_f_res_5979; - - if (cond_5978) { - defunc_0_f_res_5979 = -1; - } else { - defunc_0_f_res_5979 = -16777216; - } - ((int32_t *) mem_5949)[i_5970] = defunc_0_f_res_5979; - } - memmove(mem_5964.mem + gtid_5916 * nb_columns_5890 * (int64_t) 4, - mem_5949 + (int64_t) 0, nb_columns_5890 * - (int64_t) sizeof(int32_t)); - } - - cleanup: - { } - free(mem_5949); - if (ctx->profiling && !ctx->profiling_paused) { - uint64_t futhark_mc_segmap_parloop_6011_end = get_wall_time(); - uint64_t elapsed = futhark_mc_segmap_parloop_6011_end - - futhark_mc_segmap_parloop_6011_start; - - ctx->futhark_mc_segmap_parloop_6011_runs[tid]++; - ctx->futhark_mc_segmap_parloop_6011_total_runtime[tid] += elapsed; - ctx->futhark_mc_segmap_parloop_6011_iter[tid] += iterations; - } - return err; -} -int futhark_mc_segmap_task_6009(void *args, int64_t iterations, int tid, - struct scheduler_info info) -{ - int err = 0; - int flat_tid_5915 = tid; - int num_tasks_5968 = info.nsubtasks; - struct futhark_mc_task_6008 *futhark_mc_task_6008 = - (struct futhark_mc_task_6008 *) args; - struct futhark_context *ctx = futhark_mc_task_6008->ctx; - uint64_t futhark_mc_segmap_task_6009_start = 0; - - if (ctx->profiling && !ctx->profiling_paused) - futhark_mc_segmap_task_6009_start = get_wall_time(); - - int64_t implz2080U_5888 = futhark_mc_task_6008->free_implz2080U_5888; - int64_t nb_columns_5890 = futhark_mc_task_6008->free_nb_columns_5890; - struct memblock board_mem_5945 = {.desc ="board_mem_5945", .mem = - futhark_mc_task_6008->free_board_mem_5945, - .size =0, .references =NULL}; - int64_t bytes_5946 = futhark_mc_task_6008->free_bytes_5946; - struct memblock mem_5964 = {.desc ="mem_5964", .mem = - futhark_mc_task_6008->free_mem_5964, .size =0, - .references =NULL}; - int64_t iter_5969; - struct futhark_mc_segmap_parloop_struct_6010 - futhark_mc_segmap_parloop_struct_6010; - - futhark_mc_segmap_parloop_struct_6010.ctx = ctx; - futhark_mc_segmap_parloop_struct_6010.free_implz2080U_5888 = - implz2080U_5888; - futhark_mc_segmap_parloop_struct_6010.free_nb_columns_5890 = - nb_columns_5890; - futhark_mc_segmap_parloop_struct_6010.free_board_mem_5945 = - board_mem_5945.mem; - futhark_mc_segmap_parloop_struct_6010.free_bytes_5946 = bytes_5946; - futhark_mc_segmap_parloop_struct_6010.free_mem_5964 = mem_5964.mem; - - struct scheduler_parloop futhark_mc_segmap_parloop_6011_task; - - futhark_mc_segmap_parloop_6011_task.name = "futhark_mc_segmap_parloop_6011"; - futhark_mc_segmap_parloop_6011_task.fn = futhark_mc_segmap_parloop_6011; - futhark_mc_segmap_parloop_6011_task.args = - &futhark_mc_segmap_parloop_struct_6010; - futhark_mc_segmap_parloop_6011_task.iterations = iterations; - futhark_mc_segmap_parloop_6011_task.info = info; - - uint64_t futhark_mc_segmap_parloop_6011_total_start = 0; - - if (ctx->profiling && !ctx->profiling_paused) - futhark_mc_segmap_parloop_6011_total_start = get_wall_time(); - - int futhark_mc_segmap_parloop_6011_err = - scheduler_execute_task(&ctx->scheduler, - &futhark_mc_segmap_parloop_6011_task); - - if (futhark_mc_segmap_parloop_6011_err != 0) { - err = 1; - goto cleanup; - } - if (ctx->profiling && !ctx->profiling_paused) { - uint64_t futhark_mc_segmap_parloop_6011_total_end = get_wall_time(); - uint64_t elapsed = futhark_mc_segmap_parloop_6011_total_end - - futhark_mc_segmap_parloop_6011_total_start; - - __atomic_fetch_add(&ctx->futhark_mc_segmap_parloop_6011_total_runs, 1, - __ATOMIC_RELAXED); - __atomic_fetch_add(&ctx->futhark_mc_segmap_parloop_6011_total_total_runtime, - elapsed, __ATOMIC_RELAXED); - __atomic_fetch_add(&ctx->futhark_mc_segmap_parloop_6011_total_iter, - iterations, __ATOMIC_RELAXED); - } - if (ctx->profiling && !ctx->profiling_paused) { - uint64_t futhark_mc_segmap_task_6009_end = get_wall_time(); - uint64_t elapsed = futhark_mc_segmap_task_6009_end - - futhark_mc_segmap_task_6009_start; - - ctx->futhark_mc_segmap_task_6009_runs[tid]++; - ctx->futhark_mc_segmap_task_6009_total_runtime[tid] += elapsed; - ctx->futhark_mc_segmap_task_6009_iter[tid] += iterations; - } - - cleanup: - { } - return err; -} -struct futhark_mc_segmap_parloop_struct_6014 { - struct futhark_context *ctx; - int64_t free_implz2080U_5888; - int64_t free_nb_columns_5890; - char *free_board_mem_5945; - int64_t free_bytes_5946; - char *free_mem_5964; -} ; -struct futhark_mc_task_6017 { - struct futhark_context *ctx; - int64_t free_implz2080U_5888; - int64_t free_x_5922; - char *free_board_mem_5945; - char *free_mem_5947; -} ; -struct futhark_mc_segmap_parloop_struct_6019 { - struct futhark_context *ctx; - int64_t free_implz2080U_5888; - int64_t free_x_5922; - char *free_board_mem_5945; - char *free_mem_5947; -} ; -static int futhark_mc_segmap_parloop_6020(void *args, int64_t start, - int64_t end, int flat_tid_5919, - int tid) -{ - int err = 0; - struct futhark_mc_segmap_parloop_struct_6019 - *futhark_mc_segmap_parloop_struct_6019 = - (struct futhark_mc_segmap_parloop_struct_6019 *) args; - struct futhark_context *ctx = futhark_mc_segmap_parloop_struct_6019->ctx; - uint64_t futhark_mc_segmap_parloop_6020_start = 0; - - if (ctx->profiling && !ctx->profiling_paused) - futhark_mc_segmap_parloop_6020_start = get_wall_time(); - - int64_t implz2080U_5888 = - futhark_mc_segmap_parloop_struct_6019->free_implz2080U_5888; - int64_t x_5922 = futhark_mc_segmap_parloop_struct_6019->free_x_5922; - struct memblock board_mem_5945 = {.desc ="board_mem_5945", .mem = - futhark_mc_segmap_parloop_struct_6019->free_board_mem_5945, - .size =0, .references =NULL}; - struct memblock mem_5947 = {.desc ="mem_5947", .mem = - futhark_mc_segmap_parloop_struct_6019->free_mem_5947, - .size =0, .references =NULL}; - int64_t iterations = end - start; - int64_t iter_5992 = start; - - for (; iter_5992 < end; iter_5992++) { - if (ctx->debugging) - fprintf(ctx->log, "%s\n", "SegMap fbody"); - - int64_t gtid_5920; - - gtid_5920 = iter_5992; - - int64_t get_cell_index_res_5983; - - get_cell_index_res_5983 = add64(gtid_5920, x_5922); - - bool x_5984 = sle64((int64_t) 0, get_cell_index_res_5983); - bool y_5985 = slt64(get_cell_index_res_5983, implz2080U_5888); - bool bounds_check_5986 = x_5984 && y_5985; - bool index_certs_5987; - - if (!bounds_check_5986) { - ctx->error = msgprintf("Error: %s%lld%s%lld%s\n\nBacktrace:\n%s", - "Index [", get_cell_index_res_5983, - "] out of bounds for array of shape [", - implz2080U_5888, "].", - "-> #0 /home/baptistecdr/Documents/Cours/projet-de-bachelor/game_of_life/gol.fut:30:24-63\n #1 /home/baptistecdr/Documents/Cours/projet-de-bachelor/game_of_life/gol.fut:30:10-107\n #2 /home/baptistecdr/Documents/Cours/projet-de-bachelor/game_of_life/gol.fut:29:8-31:12\n #3 /home/baptistecdr/Documents/Cours/projet-de-bachelor/game_of_life/gol.fut:26:1-31:12\n"); - return 1; - } - - int8_t x_5988 = - ((int8_t *) board_mem_5945.mem)[get_cell_index_res_5983]; - bool cond_5989 = x_5988 == (int8_t) 1; - int32_t defunc_0_f_res_5990; - - if (cond_5989) { - defunc_0_f_res_5990 = -1; - } else { - defunc_0_f_res_5990 = -16777216; - } - ((int32_t *) mem_5947.mem)[gtid_5920] = defunc_0_f_res_5990; - } - - cleanup: - { } - if (ctx->profiling && !ctx->profiling_paused) { - uint64_t futhark_mc_segmap_parloop_6020_end = get_wall_time(); - uint64_t elapsed = futhark_mc_segmap_parloop_6020_end - - futhark_mc_segmap_parloop_6020_start; - - ctx->futhark_mc_segmap_parloop_6020_runs[tid]++; - ctx->futhark_mc_segmap_parloop_6020_total_runtime[tid] += elapsed; - ctx->futhark_mc_segmap_parloop_6020_iter[tid] += iterations; - } - return err; -} -int futhark_mc_segmap_task_6018(void *args, int64_t iterations, int tid, - struct scheduler_info info) -{ - int err = 0; - int flat_tid_5919 = tid; - int num_tasks_5991 = info.nsubtasks; - struct futhark_mc_task_6017 *futhark_mc_task_6017 = - (struct futhark_mc_task_6017 *) args; - struct futhark_context *ctx = futhark_mc_task_6017->ctx; - uint64_t futhark_mc_segmap_task_6018_start = 0; - - if (ctx->profiling && !ctx->profiling_paused) - futhark_mc_segmap_task_6018_start = get_wall_time(); - - int64_t implz2080U_5888 = futhark_mc_task_6017->free_implz2080U_5888; - int64_t x_5922 = futhark_mc_task_6017->free_x_5922; - struct memblock board_mem_5945 = {.desc ="board_mem_5945", .mem = - futhark_mc_task_6017->free_board_mem_5945, - .size =0, .references =NULL}; - struct memblock mem_5947 = {.desc ="mem_5947", .mem = - futhark_mc_task_6017->free_mem_5947, .size =0, - .references =NULL}; - int64_t iter_5992; - struct futhark_mc_segmap_parloop_struct_6019 - futhark_mc_segmap_parloop_struct_6019; - - futhark_mc_segmap_parloop_struct_6019.ctx = ctx; - futhark_mc_segmap_parloop_struct_6019.free_implz2080U_5888 = - implz2080U_5888; - futhark_mc_segmap_parloop_struct_6019.free_x_5922 = x_5922; - futhark_mc_segmap_parloop_struct_6019.free_board_mem_5945 = - board_mem_5945.mem; - futhark_mc_segmap_parloop_struct_6019.free_mem_5947 = mem_5947.mem; - - struct scheduler_parloop futhark_mc_segmap_parloop_6020_task; - - futhark_mc_segmap_parloop_6020_task.name = "futhark_mc_segmap_parloop_6020"; - futhark_mc_segmap_parloop_6020_task.fn = futhark_mc_segmap_parloop_6020; - futhark_mc_segmap_parloop_6020_task.args = - &futhark_mc_segmap_parloop_struct_6019; - futhark_mc_segmap_parloop_6020_task.iterations = iterations; - futhark_mc_segmap_parloop_6020_task.info = info; - - uint64_t futhark_mc_segmap_parloop_6020_total_start = 0; - - if (ctx->profiling && !ctx->profiling_paused) - futhark_mc_segmap_parloop_6020_total_start = get_wall_time(); - - int futhark_mc_segmap_parloop_6020_err = - scheduler_execute_task(&ctx->scheduler, - &futhark_mc_segmap_parloop_6020_task); - - if (futhark_mc_segmap_parloop_6020_err != 0) { - err = 1; - goto cleanup; - } - if (ctx->profiling && !ctx->profiling_paused) { - uint64_t futhark_mc_segmap_parloop_6020_total_end = get_wall_time(); - uint64_t elapsed = futhark_mc_segmap_parloop_6020_total_end - - futhark_mc_segmap_parloop_6020_total_start; - - __atomic_fetch_add(&ctx->futhark_mc_segmap_parloop_6020_total_runs, 1, - __ATOMIC_RELAXED); - __atomic_fetch_add(&ctx->futhark_mc_segmap_parloop_6020_total_total_runtime, - elapsed, __ATOMIC_RELAXED); - __atomic_fetch_add(&ctx->futhark_mc_segmap_parloop_6020_total_iter, - iterations, __ATOMIC_RELAXED); - } - if (ctx->profiling && !ctx->profiling_paused) { - uint64_t futhark_mc_segmap_task_6018_end = get_wall_time(); - uint64_t elapsed = futhark_mc_segmap_task_6018_end - - futhark_mc_segmap_task_6018_start; - - ctx->futhark_mc_segmap_task_6018_runs[tid]++; - ctx->futhark_mc_segmap_task_6018_total_runtime[tid] += elapsed; - ctx->futhark_mc_segmap_task_6018_iter[tid] += iterations; - } - - cleanup: - { } - return err; -} -static int futhark_mc_segmap_parloop_6015(void *args, int64_t start, - int64_t end, int flat_tid_5917, - int tid) -{ - int err = 0; - struct futhark_mc_segmap_parloop_struct_6014 - *futhark_mc_segmap_parloop_struct_6014 = - (struct futhark_mc_segmap_parloop_struct_6014 *) args; - struct futhark_context *ctx = futhark_mc_segmap_parloop_struct_6014->ctx; - uint64_t futhark_mc_segmap_parloop_6015_start = 0; - - if (ctx->profiling && !ctx->profiling_paused) - futhark_mc_segmap_parloop_6015_start = get_wall_time(); - - int64_t implz2080U_5888 = - futhark_mc_segmap_parloop_struct_6014->free_implz2080U_5888; - int64_t nb_columns_5890 = - futhark_mc_segmap_parloop_struct_6014->free_nb_columns_5890; - struct memblock board_mem_5945 = {.desc ="board_mem_5945", .mem = - futhark_mc_segmap_parloop_struct_6014->free_board_mem_5945, - .size =0, .references =NULL}; - int64_t bytes_5946 = futhark_mc_segmap_parloop_struct_6014->free_bytes_5946; - struct memblock mem_5964 = {.desc ="mem_5964", .mem = - futhark_mc_segmap_parloop_struct_6014->free_mem_5964, - .size =0, .references =NULL}; - size_t mem_5947_cached_sizze_6016 = 0; - char *mem_5947 = NULL; - int64_t iterations = end - start; - int64_t iter_5982 = start; - - if (mem_5947_cached_sizze_6016 < (size_t) bytes_5946) { - mem_5947 = realloc(mem_5947, bytes_5946); - mem_5947_cached_sizze_6016 = bytes_5946; - } - for (; iter_5982 < end; iter_5982++) { - if (ctx->debugging) - fprintf(ctx->log, "%s\n", "SegMap fbody"); - - int64_t gtid_5918; - - gtid_5918 = iter_5982; - - int64_t x_5922; - - x_5922 = mul64(nb_columns_5890, gtid_5918); - - int64_t flat_tid_5919 = (int64_t) 0; - int32_t num_tasks_5991; - struct futhark_mc_task_6017 futhark_mc_task_6017; - - futhark_mc_task_6017.ctx = ctx; - futhark_mc_task_6017.free_implz2080U_5888 = implz2080U_5888; - futhark_mc_task_6017.free_x_5922 = x_5922; - futhark_mc_task_6017.free_board_mem_5945 = board_mem_5945.mem; - futhark_mc_task_6017.free_mem_5947 = mem_5947; - - struct scheduler_segop futhark_mc_task_6017_task; - - futhark_mc_task_6017_task.args = &futhark_mc_task_6017; - futhark_mc_task_6017_task.top_level_fn = futhark_mc_segmap_task_6018; - futhark_mc_task_6017_task.name = "futhark_mc_segmap_task_6018"; - futhark_mc_task_6017_task.iterations = nb_columns_5890; - futhark_mc_task_6017_task.task_time = - &ctx->futhark_mc_segmap_task_6018_total_time; - futhark_mc_task_6017_task.task_iter = - &ctx->futhark_mc_segmap_task_6018_total_iter; - futhark_mc_task_6017_task.sched = STATIC; - futhark_mc_task_6017_task.nested_fn = NULL; - - int futhark_mc_segmap_task_6018_err = - scheduler_prepare_task(&ctx->scheduler, &futhark_mc_task_6017_task); - - if (futhark_mc_segmap_task_6018_err != 0) { - err = 1; - goto cleanup; - } - memmove(mem_5964.mem + gtid_5918 * nb_columns_5890 * (int64_t) 4, - mem_5947 + (int64_t) 0, nb_columns_5890 * - (int64_t) sizeof(int32_t)); - } - - cleanup: - { } - free(mem_5947); - if (ctx->profiling && !ctx->profiling_paused) { - uint64_t futhark_mc_segmap_parloop_6015_end = get_wall_time(); - uint64_t elapsed = futhark_mc_segmap_parloop_6015_end - - futhark_mc_segmap_parloop_6015_start; - - ctx->futhark_mc_segmap_parloop_6015_runs[tid]++; - ctx->futhark_mc_segmap_parloop_6015_total_runtime[tid] += elapsed; - ctx->futhark_mc_segmap_parloop_6015_iter[tid] += iterations; - } - return err; -} -int futhark_mc_segmap_nested_task_6013(void *args, int64_t iterations, int tid, - struct scheduler_info info) -{ - int err = 0; - int flat_tid_5917 = tid; - int num_tasks_5968 = info.nsubtasks; - struct futhark_mc_task_6008 *futhark_mc_task_6008 = - (struct futhark_mc_task_6008 *) args; - struct futhark_context *ctx = futhark_mc_task_6008->ctx; - uint64_t futhark_mc_segmap_nested_task_6013_start = 0; - - if (ctx->profiling && !ctx->profiling_paused) - futhark_mc_segmap_nested_task_6013_start = get_wall_time(); - - int64_t implz2080U_5888 = futhark_mc_task_6008->free_implz2080U_5888; - int64_t nb_columns_5890 = futhark_mc_task_6008->free_nb_columns_5890; - struct memblock board_mem_5945 = {.desc ="board_mem_5945", .mem = - futhark_mc_task_6008->free_board_mem_5945, - .size =0, .references =NULL}; - int64_t bytes_5946 = futhark_mc_task_6008->free_bytes_5946; - struct memblock mem_5964 = {.desc ="mem_5964", .mem = - futhark_mc_task_6008->free_mem_5964, .size =0, - .references =NULL}; - int64_t iter_5982; - struct futhark_mc_segmap_parloop_struct_6014 - futhark_mc_segmap_parloop_struct_6014; - - futhark_mc_segmap_parloop_struct_6014.ctx = ctx; - futhark_mc_segmap_parloop_struct_6014.free_implz2080U_5888 = - implz2080U_5888; - futhark_mc_segmap_parloop_struct_6014.free_nb_columns_5890 = - nb_columns_5890; - futhark_mc_segmap_parloop_struct_6014.free_board_mem_5945 = - board_mem_5945.mem; - futhark_mc_segmap_parloop_struct_6014.free_bytes_5946 = bytes_5946; - futhark_mc_segmap_parloop_struct_6014.free_mem_5964 = mem_5964.mem; - - struct scheduler_parloop futhark_mc_segmap_parloop_6015_task; - - futhark_mc_segmap_parloop_6015_task.name = "futhark_mc_segmap_parloop_6015"; - futhark_mc_segmap_parloop_6015_task.fn = futhark_mc_segmap_parloop_6015; - futhark_mc_segmap_parloop_6015_task.args = - &futhark_mc_segmap_parloop_struct_6014; - futhark_mc_segmap_parloop_6015_task.iterations = iterations; - futhark_mc_segmap_parloop_6015_task.info = info; - - uint64_t futhark_mc_segmap_parloop_6015_total_start = 0; - - if (ctx->profiling && !ctx->profiling_paused) - futhark_mc_segmap_parloop_6015_total_start = get_wall_time(); - - int futhark_mc_segmap_parloop_6015_err = - scheduler_execute_task(&ctx->scheduler, - &futhark_mc_segmap_parloop_6015_task); - - if (futhark_mc_segmap_parloop_6015_err != 0) { - err = 1; - goto cleanup; - } - if (ctx->profiling && !ctx->profiling_paused) { - uint64_t futhark_mc_segmap_parloop_6015_total_end = get_wall_time(); - uint64_t elapsed = futhark_mc_segmap_parloop_6015_total_end - - futhark_mc_segmap_parloop_6015_total_start; - - __atomic_fetch_add(&ctx->futhark_mc_segmap_parloop_6015_total_runs, 1, - __ATOMIC_RELAXED); - __atomic_fetch_add(&ctx->futhark_mc_segmap_parloop_6015_total_total_runtime, - elapsed, __ATOMIC_RELAXED); - __atomic_fetch_add(&ctx->futhark_mc_segmap_parloop_6015_total_iter, - iterations, __ATOMIC_RELAXED); - } - if (ctx->profiling && !ctx->profiling_paused) { - uint64_t futhark_mc_segmap_nested_task_6013_end = get_wall_time(); - uint64_t elapsed = futhark_mc_segmap_nested_task_6013_end - - futhark_mc_segmap_nested_task_6013_start; - - ctx->futhark_mc_segmap_nested_task_6013_runs[tid]++; - ctx->futhark_mc_segmap_nested_task_6013_total_runtime[tid] += elapsed; - ctx->futhark_mc_segmap_nested_task_6013_iter[tid] += iterations; - } - - cleanup: - { } - return err; -} -static int futrts_init(struct futhark_context *ctx, - struct memblock *out_mem_p_5993, - int64_t *out_scalar_out_5994, - int64_t *out_scalar_out_5995, - int64_t *out_scalar_out_5996, - struct memblock board_mem_5945, int64_t n_5861, - int64_t nb_rows_5863, int64_t nb_columns_5864, - int64_t sizze_5865) -{ - (void) ctx; - - int err = 0; - struct memblock out_mem_5965; - - out_mem_5965.references = NULL; - - int64_t scalar_out_5966; - int64_t scalar_out_5967; - int64_t scalar_out_5968; - - if (memblock_set(ctx, &out_mem_5965, &board_mem_5945, "board_mem_5945") != - 0) - return 1; - scalar_out_5966 = nb_columns_5864; - scalar_out_5967 = nb_rows_5863; - scalar_out_5968 = sizze_5865; - (*out_mem_p_5993).references = NULL; - if (memblock_set(ctx, &*out_mem_p_5993, &out_mem_5965, "out_mem_5965") != 0) - return 1; - *out_scalar_out_5994 = scalar_out_5966; - *out_scalar_out_5995 = scalar_out_5967; - *out_scalar_out_5996 = scalar_out_5968; - if (memblock_unref(ctx, &out_mem_5965, "out_mem_5965") != 0) - return 1; - - cleanup: - { } - return err; -} -static int futrts_key(struct futhark_context *ctx, - struct memblock *out_mem_p_5997, - int64_t *out_scalar_out_5998, - int64_t *out_scalar_out_5999, - int64_t *out_scalar_out_6000, - struct memblock board_mem_5945, int64_t implz2080U_5881, - int32_t e_5882, int32_t key_5883, int64_t nb_columns_5885, - int64_t nb_rows_5886, int64_t sizze_5887) -{ - (void) ctx; - - int err = 0; - struct memblock out_mem_5965; - - out_mem_5965.references = NULL; - - int64_t scalar_out_5966; - int64_t scalar_out_5967; - int64_t scalar_out_5968; - - if (memblock_set(ctx, &out_mem_5965, &board_mem_5945, "board_mem_5945") != - 0) - return 1; - scalar_out_5966 = nb_columns_5885; - scalar_out_5967 = nb_rows_5886; - scalar_out_5968 = sizze_5887; - (*out_mem_p_5997).references = NULL; - if (memblock_set(ctx, &*out_mem_p_5997, &out_mem_5965, "out_mem_5965") != 0) - return 1; - *out_scalar_out_5998 = scalar_out_5966; - *out_scalar_out_5999 = scalar_out_5967; - *out_scalar_out_6000 = scalar_out_5968; - if (memblock_unref(ctx, &out_mem_5965, "out_mem_5965") != 0) - return 1; - - cleanup: - { } - return err; -} -static int futrts_mouse(struct futhark_context *ctx, - struct memblock *out_mem_p_6001, - int64_t *out_scalar_out_6002, - int64_t *out_scalar_out_6003, - int64_t *out_scalar_out_6004, - struct memblock board_mem_5945, int64_t implz2080U_5866, - int32_t buttons_5867, int32_t x_5868, int32_t y_5869, - int64_t nb_columns_5871, int64_t nb_rows_5872, - int64_t sizze_5873) -{ - (void) ctx; - - int err = 0; - struct memblock out_mem_5965; - - out_mem_5965.references = NULL; - - int64_t scalar_out_5966; - int64_t scalar_out_5967; - int64_t scalar_out_5968; - - if (memblock_set(ctx, &out_mem_5965, &board_mem_5945, "board_mem_5945") != - 0) - return 1; - scalar_out_5966 = nb_columns_5871; - scalar_out_5967 = nb_rows_5872; - scalar_out_5968 = sizze_5873; - (*out_mem_p_6001).references = NULL; - if (memblock_set(ctx, &*out_mem_p_6001, &out_mem_5965, "out_mem_5965") != 0) - return 1; - *out_scalar_out_6002 = scalar_out_5966; - *out_scalar_out_6003 = scalar_out_5967; - *out_scalar_out_6004 = scalar_out_5968; - if (memblock_unref(ctx, &out_mem_5965, "out_mem_5965") != 0) - return 1; - - cleanup: - { } - return err; -} -static int futrts_render(struct futhark_context *ctx, - struct memblock *out_mem_p_6005, - int64_t *out_out_arrsizze_6006, - int64_t *out_out_arrsizze_6007, - struct memblock board_mem_5945, - int64_t implz2080U_5888, int64_t nb_columns_5890, - int64_t nb_rows_5891, int64_t sizze_5892) -{ - (void) ctx; - - int err = 0; - struct memblock out_mem_5965; - - out_mem_5965.references = NULL; - - int64_t out_arrsizze_5966; - int64_t out_arrsizze_5967; - bool bounds_invalid_upwards_5893 = slt64(nb_rows_5891, (int64_t) 0); - bool valid_5894 = !bounds_invalid_upwards_5893; - bool range_valid_c_5895; - - if (!valid_5894) { - ctx->error = msgprintf("Error: %s%lld%s%lld%s%lld%s\n\nBacktrace:\n%s", - "Range ", (int64_t) 0, "..", (int64_t) 1, "..<", - nb_rows_5891, " is invalid.", - "-> #0 /prelude/array.fut:90:3-10\n #1 /home/baptistecdr/Documents/Cours/projet-de-bachelor/game_of_life/gol.fut:27:17-30\n #2 /home/baptistecdr/Documents/Cours/projet-de-bachelor/game_of_life/gol.fut:26:1-31:12\n"); - if (memblock_unref(ctx, &out_mem_5965, "out_mem_5965") != 0) - return 1; - return 1; - } - - bool bounds_invalid_upwards_5897 = slt64(nb_columns_5890, (int64_t) 0); - bool valid_5898 = !bounds_invalid_upwards_5897; - bool range_valid_c_5899; - - if (!valid_5898) { - ctx->error = msgprintf("Error: %s%lld%s%lld%s%lld%s\n\nBacktrace:\n%s", - "Range ", (int64_t) 0, "..", (int64_t) 1, "..<", - nb_columns_5890, " is invalid.", - "-> #0 /prelude/array.fut:90:3-10\n #1 /home/baptistecdr/Documents/Cours/projet-de-bachelor/game_of_life/gol.fut:28:17-33\n #2 /home/baptistecdr/Documents/Cours/projet-de-bachelor/game_of_life/gol.fut:26:1-31:12\n"); - if (memblock_unref(ctx, &out_mem_5965, "out_mem_5965") != 0) - return 1; - return 1; - } - - int64_t binop_x_5963 = nb_columns_5890 * nb_rows_5891; - int64_t bytes_5962 = (int64_t) 4 * binop_x_5963; - struct memblock mem_5964; - - mem_5964.references = NULL; - if (memblock_alloc(ctx, &mem_5964, bytes_5962, "mem_5964")) { - err = 1; - goto cleanup; - } - - int64_t bytes_5946 = (int64_t) 4 * nb_columns_5890; - int64_t flat_tid_5915 = (int64_t) 0; - int32_t num_tasks_5968; - int64_t flat_tid_5917; - - flat_tid_5917 = (int64_t) 0; - - struct futhark_mc_task_6008 futhark_mc_task_6008; - - futhark_mc_task_6008.ctx = ctx; - futhark_mc_task_6008.free_implz2080U_5888 = implz2080U_5888; - futhark_mc_task_6008.free_nb_columns_5890 = nb_columns_5890; - futhark_mc_task_6008.free_board_mem_5945 = board_mem_5945.mem; - futhark_mc_task_6008.free_bytes_5946 = bytes_5946; - futhark_mc_task_6008.free_mem_5964 = mem_5964.mem; - - struct scheduler_segop futhark_mc_task_6008_task; - - futhark_mc_task_6008_task.args = &futhark_mc_task_6008; - futhark_mc_task_6008_task.top_level_fn = futhark_mc_segmap_task_6009; - futhark_mc_task_6008_task.name = "futhark_mc_segmap_task_6009"; - futhark_mc_task_6008_task.iterations = nb_rows_5891; - futhark_mc_task_6008_task.task_time = - &ctx->futhark_mc_segmap_task_6009_total_time; - futhark_mc_task_6008_task.task_iter = - &ctx->futhark_mc_segmap_task_6009_total_iter; - futhark_mc_task_6008_task.sched = STATIC; - futhark_mc_task_6008_task.nested_fn = futhark_mc_segmap_nested_task_6013; - - int futhark_mc_segmap_task_6009_err = - scheduler_prepare_task(&ctx->scheduler, &futhark_mc_task_6008_task); - - if (futhark_mc_segmap_task_6009_err != 0) { - err = 1; - goto cleanup; - } - out_arrsizze_5966 = nb_rows_5891; - out_arrsizze_5967 = nb_columns_5890; - if (memblock_set(ctx, &out_mem_5965, &mem_5964, "mem_5964") != 0) - return 1; - (*out_mem_p_6005).references = NULL; - if (memblock_set(ctx, &*out_mem_p_6005, &out_mem_5965, "out_mem_5965") != 0) - return 1; - *out_out_arrsizze_6006 = out_arrsizze_5966; - *out_out_arrsizze_6007 = out_arrsizze_5967; - if (memblock_unref(ctx, &mem_5964, "mem_5964") != 0) - return 1; - if (memblock_unref(ctx, &out_mem_5965, "out_mem_5965") != 0) - return 1; - - cleanup: - { } - return err; -} -static int futrts_resizze(struct futhark_context *ctx, - struct memblock *out_mem_p_6021, - int64_t *out_scalar_out_6022, - int64_t *out_scalar_out_6023, - int64_t *out_scalar_out_6024, - struct memblock board_mem_5945, - int64_t implz2080U_5848, int64_t h_5849, - int64_t w_5850, int64_t nb_columns_5852, - int64_t nb_rows_5853, int64_t sizze_5854) -{ - (void) ctx; - - int err = 0; - struct memblock out_mem_5965; - - out_mem_5965.references = NULL; - - int64_t scalar_out_5966; - int64_t scalar_out_5967; - int64_t scalar_out_5968; - - if (memblock_set(ctx, &out_mem_5965, &board_mem_5945, "board_mem_5945") != - 0) - return 1; - scalar_out_5966 = nb_columns_5852; - scalar_out_5967 = nb_rows_5853; - scalar_out_5968 = sizze_5854; - (*out_mem_p_6021).references = NULL; - if (memblock_set(ctx, &*out_mem_p_6021, &out_mem_5965, "out_mem_5965") != 0) - return 1; - *out_scalar_out_6022 = scalar_out_5966; - *out_scalar_out_6023 = scalar_out_5967; - *out_scalar_out_6024 = scalar_out_5968; - if (memblock_unref(ctx, &out_mem_5965, "out_mem_5965") != 0) - return 1; - - cleanup: - { } - return err; -} -static int futrts_step(struct futhark_context *ctx, - struct memblock *out_mem_p_6025, - int64_t *out_scalar_out_6026, - int64_t *out_scalar_out_6027, - int64_t *out_scalar_out_6028, - struct memblock board_mem_5945, int64_t implz2080U_5855, - float nameless_5856, int64_t nb_columns_5858, - int64_t nb_rows_5859, int64_t sizze_5860) -{ - (void) ctx; - - int err = 0; - struct memblock out_mem_5965; - - out_mem_5965.references = NULL; - - int64_t scalar_out_5966; - int64_t scalar_out_5967; - int64_t scalar_out_5968; - - if (memblock_set(ctx, &out_mem_5965, &board_mem_5945, "board_mem_5945") != - 0) - return 1; - scalar_out_5966 = nb_columns_5858; - scalar_out_5967 = nb_rows_5859; - scalar_out_5968 = sizze_5860; - (*out_mem_p_6025).references = NULL; - if (memblock_set(ctx, &*out_mem_p_6025, &out_mem_5965, "out_mem_5965") != 0) - return 1; - *out_scalar_out_6026 = scalar_out_5966; - *out_scalar_out_6027 = scalar_out_5967; - *out_scalar_out_6028 = scalar_out_5968; - if (memblock_unref(ctx, &out_mem_5965, "out_mem_5965") != 0) - return 1; - - cleanup: - { } - return err; -} -static int futrts_wheel(struct futhark_context *ctx, - struct memblock *out_mem_p_6029, - int64_t *out_scalar_out_6030, - int64_t *out_scalar_out_6031, - int64_t *out_scalar_out_6032, - struct memblock board_mem_5945, int64_t implz2080U_5874, - int32_t dx_5875, int32_t dy_5876, - int64_t nb_columns_5878, int64_t nb_rows_5879, - int64_t sizze_5880) -{ - (void) ctx; - - int err = 0; - struct memblock out_mem_5965; - - out_mem_5965.references = NULL; - - int64_t scalar_out_5966; - int64_t scalar_out_5967; - int64_t scalar_out_5968; - - if (memblock_set(ctx, &out_mem_5965, &board_mem_5945, "board_mem_5945") != - 0) - return 1; - scalar_out_5966 = nb_columns_5878; - scalar_out_5967 = nb_rows_5879; - scalar_out_5968 = sizze_5880; - (*out_mem_p_6029).references = NULL; - if (memblock_set(ctx, &*out_mem_p_6029, &out_mem_5965, "out_mem_5965") != 0) - return 1; - *out_scalar_out_6030 = scalar_out_5966; - *out_scalar_out_6031 = scalar_out_5967; - *out_scalar_out_6032 = scalar_out_5968; - if (memblock_unref(ctx, &out_mem_5965, "out_mem_5965") != 0) - return 1; - - cleanup: - { } - return err; -} -struct futhark_u32_2d { - struct memblock mem; - int64_t shape[2]; -} ; -struct futhark_u32_2d *futhark_new_u32_2d(struct futhark_context *ctx, const - uint32_t *data, int64_t dim0, - int64_t dim1) -{ - struct futhark_u32_2d *bad = NULL; - struct futhark_u32_2d *arr = - (struct futhark_u32_2d *) malloc(sizeof(struct futhark_u32_2d)); - - if (arr == NULL) - return bad; - lock_lock(&ctx->lock); - worker_local = &ctx->scheduler.workers[0]; - arr->mem.references = NULL; - if (memblock_alloc(ctx, &arr->mem, (size_t) (dim0 * dim1) * - sizeof(uint32_t), "arr->mem")) - return NULL; - arr->shape[0] = dim0; - arr->shape[1] = dim1; - memmove(arr->mem.mem + 0, data + 0, (size_t) (dim0 * dim1) * - sizeof(uint32_t)); - lock_unlock(&ctx->lock); - return arr; -} -struct futhark_u32_2d *futhark_new_raw_u32_2d(struct futhark_context *ctx, const - char *data, int offset, - int64_t dim0, int64_t dim1) -{ - struct futhark_u32_2d *bad = NULL; - struct futhark_u32_2d *arr = - (struct futhark_u32_2d *) malloc(sizeof(struct futhark_u32_2d)); - - if (arr == NULL) - return bad; - lock_lock(&ctx->lock); - worker_local = &ctx->scheduler.workers[0]; - arr->mem.references = NULL; - if (memblock_alloc(ctx, &arr->mem, (size_t) (dim0 * dim1) * - sizeof(uint32_t), "arr->mem")) - return NULL; - arr->shape[0] = dim0; - arr->shape[1] = dim1; - memmove(arr->mem.mem + 0, data + offset, (size_t) (dim0 * dim1) * - sizeof(uint32_t)); - lock_unlock(&ctx->lock); - return arr; -} -int futhark_free_u32_2d(struct futhark_context *ctx, struct futhark_u32_2d *arr) -{ - lock_lock(&ctx->lock); - worker_local = &ctx->scheduler.workers[0]; - if (memblock_unref(ctx, &arr->mem, "arr->mem") != 0) - return 1; - lock_unlock(&ctx->lock); - free(arr); - return 0; -} -int futhark_values_u32_2d(struct futhark_context *ctx, - struct futhark_u32_2d *arr, uint32_t *data) -{ - lock_lock(&ctx->lock); - worker_local = &ctx->scheduler.workers[0]; - memmove(data + 0, arr->mem.mem + 0, (size_t) (arr->shape[0] * - arr->shape[1]) * - sizeof(uint32_t)); - lock_unlock(&ctx->lock); - return 0; -} -char *futhark_values_raw_u32_2d(struct futhark_context *ctx, - struct futhark_u32_2d *arr) -{ - (void) ctx; - return arr->mem.mem; -} -const int64_t *futhark_shape_u32_2d(struct futhark_context *ctx, - struct futhark_u32_2d *arr) -{ - (void) ctx; - return arr->shape; -} -struct futhark_i8_1d { - struct memblock mem; - int64_t shape[1]; -} ; -struct futhark_i8_1d *futhark_new_i8_1d(struct futhark_context *ctx, const - int8_t *data, int64_t dim0) -{ - struct futhark_i8_1d *bad = NULL; - struct futhark_i8_1d *arr = - (struct futhark_i8_1d *) malloc(sizeof(struct futhark_i8_1d)); - - if (arr == NULL) - return bad; - lock_lock(&ctx->lock); - worker_local = &ctx->scheduler.workers[0]; - arr->mem.references = NULL; - if (memblock_alloc(ctx, &arr->mem, (size_t) dim0 * sizeof(int8_t), - "arr->mem")) - return NULL; - arr->shape[0] = dim0; - memmove(arr->mem.mem + 0, data + 0, (size_t) dim0 * sizeof(int8_t)); - lock_unlock(&ctx->lock); - return arr; -} -struct futhark_i8_1d *futhark_new_raw_i8_1d(struct futhark_context *ctx, const - char *data, int offset, - int64_t dim0) -{ - struct futhark_i8_1d *bad = NULL; - struct futhark_i8_1d *arr = - (struct futhark_i8_1d *) malloc(sizeof(struct futhark_i8_1d)); - - if (arr == NULL) - return bad; - lock_lock(&ctx->lock); - worker_local = &ctx->scheduler.workers[0]; - arr->mem.references = NULL; - if (memblock_alloc(ctx, &arr->mem, (size_t) dim0 * sizeof(int8_t), - "arr->mem")) - return NULL; - arr->shape[0] = dim0; - memmove(arr->mem.mem + 0, data + offset, (size_t) dim0 * sizeof(int8_t)); - lock_unlock(&ctx->lock); - return arr; -} -int futhark_free_i8_1d(struct futhark_context *ctx, struct futhark_i8_1d *arr) -{ - lock_lock(&ctx->lock); - worker_local = &ctx->scheduler.workers[0]; - if (memblock_unref(ctx, &arr->mem, "arr->mem") != 0) - return 1; - lock_unlock(&ctx->lock); - free(arr); - return 0; -} -int futhark_values_i8_1d(struct futhark_context *ctx, struct futhark_i8_1d *arr, - int8_t *data) -{ - lock_lock(&ctx->lock); - worker_local = &ctx->scheduler.workers[0]; - memmove(data + 0, arr->mem.mem + 0, (size_t) arr->shape[0] * - sizeof(int8_t)); - lock_unlock(&ctx->lock); - return 0; -} -char *futhark_values_raw_i8_1d(struct futhark_context *ctx, - struct futhark_i8_1d *arr) -{ - (void) ctx; - return arr->mem.mem; -} -const int64_t *futhark_shape_i8_1d(struct futhark_context *ctx, - struct futhark_i8_1d *arr) -{ - (void) ctx; - return arr->shape; -} -struct futhark_opaque_state { - struct futhark_i8_1d *v0; - int64_t v1; - int64_t v2; - int64_t v3; -} ; -int futhark_free_opaque_state(struct futhark_context *ctx, - struct futhark_opaque_state *obj) -{ - int ret = 0, tmp; - - if (obj->v0 != NULL && (tmp = futhark_free_i8_1d(ctx, obj->v0)) != 0) - ret = tmp; - free(obj); - return ret; -} -int futhark_store_opaque_state(struct futhark_context *ctx, const - struct futhark_opaque_state *obj, void **p, - size_t *n) -{ - int ret = 0; - int64_t size_0 = 7 + 1 * sizeof(int64_t) + futhark_shape_i8_1d(ctx, - obj->v0)[0] * - 1; - int64_t size_1 = 7 + 0 * sizeof(int64_t) + 1 * 8; - int64_t size_2 = 7 + 0 * sizeof(int64_t) + 1 * 8; - int64_t size_3 = 7 + 0 * sizeof(int64_t) + 1 * 8; - - *n = size_0 + size_1 + size_2 + size_3; - if (p != NULL && *p == NULL) - *p = malloc(*n); - if (p != NULL) { - unsigned char *out = *p; - - *out++ = 'b'; - *out++ = 2; - *out++ = 1; - memcpy(out, " i8", 4); - out += 4; - memcpy(out, futhark_shape_i8_1d(ctx, obj->v0), 1 * sizeof(int64_t)); - out += 1 * sizeof(int64_t); - ret |= futhark_values_i8_1d(ctx, obj->v0, (void *) out); - out += futhark_shape_i8_1d(ctx, obj->v0)[0] * sizeof(int8_t); - *out++ = 'b'; - *out++ = 2; - *out++ = 0; - memcpy(out, " i64", 4); - out += 4; - memcpy(out, &obj->v1, sizeof(obj->v1)); - out += sizeof(obj->v1); - *out++ = 'b'; - *out++ = 2; - *out++ = 0; - memcpy(out, " i64", 4); - out += 4; - memcpy(out, &obj->v2, sizeof(obj->v2)); - out += sizeof(obj->v2); - *out++ = 'b'; - *out++ = 2; - *out++ = 0; - memcpy(out, " i64", 4); - out += 4; - memcpy(out, &obj->v3, sizeof(obj->v3)); - out += sizeof(obj->v3); - } - return ret; -} -struct futhark_opaque_state *futhark_restore_opaque_state(struct futhark_context *ctx, - const void *p) -{ - int err = 0; - const unsigned char *src = p; - struct futhark_opaque_state *obj = - malloc(sizeof(struct futhark_opaque_state)); - int64_t shape_0[1]; - - err |= *src++ != 'b'; - err |= *src++ != 2; - err |= *src++ != 1; - err |= memcmp(src, " i8", 4) != 0; - src += 4; - if (err == 0) { - memcpy(shape_0, src, 1 * sizeof(int64_t)); - src += 1 * sizeof(int64_t); - } - - const void *data_0 = src; - - obj->v0 = NULL; - src += shape_0[0] * sizeof(int8_t); - err |= *src++ != 'b'; - err |= *src++ != 2; - err |= *src++ != 0; - err |= memcmp(src, " i64", 4) != 0; - src += 4; - if (err == 0) { - src += 0 * sizeof(int64_t); - } - - const void *data_1 = src; - - src += sizeof(obj->v1); - err |= *src++ != 'b'; - err |= *src++ != 2; - err |= *src++ != 0; - err |= memcmp(src, " i64", 4) != 0; - src += 4; - if (err == 0) { - src += 0 * sizeof(int64_t); - } - - const void *data_2 = src; - - src += sizeof(obj->v2); - err |= *src++ != 'b'; - err |= *src++ != 2; - err |= *src++ != 0; - err |= memcmp(src, " i64", 4) != 0; - src += 4; - if (err == 0) { - src += 0 * sizeof(int64_t); - } - - const void *data_3 = src; - - src += sizeof(obj->v3); - if (err == 0) { - obj->v0 = futhark_new_i8_1d(ctx, data_0, shape_0[0]); - if (obj->v0 == NULL) - err = 1; - memcpy(&obj->v1, data_1, sizeof(obj->v1)); - memcpy(&obj->v2, data_2, sizeof(obj->v2)); - memcpy(&obj->v3, data_3, sizeof(obj->v3)); - } - if (err != 0) { - int ret = 0, tmp; - - if (obj->v0 != NULL && (tmp = futhark_free_i8_1d(ctx, obj->v0)) != 0) - ret = tmp; - free(obj); - obj = NULL; - } - return obj; -} -int futhark_entry_init(struct futhark_context *ctx, - struct futhark_opaque_state **out0, const - struct futhark_i8_1d *in0, const int64_t in1, const - int64_t in2, const int64_t in3) -{ - struct memblock board_mem_5945; - - board_mem_5945.references = NULL; - - int64_t n_5861; - int64_t nb_rows_5863; - int64_t nb_columns_5864; - int64_t sizze_5865; - struct memblock out_mem_5965; - - out_mem_5965.references = NULL; - - int64_t scalar_out_5966; - int64_t scalar_out_5967; - int64_t scalar_out_5968; - int ret = 0; - - lock_lock(&ctx->lock); - worker_local = &ctx->scheduler.workers[0]; - board_mem_5945 = in0->mem; - n_5861 = in0->shape[0]; - nb_rows_5863 = in1; - nb_columns_5864 = in2; - sizze_5865 = in3; - if (!(n_5861 == in0->shape[0] && (true && (true && true)))) { - ret = 1; - if (!ctx->error) - ctx->error = - msgprintf("Error: entry point arguments have invalid sizes.\n"); - } else { - ret = futrts_init(ctx, &out_mem_5965, &scalar_out_5966, - &scalar_out_5967, &scalar_out_5968, board_mem_5945, - n_5861, nb_rows_5863, nb_columns_5864, sizze_5865); - if (ret == 0) { - assert((*out0 = - (struct futhark_opaque_state *) malloc(sizeof(struct futhark_opaque_state))) != - NULL); - assert(((*out0)->v0 = - (struct futhark_i8_1d *) malloc(sizeof(struct futhark_i8_1d))) != - NULL); - (*out0)->v0->mem = out_mem_5965; - (*out0)->v0->shape[0] = n_5861; - (*out0)->v1 = scalar_out_5966; - (*out0)->v2 = scalar_out_5967; - (*out0)->v3 = scalar_out_5968; - } - } - lock_unlock(&ctx->lock); - return ret; -} -int futhark_entry_key(struct futhark_context *ctx, - struct futhark_opaque_state **out0, const int32_t in0, - const int32_t in1, const struct futhark_opaque_state *in2) -{ - struct memblock board_mem_5945; - - board_mem_5945.references = NULL; - - int64_t implz2080U_5881; - int32_t e_5882; - int32_t key_5883; - int64_t nb_columns_5885; - int64_t nb_rows_5886; - int64_t sizze_5887; - struct memblock out_mem_5965; - - out_mem_5965.references = NULL; - - int64_t scalar_out_5966; - int64_t scalar_out_5967; - int64_t scalar_out_5968; - int ret = 0; - - lock_lock(&ctx->lock); - worker_local = &ctx->scheduler.workers[0]; - e_5882 = in0; - key_5883 = in1; - board_mem_5945 = in2->v0->mem; - implz2080U_5881 = in2->v0->shape[0]; - nb_columns_5885 = in2->v1; - nb_rows_5886 = in2->v2; - sizze_5887 = in2->v3; - if (!(true && (true && implz2080U_5881 == in2->v0->shape[0]))) { - ret = 1; - if (!ctx->error) - ctx->error = - msgprintf("Error: entry point arguments have invalid sizes.\n"); - } else { - ret = futrts_key(ctx, &out_mem_5965, &scalar_out_5966, &scalar_out_5967, - &scalar_out_5968, board_mem_5945, implz2080U_5881, - e_5882, key_5883, nb_columns_5885, nb_rows_5886, - sizze_5887); - if (ret == 0) { - assert((*out0 = - (struct futhark_opaque_state *) malloc(sizeof(struct futhark_opaque_state))) != - NULL); - assert(((*out0)->v0 = - (struct futhark_i8_1d *) malloc(sizeof(struct futhark_i8_1d))) != - NULL); - (*out0)->v0->mem = out_mem_5965; - (*out0)->v0->shape[0] = implz2080U_5881; - (*out0)->v1 = scalar_out_5966; - (*out0)->v2 = scalar_out_5967; - (*out0)->v3 = scalar_out_5968; - } - } - lock_unlock(&ctx->lock); - return ret; -} -int futhark_entry_mouse(struct futhark_context *ctx, - struct futhark_opaque_state **out0, const int32_t in0, - const int32_t in1, const int32_t in2, const - struct futhark_opaque_state *in3) -{ - struct memblock board_mem_5945; - - board_mem_5945.references = NULL; - - int64_t implz2080U_5866; - int32_t buttons_5867; - int32_t x_5868; - int32_t y_5869; - int64_t nb_columns_5871; - int64_t nb_rows_5872; - int64_t sizze_5873; - struct memblock out_mem_5965; - - out_mem_5965.references = NULL; - - int64_t scalar_out_5966; - int64_t scalar_out_5967; - int64_t scalar_out_5968; - int ret = 0; - - lock_lock(&ctx->lock); - worker_local = &ctx->scheduler.workers[0]; - buttons_5867 = in0; - x_5868 = in1; - y_5869 = in2; - board_mem_5945 = in3->v0->mem; - implz2080U_5866 = in3->v0->shape[0]; - nb_columns_5871 = in3->v1; - nb_rows_5872 = in3->v2; - sizze_5873 = in3->v3; - if (!(true && (true && (true && implz2080U_5866 == in3->v0->shape[0])))) { - ret = 1; - if (!ctx->error) - ctx->error = - msgprintf("Error: entry point arguments have invalid sizes.\n"); - } else { - ret = futrts_mouse(ctx, &out_mem_5965, &scalar_out_5966, - &scalar_out_5967, &scalar_out_5968, board_mem_5945, - implz2080U_5866, buttons_5867, x_5868, y_5869, - nb_columns_5871, nb_rows_5872, sizze_5873); - if (ret == 0) { - assert((*out0 = - (struct futhark_opaque_state *) malloc(sizeof(struct futhark_opaque_state))) != - NULL); - assert(((*out0)->v0 = - (struct futhark_i8_1d *) malloc(sizeof(struct futhark_i8_1d))) != - NULL); - (*out0)->v0->mem = out_mem_5965; - (*out0)->v0->shape[0] = implz2080U_5866; - (*out0)->v1 = scalar_out_5966; - (*out0)->v2 = scalar_out_5967; - (*out0)->v3 = scalar_out_5968; - } - } - lock_unlock(&ctx->lock); - return ret; -} -int futhark_entry_render(struct futhark_context *ctx, - struct futhark_u32_2d **out0, const - struct futhark_opaque_state *in0) -{ - struct memblock board_mem_5945; - - board_mem_5945.references = NULL; - - int64_t implz2080U_5888; - int64_t nb_columns_5890; - int64_t nb_rows_5891; - int64_t sizze_5892; - struct memblock out_mem_5965; - - out_mem_5965.references = NULL; - - int64_t out_arrsizze_5966; - int64_t out_arrsizze_5967; - int ret = 0; - - lock_lock(&ctx->lock); - worker_local = &ctx->scheduler.workers[0]; - board_mem_5945 = in0->v0->mem; - implz2080U_5888 = in0->v0->shape[0]; - nb_columns_5890 = in0->v1; - nb_rows_5891 = in0->v2; - sizze_5892 = in0->v3; - if (!(implz2080U_5888 == in0->v0->shape[0])) { - ret = 1; - if (!ctx->error) - ctx->error = - msgprintf("Error: entry point arguments have invalid sizes.\n"); - } else { - ret = futrts_render(ctx, &out_mem_5965, &out_arrsizze_5966, - &out_arrsizze_5967, board_mem_5945, implz2080U_5888, - nb_columns_5890, nb_rows_5891, sizze_5892); - if (ret == 0) { - assert((*out0 = - (struct futhark_u32_2d *) malloc(sizeof(struct futhark_u32_2d))) != - NULL); - (*out0)->mem = out_mem_5965; - (*out0)->shape[0] = out_arrsizze_5966; - (*out0)->shape[1] = out_arrsizze_5967; - } - } - lock_unlock(&ctx->lock); - return ret; -} -int futhark_entry_resize(struct futhark_context *ctx, - struct futhark_opaque_state **out0, const int64_t in0, - const int64_t in1, const - struct futhark_opaque_state *in2) -{ - struct memblock board_mem_5945; - - board_mem_5945.references = NULL; - - int64_t implz2080U_5848; - int64_t h_5849; - int64_t w_5850; - int64_t nb_columns_5852; - int64_t nb_rows_5853; - int64_t sizze_5854; - struct memblock out_mem_5965; - - out_mem_5965.references = NULL; - - int64_t scalar_out_5966; - int64_t scalar_out_5967; - int64_t scalar_out_5968; - int ret = 0; - - lock_lock(&ctx->lock); - worker_local = &ctx->scheduler.workers[0]; - h_5849 = in0; - w_5850 = in1; - board_mem_5945 = in2->v0->mem; - implz2080U_5848 = in2->v0->shape[0]; - nb_columns_5852 = in2->v1; - nb_rows_5853 = in2->v2; - sizze_5854 = in2->v3; - if (!(true && (true && implz2080U_5848 == in2->v0->shape[0]))) { - ret = 1; - if (!ctx->error) - ctx->error = - msgprintf("Error: entry point arguments have invalid sizes.\n"); - } else { - ret = futrts_resizze(ctx, &out_mem_5965, &scalar_out_5966, - &scalar_out_5967, &scalar_out_5968, board_mem_5945, - implz2080U_5848, h_5849, w_5850, nb_columns_5852, - nb_rows_5853, sizze_5854); - if (ret == 0) { - assert((*out0 = - (struct futhark_opaque_state *) malloc(sizeof(struct futhark_opaque_state))) != - NULL); - assert(((*out0)->v0 = - (struct futhark_i8_1d *) malloc(sizeof(struct futhark_i8_1d))) != - NULL); - (*out0)->v0->mem = out_mem_5965; - (*out0)->v0->shape[0] = implz2080U_5848; - (*out0)->v1 = scalar_out_5966; - (*out0)->v2 = scalar_out_5967; - (*out0)->v3 = scalar_out_5968; - } - } - lock_unlock(&ctx->lock); - return ret; -} -int futhark_entry_step(struct futhark_context *ctx, - struct futhark_opaque_state **out0, const float in0, - const struct futhark_opaque_state *in1) -{ - struct memblock board_mem_5945; - - board_mem_5945.references = NULL; - - int64_t implz2080U_5855; - float nameless_5856; - int64_t nb_columns_5858; - int64_t nb_rows_5859; - int64_t sizze_5860; - struct memblock out_mem_5965; - - out_mem_5965.references = NULL; - - int64_t scalar_out_5966; - int64_t scalar_out_5967; - int64_t scalar_out_5968; - int ret = 0; - - lock_lock(&ctx->lock); - worker_local = &ctx->scheduler.workers[0]; - nameless_5856 = in0; - board_mem_5945 = in1->v0->mem; - implz2080U_5855 = in1->v0->shape[0]; - nb_columns_5858 = in1->v1; - nb_rows_5859 = in1->v2; - sizze_5860 = in1->v3; - if (!(true && implz2080U_5855 == in1->v0->shape[0])) { - ret = 1; - if (!ctx->error) - ctx->error = - msgprintf("Error: entry point arguments have invalid sizes.\n"); - } else { - ret = futrts_step(ctx, &out_mem_5965, &scalar_out_5966, - &scalar_out_5967, &scalar_out_5968, board_mem_5945, - implz2080U_5855, nameless_5856, nb_columns_5858, - nb_rows_5859, sizze_5860); - if (ret == 0) { - assert((*out0 = - (struct futhark_opaque_state *) malloc(sizeof(struct futhark_opaque_state))) != - NULL); - assert(((*out0)->v0 = - (struct futhark_i8_1d *) malloc(sizeof(struct futhark_i8_1d))) != - NULL); - (*out0)->v0->mem = out_mem_5965; - (*out0)->v0->shape[0] = implz2080U_5855; - (*out0)->v1 = scalar_out_5966; - (*out0)->v2 = scalar_out_5967; - (*out0)->v3 = scalar_out_5968; - } - } - lock_unlock(&ctx->lock); - return ret; -} -int futhark_entry_wheel(struct futhark_context *ctx, - struct futhark_opaque_state **out0, const int32_t in0, - const int32_t in1, const - struct futhark_opaque_state *in2) -{ - struct memblock board_mem_5945; - - board_mem_5945.references = NULL; - - int64_t implz2080U_5874; - int32_t dx_5875; - int32_t dy_5876; - int64_t nb_columns_5878; - int64_t nb_rows_5879; - int64_t sizze_5880; - struct memblock out_mem_5965; - - out_mem_5965.references = NULL; - - int64_t scalar_out_5966; - int64_t scalar_out_5967; - int64_t scalar_out_5968; - int ret = 0; - - lock_lock(&ctx->lock); - worker_local = &ctx->scheduler.workers[0]; - dx_5875 = in0; - dy_5876 = in1; - board_mem_5945 = in2->v0->mem; - implz2080U_5874 = in2->v0->shape[0]; - nb_columns_5878 = in2->v1; - nb_rows_5879 = in2->v2; - sizze_5880 = in2->v3; - if (!(true && (true && implz2080U_5874 == in2->v0->shape[0]))) { - ret = 1; - if (!ctx->error) - ctx->error = - msgprintf("Error: entry point arguments have invalid sizes.\n"); - } else { - ret = futrts_wheel(ctx, &out_mem_5965, &scalar_out_5966, - &scalar_out_5967, &scalar_out_5968, board_mem_5945, - implz2080U_5874, dx_5875, dy_5876, nb_columns_5878, - nb_rows_5879, sizze_5880); - if (ret == 0) { - assert((*out0 = - (struct futhark_opaque_state *) malloc(sizeof(struct futhark_opaque_state))) != - NULL); - assert(((*out0)->v0 = - (struct futhark_i8_1d *) malloc(sizeof(struct futhark_i8_1d))) != - NULL); - (*out0)->v0->mem = out_mem_5965; - (*out0)->v0->shape[0] = implz2080U_5874; - (*out0)->v1 = scalar_out_5966; - (*out0)->v2 = scalar_out_5967; - (*out0)->v3 = scalar_out_5968; - } - } - lock_unlock(&ctx->lock); - return ret; -} diff --git a/game_of_life/gol.fut b/game_of_life/gol.fut deleted file mode 100644 index da907c68aa88400cf61f247bfcc5c200dd107381..0000000000000000000000000000000000000000 --- a/game_of_life/gol.fut +++ /dev/null @@ -1,34 +0,0 @@ -import "./lib/github.com/diku-dk/lys/lys" - -type sized_state [n] = {board: [n]i8, nb_rows: i64, nb_columns:i64, size:i64} - -type^ state = sized_state [] - -let keydown (key: i32) (s: state) = s -let event (e: event) (s: state): state = s - -entry mouse (buttons: i32) (x: i32) (y: i32) (s: state): state = - event (#mouse {buttons, x, y}) s - -entry wheel (dx: i32) (dy: i32) (s: state): state = - event (#wheel {dx, dy}) s - -entry key (e: i32) (key: i32) (s: state): state = - let e' = if e == 0 then #keydown {key} else #keyup {key} - in event e' s - -entry resize (h: i64) (w: i64) (s: state): state = s - -let get_cell_index (x:i64) (y:i64) (nb_columns:i64) :i64 = (y * nb_columns + x) - -entry step (_: f32) (s: state): state = s - -entry render (s: state): [][]argb.colour = - let ridxs = iota s.nb_rows - let cidxs = iota s.nb_columns - in map (\y -> - map (\x -> if s.board[get_cell_index x y s.nb_columns] == 1 then argb.white else argb.black) cidxs) - ridxs - -entry init [n] (board: [n]i8) (nb_rows: i64) (nb_columns: i64) (size:i64) : state = - { board = board, nb_rows = nb_rows, nb_columns = nb_columns, size = size } diff --git a/game_of_life/gol.h b/game_of_life/gol.h deleted file mode 100644 index dca27183d50680d61c3ff4ed213266b924554909..0000000000000000000000000000000000000000 --- a/game_of_life/gol.h +++ /dev/null @@ -1,120 +0,0 @@ -#pragma once - -// Headers - -#include <stdint.h> -#include <stddef.h> -#include <stdbool.h> -#include <stdio.h> -#include <float.h> - -#ifdef __cplusplus -extern "C" { -#endif - -// Initialisation - -struct futhark_context_config ; -struct futhark_context_config *futhark_context_config_new(void); -void futhark_context_config_free(struct futhark_context_config *cfg); -void futhark_context_config_set_debugging(struct futhark_context_config *cfg, - int flag); -void futhark_context_config_set_profiling(struct futhark_context_config *cfg, - int flag); -void futhark_context_config_set_logging(struct futhark_context_config *cfg, - int flag); -void futhark_context_config_set_num_threads(struct futhark_context_config *cfg, - int n); -struct futhark_context ; -struct futhark_context *futhark_context_new(struct futhark_context_config *cfg); -void futhark_context_free(struct futhark_context *ctx); -int futhark_context_sync(struct futhark_context *ctx); -int futhark_context_config_set_size(struct futhark_context_config *cfg, const - char *size_name, size_t size_value); -int futhark_get_num_sizes(void); -const char *futhark_get_size_name(int); -const char *futhark_get_size_class(int); - -// Arrays - -struct futhark_i8_1d ; -struct futhark_i8_1d *futhark_new_i8_1d(struct futhark_context *ctx, const - int8_t *data, int64_t dim0); -struct futhark_i8_1d *futhark_new_raw_i8_1d(struct futhark_context *ctx, const - char *data, int offset, - int64_t dim0); -int futhark_free_i8_1d(struct futhark_context *ctx, struct futhark_i8_1d *arr); -int futhark_values_i8_1d(struct futhark_context *ctx, struct futhark_i8_1d *arr, - int8_t *data); -char *futhark_values_raw_i8_1d(struct futhark_context *ctx, - struct futhark_i8_1d *arr); -const int64_t *futhark_shape_i8_1d(struct futhark_context *ctx, - struct futhark_i8_1d *arr); -struct futhark_u32_2d ; -struct futhark_u32_2d *futhark_new_u32_2d(struct futhark_context *ctx, const - uint32_t *data, int64_t dim0, - int64_t dim1); -struct futhark_u32_2d *futhark_new_raw_u32_2d(struct futhark_context *ctx, const - char *data, int offset, - int64_t dim0, int64_t dim1); -int futhark_free_u32_2d(struct futhark_context *ctx, - struct futhark_u32_2d *arr); -int futhark_values_u32_2d(struct futhark_context *ctx, - struct futhark_u32_2d *arr, uint32_t *data); -char *futhark_values_raw_u32_2d(struct futhark_context *ctx, - struct futhark_u32_2d *arr); -const int64_t *futhark_shape_u32_2d(struct futhark_context *ctx, - struct futhark_u32_2d *arr); - -// Opaque values - -struct futhark_opaque_state ; -int futhark_free_opaque_state(struct futhark_context *ctx, - struct futhark_opaque_state *obj); -int futhark_store_opaque_state(struct futhark_context *ctx, const - struct futhark_opaque_state *obj, void **p, - size_t *n); -struct futhark_opaque_state -*futhark_restore_opaque_state(struct futhark_context *ctx, const void *p); - -// Entry points - -int futhark_entry_init(struct futhark_context *ctx, - struct futhark_opaque_state **out0, const - struct futhark_i8_1d *in0, const int64_t in1, const - int64_t in2, const int64_t in3); -int futhark_entry_key(struct futhark_context *ctx, - struct futhark_opaque_state **out0, const int32_t in0, - const int32_t in1, const - struct futhark_opaque_state *in2); -int futhark_entry_mouse(struct futhark_context *ctx, - struct futhark_opaque_state **out0, const int32_t in0, - const int32_t in1, const int32_t in2, const - struct futhark_opaque_state *in3); -int futhark_entry_render(struct futhark_context *ctx, - struct futhark_u32_2d **out0, const - struct futhark_opaque_state *in0); -int futhark_entry_resize(struct futhark_context *ctx, - struct futhark_opaque_state **out0, const int64_t in0, - const int64_t in1, const - struct futhark_opaque_state *in2); -int futhark_entry_step(struct futhark_context *ctx, - struct futhark_opaque_state **out0, const float in0, - const struct futhark_opaque_state *in1); -int futhark_entry_wheel(struct futhark_context *ctx, - struct futhark_opaque_state **out0, const int32_t in0, - const int32_t in1, const - struct futhark_opaque_state *in2); - -// Miscellaneous - -char *futhark_context_report(struct futhark_context *ctx); -char *futhark_context_get_error(struct futhark_context *ctx); -void futhark_context_set_logging_file(struct futhark_context *ctx, FILE *f); -void futhark_context_pause_profiling(struct futhark_context *ctx); -void futhark_context_unpause_profiling(struct futhark_context *ctx); -int futhark_context_clear_caches(struct futhark_context *ctx); -#define FUTHARK_BACKEND_multicore -#ifdef __cplusplus -} -#endif diff --git a/game_of_life/lib/github.com/athas/matte/.gitignore b/game_of_life/lib/github.com/athas/matte/.gitignore deleted file mode 100644 index 3d8fd0fe6760e20a08eee8bc30d61cf8883d0648..0000000000000000000000000000000000000000 --- a/game_of_life/lib/github.com/athas/matte/.gitignore +++ /dev/null @@ -1,3 +0,0 @@ -* -!.gitignore -!*.fut diff --git a/game_of_life/lib/github.com/athas/matte/colour.fut b/game_of_life/lib/github.com/athas/matte/colour.fut deleted file mode 100644 index 4d47177c1087302ee8da3d86e3c939868c086c91..0000000000000000000000000000000000000000 --- a/game_of_life/lib/github.com/athas/matte/colour.fut +++ /dev/null @@ -1,186 +0,0 @@ --- | Colour manipulation library. --- --- Adapted from the [Gloss](https://hackage.haskell.org/package/gloss) --- library by Ben Lippmeier. - --- | A colour that can be converted back and forth between an RGBA --- representation. Not very useful by itself, but using just this --- interface one can generate a lot of other useful functions via the --- colourspace parametric module. -module type colour = { - type colour - - -- | Construct a colour from R, G, B and A channels, each of which - -- must be a floating-point number between 0.0 and 1.0. The - -- concrete representation need not be able to handle the full - -- precision of each channel. Thus, `from_rgba` and `to_rgba` need - -- not be inverse of each other (but should be close). - val from_rgba: f32 -> f32 -> f32 -> f32 -> colour - - -- | Convert a colour to four R, G, B and A channels, each of which - -- is a floating-point number between 0.0 and 1.0. - val to_rgba: colour -> (f32, f32, f32, f32) -} - --- | A colour representation that encodes the four RGBA channels as a --- byte each in a 32-bit word, using the order A-R-G-B. -module argb_colour: colour with colour = u32 = { - -- ARGB storage. - type colour = u32 - - let clamp_channel (x: f32): f32 = - if x < 0f32 then 0f32 else if x > 1f32 then 1f32 else x - - let from_rgba (r: f32) (g: f32) (b: f32) (a: f32): colour = - ((u32.f32 (clamp_channel a * 255) << 24) | - (u32.f32 (clamp_channel r * 255) << 16) | - (u32.f32 (clamp_channel g * 255) << 8) | - (u32.f32 (clamp_channel b * 255))) - - let to_rgba (x: colour): (f32,f32,f32,f32) = - (f32.u32 ((x>>16) & 0xFF) / 255, - f32.u32 ((x>>8) & 0xFF) / 255, - f32.u32 ((x>>0) & 0xFF) / 255, - f32.u32 ((x>>24) & 0xFF) / 255) -} - --- | A colour representation and a host of useful functions and constants. -module type colourspace = { - include colour - - -- | Add RGB components of a color component-wise, then normalise - -- them to the highest resulting one. The alpha components are - -- averaged. - val add: colour -> colour -> colour - - -- | Add RGBA components of a color component-wise, capping them at - -- the maximum. - val add_linear: colour -> colour -> colour - - val mult: colour -> colour -> colour - val scale: colour -> f32 -> colour - val mix: f32 -> colour -> f32 -> colour -> colour - - -- | Brighten 20%. - val bright: colour -> colour - -- | Dim 20%. - val dim: colour -> colour - -- | 20% lighter. - val light: colour -> colour - -- | 20% darker. - val dark: colour -> colour - - -- Basic colours - val black: colour - val red: colour - val green: colour - val blue: colour - val white: colour - val brown: colour - - -- Derived colours - val yellow: colour - val orange: colour - val magenta: colour - val violet: colour - - -- | Grayness from 0-1. - val gray: f32 -> colour -} - --- | Given a colour representation, construct a colourspace with all --- the handy functions and constants. -module colourspace(C: colour): colourspace with colour = C.colour = { - open C - - let from_rgb_normalised (r: f32) (g: f32) (b: f32): colour = - let m = f32.max r (f32.max g b) - in from_rgba (r / m) (g / m) (b / m) 1f32 - - -- Normalise a color to the value of its largest RGB component. - let normalised_colour (r: f32) (g: f32) (b: f32) (a: f32): colour = - let m = f32.max r (f32.max g b) - in from_rgba (r / m) (g / m) (b / m) a - - let add (x: colour) (y: colour): colour = - let (r1,g1,b1,a1) = to_rgba x - let (r2,g2,b2,a2) = to_rgba y - in normalised_colour - (f32.max r1 r2) - (f32.max g1 g2) - (f32.max b1 b2) - ((a1+a2)/2f32) - - let add_linear (x: colour) (y: colour): colour = - let (r1,g1,b1,a1) = to_rgba x - let (r2,g2,b2,a2) = to_rgba y - in from_rgba (r1+r2) (g1+g2) (b1+b2) (a1+a2) - - let mult (x: colour) (y: colour): colour = - let (r1,g1,b1,a1) = to_rgba x - let (r2,g2,b2,a2) = to_rgba y - in from_rgba (r1*r2) (g1*g2) (b1*b2) (a1*a2) - - let scale (x: colour) (s: f32): colour = - let (r,g,b,a) = to_rgba x - in from_rgba (r*s) (g*s) (b*s) (a*s) - - let mix (m1: f32) (c1: colour) (m2: f32) (c2: colour): colour = - let (r1,g1,b1,a1) = to_rgba c1 - let (r2,g2,b2,a2) = to_rgba c2 - - let m12 = m1 + m2 - let m1' = m1 / m12 - let m2' = m2 / m12 - - let r1s = r1 * r1 - let r2s = r2 * r2 - - let g1s = g1 * g1 - let g2s = g2 * g2 - - let b1s = b1 * b1 - let b2s = b2 * b2 - - in from_rgba (f32.sqrt (m1' * r1s + m2' * r2s)) - (f32.sqrt (m1' * g1s + m2' * g2s)) - (f32.sqrt (m1' * b1s + m2' * b2s)) - ((m1 * a1 + m2 * a2) / m12) - - - let bright (c: colour): colour = - let (r,g,b,a) = to_rgba c - in from_rgba (r * 1.2f32) (g * 1.2f32) (b * 1.2f32) a - - let dim (c: colour): colour = - let (r,g,b,a) = to_rgba c - in from_rgba (r * 0.8f32) (g * 0.8f32) (b * 0.8f32) a - - let light (c: colour): colour = - let (r,g,b,a) = to_rgba c - in from_rgba (r + 0.2f32) (g + 0.2f32) (b + 0.2f32) a - - let dark (c: colour): colour = - let (r,g,b,a) = to_rgba c - in from_rgba (r - 0.2f32) (g - 0.2f32) (b - 0.2f32) a - - -- Basic colours - let black: colour = from_rgba 0f32 0f32 0f32 1f32 - let red: colour = from_rgba 1f32 0f32 0f32 1f32 - let green: colour = from_rgba 0f32 1f32 0f32 1f32 - let blue: colour = from_rgba 0f32 0f32 1f32 1f32 - let white: colour = from_rgba 1f32 1f32 1f32 1f32 - let brown: colour = from_rgba 0.49f32 0.19f32 0.11f32 1f32 - - -- Derived colours - let yellow: colour = add red green - let orange: colour = add yellow red - let magenta: colour = add red blue - let violet: colour = add magenta blue - - let gray (d: f32): colour = from_rgba d d d 1f32 -} - --- | An ARGB colour space - simply `colourspace`@term applied to --- `argb_colour`@term. -module argb: colourspace with colour = argb_colour.colour = colourspace argb_colour diff --git a/game_of_life/lib/github.com/athas/matte/colour_test.fut b/game_of_life/lib/github.com/athas/matte/colour_test.fut deleted file mode 100644 index f2e5eed7425380e45f831f11ea32936c6327578b..0000000000000000000000000000000000000000 --- a/game_of_life/lib/github.com/athas/matte/colour_test.fut +++ /dev/null @@ -1,17 +0,0 @@ --- | ignore - --- Proper tests of this library require drawing colours to the screen, --- I think. - -import "colour" - --- == --- entry: basic_mix --- input {} output {0.7058824f32 0.7058824f32 0.7058824f32 1.0f32} -entry basic_mix = - argb.to_rgba (argb.mix 0.5f32 argb.white 0.5f32 argb.black) - --- == --- entry: is_argb --- input {} output {0xFF000000u32} -entry is_argb: u32 = argb.black diff --git a/game_of_life/lib/github.com/diku-dk/lys/Inconsolata-Regular.ttf b/game_of_life/lib/github.com/diku-dk/lys/Inconsolata-Regular.ttf deleted file mode 100644 index 592ccd20073f76a663c56fe0176397149782565c..0000000000000000000000000000000000000000 Binary files a/game_of_life/lib/github.com/diku-dk/lys/Inconsolata-Regular.ttf and /dev/null differ diff --git a/game_of_life/lib/github.com/diku-dk/lys/common.mk b/game_of_life/lib/github.com/diku-dk/lys/common.mk deleted file mode 100644 index b6756ebf6c9d6d52606c4fd58e88fcc8c6384e0b..0000000000000000000000000000000000000000 --- a/game_of_life/lib/github.com/diku-dk/lys/common.mk +++ /dev/null @@ -1,36 +0,0 @@ -.PHONY: all run clean - -PROGNAME?=lys - -all: $(PROGNAME) - -LYS_TTF=1 - -ifeq ($(shell test futhark.pkg -nt lib; echo $$?),0) -$(PROGNAME): - futhark pkg sync - @make # The sync might have resulted in a new Makefile. -else -include lib/github.com/diku-dk/lys/setup_flags.mk -$(PROGNAME): $(PROGNAME)_wrapper.o $(PROGNAME)_printf.h lib/github.com/diku-dk/lys/liblys.c lib/github.com/diku-dk/lys/liblys.h lib/github.com/diku-dk/lys/context_setup.c lib/github.com/diku-dk/lys/context_setup.h lib/github.com/diku-dk/lys/main.c - gcc lib/github.com/diku-dk/lys/liblys.c lib/github.com/diku-dk/lys/context_setup.c lib/github.com/diku-dk/lys/main.c -I. -DPROGHEADER='"$(PROGNAME)_wrapper.h"' -DPRINTFHEADER='"$(PROGNAME)_printf.h"' $(PROGNAME)_wrapper.o -o $@ $(CFLAGS) $(LDFLAGS) -endif - -$(PROGNAME)_printf.h: $(PROGNAME)_wrapper.c - python3 lib/github.com/diku-dk/lys/gen_printf.py $@ $< - -# We do not want warnings and such for the generated code. -$(PROGNAME)_wrapper.o: $(PROGNAME)_wrapper.c - gcc -o $@ -c $< $(NOWARN_CFLAGS) - -%.c: %.fut - futhark $(LYS_BACKEND) --library $< - -%_wrapper.fut: lib/github.com/diku-dk/lys/genlys.fut $(PROG_FUT_DEPS) - cat $< | sed 's/"lys"/"$(PROGNAME)"/' > $@ - -run: $(PROGNAME) - ./$(PROGNAME) - -clean: - rm -f $(PROGNAME) $(PROGNAME).c $(PROGNAME).h $(PROGNAME)_wrapper.* $(PROGNAME)_printf.h *.o diff --git a/game_of_life/lib/github.com/diku-dk/lys/context_setup.c b/game_of_life/lib/github.com/diku-dk/lys/context_setup.c deleted file mode 100644 index 96a387b2713eb49b1097c1888bc788facb0cbc8b..0000000000000000000000000000000000000000 --- a/game_of_life/lib/github.com/diku-dk/lys/context_setup.c +++ /dev/null @@ -1,50 +0,0 @@ -#include "context_setup.h" - -void lys_setup_futhark_context(const char *deviceopt, bool device_interactive, - struct futhark_context_config* *futcfg, - struct futhark_context* *futctx, - char* *opencl_device_name) { - *futcfg = futhark_context_config_new(); - assert(*futcfg != NULL); - -#if defined(FUTHARK_BACKEND_opencl) || defined(FUTHARK_BACKEND_cuda) - if (deviceopt != NULL) { - futhark_context_config_set_device(*futcfg, deviceopt); - } -#else - (void)deviceopt; -#endif - -#ifdef FUTHARK_BACKEND_opencl - if (device_interactive) { - futhark_context_config_select_device_interactively(*futcfg); - } -#else - (void)device_interactive; -#endif - - *futctx = futhark_context_new(*futcfg); - assert(*futctx != NULL); - -#ifdef FUTHARK_BACKEND_opencl - cl_device_id device; - assert(clGetCommandQueueInfo(futhark_context_get_command_queue(*futctx), - CL_QUEUE_DEVICE, sizeof(cl_device_id), &device, NULL) - == CL_SUCCESS); - - size_t dev_name_size; - assert(clGetDeviceInfo(device, CL_DEVICE_NAME, 0, NULL, &dev_name_size) - == CL_SUCCESS); - *opencl_device_name = malloc(dev_name_size); - assert(clGetDeviceInfo(device, CL_DEVICE_NAME, dev_name_size, *opencl_device_name, NULL) - == CL_SUCCESS); -#else - *opencl_device_name = NULL; -#endif -} - -int64_t lys_wall_time() { - struct timeval time; - assert(gettimeofday(&time,NULL) == 0); - return time.tv_sec * 1000000 + time.tv_usec; -} diff --git a/game_of_life/lib/github.com/diku-dk/lys/context_setup.h b/game_of_life/lib/github.com/diku-dk/lys/context_setup.h deleted file mode 100644 index d613bd7396d107c2db8e5a1296ea9484f6e00f51..0000000000000000000000000000000000000000 --- a/game_of_life/lib/github.com/diku-dk/lys/context_setup.h +++ /dev/null @@ -1,29 +0,0 @@ -#ifndef LIBLYS_CONTEXT_SETUP -#define LIBLYS_CONTEXT_SETUP - -#include <stdio.h> -#include <stdlib.h> -#include <assert.h> -#include <time.h> -#include <sys/time.h> - -#include PROGHEADER - -void lys_setup_futhark_context(const char *deviceopt, bool device_interactive, - struct futhark_context_config* *futcfg, - struct futhark_context* *futctx, - char* *opencl_device_name); - -int64_t lys_wall_time(); - -#define FUT_CHECK(ctx, x) _fut_check(ctx, x, __FILE__, __LINE__) -static inline void _fut_check(struct futhark_context *ctx, int res, - const char *file, int line) { - if (res != 0) { - fprintf(stderr, "%s:%d: Futhark error %d: %s\n", - file, line, res, futhark_context_get_error(ctx)); - exit(EXIT_FAILURE); - } -} - -#endif diff --git a/game_of_life/lib/github.com/diku-dk/lys/default.nix b/game_of_life/lib/github.com/diku-dk/lys/default.nix deleted file mode 100644 index f45fb1f91bdd28da811700f342f6e449b61d06a6..0000000000000000000000000000000000000000 --- a/game_of_life/lib/github.com/diku-dk/lys/default.nix +++ /dev/null @@ -1,5 +0,0 @@ -with import <nixpkgs> {}; -stdenv.mkDerivation { - name = "lys"; - buildInputs = [ pkgconfig SDL2 SDL2_ttf ocl-icd opencl-headers ]; -} diff --git a/game_of_life/lib/github.com/diku-dk/lys/gen_printf.py b/game_of_life/lib/github.com/diku-dk/lys/gen_printf.py deleted file mode 100644 index f1f448c031adab7c701377934c695542ace17b39..0000000000000000000000000000000000000000 --- a/game_of_life/lib/github.com/diku-dk/lys/gen_printf.py +++ /dev/null @@ -1,40 +0,0 @@ -#!/usr/bin/env python3 - -import sys -import re - -out_file, in_file = sys.argv[1:] - -with open(in_file) as f: - contents = f.read() - -start = contents.find('futhark_entry_text_content') -end = contents.find(')', start) -types = re.findall('([^ ]+) \*out\d+,', contents[start:end]) -out_vars = ['out{}'.format(i) for i in range(len(types))] - -with open(out_file, 'w') as f: - print('#include <stdio.h>', file=f) - print('#include "lib/github.com/diku-dk/lys/liblys.h"', file=f) - print('', file=f) - if len(types) == 0: - print('#define UNUSED(x) (void)(x)', file=f) - print('void build_text(const struct lys_context *ctx, char* dest, size_t dest_len, const char* format, float render_milliseconds, char* **sum_names) {', file=f) - if len(types) > 0: - for v, t in zip(out_vars, types): - print(' union {{ {} val; char* sum_name; }} {};'.format(t, v), file=f) - print(' FUT_CHECK(ctx->fut, futhark_entry_text_content(ctx->fut, {}, render_milliseconds, ctx->state));'.format(', '.join('&{}.val'.format(v) for v in out_vars)), file=f) - for v, i in zip(out_vars, range(len(out_vars))): - print(' if (sum_names[{}] != NULL) {{'.format(i), file=f) - print(' {v}.sum_name = sum_names[{i}][(int32_t) {v}.val];'.format(v=v, i=i), file=f) - print(' }', file=f) - print(' snprintf(dest, dest_len, format, {});'.format(', '.join((s + ('.sum_name' if t == 'int32_t' else '.val')) for s, t in zip(out_vars, types))), file=f) - else: - for x in ['ctx', 'render_milliseconds', 'sum_names']: - print('UNUSED({});'.format(x), file=f) - print(' snprintf(dest, dest_len, "%s", format);', file=f) - print('}', file=f) - print('', file=f) - print('size_t n_printf_arguments() {', file=f) - print(' return {};'.format(len(types)), file=f) - print('}', file=f) diff --git a/game_of_life/lib/github.com/diku-dk/lys/genlys.fut b/game_of_life/lib/github.com/diku-dk/lys/genlys.fut deleted file mode 100644 index e9264eccf42893840802894f2545e663b472893b..0000000000000000000000000000000000000000 --- a/game_of_life/lib/github.com/diku-dk/lys/genlys.fut +++ /dev/null @@ -1,41 +0,0 @@ --- | ignore - --- This file exists as a wrapper that defines entry points in the --- specific form that liblys.c requires. It is copied into place and --- modified by the rules in common.mk. - -module m = import "lys" - -type^ state = m.lys.state - -entry init (seed: u32) (h: i32) (w: i32): state = - m.lys.init seed (i64.i32 h) (i64.i32 w) - -entry grab_mouse: bool = - m.lys.grab_mouse - -entry resize (h: i32) (w: i32) (s: state): state = - m.lys.resize (i64.i32 h) (i64.i32 w) s - -entry key (e: i32) (key: i32) (s: state): state = - let e' = if e == 0 then #keydown {key} else #keyup {key} - in m.lys.event e' s - -entry mouse (buttons: i32) (x: i32) (y: i32) (s: state): state = - m.lys.event (#mouse {buttons, x, y}) s - -entry wheel (dx: i32) (dy: i32) (s: state): state = - m.lys.event (#wheel {dx, dy}) s - -entry step (td: f32) (s: state): state = - m.lys.event (#step td) s - -entry render (s: state) = m.lys.render s - -entry text_colour (s: state): u32 = - m.lys.text_colour s - -entry text_format: []u8 = m.lys.text_format () - -entry text_content (render_duration: f32) (s: state) = - m.lys.text_content render_duration s diff --git a/game_of_life/lib/github.com/diku-dk/lys/liblys.c b/game_of_life/lib/github.com/diku-dk/lys/liblys.c deleted file mode 100644 index e7d425206edc70a26e85e2cd9a04c3429b7c3b9e..0000000000000000000000000000000000000000 --- a/game_of_life/lib/github.com/diku-dk/lys/liblys.c +++ /dev/null @@ -1,269 +0,0 @@ -// Convenience framework for writing visualisations with Futhark and -// C/SDL. -// -// Based on initial SDL wrapper code by Jakob Stokholm Bertelsen. - -#include "liblys.h" - - -static void trigger_event(struct lys_context *ctx, enum lys_event event) { - ctx->event_handler(ctx, event); -} - -static void window_size_updated(struct lys_context *ctx, int newx, int newy) { - // https://stackoverflow.com/a/40122002 - ctx->wnd_surface = SDL_GetWindowSurface(ctx->wnd); - SDL_ASSERT(ctx->wnd_surface != NULL); - - ctx->width = newx; - ctx->height = newy; - - struct futhark_opaque_state *new_state; - FUT_CHECK(ctx->fut, futhark_entry_resize(ctx->fut, &new_state, ctx->height, ctx->width, ctx->state)); - futhark_free_opaque_state(ctx->fut, ctx->state); - ctx->state = new_state; - - ctx->wnd_surface = SDL_GetWindowSurface(ctx->wnd); - SDL_ASSERT(ctx->wnd_surface != NULL); - - if (ctx->data != NULL) { - free(ctx->data); - } - ctx->data = malloc(ctx->width * ctx->height * sizeof(uint32_t)); - assert(ctx->data != NULL); - - if (ctx->surface != NULL) { - SDL_FreeSurface(ctx->surface); - } - ctx->surface = SDL_CreateRGBSurfaceFrom(ctx->data, ctx->width, ctx->height, - 32, ctx->width * sizeof(uint32_t), 0xFF0000, 0xFF00, 0xFF, 0x00000000); - SDL_ASSERT(ctx->surface != NULL); - - trigger_event(ctx, LYS_WINDOW_SIZE_UPDATED); -} - -static void mouse_event(struct lys_context *ctx, Uint32 state, int x, int y) { - // We ignore mouse events if we are running a program that would - // like mouse grab, but where we have temporarily taken the mouse - // back from it (to e.g. resize the window). - if (ctx->grab_mouse != ctx->mouse_grabbed) { - return; - } - - struct futhark_opaque_state *new_state; - FUT_CHECK(ctx->fut, futhark_entry_mouse(ctx->fut, &new_state, state, x, y, ctx->state)); - futhark_free_opaque_state(ctx->fut, ctx->state); - ctx->state = new_state; -} - -static void wheel_event(struct lys_context *ctx, int x, int y) { - struct futhark_opaque_state *new_state; - FUT_CHECK(ctx->fut, futhark_entry_wheel(ctx->fut, &new_state, x, y, ctx->state)); - futhark_free_opaque_state(ctx->fut, ctx->state); - ctx->state = new_state; -} - -static void handle_sdl_events(struct lys_context *ctx) { - SDL_Event event; - - while (SDL_PollEvent(&event) == 1) { - switch (event.type) { - case SDL_WINDOWEVENT: - switch (event.window.event) { - case SDL_WINDOWEVENT_RESIZED: - { - int newx = (int)event.window.data1; - int newy = (int)event.window.data2; - window_size_updated(ctx, newx, newy); - break; - } - } - break; - case SDL_QUIT: - ctx->running = 0; - break; - case SDL_MOUSEMOTION: - if (ctx->grab_mouse) { - mouse_event(ctx, event.motion.state, event.motion.xrel, event.motion.yrel); - } else { - mouse_event(ctx, event.motion.state, event.motion.x, event.motion.y); - } - break; - case SDL_MOUSEBUTTONDOWN: - case SDL_MOUSEBUTTONUP: - if (ctx->grab_mouse && !ctx->mouse_grabbed) { - assert(SDL_SetRelativeMouseMode(1) == 0); - ctx->mouse_grabbed = 1; - } - - if (ctx->grab_mouse) { - mouse_event(ctx, 1<<(event.button.button-1), event.motion.xrel, event.motion.yrel); - } else { - mouse_event(ctx, 1<<(event.button.button-1), event.motion.x, event.motion.y); - } - break; - case SDL_MOUSEWHEEL: - wheel_event(ctx, event.wheel.x, event.wheel.y); - break; - case SDL_KEYDOWN: - case SDL_KEYUP: - switch (event.key.keysym.sym) { - case SDLK_ESCAPE: - if (ctx->grab_mouse && ctx->mouse_grabbed) { - assert(SDL_SetRelativeMouseMode(0) == 0); - ctx->mouse_grabbed = 0; - } else if (event.key.type == SDL_KEYDOWN) { - ctx->running = 0; - } - break; - case SDLK_F1: - if (event.key.type == SDL_KEYDOWN) { - trigger_event(ctx, LYS_F1); - } - break; - default: - { - struct futhark_opaque_state *new_state; - int e = event.key.type == SDL_KEYDOWN ? 0 : 1; - FUT_CHECK(ctx->fut, futhark_entry_key(ctx->fut, &new_state, - e, event.key.keysym.sym, ctx->state)); - futhark_free_opaque_state(ctx->fut, ctx->state); - ctx->state = new_state; - } - } - } - } -} - -static void sdl_loop(struct lys_context *ctx) { - struct futhark_u32_2d *out_arr; - - while (ctx->running) { - int64_t now = lys_wall_time(); - float delta = ((float)(now - ctx->last_time))/1000000.0; - ctx->fps = (ctx->fps*0.9 + (1/delta)*0.1); - ctx->last_time = now; - struct futhark_opaque_state *new_state; - FUT_CHECK(ctx->fut, futhark_entry_step(ctx->fut, &new_state, delta, ctx->state)); - futhark_free_opaque_state(ctx->fut, ctx->state); - ctx->state = new_state; - - FUT_CHECK(ctx->fut, futhark_entry_render(ctx->fut, &out_arr, ctx->state)); - FUT_CHECK(ctx->fut, futhark_values_u32_2d(ctx->fut, out_arr, ctx->data)); - FUT_CHECK(ctx->fut, futhark_free_u32_2d(ctx->fut, out_arr)); - - SDL_ASSERT(SDL_BlitSurface(ctx->surface, NULL, ctx->wnd_surface, NULL)==0); - - trigger_event(ctx, LYS_LOOP_ITERATION); - - SDL_ASSERT(SDL_UpdateWindowSurface(ctx->wnd) == 0); - - int delay = 1000.0/ctx->max_fps - delta*1000.0; - if (delay > 0) { - SDL_Delay(delay); - } - - handle_sdl_events(ctx); - } -} - -void lys_run_sdl(struct lys_context *ctx) { - struct futhark_context *fut = ctx->fut; - - ctx->last_time = lys_wall_time(); - - ctx->wnd = - SDL_CreateWindow("Lys", - SDL_WINDOWPOS_UNDEFINED, SDL_WINDOWPOS_UNDEFINED, - ctx->width, ctx->height, - ctx->sdl_flags | - SDL_RENDERER_ACCELERATED | - SDL_RENDERER_PRESENTVSYNC); - SDL_ASSERT(ctx->wnd != NULL); - - window_size_updated(ctx, ctx->width, ctx->height); - - ctx->running = 1; - ctx->mouse_grabbed = 0; - - if (ctx->grab_mouse) { - assert(SDL_SetRelativeMouseMode(1) == 0); - ctx->mouse_grabbed = 1; - } - - trigger_event(ctx, LYS_LOOP_START); - - sdl_loop(ctx); - - FUT_CHECK(fut, futhark_free_opaque_state(fut, ctx->state)); - - trigger_event(ctx, LYS_LOOP_END); - - SDL_FreeSurface(ctx->surface); - // do not free wnd_surface (see SDL_GetWindowSurface) - SDL_DestroyWindow(ctx->wnd); - SDL_Quit(); -} - -void lys_setup(struct lys_context *ctx, int width, int height, int max_fps, int sdl_flags) { - memset(ctx, 0, sizeof(struct lys_context)); - ctx->width = width; - ctx->height = height; - ctx->fps = 0; - ctx->max_fps = max_fps; - ctx->sdl_flags = sdl_flags; - - SDL_ASSERT(SDL_Init(SDL_INIT_EVERYTHING) == 0); -} - -#ifdef LYS_TTF -void draw_text(struct lys_context *ctx, - TTF_Font *font, int font_size, - char* buffer, int32_t colour, - int y_start, int x_start) { - SDL_Surface *text_surface; - SDL_Rect offset_rect; - - SDL_Color sdl_colour = - { .a = (colour >> 24) & 0xff, - .r = (colour >> 16) & 0xff, - .g = (colour >> 8) & 0xff, - .b = colour & 0xff }; - - offset_rect.x = x_start; - int y = y_start; - while (true) { - char* buffer_start = buffer; - - bool no_more_text = false; - while (true) { - if (*buffer == '\n') { - *buffer = '\0'; - break; - } else if (*buffer == '\0') { - no_more_text = true; - break; - } - buffer++; - } - - if (*buffer_start != '\0') { - text_surface = TTF_RenderUTF8_Blended(font, buffer_start, sdl_colour); - SDL_ASSERT(text_surface != NULL); - offset_rect.y = y; - offset_rect.w = text_surface->w; - offset_rect.h = text_surface->h; - SDL_ASSERT(SDL_BlitSurface(text_surface, NULL, - ctx->wnd_surface, &offset_rect) == 0); - SDL_FreeSurface(text_surface); - } - - if (no_more_text) { - break; - } else { - buffer++; - y += font_size; - } - } -} -#endif diff --git a/game_of_life/lib/github.com/diku-dk/lys/liblys.h b/game_of_life/lib/github.com/diku-dk/lys/liblys.h deleted file mode 100644 index 4c0e7750a068fe3a5e5db2d138ac96d22dba1753..0000000000000000000000000000000000000000 --- a/game_of_life/lib/github.com/diku-dk/lys/liblys.h +++ /dev/null @@ -1,61 +0,0 @@ -#ifndef LIBLYS_HEADER -#define LIBLYS_HEADER - -#include <stdio.h> -#include <stdlib.h> -#include <stdbool.h> -#include <assert.h> -#include <SDL2/SDL.h> -#include <SDL2/SDL_ttf.h> - -#include PROGHEADER - -#include "context_setup.h" - -enum lys_event { - LYS_LOOP_START, - LYS_LOOP_ITERATION, - LYS_LOOP_END, - LYS_WINDOW_SIZE_UPDATED, - LYS_F1 -}; - -struct lys_context { - struct futhark_context *fut; - struct futhark_opaque_state *state; - SDL_Window *wnd; - SDL_Surface *wnd_surface; - SDL_Surface *surface; - int width; - int height; - uint32_t *data; - int64_t last_time; - bool running; - bool grab_mouse; - bool mouse_grabbed; - float fps; - int max_fps; - int sdl_flags; - void* event_handler_data; - void (*event_handler)(struct lys_context*, enum lys_event); -}; - -#define SDL_ASSERT(x) _sdl_assert(x, __FILE__, __LINE__) -static inline void _sdl_assert(int res, const char *file, int line) { - if (res == 0) { - fprintf(stderr, "%s:%d: SDL error %d: %s\n", - file, line, res, SDL_GetError()); - exit(EXIT_FAILURE); - } -} - -void lys_setup(struct lys_context *ctx, int width, int height, int max_fps, int sdl_flags); - -void lys_run_sdl(struct lys_context *ctx); - -#ifdef LYS_TTF -void draw_text(struct lys_context *ctx, TTF_Font *font, int font_size, char* buffer, int32_t colour, - int x_start, int y_start); -#endif - -#endif diff --git a/game_of_life/lib/github.com/diku-dk/lys/lys.fut b/game_of_life/lib/github.com/diku-dk/lys/lys.fut deleted file mode 100644 index e1039d62bf658bfde2c9bd76372699cd9267727f..0000000000000000000000000000000000000000 --- a/game_of_life/lib/github.com/diku-dk/lys/lys.fut +++ /dev/null @@ -1,366 +0,0 @@ --- | Lights, camera, action! --- --- Making use of Lys requires hooking into (or duplicating) its custom --- Makefile rules, so you should also read the [usage --- section](https://github.com/diku-dk/lys/blob/master/README.md#general-usage) --- of the README. --- --- On the Futhark side, you need to define a module called `lys` that --- implements the module type `lys`@mtype. You can do this directly, --- or use some of the various conveniences defined in this file. For --- example, if you do not care about showing any text, you can use --- `lys_no_text`@mtype. - --- | For convenience, re-export the colour module. -open import "../../athas/matte/colour" - --- | UTF-8 encoded string. This is what is produced by string --- literals in Futhark code. -type string [n] = [n]u8 - --- | An event is sent when something has happened that might cause the --- state of the program to change, or just when some time has passed. --- It is permissible to ignore all of these events. Things that must --- not be ignored are separate functions in `lys`@mtype. --- --- * `#step x`: `x` seconds have passed since `init` or the last time --- this event was received. --- --- * `#keydown {key}`: `key` has pressed. --- --- * `#keyup {key}`: `key` has been released. --- --- * `#mouse {buttons, x, y}`: The mouse has been moved or clicked. --- `buttons` is a bit mask indicating which button(s) are held down, --- and the `x`/`y` the new position of the mouse. --- --- * `#wheel {dx, dy}`: The mouse wheel has been used. Note that there can --- be multiple wheels; this is why the `dy` direction also makes --- sense. In most cases, however, only the `dy` will be non-zero. -type event = #step f32 - | #keydown {key:i32} - | #keyup {key:i32} - | #mouse {buttons:i32, x:i32, y:i32} - | #wheel {dx:i32, dy:i32} - --- | The core subset of the module type of Lys applications. This is useful if --- you need a Lys application with custom initialisation or without text --- rendering. -module type lys_core = { - -- | The state maintained by this Lys application. Most functions - -- will take the current state and return a new state. - type~ state - - -- | An event occured. It is permissible to ignore any of these - -- events by returning the same state unchanged. - val event : event -> state -> state - - -- | The window was resized. - val resize : (h: i64) -> (w: i64) -> state -> state - - -- | The function for rendering a screen image in row-major order - -- (height by width). The size of the array returned must match the - -- last dimensions provided to the state (via `init`@term or - -- `resize`@term). - val render : state -> [][]argb.colour -} - --- | The module type of Lys applications. If you define a module --- called `lys` that has this module type, then the autogenerated Lys --- wrapper application can automatically define the entry point --- functions that allows Lys to communicate with the C program that --- actually implements the user interaction. -module type lys = { - include lys_core - - -- | Initial state for a given window size. A random seed is passed - -- in. Don't treat this as a true random number (it's currently - -- just a timestamp), but use it for initialising a proper RNG. - val init : (seed: u32) -> (h: i64) -> (w: i64) -> state - - -- | If true, the program will grab the mouse, and all positions - -- reported via the `mouse`@term function will be relative to the - -- last time `mouse`@term was called. If in doubt, leave this - -- `false`. - val grab_mouse : bool - - -- | Show helpful text in the upper-left corner. Specify in printf format - -- with extensions: '%[string1|string2|...]' prints a string but takes an - -- index into the given list of strings, separated by '|'. For example, - -- '%[circle|square]' prints 'circle' if passed the i32 value 0, and 'square' - -- if passed 1. - - val text_format : () -> string [] - -- | The content must be a scalar or a tuple of scalars. - type text_content - val text_content : (fps: f32) -> state -> text_content - -- | The colour can vary based on the state. - val text_colour : state -> argb.colour -} - --- | A module type for the simple case where we don't want any text. --- You can define the `lys` module to have this module type instead of --- `lys`@mtype. For maximal convenience, you can `open` --- `lys_no_text`@module inside your module definition. -module type lys_no_text = lys with text_content = () - --- | A convenience module that can be `open`ed to give dummy --- definitions for the text-related functionality. -module lys_no_text = { - let text_format () = "" - type text_content = () - let text_content _ _ = () - let text_colour _ = argb.black -} - --- | A dummy lys module that just produces a black rectangle and does --- nothing in response to events. -module lys: lys_no_text = { - type state = {h: i64, w: i64} - let init _ h w = {h,w} - let event _ s = s - let resize h w _ = {h,w} - let grab_mouse = false - let render {h,w} = replicate w argb.black |> replicate h - open lys_no_text -} - --- The following values are taken from --- https://wiki.libsdl.org/SDLKeycodeLookup - -let SDLK_UNKNOWN: i32 = 0x00 -let SDLK_BACKSPACE: i32 = 0x08 -let SDLK_TAB: i32 = 0x09 -let SDLK_RETURN: i32 = 0x0D -let SDLK_ESCAPE: i32 = 0x1B -let SDLK_SPACE: i32 = 0x20 -let SDLK_EXCLAIM: i32 = 0x21 -let SDLK_QUOTEDBL: i32 = 0x22 -let SDLK_HASH: i32 = 0x23 -let SDLK_DOLLAR: i32 = 0x24 -let SDLK_PERCENT: i32 = 0x25 -let SDLK_AMPERSAND: i32 = 0x26 -let SDLK_QUOTE: i32 = 0x27 -let SDLK_LEFTPAREN: i32 = 0x28 -let SDLK_RIGHTPAREN: i32 = 0x29 -let SDLK_ASTERISK: i32 = 0x2A -let SDLK_PLUS: i32 = 0x2B -let SDLK_COMMA: i32 = 0x2C -let SDLK_MINUS: i32 = 0x2D -let SDLK_PERIOD: i32 = 0x2E -let SDLK_SLASH: i32 = 0x2F -let SDLK_0: i32 = 0x30 -let SDLK_1: i32 = 0x31 -let SDLK_2: i32 = 0x32 -let SDLK_3: i32 = 0x33 -let SDLK_4: i32 = 0x34 -let SDLK_5: i32 = 0x35 -let SDLK_6: i32 = 0x36 -let SDLK_7: i32 = 0x37 -let SDLK_8: i32 = 0x38 -let SDLK_9: i32 = 0x39 -let SDLK_COLON: i32 = 0x3A -let SDLK_SEMICOLON: i32 = 0x3B -let SDLK_LESS: i32 = 0x3C -let SDLK_EQUALS: i32 = 0x3D -let SDLK_GREATER: i32 = 0x3E -let SDLK_QUESTION: i32 = 0x3F -let SDLK_AT: i32 = 0x40 -let SDLK_LEFTBRACKET: i32 = 0x5B -let SDLK_BACKSLASH: i32 = 0x5C -let SDLK_RIGHTBRACKET: i32 = 0x5D -let SDLK_CARET: i32 = 0x5E -let SDLK_UNDERSCORE: i32 = 0x5F -let SDLK_BACKQUOTE: i32 = 0x60 -let SDLK_a: i32 = 0x61 -let SDLK_b: i32 = 0x62 -let SDLK_c: i32 = 0x63 -let SDLK_d: i32 = 0x64 -let SDLK_e: i32 = 0x65 -let SDLK_f: i32 = 0x66 -let SDLK_g: i32 = 0x67 -let SDLK_h: i32 = 0x68 -let SDLK_i: i32 = 0x69 -let SDLK_j: i32 = 0x6A -let SDLK_k: i32 = 0x6B -let SDLK_l: i32 = 0x6C -let SDLK_m: i32 = 0x6D -let SDLK_n: i32 = 0x6E -let SDLK_o: i32 = 0x6F -let SDLK_p: i32 = 0x70 -let SDLK_q: i32 = 0x71 -let SDLK_r: i32 = 0x72 -let SDLK_s: i32 = 0x73 -let SDLK_t: i32 = 0x74 -let SDLK_u: i32 = 0x75 -let SDLK_v: i32 = 0x76 -let SDLK_w: i32 = 0x77 -let SDLK_x: i32 = 0x78 -let SDLK_y: i32 = 0x79 -let SDLK_z: i32 = 0x7A -let SDLK_DELETE: i32 = 0x7F -let SDLK_CAPSLOCK: i32 = 0x40000039 -let SDLK_F1: i32 = 0x4000003A -let SDLK_F2: i32 = 0x4000003B -let SDLK_F3: i32 = 0x4000003C -let SDLK_F4: i32 = 0x4000003D -let SDLK_F5: i32 = 0x4000003E -let SDLK_F6: i32 = 0x4000003F -let SDLK_F7: i32 = 0x40000040 -let SDLK_F8: i32 = 0x40000041 -let SDLK_F9: i32 = 0x40000042 -let SDLK_F10: i32 = 0x40000043 -let SDLK_F11: i32 = 0x40000044 -let SDLK_F12: i32 = 0x40000045 -let SDLK_PRINTSCREEN: i32 = 0x40000046 -let SDLK_SCROLLLOCK: i32 = 0x40000047 -let SDLK_PAUSE: i32 = 0x40000048 -let SDLK_INSERT: i32 = 0x40000049 -let SDLK_HOME: i32 = 0x4000004A -let SDLK_PAGEUP: i32 = 0x4000004B -let SDLK_END: i32 = 0x4000004D -let SDLK_PAGEDOWN: i32 = 0x4000004E -let SDLK_RIGHT: i32 = 0x4000004F -let SDLK_LEFT: i32 = 0x40000050 -let SDLK_DOWN: i32 = 0x40000051 -let SDLK_UP: i32 = 0x40000052 -let SDLK_NUMLOCKCLEAR: i32 = 0x40000053 -let SDLK_KP_DIVIDE: i32 = 0x40000054 -let SDLK_KP_MULTIPLY: i32 = 0x40000055 -let SDLK_KP_MINUS: i32 = 0x40000056 -let SDLK_KP_PLUS: i32 = 0x40000057 -let SDLK_KP_ENTER: i32 = 0x40000058 -let SDLK_KP_1: i32 = 0x40000059 -let SDLK_KP_2: i32 = 0x4000005A -let SDLK_KP_3: i32 = 0x4000005B -let SDLK_KP_4: i32 = 0x4000005C -let SDLK_KP_5: i32 = 0x4000005D -let SDLK_KP_6: i32 = 0x4000005E -let SDLK_KP_7: i32 = 0x4000005F -let SDLK_KP_8: i32 = 0x40000060 -let SDLK_KP_9: i32 = 0x40000061 -let SDLK_KP_0: i32 = 0x40000062 -let SDLK_KP_PERIOD: i32 = 0x40000063 -let SDLK_APPLICATION: i32 = 0x40000065 -let SDLK_POWER: i32 = 0x40000066 -let SDLK_KP_EQUALS: i32 = 0x40000067 -let SDLK_F13: i32 = 0x40000068 -let SDLK_F14: i32 = 0x40000069 -let SDLK_F15: i32 = 0x4000006A -let SDLK_F16: i32 = 0x4000006B -let SDLK_F17: i32 = 0x4000006C -let SDLK_F18: i32 = 0x4000006D -let SDLK_F19: i32 = 0x4000006E -let SDLK_F20: i32 = 0x4000006F -let SDLK_F21: i32 = 0x40000070 -let SDLK_F22: i32 = 0x40000071 -let SDLK_F23: i32 = 0x40000072 -let SDLK_F24: i32 = 0x40000073 -let SDLK_EXECUTE: i32 = 0x40000074 -let SDLK_HELP: i32 = 0x40000075 -let SDLK_MENU: i32 = 0x40000076 -let SDLK_SELECT: i32 = 0x40000077 -let SDLK_STOP: i32 = 0x40000078 -let SDLK_AGAIN: i32 = 0x40000079 -let SDLK_UNDO: i32 = 0x4000007A -let SDLK_CUT: i32 = 0x4000007B -let SDLK_COPY: i32 = 0x4000007C -let SDLK_PASTE: i32 = 0x4000007D -let SDLK_FIND: i32 = 0x4000007E -let SDLK_MUTE: i32 = 0x4000007F -let SDLK_VOLUMEUP: i32 = 0x40000080 -let SDLK_VOLUMEDOWN: i32 = 0x40000081 -let SDLK_KP_COMMA: i32 = 0x40000085 -let SDLK_KP_EQUALSAS400: i32 = 0x40000086 -let SDLK_ALTERASE: i32 = 0x40000099 -let SDLK_SYSREQ: i32 = 0x4000009A -let SDLK_CANCEL: i32 = 0x4000009B -let SDLK_CLEAR: i32 = 0x4000009C -let SDLK_PRIOR: i32 = 0x4000009D -let SDLK_RETURN2: i32 = 0x4000009E -let SDLK_SEPARATOR: i32 = 0x4000009F -let SDLK_OUT: i32 = 0x400000A0 -let SDLK_OPER: i32 = 0x400000A1 -let SDLK_CLEARAGAIN: i32 = 0x400000A2 -let SDLK_CRSEL: i32 = 0x400000A3 -let SDLK_EXSEL: i32 = 0x400000A4 -let SDLK_KP_00: i32 = 0x400000B0 -let SDLK_KP_000: i32 = 0x400000B1 -let SDLK_THOUSANDSSEPARATOR: i32 = 0x400000B2 -let SDLK_DECIMALSEPARATOR: i32 = 0x400000B3 -let SDLK_CURRENCYUNIT: i32 = 0x400000B4 -let SDLK_CURRENCYSUBUNIT: i32 = 0x400000B5 -let SDLK_KP_LEFTPAREN: i32 = 0x400000B6 -let SDLK_KP_RIGHTPAREN: i32 = 0x400000B7 -let SDLK_KP_LEFTBRACE: i32 = 0x400000B8 -let SDLK_KP_RIGHTBRACE: i32 = 0x400000B9 -let SDLK_KP_TAB: i32 = 0x400000BA -let SDLK_KP_BACKSPACE: i32 = 0x400000BB -let SDLK_KP_A: i32 = 0x400000BC -let SDLK_KP_B: i32 = 0x400000BD -let SDLK_KP_C: i32 = 0x400000BE -let SDLK_KP_D: i32 = 0x400000BF -let SDLK_KP_E: i32 = 0x400000C0 -let SDLK_KP_F: i32 = 0x400000C1 -let SDLK_KP_XOR: i32 = 0x400000C2 -let SDLK_KP_POWER: i32 = 0x400000C3 -let SDLK_KP_PERCENT: i32 = 0x400000C4 -let SDLK_KP_LESS: i32 = 0x400000C5 -let SDLK_KP_GREATER: i32 = 0x400000C6 -let SDLK_KP_AMPERSAND: i32 = 0x400000C7 -let SDLK_KP_DBLAMPERSAND: i32 = 0x400000C8 -let SDLK_KP_VERTICALBAR: i32 = 0x400000C9 -let SDLK_KP_DBLVERTICALBAR: i32 = 0x400000CA -let SDLK_KP_COLON: i32 = 0x400000CB -let SDLK_KP_HASH: i32 = 0x400000CC -let SDLK_KP_SPACE: i32 = 0x400000CD -let SDLK_KP_AT: i32 = 0x400000CE -let SDLK_KP_EXCLAM: i32 = 0x400000CF -let SDLK_KP_MEMSTORE: i32 = 0x400000D0 -let SDLK_KP_MEMRECALL: i32 = 0x400000D1 -let SDLK_KP_MEMCLEAR: i32 = 0x400000D2 -let SDLK_KP_MEMADD: i32 = 0x400000D3 -let SDLK_KP_MEMSUBTRACT: i32 = 0x400000D4 -let SDLK_KP_MEMMULTIPLY: i32 = 0x400000D5 -let SDLK_KP_MEMDIVIDE: i32 = 0x400000D6 -let SDLK_KP_PLUSMINUS: i32 = 0x400000D7 -let SDLK_KP_CLEAR: i32 = 0x400000D8 -let SDLK_KP_CLEARENTRY: i32 = 0x400000D9 -let SDLK_KP_BINARY: i32 = 0x400000DA -let SDLK_KP_OCTAL: i32 = 0x400000DB -let SDLK_KP_DECIMAL: i32 = 0x400000DC -let SDLK_KP_HEXADECIMAL: i32 = 0x400000DD -let SDLK_LCTRL: i32 = 0x400000E0 -let SDLK_LSHIFT: i32 = 0x400000E1 -let SDLK_LALT: i32 = 0x400000E2 -let SDLK_LGUI: i32 = 0x400000E3 -let SDLK_RCTRL: i32 = 0x400000E4 -let SDLK_RSHIFT: i32 = 0x400000E5 -let SDLK_RALT: i32 = 0x400000E6 -let SDLK_RGUI: i32 = 0x400000E7 -let SDLK_MODE: i32 = 0x40000101 -let SDLK_AUDIONEXT: i32 = 0x40000102 -let SDLK_AUDIOPREV: i32 = 0x40000103 -let SDLK_AUDIOSTOP: i32 = 0x40000104 -let SDLK_AUDIOPLAY: i32 = 0x40000105 -let SDLK_AUDIOMUTE: i32 = 0x40000106 -let SDLK_MEDIASELECT: i32 = 0x40000107 -let SDLK_WWW: i32 = 0x40000108 -let SDLK_MAIL: i32 = 0x40000109 -let SDLK_CALCULATOR: i32 = 0x4000010A -let SDLK_COMPUTER: i32 = 0x4000010B -let SDLK_AC_SEARCH: i32 = 0x4000010C -let SDLK_AC_HOME: i32 = 0x4000010D -let SDLK_AC_BACK: i32 = 0x4000010E -let SDLK_AC_FORWARD: i32 = 0x4000010F -let SDLK_AC_STOP: i32 = 0x40000110 -let SDLK_AC_REFRESH: i32 = 0x40000111 -let SDLK_AC_BOOKMARKS: i32 = 0x40000112 -let SDLK_BRIGHTNESSDOWN: i32 = 0x40000113 -let SDLK_BRIGHTNESSUP: i32 = 0x40000114 -let SDLK_DISPLAYSWITCH: i32 = 0x40000115 -let SDLK_KBDILLUMTOGGLE: i32 = 0x40000116 -let SDLK_KBDILLUMDOWN: i32 = 0x40000117 -let SDLK_KBDILLUMUP: i32 = 0x40000118 -let SDLK_EJECT: i32 = 0x40000119 -let SDLK_SLEEP: i32 = 0x4000011A diff --git a/game_of_life/lib/github.com/diku-dk/lys/main.c b/game_of_life/lib/github.com/diku-dk/lys/main.c deleted file mode 100644 index 2c24d1fce5124f45245379eda1a9c1a6074031ef..0000000000000000000000000000000000000000 --- a/game_of_life/lib/github.com/diku-dk/lys/main.c +++ /dev/null @@ -1,355 +0,0 @@ -#include "liblys.h" -#include PRINTFHEADER - -#define _XOPEN_SOURCE -#include <unistd.h> -#include <getopt.h> - -#define INITIAL_WIDTH 800 -#define INITIAL_HEIGHT 600 - -struct lys_text { - TTF_Font *font; - char* font_path; - int font_size; - char* text_format; - char* text_buffer; - size_t text_buffer_len; - bool show_text; - char* **sum_names; -}; - -void loop_start(struct lys_context *ctx, struct lys_text *text) { - struct futhark_u8_1d *text_format_array; - FUT_CHECK(ctx->fut, futhark_entry_text_format(ctx->fut, &text_format_array)); - size_t text_format_len = futhark_shape_u8_1d(ctx->fut, text_format_array)[0]; - text->text_format = malloc(sizeof(char) * (text_format_len + 1)); - assert(text->text_format != NULL); - FUT_CHECK(ctx->fut, futhark_values_u8_1d(ctx->fut, text_format_array, (unsigned char*) text->text_format)); - FUT_CHECK(ctx->fut, futhark_context_sync(ctx->fut)); - text->text_format[text_format_len] = '\0'; - FUT_CHECK(ctx->fut, futhark_free_u8_1d(ctx->fut, text_format_array)); - - text->sum_names = (char* **) malloc(sizeof(char* *) * n_printf_arguments()); - assert(text->sum_names != NULL); - - text->text_buffer_len = text_format_len; - size_t i_arg = -1; - for (size_t i = 0; i < text_format_len; i++) { - if (text->text_format[i] == '%' && - i + 1 < text_format_len && text->text_format[i + 1] != '%') { - i_arg++; - if (text->text_format[i + 1] == '[') { - text->text_format[i + 1] = 's'; - size_t end_pos; - size_t n_choices = 1; - bool found_end = false; - for (end_pos = i + 2; end_pos < text_format_len; end_pos++) { - if (text->text_format[end_pos] == '|') { - n_choices++; - } else if (text->text_format[end_pos] == ']') { - found_end = true; - break; - } - } - assert(found_end); - text->sum_names[i_arg] = (char* *) malloc(sizeof(char*) * (n_choices + 1)); - assert(text->sum_names[i_arg] != NULL); - text->sum_names[i_arg][n_choices] = NULL; - char* temp_choice = (char*) malloc(sizeof(char) * (end_pos - i - n_choices)); - assert(temp_choice != NULL); - size_t choice_cur = 0; - size_t i_choice = 0; - for (size_t j = i + 2; j < end_pos + 1; j++) { - if (text->text_format[j] == '|' || text->text_format[j] == ']') { - temp_choice[choice_cur] = '\0'; - text->sum_names[i_arg][i_choice] = (char*) malloc(sizeof(char) * (choice_cur + 1)); - assert(text->sum_names[i_arg][i_choice] != NULL); - strncpy(text->sum_names[i_arg][i_choice], temp_choice, choice_cur + 1); - choice_cur = 0; - i_choice++; - } else { - temp_choice[choice_cur] = text->text_format[j]; - choice_cur++; - } - } - free(temp_choice); - size_t shift_left = end_pos - i - 1; - for (size_t j = end_pos + 1; j < text_format_len; j++) { - text->text_format[j - shift_left] = text->text_format[j]; - } - text_format_len -= shift_left; - text->text_format[text_format_len] = '\0'; - i++; - } else { - text->sum_names[i_arg] = NULL; - text->text_buffer_len += 20; // estimate - } - } - } - - text->text_buffer = malloc(sizeof(char) * text->text_buffer_len); - assert(text->text_buffer != NULL); - text->text_buffer[0] = '\0'; - - text->show_text = true; -} - -void loop_iteration(struct lys_context *ctx, struct lys_text *text) { - if (!text->show_text) { - return; - } - - build_text(ctx, text->text_buffer, text->text_buffer_len, text->text_format, - ctx->fps, text->sum_names); - if (*(text->text_buffer) != '\0') { - int32_t text_colour; - FUT_CHECK(ctx->fut, - futhark_entry_text_colour(ctx->fut, (uint32_t*) &text_colour, - ctx->state)); - draw_text(ctx, text->font, text->font_size, text->text_buffer, text_colour, 10, 10); - } -} - -void loop_end(struct lys_text *text) { - free(text->text_format); - free(text->text_buffer); - - for (size_t i = 0; i < n_printf_arguments(); i++) { - if (text->sum_names[i] != NULL) { - size_t j = 0; - while (text->sum_names[i][j] != NULL) { - free(text->sum_names[i][j]); - j++; - } - free(text->sum_names[i]); - } - } - free(text->sum_names); -} - -int font_size_from_dimensions(int width, int height) { - int size, font_size; - if (height < width) { - size = height; - } else { - size = width; - } - font_size = size / 45; - if (font_size < 14) { - font_size = 14; - } else if (font_size > 32) { - font_size = 32; - } - return font_size; -} - -void window_size_updated(struct lys_context *ctx, struct lys_text *text) { - text->font_size = font_size_from_dimensions(ctx->width, ctx->height); - TTF_CloseFont(text->font); - text->font = TTF_OpenFont(text->font_path, text->font_size); - SDL_ASSERT(text->font != NULL); -} - -void f1(struct lys_text *text) { - text->show_text = !text->show_text; -} - -void handle_event(struct lys_context *ctx, enum lys_event event) { - struct lys_text *text = (struct lys_text *) ctx->event_handler_data; - switch (event) { - case LYS_LOOP_START: - loop_start(ctx, text); - break; - case LYS_LOOP_ITERATION: - loop_iteration(ctx, text); - break; - case LYS_LOOP_END: - loop_end(text); - break; - case LYS_WINDOW_SIZE_UPDATED: - window_size_updated(ctx, text); - break; - case LYS_F1: - f1(text); - } -} - -void do_bench(struct futhark_context *fut, int height, int width, int n, const char *operation) { - struct futhark_opaque_state *state; - int64_t start, end; - FUT_CHECK(fut, futhark_entry_init(fut, &state, (int32_t) lys_wall_time(), height, width)); - futhark_context_sync(fut); - bool do_step = false, do_render = false; - - if (strstr(operation, "step") != NULL) { - do_step = true; - } - - if (strstr(operation, "render") != NULL) { - do_render = true; - } - - start = lys_wall_time(); - for (int i = 0; i < n; i++) { - if (do_step) { - struct futhark_opaque_state *new_state; - FUT_CHECK(fut, futhark_entry_step(fut, &new_state, 1.0/n, state)); - futhark_free_opaque_state(fut, state); - state = new_state; - } - if (do_render) { - struct futhark_u32_2d *out_arr; - FUT_CHECK(fut, futhark_entry_render(fut, &out_arr, state)); - FUT_CHECK(fut, futhark_free_u32_2d(fut, out_arr)); - } - } - futhark_context_sync(fut); - end = lys_wall_time(); - - printf("Rendered %d frames in %fs (%f FPS)\n", - n, ((double)end-start)/1000000, - n / (((double)end-start)/1000000)); - - FUT_CHECK(fut, futhark_free_opaque_state(fut, state)); -} - -void usage(char **argv) { - printf("Usage: %s options...\n", argv[0]); - puts("Options:"); - puts(" -? Print this help and exit."); - puts(" -w INT Set the initial width of the window."); - puts(" -h INT Set the initial height of the window."); - puts(" -R Disallow resizing the window."); - puts(" -d DEV Set the computation device."); - puts(" -r INT Maximum frames per second."); - puts(" -i Select execution device interactively."); - puts(" -b <render|step> Benchmark program."); -} - -int main(int argc, char** argv) { - int width = INITIAL_WIDTH, height = INITIAL_HEIGHT, max_fps = 60; - bool allow_resize = true; - char *deviceopt = NULL; - bool device_interactive = false; - char *benchopt = NULL; - - int c; - while ( (c = getopt(argc, argv, "w:h:r:Rd:b:i")) != -1) { - switch (c) { - case 'w': - width = atoi(optarg); - if (width <= 0) { - fprintf(stderr, "'%s' is not a valid width.\n", optarg); - exit(EXIT_FAILURE); - } - break; - case 'h': - height = atoi(optarg); - if (height <= 0) { - fprintf(stderr, "'%s' is not a valid width.\n", optarg); - exit(EXIT_FAILURE); - } - break; - case 'r': - max_fps = atoi(optarg); - if (max_fps <= 0) { - fprintf(stderr, "'%s' is not a valid framerate.\n", optarg); - exit(EXIT_FAILURE); - } - break; - case 'R': - allow_resize = false; - break; - case 'd': - deviceopt = optarg; - break; - case 'i': - device_interactive = true; - break; - case 'b': - if (strcmp(optarg, "render") == 0 || - strcmp(optarg, "step") == 0) { - benchopt = optarg; - } else { - fprintf(stderr, "Use -b <render|step>\n"); - return EXIT_FAILURE; - } - break; - case '?': - usage(argv); - return EXIT_SUCCESS; - default: - fprintf(stderr, "unknown option: %c\n", c); - usage(argv); - return EXIT_FAILURE; - } - } - - if (optind < argc) { - fprintf(stderr, "Excess non-options: "); - while (optind < argc) - fprintf(stderr, "%s ", argv[optind++]); - fprintf(stderr, "\n"); - exit(EXIT_FAILURE); - } - - char font_path_rel[] = "/lib/github.com/diku-dk/lys/Inconsolata-Regular.ttf"; - char* font_path = malloc(sizeof(char) * strlen(argv[0]) + sizeof(font_path_rel)); - assert(font_path != NULL); - strcpy(font_path, argv[0]); - char *last_dash = strrchr(font_path, '/'); - if (last_dash != NULL) { - *last_dash = '\0'; - } - strcat(font_path, font_path_rel); - - int sdl_flags = 0; - if (allow_resize) { - sdl_flags |= SDL_WINDOW_RESIZABLE; - } - - struct lys_context ctx; - struct futhark_context_config *futcfg; - lys_setup(&ctx, width, height, max_fps, sdl_flags); - - char* opencl_device_name = NULL; - lys_setup_futhark_context(deviceopt, device_interactive, - &futcfg, &ctx.fut, &opencl_device_name); - if (opencl_device_name != NULL) { - printf("Using OpenCL device: %s\n", opencl_device_name); - printf("Use -d or -i to change this.\n"); - free(opencl_device_name); - } - - FUT_CHECK(ctx.fut, futhark_entry_grab_mouse(ctx.fut, &ctx.grab_mouse)); - - struct lys_text text; - ctx.event_handler_data = (void*) &text; - ctx.event_handler = handle_event; - - SDL_ASSERT(TTF_Init() == 0); - - text.font_path = font_path; - text.font_size = font_size_from_dimensions(ctx.width, ctx.height); - text.font = TTF_OpenFont(text.font_path, text.font_size); - SDL_ASSERT(text.font != NULL); - - if (benchopt != NULL) { - do_bench(ctx.fut, height, width, max_fps, benchopt); - } else { - int32_t seed = (int32_t) lys_wall_time(); - futhark_entry_init(ctx.fut, &ctx.state, - seed, ctx.height, ctx.width); - lys_run_sdl(&ctx); - free(ctx.data); - } - - TTF_CloseFont(text.font); - free(font_path); - - futhark_context_free(ctx.fut); - futhark_context_config_free(futcfg); - - return EXIT_SUCCESS; -} diff --git a/game_of_life/lib/github.com/diku-dk/lys/setup_flags.mk b/game_of_life/lib/github.com/diku-dk/lys/setup_flags.mk deleted file mode 100644 index 872f590661e3e7b1eacdffd1e113db758dee934f..0000000000000000000000000000000000000000 --- a/game_of_life/lib/github.com/diku-dk/lys/setup_flags.mk +++ /dev/null @@ -1,43 +0,0 @@ -LYS_BACKEND?=opencl -LYS_TTF?=0 - -ifeq ($(origin PROG_FUT_DEPS), undefined) -PROG_FUT_DEPS:=$(shell ls *.fut; find lib -name \*.fut) -endif - -PKG_CFLAGS_PKGS=sdl2 -ifeq ($(LYS_TTF),1) -PKG_CFLAGS_PKGS+= SDL2_ttf -endif - -PKG_CFLAGS=$(shell pkg-config --cflags $(PKG_CFLAGS_PKGS)) - -BASE_LDFLAGS=-lm -lSDL2 -ifeq ($(LYS_TTF),1) -BASE_LDFLAGS+= -lSDL2_ttf -endif - -NOWARN_CFLAGS=-std=c11 -O - -CFLAGS?=$(NOWARN_CFLAGS) $(PKG_CFLAGS) -Wall -Wextra -pedantic -ifeq ($(LYS_TTF),1) -CFLAGS+= -DLYS_TTF -endif - -ifeq ($(LYS_BACKEND),opencl) -OS=$(shell uname -s) -ifeq ($(OS),Darwin) -DEVICE_LDFLAGS=-framework OpenCL -else -DEVICE_LDFLAGS=-lOpenCL -endif -else ifeq ($(LYS_BACKEND),cuda) -DEVICE_LDFLAGS=-lcuda -lnvrtc -else ifeq ($(LYS_BACKEND),c) -DEVICE_LDFLAGS= -else ifeq ($(LYS_BACKEND),multicore) -DEVICE_LDFLAGS=-lpthread -else -$(error Unknown LYS_BACKEND: $(LYS_BACKEND). Must be 'opencl', 'cuda', 'multicore', or 'c') -endif -LDFLAGS?=$(BASE_LDFLAGS) $(DEVICE_LDFLAGS) diff --git a/game_of_life/libfpmpi.a b/game_of_life/libfpmpi.a deleted file mode 100644 index d1781ab3a3db4e7f94fc91be679cf5dafb3ae95e..0000000000000000000000000000000000000000 Binary files a/game_of_life/libfpmpi.a and /dev/null differ diff --git a/game_of_life/main.c b/game_of_life/main.c deleted file mode 100644 index 5fc18443fd5dd78c5516635fe8a54926f83030ae..0000000000000000000000000000000000000000 --- a/game_of_life/main.c +++ /dev/null @@ -1,153 +0,0 @@ -#include <stdio.h> -#include <stdlib.h> -#include <mpi.h> -#include "../lib/fpmpi.h" -#include "../lib/fp.h" -#include "gol.h" -#include "lib/github.com/diku-dk/lys/liblys.h" - -#define NB_ROWS (800) -#define NB_COLUMNS (800) -#define BOARD_SIZE (NB_ROWS * NB_COLUMNS) -#define NB_NEIGHBOURS 8 - -#define MAX_FPS (60) - -int8_t board[NB_ROWS][NB_COLUMNS] = {0}; -int my_rank; - -typedef struct tuple2 { - int8_t cell; - int8_t neighbours[NB_NEIGHBOURS]; -} tuple2_t; - -void init_board() { - for (int y = 0; y < NB_ROWS; ++y) { - for (int x = 0; x < NB_COLUMNS; ++x) { - board[y][x] = rand() % 2; - } - } -} - -void *get_neighbours(void *index) { - int cell_x = *(int *) index % NB_ROWS; - int cell_y = *(int *) index / NB_COLUMNS; - int8_t *neighbours = calloc(8, sizeof(int8_t)); - int i = 0; - for (int y = -1; y <= 1; ++y) { - for (int x = -1; x <= 1; ++x) { - if (y == 0 && x == 0) continue; - int neigh_y = cell_y + y; - if (neigh_y < 0) { - neigh_y = NB_ROWS - 1; - } else if (neigh_y >= NB_ROWS) { - neigh_y = 0; - } - int neigh_x = cell_x + x; - if (neigh_x < 0) { - neigh_x = NB_COLUMNS - 1; - } else if (neigh_x >= NB_COLUMNS) { - neigh_x = 0; - } - neighbours[i++] = board[neigh_y][neigh_x]; - } - } - return neighbours; -} - -void fold_sum(void *acc, void *neighbour) { - int8_t *acc8 = acc; - int8_t *neighbour8 = neighbour; - *acc8 += *neighbour8; -} - -void *next_state(void *element) { - tuple2_t *tuple2 = (tuple2_t *) element; - int8_t initial_value = 0; - int8_t *nb_cells_alive = local_fold_left(tuple2->neighbours, 8, FPMPI_INT8, FPMPI_INT8, fold_sum, &initial_value); - int8_t *next_state = calloc(1, sizeof(int8_t)); - *next_state = (tuple2->cell == 1 && (*nb_cells_alive == 2 || *nb_cells_alive == 3)) || - (tuple2->cell == 0 && *nb_cells_alive == 3); - return next_state; -} - -void *zip_cell_neigh(void *cell, void *neighs) { - tuple2_t *tuple2 = calloc(1, sizeof(tuple2_t)); - tuple2->cell = *(int8_t *) cell; - memcpy(tuple2->neighbours, neighs, NB_NEIGHBOURS * sizeof(int8_t)); - return tuple2; -} - -void handle_event(struct lys_context *ctx, enum lys_event event) { - MPI_Bcast(&board[0][0], BOARD_SIZE, MPI_INT8_T, FPMPI_ROOT_RANK, MPI_COMM_WORLD); - - fpmpi_result_t indexes = iota(BOARD_SIZE, MPI_COMM_WORLD); -// printf("Indexes OK: %d\n", indexes.count); - - fpmpi_result_t neighbours = map(indexes.content, BOARD_SIZE, FPMPI_INT32, NB_NEIGHBOURS * FPMPI_INT8, - get_neighbours, - MPI_COMM_WORLD); -// printf("Neighbours OK: %d\n", neighbours.count); - fpmpi_result_t board_with_neighbours = zip(&board[0][0], neighbours.content, BOARD_SIZE, FPMPI_INT8, 8 * FPMPI_INT8, - sizeof(tuple2_t), zip_cell_neigh, MPI_COMM_WORLD); -// printf("Board with Neigh OK\n"); - fpmpi_result_t new_board = map(board_with_neighbours.content, BOARD_SIZE, sizeof(tuple2_t), FPMPI_INT8, next_state, - MPI_COMM_WORLD); -// printf("New Board OK\n"); - - if (my_rank == FPMPI_ROOT_RANK) { - memcpy(&board[0][0], new_board.content, BOARD_SIZE); - struct futhark_i8_1d *fut_new_board = futhark_new_i8_1d(ctx->fut, &board[0][0], BOARD_SIZE); - futhark_entry_init(ctx->fut, &ctx->state, fut_new_board, NB_ROWS, NB_COLUMNS, BOARD_SIZE); - free(indexes.content); - free(neighbours.content); - free(board_with_neighbours.content); - free(new_board.content); - futhark_free_i8_1d(ctx->fut, fut_new_board); - } -} - -uint32_t *run_interactive(struct futhark_context *fut_ctx, int width, int height, struct futhark_i8_1d *fut_board) { - struct lys_context ctx = {0}; - lys_setup(&ctx, width, height, MAX_FPS, 0); - - ctx.fut = fut_ctx; - ctx.event_handler_data = NULL; - ctx.event_handler = handle_event; - - futhark_entry_init(ctx.fut, &ctx.state, fut_board, NB_ROWS, NB_COLUMNS, BOARD_SIZE); - lys_run_sdl(&ctx); - return ctx.data; -} - -int main(int argc, char *argv[]) { - MPI_Init(&argc, &argv); - MPI_Comm_rank(MPI_COMM_WORLD, &my_rank); - if (my_rank == FPMPI_ROOT_RANK) { - struct futhark_context_config *fut_cfg; - struct futhark_context *fut_ctx; - char *deviceopt = NULL; - bool device_interactive = true; - char *opencl_device_name = NULL; - - lys_setup_futhark_context(deviceopt, device_interactive, &fut_cfg, &fut_ctx, &opencl_device_name); - if (opencl_device_name != NULL) { - fprintf(stdout, "Using OpenCL device: %s\n", opencl_device_name); - } - init_board(); - struct futhark_i8_1d *fut_board = futhark_new_i8_1d(fut_ctx, &board[0][0], BOARD_SIZE); - run_interactive(fut_ctx, NB_COLUMNS, NB_ROWS, fut_board); - - free(opencl_device_name); - futhark_free_i8_1d(fut_ctx, fut_board); - futhark_context_config_free(fut_cfg); - futhark_context_free(fut_ctx); - } else { - for (;;) { - handle_event(NULL, LYS_LOOP_ITERATION); - } - } - - MPI_Finalize(); - return 0; -} diff --git a/lib/.gitignore b/lib/.gitignore deleted file mode 100644 index ef86935756e61596d26595a4e7faa9459b8b7dec..0000000000000000000000000000000000000000 --- a/lib/.gitignore +++ /dev/null @@ -1,383 +0,0 @@ -### macOS template -# General -.DS_Store -.AppleDouble -.LSOverride - -# Icon must end with two \r -Icon - -# Thumbnails -._* - -# Files that might appear in the root of a volume -.DocumentRevisions-V100 -.fseventsd -.Spotlight-V100 -.TemporaryItems -.Trashes -.VolumeIcon.icns -.com.apple.timemachine.donotpresent - -# Directories potentially created on remote AFP share -.AppleDB -.AppleDesktop -Network Trash Folder -Temporary Items -.apdisk - -### Windows template -# Windows thumbnail cache files -Thumbs.db -Thumbs.db:encryptable -ehthumbs.db -ehthumbs_vista.db - -# Dump file -*.stackdump - -# Folder config file -[Dd]esktop.ini - -# Recycle Bin used on file shares -$RECYCLE.BIN/ - -# Windows Installer files -*.cab -*.msi -*.msix -*.msm -*.msp - -# Windows shortcuts -*.lnk - -### C template -# Prerequisites -*.d - -# Object files -*.o -*.ko -*.obj -*.elf - -# Linker output -*.ilk -*.map -*.exp - -# Precompiled Headers -*.gch -*.pch - -# Libraries -*.lib -*.a -*.la -*.lo - -# Shared objects (inc. Windows DLLs) -*.dll -*.so -*.so.* -*.dylib - -# Executables -*.exe -*.out -*.app -*.i*86 -*.x86_64 -*.hex - -# Debug files -*.dSYM/ -*.su -*.idb -*.pdb - -# Kernel Module Compile Results -*.mod* -*.cmd -.tmp_versions/ -modules.order -Module.symvers -Mkfile.old -dkms.conf - -### macOS template -# General -.DS_Store -.AppleDouble -.LSOverride - -# Icon must end with two \r -Icon - -# Thumbnails -._* - -# Files that might appear in the root of a volume -.DocumentRevisions-V100 -.fseventsd -.Spotlight-V100 -.TemporaryItems -.Trashes -.VolumeIcon.icns -.com.apple.timemachine.donotpresent - -# Directories potentially created on remote AFP share -.AppleDB -.AppleDesktop -Network Trash Folder -Temporary Items -.apdisk - -### Linux template -*~ - -# temporary files which can be created if a process still has a handle open of a deleted file -.fuse_hidden* - -# KDE directory preferences -.directory - -# Linux trash folder which might appear on any partition or disk -.Trash-* - -# .nfs files are created when an open file is removed but is still being accessed -.nfs* - -### C template -# Prerequisites -*.d - -# Object files -*.o -*.ko -*.obj -*.elf - -# Linker output -*.ilk -*.map -*.exp - -# Precompiled Headers -*.gch -*.pch - -# Libraries -*.lib -*.a -*.la -*.lo - -# Shared objects (inc. Windows DLLs) -*.dll -*.so -*.so.* -*.dylib - -# Executables -*.exe -*.out -*.app -*.i*86 -*.x86_64 -*.hex - -# Debug files -*.dSYM/ -*.su -*.idb -*.pdb - -# Kernel Module Compile Results -*.mod* -*.cmd -.tmp_versions/ -modules.order -Module.symvers -Mkfile.old -dkms.conf - -### C template -# Prerequisites -*.d - -# Object files -*.o -*.ko -*.obj -*.elf - -# Linker output -*.ilk -*.map -*.exp - -# Precompiled Headers -*.gch -*.pch - -# Libraries -*.lib -*.a -*.la -*.lo - -# Shared objects (inc. Windows DLLs) -*.dll -*.so -*.so.* -*.dylib - -# Executables -*.exe -*.out -*.app -*.i*86 -*.x86_64 -*.hex - -# Debug files -*.dSYM/ -*.su -*.idb -*.pdb - -# Kernel Module Compile Results -*.mod* -*.cmd -.tmp_versions/ -modules.order -Module.symvers -Mkfile.old -dkms.conf - -### macOS template -# General -.DS_Store -.AppleDouble -.LSOverride - -# Icon must end with two \r -Icon - -# Thumbnails -._* - -# Files that might appear in the root of a volume -.DocumentRevisions-V100 -.fseventsd -.Spotlight-V100 -.TemporaryItems -.Trashes -.VolumeIcon.icns -.com.apple.timemachine.donotpresent - -# Directories potentially created on remote AFP share -.AppleDB -.AppleDesktop -Network Trash Folder -Temporary Items -.apdisk - -### Windows template -# Windows thumbnail cache files -Thumbs.db -Thumbs.db:encryptable -ehthumbs.db -ehthumbs_vista.db - -# Dump file -*.stackdump - -# Folder config file -[Dd]esktop.ini - -# Recycle Bin used on file shares -$RECYCLE.BIN/ - -# Windows Installer files -*.cab -*.msi -*.msix -*.msm -*.msp - -# Windows shortcuts -*.lnk - -# User-specific stuff -.idea/**/workspace.xml -.idea/**/tasks.xml -.idea/**/usage.statistics.xml -.idea/**/dictionaries -.idea/**/shelf - -# Generated files -.idea/**/contentModel.xml - -# Sensitive or high-churn files -.idea/**/dataSources/ -.idea/**/dataSources.ids -.idea/**/dataSources.local.xml -.idea/**/sqlDataSources.xml -.idea/**/dynamic.xml -.idea/**/uiDesigner.xml -.idea/**/dbnavigator.xml - -# Gradle -.idea/**/gradle.xml -.idea/**/libraries - -# Gradle and Maven with auto-import -# When using Gradle or Maven with auto-import, you should exclude module files, -# since they will be recreated, and may cause churn. Uncomment if using -# auto-import. -# .idea/artifacts -# .idea/compiler.xml -# .idea/jarRepositories.xml -# .idea/modules.xml -# .idea/*.iml -# .idea/modules -# *.iml -# *.ipr - -# CMake -cmake-build-*/ - -# Mongo Explorer plugin -.idea/**/mongoSettings.xml - -# File-based project format -*.iws - -# IntelliJ -out/ - -# mpeltonen/sbt-idea plugin -.idea_modules/ - -# JIRA plugin -atlassian-ide-plugin.xml - -# Cursive Clojure plugin -.idea/replstate.xml - -# Crashlytics plugin (for Android Studio and IntelliJ) -com_crashlytics_export_strings.xml -crashlytics.properties -crashlytics-build.properties -fabric.properties - -# Editor-based Rest Client -.idea/httpRequests - -# Android studio 3.1+ serialized cache file -.idea/caches/build_file_checksums.ser - -.idea diff --git a/lib/CMakeLists.txt b/lib/CMakeLists.txt deleted file mode 100644 index fe86691c8a4d7dc80de0394d5fb6898c90237ec2..0000000000000000000000000000000000000000 --- a/lib/CMakeLists.txt +++ /dev/null @@ -1,29 +0,0 @@ -cmake_minimum_required(VERSION 3.17) -project(fpmpi C) - -set(CMAKE_C_STANDARD 11) - -if (CMAKE_BUILD_TYPE MATCHES Debug) - set(GCC_COMPILE_FLAGS "-Wall -Wextra -pedantic -fsanitize=undefined -fsanitize=address") - if (CMAKE_SYSTEM_NAME MATCHES "Linux") - set(GCC_COMPILE_FLAGS "${GCC_COMPILE_FLAGS} -fsanitize=leak") - endif () -elseif (CMAKE_BUILD_TYPE MATCHES Release) - set(GCC_COMPILE_FLAGS "-O2") -elseif (CMAKE_BUILD_TYPE MATCHES Benchmark) - set(GCC_COMPILE_FLAGS "-DBENCHMARK -O2") -endif () - -set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${GCC_COMPILE_FLAGS}") - -find_package(MPI REQUIRED) -include_directories(${MPI_C_INCLUDE_PATH}) - -add_library(fpmpi fpmpi.c fpmpi.h fp.c fp.h dispatch.c dispatch.h) -target_link_libraries(fpmpi ${MPI_C_LIBRARIES}) - -add_executable(fpmpi_benchmark benchmark/benchmark.c) -target_link_libraries(fpmpi_benchmark fpmpi) - -add_executable(fpmpi_tests tests/tests.c) -target_link_libraries(fpmpi_tests fpmpi) diff --git a/lib/Makefile b/lib/Makefile deleted file mode 100644 index be8c2a3a7e9dc1e85adaacea01e04d38294d2fde..0000000000000000000000000000000000000000 --- a/lib/Makefile +++ /dev/null @@ -1,18 +0,0 @@ -all: release debug benchmark - -release: - mkdir -p "cmake-build-release" - cmake -DCMAKE_BUILD_TYPE=Release -Bcmake-build-release - $(MAKE) -C cmake-build-release all - -debug: - mkdir -p "cmake-build-debug" - cmake -DCMAKE_BUILD_TYPE=Debug -Bcmake-build-debug - $(MAKE) -C cmake-build-release all - -benchmark: - mkdir -p "cmake-build-benchmark" - cmake -DCMAKE_BUILD_TYPE=Benchmark -Bcmake-build-benchmark - $(MAKE) -C cmake-build-benchmark all - -.PHONY: all release benchmark diff --git a/lib/benchmark/benchmark.c b/lib/benchmark/benchmark.c deleted file mode 100644 index 60e0163f7e2ef7aee5504188ac33747a191ecdbc..0000000000000000000000000000000000000000 --- a/lib/benchmark/benchmark.c +++ /dev/null @@ -1,149 +0,0 @@ -#include <mpi.h> -#include <stdlib.h> -#include <stdio.h> -#include "../fpmpi.h" - -#define BENCHMARK_MAP 1 -#define BENCHMARK_FILTER 2 -#define BENCHMARK_REDUCE 3 -#define BENCHMARK_FIND 4 -#define BENCHMARK_FOLD_LEFT 5 -#define BENCHMARK_FOLD_RIGHT 6 -#define BENCHMARK_SORT 7 -#define BENCHMARK_SCAN 8 -#define BENCHMARK_IOTA 9 -#define BENCHMARK_ZIP 10 - -void *map_mul_int(void *element) { - int *result = calloc(1, sizeof(int)); - *result = (*(int *) (element)) * 2; - return (void *) result; -} - -bool filter_only_even(void *element) { - int element32 = *(int *) element; - return element32 % 2 == 0; -} - -bool find_divide_by_three(void *element) { - int element32 = *(int *) element; - return element32 % 3 == 0; -} - -void reduce_sum(void *accumulator, void *current_value) { - int *accumulator32 = (int *) accumulator; - int current_value32 = *(int *) current_value; - *accumulator32 = (*accumulator32 + current_value32); -} - -void fold_left_sub(void *accumulator, void *current_value) { - int *accumulator32 = (int *) accumulator; - int current_value32 = *(int *) current_value; - *accumulator32 = (*accumulator32 - current_value32); -} - -void fold_right_sub(void *current_value, void *accumulator) { - int *accumulator32 = (int *) accumulator; - int current_value32 = *(int *) current_value; - *accumulator32 = (current_value32 - *accumulator32); -} - -bool sort_asc(void *left, void *right) { - int left32 = *(int *) left; - int right32 = *(int *) right; - return left32 < right32; -} - -int main(int argc, char *argv[]) { - if(argc < 4) { - printf("Missing argv parameters.\n"); - exit(0); - } - - MPI_Init(&argc, &argv); - - int my_rank; - MPI_Comm_rank(MPI_COMM_WORLD, &my_rank); - - int benchmark = atoi(argv[1]); - int times = atoi(argv[2]); - int N = atoi(argv[3]); - - int *array1 = NULL; - int *array2 = NULL; - if (benchmark == BENCHMARK_REDUCE || benchmark == BENCHMARK_FOLD_LEFT || benchmark == BENCHMARK_FOLD_RIGHT || - my_rank == FPMPI_ROOT_RANK) { - array1 = calloc(N, sizeof(int)); - array2 = calloc(N, sizeof(int)); - for (int i = 0; i < N; ++i) { - if (benchmark == BENCHMARK_SORT) { - array1[i] = rand(); - array2[i] = rand(); - } else { - array1[i] = i; - array2[i] = i; - } - } - } - - for (int i = 0; i < times; ++i) { - fpmpi_result_t result; - switch (benchmark) { - case BENCHMARK_MAP: { - result = map(array1, N, FPMPI_INT32, FPMPI_INT32, map_mul_int, MPI_COMM_WORLD); - } - break; - case BENCHMARK_FILTER: { - result = filter(array1, N, FPMPI_INT32, filter_only_even, MPI_COMM_WORLD); - } - break; - case BENCHMARK_REDUCE: { - result = reduce(array1, N, FPMPI_INT32, reduce_sum, MPI_COMM_WORLD); - } - break; - case BENCHMARK_FIND: { - result = find(array1, N, FPMPI_INT32, find_divide_by_three, MPI_COMM_WORLD); - } - break; - case BENCHMARK_FOLD_LEFT: { - int initial_value = 0; - result = fold_left(array1, N, FPMPI_INT32, FPMPI_INT32, fold_left_sub, &initial_value, MPI_COMM_WORLD); - } - break; - case BENCHMARK_FOLD_RIGHT: { - int initial_value = 0; - result = fold_right(array1, N, FPMPI_INT32, FPMPI_INT32, fold_right_sub, &initial_value, MPI_COMM_WORLD); - } - break; - case BENCHMARK_SORT: { - result = sort(array1, N, FPMPI_INT32, FPMPI_MERGE_SORT, sort_asc, MPI_COMM_WORLD); - } - break; - case BENCHMARK_SCAN: { -// result = sort(array1, N, FPMPI_INT32, FPMPI_MERGE_SORT, sort_asc, MPI_COMM_WORLD); - } - break; - case BENCHMARK_IOTA: { - result = iota(N, MPI_COMM_WORLD); - } - break; - case BENCHMARK_ZIP: { -// result = zip(array1, array2, FPMPI_INT32, FPMPI_MERGE_SORT, sort_asc, MPI_COMM_WORLD); - } - break; - default: - MPI_Finalize(); - exit(0); - } - - if (my_rank == FPMPI_ROOT_RANK) { - free(result.content); - } - } - if (benchmark == BENCHMARK_REDUCE || benchmark == BENCHMARK_FOLD_LEFT || benchmark == BENCHMARK_FOLD_RIGHT || - my_rank == FPMPI_ROOT_RANK) { - free(array1); - } - MPI_Finalize(); - return 0; -} diff --git a/lib/dispatch.c b/lib/dispatch.c deleted file mode 100644 index 755405841750938d40de6cb1976521dfc97f3ba1..0000000000000000000000000000000000000000 --- a/lib/dispatch.c +++ /dev/null @@ -1,36 +0,0 @@ -#include <stdlib.h> -#include "dispatch.h" - -dispatch_t dispatch_init(int count, int type, int out_type, int world_size, int root) { - int nb_columns_per_process = count / world_size; - int remaining_columns = count % world_size; - - dispatch_t dispatch = { - .in_counts8 = calloc(world_size, sizeof(int)), - .in_displacements8 = calloc(world_size, sizeof(int)), - .out_counts8 = calloc(world_size, sizeof(int)), - .out_displacements8 = calloc(world_size, sizeof(int)), - }; - - for (int i = 0; i < world_size; ++i) { - int root_nb_columns = nb_columns_per_process + remaining_columns; - int nb_columns = (i == root) ? root_nb_columns : nb_columns_per_process; - if (i == 0) { - dispatch.in_displacements8[i] = 0; - dispatch.out_displacements8[i] = 0; - } else { - dispatch.in_displacements8[i] = dispatch.in_displacements8[i - 1] + dispatch.in_counts8[i - 1]; - dispatch.out_displacements8[i] = dispatch.out_displacements8[i - 1] + dispatch.out_counts8[i - 1]; - } - dispatch.in_counts8[i] = nb_columns * type; - dispatch.out_counts8[i] = nb_columns * out_type; - } - return dispatch; -} - -void dispatch_destroy(dispatch_t *dispatch) { - free(dispatch->in_displacements8); - free(dispatch->in_counts8); - free(dispatch->out_displacements8); - free(dispatch->out_counts8); -} diff --git a/lib/dispatch.h b/lib/dispatch.h deleted file mode 100644 index bdb08d83ea2ae9e51b2da503624a0b3543f57028..0000000000000000000000000000000000000000 --- a/lib/dispatch.h +++ /dev/null @@ -1,15 +0,0 @@ -#ifndef _DISPATCH_H_ -#define _DISPATCH_H_ - -typedef struct dispatch { - int *in_counts8; - int *in_displacements8; - int *out_counts8; - int *out_displacements8; -} dispatch_t; - -dispatch_t dispatch_init(int count, int type, int out_type, int world_size, int root); - -void dispatch_destroy(dispatch_t *dispatch); - -#endif //_DISPATCH_H_ diff --git a/lib/fp.c b/lib/fp.c deleted file mode 100644 index 3ca8106389f5f0473ea4e474809c3d2151b77c65..0000000000000000000000000000000000000000 --- a/lib/fp.c +++ /dev/null @@ -1,164 +0,0 @@ -#include "fp.h" -#include "fpmpi.h" -#include <stdint.h> -#include <stdlib.h> -#include <string.h> - -void *local_map(void *array, int count, int type, int map_type, void *f(void *)) { - uint8_t *array8 = (uint8_t *) array; - void *output = calloc(count, map_type); - uint8_t *output8 = (uint8_t *) output; - - for (int i = 0; i < count; ++i) { - void *result = f(array8 + i * type); - memcpy(output8, result, (size_t) map_type); - output8 += map_type; - free(result); - } - return output; -} - -void *local_filter(void *array, int count, int type, bool f(void *), int *output_count) { - uint8_t *array8 = (uint8_t *) array; - void *output = calloc(count, type); - uint8_t *output8 = (uint8_t *) output; - - for (int i = 0; i < count; ++i) { - uint8_t *element8 = array8 + i * type; - if (f(element8)) { - memcpy(output8, element8, type); - output8 += type; - ++(*output_count); - } - } - return output; -} - -void *local_fold_left(void *array, int count, int type, int fold_type, void f(void *, void *), void *initial_value) { - uint8_t *array8 = (uint8_t *) array; - void *accumulator = calloc(1, fold_type); - int i = 0; - /* initial_value is NULL for reduce */ - if (initial_value == NULL) { - memcpy(accumulator, array8, type); - ++i; - } else { - memcpy(accumulator, initial_value, fold_type); - } - - for (; i < count; ++i) { - f(accumulator, array8 + i * type); - } - return accumulator; -} - -void *local_fold_right(void *array, int count, int type, int fold_type, void f(void *, void *), void *initial_value) { - uint8_t *array8 = (uint8_t *) array; - void *accumulator = calloc(1, fold_type); - memcpy(accumulator, initial_value, fold_type); - for (int i = count - 1; i >= 0; --i) { - f(array8 + i * type, accumulator); - } - return accumulator; -} - -void *local_find(void *array, int count, int type, bool f(void *)) { - uint8_t *array8 = (uint8_t *) array; - for (int i = 0; i < count; ++i) { - if (f(array8 + i * type)) { - return array8 + i * type; - } - } - return NULL; -} - -// https://gist.github.com/hackrio1/a11c8499ed68f5df6c30e53d1c3fe076 -static void merge_sort(void *array, void *work_array, int type, bool f(void *, void *), int i, int j) { - uint8_t *array8 = (uint8_t *) array; - uint8_t *work_array8 = (uint8_t *) work_array; - if (j <= i) { - return; // the subsection is empty or a single element - } - int mid = (i + j) / 2; - - // left sub-array is a[i .. mid] - // right sub-array is a[mid + 1 .. j] - - merge_sort(array, work_array, type, f, i, mid); // sort the left sub-array recursively - merge_sort(array, work_array, type, f, mid + 1, j); // sort the right sub-array recursively - - int pointer_left = i; // pointer_left points to the beginning of the left sub-array - int pointer_right = mid + 1; // pointer_right points to the beginning of the right sub-array - int k; // k is the loop counter - - // we loop from i to j to fill each element of the final merged array - for (k = i; k <= j; k++) { - if (pointer_left == mid + 1) { // left pointer has reached the limit - memcpy(work_array8 + k * type, array8 + pointer_right * type, type); - pointer_right++; - } else if (pointer_right == j + 1) { // right pointer has reached the limit - memcpy(work_array8 + k * type, array8 + pointer_left * type, type); - pointer_left++; - } else if (f(array8 + pointer_left * type, - array8 + pointer_right * type)) { // pointer left points to smaller element - memcpy(work_array8 + k * type, array8 + pointer_left * type, type); - pointer_left++; - } else { // pointer right points to smaller element - memcpy(work_array8 + k * type, array8 + pointer_right * type, type); - pointer_right++; - } - } - - for (k = i; k <= j; k++) { // copy the elements from work_array[] to array[] - memcpy(array8 + k * type, work_array8 + k * type, type); - } -} - -void local_sort(void *array, int count, int type, int sort_method, bool f(void *, void *)) { - switch (sort_method) { - case FPMPI_MERGE_SORT: { - void *work_array = calloc(count, type); - merge_sort(array, work_array, type, f, 0, count - 1); - free(work_array); - } - default: - break; - } -} - -void *local_scan(void *array, int count, int type, int scan_type, void *f(void *, void *), void *initial_value) { - uint8_t *array8 = (uint8_t *) array; - void *accumulators = calloc(count + 1, scan_type); - uint8_t *accumulators8 = (uint8_t *) accumulators; - memcpy(accumulators, initial_value, scan_type); - - for (int i = 0; i < count; ++i) { - void *accumulator = f(accumulators8 + i * scan_type, array8 + i * type); - memcpy(accumulators8 + (i + 1) * scan_type, accumulator, scan_type); - free(accumulator); - } - return accumulators; -} - -void *local_zip(void *array1, void *array2, int count, int type1, int type2, int tuple_type, void *f(void *, void *)) { - uint8_t *array1_8 = (uint8_t *) array1; - uint8_t *array2_8 = (uint8_t *) array2; - - void *output = calloc(count, tuple_type); - uint8_t *output8 = (uint8_t *) output; - - for (int i = 0; i < count; ++i) { - void *tuple = f(array1_8 + i * type1, array2_8 + i * type2); - memcpy(output8 + i * tuple_type, tuple, tuple_type); - free(tuple); - } - return output; -} - -int *local_iota(int start, int count) { - int *output = calloc(count, sizeof(int)); - for (int i = 0; i < count; ++i) { - output[i] = start++; - } - return output; -} diff --git a/lib/fp.h b/lib/fp.h deleted file mode 100644 index 0d7f6c23c974f2e8a5d6b4f9bf52a5fa972c251e..0000000000000000000000000000000000000000 --- a/lib/fp.h +++ /dev/null @@ -1,25 +0,0 @@ -#ifndef _FP_H_ -#define _FP_H_ - -#include <stdbool.h> -#include "fpmpi.h" - -void *local_map(void *array, int count, int type, int map_type, void *f(void *)); - -void *local_filter(void *array, int count, int type, bool f(void *), int *output_count); - -void *local_fold_left(void *array, int count, int type, int fold_type, void f(void *, void *), void *initial_value); - -void *local_fold_right(void *array, int count, int type, int fold_type, void f(void *, void *), void *initial_value); - -void *local_find(void *array, int count, int type, bool f(void *)); - -void local_sort(void *array, int count, int type, int sort_method, bool f(void *, void *)); - -void *local_scan(void *array, int count, int type, int scan_type, void *f(void *, void *), void *initial_value); - -void *local_zip(void *array1, void *array2, int count, int type1, int type2, int tuple_type, void *f(void *, void *)); - -int *local_iota(int start, int count); - -#endif //_FP_H_ diff --git a/lib/fpmpi.c b/lib/fpmpi.c deleted file mode 100644 index b92e9657695fe57288dd4d77d30f52516ecf5f61..0000000000000000000000000000000000000000 --- a/lib/fpmpi.c +++ /dev/null @@ -1,546 +0,0 @@ -#include <stdlib.h> - -#ifdef BENCHMARK -#include <stdio.h> -#endif - -#include <string.h> -#include <stdio.h> -#include "fpmpi.h" -#include "fp.h" -#include "dispatch.h" - -#define TAG_FILTER_LOCAL_OUTPUT_COUNT 0 -#define TAG_FIND_HAS_RESULT 1 -#define TAG_FIND_LOCAL_OUTPUT 2 -#define TAG_FOLD_LEFT_ACCUMULATOR 3 -#define TAG_FOLD_RIGHT_ACCUMULATOR 4 -#define TAG_SORT_LOCAL_OUTPUT 5 -#define TAG_SCAN_ACCUMULATORS 6 -#define TAG_ZIP_START_ARRAYS 7 - -#define min(a, b) (((a) <= (b)) ? (a) : (b)) -#define set_count(my_rank, count) my_rank == FPMPI_ROOT_RANK ? count : 0 - -int get_world_size(MPI_Comm comm) { - int world_size; - MPI_Comm_size(comm, &world_size); - return world_size; -} - -int get_my_rank(MPI_Comm comm) { - int my_rank; - MPI_Comm_rank(comm, &my_rank); - return my_rank; -} - -fpmpi_result_t map(void *array, int count, int type, int map_type, void *f(void *), MPI_Comm comm) { -#ifdef BENCHMARK - double start = MPI_Wtime(); -#endif - int my_rank = get_my_rank(comm); - int world_size = get_world_size(comm); - - dispatch_t dispatch = dispatch_init(count, type, map_type, world_size, FPMPI_ROOT_RANK); - - void *local_array = calloc(dispatch.in_counts8[my_rank], sizeof(uint8_t)); - MPI_Scatterv(array, dispatch.in_counts8, dispatch.in_displacements8, MPI_UINT8_T, local_array, - dispatch.in_counts8[my_rank], MPI_UINT8_T, FPMPI_ROOT_RANK, comm); - - - int local_count = dispatch.in_counts8[my_rank] / type; - void *local_output = local_map(local_array, local_count, type, map_type, f); - - void *result = NULL; - if (my_rank == FPMPI_ROOT_RANK) { - result = calloc(count, map_type); - } - MPI_Gatherv(local_output, dispatch.out_counts8[my_rank], MPI_UINT8_T, result, dispatch.out_counts8, - dispatch.out_displacements8, MPI_UINT8_T, FPMPI_ROOT_RANK, comm); - - - free(local_array); - free(local_output); - dispatch_destroy(&dispatch); - -#ifdef BENCHMARK - double finish = MPI_Wtime(); - if (my_rank == FPMPI_ROOT_RANK) { - printf("%d;%f\n", world_size, finish - start); - } -#endif - return (fpmpi_result_t) { - .content = result, - .count = set_count(my_rank, count), - }; -} - -fpmpi_result_t filter(void *array, int count, int type, bool f(void *), MPI_Comm comm) { -#ifdef BENCHMARK - double start = MPI_Wtime(); -#endif - int my_rank = get_my_rank(comm); - int world_size = get_world_size(comm); - - dispatch_t dispatch = dispatch_init(count, type, type, world_size, FPMPI_ROOT_RANK); - - void *local_array = calloc(dispatch.in_counts8[my_rank], sizeof(uint8_t)); - MPI_Scatterv(array, dispatch.in_counts8, dispatch.in_displacements8, MPI_UINT8_T, local_array, - dispatch.in_counts8[my_rank], MPI_UINT8_T, FPMPI_ROOT_RANK, comm); - - int local_array_count = dispatch.in_counts8[my_rank] / type; - int local_output_count = 0; - void *local_output = local_filter(local_array, local_array_count, type, f, &local_output_count); - - MPI_Request request; - MPI_Isend(&local_output_count, 1, MPI_INT, FPMPI_ROOT_RANK, TAG_FILTER_LOCAL_OUTPUT_COUNT, comm, &request); - - void *result = NULL; - int result_count = 0; - if (my_rank == FPMPI_ROOT_RANK) { - for (int i = 0; i < world_size; ++i) { - MPI_Recv(&dispatch.out_counts8[i], 1, MPI_INT, i, TAG_FILTER_LOCAL_OUTPUT_COUNT, comm, MPI_STATUS_IGNORE); - result_count += dispatch.out_counts8[i]; - dispatch.out_counts8[i] *= type; - dispatch.out_displacements8[i] = - i == 0 ? 0 : dispatch.out_displacements8[i - 1] + dispatch.out_counts8[i - 1]; - } - result = calloc(result_count, type); - } - - MPI_Wait(&request, MPI_STATUS_IGNORE); - - MPI_Gatherv(local_output, local_output_count * type, MPI_UINT8_T, result, dispatch.out_counts8, - dispatch.out_displacements8, MPI_UINT8_T, FPMPI_ROOT_RANK, MPI_COMM_WORLD); - - free(local_array); - free(local_output); - dispatch_destroy(&dispatch); - -#ifdef BENCHMARK - double finish = MPI_Wtime(); - if (my_rank == FPMPI_ROOT_RANK) { - printf("%d;%f\n", world_size, finish - start); - } -#endif - return (fpmpi_result_t) { - .content = result, - .count = set_count(my_rank, result_count), - }; -} - -fpmpi_result_t reduce(void *array, int count, int type, void f(void *, void *), MPI_Comm comm) { - return fold_left(array, count, type, type, f, NULL, comm); -} - -fpmpi_result_t -fold_left(void *array, int count, int type, int fold_type, void f(void *, void *), void *initial_value, MPI_Comm comm) { -#ifdef BENCHMARK - double start = MPI_Wtime(); -#endif - int my_rank = get_my_rank(comm); - int world_size = get_world_size(comm); - - void *accumulator = calloc(1, fold_type); - if (my_rank != 0) { - MPI_Recv(accumulator, 1 * fold_type, MPI_UINT8_T, my_rank - 1, TAG_FOLD_LEFT_ACCUMULATOR, comm, - MPI_STATUS_IGNORE); - initial_value = accumulator; - } - - void *local_result = local_fold_left(array, count, type, fold_type, f, initial_value); - - int dest = my_rank == world_size - 1 ? FPMPI_ROOT_RANK : my_rank + 1; - - /* Isend because if dest == my_rank, a deadlock will occur, MPI_Recv is after */ - MPI_Request request = {0}; - MPI_Isend(local_result, 1 * fold_type, MPI_UINT8_T, dest, TAG_FOLD_LEFT_ACCUMULATOR, comm, &request); - - - void *result = NULL; - if (my_rank == FPMPI_ROOT_RANK) { - result = calloc(1, fold_type); - MPI_Recv(result, 1 * fold_type, MPI_UINT8_T, world_size - 1, TAG_FOLD_LEFT_ACCUMULATOR, comm, - MPI_STATUS_IGNORE); - } - - MPI_Wait(&request, MPI_STATUS_IGNORE); - - free(local_result); - free(accumulator); - -#ifdef BENCHMARK - double finish = MPI_Wtime(); - if (my_rank == FPMPI_ROOT_RANK) { - printf("%d;%f\n", world_size, finish - start); - } -#endif - return (fpmpi_result_t) { - .content = result, - .count = set_count(my_rank, 1), - }; -} - -fpmpi_result_t -fold_right(void *array, int count, int type, int fold_type, void f(void *, void *), void *initial_value, - MPI_Comm comm) { -#ifdef BENCHMARK - double start = MPI_Wtime(); -#endif - int my_rank = get_my_rank(comm); - int world_size = get_world_size(comm); - - void *accumulator = calloc(1, fold_type); - if (my_rank != world_size - 1) { - MPI_Recv(accumulator, 1 * fold_type, MPI_UINT8_T, my_rank + 1, TAG_FOLD_RIGHT_ACCUMULATOR, comm, - MPI_STATUS_IGNORE); - initial_value = accumulator; - } - - void *local_result = local_fold_right(array, count, type, fold_type, f, initial_value); - - int dest = my_rank == 0 ? FPMPI_ROOT_RANK : my_rank - 1; - /* Isend because if dest == my_rank, a deadlock will occur, MPI_Recv will be after */ - MPI_Request request = {0}; - MPI_Isend(local_result, 1 * fold_type, MPI_UINT8_T, dest, TAG_FOLD_RIGHT_ACCUMULATOR, comm, &request); - - void *result = NULL; - if (my_rank == FPMPI_ROOT_RANK) { - result = calloc(1, fold_type); - MPI_Recv(result, 1 * fold_type, MPI_UINT8_T, 0, TAG_FOLD_RIGHT_ACCUMULATOR, comm, - MPI_STATUS_IGNORE); - } - - MPI_Wait(&request, MPI_STATUS_IGNORE); - - free(local_result); - free(accumulator); - -#ifdef BENCHMARK - double finish = MPI_Wtime(); - if (my_rank == FPMPI_ROOT_RANK) { - printf("%d;%f\n", world_size, finish - start); - } -#endif - return (fpmpi_result_t) { - .content = result, - .count = set_count(my_rank, 1), - }; -} - -fpmpi_result_t find(void *array, int count, int type, bool f(void *), MPI_Comm comm) { -#ifdef BENCHMARK - double start = MPI_Wtime(); -#endif - int my_rank = get_my_rank(comm); - int world_size = get_world_size(comm); - - dispatch_t dispatch = dispatch_init(count, type, type, world_size, FPMPI_ROOT_RANK); - - void *local_array = calloc(dispatch.in_counts8[my_rank], sizeof(uint8_t)); - MPI_Scatterv(array, dispatch.in_counts8, dispatch.in_displacements8, MPI_UINT8_T, local_array, - dispatch.in_counts8[my_rank], MPI_UINT8_T, FPMPI_ROOT_RANK, comm); - - int local_count = dispatch.in_counts8[my_rank] / type; - void *local_output = local_find(local_array, local_count, type, f); - bool local_has_result = local_output != NULL; - - MPI_Request requests[2] = {0}; - MPI_Isend(&local_has_result, 1, MPI_C_BOOL, FPMPI_ROOT_RANK, TAG_FIND_HAS_RESULT, comm, &requests[0]); - - if (local_has_result) { - MPI_Isend(local_output, 1 * type, MPI_UINT8_T, FPMPI_ROOT_RANK, TAG_FIND_LOCAL_OUTPUT, comm, &requests[1]); - } - - void *result = NULL; - if (my_rank == FPMPI_ROOT_RANK) { - uint8_t *results = calloc(world_size, type); - int result_index = INT32_MAX; - MPI_Status status; - bool has_result = false; - for (int i = 0; i < world_size; ++i) { - MPI_Recv(&has_result, 1, MPI_C_BOOL, MPI_ANY_SOURCE, TAG_FIND_HAS_RESULT, comm, &status); - if (has_result) { - MPI_Recv(results + status.MPI_SOURCE * type, 1 * type, MPI_UINT8_T, status.MPI_SOURCE, - TAG_FIND_LOCAL_OUTPUT, comm, MPI_STATUS_IGNORE); - result_index = min(status.MPI_SOURCE, result_index); - } - } - if (result_index != INT32_MAX) { - result = calloc(1, type); - memcpy(result, results + result_index * type, type); - } - free(results); - } - - MPI_Waitall(local_has_result + 1, requests, MPI_STATUSES_IGNORE); - - free(local_array); - dispatch_destroy(&dispatch); - -#ifdef BENCHMARK - double finish = MPI_Wtime(); - if (my_rank == FPMPI_ROOT_RANK) { - printf("%d;%f\n", world_size, finish - start); - } -#endif - return (fpmpi_result_t) { - .content = result, - .count = set_count(my_rank, result != NULL), - }; -} - -// https://www.geeksforgeeks.org/merge-k-sorted-arrays/ -static void -marge_arrays(void *array1, void *array2, int count1, int count2, int type, bool f(void *, void *), void *result_array) { - int i = 0, j = 0, k = 0; - uint8_t *array1_8 = (uint8_t *) array1; - uint8_t *array2_8 = (uint8_t *) array2; - uint8_t *result_array8 = (uint8_t *) result_array; - - // Traverse both array - while (i < count1 && j < count2) { - // Check if current element of first - // array is smaller than current element - // of second array. If yes, store first - // array element and increment first array - // index. Otherwise do same with second array - if (f(array1_8 + i * type, array2_8 + j * type)) { - memcpy(result_array8 + k * type, array1_8 + i * type, type); - ++k, ++i; - } else { - memcpy(result_array8 + k * type, array2_8 + j * type, type); - ++k, ++j; - } - } - - // Store remaining elements of first array - while (i < count1) { - memcpy(result_array8 + k * type, array1_8 + i * type, type); - ++k, ++i; - } - - // Store remaining elements of second array - while (j < count2) { - memcpy(result_array8 + k * type, array2_8 + j * type, type); - ++k, ++j; - } -} - -fpmpi_result_t sort(void *array, int count, int type, int sort_method, bool f(void *, void *), MPI_Comm comm) { -#ifdef BENCHMARK - double start = MPI_Wtime(); -#endif - int my_rank = get_my_rank(comm); - int world_size = get_world_size(comm); - - dispatch_t dispatch = dispatch_init(count, type, type, world_size, FPMPI_ROOT_RANK); - - void *local_array = calloc(dispatch.in_counts8[my_rank], sizeof(uint8_t)); - MPI_Scatterv(array, dispatch.in_counts8, dispatch.in_displacements8, MPI_UINT8_T, local_array, - dispatch.in_counts8[my_rank], MPI_UINT8_T, FPMPI_ROOT_RANK, comm); - - int local_count = dispatch.in_counts8[my_rank] / type; - local_sort(local_array, local_count, type, sort_method, f); - - void *result = NULL; - if (my_rank == FPMPI_ROOT_RANK) { - result = calloc(local_count, type); - memcpy(result, local_array, dispatch.in_counts8[my_rank]); - - if (world_size > 1) { - int current_count = local_count; - int recv_count8 = dispatch.in_counts8[(my_rank + 1) % world_size]; - int recv_count = dispatch.in_counts8[(my_rank + 1) % world_size] / type; - void *recv_buffer = calloc(recv_count8, sizeof(uint8_t)); - - for (int i = 0; i < world_size - 1; ++i) { - MPI_Recv(recv_buffer, recv_count8, MPI_UINT8_T, MPI_ANY_SOURCE, TAG_SORT_LOCAL_OUTPUT, comm, - MPI_STATUS_IGNORE); - void *tmp_result = calloc(current_count + recv_count, type); - marge_arrays(result, recv_buffer, current_count, recv_count, type, f, tmp_result); - free(result); - result = tmp_result; - current_count += recv_count; - } - free(recv_buffer); - } - } else { - MPI_Send(local_array, dispatch.in_counts8[my_rank], MPI_UINT8_T, FPMPI_ROOT_RANK, TAG_SORT_LOCAL_OUTPUT, comm); - } - - free(local_array); - dispatch_destroy(&dispatch); - -#ifdef BENCHMARK - double finish = MPI_Wtime(); - if (my_rank == FPMPI_ROOT_RANK) { - printf("%d;%f\n", world_size, finish - start); - } -#endif - return (fpmpi_result_t) { - .content = result, - .count = count, - }; -} - - -fpmpi_result_t -scan(void *array, int count, int type, int scan_type, void *f(void *, void *), void *initial_value, MPI_Comm comm) { -#ifdef BENCHMARK - double start = MPI_Wtime(); -#endif - int my_rank = get_my_rank(comm); - int world_size = get_world_size(comm); - - void *current_accumulators = NULL; - int current_accumulators_count8 = 0; - - if (my_rank != 0) { - MPI_Status status = {0}; - MPI_Probe(my_rank - 1, TAG_SCAN_ACCUMULATORS, MPI_COMM_WORLD, &status); - MPI_Get_count(&status, MPI_UINT8_T, ¤t_accumulators_count8); - current_accumulators = calloc(current_accumulators_count8, sizeof(uint8_t)); - - MPI_Recv(current_accumulators, current_accumulators_count8, MPI_UINT8_T, my_rank - 1, TAG_SCAN_ACCUMULATORS, - comm, MPI_STATUS_IGNORE); - /* Initial value is the last accumulator value */ - initial_value = ((uint8_t *) current_accumulators) + current_accumulators_count8 - scan_type; - } else { - current_accumulators = calloc(count + 1, scan_type); - current_accumulators_count8 = (count + 1) * scan_type; - } - - void *local_accumulators = local_scan(array, count, type, scan_type, f, initial_value); - int local_accumulators_count8 = (count + 1) * scan_type; - - if (my_rank != 0) { - /* First accumulators is ignored because it will be duplicated */ - uint8_t *local_accumulators8 = (uint8_t *) local_accumulators + scan_type; - local_accumulators_count8 = (count) * scan_type; - current_accumulators = realloc(current_accumulators, current_accumulators_count8 + local_accumulators_count8); - uint8_t *current_accumulators8 = (uint8_t *) current_accumulators + current_accumulators_count8; - current_accumulators_count8 += local_accumulators_count8; - memcpy(current_accumulators8, local_accumulators8, local_accumulators_count8); - } else { - memcpy(current_accumulators, local_accumulators, local_accumulators_count8); - } - - int dest = my_rank == world_size - 1 ? FPMPI_ROOT_RANK : my_rank + 1; - - /* Isend because if dest == my_rank, a deadlock will occur, MPI_Recv is after */ - MPI_Request request = {0}; - MPI_Isend(current_accumulators, current_accumulators_count8, MPI_UINT8_T, dest, TAG_SCAN_ACCUMULATORS, comm, - &request); - - void *result = NULL; - int recv_count = 0; - if (my_rank == FPMPI_ROOT_RANK) { - MPI_Status status = {0}; - MPI_Probe(world_size - 1, TAG_SCAN_ACCUMULATORS, MPI_COMM_WORLD, &status); - MPI_Get_count(&status, MPI_UINT8_T, &recv_count); - result = calloc(recv_count, sizeof(uint8_t)); - MPI_Recv(result, recv_count, MPI_UINT8_T, world_size - 1, TAG_SCAN_ACCUMULATORS, comm, - MPI_STATUS_IGNORE); - } - - MPI_Wait(&request, MPI_STATUS_IGNORE); - - free(local_accumulators); - free(current_accumulators); - -#ifdef BENCHMARK - double finish = MPI_Wtime(); - if (my_rank == FPMPI_ROOT_RANK) { - printf("%d;%f\n", world_size, finish - start); - } -#endif - return (fpmpi_result_t) { - .content = result, - .count = set_count(my_rank, recv_count / scan_type), - }; -} - -fpmpi_result_t iota(int n, MPI_Comm comm) { -#ifdef BENCHMARK - double start = MPI_Wtime(); -#endif - int my_rank = get_my_rank(comm); - int world_size = get_world_size(comm); - dispatch_t dispatch = dispatch_init(n, FPMPI_INT32, FPMPI_INT32, world_size, my_rank); - - int local_start = dispatch.in_displacements8[my_rank] / FPMPI_INT32; - int local_n = dispatch.in_counts8[my_rank] / FPMPI_INT32; - void *local_output = local_iota(local_start, local_n); - - void *result = NULL; - if (my_rank == FPMPI_ROOT_RANK) { - result = calloc(n, FPMPI_INT32); - } - MPI_Gatherv(local_output, dispatch.out_counts8[my_rank], MPI_UINT8_T, result, dispatch.out_counts8, - dispatch.out_displacements8, MPI_UINT8_T, FPMPI_ROOT_RANK, comm); - - free(local_output); - dispatch_destroy(&dispatch); -#ifdef BENCHMARK - double finish = MPI_Wtime(); - if (my_rank == FPMPI_ROOT_RANK) { - printf("%d;%f\n", world_size, finish - start); - } -#endif - return (fpmpi_result_t) { - .content = result, - .count = set_count(my_rank, n) - }; -} - -fpmpi_result_t zip(void *array1, void *array2, int count, int type1, int type2, int tuple_type, void *f(void *, void *), - MPI_Comm comm) { -#ifdef BENCHMARK - double start = MPI_Wtime(); -#endif - int my_rank = get_my_rank(comm); - int world_size = get_world_size(comm); - - dispatch_t dispatch1 = dispatch_init(count, type1, type1, world_size, FPMPI_ROOT_RANK); - dispatch_t dispatch2 = dispatch_init(count, type2, type2, world_size, FPMPI_ROOT_RANK); - dispatch_t dispatch3 = dispatch_init(count, tuple_type, tuple_type, world_size, FPMPI_ROOT_RANK); - - void *local_array1 = calloc(dispatch1.in_counts8[my_rank], sizeof(uint8_t)); - MPI_Scatterv(array1, dispatch1.in_counts8, dispatch1.in_displacements8, MPI_UINT8_T, local_array1, - dispatch1.in_counts8[my_rank], MPI_UINT8_T, FPMPI_ROOT_RANK, comm); - - void *local_array2 = calloc(dispatch2.in_counts8[my_rank], sizeof(uint8_t)); - MPI_Scatterv(array2, dispatch2.in_counts8, dispatch2.in_displacements8, MPI_UINT8_T, local_array2, - dispatch2.in_counts8[my_rank], MPI_UINT8_T, FPMPI_ROOT_RANK, comm); - - int local_count = dispatch1.in_counts8[my_rank] / type1; - void *local_output = local_zip(local_array1, local_array2, local_count, type1, type2, tuple_type, f); - - void *result = NULL; - if (my_rank == FPMPI_ROOT_RANK) { - result = calloc(count, tuple_type); - } - MPI_Gatherv(local_output, dispatch3.out_counts8[my_rank], MPI_UINT8_T, result, dispatch3.out_counts8, - dispatch3.out_displacements8, MPI_UINT8_T, FPMPI_ROOT_RANK, comm); - - - free(local_array1); - free(local_array2); - free(local_output); - - dispatch_destroy(&dispatch1); - dispatch_destroy(&dispatch2); - dispatch_destroy(&dispatch3); - -#ifdef BENCHMARK - double finish = MPI_Wtime(); - if (my_rank == FPMPI_ROOT_RANK) { - printf("%d;%f\n", world_size, finish - start); - } -#endif - return (fpmpi_result_t) { - .content = result, - .count = set_count(my_rank, count) - }; -} diff --git a/lib/fpmpi.h b/lib/fpmpi.h deleted file mode 100644 index 02ae54bbd5580a95ed229c81b92d18d396291fcb..0000000000000000000000000000000000000000 --- a/lib/fpmpi.h +++ /dev/null @@ -1,61 +0,0 @@ -#ifndef FPMPI_LIBRARY_H -#define FPMPI_LIBRARY_H - -#include <mpi.h> -#include <stdint.h> -#include <stdbool.h> - -#define FPMPI_INT8 sizeof(int8_t) -#define FPMPI_UINT8 sizeof(uint8_t) - -#define FPMPI_INT16 sizeof(int16_t) -#define FPMPI_UINT16 sizeof(uint16_t) - -#define FPMPI_INT32 sizeof(int32_t) -#define FPMPI_UINT32 sizeof(uint32_t) - -#define FPMPI_INT64 sizeof(int64_t) -#define FPMPI_UINT64 sizeof(int64_t) - -#define FPMPI_DOUBLE sizeof(double) - -#define FPMPI_ROOT_RANK 0 - -#define FPMPI_MERGE_SORT 1 - -typedef struct fpmpi_result { - void *content; - int count; -} fpmpi_result_t; - -fpmpi_result_t map(void *array, int count, int type, int map_type, void *f(void *), MPI_Comm comm); - -fpmpi_result_t filter(void *array, int count, int type, bool f(void *), MPI_Comm comm); - -fpmpi_result_t reduce(void *array, int count, int type, void f(void *, void *), MPI_Comm comm); - -fpmpi_result_t -fold_left(void *array, int count, int type, int fold_type, void f(void *, void *), void *initial_value, MPI_Comm comm); - -fpmpi_result_t -fold_right(void *array, int count, int type, int fold_type, void f(void *, void *), void *initial_value, MPI_Comm comm); - -fpmpi_result_t -scan(void *array, int count, int type, int scan_type, void *f(void *, void *), void *initial_value, MPI_Comm comm); - -fpmpi_result_t sort(void *array, int count, int type, int sort_method, bool f(void *, void *), MPI_Comm comm); - -fpmpi_result_t find(void *array, int count, int type, bool f(void *), MPI_Comm comm); - -fpmpi_result_t iota(int n, MPI_Comm comm); - -fpmpi_result_t zip(void *array1, void *array2, int count, int type1, int type2, int tuple_type, void *f(void *, void *), - MPI_Comm comm); - -//fpmpi_result_t unzip(tuple2_t *array, int count, int type1, int type2, MPI_Comm comm); - -//fpmpi_result_t flat_map(void *array, int dimensions[2], int type, MPI_Comm comm); -// map2, map (zip xs ys) -// rotate - -#endif //FPMPI_LIBRARY_H diff --git a/lib/tests/tests.c b/lib/tests/tests.c deleted file mode 100644 index 6edb8afb08dc92d1aeadf06b294ffb85a281bef9..0000000000000000000000000000000000000000 --- a/lib/tests/tests.c +++ /dev/null @@ -1,453 +0,0 @@ -#include <mpi.h> -#include <stdlib.h> -#include <stdio.h> -#include <assert.h> -#include "../fpmpi.h" -#include "../fp.h" - -#define TEST_MAP 1 -#define TEST_FILTER 2 -#define TEST_REDUCE 3 -#define TEST_FIND 4 -#define TEST_FOLD_LEFT 5 -#define TEST_FOLD_RIGHT 6 -#define TEST_SORT 7 -#define TEST_SCAN 8 -#define TEST_IOTA 9 -#define TEST_ZIP 10 - -#define N 12 - -void *map_mul_int(void *element) { - int *result = calloc(1, sizeof(int)); - *result = (*(int *) (element)) * 2; - return (void *) result; -} - -void *map_mul_int_double(void *element) { - double *result = calloc(1, sizeof(double)); - *result = (*(int *) (element)) * 2.0; - return (void *) result; -} - -bool filter_only_even(void *element) { - int element32 = *(int *) element; - return element32 % 2 == 0; -} - -bool find_divide_by_three(void *element) { - int element32 = *(int *) element; - return element32 % 3 == 0; -} - -bool find_fifty(void *element) { - int element32 = *(int *) element; - return element32 == 50; -} - -void reduce_sum(void *accumulator, void *current_value) { - int *accumulator32 = (int *) accumulator; - int current_value32 = *(int *) current_value; - *accumulator32 = (*accumulator32 + current_value32); -} - -void fold_left_sub(void *accumulator, void *current_value) { - int *accumulator32 = (int *) accumulator; - int current_value32 = *(int *) current_value; - *accumulator32 = (*accumulator32 - current_value32); -} - -void fold_left_sub_int8_int(void *accumulator, void *current_value) { - int *accumulator32 = (int *) accumulator; - int current_value32 = (int) (*(char *) current_value); - *accumulator32 = (*accumulator32 - current_value32); -} - -void fold_right_sub(void *current_value, void *accumulator) { - int *accumulator32 = (int *) accumulator; - int current_value32 = *(int *) current_value; - *accumulator32 = (current_value32 - *accumulator32); -} - -void fold_right_sub_int8_int(void *current_value, void *accumulator) { - int *accumulator32 = (int *) accumulator; - int current_value32 = (int) (*(char *) current_value); - *accumulator32 = (current_value32 - *accumulator32); -} - -void *scan_add(void *accumulator, void *current_value) { - int *accumulator32 = (int *) accumulator; - int current_value32 = *(int *) current_value; - int *new_accumulator = calloc(1, sizeof(int)); - *new_accumulator = (*accumulator32 + current_value32); - return new_accumulator; -} - -void tests_map(int my_rank) { - int array[N] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}; - { - printf("Test map 1...\n"); - fpmpi_result_t result = map(array, N, FPMPI_INT32, FPMPI_INT32, map_mul_int, MPI_COMM_WORLD); - if (my_rank == FPMPI_ROOT_RANK) { - int expected_content[N] = {2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24}; - int *content = result.content; - - assert(result.content != NULL); - assert(result.count == N); - for (int i = 0; i < result.count; ++i) { - assert(content[i] == expected_content[i]); - } - free(result.content); - } else { - assert(result.content == NULL); - assert(result.count == 0); - } - } - - { - printf("Test map 2...\n"); - fpmpi_result_t result = map(array, N, FPMPI_INT32, FPMPI_DOUBLE, map_mul_int_double, MPI_COMM_WORLD); - if (my_rank == FPMPI_ROOT_RANK) { - double expected_content[N] = {2.0, 4.0, 6.0, 8.0, 10.0, 12.0, 14.0, 16.0, 18.0, 20.0, 22.0, 24.0}; - double *content = result.content; - - assert(result.content != NULL); - assert(result.count == N); - for (int i = 0; i < result.count; ++i) { - assert(content[i] == expected_content[i]); - } - - free(result.content); - } else { - assert(result.content == NULL); - assert(result.count == 0); - } - } -} - -void tests_filter(int my_rank) { - int array[N] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}; - { - printf("Test filter 1...\n"); - fpmpi_result_t result = filter(array, N, FPMPI_INT32, filter_only_even, MPI_COMM_WORLD); - if (my_rank == FPMPI_ROOT_RANK) { - int expected_content[N / 2] = {2, 4, 6, 8, 10, 12}; - int *content = result.content; - - assert(result.count == 6); - for (int i = 0; i < result.count; ++i) { - assert(content[i] == expected_content[i]); - } - free(result.content); - } else { - assert(result.content == NULL); - assert(result.count == 0); - } - } -} - -void tests_reduce(int my_rank) { - int array[N] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}; - { - printf("Test reduce 1...\n"); - fpmpi_result_t result = reduce(array, N, FPMPI_INT32, reduce_sum, MPI_COMM_WORLD); - if (my_rank == FPMPI_ROOT_RANK) { - int expected_content = 468; - int *content = result.content; - - assert(result.count == 1); - assert(*content == expected_content); - free(result.content); - } else { - assert(result.content == NULL); - assert(result.count == 0); - } - } -} - -void tests_find(int my_rank) { - int array[N] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}; - { - printf("Test find 1...\n"); - fpmpi_result_t result = find(array, N, FPMPI_INT32, find_divide_by_three, MPI_COMM_WORLD); - if (my_rank == FPMPI_ROOT_RANK) { - int expected_content = 3; - int *content = result.content; - - assert(result.count == 1); - assert(*content == expected_content); - free(result.content); - } else { - assert(result.content == NULL); - assert(result.count == 0); - } - } - - { - printf("Test find 2...\n"); - fpmpi_result_t result = find(array, N, FPMPI_INT32, find_fifty, MPI_COMM_WORLD); - if (my_rank == FPMPI_ROOT_RANK) { - int *content = result.content; - - assert(result.count == 0); - assert(content == NULL); - free(result.content); - } else { - assert(result.content == NULL); - assert(result.count == 0); - } - } -} - -void tests_fold_left(int my_rank) { - { - printf("Test fold_left 1...\n"); - int array[N] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}; - int initial_value = 0; - fpmpi_result_t result = fold_left(array, N, FPMPI_INT32, FPMPI_INT32, fold_left_sub, &initial_value, - MPI_COMM_WORLD); - if (my_rank == FPMPI_ROOT_RANK) { - int expected_content = -468; - int *content = result.content; - - assert(result.count == 1); - assert(*content == expected_content); - free(result.content); - } else { - assert(result.content == NULL); - assert(result.count == 0); - } - } - { - printf("Test fold_left 2...\n"); - char array[N] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}; - double initial_value = 0; - fpmpi_result_t result = fold_left(array, N, FPMPI_INT8, FPMPI_INT32, fold_left_sub_int8_int, &initial_value, - MPI_COMM_WORLD); - if (my_rank == FPMPI_ROOT_RANK) { - double expected_content = -468.0; - int *content = result.content; - - assert(result.count == 1); - assert(*content == expected_content); - free(result.content); - } else { - assert(result.content == NULL); - assert(result.count == 0); - } - } -} - -void tests_fold_right(int my_rank) { - { - printf("Test fold_left 1...\n"); - int array[N] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}; - int initial_value = 0; - fpmpi_result_t result = fold_right(array, N, FPMPI_INT32, FPMPI_INT32, fold_right_sub, &initial_value, - MPI_COMM_WORLD); - if (my_rank == FPMPI_ROOT_RANK) { - int expected_content = -36; - int *content = result.content; - - assert(result.count == 1); - assert(*content == expected_content); - free(result.content); - } else { - assert(result.content == NULL); - assert(result.count == 0); - } - } - - { - printf("Test fold_left 2...\n"); - char array[N] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}; - int initial_value = 0; - fpmpi_result_t result = fold_right(array, N, FPMPI_INT8, FPMPI_INT32, fold_right_sub_int8_int, &initial_value, - MPI_COMM_WORLD); - if (my_rank == FPMPI_ROOT_RANK) { - int expected_content = -36; - int *content = result.content; - - assert(result.count == 1); - assert(*content == expected_content); - free(result.content); - } else { - assert(result.content == NULL); - assert(result.count == 0); - } - } -} - -bool sort_asc(void *left, void *right) { - int left32 = *(int *) left; - int right32 = *(int *) right; - return left32 < right32; -} - -bool sort_dsc(void *left, void *right) { - int left32 = *(int *) left; - int right32 = *(int *) right; - return left32 > right32; -} - -void tests_sort(int my_rank) { - int array[N] = {18, 15, 83, 56, 41, 100, 71, 7, 69, 23, 36, 77}; - { - printf("Test sort 1...\n"); - fpmpi_result_t result = sort(array, N, FPMPI_INT32, FPMPI_MERGE_SORT, sort_asc, MPI_COMM_WORLD); - if (my_rank == FPMPI_ROOT_RANK) { - int expected_content[N] = {7, 15, 18, 23, 36, 41, 56, 69, 71, 77, 83, 100}; - int *content = result.content; - - assert(result.count == N); - for (int i = 0; i < result.count; ++i) { - assert(content[i] == expected_content[i]); - } - free(result.content); - } else { - assert(result.content == NULL); - assert(result.count == N); - } - } - { - printf("Test sort 2...\n"); - fpmpi_result_t result = sort(array, N, FPMPI_INT32, FPMPI_MERGE_SORT, sort_dsc, MPI_COMM_WORLD); - if (my_rank == FPMPI_ROOT_RANK) { - int expected_content[N] = {100, 83, 77, 71, 69, 56, 41, 36, 23, 18, 15, 7}; - int *content = result.content; - - assert(result.count == N); - for (int i = 0; i < result.count; ++i) { - assert(content[i] == expected_content[i]); - } - free(result.content); - } else { - assert(result.content == NULL); - assert(result.count == N); - } - } -} - -void tests_scan(int my_rank) { - { - printf("Test scan 1...\n"); - int array[N] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}; - int initial_value = 0; - fpmpi_result_t result = scan(array, N, FPMPI_INT32, FPMPI_INT32, scan_add, &initial_value, MPI_COMM_WORLD); - if (my_rank == FPMPI_ROOT_RANK) { - int expected_content = 468; - int *content = result.content; - - assert(result.count == 73); - assert(content[result.count - 1] == expected_content); - free(result.content); - } else { - assert(result.content == NULL); - assert(result.count == 0); - } - } -} - -void tests_iota(int my_rank) { - { - printf("Test iota 1...\n"); - fpmpi_result_t result = iota(N, MPI_COMM_WORLD); - if (my_rank == FPMPI_ROOT_RANK) { - int expected_content[N] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}; - int *content = result.content; - - assert(result.count == N); - for (int i = 0; i < result.count; ++i) { - assert(content[i] == expected_content[i]); - } - free(result.content); - } else { - assert(result.content == NULL); - assert(result.count == 0); - } - } -} - -//void tests_zip(int my_rank) { -// int array1[N] = {18, 15, 83, 56, 41, 100, 71, 7, 69, 23, 36, 77}; -// char array2[N] = {5, 91, 70, 96, 9, 98, 37, 1, 13, 3, 42, 7}; -// { -// printf("Test zip 1...\n"); -// fpmpi_result_t result = zip(array1, array2, N, FPMPI_INT32, FPMPI_INT8, MPI_COMM_WORLD); -// if (my_rank == FPMPI_ROOT_RANK) { -// tuple2_t expected_content[N] = { -// {.first = &array1[0], .second = &array2[0]}, -// {.first = &array1[1], .second = &array2[1]}, -// {.first = &array1[2], .second = &array2[2]}, -// {.first = &array1[3], .second = &array2[3]}, -// {.first = &array1[4], .second = &array2[4]}, -// {.first = &array1[5], .second = &array2[5]}, -// {.first = &array1[6], .second = &array2[6]}, -// {.first = &array1[7], .second = &array2[7]}, -// {.first = &array1[8], .second = &array2[8]}, -// {.first = &array1[9], .second = &array2[9]}, -// {.first = &array1[10], .second = &array2[10]}, -// {.first = &array1[11], .second = &array2[11]}, -// }; -// tuple2_t *content = result.content; -// assert(result.count == N); -// for (int i = 0; i < result.count; ++i) { -// assert(content[i].first == expected_content[i].first); -// assert(content[i].second == expected_content[i].second); -// } -// free(result.content); -// } else { -// assert(result.content == NULL); -// assert(result.count == 0); -// } -// } -//} - -int main(int argc, char *argv[]) { - MPI_Init(&argc, &argv); - - int my_rank; - MPI_Comm_rank(MPI_COMM_WORLD, &my_rank); - - for (int i = TEST_MAP; i <= TEST_ZIP; ++i) { - switch (i) { - case TEST_MAP: - tests_map(my_rank); - break; - case TEST_FILTER: - tests_filter(my_rank); - break; - case TEST_REDUCE: - tests_reduce(my_rank); - break; - case TEST_FIND: - tests_find(my_rank); - break; - case TEST_FOLD_LEFT: - tests_fold_left(my_rank); - break; - case TEST_FOLD_RIGHT: - tests_fold_right(my_rank); - break; - case TEST_SORT: - tests_sort(my_rank); - break; - case TEST_SCAN: - tests_scan(my_rank); - break; - case TEST_IOTA: - tests_iota(my_rank); - break; - case TEST_ZIP: -// tests_zip(my_rank); - break; - default: - MPI_Finalize(); - exit(0); - } - MPI_Barrier(MPI_COMM_WORLD); - } - - MPI_Finalize(); - return 0; -}