diff --git a/include/fmpi_data.h b/include/fmpi_data.h index 562a73dc407c6ecd56f4404d0b332157131bf4e4..478bc13f1f1fc556ea3d2d1eed47715b587000e8 100644 --- a/include/fmpi_data.h +++ b/include/fmpi_data.h @@ -47,7 +47,8 @@ typedef struct fmpi_data { size_t size; //!< Total size `(cnt * type.size)`. size_t dim_len[FMPI_DIM_MAX]; //!< Length of each dimension. size_t dim_cnt; //!< Number of dimensions. - void * raw; //!< Pointer to the start of the data. + void * raw; //!< Pointer to user data. + void * gpu; //!< Pointer to data returned by the GPU. } fmpi_data; /*============================================================================== PUBLIC FUNCTION diff --git a/include/fmpi_task.h b/include/fmpi_task.h index 74e091cdf0648e2d558b0f863f989ea91c72acb4..223e2de84d05c6f49c62a8a8455e0ca1534a3986 100644 --- a/include/fmpi_task.h +++ b/include/fmpi_task.h @@ -59,7 +59,6 @@ typedef struct fmpi_task_args { struct fmpi_data in[FMPI_TASK_ARGS_MAX]; //!< TODO struct fmpi_data out; //!< TODO - void * out_raw; size_t cnt; //!< TODO } fmpi_task_args; /*------------------------------------------------------------------------------ diff --git a/include/internal/generic/fmpi_task_generic.h b/include/internal/generic/fmpi_task_generic.h index cfcac37b3f62a5a9dcf005e8cf6e48440e3de99c..f1624e7dc1155b6f3b2789b2ed6aa96118566cb9 100644 --- a/include/internal/generic/fmpi_task_generic.h +++ b/include/internal/generic/fmpi_task_generic.h @@ -51,15 +51,15 @@ _Pragma("GCC diagnostic ignored \"-Wcast-qual\"")\ if(args->out.type.derived == FMPI_TYPE_ARRAY) { \ if(args->out.dim_cnt == 1) { \ CPL_MAP_FIXED(FMPI_PRIV_TASK_RET_FUNC, CPL_EMPTY, \ - (FUNC, N, 1, ctx->fut->ctx, args->out_raw, args->in), FMPI_TYPE_REAL) \ + (FUNC, N, 1, ctx->fut->ctx, args->out.gpu, args->in), FMPI_TYPE_REAL) \ } \ if(args->out.dim_cnt == 2) { \ CPL_MAP_FIXED(FMPI_PRIV_TASK_RET_FUNC, CPL_EMPTY, \ - (FUNC, N, 2, ctx->fut->ctx, args->out_raw, args->in), FMPI_TYPE_REAL) \ + (FUNC, N, 2, ctx->fut->ctx, args->out.gpu, args->in), FMPI_TYPE_REAL) \ } \ if(args->out.dim_cnt == 3) { \ CPL_MAP_FIXED(FMPI_PRIV_TASK_RET_FUNC, CPL_EMPTY, \ - (FUNC, N, 3, ctx->fut->ctx, args->out_raw, args->in), FMPI_TYPE_REAL) \ + (FUNC, N, 3, ctx->fut->ctx, args->out.gpu, args->in), FMPI_TYPE_REAL) \ } \ } \ return futhark_entry_##FUNC(ctx->fut->ctx, args->out.raw, FMPI_PRIV_TASK_ARGS_##N(args->in)); \ @@ -80,7 +80,6 @@ _Pragma("GCC diagnostic warning \"-Wincompatible-pointer-types\"")\ #define FMPI_PRIV_TASK_REGISTER_1(FUNC, TYPE, ctx, stencil, arg_out) \ fmpi_task_register_##TYPE((ctx), FUNC##_0, #FUNC, (stencil), &(struct fmpi_task_args){ \ .out = (arg_out), \ - .out_raw = NULL, \ .cnt = 0 \ }) @@ -88,42 +87,41 @@ _Pragma("GCC diagnostic warning \"-Wincompatible-pointer-types\"")\ fmpi_task_register_##TYPE((ctx), FUNC##_##N, #FUNC, (stencil), &(struct fmpi_task_args){ \ .in = {__VA_ARGS__}, \ .out = (arg_out), \ - .out_raw = NULL, \ .cnt = N \ }) -#define FMPI_PRIV_TASK_ARGS_1(args_in) (args_in)[0].raw -#define FMPI_PRIV_TASK_ARGS_2(args_in) FMPI_PRIV_TASK_ARGS_1(args_in), (args_in)[1].raw -#define FMPI_PRIV_TASK_ARGS_3(args_in) FMPI_PRIV_TASK_ARGS_2(args_in), (args_in)[2].raw -#define FMPI_PRIV_TASK_ARGS_4(args_in) FMPI_PRIV_TASK_ARGS_3(args_in), (args_in)[3].raw -#define FMPI_PRIV_TASK_ARGS_5(args_in) FMPI_PRIV_TASK_ARGS_4(args_in), (args_in)[4].raw -#define FMPI_PRIV_TASK_ARGS_6(args_in) FMPI_PRIV_TASK_ARGS_5(args_in), (args_in)[5].raw -#define FMPI_PRIV_TASK_ARGS_7(args_in) FMPI_PRIV_TASK_ARGS_6(args_in), (args_in)[6].raw -#define FMPI_PRIV_TASK_ARGS_8(args_in) FMPI_PRIV_TASK_ARGS_7(args_in), (args_in)[7].raw -#define FMPI_PRIV_TASK_ARGS_9(args_in) FMPI_PRIV_TASK_ARGS_8(args_in), (args_in)[8].raw -#define FMPI_PRIV_TASK_ARGS_10(args_in) FMPI_PRIV_TASK_ARGS_9(args_in), (args_in)[9].raw -#define FMPI_PRIV_TASK_ARGS_11(args_in) FMPI_PRIV_TASK_ARGS_10(args_in), (args_in)[10].raw -#define FMPI_PRIV_TASK_ARGS_12(args_in) FMPI_PRIV_TASK_ARGS_11(args_in), (args_in)[11].raw -#define FMPI_PRIV_TASK_ARGS_13(args_in) FMPI_PRIV_TASK_ARGS_12(args_in), (args_in)[12].raw -#define FMPI_PRIV_TASK_ARGS_14(args_in) FMPI_PRIV_TASK_ARGS_13(args_in), (args_in)[13].raw -#define FMPI_PRIV_TASK_ARGS_15(args_in) FMPI_PRIV_TASK_ARGS_14(args_in), (args_in)[14].raw -#define FMPI_PRIV_TASK_ARGS_16(args_in) FMPI_PRIV_TASK_ARGS_15(args_in), (args_in)[15].raw -#define FMPI_PRIV_TASK_ARGS_17(args_in) FMPI_PRIV_TASK_ARGS_16(args_in), (args_in)[16].raw -#define FMPI_PRIV_TASK_ARGS_18(args_in) FMPI_PRIV_TASK_ARGS_17(args_in), (args_in)[17].raw -#define FMPI_PRIV_TASK_ARGS_19(args_in) FMPI_PRIV_TASK_ARGS_18(args_in), (args_in)[18].raw -#define FMPI_PRIV_TASK_ARGS_20(args_in) FMPI_PRIV_TASK_ARGS_19(args_in), (args_in)[19].raw -#define FMPI_PRIV_TASK_ARGS_21(args_in) FMPI_PRIV_TASK_ARGS_20(args_in), (args_in)[20].raw -#define FMPI_PRIV_TASK_ARGS_22(args_in) FMPI_PRIV_TASK_ARGS_21(args_in), (args_in)[21].raw -#define FMPI_PRIV_TASK_ARGS_23(args_in) FMPI_PRIV_TASK_ARGS_22(args_in), (args_in)[22].raw -#define FMPI_PRIV_TASK_ARGS_24(args_in) FMPI_PRIV_TASK_ARGS_23(args_in), (args_in)[23].raw -#define FMPI_PRIV_TASK_ARGS_25(args_in) FMPI_PRIV_TASK_ARGS_24(args_in), (args_in)[24].raw -#define FMPI_PRIV_TASK_ARGS_26(args_in) FMPI_PRIV_TASK_ARGS_25(args_in), (args_in)[25].raw -#define FMPI_PRIV_TASK_ARGS_27(args_in) FMPI_PRIV_TASK_ARGS_26(args_in), (args_in)[26].raw -#define FMPI_PRIV_TASK_ARGS_28(args_in) FMPI_PRIV_TASK_ARGS_27(args_in), (args_in)[27].raw -#define FMPI_PRIV_TASK_ARGS_29(args_in) FMPI_PRIV_TASK_ARGS_28(args_in), (args_in)[28].raw -#define FMPI_PRIV_TASK_ARGS_30(args_in) FMPI_PRIV_TASK_ARGS_29(args_in), (args_in)[29].raw -#define FMPI_PRIV_TASK_ARGS_31(args_in) FMPI_PRIV_TASK_ARGS_30(args_in), (args_in)[30].raw -#define FMPI_PRIV_TASK_ARGS_32(args_in) FMPI_PRIV_TASK_ARGS_31(args_in), (args_in)[31].raw +#define FMPI_PRIV_TASK_ARGS_1(args_in) (args_in)[0].gpu +#define FMPI_PRIV_TASK_ARGS_2(args_in) FMPI_PRIV_TASK_ARGS_1(args_in), (args_in)[1].gpu +#define FMPI_PRIV_TASK_ARGS_3(args_in) FMPI_PRIV_TASK_ARGS_2(args_in), (args_in)[2].gpu +#define FMPI_PRIV_TASK_ARGS_4(args_in) FMPI_PRIV_TASK_ARGS_3(args_in), (args_in)[3].gpu +#define FMPI_PRIV_TASK_ARGS_5(args_in) FMPI_PRIV_TASK_ARGS_4(args_in), (args_in)[4].gpu +#define FMPI_PRIV_TASK_ARGS_6(args_in) FMPI_PRIV_TASK_ARGS_5(args_in), (args_in)[5].gpu +#define FMPI_PRIV_TASK_ARGS_7(args_in) FMPI_PRIV_TASK_ARGS_6(args_in), (args_in)[6].gpu +#define FMPI_PRIV_TASK_ARGS_8(args_in) FMPI_PRIV_TASK_ARGS_7(args_in), (args_in)[7].gpu +#define FMPI_PRIV_TASK_ARGS_9(args_in) FMPI_PRIV_TASK_ARGS_8(args_in), (args_in)[8].gpu +#define FMPI_PRIV_TASK_ARGS_10(args_in) FMPI_PRIV_TASK_ARGS_9(args_in), (args_in)[9].gpu +#define FMPI_PRIV_TASK_ARGS_11(args_in) FMPI_PRIV_TASK_ARGS_10(args_in), (args_in)[10].gpu +#define FMPI_PRIV_TASK_ARGS_12(args_in) FMPI_PRIV_TASK_ARGS_11(args_in), (args_in)[11].gpu +#define FMPI_PRIV_TASK_ARGS_13(args_in) FMPI_PRIV_TASK_ARGS_12(args_in), (args_in)[12].gpu +#define FMPI_PRIV_TASK_ARGS_14(args_in) FMPI_PRIV_TASK_ARGS_13(args_in), (args_in)[13].gpu +#define FMPI_PRIV_TASK_ARGS_15(args_in) FMPI_PRIV_TASK_ARGS_14(args_in), (args_in)[14].gpu +#define FMPI_PRIV_TASK_ARGS_16(args_in) FMPI_PRIV_TASK_ARGS_15(args_in), (args_in)[15].gpu +#define FMPI_PRIV_TASK_ARGS_17(args_in) FMPI_PRIV_TASK_ARGS_16(args_in), (args_in)[16].gpu +#define FMPI_PRIV_TASK_ARGS_18(args_in) FMPI_PRIV_TASK_ARGS_17(args_in), (args_in)[17].gpu +#define FMPI_PRIV_TASK_ARGS_19(args_in) FMPI_PRIV_TASK_ARGS_18(args_in), (args_in)[18].gpu +#define FMPI_PRIV_TASK_ARGS_20(args_in) FMPI_PRIV_TASK_ARGS_19(args_in), (args_in)[19].gpu +#define FMPI_PRIV_TASK_ARGS_21(args_in) FMPI_PRIV_TASK_ARGS_20(args_in), (args_in)[20].gpu +#define FMPI_PRIV_TASK_ARGS_22(args_in) FMPI_PRIV_TASK_ARGS_21(args_in), (args_in)[21].gpu +#define FMPI_PRIV_TASK_ARGS_23(args_in) FMPI_PRIV_TASK_ARGS_22(args_in), (args_in)[22].gpu +#define FMPI_PRIV_TASK_ARGS_24(args_in) FMPI_PRIV_TASK_ARGS_23(args_in), (args_in)[23].gpu +#define FMPI_PRIV_TASK_ARGS_25(args_in) FMPI_PRIV_TASK_ARGS_24(args_in), (args_in)[24].gpu +#define FMPI_PRIV_TASK_ARGS_26(args_in) FMPI_PRIV_TASK_ARGS_25(args_in), (args_in)[25].gpu +#define FMPI_PRIV_TASK_ARGS_27(args_in) FMPI_PRIV_TASK_ARGS_26(args_in), (args_in)[26].gpu +#define FMPI_PRIV_TASK_ARGS_28(args_in) FMPI_PRIV_TASK_ARGS_27(args_in), (args_in)[27].gpu +#define FMPI_PRIV_TASK_ARGS_29(args_in) FMPI_PRIV_TASK_ARGS_28(args_in), (args_in)[28].gpu +#define FMPI_PRIV_TASK_ARGS_30(args_in) FMPI_PRIV_TASK_ARGS_29(args_in), (args_in)[29].gpu +#define FMPI_PRIV_TASK_ARGS_31(args_in) FMPI_PRIV_TASK_ARGS_30(args_in), (args_in)[30].gpu +#define FMPI_PRIV_TASK_ARGS_32(args_in) FMPI_PRIV_TASK_ARGS_31(args_in), (args_in)[31].gpu #define FMPI_PRIV_TASK_REGISTER_2(...) FMPI_PRIV_TASK_REGISTER_N(1, __VA_ARGS__) #define FMPI_PRIV_TASK_REGISTER_3(...) FMPI_PRIV_TASK_REGISTER_N(2, __VA_ARGS__) diff --git a/src/fmpi_data.c b/src/fmpi_data.c index a3909835efbc9174fe64a5ed88204fd2636b4c9d..63b4744b377b10f7d6a8a6b04262c77dc0a47293 100644 --- a/src/fmpi_data.c +++ b/src/fmpi_data.c @@ -49,7 +49,8 @@ struct fmpi_data fmpi_data_out_##T( \ .size = sizeof(T), \ .dim_len = {0, 0, 0}, \ .dim_cnt = 0, \ - .raw = data \ + .raw = data, \ + .gpu = NULL \ }; \ } @@ -74,7 +75,8 @@ struct fmpi_data fmpi_data_##D##d_in_##T( \ .size = cnt * sizeof(T), \ .dim_len = {x, y, z}, \ .dim_cnt = (D), \ - .raw = data \ + .raw = data, \ + .gpu = NULL \ }; \ } \ struct fmpi_data fmpi_data_##D##d_out_##T( \ @@ -95,7 +97,8 @@ struct fmpi_data fmpi_data_##D##d_out_##T( \ .size = cnt * sizeof(T), \ .dim_len = {x, y, z}, \ .dim_cnt = (D), \ - .raw = data \ + .raw = data, \ + .gpu = NULL \ }; \ } diff --git a/src/fmpi_task.c b/src/fmpi_task.c index a0147c107c1903f67ab39d56d74aa61ccfde8fa0..11477f8f223a9d019dc0082bc66db0fea133e375 100644 --- a/src/fmpi_task.c +++ b/src/fmpi_task.c @@ -65,20 +65,21 @@ struct fmpi_task fmpi_task_register_sync( FMPI_RAISE_ERROR(ctx->err_handler, "FMPI", "fmpi_new_domain() failed!"); continue; } - //! @todo Could fmpi_futhark_new_data_async() could be called here? - void * data = fmpi_futhark_new_data_sync( + //! @todo Could fmpi_futhark_new_data_async() be called here instead? + void * gpu_data = fmpi_futhark_new_data_sync( ctx->fut, task.domains[i].parts[rank].raw, task.domains[i].data.type.base, task.domains[i].data.dim_cnt, task.domains[i].parts[rank].dim_len[0], task.domains[i].parts[rank].dim_len[1], task.domains[i].parts[rank].dim_len[2] ); - if(data == NULL) { + if(gpu_data == NULL) { FMPI_RAISE_ERROR(ctx->err_handler, "FMPI", "fmpi_futhark_new_data_sync() failed!" ); } - task.args.in[i].raw = data; + task.domains[i].parts[rank].gpu = gpu_data; + task.args.in[i].gpu = gpu_data; } ctx->tasks[ctx->task_cnt++] = task; fmpi_futhark_sync(ctx->fut); @@ -110,19 +111,20 @@ struct fmpi_task fmpi_task_register_async( FMPI_RAISE_ERROR(ctx->err_handler, "FMPI", "fmpi_new_domain() failed!"); continue; } - void * data = fmpi_futhark_new_data_async( + void * gpu_data = fmpi_futhark_new_data_async( ctx->fut, task.domains[i].parts[rank].raw, task.domains[i].data.type.base, task.domains[i].data.dim_cnt, task.domains[i].parts[rank].dim_len[0], task.domains[i].parts[rank].dim_len[1], task.domains[i].parts[rank].dim_len[2] ); - if(data == NULL) { + if(gpu_data == NULL) { FMPI_RAISE_ERROR(ctx->err_handler, "FMPI", "fmpi_futhark_new_data_async() failed!" ); } - task.args.in[i].raw = data; + task.domains[i].parts[rank].gpu = gpu_data; + task.args.in[i].gpu = gpu_data; } ctx->tasks[ctx->task_cnt++] = task; return task; @@ -140,7 +142,7 @@ int fmpi_task_run_sync( fmpi_futhark_check_error(ctx->fut, "task->func"); if(task->args.out.type.derived == FMPI_TYPE_ARRAY) { void * out = fmpi_futhark_get_data_sync( - ctx->fut, task->args.out_raw, task->args.out.raw, + ctx->fut, task->args.out.gpu, task->args.out.raw, task->args.out.type.base, task->args.out.dim_cnt ); if(out == NULL) { @@ -149,7 +151,7 @@ int fmpi_task_run_sync( ); } const int err = fmpi_futhark_free_data_sync( - ctx->fut, task->args.out_raw, task->args.out.type.base, + ctx->fut, task->args.out.gpu, task->args.out.type.base, task->args.out.dim_cnt ); if(err != 0) {