diff --git a/include/fmpi_data.h b/include/fmpi_data.h
index 562a73dc407c6ecd56f4404d0b332157131bf4e4..478bc13f1f1fc556ea3d2d1eed47715b587000e8 100644
--- a/include/fmpi_data.h
+++ b/include/fmpi_data.h
@@ -47,7 +47,8 @@ typedef struct fmpi_data {
     size_t size;                  //!< Total size `(cnt * type.size)`.
     size_t dim_len[FMPI_DIM_MAX]; //!< Length of each dimension.
     size_t dim_cnt;               //!< Number of dimensions.
-    void * raw;                   //!< Pointer to the start of the data.
+    void * raw;                   //!< Pointer to user data.
+    void * gpu;                   //!< Pointer to data returned by the GPU.
 } fmpi_data;
 /*==============================================================================
     PUBLIC FUNCTION
diff --git a/include/fmpi_task.h b/include/fmpi_task.h
index 74e091cdf0648e2d558b0f863f989ea91c72acb4..223e2de84d05c6f49c62a8a8455e0ca1534a3986 100644
--- a/include/fmpi_task.h
+++ b/include/fmpi_task.h
@@ -59,7 +59,6 @@
 typedef struct fmpi_task_args {
     struct fmpi_data in[FMPI_TASK_ARGS_MAX]; //!< TODO
     struct fmpi_data out;                        //!< TODO
-    void * out_raw;
     size_t cnt;                                  //!< TODO
 } fmpi_task_args;
 /*------------------------------------------------------------------------------
diff --git a/include/internal/generic/fmpi_task_generic.h b/include/internal/generic/fmpi_task_generic.h
index cfcac37b3f62a5a9dcf005e8cf6e48440e3de99c..f1624e7dc1155b6f3b2789b2ed6aa96118566cb9 100644
--- a/include/internal/generic/fmpi_task_generic.h
+++ b/include/internal/generic/fmpi_task_generic.h
@@ -51,15 +51,15 @@ _Pragma("GCC diagnostic ignored \"-Wcast-qual\"")\
     if(args->out.type.derived == FMPI_TYPE_ARRAY) { \
         if(args->out.dim_cnt == 1) { \
             CPL_MAP_FIXED(FMPI_PRIV_TASK_RET_FUNC, CPL_EMPTY, \
-                (FUNC, N, 1, ctx->fut->ctx, args->out_raw, args->in), FMPI_TYPE_REAL) \
+                (FUNC, N, 1, ctx->fut->ctx, args->out.gpu, args->in), FMPI_TYPE_REAL) \
         } \
         if(args->out.dim_cnt == 2) { \
             CPL_MAP_FIXED(FMPI_PRIV_TASK_RET_FUNC, CPL_EMPTY, \
-                (FUNC, N, 2, ctx->fut->ctx, args->out_raw, args->in), FMPI_TYPE_REAL) \
+                (FUNC, N, 2, ctx->fut->ctx, args->out.gpu, args->in), FMPI_TYPE_REAL) \
         } \
         if(args->out.dim_cnt == 3) { \
             CPL_MAP_FIXED(FMPI_PRIV_TASK_RET_FUNC, CPL_EMPTY, \
-                (FUNC, N, 3, ctx->fut->ctx, args->out_raw, args->in), FMPI_TYPE_REAL) \
+                (FUNC, N, 3, ctx->fut->ctx, args->out.gpu, args->in), FMPI_TYPE_REAL) \
         } \
     } \
     return futhark_entry_##FUNC(ctx->fut->ctx, args->out.raw, FMPI_PRIV_TASK_ARGS_##N(args->in)); \
@@ -80,7 +80,6 @@ _Pragma("GCC diagnostic warning \"-Wincompatible-pointer-types\"")\
 #define FMPI_PRIV_TASK_REGISTER_1(FUNC, TYPE, ctx, stencil, arg_out) \
     fmpi_task_register_##TYPE((ctx), FUNC##_0, #FUNC, (stencil), &(struct fmpi_task_args){ \
         .out = (arg_out), \
-        .out_raw = NULL, \
         .cnt = 0 \
     })
 
@@ -88,42 +87,41 @@ _Pragma("GCC diagnostic warning \"-Wincompatible-pointer-types\"")\
     fmpi_task_register_##TYPE((ctx), FUNC##_##N, #FUNC, (stencil), &(struct fmpi_task_args){ \
         .in = {__VA_ARGS__}, \
         .out = (arg_out), \
-        .out_raw = NULL, \
         .cnt = N \
     })
 
-#define FMPI_PRIV_TASK_ARGS_1(args_in) (args_in)[0].raw
-#define FMPI_PRIV_TASK_ARGS_2(args_in) FMPI_PRIV_TASK_ARGS_1(args_in), (args_in)[1].raw
-#define FMPI_PRIV_TASK_ARGS_3(args_in) FMPI_PRIV_TASK_ARGS_2(args_in), (args_in)[2].raw
-#define FMPI_PRIV_TASK_ARGS_4(args_in) FMPI_PRIV_TASK_ARGS_3(args_in), (args_in)[3].raw
-#define FMPI_PRIV_TASK_ARGS_5(args_in) FMPI_PRIV_TASK_ARGS_4(args_in), (args_in)[4].raw
-#define FMPI_PRIV_TASK_ARGS_6(args_in) FMPI_PRIV_TASK_ARGS_5(args_in), (args_in)[5].raw
-#define FMPI_PRIV_TASK_ARGS_7(args_in) FMPI_PRIV_TASK_ARGS_6(args_in), (args_in)[6].raw
-#define FMPI_PRIV_TASK_ARGS_8(args_in) FMPI_PRIV_TASK_ARGS_7(args_in), (args_in)[7].raw
-#define FMPI_PRIV_TASK_ARGS_9(args_in) FMPI_PRIV_TASK_ARGS_8(args_in), (args_in)[8].raw
-#define FMPI_PRIV_TASK_ARGS_10(args_in) FMPI_PRIV_TASK_ARGS_9(args_in), (args_in)[9].raw
-#define FMPI_PRIV_TASK_ARGS_11(args_in) FMPI_PRIV_TASK_ARGS_10(args_in), (args_in)[10].raw
-#define FMPI_PRIV_TASK_ARGS_12(args_in) FMPI_PRIV_TASK_ARGS_11(args_in), (args_in)[11].raw
-#define FMPI_PRIV_TASK_ARGS_13(args_in) FMPI_PRIV_TASK_ARGS_12(args_in), (args_in)[12].raw
-#define FMPI_PRIV_TASK_ARGS_14(args_in) FMPI_PRIV_TASK_ARGS_13(args_in), (args_in)[13].raw
-#define FMPI_PRIV_TASK_ARGS_15(args_in) FMPI_PRIV_TASK_ARGS_14(args_in), (args_in)[14].raw
-#define FMPI_PRIV_TASK_ARGS_16(args_in) FMPI_PRIV_TASK_ARGS_15(args_in), (args_in)[15].raw
-#define FMPI_PRIV_TASK_ARGS_17(args_in) FMPI_PRIV_TASK_ARGS_16(args_in), (args_in)[16].raw
-#define FMPI_PRIV_TASK_ARGS_18(args_in) FMPI_PRIV_TASK_ARGS_17(args_in), (args_in)[17].raw
-#define FMPI_PRIV_TASK_ARGS_19(args_in) FMPI_PRIV_TASK_ARGS_18(args_in), (args_in)[18].raw
-#define FMPI_PRIV_TASK_ARGS_20(args_in) FMPI_PRIV_TASK_ARGS_19(args_in), (args_in)[19].raw
-#define FMPI_PRIV_TASK_ARGS_21(args_in) FMPI_PRIV_TASK_ARGS_20(args_in), (args_in)[20].raw
-#define FMPI_PRIV_TASK_ARGS_22(args_in) FMPI_PRIV_TASK_ARGS_21(args_in), (args_in)[21].raw
-#define FMPI_PRIV_TASK_ARGS_23(args_in) FMPI_PRIV_TASK_ARGS_22(args_in), (args_in)[22].raw
-#define FMPI_PRIV_TASK_ARGS_24(args_in) FMPI_PRIV_TASK_ARGS_23(args_in), (args_in)[23].raw
-#define FMPI_PRIV_TASK_ARGS_25(args_in) FMPI_PRIV_TASK_ARGS_24(args_in), (args_in)[24].raw
-#define FMPI_PRIV_TASK_ARGS_26(args_in) FMPI_PRIV_TASK_ARGS_25(args_in), (args_in)[25].raw
-#define FMPI_PRIV_TASK_ARGS_27(args_in) FMPI_PRIV_TASK_ARGS_26(args_in), (args_in)[26].raw
-#define FMPI_PRIV_TASK_ARGS_28(args_in) FMPI_PRIV_TASK_ARGS_27(args_in), (args_in)[27].raw
-#define FMPI_PRIV_TASK_ARGS_29(args_in) FMPI_PRIV_TASK_ARGS_28(args_in), (args_in)[28].raw
-#define FMPI_PRIV_TASK_ARGS_30(args_in) FMPI_PRIV_TASK_ARGS_29(args_in), (args_in)[29].raw
-#define FMPI_PRIV_TASK_ARGS_31(args_in) FMPI_PRIV_TASK_ARGS_30(args_in), (args_in)[30].raw
-#define FMPI_PRIV_TASK_ARGS_32(args_in) FMPI_PRIV_TASK_ARGS_31(args_in), (args_in)[31].raw
+#define FMPI_PRIV_TASK_ARGS_1(args_in) (args_in)[0].gpu
+#define FMPI_PRIV_TASK_ARGS_2(args_in) FMPI_PRIV_TASK_ARGS_1(args_in), (args_in)[1].gpu
+#define FMPI_PRIV_TASK_ARGS_3(args_in) FMPI_PRIV_TASK_ARGS_2(args_in), (args_in)[2].gpu
+#define FMPI_PRIV_TASK_ARGS_4(args_in) FMPI_PRIV_TASK_ARGS_3(args_in), (args_in)[3].gpu
+#define FMPI_PRIV_TASK_ARGS_5(args_in) FMPI_PRIV_TASK_ARGS_4(args_in), (args_in)[4].gpu
+#define FMPI_PRIV_TASK_ARGS_6(args_in) FMPI_PRIV_TASK_ARGS_5(args_in), (args_in)[5].gpu
+#define FMPI_PRIV_TASK_ARGS_7(args_in) FMPI_PRIV_TASK_ARGS_6(args_in), (args_in)[6].gpu
+#define FMPI_PRIV_TASK_ARGS_8(args_in) FMPI_PRIV_TASK_ARGS_7(args_in), (args_in)[7].gpu
+#define FMPI_PRIV_TASK_ARGS_9(args_in) FMPI_PRIV_TASK_ARGS_8(args_in), (args_in)[8].gpu
+#define FMPI_PRIV_TASK_ARGS_10(args_in) FMPI_PRIV_TASK_ARGS_9(args_in), (args_in)[9].gpu
+#define FMPI_PRIV_TASK_ARGS_11(args_in) FMPI_PRIV_TASK_ARGS_10(args_in), (args_in)[10].gpu
+#define FMPI_PRIV_TASK_ARGS_12(args_in) FMPI_PRIV_TASK_ARGS_11(args_in), (args_in)[11].gpu
+#define FMPI_PRIV_TASK_ARGS_13(args_in) FMPI_PRIV_TASK_ARGS_12(args_in), (args_in)[12].gpu
+#define FMPI_PRIV_TASK_ARGS_14(args_in) FMPI_PRIV_TASK_ARGS_13(args_in), (args_in)[13].gpu
+#define FMPI_PRIV_TASK_ARGS_15(args_in) FMPI_PRIV_TASK_ARGS_14(args_in), (args_in)[14].gpu
+#define FMPI_PRIV_TASK_ARGS_16(args_in) FMPI_PRIV_TASK_ARGS_15(args_in), (args_in)[15].gpu
+#define FMPI_PRIV_TASK_ARGS_17(args_in) FMPI_PRIV_TASK_ARGS_16(args_in), (args_in)[16].gpu
+#define FMPI_PRIV_TASK_ARGS_18(args_in) FMPI_PRIV_TASK_ARGS_17(args_in), (args_in)[17].gpu
+#define FMPI_PRIV_TASK_ARGS_19(args_in) FMPI_PRIV_TASK_ARGS_18(args_in), (args_in)[18].gpu
+#define FMPI_PRIV_TASK_ARGS_20(args_in) FMPI_PRIV_TASK_ARGS_19(args_in), (args_in)[19].gpu
+#define FMPI_PRIV_TASK_ARGS_21(args_in) FMPI_PRIV_TASK_ARGS_20(args_in), (args_in)[20].gpu
+#define FMPI_PRIV_TASK_ARGS_22(args_in) FMPI_PRIV_TASK_ARGS_21(args_in), (args_in)[21].gpu
+#define FMPI_PRIV_TASK_ARGS_23(args_in) FMPI_PRIV_TASK_ARGS_22(args_in), (args_in)[22].gpu
+#define FMPI_PRIV_TASK_ARGS_24(args_in) FMPI_PRIV_TASK_ARGS_23(args_in), (args_in)[23].gpu
+#define FMPI_PRIV_TASK_ARGS_25(args_in) FMPI_PRIV_TASK_ARGS_24(args_in), (args_in)[24].gpu
+#define FMPI_PRIV_TASK_ARGS_26(args_in) FMPI_PRIV_TASK_ARGS_25(args_in), (args_in)[25].gpu
+#define FMPI_PRIV_TASK_ARGS_27(args_in) FMPI_PRIV_TASK_ARGS_26(args_in), (args_in)[26].gpu
+#define FMPI_PRIV_TASK_ARGS_28(args_in) FMPI_PRIV_TASK_ARGS_27(args_in), (args_in)[27].gpu
+#define FMPI_PRIV_TASK_ARGS_29(args_in) FMPI_PRIV_TASK_ARGS_28(args_in), (args_in)[28].gpu
+#define FMPI_PRIV_TASK_ARGS_30(args_in) FMPI_PRIV_TASK_ARGS_29(args_in), (args_in)[29].gpu
+#define FMPI_PRIV_TASK_ARGS_31(args_in) FMPI_PRIV_TASK_ARGS_30(args_in), (args_in)[30].gpu
+#define FMPI_PRIV_TASK_ARGS_32(args_in) FMPI_PRIV_TASK_ARGS_31(args_in), (args_in)[31].gpu
 
 #define FMPI_PRIV_TASK_REGISTER_2(...)  FMPI_PRIV_TASK_REGISTER_N(1, __VA_ARGS__)
 #define FMPI_PRIV_TASK_REGISTER_3(...)  FMPI_PRIV_TASK_REGISTER_N(2, __VA_ARGS__)
diff --git a/src/fmpi_data.c b/src/fmpi_data.c
index a3909835efbc9174fe64a5ed88204fd2636b4c9d..63b4744b377b10f7d6a8a6b04262c77dc0a47293 100644
--- a/src/fmpi_data.c
+++ b/src/fmpi_data.c
@@ -49,7 +49,8 @@ struct fmpi_data fmpi_data_out_##T( \
         .size = sizeof(T), \
         .dim_len = {0, 0, 0}, \
         .dim_cnt = 0, \
-        .raw = data \
+        .raw = data, \
+        .gpu = NULL \
     }; \
 }
 
@@ -74,7 +75,8 @@ struct fmpi_data fmpi_data_##D##d_in_##T( \
         .size = cnt * sizeof(T), \
         .dim_len = {x, y, z}, \
         .dim_cnt = (D), \
-        .raw = data \
+        .raw = data, \
+        .gpu = NULL \
     }; \
 } \
 struct fmpi_data fmpi_data_##D##d_out_##T( \
@@ -95,7 +97,8 @@ struct fmpi_data fmpi_data_##D##d_out_##T( \
         .size = cnt * sizeof(T), \
         .dim_len = {x, y, z}, \
         .dim_cnt = (D), \
-        .raw = data \
+        .raw = data, \
+        .gpu = NULL \
     }; \
 }
 
diff --git a/src/fmpi_task.c b/src/fmpi_task.c
index a0147c107c1903f67ab39d56d74aa61ccfde8fa0..11477f8f223a9d019dc0082bc66db0fea133e375 100644
--- a/src/fmpi_task.c
+++ b/src/fmpi_task.c
@@ -65,20 +65,21 @@ struct fmpi_task fmpi_task_register_sync(
             FMPI_RAISE_ERROR(ctx->err_handler, "FMPI", "fmpi_new_domain() failed!");
             continue;
         }
-        //! @todo Could fmpi_futhark_new_data_async() could be called here?
-        void * data = fmpi_futhark_new_data_sync(
+        //! @todo Could fmpi_futhark_new_data_async() be called here instead?
+        void * gpu_data = fmpi_futhark_new_data_sync(
             ctx->fut, task.domains[i].parts[rank].raw, task.domains[i].data.type.base,
             task.domains[i].data.dim_cnt,
             task.domains[i].parts[rank].dim_len[0],
             task.domains[i].parts[rank].dim_len[1],
             task.domains[i].parts[rank].dim_len[2]
         );
-        if(data == NULL) {
+        if(gpu_data == NULL) {
             FMPI_RAISE_ERROR(ctx->err_handler, "FMPI",
                 "fmpi_futhark_new_data_sync() failed!"
             );
         }
-        task.args.in[i].raw = data;
+        task.domains[i].parts[rank].gpu = gpu_data;
+        task.args.in[i].gpu = gpu_data;
     }
     ctx->tasks[ctx->task_cnt++] = task;
     fmpi_futhark_sync(ctx->fut);
@@ -110,19 +111,20 @@ struct fmpi_task fmpi_task_register_async(
             FMPI_RAISE_ERROR(ctx->err_handler, "FMPI", "fmpi_new_domain() failed!");
             continue;
         }
-        void * data = fmpi_futhark_new_data_async(
+        void * gpu_data = fmpi_futhark_new_data_async(
             ctx->fut, task.domains[i].parts[rank].raw, task.domains[i].data.type.base,
             task.domains[i].data.dim_cnt,
             task.domains[i].parts[rank].dim_len[0],
             task.domains[i].parts[rank].dim_len[1],
             task.domains[i].parts[rank].dim_len[2]
         );
-        if(data == NULL) {
+        if(gpu_data == NULL) {
             FMPI_RAISE_ERROR(ctx->err_handler, "FMPI",
                 "fmpi_futhark_new_data_async() failed!"
             );
         }
-        task.args.in[i].raw = data;
+        task.domains[i].parts[rank].gpu = gpu_data;
+        task.args.in[i].gpu = gpu_data;
     }
     ctx->tasks[ctx->task_cnt++] = task;
     return task;
@@ -140,7 +142,7 @@ int fmpi_task_run_sync(
     fmpi_futhark_check_error(ctx->fut, "task->func");
     if(task->args.out.type.derived == FMPI_TYPE_ARRAY) {
         void * out = fmpi_futhark_get_data_sync(
-            ctx->fut, task->args.out_raw, task->args.out.raw,
+            ctx->fut, task->args.out.gpu, task->args.out.raw,
             task->args.out.type.base, task->args.out.dim_cnt
         );
         if(out == NULL) {
@@ -149,7 +151,7 @@ int fmpi_task_run_sync(
             );
         }
         const int err = fmpi_futhark_free_data_sync(
-            ctx->fut, task->args.out_raw, task->args.out.type.base,
+            ctx->fut, task->args.out.gpu, task->args.out.type.base,
             task->args.out.dim_cnt
         );
         if(err != 0) {