src/grid/cuda/cuda.c

    CUDA backend for GPUs

    This relies on the CUDA and NVRTC libraries which can be installed on Debian systems using:

    apt install nvidia-driver nvidia-cuda-dev
    #include <stdlib.h>
    #include <stdio.h>
    #include <stdbool.h>
    #include <string.h>
    #include <math.h>
    #include <assert.h>
    #include <sys/stat.h>
    #include <errno.h>
    #include "a32.h"
    
    typedef struct { double x, y, z; } coord;
    typedef float real;
    typedef struct { int i; } scalar;
    extern int datasize;
    
    typedef struct {
      coord p, * box, n; // region
      int level; // level
    } RegionParameters;
    
    extern int N;
    extern double X0, Y0, Z0, L0;
    extern struct { int x, y; } Dimensions;
    
    typedef struct _External External;
    
    struct _External {
      char * name;    // the name of the variable
      void * pointer; // a pointer to the data
      int type;       // the type of the variable
      int nd;         // the number of pointer dereferences or attribute offset or enum constant
      char reduct;    // the reduction operation
      char global;    // is it a global variable?
      char constant;  // is it a constant?
      void * data;    // the dimensions (int *) for arrays or the code (char *) for functions
      scalar s;       // used for reductions on GPUs
      External * externals, * next;
      int used;
    };
    
    #include "../../ast/symbols.h"
    
    enum typedef_kind_t {
      sym_SCALAR = sym_root + 1,
      sym_VECTOR,
      sym_TENSOR,
      sym_COORD,
      sym__COORD,
      sym_VEC4,
      sym_IVEC
    };
    
    #define min(a,b) ((a) < (b) ? (a) : (b))
    #define max(a,b) ((a) > (b) ? (a) : (b))
    
    #define sysrealloc realloc
    
    #include "../gpu/backend.h"
    
    #include <cuda.h>
    #include <nvrtc.h>
    
    static CUdeviceptr ssbo = 0;
    static CUdevice dev = 0;
    static CUcontext ctx = 0;
    static CUstream stream = 0;
    
    #define CUDA_CHECK(x)                                                   \
      do {                                                                  \
        CUresult err = (x);                                                 \
        if (err != CUDA_SUCCESS) {                                          \
          const char *msg;                                                  \
          cuGetErrorName(err, &msg);                                        \
          fprintf(stderr, "%s:%d: CUDA error: %s\n", __FILE__, __LINE__, msg); \
          exit(1);                                                          \
        }                                                                   \
      } while(0)
    
    #define NVRTC_CHECK(x)                          \
      do {                                          \
        nvrtcResult err = (x);                      \
        if (err != NVRTC_SUCCESS) {                 \
          fprintf(stderr, "%s:%d: NVRTC error: %s\n", __FILE__, __LINE__,   \
                  nvrtcGetErrorString(err));        \
          exit(1);                                  \
        }                                           \
      } while(0)
    
    typedef struct {
      CUdeviceptr location;
      int type, nd, local;
      size_t size;
      void * pointer, * last;
    } MyUniform;
    
    struct _Shader {
      unsigned ng[2], nwg[2];
      CUdeviceptr _data, csOrigin, ssbo;
      MyUniform * uniforms;
      CUmodule module;
      CUfunction kernel;
    };
    
    static void * memcpycheck (CUdeviceptr location, void * pointer, void * last, size_t size)
    {
      if (last && !memcmp (pointer, last, size))
        return last;
      CUDA_CHECK (cuMemcpyHtoDAsync (location, pointer, size, stream));
      if (!last) last = malloc (size);
      return memcpy (last, pointer, size);
    }
    
    void free_shader (Shader * s)
    {
      for (MyUniform * g = s->uniforms; g && g->type; g++)
        free (g->last);
      free (s->uniforms);
      free (s);
    }
    
    static
    void architecture (char * arch)
    {
      int major, minor;
      CUdevice cuDevice;
      CUDA_CHECK (cuDeviceGet(&cuDevice, 0));
      CUDA_CHECK (cuDeviceGetAttribute (&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
                                        cuDevice));
      CUDA_CHECK (cuDeviceGetAttribute (&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR,
                                        cuDevice));
      sprintf (arch, "--gpu-architecture=sm_%d%d", major, minor);
      // fixme: not sure whether this should be compute_%d%d or sm_%d%d ??
    }
    
    static char * compile_ptx (const char * fs, const char * arch,
                               const char * func, const char * file, int line,
                               size_t * ptxSize)
    {
      //  fputs (fs, stderr);
      nvrtcProgram prog;
      NVRTC_CHECK(nvrtcCreateProgram (&prog, fs,
                                      "kernel.cu",
                                      0,
                                      NULL,
                                      NULL
                                      ));
      const char *opts[] = {
        "--std=c++11",
        arch,
        "-default-device",
        "-diag-suppress=177",
        "--ptxas-options=-O3",
        "--extra-device-vectorization",
        "--restrict",
        "-use_fast_math",
      };
    
      nvrtcResult compile_res = nvrtcCompileProgram (prog, sizeof(opts)/sizeof(char *), opts);
    
      if (compile_res != NVRTC_SUCCESS) {
        size_t logSize;
        NVRTC_CHECK (nvrtcGetProgramLogSize (prog, &logSize));
        if (logSize > 1) {
          char * log = (char *) malloc (logSize);
          NVRTC_CHECK (nvrtcGetProgramLog (prog, log));
          // fputs (log, stderr);
          char * error = gpu_errors (log, fs, NULL, "CUDA");
          fputs (error, stderr);
          free (error);
          free (log);
        }
        return NULL;
      }
    
      // ------------------------------------------------------------
      // Get PTX
      // ------------------------------------------------------------
    
      NVRTC_CHECK (nvrtcGetPTXSize (prog, ptxSize));
      char * ptx = malloc (*ptxSize);
      NVRTC_CHECK (nvrtcGetPTX (prog, ptx));
      NVRTC_CHECK (nvrtcDestroyProgram (&prog));
    
    #if SINGLE_PRECISION
      //  fputs (ptx, stderr);
      if (strstr (ptx, ".f64"))
        fprintf (stderr, "%s:%d: warning: CUDA: found FP64 assembly in single precision mode\n",
                 file, line);
    #endif // SINGLE_PRECISION
    
      return ptx;
    }
    
    static
    int create_tmpdir (const char * path)
    {
      struct stat st;
      if (stat (path, &st) == 0) {
        if (S_ISDIR (st.st_mode))
          return 0; // Directory exists
        else {
          errno = ENOTDIR;
          return -1; // Path exists but is not a directory
        }
      }
      // Directory does not exist, try to create it
      if (mkdir (path, 0755) == 0)
        return 0; // Successfully created
      return -1; // Failed to create
    }
    
    Shader * load_normal_shader (const char * fs,
                                 const char * func, const char * file, int line)
    {
      char arch[] = "--gpu-architecture=compute_????";
      architecture (arch);
    
      // ---------------------------------------------------------------
      // Try to read from a compilation cache (by default in /tmp/buda/)
      // ---------------------------------------------------------------
      
      Adler32Hash hasha;
      a32_hash_init (&hasha);
      a32_hash_add (&hasha, fs, strlen (fs));
      a32_hash_add (&hasha, arch, strlen (arch));
      uint32_t hash = a32_hash (&hasha);
    
      const char * tmpdir = getenv ("TMPDIR"), * tmp = tmpdir ? tmpdir : "/tmp";
      char cache[strlen(tmp) + strlen("/buda/ffffffff") + 1];
      sprintf (cache, "%s/buda", tmp);
      if (create_tmpdir (cache)) {
        fprintf (stderr, "%s:%d: cannot create temporary directory '%s'\n", \
                 __FILE__, __LINE__, cache);
        perror ("");
        exit (1);
      }
      sprintf (cache, "%s/buda/%x", tmp, hash);
      char * ptx;
      struct stat st;
      if (stat (cache, &st) == 0) { // found in cache
        FILE * fp = fopen (cache, "r");
        assert (fp);
        ptx = malloc (st.st_size);
        assert (fread (ptx, 1, st.st_size, fp) == st.st_size);
        fclose (fp);
      }
      else { // not found in cache
        size_t size;
        ptx = compile_ptx (fs, arch, func, file, line, &size);
        if (!ptx)
          return NULL;
        FILE * fp = fopen (cache, "w");
        if (fp) {
          assert (fwrite (ptx, 1, size, fp) == size);
          fclose (fp);
        }
      }
      
      // ------------------------------------------------------------
      // Load PTX module
      // ------------------------------------------------------------
    
      Shader * shader = calloc (1, sizeof (Shader));
      CUDA_CHECK (cuModuleLoadData (&shader->module, ptx));
      free (ptx);
      CUDA_CHECK (cuModuleGetFunction (&shader->kernel, shader->module, func));
      return shader;
    }
    
    bool gpu_init_context (GPUData ** data)
    {
      bool initialized = ctx;
      if (!initialized) {
        CUDA_CHECK (cuInit (0));
        CUDA_CHECK (cuDeviceGet (&dev, 0));
        CUDA_CHECK (cuCtxCreate (&ctx, 0, dev));
      }
      *data = NULL;
      return !initialized;
    }
    
    void gpu_free_context (GPUData * data)
    {
      if (ssbo) {
        CUDA_CHECK (cuMemFree (ssbo));
        ssbo = 0;
      }
      GPUContext.current_size = 0;
    }
    
    void realloc_ssbo (size_t field_size)
    {
      if (!datasize)
        return;
      size_t totalsize = field_size*datasize;
      assert (totalsize > GPUContext.current_size);
      CUdeviceptr ptr;
      CUDA_CHECK (cuMemAlloc (&ptr, totalsize)); // fixme: allocates memory twice
      if (GPUContext.current_size > 0) {
        CUDA_CHECK (cuMemcpyDtoD (ptr, ssbo, GPUContext.current_size));
        CUDA_CHECK (cuMemFree (ssbo));
      }
      ssbo = ptr;
      GPUContext.current_size = totalsize;
    }
    
    void gpu_cpu_sync_scalar (int i, int block, char * data, size_t field_size, SyncMode mode)
    {
      size_t size = field_size*sizeof(real), offset = i*size, totalsize = block*size;
      char * cd = data + offset;
      CUdeviceptr gd = ssbo + offset;
      if (mode == GPU_READ)
        CUDA_CHECK (cuMemcpyDtoH (cd, gd, totalsize));
      else if (mode == GPU_WRITE)
        CUDA_CHECK (cuMemcpyHtoD (gd, cd, totalsize));
      else
        assert (false);
    }
    
    void reset_scalar (int i, int block, size_t field_size, double val)
    {
      size_t size = field_size*sizeof(real);
      size_t offset = i*size, totalsize = max(block, 1)*size;
    #if SINGLE_PRECISION
      float fval = val;
      uint32_t bits;
      memcpy (&bits, &fval, sizeof(bits));
      CUDA_CHECK (cuMemsetD32 (ssbo + offset, bits, totalsize/sizeof(float)));
    #else
      fprintf (stderr, "%s:%d: error: not implemented yet\n");
    #endif
    }
    
    void finalize_shader (Shader * shader, External * externals, External * merged,
                          unsigned ng[2], unsigned nwg[2])
    {
      for (int i = 0; i < 2; i++)
        shader->ng[i] = ng[i], shader->nwg[i] = nwg[i];

    Get the SSBO pointer.

      size_t size;
      CUDA_CHECK (cuModuleGetGlobal(&shader->_data, &size, shader->module, "_data"));
      assert (size == sizeof (ssbo));

    Get the csOrigin pointer.

      CUDA_CHECK (cuModuleGetGlobal(&shader->csOrigin, &size, shader->module, "csOrigin"));
      assert (size == 2*sizeof (int));

    Make list of uniforms

      for (External * g = merged; g; g = g->next)
        g->used = 0;
      int index = 1;
      for (External * g = externals; g && g->name; g++)
        g->used = index++;
      int nuniforms = 0;
      for (const External * g = merged; g; g = g->next) {
        if (g->name[0] == '.') continue;
        if (IS_EXTERNAL_CONSTANT(g)) continue;
        if (g->type == sym_function_declaration || g->type == sym_function_definition) continue;
        if (g->type == sym_INT && (!strcmp (g->name, "N") ||
    			       !strcmp (g->name, "nl") ||
    			       !strcmp (g->name, "bc_period_x") ||
    			       !strcmp (g->name, "bc_period_y")))
          continue;
        if (g->type == sym_INT ||
    	g->type == sym_LONG ||
    	g->type == sym_FLOAT ||
    	g->type == sym_DOUBLE ||
    	g->type == sym__COORD ||
    	g->type == sym_COORD ||
    	g->type == sym_BOOL ||
    	g->type == sym_VEC4) {
          char * name = str_append (NULL, EXTERNAL_NAME (g));
          CUdeviceptr location = 0;
          size_t size;
          cuModuleGetGlobal (&location, &size, shader->module, name);
          if (location) {
            //        fprintf (stderr, "%s %d %ld\n", name, g->type, size);
    	// not an array or just a one-dimensional array
    	assert (!g->nd);
    	assert (!g->data || ((int *)g->data)[1] == 0);
    	int nd = g->data ? ((int *)g->data)[0] : 1;
            switch (g->type) {
            case sym_INT: case sym_LONG:
              assert (size == sizeof(int)*nd); break;
            case sym_FLOAT:
              assert (size == sizeof(float)*nd); break;
            case sym_VEC4:
              nd *= 4;
              assert (size == sizeof(float)*nd); break;
            case sym_BOOL:
              assert (size == sizeof(bool)*nd); break;
    #if SINGLE_PRECISION
            case sym_DOUBLE:
              assert (size == sizeof(float)*nd); break;
            case sym__COORD:
              nd *= 2;
              assert (size == sizeof(float)*nd); break;
            case sym_COORD:
              nd *= 3;
              assert (size == sizeof(float)*nd); break;
    #else // DOUBLE_PRECISION
            case sym_DOUBLE:
              assert (size == sizeof(double)*nd); break;
            case sym__COORD:
              nd *= 2;
              assert (size == sizeof(double)*nd); break;
            case sym_COORD:
              nd *= 3;
              assert (size == sizeof(double)*nd); break;
    #endif // DOUBLE_PRECISION
            default:
              assert (false);
            }
    	shader->uniforms = realloc (shader->uniforms, (nuniforms + 2)*sizeof(MyUniform));
    	shader->uniforms[nuniforms] = (MyUniform){
    	  .location = location, .type = g->type, .nd = nd, .size = size,
    	  .local = g->global == 1 ? -1 : g->used - 1,
    	  .pointer = g->global == 1 ? g->pointer : NULL
            };
    	shader->uniforms[nuniforms + 1].type = 0;
    	nuniforms++;
    	// uniforms refering to local variables must be in the 'externals' local list
    	assert (g->global == 1 || g->used);
          }
          else
            fprintf (stderr, "%s not found\n", name);
          free (name);
        }
      }
    }
    
    void post_setup_shader (Shader * shader, External * externals)
    {

    Set SSBO pointer.

      assert (ssbo);
      if (shader->ssbo != ssbo) {
        CUDA_CHECK (cuMemcpyHtoDAsync (shader->_data, &ssbo, sizeof (ssbo), stream));
        shader->ssbo = ssbo;
      }

    Set uniforms

      for (MyUniform * g = shader->uniforms; g && g->type; g++) {
        void * pointer = g->pointer;
        if (!pointer) {
          assert (g->local >= 0);
          pointer = externals[g->local].pointer;
        }
        switch (g->type) {
        case sym_INT: case sym_FLOAT: case sym_VEC4: case sym_BOOL:
          g->last = memcpycheck (g->location, pointer, g->last, g->size);
          break;
        case sym_LONG: {
          int p[g->nd];
          long * data = pointer;
          for (int i = 0; i < g->nd; i++)
    	p[i] = data[i];
          g->last = memcpycheck (g->location, p, g->last, g->size);
          break;
        }
    #if SINGLE_PRECISION
        case sym_DOUBLE: case sym__COORD: case sym_COORD: {
          float p[g->nd];
          double * data = pointer;
          for (int i = 0; i < g->nd; i++)
    	p[i] = data[i];
          g->last = memcpycheck (g->location, p, g->last, g->size);
          break;
        }
    #else // DOUBLE_PRECISION
        case sym_DOUBLE: case sym__COORD: case sym_COORD:
          g->last = memcpycheck (g->location, pointer, g->last, g->size);
          break;
    #endif // DOUBLE_PRECISION
        default:
          assert (false);
        }
      }
    }
    
    int run_shader (const Shader * shader, const RegionParameters * region)
    {

    Render

    If this is a foreach_point() iteration, we access a single point

      int Nl = region->level > 0 ? 1 << (region->level - 1) : N/Dimensions.x;
      if (region->n.x == 1 && region->n.y == 1) {
        int csOrigin[] = {
          (region->p.x - X0)/L0*Nl*Dimensions.x,
          (region->p.y - Y0)/L0*Nl*Dimensions.x
        };
        CUDA_CHECK (cuMemcpyHtoDAsync (shader->csOrigin, csOrigin, 2*sizeof(int), stream));
        assert (!GPUContext.fragment_shader);
        CUDA_CHECK (cuLaunchKernel (shader->kernel,
                                    1, 1, 1,
                                    1, 1, 1,
                                    0, stream, NULL, NULL));
      }

    This is a region

      else if (region->n.x || region->n.y) {
    #if 1
        assert (false);
    #else
        float vsScale[] = {
          (region->box[1].x - region->box[0].x)/L0,
          (region->box[1].y - region->box[0].y)/L0
        };
        float vsOrigin[] = { (region->box[0].x - X0)/L0, (region->box[0].y - Y0)/L0 };
        GL_C (glUniform2fv (1, 1, vsOrigin));
        GL_C (glUniform2fv (2, 1, vsScale));
        assert (GPUContext.fragment_shader);
        GL_C (glMemoryBarrier (GL_SHADER_STORAGE_BARRIER_BIT));
        GL_C (glDrawArrays (GL_TRIANGLES, 0, 6));
    #endif
      }
    
      else {
        assert (!GPUContext.fragment_shader);
        CUDA_CHECK (cuLaunchKernel (shader->kernel,
                                    shader->ng[0], shader->ng[1], 1,
                                    shader->nwg[0], shader->nwg[1], 1,
                                    0, stream, NULL, NULL));
      }
      return Nl;
    }
    
    void gpu_free_solver (void)
    {
      CUDA_CHECK (cuCtxSynchronize ());
      CUDA_CHECK (cuCtxDestroy (ctx));
      ctx = NULL;
    }
    
    void gpu_synchronize()
    {
      if (ctx)
        CUDA_CHECK (cuCtxSynchronize ());
    }

    Reductions

    static char kernel_source[] =
    "#define REDUCE(reduced,rhs) reduced += rhs                                          \n"
    "extern \"C\"\n"
    "__global__ void reduce (const float* input, float* output, int n){\n"
    "    __shared__ float sdata[256];                       \n"
    "    unsigned int tid = threadIdx.x;                    \n"
    "    unsigned int i = blockIdx.x * blockDim.x * 2 + tid;\n"
    "    float reduced = 0.0f;                              \n"
    "    if (i < n)                                         \n"
    "        REDUCE (reduced, input[i]);                    \n"
    "    if (i + blockDim.x < n)                            \n"
    "        REDUCE (reduced, input[i + blockDim.x]);       \n"
    "    sdata[tid] = reduced;                              \n"
    "    __syncthreads();                                   \n"
    "                                                       \n"
    "    for (unsigned int s = blockDim.x/2; s > 32; s >>= 1){\n"
    "        if (tid < s)                                    \n"
    "            REDUCE (sdata[tid], sdata[tid + s]);        \n"
    "        __syncthreads();                                \n"
    "    }                                                   \n"
    "    if (tid < 32) {                                     \n"
    "        volatile float* smem = sdata;                   \n"
    "        REDUCE (smem[tid], smem[tid + 32]);             \n"
    "        REDUCE (smem[tid], smem[tid + 16]);             \n"
    "        REDUCE (smem[tid], smem[tid + 8]);              \n"
    "        REDUCE (smem[tid], smem[tid + 4]);              \n"
    "        REDUCE (smem[tid], smem[tid + 2]);              \n"
    "        REDUCE (smem[tid], smem[tid + 1]);              \n"
    "    }                                                   \n"
    "    if (tid == 0)                                       \n"
    "        output[blockIdx.x] = sdata[0];                  \n"
    "}                                                       \n";
    
    static CUfunction compile_kernel (const char * start, const char * op)
    {
      nvrtcProgram prog;
      char * s = kernel_source + strlen("#define REDUCE(reduced,rhs) ");
      memcpy (s, op, strlen (op));
      s += strlen(op); while (*s != '\n') *s++ = ' ';
      s = strstr (kernel_source, "float reduced = ");
      s += strlen ("float reduced = ");
      memcpy (s, start, strlen (start));
      s += strlen(start); while (*s != '\n') *s++ = ' '; 
      NVRTC_CHECK (nvrtcCreateProgram (&prog, kernel_source, "reduce.cu",
                                       0, NULL, NULL));
      char arch[] = "--gpu-architecture=compute_????";
      architecture (arch);
      const char* options[] = {
        arch,
        "--std=c++11",
        "--use_fast_math"
      };
      if (nvrtcCompileProgram (prog, 3, options) != NVRTC_SUCCESS) {
        //    fputs (kernel_source, stderr);
        size_t log_size;
        NVRTC_CHECK (nvrtcGetProgramLogSize (prog, &log_size));
        if (log_size) {
          char * log = (char *)malloc (log_size);
          NVRTC_CHECK (nvrtcGetProgramLog (prog, log));
          fprintf (stderr, "%s:%d: %s\n", __FILE__, __LINE__, log);
          free (log);
        }
        exit (EXIT_FAILURE);
      }
    
      /*    Extract PTX    */
      size_t ptx_size;
      NVRTC_CHECK (nvrtcGetPTXSize (prog, &ptx_size));
      char * ptx = (char *) malloc (ptx_size);
      NVRTC_CHECK (nvrtcGetPTX (prog, ptx));
      NVRTC_CHECK (nvrtcDestroyProgram (&prog));
    
      /*    Load module    */
      CUmodule module;
      CUDA_CHECK (cuModuleLoadData (&module, ptx));
      free (ptx);
    
      CUfunction kernel;
      CUDA_CHECK(cuModuleGetFunction (&kernel, module, "reduce"));
      return kernel;
    }
    
    static
    float cuda_reduce (CUdeviceptr d_input, const size_t N, const char op)
    {
      /*    Compile kernel with NVRTC    */
      
      CUfunction kernel;
      switch (op) {
      case '+': {
        static CUfunction k = 0;
        if (!k) k = compile_kernel ("0.0f;", "reduced += rhs");
        kernel = k;
        break;
      }
      case 'm': {
        static CUfunction k = 0;
        if (!k) k = compile_kernel ("3e38f;", "reduced = min(reduced,rhs)");
        kernel = k;
        break;
      }
      case 'M': {
        static CUfunction k = 0;
        if (!k) k = compile_kernel ("-3e38f;", "reduced = max(reduced,rhs)");
        kernel = k;
        break;
      }
      default:
        assert (0);
      }
    
      /*    Allocate device memory    */
    
      static size_t Np = 0;
      static CUdeviceptr d_output_a = 0, d_output_b = 0;
      if (N > Np) {
        const size_t max_blocks = (N + 511)/512;
        if (d_output_a) {
          CUDA_CHECK (cuMemFree (d_output_a));
          CUDA_CHECK (cuMemFree (d_output_b));
        }
        CUDA_CHECK (cuMemAlloc (&d_output_a, max_blocks*sizeof(float)));
        CUDA_CHECK (cuMemAlloc (&d_output_b, max_blocks*sizeof(float)));
        Np = N;
      }
        
      /*    Multi-pass reduction    */
    
      CUdeviceptr input = d_input, output = d_output_a;
      size_t current_n = N;
      while (current_n > 1) {
        const size_t threads = 256;
        size_t blocks = (current_n + threads*2 - 1)/(threads*2);
        void * args[] = {&input, &output, &current_n};
        CUDA_CHECK (cuLaunchKernel (kernel,
                                    blocks, 1, 1, threads,
                                    1, 1, 0, stream,
                                    args, NULL));
        //    CUDA_CHECK (cuCtxSynchronize());
        current_n = blocks;
        input = output;
        output = (output == d_output_a) ? d_output_b : d_output_a;
      }
    
      /*        Copy result back        */  
      float gpu_reduce;
      CUDA_CHECK (cuMemcpyDtoH (&gpu_reduce, input, sizeof(float)));
    #if 0
      /*    Cleanup    */
      CUDA_CHECK(cuMemFree(d_output_a));
      CUDA_CHECK(cuMemFree(d_output_b));
      CUDA_CHECK(cuModuleUnload(module));
    #endif
      return gpu_reduce;
    }
    
    double gpu_reduction (size_t offset,
                          const char op,
                          const RegionParameters * region,
                          GPUData * data,
                          size_t nb)
    {
      if (region->n.x == 1 && region->n.y == 1) {
        int i = (region->p.x - X0)/L0*N;
        int j = (region->p.y - Y0)/L0*N;
        if (i < 0 || i >= N || j < 0 || j >= N)
          return 0.;
        offset += i*N + j;
        nb = 1;
      }
      return cuda_reduce (ssbo + offset*sizeof(real), nb, op);
    }