hip.c

HIP backend for GPUs

HIP backend for GPUs

This uses the HIP driver interface from AMD as well as the “run-time-compiler” HIPRTC. HIP can be installed on Debian systems using:

apt install hipcc

Since the HIP driver interface is meant to be compatible with CUDA, this file was (mostly) translated automatically from the CUDA backend using

hipify-perl -hip-kernel-execution-syntax ../cuda/cuda.c | \
  sed -e 's/CUDA_/HIP_/g' -e 's/NVRTC_/HIPRTC_/g' > hip.c

In principle this should work on both NVidia and AMD cards. Note however that Basilisk GPU development is done on Nvidia cards so that the AMD version is not as well tested.

#include <stdlib.h>
#include <stdio.h>
#include <stdbool.h>
#include <string.h>
#include <math.h>
#include <assert.h>
#include <sys/stat.h>
#include <errno.h>
#include "a32.h"

typedef struct { double x, y, z; } coord;
typedef float real;
typedef struct { int i; } scalar;
extern int datasize;

typedef struct {
  coord p, * box, n; // region
  int level; // level
} RegionParameters;

extern int N;
extern double X0, Y0, Z0, L0;
extern struct { int x, y; } Dimensions;

typedef struct _External External;

struct _External {
  char * name;    // the name of the variable
  void * pointer; // a pointer to the data
  int type;       // the type of the variable
  int nd;         // the number of pointer dereferences or attribute offset or enum constant
  char reduct;    // the reduction operation
  char global;    // is it a global variable?
  char constant;  // is it a constant?
  void * data;    // the dimensions (int *) for arrays or the code (char *) for functions
  scalar s;       // used for reductions on GPUs
  External * externals, * next;
  int used;
};

#include "../../ast/symbols.h"

enum typedef_kind_t {
  sym_SCALAR = sym_root + 1,
  sym_VECTOR,
  sym_TENSOR,
  sym_COORD,
  sym__COORD,
  sym_VEC4,
  sym_IVEC
};

#define min(a,b) ((a) < (b) ? (a) : (b))
#define max(a,b) ((a) > (b) ? (a) : (b))

#define sysrealloc realloc

#include "../gpu/backend.h"

#include <hip/hip_runtime.h>
#include <hip/hiprtc.h>

static hipDeviceptr_t ssbo = 0;
#ifndef __HIP_PLATFORM_AMD__
static hipDevice_t dev = 0;
#endif
static hipCtx_t ctx = 0;
static hipStream_t stream = 0;

#define HIP_CHECK(x)                                                   \
  do {                                                                  \
    hipError_t err = (x);                                               \
    if (err != hipSuccess) {                                            \
      const char *msg =                                                 \
        hipGetErrorName(err);                                           \
      fprintf(stderr, "%s:%d: HIP error: %s\n", __FILE__, __LINE__, msg); \
      exit(1);                                                          \
    }                                                                   \
  } while(0)

#define HIPRTC_CHECK(x)                                                 \
  do {                                                                  \
    hiprtcResult err = (x);                                             \
    if (err != HIPRTC_SUCCESS) {                                        \
      fprintf(stderr, "%s:%d: HIPRTC error: %s\n", __FILE__, __LINE__,  \
              hiprtcGetErrorString(err));                               \
      exit(1);                                                          \
    }                                                                   \
  } while(0)

typedef struct {
  hipDeviceptr_t location;
  int type, nd, local;
  size_t size;
  void * pointer, * last;
} MyUniform;

struct _Shader {
  unsigned ng[2], nwg[2];
  hipDeviceptr_t _data, csOrigin, ssbo;
  MyUniform * uniforms;
  hipModule_t module;
  hipFunction_t kernel;
};

static void * memcpycheck (hipDeviceptr_t location, void * pointer, void * last,
                           size_t size)
{
  if (last && !memcmp (pointer, last, size))
    return last;
  HIP_CHECK (hipMemcpyAsync ((void *)location, pointer, size,
                             hipMemcpyHostToDevice, stream));
  if (!last) last = malloc (size);
  return memcpy (last, pointer, size);
}

void free_shader (Shader * s)
{
  for (MyUniform * g = s->uniforms; g && g->type; g++)
    free (g->last);
  free (s->uniforms);
  free (s);
}

static
void architecture (char * arch)
{
#ifdef __HIP_PLATFORM_AMD__
  // For AMD GPUs, use gcnArchName from device properties (e.g., gfx906, gfx90a, gfx1030)
  hipDeviceProp_t props;
  HIP_CHECK (hipGetDeviceProperties(&props, 0));
  sprintf (arch, "--offload-arch=%s", props.gcnArchName);
#else
  // For NVIDIA GPUs via HIP-on-CUDA, use sm_ architecture
  int major, minor;
  hipDevice_t cuDevice;
  HIP_CHECK (hipDeviceGet(&cuDevice, 0));
  HIP_CHECK (hipDeviceGetAttribute (&major, hipDeviceAttributeComputeCapabilityMajor,
                                    cuDevice));
  HIP_CHECK (hipDeviceGetAttribute (&minor, hipDeviceAttributeComputeCapabilityMinor,
                                    cuDevice));
  sprintf (arch, "--gpu-architecture=sm_%d%d", major, minor);
#endif
}

static char * compile_ptx (const char * fs, const char * arch,
                           const char * func, const char * file, int line,
                           size_t * ptxSize)
{
  //  fputs (fs, stderr);
  hiprtcProgram prog;
  HIPRTC_CHECK(hiprtcCreateProgram (&prog, fs,
                                    "kernel.cu",
                                    0,
                                    NULL,
                                    NULL
                                    ));
#ifdef __HIP_PLATFORM_AMD__
  // AMD ROCm - use HIP-compatible compiler options
  const char *opts[] = {
    "--std=c++14",
    arch,
    "-ffast-math",
  };
#else
  // NVIDIA via HIP-on-CUDA - use NVRTC-compatible options
  const char *opts[] = {
    "--std=c++11",
    arch,
    "-default-device",
    "-diag-suppress=177",
    "--ptxas-options=-O3",
    "--extra-device-vectorization",
    "--restrict",
    "-use_fast_math",
  };
#endif

  hiprtcResult compile_res = hiprtcCompileProgram (prog, sizeof(opts)/sizeof(char *), opts);

  if (compile_res != HIPRTC_SUCCESS) {
    size_t logSize;
    HIPRTC_CHECK (hiprtcGetProgramLogSize (prog, &logSize));
    if (logSize > 1) {
      char * log = (char *) malloc (logSize);
      HIPRTC_CHECK (hiprtcGetProgramLog (prog, log));
      // fputs (log, stderr);
      char * error = gpu_errors (log, fs, NULL, "HIP");
      fputs (error, stderr);
      free (error);
      free (log);
    }
    return NULL;
  }

  // ------------------------------------------------------------
  // Get Code (PTX for NVIDIA, or HSACO/GCN for AMD)
  // ------------------------------------------------------------

  HIPRTC_CHECK (hiprtcGetCodeSize (prog, ptxSize));
  char * ptx = malloc (*ptxSize);
  HIPRTC_CHECK (hiprtcGetCode (prog, ptx));
  HIPRTC_CHECK (hiprtcDestroyProgram (&prog));

#ifdef __HIP_PLATFORM_AMD__
  // For AMD GPUs: code is HSACO binary, not PTX
  // No FP64 check needed here as it's platform-dependent
#else
  // For NVIDIA GPUs via HIP: verify single precision mode
#if SINGLE_PRECISION
  if (strstr (ptx, ".f64"))
    fprintf (stderr, "%s:%d: warning: HIP: found FP64 assembly in single precision mode\n",
             file, line);
#endif // SINGLE_PRECISION
#endif

  return ptx;
}

static
int create_tmpdir (const char * path)
{
  struct stat st;
  if (stat (path, &st) == 0) {
    if (S_ISDIR (st.st_mode))
      return 0; // Directory exists
    else {
      errno = ENOTDIR;
      return -1; // Path exists but is not a directory
    }
  }
  // Directory does not exist, try to create it
  if (mkdir (path, 0755) == 0)
    return 0; // Successfully created
  return -1; // Failed to create
}

Shader * load_normal_shader (const char * fs, const char * func, const char * file, int line)
{
  char arch[64] = "";
  architecture (arch);

  // ---------------------------------------------------------------
  // Try to read from a compilation cache (by default in /tmp/buda/)
  // ---------------------------------------------------------------
  
  Adler32Hash hasha;
  a32_hash_init (&hasha);
  a32_hash_add (&hasha, fs, strlen (fs));
  a32_hash_add (&hasha, arch, strlen (arch));
  uint32_t hash = a32_hash (&hasha);

  const char * tmpdir = getenv ("TMPDIR"), * tmp = tmpdir ? tmpdir : "/tmp";
  char cache[strlen(tmp) + strlen("/buda/ffffffff") + 1];
  sprintf (cache, "%s/buda", tmp);
  if (create_tmpdir (cache)) {
    fprintf (stderr, "%s:%d: cannot create temporary directory '%s'\n", \
             __FILE__, __LINE__, cache);
    perror ("");
    exit (1);
  }
  sprintf (cache, "%s/buda/%x", tmp, hash);
  char * ptx;
  struct stat st;
  if (stat (cache, &st) == 0) { // found in cache
    FILE * fp = fopen (cache, "r");
    assert (fp);
    ptx = malloc (st.st_size);
    assert (fread (ptx, 1, st.st_size, fp) == st.st_size);
    fclose (fp);
  }
  else { // not found in cache
    size_t size;
    ptx = compile_ptx (fs, arch, func, file, line, &size);
    if (!ptx)
      return NULL;
    FILE * fp = fopen (cache, "w");
    if (fp) {
      assert (fwrite (ptx, 1, size, fp) == size);
      fclose (fp);
    }
  }

  // ------------------------------------------------------------
  // Load binary module
  // ------------------------------------------------------------

  Shader * shader = calloc (1, sizeof (Shader));
  HIP_CHECK (hipModuleLoadData (&shader->module, ptx));
  free (ptx);
  HIP_CHECK (hipModuleGetFunction (&shader->kernel, shader->module, func));
  return shader;
}

bool gpu_init_context (GPUData ** data)
{
  bool initialized = ctx;
  if (!initialized) {
#ifdef __HIP_PLATFORM_AMD__
    // AMD ROCm - use Runtime API (implicit context)
    HIP_CHECK (hipSetDevice(0));
    HIP_CHECK (hipStreamCreate(&stream));
    ctx = (hipCtx_t)1;  // flag to indicate initialized
#else
    // NVIDIA via HIP-on-CUDA - use Driver API
    HIP_CHECK (hipInit (0));
    HIP_CHECK (hipDeviceGet (&dev, 0));
    HIP_CHECK (hipCtxCreate (&ctx, 0, dev));
#endif
  }
  *data = NULL;
  return !initialized;
}

void gpu_free_context (GPUData * data)
{
  if (ssbo) {
    HIP_CHECK (hipFree ((void *) ssbo));
    ssbo = 0;
  }
  GPUContext.current_size = 0;
}

void realloc_ssbo (size_t field_size)
{
  if (!datasize)
    return;
  size_t totalsize = field_size*datasize;
  assert (totalsize > GPUContext.current_size);
  hipDeviceptr_t ptr;
  HIP_CHECK (hipMalloc ((void **) &ptr, totalsize)); // fixme: allocates memory twice
  if (GPUContext.current_size > 0) {
    HIP_CHECK (hipMemcpy ((void *)ptr, (void *)ssbo, GPUContext.current_size, hipMemcpyDeviceToDevice));
    HIP_CHECK (hipFree ((void *) ssbo));
  }
  ssbo = ptr;
  GPUContext.current_size = totalsize;
}

void gpu_cpu_sync_scalar (int i, int block, char * data, size_t field_size, SyncMode mode)
{
  size_t size = field_size*sizeof(real), offset = i*size, totalsize = block*size;
  char * cd = data + offset;
  hipDeviceptr_t gd = ssbo + offset;
  if (mode == GPU_READ)
    HIP_CHECK (hipMemcpy (cd, (void *)gd, totalsize, hipMemcpyDeviceToHost));
  else if (mode == GPU_WRITE)
    HIP_CHECK (hipMemcpy ((void *)gd, cd, totalsize, hipMemcpyHostToDevice));
  else
    assert (false);
}

void reset_scalar (int i, int block, size_t field_size, double val)
{
  size_t size = field_size*sizeof(real);
  size_t offset = i*size, totalsize = max(block, 1)*size;
  if (val == 0.)
    HIP_CHECK (hipMemset ((void *)(ssbo + offset), 0, totalsize));
  else {
#if SINGLE_PRECISION
    float fval = val;
    uint32_t bits;
    memcpy (&bits, &fval, sizeof(bits));
#ifdef __HIP_PLATFORM_AMD__
    // Note: For non-zero values, need to use a kernel or hipMemcpy pattern fill
    float * temp = (float *) malloc (totalsize);
    for (size_t j = 0; j < totalsize/sizeof(float); j++)
      temp[j] = fval;
    HIP_CHECK (hipMemcpy ((void *)(ssbo + offset), temp, totalsize, hipMemcpyHostToDevice));
    free (temp);
#else
    HIP_CHECK (hipMemsetD32 (ssbo + offset, bits, totalsize/sizeof(float)));
#endif
#else
    fprintf (stderr, "%s:%d: error: not implemented yet\n", __FILE__, __LINE__);
#endif
  }
}

void finalize_shader (Shader * shader, External * externals, External * merged,
                      unsigned ng[2], unsigned nwg[2])
{
  for (int i = 0; i < 2; i++)
    shader->ng[i] = ng[i], shader->nwg[i] = nwg[i];

Get the SSBO pointer.

  size_t size;
  HIP_CHECK (hipModuleGetGlobal(&shader->_data, &size, shader->module, "_data"));
  assert (size == sizeof (ssbo));

Get the csOrigin pointer.

  HIP_CHECK (hipModuleGetGlobal(&shader->csOrigin, &size, shader->module, "csOrigin"));
  assert (size == 2*sizeof (int));

Make list of uniforms

  for (External * g = merged; g; g = g->next)
    g->used = 0;
  int index = 1;
  for (External * g = externals; g && g->name; g++)
    g->used = index++;
  int nuniforms = 0;
  for (const External * g = merged; g; g = g->next) {
    if (g->name[0] == '.') continue;
    if (IS_EXTERNAL_CONSTANT(g)) continue;
    if (g->type == sym_function_declaration || g->type == sym_function_definition) continue;
    if (g->type == sym_INT && (!strcmp (g->name, "N") ||
			       !strcmp (g->name, "nl") ||
			       !strcmp (g->name, "bc_period_x") ||
			       !strcmp (g->name, "bc_period_y")))
      continue;
    if (g->type == sym_INT ||
	g->type == sym_LONG ||
	g->type == sym_FLOAT ||
	g->type == sym_DOUBLE ||
	g->type == sym__COORD ||
	g->type == sym_COORD ||
	g->type == sym_BOOL ||
	g->type == sym_VEC4) {
      char * name = str_append (NULL, EXTERNAL_NAME (g));
      hipDeviceptr_t location = 0;
      size_t size;
      hipModuleGetGlobal (&location, &size, shader->module, name);
      if (location) {
        //        fprintf (stderr, "%s %d %ld\n", name, g->type, size);
	// not an array or just a one-dimensional array
	assert (!g->nd);
	assert (!g->data || ((int *)g->data)[1] == 0);
	int nd = g->data ? ((int *)g->data)[0] : 1;
        switch (g->type) {
        case sym_INT: case sym_LONG:
          assert (size == sizeof(int)*nd); break;
        case sym_FLOAT:
          assert (size == sizeof(float)*nd); break;
        case sym_VEC4:
          nd *= 4;
          assert (size == sizeof(float)*nd); break;
        case sym_BOOL:
          assert (size == sizeof(bool)*nd); break;
#if SINGLE_PRECISION
        case sym_DOUBLE:
          assert (size == sizeof(float)*nd); break;
        case sym__COORD:
          nd *= 2;
          assert (size == sizeof(float)*nd); break;
        case sym_COORD:
          nd *= 3;
          assert (size == sizeof(float)*nd); break;
#else // DOUBLE_PRECISION
        case sym_DOUBLE:
          assert (size == sizeof(double)*nd); break;
        case sym__COORD:
          nd *= 2;
          assert (size == sizeof(double)*nd); break;
        case sym_COORD:
          nd *= 3;
          assert (size == sizeof(double)*nd); break;
#endif // DOUBLE_PRECISION
        default:
          assert (false);
        }
	shader->uniforms = realloc (shader->uniforms, (nuniforms + 2)*sizeof(MyUniform));
	shader->uniforms[nuniforms] = (MyUniform){
	  .location = location, .type = g->type, .nd = nd, .size = size,
	  .local = g->global == 1 ? -1 : g->used - 1,
	  .pointer = g->global == 1 ? g->pointer : NULL
        };
	shader->uniforms[nuniforms + 1].type = 0;
	nuniforms++;
	// uniforms refering to local variables must be in the 'externals' local list
	assert (g->global == 1 || g->used);
      }
      else
        fprintf (stderr, "%s not found\n", name);
      free (name);
    }
  }
}

void post_setup_shader (Shader * shader, External * externals)
{

Set SSBO pointer.

  assert (ssbo);
  if (shader->ssbo != ssbo) {
    HIP_CHECK (hipMemcpyAsync ((void *)shader->_data, &ssbo, sizeof (ssbo),
                               hipMemcpyHostToDevice, stream));
    shader->ssbo = ssbo;
  }

Set uniforms

  for (MyUniform * g = shader->uniforms; g && g->type; g++) {
    void * pointer = g->pointer;
    if (!pointer) {
      assert (g->local >= 0);
      pointer = externals[g->local].pointer;
    }
    switch (g->type) {
    case sym_INT: case sym_FLOAT: case sym_VEC4: case sym_BOOL:
      g->last = memcpycheck (g->location, pointer, g->last, g->size);
      break;
    case sym_LONG: {
      int p[g->nd];
      long * data = pointer;
      for (int i = 0; i < g->nd; i++)
	p[i] = data[i];
      g->last = memcpycheck (g->location, p, g->last, g->size);
      break;
    }
#if SINGLE_PRECISION
    case sym_DOUBLE: case sym__COORD: case sym_COORD: {
      float p[g->nd];
      double * data = pointer;
      for (int i = 0; i < g->nd; i++)
	p[i] = data[i];
      g->last = memcpycheck (g->location, p, g->last, g->size);
      break;
    }
#else // DOUBLE_PRECISION
    case sym_DOUBLE: case sym__COORD: case sym_COORD:
      g->last = memcpycheck (g->location, pointer, g->last, g->size);
      break;
#endif // DOUBLE_PRECISION
    default:
      assert (false);
    }
  }
}

int run_shader (const Shader * shader, const RegionParameters * region)
{

Render

If this is a foreach_point() iteration, we access a single point

  int Nl = region->level > 0 ? 1 << (region->level - 1) : N/Dimensions.x;
  if (region->n.x == 1 && region->n.y == 1) {
    int csOrigin[] = {
      (region->p.x - X0)/L0*Nl*Dimensions.x,
      (region->p.y - Y0)/L0*Nl*Dimensions.x
    };
    HIP_CHECK (hipMemcpyHtoDAsync (shader->csOrigin, csOrigin, 2*sizeof(int), stream));
    assert (!GPUContext.fragment_shader);
    HIP_CHECK (hipModuleLaunchKernel (shader->kernel,
                                      1, 1, 1,
                                      1, 1, 1,
                                      0, stream, NULL, NULL));
  }

This is a region

  else if (region->n.x || region->n.y) {
#if 1
    assert (false);
#else
    float vsScale[] = {
      (region->box[1].x - region->box[0].x)/L0,
      (region->box[1].y - region->box[0].y)/L0
    };
    float vsOrigin[] = { (region->box[0].x - X0)/L0, (region->box[0].y - Y0)/L0 };
    GL_C (glUniform2fv (1, 1, vsOrigin));
    GL_C (glUniform2fv (2, 1, vsScale));
    assert (GPUContext.fragment_shader);
    GL_C (glMemoryBarrier (GL_SHADER_STORAGE_BARRIER_BIT));
    GL_C (glDrawArrays (GL_TRIANGLES, 0, 6));
#endif
  }

  else {
    assert (!GPUContext.fragment_shader);
    HIP_CHECK (hipModuleLaunchKernel (shader->kernel,
                                      shader->ng[0], shader->ng[1], 1,
                                      shader->nwg[0], shader->nwg[1], 1,
                                      0, stream, NULL, NULL));
  }
  return Nl;
}

void gpu_free_solver (void)
{
#ifdef __HIP_PLATFORM_AMD__
  // AMD ROCm - use Runtime API
  HIP_CHECK (hipDeviceSynchronize ());
  if (stream) {
    HIP_CHECK (hipStreamDestroy (stream));
    stream = 0;
  }
  HIP_CHECK (hipDeviceReset ());
#else
  // NVIDIA via HIP-on-CUDA - use Driver API
  HIP_CHECK (hipCtxSynchronize ());
  HIP_CHECK (hipCtxDestroy (ctx));
#endif
  ctx = 0;
}

void gpu_synchronize()
{
  if (ctx) {
#ifdef __HIP_PLATFORM_AMD__
    HIP_CHECK (hipDeviceSynchronize ());
#else
    HIP_CHECK (hipCtxSynchronize ());
#endif
  }
}

Reductions

static char kernel_source[] =
"#define REDUCE(reduced,rhs) reduced += rhs                                          \n"
"extern \"C\"\n"
"__global__ void reduce (const float* input, float* output, int n){\n"
"    __shared__ float sdata[256];                       \n"
"    unsigned int tid = threadIdx.x;                    \n"
"    unsigned int i = blockIdx.x * blockDim.x * 2 + tid;\n"
"    float reduced = 0.0f;                              \n"
"    if (i < n)                                         \n"
"        REDUCE (reduced, input[i]);                    \n"
"    if (i + blockDim.x < n)                            \n"
"        REDUCE (reduced, input[i + blockDim.x]);       \n"
"    sdata[tid] = reduced;                              \n"
"    __syncthreads();                                   \n"
"                                                       \n"
"    for (unsigned int s = blockDim.x/2; s > 32; s >>= 1){\n"
"        if (tid < s)                                    \n"
"            REDUCE (sdata[tid], sdata[tid + s]);        \n"
"        __syncthreads();                                \n"
"    }                                                   \n"
"    if (tid < 32) {                                     \n"
"        volatile float* smem = sdata;                   \n"
"        REDUCE (smem[tid], smem[tid + 32]);             \n"
"        REDUCE (smem[tid], smem[tid + 16]);             \n"
"        REDUCE (smem[tid], smem[tid + 8]);              \n"
"        REDUCE (smem[tid], smem[tid + 4]);              \n"
"        REDUCE (smem[tid], smem[tid + 2]);              \n"
"        REDUCE (smem[tid], smem[tid + 1]);              \n"
"    }                                                   \n"
"    if (tid == 0)                                       \n"
"        output[blockIdx.x] = sdata[0];                  \n"
"}                                                       \n";

static hipFunction_t compile_kernel (const char * start, const char * op)
{
  hiprtcProgram prog;
  char * s = kernel_source + strlen("#define REDUCE(reduced,rhs) ");
  memcpy (s, op, strlen (op));
  s += strlen(op); while (*s != '\n') *s++ = ' ';
  s = strstr (kernel_source, "float reduced = ");
  s += strlen ("float reduced = ");
  memcpy (s, start, strlen (start));
  s += strlen(start); while (*s != '\n') *s++ = ' '; 
  HIPRTC_CHECK (hiprtcCreateProgram (&prog, kernel_source, "reduce.cu",
                                     0, NULL, NULL));
  char arch[64];
  architecture (arch);
#ifdef __HIP_PLATFORM_AMD__
  // AMD ROCm - use HIP-compatible options
  const char* options[] = {
    arch,
    "--std=c++14",
    "-ffast-math"
  };
#else
  // NVIDIA via HIP-on-CUDA
  const char* options[] = {
    arch,
    "--std=c++11",
    "--use_fast_math"
  };
#endif
  if (hiprtcCompileProgram (prog, 3, options) != HIPRTC_SUCCESS) {
    //    fputs (kernel_source, stderr);
    size_t log_size;
    HIPRTC_CHECK (hiprtcGetProgramLogSize (prog, &log_size));
    if (log_size) {
      char * log = (char *)malloc (log_size);
      HIPRTC_CHECK (hiprtcGetProgramLog (prog, log));
      fprintf (stderr, "%s:%d: %s\n", __FILE__, __LINE__, log);
      free (log);
    }
    exit (EXIT_FAILURE);
  }

  /*    Extract PTX    */
  size_t ptx_size;
  HIPRTC_CHECK (hiprtcGetCodeSize (prog, &ptx_size));
  char * ptx = (char *) malloc (ptx_size);
  HIPRTC_CHECK (hiprtcGetCode (prog, ptx));
  HIPRTC_CHECK (hiprtcDestroyProgram (&prog));

  /*    Load module    */
  hipModule_t module;
  HIP_CHECK (hipModuleLoadData (&module, ptx));
  free (ptx);

  hipFunction_t kernel;
  HIP_CHECK(hipModuleGetFunction (&kernel, module, "reduce"));
  return kernel;
}

static
float cuda_reduce (hipDeviceptr_t d_input, const size_t N, const char op)
{
  /*    Compile kernel with HIPRTC    */
  
  hipFunction_t kernel;
  switch (op) {
  case '+': {
    static hipFunction_t k = 0;
    if (!k) k = compile_kernel ("0.0f;", "reduced += rhs");
    kernel = k;
    break;
  }
  case 'm': {
    static hipFunction_t k = 0;
    if (!k) k = compile_kernel ("3e38f;", "reduced = min(reduced,rhs)");
    kernel = k;
    break;
  }
  case 'M': {
    static hipFunction_t k = 0;
    if (!k) k = compile_kernel ("-3e38f;", "reduced = max(reduced,rhs)");
    kernel = k;
    break;
  }
  default:
    assert (0);
  }

  /*    Allocate device memory    */

  static size_t Np = 0;
  static hipDeviceptr_t d_output_a = 0, d_output_b = 0;
  if (N > Np) {
    const size_t max_blocks = (N + 511)/512;
    if (d_output_a) {
      HIP_CHECK (hipFree ((void *) d_output_a));
      HIP_CHECK (hipFree ((void *) d_output_b));
    }
    HIP_CHECK (hipMalloc ((void **) &d_output_a, max_blocks*sizeof(float)));
    HIP_CHECK (hipMalloc ((void **) &d_output_b, max_blocks*sizeof(float)));
    Np = N;
  }
    
  /*    Multi-pass reduction    */

  hipDeviceptr_t input = d_input, output = d_output_a;
  size_t current_n = N;
  while (current_n > 1) {
    const size_t threads = 256;
    size_t blocks = (current_n + threads*2 - 1)/(threads*2);
    void * args[] = {&input, &output, &current_n};
    HIP_CHECK (hipModuleLaunchKernel (kernel,
                                      blocks, 1, 1, threads,
                                      1, 1, 0, stream,
                                      args, NULL));
    //    HIP_CHECK (hipCtxSynchronize());
    current_n = blocks;
    input = output;
    output = (output == d_output_a) ? d_output_b : d_output_a;
  }

  /*        Copy result back        */  
  float gpu_reduce;
  HIP_CHECK (hipMemcpy (&gpu_reduce, (void *)input, sizeof(float), hipMemcpyDeviceToHost));
#if 0
  /*    Cleanup    */
  HIP_CHECK(hipFree(d_output_a));
  HIP_CHECK(hipFree(d_output_b));
  HIP_CHECK(hipModuleUnload(module));
#endif
  return gpu_reduce;
}

double gpu_reduction (size_t offset,
                      const char op,
                      const RegionParameters * region,
                      GPUData * data,
                      size_t nb)
{
  if (region->n.x == 1 && region->n.y == 1) {
    int i = (region->p.x - X0)/L0*N;
    int j = (region->p.y - Y0)/L0*N;
    if (i < 0 || i >= N || j < 0 || j >= N)
      return 0.;
    offset += i*N + j;
    nb = 1;
  }
  return cuda_reduce (ssbo + offset*sizeof(real), nb, op);
}