src/grid/gpu/reduction.h

    double cpu_reduction (GLuint * src, size_t offset, size_t nb, const char op)
    {
      GL_C (glMemoryBarrier (GL_BUFFER_UPDATE_BARRIER_BIT));
      double result;
      switch (op) {
      case '+': result =   0.;   break;
      case 'M': result = - HUGE; break;
      case 'm': result =   HUGE; break;
      default: assert (false);
      }
      offset *= sizeof(real), nb *= sizeof(real);
      int index = offset/GPUContext.max_ssbo_size;
      offset -= index*GPUContext.max_ssbo_size;
      while (nb) {
        GL_C (glBindBuffer (GL_SHADER_STORAGE_BUFFER, src[index]));
        size_t size = min (nb, GPUContext.max_ssbo_size - offset);
        real * a = glMapBufferRange (GL_SHADER_STORAGE_BUFFER, offset, size, GL_MAP_READ_BIT);
        assert (a);
        switch (op) {
        case '+':
          for (size_t i = 0; i < size/sizeof(real); i++, a++)
    	result += *a;
          break;
        case 'M':
          for (size_t i = 0; i < size/sizeof(real); i++, a++)
    	result = fmax (result, *a);
          break;
        case 'm':
          for (size_t i = 0; i < size/sizeof(real); i++, a++)
    	result = fmin (result, *a);
          break;
        default: assert (false);
        }
        assert (glUnmapBuffer (GL_SHADER_STORAGE_BUFFER));
        nb -= size, offset = 0, index++;
      }
      GL_C (glBindBuffer (GL_SHADER_STORAGE_BUFFER, 0));
      return result;
    }
    
    double gpu_reduction (size_t offset, const char op, const RegionParameters * region, size_t nb)
    {
      const int stride = 64, nwgr = 64;
      bool is_foreach_point = (region->n.x == 1 && region->n.y == 1);
      if (!is_foreach_point && nb < nwgr*stride)
        return cpu_reduction (GPUContext.ssbo, offset, nb, op);
      
      GLuint * br = gpu_grid->reduct;
      if (!br[0]) {
        GL_C (glGenBuffers (2, br));
        for (int i = 0; i < 2; i++) {
          GL_C (glBindBuffer (GL_SHADER_STORAGE_BUFFER, br[i]));
          size_t size = (sq((size_t)N + 1)/stride + 1)*sizeof(real);
          assert (size < GPUContext.max_ssbo_size); // must fit within a single SSBO
          GL_C (glBufferData (GL_SHADER_STORAGE_BUFFER, size, NULL, GL_DYNAMIC_READ));
        }
        GL_C (glBindBuffer (GL_SHADER_STORAGE_BUFFER, 0));
      }
      
      const char * operation;
      static const char * opsum =
        "  real reduct = 0.;\n"
        "  for (uint j = 0; j < stride; j++, index++) {\n"
        "    val = _data_val(index);\n"
        "    reduct += val;\n"
        "  }\n";
      static const char * opmin =
        "  real reduct = val;\n"
        "  for (uint j = 0; j < stride; j++, index++) {\n"
        "    val = _data_val(index);\n"
        "    if (val < reduct) reduct = val;\n"
        "  }\n";
      static const char * opmax =
        "  real reduct = val;\n"
        "  for (uint j = 0; j < stride; j++, index++) {\n"
        "    val = _data_val(index);\n"
        "    if (val > reduct) reduct = val;\n"
        "  }\n";
      switch (op) {
      case '+': operation = opsum; break;
      case 'M': operation = opmax; break;
      case 'm': operation = opmin; break;
      default: // unknown reduction operation
        assert (false);
      }
    
      char nwgrs[20], strides[20];
      snprintf (nwgrs, 19, "%d", nwgr);
      snprintf (strides, 19, "%d", stride);
      
      char * fs =
        str_append (NULL,
    		"#version 430\n", glsl_preproc,
    		"layout (std430, binding = 0) writeonly buffer _reduct_layout {"
    		" real _reduct[]; };\n"
    		"layout (std430, binding = 1) readonly buffer _data_layout { real f[]; } _data");
      if (GPUContext.nssbo > 1) {
        char a[20], s[30];
        snprintf (a, 19, "%d", GPUContext.nssbo);
        snprintf (s, 29, "%ld", GPUContext.max_ssbo_size/sizeof(real));
        fs = str_append (fs, "[", a, "];\n"
    		     "#define _data_val(index) _data[(index)/", s, "].f[(index)%", s, "]\n");
      }
      else
        fs = str_append (fs, ";\n"
    		     "#define _data_val(index) _data.f[index]\n");
      fs = str_append (fs,
    		   "layout (location = 3) uniform uint offset;\n"
    		   "layout (location = 4) uniform uint nb;\n"
    		   "layout (location = 5) uniform uint nbr;\n"
    		   "layout (local_size_x = ", nwgrs, ") in;\n"
    		   "void main() {\n"
    		   "if (gl_GlobalInvocationID.x < nb) {\n"
    		   "  uint stride = ", strides, ";\n"
    		   "  uint index = stride*gl_GlobalInvocationID.x;\n"
    		   "  if (index + stride > nbr) stride = nbr - index;\n"
    		   "  index += offset;\n"
    		   "  real val = _data_val(index);\n",
    		   operation,
    		   "  _reduct[gl_GlobalInvocationID.x] = reduct;\n"
    		   "}}\n");
      Shader * shader;
      Adler32Hash hash;
      a32_hash_init (&hash);
      a32_hash_add (&hash, fs, strlen (fs));
      shader = load_shader (fs, a32_hash (&hash), NULL);
      assert (shader);
      if (shader->id != GPUContext.current_shader) {
        GL_C (glUseProgram (shader->id));
        GPUContext.current_shader = shader->id;
      }
      const GLint loffset = 3, lnb = 4, lnbr = 5;
    
      int start = offset*sizeof(real)/GPUContext.max_ssbo_size;
      offset -= start*GPUContext.max_ssbo_size/sizeof(real);
      for (int i = start; i < GPUContext.nssbo; i++)
        GL_C (glBindBufferBase (GL_SHADER_STORAGE_BUFFER, 1 + i - start, GPUContext.ssbo[i]));
    
      if (is_foreach_point) {
        real result = 0.;
        int i = (region->p.x - X0)/L0*N;
        int j = (region->p.y - Y0)/L0*N;
        if (i >= 0 && i < N && j >= 0 && j < N) {
          offset += i*N + j;
          GL_C (glUniform1ui (loffset, offset));
          GL_C (glBindBufferBase (GL_SHADER_STORAGE_BUFFER, 0, br[0]));
          GL_C (glUniform1ui (lnbr, 1));
          GL_C (glUniform1ui (lnb, 1));
          GL_C (glMemoryBarrier (GL_SHADER_STORAGE_BARRIER_BIT));
          GL_C (glDispatchCompute (1, 1, 1));
          GL_C (glBindBuffer (GL_SHADER_STORAGE_BUFFER, br[0]));
          GL_C (glGetBufferSubData (GL_SHADER_STORAGE_BUFFER, 0, sizeof (real), &result));
          GL_C (glBindBuffer (GL_SHADER_STORAGE_BUFFER, 0));
        }
        return result;
      }
      
      GL_C (glUniform1ui (loffset, offset));
      int src = 0, dst = 1;
      while (nb >= nwgr*stride) {
        GL_C (glBindBufferBase (GL_SHADER_STORAGE_BUFFER, 0, br[dst]));
        GL_C (glUniform1ui (lnbr, nb));
        if (nb % stride) {
          nb /= stride;
          nb++;
        }
        else
          nb /= stride;
        int ng = nb/nwgr;
        if (ng*nwgr < nb)
          ng++;
        GL_C (glUniform1ui (lnb, nb));
        GL_C (glMemoryBarrier (GL_SHADER_STORAGE_BARRIER_BIT));
        GL_C (glDispatchCompute (ng, 1, 1));
        swap (int, src, dst);
        if (offset) {
          GL_C (glUniform1ui (loffset, 0));
          offset = 0;
        }
        GL_C (glBindBufferBase (GL_SHADER_STORAGE_BUFFER, 1, br[src]));
      }
    
      return cpu_reduction (br + src, 0, nb, op);
    }