src/grid/gpu/Benchmarks/bump2D-gpu

    rm -f bump2D-gpu.ctst && CFLAGS='-fopenmp -DBENCHMARK -DSINGLE_PRECISION' make bump2D-gpu.ctst
    
    for i in 6 7 8 9 10 11; do OMP_NUM_THREADS=8 ./bump2D-gpu/bump2D-gpu $i; done
    
    # Multigrid, 217 steps, 0.584182 CPU, 0.07575 real, 1.17e+07 points.step/s, 27 var
    # Multigrid, 437 steps, 2.08056 CPU, 0.2613 real, 2.74e+07 points.step/s, 27 var
    # Multigrid, 885 steps, 12.9367 CPU, 1.619 real, 3.58e+07 points.step/s, 27 var
    # Multigrid, 1789 steps, 99.9439 CPU, 12.5 real, 3.75e+07 points.step/s, 27 var
    # Multigrid, 3615 steps, 1110.68 CPU, 138.9 real, 2.73e+07 points.step/s, 27 var
    # Multigrid, 7303 steps, 9532.31 CPU, 1192 real, 2.57e+07 points.step/s, 27 var
    
    rm -f bump2D-gpu.tst && CFLAGS='-DBENCHMARK' make bump2D-gpu.tst
    
    OpenGL renderer string: Mesa Intel(R) UHD Graphics (TGL GT1) (0x9a60)
    Video memory: 3072MB
    
    for i in 6 7 8 9 10 11; do ./bump2D-gpu/bump2D-gpu $i; done
    
    # Cartesian (GPU), 217 steps, 1.72295 CPU, 1.806 real, 4.92e+05 points.step/s, 28 var
    # Cartesian (GPU), 437 steps, 0.32242 CPU, 0.5966 real, 1.2e+07 points.step/s, 28 var
    # Cartesian (GPU), 885 steps, 1.96664 CPU, 3.117 real, 1.86e+07 points.step/s, 28 var
    # Cartesian (GPU), 1789 steps, 3.42581 CPU, 11.82 real, 3.97e+07 points.step/s, 28 var
    # Cartesian (GPU), 3615 steps, 6.07372 CPU, 107.7 real, 3.52e+07 points.step/s, 28 var
    # Cartesian (GPU), 7303 steps, 15.5081 CPU, 955 real, 3.21e+07 points.step/s, 28 var
    
    OpenGL renderer string: NVIDIA GeForce RTX 3050 Ti Laptop GPU/PCIe/SSE2
    Dedicated video memory: 4096 MB
    
    for i in 6 7 8 9 10 11; do __NV_PRIME_RENDER_OFFLOAD=1 __GLX_VENDOR_LIBRARY_NAME=nvidia ./bump2D-gpu/bump2D-gpu $i; done
    
    # Cartesian (GPU), 217 steps, 0.061795 CPU, 0.06184 real, 1.44e+07 points.step/s, 28 var
    # Cartesian (GPU), 437 steps, 0.124741 CPU, 0.1248 real, 5.74e+07 points.step/s, 28 var
    # Cartesian (GPU), 885 steps, 0.417376 CPU, 0.4174 real, 1.39e+08 points.step/s, 28 var
    # Cartesian (GPU), 1789 steps, 2.01836 CPU, 2.018 real, 2.32e+08 points.step/s, 28 var
    # Cartesian (GPU), 3615 steps, 12.9819 CPU, 12.98 real, 2.92e+08 points.step/s, 28 var
    # Cartesian (GPU), 7303 steps, 100.6 CPU, 100.6 real, 3.04e+08 points.step/s, 28 var
    
    OpenGL renderer string: Quadro RTX 6000/PCIe/SSE2
    Dedicated video memory: 24576 MB
    
    for i in 6 7 8 9 10 11; do ./bump2D-gpu/bump2D-gpu $i; done
    
    # Cartesian (GPU), 217 steps, 0.410228 CPU, 0.4102 real, 2.17e+06 points.step/s, 28 var
    # Cartesian (GPU), 437 steps, 0.079351 CPU, 0.07935 real, 9.02e+07 points.step/s, 28 var
    # Cartesian (GPU), 885 steps, 0.567819 CPU, 0.5678 real, 1.02e+08 points.step/s, 28 var
    # Cartesian (GPU), 1789 steps, 1.1364 CPU, 1.136 real, 4.13e+08 points.step/s, 28 var
    # Cartesian (GPU), 3615 steps, 4.36244 CPU, 4.363 real, 8.69e+08 points.step/s, 28 var
    # Cartesian (GPU), 7303 steps, 29.2246 CPU, 29.23 real, 1.05e+09 points.step/s, 28 var
    
    rm -f bump2D-gpu.tst && CFLAGS='-DBENCHMARK -DTRACE=3' make bump2D-gpu.tst
    # Ignore diff error since the log has not been generated
    
    OpenGL renderer string: NVIDIA GeForce RTX 3050 Ti Laptop GPU/PCIe/SSE2
    Dedicated video memory: 4096 MB
    
    __NV_PRIME_RENDER_OFFLOAD=1 __GLX_VENDOR_LIBRARY_NAME=nvidia bump2D-gpu/bump2D-gpu 10
    
    # Cartesian (GPU), 3615 steps, 13.3367 CPU, 13.34 real, 2.84e+08 points.step/s, 28 var
       calls    total     self   % total   function
        7230     5.12     4.23     31.7%   foreach():/src/saint-venant.h:275
        7230     2.60     2.57     19.2%   foreach():/src/utils.h:266
        7230     2.50     2.47     18.5%   foreach():/src/saint-venant.h:321
        7230     2.41     2.38     17.8%   foreach():/src/saint-venant.h:129
        7230     0.69     0.69      5.1%   gpu_reduction():/src/saint-venant.h:207
        7230    10.84     0.59      4.4%   update_saint_venant():/src/saint-venant.h:331
       28923     0.31     0.18      1.4%   setup_shader():/src/grid/gpu/grid.h:1403
    
    OpenGL renderer string: Quadro RTX 6000/PCIe/SSE2
    Dedicated video memory: 24576 MB
    
    bump2D-gpu/bump2D-gpu 10
    
    # Cartesian (GPU), 3615 steps, 4.54817 CPU, 4.548 real, 8.33e+08 points.step/s, 28 var
       calls    total     self   % total   function
        7230     1.75     1.29     28.3%   foreach():/home/user/basilisk/src/saint-venant.h:275
        7230     0.93     0.91     20.0%   foreach():/home/user/basilisk/src/utils.h:266
        7230     0.77     0.75     16.4%   foreach():/home/user/basilisk/src/saint-venant.h:321
        7230     0.74     0.70     15.4%   foreach():/home/user/basilisk/src/saint-venant.h:129
        7230     0.32     0.32      7.0%   gpu_reduction():/home/user/basilisk/src/saint-venant.h:207
        7230     3.78     0.27      6.0%   update_saint_venant():/home/user/basilisk/src/saint-venant.h:331
       28923     0.23     0.12      2.7%   setup_shader():/home/user/basilisk/src/grid/gpu/grid.h:1402
          27     0.12     0.12      2.5%   gpu_cpu_sync_scalar():/home/user/basilisk/src/grid/gpu/grid.h:977
    
    ...
    Device: Mesa Intel(R) UHD Graphics (TGL GT1) (0x9a60)
    ...
    Video memory: 3072MB
    
    ./bump2D-gpu/bump2D-gpu 10
    
    # Cartesian (GPU), 3615 steps, 14.4777 CPU, 121 real, 3.13e+07 points.step/s, 28 var
       calls    total     self   % total   function
        7230    53.22    44.82     37.0%   foreach():/src/saint-venant.h:275
        7230    29.79    29.36     24.3%   foreach():/src/utils.h:266
        7230    19.10    18.53     15.3%   foreach():/src/saint-venant.h:321
        7230    16.67    16.05     13.3%   foreach():/src/saint-venant.h:129
        7230     7.57     7.55      6.2%   gpu_reduction():/src/saint-venant.h:207
       28923     2.45     1.96      1.6%   setup_shader():/src/grid/gpu/grid.h:1396
        7230   103.84     1.73      1.4%   update_saint_venant():/src/saint-venant.h:331
    
    CFLAGS='-DTRACE=2 -grid=cartesian -fopenmp' make bump2D.tst
    cd bump2D
    OMP_NUM_THREADS=8 ./bump2D 10
    
    # Cartesian, 3615 steps, 2115.91 CPU, 264.5 real, 1.43e+07 points.step/s, 27 var
       calls    total     self   % total   function
        7230   223.97   222.45     84.1%   update_saint_venant():/src/saint-venant.h:331
        7230    40.40    40.40     15.3%   advance_saint_venant():/src/saint-venant.h:130
       14460     1.52     1.52      0.6%   boundary_internal():/src/grid/cartesian-common.h:530
           1   264.52     0.14      0.1%   run():/src/predictor-corrector.h:75