/**
# Parallel scalability

This code can be used to test the scalability of common operations
(traversal, restriction, boundary conditions etc...) and their
combinations, in particular the multigrid Poisson solver. */

#include "poisson.h"
#include "utils.h"

scalar a[], b[];

static void mpi_print (timer t, int i, long tnc,
		       const char * name)
{
  double mpi[npe()];
  timing s = timer_timing (t, i, tnc, mpi);

#if 0
  scalar wt[];
  foreach()
    wt[] = mpi[pid()];
  char fname[80];
  sprintf (fname, "%s-%d.ppm", name, npe());
  FILE * fp = pid() ? NULL : fopen (fname, "w");
  output_ppm (wt, fp, n = 512);
#endif
  
  if (pid() == 0) {

    /**
    *s.min/i*, *s.avg/i*, *s.max/i* are the minimum, average and maximum
    *times spent in communication routines. */

    printf ("%d %g %g %g %s %g %g %g %.2g%% %ld %ld",
	    npe(), s.cpu/i, s.real/i, s.speed, name, s.min/i, s.avg/i, s.max/i,
	    100.*s.avg/s.real, s.mem, s.tnc/i);

    /**
    We also output the times spent in communication routines for each process. */
#if 0
    for (int j = 0; j < npe(); j++)
      printf (" %g", mpi[j]/i);
#endif

    /**
    If [memory profiling](/src/README.mtrace) is enabled we output the
    maximum allocated memory (in kilobytes). */

@if MTRACE
    printf (" %ld", pmtrace.max/1024);
@endif
    
    printf ("\n");
  }
  
  MPI_Barrier (MPI_COMM_WORLD);
}

int main (int argc, char * argv[])
{
  int maxlevel = argc > 1 ? atoi(argv[1]) : (dimension == 2 ? 8 : 5);
  timer t;

  double speed = 1e6; // approx speed in points/sec/core
  long tnc = ((long)1) << (dimension*maxlevel);
  int i, nloops = max(0.1*speed*npe()/(double)tnc, 1);
  if (!pid())
    fprintf (stderr, "grid: %s\nnloops = %d\n", GRIDNAME, nloops);

  if (tnc*nloops/(speed*npe()) > 100.) {
    fprintf (stderr, "this run would probably take more than 100 seconds."
	     " Aborting ...\n");
    exit(1);
  }

  size (1[0]);
  init_grid (N);
  foreach()
    a[] = 0., b[] = 0.;
  poisson (a, b); // to force allocation of extra fields
  
  MPI_Barrier (MPI_COMM_WORLD);
  t = timer_start();
  init_grid (1 << maxlevel);
  tnc = 0;
  foreach(reduction(+:tnc))
    tnc++;
  mpi_print (t, 1, tnc, "refine");

#if TREE  
  assert (tree_is_full());
#endif
  
  /**
  We fill `a` with a simple function. */

  MPI_Barrier (MPI_COMM_WORLD);
  i = 0;
  t = timer_start();
  while (i < nloops) {
    foreach()
      a[] = cos(pi*x)*cos(pi*y)*cos(pi*z);
#if 0
    boundary ({a});
#else
    prof_start ("barrier");
    MPI_Barrier (MPI_COMM_WORLD);
    prof_stop();
#endif
    i++;
  }
  mpi_print (t, i, tnc*i, "cos");
  boundary ({a});

  /**
  Here we compute
  $$
  b = \nabla^2 a
  $$
  using a 5-points Laplacian operator. */

  MPI_Barrier (MPI_COMM_WORLD);
  i = 0;
  t = timer_start();
  while (i < nloops) {
    foreach() {
      b[] = 0.;
      foreach_dimension()
        b[] += a[1] + a[-1];
      b[] = (b[] - 2.*dimension*a[])/sq(Delta);
    }
    boundary ({b});
    i++;
  }
  mpi_print (t, i, tnc*i, "laplacian");
  
  /**
  Something simpler: the sum of `a` over the entire mesh. */

  MPI_Barrier (MPI_COMM_WORLD);
  i = 0;
  t = timer_start();
  double sum = 0.;
  while (i < nloops) {
    sum = 0.;
    foreach(reduction(+:sum))
      sum += b[];
    i++;
  }
  mpi_print (t, i, tnc*i, "sum");
  if (pid() == 0)
    fprintf (stderr, "sum: %g\n", sum);

  /**
  The restriction operator. */

  MPI_Barrier (MPI_COMM_WORLD);
  i = 0;
  t = timer_start();
  while (i < nloops) {
    boundary ({b});
    restriction ({b});
    i++;
  }
  mpi_print (t, i, tnc*i, "restriction");

  /**
  And finally the Poisson solver. */

  MPI_Barrier (MPI_COMM_WORLD);
  i = 0;
  TOLERANCE = HUGE;
  t = timer_start();
  while (i < nloops) {
    poisson (a, b);
    i++;
  }
  mpi_print (t, i, tnc*i, "poisson");

  scalar e[];
  foreach()
    e[] = a[] - cos(pi*x)*cos(pi*y)*cos(pi*z);
  double max = normf(e).max;
  if (pid() == 0)
    fprintf (stderr, "error: %g\n", max);
  assert (normf(e).max < 1e-10);
  //  output_ppm (e, file = "error.png");

  sum = 0.;
  int n = 0;
  foreach (serial) {
    e[] = (long) &(a[]);
    sum += e[];
    n++;
  }
  foreach()
    e[] -= sum/n;
#if 0
  FILE * fp = pid() ? NULL : fopen ("map.ppm", "w");
  output_ppm (e, fp, n = 512);
#endif
  
  int nmin = n, nmax = n;
  mpi_all_reduce (nmin, MPI_INT, MPI_MIN);
  mpi_all_reduce (nmax, MPI_INT, MPI_MAX);
  if (pid() == 0)
    fprintf (stderr, "balance %d %d\n", nmin, nmax);
}

/**
## How to run on Occigen

This test is run on
[Occigen](https://www.cines.fr/calcul/materiels/occigen/configuration/). The
C99 source code is generated on a system with *qcc* installed and then
copied to Occigen using something like

~~~bash
qcc -grid=octree -source mpi-laplacian.c
scp _mpi-laplacian.c popinet@occigen.cines.fr:
~~~

On Occigen one can then compile the code using

~~~bash
module load bullxmpi
module load intel
mpicc -Wall -std=c99 -O2 -D_MPI=1 -D_GNU_SOURCE=1 _mpi-laplacian.c \
     -o mpi-laplacian -lm
~~~

The following script (*run.sh*) can then be used to run the code

~~~bash
#!/bin/bash
#SBATCH -J basilisk
#SBATCH --nodes=1
#SBATCH --ntasks=2
#SBATCH --ntasks-per-node=2
#SBATCH --threads-per-core=1
#SBATCH --time=00:10:00
#SBATCH --output basilisk.output
#SBATCH --exclusive

LEVEL=11

module purge
module load bullxmpi
module load intel

srun --mpi=pmi2 -K1 --resv-ports -n $SLURM_NTASKS ./mpi-laplacian $LEVEL \
     2> log-$LEVEL-$SLURM_NTASKS > out-$LEVEL-$SLURM_NTASKS
~~~

LEVEL is varied from 9 (~134 million grid points) to 11 (~8.6 billion
grid points) and the number of processes up to 16384 using something like

~~~bash
sbatch --ntasks=1 --ntasks-per-node=1 run.sh
sbatch --ntasks=2 --ntasks-per-node=2 run.sh
sbatch --ntasks=4 --ntasks-per-node=4 run.sh
sbatch --ntasks=8 --ntasks-per-node=8 run.sh
sbatch --ntasks=16 --ntasks-per-node=16 run.sh
sbatch --ntasks=32 --ntasks-per-node=16 --nodes=2 run.sh
sbatch --ntasks=64 --ntasks-per-node=16 --nodes=4 run.sh
sbatch --ntasks=128 --ntasks-per-node=16 --nodes=8 run.sh
sbatch --ntasks=256 --ntasks-per-node=16 --nodes=16 run.sh
sbatch --ntasks=512 --ntasks-per-node=16 --nodes=32 run.sh
sbatch --ntasks=1024 --ntasks-per-node=16 --nodes=64 run.sh
sbatch --ntasks=2048 --ntasks-per-node=16 --nodes=128 run.sh
sbatch --ntasks=4096 --ntasks-per-node=16 --nodes=256 run.sh
sbatch --ntasks=8192 --ntasks-per-node=16 --nodes=512 run.sh
sbatch --ntasks=16384 --ntasks-per-node=24 --nodes=683 run.sh
~~~

The results can then be collected using

~~~bash
tar czvf occigen.tgz out-*-*
~~~

## Results on Occigen

The memory usage per core is given below. The increase for a large number
of cores corresponds to the memory overhead of communication buffers
etc...

~~~gnuplot Memory usage on Occigen
# generate results for Occigen
cd 'occigen/3D'

# generate weak scaling curves
! bash weak.sh > weak

set logscale
set logscale x 2
set grid
set xrange [2:32768]
set format x '2^{%L}'
set xtics 2

set xlabel "# of cores"
set ylabel "Memory/core (GB)"
minlevel=9
maxlevel=11
plot [][0.1:] for [i=minlevel:maxlevel] \
     '< sh table.sh poisson '.i u 1:($2/$1) t ''.i.' levels' w lp, \
     18/x**0.9 t '18/x^{0.9}'
cd '../..'
~~~

The wall-clock time for one iteration of the multigrid Poisson solver
is given below.

The pink lines connect points corresponding with weak (or
*iso-granular*) scaling i.e. multiplying by eight both the computation
size and the number of cores. The ideal weak scaling would give
horizontal lines (i.e. constant computation time for
proportionally-increasing problem sizes and number of cores).

~~~gnuplot Wall-clock time on Occigen for the Poisson solver
cd 'occigen/3D'
set ylabel 'Time (sec)'
plot [][0.01:100] for [i=minlevel:maxlevel] \
     '< sh time.sh poisson '.i u 1:2 t ''.i.' levels' w lp, \
     'weak' u 1:2 w lp t 'weak scaling', \
     600/x**0.95 t '600/x^{0.95}'
cd '../..'
~~~

The time spent in communication routines is illustrated below.

~~~gnuplot Communication time on Occigen for the Poisson solver
cd 'occigen/3D'
plot [][1e-2:10] for [i=minlevel:maxlevel] \
     '< sh time.sh poisson '.i u 1:3 w lp t ''.i.' levels', \
     4.5/x**0.65 t '4.5/x^{0.65}'
cd '../..'
~~~

Similar results are obtained for a pure Laplacian.

~~~gnuplot Wall-clock time on Occigen for the Laplacian
cd 'occigen/3D'
plot [][1e-4:10] for [i=minlevel:maxlevel] \
     '< sh time.sh laplacian '.i u 1:2 w lp t ''.i.' levels', \
     50/x**0.93 t '50/x^{0.93}'
cd '../..'
~~~

~~~gnuplot Communication time on Occigen for the Laplacian
cd 'occigen/3D'
plot [][1e-4:1] for [i=minlevel:maxlevel] \
     '< sh time.sh laplacian '.i u 1:3 w lp t ''.i.' levels', \
     2./x**0.7 t '2/x^{0.7}'
cd '../..'
~~~

And for the restriction operator.

~~~gnuplot Wall-clock time on Occigen for the restriction operator
cd 'occigen/3D'
plot [][:1] for [i=minlevel:maxlevel] \
     '< sh time.sh restriction '.i u 1:2 w lp t ''.i.' levels', \
     18/x**0.85 t '18/x^{0.85}'
cd '../..'
~~~

~~~gnuplot Communication time on Occigen for the restriction operator
cd 'occigen/3D'
plot [][1e-3:1] for [i=minlevel:maxlevel] \
     '< sh time.sh restriction '.i u 1:3 w lp t ''.i.' levels', \
     2.8/x**0.66 t '2.8/x^{0.66}'
cd '../..'
~~~
*/