mpi-laplacian.c

Parallel scalability
- How to run on Occigen
- Results on Occigen

Parallel scalability

This code can be used to test the scalability of common operations (traversal, restriction, boundary conditions etc…) and their combinations, in particular the multigrid Poisson solver.

#include "poisson.h"
#include "utils.h"

scalar a[], b[];

static void mpi_print (timer t, int i, long tnc,
		       const char * name)
{
  double mpi[npe()];
  timing s = timer_timing (t, i, tnc, mpi);

#if 0
  scalar wt[];
  foreach()
    wt[] = mpi[pid()];
  char fname[80];
  sprintf (fname, "%s-%d.ppm", name, npe());
  FILE * fp = pid() ? NULL : fopen (fname, "w");
  output_ppm (wt, fp, n = 512);
#endif
  
  if (pid() == 0) {

s.min/i, s.avg/i, s.max/i are the minimum, average and maximum *times spent in communication routines.

    printf ("%d %g %g %g %s %g %g %g %.2g%% %ld %ld",
	    npe(), s.cpu/i, s.real/i, s.speed, name, s.min/i, s.avg/i, s.max/i,
	    100.*s.avg/s.real, s.mem, s.tnc/i);

We also output the times spent in communication routines for each process.

#if 0
    for (int j = 0; j < npe(); j++)
      printf (" %g", mpi[j]/i);
#endif

If memory profiling is enabled we output the maximum allocated memory (in kilobytes).

@if MTRACE
    printf (" %ld", pmtrace.max/1024);
@endif
    
    printf ("\n");
  }
  
  MPI_Barrier (MPI_COMM_WORLD);
}

int main (int argc, char * argv[])
{
  int maxlevel = argc > 1 ? atoi(argv[1]) : (dimension == 2 ? 8 : 5);
  timer t;

  double speed = 1e6; // approx speed in points/sec/core
  long tnc = ((long)1) << (dimension*maxlevel);
  int i, nloops = max(0.1*speed*npe()/(double)tnc, 1);
  if (!pid())
    fprintf (stderr, "grid: %s\nnloops = %d\n", GRIDNAME, nloops);

  if (tnc*nloops/(speed*npe()) > 100.) {
    fprintf (stderr, "this run would probably take more than 100 seconds."
	     " Aborting ...\n");
    exit(1);
  }

  size (1[0]);
  init_grid (N);
  foreach()
    a[] = 0., b[] = 0.;
  poisson (a, b); // to force allocation of extra fields
  
  MPI_Barrier (MPI_COMM_WORLD);
  t = timer_start();
  init_grid (1 << maxlevel);
  tnc = 0;
  foreach(reduction(+:tnc))
    tnc++;
  mpi_print (t, 1, tnc, "refine");

#if TREE  
  assert (tree_is_full());
#endif

We fill a with a simple function.

  MPI_Barrier (MPI_COMM_WORLD);
  i = 0;
  t = timer_start();
  while (i < nloops) {
    foreach()
      a[] = cos(pi*x)*cos(pi*y)*cos(pi*z);
#if 0
    boundary ({a});
#else
    prof_start ("barrier");
    MPI_Barrier (MPI_COMM_WORLD);
    prof_stop();
#endif
    i++;
  }
  mpi_print (t, i, tnc*i, "cos");
  boundary ({a});

Here we compute \displaystyle b = \nabla^2 a using a 5-points Laplacian operator.

  MPI_Barrier (MPI_COMM_WORLD);
  i = 0;
  t = timer_start();
  while (i < nloops) {
    foreach() {
      b[] = 0.;
      foreach_dimension()
        b[] += a[1] + a[-1];
      b[] = (b[] - 2.*dimension*a[])/sq(Delta);
    }
    boundary ({b});
    i++;
  }
  mpi_print (t, i, tnc*i, "laplacian");

Something simpler: the sum of a over the entire mesh.

  MPI_Barrier (MPI_COMM_WORLD);
  i = 0;
  t = timer_start();
  double sum = 0.;
  while (i < nloops) {
    sum = 0.;
    foreach(reduction(+:sum))
      sum += b[];
    i++;
  }
  mpi_print (t, i, tnc*i, "sum");
  if (pid() == 0)
    fprintf (stderr, "sum: %g\n", sum);

The restriction operator.

  MPI_Barrier (MPI_COMM_WORLD);
  i = 0;
  t = timer_start();
  while (i < nloops) {
    boundary ({b});
    restriction ({b});
    i++;
  }
  mpi_print (t, i, tnc*i, "restriction");

And finally the Poisson solver.

  MPI_Barrier (MPI_COMM_WORLD);
  i = 0;
  TOLERANCE = HUGE;
  t = timer_start();
  while (i < nloops) {
    poisson (a, b);
    i++;
  }
  mpi_print (t, i, tnc*i, "poisson");

  scalar e[];
  foreach()
    e[] = a[] - cos(pi*x)*cos(pi*y)*cos(pi*z);
  double max = normf(e).max;
  if (pid() == 0)
    fprintf (stderr, "error: %g\n", max);
  assert (normf(e).max < 1e-10);
  //  output_ppm (e, file = "error.png");

  sum = 0.;
  int n = 0;
  foreach (serial) {
    e[] = (long) &(a[]);
    sum += e[];
    n++;
  }
  foreach()
    e[] -= sum/n;
#if 0
  FILE * fp = pid() ? NULL : fopen ("map.ppm", "w");
  output_ppm (e, fp, n = 512);
#endif
  
  int nmin = n, nmax = n;
  mpi_all_reduce (nmin, MPI_INT, MPI_MIN);
  mpi_all_reduce (nmax, MPI_INT, MPI_MAX);
  if (pid() == 0)
    fprintf (stderr, "balance %d %d\n", nmin, nmax);
}

How to run on Occigen

This test is run on Occigen. The C99 source code is generated on a system with qcc installed and then copied to Occigen using something like

qcc -grid=octree -source mpi-laplacian.c
scp _mpi-laplacian.c popinet@occigen.cines.fr:

On Occigen one can then compile the code using

module load bullxmpi
module load intel
mpicc -Wall -std=c99 -O2 -D_MPI=1 -D_GNU_SOURCE=1 _mpi-laplacian.c \
     -o mpi-laplacian -lm

The following script (run.sh) can then be used to run the code

#!/bin/bash
#SBATCH -J basilisk
#SBATCH --nodes=1
#SBATCH --ntasks=2
#SBATCH --ntasks-per-node=2
#SBATCH --threads-per-core=1
#SBATCH --time=00:10:00
#SBATCH --output basilisk.output
#SBATCH --exclusive

LEVEL=11

module purge
module load bullxmpi
module load intel

srun --mpi=pmi2 -K1 --resv-ports -n $SLURM_NTASKS ./mpi-laplacian $LEVEL \
     2> log-$LEVEL-$SLURM_NTASKS > out-$LEVEL-$SLURM_NTASKS

LEVEL is varied from 9 (~134 million grid points) to 11 (~8.6 billion grid points) and the number of processes up to 16384 using something like

sbatch --ntasks=1 --ntasks-per-node=1 run.sh
sbatch --ntasks=2 --ntasks-per-node=2 run.sh
sbatch --ntasks=4 --ntasks-per-node=4 run.sh
sbatch --ntasks=8 --ntasks-per-node=8 run.sh
sbatch --ntasks=16 --ntasks-per-node=16 run.sh
sbatch --ntasks=32 --ntasks-per-node=16 --nodes=2 run.sh
sbatch --ntasks=64 --ntasks-per-node=16 --nodes=4 run.sh
sbatch --ntasks=128 --ntasks-per-node=16 --nodes=8 run.sh
sbatch --ntasks=256 --ntasks-per-node=16 --nodes=16 run.sh
sbatch --ntasks=512 --ntasks-per-node=16 --nodes=32 run.sh
sbatch --ntasks=1024 --ntasks-per-node=16 --nodes=64 run.sh
sbatch --ntasks=2048 --ntasks-per-node=16 --nodes=128 run.sh
sbatch --ntasks=4096 --ntasks-per-node=16 --nodes=256 run.sh
sbatch --ntasks=8192 --ntasks-per-node=16 --nodes=512 run.sh
sbatch --ntasks=16384 --ntasks-per-node=24 --nodes=683 run.sh

The results can then be collected using

tar czvf occigen.tgz out-*-*

Results on Occigen

The memory usage per core is given below. The increase for a large number of cores corresponds to the memory overhead of communication buffers etc…

# generate results for Occigen
cd 'occigen/3D'

# generate weak scaling curves
! bash weak.sh > weak

set logscale
set logscale x 2
set grid
set xrange [2:32768]
set format x '2^{%L}'
set xtics 2

set xlabel "# of cores"
set ylabel "Memory/core (GB)"
minlevel=9
maxlevel=11
plot [][0.1:] for [i=minlevel:maxlevel] \
     '< sh table.sh poisson '.i u 1:($2/$1) t ''.i.' levels' w lp, \
     18/x**0.9 t '18/x^{0.9}'
cd '../..'

Memory usage on Occigen (script)

The wall-clock time for one iteration of the multigrid Poisson solver is given below.

The pink lines connect points corresponding with weak (or iso-granular) scaling i.e. multiplying by eight both the computation size and the number of cores. The ideal weak scaling would give horizontal lines (i.e. constant computation time for proportionally-increasing problem sizes and number of cores).

cd 'occigen/3D'
set ylabel 'Time (sec)'
plot [][0.01:100] for [i=minlevel:maxlevel] \
     '< sh time.sh poisson '.i u 1:2 t ''.i.' levels' w lp, \
     'weak' u 1:2 w lp t 'weak scaling', \
     600/x**0.95 t '600/x^{0.95}'
cd '../..'

Wall-clock time on Occigen for the Poisson solver (script)

The time spent in communication routines is illustrated below.

cd 'occigen/3D'
plot [][1e-2:10] for [i=minlevel:maxlevel] \
     '< sh time.sh poisson '.i u 1:3 w lp t ''.i.' levels', \
     4.5/x**0.65 t '4.5/x^{0.65}'
cd '../..'

Communication time on Occigen for the Poisson solver (script)

Similar results are obtained for a pure Laplacian.

cd 'occigen/3D'
plot [][1e-4:10] for [i=minlevel:maxlevel] \
     '< sh time.sh laplacian '.i u 1:2 w lp t ''.i.' levels', \
     50/x**0.93 t '50/x^{0.93}'
cd '../..'

Wall-clock time on Occigen for the Laplacian (script)

cd 'occigen/3D'
plot [][1e-4:1] for [i=minlevel:maxlevel] \
     '< sh time.sh laplacian '.i u 1:3 w lp t ''.i.' levels', \
     2./x**0.7 t '2/x^{0.7}'
cd '../..'

Communication time on Occigen for the Laplacian (script)

And for the restriction operator.

cd 'occigen/3D'
plot [][:1] for [i=minlevel:maxlevel] \
     '< sh time.sh restriction '.i u 1:2 w lp t ''.i.' levels', \
     18/x**0.85 t '18/x^{0.85}'
cd '../..'

Wall-clock time on Occigen for the restriction operator (script)

cd 'occigen/3D'
plot [][1e-3:1] for [i=minlevel:maxlevel] \
     '< sh time.sh restriction '.i u 1:3 w lp t ''.i.' levels', \
     2.8/x**0.66 t '2.8/x^{0.66}'
cd '../..'

Communication time on Occigen for the restriction operator (script)