src/test/mpi-laplacian.c
Parallel scalability
This code can be used to test the scalability of common operations (traversal, restriction, boundary conditions etc…) and their combinations, in particular the multigrid Poisson solver.
#include "poisson.h"
#include "utils.h"
scalar a[], b[];
static void mpi_print (timer t, int i, long tnc,
const char * name)
{
double mpi[npe()];
timing s = timer_timing (t, i, tnc, mpi);
#if 0
scalar wt[];
foreach()
[] = mpi[pid()];
wtchar fname[80];
sprintf (fname, "%s-%d.ppm", name, npe());
FILE * fp = pid() ? NULL : fopen (fname, "w");
output_ppm (wt, fp, n = 512);
#endif
if (pid() == 0) {
s.min/i, s.avg/i, s.max/i are the minimum, average and maximum *times spent in communication routines.
printf ("%d %g %g %g %s %g %g %g %.2g%% %ld %ld",
(), s.cpu/i, s.real/i, s.speed, name, s.min/i, s.avg/i, s.max/i,
npe100.*s.avg/s.real, s.mem, s.tnc/i);
We also output the times spent in communication routines for each process.
#if 0
for (int j = 0; j < npe(); j++)
printf (" %g", mpi[j]/i);
#endif
If memory profiling is enabled we output the maximum allocated memory (in kilobytes).
@if MTRACEprintf (" %ld", pmtrace.max/1024);
@endif
printf ("\n");
}
(MPI_COMM_WORLD);
MPI_Barrier }
int main (int argc, char * argv[])
{
int maxlevel = argc > 1 ? atoi(argv[1]) : (dimension == 2 ? 8 : 5);
;
timer t
double speed = 1e6; // approx speed in points/sec/core
long tnc = ((long)1) << (dimension*maxlevel);
int i, nloops = max(0.1*speed*npe()/(double)tnc, 1);
if (!pid())
fprintf (stderr, "grid: %s\nnloops = %d\n", GRIDNAME, nloops);
if (tnc*nloops/(speed*npe()) > 100.) {
fprintf (stderr, "this run would probably take more than 100 seconds."
" Aborting ...\n");
exit(1);
}
(1[0]);
size (N);
init_grid foreach()
[] = 0., b[] = 0.;
apoisson (a, b); // to force allocation of extra fields
(MPI_COMM_WORLD);
MPI_Barrier = timer_start();
t (1 << maxlevel);
init_grid = 0;
tnc foreach(reduction(+:tnc))
++;
tncmpi_print (t, 1, tnc, "refine");
#if TREE
assert (tree_is_full());
#endif
We fill a
with a simple function.
(MPI_COMM_WORLD);
MPI_Barrier = 0;
i = timer_start();
t while (i < nloops) {
foreach()
[] = cos(pi*x)*cos(pi*y)*cos(pi*z);
a#if 0
boundary ({a});
#else
("barrier");
prof_start (MPI_COMM_WORLD);
MPI_Barrier ();
prof_stop#endif
++;
i}
mpi_print (t, i, tnc*i, "cos");
boundary ({a});
Here we compute b = \nabla^2 a using a 5-points Laplacian operator.
(MPI_COMM_WORLD);
MPI_Barrier = 0;
i = timer_start();
t while (i < nloops) {
foreach() {
[] = 0.;
bforeach_dimension()
[] += a[1] + a[-1];
b[] = (b[] - 2.*dimension*a[])/sq(Delta);
b}
boundary ({b});
++;
i}
mpi_print (t, i, tnc*i, "laplacian");
Something simpler: the sum of a
over the entire
mesh.
(MPI_COMM_WORLD);
MPI_Barrier = 0;
i = timer_start();
t double sum = 0.;
while (i < nloops) {
= 0.;
sum foreach(reduction(+:sum))
+= b[];
sum ++;
i}
mpi_print (t, i, tnc*i, "sum");
if (pid() == 0)
fprintf (stderr, "sum: %g\n", sum);
The restriction operator.
(MPI_COMM_WORLD);
MPI_Barrier = 0;
i = timer_start();
t while (i < nloops) {
boundary ({b});
({b});
restriction ++;
i}
mpi_print (t, i, tnc*i, "restriction");
And finally the Poisson solver.
(MPI_COMM_WORLD);
MPI_Barrier = 0;
i = HUGE;
TOLERANCE = timer_start();
t while (i < nloops) {
poisson (a, b);
++;
i}
mpi_print (t, i, tnc*i, "poisson");
scalar e[];
foreach()
[] = a[] - cos(pi*x)*cos(pi*y)*cos(pi*z);
edouble max = normf(e).max;
if (pid() == 0)
fprintf (stderr, "error: %g\n", max);
assert (normf(e).max < 1e-10);
// output_ppm (e, file = "error.png");
= 0.;
sum int n = 0;
foreach (serial) {
[] = (long) &(a[]);
e+= e[];
sum ++;
n}
foreach()
[] -= sum/n;
e#if 0
FILE * fp = pid() ? NULL : fopen ("map.ppm", "w");
output_ppm (e, fp, n = 512);
#endif
int nmin = n, nmax = n;
(nmin, MPI_INT, MPI_MIN);
mpi_all_reduce (nmax, MPI_INT, MPI_MAX);
mpi_all_reduce if (pid() == 0)
fprintf (stderr, "balance %d %d\n", nmin, nmax);
}
How to run on Occigen
This test is run on Occigen. The C99 source code is generated on a system with qcc installed and then copied to Occigen using something like
qcc -grid=octree -source mpi-laplacian.c
scp _mpi-laplacian.c popinet@occigen.cines.fr:
On Occigen one can then compile the code using
module load bullxmpi
module load intel
mpicc -Wall -std=c99 -O2 -D_MPI=1 -D_GNU_SOURCE=1 _mpi-laplacian.c \
-o mpi-laplacian -lm
The following script (run.sh) can then be used to run the code
#!/bin/bash
#SBATCH -J basilisk
#SBATCH --nodes=1
#SBATCH --ntasks=2
#SBATCH --ntasks-per-node=2
#SBATCH --threads-per-core=1
#SBATCH --time=00:10:00
#SBATCH --output basilisk.output
#SBATCH --exclusive
LEVEL=11
module purge
module load bullxmpi
module load intel
srun --mpi=pmi2 -K1 --resv-ports -n $SLURM_NTASKS ./mpi-laplacian $LEVEL \
2> log-$LEVEL-$SLURM_NTASKS > out-$LEVEL-$SLURM_NTASKS
LEVEL is varied from 9 (~134 million grid points) to 11 (~8.6 billion grid points) and the number of processes up to 16384 using something like
sbatch --ntasks=1 --ntasks-per-node=1 run.sh
sbatch --ntasks=2 --ntasks-per-node=2 run.sh
sbatch --ntasks=4 --ntasks-per-node=4 run.sh
sbatch --ntasks=8 --ntasks-per-node=8 run.sh
sbatch --ntasks=16 --ntasks-per-node=16 run.sh
sbatch --ntasks=32 --ntasks-per-node=16 --nodes=2 run.sh
sbatch --ntasks=64 --ntasks-per-node=16 --nodes=4 run.sh
sbatch --ntasks=128 --ntasks-per-node=16 --nodes=8 run.sh
sbatch --ntasks=256 --ntasks-per-node=16 --nodes=16 run.sh
sbatch --ntasks=512 --ntasks-per-node=16 --nodes=32 run.sh
sbatch --ntasks=1024 --ntasks-per-node=16 --nodes=64 run.sh
sbatch --ntasks=2048 --ntasks-per-node=16 --nodes=128 run.sh
sbatch --ntasks=4096 --ntasks-per-node=16 --nodes=256 run.sh
sbatch --ntasks=8192 --ntasks-per-node=16 --nodes=512 run.sh
sbatch --ntasks=16384 --ntasks-per-node=24 --nodes=683 run.sh
The results can then be collected using
tar czvf occigen.tgz out-*-*
Results on Occigen
The memory usage per core is given below. The increase for a large number of cores corresponds to the memory overhead of communication buffers etc…
# generate results for Occigen
cd 'occigen/3D'
# generate weak scaling curves
! bash weak.sh > weak
set logscale
set logscale x 2
set grid
set xrange [2:32768]
set format x '2^{%L}'
set xtics 2
set xlabel "# of cores"
set ylabel "Memory/core (GB)"
minlevel=9
maxlevel=11
plot [][0.1:] for [i=minlevel:maxlevel] \
'< sh table.sh poisson '.i u 1:($2/$1) t ''.i.' levels' w lp, \
**0.9 t '18/x^{0.9}'
18/xcd '../..'
The wall-clock time for one iteration of the multigrid Poisson solver is given below.
The pink lines connect points corresponding with weak (or iso-granular) scaling i.e. multiplying by eight both the computation size and the number of cores. The ideal weak scaling would give horizontal lines (i.e. constant computation time for proportionally-increasing problem sizes and number of cores).
cd 'occigen/3D'
set ylabel 'Time (sec)'
plot [][0.01:100] for [i=minlevel:maxlevel] \
'< sh time.sh poisson '.i u 1:2 t ''.i.' levels' w lp, \
'weak' u 1:2 w lp t 'weak scaling', \
**0.95 t '600/x^{0.95}'
600/xcd '../..'
The time spent in communication routines is illustrated below.
cd 'occigen/3D'
plot [][1e-2:10] for [i=minlevel:maxlevel] \
'< sh time.sh poisson '.i u 1:3 w lp t ''.i.' levels', \
**0.65 t '4.5/x^{0.65}'
4.5/xcd '../..'
Similar results are obtained for a pure Laplacian.
cd 'occigen/3D'
plot [][1e-4:10] for [i=minlevel:maxlevel] \
'< sh time.sh laplacian '.i u 1:2 w lp t ''.i.' levels', \
**0.93 t '50/x^{0.93}'
50/xcd '../..'
cd 'occigen/3D'
plot [][1e-4:1] for [i=minlevel:maxlevel] \
'< sh time.sh laplacian '.i u 1:3 w lp t ''.i.' levels', \
**0.7 t '2/x^{0.7}'
2./xcd '../..'
And for the restriction operator.
cd 'occigen/3D'
plot [][:1] for [i=minlevel:maxlevel] \
'< sh time.sh restriction '.i u 1:2 w lp t ''.i.' levels', \
**0.85 t '18/x^{0.85}'
18/xcd '../..'
cd 'occigen/3D'
plot [][1e-3:1] for [i=minlevel:maxlevel] \
'< sh time.sh restriction '.i u 1:3 w lp t ''.i.' levels', \
**0.66 t '2.8/x^{0.66}'
2.8/xcd '../..'