/** # Parallel scalability This code can be used to test the scalability of common operations (traversal, restriction, boundary conditions etc...) and their combinations, in particular the multigrid Poisson solver. */ #include "poisson.h" #include "utils.h" #include "check_restriction.h" scalar a[], b[]; static void mpi_print (timer t, int i, long tnc, const char * name) { double mpi[npe()]; timing s = timer_timing (t, i, tnc, mpi); if (pid() == 0) { /** *s.min/i*, *s.avg/i*, *s.max/i* are the minimum, average and maximum *times spent in communication routines. */ printf ("%d %g %g %g %s %g %g %g %.2g%% %ld %ld ", npe(), s.cpu/i, s.real/i, s.speed, name, s.min/i, s.avg/i, s.max/i, 100.*s.avg/s.real, s.mem, s.tnc); /** We also output the times spent in communication routines for each process. */ for (int j = 0; j < npe(); j++) printf (" %g", mpi[j]/i); printf ("\n"); } MPI_Barrier (MPI_COMM_WORLD); } int main (int argc, char * argv[]) { int maxlevel = argc > 1 ? atoi(argv[1]) : 6; int minlevel = argc > 2 ? atoi(argv[2]) : 5; timer t; size (1. [0]); // dimensionless init_grid (1); refine (level <= minlevel*(1. - sqrt(sq((x - 0.5) - 0.1) + sq((y - 0.5)- 0.1)))); foreach() a[] = b[] = 0.; poisson (a, b); // to force allocation of extra fields MPI_Barrier (MPI_COMM_WORLD); t = timer_start(); refine (level <= maxlevel*(1. - sqrt(sq((x - 0.5) - 0.1) + sq((y - 0.5) - 0.1)))); check_restriction (a); long tnc = 0; foreach(reduction(+:tnc)) tnc++; mpi_print (t, 1, tnc, "refine"); if (npe() <= 8) debug_mpi (qerr); int nloops, i; /** We fill `a` with a simple function. */ MPI_Barrier (MPI_COMM_WORLD); t = timer_start(); i = nloops = npe(); while (i--) { foreach() a[] = cos(pi*x)*cos(pi*y); #if 0 boundary ({a}); #else prof_start ("barrier"); MPI_Barrier (MPI_COMM_WORLD); prof_stop(); #endif } mpi_print (t, nloops, tnc*nloops, "cos"); boundary ({a}); /** Here we compute $$ b = \nabla^2 a $$ using a 5-points Laplacian operator. */ MPI_Barrier (MPI_COMM_WORLD); t = timer_start(); i = nloops = npe(); while (i--) { foreach() b[] = (a[0,1] + a[1,0] + a[0,-1] + a[-1,0] - 4.*a[])/sq(Delta); boundary ({b}); } mpi_print (t, nloops, tnc*nloops, "laplacian"); /** Something simpler: the sum of `a` over the entire mesh. */ MPI_Barrier (MPI_COMM_WORLD); t = timer_start(); i = nloops = npe(); double sum = 0.; while (i--) { sum = 0.; foreach(reduction(+:sum)) sum += sq(Delta)*b[]; } mpi_print (t, nloops, tnc*nloops, "sum"); fprintf (qerr, "sum: %g\n", sum); /** The restriction operator. */ MPI_Barrier (MPI_COMM_WORLD); t = timer_start(); i = nloops = 1; while (i--) restriction ({b}); mpi_print (t, nloops, tnc*nloops, "restriction"); /** And finally the Poisson solver. */ MPI_Barrier (MPI_COMM_WORLD); t = timer_start(); i = nloops = 1; TOLERANCE = HUGE; while (i--) poisson (a, b); mpi_print (t, nloops, tnc*nloops, "poisson"); scalar e[]; foreach() e[] = a[] - cos(pi*x)*cos(pi*y); fprintf (qerr, "error: %g\n", normf(e).max); // output_ppm (e, file = "error.png", n = 512); // assert (normf(e).max < 0.4); int n = 0; foreach (serial) n++; int nmin = n, nmax = n; mpi_all_reduce (nmin, MPI_INT, MPI_MIN); mpi_all_reduce (nmax, MPI_INT, MPI_MAX); fprintf (qerr, "balance %d %d\n", nmin, nmax); } /** ## How to run on Curie This test is run on [Curie](http://www-hpc.cea.fr/en/complexe/tgcc-curie.htm). The C99 source code is generated on a system with *qcc* installed and then copied to Curie using something like ~~~bash qcc -source mpi-laplacian.c scp _mpi-laplacian.c popinets@curie-fr.ccc.cea.fr: ~~~ On Curie the following script (*run.sh*) is used to compile and run the code ~~~bash #!/bin/bash #MSUB -r mpi-laplacian #MSUB -n 32 #MSUB -T 600 ####MSUB -Q test #MSUB -o basilisk_%I.out #MSUB -e basilisk_%I.log #MSUB -q standard #MSUB -A gen7325 #MSUB -w LEVEL=10 set -x cd ${BRIDGE_MSUB_PWD} mpicc -O2 -Wall -std=c99 -D_MPI=1 _mpi-laplacian.c -o mpi-laplacian -lm rm -f trace-* ccc_mprun -n ${BRIDGE_MSUB_NPROC} ./mpi-laplacian ${LEVEL} 8 \ 2> log-${LEVEL}-${BRIDGE_MSUB_NPROC} > out-${LEVEL}-${BRIDGE_MSUB_NPROC} ~~~ LEVEL is varied from 10 (~1 million grid points) to 15 (~1 billion grid points) and the number of processes up to 16384 using something like ~~~bash for i in 16 32 64 128 256 512 1024 2048 4096 8192 16384; do ccc_msub -n $i run.sh; done ~~~ The results can then be collected using ~~~bash tar czvf curie.tgz out-*-* ~~~ ## Results on Curie The memory usage per core is given below. The curves are a model (see [mpi-laplacian.plot](mpi-laplacian.plot) for details). The increase for a large number of cores corresponds to the memory overhead of communication buffers etc... ![Memory usage on Curie](mpi-circle/curie/memory.png) The wall-clock time for one iteration of the multigrid Poisson solver is given below. The red lines are a model of strong scaling. For a low enough number of cores, close to perfect scaling is obtained with a best fit computation speed close to 1.5 million grid points/core. The pink lines connect points corresponding with weak (or *iso-granular*) scaling i.e. quadrupling both the computation size and the number of cores. The ideal weak scaling would give horizontal lines (i.e. constant computation time for proportionally-increasing problem sizes and number of cores). ![Wall-clock time on Curie for the Poisson solver](mpi-circle/curie/poisson.png) The time spent in communication routines is illustrated below, together with the model (corresponding to the communication part of the total time in the previous figure). ![Communication time on Curie for the Poisson solver](mpi-circle/curie/poisson-mpi.png) Similar results are obtained for a pure Laplacian with a best fit speed of order 25 million grid points/core. This speed is larger than for the Poisson solver and hence does not scale as well. ![Wall-clock time on Curie for the Laplacian](mpi-circle/curie/laplacian.png) ![Communication time on Curie for the Laplacian](mpi-circle/curie/laplacian-mpi.png) Similarly, the pure restriction operator scales quite well, with a best fit speed of around 30 million grid points/core. ![Wall-clock time on Curie for the restriction operator](mpi-circle/curie/restriction.png) ![Communication time on Curie for the restriction operator](mpi-circle/curie/restriction-mpi.png) */