src/test/mpi-circle.c

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
/**
# Parallel scalability

This code can be used to test the scalability of common operations
(traversal, restriction, boundary conditions etc...) and their
combinations, in particular the multigrid Poisson solver. */

#include "poisson.h"
#include "utils.h"
#include "check_restriction.h"

scalar a[], b[];

static void mpi_print (timer t, int i, size_t tnc,
		       const char * name)
{
  double mpi[npe()];
  timing s = timer_timing (t, i, tnc, mpi);

  if (pid() == 0) {
    /**
    *s.min/i*, *s.avg/i*, *s.max/i* are the minimum, average and maximum
    *times spent in communication routines. */

    printf ("%d %g %g %g %s %g %g %g %.2g%% %ld %ld ",
	    npe(), s.cpu/i, s.real/i, s.speed, name, s.min/i, s.avg/i, s.max/i,
	    100.*s.avg/s.real, s.mem, s.tnc);

    /**
    We also output the times spent in communication routines for each process. */

    for (int j = 0; j < npe(); j++)
      printf (" %g", mpi[j]/i);
    printf ("\n");
  }
    
  MPI_Barrier (MPI_COMM_WORLD);
}

int main (int argc, char * argv[])
{
  int maxlevel = argc > 1 ? atoi(argv[1]) : 6;
  int minlevel = argc > 2 ? atoi(argv[2]) : 5;
  timer t;

  init_grid (1);
  refine (level <= minlevel*(1. - sqrt(sq((x - 0.5) - 0.1) +
				       sq((y - 0.5)- 0.1))));
  
  foreach()
    a[] = b[] = 0.;
  boundary ({a, b});
  poisson (a, b); // to force allocation of extra fields
  
  MPI_Barrier (MPI_COMM_WORLD);
  t = timer_start();
  refine (level <= maxlevel*(1. - sqrt(sq((x - 0.5) - 0.1) +
				       sq((y - 0.5) - 0.1))));
  check_restriction (a);
  
  size_t tnc = 0;
  foreach(reduction(+:tnc))
    tnc++;
  mpi_print (t, 1, tnc, "refine");
  
  if (npe() <= 8)
    debug_mpi (stderr);
  
  int nloops, i;

  /**
  We fill `a` with a simple function. */

  MPI_Barrier (MPI_COMM_WORLD);
  t = timer_start();
  i = nloops = npe();
  while (i--) {
    foreach()
      a[] = cos(pi*x)*cos(pi*y);
#if 0
    boundary ({a});
#else
    prof_start ("barrier");
    MPI_Barrier (MPI_COMM_WORLD);
    prof_stop();
#endif
  }
  mpi_print (t, nloops, tnc*nloops, "cos");
  boundary ({a});

  /**
  Here we compute
  $$
  b = \nabla^2 a
  $$
  using a 5-points Laplacian operator. */

  MPI_Barrier (MPI_COMM_WORLD);
  t = timer_start();
  i = nloops = npe();
  while (i--) {
    foreach()
      b[] = (a[0,1] + a[1,0] + a[0,-1] + a[-1,0] - 4.*a[])/sq(Delta);
    boundary ({b});
  }
  mpi_print (t, nloops, tnc*nloops, "laplacian");
  
  /**
  Something simpler: the sum of `a` over the entire mesh. */

  MPI_Barrier (MPI_COMM_WORLD);
  t = timer_start();
  i = nloops = npe();
  double sum = 0.;
  while (i--) {
    sum = 0.;
    foreach(reduction(+:sum))
      sum += sq(Delta)*b[];
  }
  mpi_print (t, nloops, tnc*nloops, "sum");
  fprintf (stderr, "sum: %g\n", sum);

  /**
  The restriction operator. */

  MPI_Barrier (MPI_COMM_WORLD);
  t = timer_start();
  i = nloops = 1;
  while (i--) {
    boundary ({b});
    restriction ({b});
  }
  mpi_print (t, nloops, tnc*nloops, "restriction");

  /**
  And finally the Poisson solver. */

  MPI_Barrier (MPI_COMM_WORLD);
  t = timer_start();
  i = nloops = 1;
  TOLERANCE = HUGE;
  while (i--)
    poisson (a, b);
  mpi_print (t, nloops, tnc*nloops, "poisson");

  scalar e[];
  foreach()
    e[] = a[] - cos(pi*x)*cos(pi*y);
  fprintf (stderr, "error: %g\n", normf(e).max);
  //  output_ppm (e, file = "error.png", n = 512);
  //  assert (normf(e).max < 0.4);
  
  int n = 0;
  foreach()
    n++;
  int nmin = n, nmax = n;
  mpi_all_reduce (nmin, MPI_INT, MPI_MIN);
  mpi_all_reduce (nmax, MPI_INT, MPI_MAX);
  fprintf (stderr, "balance %d %d\n", nmin, nmax);
}

/**
## How to run on Curie

This test is run on
[Curie](http://www-hpc.cea.fr/en/complexe/tgcc-curie.htm). The C99
source code is generated on a system with *qcc* installed and then
copied to Curie using something like

~~~bash
qcc -source mpi-laplacian.c
scp _mpi-laplacian.c popinets@curie-fr.ccc.cea.fr:
~~~

On Curie the following script (*run.sh*) is used to compile and run
the code

~~~bash
#!/bin/bash
#MSUB -r mpi-laplacian
#MSUB -n 32
#MSUB -T 600
####MSUB -Q test
#MSUB -o basilisk_%I.out
#MSUB -e basilisk_%I.log
#MSUB -q standard
#MSUB -A gen7325
#MSUB -w

LEVEL=10

set -x
cd ${BRIDGE_MSUB_PWD}
mpicc -O2 -Wall -std=c99 -D_MPI=1 _mpi-laplacian.c -o mpi-laplacian -lm
rm -f trace-*
ccc_mprun -n ${BRIDGE_MSUB_NPROC} ./mpi-laplacian ${LEVEL} 8 \
    2> log-${LEVEL}-${BRIDGE_MSUB_NPROC} > out-${LEVEL}-${BRIDGE_MSUB_NPROC}
~~~

LEVEL is varied from 10 (~1 million grid points) to 15 (~1 billion
grid points) and the number of processes up to 16384 using something like

~~~bash
for i in 16 32 64 128 256 512 1024 2048 4096 8192 16384; do 
  ccc_msub -n $i run.sh; 
done
~~~

The results can then be collected using

~~~bash
tar czvf curie.tgz out-*-*
~~~

## Results on Curie

The memory usage per core is given below. The curves are a model (see
[mpi-laplacian.plot]() for details). The increase for a large number
of cores corresponds to the memory overhead of communication buffers
etc...

![Memory usage on Curie](curie/memory.png)

The wall-clock time for one iteration of the multigrid Poisson solver
is given below. The red lines are a model of strong scaling. For a low
enough number of cores, close to perfect scaling is obtained with a
best fit computation speed close to 1.5 million grid points/core.

The pink lines connect points corresponding with weak (or
*iso-granular*) scaling i.e. quadrupling both the computation size and
the number of cores. The ideal weak scaling would give horizontal
lines (i.e. constant computation time for proportionally-increasing
problem sizes and number of cores).

![Wall-clock time on Curie for the Poisson solver](curie/poisson.png)

The time spent in communication routines is illustrated below,
together with the model (corresponding to the communication part of
the total time in the previous figure).

![Communication time on Curie for the Poisson solver](curie/poisson-mpi.png)

Similar results are obtained for a pure Laplacian with a best fit
speed of order 25 million grid points/core. This speed is larger than
for the Poisson solver and hence does not scale as well.

![Wall-clock time on Curie for the Laplacian](curie/laplacian.png)

![Communication time on Curie for the Laplacian](curie/laplacian-mpi.png)

Similarly, the pure restriction operator scales quite well, with a
best fit speed of around 30 million grid points/core.

![Wall-clock time on Curie for the restriction operator](curie/restriction.png)

![Communication time on Curie for the restriction operator](curie/restriction-mpi.png)

*/