sandbox/bugs/dump_integer_overflow.c
Dumps are mangled when using 3D multigrid over a certain level
When using 3D multigrid over a certain level, dumps are mangled and you get the following error:
restore(): error: the number of processes don't match: 0 != 1024
despite having used the right amount of processors.
#include "grid/multigrid3D.h"
#include "utils.h"
#include "output.h"
#define LEVEL 8 // You need to change this to 10 or 11 to see the bug
int main(){
L0 = 1.0;
X0 = Y0 = Z0 = -L0 / 2;
N = 1 << (LEVEL);
init_grid(N);
scalar p[];
foreach(){
p[] = x;
}
dump("restore.dump");
restore("restore.dump");
}The issue can be traced output.h. The offset is a long
but the multiplication index[] * cell_size is not cast to
long before multiplication, which can lead to an integer
overflow when index[] and cell_size are large.
A race condition when writing the header and the data can also lead to
mangled dumps.
--- a/output.h
+++ b/output.h
@@ -1154,10 +1154,13 @@
strcpy (name, file);
if (!unbuffered)
strcat (name, "~");
- FILE * fh = fopen (name, "w");
- if (fh == NULL) {
- perror (name);
- exit (1);
+ FILE * fh = NULL;
+ if (pid() == 0) {
+ fh = fopen (name, "w");
+ if (fh == NULL) {
+ perror (name);
+ exit (1);
+ }
}
scalar * dlist = dump_list (list, zero);
@@ -1172,8 +1175,20 @@
MPI_Barrier (MPI_COMM_WORLD);
#endif
- if (pid() == 0)
+ if (pid() == 0) {
dump_header (fh, &header, slist);
+ fflush (fh);
+ }
+
+ MPI_Barrier (MPI_COMM_WORLD);
+
+ if (pid() != 0) {
+ fh = fopen (name, "r+");
+ if (fh == NULL) {
+ perror (name);
+ exit (1);
+ }
+ }
scalar index = {-1};
@@ -1190,7 +1205,7 @@
foreach_cell() {
// fixme: this won't work when combining MPI and mask()
if (is_local(cell)) {
- long offset = sizeofheader + index[]*cell_size;
+ long offset = sizeofheader + (long)index[] * (long)cell_size;
if (pos != offset) {
fseek (fh, offset, SEEK_SET);
pos = offset;
@@ -1323,8 +1338,8 @@
#if MULTIGRID_MPI
long cell_size = sizeof(unsigned) + header.len*sizeof(double);
- long offset = pid()*((1 << dimension*(header.depth + 1)) - 1)/
- ((1 << dimension) - 1)*cell_size;
+ long offset = pid()*((1L << dimension*(header.depth + 1)) - 1)/
+ ((1L << dimension) - 1)*cell_size;
if (fseek (fp, offset, SEEK_CUR) < 0) {
perror ("restore(): error while seeking");
exit (1);
--- a/grid/multigrid-mpi.h
+++ b/grid/multigrid-mpi.h
@@ -232,9 +232,9 @@
{
long i;
if (leaves)
- i = pid()*(1 << dimension*depth());
+ i = (long)pid() * (1L << (dimension * depth()));
else
- i = pid()*((1 << dimension*(depth() + 1)) - 1)/((1 << dimension) - 1);
+ i = (long)pid() * ((1L << (dimension * (depth() + 1))) - 1) / ((1L << dimension) - 1);
foreach_cell() {
if (!leaves || is_leaf(cell))