Use large sort chunks, since that part can be parallelized

This commit is contained in:
Eric Fischer 2016-04-05 10:08:46 -07:00
parent 2a4be4f6d4
commit c0a0aef060

View File

@ -338,8 +338,8 @@ static void merge(struct merge *merges, int nmerges, unsigned char *map, FILE *f
fwrite_check(geom_map + ix->start, 1, ix->end - ix->start, geom_out, "merge geometry");
*geompos += ix->end - ix->start;
// Count this as a half-accomplishment, since we already half-counted it
*progress += (ix->end - ix->start) / 2;
// Count this as an 75%-accomplishment, since we already 25%-counted it
*progress += (ix->end - ix->start) * 3 / 4;
if (!quiet && 100 * *progress / *progress_max != *progress_reported) {
fprintf(stderr, "Reordering geometry: %lld%% \r", 100 * *progress / *progress_max);
*progress_reported = 100 * *progress / *progress_max;
@ -1179,8 +1179,8 @@ void radix1(int *geomfds_in, int *indexfds_in, int inputs, int prefix, int split
fwrite_check(geommap + ix.start, ix.end - ix.start, 1, geomfiles[which], "geom");
sub_geompos[which] += ix.end - ix.start;
// Count this as a half-accomplishment, since we will copy again
*progress += (ix.end - ix.start) / 2;
// Count this as a 25%-accomplishment, since we will copy again
*progress += (ix.end - ix.start) / 4;
if (!quiet && 100 * *progress / *progress_max != *progress_reported) {
fprintf(stderr, "Reordering geometry: %lld%% \r", 100 * *progress / *progress_max);
*progress_reported = 100 * *progress / *progress_max;
@ -1241,10 +1241,14 @@ void radix1(int *geomfds_in, int *indexfds_in, int inputs, int prefix, int split
int bytes = sizeof(struct index);
int page = sysconf(_SC_PAGESIZE);
long long unit = (50 * 1024 * 1024 / bytes) * bytes;
while (unit % page != 0) {
unit += bytes;
// Don't try to sort more than 2GB at once,
// which used to crash Macs and may still
long long max_unit = 2LL * 1024 * 1024 * 1024;
long long unit = ((indexpos / CPUS + bytes - 1) / bytes) * bytes;
if (unit > max_unit) {
unit = max_unit;
}
unit = ((unit + page - 1) / page) * page;
int nmerges = (indexpos + unit - 1) / unit;
struct merge merges[nmerges];
@ -1334,6 +1338,13 @@ void radix1(int *geomfds_in, int *indexfds_in, int inputs, int prefix, int split
fwrite_check(geommap + ix.start, ix.end - ix.start, 1, geomfile, "geom");
*geompos_out += ix.end - ix.start;
// Count this as an 75%-accomplishment, since we already 25%-counted it
*progress += (ix.end - ix.start) * 3 / 4;
if (!quiet && 100 * *progress / *progress_max != *progress_reported) {
fprintf(stderr, "Reordering geometry: %lld%% \r", 100 * *progress / *progress_max);
*progress_reported = 100 * *progress / *progress_max;
}
ix.start = pos;
ix.end = *geompos_out;
fwrite_check(&ix, sizeof(struct index), 1, indexfile, "index");
@ -1355,7 +1366,7 @@ void radix1(int *geomfds_in, int *indexfds_in, int inputs, int prefix, int split
// progress. So increase the total amount of progress to report by
// the additional progress that will happpen, which may move the
// counter backward but will be an honest estimate of the work remaining.
*progress_max += geomst.st_size / 2;
*progress_max += geomst.st_size / 4;
radix1(&geomfds[i], &indexfds[i], 1, prefix + splitbits, availfiles / 4, mem, tmpdir, availfiles, geomfile, indexfile, geompos_out, progress, progress_max, progress_reported);
already_closed = 1;
@ -1415,6 +1426,11 @@ void radix(struct reader *reader, int nreaders, FILE *geomfile, int geomfd, FILE
mem = (long long) pages * pagesize;
#endif
// Don't use huge numbers of files that will trouble the file system
if (rl.rlim_cur > 5000) {
rl.rlim_cur = 5000;
}
long long availfiles = rl.rlim_cur - 2 * nreaders // each reader has a geom and an index
- 4 // pool, meta, mbtiles, mbtiles journal
- 4 // top-level geom and index output, both FILE and fd