From dfcdd03b19630bba244a97e30a69a2d791785d4a Mon Sep 17 00:00:00 2001 From: Eric Fischer Date: Fri, 23 Oct 2015 11:59:57 -0700 Subject: [PATCH 01/19] Factor out the JSON token reading loop Conflicts: geojson.c --- geojson.c | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/geojson.c b/geojson.c index 6248b57..68aaf46 100644 --- a/geojson.c +++ b/geojson.c @@ -864,14 +864,7 @@ int read_json(int argc, char **argv, char *fname, const char *layername, int max jp = json_begin_file(fp); - int layer; - if (nlayers == 1) { - layer = 0; - } else { - layer = source; - } - - parse_json(jp, reading, &seq, &metapos, &geompos, &indexpos, exclude, include, exclude_all, metafile, geomfile, indexfile, poolfile, treefile, fname, maxzoom, basezoom, layer, droprate, file_bbox); + parse_json(jp, reading, &seq, &metapos, &geompos, &indexpos, exclude, include, exclude_all, metafile, geomfile, indexfile, poolfile, treefile, fname, maxzoom, basezoom, source, droprate, file_bbox); json_end(jp); fclose(fp); From c6dfae26cb4bce87d37d0547918e8d97c9bd915b Mon Sep 17 00:00:00 2001 From: Eric Fischer Date: Fri, 23 Oct 2015 14:58:50 -0700 Subject: [PATCH 02/19] Add a function to disconnect a JSON object from the parse tree --- jsonpull.c | 12 +++++++++++- jsonpull.h | 1 + 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/jsonpull.c b/jsonpull.c index fc85b92..212ca2c 100644 --- a/jsonpull.c +++ b/jsonpull.c @@ -568,11 +568,19 @@ void json_free(json_object *o) { free(o->string); } + json_disconnect(o); + + free(o); +} + +void json_disconnect(json_object *o) { // Expunge references to this as an array element // or a hash key or value. if (o->parent != NULL) { if (o->parent->type == JSON_ARRAY) { + int i; + for (i = 0; i < o->parent->length; i++) { if (o->parent->array[i] == o) { break; @@ -586,6 +594,8 @@ void json_free(json_object *o) { } if (o->parent->type == JSON_HASH) { + int i; + for (i = 0; i < o->parent->length; i++) { if (o->parent->keys[i] == o) { o->parent->keys[i] = fabricate_object(o->parent, JSON_NULL); @@ -612,5 +622,5 @@ void json_free(json_object *o) { } } - free(o); + o->parent = NULL; } diff --git a/jsonpull.h b/jsonpull.h index f5ccbee..645f47c 100644 --- a/jsonpull.h +++ b/jsonpull.h @@ -59,5 +59,6 @@ json_object *json_read_tree(json_pull *j); json_object *json_read(json_pull *j); json_object *json_read_separators(json_pull *j, json_separator_callback cb, void *state); void json_free(json_object *j); +void json_disconnect(json_object *j); json_object *json_hash_get(json_object *o, const char *s); From dd2a4b0fc82919765031d9c10c873e4e922f4cf2 Mon Sep 17 00:00:00 2001 From: Eric Fischer Date: Fri, 18 Dec 2015 15:12:27 -0800 Subject: [PATCH 03/19] Pass the input line number around instead of the JSON parser itself --- geojson.c | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/geojson.c b/geojson.c index 68aaf46..84bd530 100644 --- a/geojson.c +++ b/geojson.c @@ -138,9 +138,9 @@ void serialize_string(FILE *out, const char *s, long long *fpos, const char *fna *fpos += len + 1; } -void parse_geometry(int t, json_object *j, long long *bbox, long long *fpos, FILE *out, int op, const char *fname, json_pull *source, long long *wx, long long *wy, int *initialized) { +void parse_geometry(int t, json_object *j, long long *bbox, long long *fpos, FILE *out, int op, const char *fname, int line, long long *wx, long long *wy, int *initialized) { if (j == NULL || j->type != JSON_ARRAY) { - fprintf(stderr, "%s:%d: expected array for type %d\n", fname, source->line, t); + fprintf(stderr, "%s:%d: expected array for type %d\n", fname, line, t); return; } @@ -156,7 +156,7 @@ void parse_geometry(int t, json_object *j, long long *bbox, long long *fpos, FIL } } - parse_geometry(within, j->array[i], bbox, fpos, out, op, fname, source, wx, wy, initialized); + parse_geometry(within, j->array[i], bbox, fpos, out, op, fname, line, wx, wy, initialized); } } else { if (j->length >= 2 && j->array[0]->type == JSON_NUMBER && j->array[1]->type == JSON_NUMBER) { @@ -169,7 +169,7 @@ void parse_geometry(int t, json_object *j, long long *bbox, long long *fpos, FIL static int warned = 0; if (!warned) { - fprintf(stderr, "%s:%d: ignoring dimensions beyond two\n", fname, source->line); + fprintf(stderr, "%s:%d: ignoring dimensions beyond two\n", fname, line); warned = 1; } } @@ -203,7 +203,7 @@ void parse_geometry(int t, json_object *j, long long *bbox, long long *fpos, FIL *wx = x; *wy = y; } else { - fprintf(stderr, "%s:%d: malformed point\n", fname, source->line); + fprintf(stderr, "%s:%d: malformed point\n", fname, line); } } @@ -437,12 +437,12 @@ long long addpool(struct memfile *poolfile, struct memfile *treefile, char *s, c return off; } -int serialize_geometry(json_object *geometry, json_object *properties, const char *reading, json_pull *jp, long long *seq, long long *metapos, long long *geompos, long long *indexpos, struct pool *exclude, struct pool *include, int exclude_all, FILE *metafile, FILE *geomfile, FILE *indexfile, struct memfile *poolfile, struct memfile *treefile, const char *fname, int maxzoom, int basezoom, int layer, double droprate, long long *file_bbox, json_object *tippecanoe) { +int serialize_geometry(json_object *geometry, json_object *properties, const char *reading, int line, long long *seq, long long *metapos, long long *geompos, long long *indexpos, struct pool *exclude, struct pool *include, int exclude_all, FILE *metafile, FILE *geomfile, FILE *indexfile, struct memfile *poolfile, struct memfile *treefile, const char *fname, int maxzoom, int basezoom, int layer, double droprate, long long *file_bbox, json_object *tippecanoe) { json_object *geometry_type = json_hash_get(geometry, "type"); if (geometry_type == NULL) { static int warned = 0; if (!warned) { - fprintf(stderr, "%s:%d: null geometry (additional not reported)\n", reading, jp->line); + fprintf(stderr, "%s:%d: null geometry (additional not reported)\n", reading, line); warned = 1; } @@ -450,13 +450,13 @@ int serialize_geometry(json_object *geometry, json_object *properties, const cha } if (geometry_type->type != JSON_STRING) { - fprintf(stderr, "%s:%d: geometry without type\n", reading, jp->line); + fprintf(stderr, "%s:%d: geometry without type\n", reading, line); return 0; } json_object *coordinates = json_hash_get(geometry, "coordinates"); if (coordinates == NULL || coordinates->type != JSON_ARRAY) { - fprintf(stderr, "%s:%d: feature without coordinates array\n", reading, jp->line); + fprintf(stderr, "%s:%d: feature without coordinates array\n", reading, line); return 0; } @@ -467,7 +467,7 @@ int serialize_geometry(json_object *geometry, json_object *properties, const cha } } if (t >= GEOM_TYPES) { - fprintf(stderr, "%s:%d: Can't handle geometry type %s\n", reading, jp->line, geometry_type->string); + fprintf(stderr, "%s:%d: Can't handle geometry type %s\n", reading, line, geometry_type->string); return 0; } @@ -533,7 +533,7 @@ int serialize_geometry(json_object *geometry, json_object *properties, const cha } else if (properties->values[i] != NULL && (properties->values[i]->type == JSON_NULL)) { ; } else { - fprintf(stderr, "%s:%d: Unsupported property type for %s\n", reading, jp->line, properties->keys[i]->string); + fprintf(stderr, "%s:%d: Unsupported property type for %s\n", reading, line, properties->keys[i]->string); continue; } } @@ -560,7 +560,7 @@ int serialize_geometry(json_object *geometry, json_object *properties, const cha serialize_long_long(geomfile, metastart, geompos, fname); long long wx = initial_x, wy = initial_y; - parse_geometry(t, coordinates, bbox, geompos, geomfile, VT_MOVETO, fname, jp, &wx, &wy, &initialized); + parse_geometry(t, coordinates, bbox, geompos, geomfile, VT_MOVETO, fname, line, &wx, &wy, &initialized); serialize_byte(geomfile, VT_END, geompos, fname); /* @@ -691,7 +691,7 @@ void parse_json(json_pull *jp, const char *reading, long long *seq, long long *m } found_geometries++; - serialize_geometry(j, NULL, reading, jp, seq, metapos, geompos, indexpos, exclude, include, exclude_all, metafile, geomfile, indexfile, poolfile, treefile, fname, maxzoom, basezoom, layer, droprate, file_bbox, NULL); + serialize_geometry(j, NULL, reading, jp->line, seq, metapos, geompos, indexpos, exclude, include, exclude_all, metafile, geomfile, indexfile, poolfile, treefile, fname, maxzoom, basezoom, layer, droprate, file_bbox, NULL); json_free(j); continue; } @@ -726,10 +726,10 @@ void parse_json(json_pull *jp, const char *reading, long long *seq, long long *m if (geometries != NULL) { int g; for (g = 0; g < geometries->length; g++) { - serialize_geometry(geometries->array[g], properties, reading, jp, seq, metapos, geompos, indexpos, exclude, include, exclude_all, metafile, geomfile, indexfile, poolfile, treefile, fname, maxzoom, basezoom, layer, droprate, file_bbox, tippecanoe); + serialize_geometry(geometries->array[g], properties, reading, jp->line, seq, metapos, geompos, indexpos, exclude, include, exclude_all, metafile, geomfile, indexfile, poolfile, treefile, fname, maxzoom, basezoom, layer, droprate, file_bbox, tippecanoe); } } else { - serialize_geometry(geometry, properties, reading, jp, seq, metapos, geompos, indexpos, exclude, include, exclude_all, metafile, geomfile, indexfile, poolfile, treefile, fname, maxzoom, basezoom, layer, droprate, file_bbox, tippecanoe); + serialize_geometry(geometry, properties, reading, jp->line, seq, metapos, geompos, indexpos, exclude, include, exclude_all, metafile, geomfile, indexfile, poolfile, treefile, fname, maxzoom, basezoom, layer, droprate, file_bbox, tippecanoe); } json_free(j); From ed90c7b53ab65d31fadc9364ac02c7fa8a382d5b Mon Sep 17 00:00:00 2001 From: Eric Fischer Date: Mon, 26 Oct 2015 16:05:01 -0700 Subject: [PATCH 04/19] Use memory-mapped I/O for GeoJSON reading if possible Conflicts: geojson.c --- geojson.c | 68 ++++++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 60 insertions(+), 8 deletions(-) diff --git a/geojson.c b/geojson.c index 84bd530..8d11af6 100644 --- a/geojson.c +++ b/geojson.c @@ -743,6 +743,35 @@ void parse_json(json_pull *jp, const char *reading, long long *seq, long long *m } } +struct jsonmap { + char *map; + long long off; + long long end; +}; + +int json_map_read(struct json_pull *jp, char *buffer, int n) { + struct jsonmap *jm = jp->source; + + if (jm->off + n >= jm->end) { + n = jm->end - jm->off; + } + + memcpy(buffer, jm->map + jm->off, n); + jm->off += n; + + return n; +} + +struct json_pull *json_begin_map(char *map, long long len) { + struct jsonmap *jm = malloc(sizeof(struct jsonmap)); + + jm->map = map; + jm->off = 0; + jm->end = len; + + return json_begin(json_map_read, jm); +} + int read_json(int argc, char **argv, char *fname, const char *layername, int maxzoom, int minzoom, int basezoom, double basezoom_marker_width, sqlite3 *outdb, struct pool *exclude, struct pool *include, int exclude_all, double droprate, int buffer, const char *tmpdir, double gamma, char *prevent, char *additional) { int ret = EXIT_SUCCESS; @@ -848,26 +877,49 @@ int read_json(int argc, char **argv, char *fname, const char *layername, int max for (source = 0; source < nsources; source++) { json_pull *jp; const char *reading; - FILE *fp; + int fd; if (source >= argc) { reading = "standard input"; - fp = stdin; + fd = 0; } else { reading = argv[source]; - fp = fopen(argv[source], "r"); - if (fp == NULL) { + fd = open(argv[source], O_RDONLY); + if (fd < 0) { perror(argv[source]); continue; } } - jp = json_begin_file(fp); + struct stat st; + char *map = NULL; + off_t off; - parse_json(jp, reading, &seq, &metapos, &geompos, &indexpos, exclude, include, exclude_all, metafile, geomfile, indexfile, poolfile, treefile, fname, maxzoom, basezoom, source, droprate, file_bbox); + if (fstat(fd, &st) == 0) { + off = lseek(fd, 0, SEEK_CUR); + if (off >= 0) { + map = mmap(NULL, st.st_size - off, PROT_READ, MAP_PRIVATE, fd, off); + } + } - json_end(jp); - fclose(fp); + if (map != NULL && map != MAP_FAILED) { + jp = json_begin_map(map, st.st_size - off); + parse_json(jp, reading, &seq, &metapos, &geompos, &indexpos, exclude, include, exclude_all, metafile, geomfile, indexfile, poolfile, treefile, fname, maxzoom, basezoom, source, droprate, file_bbox); + free(jp->source); + json_end(jp); + } else { + FILE *fp = fdopen(fd, "r"); + if (fp == NULL) { + perror(argv[source]); + close(fd); + continue; + } + + jp = json_begin_file(fp); + parse_json(jp, reading, &seq, &metapos, &geompos, &indexpos, exclude, include, exclude_all, metafile, geomfile, indexfile, poolfile, treefile, fname, maxzoom, basezoom, source, droprate, file_bbox); + json_end(jp); + fclose(fp); + } } fclose(metafile); From 2159d464d093a22f762b4aa6d7768d56599202d0 Mon Sep 17 00:00:00 2001 From: Eric Fischer Date: Fri, 18 Dec 2015 15:59:07 -0800 Subject: [PATCH 05/19] Segment the file into input chunks. Allow commas at the top level. --- geojson.c | 26 ++++++++++++++++++++++---- jsonpull.c | 23 ++++++++++------------- 2 files changed, 32 insertions(+), 17 deletions(-) diff --git a/geojson.c b/geojson.c index 8d11af6..07c4d45 100644 --- a/geojson.c +++ b/geojson.c @@ -903,10 +903,28 @@ int read_json(int argc, char **argv, char *fname, const char *layername, int max } if (map != NULL && map != MAP_FAILED) { - jp = json_begin_map(map, st.st_size - off); - parse_json(jp, reading, &seq, &metapos, &geompos, &indexpos, exclude, include, exclude_all, metafile, geomfile, indexfile, poolfile, treefile, fname, maxzoom, basezoom, source, droprate, file_bbox); - free(jp->source); - json_end(jp); +#define THREADS 10 + long long segs[THREADS + 1]; + segs[0] = 0; + segs[THREADS] = st.st_size - off; + + int i; + for (i = 1; i < THREADS; i++) { + segs[i] = off + (st.st_size - off) * i / THREADS; + + while (segs[i] < st.st_size && map[segs[i]] != '\n') { + segs[i]++; + } + + printf("%d %lld\n", i, segs[i]); + } + + for (i = 0; i < THREADS; i++) { + jp = json_begin_map(map + segs[i], segs[i + 1] - segs[i]); + parse_json(jp, reading, &seq, &metapos, &geompos, &indexpos, exclude, include, exclude_all, metafile, geomfile, indexfile, poolfile, treefile, fname, maxzoom, basezoom, source, droprate, file_bbox); + free(jp->source); + json_end(jp); + } } else { FILE *fp = fdopen(fd, "r"); if (fp == NULL) { diff --git a/jsonpull.c b/jsonpull.c index 212ca2c..de2eaae 100644 --- a/jsonpull.c +++ b/jsonpull.c @@ -332,20 +332,17 @@ again: /////////////////////////// Comma if (c == ',') { - if (j->container == NULL) { - j->error = "Found comma at top level"; - return NULL; - } + if (j->container != NULL) { + if (j->container->expect != JSON_COMMA) { + j->error = "Found unexpected comma"; + return NULL; + } - if (j->container->expect != JSON_COMMA) { - j->error = "Found unexpected comma"; - return NULL; - } - - if (j->container->type == JSON_HASH) { - j->container->expect = JSON_KEY; - } else { - j->container->expect = JSON_ITEM; + if (j->container->type == JSON_HASH) { + j->container->expect = JSON_KEY; + } else { + j->container->expect = JSON_ITEM; + } } if (cb != NULL) { From f6dfe0ace0b2fa8bda0eefd88c113f0af602a536 Mon Sep 17 00:00:00 2001 From: Eric Fischer Date: Fri, 18 Dec 2015 16:47:48 -0800 Subject: [PATCH 06/19] WIP on splitting geometry-reading state into per-thread structures --- geojson.c | 320 +++++++++++++++++++++++++++++++++--------------------- 1 file changed, 197 insertions(+), 123 deletions(-) diff --git a/geojson.c b/geojson.c index 07c4d45..e502748 100644 --- a/geojson.c +++ b/geojson.c @@ -272,6 +272,7 @@ struct index { long long start; long long end; unsigned long long index; + int segment; }; int indexcmp(const void *v1, const void *v2) { @@ -437,7 +438,7 @@ long long addpool(struct memfile *poolfile, struct memfile *treefile, char *s, c return off; } -int serialize_geometry(json_object *geometry, json_object *properties, const char *reading, int line, long long *seq, long long *metapos, long long *geompos, long long *indexpos, struct pool *exclude, struct pool *include, int exclude_all, FILE *metafile, FILE *geomfile, FILE *indexfile, struct memfile *poolfile, struct memfile *treefile, const char *fname, int maxzoom, int basezoom, int layer, double droprate, long long *file_bbox, json_object *tippecanoe) { +int serialize_geometry(json_object *geometry, json_object *properties, const char *reading, int line, long long *seq, long long *metapos, long long *geompos, long long *indexpos, struct pool *exclude, struct pool *include, int exclude_all, FILE *metafile, FILE *geomfile, FILE *indexfile, struct memfile *poolfile, struct memfile *treefile, const char *fname, int maxzoom, int basezoom, int layer, double droprate, long long *file_bbox, json_object *tippecanoe, int segment) { json_object *geometry_type = json_hash_get(geometry, "type"); if (geometry_type == NULL) { static int warned = 0; @@ -595,6 +596,7 @@ int serialize_geometry(json_object *geometry, json_object *properties, const cha struct index index; index.start = geomstart; index.end = *geompos; + index.segment = segment; // Calculate the center even if off the edge of the plane, // and then mask to bring it back into the addressable area @@ -626,7 +628,7 @@ int serialize_geometry(json_object *geometry, json_object *properties, const cha return 1; } -void parse_json(json_pull *jp, const char *reading, long long *seq, long long *metapos, long long *geompos, long long *indexpos, struct pool *exclude, struct pool *include, int exclude_all, FILE *metafile, FILE *geomfile, FILE *indexfile, struct memfile *poolfile, struct memfile *treefile, char *fname, int maxzoom, int basezoom, int layer, double droprate, long long *file_bbox) { +void parse_json(json_pull *jp, const char *reading, long long *seq, long long *metapos, long long *geompos, long long *indexpos, struct pool *exclude, struct pool *include, int exclude_all, FILE *metafile, FILE *geomfile, FILE *indexfile, struct memfile *poolfile, struct memfile *treefile, char *fname, int maxzoom, int basezoom, int layer, double droprate, long long *file_bbox, int segment) { long long found_hashes = 0; long long found_features = 0; long long found_geometries = 0; @@ -691,7 +693,7 @@ void parse_json(json_pull *jp, const char *reading, long long *seq, long long *m } found_geometries++; - serialize_geometry(j, NULL, reading, jp->line, seq, metapos, geompos, indexpos, exclude, include, exclude_all, metafile, geomfile, indexfile, poolfile, treefile, fname, maxzoom, basezoom, layer, droprate, file_bbox, NULL); + serialize_geometry(j, NULL, reading, jp->line, seq, metapos, geompos, indexpos, exclude, include, exclude_all, metafile, geomfile, indexfile, poolfile, treefile, fname, maxzoom, basezoom, layer, droprate, file_bbox, NULL, segment); json_free(j); continue; } @@ -726,10 +728,10 @@ void parse_json(json_pull *jp, const char *reading, long long *seq, long long *m if (geometries != NULL) { int g; for (g = 0; g < geometries->length; g++) { - serialize_geometry(geometries->array[g], properties, reading, jp->line, seq, metapos, geompos, indexpos, exclude, include, exclude_all, metafile, geomfile, indexfile, poolfile, treefile, fname, maxzoom, basezoom, layer, droprate, file_bbox, tippecanoe); + serialize_geometry(geometries->array[g], properties, reading, jp->line, seq, metapos, geompos, indexpos, exclude, include, exclude_all, metafile, geomfile, indexfile, poolfile, treefile, fname, maxzoom, basezoom, layer, droprate, file_bbox, tippecanoe, segment); } } else { - serialize_geometry(geometry, properties, reading, jp->line, seq, metapos, geompos, indexpos, exclude, include, exclude_all, metafile, geomfile, indexfile, poolfile, treefile, fname, maxzoom, basezoom, layer, droprate, file_bbox, tippecanoe); + serialize_geometry(geometry, properties, reading, jp->line, seq, metapos, geompos, indexpos, exclude, include, exclude_all, metafile, geomfile, indexfile, poolfile, treefile, fname, maxzoom, basezoom, layer, droprate, file_bbox, tippecanoe, segment); } json_free(j); @@ -772,90 +774,130 @@ struct json_pull *json_begin_map(char *map, long long len) { return json_begin(json_map_read, jm); } +struct reader { + char *metaname; + char *poolname; + char *treename; + char *geomname; + char *indexname; + + int metafd; + int poolfd; + int treefd; + int geomfd; + int indexfd; + + FILE *metafile; + struct memfile *poolfile; + struct memfile *treefile; + FILE *geomfile; + FILE *indexfile; + + long long metapos; + long long geompos; + long long indexpos; + + long long *file_bbox; + + struct stat geomst; + struct stat metast; + + char *geom_map; +}; + int read_json(int argc, char **argv, char *fname, const char *layername, int maxzoom, int minzoom, int basezoom, double basezoom_marker_width, sqlite3 *outdb, struct pool *exclude, struct pool *include, int exclude_all, double droprate, int buffer, const char *tmpdir, double gamma, char *prevent, char *additional) { int ret = EXIT_SUCCESS; - char metaname[strlen(tmpdir) + strlen("/meta.XXXXXXXX") + 1]; - char poolname[strlen(tmpdir) + strlen("/pool.XXXXXXXX") + 1]; - char treename[strlen(tmpdir) + strlen("/tree.XXXXXXXX") + 1]; - char geomname[strlen(tmpdir) + strlen("/geom.XXXXXXXX") + 1]; - char indexname[strlen(tmpdir) + strlen("/index.XXXXXXXX") + 1]; +#define THREADS 10 + struct reader reader[THREADS]; + int i; + for (i = 0; i < THREADS; i++) { + struct reader *r = reader + i; - sprintf(metaname, "%s%s", tmpdir, "/meta.XXXXXXXX"); - sprintf(poolname, "%s%s", tmpdir, "/pool.XXXXXXXX"); - sprintf(treename, "%s%s", tmpdir, "/tree.XXXXXXXX"); - sprintf(geomname, "%s%s", tmpdir, "/geom.XXXXXXXX"); - sprintf(indexname, "%s%s", tmpdir, "/index.XXXXXXXX"); + r->metaname = malloc(strlen(tmpdir) + strlen("/meta.XXXXXXXX") + 1); + r->poolname = malloc(strlen(tmpdir) + strlen("/pool.XXXXXXXX") + 1); + r->treename = malloc(strlen(tmpdir) + strlen("/tree.XXXXXXXX") + 1); + r->geomname = malloc(strlen(tmpdir) + strlen("/geom.XXXXXXXX") + 1); + r->indexname = malloc(strlen(tmpdir) + strlen("/index.XXXXXXXX") + 1); - int metafd = mkstemp(metaname); - if (metafd < 0) { - perror(metaname); - exit(EXIT_FAILURE); - } - int poolfd = mkstemp(poolname); - if (poolfd < 0) { - perror(poolname); - exit(EXIT_FAILURE); - } - int treefd = mkstemp(treename); - if (treefd < 0) { - perror(treename); - exit(EXIT_FAILURE); - } - int geomfd = mkstemp(geomname); - if (geomfd < 0) { - perror(geomname); - exit(EXIT_FAILURE); - } - int indexfd = mkstemp(indexname); - if (indexfd < 0) { - perror(indexname); - exit(EXIT_FAILURE); + sprintf(r->metaname, "%s%s", tmpdir, "/meta.XXXXXXXX"); + sprintf(r->poolname, "%s%s", tmpdir, "/pool.XXXXXXXX"); + sprintf(r->treename, "%s%s", tmpdir, "/tree.XXXXXXXX"); + sprintf(r->geomname, "%s%s", tmpdir, "/geom.XXXXXXXX"); + sprintf(r->indexname, "%s%s", tmpdir, "/index.XXXXXXXX"); + + r->metafd = mkstemp(r->metaname); + if (r->metafd < 0) { + perror(r->metaname); + exit(EXIT_FAILURE); + } + r->poolfd = mkstemp(r->poolname); + if (r->poolfd < 0) { + perror(r->poolname); + exit(EXIT_FAILURE); + } + r->treefd = mkstemp(r->treename); + if (r->treefd < 0) { + perror(r->treename); + exit(EXIT_FAILURE); + } + r->geomfd = mkstemp(r->geomname); + if (r->geomfd < 0) { + perror(r->geomname); + exit(EXIT_FAILURE); + } + r->indexfd = mkstemp(r->indexname); + if (r->indexfd < 0) { + perror(r->indexname); + exit(EXIT_FAILURE); + } + + r->metafile = fopen(r->metaname, "wb"); + if (r->metafile == NULL) { + perror(r->metaname); + exit(EXIT_FAILURE); + } + r->poolfile = memfile_open(r->poolfd); + if (r->poolfile == NULL) { + perror(r->poolname); + exit(EXIT_FAILURE); + } + r->treefile = memfile_open(r->treefd); + if (r->treefile == NULL) { + perror(r->treename); + exit(EXIT_FAILURE); + } + r->geomfile = fopen(r->geomname, "wb"); + if (r->geomfile == NULL) { + perror(r->geomname); + exit(EXIT_FAILURE); + } + r->indexfile = fopen(r->indexname, "wb"); + if (r->indexfile == NULL) { + perror(r->indexname); + exit(EXIT_FAILURE); + } + r->metapos = 0; + r->geompos = 0; + r->indexpos = 0; + + unlink(r->metaname); + unlink(r->poolname); + unlink(r->treename); + unlink(r->geomname); + unlink(r->indexname); + + // To distinguish a null value + { + struct stringpool p; + memfile_write(r->treefile, &p, sizeof(struct stringpool)); + } + + r->file_bbox = malloc(4 * sizeof(long long)); + r->file_bbox[0] = r->file_bbox[1] = UINT_MAX; + r->file_bbox[2] = r->file_bbox[3] = 0; } - FILE *metafile = fopen(metaname, "wb"); - if (metafile == NULL) { - perror(metaname); - exit(EXIT_FAILURE); - } - struct memfile *poolfile = memfile_open(poolfd); - if (poolfile == NULL) { - perror(poolname); - exit(EXIT_FAILURE); - } - struct memfile *treefile = memfile_open(treefd); - if (treefile == NULL) { - perror(treename); - exit(EXIT_FAILURE); - } - FILE *geomfile = fopen(geomname, "wb"); - if (geomfile == NULL) { - perror(geomname); - exit(EXIT_FAILURE); - } - FILE *indexfile = fopen(indexname, "wb"); - if (indexfile == NULL) { - perror(indexname); - exit(EXIT_FAILURE); - } - long long metapos = 0; - long long geompos = 0; - long long indexpos = 0; - - unlink(metaname); - unlink(poolname); - unlink(treename); - unlink(geomname); - unlink(indexname); - - // To distinguish a null value - { - struct stringpool p; - memfile_write(treefile, &p, sizeof(struct stringpool)); - } - - long long file_bbox[] = {UINT_MAX, UINT_MAX, 0, 0}; - unsigned midx = 0, midy = 0; long long seq = 0; int nlayers; @@ -903,7 +945,6 @@ int read_json(int argc, char **argv, char *fname, const char *layername, int max } if (map != NULL && map != MAP_FAILED) { -#define THREADS 10 long long segs[THREADS + 1]; segs[0] = 0; segs[THREADS] = st.st_size - off; @@ -921,7 +962,7 @@ int read_json(int argc, char **argv, char *fname, const char *layername, int max for (i = 0; i < THREADS; i++) { jp = json_begin_map(map + segs[i], segs[i + 1] - segs[i]); - parse_json(jp, reading, &seq, &metapos, &geompos, &indexpos, exclude, include, exclude_all, metafile, geomfile, indexfile, poolfile, treefile, fname, maxzoom, basezoom, source, droprate, file_bbox); + parse_json(jp, reading, &seq, &reader[i].metapos, &reader[i].geompos, &reader[i].indexpos, exclude, include, exclude_all, reader[i].metafile, reader[i].geomfile, reader[i].indexfile, reader[i].poolfile, reader[i].treefile, fname, maxzoom, basezoom, source, droprate, reader[i].file_bbox, i); free(jp->source); json_end(jp); } @@ -934,45 +975,35 @@ int read_json(int argc, char **argv, char *fname, const char *layername, int max } jp = json_begin_file(fp); - parse_json(jp, reading, &seq, &metapos, &geompos, &indexpos, exclude, include, exclude_all, metafile, geomfile, indexfile, poolfile, treefile, fname, maxzoom, basezoom, source, droprate, file_bbox); + parse_json(jp, reading, &seq, &reader[0].metapos, &reader[0].geompos, &reader[0].indexpos, exclude, include, exclude_all, reader[0].metafile, reader[0].geomfile, reader[0].indexfile, reader[0].poolfile, reader[0].treefile, fname, maxzoom, basezoom, source, droprate, reader[0].file_bbox, 0); json_end(jp); fclose(fp); } } - fclose(metafile); - fclose(geomfile); - fclose(indexfile); - memfile_close(treefile); + for (i = 0; i < THREADS; i++) { + fclose(reader[i].metafile); + fclose(reader[i].geomfile); + fclose(reader[i].indexfile); + memfile_close(reader[i].treefile); - struct stat geomst; - struct stat metast; + if (fstat(reader[i].geomfd, &reader[i].geomst) != 0) { + perror("stat geom\n"); + exit(EXIT_FAILURE); + } + if (fstat(reader[i].metafd, &reader[i].metast) != 0) { + perror("stat meta\n"); + exit(EXIT_FAILURE); + } - if (fstat(geomfd, &geomst) != 0) { - perror("stat geom\n"); - exit(EXIT_FAILURE); + if (reader[i].geomst.st_size == 0 || reader[i].metast.st_size == 0) { + fprintf(stderr, "did not read any valid geometries\n"); // XXX + // exit(EXIT_FAILURE); + } } - if (fstat(metafd, &metast) != 0) { - perror("stat meta\n"); - exit(EXIT_FAILURE); - } - - if (geomst.st_size == 0 || metast.st_size == 0) { - fprintf(stderr, "did not read any valid geometries\n"); - exit(EXIT_FAILURE); - } - - char *meta = (char *) mmap(NULL, metast.st_size, PROT_READ, MAP_PRIVATE, metafd, 0); - if (meta == MAP_FAILED) { - perror("mmap meta"); - exit(EXIT_FAILURE); - } - - char *stringpool = poolfile->map; struct pool file_keys1[nlayers]; struct pool *file_keys[nlayers]; - int i; for (i = 0; i < nlayers; i++) { pool_init(&file_keys1[i], 0); file_keys[i] = &file_keys1[i]; @@ -1020,6 +1051,12 @@ int read_json(int argc, char **argv, char *fname, const char *layername, int max } } + // XXX Join the sub-indices together, and then + + long long indexpos = 0; // XXX + int indexfd = 0; // XXX + char *indexname = "XXX"; // XXX + /* Sort the index by geometry */ { @@ -1304,15 +1341,23 @@ int read_json(int argc, char **argv, char *fname, const char *layername, int max } unlink(indexname); - char *geom_map = mmap(NULL, geomst.st_size, PROT_READ, MAP_PRIVATE, geomfd, 0); - if (geom_map == MAP_FAILED) { - perror("mmap unsorted geometry"); - exit(EXIT_FAILURE); - } - if (close(geomfd) != 0) { - perror("close unsorted geometry"); + for (i = 0; i < THREADS; i++) { + reader[i].geom_map = mmap(NULL, reader[i].geomst.st_size, PROT_READ, MAP_PRIVATE, reader[i].geomfd, 0); + if (reader[i].geom_map == MAP_FAILED) { + perror("mmap unsorted geometry"); + exit(EXIT_FAILURE); + } + if (close(reader[i].geomfd) != 0) { + perror("close unsorted geometry"); + } } + char *geomname = "XXX"; // XXX + FILE *geomfile = NULL; // XXX + int geomfd = 0; // XXX + long long geompos = 0; // XXX + struct stat geomst; // XXX + sprintf(geomname, "%s%s", tmpdir, "/geom.XXXXXXXX"); geomfd = mkstemp(geomname); if (geomfd < 0) { @@ -1337,7 +1382,7 @@ int read_json(int argc, char **argv, char *fname, const char *layername, int max long long sum = 0; long long progress = 0; for (i = 0; i < indexpos / sizeof(struct index); i++) { - fwrite_check(geom_map + index_map[i].start, sizeof(char), index_map[i].end - index_map[i].start, geomfile, fname); + fwrite_check(reader[index_map[i].segment].geom_map + index_map[i].start, sizeof(char), index_map[i].end - index_map[i].start, geomfile, fname); sum += index_map[i].end - index_map[i].start; long long p = 1000 * i / (indexpos / sizeof(struct index)); @@ -1357,8 +1402,11 @@ int read_json(int argc, char **argv, char *fname, const char *layername, int max if (munmap(index_map, indexpos) != 0) { perror("unmap sorted index"); } - if (munmap(geom_map, geomst.st_size) != 0) { - perror("unmap unsorted geometry"); + + for (i = 0; i < THREADS; i++) { + if (munmap(reader[i].geom_map, reader[i].geomst.st_size) != 0) { + perror("unmap unsorted geometry"); + } } if (close(indexfd) != 0) { perror("close sorted index"); @@ -1390,9 +1438,19 @@ int read_json(int argc, char **argv, char *fname, const char *layername, int max } if (!quiet) { - fprintf(stderr, "%lld features, %lld bytes of geometry, %lld bytes of metadata, %lld bytes of string pool\n", seq, (long long) geomst.st_size, (long long) metast.st_size, poolfile->off); + for (i = 0; i < THREADS; i++) { + fprintf(stderr, "%lld features, %lld bytes of geometry, %lld bytes of metadata, %lld bytes of string pool\n", seq, (long long) reader[i].geomst.st_size, (long long) reader[i].metast.st_size, reader[i].poolfile->off); + } } + char *stringpool = poolfile->map; + char *meta = (char *) mmap(NULL, metast.st_size, PROT_READ, MAP_PRIVATE, metafd, 0); + if (meta == MAP_FAILED) { + perror("mmap meta"); + exit(EXIT_FAILURE); + } + + unsigned midx = 0, midy = 0; int written = traverse_zooms(fd, size, meta, stringpool, file_keys, &midx, &midy, layernames, maxzoom, minzoom, basezoom, outdb, droprate, buffer, fname, tmpdir, gamma, nlayers, prevent, additional, full_detail, low_detail, min_detail); if (maxzoom != written) { @@ -1420,6 +1478,22 @@ int read_json(int argc, char **argv, char *fname, const char *layername, int max midlat = (maxlat + minlat) / 2; midlon = (maxlon + minlon) / 2; + long long file_bbox[4] = { UINT_MAX, UINT_MAX, 0, 0 }; + for (i = 0; i < THREADS; i++) { + if (reader[i].file_bbox[0] < file_bbox[0]) { + file_bbox[0] = reader[i].file_bbox[0]; + } + if (reader[i].file_bbox[1] < file_bbox[1]) { + file_bbox[1] = reader[i].file_bbox[1]; + } + if (reader[i].file_bbox[2] > file_bbox[2]) { + file_bbox[2] = reader[i].file_bbox[2]; + } + if (reader[i].file_bbox[3] > file_bbox[3]) { + file_bbox[3] = reader[i].file_bbox[3]; + } + } + // If the bounding box extends off the plane on either side, // a feature wrapped across the date line, so the width of the // bounding box is the whole world. From 2dc99698d236de897ffb3bf4892eb295e9624829 Mon Sep 17 00:00:00 2001 From: Eric Fischer Date: Mon, 21 Dec 2015 16:17:47 -0800 Subject: [PATCH 07/19] Stub out enough to get it to compile --- geojson.c | 47 ++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 40 insertions(+), 7 deletions(-) diff --git a/geojson.c b/geojson.c index e502748..7e078d2 100644 --- a/geojson.c +++ b/geojson.c @@ -917,7 +917,6 @@ int read_json(int argc, char **argv, char *fname, const char *layername, int max int source; for (source = 0; source < nsources; source++) { - json_pull *jp; const char *reading; int fd; @@ -961,7 +960,7 @@ int read_json(int argc, char **argv, char *fname, const char *layername, int max } for (i = 0; i < THREADS; i++) { - jp = json_begin_map(map + segs[i], segs[i + 1] - segs[i]); + json_pull *jp = json_begin_map(map + segs[i], segs[i + 1] - segs[i]); parse_json(jp, reading, &seq, &reader[i].metapos, &reader[i].geompos, &reader[i].indexpos, exclude, include, exclude_all, reader[i].metafile, reader[i].geomfile, reader[i].indexfile, reader[i].poolfile, reader[i].treefile, fname, maxzoom, basezoom, source, droprate, reader[i].file_bbox, i); free(jp->source); json_end(jp); @@ -974,7 +973,7 @@ int read_json(int argc, char **argv, char *fname, const char *layername, int max continue; } - jp = json_begin_file(fp); + json_pull *jp = json_begin_file(fp); parse_json(jp, reading, &seq, &reader[0].metapos, &reader[0].geompos, &reader[0].indexpos, exclude, include, exclude_all, reader[0].metafile, reader[0].geomfile, reader[0].indexfile, reader[0].poolfile, reader[0].treefile, fname, maxzoom, basezoom, source, droprate, reader[0].file_bbox, 0); json_end(jp); fclose(fp); @@ -1051,11 +1050,41 @@ int read_json(int argc, char **argv, char *fname, const char *layername, int max } } - // XXX Join the sub-indices together, and then + /* Join the sub-indices together */ - long long indexpos = 0; // XXX - int indexfd = 0; // XXX - char *indexname = "XXX"; // XXX + char indexname[strlen(tmpdir) + strlen("/index.XXXXXXXX") + 1]; + sprintf(indexname, "%s%s", tmpdir, "/index.XXXXXXXX"); + + int indexfd = mkstemp(indexname); + if (indexfd < 0) { + perror(indexname); + exit(EXIT_FAILURE); + } + + FILE *indexfile = fopen(indexname, "wb"); + if (indexfile == NULL) { + perror(indexname); + exit(EXIT_FAILURE); + } + + unlink(indexname); + + long long indexpos = 0; + for (i = 0; i < THREADS; i++) { + if (reader[i].indexpos > 0) { + void *map = mmap(NULL, reader[i].indexpos, PROT_READ, MAP_PRIVATE, reader[i].indexfd, 0); + if (map == MAP_FAILED) { + perror("mmap"); + exit(EXIT_FAILURE); + } + if (fwrite(map, reader[i].indexpos, 1, indexfile) != 1) { + perror("Reunify index"); + exit(EXIT_FAILURE); + } + munmap(map, reader[i].indexpos); + indexpos += reader[i].indexpos; + } + } /* Sort the index by geometry */ @@ -1443,6 +1472,8 @@ int read_json(int argc, char **argv, char *fname, const char *layername, int max } } +#if 0 /* XXX */ + char *stringpool = poolfile->map; char *meta = (char *) mmap(NULL, metast.st_size, PROT_READ, MAP_PRIVATE, metafd, 0); if (meta == MAP_FAILED) { @@ -1534,6 +1565,8 @@ int read_json(int argc, char **argv, char *fname, const char *layername, int max pool_free_strings(&file_keys1[i]); free(layernames[i]); } + +#endif return ret; } From 1f335eec44637c786d0e93be0ea36cd9517e72b9 Mon Sep 17 00:00:00 2001 From: Eric Fischer Date: Mon, 21 Dec 2015 17:00:07 -0800 Subject: [PATCH 08/19] Get rid of a global I had forgotten about --- geojson.c | 21 +++++++++++---------- memfile.c | 1 + memfile.h | 1 + 3 files changed, 13 insertions(+), 10 deletions(-) diff --git a/geojson.c b/geojson.c index 7e078d2..f0f48d8 100644 --- a/geojson.c +++ b/geojson.c @@ -344,7 +344,6 @@ struct stringpool { long long right; long long off; }; -long long pooltree = 0; static unsigned char swizzle[256] = { 0x00, 0xBF, 0x18, 0xDE, 0x93, 0xC9, 0xB1, 0x5E, 0xDF, 0xBE, 0x72, 0x5A, 0xBB, 0x42, 0x64, 0xC6, @@ -383,7 +382,7 @@ int swizzlecmp(char *a, char *b) { } long long addpool(struct memfile *poolfile, struct memfile *treefile, char *s, char type) { - long long *sp = &pooltree; + long long *sp = &treefile->tree; while (*sp != 0) { int cmp = swizzlecmp(s, poolfile->map + ((struct stringpool *) (treefile->map + *sp))->off + 1); @@ -403,7 +402,7 @@ long long addpool(struct memfile *poolfile, struct memfile *treefile, char *s, c // *sp is probably in the memory-mapped file, and will move if the file grows. long long ssp; - if (sp == &pooltree) { + if (sp == &treefile->tree) { ssp = -1; } else { ssp = ((char *) sp) - treefile->map; @@ -431,7 +430,7 @@ long long addpool(struct memfile *poolfile, struct memfile *treefile, char *s, c } if (ssp == -1) { - pooltree = p; + treefile->tree = p; } else { *((long long *) (treefile->map + ssp)) = p; } @@ -934,7 +933,7 @@ int read_json(int argc, char **argv, char *fname, const char *layername, int max struct stat st; char *map = NULL; - off_t off; + off_t off = 0; if (fstat(fd, &st) == 0) { off = lseek(fd, 0, SEEK_CUR); @@ -1085,6 +1084,7 @@ int read_json(int argc, char **argv, char *fname, const char *layername, int max indexpos += reader[i].indexpos; } } + fclose(indexfile); /* Sort the index by geometry */ @@ -1381,11 +1381,12 @@ int read_json(int argc, char **argv, char *fname, const char *layername, int max } } - char *geomname = "XXX"; // XXX - FILE *geomfile = NULL; // XXX - int geomfd = 0; // XXX - long long geompos = 0; // XXX - struct stat geomst; // XXX + char geomname[strlen(tmpdir) + strlen("/geom.XXXXXXXX") + 1]; + + FILE *geomfile; + int geomfd; + long long geompos = 0; + struct stat geomst; sprintf(geomname, "%s%s", tmpdir, "/geom.XXXXXXXX"); geomfd = mkstemp(geomname); diff --git a/memfile.c b/memfile.c index 454628c..68b7d7d 100644 --- a/memfile.c +++ b/memfile.c @@ -26,6 +26,7 @@ struct memfile *memfile_open(int fd) { mf->map = map; mf->len = INCREMENT; mf->off = 0; + mf->tree = 0; return mf; } diff --git a/memfile.h b/memfile.h index b536a6c..e8fa936 100644 --- a/memfile.h +++ b/memfile.h @@ -3,6 +3,7 @@ struct memfile { char *map; long long len; long long off; + long long tree; }; struct memfile *memfile_open(int fd); From de60545da889ab7533f254959b91bccfb659c2ac Mon Sep 17 00:00:00 2001 From: Eric Fischer Date: Mon, 21 Dec 2015 17:21:18 -0800 Subject: [PATCH 09/19] Include the segment number in the serialized geometry --- geojson.c | 1 + tile.cc | 8 ++++++-- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/geojson.c b/geojson.c index f0f48d8..f40d556 100644 --- a/geojson.c +++ b/geojson.c @@ -558,6 +558,7 @@ int serialize_geometry(json_object *geometry, json_object *properties, const cha serialize_int(geomfile, tippecanoe_maxzoom, geompos, fname); } + serialize_int(geomfile, segment, geompos, fname); serialize_long_long(geomfile, metastart, geompos, fname); long long wx = initial_x, wy = initial_y; parse_geometry(t, coordinates, bbox, geompos, geomfile, VT_MOVETO, fname, line, &wx, &wy, &initialized); diff --git a/tile.cc b/tile.cc index 22f4068..0f4eaac 100644 --- a/tile.cc +++ b/tile.cc @@ -362,7 +362,7 @@ struct sll { } }; -void rewrite(drawvec &geom, int z, int nextzoom, int maxzoom, long long *bbox, unsigned tx, unsigned ty, int buffer, int line_detail, int *within, long long *geompos, FILE **geomfile, const char *fname, signed char t, int layer, long long metastart, signed char feature_minzoom, int child_shards, int max_zoom_increment, long long seq, int tippecanoe_minzoom, int tippecanoe_maxzoom) { +void rewrite(drawvec &geom, int z, int nextzoom, int maxzoom, long long *bbox, unsigned tx, unsigned ty, int buffer, int line_detail, int *within, long long *geompos, FILE **geomfile, const char *fname, signed char t, int layer, long long metastart, signed char feature_minzoom, int child_shards, int max_zoom_increment, long long seq, int tippecanoe_minzoom, int tippecanoe_maxzoom, int segment) { if (geom.size() > 0 && nextzoom <= maxzoom) { int xo, yo; int span = 1 << (nextzoom - z); @@ -441,6 +441,7 @@ void rewrite(drawvec &geom, int z, int nextzoom, int maxzoom, long long *bbox, u if (tippecanoe_maxzoom != -1) { serialize_int(geomfile[j], tippecanoe_maxzoom, geompos, fname); } + serialize_int(geomfile[j], segment, &geompos[j], fname); serialize_long_long(geomfile[j], metastart, &geompos[j], fname); long long wx = initial_x, wy = initial_y; @@ -556,6 +557,9 @@ long long write_tile(char **geoms, char *metabase, char *stringpool, int z, unsi } layer >>= 2; + int segment; + deserialize_int(geoms, &segment); + long long metastart; deserialize_long_long(geoms, &metastart); char *meta = metabase + metastart; @@ -630,7 +634,7 @@ long long write_tile(char **geoms, char *metabase, char *stringpool, int z, unsi } if (line_detail == detail && fraction == 1) { /* only write out the next zoom once, even if we retry */ - rewrite(geom, z, nextzoom, maxzoom, bbox, tx, ty, buffer, line_detail, within, geompos, geomfile, fname, t, layer, metastart, feature_minzoom, child_shards, max_zoom_increment, original_seq, tippecanoe_minzoom, tippecanoe_maxzoom); + rewrite(geom, z, nextzoom, maxzoom, bbox, tx, ty, buffer, line_detail, within, geompos, geomfile, fname, t, layer, metastart, feature_minzoom, child_shards, max_zoom_increment, original_seq, tippecanoe_minzoom, tippecanoe_maxzoom, segment); } if (z < minzoom) { From aaf9e87bce543b860dc67f307971245f640da423 Mon Sep 17 00:00:00 2001 From: Eric Fischer Date: Mon, 21 Dec 2015 17:46:04 -0800 Subject: [PATCH 10/19] Reassemble the combined string pool and metadata files --- geojson.c | 88 +++++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 85 insertions(+), 3 deletions(-) diff --git a/geojson.c b/geojson.c index f40d556..29f1598 100644 --- a/geojson.c +++ b/geojson.c @@ -1474,15 +1474,97 @@ int read_json(int argc, char **argv, char *fname, const char *layername, int max } } -#if 0 /* XXX */ + // Create a combined string pool and a combined metadata file + // but keep track of the offsets into it since we still need + // segment+offset to find the data. - char *stringpool = poolfile->map; - char *meta = (char *) mmap(NULL, metast.st_size, PROT_READ, MAP_PRIVATE, metafd, 0); + long long pool_off[THREADS]; + long long metafile_off[THREADS]; + + char poolname[strlen(tmpdir) + strlen("/pool.XXXXXXXX") + 1]; + sprintf(poolname, "%s%s", tmpdir, "/pool.XXXXXXXX"); + + int poolfd = mkstemp(poolname); + if (poolfd < 0) { + perror(poolname); + exit(EXIT_FAILURE); + } + + FILE *poolfile = fopen(poolname, "wb"); + if (poolfile == NULL) { + perror(poolname); + exit(EXIT_FAILURE); + } + + unlink(poolname); + + char metaname[strlen(tmpdir) + strlen("/meta.XXXXXXXX") + 1]; + sprintf(metaname, "%s%s", tmpdir, "/meta.XXXXXXXX"); + + int metafd = mkstemp(metaname); + if (metafd < 0) { + perror(metaname); + exit(EXIT_FAILURE); + } + + FILE *metafile = fopen(metaname, "wb"); + if (metafile == NULL) { + perror(metaname); + exit(EXIT_FAILURE); + } + + unlink(metaname); + + long long metapos = 0; + long long poolpos = 0; + + for (i = 0; i < THREADS; i++) { + if (reader[i].metapos > 0) { + void *map = mmap(NULL, reader[i].metapos, PROT_READ, MAP_PRIVATE, reader[i].metafd, 0); + if (map == MAP_FAILED) { + perror("mmap"); + exit(EXIT_FAILURE); + } + if (fwrite(map, reader[i].metapos, 1, metafile) != 1) { + perror("Reunify meta"); + exit(EXIT_FAILURE); + } + munmap(map, reader[i].metapos); + } + + metafile_off[i] = metapos; + metapos += reader[i].metapos; + close(reader[i].metafd); + + if (reader[i].poolfile->off > 0) { + if (fwrite(reader[i].poolfile->map, reader[i].poolfile->off, 1, poolfile) != 1) { + perror("Reunify string pool"); + exit(EXIT_FAILURE); + } + } + + pool_off[i] = poolpos; + poolpos += reader[i].poolfile->off; + memfile_close(reader[i].poolfile); + } + + fclose(poolfile); + fclose(metafile); + + char *meta = (char *) mmap(NULL, metapos, PROT_READ, MAP_PRIVATE, metafd, 0); if (meta == MAP_FAILED) { perror("mmap meta"); exit(EXIT_FAILURE); } + char *stringpool = (char *) mmap(NULL, poolpos, PROT_READ, MAP_PRIVATE, poolfd, 0); + if (stringpool == MAP_FAILED) { + perror("mmap string pool"); + exit(EXIT_FAILURE); + } + +#if 0 /* XXX */ + unsigned midx = 0, midy = 0; int written = traverse_zooms(fd, size, meta, stringpool, file_keys, &midx, &midy, layernames, maxzoom, minzoom, basezoom, outdb, droprate, buffer, fname, tmpdir, gamma, nlayers, prevent, additional, full_detail, low_detail, min_detail); From bd081c864ed30ba2c7fe6b79eb47da4c18bd0238 Mon Sep 17 00:00:00 2001 From: Eric Fischer Date: Mon, 21 Dec 2015 18:00:07 -0800 Subject: [PATCH 11/19] It tiles again! --- geojson.c | 16 ++++++++-------- tile.cc | 14 +++++++++----- tile.h | 2 +- 3 files changed, 18 insertions(+), 14 deletions(-) diff --git a/geojson.c b/geojson.c index 29f1598..37a12f8 100644 --- a/geojson.c +++ b/geojson.c @@ -1479,7 +1479,7 @@ int read_json(int argc, char **argv, char *fname, const char *layername, int max // segment+offset to find the data. long long pool_off[THREADS]; - long long metafile_off[THREADS]; + long long meta_off[THREADS]; char poolname[strlen(tmpdir) + strlen("/pool.XXXXXXXX") + 1]; sprintf(poolname, "%s%s", tmpdir, "/pool.XXXXXXXX"); @@ -1532,7 +1532,7 @@ int read_json(int argc, char **argv, char *fname, const char *layername, int max munmap(map, reader[i].metapos); } - metafile_off[i] = metapos; + meta_off[i] = metapos; metapos += reader[i].metapos; close(reader[i].metafd); @@ -1563,10 +1563,8 @@ int read_json(int argc, char **argv, char *fname, const char *layername, int max exit(EXIT_FAILURE); } -#if 0 /* XXX */ - unsigned midx = 0, midy = 0; - int written = traverse_zooms(fd, size, meta, stringpool, file_keys, &midx, &midy, layernames, maxzoom, minzoom, basezoom, outdb, droprate, buffer, fname, tmpdir, gamma, nlayers, prevent, additional, full_detail, low_detail, min_detail); + int written = traverse_zooms(fd, size, meta, stringpool, file_keys, &midx, &midy, layernames, maxzoom, minzoom, basezoom, outdb, droprate, buffer, fname, tmpdir, gamma, nlayers, prevent, additional, full_detail, low_detail, min_detail, meta_off, pool_off); if (maxzoom != written) { fprintf(stderr, "\n\n\n*** NOTE TILES ONLY COMPLETE THROUGH ZOOM %d ***\n\n\n", written); @@ -1574,14 +1572,17 @@ int read_json(int argc, char **argv, char *fname, const char *layername, int max ret = EXIT_FAILURE; } - if (munmap(meta, metast.st_size) != 0) { + if (munmap(meta, metapos) != 0) { perror("munmap meta"); } if (close(metafd) < 0) { perror("close meta"); } - if (memfile_close(poolfile) != 0) { + if (munmap(stringpool, poolpos) != 0) { + perror("munmap stringpool"); + } + if (close(poolfd) < 0) { perror("close pool"); } @@ -1650,7 +1651,6 @@ int read_json(int argc, char **argv, char *fname, const char *layername, int max free(layernames[i]); } -#endif return ret; } diff --git a/tile.cc b/tile.cc index 0f4eaac..6fe6509 100644 --- a/tile.cc +++ b/tile.cc @@ -464,7 +464,7 @@ void rewrite(drawvec &geom, int z, int nextzoom, int maxzoom, long long *bbox, u } } -long long write_tile(char **geoms, char *metabase, char *stringpool, int z, unsigned tx, unsigned ty, int detail, int min_detail, int basezoom, struct pool **file_keys, char **layernames, sqlite3 *outdb, double droprate, int buffer, const char *fname, FILE **geomfile, int minzoom, int maxzoom, double todo, char *geomstart, volatile long long *along, double gamma, int nlayers, char *prevent, char *additional, int child_shards) { +long long write_tile(char **geoms, char *metabase, char *stringpool, int z, unsigned tx, unsigned ty, int detail, int min_detail, int basezoom, struct pool **file_keys, char **layernames, sqlite3 *outdb, double droprate, int buffer, const char *fname, FILE **geomfile, int minzoom, int maxzoom, double todo, char *geomstart, volatile long long *along, double gamma, int nlayers, char *prevent, char *additional, int child_shards, long long *meta_off, long long *pool_off) { int line_detail; double fraction = 1; @@ -562,7 +562,7 @@ long long write_tile(char **geoms, char *metabase, char *stringpool, int z, unsi long long metastart; deserialize_long_long(geoms, &metastart); - char *meta = metabase + metastart; + char *meta = metabase + metastart + meta_off[segment]; long long bbox[4]; drawvec geom = decode_geometry(geoms, z, tx, ty, line_detail, bbox); @@ -757,7 +757,7 @@ long long write_tile(char **geoms, char *metabase, char *stringpool, int z, unsi c.coalesced = false; c.original_seq = original_seq; - decode_meta(&meta, stringpool, keys[layer], values[layer], file_keys[layer], &c.meta); + decode_meta(&meta, stringpool + pool_off[segment], keys[layer], values[layer], file_keys[layer], &c.meta); features[layer].push_back(c); } } @@ -926,6 +926,8 @@ struct write_tile_args { int full_detail; int low_detail; volatile long long *most; + long long *meta_off; + long long *pool_off; }; void *run_thread(void *vargs) { @@ -965,7 +967,7 @@ void *run_thread(void *vargs) { // fprintf(stderr, "%d/%u/%u\n", z, x, y); - long long len = write_tile(&geom, arg->metabase, arg->stringpool, z, x, y, z == arg->maxzoom ? arg->full_detail : arg->low_detail, arg->min_detail, arg->basezoom, arg->file_keys, arg->layernames, arg->outdb, arg->droprate, arg->buffer, arg->fname, arg->geomfile, arg->minzoom, arg->maxzoom, arg->todo, geomstart, arg->along, arg->gamma, arg->nlayers, arg->prevent, arg->additional, arg->child_shards); + long long len = write_tile(&geom, arg->metabase, arg->stringpool, z, x, y, z == arg->maxzoom ? arg->full_detail : arg->low_detail, arg->min_detail, arg->basezoom, arg->file_keys, arg->layernames, arg->outdb, arg->droprate, arg->buffer, arg->fname, arg->geomfile, arg->minzoom, arg->maxzoom, arg->todo, geomstart, arg->along, arg->gamma, arg->nlayers, arg->prevent, arg->additional, arg->child_shards, arg->meta_off, arg->pool_off); if (len < 0) { int *err = (int *) malloc(sizeof(int)); @@ -1001,7 +1003,7 @@ void *run_thread(void *vargs) { return NULL; } -int traverse_zooms(int *geomfd, off_t *geom_size, char *metabase, char *stringpool, struct pool **file_keys, unsigned *midx, unsigned *midy, char **layernames, int maxzoom, int minzoom, int basezoom, sqlite3 *outdb, double droprate, int buffer, const char *fname, const char *tmpdir, double gamma, int nlayers, char *prevent, char *additional, int full_detail, int low_detail, int min_detail) { +int traverse_zooms(int *geomfd, off_t *geom_size, char *metabase, char *stringpool, struct pool **file_keys, unsigned *midx, unsigned *midy, char **layernames, int maxzoom, int minzoom, int basezoom, sqlite3 *outdb, double droprate, int buffer, const char *fname, const char *tmpdir, double gamma, int nlayers, char *prevent, char *additional, int full_detail, int low_detail, int min_detail, long long *meta_off, long long *pool_off) { int i; for (i = 0; i <= maxzoom; i++) { long long most = 0; @@ -1125,6 +1127,8 @@ int traverse_zooms(int *geomfd, off_t *geom_size, char *metabase, char *stringpo args[thread].full_detail = full_detail; args[thread].low_detail = low_detail; args[thread].most = &most; // locked with var_lock + args[thread].meta_off = meta_off; + args[thread].pool_off = pool_off; args[thread].tasks = dispatches[thread].tasks; diff --git a/tile.h b/tile.h index 2ac893f..f7df52f 100644 --- a/tile.h +++ b/tile.h @@ -27,7 +27,7 @@ struct pool_val *deserialize_string(char **f, struct pool *p, int type); long long write_tile(char **geom, char *metabase, char *stringpool, unsigned *file_bbox, int z, unsigned x, unsigned y, int detail, int min_detail, int basezoom, struct pool **file_keys, char **layernames, sqlite3 *outdb, double droprate, int buffer, const char *fname, FILE **geomfile, int file_minzoom, int file_maxzoom, double todo, char *geomstart, long long along, double gamma, int nlayers, char *prevent, char *additional); -int traverse_zooms(int *geomfd, off_t *geom_size, char *metabase, char *stringpool, struct pool **file_keys, unsigned *midx, unsigned *midy, char **layernames, int maxzoom, int minzoom, int basezoom, sqlite3 *outdb, double droprate, int buffer, const char *fname, const char *tmpdir, double gamma, int nlayers, char *prevent, char *additional, int full_detail, int low_detail, int min_detail); +int traverse_zooms(int *geomfd, off_t *geom_size, char *metabase, char *stringpool, struct pool **file_keys, unsigned *midx, unsigned *midy, char **layernames, int maxzoom, int minzoom, int basezoom, sqlite3 *outdb, double droprate, int buffer, const char *fname, const char *tmpdir, double gamma, int nlayers, char *prevent, char *additional, int full_detail, int low_detail, int min_detail, long long *meta_off, long long *pool_off); extern unsigned initial_x, initial_y; extern int geometry_scale; From 40ec317c3626b560c97fee801702d9a29a7dea53 Mon Sep 17 00:00:00 2001 From: Eric Fischer Date: Tue, 22 Dec 2015 14:02:31 -0800 Subject: [PATCH 12/19] Launch a separate thread to read each segment --- geojson.c | 123 +++++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 98 insertions(+), 25 deletions(-) diff --git a/geojson.c b/geojson.c index 37a12f8..8544027 100644 --- a/geojson.c +++ b/geojson.c @@ -18,6 +18,7 @@ #include #include #include +#include #include "jsonpull.h" #include "tile.h" @@ -745,6 +746,38 @@ void parse_json(json_pull *jp, const char *reading, long long *seq, long long *m } } +struct parse_json_args { + json_pull *jp; + const char *reading; + long long *seq; + long long *metapos; + long long *geompos; + long long *indexpos; + struct pool *exclude; + struct pool *include; + int exclude_all; + FILE *metafile; + FILE *geomfile; + FILE *indexfile; + struct memfile *poolfile; + struct memfile *treefile; + char *fname; + int maxzoom; + int basezoom; + int layer; + double droprate; + long long *file_bbox; + int segment; +}; + +void *run_parse_json(void *v) { + struct parse_json_args *pja = v; + + parse_json(pja->jp, pja->reading, pja->seq, pja->metapos, pja->geompos, pja->indexpos, pja->exclude, pja->include, pja->exclude_all, pja->metafile, pja->geomfile, pja->indexfile, pja->poolfile, pja->treefile, pja->fname, pja->maxzoom, pja->basezoom, pja->layer, pja->droprate, pja->file_bbox, pja->segment); + + return NULL; +} + struct jsonmap { char *map; long long off; @@ -808,10 +841,9 @@ struct reader { int read_json(int argc, char **argv, char *fname, const char *layername, int maxzoom, int minzoom, int basezoom, double basezoom_marker_width, sqlite3 *outdb, struct pool *exclude, struct pool *include, int exclude_all, double droprate, int buffer, const char *tmpdir, double gamma, char *prevent, char *additional) { int ret = EXIT_SUCCESS; -#define THREADS 10 - struct reader reader[THREADS]; + struct reader reader[CPUS]; int i; - for (i = 0; i < THREADS; i++) { + for (i = 0; i < CPUS; i++) { struct reader *r = reader + i; r->metaname = malloc(strlen(tmpdir) + strlen("/meta.XXXXXXXX") + 1); @@ -944,13 +976,13 @@ int read_json(int argc, char **argv, char *fname, const char *layername, int max } if (map != NULL && map != MAP_FAILED) { - long long segs[THREADS + 1]; + long long segs[CPUS + 1]; segs[0] = 0; - segs[THREADS] = st.st_size - off; + segs[CPUS] = st.st_size - off; int i; - for (i = 1; i < THREADS; i++) { - segs[i] = off + (st.st_size - off) * i / THREADS; + for (i = 1; i < CPUS; i++) { + segs[i] = off + (st.st_size - off) * i / CPUS; while (segs[i] < st.st_size && map[segs[i]] != '\n') { segs[i]++; @@ -959,11 +991,47 @@ int read_json(int argc, char **argv, char *fname, const char *layername, int max printf("%d %lld\n", i, segs[i]); } - for (i = 0; i < THREADS; i++) { - json_pull *jp = json_begin_map(map + segs[i], segs[i + 1] - segs[i]); - parse_json(jp, reading, &seq, &reader[i].metapos, &reader[i].geompos, &reader[i].indexpos, exclude, include, exclude_all, reader[i].metafile, reader[i].geomfile, reader[i].indexfile, reader[i].poolfile, reader[i].treefile, fname, maxzoom, basezoom, source, droprate, reader[i].file_bbox, i); - free(jp->source); - json_end(jp); + struct parse_json_args pja[CPUS]; + pthread_t pthreads[CPUS]; + + for (i = 0; i < CPUS; i++) { + pja[i].jp = json_begin_map(map + segs[i], segs[i + 1] - segs[i]); + pja[i].reading = reading; + pja[i].seq = &seq; + pja[i].metapos = &reader[i].metapos; + pja[i].geompos = &reader[i].geompos; + pja[i].indexpos = &reader[i].indexpos; + pja[i].exclude = exclude; + pja[i].include = include; + pja[i].exclude_all = exclude_all; + pja[i].metafile = reader[i].metafile; + pja[i].geomfile = reader[i].geomfile; + pja[i].indexfile = reader[i].indexfile; + pja[i].poolfile = reader[i].poolfile; + pja[i].treefile = reader[i].treefile; + pja[i].fname = fname; + pja[i].maxzoom = maxzoom; + pja[i].basezoom = basezoom; + pja[i].layer = source; + pja[i].droprate = droprate; + pja[i].file_bbox = reader[i].file_bbox; + pja[i].segment = i; + + if (pthread_create(&pthreads[i], NULL, run_parse_json, &pja[i]) != 0) { + perror("pthread_create"); + exit(EXIT_FAILURE); + } + } + + for (i = 0; i < CPUS; i++) { + void *retval; + + if (pthread_join(pthreads[i], &retval) != 0) { + perror("pthread_join"); + } + + free(pja[i].jp->source); + json_end(pja[i].jp); } } else { FILE *fp = fdopen(fd, "r"); @@ -980,7 +1048,7 @@ int read_json(int argc, char **argv, char *fname, const char *layername, int max } } - for (i = 0; i < THREADS; i++) { + for (i = 0; i < CPUS; i++) { fclose(reader[i].metafile); fclose(reader[i].geomfile); fclose(reader[i].indexfile); @@ -1070,7 +1138,7 @@ int read_json(int argc, char **argv, char *fname, const char *layername, int max unlink(indexname); long long indexpos = 0; - for (i = 0; i < THREADS; i++) { + for (i = 0; i < CPUS; i++) { if (reader[i].indexpos > 0) { void *map = mmap(NULL, reader[i].indexpos, PROT_READ, MAP_PRIVATE, reader[i].indexfd, 0); if (map == MAP_FAILED) { @@ -1371,12 +1439,17 @@ int read_json(int argc, char **argv, char *fname, const char *layername, int max } unlink(indexname); - for (i = 0; i < THREADS; i++) { - reader[i].geom_map = mmap(NULL, reader[i].geomst.st_size, PROT_READ, MAP_PRIVATE, reader[i].geomfd, 0); - if (reader[i].geom_map == MAP_FAILED) { - perror("mmap unsorted geometry"); - exit(EXIT_FAILURE); + for (i = 0; i < CPUS; i++) { + reader[i].geom_map = NULL; + + if (reader[i].geomst.st_size > 0) { + reader[i].geom_map = mmap(NULL, reader[i].geomst.st_size, PROT_READ, MAP_PRIVATE, reader[i].geomfd, 0); + if (reader[i].geom_map == MAP_FAILED) { + perror("mmap unsorted geometry"); + exit(EXIT_FAILURE); + } } + if (close(reader[i].geomfd) != 0) { perror("close unsorted geometry"); } @@ -1434,7 +1507,7 @@ int read_json(int argc, char **argv, char *fname, const char *layername, int max perror("unmap sorted index"); } - for (i = 0; i < THREADS; i++) { + for (i = 0; i < CPUS; i++) { if (munmap(reader[i].geom_map, reader[i].geomst.st_size) != 0) { perror("unmap unsorted geometry"); } @@ -1469,7 +1542,7 @@ int read_json(int argc, char **argv, char *fname, const char *layername, int max } if (!quiet) { - for (i = 0; i < THREADS; i++) { + for (i = 0; i < CPUS; i++) { fprintf(stderr, "%lld features, %lld bytes of geometry, %lld bytes of metadata, %lld bytes of string pool\n", seq, (long long) reader[i].geomst.st_size, (long long) reader[i].metast.st_size, reader[i].poolfile->off); } } @@ -1478,8 +1551,8 @@ int read_json(int argc, char **argv, char *fname, const char *layername, int max // but keep track of the offsets into it since we still need // segment+offset to find the data. - long long pool_off[THREADS]; - long long meta_off[THREADS]; + long long pool_off[CPUS]; + long long meta_off[CPUS]; char poolname[strlen(tmpdir) + strlen("/pool.XXXXXXXX") + 1]; sprintf(poolname, "%s%s", tmpdir, "/pool.XXXXXXXX"); @@ -1518,7 +1591,7 @@ int read_json(int argc, char **argv, char *fname, const char *layername, int max long long metapos = 0; long long poolpos = 0; - for (i = 0; i < THREADS; i++) { + for (i = 0; i < CPUS; i++) { if (reader[i].metapos > 0) { void *map = mmap(NULL, reader[i].metapos, PROT_READ, MAP_PRIVATE, reader[i].metafd, 0); if (map == MAP_FAILED) { @@ -1595,7 +1668,7 @@ int read_json(int argc, char **argv, char *fname, const char *layername, int max midlon = (maxlon + minlon) / 2; long long file_bbox[4] = { UINT_MAX, UINT_MAX, 0, 0 }; - for (i = 0; i < THREADS; i++) { + for (i = 0; i < CPUS; i++) { if (reader[i].file_bbox[0] < file_bbox[0]) { file_bbox[0] = reader[i].file_bbox[0]; } From ca4d1beb7c24e6df02ba04c89205b0e8fc7ff3c7 Mon Sep 17 00:00:00 2001 From: Eric Fischer Date: Tue, 22 Dec 2015 14:27:38 -0800 Subject: [PATCH 13/19] Guard against trying to map an empty string pool into memory --- geojson.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/geojson.c b/geojson.c index 5378a92..e715cfc 100644 --- a/geojson.c +++ b/geojson.c @@ -1633,10 +1633,13 @@ int read_json(int argc, char **argv, char *fname, const char *layername, int max exit(EXIT_FAILURE); } - char *stringpool = (char *) mmap(NULL, poolpos, PROT_READ, MAP_PRIVATE, poolfd, 0); - if (stringpool == MAP_FAILED) { - perror("mmap string pool"); - exit(EXIT_FAILURE); + char *stringpool = NULL; + if (poolpos > 0) { // Will be 0 if -X was specified + stringpool = (char *) mmap(NULL, poolpos, PROT_READ, MAP_PRIVATE, poolfd, 0); + if (stringpool == MAP_FAILED) { + perror("mmap string pool"); + exit(EXIT_FAILURE); + } } unsigned midx = 0, midy = 0; From fde3aa45de8d0fcebf97458bc47aa26c25a65253 Mon Sep 17 00:00:00 2001 From: Eric Fischer Date: Tue, 22 Dec 2015 14:58:19 -0800 Subject: [PATCH 14/19] Make parallel reading a command-line option --- README.md | 5 +++++ geojson.c | 47 +++++++++++++++++++++++++++-------------------- man/tippecanoe.1 | 9 +++++++++ 3 files changed, 41 insertions(+), 20 deletions(-) diff --git a/README.md b/README.md index 3ad7f94..8620c32 100644 --- a/README.md +++ b/README.md @@ -65,6 +65,11 @@ Options * -o _file_.mbtiles: Name the output file. * -f: Delete the mbtiles file if it already exists instead of giving an error * -t _directory_: Put the temporary files in _directory_. + * -P: Use multiple threads to read different parts of each input file at once. + This will only work if the input is line-delimited JSON with each Feature on its + own line, because it knows nothing of the top-level structure around the Features. + In addition, it only works if the input is a named file that can be mapped into memory + rather than a stream that can only be read sequentially. ### Zoom levels and resolution diff --git a/geojson.c b/geojson.c index e715cfc..60ffa37 100644 --- a/geojson.c +++ b/geojson.c @@ -823,15 +823,15 @@ struct reader { int geomfd; int indexfd; - FILE *metafile; - struct memfile *poolfile; - struct memfile *treefile; - FILE *geomfile; - FILE *indexfile; + FILE *metafile; + struct memfile *poolfile; + struct memfile *treefile; + FILE *geomfile; + FILE *indexfile; - long long metapos; - long long geompos; - long long indexpos; + long long metapos; + long long geompos; + long long indexpos; long long *file_bbox; @@ -841,7 +841,7 @@ struct reader { char *geom_map; }; -int read_json(int argc, char **argv, char *fname, const char *layername, int maxzoom, int minzoom, int basezoom, double basezoom_marker_width, sqlite3 *outdb, struct pool *exclude, struct pool *include, int exclude_all, double droprate, int buffer, const char *tmpdir, double gamma, char *prevent, char *additional) { +int read_json(int argc, char **argv, char *fname, const char *layername, int maxzoom, int minzoom, int basezoom, double basezoom_marker_width, sqlite3 *outdb, struct pool *exclude, struct pool *include, int exclude_all, double droprate, int buffer, const char *tmpdir, double gamma, char *prevent, char *additional, int read_parallel) { int ret = EXIT_SUCCESS; struct reader reader[CPUS]; @@ -971,10 +971,12 @@ int read_json(int argc, char **argv, char *fname, const char *layername, int max char *map = NULL; off_t off = 0; - if (fstat(fd, &st) == 0) { - off = lseek(fd, 0, SEEK_CUR); - if (off >= 0) { - map = mmap(NULL, st.st_size - off, PROT_READ, MAP_PRIVATE, fd, off); + if (read_parallel) { + if (fstat(fd, &st) == 0) { + off = lseek(fd, 0, SEEK_CUR); + if (off >= 0) { + map = mmap(NULL, st.st_size - off, PROT_READ, MAP_PRIVATE, fd, off); + } } } @@ -1067,8 +1069,8 @@ int read_json(int argc, char **argv, char *fname, const char *layername, int max } if (reader[i].geomst.st_size == 0 || reader[i].metast.st_size == 0) { - fprintf(stderr, "did not read any valid geometries\n"); // XXX - // exit(EXIT_FAILURE); + fprintf(stderr, "did not read any valid geometries\n"); // XXX + // exit(EXIT_FAILURE); } } @@ -1634,7 +1636,7 @@ int read_json(int argc, char **argv, char *fname, const char *layername, int max } char *stringpool = NULL; - if (poolpos > 0) { // Will be 0 if -X was specified + if (poolpos > 0) { // Will be 0 if -X was specified stringpool = (char *) mmap(NULL, poolpos, PROT_READ, MAP_PRIVATE, poolfd, 0); if (stringpool == MAP_FAILED) { perror("mmap string pool"); @@ -1673,7 +1675,7 @@ int read_json(int argc, char **argv, char *fname, const char *layername, int max midlat = (maxlat + minlat) / 2; midlon = (maxlon + minlon) / 2; - long long file_bbox[4] = { UINT_MAX, UINT_MAX, 0, 0 }; + long long file_bbox[4] = {UINT_MAX, UINT_MAX, 0, 0}; for (i = 0; i < CPUS; i++) { if (reader[i].file_bbox[0] < file_bbox[0]) { file_bbox[0] = reader[i].file_bbox[0]; @@ -1763,13 +1765,14 @@ int main(int argc, char **argv) { pool_init(&exclude, 0); pool_init(&include, 0); int exclude_all = 0; + int read_parallel = 0; for (i = 0; i < 256; i++) { prevent[i] = 0; additional[i] = 0; } - while ((i = getopt(argc, argv, "l:n:z:Z:d:D:m:o:x:y:r:b:fXt:g:p:vqa:B:")) != -1) { + while ((i = getopt(argc, argv, "l:n:z:Z:d:D:m:o:x:y:r:b:fXt:g:p:vqa:B:P")) != -1) { switch (i) { case 'n': name = optarg; @@ -1882,8 +1885,12 @@ int main(int argc, char **argv) { fprintf(stderr, VERSION); exit(EXIT_FAILURE); + case 'P': + read_parallel = 1; + break; + default: - fprintf(stderr, "Usage: %s -o out.mbtiles [-n name] [-l layername] [-z maxzoom] [-Z minzoom] [-B basezoom] [-d detail] [-D lower-detail] [-m min-detail] [-x excluded-field ...] [-y included-field ...] [-X] [-r droprate] [-b buffer] [-t tmpdir] [-a rco] [-p sfkld] [-q] [file.json ...]\n", argv[0]); + fprintf(stderr, "Usage: %s -o out.mbtiles [-n name] [-l layername] [-z maxzoom] [-Z minzoom] [-B basezoom] [-d detail] [-D lower-detail] [-m min-detail] [-x excluded-field ...] [-y included-field ...] [-X] [-r droprate] [-b buffer] [-t tmpdir] [-a rco] [-p sfkld] [-q] [-P] [file.json ...]\n", argv[0]); exit(EXIT_FAILURE); } } @@ -1927,7 +1934,7 @@ int main(int argc, char **argv) { sqlite3 *outdb = mbtiles_open(outdir, argv); int ret = EXIT_SUCCESS; - ret = read_json(argc - optind, argv + optind, name ? name : outdir, layer, maxzoom, minzoom, basezoom, basezoom_marker_width, outdb, &exclude, &include, exclude_all, droprate, buffer, tmpdir, gamma, prevent, additional); + ret = read_json(argc - optind, argv + optind, name ? name : outdir, layer, maxzoom, minzoom, basezoom, basezoom_marker_width, outdb, &exclude, &include, exclude_all, droprate, buffer, tmpdir, gamma, prevent, additional, read_parallel); mbtiles_close(outdb, argv); diff --git a/man/tippecanoe.1 b/man/tippecanoe.1 index 28d1828..f8a3798 100644 --- a/man/tippecanoe.1 +++ b/man/tippecanoe.1 @@ -5,6 +5,9 @@ Builds vector tilesets \[la]http://geojson.org/\[ra] features. This is a tool for making maps from huge datasets \[la]MADE_WITH.md\[ra]\&. +.PP +[Build Status](https://travis\-ci.org/mapbox/tippecanoe.svg) +\[la]https://travis-ci.org/mapbox/tippecanoe\[ra] .SH Intent .PP The goal of Tippecanoe is to enable making a scale\-independent view of your data, @@ -67,6 +70,12 @@ specified, the files are all merged into the single named layer. \-f: Delete the mbtiles file if it already exists instead of giving an error .IP \(bu 2 \-t \fIdirectory\fP: Put the temporary files in \fIdirectory\fP\&. +.IP \(bu 2 +\-P: Use multiple threads to read different parts of each input file at once. +This will only work if the input is line\-delimited JSON with each Feature on its +own line, because it knows nothing of the top\-level structure around the Features. +In addition, it only works if the input is a named file that can be mapped into memory +rather than a stream that can only be read sequentially. .RE .SS Zoom levels and resolution .RS From 02cf4d46ade9ad41eece17f158faeb1c4a7ae768 Mon Sep 17 00:00:00 2001 From: Eric Fischer Date: Tue, 22 Dec 2015 15:42:51 -0800 Subject: [PATCH 15/19] Close files that were being left open --- geojson.c | 28 +++++++++++++++++++--------- 1 file changed, 19 insertions(+), 9 deletions(-) diff --git a/geojson.c b/geojson.c index 60ffa37..a8882d3 100644 --- a/geojson.c +++ b/geojson.c @@ -1154,7 +1154,12 @@ int read_json(int argc, char **argv, char *fname, const char *layername, int max perror("Reunify index"); exit(EXIT_FAILURE); } - munmap(map, reader[i].indexpos); + if (munmap(map, reader[i].indexpos) != 0) { + perror("unmap unmerged index"); + } + if (close(reader[i].indexfd) != 0) { + perror("close unmerged index"); + } indexpos += reader[i].indexpos; } } @@ -1454,10 +1459,6 @@ int read_json(int argc, char **argv, char *fname, const char *layername, int max exit(EXIT_FAILURE); } } - - if (close(reader[i].geomfd) != 0) { - perror("close unsorted geometry"); - } } char geomname[strlen(tmpdir) + strlen("/geom.XXXXXXXX") + 1]; @@ -1516,6 +1517,9 @@ int read_json(int argc, char **argv, char *fname, const char *layername, int max if (munmap(reader[i].geom_map, reader[i].geomst.st_size) != 0) { perror("unmap unsorted geometry"); } + if (close(reader[i].geomfd) != 0) { + perror("close unsorted geometry"); + } } if (close(indexfd) != 0) { perror("close sorted index"); @@ -1607,12 +1611,16 @@ int read_json(int argc, char **argv, char *fname, const char *layername, int max perror("Reunify meta"); exit(EXIT_FAILURE); } - munmap(map, reader[i].metapos); + if (munmap(map, reader[i].metapos) != 0) { + perror("unmap unmerged meta"); + } } meta_off[i] = metapos; metapos += reader[i].metapos; - close(reader[i].metafd); + if (close(reader[i].metafd) != 0) { + perror("close unmerged meta"); + } if (reader[i].poolfile->off > 0) { if (fwrite(reader[i].poolfile->map, reader[i].poolfile->off, 1, poolfile) != 1) { @@ -1660,8 +1668,10 @@ int read_json(int argc, char **argv, char *fname, const char *layername, int max perror("close meta"); } - if (munmap(stringpool, poolpos) != 0) { - perror("munmap stringpool"); + if (poolpos > 0) { + if (munmap(stringpool, poolpos) != 0) { + perror("munmap stringpool"); + } } if (close(poolfd) < 0) { perror("close pool"); From 2fd00413cf25b6b25b26f49671ec436e679da079 Mon Sep 17 00:00:00 2001 From: Eric Fischer Date: Tue, 22 Dec 2015 16:29:45 -0800 Subject: [PATCH 16/19] Clean up progress indicator and feature sequence --- geojson.c | 67 +++++++++++++++++++++++++++++++------------------------ 1 file changed, 38 insertions(+), 29 deletions(-) diff --git a/geojson.c b/geojson.c index a8882d3..f1e427a 100644 --- a/geojson.c +++ b/geojson.c @@ -441,7 +441,7 @@ long long addpool(struct memfile *poolfile, struct memfile *treefile, char *s, c return off; } -int serialize_geometry(json_object *geometry, json_object *properties, const char *reading, int line, long long *seq, long long *metapos, long long *geompos, long long *indexpos, struct pool *exclude, struct pool *include, int exclude_all, FILE *metafile, FILE *geomfile, FILE *indexfile, struct memfile *poolfile, struct memfile *treefile, const char *fname, int maxzoom, int basezoom, int layer, double droprate, long long *file_bbox, json_object *tippecanoe, int segment) { +int serialize_geometry(json_object *geometry, json_object *properties, const char *reading, int line, long long *layer_seq, volatile long long *progress_seq, long long *metapos, long long *geompos, long long *indexpos, struct pool *exclude, struct pool *include, int exclude_all, FILE *metafile, FILE *geomfile, FILE *indexfile, struct memfile *poolfile, struct memfile *treefile, const char *fname, int maxzoom, int basezoom, int layer, double droprate, long long *file_bbox, json_object *tippecanoe, int segment) { json_object *geometry_type = json_hash_get(geometry, "type"); if (geometry_type == NULL) { static int warned = 0; @@ -552,7 +552,7 @@ int serialize_geometry(json_object *geometry, json_object *properties, const cha long long geomstart = *geompos; serialize_byte(geomfile, mb_geometry[t], geompos, fname); - serialize_long_long(geomfile, *seq, geompos, fname); + serialize_long_long(geomfile, *layer_seq, geompos, fname); serialize_long_long(geomfile, (layer << 2) | ((tippecanoe_minzoom != -1) << 1) | (tippecanoe_maxzoom != -1), geompos, fname); if (tippecanoe_minzoom != -1) { @@ -622,17 +622,18 @@ int serialize_geometry(json_object *geometry, json_object *properties, const cha } } - if (*seq % 10000 == 0) { + if (*progress_seq % 10000 == 0) { if (!quiet) { - fprintf(stderr, "Read %.2f million features\r", *seq / 1000000.0); + fprintf(stderr, "Read %.2f million features\r", *progress_seq / 1000000.0); } } - (*seq)++; + (*progress_seq)++; + (*layer_seq)++; return 1; } -void parse_json(json_pull *jp, const char *reading, long long *seq, long long *metapos, long long *geompos, long long *indexpos, struct pool *exclude, struct pool *include, int exclude_all, FILE *metafile, FILE *geomfile, FILE *indexfile, struct memfile *poolfile, struct memfile *treefile, char *fname, int maxzoom, int basezoom, int layer, double droprate, long long *file_bbox, int segment) { +void parse_json(json_pull *jp, const char *reading, long long *layer_seq, volatile long long *progress_seq, long long *metapos, long long *geompos, long long *indexpos, struct pool *exclude, struct pool *include, int exclude_all, FILE *metafile, FILE *geomfile, FILE *indexfile, struct memfile *poolfile, struct memfile *treefile, char *fname, int maxzoom, int basezoom, int layer, double droprate, long long *file_bbox, int segment) { long long found_hashes = 0; long long found_features = 0; long long found_geometries = 0; @@ -697,7 +698,7 @@ void parse_json(json_pull *jp, const char *reading, long long *seq, long long *m } found_geometries++; - serialize_geometry(j, NULL, reading, jp->line, seq, metapos, geompos, indexpos, exclude, include, exclude_all, metafile, geomfile, indexfile, poolfile, treefile, fname, maxzoom, basezoom, layer, droprate, file_bbox, NULL, segment); + serialize_geometry(j, NULL, reading, jp->line, layer_seq, progress_seq, metapos, geompos, indexpos, exclude, include, exclude_all, metafile, geomfile, indexfile, poolfile, treefile, fname, maxzoom, basezoom, layer, droprate, file_bbox, NULL, segment); json_free(j); continue; } @@ -732,10 +733,10 @@ void parse_json(json_pull *jp, const char *reading, long long *seq, long long *m if (geometries != NULL) { int g; for (g = 0; g < geometries->length; g++) { - serialize_geometry(geometries->array[g], properties, reading, jp->line, seq, metapos, geompos, indexpos, exclude, include, exclude_all, metafile, geomfile, indexfile, poolfile, treefile, fname, maxzoom, basezoom, layer, droprate, file_bbox, tippecanoe, segment); + serialize_geometry(geometries->array[g], properties, reading, jp->line, layer_seq, progress_seq, metapos, geompos, indexpos, exclude, include, exclude_all, metafile, geomfile, indexfile, poolfile, treefile, fname, maxzoom, basezoom, layer, droprate, file_bbox, tippecanoe, segment); } } else { - serialize_geometry(geometry, properties, reading, jp->line, seq, metapos, geompos, indexpos, exclude, include, exclude_all, metafile, geomfile, indexfile, poolfile, treefile, fname, maxzoom, basezoom, layer, droprate, file_bbox, tippecanoe, segment); + serialize_geometry(geometry, properties, reading, jp->line, layer_seq, progress_seq, metapos, geompos, indexpos, exclude, include, exclude_all, metafile, geomfile, indexfile, poolfile, treefile, fname, maxzoom, basezoom, layer, droprate, file_bbox, tippecanoe, segment); } json_free(j); @@ -745,14 +746,15 @@ void parse_json(json_pull *jp, const char *reading, long long *seq, long long *m if (!quiet) { fprintf(stderr, " \r"); - // (stderr, "Read 10000.00 million features\r", *seq / 1000000.0); + // (stderr, "Read 10000.00 million features\r", *progress_seq / 1000000.0); } } struct parse_json_args { json_pull *jp; const char *reading; - long long *seq; + long long *layer_seq; + volatile long long *progress_seq; long long *metapos; long long *geompos; long long *indexpos; @@ -776,7 +778,7 @@ struct parse_json_args { void *run_parse_json(void *v) { struct parse_json_args *pja = v; - parse_json(pja->jp, pja->reading, pja->seq, pja->metapos, pja->geompos, pja->indexpos, pja->exclude, pja->include, pja->exclude_all, pja->metafile, pja->geomfile, pja->indexfile, pja->poolfile, pja->treefile, pja->fname, pja->maxzoom, pja->basezoom, pja->layer, pja->droprate, pja->file_bbox, pja->segment); + parse_json(pja->jp, pja->reading, pja->layer_seq, pja->progress_seq, pja->metapos, pja->geompos, pja->indexpos, pja->exclude, pja->include, pja->exclude_all, pja->metafile, pja->geomfile, pja->indexfile, pja->poolfile, pja->treefile, pja->fname, pja->maxzoom, pja->basezoom, pja->layer, pja->droprate, pja->file_bbox, pja->segment); return NULL; } @@ -933,7 +935,7 @@ int read_json(int argc, char **argv, char *fname, const char *layername, int max r->file_bbox[2] = r->file_bbox[3] = 0; } - long long seq = 0; + volatile long long progress_seq = 0; int nlayers; if (layername != NULL) { @@ -992,8 +994,13 @@ int read_json(int argc, char **argv, char *fname, const char *layername, int max while (segs[i] < st.st_size && map[segs[i]] != '\n') { segs[i]++; } + } - printf("%d %lld\n", i, segs[i]); + long long layer_seq[CPUS]; + for (i = 0; i < CPUS; i++) { + // To preserve feature ordering, unique id for each segment + // begins with that segment's offset into the input + layer_seq[i] = segs[i]; } struct parse_json_args pja[CPUS]; @@ -1002,7 +1009,8 @@ int read_json(int argc, char **argv, char *fname, const char *layername, int max for (i = 0; i < CPUS; i++) { pja[i].jp = json_begin_map(map + segs[i], segs[i + 1] - segs[i]); pja[i].reading = reading; - pja[i].seq = &seq; + pja[i].layer_seq = &layer_seq[i]; + pja[i].progress_seq = &progress_seq; pja[i].metapos = &reader[i].metapos; pja[i].geompos = &reader[i].geompos; pja[i].indexpos = &reader[i].indexpos; @@ -1046,8 +1054,9 @@ int read_json(int argc, char **argv, char *fname, const char *layername, int max continue; } + long long layer_seq = 0; json_pull *jp = json_begin_file(fp); - parse_json(jp, reading, &seq, &reader[0].metapos, &reader[0].geompos, &reader[0].indexpos, exclude, include, exclude_all, reader[0].metafile, reader[0].geomfile, reader[0].indexfile, reader[0].poolfile, reader[0].treefile, fname, maxzoom, basezoom, source, droprate, reader[0].file_bbox, 0); + parse_json(jp, reading, &layer_seq, &progress_seq, &reader[0].metapos, &reader[0].geompos, &reader[0].indexpos, exclude, include, exclude_all, reader[0].metafile, reader[0].geomfile, reader[0].indexfile, reader[0].poolfile, reader[0].treefile, fname, maxzoom, basezoom, source, droprate, reader[0].file_bbox, 0); json_end(jp); fclose(fp); } @@ -1067,11 +1076,6 @@ int read_json(int argc, char **argv, char *fname, const char *layername, int max perror("stat meta\n"); exit(EXIT_FAILURE); } - - if (reader[i].geomst.st_size == 0 || reader[i].metast.st_size == 0) { - fprintf(stderr, "did not read any valid geometries\n"); // XXX - // exit(EXIT_FAILURE); - } } struct pool file_keys1[nlayers]; @@ -1514,8 +1518,10 @@ int read_json(int argc, char **argv, char *fname, const char *layername, int max } for (i = 0; i < CPUS; i++) { - if (munmap(reader[i].geom_map, reader[i].geomst.st_size) != 0) { - perror("unmap unsorted geometry"); + if (reader[i].geomst.st_size > 0) { + if (munmap(reader[i].geom_map, reader[i].geomst.st_size) != 0) { + perror("unmap unsorted geometry"); + } } if (close(reader[i].geomfd) != 0) { perror("close unsorted geometry"); @@ -1550,12 +1556,6 @@ int read_json(int argc, char **argv, char *fname, const char *layername, int max size[j] = 0; } - if (!quiet) { - for (i = 0; i < CPUS; i++) { - fprintf(stderr, "%lld features, %lld bytes of geometry, %lld bytes of metadata, %lld bytes of string pool\n", seq, (long long) reader[i].geomst.st_size, (long long) reader[i].metast.st_size, reader[i].poolfile->off); - } - } - // Create a combined string pool and a combined metadata file // but keep track of the offsets into it since we still need // segment+offset to find the data. @@ -1652,6 +1652,15 @@ int read_json(int argc, char **argv, char *fname, const char *layername, int max } } + if (geompos == 0 || metapos == 0) { + fprintf(stderr, "did not read any valid geometries\n"); + exit(EXIT_FAILURE); + } + + if (!quiet) { + fprintf(stderr, "%lld features, %lld bytes of geometry, %lld bytes of metadata, %lld bytes of string pool\n", progress_seq, geompos, metapos, poolpos); + } + unsigned midx = 0, midy = 0; int written = traverse_zooms(fd, size, meta, stringpool, file_keys, &midx, &midy, layernames, maxzoom, minzoom, basezoom, outdb, droprate, buffer, fname, tmpdir, gamma, nlayers, prevent, additional, full_detail, low_detail, min_detail, meta_off, pool_off); From 228a4d6bb91306c3884c32a7df55c24ac01ffafe Mon Sep 17 00:00:00 2001 From: Eric Fischer Date: Tue, 22 Dec 2015 16:58:27 -0800 Subject: [PATCH 17/19] Make the geometric origin a per-reader property for thread safety --- geojson.c | 43 +++++++++++++++++++++++++++---------------- geometry.cc | 2 +- geometry.hh | 2 +- tile.cc | 18 +++++++++++------- tile.h | 2 +- 5 files changed, 41 insertions(+), 26 deletions(-) diff --git a/geojson.c b/geojson.c index f1e427a..996fb18 100644 --- a/geojson.c +++ b/geojson.c @@ -33,9 +33,7 @@ int full_detail = -1; int min_detail = 7; int quiet = 0; -unsigned initial_x = 0, initial_y = 0; int geometry_scale = 0; -int initialized = 0; #define GEOM_POINT 0 /* array of positions */ #define GEOM_MULTIPOINT 1 /* array of arrays of positions */ @@ -142,7 +140,7 @@ void serialize_string(FILE *out, const char *s, long long *fpos, const char *fna *fpos += len + 1; } -void parse_geometry(int t, json_object *j, long long *bbox, long long *fpos, FILE *out, int op, const char *fname, int line, long long *wx, long long *wy, int *initialized) { +void parse_geometry(int t, json_object *j, long long *bbox, long long *fpos, FILE *out, int op, const char *fname, int line, long long *wx, long long *wy, int *initialized, unsigned *initial_x, unsigned *initial_y) { if (j == NULL || j->type != JSON_ARRAY) { fprintf(stderr, "%s:%d: expected array for type %d\n", fname, line, t); return; @@ -160,7 +158,7 @@ void parse_geometry(int t, json_object *j, long long *bbox, long long *fpos, FIL } } - parse_geometry(within, j->array[i], bbox, fpos, out, op, fname, line, wx, wy, initialized); + parse_geometry(within, j->array[i], bbox, fpos, out, op, fname, line, wx, wy, initialized, initial_x, initial_y); } } else { if (j->length >= 2 && j->array[0]->type == JSON_NUMBER && j->array[1]->type == JSON_NUMBER) { @@ -194,8 +192,8 @@ void parse_geometry(int t, json_object *j, long long *bbox, long long *fpos, FIL } if (!*initialized) { - initial_x = x; - initial_y = y; + *initial_x = (x >> geometry_scale) << geometry_scale; + *initial_y = (y >> geometry_scale) << geometry_scale; *wx = x; *wy = y; *initialized = 1; @@ -441,7 +439,7 @@ long long addpool(struct memfile *poolfile, struct memfile *treefile, char *s, c return off; } -int serialize_geometry(json_object *geometry, json_object *properties, const char *reading, int line, long long *layer_seq, volatile long long *progress_seq, long long *metapos, long long *geompos, long long *indexpos, struct pool *exclude, struct pool *include, int exclude_all, FILE *metafile, FILE *geomfile, FILE *indexfile, struct memfile *poolfile, struct memfile *treefile, const char *fname, int maxzoom, int basezoom, int layer, double droprate, long long *file_bbox, json_object *tippecanoe, int segment) { +int serialize_geometry(json_object *geometry, json_object *properties, const char *reading, int line, long long *layer_seq, volatile long long *progress_seq, long long *metapos, long long *geompos, long long *indexpos, struct pool *exclude, struct pool *include, int exclude_all, FILE *metafile, FILE *geomfile, FILE *indexfile, struct memfile *poolfile, struct memfile *treefile, const char *fname, int maxzoom, int basezoom, int layer, double droprate, long long *file_bbox, json_object *tippecanoe, int segment, int *initialized, unsigned *initial_x, unsigned *initial_y) { json_object *geometry_type = json_hash_get(geometry, "type"); if (geometry_type == NULL) { static int warned = 0; @@ -564,8 +562,8 @@ int serialize_geometry(json_object *geometry, json_object *properties, const cha serialize_int(geomfile, segment, geompos, fname); serialize_long_long(geomfile, metastart, geompos, fname); - long long wx = initial_x, wy = initial_y; - parse_geometry(t, coordinates, bbox, geompos, geomfile, VT_MOVETO, fname, line, &wx, &wy, &initialized); + long long wx = *initial_x, wy = *initial_y; + parse_geometry(t, coordinates, bbox, geompos, geomfile, VT_MOVETO, fname, line, &wx, &wy, initialized, initial_x, initial_y); serialize_byte(geomfile, VT_END, geompos, fname); /* @@ -633,7 +631,7 @@ int serialize_geometry(json_object *geometry, json_object *properties, const cha return 1; } -void parse_json(json_pull *jp, const char *reading, long long *layer_seq, volatile long long *progress_seq, long long *metapos, long long *geompos, long long *indexpos, struct pool *exclude, struct pool *include, int exclude_all, FILE *metafile, FILE *geomfile, FILE *indexfile, struct memfile *poolfile, struct memfile *treefile, char *fname, int maxzoom, int basezoom, int layer, double droprate, long long *file_bbox, int segment) { +void parse_json(json_pull *jp, const char *reading, long long *layer_seq, volatile long long *progress_seq, long long *metapos, long long *geompos, long long *indexpos, struct pool *exclude, struct pool *include, int exclude_all, FILE *metafile, FILE *geomfile, FILE *indexfile, struct memfile *poolfile, struct memfile *treefile, char *fname, int maxzoom, int basezoom, int layer, double droprate, long long *file_bbox, int segment, int *initialized, unsigned *initial_x, unsigned *initial_y) { long long found_hashes = 0; long long found_features = 0; long long found_geometries = 0; @@ -698,7 +696,7 @@ void parse_json(json_pull *jp, const char *reading, long long *layer_seq, volati } found_geometries++; - serialize_geometry(j, NULL, reading, jp->line, layer_seq, progress_seq, metapos, geompos, indexpos, exclude, include, exclude_all, metafile, geomfile, indexfile, poolfile, treefile, fname, maxzoom, basezoom, layer, droprate, file_bbox, NULL, segment); + serialize_geometry(j, NULL, reading, jp->line, layer_seq, progress_seq, metapos, geompos, indexpos, exclude, include, exclude_all, metafile, geomfile, indexfile, poolfile, treefile, fname, maxzoom, basezoom, layer, droprate, file_bbox, NULL, segment, initialized, initial_x, initial_y); json_free(j); continue; } @@ -733,10 +731,10 @@ void parse_json(json_pull *jp, const char *reading, long long *layer_seq, volati if (geometries != NULL) { int g; for (g = 0; g < geometries->length; g++) { - serialize_geometry(geometries->array[g], properties, reading, jp->line, layer_seq, progress_seq, metapos, geompos, indexpos, exclude, include, exclude_all, metafile, geomfile, indexfile, poolfile, treefile, fname, maxzoom, basezoom, layer, droprate, file_bbox, tippecanoe, segment); + serialize_geometry(geometries->array[g], properties, reading, jp->line, layer_seq, progress_seq, metapos, geompos, indexpos, exclude, include, exclude_all, metafile, geomfile, indexfile, poolfile, treefile, fname, maxzoom, basezoom, layer, droprate, file_bbox, tippecanoe, segment, initialized, initial_x, initial_y); } } else { - serialize_geometry(geometry, properties, reading, jp->line, layer_seq, progress_seq, metapos, geompos, indexpos, exclude, include, exclude_all, metafile, geomfile, indexfile, poolfile, treefile, fname, maxzoom, basezoom, layer, droprate, file_bbox, tippecanoe, segment); + serialize_geometry(geometry, properties, reading, jp->line, layer_seq, progress_seq, metapos, geompos, indexpos, exclude, include, exclude_all, metafile, geomfile, indexfile, poolfile, treefile, fname, maxzoom, basezoom, layer, droprate, file_bbox, tippecanoe, segment, initialized, initial_x, initial_y); } json_free(j); @@ -773,12 +771,15 @@ struct parse_json_args { double droprate; long long *file_bbox; int segment; + int *initialized; + unsigned *initial_x; + unsigned *initial_y; }; void *run_parse_json(void *v) { struct parse_json_args *pja = v; - parse_json(pja->jp, pja->reading, pja->layer_seq, pja->progress_seq, pja->metapos, pja->geompos, pja->indexpos, pja->exclude, pja->include, pja->exclude_all, pja->metafile, pja->geomfile, pja->indexfile, pja->poolfile, pja->treefile, pja->fname, pja->maxzoom, pja->basezoom, pja->layer, pja->droprate, pja->file_bbox, pja->segment); + parse_json(pja->jp, pja->reading, pja->layer_seq, pja->progress_seq, pja->metapos, pja->geompos, pja->indexpos, pja->exclude, pja->include, pja->exclude_all, pja->metafile, pja->geomfile, pja->indexfile, pja->poolfile, pja->treefile, pja->fname, pja->maxzoom, pja->basezoom, pja->layer, pja->droprate, pja->file_bbox, pja->segment, pja->initialized, pja->initial_x, pja->initial_y); return NULL; } @@ -937,6 +938,12 @@ int read_json(int argc, char **argv, char *fname, const char *layername, int max volatile long long progress_seq = 0; + int initialized[CPUS]; + unsigned initial_x[CPUS], initial_y[CPUS]; + for (i = 0; i < CPUS; i++) { + initialized[i] = initial_x[i] = initial_y[i] = 0; + } + int nlayers; if (layername != NULL) { nlayers = 1; @@ -1029,6 +1036,9 @@ int read_json(int argc, char **argv, char *fname, const char *layername, int max pja[i].droprate = droprate; pja[i].file_bbox = reader[i].file_bbox; pja[i].segment = i; + pja[i].initialized = &initialized[i]; + pja[i].initial_x = &initial_x[i]; + pja[i].initial_y = &initial_y[i]; if (pthread_create(&pthreads[i], NULL, run_parse_json, &pja[i]) != 0) { perror("pthread_create"); @@ -1056,7 +1066,7 @@ int read_json(int argc, char **argv, char *fname, const char *layername, int max long long layer_seq = 0; json_pull *jp = json_begin_file(fp); - parse_json(jp, reading, &layer_seq, &progress_seq, &reader[0].metapos, &reader[0].geompos, &reader[0].indexpos, exclude, include, exclude_all, reader[0].metafile, reader[0].geomfile, reader[0].indexfile, reader[0].poolfile, reader[0].treefile, fname, maxzoom, basezoom, source, droprate, reader[0].file_bbox, 0); + parse_json(jp, reading, &layer_seq, &progress_seq, &reader[0].metapos, &reader[0].geompos, &reader[0].indexpos, exclude, include, exclude_all, reader[0].metafile, reader[0].geomfile, reader[0].indexfile, reader[0].poolfile, reader[0].treefile, fname, maxzoom, basezoom, source, droprate, reader[0].file_bbox, 0, &initialized[0], &initial_x[0], &initial_y[0]); json_end(jp); fclose(fp); } @@ -1498,6 +1508,7 @@ int read_json(int argc, char **argv, char *fname, const char *layername, int max for (i = 0; i < indexpos / sizeof(struct index); i++) { fwrite_check(reader[index_map[i].segment].geom_map + index_map[i].start, sizeof(char), index_map[i].end - index_map[i].start, geomfile, fname); sum += index_map[i].end - index_map[i].start; + geompos += index_map[i].end - index_map[i].start; long long p = 1000 * i / (indexpos / sizeof(struct index)); if (p != progress) { @@ -1662,7 +1673,7 @@ int read_json(int argc, char **argv, char *fname, const char *layername, int max } unsigned midx = 0, midy = 0; - int written = traverse_zooms(fd, size, meta, stringpool, file_keys, &midx, &midy, layernames, maxzoom, minzoom, basezoom, outdb, droprate, buffer, fname, tmpdir, gamma, nlayers, prevent, additional, full_detail, low_detail, min_detail, meta_off, pool_off); + int written = traverse_zooms(fd, size, meta, stringpool, file_keys, &midx, &midy, layernames, maxzoom, minzoom, basezoom, outdb, droprate, buffer, fname, tmpdir, gamma, nlayers, prevent, additional, full_detail, low_detail, min_detail, meta_off, pool_off, initial_x, initial_y); if (maxzoom != written) { fprintf(stderr, "\n\n\n*** NOTE TILES ONLY COMPLETE THROUGH ZOOM %d ***\n\n\n", written); diff --git a/geometry.cc b/geometry.cc index ce30af2..0005a5c 100644 --- a/geometry.cc +++ b/geometry.cc @@ -18,7 +18,7 @@ extern "C" { #include "projection.h" } -drawvec decode_geometry(char **meta, int z, unsigned tx, unsigned ty, int detail, long long *bbox) { +drawvec decode_geometry(char **meta, int z, unsigned tx, unsigned ty, int detail, long long *bbox, unsigned initial_x, unsigned initial_y) { drawvec out; bbox[0] = LONG_LONG_MAX; diff --git a/geometry.hh b/geometry.hh index c6019d1..3f057ec 100644 --- a/geometry.hh +++ b/geometry.hh @@ -16,7 +16,7 @@ struct draw { typedef std::vector drawvec; -drawvec decode_geometry(char **meta, int z, unsigned tx, unsigned ty, int detail, long long *bbox); +drawvec decode_geometry(char **meta, int z, unsigned tx, unsigned ty, int detail, long long *bbox, unsigned initial_x, unsigned initial_y); void to_tile_scale(drawvec &geom, int z, int detail); drawvec remove_noop(drawvec geom, int type, int shift); drawvec clip_point(drawvec &geom, int z, int detail, long long buffer); diff --git a/tile.cc b/tile.cc index f13ed6c..600922e 100644 --- a/tile.cc +++ b/tile.cc @@ -363,7 +363,7 @@ struct sll { } }; -void rewrite(drawvec &geom, int z, int nextzoom, int maxzoom, long long *bbox, unsigned tx, unsigned ty, int buffer, int line_detail, int *within, long long *geompos, FILE **geomfile, const char *fname, signed char t, int layer, long long metastart, signed char feature_minzoom, int child_shards, int max_zoom_increment, long long seq, int tippecanoe_minzoom, int tippecanoe_maxzoom, int segment) { +void rewrite(drawvec &geom, int z, int nextzoom, int maxzoom, long long *bbox, unsigned tx, unsigned ty, int buffer, int line_detail, int *within, long long *geompos, FILE **geomfile, const char *fname, signed char t, int layer, long long metastart, signed char feature_minzoom, int child_shards, int max_zoom_increment, long long seq, int tippecanoe_minzoom, int tippecanoe_maxzoom, int segment, unsigned *initial_x, unsigned *initial_y) { if (geom.size() > 0 && nextzoom <= maxzoom) { int xo, yo; int span = 1 << (nextzoom - z); @@ -444,7 +444,7 @@ void rewrite(drawvec &geom, int z, int nextzoom, int maxzoom, long long *bbox, u } serialize_int(geomfile[j], segment, &geompos[j], fname); serialize_long_long(geomfile[j], metastart, &geompos[j], fname); - long long wx = initial_x, wy = initial_y; + long long wx = initial_x[segment], wy = initial_y[segment]; for (unsigned u = 0; u < geom.size(); u++) { serialize_byte(geomfile[j], geom[u].op, &geompos[j], fname); @@ -465,7 +465,7 @@ void rewrite(drawvec &geom, int z, int nextzoom, int maxzoom, long long *bbox, u } } -long long write_tile(char **geoms, char *metabase, char *stringpool, int z, unsigned tx, unsigned ty, int detail, int min_detail, int basezoom, struct pool **file_keys, char **layernames, sqlite3 *outdb, double droprate, int buffer, const char *fname, FILE **geomfile, int minzoom, int maxzoom, double todo, char *geomstart, volatile long long *along, double gamma, int nlayers, char *prevent, char *additional, int child_shards, long long *meta_off, long long *pool_off) { +long long write_tile(char **geoms, char *metabase, char *stringpool, int z, unsigned tx, unsigned ty, int detail, int min_detail, int basezoom, struct pool **file_keys, char **layernames, sqlite3 *outdb, double droprate, int buffer, const char *fname, FILE **geomfile, int minzoom, int maxzoom, double todo, char *geomstart, volatile long long *along, double gamma, int nlayers, char *prevent, char *additional, int child_shards, long long *meta_off, long long *pool_off, unsigned *initial_x, unsigned *initial_y) { int line_detail; double fraction = 1; @@ -566,7 +566,7 @@ long long write_tile(char **geoms, char *metabase, char *stringpool, int z, unsi char *meta = metabase + metastart + meta_off[segment]; long long bbox[4]; - drawvec geom = decode_geometry(geoms, z, tx, ty, line_detail, bbox); + drawvec geom = decode_geometry(geoms, z, tx, ty, line_detail, bbox, initial_x[segment], initial_y[segment]); signed char feature_minzoom; deserialize_byte(geoms, &feature_minzoom); @@ -635,7 +635,7 @@ long long write_tile(char **geoms, char *metabase, char *stringpool, int z, unsi } if (line_detail == detail && fraction == 1) { /* only write out the next zoom once, even if we retry */ - rewrite(geom, z, nextzoom, maxzoom, bbox, tx, ty, buffer, line_detail, within, geompos, geomfile, fname, t, layer, metastart, feature_minzoom, child_shards, max_zoom_increment, original_seq, tippecanoe_minzoom, tippecanoe_maxzoom, segment); + rewrite(geom, z, nextzoom, maxzoom, bbox, tx, ty, buffer, line_detail, within, geompos, geomfile, fname, t, layer, metastart, feature_minzoom, child_shards, max_zoom_increment, original_seq, tippecanoe_minzoom, tippecanoe_maxzoom, segment, initial_x, initial_y); } if (z < minzoom) { @@ -929,6 +929,8 @@ struct write_tile_args { volatile long long *most; long long *meta_off; long long *pool_off; + unsigned *initial_x; + unsigned *initial_y; }; void *run_thread(void *vargs) { @@ -968,7 +970,7 @@ void *run_thread(void *vargs) { // fprintf(stderr, "%d/%u/%u\n", z, x, y); - long long len = write_tile(&geom, arg->metabase, arg->stringpool, z, x, y, z == arg->maxzoom ? arg->full_detail : arg->low_detail, arg->min_detail, arg->basezoom, arg->file_keys, arg->layernames, arg->outdb, arg->droprate, arg->buffer, arg->fname, arg->geomfile, arg->minzoom, arg->maxzoom, arg->todo, geomstart, arg->along, arg->gamma, arg->nlayers, arg->prevent, arg->additional, arg->child_shards, arg->meta_off, arg->pool_off); + long long len = write_tile(&geom, arg->metabase, arg->stringpool, z, x, y, z == arg->maxzoom ? arg->full_detail : arg->low_detail, arg->min_detail, arg->basezoom, arg->file_keys, arg->layernames, arg->outdb, arg->droprate, arg->buffer, arg->fname, arg->geomfile, arg->minzoom, arg->maxzoom, arg->todo, geomstart, arg->along, arg->gamma, arg->nlayers, arg->prevent, arg->additional, arg->child_shards, arg->meta_off, arg->pool_off, arg->initial_x, arg->initial_y); if (len < 0) { int *err = (int *) malloc(sizeof(int)); @@ -1004,7 +1006,7 @@ void *run_thread(void *vargs) { return NULL; } -int traverse_zooms(int *geomfd, off_t *geom_size, char *metabase, char *stringpool, struct pool **file_keys, unsigned *midx, unsigned *midy, char **layernames, int maxzoom, int minzoom, int basezoom, sqlite3 *outdb, double droprate, int buffer, const char *fname, const char *tmpdir, double gamma, int nlayers, char *prevent, char *additional, int full_detail, int low_detail, int min_detail, long long *meta_off, long long *pool_off) { +int traverse_zooms(int *geomfd, off_t *geom_size, char *metabase, char *stringpool, struct pool **file_keys, unsigned *midx, unsigned *midy, char **layernames, int maxzoom, int minzoom, int basezoom, sqlite3 *outdb, double droprate, int buffer, const char *fname, const char *tmpdir, double gamma, int nlayers, char *prevent, char *additional, int full_detail, int low_detail, int min_detail, long long *meta_off, long long *pool_off, unsigned *initial_x, unsigned *initial_y) { int i; for (i = 0; i <= maxzoom; i++) { long long most = 0; @@ -1130,6 +1132,8 @@ int traverse_zooms(int *geomfd, off_t *geom_size, char *metabase, char *stringpo args[thread].most = &most; // locked with var_lock args[thread].meta_off = meta_off; args[thread].pool_off = pool_off; + args[thread].initial_x = initial_x; + args[thread].initial_y = initial_y; args[thread].tasks = dispatches[thread].tasks; diff --git a/tile.h b/tile.h index f7df52f..bca0d4f 100644 --- a/tile.h +++ b/tile.h @@ -27,7 +27,7 @@ struct pool_val *deserialize_string(char **f, struct pool *p, int type); long long write_tile(char **geom, char *metabase, char *stringpool, unsigned *file_bbox, int z, unsigned x, unsigned y, int detail, int min_detail, int basezoom, struct pool **file_keys, char **layernames, sqlite3 *outdb, double droprate, int buffer, const char *fname, FILE **geomfile, int file_minzoom, int file_maxzoom, double todo, char *geomstart, long long along, double gamma, int nlayers, char *prevent, char *additional); -int traverse_zooms(int *geomfd, off_t *geom_size, char *metabase, char *stringpool, struct pool **file_keys, unsigned *midx, unsigned *midy, char **layernames, int maxzoom, int minzoom, int basezoom, sqlite3 *outdb, double droprate, int buffer, const char *fname, const char *tmpdir, double gamma, int nlayers, char *prevent, char *additional, int full_detail, int low_detail, int min_detail, long long *meta_off, long long *pool_off); +int traverse_zooms(int *geomfd, off_t *geom_size, char *metabase, char *stringpool, struct pool **file_keys, unsigned *midx, unsigned *midy, char **layernames, int maxzoom, int minzoom, int basezoom, sqlite3 *outdb, double droprate, int buffer, const char *fname, const char *tmpdir, double gamma, int nlayers, char *prevent, char *additional, int full_detail, int low_detail, int min_detail, long long *meta_off, long long *pool_off, unsigned *initial_x, unsigned *initial_y); extern unsigned initial_x, initial_y; extern int geometry_scale; From 2b378ceb9ff86dfa3b14a46093ac7385565e3536 Mon Sep 17 00:00:00 2001 From: Eric Fischer Date: Mon, 4 Jan 2016 13:39:34 -0800 Subject: [PATCH 18/19] Use multiple threads to sort the features --- geojson.c | 125 ++++++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 93 insertions(+), 32 deletions(-) diff --git a/geojson.c b/geojson.c index 996fb18..0662e8b 100644 --- a/geojson.c +++ b/geojson.c @@ -844,6 +844,64 @@ struct reader { char *geom_map; }; +struct sort_arg { + int task; + int cpus; + long long indexpos; + struct merge *merges; + int indexfd; + int nmerges; + long long unit; + int bytes; +}; + +void *run_sort(void *v) { + struct sort_arg *a = v; + + long long start; + for (start = a->task * a->unit; start < a->indexpos; start += a->unit * a->cpus) { + long long end = start + a->unit; + if (end > a->indexpos) { + end = a->indexpos; + } + + if (a->nmerges != 1) { + if (!quiet) { + fprintf(stderr, "Sorting part %lld of %d \r", start / a->unit + 1, a->nmerges); + } + } + + a->merges[start / a->unit].start = start; + a->merges[start / a->unit].end = end; + a->merges[start / a->unit].next = NULL; + + // MAP_PRIVATE to avoid disk writes if it fits in memory + void *map = mmap(NULL, end - start, PROT_READ | PROT_WRITE, MAP_PRIVATE, a->indexfd, start); + if (map == MAP_FAILED) { + perror("mmap"); + exit(EXIT_FAILURE); + } + + qsort(map, (end - start) / a->bytes, a->bytes, indexcmp); + + // Sorting and then copying avoids disk access to + // write out intermediate stages of the sort. + + void *map2 = mmap(NULL, end - start, PROT_READ | PROT_WRITE, MAP_SHARED, a->indexfd, start); + if (map2 == MAP_FAILED) { + perror("mmap (write)"); + exit(EXIT_FAILURE); + } + + memcpy(map2, map, end - start); + + munmap(map, end - start); + munmap(map2, end - start); + } + + return NULL; +} + int read_json(int argc, char **argv, char *fname, const char *layername, int maxzoom, int minzoom, int basezoom, double basezoom_marker_width, sqlite3 *outdb, struct pool *exclude, struct pool *include, int exclude_all, double droprate, int buffer, const char *tmpdir, double gamma, char *prevent, char *additional, int read_parallel) { int ret = EXIT_SUCCESS; @@ -1187,6 +1245,12 @@ int read_json(int argc, char **argv, char *fname, const char *layername, int max fprintf(stderr, "Sorting %lld features\n", (long long) indexpos / bytes); } + // XXX On machines with different page sizes, doing the sorting + // in different-sized chunks can cause features with the same + // index (i.e., the same bbox) to appear in different orders + // because the sort is unstable. This doesn't seem worth spending + // more memory to fix, but could make tests unstable. + int page = sysconf(_SC_PAGESIZE); long long unit = (50 * 1024 * 1024 / bytes) * bytes; while (unit % page != 0) { @@ -1196,45 +1260,32 @@ int read_json(int argc, char **argv, char *fname, const char *layername, int max int nmerges = (indexpos + unit - 1) / unit; struct merge merges[nmerges]; - long long start; - for (start = 0; start < indexpos; start += unit) { - long long end = start + unit; - if (end > indexpos) { - end = indexpos; - } + pthread_t pthreads[CPUS]; + struct sort_arg args[CPUS]; - if (nmerges != 1) { - if (!quiet) { - fprintf(stderr, "Sorting part %lld of %d\r", start / unit + 1, nmerges); - } - } + int i; + for (i = 0; i < CPUS; i++) { + args[i].task = i; + args[i].cpus = CPUS; + args[i].indexpos = indexpos; + args[i].merges = merges; + args[i].indexfd = indexfd; + args[i].nmerges = nmerges; + args[i].unit = unit; + args[i].bytes = bytes; - merges[start / unit].start = start; - merges[start / unit].end = end; - merges[start / unit].next = NULL; - - // MAP_PRIVATE to avoid disk writes if it fits in memory - void *map = mmap(NULL, end - start, PROT_READ | PROT_WRITE, MAP_PRIVATE, indexfd, start); - if (map == MAP_FAILED) { - perror("mmap"); + if (pthread_create(&pthreads[i], NULL, run_sort, &args[i]) != 0) { + perror("pthread_create"); exit(EXIT_FAILURE); } + } - qsort(map, (end - start) / bytes, bytes, indexcmp); + for (i = 0; i < CPUS; i++) { + void *retval; - // Sorting and then copying avoids the need to - // write out intermediate stages of the sort. - - void *map2 = mmap(NULL, end - start, PROT_READ | PROT_WRITE, MAP_SHARED, indexfd, start); - if (map2 == MAP_FAILED) { - perror("mmap (write)"); - exit(EXIT_FAILURE); + if (pthread_join(pthreads[i], &retval) != 0) { + perror("pthread_join"); } - - memcpy(map2, map, end - start); - - munmap(map, end - start); - munmap(map2, end - start); } if (nmerges != 1) { @@ -1505,7 +1556,17 @@ int read_json(int argc, char **argv, char *fname, const char *layername, int max long long i; long long sum = 0; long long progress = 0; + unsigned long long prev = 0; for (i = 0; i < indexpos / sizeof(struct index); i++) { + if (index_map[i].index < prev) { + unsigned x, y; + decode(index_map[i].index, &x, &y); + double lat, lon; + tile2latlon(x, y, 32, &lat, &lon); + fprintf(stderr, "Internal error: index out of order. %lld vs %lld: %lf,%lf\n", index_map[i].index, prev, lat, lon); + } + prev = index_map[i].index; + fwrite_check(reader[index_map[i].segment].geom_map + index_map[i].start, sizeof(char), index_map[i].end - index_map[i].start, geomfile, fname); sum += index_map[i].end - index_map[i].start; geompos += index_map[i].end - index_map[i].start; From 1d772615098eca435d34ea46c7d1150b883f3f19 Mon Sep 17 00:00:00 2001 From: Eric Fischer Date: Mon, 4 Jan 2016 17:20:38 -0800 Subject: [PATCH 19/19] Bump version number --- CHANGELOG.md | 4 ++++ version.h | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 6ed8a05..a15b744 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,7 @@ +## 1.6.0 + +* Add option of parallelized input when reading from a line-delimited file + ## 1.5.1 * Fix internal error when number of CPUs is not a power of 2 diff --git a/version.h b/version.h index 21c9cfb..7ca57b9 100644 --- a/version.h +++ b/version.h @@ -1 +1 @@ -#define VERSION "tippecanoe v1.5.1\n" +#define VERSION "tippecanoe v1.6.0\n"