From cde1e60603146b9f261922b6e136cf287e221d4f Mon Sep 17 00:00:00 2001 From: Eric Fischer Date: Wed, 17 Jun 2015 17:18:08 -0700 Subject: [PATCH] Use a string pool to avoid duplicating keys and values --- geojson.c | 83 ++++++++++++++++++++++++++++++++++++++++++++++++++----- tile.cc | 20 ++++++++++---- tile.h | 2 +- 3 files changed, 92 insertions(+), 13 deletions(-) diff --git a/geojson.c b/geojson.c index f1effa0..d8e6483 100644 --- a/geojson.c +++ b/geojson.c @@ -214,7 +214,7 @@ struct pool_val *deserialize_string(char **f, struct pool *p, int type) { return ret; } -int traverse_zooms(int geomfd[4], off_t geom_size[4], char *metabase, unsigned *file_bbox, struct pool **file_keys, unsigned *midx, unsigned *midy, char **layernames, int maxzoom, int minzoom, sqlite3 *outdb, double droprate, int buffer, const char *fname, const char *tmpdir, double gamma, int nlayers, char *prevent) { +int traverse_zooms(int geomfd[4], off_t geom_size[4], char *metabase, char *stringpool, unsigned *file_bbox, struct pool **file_keys, unsigned *midx, unsigned *midy, char **layernames, int maxzoom, int minzoom, sqlite3 *outdb, double droprate, int buffer, const char *fname, const char *tmpdir, double gamma, int nlayers, char *prevent) { int i; for (i = 0; i <= maxzoom; i++) { long long most = 0; @@ -275,7 +275,7 @@ int traverse_zooms(int geomfd[4], off_t geom_size[4], char *metabase, unsigned * // fprintf(stderr, "%d/%u/%u\n", z, x, y); - long long len = write_tile(&geom, metabase, file_bbox, z, x, y, z == maxzoom ? full_detail : low_detail, min_detail, maxzoom, file_keys, layernames, outdb, droprate, buffer, fname, sub, minzoom, maxzoom, todo, geomstart, along, gamma, nlayers, prevent); + long long len = write_tile(&geom, metabase, stringpool, file_bbox, z, x, y, z == maxzoom ? full_detail : low_detail, min_detail, maxzoom, file_keys, layernames, outdb, droprate, buffer, fname, sub, minzoom, maxzoom, todo, geomstart, along, gamma, nlayers, prevent); if (len < 0) { return i - 1; @@ -381,14 +381,49 @@ static void merge(struct merge *merges, int nmerges, unsigned char *map, FILE *f } } +struct stringpool { + char *s; + struct stringpool *left; + struct stringpool *right; + long long off; +} *pooltree = NULL; + +long long addpool(FILE *poolfile, long long *poolpos, char *s) { + struct stringpool **sp = &pooltree; + + while (*sp != NULL) { + int cmp = strcmp(s, (*sp)->s); + if (cmp < 0) { + sp = &((*sp)->left); + } else if (cmp > 0) { + sp = &((*sp)->right); + } else { + return (*sp)->off; + } + } + + *sp = malloc(sizeof(struct stringpool)); + (*sp)->s = strdup(s); // XXX really should be mapped from the pool itself + (*sp)->left = NULL; + (*sp)->right = NULL; + (*sp)->off = *poolpos; + + fwrite_check(s, strlen(s) + 1, sizeof(char), poolfile, "string pool"); + *poolpos += strlen(s) + 1; + + return (*sp)->off; +} + int read_json(int argc, char **argv, char *fname, const char *layername, int maxzoom, int minzoom, sqlite3 *outdb, struct pool *exclude, struct pool *include, int exclude_all, double droprate, int buffer, const char *tmpdir, double gamma, char *prevent) { int ret = EXIT_SUCCESS; char metaname[strlen(tmpdir) + strlen("/meta.XXXXXXXX") + 1]; + char poolname[strlen(tmpdir) + strlen("/pool.XXXXXXXX") + 1]; char geomname[strlen(tmpdir) + strlen("/geom.XXXXXXXX") + 1]; char indexname[strlen(tmpdir) + strlen("/index.XXXXXXXX") + 1]; sprintf(metaname, "%s%s", tmpdir, "/meta.XXXXXXXX"); + sprintf(poolname, "%s%s", tmpdir, "/pool.XXXXXXXX"); sprintf(geomname, "%s%s", tmpdir, "/geom.XXXXXXXX"); sprintf(indexname, "%s%s", tmpdir, "/index.XXXXXXXX"); @@ -397,6 +432,11 @@ int read_json(int argc, char **argv, char *fname, const char *layername, int max perror(metaname); exit(EXIT_FAILURE); } + int poolfd = mkstemp(poolname); + if (poolfd < 0) { + perror(poolname); + exit(EXIT_FAILURE); + } int geomfd = mkstemp(geomname); if (geomfd < 0) { perror(geomname); @@ -413,6 +453,11 @@ int read_json(int argc, char **argv, char *fname, const char *layername, int max perror(metaname); exit(EXIT_FAILURE); } + FILE *poolfile = fopen(poolname, "wb"); + if (poolfile == NULL) { + perror(poolname); + exit(EXIT_FAILURE); + } FILE *geomfile = fopen(geomname, "wb"); if (geomfile == NULL) { perror(geomname); @@ -424,13 +469,19 @@ int read_json(int argc, char **argv, char *fname, const char *layername, int max exit(EXIT_FAILURE); } long long metapos = 0; + long long poolpos = 0; long long geompos = 0; long long indexpos = 0; unlink(metaname); + unlink(poolname); unlink(geomname); unlink(indexname); + // So we still have a legitimate map even if no metadata + fprintf(poolfile, "\n"); + poolpos++; + unsigned file_bbox[] = {UINT_MAX, UINT_MAX, 0, 0}; unsigned midx = 0, midy = 0; long long seq = 0; @@ -592,8 +643,8 @@ int read_json(int argc, char **argv, char *fname, const char *layername, int max serialize_int(metafile, m, &metapos, fname); for (i = 0; i < m; i++) { serialize_int(metafile, metatype[i], &metapos, fname); - serialize_string(metafile, metakey[i], &metapos, fname); - serialize_string(metafile, metaval[i], &metapos, fname); + serialize_long_long(metafile, addpool(poolfile, &poolpos, metakey[i]), &metapos, fname); + serialize_long_long(metafile, addpool(poolfile, &poolpos, metaval[i]), &metapos, fname); } long long geomstart = geompos; @@ -667,11 +718,13 @@ int read_json(int argc, char **argv, char *fname, const char *layername, int max } fclose(metafile); + fclose(poolfile); fclose(geomfile); fclose(indexfile); struct stat geomst; struct stat metast; + struct stat poolst; if (fstat(geomfd, &geomst) != 0) { perror("stat geom\n"); @@ -681,6 +734,10 @@ int read_json(int argc, char **argv, char *fname, const char *layername, int max perror("stat meta\n"); exit(EXIT_FAILURE); } + if (fstat(poolfd, &poolst) != 0) { + perror("stat pool\n"); + exit(EXIT_FAILURE); + } if (geomst.st_size == 0 || metast.st_size == 0) { fprintf(stderr, "did not read any valid geometries\n"); @@ -693,6 +750,12 @@ int read_json(int argc, char **argv, char *fname, const char *layername, int max exit(EXIT_FAILURE); } + char *stringpool = (char *) mmap(NULL, poolst.st_size, PROT_READ, MAP_PRIVATE, poolfd, 0); + if (stringpool == MAP_FAILED) { + perror("mmap stringpool"); + exit(EXIT_FAILURE); + } + struct pool file_keys1[nlayers]; struct pool *file_keys[nlayers]; int i; @@ -915,9 +978,9 @@ int read_json(int argc, char **argv, char *fname, const char *layername, int max size[j] = 0; } - fprintf(stderr, "%lld features, %lld bytes of geometry, %lld bytes of metadata\n", seq, (long long) geomst.st_size, (long long) metast.st_size); + fprintf(stderr, "%lld features, %lld bytes of geometry, %lld bytes of metadata, %lld bytes of string pool\n", seq, (long long) geomst.st_size, (long long) metast.st_size, (long long) poolst.st_size); - int written = traverse_zooms(fd, size, meta, file_bbox, file_keys, &midx, &midy, layernames, maxzoom, minzoom, outdb, droprate, buffer, fname, tmpdir, gamma, nlayers, prevent); + int written = traverse_zooms(fd, size, meta, stringpool, file_bbox, file_keys, &midx, &midy, layernames, maxzoom, minzoom, outdb, droprate, buffer, fname, tmpdir, gamma, nlayers, prevent); if (maxzoom != written) { fprintf(stderr, "\n\n\n*** NOTE TILES ONLY COMPLETE THROUGH ZOOM %d ***\n\n\n", written); @@ -928,11 +991,17 @@ int read_json(int argc, char **argv, char *fname, const char *layername, int max if (munmap(meta, metast.st_size) != 0) { perror("munmap meta"); } - if (close(metafd) < 0) { perror("close meta"); } + if (munmap(stringpool, poolst.st_size) != 0) { + perror("munmap pool"); + } + if (close(poolfd) < 0) { + perror("close pool"); + } + double minlat = 0, minlon = 0, maxlat = 0, maxlon = 0, midlat = 0, midlon = 0; tile2latlon(midx, midy, maxzoom, &maxlat, &minlon); diff --git a/tile.cc b/tile.cc index 059a618..1cd0691 100644 --- a/tile.cc +++ b/tile.cc @@ -186,7 +186,17 @@ int coalindexcmp(const struct coalesce *c1, const struct coalesce *c2) { return cmp; } -void decode_meta(char **meta, struct pool *keys, struct pool *values, struct pool *file_keys, std::vector *intmeta, char *only) { +struct pool_val *retrieve_string(char **f, struct pool *p, int type, char *stringpool) { + struct pool_val *ret; + long long off; + + deserialize_long_long(f, &off); + ret = pool(p, stringpool + off, type); + + return ret; +} + +void decode_meta(char **meta, char *stringpool, struct pool *keys, struct pool *values, struct pool *file_keys, std::vector *intmeta, char *only) { int m; deserialize_int(meta, &m); @@ -194,13 +204,13 @@ void decode_meta(char **meta, struct pool *keys, struct pool *values, struct poo for (i = 0; i < m; i++) { int t; deserialize_int(meta, &t); - struct pool_val *key = deserialize_string(meta, keys, VT_STRING); + struct pool_val *key = retrieve_string(meta, keys, VT_STRING, stringpool); if (only != NULL && (strcmp(key->s, only) != 0)) { deserialize_int(meta, &t); *meta += t; } else { - struct pool_val *value = deserialize_string(meta, values, t); + struct pool_val *value = retrieve_string(meta, values, t, stringpool); intmeta->push_back(key->n); intmeta->push_back(value->n); @@ -349,7 +359,7 @@ void evaluate(std::vector &features, char *metabase, struct pool *file } #endif -long long write_tile(char **geoms, char *metabase, unsigned *file_bbox, int z, unsigned tx, unsigned ty, int detail, int min_detail, int basezoom, struct pool **file_keys, char **layernames, sqlite3 *outdb, double droprate, int buffer, const char *fname, FILE *geomfile[4], int file_minzoom, int file_maxzoom, double todo, char *geomstart, long long along, double gamma, int nlayers, char *prevent) { +long long write_tile(char **geoms, char *metabase, char *stringpool, unsigned *file_bbox, int z, unsigned tx, unsigned ty, int detail, int min_detail, int basezoom, struct pool **file_keys, char **layernames, sqlite3 *outdb, double droprate, int buffer, const char *fname, FILE *geomfile[4], int file_minzoom, int file_maxzoom, double todo, char *geomstart, long long along, double gamma, int nlayers, char *prevent) { int line_detail; static bool evaluated = false; double oprogress = 0; @@ -614,7 +624,7 @@ long long write_tile(char **geoms, char *metabase, unsigned *file_bbox, int z, u c.metasrc = meta; c.coalesced = false; - decode_meta(&meta, keys[layer], values[layer], file_keys[layer], &c.meta, NULL); + decode_meta(&meta, stringpool, keys[layer], values[layer], file_keys[layer], &c.meta, NULL); features[layer].push_back(c); } } diff --git a/tile.h b/tile.h index 3193231..5494c20 100644 --- a/tile.h +++ b/tile.h @@ -25,4 +25,4 @@ void deserialize_uint(char **f, unsigned *n); void deserialize_byte(char **f, signed char *n); struct pool_val *deserialize_string(char **f, struct pool *p, int type); -long long write_tile(char **geom, char *metabase, unsigned *file_bbox, int z, unsigned x, unsigned y, int detail, int min_detail, int basezoom, struct pool **file_keys, char **layernames, sqlite3 *outdb, double droprate, int buffer, const char *fname, FILE *geomfile[4], int file_minzoom, int file_maxzoom, double todo, char *geomstart, long long along, double gamma, int nlayers, char *prevent); +long long write_tile(char **geom, char *metabase, char *stringpool, unsigned *file_bbox, int z, unsigned x, unsigned y, int detail, int min_detail, int basezoom, struct pool **file_keys, char **layernames, sqlite3 *outdb, double droprate, int buffer, const char *fname, FILE *geomfile[4], int file_minzoom, int file_maxzoom, double todo, char *geomstart, long long along, double gamma, int nlayers, char *prevent);