Merge pull request #568 from mapbox/gzip-input

Directly support gzipped GeoJSON files as input
2025-04-16 15:09:01 +00:00 · 2018-05-15 00:08:51 +02:00 · 2018-05-15 00:08:51 +02:00 · 6db02e8457
commit 6db02e8457
parent 8e8b74b0ce 73398a0484
9 changed files with 141 additions and 229 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,3 +1,7 @@
+## 1.28.0
+
+* Directly support gzipped GeoJSON as input files
+
 ## 1.27.16

 * Fix thread safety issues related to the out-of-disk-space checker
--- a/11
+++ b/11
@ -85,9 +85,11 @@ SPACE = $(NULL) $(NULL)
 test: tippecanoe tippecanoe-decode $(addsuffix .check,$(TESTS)) raw-tiles-test parallel-test pbf-test join-test enumerate-test decode-test join-filter-test unit json-tool-test allow-existing-test csv-test
 	./unit

+suffixes = json json.gz
+
 # Work around Makefile and filename punctuation limits: _ for space, @ for :, % for /
 %.json.check:
-	./tippecanoe -a@ -f -o $@.mbtiles $(subst @,:,$(subst %,/,$(subst _, ,$(patsubst %.json.check,%,$(word 4,$(subst /, ,$@)))))) $(wildcard $(subst $(SPACE),/,$(wordlist 1,2,$(subst /, ,$@)))/*.json) < /dev/null
+	./tippecanoe -a@ -f -o $@.mbtiles $(subst @,:,$(subst %,/,$(subst _, ,$(patsubst %.json.check,%,$(word 4,$(subst /, ,$@)))))) $(foreach suffix,$(suffixes),$(wildcard $(subst $(SPACE),/,$(wordlist 1,2,$(subst /, ,$@)))/*.$(suffix))) < /dev/null
 	./tippecanoe-decode $@.mbtiles > $@.out
 	cmp $@.out $(patsubst %.check,%,$@)
 	rm $@.out $@.mbtiles
@ -101,7 +103,8 @@ fewer-tests: tippecanoe tippecanoe-decode geobuf-test raw-tiles-test parallel-te
 # XXX Use proper makefile rules instead of a for loop
 %.json.checkbuf:
 	for i in $(wildcard $(subst $(SPACE),/,$(wordlist 1,2,$(subst /, ,$@)))/*.json); do ./tippecanoe-json-tool -w $$i | ./node_modules/geobuf/bin/json2geobuf > $$i.geobuf; done
-	./tippecanoe -a@ -f -o $@.mbtiles $(subst @,:,$(subst %,/,$(subst _, ,$(patsubst %.json.checkbuf,%,$(word 4,$(subst /, ,$@)))))) $(addsuffix .geobuf,$(wildcard $(subst $(SPACE),/,$(wordlist 1,2,$(subst /, ,$@)))/*.json)) < /dev/null
+	for i in $(wildcard $(subst $(SPACE),/,$(wordlist 1,2,$(subst /, ,$@)))/*.json.gz); do gzip -dc $$i | ./tippecanoe-json-tool -w | ./node_modules/geobuf/bin/json2geobuf > $$i.geobuf; done
+	./tippecanoe -a@ -f -o $@.mbtiles $(subst @,:,$(subst %,/,$(subst _, ,$(patsubst %.json.checkbuf,%,$(word 4,$(subst /, ,$@)))))) $(foreach suffix,$(suffixes),$(addsuffix .geobuf,$(wildcard $(subst $(SPACE),/,$(wordlist 1,2,$(subst /, ,$@)))/*.$(suffix)))) < /dev/null
 	./tippecanoe-decode $@.mbtiles | sed 's/checkbuf/check/g' > $@.out
 	cmp $@.out $(patsubst %.checkbuf,%,$@)
 	rm $@.out $@.mbtiles
@ -160,7 +163,7 @@ pbf-test:
 	rm tests/pbf/11-328-791.3857.vector.pbf.out

 enumerate-test:
-	./tippecanoe -z5 -f -o tests/ne_110m_admin_0_countries/out/enum.mbtiles tests/ne_110m_admin_0_countries/in.json
+	./tippecanoe -z5 -f -o tests/ne_110m_admin_0_countries/out/enum.mbtiles tests/ne_110m_admin_0_countries/in.json.gz
 	./tippecanoe-enumerate tests/ne_110m_admin_0_countries/out/enum.mbtiles > tests/ne_110m_admin_0_countries/out/enum.check
 	cmp tests/ne_110m_admin_0_countries/out/enum tests/ne_110m_admin_0_countries/out/enum.check
 	rm tests/ne_110m_admin_0_countries/out/enum.mbtiles tests/ne_110m_admin_0_countries/out/enum.check
@ -280,7 +283,7 @@ csv-test:
 prep-test: $(TESTS)

 tests/%.json: Makefile tippecanoe tippecanoe-decode
-	./tippecanoe -f -o $@.check.mbtiles $(subst @,:,$(subst %,/,$(subst _, ,$(patsubst %.json,%,$(word 4,$(subst /, ,$@)))))) $(wildcard $(subst $(SPACE),/,$(wordlist 1,2,$(subst /, ,$@)))/*.json)
+	./tippecanoe -f -o $@.check.mbtiles $(subst @,:,$(subst %,/,$(subst _, ,$(patsubst %.json,%,$(word 4,$(subst /, ,$@)))))) $(foreach suffix,$(suffixes),$(wildcard $(subst $(SPACE),/,$(wordlist 1,2,$(subst /, ,$@)))/*.$(suffix)))
 	./tippecanoe-decode $@.check.mbtiles > $@
 	cmp $(patsubst %.check,%,$@) $@
 	rm $@.check.mbtiles
--- a/README.md
+++ b/README.md
@ -56,7 +56,7 @@ Usage
 -----

 ```sh
-$ tippecanoe -o file.mbtiles [options] [file.json file.geobuf ...]
+$ tippecanoe -o file.mbtiles [options] [file.json file.json.gz file.geobuf ...]
 ```

 If no files are specified, it reads GeoJSON from the standard input.
@ -142,6 +142,7 @@ If your input is formatted as newline-delimited GeoJSON, use `-P` to make input
 ### Input files and layer names

 * _name_`.json` or _name_`.geojson`: Read the named GeoJSON input file into a layer called _name_.
+ * _name_`.json.gz` or _name_`.geojson.gz`: Read the named gzipped GeoJSON input file into a layer called _name_.
 * _name_`.geobuf`: Read the named Geobuf input file into a layer called _name_.
 * _name_`.csv`: Read the named CSV input file into a layer called _name_.
 * `-l` _name_ or `--layer=`_name_: Use the specified layer name instead of deriving a name from the input filename or output tileset. If there are multiple input files
--- a/main.cpp
+++ b/main.cpp
@ -26,6 +26,7 @@
 #include <getopt.h>
 #include <signal.h>
 #include <sys/time.h>
+#include <zlib.h>
 #include <algorithm>
 #include <vector>
 #include <string>
@ -476,9 +477,96 @@ void do_read_parallel(char *map, long long len, long long initial_offset, const
 	}
 }

+static ssize_t read_stream(json_pull *j, char *buffer, size_t n);
+
+struct STREAM {
+	FILE *fp = NULL;
+	gzFile gz = NULL;
+
+	int fclose() {
+		int ret;
+
+		if (gz != NULL) {
+			ret = gzclose(gz);
+		} else {
+			ret = ::fclose(fp);
+		}
+
+		delete this;
+		return ret;
+	}
+
+	int peekc() {
+		if (gz != NULL) {
+			int c = gzgetc(gz);
+			if (c != EOF) {
+				gzungetc(c, gz);
+			}
+			return c;
+		} else {
+			int c = getc(fp);
+			if (c != EOF) {
+				ungetc(c, fp);
+			}
+			return c;
+		}
+	}
+
+	size_t read(char *out, size_t count) {
+		if (gz != NULL) {
+			int ret = gzread(gz, out, count);
+			if (ret < 0) {
+				fprintf(stderr, "%s: Error reading compressed data\n", *av);
+				exit(EXIT_FAILURE);
+			}
+			return ret;
+		} else {
+			return ::fread(out, 1, count, fp);
+		}
+	}
+
+	json_pull *json_begin() {
+		return ::json_begin(read_stream, this);
+	}
+};
+
+static ssize_t read_stream(json_pull *j, char *buffer, size_t n) {
+	return ((STREAM *) j->source)->read(buffer, n);
+}
+
+STREAM *streamfdopen(int fd, const char *mode, std::string const &fname) {
+	STREAM *s = new STREAM;
+	s->fp = NULL;
+	s->gz = NULL;
+
+	if (fname.size() > 3 && fname.substr(fname.size() - 3) == std::string(".gz")) {
+		s->gz = gzdopen(fd, mode);
+		if (s->gz == NULL) {
+			fprintf(stderr, "%s: %s: Decompression error\n", *av, fname.c_str());
+			exit(EXIT_FAILURE);
+		}
+	} else {
+		s->fp = fdopen(fd, mode);
+		if (s->fp == NULL) {
+			perror(fname.c_str());
+			exit(EXIT_FAILURE);
+		}
+	}
+
+	return s;
+}
+
+STREAM *streamfpopen(FILE *fp) {
+	STREAM *s = new STREAM;
+	s->fp = fp;
+	s->gz = NULL;
+
+	return s;
+}
+
 struct read_parallel_arg {
 	int fd = 0;
-	FILE *fp = NULL;
+	STREAM *fp = NULL;
 	long long offset = 0;
 	long long len = 0;
 	std::atomic<int> *is_parsing = NULL;
@ -532,7 +620,7 @@ void *run_read_parallel(void *v) {
 	if (munmap(map, rpa->len) != 0) {
 		perror("munmap source file");
 	}
-	if (fclose(rpa->fp) != 0) {
+	if (rpa->fp->fclose() != 0) {
 		perror("close source file");
 		exit(EXIT_FAILURE);
 	}
@ -543,7 +631,7 @@ void *run_read_parallel(void *v) {
 	return NULL;
 }

-void start_parsing(int fd, FILE *fp, long long offset, long long len, std::atomic<int> *is_parsing, pthread_t *parallel_parser, bool &parser_created, const char *reading, std::vector<struct reader> *readers, std::atomic<long long> *progress_seq, std::set<std::string> *exclude, std::set<std::string> *include, int exclude_all, json_object *filter, int basezoom, int source, std::vector<std::map<std::string, layermap_entry> > &layermaps, int *initialized, unsigned *initial_x, unsigned *initial_y, int maxzoom, std::string layername, bool uses_gamma, std::map<std::string, int> const *attribute_types, int separator, double *dist_sum, size_t *dist_count, bool want_dist, bool filters) {
+void start_parsing(int fd, STREAM *fp, long long offset, long long len, std::atomic<int> *is_parsing, pthread_t *parallel_parser, bool &parser_created, const char *reading, std::vector<struct reader> *readers, std::atomic<long long> *progress_seq, std::set<std::string> *exclude, std::set<std::string> *include, int exclude_all, json_object *filter, int basezoom, int source, std::vector<std::map<std::string, layermap_entry> > &layermaps, int *initialized, unsigned *initial_x, unsigned *initial_y, int maxzoom, std::string layername, bool uses_gamma, std::map<std::string, int> const *attribute_types, int separator, double *dist_sum, size_t *dist_count, bool want_dist, bool filters) {
 	// This has to kick off an intermediate thread to start the parser threads,
 	// so the main thread can get back to reading the next input stage while
 	// the intermediate thread waits for the completion of the parser threads.
@ -1173,30 +1261,24 @@ int read_input(std::vector<source> &sources, char *fname, int maxzoom, int minzo
 			}
 			std::string trunc = std::string(use);

+			std::vector<std::string> trim = {
+				".json",
+				".geojson",
+				".geobuf",
+				".mbtiles",
+				".gz",
+			};
+
 			// Trim .json or .mbtiles from the name
-			while (true) {
-				ssize_t cp;
-				cp = trunc.find(".json");
-				if (cp >= 0 && (size_t) cp + 5 == trunc.size()) {
-					trunc = trunc.substr(0, cp);
-					continue;
+			bool again = true;
+			while (again) {
+				again = false;
+				for (size_t i = 0; i < trim.size(); i++) {
+					if (trunc.size() > trim[i].size() && trunc.substr(trunc.size() - trim[i].size()) == trim[i]) {
+						trunc = trunc.substr(0, trunc.size() - trim[i].size());
+						again = true;
+					}
 				}
-				cp = trunc.find(".geojson");
-				if (cp >= 0 && (size_t) cp + 8 == trunc.size()) {
-					trunc = trunc.substr(0, cp);
-					continue;
-				}
-				cp = trunc.find(".geobuf");
-				if (cp >= 0 && (size_t) cp + 7 == trunc.size()) {
-					trunc = trunc.substr(0, cp);
-					continue;
-				}
-				cp = trunc.find(".mbtiles");
-				if (cp >= 0 && (size_t) cp + 8 == trunc.size()) {
-					trunc = trunc.substr(0, cp);
-					continue;
-				}
-				break;
 			}

 			// Trim out characters that can't be part of selector
@ -1394,7 +1476,7 @@ int read_input(std::vector<source> &sources, char *fname, int maxzoom, int minzo

 		int read_parallel_this = read_parallel ? '\n' : 0;

-		if (1) {
+		if (!(sources[source].file.size() > 3 && sources[source].file.substr(sources[source].file.size() - 3) == std::string(".gz"))) {
 			if (fstat(fd, &st) == 0) {
 				off = lseek(fd, 0, SEEK_CUR);
 				if (off >= 0) {
@ -1439,7 +1521,7 @@ int read_input(std::vector<source> &sources, char *fname, int maxzoom, int minzo
 				exit(EXIT_FAILURE);
 			}
 		} else {
-			FILE *fp = fdopen(fd, "r");
+			STREAM *fp = streamfdopen(fd, "r", sources[layer].file);
 			if (fp == NULL) {
 				perror(sources[layer].file.c_str());
 				if (close(fd) != 0) {
@ -1449,10 +1531,7 @@ int read_input(std::vector<source> &sources, char *fname, int maxzoom, int minzo
 				continue;
 			}

-			int c = getc(fp);
-			if (c != EOF) {
-				ungetc(c, fp);
-			}
+			int c = fp->peekc();
 			if (c == 0x1E) {
 				read_parallel_this = 0x1E;
 			}
@ -1487,7 +1566,7 @@ int read_input(std::vector<source> &sources, char *fname, int maxzoom, int minzo
 				char buf[READ_BUF];
 				int n;

-				while ((n = fread(buf, sizeof(char), READ_BUF, fp)) > 0) {
+				while ((n = fp->read(buf, READ_BUF)) > 0) {
 					fwrite_check(buf, sizeof(char), n, readfp, reading.c_str());
 					ahead += n;

@ -1506,7 +1585,7 @@ int read_input(std::vector<source> &sources, char *fname, int maxzoom, int minzo
 							}

 							fflush(readfp);
-							start_parsing(readfd, readfp, initial_offset, ahead, &is_parsing, &parallel_parser, parser_created, reading.c_str(), &readers, &progress_seq, exclude, include, exclude_all, filter, basezoom, layer, layermaps, initialized, initial_x, initial_y, maxzoom, sources[layer].layer, gamma != 0, attribute_types, read_parallel_this, &dist_sum, &dist_count, guess_maxzoom, prefilter != NULL || postfilter != NULL);
+							start_parsing(readfd, streamfpopen(readfp), initial_offset, ahead, &is_parsing, &parallel_parser, parser_created, reading.c_str(), &readers, &progress_seq, exclude, include, exclude_all, filter, basezoom, layer, layermaps, initialized, initial_x, initial_y, maxzoom, sources[layer].layer, gamma != 0, attribute_types, read_parallel_this, &dist_sum, &dist_count, guess_maxzoom, prefilter != NULL || postfilter != NULL);

 							initial_offset += ahead;
 							overall_offset += ahead;
@ -1543,7 +1622,7 @@ int read_input(std::vector<source> &sources, char *fname, int maxzoom, int minzo
 				fflush(readfp);

 				if (ahead > 0) {
-					start_parsing(readfd, readfp, initial_offset, ahead, &is_parsing, &parallel_parser, parser_created, reading.c_str(), &readers, &progress_seq, exclude, include, exclude_all, filter, basezoom, layer, layermaps, initialized, initial_x, initial_y, maxzoom, sources[layer].layer, gamma != 0, attribute_types, read_parallel_this, &dist_sum, &dist_count, guess_maxzoom, prefilter != NULL || postfilter != NULL);
+					start_parsing(readfd, streamfpopen(readfp), initial_offset, ahead, &is_parsing, &parallel_parser, parser_created, reading.c_str(), &readers, &progress_seq, exclude, include, exclude_all, filter, basezoom, layer, layermaps, initialized, initial_x, initial_y, maxzoom, sources[layer].layer, gamma != 0, attribute_types, read_parallel_this, &dist_sum, &dist_count, guess_maxzoom, prefilter != NULL || postfilter != NULL);

 					if (parser_created) {
 						if (pthread_join(parallel_parser, NULL) != 0) {
@ -1559,7 +1638,7 @@ int read_input(std::vector<source> &sources, char *fname, int maxzoom, int minzo
 				// Plain serial reading

 				std::atomic<long long> layer_seq(overall_offset);
-				json_pull *jp = json_begin_file(fp);
+				json_pull *jp = fp->json_begin();
 				struct serialization_state sst;

 				sst.fname = reading.c_str();
@ -1591,7 +1670,7 @@ int read_input(std::vector<source> &sources, char *fname, int maxzoom, int minzo
 				checkdisk(&readers);
 			}

-			if (fclose(fp) != 0) {
+			if (fp->fclose() != 0) {
 				perror("fclose input");
 				exit(EXIT_FAILURE);
 			}
--- a/man/tippecanoe.1
+++ b/man/tippecanoe.1
@ -54,7 +54,7 @@ compiler errors.
 .PP
 .RS
 .nf
-$ tippecanoe \-o file.mbtiles [options] [file.json file.geobuf ...]
+$ tippecanoe \-o file.mbtiles [options] [file.json file.json.gz file.geobuf ...]
 .fi
 .RE
 .PP
@ -149,6 +149,8 @@ or if metadata fields can't be set. You probably don't want to use this.
 .IP \(bu 2
 \fIname\fP\fB\fC\&.json\fR or \fIname\fP\fB\fC\&.geojson\fR: Read the named GeoJSON input file into a layer called \fIname\fP\&.
 .IP \(bu 2
+\fIname\fP\fB\fC\&.json.gz\fR or \fIname\fP\fB\fC\&.geojson.gz\fR: Read the named gzipped GeoJSON input file into a layer called \fIname\fP\&.
+.IP \(bu 2
 \fIname\fP\fB\fC\&.geobuf\fR: Read the named Geobuf input file into a layer called \fIname\fP\&.
 .IP \(bu 2
 \fIname\fP\fB\fC\&.csv\fR: Read the named CSV input file into a layer called \fIname\fP\&.
--- a/tests/ne_110m_admin_0_countries/in.json
+++ b/tests/ne_110m_admin_0_countries/in.json
--- a/tests/ne_110m_admin_0_countries/in.json.gz
+++ b/tests/ne_110m_admin_0_countries/in.json.gz
--- a/tile.cpp
+++ b/tile.cpp
@ -1390,7 +1390,7 @@ serial_feature next_feature(FILE *geoms, std::atomic<long long> *geompos_in, cha

 		// Remove nulls, now that the filter has run

-		for (ssize_t i = sf.keys.size() - 1; i >= 0; i--) {
+		for (ssize_t i = (ssize_t) sf.keys.size() - 1; i >= 0; i--) {
 			int type = (stringpool + pool_off[sf.segment])[sf.values[i]];

 			if (type == mvt_null) {
@ -2367,10 +2367,16 @@ long long write_tile(FILE *geoms, std::atomic<long long> *geompos_in, char *meta
 					mingap_fraction = mingap_fraction * max_tile_size / compressed.size() * 0.90;
 					unsigned long long mg = choose_mingap(indices, mingap_fraction);
 					if (mg <= mingap) {
-						mg = (mingap + 1) * 1.5;
+						double nmg = (mingap + 1) * 1.5;

-						if (mg <= mingap) {
+						if (nmg <= mingap || nmg > ULONG_MAX) {
 							mg = ULONG_MAX;
+						} else {
+							mg = nmg;
+
+							if (mg <= mingap) {
+								mg = ULONG_MAX;
+							}
 						}
 					}
 					mingap = mg;
--- a/version.hpp
+++ b/version.hpp
@ -1,6 +1,6 @@
 #ifndef VERSION_HPP
 #define VERSION_HPP

-#define VERSION "tippecanoe v1.27.16\n"
+#define VERSION "tippecanoe v1.28.0\n"

 #endif