From c2093329b19c3a0b2080f744bce8167bfed60aa4 Mon Sep 17 00:00:00 2001 From: Eric Fischer Date: Wed, 20 Sep 2017 15:38:37 -0700 Subject: [PATCH] Read Shapefile and DBF headers and skim through the files --- Makefile | 2 +- main.cpp | 44 ++++++++++++++++ shapefile.cpp | 143 ++++++++++++++++++++++++++++++++++++++++++++++++++ shapefile.hpp | 13 +++++ 4 files changed, 201 insertions(+), 1 deletion(-) create mode 100644 shapefile.cpp create mode 100644 shapefile.hpp diff --git a/Makefile b/Makefile index b4b2773..ad99009 100644 --- a/Makefile +++ b/Makefile @@ -46,7 +46,7 @@ C = $(wildcard *.c) $(wildcard *.cpp) INCLUDES = -I/usr/local/include -I. LIBS = -L/usr/local/lib -tippecanoe: geojson.o jsonpull/jsonpull.o tile.o pool.o mbtiles.o geometry.o projection.o memfile.o mvt.o serial.o main.o text.o dirtiles.o plugin.o read_json.o write_json.o geobuf.o evaluator.o +tippecanoe: geojson.o jsonpull/jsonpull.o tile.o pool.o mbtiles.o geometry.o projection.o memfile.o mvt.o serial.o main.o text.o dirtiles.o plugin.o read_json.o write_json.o geobuf.o shapefile.o evaluator.o $(CXX) $(PG) $(LIBS) $(FINAL_FLAGS) $(CXXFLAGS) -o $@ $^ $(LDFLAGS) -lm -lz -lsqlite3 -lpthread tippecanoe-enumerate: enumerate.o diff --git a/main.cpp b/main.cpp index fffaea1..50e8339 100644 --- a/main.cpp +++ b/main.cpp @@ -51,6 +51,7 @@ #include "main.hpp" #include "geojson.hpp" #include "geobuf.hpp" +#include "shapefile.hpp" #include "geometry.hpp" #include "serial.hpp" #include "options.hpp" @@ -1267,6 +1268,49 @@ int read_input(std::vector &sources, char *fname, int maxzoom, int minzo continue; } + if (sources[source].file.size() > 4 && sources[source].file.substr(sources[source].file.size() - 4) == std::string(".shp")) { + long long layer_seq[CPUS]; + double dist_sums[CPUS]; + size_t dist_counts[CPUS]; + struct serialization_state sst[CPUS]; + + // XXX factor out this duplicated setup + for (size_t i = 0; i < CPUS; i++) { + layer_seq[i] = overall_offset; + dist_sums[i] = 0; + dist_counts[i] = 0; + + sst[i].fname = reading.c_str(); + sst[i].line = 0; + sst[i].layer_seq = &layer_seq[i]; + sst[i].progress_seq = &progress_seq; + sst[i].readers = readers; + sst[i].segment = i; + sst[i].initial_x = &initial_x[i]; + sst[i].initial_y = &initial_y[i]; + sst[i].initialized = &initialized[i]; + sst[i].dist_sum = &dist_sums[i]; + sst[i].dist_count = &dist_counts[i]; + sst[i].want_dist = guess_maxzoom; + sst[i].maxzoom = maxzoom; + sst[i].filters = prefilter != NULL || postfilter != NULL; + sst[i].uses_gamma = uses_gamma; + sst[i].layermap = &layermaps[i]; + sst[i].exclude = exclude; + sst[i].include = include; + sst[i].exclude_all = exclude_all; + sst[i].filter = filter; + sst[i].basezoom = basezoom; + sst[i].attribute_types = attribute_types; + } + + parse_shapefile(sst, sources[source].file, layer, sources[layer].layer); + + overall_offset = layer_seq[0]; + checkdisk(readers, CPUS); + continue; + } + struct stat st; char *map = NULL; off_t off = 0; diff --git a/shapefile.cpp b/shapefile.cpp new file mode 100644 index 0000000..841f182 --- /dev/null +++ b/shapefile.cpp @@ -0,0 +1,143 @@ +#include +#include "shapefile.hpp" + +static unsigned int read32le(unsigned char *ba) { + return ((ba[0] & 0xFF)) | + ((ba[1] & 0xFF) << 8) | + ((ba[2] & 0xFF) << 16) | + ((ba[3] & 0xFF) << 24); +} + +static unsigned int read16le(unsigned char *ba) { + return ((ba[0] & 0xFF)) | + ((ba[1] & 0xFF) << 8); +} + +static unsigned long long read64le(unsigned char *ba) { + return read32le(ba) | + (((long long) read32le(ba + 4)) << 32); +} + +static unsigned int read32be(unsigned char *ba) { + return ((ba[0] & 0xFF) << 24) | + ((ba[1] & 0xFF) << 16) | + ((ba[2] & 0xFF) << 8) | + ((ba[3] & 0xFF)); +} + +static double toDouble(unsigned char *ba) { + if (sizeof(double) != 8) { + fprintf(stderr, "Internal error: wrong floating point size\n"); + exit(EXIT_FAILURE); + } + + return *((double *) ba); +} + +void parse_shapefile(struct serialization_state *sst, std::string fname, int layer, std::string layername) { + std::string dbfname = fname.substr(0, fname.size() - 3) + "dbf"; + + FILE *shp = fopen(fname.c_str(), "rb"); + if (shp == NULL) { + perror(fname.c_str()); + exit(EXIT_FAILURE); + } + FILE *dbf = fopen(dbfname.c_str(), "rb"); + if (dbf == NULL) { + perror(dbfname.c_str()); + exit(EXIT_FAILURE); + } + + unsigned char shpheader[100]; + if (fread(shpheader, 1, 100, shp) != 100) { + perror("read shapefile header"); + exit(EXIT_FAILURE); + } + + unsigned int magic = read32be(shpheader); + unsigned int flen = 2 * read32be(shpheader + 24) - 100; + unsigned int version = read32le(shpheader + 28); + + if (magic != 9994 || version != 1000) { + fprintf(stderr, "%s: not a shapefile (%u %u)\n", fname.c_str(), magic, version); + exit(EXIT_FAILURE); + } + + unsigned char dbfheader[32]; + if (fread(dbfheader, 1, 32, dbf) != 32) { + perror("read dbf header"); + exit(EXIT_FAILURE); + } + + unsigned int dbnrec = read32le(dbfheader + 4); + unsigned int dbheaderlen = read16le(dbfheader + 8); + unsigned int dbreclen = read16le(dbfheader + 10); + + if (dbheaderlen <= 32) { + fprintf(stderr, "Impossible length for DBF column header %u\n", dbheaderlen); + exit(EXIT_FAILURE); + } + + unsigned int dbcol_len = dbheaderlen - 32; + unsigned char dbcolumns[dbcol_len]; + if (fread(dbcolumns, 1, dbcol_len, dbf) != dbcol_len) { + perror("read dbf column header"); + exit(EXIT_FAILURE); + } + + std::vector columns; + std::vector column_widths; + std::vector column_types; + + // -1 because there is a 1-byte terminator + for (size_t i = 0; i < dbcol_len - 1; i += 32) { + size_t j; + for (j = i; j < i + 10; j++) { + if (dbcolumns[j] == '\0') { + break; + } + } + + columns.push_back(std::string((char *) dbcolumns + i, j - i)); + column_widths.push_back(dbcolumns[i + 16]); + column_types.push_back(dbcolumns[i + 11]); + } + + unsigned char db[dbreclen]; + unsigned seq = 0; + while (fread(db, dbreclen, 1, dbf) == 1) { + unsigned char shlen[8]; + if (fread(shlen, 8, 1, shp) != 1) { + fprintf(stderr, "Attributes with no shape\n"); + exit(EXIT_FAILURE); + } + + seq++; + unsigned fileseq = read32be(shlen); + if (fileseq != seq) { + fprintf(stderr, "Shapefile out of sequence: found %u for record %u\n", fileseq, seq); + exit(EXIT_FAILURE); + } + + unsigned int geom_len = read32be(shlen + 4) * 2; + unsigned char geom_buf[geom_len]; + if (fread(geom_buf, 1, geom_len, shp) != geom_len) { + fprintf(stderr, "End of file reading geometry\n"); + exit(EXIT_FAILURE); + } + } + + if (seq != dbnrec) { + fprintf(stderr, "Unexpected number of attributes: %u instead of %u\n", seq, dbnrec); + exit(EXIT_FAILURE); + } + + if (fclose(shp) != 0) { + perror("fclose"); + exit(EXIT_FAILURE); + } + if (fclose(dbf) != 0) { + perror("fclose"); + exit(EXIT_FAILURE); + } +} diff --git a/shapefile.hpp b/shapefile.hpp new file mode 100644 index 0000000..a51dd9b --- /dev/null +++ b/shapefile.hpp @@ -0,0 +1,13 @@ +#ifndef SHAPEFILE_HPP +#define SHAPEFILE_HPP + +#include +#include +#include +#include +#include "mbtiles.hpp" +#include "serial.hpp" + +void parse_shapefile(struct serialization_state *sst, std::string fname, int layer, std::string layername); + +#endif