#include #include #include #include #include #include #include #include #include #include "jsonpull/jsonpull.h" #include "csv.hpp" #include "text.hpp" int fail = EXIT_SUCCESS; bool wrap = false; const char *extract = NULL; FILE *csvfile = NULL; std::vector header; std::vector fields; std::string buffered; int buffered_type = -1; // 0: nothing yet // 1: buffered a line // 2: wrote the line and the wrapper int buffer_state = 0; std::vector decode32(const char *s) { std::vector utf32; while (*s != '\0') { unsigned long b = *(s++) & 0xFF; if (b < 0x80) { utf32.push_back(b); } else if ((b & 0xe0) == 0xc0) { unsigned long c = (b & 0x1f) << 6; unsigned long b1 = *(s++) & 0xFF; if ((b1 & 0xc0) == 0x80) { c |= b1 & 0x3f; utf32.push_back(c); } else { s--; utf32.push_back(0xfffd); } } else if ((b & 0xf0) == 0xe0) { unsigned long c = (b & 0x0f) << 12; unsigned long b1 = *(s++) & 0xFF; if ((b1 & 0xc0) == 0x80) { c |= (b1 & 0x3f) << 6; unsigned long b2 = *(s++) & 0xFF; if ((b2 & 0xc0) == 0x80) { c |= b2 & 0x3f; utf32.push_back(c); } else { s -= 2; utf32.push_back(0xfffd); } } else { s--; utf32.push_back(0xfffd); } } else if ((b & 0xf8) == 0xf0) { unsigned long c = (b & 0x07) << 18; unsigned long b1 = *(s++) & 0xFF; if ((b1 & 0xc0) == 0x80) { c |= (b1 & 0x3f) << 12; unsigned long b2 = *(s++) & 0xFF; if ((b2 & 0xc0) == 0x80) { c |= (b2 & 0x3f) << 6; unsigned long b3 = *(s++) & 0xFF; if ((b3 & 0xc0) == 0x80) { c |= b3 & 0x3f; utf32.push_back(c); } else { s -= 3; utf32.push_back(0xfffd); } } else { s -= 2; utf32.push_back(0xfffd); } } else { s -= 1; utf32.push_back(0xfffd); } } else { utf32.push_back(0xfffd); } } return utf32; } // This uses a really weird encoding for strings // so that they will sort in UTF-32 order in spite of quoting std::string sort_quote(const char *s) { std::vector utf32 = decode32(s); std::string ret; for (size_t i = 0; i < utf32.size(); i++) { if (utf32[i] < 0xD800) { char buf[7]; sprintf(buf, "\\u%04lu", utf32[i]); ret.append(std::string(buf)); } else { unsigned long c = utf32[i]; if (c <= 0x7f) { ret.push_back(c); } else if (c <= 0x7ff) { ret.push_back(0xc0 | (c >> 6)); ret.push_back(0x80 | (c & 0x3f)); } else if (c <= 0xffff) { ret.push_back(0xe0 | (c >> 12)); ret.push_back(0x80 | ((c >> 6) & 0x3f)); ret.push_back(0x80 | (c & 0x3f)); } else { ret.push_back(0xf0 | (c >> 18)); ret.push_back(0x80 | ((c >> 12) & 0x3f)); ret.push_back(0x80 | ((c >> 6) & 0x3f)); ret.push_back(0x80 | (c & 0x3f)); } } } return ret; } void out(std::string const &s, int type, json_object *properties) { if (extract != NULL) { std::string extracted = sort_quote("null"); bool found = false; json_object *o = json_hash_get(properties, extract); if (o != NULL) { found = true; if (o->type == JSON_STRING || o->type == JSON_NUMBER) { extracted = sort_quote(o->string); } else { // Don't really know what to do about sort quoting // for arbitrary objects const char *out = json_stringify(o); extracted = sort_quote(out); free((void *) out); } } if (!found) { static bool warned = false; if (!warned) { fprintf(stderr, "Warning: extract key \"%s\" not found in JSON\n", extract); warned = true; } } printf("{\"%s\":%s}\n", extracted.c_str(), s.c_str()); return; } if (!wrap) { printf("%s\n", s.c_str()); return; } if (buffer_state == 0) { buffered = s; buffered_type = type; buffer_state = 1; return; } if (buffer_state == 1) { if (buffered_type == 1) { printf("{\"type\":\"FeatureCollection\",\"features\":[\n"); } else { printf("{\"type\":\"GeometryCollection\",\"geometries\":[\n"); } printf("%s\n", buffered.c_str()); buffer_state = 2; } printf(",\n%s\n", s.c_str()); if (type != buffered_type) { fprintf(stderr, "Error: mix of bare geometries and features\n"); exit(EXIT_FAILURE); } } std::string prev_joinkey; void join_csv(json_object *j) { if (header.size() == 0) { std::string s = csv_getline(csvfile); if (s.size() == 0) { fprintf(stderr, "Couldn't get column header from CSV file\n"); exit(EXIT_FAILURE); } std::string err = check_utf8(s); if (err != "") { fprintf(stderr, "%s\n", err.c_str()); exit(EXIT_FAILURE); } header = csv_split(s.c_str()); for (size_t i = 0; i < header.size(); i++) { header[i] = csv_dequote(header[i]); } if (header.size() == 0) { fprintf(stderr, "No columns in CSV header \"%s\"\n", s.c_str()); exit(EXIT_FAILURE); } } json_object *properties = json_hash_get(j, "properties"); json_object *key = NULL; if (properties != NULL) { key = json_hash_get(properties, header[0].c_str()); } if (key == NULL) { static bool warned = false; if (!warned) { fprintf(stderr, "Warning: couldn't find CSV key \"%s\" in JSON\n", header[0].c_str()); warned = true; } return; } std::string joinkey; if (key->type == JSON_STRING || key->type == JSON_NUMBER) { joinkey = key->string; } else { const char *s = json_stringify(key); joinkey = s; free((void *) s); } if (joinkey < prev_joinkey) { fprintf(stderr, "GeoJSON file is out of sort: \"%s\" follows \"%s\"\n", joinkey.c_str(), prev_joinkey.c_str()); exit(EXIT_FAILURE); } prev_joinkey = joinkey; if (fields.size() == 0 || joinkey > fields[0]) { std::string prevkey; if (fields.size() > 0) { prevkey = fields[0]; } while (true) { std::string s = csv_getline(csvfile); if (s.size() == 0) { fields.clear(); break; } std::string err = check_utf8(s); if (err != "") { fprintf(stderr, "%s\n", err.c_str()); exit(EXIT_FAILURE); } fields = csv_split(s.c_str()); for (size_t i = 0; i < fields.size(); i++) { fields[i] = csv_dequote(fields[i]); } if (fields.size() > 0 && fields[0] < prevkey) { fprintf(stderr, "CSV file is out of sort: \"%s\" follows \"%s\"\n", fields[0].c_str(), prevkey.c_str()); exit(EXIT_FAILURE); } if (fields.size() > 0 && fields[0] >= joinkey) { break; } if (fields.size() > 0) { prevkey = fields[0]; } } } if (fields.size() > 0 && joinkey == fields[0]) { // This knows more about the structure of JSON objects than it ought to properties->keys = (json_object **) realloc((void *) properties->keys, (properties->length + 32 + fields.size()) * sizeof(json_object *)); properties->values = (json_object **) realloc((void *) properties->values, (properties->length + 32 + fields.size()) * sizeof(json_object *)); if (properties->keys == NULL || properties->values == NULL) { perror("realloc"); exit(EXIT_FAILURE); } for (size_t i = 1; i < fields.size(); i++) { std::string k = header[i]; std::string v = fields[i]; json_type attr_type = JSON_STRING; if (v.size() > 0) { if (v[0] == '"') { v = csv_dequote(v); } else if (is_number(v)) { attr_type = JSON_NUMBER; } // This knows more about the structure of JSON objects than it ought to json_object *ko = (json_object *) malloc(sizeof(json_object)); json_object *vo = (json_object *) malloc(sizeof(json_object)); if (ko == NULL || vo == NULL) { perror("malloc"); exit(EXIT_FAILURE); } ko->type = JSON_STRING; vo->type = attr_type; ko->parent = vo->parent = properties; ko->array = vo->array = NULL; ko->keys = vo->keys = NULL; ko->values = vo->values = NULL; ko->parser = vo->parser = properties->parser; ko->string = strdup(k.c_str()); vo->string = strdup(v.c_str()); if (ko->string == NULL || vo->string == NULL) { perror("strdup"); exit(EXIT_FAILURE); } ko->length = strlen(ko->string); vo->length = strlen(vo->string); vo->number = atof(vo->string); properties->keys[properties->length] = ko; properties->values[properties->length] = vo; properties->length++; } } } } void process(FILE *fp, const char *fname) { json_pull *jp = json_begin_file(fp); while (1) { json_object *j = json_read(jp); if (j == NULL) { if (jp->error != NULL) { fprintf(stderr, "%s:%d: %s\n", fname, jp->line, jp->error); } json_free(jp->root); break; } json_object *type = json_hash_get(j, "type"); if (type == NULL || type->type != JSON_STRING) { continue; } if (strcmp(type->string, "Feature") == 0) { if (csvfile != NULL) { join_csv(j); } char *s = json_stringify(j); out(s, 1, json_hash_get(j, "properties")); free(s); json_free(j); } else if (strcmp(type->string, "Point") == 0 || strcmp(type->string, "MultiPoint") == 0 || strcmp(type->string, "LineString") == 0 || strcmp(type->string, "MultiLineString") == 0 || strcmp(type->string, "MultiPolygon") == 0) { int is_geometry = 1; if (j->parent != NULL) { if (j->parent->type == JSON_ARRAY && j->parent->parent != NULL) { if (j->parent->parent->type == JSON_HASH) { json_object *geometries = json_hash_get(j->parent->parent, "geometries"); if (geometries != NULL) { // Parent of Parent must be a GeometryCollection is_geometry = 0; } } } else if (j->parent->type == JSON_HASH) { json_object *geometry = json_hash_get(j->parent, "geometry"); if (geometry != NULL) { // Parent must be a Feature is_geometry = 0; } } } if (is_geometry) { char *s = json_stringify(j); out(s, 2, NULL); free(s); json_free(j); } } else if (strcmp(type->string, "FeatureCollection") == 0) { json_free(j); } } json_end(jp); } int main(int argc, char **argv) { const char *csv = NULL; struct option long_options[] = { {"wrap", no_argument, 0, 'w'}, {"extract", required_argument, 0, 'e'}, {"csv", required_argument, 0, 'c'}, {0, 0, 0, 0}, }; std::string getopt_str; for (size_t lo = 0; long_options[lo].name != NULL; lo++) { if (long_options[lo].val > ' ') { getopt_str.push_back(long_options[lo].val); if (long_options[lo].has_arg == required_argument) { getopt_str.push_back(':'); } } } extern int optind; int i; while ((i = getopt_long(argc, argv, getopt_str.c_str(), long_options, NULL)) != -1) { switch (i) { case 'w': wrap = true; break; case 'e': extract = optarg; break; case 'c': csv = optarg; break; default: fprintf(stderr, "Unexpected option -%c\n", i); exit(EXIT_FAILURE); } } if (extract != NULL && wrap) { fprintf(stderr, "%s: --wrap and --extract not supported together\n", argv[0]); exit(EXIT_FAILURE); } if (csv != NULL) { csvfile = fopen(csv, "r"); if (csvfile == NULL) { perror(csv); exit(EXIT_FAILURE); } } if (optind >= argc) { process(stdin, "standard input"); } else { for (i = optind; i < argc; i++) { FILE *f = fopen(argv[i], "r"); if (f == NULL) { perror(argv[i]); exit(EXIT_FAILURE); } process(f, argv[i]); fclose(f); } } if (buffer_state == 1) { printf("%s\n", buffered.c_str()); } else if (buffer_state == 2) { printf("]}\n"); } if (csvfile != NULL) { if (fclose(csvfile) != 0) { perror("close"); exit(EXIT_FAILURE); } } return fail; }