From 27139eb295e0d0298874b4736713f0512c615530 Mon Sep 17 00:00:00 2001 From: Eric Fischer Date: Tue, 26 Sep 2017 13:02:54 -0700 Subject: [PATCH] Guess that any non-UTF-8 attributes are encoded as ISO-8859-1 --- shapefile.cpp | 29 +++++++++++++++++++++++++++-- text.cpp | 20 +++++++++++++++++++- text.hpp | 3 ++- 3 files changed, 48 insertions(+), 4 deletions(-) diff --git a/shapefile.cpp b/shapefile.cpp index 35c642a..1be484d 100644 --- a/shapefile.cpp +++ b/shapefile.cpp @@ -4,6 +4,7 @@ #include "serial.hpp" #include "projection.hpp" #include "main.hpp" +#include "text.hpp" #include "milo/dtoa_milo.h" static void check(size_t bits, void *p, void *end) { @@ -41,6 +42,30 @@ static double toDouble(unsigned char *ba) { return *((double *) ba); } +std::string forceutf8(std::string const &s) { + if (check_utf8(s).size() == 0) { + return s; + } + + std::string out; + for (size_t i = 0; i < s.size(); i++) { + to_utf8(s[i] & 0xFF, out); + } + + static bool warned = false; + if (!warned) { + std::string trimmed = out; + while (trimmed.size() > 0 && trimmed[trimmed.size() - 1] == ' ') { + trimmed.pop_back(); + } + + fprintf(stderr, "Warning: string \"%s\" is not UTF-8; assuming ISO-8859-1\n", trimmed.c_str()); + warned = true; + } + + return out; +} + drawvec decode_geometry(unsigned char *data, size_t len, int *type) { drawvec dv; @@ -235,7 +260,7 @@ void parse_shapefile(struct serialization_state *sst, std::string fname, int lay } } - columns.push_back(std::string((char *) dbcolumns + i, j - i)); + columns.push_back(forceutf8(std::string((char *) dbcolumns + i, j - i))); column_widths.push_back(dbcolumns[i + 16]); column_types.push_back(dbcolumns[i + 11]); } @@ -274,7 +299,7 @@ void parse_shapefile(struct serialization_state *sst, std::string fname, int lay size_t dbp = 1; for (size_t i = 0; i < columns.size(); i++) { - std::string s = std::string((char *) (db + dbp), column_widths[i]); + std::string s = forceutf8(std::string((char *) (db + dbp), column_widths[i])); dbp += column_widths[i]; while (s.size() > 0 && s[s.size() - 1] == ' ') { diff --git a/text.cpp b/text.cpp index dc27d21..7e51ad2 100644 --- a/text.cpp +++ b/text.cpp @@ -5,7 +5,7 @@ * Returns an empty string if `s` is valid utf8; * otherwise returns an error message. */ -std::string check_utf8(std::string s) { +std::string check_utf8(std::string const &s) { for (size_t i = 0; i < s.size(); i++) { size_t fail = 0; @@ -122,3 +122,21 @@ std::string truncate16(std::string const &s, size_t runes) { return std::string(s, 0, lastgood - start); } + +void to_utf8(unsigned ch, std::string &s) { + if (ch <= 0x7F) { + s.push_back(ch); + } else if (ch <= 0x7FF) { + s.push_back(0xC0 | (ch >> 6)); + s.push_back(0x80 | (ch & 0x3F)); + } else if (ch < 0xFFFF) { + s.push_back(0xE0 | (ch >> 12)); + s.push_back(0x80 | ((ch >> 6) & 0x3F)); + s.push_back(0x80 | (ch & 0x3F)); + } else { + s.push_back(0xF0 | (ch >> 18)); + s.push_back(0x80 | ((ch >> 12) & 0x3F)); + s.push_back(0x80 | ((ch >> 6) & 0x3F)); + s.push_back(0x80 | (ch & 0x3F)); + } +} diff --git a/text.hpp b/text.hpp index 65274a3..48ffaf8 100644 --- a/text.hpp +++ b/text.hpp @@ -3,8 +3,9 @@ #include -std::string check_utf8(std::string text); +std::string check_utf8(std::string const &text); const char *utf8_next(const char *s, long *c); std::string truncate16(std::string const &s, size_t runes); +void to_utf8(unsigned c, std::string &s); #endif