Guess that any non-UTF-8 attributes are encoded as ISO-8859-1

This commit is contained in:
Eric Fischer 2017-09-26 13:02:54 -07:00
parent d75258cc43
commit 27139eb295
3 changed files with 48 additions and 4 deletions

View File

@ -4,6 +4,7 @@
#include "serial.hpp"
#include "projection.hpp"
#include "main.hpp"
#include "text.hpp"
#include "milo/dtoa_milo.h"
static void check(size_t bits, void *p, void *end) {
@ -41,6 +42,30 @@ static double toDouble(unsigned char *ba) {
return *((double *) ba);
}
std::string forceutf8(std::string const &s) {
if (check_utf8(s).size() == 0) {
return s;
}
std::string out;
for (size_t i = 0; i < s.size(); i++) {
to_utf8(s[i] & 0xFF, out);
}
static bool warned = false;
if (!warned) {
std::string trimmed = out;
while (trimmed.size() > 0 && trimmed[trimmed.size() - 1] == ' ') {
trimmed.pop_back();
}
fprintf(stderr, "Warning: string \"%s\" is not UTF-8; assuming ISO-8859-1\n", trimmed.c_str());
warned = true;
}
return out;
}
drawvec decode_geometry(unsigned char *data, size_t len, int *type) {
drawvec dv;
@ -235,7 +260,7 @@ void parse_shapefile(struct serialization_state *sst, std::string fname, int lay
}
}
columns.push_back(std::string((char *) dbcolumns + i, j - i));
columns.push_back(forceutf8(std::string((char *) dbcolumns + i, j - i)));
column_widths.push_back(dbcolumns[i + 16]);
column_types.push_back(dbcolumns[i + 11]);
}
@ -274,7 +299,7 @@ void parse_shapefile(struct serialization_state *sst, std::string fname, int lay
size_t dbp = 1;
for (size_t i = 0; i < columns.size(); i++) {
std::string s = std::string((char *) (db + dbp), column_widths[i]);
std::string s = forceutf8(std::string((char *) (db + dbp), column_widths[i]));
dbp += column_widths[i];
while (s.size() > 0 && s[s.size() - 1] == ' ') {

View File

@ -5,7 +5,7 @@
* Returns an empty string if `s` is valid utf8;
* otherwise returns an error message.
*/
std::string check_utf8(std::string s) {
std::string check_utf8(std::string const &s) {
for (size_t i = 0; i < s.size(); i++) {
size_t fail = 0;
@ -122,3 +122,21 @@ std::string truncate16(std::string const &s, size_t runes) {
return std::string(s, 0, lastgood - start);
}
void to_utf8(unsigned ch, std::string &s) {
if (ch <= 0x7F) {
s.push_back(ch);
} else if (ch <= 0x7FF) {
s.push_back(0xC0 | (ch >> 6));
s.push_back(0x80 | (ch & 0x3F));
} else if (ch < 0xFFFF) {
s.push_back(0xE0 | (ch >> 12));
s.push_back(0x80 | ((ch >> 6) & 0x3F));
s.push_back(0x80 | (ch & 0x3F));
} else {
s.push_back(0xF0 | (ch >> 18));
s.push_back(0x80 | ((ch >> 12) & 0x3F));
s.push_back(0x80 | ((ch >> 6) & 0x3F));
s.push_back(0x80 | (ch & 0x3F));
}
}

View File

@ -3,8 +3,9 @@
#include <string>
std::string check_utf8(std::string text);
std::string check_utf8(std::string const &text);
const char *utf8_next(const char *s, long *c);
std::string truncate16(std::string const &s, size_t runes);
void to_utf8(unsigned c, std::string &s);
#endif