From 4891348684f74d3ec913df93f342b2ddcfa8828c Mon Sep 17 00:00:00 2001 From: Jeremy Lakeman Date: Mon, 29 Jun 2015 14:30:26 +0930 Subject: [PATCH] Ensure unicode strings are correctly handled --- .../servaldna/ServalDClient.java | 2 +- .../servaldna/meshms/MeshMSCommon.java | 6 +-- .../servaldna/rhizome/RhizomeCommon.java | 6 +-- .../rhizome/RhizomeIncompleteManifest.java | 4 +- strbuf_helpers.c | 53 +++++++++++++++---- 5 files changed, 53 insertions(+), 18 deletions(-) diff --git a/java/org/servalproject/servaldna/ServalDClient.java b/java/org/servalproject/servaldna/ServalDClient.java index d1dfc508..fe65743f 100644 --- a/java/org/servalproject/servaldna/ServalDClient.java +++ b/java/org/servalproject/servaldna/ServalDClient.java @@ -182,7 +182,7 @@ public class ServalDClient implements ServalDHttpConnectionFactory } conn.setAllowUserInteraction(false); try { - conn.addRequestProperty("Authorization", "Basic " + Base64.encode((restfulUsername + ":" + restfulPassword).getBytes("US-ASCII"))); + conn.addRequestProperty("Authorization", "Basic " + Base64.encode((restfulUsername + ":" + restfulPassword).getBytes("UTF-8"))); } catch (UnsupportedEncodingException e) { throw new ServalDInterfaceException("invalid RESTful password", e); diff --git a/java/org/servalproject/servaldna/meshms/MeshMSCommon.java b/java/org/servalproject/servaldna/meshms/MeshMSCommon.java index 46c80ef7..099f0f0c 100644 --- a/java/org/servalproject/servaldna/meshms/MeshMSCommon.java +++ b/java/org/servalproject/servaldna/meshms/MeshMSCommon.java @@ -47,14 +47,14 @@ public class MeshMSCommon if (!"application/json".equals(conn.getContentType())) throw new ServalDInterfaceException("unexpected HTTP Content-Type: " + conn.getContentType()); if (conn.getResponseCode() == HttpURLConnection.HTTP_FORBIDDEN) { - JSONTokeniser json = new JSONTokeniser(new InputStreamReader(conn.getErrorStream(), "US-ASCII")); + JSONTokeniser json = new JSONTokeniser(new InputStreamReader(conn.getErrorStream(), "UTF-8")); Status status = decodeRestfulStatus(json); throwRestfulResponseExceptions(status, conn.getURL()); throw new ServalDInterfaceException("unexpected MeshMS status = " + status.meshms_status_code + ", \"" + status.meshms_status_message + "\""); } for (int code: expected_response_codes) { if (conn.getResponseCode() == code) { - JSONTokeniser json = new JSONTokeniser(new InputStreamReader(conn.getInputStream(), "US-ASCII")); + JSONTokeniser json = new JSONTokeniser(new InputStreamReader(conn.getInputStream(), "UTF-8")); return json; } } @@ -121,7 +121,7 @@ public class MeshMSCommon conn.setRequestProperty("Content-Type", "multipart/form-data; boundary=" + boundary); conn.connect(); OutputStream ost = conn.getOutputStream(); - PrintStream wr = new PrintStream(ost, false, "US-ASCII"); + PrintStream wr = new PrintStream(ost, false, "UTF-8"); wr.print("--" + boundary + "\r\n"); wr.print("Content-Disposition: form-data; name=\"message\"\r\n"); wr.print("Content-Type: text/plain; charset=utf-8\r\n"); diff --git a/java/org/servalproject/servaldna/rhizome/RhizomeCommon.java b/java/org/servalproject/servaldna/rhizome/RhizomeCommon.java index d98901b6..1159e2a5 100644 --- a/java/org/servalproject/servaldna/rhizome/RhizomeCommon.java +++ b/java/org/servalproject/servaldna/rhizome/RhizomeCommon.java @@ -90,7 +90,7 @@ public class RhizomeCommon if (!conn.getContentType().equals("application/json")) throw new ServalDInterfaceException("unexpected HTTP Content-Type: " + conn.getContentType()); if (status.http_status_code >= 300) { - JSONTokeniser json = new JSONTokeniser(new InputStreamReader(conn.getErrorStream(), "US-ASCII")); + JSONTokeniser json = new JSONTokeniser(new InputStreamReader(conn.getErrorStream(), "UTF-8")); decodeRestfulStatus(status, json); } if (status.http_status_code == HttpURLConnection.HTTP_FORBIDDEN) @@ -123,7 +123,7 @@ public class RhizomeCommon Status status = receiveResponse(conn, expected_response_codes); if (!conn.getContentType().equals("application/json")) throw new ServalDInterfaceException("unexpected HTTP Content-Type: " + conn.getContentType()); - return new JSONTokeniser(new InputStreamReader(status.input_stream, "US-ASCII")); + return new JSONTokeniser(new InputStreamReader(status.input_stream, "UTF-8")); } protected static void decodeHeaderBundleStatus(Status status, HttpURLConnection conn) throws ServalDInterfaceException @@ -382,7 +382,7 @@ public class RhizomeCommon conn.setRequestProperty("Content-Type", "multipart/form-data; boundary=" + boundary); conn.connect(); OutputStream ost = conn.getOutputStream(); - PrintStream wr = new PrintStream(ost, false, "US-ASCII"); + PrintStream wr = new PrintStream(ost, false, "UTF-8"); wr.print(new Object(){}.getClass().getEnclosingClass().getName()); if (author != null) { wr.print("\r\n--" + boundary + "\r\n"); diff --git a/java/org/servalproject/servaldna/rhizome/RhizomeIncompleteManifest.java b/java/org/servalproject/servaldna/rhizome/RhizomeIncompleteManifest.java index 5dbc50a1..f6613675 100644 --- a/java/org/servalproject/servaldna/rhizome/RhizomeIncompleteManifest.java +++ b/java/org/servalproject/servaldna/rhizome/RhizomeIncompleteManifest.java @@ -82,7 +82,7 @@ public class RhizomeIncompleteManifest { */ public void toTextFormat(OutputStream os) throws IOException { - OutputStreamWriter osw = new OutputStreamWriter(os, "US-ASCII"); + OutputStreamWriter osw = new OutputStreamWriter(os, "UTF-8"); if (id != null) osw.write("id=" + id.toHex() + "\n"); if (version != null) @@ -161,7 +161,7 @@ public class RhizomeIncompleteManifest { public void parseTextFormat(InputStream in) throws IOException, RhizomeManifestParseException { try { - InputStreamReader inr = new InputStreamReader(in, "US-ASCII"); + InputStreamReader inr = new InputStreamReader(in, "UTF-8"); int pos = 0; int lnum = 1; int eq = -1; diff --git a/strbuf_helpers.c b/strbuf_helpers.c index 140957de..08276624 100644 --- a/strbuf_helpers.c +++ b/strbuf_helpers.c @@ -663,7 +663,35 @@ strbuf strbuf_json_boolean(strbuf sb, int boolean) return sb; } -static void _json_char(strbuf sb, char c) +static const uint32_t offsetsFromUTF8[6] = { + 0x00000000UL, 0x00003080UL, 0x000E2080UL, + 0x03C82080UL, 0xFA082080UL, 0x82082080UL +}; + +// is start of UTF sequence +static uint8_t isutf(char c) { + return (c & 0xC0) != 0x80; +} + +static uint32_t u8_nextchar(const char *s, unsigned *i) +{ + if (!s[*i]) + return 0; + + uint32_t ch = 0; + int sz = 0; + + do { + ch <<= 6; + ch += (unsigned char)s[(*i)++]; + sz++; + } while (s[*i] && !isutf(s[*i])); + ch -= offsetsFromUTF8[sz-1]; + + return ch; +} + +static void _json_char(strbuf sb, uint32_t c) { if (c == '"' || c == '\\') { strbuf_putc(sb, '\\'); @@ -679,8 +707,8 @@ static void _json_char(strbuf sb, char c) strbuf_puts(sb, "\\r"); else if (c == '\t') strbuf_puts(sb, "\\t"); - else if (iscntrl(c)) - strbuf_sprintf(sb, "\\u%04X", (unsigned char) c); + else if (c>0x7f || iscntrl(c)) + strbuf_sprintf(sb, "\\u%04X", c); else strbuf_putc(sb, c); } @@ -689,8 +717,10 @@ strbuf strbuf_json_string(strbuf sb, const char *str) { if (str) { strbuf_putc(sb, '"'); - for (; *str; ++str) - _json_char(sb, *str); + unsigned pos=0; + uint32_t c; + while((c = u8_nextchar(str, &pos))) + _json_char(sb, c); strbuf_putc(sb, '"'); } else strbuf_json_null(sb); @@ -699,10 +729,15 @@ strbuf strbuf_json_string(strbuf sb, const char *str) strbuf strbuf_json_string_len(strbuf sb, const char *str, size_t strlen) { - strbuf_putc(sb, '"'); - for (; strlen; --strlen, ++str) - _json_char(sb, *str); - strbuf_putc(sb, '"'); + if (str && strlen){ + strbuf_putc(sb, '"'); + unsigned pos=0; + uint32_t c; + while(pos