From 7fcbf9d85ceef2c5c292e0f8873a4afea3633b4b Mon Sep 17 00:00:00 2001 From: Joel Dice Date: Fri, 14 Aug 2009 08:52:31 -0600 Subject: [PATCH] fix reading 2-byte UTF-8 constants --- src/machine.cpp | 35 +++++++++++++++++++++-------------- test/Strings.java | 29 +++++++++++++++++++++++++++++ 2 files changed, 50 insertions(+), 14 deletions(-) create mode 100644 test/Strings.java diff --git a/src/machine.cpp b/src/machine.cpp index 9905f63ddf..53a134393c 100644 --- a/src/machine.cpp +++ b/src/machine.cpp @@ -19,6 +19,8 @@ using namespace vm; namespace { +const unsigned NoByte = 0xFFFF; + bool find(Thread* t, Thread* o) { @@ -540,9 +542,21 @@ makeByteArray(Thread* t, const char* format, va_list a) return s; } +unsigned +readByte(Stream& s, unsigned* value) +{ + if (*value == NoByte) { + return s.read1(); + } else { + unsigned r = *value; + *value = NoByte; + return r; + } +} + object parseUtf8NonAscii(Thread* t, Stream& s, object bytesSoFar, unsigned byteCount, - unsigned sourceIndex, unsigned lastByteRead) + unsigned sourceIndex, unsigned byteA, unsigned byteB) { PROTECT(t, bytesSoFar); @@ -554,15 +568,14 @@ parseUtf8NonAscii(Thread* t, Stream& s, object bytesSoFar, unsigned byteCount, charArrayBody(t, value, vi) = byteArrayBody(t, bytesSoFar, vi); } - unsigned a = lastByteRead; - unsigned si = sourceIndex; - while (true) { + for (unsigned si = sourceIndex; si < length; ++si) { + unsigned a = readByte(s, &byteA); if (a & 0x80) { if (a & 0x20) { // 3 bytes si += 2; assert(t, si < length); - unsigned b = s.read1(); + unsigned b = readByte(s, &byteB); unsigned c = s.read1(); charArrayBody(t, value, vi++) = ((a & 0xf) << 12) | ((b & 0x3f) << 6) | (c & 0x3f); @@ -570,7 +583,7 @@ parseUtf8NonAscii(Thread* t, Stream& s, object bytesSoFar, unsigned byteCount, // 2 bytes ++ si; assert(t, si < length); - unsigned b = s.read1(); + unsigned b = readByte(s, &byteB); if (a == 0xC0 and b == 0x80) { charArrayBody(t, value, vi++) = 0; @@ -580,12 +593,6 @@ parseUtf8NonAscii(Thread* t, Stream& s, object bytesSoFar, unsigned byteCount, } } else { charArrayBody(t, value, vi++) = a; - } - - if (++si < length) { - a = s.read1(); - } else { - break; } } @@ -611,7 +618,7 @@ parseUtf8(Thread* t, Stream& s, unsigned length) if (a & 0x80) { if (a & 0x20) { // 3 bytes - return parseUtf8NonAscii(t, s, value, vi, si, a); + return parseUtf8NonAscii(t, s, value, vi, si, a, NoByte); } else { // 2 bytes unsigned b = s.read1(); @@ -621,7 +628,7 @@ parseUtf8(Thread* t, Stream& s, unsigned length) assert(t, si < length); byteArrayBody(t, value, vi++) = 0; } else { - return parseUtf8NonAscii(t, s, value, vi, si, a); + return parseUtf8NonAscii(t, s, value, vi, si, a, b); } } } else { diff --git a/test/Strings.java b/test/Strings.java new file mode 100644 index 0000000000..c71fffa1aa --- /dev/null +++ b/test/Strings.java @@ -0,0 +1,29 @@ +public class Strings { + private static void expect(boolean v) { + if (! v) throw new RuntimeException(); + } + + public static void main(String[] args) { + expect(new String(new byte[] { 99, 111, 109, 46, 101, 99, 111, 118, 97, + 116, 101, 46, 110, 97, 116, 46, 98, 117, + 115, 46, 83, 121, 109, 98, 111, 108 }) + .equals("com.ecovate.nat.bus.Symbol")); + + final String months = "Jan\u00aeFeb\u00aeMar\u00ae"; + + System.out.println(months.split("\u00ae")[0]); + System.out.println(months.length()); + System.out.println(months); + for (int i = 0; i < months.length(); ++i) { + System.out.print(Integer.toHexString(months.charAt(i)) + " "); + } + System.out.println(); + + expect(months.split("\u00ae").length == 3); + + StringBuilder sb = new StringBuilder(); + sb.append('$'); + sb.append('2'); + expect(sb.substring(1).equals("2")); + } +}