fix reading 2-byte UTF-8 constants

This commit is contained in:
Joel Dice 2009-08-14 08:52:31 -06:00
parent 6d54b6cec8
commit 7fcbf9d85c
2 changed files with 50 additions and 14 deletions

View File

@ -19,6 +19,8 @@ using namespace vm;
namespace {
const unsigned NoByte = 0xFFFF;
bool
find(Thread* t, Thread* o)
{
@ -540,9 +542,21 @@ makeByteArray(Thread* t, const char* format, va_list a)
return s;
}
unsigned
readByte(Stream& s, unsigned* value)
{
if (*value == NoByte) {
return s.read1();
} else {
unsigned r = *value;
*value = NoByte;
return r;
}
}
object
parseUtf8NonAscii(Thread* t, Stream& s, object bytesSoFar, unsigned byteCount,
unsigned sourceIndex, unsigned lastByteRead)
unsigned sourceIndex, unsigned byteA, unsigned byteB)
{
PROTECT(t, bytesSoFar);
@ -554,15 +568,14 @@ parseUtf8NonAscii(Thread* t, Stream& s, object bytesSoFar, unsigned byteCount,
charArrayBody(t, value, vi) = byteArrayBody(t, bytesSoFar, vi);
}
unsigned a = lastByteRead;
unsigned si = sourceIndex;
while (true) {
for (unsigned si = sourceIndex; si < length; ++si) {
unsigned a = readByte(s, &byteA);
if (a & 0x80) {
if (a & 0x20) {
// 3 bytes
si += 2;
assert(t, si < length);
unsigned b = s.read1();
unsigned b = readByte(s, &byteB);
unsigned c = s.read1();
charArrayBody(t, value, vi++)
= ((a & 0xf) << 12) | ((b & 0x3f) << 6) | (c & 0x3f);
@ -570,7 +583,7 @@ parseUtf8NonAscii(Thread* t, Stream& s, object bytesSoFar, unsigned byteCount,
// 2 bytes
++ si;
assert(t, si < length);
unsigned b = s.read1();
unsigned b = readByte(s, &byteB);
if (a == 0xC0 and b == 0x80) {
charArrayBody(t, value, vi++) = 0;
@ -580,12 +593,6 @@ parseUtf8NonAscii(Thread* t, Stream& s, object bytesSoFar, unsigned byteCount,
}
} else {
charArrayBody(t, value, vi++) = a;
}
if (++si < length) {
a = s.read1();
} else {
break;
}
}
@ -611,7 +618,7 @@ parseUtf8(Thread* t, Stream& s, unsigned length)
if (a & 0x80) {
if (a & 0x20) {
// 3 bytes
return parseUtf8NonAscii(t, s, value, vi, si, a);
return parseUtf8NonAscii(t, s, value, vi, si, a, NoByte);
} else {
// 2 bytes
unsigned b = s.read1();
@ -621,7 +628,7 @@ parseUtf8(Thread* t, Stream& s, unsigned length)
assert(t, si < length);
byteArrayBody(t, value, vi++) = 0;
} else {
return parseUtf8NonAscii(t, s, value, vi, si, a);
return parseUtf8NonAscii(t, s, value, vi, si, a, b);
}
}
} else {

29
test/Strings.java Normal file
View File

@ -0,0 +1,29 @@
public class Strings {
private static void expect(boolean v) {
if (! v) throw new RuntimeException();
}
public static void main(String[] args) {
expect(new String(new byte[] { 99, 111, 109, 46, 101, 99, 111, 118, 97,
116, 101, 46, 110, 97, 116, 46, 98, 117,
115, 46, 83, 121, 109, 98, 111, 108 })
.equals("com.ecovate.nat.bus.Symbol"));
final String months = "Jan\u00aeFeb\u00aeMar\u00ae";
System.out.println(months.split("\u00ae")[0]);
System.out.println(months.length());
System.out.println(months);
for (int i = 0; i < months.length(); ++i) {
System.out.print(Integer.toHexString(months.charAt(i)) + " ");
}
System.out.println();
expect(months.split("\u00ae").length == 3);
StringBuilder sb = new StringBuilder();
sb.append('$');
sb.append('2');
expect(sb.substring(1).equals("2"));
}
}