Read 2- and 3-byte UTF8 constants gracefully (but not correctly)

This commit is contained in:
Eric Scharff 2007-10-25 17:58:53 -06:00
parent 25275933ce
commit 6342fc7ccb

View File

@ -510,15 +510,25 @@ parseUtf8(Thread* t, Stream& s, unsigned length)
for (unsigned si = 0; si < length; ++si) { for (unsigned si = 0; si < length; ++si) {
unsigned a = s.read1(); unsigned a = s.read1();
if (a & 0x80) { if (a & 0x80) {
++ si; // todo: handle non-ASCII characters properly
assert(t, si < length); if (a & 0x20) {
// 3 bytes
unsigned b = s.read1(); si += 2;
assert(t, si < length);
if (a == 0xC0 and b == 0x80) { /*unsigned b = */s.read1();
byteArrayBody(t, value, vi++) = 0; /*unsigned c = */s.read1();
byteArrayBody(t, value, vi++) = '_';
} else { } else {
abort(t); // todo: handle non-ASCII characters // 2 bytes
++ si;
assert(t, si < length);
unsigned b = s.read1();
if (a == 0xC0 and b == 0x80) {
byteArrayBody(t, value, vi++) = 0;
} else {
byteArrayBody(t, value, vi++) = '_';
}
} }
} else { } else {
byteArrayBody(t, value, vi++) = a; byteArrayBody(t, value, vi++) = a;