From 6342fc7ccb085fea631d460da987a17052a62ef4 Mon Sep 17 00:00:00 2001 From: Eric Scharff Date: Thu, 25 Oct 2007 17:58:53 -0600 Subject: [PATCH] Read 2- and 3-byte UTF8 constants gracefully (but not correctly) --- src/machine.cpp | 26 ++++++++++++++++++-------- 1 file changed, 18 insertions(+), 8 deletions(-) diff --git a/src/machine.cpp b/src/machine.cpp index 4afc085dd5..63f6233d82 100644 --- a/src/machine.cpp +++ b/src/machine.cpp @@ -510,15 +510,25 @@ parseUtf8(Thread* t, Stream& s, unsigned length) for (unsigned si = 0; si < length; ++si) { unsigned a = s.read1(); if (a & 0x80) { - ++ si; - assert(t, si < length); - - unsigned b = s.read1(); - - if (a == 0xC0 and b == 0x80) { - byteArrayBody(t, value, vi++) = 0; + // todo: handle non-ASCII characters properly + if (a & 0x20) { + // 3 bytes + si += 2; + assert(t, si < length); + /*unsigned b = */s.read1(); + /*unsigned c = */s.read1(); + byteArrayBody(t, value, vi++) = '_'; } else { - abort(t); // todo: handle non-ASCII characters + // 2 bytes + ++ si; + assert(t, si < length); + unsigned b = s.read1(); + + if (a == 0xC0 and b == 0x80) { + byteArrayBody(t, value, vi++) = 0; + } else { + byteArrayBody(t, value, vi++) = '_'; + } } } else { byteArrayBody(t, value, vi++) = a;