From c63668c1ce6bccd99acb063d6b26aacf910c9c6e Mon Sep 17 00:00:00 2001 From: Joel Dice Date: Sat, 4 Aug 2012 16:11:27 -0600 Subject: [PATCH] fix ArrayIndexOutOfBoundsException when decoding a UTF-8 stream --- classpath/avian/Utf8.java | 24 +++++++-- classpath/java/io/InputStreamReader.java | 57 ++++++++++++++++++--- test/Strings.java | 64 +++++++++++++++++++++++- 3 files changed, 134 insertions(+), 11 deletions(-) diff --git a/classpath/avian/Utf8.java b/classpath/avian/Utf8.java index 84c4c28058..cb116043c0 100644 --- a/classpath/avian/Utf8.java +++ b/classpath/avian/Utf8.java @@ -50,9 +50,18 @@ public class Utf8 { while (i < offset+length) { int x = s8[i++]; if ((x & 0x080) == 0x0) { // 1 byte char - if (x == 0) ++i; // 2 byte null char + if (x == 0) { // 2 byte null char + if (i == offset + length) { + return null; + } + ++ i; + } cram(buf, j++, x); } else if ((x & 0x0e0) == 0x0c0) { // 2 byte char + if (i == offset + length) { + return null; + } + if (!isMultiByte) { buf = widen(buf, j, length-1); isMultiByte = true; @@ -60,6 +69,10 @@ public class Utf8 { int y = s8[i++]; cram(buf, j++, ((x & 0x1f) << 6) | (y & 0x3f)); } else if ((x & 0x0f0) == 0x0e0) { // 3 byte char + if (i + 1 >= offset + length) { + return null; + } + if (!isMultiByte) { buf = widen(buf, j, length-2); isMultiByte = true; @@ -74,8 +87,13 @@ public class Utf8 { public static char[] decode16(byte[] s8, int offset, int length) { Object decoded = decode(s8, offset, length); - if (decoded instanceof char[]) return (char[])decoded; - return (char[])widen(decoded, length, length); + if (decoded == null) { + return null; + } else if (decoded instanceof char[]) { + return (char[])decoded; + } else { + return (char[])widen(decoded, length, length); + } } private static void cram(Object data, int index, int val) { diff --git a/classpath/java/io/InputStreamReader.java b/classpath/java/io/InputStreamReader.java index 98f145cd92..9b35187d29 100644 --- a/classpath/java/io/InputStreamReader.java +++ b/classpath/java/io/InputStreamReader.java @@ -13,6 +13,8 @@ package java.io; import avian.Utf8; public class InputStreamReader extends Reader { + private static final int MultibytePadding = 4; + private final InputStream in; public InputStreamReader(InputStream in) { @@ -28,19 +30,60 @@ public class InputStreamReader extends Reader { throw new UnsupportedEncodingException(encoding); } } - public int read(char[] b, int offset, int length) throws IOException { - byte[] buffer = new byte[length]; - int c = in.read(buffer); + if (length == 0) { + return 0; + } - if (c <= 0) return c; + byte[] buffer = new byte[length + MultibytePadding]; + int bufferLength = length; + int bufferOffset = 0; + while (true) { + int c = in.read(buffer, bufferOffset, bufferLength); - char[] buffer16 = Utf8.decode16(buffer, 0, c); + if (c <= 0) { + if (bufferOffset > 0) { + // if we've reached the end of the stream while trying to + // read a multibyte character, we still need to return any + // competely-decoded characters, plus \ufffd to indicate an + // unknown character + c = 1; + while (bufferOffset > 0) { + char[] buffer16 = Utf8.decode16(buffer, 0, bufferOffset); - System.arraycopy(buffer16, 0, b, offset, buffer16.length); + if (buffer16 != null) { + System.arraycopy(buffer16, 0, b, offset, buffer16.length); + + c = buffer16.length + 1; + break; + } else { + -- bufferOffset; + } + } - return buffer16.length; + b[offset + c - 1] = '\ufffd'; + } + + return c; + } + + bufferOffset += c; + + char[] buffer16 = Utf8.decode16(buffer, 0, bufferOffset); + + if (buffer16 != null) { + bufferOffset = 0; + + System.arraycopy(buffer16, 0, b, offset, buffer16.length); + + return buffer16.length; + } else { + // the buffer ended in an incomplete multibyte character, so + // we try to read a another byte at a time until it's complete + bufferLength = 1; + } + } } public void close() throws IOException { diff --git a/test/Strings.java b/test/Strings.java index d98c1f13f9..bfc498ae65 100644 --- a/test/Strings.java +++ b/test/Strings.java @@ -21,7 +21,66 @@ public class Strings { return true; } - public static void main(String[] args) { + private static void testDecode(final boolean prematureEOS) throws Exception { + java.io.Reader r = new java.io.InputStreamReader + (new java.io.InputStream() { + int state = 0; + + public int read() { + throw new UnsupportedOperationException(); + } + + public int read(byte[] b, int offset, int length) { + if (length == 0) return 0; + + switch (state) { + case 0: + b[offset] = (byte) 0xc2; + state = 1; + return 1; + + case 1: + b[offset] = (byte) 0xae; + state = 2; + return 1; + + case 2: + b[offset] = (byte) 0xea; + state = 3; + return 1; + + case 3: + b[offset] = (byte) 0xba; + state = prematureEOS ? 5 : 4; + return 1; + + case 4: + b[offset] = (byte) 0xaf; + state = 5; + return 1; + + case 5: + return -1; + + default: + throw new RuntimeException(); + } + } + }); + + char[] buffer = new char[2]; + int offset = 0; + while (offset < buffer.length) { + int c = r.read(buffer, offset, buffer.length - offset); + if (c == -1) break; + offset += c; + } + + expect(new String(buffer, 0, offset).equals + (prematureEOS ? "\u00ae\ufffd" : "\u00ae\uaeaf")); + } + + public static void main(String[] args) throws Exception { expect(new String(new byte[] { 99, 111, 109, 46, 101, 99, 111, 118, 97, 116, 101, 46, 110, 97, 116, 46, 98, 117, 115, 46, 83, 121, 109, 98, 111, 108 }) @@ -77,5 +136,8 @@ public class Strings { expect(Character.forDigit(Character.digit('b', 16), 16) == 'b'); expect(Character.forDigit(Character.digit('f', 16), 16) == 'f'); expect(Character.forDigit(Character.digit('z', 36), 36) == 'z'); + + testDecode(false); + testDecode(true); } }