fix ArrayIndexOutOfBoundsException when decoding a UTF-8 stream

This commit is contained in:
Joel Dice 2012-08-04 16:11:27 -06:00
parent 852d77d0b5
commit c63668c1ce
3 changed files with 134 additions and 11 deletions

View File

@ -50,9 +50,18 @@ public class Utf8 {
while (i < offset+length) { while (i < offset+length) {
int x = s8[i++]; int x = s8[i++];
if ((x & 0x080) == 0x0) { // 1 byte char if ((x & 0x080) == 0x0) { // 1 byte char
if (x == 0) ++i; // 2 byte null char if (x == 0) { // 2 byte null char
if (i == offset + length) {
return null;
}
++ i;
}
cram(buf, j++, x); cram(buf, j++, x);
} else if ((x & 0x0e0) == 0x0c0) { // 2 byte char } else if ((x & 0x0e0) == 0x0c0) { // 2 byte char
if (i == offset + length) {
return null;
}
if (!isMultiByte) { if (!isMultiByte) {
buf = widen(buf, j, length-1); buf = widen(buf, j, length-1);
isMultiByte = true; isMultiByte = true;
@ -60,6 +69,10 @@ public class Utf8 {
int y = s8[i++]; int y = s8[i++];
cram(buf, j++, ((x & 0x1f) << 6) | (y & 0x3f)); cram(buf, j++, ((x & 0x1f) << 6) | (y & 0x3f));
} else if ((x & 0x0f0) == 0x0e0) { // 3 byte char } else if ((x & 0x0f0) == 0x0e0) { // 3 byte char
if (i + 1 >= offset + length) {
return null;
}
if (!isMultiByte) { if (!isMultiByte) {
buf = widen(buf, j, length-2); buf = widen(buf, j, length-2);
isMultiByte = true; isMultiByte = true;
@ -74,9 +87,14 @@ public class Utf8 {
public static char[] decode16(byte[] s8, int offset, int length) { public static char[] decode16(byte[] s8, int offset, int length) {
Object decoded = decode(s8, offset, length); Object decoded = decode(s8, offset, length);
if (decoded instanceof char[]) return (char[])decoded; if (decoded == null) {
return null;
} else if (decoded instanceof char[]) {
return (char[])decoded;
} else {
return (char[])widen(decoded, length, length); return (char[])widen(decoded, length, length);
} }
}
private static void cram(Object data, int index, int val) { private static void cram(Object data, int index, int val) {
if (data instanceof byte[]) ((byte[])data)[index] = (byte)val; if (data instanceof byte[]) ((byte[])data)[index] = (byte)val;

View File

@ -13,6 +13,8 @@ package java.io;
import avian.Utf8; import avian.Utf8;
public class InputStreamReader extends Reader { public class InputStreamReader extends Reader {
private static final int MultibytePadding = 4;
private final InputStream in; private final InputStream in;
public InputStreamReader(InputStream in) { public InputStreamReader(InputStream in) {
@ -29,18 +31,59 @@ public class InputStreamReader extends Reader {
} }
} }
public int read(char[] b, int offset, int length) throws IOException { public int read(char[] b, int offset, int length) throws IOException {
byte[] buffer = new byte[length]; if (length == 0) {
int c = in.read(buffer); return 0;
}
if (c <= 0) return c; byte[] buffer = new byte[length + MultibytePadding];
int bufferLength = length;
int bufferOffset = 0;
while (true) {
int c = in.read(buffer, bufferOffset, bufferLength);
char[] buffer16 = Utf8.decode16(buffer, 0, c); if (c <= 0) {
if (bufferOffset > 0) {
// if we've reached the end of the stream while trying to
// read a multibyte character, we still need to return any
// competely-decoded characters, plus \ufffd to indicate an
// unknown character
c = 1;
while (bufferOffset > 0) {
char[] buffer16 = Utf8.decode16(buffer, 0, bufferOffset);
if (buffer16 != null) {
System.arraycopy(buffer16, 0, b, offset, buffer16.length);
c = buffer16.length + 1;
break;
} else {
-- bufferOffset;
}
}
b[offset + c - 1] = '\ufffd';
}
return c;
}
bufferOffset += c;
char[] buffer16 = Utf8.decode16(buffer, 0, bufferOffset);
if (buffer16 != null) {
bufferOffset = 0;
System.arraycopy(buffer16, 0, b, offset, buffer16.length); System.arraycopy(buffer16, 0, b, offset, buffer16.length);
return buffer16.length; return buffer16.length;
} else {
// the buffer ended in an incomplete multibyte character, so
// we try to read a another byte at a time until it's complete
bufferLength = 1;
}
}
} }
public void close() throws IOException { public void close() throws IOException {

View File

@ -21,7 +21,66 @@ public class Strings {
return true; return true;
} }
public static void main(String[] args) { private static void testDecode(final boolean prematureEOS) throws Exception {
java.io.Reader r = new java.io.InputStreamReader
(new java.io.InputStream() {
int state = 0;
public int read() {
throw new UnsupportedOperationException();
}
public int read(byte[] b, int offset, int length) {
if (length == 0) return 0;
switch (state) {
case 0:
b[offset] = (byte) 0xc2;
state = 1;
return 1;
case 1:
b[offset] = (byte) 0xae;
state = 2;
return 1;
case 2:
b[offset] = (byte) 0xea;
state = 3;
return 1;
case 3:
b[offset] = (byte) 0xba;
state = prematureEOS ? 5 : 4;
return 1;
case 4:
b[offset] = (byte) 0xaf;
state = 5;
return 1;
case 5:
return -1;
default:
throw new RuntimeException();
}
}
});
char[] buffer = new char[2];
int offset = 0;
while (offset < buffer.length) {
int c = r.read(buffer, offset, buffer.length - offset);
if (c == -1) break;
offset += c;
}
expect(new String(buffer, 0, offset).equals
(prematureEOS ? "\u00ae\ufffd" : "\u00ae\uaeaf"));
}
public static void main(String[] args) throws Exception {
expect(new String(new byte[] { 99, 111, 109, 46, 101, 99, 111, 118, 97, expect(new String(new byte[] { 99, 111, 109, 46, 101, 99, 111, 118, 97,
116, 101, 46, 110, 97, 116, 46, 98, 117, 116, 101, 46, 110, 97, 116, 46, 98, 117,
115, 46, 83, 121, 109, 98, 111, 108 }) 115, 46, 83, 121, 109, 98, 111, 108 })
@ -77,5 +136,8 @@ public class Strings {
expect(Character.forDigit(Character.digit('b', 16), 16) == 'b'); expect(Character.forDigit(Character.digit('b', 16), 16) == 'b');
expect(Character.forDigit(Character.digit('f', 16), 16) == 'f'); expect(Character.forDigit(Character.digit('f', 16), 16) == 'f');
expect(Character.forDigit(Character.digit('z', 36), 36) == 'z'); expect(Character.forDigit(Character.digit('z', 36), 36) == 'z');
testDecode(false);
testDecode(true);
} }
} }