From 1f7f9319c377faf6830e6aea575d64e13724faca Mon Sep 17 00:00:00 2001 From: "J. Treadwell" Date: Tue, 11 Nov 2008 17:43:11 -0700 Subject: [PATCH] Added UTF-8 support (still absent with string literals)! --- classpath/java/lang/String.java | 107 +++++++++++++++++++++++++++++--- 1 file changed, 99 insertions(+), 8 deletions(-) diff --git a/classpath/java/lang/String.java b/classpath/java/lang/String.java index 74f05c4d55..89b33ad39e 100644 --- a/classpath/java/lang/String.java +++ b/classpath/java/lang/String.java @@ -12,11 +12,12 @@ package java.lang; import java.io.UnsupportedEncodingException; import java.util.regex.Pattern; +import java.io.ByteArrayOutputStream; public final class String implements Comparable, CharSequence { - private Object data; - private int offset; - private int length; + private final Object data; + private final int offset; + private final int length; private int hashCode; public String(char[] data, int offset, int length, boolean copy) { @@ -43,6 +44,10 @@ public final class String implements Comparable, CharSequence { this(data, 0, data.length); } + public String(String s) { + this(s.toCharArray()); + } + public String(byte[] data, String charset) throws UnsupportedEncodingException { @@ -65,16 +70,20 @@ public final class String implements Comparable, CharSequence { (offset + " < 0 or " + offset + " + " + length + " > " + l); } + if(!copy && isUTF8(data)) copy = true; + if (copy) { Object c; if (data instanceof char[]) { c = new char[length]; + System.arraycopy(data, offset, c, 0, length); } else { - c = new byte[length]; + c = decodeUTF8((byte[])data, offset, length); + if(c instanceof char[]) length = ((char[])c).length; } - System.arraycopy(data, offset, c, 0, length); this.data = c; + this.offset = 0; this.length = length; } else { this.data = data; @@ -83,6 +92,85 @@ public final class String implements Comparable, CharSequence { } } + private static boolean isUTF8(Object data) { + if(!(data instanceof byte[])) return false; + byte[] b = (byte[])data; + for(int i = 0; i < b.length; ++i) { + if(((int)b[i] & 0x080) != 0) return true; + } + return false; + } + + private static byte[] encodeUTF8(char[] s16, int offset, int length) { + ByteArrayOutputStream buf = new ByteArrayOutputStream(); + for(int i = offset; i < offset+length; ++i) { + char c = s16[i]; + if(c == '\u0000') { // null char + buf.write(0); + buf.write(0); + } else if(c < 0x080) { // 1 byte char + buf.write(c); + } else if(c < 0x0800) { // 2 byte char + buf.write(0x0c0 | (c >>> 6)); + buf.write(0x080 | (c & 0x03f)); + } else { // 3 byte char + buf.write(0x0e0 | ((c >>> 12) & 0x0f)); + buf.write(0x080 | ((c >>> 6) & 0x03f)); + buf.write(0x080 | (c & 0x03f)); + } + } + return buf.toByteArray(); + } + + private static void decodeUTF8_insert(Object data, int index, int val) { + if(data instanceof byte[]) ((byte[])data)[index] = (byte)val; + else ((char[])data)[index] = (char)val; + } + + private static Object decodeUTF8_widen(Object data, int length, int capacity) { + byte[] src = (byte[])data; + char[] result = new char[capacity]; + for(int i = 0; i < length; ++i) result[i] = (char)((int)src[i] & 0x0ff); + return result; + } + + private static Object decodeUTF8_trim(Object data, int length) { + if(data instanceof byte[]) return data; + if(((char[])data).length == length) return data; + char[] result = new char[length]; + System.arraycopy(data, 0, result, 0, length); + return result; + } + + private static Object decodeUTF8(byte[] s8, int offset, int length) { + Object buf = new byte[s8.length]; + boolean isMultiByte = false; + int i=offset, j=0; + while(i < offset+length) { + int x = s8[i++]; + if((x & 0x080) == 0x0) { // 1 byte char + if(x == 0) ++i; // 2 byte null char + decodeUTF8_insert(buf, j++, x); + } else if((x & 0x0e0) == 0x0c0) { // 2 byte char + if(!isMultiByte) { + buf = decodeUTF8_widen(buf, j, s8.length-1); + isMultiByte = true; + } + int y = s8[i++]; + decodeUTF8_insert(buf, j++, ((x & 0x1f) << 6) | (y & 0x3f)); + } else if((x & 0x0f0) == 0x0e0) { // 3 byte char + if(!isMultiByte) { + buf = decodeUTF8_widen(buf, j, s8.length-2); + isMultiByte = true; + } + int y = s8[i++]; int z = s8[i++]; + decodeUTF8_insert(buf, j++, ((x & 0xf) << 12) | ((y & 0x3f) << 6) | (z & 0x3f)); + } + } + + return decodeUTF8_trim(buf, j); + } + public String toString() { return this; } @@ -341,9 +429,12 @@ public final class String implements Comparable, CharSequence { } public byte[] getBytes() { - byte[] b = new byte[length]; - getBytes(0, length, b, 0); - return b; + if(data instanceof byte[]) { + byte[] b = new byte[length]; + getBytes(0, length, b, 0); + return b; + } + return encodeUTF8((char[])data, offset, length); } public byte[] getBytes(String format)