Added UTF-8 support (still absent with string literals)!

2025-04-18 08:10:47 +00:00 · 2008-11-11 17:43:11 -07:00 · 2008-11-11 17:43:11 -07:00 · 1f7f9319c3
commit 1f7f9319c3
parent 0bef625500
1 changed files with 99 additions and 8 deletions
--- a/classpath/java/lang/String.java
+++ b/classpath/java/lang/String.java
@ -12,11 +12,12 @@ package java.lang;

 import java.io.UnsupportedEncodingException;
 import java.util.regex.Pattern;
+import java.io.ByteArrayOutputStream;

 public final class String implements Comparable<String>, CharSequence {
-  private Object data;
-  private int offset;
-  private int length;
+  private final Object data;
+  private final int offset;
+  private final int length;
  private int hashCode;

  public String(char[] data, int offset, int length, boolean copy) {
@ -43,6 +44,10 @@ public final class String implements Comparable<String>, CharSequence {
    this(data, 0, data.length);
  }

+  public String(String s) {
+    this(s.toCharArray());
+  }
+
  public String(byte[] data, String charset)
    throws UnsupportedEncodingException
    {
@ -65,16 +70,20 @@ public final class String implements Comparable<String>, CharSequence {
        (offset + " < 0 or " + offset + " + " + length + " > " + l);
    }

+    if(!copy && isUTF8(data)) copy = true;
+
    if (copy) {
      Object c;
      if (data instanceof char[]) {
        c = new char[length];
+        System.arraycopy(data, offset, c, 0, length);
      } else {
-        c = new byte[length];
+        c = decodeUTF8((byte[])data, offset, length);
+        if(c instanceof char[]) length = ((char[])c).length;
      }
-      System.arraycopy(data, offset, c, 0, length);
      
      this.data = c;
+      this.offset = 0;
      this.length = length;
    } else {
      this.data = data;
@ -83,6 +92,85 @@ public final class String implements Comparable<String>, CharSequence {
    }
  }

+  private static boolean isUTF8(Object data) {
+    if(!(data instanceof byte[])) return false;
+    byte[] b = (byte[])data;
+    for(int i = 0; i < b.length; ++i) {
+      if(((int)b[i] & 0x080) != 0) return true;
+    }
+    return false;
+  }
+
+  private static byte[] encodeUTF8(char[] s16, int offset, int length) {
+    ByteArrayOutputStream buf = new ByteArrayOutputStream();
+    for(int i = offset; i < offset+length; ++i) {
+      char c = s16[i];
+      if(c == '\u0000') {     // null char
+        buf.write(0);
+        buf.write(0);
+      } else if(c < 0x080) {  // 1 byte char
+        buf.write(c);
+      } else if(c < 0x0800) { // 2 byte char
+        buf.write(0x0c0 | (c >>> 6));
+        buf.write(0x080 | (c & 0x03f));
+      } else {                // 3 byte char
+        buf.write(0x0e0 | ((c >>> 12) & 0x0f));
+        buf.write(0x080 | ((c >>> 6) & 0x03f));
+        buf.write(0x080 | (c & 0x03f));
+      }
+    }
+    return buf.toByteArray();
+  }
+
+  private static void decodeUTF8_insert(Object data, int index, int val) {
+    if(data instanceof byte[]) ((byte[])data)[index] = (byte)val;
+    else                       ((char[])data)[index] = (char)val;
+  }
+
+  private static Object decodeUTF8_widen(Object data, int length, int capacity) {
+    byte[] src = (byte[])data;
+    char[] result = new char[capacity];
+    for(int i = 0; i < length; ++i) result[i] = (char)((int)src[i] & 0x0ff);
+    return result;
+  }
+
+  private static Object decodeUTF8_trim(Object data, int length) {
+    if(data instanceof byte[]) return data;
+    if(((char[])data).length == length) return data;
+    char[] result = new char[length];
+    System.arraycopy(data, 0, result, 0, length);
+    return result;
+  }
+
+  private static Object decodeUTF8(byte[] s8, int offset, int length) {
+    Object buf = new byte[s8.length];
+    boolean isMultiByte = false;
+    int i=offset, j=0;
+    while(i < offset+length) {
+      int x = s8[i++];
+      if((x & 0x080) == 0x0) {          // 1 byte char
+        if(x == 0) ++i;                 // 2 byte null char
+        decodeUTF8_insert(buf, j++, x);
+      } else if((x & 0x0e0) == 0x0c0) { // 2 byte char
+        if(!isMultiByte) {
+          buf = decodeUTF8_widen(buf, j, s8.length-1);
+          isMultiByte = true;
+        }
+        int y = s8[i++];
+        decodeUTF8_insert(buf, j++, ((x & 0x1f) << 6) | (y & 0x3f));
+      } else if((x & 0x0f0) == 0x0e0) { // 3 byte char
+        if(!isMultiByte) {
+          buf = decodeUTF8_widen(buf, j, s8.length-2);
+          isMultiByte = true;
+        }
+        int y = s8[i++]; int z = s8[i++];
+        decodeUTF8_insert(buf, j++, ((x & 0xf) << 12) | ((y & 0x3f) << 6) | (z & 0x3f));
+      }
+    }
+
+    return decodeUTF8_trim(buf, j);
+  }
+
  public String toString() {
    return this;
  }
@ -341,9 +429,12 @@ public final class String implements Comparable<String>, CharSequence {
  }

  public byte[] getBytes() {
-    byte[] b = new byte[length];
-    getBytes(0, length, b, 0);
-    return b;
+    if(data instanceof byte[]) {
+      byte[] b = new byte[length];
+      getBytes(0, length, b, 0);
+      return b;
+    }
+    return encodeUTF8((char[])data, offset, length);
  }

  public byte[] getBytes(String format)