ARM and UTF-8 work

This commit is contained in:
JET
2010-04-14 09:26:50 -06:00
parent 3e5b2cbc7b
commit b2f5e71d22
8 changed files with 305 additions and 324 deletions

View File

@ -15,8 +15,8 @@ import java.util.regex.Pattern;
import java.util.Comparator;
import java.util.Formatter;
import java.util.Locale;
import java.io.ByteArrayOutputStream;
import java.io.Serializable;
import avian.Utf8;
public final class String
implements Comparable<String>, CharSequence, Serializable
@ -112,7 +112,7 @@ public final class String
(offset + " < 0 or " + offset + " + " + length + " > " + l);
}
if(!copy && isUTF8(data)) copy = true;
if(!copy && Utf8.test(data)) copy = true;
if (copy) {
Object c;
@ -120,7 +120,7 @@ public final class String
c = new char[length];
System.arraycopy(data, offset, c, 0, length);
} else {
c = decodeUTF8((byte[])data, offset, length);
c = Utf8.decode((byte[])data, offset, length);
if(c instanceof char[]) length = ((char[])c).length;
}
@ -134,85 +134,6 @@ public final class String
}
}
private static boolean isUTF8(Object data) {
if(!(data instanceof byte[])) return false;
byte[] b = (byte[])data;
for(int i = 0; i < b.length; ++i) {
if(((int)b[i] & 0x080) != 0) return true;
}
return false;
}
private static byte[] encodeUTF8(char[] s16, int offset, int length) {
ByteArrayOutputStream buf = new ByteArrayOutputStream();
for(int i = offset; i < offset+length; ++i) {
char c = s16[i];
if(c == '\u0000') { // null char
buf.write(0);
buf.write(0);
} else if(c < 0x080) { // 1 byte char
buf.write(c);
} else if(c < 0x0800) { // 2 byte char
buf.write(0x0c0 | (c >>> 6));
buf.write(0x080 | (c & 0x03f));
} else { // 3 byte char
buf.write(0x0e0 | ((c >>> 12) & 0x0f));
buf.write(0x080 | ((c >>> 6) & 0x03f));
buf.write(0x080 | (c & 0x03f));
}
}
return buf.toByteArray();
}
private static void decodeUTF8_insert(Object data, int index, int val) {
if(data instanceof byte[]) ((byte[])data)[index] = (byte)val;
else ((char[])data)[index] = (char)val;
}
private static Object decodeUTF8_widen(Object data, int length, int capacity) {
byte[] src = (byte[])data;
char[] result = new char[capacity];
for(int i = 0; i < length; ++i) result[i] = (char)((int)src[i] & 0x0ff);
return result;
}
private static Object decodeUTF8_trim(Object data, int length) {
if(data instanceof byte[]) return data;
if(((char[])data).length == length) return data;
char[] result = new char[length];
System.arraycopy(data, 0, result, 0, length);
return result;
}
private static Object decodeUTF8(byte[] s8, int offset, int length) {
Object buf = new byte[length];
boolean isMultiByte = false;
int i=offset, j=0;
while(i < offset+length) {
int x = s8[i++];
if((x & 0x080) == 0x0) { // 1 byte char
if(x == 0) ++i; // 2 byte null char
decodeUTF8_insert(buf, j++, x);
} else if((x & 0x0e0) == 0x0c0) { // 2 byte char
if(!isMultiByte) {
buf = decodeUTF8_widen(buf, j, length-1);
isMultiByte = true;
}
int y = s8[i++];
decodeUTF8_insert(buf, j++, ((x & 0x1f) << 6) | (y & 0x3f));
} else if((x & 0x0f0) == 0x0e0) { // 3 byte char
if(!isMultiByte) {
buf = decodeUTF8_widen(buf, j, length-2);
isMultiByte = true;
}
int y = s8[i++]; int z = s8[i++];
decodeUTF8_insert(buf, j++, ((x & 0xf) << 12) | ((y & 0x3f) << 6) | (z & 0x3f));
}
}
return decodeUTF8_trim(buf, j);
}
public String toString() {
return this;
}
@ -494,7 +415,7 @@ public final class String
getBytes(0, length, b, 0);
return b;
}
return encodeUTF8((char[])data, offset, length);
return Utf8.encode((char[])data, offset, length);
}
public byte[] getBytes(String format)