diff --git a/.gitignore b/.gitignore index 588446f90d..8800bb0b0c 100644 --- a/.gitignore +++ b/.gitignore @@ -8,3 +8,4 @@ bin /lib /distrib *.pdb +*.swp diff --git a/classpath/java/util/regex/CharacterMatcher.java b/classpath/java/util/regex/CharacterMatcher.java new file mode 100644 index 0000000000..36a74fe4c4 --- /dev/null +++ b/classpath/java/util/regex/CharacterMatcher.java @@ -0,0 +1,332 @@ +/* Copyright (c) 2008-2013, Avian Contributors + + Permission to use, copy, modify, and/or distribute this software + for any purpose with or without fee is hereby granted, provided + that the above copyright notice and this permission notice appear + in all copies. + + There is NO WARRANTY for this software. See license.txt for + details. */ + +package java.util.regex; + +/** + * A class to match classes of characters. + *

+ * This class is intended to be the working horse behind character classes + * such as {@code [a-z]}. + *

+ * @author Johannes Schindelin + */ +class CharacterMatcher { + private boolean[] map; + private boolean inversePattern; + + public static CharacterMatcher parse(String description) { + return parse(description.toCharArray()); + } + + public static CharacterMatcher parse(char[] description) { + Parser parser = new Parser(description); + CharacterMatcher result = parser.parseClass(); + if (parser.getEndOffset() != description.length) { + throw new RuntimeException("Short character class @" + + parser.getEndOffset() + ": " + new String(description)); + } + return result; + } + + public boolean matches(char c) { + int index = c; + return (map.length > index && map[index]) ^ inversePattern; + } + + public String toString() { + StringBuilder builder = new StringBuilder(); + builder.append("["); + if (inversePattern) { + builder.append("^"); + } + for (int i = 0; i < map.length; ++ i) { + if (!map[i]) { + continue; + } + builder.append(i >= ' ' && i <= 0x7f ? + "" + (char)i : ("\\x" + Integer.toHexString(i))); + int j = i + 1; + while (j < map.length && map[j]) { + ++ j; + } + -- j; + if (j > i) { + if (j > i + 1) { + builder.append('-'); + } + builder.append(j >= ' ' && j <= 0x7f ? + "" + (char)j : ("\\x" + Integer.toHexString(j))); + i = j; + } + } + builder.append("]"); + return builder.toString(); + } + + private static String specialClass(int c) { + if ('d' == c) { + return "[0-9]"; + } + if ('D' == c) { + return "[^0-9]"; + } + if ('s' == c) { + return "[ \\t\\n\\x0B\\f\\r]"; + } + if ('S' == c) { + return "[^ \\t\\n\\x0B\\f\\r]"; + } + if ('w' == c) { + return "[a-zA-Z_0-9]"; + } + if ('W' == c) { + return "[^a-zA-Z_0-9]"; + } + return null; + } + + private CharacterMatcher(boolean[] map, boolean inversePattern) { + this.map = map; + this.inversePattern = inversePattern; + } + + private void setMatch(int c) { + ensureCapacity(c + 1); + map[c] = true; + } + + private void ensureCapacity(int length) { + if (map.length >= length) { + return; + } + int size = map.length; + if (size < 32) { + size = 32; + } + while (size < length) { + size <<= 1; + } + map = java.util.Arrays.copyOf(map, size); + } + + private void merge(CharacterMatcher other) { + boolean inversePattern = this.inversePattern || other.inversePattern; + if ((map.length < other.map.length) ^ inversePattern) { + map = java.util.Arrays.copyOf(map, other.map.length); + } + for (int i = 0; i < map.length; ++ i) { + map[i] = (matches((char)i) || other.matches((char)i)) ^ inversePattern; + } + this.inversePattern = inversePattern; + } + + private void intersect(CharacterMatcher other) { + boolean inversePattern = this.inversePattern && other.inversePattern; + if ((map.length > other.map.length) ^ inversePattern) { + map = java.util.Arrays.copyOf(map, other.map.length); + } + for (int i = 0; i < map.length; ++ i) { + map[i] = (matches((char)i) && other.matches((char)i)) ^ inversePattern; + } + this.inversePattern = inversePattern; + } + + static class Parser { + private final char[] description; + private int offset; + + public Parser(char[] description) { + this.description = description; + } + + public int getEndOffset() { + return offset; + } + + /** + * Parses an escaped character. + * + * @param start the offset after the backslash + * @return the escaped character, or -1 if no character was recognized + */ + public int parseEscapedCharacter(int start) { + offset = start; + return parseEscapedCharacter(); + } + + private int parseEscapedCharacter() { + if (offset == description.length) { + throw new IllegalArgumentException("Short escaped character"); + } + char c = description[offset++]; + if (c == '0') { + int len = digits(offset, 3, 8); + if (len == 3 && description[offset] > '3') { + --len; + } + c = (char)Integer.parseInt(new String(description, offset, len), 8); + offset += len; + return c; + } + if (c == 'x' || c == 'u') { + int len = digits(offset, 4, 16); + c = (char)Integer.parseInt(new String(description, offset, len), 16); + offset += len; + return c; + } + switch (c) { + case 'a': + return 0x0007; + case 'e': + return 0x001B; + case 'f': + return 0x000C; + case 'n': + return 0x000A; + case 'r': + return 0x000D; + case 't': + return 0x0009; + case '\\': + case '.': + case '*': + case '+': + case '?': + case '|': + case '[': + case ']': + case '{': + case '}': + case '(': + case ')': + case '^': + case '$': + return c; + } + return -1; + } + + public int digits(int offset, int maxLength, int base) { + for (int i = 0; ; ++i) { + if (i == maxLength || offset + i >= description.length) { + return i; + } + int value = description[offset + i] - '0'; + if (value < 0) { + return i; + } + if (base > 10 && value >= 10) { + value += 10 - (value >= 'a' - '0' ? 'a' - '0' : 'A' - '0'); + } + if (value >= base) { + return i; + } + } + } + + public CharacterMatcher parseClass(int start) { + offset = start; + return parseClass(); + } + + public CharacterMatcher parseClass() { + if (description[offset] != '[') { + if (description[offset] == '\\') { + String range = specialClass(description[++ offset]); + if (range != null) { + ++ offset; + return CharacterMatcher.parse(range); + } + } + return null; + } + CharacterMatcher matcher = new CharacterMatcher(new boolean[0], + description[++ offset] == '^'); + if (matcher.inversePattern) { + ++ offset; + } + + int previous = -1; + boolean firstCharacter = true; + for (;;) { + if (offset >= description.length) { + unsupported("short regex"); + } + char c = description[offset++]; + if (c == '-' && !firstCharacter && description[offset] != ']') { + if (previous < 0) { + unsupported("invalid range"); + } + int rangeEnd = description[offset]; + if ('\\' == rangeEnd) { + rangeEnd = parseEscapedCharacter(); + if (rangeEnd < 0) { + unsupported("invalid range"); + } + } + matcher.ensureCapacity(rangeEnd + 1); + for (int j = previous + 1; j <= rangeEnd; j++) { + matcher.map[j] = true; + } + } else if (c == '\\') { + int saved = offset; + previous = parseEscapedCharacter(); + if (previous < 0) { + offset = saved - 1; + CharacterMatcher clazz = parseClass(); + if (clazz == null) { + unsupported("escape"); + } + matcher.merge(clazz); + } else { + matcher.setMatch(previous); + } + } else if (c == '[') { + Parser parser = new Parser(description); + CharacterMatcher other = parser.parseClass(offset - 1); + if (other == null) { + unsupported("invalid merge"); + } + matcher.merge(other); + offset = parser.getEndOffset(); + previous = -1; + } else if (c == '&') { + if (offset + 2 > description.length || description[offset] != '&' + || description[offset + 1] != '[') { + unsupported("operation"); + } + Parser parser = new Parser(description); + CharacterMatcher other = parser.parseClass(offset + 1); + if (other == null) { + unsupported("invalid intersection"); + } + matcher.intersect(other); + offset = parser.getEndOffset(); + previous = -1; + } else if (c == ']') { + break; + } else { + previous = c; + matcher.setMatch(previous); + } + firstCharacter = false; + } + + return matcher; + } + + private void unsupported(String msg) throws UnsupportedOperationException { + throw new UnsupportedOperationException("Unsupported " + msg + " @" + + offset + ": " + + new String(description, 0, description.length)); + } + } +} diff --git a/classpath/java/util/regex/Compiler.java b/classpath/java/util/regex/Compiler.java new file mode 100644 index 0000000000..0cf50fcee4 --- /dev/null +++ b/classpath/java/util/regex/Compiler.java @@ -0,0 +1,533 @@ +/* Copyright (c) 2008-2013, Avian Contributors + + Permission to use, copy, modify, and/or distribute this software + for any purpose with or without fee is hereby granted, provided + that the above copyright notice and this permission notice appear + in all copies. + + There is NO WARRANTY for this software. See license.txt for + details. */ + +package java.util.regex; + +import java.util.ArrayList; +import java.util.Stack; + +/** + * Compiles regular expressions into {@link PikeVM}s. + * + * @author Johannes Schindelin + */ +class Compiler implements PikeVMOpcodes { + private final static CharacterMatcher regularCharacter = + CharacterMatcher.parse("[^\\\\.*+?|\\[\\]{}()^$]"); + + private static class Output { + private int[] program; + private int offset; + private int groupCount = -1; + private int findPreambleSize; + private ArrayList classes; + private ArrayList lookarounds; + + public Output(Expression expr) { + // try-run to determine the code size + expr.writeCode(this); + program = new int[offset]; + offset = 0; + groupCount = -1; + classes = new ArrayList(); + lookarounds = new ArrayList(); + // write it out! + expr.writeCode(this); + } + + public void add(int opcode) { + if (program != null) { + program[offset] = opcode; + } + offset++; + } + + public int markJump() { + return offset++; + } + + public void setJump(int mark) { + if (program != null) { + program[mark] = offset; + } + } + + public void markFindPreambleEnd() { + findPreambleSize = offset; + } + + public PikeVM toVM() { + CharacterMatcher[] classes = new CharacterMatcher[this.classes.size()]; + this.classes.toArray(classes); + PikeVM[] lookarounds = new PikeVM[this.lookarounds.size()]; + this.lookarounds.toArray(lookarounds); + return new PikeVM(program, findPreambleSize, groupCount, classes, + lookarounds); + } + + public int addClass(CharacterMatcher characterClass) { + if (program == null) { + return -1; + } + int result = classes.size(); + classes.add(characterClass); + return result; + } + + public int addLookaround(PikeVM lookaround) { + if (program == null) { + return -1; + } + int result = lookarounds.size(); + lookarounds.add(lookaround); + return result; + } + } + + private abstract class Expression { + protected abstract void writeCode(Output output); + } + + private class CharacterRange extends Expression { + private final CharacterMatcher characterClass; + + public CharacterRange(CharacterMatcher characterClass) { + this.characterClass = characterClass; + } + + protected void writeCode(Output output) { + output.add(CHARACTER_CLASS); + output.add(output.addClass(characterClass)); + } + + public String toString() { + return characterClass.toString(); + } + } + + private class Repeat extends Expression { + private Expression expr; + private int minCount, maxCount; + private boolean greedy; + + public Repeat(Expression expr, int minCount, int maxCount, boolean greedy) { + if (minCount < 0) { + throw new RuntimeException("Unexpected min count: " + minCount); + } + if (maxCount != -1) { + if (maxCount == 0) { + throw new RuntimeException("Unexpected max count: " + maxCount); + } + if (minCount > maxCount) { + throw new RuntimeException("Unexpected range: " + minCount + ", " + maxCount); + } + } + this.expr = expr; + this.minCount = minCount; + this.maxCount = maxCount; + this.greedy = greedy; + } + + protected void writeCode(Output output) { + int start = output.offset; + int splitJmp = greedy ? SPLIT_JMP : SPLIT; + int split = greedy ? SPLIT : SPLIT_JMP; + for (int i = 1; i < minCount; ++ i) { + expr.writeCode(output); + } + if (maxCount == -1) { + if (minCount > 0) { + int jump = output.offset; + expr.writeCode(output); + output.add(splitJmp); + output.add(jump); + } else { + output.add(split); + int jump = output.markJump(); + expr.writeCode(output); + output.add(splitJmp); + output.add(start + 2); + output.setJump(jump); + } + } else { + if (minCount > 0) { + expr.writeCode(output); + } + if (maxCount > minCount) { + int[] jumps = new int[maxCount - minCount]; + for (int i = 0; i < jumps.length; ++ i) { + output.add(split); + jumps[i] = output.markJump(); + expr.writeCode(output); + } + for (int jump : jumps) { + output.setJump(jump); + } + } + } + } + + public String toString() { + String qualifier = greedy ? "" : "?"; + if (minCount == 0 && maxCount < 2) { + return expr.toString() + (minCount < 0 ? "*" : "?") + qualifier; + } + if (minCount == 1 && maxCount < 0) { + return expr.toString() + "+" + qualifier; + } + return expr.toString() + "{" + minCount + "," + + (maxCount < 0 ? "" : "" + maxCount) + "}" + qualifier; + } + } + + private class Group extends Expression { + private final boolean capturing; + + private ArrayList list = new ArrayList(); + private ArrayList alternatives; + + public Group(boolean capturing, ArrayList initialList) { + this.capturing = capturing; + if (initialList != null) { + list.addAll(initialList); + } + } + + public void push(Expression expr) { + list.add(expr); + } + + public void push(final int c) { + push(new Expression() { + public void writeCode(Output output) { + output.add(c); + } + + public String toString() { + if (c >= 0) { + return "" + (char)c; + } + switch (c) { + case DOT: + return "."; + case WORD_BOUNDARY: + return "\\b"; + case NON_WORD_BOUNDARY: + return "\\B"; + case LINE_START: + return "^"; + case LINE_END: + return "$"; + default: + throw new RuntimeException("Unhandled opcode: " + c); + } + } + }); + } + + public void startAlternative() { + if (alternatives == null) { + alternatives = new ArrayList(); + } + alternatives.add(new Group(false, list)); + list.clear(); + } + + public Expression pop() { + Expression result = list.remove(list.size() - 1); + return result; + } + + protected void writeCode(Output output) { + int groupIndex = -1; + if (capturing) { + groupIndex = ++ output.groupCount; + output.add(SAVE_OFFSET); + output.add(2 * groupIndex); + } + int[] jumps = null; + if (alternatives != null) { + jumps = new int[alternatives.size()]; + int i = 0; + for (Group alternative : alternatives) { + output.add(SPLIT); + int jump = output.markJump(); + alternative.writeCode(output); + output.add(JMP); + jumps[i++] = output.markJump(); + output.setJump(jump); + } + } + for (Expression expr : list) { + expr.writeCode(output); + } + if (jumps != null) { + for (int jump : jumps) { + output.setJump(jump); + } + } + if (capturing) { + output.add(SAVE_OFFSET); + output.add(2 * groupIndex + 1); + } + } + + public String toString() { + StringBuilder builder = new StringBuilder(); + if (alternatives != null || list.size() > 1) { + builder.append('('); + if (!capturing) { + builder.append("?:"); + } + } + if (alternatives != null) { + for (Group alternative : alternatives) { + builder.append(alternative).append('|'); + } + } + for (Expression expr : list) { + builder.append(expr); + } + if (alternatives != null || list.size() > 1) { + builder.append(')'); + } + return builder.toString(); + } + } + + private class Lookaround extends Expression { + private final Group group = new Group(false, null); + private final boolean forward, negative; + + public Lookaround(boolean forward, boolean negative) { + this.forward = forward; + this.negative = negative; + } + + @Override + protected void writeCode(Output output) { + PikeVM vm = new Output(group).toVM(); + if (!forward) { + vm.reverse(); + } + output.add(forward ? + (negative ? NEGATIVE_LOOKAHEAD : LOOKAHEAD) : + (negative ? NEGATIVE_LOOKAHEAD : LOOKBEHIND)); + output.add(output.addLookaround(vm)); + } + + public String toString() { + String inner = group.toString(); + if (inner.startsWith("(?:")) { + inner = inner.substring(3); + } else { + inner += ")"; + } + return "(?=" + inner; + } + } + + private class Group0 extends Expression { + private final Group group; + + public Group0() { + group = new Group(true, null); + } + + public void writeCode(Output output) { + // find() preamble + int start = output.offset; + output.add(SPLIT_JMP); + output.add(start + 5); + output.add(DOTALL); + output.add(SPLIT); + output.add(start + 2); + output.markFindPreambleEnd(); + group.writeCode(output); + } + + public String toString() { + String inner = group.toString(); + return inner.startsWith("(?:") && inner.endsWith(")") ? + inner.substring(1, inner.length() - 1) : inner; + } + } + + private Group0 root; + private Stack groups; + + public Compiler() { + root = new Group0(); + groups = new Stack(); + groups.add(root.group); + } + + public Pattern compile(String regex) { + char[] array = regex.toCharArray(); + CharacterMatcher.Parser characterClassParser = + new CharacterMatcher.Parser(array); + for (int index = 0; index < array.length; ++ index) { + char c = array[index]; + Group current = groups.peek(); + if (regularCharacter.matches(c)) { + current.push(c); + continue; + } + switch (c) { + case '.': + current.push(DOT); + continue; + case '\\': + int unescaped = characterClassParser.parseEscapedCharacter(index + 1); + if (unescaped >= 0) { + index = characterClassParser.getEndOffset() - 1; + current.push((char)unescaped); + continue; + } + CharacterMatcher characterClass = characterClassParser.parseClass(index); + if (characterClass != null) { + index = characterClassParser.getEndOffset() - 1; + current.push(new CharacterRange(characterClass)); + continue; + } + switch (array[index + 1]) { + case 'b': + index++; + current.push(WORD_BOUNDARY); + continue; + case 'B': + index++; + current.push(NON_WORD_BOUNDARY); + continue; + } + throw new RuntimeException("Parse error @" + index + ": " + regex); + case '?': + case '*': + case '+': { + boolean greedy = true; + if (index + 1 < array.length && array[index + 1] == '?') { + greedy = false; + ++ index; + } + current.push(new Repeat(current.pop(), + c == '+' ? 1 : 0, c == '?' ? 1 : -1, greedy)); + continue; + } + case '{': { + ++ index; + int length = characterClassParser.digits(index, 8, 10); + int min = Integer.parseInt(regex.substring(index, index + length)); + int max = min; + index += length - 1; + c = index + 1 < array.length ? array[index + 1] : 0; + if (c == ',') { + ++ index; + length = characterClassParser.digits(index + 1, 8, 10); + max = length == 0 ? -1 : + Integer.parseInt(regex.substring(index + 1, index + 1 + length)); + index += length; + c = index + 1< array.length ? array[index + 1] : 0; + } + if (c != '}') { + throw new RuntimeException("Invalid quantifier @" + index + ": " + + regex); + } + ++ index; + boolean greedy = true; + if (index + 1 < array.length && array[index + 1] == '?') { + ++ index; + greedy = false; + } + current.push(new Repeat(current.pop(), min, max, greedy)); + continue; + } + case '(': { + boolean capturing = true; + if (index + 1 < array.length && array[index + 1] == '?') { + index += 2; + if (index >= array.length) { + throw new RuntimeException("Short pattern @" + index + ": " + + regex); + } + c = array[index]; + boolean lookAhead = true; + if (c == '<') { + if (++ index >= array.length) { + throw new RuntimeException("Short pattern @" + index + ": " + + regex); + } + lookAhead = false; + c = array[index]; + if (c != '=' && c != '!') { + throw new IllegalArgumentException("Named groups not supported @" + + index + ": " + regex); + } + } + switch (c) { + case ':': + capturing = false; + break; + case '!': + case '=': { + capturing = false; + Lookaround lookaround = new Lookaround(lookAhead, c == '!'); + current.push(lookaround); + groups.push(lookaround.group); + continue; + } + default: + throw new UnsupportedOperationException("Not yet supported: " + + regex.substring(index)); + } + } + current.push(groups.push(new Group(capturing, null))); + continue; + } + case ')': + if (groups.size() < 2) { + throw new RuntimeException("Invalid group close @" + index + ": " + + regex); + } + groups.pop(); + continue; + case '[': { + CharacterMatcher matcher = characterClassParser.parseClass(index); + if (matcher == null) { + throw new RuntimeException("Invalid range @" + index + ": " + regex); + } + current.push(new CharacterRange(matcher)); + index = characterClassParser.getEndOffset() - 1; + continue; + } + case '|': + current.startAlternative(); + continue; + case '^': + current.push(LINE_START); + continue; + case '$': + current.push(LINE_END); + continue; + default: + throw new RuntimeException("Parse error @" + index + ": " + regex); + } + } + if (groups.size() != 1) { + throw new IllegalArgumentException("Unclosed groups: (" + + (groups.size() - 1) + "): " + regex); + } + PikeVM vm = new Output(root).toVM(); + String plain = vm.isPlainString(); + if (plain != null) { + return new TrivialPattern(regex, plain, 0); + } + return new RegexPattern(regex, 0, vm); + } +} diff --git a/classpath/java/util/regex/Matcher.java b/classpath/java/util/regex/Matcher.java index 4397931fdf..89f8306cc5 100644 --- a/classpath/java/util/regex/Matcher.java +++ b/classpath/java/util/regex/Matcher.java @@ -15,27 +15,23 @@ package java.util.regex; * * @author zsombor and others */ -public class Matcher { - private final Pattern pattern; - private CharSequence input; - private int start; - private int end; +public abstract class Matcher { + protected CharSequence input; + protected int start; + protected int end; - Matcher(Pattern pattern, CharSequence input) { - this.pattern = pattern; - this.input = input; + public Matcher(CharSequence input) { + reset(input); } - public boolean matches() { - if (pattern.pattern().equals(input.toString())) { - start = 0; - end = input.length(); - return true; - } else { - return false; - } + public abstract boolean matches(); + + public boolean find() { + return find(end); } + public abstract boolean find(int start); + public Matcher reset() { return reset(input); } @@ -47,10 +43,6 @@ public class Matcher { return this; } - public int start() { - return start; - } - public String replaceAll(String replacement) { return replace(replacement, Integer.MAX_VALUE); } @@ -59,7 +51,7 @@ public class Matcher { return replace(replacement, 1); } - private String replace(String replacement, int limit) { + protected String replace(String replacement, int limit) { reset(); StringBuilder sb = null; @@ -88,23 +80,40 @@ public class Matcher { return sb.toString(); } + public int start() { + return start; + } + public int end() { return end; } - public boolean find() { - return find(end); + public String group() { + return input.subSequence(start, end).toString(); } - public boolean find(int start) { - String p = pattern.pattern(); - int i = Pattern.indexOf(input, p, start); - if (i >= 0) { - this.start = i; - this.end = i + p.length(); - return true; - } else { - return false; + public int start(int group) { + if (group == 0) { + return start(); } + throw new UnsupportedOperationException(); + } + + public int end(int group) { + if (group == 0) { + return end(); + } + throw new UnsupportedOperationException(); + } + + public String group(int group) { + if (group == 0) { + return group(); + } + throw new UnsupportedOperationException(); + } + + public int groupCount() { + return 0; } } diff --git a/classpath/java/util/regex/Pattern.java b/classpath/java/util/regex/Pattern.java index b9c84eb6f3..be63b73e29 100644 --- a/classpath/java/util/regex/Pattern.java +++ b/classpath/java/util/regex/Pattern.java @@ -10,9 +10,8 @@ package java.util.regex; -import java.util.Iterator; +import java.util.ArrayList; import java.util.List; -import java.util.LinkedList; /** * This is a work in progress. @@ -20,7 +19,7 @@ import java.util.LinkedList; * @author zsombor and others * */ -public class Pattern { +public abstract class Pattern implements PikeVMOpcodes { public static final int UNIX_LINES = 1; public static final int CASE_INSENSITIVE = 2; @@ -35,112 +34,26 @@ public class Pattern { private final String pattern; protected Pattern(String pattern, int flags) { - this.pattern = trivial(pattern); + this.pattern = pattern; this.patternFlags = flags; } - private static String trivial(String pattern) { - StringBuffer buffer = new StringBuffer(); - for (int i = 0; i < pattern.length(); ++i) { - char c = pattern.charAt(i); - switch (c) { - case '\\': - if (++i == pattern.length()) { - break; - } - c = pattern.charAt(i); - if (c == '0') { - int len = digits(pattern, ++i, 3, 8); - if (len == 3 && pattern.charAt(i) > '3') { - --len; - } - c = (char)Integer.parseInt(pattern.substring(i, i + len), 8); - i += len - 1; - } else if (c == 'x' || c == 'u') { - int len = digits(pattern, ++i, 4, 16); - c = (char)Integer.parseInt(pattern.substring(i, i + len), 16); - i += len - 1; - } else { - c = unescape(pattern.charAt(i)); - } - if (c != -1) { - break; - } - // fallthru - case '.': - case '*': - case '+': - case '?': - case '|': - case '[': - case ']': - case '{': - case '}': - case '(': - case ')': - case '^': - case '$': - throw new UnsupportedOperationException - ("only trivial regular expressions are supported so far (" + pattern + ")"); - } - buffer.append(c); - } - return buffer.toString(); - } - - private static int digits(String s, int offset, int maxLength, int base) { - for (int i = 0; ; ++i) { - if (i == maxLength || offset + i >= s.length()) { - return i; - } - int value = s.charAt(offset + i) - '0'; - if (value < 0) { - return i; - } - if (base > 10 && value >= 10) { - value += 10 - (value >= 'a' - '0' ? 'a' - '0' : 'A' - '0'); - } - if (value >= base) { - return i; - } - } - } - - private static char unescape(char c) { - switch (c) { - case '\\': - return c; - case 'a': - return 0x0007; - case 'e': - return 0x001B; - case 'f': - return 0x000C; - case 'n': - return 0x000A; - case 'r': - return 0x000D; - case 't': - return 0x0009; - } - return (char)-1; - } - public static Pattern compile(String regex) { - return new Pattern(regex, 0); + return compile(regex, 0); } public static Pattern compile(String regex, int flags) { - return new Pattern(regex, flags); + if (flags != 0) { + throw new UnsupportedOperationException("TODO"); + } + return new Compiler().compile(regex); } public int flags() { return patternFlags; } - public Matcher matcher(CharSequence input) { - return new Matcher(this, input); - } + public abstract Matcher matcher(CharSequence input); public static boolean matches(String regex, CharSequence input) { return Pattern.compile(regex).matcher(input).matches(); @@ -155,79 +68,22 @@ public class Pattern { } public String[] split(CharSequence input, int limit) { - boolean strip; - if (limit < 0) { - strip = false; + if (limit <= 0) { limit = Integer.MAX_VALUE; - } else if (limit == 0) { - strip = true; - limit = Integer.MAX_VALUE; - } else { - strip = false; } - - List list = new LinkedList(); - int index = 0; - int trailing = 0; - int patternLength = pattern.length(); - while (index < input.length() && list.size() < limit - 1) { - int i; - if (patternLength == 0) { - if (list.size() == 0) { - i = 0; - } else { - i = index + 1; - } - } else { - i = indexOf(input, pattern, index); - } - - if (i >= 0) { - if (patternLength != 0 && i == index) { - ++ trailing; - } else { - trailing = 0; - } - - list.add(input.subSequence(index, i)); - index = i + patternLength; - } else { + Matcher matcher = matcher(input); + List result = new ArrayList(); + int offset = 0; + for (;;) { + if (result.size() >= limit || !matcher.find()) { break; } + result.add(input.subSequence(offset, matcher.start()).toString()); + offset = matcher.end(); } - - if (strip && index > 0 && index == input.length()) { - ++ trailing; - } else { - trailing = 0; + if (offset == 0 || offset < input.length()) { + result.add(input.subSequence(offset, input.length()).toString()); } - list.add(input.subSequence(index, input.length())); - - String[] result = new String[list.size() - trailing]; - int i = 0; - for (Iterator it = list.iterator(); - it.hasNext() && i < result.length; ++ i) - { - result[i] = it.next().toString(); - } - return result; - } - - static int indexOf(CharSequence haystack, CharSequence needle, int start) { - if (needle.length() == 0) return start; - - for (int i = start; i < haystack.length() - needle.length() + 1; ++i) { - int j = 0; - for (; j < needle.length(); ++j) { - if (haystack.charAt(i + j) != needle.charAt(j)) { - break; - } - } - if (j == needle.length()) { - return i; - } - } - - return -1; + return result.toArray(new String[result.size()]); } } diff --git a/classpath/java/util/regex/PikeVM.java b/classpath/java/util/regex/PikeVM.java new file mode 100644 index 0000000000..d34ef068a3 --- /dev/null +++ b/classpath/java/util/regex/PikeVM.java @@ -0,0 +1,629 @@ +/* Copyright (c) 2008-2013, Avian Contributors + + Permission to use, copy, modify, and/or distribute this software + for any purpose with or without fee is hereby granted, provided + that the above copyright notice and this permission notice appear + in all copies. + + There is NO WARRANTY for this software. See license.txt for + details. */ + +package java.util.regex; + +/** + * A minimal implementation of a regular expression engine. + * + * @author Johannes Schindelin + */ +class PikeVM implements PikeVMOpcodes { + private final int[] program; + private final int groupCount; + private final int offsetsCount; + /* + * For find(), we do not want to anchor the match at the start offset. Our + * compiler allows this by prefixing the code with an implicit '(?:.*?)'. For + * regular matches() calls, we want to skip that code and start at {@code + * findPrefixLength} instead. + */ + private final int findPrefixLength; + private final CharacterMatcher[] classes; + private final PikeVM[] lookarounds; + private final static CharacterMatcher wordCharacter = + CharacterMatcher.parse("\\w"); + private final static CharacterMatcher lineTerminator = + CharacterMatcher.parse("[\n\r\u0085\u2028\u2029]"); + private boolean multiLine; + + public interface Result { + void set(int[] start, int[] end); + } + + protected PikeVM(int[] program, int findPrefixLength, int groupCount, + CharacterMatcher[] classes, PikeVM[] lookarounds) + { + this.program = program; + this.findPrefixLength = findPrefixLength; + this.groupCount = groupCount; + offsetsCount = 2 * groupCount + 2; + this.classes = classes; + this.lookarounds = lookarounds; + } + + /** + * The current thread states. + *

+ * The threads are identified by their program counter. The rationale: as all + * threads are executed in lock-step, i.e. for the same character in the + * string to be matched, it does not make sense for two threads to be at the + * same program counter -- they would both do exactly the same for the rest of + * the execution. + *

+ *

+ * For efficiency, the threads are kept in a linked list that actually lives + * in an array indexed by the program counter, pointing to the next thread's + * program counter, in the order of high to low priority. + *

+ *

+ * Program counters which have no thread associated thread are marked as -1. + * The program counter associated with the least-priority thread (the last one + * in the linked list) is marked as -2 to be able to tell it apart from + * unscheduled threads. + *

+ *

+ * We actually never need to have an explicit value for the priority, the + * ordering is sufficient: whenever a new thread is to be scheduled and it is + * found to be scheduled already, it was already scheduled by a + * higher-priority thread. + *

+ */ + private class ThreadQueue { + private int head, tail; + // next[pc] is 1 + the next thread's pc + private int[] next; + // offsets[pc][2 * group] is 1 + start offset + private int[][] offsets; + + public ThreadQueue() { + head = tail = -1; + next = new int[program.length + 1]; + offsets = new int[program.length + 1][]; + } + + public ThreadQueue(int startPC) { + head = tail = startPC; + next = new int[program.length + 1]; + offsets = new int[program.length + 1][]; + offsets[head] = new int[offsetsCount]; + } + + public int queueOneImmediately(ThreadQueue into) { + for (;;) { + if (head < 0) { + return -1; + } + boolean wasQueued = queueNext(head, head, into); + int pc = head; + if (head == tail) { + head = tail = -1; + } else { + head = next[pc] - 1; + next[pc] = 0; + } + offsets[pc] = null; + if (wasQueued) { + into.tail = pc; + return pc; + } + } + } + + /** + * Schedules the instruction at {@code nextPC} to be executed immediately. + *

+ * For non-matching steps (SPLIT, SAVE_STATE, etc) we need to schedule the + * corresponding program counter(s) to be handled right after this opcode, + * before advancing to the next character. + *

+ *

+ * To achieve this, we insert the program counter to-be-scheduled in the + * linked thread list at the current position, but only if it has not been + * scheduled yet: if it has, a higher-priority thread already reached that + * state. + *

+ *

+ * In contrast to {@link #queueNext(int, int, ThreadQueue)}, this method + * works on the current step's thread list. + *

+ * + * @param currentPC + * the current program counter + * @param nextPC + * the program counter to schedule + * @param copyThreadState + * whether to spawn off a new thread + * @return whether the step was queued (i.e. no thread was queued for the + * same {@code nextPC} already) + */ + public boolean queueImmediately(int currentPC, int nextPC, + boolean copyThreadState) { + if (isScheduled(nextPC)) { + return false; + } + int[] offsets = this.offsets[currentPC]; + if (copyThreadState) { + offsets = java.util.Arrays.copyOf(offsets, offsetsCount); + } + if (currentPC == tail) { + tail = nextPC; + } else { + next[nextPC] = next[currentPC]; + } + this.offsets[nextPC] = offsets; + next[currentPC] = nextPC + 1; + return true; + } + + /** + * Schedules the instruction at {@code nextPC} to be executed in the next + * step. + *

+ * This method advances the current thread to the next program counter, to + * be executed after reading the next character. + *

+ * + * @param currentPC + * the current program counter + * @param nextPC + * the program counter to schedule + * @param next + * the thread state of the next step + * @return whether the step was queued (i.e. no thread was queued for the + * same {@code nextPC} already) + */ + private boolean queueNext(int currentPC, int nextPC, ThreadQueue next) { + if (next.tail < 0) { + next.head = nextPC; + } else if (next.isScheduled(nextPC)) { + return false; + } else { + next.next[next.tail] = nextPC + 1; + } + next.offsets[nextPC] = offsets[currentPC]; + next.tail = nextPC; + return true; + } + + public void saveOffset(int pc, int index, int offset) { + offsets[pc][index] = offset + 1; + } + + public void setResult(Result result) { + // copy offsets + int[] offsets = this.offsets[program.length]; + int[] groupStart = new int[groupCount + 1]; + int[] groupEnd = new int[groupCount + 1]; + for (int j = 0; j <= groupCount; ++j) { + groupStart[j] = offsets[2 * j] - 1; + groupEnd[j] = offsets[2 * j + 1] - 1; + } + result.set(groupStart, groupEnd); + } + + private void mustStartMatchAt(int start) { + int previous = -1; + for (int pc = head; pc >= 0; ) { + int nextPC = next[pc] - 1; + if (start + 1 == offsets[pc][0]) { + previous = pc; + } else { + next[pc] = 0; + offsets[pc] = null; + if (pc == tail) { + head = tail = -1; + } else if (previous < 0) { + head = nextPC; + } else { + next[previous] = 1 + nextPC; + } + } + pc = nextPC; + } + } + + private int startOffset(int pc) { + return offsets[pc][0] - 1; + } + + public boolean isEmpty() { + return head < 0; + } + + public boolean isScheduled(int pc) { + return pc == tail || next[pc] > 0; + } + + public int next(int pc) { + return pc < 0 ? head : next[pc] - 1; + } + + public void clean() { + for (int pc = head; pc >= 0; ) { + int nextPC = next[pc] - 1; + next[pc] = 0; + offsets[pc] = null; + pc = nextPC; + } + head = tail = -1; + } + } + + /** + * Executes the Pike VM defined by the program. + *

+ * The idea is to execute threads in parallel, at each step executing them + * from the highest priority thread to the lowest one. In contrast to most + * regular expression engines, the Thompson/Pike one gets away with linear + * complexity because the string is matched from left to right, at each step + * executing a number of threads bounded by the length of the program: if two + * threads would execute at the same instruction pointer of the program, we + * need only consider the higher-priority one. + *

+ *

+ * This implementation is based on the description of Russ Cox. + *

+ * + * @param characters + * the {@link String} to match + * @param start + * the start offset where to match + * @param length + * the end offset + * @param anchorStart + * whether the match must start at {@code start} + * @param anchorEnd + * whether the match must start at {@code end} + * @param result + * the {@link Matcher} to store the groups' offsets in, if successful + * @return whether a match was found + */ + public boolean matches(char[] characters, int start, int end, + boolean anchorStart, boolean anchorEnd, Result result) + { + ThreadQueue current = new ThreadQueue(); + ThreadQueue next = new ThreadQueue(); + + // initialize the first thread + int startPC = anchorStart ? findPrefixLength : 0; + ThreadQueue queued = new ThreadQueue(startPC); + + boolean foundMatch = false; + int step = end > start ? +1 : -1; + for (int i = start; i != end + step; i += step) { + if (queued.isEmpty()) { + // no threads left + return foundMatch; + } + + char c = i != end ? characters[i] : 0; + int pc = -1; + for (;;) { + pc = current.next(pc); + if (pc < 0) { + pc = queued.queueOneImmediately(current); + } + if (pc < 0) { + break; + } + + // pc == program.length is a match! + if (pc == program.length) { + if (anchorEnd && i != end) { + continue; + } + if (result == null) { + // only interested in a match, no need to go on + return true; + } + current.setResult(result); + + // now that we found a match, even higher-priority matches must match + // at the same start offset + if (!anchorStart) { + next.mustStartMatchAt(current.startOffset(pc)); + } + foundMatch = true; + break; + } + + int opcode = program[pc]; + switch (opcode) { + case DOT: + if (c != '\0' && c != '\r' && c != '\n') { + current.queueNext(pc, pc + 1, next); + } + break; + case DOTALL: + current.queueNext(pc, pc + 1, next); + break; + case WORD_BOUNDARY: + case NON_WORD_BOUNDARY: { + int i2 = i - step; + int c2 = i2 < 0 || i2 >= characters.length ? -1 : characters[i2]; + switch (opcode) { + case WORD_BOUNDARY: + if ((c2 < 0 || !wordCharacter.matches((char)c2))) { + if (wordCharacter.matches(c)) { + current.queueImmediately(pc, pc + 1, false); + } + } else if (i >= 0 && i < characters.length && + !wordCharacter.matches(c)) { + current.queueImmediately(pc, pc + 1, false); + } + break; + case NON_WORD_BOUNDARY: + if ((c2 < 0 || !wordCharacter.matches((char)c2))) { + if (i >= 0 && i < characters.length && + !wordCharacter.matches(c)) { + current.queueImmediately(pc, pc + 1, false); + } + } else if (wordCharacter.matches(c)) { + current.queueImmediately(pc, pc + 1, false); + } + break; + } + break; + } + case LINE_START: + if (i == 0 || (multiLine && + lineTerminator.matches(characters[i - 1]))) { + current.queueImmediately(pc, pc + 1, false); + } + break; + case LINE_END: + if (i == characters.length || (multiLine && + lineTerminator.matches(c))) { + current.queueImmediately(pc, pc + 1, false); + } + break; + case CHARACTER_CLASS: + if (classes[program[pc + 1]].matches(c)) { + current.queueNext(pc, pc + 2, next); + } + break; + case LOOKAHEAD: + if (lookarounds[program[pc + 1]].matches(characters, + i, characters.length, true, false, null)) { + current.queueImmediately(pc, pc + 2, false); + } + break; + case LOOKBEHIND: + if (lookarounds[program[pc + 1]].matches(characters, + i - 1, -1, true, false, null)) { + current.queueImmediately(pc, pc + 2, false); + } + break; + case NEGATIVE_LOOKAHEAD: + if (!lookarounds[program[pc + 1]].matches(characters, + i, characters.length, true, false, null)) { + current.queueImmediately(pc, pc + 2, false); + } + break; + case NEGATIVE_LOOKBEHIND: + if (!lookarounds[program[pc + 1]].matches(characters, + i - 1, -1, true, false, null)) { + current.queueImmediately(pc, pc + 2, false); + } + break; + /* immediate opcodes, i.e. thread continues within the same step */ + case SAVE_OFFSET: + if (result != null) { + int index = program[pc + 1]; + current.saveOffset(pc, index, i); + } + current.queueImmediately(pc, pc + 2, false); + break; + case SPLIT: + current.queueImmediately(pc, program[pc + 1], true); + current.queueImmediately(pc, pc + 2, false); + break; + case SPLIT_JMP: + current.queueImmediately(pc, pc + 2, true); + current.queueImmediately(pc, program[pc + 1], false); + break; + case JMP: + current.queueImmediately(pc, program[pc + 1], false); + break; + default: + if (program[pc] >= 0 && program[pc] <= 0xffff) { + if (c == (char)program[pc]) { + current.queueNext(pc, pc + 1, next); + } + break; + } + throw new RuntimeException("Invalid opcode: " + opcode + + " at pc " + pc); + } + } + // clean linked thread list (and states) + current.clean(); + + // prepare for next step + ThreadQueue swap = queued; + queued = next; + next = swap; + } + return foundMatch; + } + + /** + * Determines whether this machine recognizes a pattern without special + * operators. + *

+ * In case that the regular expression is actually a plain string without any + * special operators, we can avoid using a full-blown Pike VM and instead fall + * back to using the much faster {@link TrivialPattern}. + *

+ * + * @return the string to match, or null if the machine recognizes a + * non-trivial pattern + */ + public String isPlainString() { + // we expect the machine to start with the find preamble and SAVE_OFFSET 0 + // end with SAVE_OFFSET 1 + int start = findPrefixLength; + if (start + 1 < program.length && + program[start] == SAVE_OFFSET && program[start + 1] == 0) { + start += 2; + } + int end = program.length; + if (end > start + 1 && + program[end - 2] == SAVE_OFFSET && program[end - 1] == 1) { + end -= 2; + } + for (int i = start; i < end; ++ i) { + if (program[i] < 0) { + return null; + } + } + char[] array = new char[end - start]; + for (int i = start; i < end; ++ i) { + array[i - start] = (char)program[i]; + } + return new String(array); + } + + private static int length(int opcode) { + return opcode <= SINGLE_ARG_START && opcode >= SINGLE_ARG_END ? 2 : 1; + } + + private static boolean isJump(int opcode) { + return opcode <= SPLIT && opcode >= JMP; + } + + /** + * Reverses the program (effectively matching the reverse pattern). + *

+ * It is a well-known fact that any regular expression can be reordered + * trivially into an equivalent regular expression to be applied in backward + * direction (coming in real handy for look-behind expressions). + *

+ *

+ * Example: instead of matching the sequence "aaaabb" with the pattern "a+b+", + * we can match the reverse sequence "bbaaaa" with the pattern "b+a+". + *

+ *

+ * One caveat: while the reverse pattern is equivalent in the sense that it + * matches if, and only if, the original pattern matches the forward + * direction, the same is not true for submatches. Consider the input "a" and + * the pattern "(a?)a?": when matching in forward direction the captured group + * is "a", while the backward direction will yield the empty string. For that + * reason, Java dictates that capturing groups in look-behind patterns are + * ignored. + *

+ */ + public void reverse() { + reverse(findPrefixLength, program.length); + } + + /** + * Reverses a specific part of the program (to match in reverse direction). + *

+ * This is the work-horse of {@link #reverse()}. + *

+ *

+ * To visualize the process of reversing a program, let's look at it as a + * directed graph (each jump is represented by an "X + * ", non-jumping steps are represented by a "o"s, arrows show the + * direction of the flow, SPLITs spawn two arrows): + * + *

+   * o -> X -> X -> o -> X    o -> o
+   * ^    |     \         \___^____^
+   *  \__/       \____________|
+   * 
+ * + * The concept of reversing the program is easiest explained as following: if + * we insert auxiliary nodes "Y" for jump targets, the graph looks + * like this instead: + * + *
+   * Y -> o -> X -> X -> o -> X    Y -> o -> Y -> o
+   * ^         |     \         \___^_________^
+   *  \_______/       \____________|
+   * 
+ * + * It is now obvious that reversing the program is equivalent to reversing all + * arrows, simply deleting all Xs and substituting each Y + * with a jump. Note that the reverse program will have the same number of + * JMP, but they will not be associated with the same arrows!: + * + *
+   * X <- o <- o    X <- o <- X <- o
+   * |    ^    ^____|________/
+   *  \__/ \_______/
+   * 
+ * + *

+ * @param start + * start reversing the program with this instruction + * @param end + * stop reversing at this instruction (this must be either an index + * aligned exactly with an instruction, or exactly + * {@code program.length}. + */ + private void reverse(int start, int end) { + // Pass 1: build the list of jump targets + int[] newJumps = new int[end + 1]; + boolean[] brokenArrows = new boolean[end + 1]; + for (int pc = start; pc < end; pc += length(program[pc])) { + if (isJump(program[pc])) { + int target = program[pc + 1]; + newJumps[pc + 1] = newJumps[target]; + newJumps[target] = pc + 1; + if (program[pc] == JMP) { + brokenArrows[pc + 2] = true; + } + } + } + + // Pass 2: determine mapped program counters + int[] mapping = new int[end]; + for (int pc = start, mappedPC = end; mappedPC > 0 + && pc < end; pc += length(program[pc])) { + for (int jump = newJumps[pc]; jump > 0; jump = newJumps[jump]) { + mappedPC -= 2; + } + if (!isJump(program[pc])) { + mappedPC -= length(program[pc]); + } + mapping[pc] = mappedPC; + } + + // Pass 3: write the new program + int[] reverse = new int[end]; + for (int pc = start, mappedPC = end; mappedPC > 0; + pc += length(program[pc])) { + boolean brokenArrow = brokenArrows[pc]; + for (int jump = newJumps[pc]; jump > 0; jump = newJumps[jump]) { + reverse[--mappedPC] = mapping[jump - 1]; + if (brokenArrow) { + reverse[--mappedPC] = JMP; + brokenArrow = false; + } else { + reverse[--mappedPC] = + program[jump - 1] == SPLIT_JMP ? SPLIT_JMP : SPLIT; + } + } + if (pc == end) { + break; + } + if (!isJump(program[pc])) { + for (int i = length(program[pc]); i-- > 0; ) { + reverse[--mappedPC] = program[pc + i]; + } + } + } + System.arraycopy(reverse, start, program, start, end - start); + } +} diff --git a/classpath/java/util/regex/PikeVMOpcodes.java b/classpath/java/util/regex/PikeVMOpcodes.java new file mode 100644 index 0000000000..d932aec870 --- /dev/null +++ b/classpath/java/util/regex/PikeVMOpcodes.java @@ -0,0 +1,45 @@ +/* Copyright (c) 2008-2013, Avian Contributors + + Permission to use, copy, modify, and/or distribute this software + for any purpose with or without fee is hereby granted, provided + that the above copyright notice and this permission notice appear + in all copies. + + There is NO WARRANTY for this software. See license.txt for + details. */ + +package java.util.regex; + +/** + * Opcodes for the Pike VM. + *

+ * See {@link PikeVM}. + *

+ * + * @author Johannes Schindelin + */ +interface PikeVMOpcodes { + final static int DOT = -1; + final static int DOTALL = -2; + + final static int WORD_BOUNDARY = -10; + final static int NON_WORD_BOUNDARY = -11; + final static int LINE_START = -12; + final static int LINE_END = -13; + + final static int CHARACTER_CLASS = -20; + + final static int LOOKAHEAD = -30; + final static int LOOKBEHIND = -31; + final static int NEGATIVE_LOOKAHEAD = -32; + final static int NEGATIVE_LOOKBEHIND = -33; + + final static int SAVE_OFFSET = -40; + + final static int SPLIT = -50; + final static int SPLIT_JMP = -51; // this split prefers to jump + final static int JMP = -52; + + final static int SINGLE_ARG_START = CHARACTER_CLASS; + final static int SINGLE_ARG_END = JMP; +} diff --git a/classpath/java/util/regex/RegexMatcher.java b/classpath/java/util/regex/RegexMatcher.java new file mode 100644 index 0000000000..145b15a704 --- /dev/null +++ b/classpath/java/util/regex/RegexMatcher.java @@ -0,0 +1,80 @@ +/* Copyright (c) 2008-2013, Avian Contributors + + Permission to use, copy, modify, and/or distribute this software + for any purpose with or without fee is hereby granted, provided + that the above copyright notice and this permission notice appear + in all copies. + + There is NO WARRANTY for this software. See license.txt for + details. */ + +package java.util.regex; + +/** + * A minimal implementation of a regular expression matcher. + * + * @author Johannes Schindelin + */ +public class RegexMatcher extends Matcher { + private final PikeVM vm; + private char[] array; + int[] groupStart, groupEnd; + + RegexMatcher(PikeVM vm, CharSequence string) { + super(string); + this.vm = vm; + } + + private final PikeVM.Result adapter = new PikeVM.Result() { + public void set(int[] start, int[] end) { + RegexMatcher.this.start = start[0]; + RegexMatcher.this.end = end[0]; + RegexMatcher.this.groupStart = start; + RegexMatcher.this.groupEnd = end; + } + }; + + public Matcher reset() { + start = end = -1; + return this; + } + + public Matcher reset(CharSequence input) { + this.input = input; + array = input.toString().toCharArray(); + return reset(); + } + + public boolean matches() { + return vm.matches(array, 0, array.length, true, true, adapter); + } + + public boolean find() { + return find(end + (start == end ? 1 : 0)); + } + + public boolean find(int offset) { + return vm.matches(array, offset, array.length, false, false, adapter); + } + + public int start(int group) { + return groupStart[group]; + } + + public int end(int group) { + return groupEnd[group]; + } + + public String group(int group) { + int offset = start(group); + if (offset < 0) { + return null; + } + int length = end(group) - offset; + return new String(array, offset, length); + } + + public int groupCount() { + return groupStart.length - 1; + } +} diff --git a/classpath/java/util/regex/RegexPattern.java b/classpath/java/util/regex/RegexPattern.java new file mode 100644 index 0000000000..bceb90cfc4 --- /dev/null +++ b/classpath/java/util/regex/RegexPattern.java @@ -0,0 +1,57 @@ +/* Copyright (c) 2008-2013, Avian Contributors + + Permission to use, copy, modify, and/or distribute this software + for any purpose with or without fee is hereby granted, provided + that the above copyright notice and this permission notice appear + in all copies. + + There is NO WARRANTY for this software. See license.txt for + details. */ + +package java.util.regex; + +/** + * A minimal implementation of a regular expression engine. + *

+ * Intended as a permissively-licensed drop-in replacement for Oracle JDK's + * regular expression engine, this class uses the Pike VM implemented in + * {@link PikeVM} to match regular expressions. + *

+ *

+ * The Pike VM not only has a nicer runtime performance than Oracle JDK's + * backtracking approach -- O(n*m) instead of O(2^m) where + * n is the length of the regular expression pattern (after normalizing + * {<n>} quantifiers) and m the length of the text to match against + * the pattern -- but also supports arbitrary-sized look-behinds. + *

+ *

+ * The current implementation supports all regular expression constructs + * supported by Oracle JDK's regular expression engine except for the following + * ones: + *

    + *
  • control characters: \cX
  • + *
  • extended character classes: \p{...}
  • + *
  • extended boundary matchers: \A,\G,\Z,\z
  • + *
  • possessive quantifiers: X?+
  • + *
  • back references: \<n>, \k<name>
  • + *
  • long escape: \Q, \E
  • + *
  • named groups: (?<name>X)
  • + *
  • flags: (?idmsuxU)
  • + *
  • independent, non-capturing group: (?>X)
  • + *
+ *

+ * + * @author Johannes Schindelin + */ +public class RegexPattern extends Pattern { + private PikeVM vm; + + public RegexMatcher matcher(CharSequence string) { + return new RegexMatcher(vm, string); + } + + RegexPattern(String regex, int flags, PikeVM vm) { + super(regex, flags); + this.vm = vm; + } +} diff --git a/classpath/java/util/regex/TrivialMatcher.java b/classpath/java/util/regex/TrivialMatcher.java new file mode 100644 index 0000000000..2b735f83e1 --- /dev/null +++ b/classpath/java/util/regex/TrivialMatcher.java @@ -0,0 +1,48 @@ +/* Copyright (c) 2008-2013, Avian Contributors + + Permission to use, copy, modify, and/or distribute this software + for any purpose with or without fee is hereby granted, provided + that the above copyright notice and this permission notice appear + in all copies. + + There is NO WARRANTY for this software. See license.txt for + details. */ + +package java.util.regex; + +/** + * This is a work in progress. + * + * @author zsombor and others + */ +class TrivialMatcher extends Matcher { + private final String pattern; + + TrivialMatcher(String pattern, CharSequence input) { + super(input); + this.pattern = pattern; + } + + public boolean matches() { + if (pattern.equals(input.toString())) { + start = 0; + end = input.length(); + return true; + } else { + return false; + } + } + + public boolean find(int start) { + String p = pattern; + int i = TrivialPattern.indexOf(input, p, start); + if (i >= 0) { + this.start = i; + this.end = i + p.length(); + return true; + } else { + return false; + } + } +} + diff --git a/classpath/java/util/regex/TrivialPattern.java b/classpath/java/util/regex/TrivialPattern.java new file mode 100644 index 0000000000..1041e1bfaa --- /dev/null +++ b/classpath/java/util/regex/TrivialPattern.java @@ -0,0 +1,112 @@ +/* Copyright (c) 2008-2013, Avian Contributors + + Permission to use, copy, modify, and/or distribute this software + for any purpose with or without fee is hereby granted, provided + that the above copyright notice and this permission notice appear + in all copies. + + There is NO WARRANTY for this software. See license.txt for + details. */ + +package java.util.regex; + +import java.util.Iterator; +import java.util.List; +import java.util.LinkedList; + +/** + * This is a work in progress. + * + * @author zsombor and others + * + */ +public class TrivialPattern extends Pattern { + + private final String unescaped; + + TrivialPattern(String pattern, String unescaped, int flags) { + super(pattern, flags); + this.unescaped = unescaped; + } + + public Matcher matcher(CharSequence input) { + return new TrivialMatcher(unescaped, input); + } + + public String[] split(CharSequence input, int limit) { + boolean strip; + if (limit < 0) { + strip = false; + limit = Integer.MAX_VALUE; + } else if (limit == 0) { + strip = true; + limit = Integer.MAX_VALUE; + } else { + strip = false; + } + + List list = new LinkedList(); + int index = 0; + int trailing = 0; + int patternLength = unescaped.length(); + while (index < input.length() && list.size() < limit - 1) { + int i; + if (patternLength == 0) { + if (list.size() == 0) { + i = 0; + } else { + i = index + 1; + } + } else { + i = indexOf(input, unescaped, index); + } + + if (i >= 0) { + if (patternLength != 0 && i == index) { + ++ trailing; + } else { + trailing = 0; + } + + list.add(input.subSequence(index, i)); + index = i + patternLength; + } else { + break; + } + } + + if (strip && index > 0 && index == input.length()) { + ++ trailing; + } else { + trailing = 0; + } + list.add(input.subSequence(index, input.length())); + + String[] result = new String[list.size() - trailing]; + int i = 0; + for (Iterator it = list.iterator(); + it.hasNext() && i < result.length; ++ i) + { + result[i] = it.next().toString(); + } + return result; + } + + static int indexOf(CharSequence haystack, CharSequence needle, int start) { + if (needle.length() == 0) return start; + + for (int i = start; i < haystack.length() - needle.length() + 1; ++i) { + int j = 0; + for (; j < needle.length(); ++j) { + if (haystack.charAt(i + j) != needle.charAt(j)) { + break; + } + } + if (j == needle.length()) { + return i; + } + } + + return -1; + } +} diff --git a/test/Regex.java b/test/Regex.java new file mode 100644 index 0000000000..22108dde5a --- /dev/null +++ b/test/Regex.java @@ -0,0 +1,96 @@ +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +public class Regex { + private static void expect(boolean v) { + if (! v) throw new RuntimeException(); + } + + private static Matcher getMatcher(String regex, String string) { + return Pattern.compile(regex).matcher(string); + } + + private static void expectMatch(String regex, String string) { + expect(getMatcher(regex, string).matches()); + } + + private static void expectNoMatch(String regex, String string) { + expect(!getMatcher(regex, string).matches()); + } + + private static void expectGroups(String regex, String string, + String... groups) { + Matcher matcher = getMatcher(regex, string); + expect(matcher.matches()); + expect(matcher.groupCount() == groups.length); + for (int i = 1; i <= groups.length; ++i) { + if (groups[i - 1] == null) { + expect(matcher.group(i) == null); + } else { + expect(groups[i - 1].equals(matcher.group(i))); + } + } + } + + private static void expectFind(String regex, String string, + String... matches) + { + Matcher matcher = getMatcher(regex, string); + int i = 0; + while (i < matches.length) { + expect(matcher.find()); + expect(matches[i++].equals(matcher.group())); + } + expect(!matcher.find()); + } + + private static void expectSplit(String regex, String string, + String... list) + { + String[] array = Pattern.compile(regex).split(string); + expect(array.length == list.length); + for (int i = 0; i < list.length; ++ i) { + expect(list[i].equals(array[i])); + } + } + + public static void main(String[] args) { + expectMatch("a(bb)?a", "abba"); + expectNoMatch("a(bb)?a", "abbba"); + expectNoMatch("a(bb)?a", "abbaa"); + expectGroups("a(a*?)(a?)(a??)(a+)(a*)a", "aaaaaa", "", "a", "", "aaa", ""); + expectMatch("...", "abc"); + expectNoMatch(".", "\n"); + expectGroups("a(bb)*a", "abbbba", "bb"); + expectGroups("a(bb)?(bb)+a", "abba", null, "bb"); + expectFind(" +", "Hello , world! ", " ", " ", " "); + expectMatch("[0-9A-Fa-f]+", "08ef"); + expectNoMatch("[0-9A-Fa-f]+", "08@ef"); + expectGroups("(?:a)", "a"); + expectGroups("a|(b|c)", "a", (String)null); + expectGroups("a|(b|c)", "c", "c"); + expectGroups("(?=a)a", "a"); + expectGroups(".*(o)(?<=[A-Z][a-z]*)", "Hello", "o"); + expectNoMatch("(?!a).", "a"); + expectMatch("[\\d]", "0"); + expectMatch("\\0777", "?7"); + expectMatch("\\a", "\007"); + expectMatch("\\\\", "\\"); + expectMatch("\\x4A", "J"); + expectMatch("\\x61", "a"); + expectMatch("\\078", "\0078"); + expectSplit("(?<=\\w)(?=\\W)|(?<=\\W)(?=\\w)", "a + b * x", + "a", " + ", "b", " * ", "x"); + expectMatch("[0-9[def]]", "f"); + expectNoMatch("[a-z&&[^d-f]]", "f"); + expectSplit("^H", "Hello\nHobbes!", "", "ello\nHobbes!"); + expectSplit("o.*?$", "Hello\r\nHobbes!", "Hello\r\nH"); + expectSplit("\\b", "a+ b + c\nd", "", "a", "+ ", "b", " + ", "c", "\n", "d"); + expectSplit("\\B", "Hi Cal!", "H", "i C", "a", "l!"); + expectMatch("a{2,5}", "aaaa"); + expectGroups("a??(a{2,5}?)", "aaaa", "aaaa"); + expectGroups("a??(a{3}?)", "aaaa", "aaa"); + expectNoMatch("a(a{3}?)", "aaaaa"); + expectMatch("a(a{3,}?)", "aaaaa"); + } +}