Merge pull request #105 from dscho/regex

Support (the most common subset of) regular expressions
This commit is contained in:
Joshua Warner 2013-12-04 11:57:26 -08:00
commit fe9ac94629
12 changed files with 1994 additions and 196 deletions

1
.gitignore vendored
View File

@ -8,3 +8,4 @@ bin
/lib
/distrib
*.pdb
*.swp

View File

@ -0,0 +1,332 @@
/* Copyright (c) 2008-2013, Avian Contributors
Permission to use, copy, modify, and/or distribute this software
for any purpose with or without fee is hereby granted, provided
that the above copyright notice and this permission notice appear
in all copies.
There is NO WARRANTY for this software. See license.txt for
details. */
package java.util.regex;
/**
* A class to match classes of characters.
* <p>
* This class is intended to be the working horse behind character classes
* such as {@code [a-z]}.
* </p>
* @author Johannes Schindelin
*/
class CharacterMatcher {
private boolean[] map;
private boolean inversePattern;
public static CharacterMatcher parse(String description) {
return parse(description.toCharArray());
}
public static CharacterMatcher parse(char[] description) {
Parser parser = new Parser(description);
CharacterMatcher result = parser.parseClass();
if (parser.getEndOffset() != description.length) {
throw new RuntimeException("Short character class @"
+ parser.getEndOffset() + ": " + new String(description));
}
return result;
}
public boolean matches(char c) {
int index = c;
return (map.length > index && map[index]) ^ inversePattern;
}
public String toString() {
StringBuilder builder = new StringBuilder();
builder.append("[");
if (inversePattern) {
builder.append("^");
}
for (int i = 0; i < map.length; ++ i) {
if (!map[i]) {
continue;
}
builder.append(i >= ' ' && i <= 0x7f ?
"" + (char)i : ("\\x" + Integer.toHexString(i)));
int j = i + 1;
while (j < map.length && map[j]) {
++ j;
}
-- j;
if (j > i) {
if (j > i + 1) {
builder.append('-');
}
builder.append(j >= ' ' && j <= 0x7f ?
"" + (char)j : ("\\x" + Integer.toHexString(j)));
i = j;
}
}
builder.append("]");
return builder.toString();
}
private static String specialClass(int c) {
if ('d' == c) {
return "[0-9]";
}
if ('D' == c) {
return "[^0-9]";
}
if ('s' == c) {
return "[ \\t\\n\\x0B\\f\\r]";
}
if ('S' == c) {
return "[^ \\t\\n\\x0B\\f\\r]";
}
if ('w' == c) {
return "[a-zA-Z_0-9]";
}
if ('W' == c) {
return "[^a-zA-Z_0-9]";
}
return null;
}
private CharacterMatcher(boolean[] map, boolean inversePattern) {
this.map = map;
this.inversePattern = inversePattern;
}
private void setMatch(int c) {
ensureCapacity(c + 1);
map[c] = true;
}
private void ensureCapacity(int length) {
if (map.length >= length) {
return;
}
int size = map.length;
if (size < 32) {
size = 32;
}
while (size < length) {
size <<= 1;
}
map = java.util.Arrays.copyOf(map, size);
}
private void merge(CharacterMatcher other) {
boolean inversePattern = this.inversePattern || other.inversePattern;
if ((map.length < other.map.length) ^ inversePattern) {
map = java.util.Arrays.copyOf(map, other.map.length);
}
for (int i = 0; i < map.length; ++ i) {
map[i] = (matches((char)i) || other.matches((char)i)) ^ inversePattern;
}
this.inversePattern = inversePattern;
}
private void intersect(CharacterMatcher other) {
boolean inversePattern = this.inversePattern && other.inversePattern;
if ((map.length > other.map.length) ^ inversePattern) {
map = java.util.Arrays.copyOf(map, other.map.length);
}
for (int i = 0; i < map.length; ++ i) {
map[i] = (matches((char)i) && other.matches((char)i)) ^ inversePattern;
}
this.inversePattern = inversePattern;
}
static class Parser {
private final char[] description;
private int offset;
public Parser(char[] description) {
this.description = description;
}
public int getEndOffset() {
return offset;
}
/**
* Parses an escaped character.
*
* @param start the offset <u>after</u> the backslash
* @return the escaped character, or -1 if no character was recognized
*/
public int parseEscapedCharacter(int start) {
offset = start;
return parseEscapedCharacter();
}
private int parseEscapedCharacter() {
if (offset == description.length) {
throw new IllegalArgumentException("Short escaped character");
}
char c = description[offset++];
if (c == '0') {
int len = digits(offset, 3, 8);
if (len == 3 && description[offset] > '3') {
--len;
}
c = (char)Integer.parseInt(new String(description, offset, len), 8);
offset += len;
return c;
}
if (c == 'x' || c == 'u') {
int len = digits(offset, 4, 16);
c = (char)Integer.parseInt(new String(description, offset, len), 16);
offset += len;
return c;
}
switch (c) {
case 'a':
return 0x0007;
case 'e':
return 0x001B;
case 'f':
return 0x000C;
case 'n':
return 0x000A;
case 'r':
return 0x000D;
case 't':
return 0x0009;
case '\\':
case '.':
case '*':
case '+':
case '?':
case '|':
case '[':
case ']':
case '{':
case '}':
case '(':
case ')':
case '^':
case '$':
return c;
}
return -1;
}
public int digits(int offset, int maxLength, int base) {
for (int i = 0; ; ++i) {
if (i == maxLength || offset + i >= description.length) {
return i;
}
int value = description[offset + i] - '0';
if (value < 0) {
return i;
}
if (base > 10 && value >= 10) {
value += 10 - (value >= 'a' - '0' ? 'a' - '0' : 'A' - '0');
}
if (value >= base) {
return i;
}
}
}
public CharacterMatcher parseClass(int start) {
offset = start;
return parseClass();
}
public CharacterMatcher parseClass() {
if (description[offset] != '[') {
if (description[offset] == '\\') {
String range = specialClass(description[++ offset]);
if (range != null) {
++ offset;
return CharacterMatcher.parse(range);
}
}
return null;
}
CharacterMatcher matcher = new CharacterMatcher(new boolean[0],
description[++ offset] == '^');
if (matcher.inversePattern) {
++ offset;
}
int previous = -1;
boolean firstCharacter = true;
for (;;) {
if (offset >= description.length) {
unsupported("short regex");
}
char c = description[offset++];
if (c == '-' && !firstCharacter && description[offset] != ']') {
if (previous < 0) {
unsupported("invalid range");
}
int rangeEnd = description[offset];
if ('\\' == rangeEnd) {
rangeEnd = parseEscapedCharacter();
if (rangeEnd < 0) {
unsupported("invalid range");
}
}
matcher.ensureCapacity(rangeEnd + 1);
for (int j = previous + 1; j <= rangeEnd; j++) {
matcher.map[j] = true;
}
} else if (c == '\\') {
int saved = offset;
previous = parseEscapedCharacter();
if (previous < 0) {
offset = saved - 1;
CharacterMatcher clazz = parseClass();
if (clazz == null) {
unsupported("escape");
}
matcher.merge(clazz);
} else {
matcher.setMatch(previous);
}
} else if (c == '[') {
Parser parser = new Parser(description);
CharacterMatcher other = parser.parseClass(offset - 1);
if (other == null) {
unsupported("invalid merge");
}
matcher.merge(other);
offset = parser.getEndOffset();
previous = -1;
} else if (c == '&') {
if (offset + 2 > description.length || description[offset] != '&'
|| description[offset + 1] != '[') {
unsupported("operation");
}
Parser parser = new Parser(description);
CharacterMatcher other = parser.parseClass(offset + 1);
if (other == null) {
unsupported("invalid intersection");
}
matcher.intersect(other);
offset = parser.getEndOffset();
previous = -1;
} else if (c == ']') {
break;
} else {
previous = c;
matcher.setMatch(previous);
}
firstCharacter = false;
}
return matcher;
}
private void unsupported(String msg) throws UnsupportedOperationException {
throw new UnsupportedOperationException("Unsupported " + msg + " @"
+ offset + ": "
+ new String(description, 0, description.length));
}
}
}

View File

@ -0,0 +1,533 @@
/* Copyright (c) 2008-2013, Avian Contributors
Permission to use, copy, modify, and/or distribute this software
for any purpose with or without fee is hereby granted, provided
that the above copyright notice and this permission notice appear
in all copies.
There is NO WARRANTY for this software. See license.txt for
details. */
package java.util.regex;
import java.util.ArrayList;
import java.util.Stack;
/**
* Compiles regular expressions into {@link PikeVM}s.
*
* @author Johannes Schindelin
*/
class Compiler implements PikeVMOpcodes {
private final static CharacterMatcher regularCharacter =
CharacterMatcher.parse("[^\\\\.*+?|\\[\\]{}()^$]");
private static class Output {
private int[] program;
private int offset;
private int groupCount = -1;
private int findPreambleSize;
private ArrayList<CharacterMatcher> classes;
private ArrayList<PikeVM> lookarounds;
public Output(Expression expr) {
// try-run to determine the code size
expr.writeCode(this);
program = new int[offset];
offset = 0;
groupCount = -1;
classes = new ArrayList<CharacterMatcher>();
lookarounds = new ArrayList<PikeVM>();
// write it out!
expr.writeCode(this);
}
public void add(int opcode) {
if (program != null) {
program[offset] = opcode;
}
offset++;
}
public int markJump() {
return offset++;
}
public void setJump(int mark) {
if (program != null) {
program[mark] = offset;
}
}
public void markFindPreambleEnd() {
findPreambleSize = offset;
}
public PikeVM toVM() {
CharacterMatcher[] classes = new CharacterMatcher[this.classes.size()];
this.classes.toArray(classes);
PikeVM[] lookarounds = new PikeVM[this.lookarounds.size()];
this.lookarounds.toArray(lookarounds);
return new PikeVM(program, findPreambleSize, groupCount, classes,
lookarounds);
}
public int addClass(CharacterMatcher characterClass) {
if (program == null) {
return -1;
}
int result = classes.size();
classes.add(characterClass);
return result;
}
public int addLookaround(PikeVM lookaround) {
if (program == null) {
return -1;
}
int result = lookarounds.size();
lookarounds.add(lookaround);
return result;
}
}
private abstract class Expression {
protected abstract void writeCode(Output output);
}
private class CharacterRange extends Expression {
private final CharacterMatcher characterClass;
public CharacterRange(CharacterMatcher characterClass) {
this.characterClass = characterClass;
}
protected void writeCode(Output output) {
output.add(CHARACTER_CLASS);
output.add(output.addClass(characterClass));
}
public String toString() {
return characterClass.toString();
}
}
private class Repeat extends Expression {
private Expression expr;
private int minCount, maxCount;
private boolean greedy;
public Repeat(Expression expr, int minCount, int maxCount, boolean greedy) {
if (minCount < 0) {
throw new RuntimeException("Unexpected min count: " + minCount);
}
if (maxCount != -1) {
if (maxCount == 0) {
throw new RuntimeException("Unexpected max count: " + maxCount);
}
if (minCount > maxCount) {
throw new RuntimeException("Unexpected range: " + minCount + ", " + maxCount);
}
}
this.expr = expr;
this.minCount = minCount;
this.maxCount = maxCount;
this.greedy = greedy;
}
protected void writeCode(Output output) {
int start = output.offset;
int splitJmp = greedy ? SPLIT_JMP : SPLIT;
int split = greedy ? SPLIT : SPLIT_JMP;
for (int i = 1; i < minCount; ++ i) {
expr.writeCode(output);
}
if (maxCount == -1) {
if (minCount > 0) {
int jump = output.offset;
expr.writeCode(output);
output.add(splitJmp);
output.add(jump);
} else {
output.add(split);
int jump = output.markJump();
expr.writeCode(output);
output.add(splitJmp);
output.add(start + 2);
output.setJump(jump);
}
} else {
if (minCount > 0) {
expr.writeCode(output);
}
if (maxCount > minCount) {
int[] jumps = new int[maxCount - minCount];
for (int i = 0; i < jumps.length; ++ i) {
output.add(split);
jumps[i] = output.markJump();
expr.writeCode(output);
}
for (int jump : jumps) {
output.setJump(jump);
}
}
}
}
public String toString() {
String qualifier = greedy ? "" : "?";
if (minCount == 0 && maxCount < 2) {
return expr.toString() + (minCount < 0 ? "*" : "?") + qualifier;
}
if (minCount == 1 && maxCount < 0) {
return expr.toString() + "+" + qualifier;
}
return expr.toString() + "{" + minCount + ","
+ (maxCount < 0 ? "" : "" + maxCount) + "}" + qualifier;
}
}
private class Group extends Expression {
private final boolean capturing;
private ArrayList<Expression> list = new ArrayList<Expression>();
private ArrayList<Group> alternatives;
public Group(boolean capturing, ArrayList<Expression> initialList) {
this.capturing = capturing;
if (initialList != null) {
list.addAll(initialList);
}
}
public void push(Expression expr) {
list.add(expr);
}
public void push(final int c) {
push(new Expression() {
public void writeCode(Output output) {
output.add(c);
}
public String toString() {
if (c >= 0) {
return "" + (char)c;
}
switch (c) {
case DOT:
return ".";
case WORD_BOUNDARY:
return "\\b";
case NON_WORD_BOUNDARY:
return "\\B";
case LINE_START:
return "^";
case LINE_END:
return "$";
default:
throw new RuntimeException("Unhandled opcode: " + c);
}
}
});
}
public void startAlternative() {
if (alternatives == null) {
alternatives = new ArrayList<Group>();
}
alternatives.add(new Group(false, list));
list.clear();
}
public Expression pop() {
Expression result = list.remove(list.size() - 1);
return result;
}
protected void writeCode(Output output) {
int groupIndex = -1;
if (capturing) {
groupIndex = ++ output.groupCount;
output.add(SAVE_OFFSET);
output.add(2 * groupIndex);
}
int[] jumps = null;
if (alternatives != null) {
jumps = new int[alternatives.size()];
int i = 0;
for (Group alternative : alternatives) {
output.add(SPLIT);
int jump = output.markJump();
alternative.writeCode(output);
output.add(JMP);
jumps[i++] = output.markJump();
output.setJump(jump);
}
}
for (Expression expr : list) {
expr.writeCode(output);
}
if (jumps != null) {
for (int jump : jumps) {
output.setJump(jump);
}
}
if (capturing) {
output.add(SAVE_OFFSET);
output.add(2 * groupIndex + 1);
}
}
public String toString() {
StringBuilder builder = new StringBuilder();
if (alternatives != null || list.size() > 1) {
builder.append('(');
if (!capturing) {
builder.append("?:");
}
}
if (alternatives != null) {
for (Group alternative : alternatives) {
builder.append(alternative).append('|');
}
}
for (Expression expr : list) {
builder.append(expr);
}
if (alternatives != null || list.size() > 1) {
builder.append(')');
}
return builder.toString();
}
}
private class Lookaround extends Expression {
private final Group group = new Group(false, null);
private final boolean forward, negative;
public Lookaround(boolean forward, boolean negative) {
this.forward = forward;
this.negative = negative;
}
@Override
protected void writeCode(Output output) {
PikeVM vm = new Output(group).toVM();
if (!forward) {
vm.reverse();
}
output.add(forward ?
(negative ? NEGATIVE_LOOKAHEAD : LOOKAHEAD) :
(negative ? NEGATIVE_LOOKAHEAD : LOOKBEHIND));
output.add(output.addLookaround(vm));
}
public String toString() {
String inner = group.toString();
if (inner.startsWith("(?:")) {
inner = inner.substring(3);
} else {
inner += ")";
}
return "(?=" + inner;
}
}
private class Group0 extends Expression {
private final Group group;
public Group0() {
group = new Group(true, null);
}
public void writeCode(Output output) {
// find() preamble
int start = output.offset;
output.add(SPLIT_JMP);
output.add(start + 5);
output.add(DOTALL);
output.add(SPLIT);
output.add(start + 2);
output.markFindPreambleEnd();
group.writeCode(output);
}
public String toString() {
String inner = group.toString();
return inner.startsWith("(?:") && inner.endsWith(")") ?
inner.substring(1, inner.length() - 1) : inner;
}
}
private Group0 root;
private Stack<Group> groups;
public Compiler() {
root = new Group0();
groups = new Stack<Group>();
groups.add(root.group);
}
public Pattern compile(String regex) {
char[] array = regex.toCharArray();
CharacterMatcher.Parser characterClassParser =
new CharacterMatcher.Parser(array);
for (int index = 0; index < array.length; ++ index) {
char c = array[index];
Group current = groups.peek();
if (regularCharacter.matches(c)) {
current.push(c);
continue;
}
switch (c) {
case '.':
current.push(DOT);
continue;
case '\\':
int unescaped = characterClassParser.parseEscapedCharacter(index + 1);
if (unescaped >= 0) {
index = characterClassParser.getEndOffset() - 1;
current.push((char)unescaped);
continue;
}
CharacterMatcher characterClass = characterClassParser.parseClass(index);
if (characterClass != null) {
index = characterClassParser.getEndOffset() - 1;
current.push(new CharacterRange(characterClass));
continue;
}
switch (array[index + 1]) {
case 'b':
index++;
current.push(WORD_BOUNDARY);
continue;
case 'B':
index++;
current.push(NON_WORD_BOUNDARY);
continue;
}
throw new RuntimeException("Parse error @" + index + ": " + regex);
case '?':
case '*':
case '+': {
boolean greedy = true;
if (index + 1 < array.length && array[index + 1] == '?') {
greedy = false;
++ index;
}
current.push(new Repeat(current.pop(),
c == '+' ? 1 : 0, c == '?' ? 1 : -1, greedy));
continue;
}
case '{': {
++ index;
int length = characterClassParser.digits(index, 8, 10);
int min = Integer.parseInt(regex.substring(index, index + length));
int max = min;
index += length - 1;
c = index + 1 < array.length ? array[index + 1] : 0;
if (c == ',') {
++ index;
length = characterClassParser.digits(index + 1, 8, 10);
max = length == 0 ? -1 :
Integer.parseInt(regex.substring(index + 1, index + 1 + length));
index += length;
c = index + 1< array.length ? array[index + 1] : 0;
}
if (c != '}') {
throw new RuntimeException("Invalid quantifier @" + index + ": "
+ regex);
}
++ index;
boolean greedy = true;
if (index + 1 < array.length && array[index + 1] == '?') {
++ index;
greedy = false;
}
current.push(new Repeat(current.pop(), min, max, greedy));
continue;
}
case '(': {
boolean capturing = true;
if (index + 1 < array.length && array[index + 1] == '?') {
index += 2;
if (index >= array.length) {
throw new RuntimeException("Short pattern @" + index + ": "
+ regex);
}
c = array[index];
boolean lookAhead = true;
if (c == '<') {
if (++ index >= array.length) {
throw new RuntimeException("Short pattern @" + index + ": "
+ regex);
}
lookAhead = false;
c = array[index];
if (c != '=' && c != '!') {
throw new IllegalArgumentException("Named groups not supported @"
+ index + ": " + regex);
}
}
switch (c) {
case ':':
capturing = false;
break;
case '!':
case '=': {
capturing = false;
Lookaround lookaround = new Lookaround(lookAhead, c == '!');
current.push(lookaround);
groups.push(lookaround.group);
continue;
}
default:
throw new UnsupportedOperationException("Not yet supported: "
+ regex.substring(index));
}
}
current.push(groups.push(new Group(capturing, null)));
continue;
}
case ')':
if (groups.size() < 2) {
throw new RuntimeException("Invalid group close @" + index + ": "
+ regex);
}
groups.pop();
continue;
case '[': {
CharacterMatcher matcher = characterClassParser.parseClass(index);
if (matcher == null) {
throw new RuntimeException("Invalid range @" + index + ": " + regex);
}
current.push(new CharacterRange(matcher));
index = characterClassParser.getEndOffset() - 1;
continue;
}
case '|':
current.startAlternative();
continue;
case '^':
current.push(LINE_START);
continue;
case '$':
current.push(LINE_END);
continue;
default:
throw new RuntimeException("Parse error @" + index + ": " + regex);
}
}
if (groups.size() != 1) {
throw new IllegalArgumentException("Unclosed groups: ("
+ (groups.size() - 1) + "): " + regex);
}
PikeVM vm = new Output(root).toVM();
String plain = vm.isPlainString();
if (plain != null) {
return new TrivialPattern(regex, plain, 0);
}
return new RegexPattern(regex, 0, vm);
}
}

View File

@ -15,27 +15,23 @@ package java.util.regex;
*
* @author zsombor and others
*/
public class Matcher {
private final Pattern pattern;
private CharSequence input;
private int start;
private int end;
public abstract class Matcher {
protected CharSequence input;
protected int start;
protected int end;
Matcher(Pattern pattern, CharSequence input) {
this.pattern = pattern;
this.input = input;
public Matcher(CharSequence input) {
reset(input);
}
public boolean matches() {
if (pattern.pattern().equals(input.toString())) {
start = 0;
end = input.length();
return true;
} else {
return false;
}
public abstract boolean matches();
public boolean find() {
return find(end);
}
public abstract boolean find(int start);
public Matcher reset() {
return reset(input);
}
@ -47,10 +43,6 @@ public class Matcher {
return this;
}
public int start() {
return start;
}
public String replaceAll(String replacement) {
return replace(replacement, Integer.MAX_VALUE);
}
@ -59,7 +51,7 @@ public class Matcher {
return replace(replacement, 1);
}
private String replace(String replacement, int limit) {
protected String replace(String replacement, int limit) {
reset();
StringBuilder sb = null;
@ -88,23 +80,40 @@ public class Matcher {
return sb.toString();
}
public int start() {
return start;
}
public int end() {
return end;
}
public boolean find() {
return find(end);
public String group() {
return input.subSequence(start, end).toString();
}
public boolean find(int start) {
String p = pattern.pattern();
int i = Pattern.indexOf(input, p, start);
if (i >= 0) {
this.start = i;
this.end = i + p.length();
return true;
} else {
return false;
public int start(int group) {
if (group == 0) {
return start();
}
throw new UnsupportedOperationException();
}
public int end(int group) {
if (group == 0) {
return end();
}
throw new UnsupportedOperationException();
}
public String group(int group) {
if (group == 0) {
return group();
}
throw new UnsupportedOperationException();
}
public int groupCount() {
return 0;
}
}

View File

@ -10,9 +10,8 @@
package java.util.regex;
import java.util.Iterator;
import java.util.ArrayList;
import java.util.List;
import java.util.LinkedList;
/**
* This is a work in progress.
@ -20,7 +19,7 @@ import java.util.LinkedList;
* @author zsombor and others
*
*/
public class Pattern {
public abstract class Pattern implements PikeVMOpcodes {
public static final int UNIX_LINES = 1;
public static final int CASE_INSENSITIVE = 2;
@ -35,112 +34,26 @@ public class Pattern {
private final String pattern;
protected Pattern(String pattern, int flags) {
this.pattern = trivial(pattern);
this.pattern = pattern;
this.patternFlags = flags;
}
private static String trivial(String pattern) {
StringBuffer buffer = new StringBuffer();
for (int i = 0; i < pattern.length(); ++i) {
char c = pattern.charAt(i);
switch (c) {
case '\\':
if (++i == pattern.length()) {
break;
}
c = pattern.charAt(i);
if (c == '0') {
int len = digits(pattern, ++i, 3, 8);
if (len == 3 && pattern.charAt(i) > '3') {
--len;
}
c = (char)Integer.parseInt(pattern.substring(i, i + len), 8);
i += len - 1;
} else if (c == 'x' || c == 'u') {
int len = digits(pattern, ++i, 4, 16);
c = (char)Integer.parseInt(pattern.substring(i, i + len), 16);
i += len - 1;
} else {
c = unescape(pattern.charAt(i));
}
if (c != -1) {
break;
}
// fallthru
case '.':
case '*':
case '+':
case '?':
case '|':
case '[':
case ']':
case '{':
case '}':
case '(':
case ')':
case '^':
case '$':
throw new UnsupportedOperationException
("only trivial regular expressions are supported so far (" + pattern + ")");
}
buffer.append(c);
}
return buffer.toString();
}
private static int digits(String s, int offset, int maxLength, int base) {
for (int i = 0; ; ++i) {
if (i == maxLength || offset + i >= s.length()) {
return i;
}
int value = s.charAt(offset + i) - '0';
if (value < 0) {
return i;
}
if (base > 10 && value >= 10) {
value += 10 - (value >= 'a' - '0' ? 'a' - '0' : 'A' - '0');
}
if (value >= base) {
return i;
}
}
}
private static char unescape(char c) {
switch (c) {
case '\\':
return c;
case 'a':
return 0x0007;
case 'e':
return 0x001B;
case 'f':
return 0x000C;
case 'n':
return 0x000A;
case 'r':
return 0x000D;
case 't':
return 0x0009;
}
return (char)-1;
}
public static Pattern compile(String regex) {
return new Pattern(regex, 0);
return compile(regex, 0);
}
public static Pattern compile(String regex, int flags) {
return new Pattern(regex, flags);
if (flags != 0) {
throw new UnsupportedOperationException("TODO");
}
return new Compiler().compile(regex);
}
public int flags() {
return patternFlags;
}
public Matcher matcher(CharSequence input) {
return new Matcher(this, input);
}
public abstract Matcher matcher(CharSequence input);
public static boolean matches(String regex, CharSequence input) {
return Pattern.compile(regex).matcher(input).matches();
@ -155,79 +68,22 @@ public class Pattern {
}
public String[] split(CharSequence input, int limit) {
boolean strip;
if (limit < 0) {
strip = false;
if (limit <= 0) {
limit = Integer.MAX_VALUE;
} else if (limit == 0) {
strip = true;
limit = Integer.MAX_VALUE;
} else {
strip = false;
}
List<CharSequence> list = new LinkedList();
int index = 0;
int trailing = 0;
int patternLength = pattern.length();
while (index < input.length() && list.size() < limit - 1) {
int i;
if (patternLength == 0) {
if (list.size() == 0) {
i = 0;
} else {
i = index + 1;
}
} else {
i = indexOf(input, pattern, index);
}
if (i >= 0) {
if (patternLength != 0 && i == index) {
++ trailing;
} else {
trailing = 0;
}
list.add(input.subSequence(index, i));
index = i + patternLength;
} else {
Matcher matcher = matcher(input);
List<String> result = new ArrayList<String>();
int offset = 0;
for (;;) {
if (result.size() >= limit || !matcher.find()) {
break;
}
result.add(input.subSequence(offset, matcher.start()).toString());
offset = matcher.end();
}
if (strip && index > 0 && index == input.length()) {
++ trailing;
} else {
trailing = 0;
if (offset == 0 || offset < input.length()) {
result.add(input.subSequence(offset, input.length()).toString());
}
list.add(input.subSequence(index, input.length()));
String[] result = new String[list.size() - trailing];
int i = 0;
for (Iterator<CharSequence> it = list.iterator();
it.hasNext() && i < result.length; ++ i)
{
result[i] = it.next().toString();
}
return result;
}
static int indexOf(CharSequence haystack, CharSequence needle, int start) {
if (needle.length() == 0) return start;
for (int i = start; i < haystack.length() - needle.length() + 1; ++i) {
int j = 0;
for (; j < needle.length(); ++j) {
if (haystack.charAt(i + j) != needle.charAt(j)) {
break;
}
}
if (j == needle.length()) {
return i;
}
}
return -1;
return result.toArray(new String[result.size()]);
}
}

View File

@ -0,0 +1,629 @@
/* Copyright (c) 2008-2013, Avian Contributors
Permission to use, copy, modify, and/or distribute this software
for any purpose with or without fee is hereby granted, provided
that the above copyright notice and this permission notice appear
in all copies.
There is NO WARRANTY for this software. See license.txt for
details. */
package java.util.regex;
/**
* A minimal implementation of a regular expression engine.
*
* @author Johannes Schindelin
*/
class PikeVM implements PikeVMOpcodes {
private final int[] program;
private final int groupCount;
private final int offsetsCount;
/*
* For find(), we do not want to anchor the match at the start offset. Our
* compiler allows this by prefixing the code with an implicit '(?:.*?)'. For
* regular matches() calls, we want to skip that code and start at {@code
* findPrefixLength} instead.
*/
private final int findPrefixLength;
private final CharacterMatcher[] classes;
private final PikeVM[] lookarounds;
private final static CharacterMatcher wordCharacter =
CharacterMatcher.parse("\\w");
private final static CharacterMatcher lineTerminator =
CharacterMatcher.parse("[\n\r\u0085\u2028\u2029]");
private boolean multiLine;
public interface Result {
void set(int[] start, int[] end);
}
protected PikeVM(int[] program, int findPrefixLength, int groupCount,
CharacterMatcher[] classes, PikeVM[] lookarounds)
{
this.program = program;
this.findPrefixLength = findPrefixLength;
this.groupCount = groupCount;
offsetsCount = 2 * groupCount + 2;
this.classes = classes;
this.lookarounds = lookarounds;
}
/**
* The current thread states.
* <p>
* The threads are identified by their program counter. The rationale: as all
* threads are executed in lock-step, i.e. for the same character in the
* string to be matched, it does not make sense for two threads to be at the
* same program counter -- they would both do exactly the same for the rest of
* the execution.
* </p>
* <p>
* For efficiency, the threads are kept in a linked list that actually lives
* in an array indexed by the program counter, pointing to the next thread's
* program counter, in the order of high to low priority.
* </p>
* <p>
* Program counters which have no thread associated thread are marked as -1.
* The program counter associated with the least-priority thread (the last one
* in the linked list) is marked as -2 to be able to tell it apart from
* unscheduled threads.
* </p>
* <p>
* We actually never need to have an explicit value for the priority, the
* ordering is sufficient: whenever a new thread is to be scheduled and it is
* found to be scheduled already, it was already scheduled by a
* higher-priority thread.
* </p>
*/
private class ThreadQueue {
private int head, tail;
// next[pc] is 1 + the next thread's pc
private int[] next;
// offsets[pc][2 * group] is 1 + start offset
private int[][] offsets;
public ThreadQueue() {
head = tail = -1;
next = new int[program.length + 1];
offsets = new int[program.length + 1][];
}
public ThreadQueue(int startPC) {
head = tail = startPC;
next = new int[program.length + 1];
offsets = new int[program.length + 1][];
offsets[head] = new int[offsetsCount];
}
public int queueOneImmediately(ThreadQueue into) {
for (;;) {
if (head < 0) {
return -1;
}
boolean wasQueued = queueNext(head, head, into);
int pc = head;
if (head == tail) {
head = tail = -1;
} else {
head = next[pc] - 1;
next[pc] = 0;
}
offsets[pc] = null;
if (wasQueued) {
into.tail = pc;
return pc;
}
}
}
/**
* Schedules the instruction at {@code nextPC} to be executed immediately.
* <p>
* For non-matching steps (SPLIT, SAVE_STATE, etc) we need to schedule the
* corresponding program counter(s) to be handled right after this opcode,
* before advancing to the next character.
* </p>
* <p>
* To achieve this, we insert the program counter to-be-scheduled in the
* linked thread list at the current position, but only if it has not been
* scheduled yet: if it has, a higher-priority thread already reached that
* state.
* </p>
* <p>
* In contrast to {@link #queueNext(int, int, ThreadQueue)}, this method
* works on the current step's thread list.
* </p>
*
* @param currentPC
* the current program counter
* @param nextPC
* the program counter to schedule
* @param copyThreadState
* whether to spawn off a new thread
* @return whether the step was queued (i.e. no thread was queued for the
* same {@code nextPC} already)
*/
public boolean queueImmediately(int currentPC, int nextPC,
boolean copyThreadState) {
if (isScheduled(nextPC)) {
return false;
}
int[] offsets = this.offsets[currentPC];
if (copyThreadState) {
offsets = java.util.Arrays.copyOf(offsets, offsetsCount);
}
if (currentPC == tail) {
tail = nextPC;
} else {
next[nextPC] = next[currentPC];
}
this.offsets[nextPC] = offsets;
next[currentPC] = nextPC + 1;
return true;
}
/**
* Schedules the instruction at {@code nextPC} to be executed in the next
* step.
* <p>
* This method advances the current thread to the next program counter, to
* be executed after reading the next character.
* </p>
*
* @param currentPC
* the current program counter
* @param nextPC
* the program counter to schedule
* @param next
* the thread state of the next step
* @return whether the step was queued (i.e. no thread was queued for the
* same {@code nextPC} already)
*/
private boolean queueNext(int currentPC, int nextPC, ThreadQueue next) {
if (next.tail < 0) {
next.head = nextPC;
} else if (next.isScheduled(nextPC)) {
return false;
} else {
next.next[next.tail] = nextPC + 1;
}
next.offsets[nextPC] = offsets[currentPC];
next.tail = nextPC;
return true;
}
public void saveOffset(int pc, int index, int offset) {
offsets[pc][index] = offset + 1;
}
public void setResult(Result result) {
// copy offsets
int[] offsets = this.offsets[program.length];
int[] groupStart = new int[groupCount + 1];
int[] groupEnd = new int[groupCount + 1];
for (int j = 0; j <= groupCount; ++j) {
groupStart[j] = offsets[2 * j] - 1;
groupEnd[j] = offsets[2 * j + 1] - 1;
}
result.set(groupStart, groupEnd);
}
private void mustStartMatchAt(int start) {
int previous = -1;
for (int pc = head; pc >= 0; ) {
int nextPC = next[pc] - 1;
if (start + 1 == offsets[pc][0]) {
previous = pc;
} else {
next[pc] = 0;
offsets[pc] = null;
if (pc == tail) {
head = tail = -1;
} else if (previous < 0) {
head = nextPC;
} else {
next[previous] = 1 + nextPC;
}
}
pc = nextPC;
}
}
private int startOffset(int pc) {
return offsets[pc][0] - 1;
}
public boolean isEmpty() {
return head < 0;
}
public boolean isScheduled(int pc) {
return pc == tail || next[pc] > 0;
}
public int next(int pc) {
return pc < 0 ? head : next[pc] - 1;
}
public void clean() {
for (int pc = head; pc >= 0; ) {
int nextPC = next[pc] - 1;
next[pc] = 0;
offsets[pc] = null;
pc = nextPC;
}
head = tail = -1;
}
}
/**
* Executes the Pike VM defined by the program.
* <p>
* The idea is to execute threads in parallel, at each step executing them
* from the highest priority thread to the lowest one. In contrast to most
* regular expression engines, the Thompson/Pike one gets away with linear
* complexity because the string is matched from left to right, at each step
* executing a number of threads bounded by the length of the program: if two
* threads would execute at the same instruction pointer of the program, we
* need only consider the higher-priority one.
* </p>
* <p>
* This implementation is based on the description of <a
* href="http://swtch.com/%7Ersc/regexp/regexp2.html">Russ Cox</a>.
* </p>
*
* @param characters
* the {@link String} to match
* @param start
* the start offset where to match
* @param length
* the end offset
* @param anchorStart
* whether the match must start at {@code start}
* @param anchorEnd
* whether the match must start at {@code end}
* @param result
* the {@link Matcher} to store the groups' offsets in, if successful
* @return whether a match was found
*/
public boolean matches(char[] characters, int start, int end,
boolean anchorStart, boolean anchorEnd, Result result)
{
ThreadQueue current = new ThreadQueue();
ThreadQueue next = new ThreadQueue();
// initialize the first thread
int startPC = anchorStart ? findPrefixLength : 0;
ThreadQueue queued = new ThreadQueue(startPC);
boolean foundMatch = false;
int step = end > start ? +1 : -1;
for (int i = start; i != end + step; i += step) {
if (queued.isEmpty()) {
// no threads left
return foundMatch;
}
char c = i != end ? characters[i] : 0;
int pc = -1;
for (;;) {
pc = current.next(pc);
if (pc < 0) {
pc = queued.queueOneImmediately(current);
}
if (pc < 0) {
break;
}
// pc == program.length is a match!
if (pc == program.length) {
if (anchorEnd && i != end) {
continue;
}
if (result == null) {
// only interested in a match, no need to go on
return true;
}
current.setResult(result);
// now that we found a match, even higher-priority matches must match
// at the same start offset
if (!anchorStart) {
next.mustStartMatchAt(current.startOffset(pc));
}
foundMatch = true;
break;
}
int opcode = program[pc];
switch (opcode) {
case DOT:
if (c != '\0' && c != '\r' && c != '\n') {
current.queueNext(pc, pc + 1, next);
}
break;
case DOTALL:
current.queueNext(pc, pc + 1, next);
break;
case WORD_BOUNDARY:
case NON_WORD_BOUNDARY: {
int i2 = i - step;
int c2 = i2 < 0 || i2 >= characters.length ? -1 : characters[i2];
switch (opcode) {
case WORD_BOUNDARY:
if ((c2 < 0 || !wordCharacter.matches((char)c2))) {
if (wordCharacter.matches(c)) {
current.queueImmediately(pc, pc + 1, false);
}
} else if (i >= 0 && i < characters.length &&
!wordCharacter.matches(c)) {
current.queueImmediately(pc, pc + 1, false);
}
break;
case NON_WORD_BOUNDARY:
if ((c2 < 0 || !wordCharacter.matches((char)c2))) {
if (i >= 0 && i < characters.length &&
!wordCharacter.matches(c)) {
current.queueImmediately(pc, pc + 1, false);
}
} else if (wordCharacter.matches(c)) {
current.queueImmediately(pc, pc + 1, false);
}
break;
}
break;
}
case LINE_START:
if (i == 0 || (multiLine &&
lineTerminator.matches(characters[i - 1]))) {
current.queueImmediately(pc, pc + 1, false);
}
break;
case LINE_END:
if (i == characters.length || (multiLine &&
lineTerminator.matches(c))) {
current.queueImmediately(pc, pc + 1, false);
}
break;
case CHARACTER_CLASS:
if (classes[program[pc + 1]].matches(c)) {
current.queueNext(pc, pc + 2, next);
}
break;
case LOOKAHEAD:
if (lookarounds[program[pc + 1]].matches(characters,
i, characters.length, true, false, null)) {
current.queueImmediately(pc, pc + 2, false);
}
break;
case LOOKBEHIND:
if (lookarounds[program[pc + 1]].matches(characters,
i - 1, -1, true, false, null)) {
current.queueImmediately(pc, pc + 2, false);
}
break;
case NEGATIVE_LOOKAHEAD:
if (!lookarounds[program[pc + 1]].matches(characters,
i, characters.length, true, false, null)) {
current.queueImmediately(pc, pc + 2, false);
}
break;
case NEGATIVE_LOOKBEHIND:
if (!lookarounds[program[pc + 1]].matches(characters,
i - 1, -1, true, false, null)) {
current.queueImmediately(pc, pc + 2, false);
}
break;
/* immediate opcodes, i.e. thread continues within the same step */
case SAVE_OFFSET:
if (result != null) {
int index = program[pc + 1];
current.saveOffset(pc, index, i);
}
current.queueImmediately(pc, pc + 2, false);
break;
case SPLIT:
current.queueImmediately(pc, program[pc + 1], true);
current.queueImmediately(pc, pc + 2, false);
break;
case SPLIT_JMP:
current.queueImmediately(pc, pc + 2, true);
current.queueImmediately(pc, program[pc + 1], false);
break;
case JMP:
current.queueImmediately(pc, program[pc + 1], false);
break;
default:
if (program[pc] >= 0 && program[pc] <= 0xffff) {
if (c == (char)program[pc]) {
current.queueNext(pc, pc + 1, next);
}
break;
}
throw new RuntimeException("Invalid opcode: " + opcode
+ " at pc " + pc);
}
}
// clean linked thread list (and states)
current.clean();
// prepare for next step
ThreadQueue swap = queued;
queued = next;
next = swap;
}
return foundMatch;
}
/**
* Determines whether this machine recognizes a pattern without special
* operators.
* <p>
* In case that the regular expression is actually a plain string without any
* special operators, we can avoid using a full-blown Pike VM and instead fall
* back to using the much faster {@link TrivialPattern}.
* </p>
*
* @return the string to match, or null if the machine recognizes a
* non-trivial pattern
*/
public String isPlainString() {
// we expect the machine to start with the find preamble and SAVE_OFFSET 0
// end with SAVE_OFFSET 1
int start = findPrefixLength;
if (start + 1 < program.length &&
program[start] == SAVE_OFFSET && program[start + 1] == 0) {
start += 2;
}
int end = program.length;
if (end > start + 1 &&
program[end - 2] == SAVE_OFFSET && program[end - 1] == 1) {
end -= 2;
}
for (int i = start; i < end; ++ i) {
if (program[i] < 0) {
return null;
}
}
char[] array = new char[end - start];
for (int i = start; i < end; ++ i) {
array[i - start] = (char)program[i];
}
return new String(array);
}
private static int length(int opcode) {
return opcode <= SINGLE_ARG_START && opcode >= SINGLE_ARG_END ? 2 : 1;
}
private static boolean isJump(int opcode) {
return opcode <= SPLIT && opcode >= JMP;
}
/**
* Reverses the program (effectively matching the reverse pattern).
* <p>
* It is a well-known fact that any regular expression can be reordered
* trivially into an equivalent regular expression to be applied in backward
* direction (coming in real handy for look-behind expressions).
* </p>
* <p>
* Example: instead of matching the sequence "aaaabb" with the pattern "a+b+",
* we can match the reverse sequence "bbaaaa" with the pattern "b+a+".
* </p>
* <p>
* One caveat: while the reverse pattern is equivalent in the sense that it
* matches if, and only if, the original pattern matches the forward
* direction, the same is not true for submatches. Consider the input "a" and
* the pattern "(a?)a?": when matching in forward direction the captured group
* is "a", while the backward direction will yield the empty string. For that
* reason, Java dictates that capturing groups in look-behind patterns are
* ignored.
* </p>
*/
public void reverse() {
reverse(findPrefixLength, program.length);
}
/**
* Reverses a specific part of the program (to match in reverse direction).
* <p>
* This is the work-horse of {@link #reverse()}.
* </p>
* <p>
* To visualize the process of reversing a program, let's look at it as a
* directed graph (each jump is represented by an "<tt>X</tt>
* ", non-jumping steps are represented by a "<tt>o</tt>"s, arrows show the
* direction of the flow, <code>SPLIT</code>s spawn two arrows):
*
* <pre>
* o -> X -> X -> o -> X o -> o
* ^ | \ \___^____^
* \__/ \____________|
* </pre>
*
* The concept of reversing the program is easiest explained as following: if
* we insert auxiliary nodes "<tt>Y</tt>" for jump targets, the graph looks
* like this instead:
*
* <pre>
* Y -> o -> X -> X -> o -> X Y -> o -> Y -> o
* ^ | \ \___^_________^
* \_______/ \____________|
* </pre>
*
* It is now obvious that reversing the program is equivalent to reversing all
* arrows, simply deleting all <tt>X</tt>s and substituting each <tt>Y</tt>
* with a jump. Note that the reverse program will have the same number of
* <tt>JMP</tt>, but they will not be associated with the same arrows!:
*
* <pre>
* X <- o <- o X <- o <- X <- o
* | ^ ^____|________/
* \__/ \_______/
* </pre>
*
* </p>
* @param start
* start reversing the program with this instruction
* @param end
* stop reversing at this instruction (this must be either an index
* aligned exactly with an instruction, or exactly
* {@code program.length}.
*/
private void reverse(int start, int end) {
// Pass 1: build the list of jump targets
int[] newJumps = new int[end + 1];
boolean[] brokenArrows = new boolean[end + 1];
for (int pc = start; pc < end; pc += length(program[pc])) {
if (isJump(program[pc])) {
int target = program[pc + 1];
newJumps[pc + 1] = newJumps[target];
newJumps[target] = pc + 1;
if (program[pc] == JMP) {
brokenArrows[pc + 2] = true;
}
}
}
// Pass 2: determine mapped program counters
int[] mapping = new int[end];
for (int pc = start, mappedPC = end; mappedPC > 0
&& pc < end; pc += length(program[pc])) {
for (int jump = newJumps[pc]; jump > 0; jump = newJumps[jump]) {
mappedPC -= 2;
}
if (!isJump(program[pc])) {
mappedPC -= length(program[pc]);
}
mapping[pc] = mappedPC;
}
// Pass 3: write the new program
int[] reverse = new int[end];
for (int pc = start, mappedPC = end; mappedPC > 0;
pc += length(program[pc])) {
boolean brokenArrow = brokenArrows[pc];
for (int jump = newJumps[pc]; jump > 0; jump = newJumps[jump]) {
reverse[--mappedPC] = mapping[jump - 1];
if (brokenArrow) {
reverse[--mappedPC] = JMP;
brokenArrow = false;
} else {
reverse[--mappedPC] =
program[jump - 1] == SPLIT_JMP ? SPLIT_JMP : SPLIT;
}
}
if (pc == end) {
break;
}
if (!isJump(program[pc])) {
for (int i = length(program[pc]); i-- > 0; ) {
reverse[--mappedPC] = program[pc + i];
}
}
}
System.arraycopy(reverse, start, program, start, end - start);
}
}

View File

@ -0,0 +1,45 @@
/* Copyright (c) 2008-2013, Avian Contributors
Permission to use, copy, modify, and/or distribute this software
for any purpose with or without fee is hereby granted, provided
that the above copyright notice and this permission notice appear
in all copies.
There is NO WARRANTY for this software. See license.txt for
details. */
package java.util.regex;
/**
* Opcodes for the Pike VM.
* <p>
* See {@link PikeVM}.
* </p>
*
* @author Johannes Schindelin
*/
interface PikeVMOpcodes {
final static int DOT = -1;
final static int DOTALL = -2;
final static int WORD_BOUNDARY = -10;
final static int NON_WORD_BOUNDARY = -11;
final static int LINE_START = -12;
final static int LINE_END = -13;
final static int CHARACTER_CLASS = -20;
final static int LOOKAHEAD = -30;
final static int LOOKBEHIND = -31;
final static int NEGATIVE_LOOKAHEAD = -32;
final static int NEGATIVE_LOOKBEHIND = -33;
final static int SAVE_OFFSET = -40;
final static int SPLIT = -50;
final static int SPLIT_JMP = -51; // this split prefers to jump
final static int JMP = -52;
final static int SINGLE_ARG_START = CHARACTER_CLASS;
final static int SINGLE_ARG_END = JMP;
}

View File

@ -0,0 +1,80 @@
/* Copyright (c) 2008-2013, Avian Contributors
Permission to use, copy, modify, and/or distribute this software
for any purpose with or without fee is hereby granted, provided
that the above copyright notice and this permission notice appear
in all copies.
There is NO WARRANTY for this software. See license.txt for
details. */
package java.util.regex;
/**
* A minimal implementation of a regular expression matcher.
*
* @author Johannes Schindelin
*/
public class RegexMatcher extends Matcher {
private final PikeVM vm;
private char[] array;
int[] groupStart, groupEnd;
RegexMatcher(PikeVM vm, CharSequence string) {
super(string);
this.vm = vm;
}
private final PikeVM.Result adapter = new PikeVM.Result() {
public void set(int[] start, int[] end) {
RegexMatcher.this.start = start[0];
RegexMatcher.this.end = end[0];
RegexMatcher.this.groupStart = start;
RegexMatcher.this.groupEnd = end;
}
};
public Matcher reset() {
start = end = -1;
return this;
}
public Matcher reset(CharSequence input) {
this.input = input;
array = input.toString().toCharArray();
return reset();
}
public boolean matches() {
return vm.matches(array, 0, array.length, true, true, adapter);
}
public boolean find() {
return find(end + (start == end ? 1 : 0));
}
public boolean find(int offset) {
return vm.matches(array, offset, array.length, false, false, adapter);
}
public int start(int group) {
return groupStart[group];
}
public int end(int group) {
return groupEnd[group];
}
public String group(int group) {
int offset = start(group);
if (offset < 0) {
return null;
}
int length = end(group) - offset;
return new String(array, offset, length);
}
public int groupCount() {
return groupStart.length - 1;
}
}

View File

@ -0,0 +1,57 @@
/* Copyright (c) 2008-2013, Avian Contributors
Permission to use, copy, modify, and/or distribute this software
for any purpose with or without fee is hereby granted, provided
that the above copyright notice and this permission notice appear
in all copies.
There is NO WARRANTY for this software. See license.txt for
details. */
package java.util.regex;
/**
* A minimal implementation of a regular expression engine.
* <p>
* Intended as a permissively-licensed drop-in replacement for Oracle JDK's
* regular expression engine, this class uses the Pike VM implemented in
* {@link PikeVM} to match regular expressions.
* </p>
* <p>
* The Pike VM not only has a nicer runtime performance than Oracle JDK's
* backtracking approach -- <i>O(n*m)</i> instead of <i>O(2^m)</i> where
* <i>n</i> is the length of the regular expression pattern (after normalizing
* {&lt;n&gt;} quantifiers) and <i>m</i> the length of the text to match against
* the pattern -- but also supports arbitrary-sized look-behinds.
* </p>
* <p>
* The current implementation supports all regular expression constructs
* supported by Oracle JDK's regular expression engine except for the following
* ones:
* <ul>
* <li>control characters: \cX</li>
* <li>extended character classes: \p{...}</li>
* <li>extended boundary matchers: \A,\G,\Z,\z</li>
* <li>possessive quantifiers: X?+</li>
* <li>back references: \&lt;n&gt;, \k&lt;name&gt;</li>
* <li>long escape: \Q, \E</li>
* <li>named groups: (?&lt;name&gt;X)</li>
* <li>flags: (?idmsuxU)</li>
* <li>independent, non-capturing group: (?>X)</li>
* </ul>
* </p>
*
* @author Johannes Schindelin
*/
public class RegexPattern extends Pattern {
private PikeVM vm;
public RegexMatcher matcher(CharSequence string) {
return new RegexMatcher(vm, string);
}
RegexPattern(String regex, int flags, PikeVM vm) {
super(regex, flags);
this.vm = vm;
}
}

View File

@ -0,0 +1,48 @@
/* Copyright (c) 2008-2013, Avian Contributors
Permission to use, copy, modify, and/or distribute this software
for any purpose with or without fee is hereby granted, provided
that the above copyright notice and this permission notice appear
in all copies.
There is NO WARRANTY for this software. See license.txt for
details. */
package java.util.regex;
/**
* This is a work in progress.
*
* @author zsombor and others
*/
class TrivialMatcher extends Matcher {
private final String pattern;
TrivialMatcher(String pattern, CharSequence input) {
super(input);
this.pattern = pattern;
}
public boolean matches() {
if (pattern.equals(input.toString())) {
start = 0;
end = input.length();
return true;
} else {
return false;
}
}
public boolean find(int start) {
String p = pattern;
int i = TrivialPattern.indexOf(input, p, start);
if (i >= 0) {
this.start = i;
this.end = i + p.length();
return true;
} else {
return false;
}
}
}

View File

@ -0,0 +1,112 @@
/* Copyright (c) 2008-2013, Avian Contributors
Permission to use, copy, modify, and/or distribute this software
for any purpose with or without fee is hereby granted, provided
that the above copyright notice and this permission notice appear
in all copies.
There is NO WARRANTY for this software. See license.txt for
details. */
package java.util.regex;
import java.util.Iterator;
import java.util.List;
import java.util.LinkedList;
/**
* This is a work in progress.
*
* @author zsombor and others
*
*/
public class TrivialPattern extends Pattern {
private final String unescaped;
TrivialPattern(String pattern, String unescaped, int flags) {
super(pattern, flags);
this.unescaped = unescaped;
}
public Matcher matcher(CharSequence input) {
return new TrivialMatcher(unescaped, input);
}
public String[] split(CharSequence input, int limit) {
boolean strip;
if (limit < 0) {
strip = false;
limit = Integer.MAX_VALUE;
} else if (limit == 0) {
strip = true;
limit = Integer.MAX_VALUE;
} else {
strip = false;
}
List<CharSequence> list = new LinkedList<CharSequence>();
int index = 0;
int trailing = 0;
int patternLength = unescaped.length();
while (index < input.length() && list.size() < limit - 1) {
int i;
if (patternLength == 0) {
if (list.size() == 0) {
i = 0;
} else {
i = index + 1;
}
} else {
i = indexOf(input, unescaped, index);
}
if (i >= 0) {
if (patternLength != 0 && i == index) {
++ trailing;
} else {
trailing = 0;
}
list.add(input.subSequence(index, i));
index = i + patternLength;
} else {
break;
}
}
if (strip && index > 0 && index == input.length()) {
++ trailing;
} else {
trailing = 0;
}
list.add(input.subSequence(index, input.length()));
String[] result = new String[list.size() - trailing];
int i = 0;
for (Iterator<CharSequence> it = list.iterator();
it.hasNext() && i < result.length; ++ i)
{
result[i] = it.next().toString();
}
return result;
}
static int indexOf(CharSequence haystack, CharSequence needle, int start) {
if (needle.length() == 0) return start;
for (int i = start; i < haystack.length() - needle.length() + 1; ++i) {
int j = 0;
for (; j < needle.length(); ++j) {
if (haystack.charAt(i + j) != needle.charAt(j)) {
break;
}
}
if (j == needle.length()) {
return i;
}
}
return -1;
}
}

96
test/Regex.java Normal file
View File

@ -0,0 +1,96 @@
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class Regex {
private static void expect(boolean v) {
if (! v) throw new RuntimeException();
}
private static Matcher getMatcher(String regex, String string) {
return Pattern.compile(regex).matcher(string);
}
private static void expectMatch(String regex, String string) {
expect(getMatcher(regex, string).matches());
}
private static void expectNoMatch(String regex, String string) {
expect(!getMatcher(regex, string).matches());
}
private static void expectGroups(String regex, String string,
String... groups) {
Matcher matcher = getMatcher(regex, string);
expect(matcher.matches());
expect(matcher.groupCount() == groups.length);
for (int i = 1; i <= groups.length; ++i) {
if (groups[i - 1] == null) {
expect(matcher.group(i) == null);
} else {
expect(groups[i - 1].equals(matcher.group(i)));
}
}
}
private static void expectFind(String regex, String string,
String... matches)
{
Matcher matcher = getMatcher(regex, string);
int i = 0;
while (i < matches.length) {
expect(matcher.find());
expect(matches[i++].equals(matcher.group()));
}
expect(!matcher.find());
}
private static void expectSplit(String regex, String string,
String... list)
{
String[] array = Pattern.compile(regex).split(string);
expect(array.length == list.length);
for (int i = 0; i < list.length; ++ i) {
expect(list[i].equals(array[i]));
}
}
public static void main(String[] args) {
expectMatch("a(bb)?a", "abba");
expectNoMatch("a(bb)?a", "abbba");
expectNoMatch("a(bb)?a", "abbaa");
expectGroups("a(a*?)(a?)(a??)(a+)(a*)a", "aaaaaa", "", "a", "", "aaa", "");
expectMatch("...", "abc");
expectNoMatch(".", "\n");
expectGroups("a(bb)*a", "abbbba", "bb");
expectGroups("a(bb)?(bb)+a", "abba", null, "bb");
expectFind(" +", "Hello , world! ", " ", " ", " ");
expectMatch("[0-9A-Fa-f]+", "08ef");
expectNoMatch("[0-9A-Fa-f]+", "08@ef");
expectGroups("(?:a)", "a");
expectGroups("a|(b|c)", "a", (String)null);
expectGroups("a|(b|c)", "c", "c");
expectGroups("(?=a)a", "a");
expectGroups(".*(o)(?<=[A-Z][a-z]*)", "Hello", "o");
expectNoMatch("(?!a).", "a");
expectMatch("[\\d]", "0");
expectMatch("\\0777", "?7");
expectMatch("\\a", "\007");
expectMatch("\\\\", "\\");
expectMatch("\\x4A", "J");
expectMatch("\\x61", "a");
expectMatch("\\078", "\0078");
expectSplit("(?<=\\w)(?=\\W)|(?<=\\W)(?=\\w)", "a + b * x",
"a", " + ", "b", " * ", "x");
expectMatch("[0-9[def]]", "f");
expectNoMatch("[a-z&&[^d-f]]", "f");
expectSplit("^H", "Hello\nHobbes!", "", "ello\nHobbes!");
expectSplit("o.*?$", "Hello\r\nHobbes!", "Hello\r\nH");
expectSplit("\\b", "a+ b + c\nd", "", "a", "+ ", "b", " + ", "c", "\n", "d");
expectSplit("\\B", "Hi Cal!", "H", "i C", "a", "l!");
expectMatch("a{2,5}", "aaaa");
expectGroups("a??(a{2,5}?)", "aaaa", "aaaa");
expectGroups("a??(a{3}?)", "aaaa", "aaa");
expectNoMatch("a(a{3}?)", "aaaaa");
expectMatch("a(a{3,}?)", "aaaaa");
}
}