mirror of
https://github.com/corda/corda.git
synced 2025-01-08 14:03:06 +00:00
Merge pull request #105 from dscho/regex
Support (the most common subset of) regular expressions
This commit is contained in:
commit
fe9ac94629
1
.gitignore
vendored
1
.gitignore
vendored
@ -8,3 +8,4 @@ bin
|
|||||||
/lib
|
/lib
|
||||||
/distrib
|
/distrib
|
||||||
*.pdb
|
*.pdb
|
||||||
|
*.swp
|
||||||
|
332
classpath/java/util/regex/CharacterMatcher.java
Normal file
332
classpath/java/util/regex/CharacterMatcher.java
Normal file
@ -0,0 +1,332 @@
|
|||||||
|
/* Copyright (c) 2008-2013, Avian Contributors
|
||||||
|
|
||||||
|
Permission to use, copy, modify, and/or distribute this software
|
||||||
|
for any purpose with or without fee is hereby granted, provided
|
||||||
|
that the above copyright notice and this permission notice appear
|
||||||
|
in all copies.
|
||||||
|
|
||||||
|
There is NO WARRANTY for this software. See license.txt for
|
||||||
|
details. */
|
||||||
|
|
||||||
|
package java.util.regex;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* A class to match classes of characters.
|
||||||
|
* <p>
|
||||||
|
* This class is intended to be the working horse behind character classes
|
||||||
|
* such as {@code [a-z]}.
|
||||||
|
* </p>
|
||||||
|
* @author Johannes Schindelin
|
||||||
|
*/
|
||||||
|
class CharacterMatcher {
|
||||||
|
private boolean[] map;
|
||||||
|
private boolean inversePattern;
|
||||||
|
|
||||||
|
public static CharacterMatcher parse(String description) {
|
||||||
|
return parse(description.toCharArray());
|
||||||
|
}
|
||||||
|
|
||||||
|
public static CharacterMatcher parse(char[] description) {
|
||||||
|
Parser parser = new Parser(description);
|
||||||
|
CharacterMatcher result = parser.parseClass();
|
||||||
|
if (parser.getEndOffset() != description.length) {
|
||||||
|
throw new RuntimeException("Short character class @"
|
||||||
|
+ parser.getEndOffset() + ": " + new String(description));
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean matches(char c) {
|
||||||
|
int index = c;
|
||||||
|
return (map.length > index && map[index]) ^ inversePattern;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String toString() {
|
||||||
|
StringBuilder builder = new StringBuilder();
|
||||||
|
builder.append("[");
|
||||||
|
if (inversePattern) {
|
||||||
|
builder.append("^");
|
||||||
|
}
|
||||||
|
for (int i = 0; i < map.length; ++ i) {
|
||||||
|
if (!map[i]) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
builder.append(i >= ' ' && i <= 0x7f ?
|
||||||
|
"" + (char)i : ("\\x" + Integer.toHexString(i)));
|
||||||
|
int j = i + 1;
|
||||||
|
while (j < map.length && map[j]) {
|
||||||
|
++ j;
|
||||||
|
}
|
||||||
|
-- j;
|
||||||
|
if (j > i) {
|
||||||
|
if (j > i + 1) {
|
||||||
|
builder.append('-');
|
||||||
|
}
|
||||||
|
builder.append(j >= ' ' && j <= 0x7f ?
|
||||||
|
"" + (char)j : ("\\x" + Integer.toHexString(j)));
|
||||||
|
i = j;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
builder.append("]");
|
||||||
|
return builder.toString();
|
||||||
|
}
|
||||||
|
|
||||||
|
private static String specialClass(int c) {
|
||||||
|
if ('d' == c) {
|
||||||
|
return "[0-9]";
|
||||||
|
}
|
||||||
|
if ('D' == c) {
|
||||||
|
return "[^0-9]";
|
||||||
|
}
|
||||||
|
if ('s' == c) {
|
||||||
|
return "[ \\t\\n\\x0B\\f\\r]";
|
||||||
|
}
|
||||||
|
if ('S' == c) {
|
||||||
|
return "[^ \\t\\n\\x0B\\f\\r]";
|
||||||
|
}
|
||||||
|
if ('w' == c) {
|
||||||
|
return "[a-zA-Z_0-9]";
|
||||||
|
}
|
||||||
|
if ('W' == c) {
|
||||||
|
return "[^a-zA-Z_0-9]";
|
||||||
|
}
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
private CharacterMatcher(boolean[] map, boolean inversePattern) {
|
||||||
|
this.map = map;
|
||||||
|
this.inversePattern = inversePattern;
|
||||||
|
}
|
||||||
|
|
||||||
|
private void setMatch(int c) {
|
||||||
|
ensureCapacity(c + 1);
|
||||||
|
map[c] = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
private void ensureCapacity(int length) {
|
||||||
|
if (map.length >= length) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
int size = map.length;
|
||||||
|
if (size < 32) {
|
||||||
|
size = 32;
|
||||||
|
}
|
||||||
|
while (size < length) {
|
||||||
|
size <<= 1;
|
||||||
|
}
|
||||||
|
map = java.util.Arrays.copyOf(map, size);
|
||||||
|
}
|
||||||
|
|
||||||
|
private void merge(CharacterMatcher other) {
|
||||||
|
boolean inversePattern = this.inversePattern || other.inversePattern;
|
||||||
|
if ((map.length < other.map.length) ^ inversePattern) {
|
||||||
|
map = java.util.Arrays.copyOf(map, other.map.length);
|
||||||
|
}
|
||||||
|
for (int i = 0; i < map.length; ++ i) {
|
||||||
|
map[i] = (matches((char)i) || other.matches((char)i)) ^ inversePattern;
|
||||||
|
}
|
||||||
|
this.inversePattern = inversePattern;
|
||||||
|
}
|
||||||
|
|
||||||
|
private void intersect(CharacterMatcher other) {
|
||||||
|
boolean inversePattern = this.inversePattern && other.inversePattern;
|
||||||
|
if ((map.length > other.map.length) ^ inversePattern) {
|
||||||
|
map = java.util.Arrays.copyOf(map, other.map.length);
|
||||||
|
}
|
||||||
|
for (int i = 0; i < map.length; ++ i) {
|
||||||
|
map[i] = (matches((char)i) && other.matches((char)i)) ^ inversePattern;
|
||||||
|
}
|
||||||
|
this.inversePattern = inversePattern;
|
||||||
|
}
|
||||||
|
|
||||||
|
static class Parser {
|
||||||
|
private final char[] description;
|
||||||
|
private int offset;
|
||||||
|
|
||||||
|
public Parser(char[] description) {
|
||||||
|
this.description = description;
|
||||||
|
}
|
||||||
|
|
||||||
|
public int getEndOffset() {
|
||||||
|
return offset;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Parses an escaped character.
|
||||||
|
*
|
||||||
|
* @param start the offset <u>after</u> the backslash
|
||||||
|
* @return the escaped character, or -1 if no character was recognized
|
||||||
|
*/
|
||||||
|
public int parseEscapedCharacter(int start) {
|
||||||
|
offset = start;
|
||||||
|
return parseEscapedCharacter();
|
||||||
|
}
|
||||||
|
|
||||||
|
private int parseEscapedCharacter() {
|
||||||
|
if (offset == description.length) {
|
||||||
|
throw new IllegalArgumentException("Short escaped character");
|
||||||
|
}
|
||||||
|
char c = description[offset++];
|
||||||
|
if (c == '0') {
|
||||||
|
int len = digits(offset, 3, 8);
|
||||||
|
if (len == 3 && description[offset] > '3') {
|
||||||
|
--len;
|
||||||
|
}
|
||||||
|
c = (char)Integer.parseInt(new String(description, offset, len), 8);
|
||||||
|
offset += len;
|
||||||
|
return c;
|
||||||
|
}
|
||||||
|
if (c == 'x' || c == 'u') {
|
||||||
|
int len = digits(offset, 4, 16);
|
||||||
|
c = (char)Integer.parseInt(new String(description, offset, len), 16);
|
||||||
|
offset += len;
|
||||||
|
return c;
|
||||||
|
}
|
||||||
|
switch (c) {
|
||||||
|
case 'a':
|
||||||
|
return 0x0007;
|
||||||
|
case 'e':
|
||||||
|
return 0x001B;
|
||||||
|
case 'f':
|
||||||
|
return 0x000C;
|
||||||
|
case 'n':
|
||||||
|
return 0x000A;
|
||||||
|
case 'r':
|
||||||
|
return 0x000D;
|
||||||
|
case 't':
|
||||||
|
return 0x0009;
|
||||||
|
case '\\':
|
||||||
|
case '.':
|
||||||
|
case '*':
|
||||||
|
case '+':
|
||||||
|
case '?':
|
||||||
|
case '|':
|
||||||
|
case '[':
|
||||||
|
case ']':
|
||||||
|
case '{':
|
||||||
|
case '}':
|
||||||
|
case '(':
|
||||||
|
case ')':
|
||||||
|
case '^':
|
||||||
|
case '$':
|
||||||
|
return c;
|
||||||
|
}
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
public int digits(int offset, int maxLength, int base) {
|
||||||
|
for (int i = 0; ; ++i) {
|
||||||
|
if (i == maxLength || offset + i >= description.length) {
|
||||||
|
return i;
|
||||||
|
}
|
||||||
|
int value = description[offset + i] - '0';
|
||||||
|
if (value < 0) {
|
||||||
|
return i;
|
||||||
|
}
|
||||||
|
if (base > 10 && value >= 10) {
|
||||||
|
value += 10 - (value >= 'a' - '0' ? 'a' - '0' : 'A' - '0');
|
||||||
|
}
|
||||||
|
if (value >= base) {
|
||||||
|
return i;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public CharacterMatcher parseClass(int start) {
|
||||||
|
offset = start;
|
||||||
|
return parseClass();
|
||||||
|
}
|
||||||
|
|
||||||
|
public CharacterMatcher parseClass() {
|
||||||
|
if (description[offset] != '[') {
|
||||||
|
if (description[offset] == '\\') {
|
||||||
|
String range = specialClass(description[++ offset]);
|
||||||
|
if (range != null) {
|
||||||
|
++ offset;
|
||||||
|
return CharacterMatcher.parse(range);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
CharacterMatcher matcher = new CharacterMatcher(new boolean[0],
|
||||||
|
description[++ offset] == '^');
|
||||||
|
if (matcher.inversePattern) {
|
||||||
|
++ offset;
|
||||||
|
}
|
||||||
|
|
||||||
|
int previous = -1;
|
||||||
|
boolean firstCharacter = true;
|
||||||
|
for (;;) {
|
||||||
|
if (offset >= description.length) {
|
||||||
|
unsupported("short regex");
|
||||||
|
}
|
||||||
|
char c = description[offset++];
|
||||||
|
if (c == '-' && !firstCharacter && description[offset] != ']') {
|
||||||
|
if (previous < 0) {
|
||||||
|
unsupported("invalid range");
|
||||||
|
}
|
||||||
|
int rangeEnd = description[offset];
|
||||||
|
if ('\\' == rangeEnd) {
|
||||||
|
rangeEnd = parseEscapedCharacter();
|
||||||
|
if (rangeEnd < 0) {
|
||||||
|
unsupported("invalid range");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
matcher.ensureCapacity(rangeEnd + 1);
|
||||||
|
for (int j = previous + 1; j <= rangeEnd; j++) {
|
||||||
|
matcher.map[j] = true;
|
||||||
|
}
|
||||||
|
} else if (c == '\\') {
|
||||||
|
int saved = offset;
|
||||||
|
previous = parseEscapedCharacter();
|
||||||
|
if (previous < 0) {
|
||||||
|
offset = saved - 1;
|
||||||
|
CharacterMatcher clazz = parseClass();
|
||||||
|
if (clazz == null) {
|
||||||
|
unsupported("escape");
|
||||||
|
}
|
||||||
|
matcher.merge(clazz);
|
||||||
|
} else {
|
||||||
|
matcher.setMatch(previous);
|
||||||
|
}
|
||||||
|
} else if (c == '[') {
|
||||||
|
Parser parser = new Parser(description);
|
||||||
|
CharacterMatcher other = parser.parseClass(offset - 1);
|
||||||
|
if (other == null) {
|
||||||
|
unsupported("invalid merge");
|
||||||
|
}
|
||||||
|
matcher.merge(other);
|
||||||
|
offset = parser.getEndOffset();
|
||||||
|
previous = -1;
|
||||||
|
} else if (c == '&') {
|
||||||
|
if (offset + 2 > description.length || description[offset] != '&'
|
||||||
|
|| description[offset + 1] != '[') {
|
||||||
|
unsupported("operation");
|
||||||
|
}
|
||||||
|
Parser parser = new Parser(description);
|
||||||
|
CharacterMatcher other = parser.parseClass(offset + 1);
|
||||||
|
if (other == null) {
|
||||||
|
unsupported("invalid intersection");
|
||||||
|
}
|
||||||
|
matcher.intersect(other);
|
||||||
|
offset = parser.getEndOffset();
|
||||||
|
previous = -1;
|
||||||
|
} else if (c == ']') {
|
||||||
|
break;
|
||||||
|
} else {
|
||||||
|
previous = c;
|
||||||
|
matcher.setMatch(previous);
|
||||||
|
}
|
||||||
|
firstCharacter = false;
|
||||||
|
}
|
||||||
|
|
||||||
|
return matcher;
|
||||||
|
}
|
||||||
|
|
||||||
|
private void unsupported(String msg) throws UnsupportedOperationException {
|
||||||
|
throw new UnsupportedOperationException("Unsupported " + msg + " @"
|
||||||
|
+ offset + ": "
|
||||||
|
+ new String(description, 0, description.length));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
533
classpath/java/util/regex/Compiler.java
Normal file
533
classpath/java/util/regex/Compiler.java
Normal file
@ -0,0 +1,533 @@
|
|||||||
|
/* Copyright (c) 2008-2013, Avian Contributors
|
||||||
|
|
||||||
|
Permission to use, copy, modify, and/or distribute this software
|
||||||
|
for any purpose with or without fee is hereby granted, provided
|
||||||
|
that the above copyright notice and this permission notice appear
|
||||||
|
in all copies.
|
||||||
|
|
||||||
|
There is NO WARRANTY for this software. See license.txt for
|
||||||
|
details. */
|
||||||
|
|
||||||
|
package java.util.regex;
|
||||||
|
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.Stack;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Compiles regular expressions into {@link PikeVM}s.
|
||||||
|
*
|
||||||
|
* @author Johannes Schindelin
|
||||||
|
*/
|
||||||
|
class Compiler implements PikeVMOpcodes {
|
||||||
|
private final static CharacterMatcher regularCharacter =
|
||||||
|
CharacterMatcher.parse("[^\\\\.*+?|\\[\\]{}()^$]");
|
||||||
|
|
||||||
|
private static class Output {
|
||||||
|
private int[] program;
|
||||||
|
private int offset;
|
||||||
|
private int groupCount = -1;
|
||||||
|
private int findPreambleSize;
|
||||||
|
private ArrayList<CharacterMatcher> classes;
|
||||||
|
private ArrayList<PikeVM> lookarounds;
|
||||||
|
|
||||||
|
public Output(Expression expr) {
|
||||||
|
// try-run to determine the code size
|
||||||
|
expr.writeCode(this);
|
||||||
|
program = new int[offset];
|
||||||
|
offset = 0;
|
||||||
|
groupCount = -1;
|
||||||
|
classes = new ArrayList<CharacterMatcher>();
|
||||||
|
lookarounds = new ArrayList<PikeVM>();
|
||||||
|
// write it out!
|
||||||
|
expr.writeCode(this);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void add(int opcode) {
|
||||||
|
if (program != null) {
|
||||||
|
program[offset] = opcode;
|
||||||
|
}
|
||||||
|
offset++;
|
||||||
|
}
|
||||||
|
|
||||||
|
public int markJump() {
|
||||||
|
return offset++;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setJump(int mark) {
|
||||||
|
if (program != null) {
|
||||||
|
program[mark] = offset;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public void markFindPreambleEnd() {
|
||||||
|
findPreambleSize = offset;
|
||||||
|
}
|
||||||
|
|
||||||
|
public PikeVM toVM() {
|
||||||
|
CharacterMatcher[] classes = new CharacterMatcher[this.classes.size()];
|
||||||
|
this.classes.toArray(classes);
|
||||||
|
PikeVM[] lookarounds = new PikeVM[this.lookarounds.size()];
|
||||||
|
this.lookarounds.toArray(lookarounds);
|
||||||
|
return new PikeVM(program, findPreambleSize, groupCount, classes,
|
||||||
|
lookarounds);
|
||||||
|
}
|
||||||
|
|
||||||
|
public int addClass(CharacterMatcher characterClass) {
|
||||||
|
if (program == null) {
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
int result = classes.size();
|
||||||
|
classes.add(characterClass);
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
public int addLookaround(PikeVM lookaround) {
|
||||||
|
if (program == null) {
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
int result = lookarounds.size();
|
||||||
|
lookarounds.add(lookaround);
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private abstract class Expression {
|
||||||
|
protected abstract void writeCode(Output output);
|
||||||
|
}
|
||||||
|
|
||||||
|
private class CharacterRange extends Expression {
|
||||||
|
private final CharacterMatcher characterClass;
|
||||||
|
|
||||||
|
public CharacterRange(CharacterMatcher characterClass) {
|
||||||
|
this.characterClass = characterClass;
|
||||||
|
}
|
||||||
|
|
||||||
|
protected void writeCode(Output output) {
|
||||||
|
output.add(CHARACTER_CLASS);
|
||||||
|
output.add(output.addClass(characterClass));
|
||||||
|
}
|
||||||
|
|
||||||
|
public String toString() {
|
||||||
|
return characterClass.toString();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private class Repeat extends Expression {
|
||||||
|
private Expression expr;
|
||||||
|
private int minCount, maxCount;
|
||||||
|
private boolean greedy;
|
||||||
|
|
||||||
|
public Repeat(Expression expr, int minCount, int maxCount, boolean greedy) {
|
||||||
|
if (minCount < 0) {
|
||||||
|
throw new RuntimeException("Unexpected min count: " + minCount);
|
||||||
|
}
|
||||||
|
if (maxCount != -1) {
|
||||||
|
if (maxCount == 0) {
|
||||||
|
throw new RuntimeException("Unexpected max count: " + maxCount);
|
||||||
|
}
|
||||||
|
if (minCount > maxCount) {
|
||||||
|
throw new RuntimeException("Unexpected range: " + minCount + ", " + maxCount);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
this.expr = expr;
|
||||||
|
this.minCount = minCount;
|
||||||
|
this.maxCount = maxCount;
|
||||||
|
this.greedy = greedy;
|
||||||
|
}
|
||||||
|
|
||||||
|
protected void writeCode(Output output) {
|
||||||
|
int start = output.offset;
|
||||||
|
int splitJmp = greedy ? SPLIT_JMP : SPLIT;
|
||||||
|
int split = greedy ? SPLIT : SPLIT_JMP;
|
||||||
|
for (int i = 1; i < minCount; ++ i) {
|
||||||
|
expr.writeCode(output);
|
||||||
|
}
|
||||||
|
if (maxCount == -1) {
|
||||||
|
if (minCount > 0) {
|
||||||
|
int jump = output.offset;
|
||||||
|
expr.writeCode(output);
|
||||||
|
output.add(splitJmp);
|
||||||
|
output.add(jump);
|
||||||
|
} else {
|
||||||
|
output.add(split);
|
||||||
|
int jump = output.markJump();
|
||||||
|
expr.writeCode(output);
|
||||||
|
output.add(splitJmp);
|
||||||
|
output.add(start + 2);
|
||||||
|
output.setJump(jump);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
if (minCount > 0) {
|
||||||
|
expr.writeCode(output);
|
||||||
|
}
|
||||||
|
if (maxCount > minCount) {
|
||||||
|
int[] jumps = new int[maxCount - minCount];
|
||||||
|
for (int i = 0; i < jumps.length; ++ i) {
|
||||||
|
output.add(split);
|
||||||
|
jumps[i] = output.markJump();
|
||||||
|
expr.writeCode(output);
|
||||||
|
}
|
||||||
|
for (int jump : jumps) {
|
||||||
|
output.setJump(jump);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public String toString() {
|
||||||
|
String qualifier = greedy ? "" : "?";
|
||||||
|
if (minCount == 0 && maxCount < 2) {
|
||||||
|
return expr.toString() + (minCount < 0 ? "*" : "?") + qualifier;
|
||||||
|
}
|
||||||
|
if (minCount == 1 && maxCount < 0) {
|
||||||
|
return expr.toString() + "+" + qualifier;
|
||||||
|
}
|
||||||
|
return expr.toString() + "{" + minCount + ","
|
||||||
|
+ (maxCount < 0 ? "" : "" + maxCount) + "}" + qualifier;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private class Group extends Expression {
|
||||||
|
private final boolean capturing;
|
||||||
|
|
||||||
|
private ArrayList<Expression> list = new ArrayList<Expression>();
|
||||||
|
private ArrayList<Group> alternatives;
|
||||||
|
|
||||||
|
public Group(boolean capturing, ArrayList<Expression> initialList) {
|
||||||
|
this.capturing = capturing;
|
||||||
|
if (initialList != null) {
|
||||||
|
list.addAll(initialList);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public void push(Expression expr) {
|
||||||
|
list.add(expr);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void push(final int c) {
|
||||||
|
push(new Expression() {
|
||||||
|
public void writeCode(Output output) {
|
||||||
|
output.add(c);
|
||||||
|
}
|
||||||
|
|
||||||
|
public String toString() {
|
||||||
|
if (c >= 0) {
|
||||||
|
return "" + (char)c;
|
||||||
|
}
|
||||||
|
switch (c) {
|
||||||
|
case DOT:
|
||||||
|
return ".";
|
||||||
|
case WORD_BOUNDARY:
|
||||||
|
return "\\b";
|
||||||
|
case NON_WORD_BOUNDARY:
|
||||||
|
return "\\B";
|
||||||
|
case LINE_START:
|
||||||
|
return "^";
|
||||||
|
case LINE_END:
|
||||||
|
return "$";
|
||||||
|
default:
|
||||||
|
throw new RuntimeException("Unhandled opcode: " + c);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
public void startAlternative() {
|
||||||
|
if (alternatives == null) {
|
||||||
|
alternatives = new ArrayList<Group>();
|
||||||
|
}
|
||||||
|
alternatives.add(new Group(false, list));
|
||||||
|
list.clear();
|
||||||
|
}
|
||||||
|
|
||||||
|
public Expression pop() {
|
||||||
|
Expression result = list.remove(list.size() - 1);
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
protected void writeCode(Output output) {
|
||||||
|
int groupIndex = -1;
|
||||||
|
if (capturing) {
|
||||||
|
groupIndex = ++ output.groupCount;
|
||||||
|
output.add(SAVE_OFFSET);
|
||||||
|
output.add(2 * groupIndex);
|
||||||
|
}
|
||||||
|
int[] jumps = null;
|
||||||
|
if (alternatives != null) {
|
||||||
|
jumps = new int[alternatives.size()];
|
||||||
|
int i = 0;
|
||||||
|
for (Group alternative : alternatives) {
|
||||||
|
output.add(SPLIT);
|
||||||
|
int jump = output.markJump();
|
||||||
|
alternative.writeCode(output);
|
||||||
|
output.add(JMP);
|
||||||
|
jumps[i++] = output.markJump();
|
||||||
|
output.setJump(jump);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for (Expression expr : list) {
|
||||||
|
expr.writeCode(output);
|
||||||
|
}
|
||||||
|
if (jumps != null) {
|
||||||
|
for (int jump : jumps) {
|
||||||
|
output.setJump(jump);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (capturing) {
|
||||||
|
output.add(SAVE_OFFSET);
|
||||||
|
output.add(2 * groupIndex + 1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public String toString() {
|
||||||
|
StringBuilder builder = new StringBuilder();
|
||||||
|
if (alternatives != null || list.size() > 1) {
|
||||||
|
builder.append('(');
|
||||||
|
if (!capturing) {
|
||||||
|
builder.append("?:");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (alternatives != null) {
|
||||||
|
for (Group alternative : alternatives) {
|
||||||
|
builder.append(alternative).append('|');
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for (Expression expr : list) {
|
||||||
|
builder.append(expr);
|
||||||
|
}
|
||||||
|
if (alternatives != null || list.size() > 1) {
|
||||||
|
builder.append(')');
|
||||||
|
}
|
||||||
|
return builder.toString();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private class Lookaround extends Expression {
|
||||||
|
private final Group group = new Group(false, null);
|
||||||
|
private final boolean forward, negative;
|
||||||
|
|
||||||
|
public Lookaround(boolean forward, boolean negative) {
|
||||||
|
this.forward = forward;
|
||||||
|
this.negative = negative;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected void writeCode(Output output) {
|
||||||
|
PikeVM vm = new Output(group).toVM();
|
||||||
|
if (!forward) {
|
||||||
|
vm.reverse();
|
||||||
|
}
|
||||||
|
output.add(forward ?
|
||||||
|
(negative ? NEGATIVE_LOOKAHEAD : LOOKAHEAD) :
|
||||||
|
(negative ? NEGATIVE_LOOKAHEAD : LOOKBEHIND));
|
||||||
|
output.add(output.addLookaround(vm));
|
||||||
|
}
|
||||||
|
|
||||||
|
public String toString() {
|
||||||
|
String inner = group.toString();
|
||||||
|
if (inner.startsWith("(?:")) {
|
||||||
|
inner = inner.substring(3);
|
||||||
|
} else {
|
||||||
|
inner += ")";
|
||||||
|
}
|
||||||
|
return "(?=" + inner;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private class Group0 extends Expression {
|
||||||
|
private final Group group;
|
||||||
|
|
||||||
|
public Group0() {
|
||||||
|
group = new Group(true, null);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void writeCode(Output output) {
|
||||||
|
// find() preamble
|
||||||
|
int start = output.offset;
|
||||||
|
output.add(SPLIT_JMP);
|
||||||
|
output.add(start + 5);
|
||||||
|
output.add(DOTALL);
|
||||||
|
output.add(SPLIT);
|
||||||
|
output.add(start + 2);
|
||||||
|
output.markFindPreambleEnd();
|
||||||
|
group.writeCode(output);
|
||||||
|
}
|
||||||
|
|
||||||
|
public String toString() {
|
||||||
|
String inner = group.toString();
|
||||||
|
return inner.startsWith("(?:") && inner.endsWith(")") ?
|
||||||
|
inner.substring(1, inner.length() - 1) : inner;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private Group0 root;
|
||||||
|
private Stack<Group> groups;
|
||||||
|
|
||||||
|
public Compiler() {
|
||||||
|
root = new Group0();
|
||||||
|
groups = new Stack<Group>();
|
||||||
|
groups.add(root.group);
|
||||||
|
}
|
||||||
|
|
||||||
|
public Pattern compile(String regex) {
|
||||||
|
char[] array = regex.toCharArray();
|
||||||
|
CharacterMatcher.Parser characterClassParser =
|
||||||
|
new CharacterMatcher.Parser(array);
|
||||||
|
for (int index = 0; index < array.length; ++ index) {
|
||||||
|
char c = array[index];
|
||||||
|
Group current = groups.peek();
|
||||||
|
if (regularCharacter.matches(c)) {
|
||||||
|
current.push(c);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
switch (c) {
|
||||||
|
case '.':
|
||||||
|
current.push(DOT);
|
||||||
|
continue;
|
||||||
|
case '\\':
|
||||||
|
int unescaped = characterClassParser.parseEscapedCharacter(index + 1);
|
||||||
|
if (unescaped >= 0) {
|
||||||
|
index = characterClassParser.getEndOffset() - 1;
|
||||||
|
current.push((char)unescaped);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
CharacterMatcher characterClass = characterClassParser.parseClass(index);
|
||||||
|
if (characterClass != null) {
|
||||||
|
index = characterClassParser.getEndOffset() - 1;
|
||||||
|
current.push(new CharacterRange(characterClass));
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
switch (array[index + 1]) {
|
||||||
|
case 'b':
|
||||||
|
index++;
|
||||||
|
current.push(WORD_BOUNDARY);
|
||||||
|
continue;
|
||||||
|
case 'B':
|
||||||
|
index++;
|
||||||
|
current.push(NON_WORD_BOUNDARY);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
throw new RuntimeException("Parse error @" + index + ": " + regex);
|
||||||
|
case '?':
|
||||||
|
case '*':
|
||||||
|
case '+': {
|
||||||
|
boolean greedy = true;
|
||||||
|
if (index + 1 < array.length && array[index + 1] == '?') {
|
||||||
|
greedy = false;
|
||||||
|
++ index;
|
||||||
|
}
|
||||||
|
current.push(new Repeat(current.pop(),
|
||||||
|
c == '+' ? 1 : 0, c == '?' ? 1 : -1, greedy));
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
case '{': {
|
||||||
|
++ index;
|
||||||
|
int length = characterClassParser.digits(index, 8, 10);
|
||||||
|
int min = Integer.parseInt(regex.substring(index, index + length));
|
||||||
|
int max = min;
|
||||||
|
index += length - 1;
|
||||||
|
c = index + 1 < array.length ? array[index + 1] : 0;
|
||||||
|
if (c == ',') {
|
||||||
|
++ index;
|
||||||
|
length = characterClassParser.digits(index + 1, 8, 10);
|
||||||
|
max = length == 0 ? -1 :
|
||||||
|
Integer.parseInt(regex.substring(index + 1, index + 1 + length));
|
||||||
|
index += length;
|
||||||
|
c = index + 1< array.length ? array[index + 1] : 0;
|
||||||
|
}
|
||||||
|
if (c != '}') {
|
||||||
|
throw new RuntimeException("Invalid quantifier @" + index + ": "
|
||||||
|
+ regex);
|
||||||
|
}
|
||||||
|
++ index;
|
||||||
|
boolean greedy = true;
|
||||||
|
if (index + 1 < array.length && array[index + 1] == '?') {
|
||||||
|
++ index;
|
||||||
|
greedy = false;
|
||||||
|
}
|
||||||
|
current.push(new Repeat(current.pop(), min, max, greedy));
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
case '(': {
|
||||||
|
boolean capturing = true;
|
||||||
|
if (index + 1 < array.length && array[index + 1] == '?') {
|
||||||
|
index += 2;
|
||||||
|
if (index >= array.length) {
|
||||||
|
throw new RuntimeException("Short pattern @" + index + ": "
|
||||||
|
+ regex);
|
||||||
|
}
|
||||||
|
c = array[index];
|
||||||
|
boolean lookAhead = true;
|
||||||
|
if (c == '<') {
|
||||||
|
if (++ index >= array.length) {
|
||||||
|
throw new RuntimeException("Short pattern @" + index + ": "
|
||||||
|
+ regex);
|
||||||
|
}
|
||||||
|
lookAhead = false;
|
||||||
|
c = array[index];
|
||||||
|
if (c != '=' && c != '!') {
|
||||||
|
throw new IllegalArgumentException("Named groups not supported @"
|
||||||
|
+ index + ": " + regex);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
switch (c) {
|
||||||
|
case ':':
|
||||||
|
capturing = false;
|
||||||
|
break;
|
||||||
|
case '!':
|
||||||
|
case '=': {
|
||||||
|
capturing = false;
|
||||||
|
Lookaround lookaround = new Lookaround(lookAhead, c == '!');
|
||||||
|
current.push(lookaround);
|
||||||
|
groups.push(lookaround.group);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
default:
|
||||||
|
throw new UnsupportedOperationException("Not yet supported: "
|
||||||
|
+ regex.substring(index));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
current.push(groups.push(new Group(capturing, null)));
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
case ')':
|
||||||
|
if (groups.size() < 2) {
|
||||||
|
throw new RuntimeException("Invalid group close @" + index + ": "
|
||||||
|
+ regex);
|
||||||
|
}
|
||||||
|
groups.pop();
|
||||||
|
continue;
|
||||||
|
case '[': {
|
||||||
|
CharacterMatcher matcher = characterClassParser.parseClass(index);
|
||||||
|
if (matcher == null) {
|
||||||
|
throw new RuntimeException("Invalid range @" + index + ": " + regex);
|
||||||
|
}
|
||||||
|
current.push(new CharacterRange(matcher));
|
||||||
|
index = characterClassParser.getEndOffset() - 1;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
case '|':
|
||||||
|
current.startAlternative();
|
||||||
|
continue;
|
||||||
|
case '^':
|
||||||
|
current.push(LINE_START);
|
||||||
|
continue;
|
||||||
|
case '$':
|
||||||
|
current.push(LINE_END);
|
||||||
|
continue;
|
||||||
|
default:
|
||||||
|
throw new RuntimeException("Parse error @" + index + ": " + regex);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (groups.size() != 1) {
|
||||||
|
throw new IllegalArgumentException("Unclosed groups: ("
|
||||||
|
+ (groups.size() - 1) + "): " + regex);
|
||||||
|
}
|
||||||
|
PikeVM vm = new Output(root).toVM();
|
||||||
|
String plain = vm.isPlainString();
|
||||||
|
if (plain != null) {
|
||||||
|
return new TrivialPattern(regex, plain, 0);
|
||||||
|
}
|
||||||
|
return new RegexPattern(regex, 0, vm);
|
||||||
|
}
|
||||||
|
}
|
@ -15,27 +15,23 @@ package java.util.regex;
|
|||||||
*
|
*
|
||||||
* @author zsombor and others
|
* @author zsombor and others
|
||||||
*/
|
*/
|
||||||
public class Matcher {
|
public abstract class Matcher {
|
||||||
private final Pattern pattern;
|
protected CharSequence input;
|
||||||
private CharSequence input;
|
protected int start;
|
||||||
private int start;
|
protected int end;
|
||||||
private int end;
|
|
||||||
|
|
||||||
Matcher(Pattern pattern, CharSequence input) {
|
public Matcher(CharSequence input) {
|
||||||
this.pattern = pattern;
|
reset(input);
|
||||||
this.input = input;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public boolean matches() {
|
public abstract boolean matches();
|
||||||
if (pattern.pattern().equals(input.toString())) {
|
|
||||||
start = 0;
|
public boolean find() {
|
||||||
end = input.length();
|
return find(end);
|
||||||
return true;
|
|
||||||
} else {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public abstract boolean find(int start);
|
||||||
|
|
||||||
public Matcher reset() {
|
public Matcher reset() {
|
||||||
return reset(input);
|
return reset(input);
|
||||||
}
|
}
|
||||||
@ -47,10 +43,6 @@ public class Matcher {
|
|||||||
return this;
|
return this;
|
||||||
}
|
}
|
||||||
|
|
||||||
public int start() {
|
|
||||||
return start;
|
|
||||||
}
|
|
||||||
|
|
||||||
public String replaceAll(String replacement) {
|
public String replaceAll(String replacement) {
|
||||||
return replace(replacement, Integer.MAX_VALUE);
|
return replace(replacement, Integer.MAX_VALUE);
|
||||||
}
|
}
|
||||||
@ -59,7 +51,7 @@ public class Matcher {
|
|||||||
return replace(replacement, 1);
|
return replace(replacement, 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
private String replace(String replacement, int limit) {
|
protected String replace(String replacement, int limit) {
|
||||||
reset();
|
reset();
|
||||||
|
|
||||||
StringBuilder sb = null;
|
StringBuilder sb = null;
|
||||||
@ -88,23 +80,40 @@ public class Matcher {
|
|||||||
return sb.toString();
|
return sb.toString();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public int start() {
|
||||||
|
return start;
|
||||||
|
}
|
||||||
|
|
||||||
public int end() {
|
public int end() {
|
||||||
return end;
|
return end;
|
||||||
}
|
}
|
||||||
|
|
||||||
public boolean find() {
|
public String group() {
|
||||||
return find(end);
|
return input.subSequence(start, end).toString();
|
||||||
}
|
}
|
||||||
|
|
||||||
public boolean find(int start) {
|
public int start(int group) {
|
||||||
String p = pattern.pattern();
|
if (group == 0) {
|
||||||
int i = Pattern.indexOf(input, p, start);
|
return start();
|
||||||
if (i >= 0) {
|
|
||||||
this.start = i;
|
|
||||||
this.end = i + p.length();
|
|
||||||
return true;
|
|
||||||
} else {
|
|
||||||
return false;
|
|
||||||
}
|
}
|
||||||
|
throw new UnsupportedOperationException();
|
||||||
|
}
|
||||||
|
|
||||||
|
public int end(int group) {
|
||||||
|
if (group == 0) {
|
||||||
|
return end();
|
||||||
|
}
|
||||||
|
throw new UnsupportedOperationException();
|
||||||
|
}
|
||||||
|
|
||||||
|
public String group(int group) {
|
||||||
|
if (group == 0) {
|
||||||
|
return group();
|
||||||
|
}
|
||||||
|
throw new UnsupportedOperationException();
|
||||||
|
}
|
||||||
|
|
||||||
|
public int groupCount() {
|
||||||
|
return 0;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -10,9 +10,8 @@
|
|||||||
|
|
||||||
package java.util.regex;
|
package java.util.regex;
|
||||||
|
|
||||||
import java.util.Iterator;
|
import java.util.ArrayList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.LinkedList;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* This is a work in progress.
|
* This is a work in progress.
|
||||||
@ -20,7 +19,7 @@ import java.util.LinkedList;
|
|||||||
* @author zsombor and others
|
* @author zsombor and others
|
||||||
*
|
*
|
||||||
*/
|
*/
|
||||||
public class Pattern {
|
public abstract class Pattern implements PikeVMOpcodes {
|
||||||
|
|
||||||
public static final int UNIX_LINES = 1;
|
public static final int UNIX_LINES = 1;
|
||||||
public static final int CASE_INSENSITIVE = 2;
|
public static final int CASE_INSENSITIVE = 2;
|
||||||
@ -35,112 +34,26 @@ public class Pattern {
|
|||||||
private final String pattern;
|
private final String pattern;
|
||||||
|
|
||||||
protected Pattern(String pattern, int flags) {
|
protected Pattern(String pattern, int flags) {
|
||||||
this.pattern = trivial(pattern);
|
this.pattern = pattern;
|
||||||
this.patternFlags = flags;
|
this.patternFlags = flags;
|
||||||
}
|
}
|
||||||
|
|
||||||
private static String trivial(String pattern) {
|
|
||||||
StringBuffer buffer = new StringBuffer();
|
|
||||||
for (int i = 0; i < pattern.length(); ++i) {
|
|
||||||
char c = pattern.charAt(i);
|
|
||||||
switch (c) {
|
|
||||||
case '\\':
|
|
||||||
if (++i == pattern.length()) {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
c = pattern.charAt(i);
|
|
||||||
if (c == '0') {
|
|
||||||
int len = digits(pattern, ++i, 3, 8);
|
|
||||||
if (len == 3 && pattern.charAt(i) > '3') {
|
|
||||||
--len;
|
|
||||||
}
|
|
||||||
c = (char)Integer.parseInt(pattern.substring(i, i + len), 8);
|
|
||||||
i += len - 1;
|
|
||||||
} else if (c == 'x' || c == 'u') {
|
|
||||||
int len = digits(pattern, ++i, 4, 16);
|
|
||||||
c = (char)Integer.parseInt(pattern.substring(i, i + len), 16);
|
|
||||||
i += len - 1;
|
|
||||||
} else {
|
|
||||||
c = unescape(pattern.charAt(i));
|
|
||||||
}
|
|
||||||
if (c != -1) {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
// fallthru
|
|
||||||
case '.':
|
|
||||||
case '*':
|
|
||||||
case '+':
|
|
||||||
case '?':
|
|
||||||
case '|':
|
|
||||||
case '[':
|
|
||||||
case ']':
|
|
||||||
case '{':
|
|
||||||
case '}':
|
|
||||||
case '(':
|
|
||||||
case ')':
|
|
||||||
case '^':
|
|
||||||
case '$':
|
|
||||||
throw new UnsupportedOperationException
|
|
||||||
("only trivial regular expressions are supported so far (" + pattern + ")");
|
|
||||||
}
|
|
||||||
buffer.append(c);
|
|
||||||
}
|
|
||||||
return buffer.toString();
|
|
||||||
}
|
|
||||||
|
|
||||||
private static int digits(String s, int offset, int maxLength, int base) {
|
|
||||||
for (int i = 0; ; ++i) {
|
|
||||||
if (i == maxLength || offset + i >= s.length()) {
|
|
||||||
return i;
|
|
||||||
}
|
|
||||||
int value = s.charAt(offset + i) - '0';
|
|
||||||
if (value < 0) {
|
|
||||||
return i;
|
|
||||||
}
|
|
||||||
if (base > 10 && value >= 10) {
|
|
||||||
value += 10 - (value >= 'a' - '0' ? 'a' - '0' : 'A' - '0');
|
|
||||||
}
|
|
||||||
if (value >= base) {
|
|
||||||
return i;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private static char unescape(char c) {
|
|
||||||
switch (c) {
|
|
||||||
case '\\':
|
|
||||||
return c;
|
|
||||||
case 'a':
|
|
||||||
return 0x0007;
|
|
||||||
case 'e':
|
|
||||||
return 0x001B;
|
|
||||||
case 'f':
|
|
||||||
return 0x000C;
|
|
||||||
case 'n':
|
|
||||||
return 0x000A;
|
|
||||||
case 'r':
|
|
||||||
return 0x000D;
|
|
||||||
case 't':
|
|
||||||
return 0x0009;
|
|
||||||
}
|
|
||||||
return (char)-1;
|
|
||||||
}
|
|
||||||
|
|
||||||
public static Pattern compile(String regex) {
|
public static Pattern compile(String regex) {
|
||||||
return new Pattern(regex, 0);
|
return compile(regex, 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
public static Pattern compile(String regex, int flags) {
|
public static Pattern compile(String regex, int flags) {
|
||||||
return new Pattern(regex, flags);
|
if (flags != 0) {
|
||||||
|
throw new UnsupportedOperationException("TODO");
|
||||||
|
}
|
||||||
|
return new Compiler().compile(regex);
|
||||||
}
|
}
|
||||||
|
|
||||||
public int flags() {
|
public int flags() {
|
||||||
return patternFlags;
|
return patternFlags;
|
||||||
}
|
}
|
||||||
|
|
||||||
public Matcher matcher(CharSequence input) {
|
public abstract Matcher matcher(CharSequence input);
|
||||||
return new Matcher(this, input);
|
|
||||||
}
|
|
||||||
|
|
||||||
public static boolean matches(String regex, CharSequence input) {
|
public static boolean matches(String regex, CharSequence input) {
|
||||||
return Pattern.compile(regex).matcher(input).matches();
|
return Pattern.compile(regex).matcher(input).matches();
|
||||||
@ -155,79 +68,22 @@ public class Pattern {
|
|||||||
}
|
}
|
||||||
|
|
||||||
public String[] split(CharSequence input, int limit) {
|
public String[] split(CharSequence input, int limit) {
|
||||||
boolean strip;
|
if (limit <= 0) {
|
||||||
if (limit < 0) {
|
|
||||||
strip = false;
|
|
||||||
limit = Integer.MAX_VALUE;
|
limit = Integer.MAX_VALUE;
|
||||||
} else if (limit == 0) {
|
|
||||||
strip = true;
|
|
||||||
limit = Integer.MAX_VALUE;
|
|
||||||
} else {
|
|
||||||
strip = false;
|
|
||||||
}
|
}
|
||||||
|
Matcher matcher = matcher(input);
|
||||||
List<CharSequence> list = new LinkedList();
|
List<String> result = new ArrayList<String>();
|
||||||
int index = 0;
|
int offset = 0;
|
||||||
int trailing = 0;
|
for (;;) {
|
||||||
int patternLength = pattern.length();
|
if (result.size() >= limit || !matcher.find()) {
|
||||||
while (index < input.length() && list.size() < limit - 1) {
|
|
||||||
int i;
|
|
||||||
if (patternLength == 0) {
|
|
||||||
if (list.size() == 0) {
|
|
||||||
i = 0;
|
|
||||||
} else {
|
|
||||||
i = index + 1;
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
i = indexOf(input, pattern, index);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (i >= 0) {
|
|
||||||
if (patternLength != 0 && i == index) {
|
|
||||||
++ trailing;
|
|
||||||
} else {
|
|
||||||
trailing = 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
list.add(input.subSequence(index, i));
|
|
||||||
index = i + patternLength;
|
|
||||||
} else {
|
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
result.add(input.subSequence(offset, matcher.start()).toString());
|
||||||
|
offset = matcher.end();
|
||||||
}
|
}
|
||||||
|
if (offset == 0 || offset < input.length()) {
|
||||||
if (strip && index > 0 && index == input.length()) {
|
result.add(input.subSequence(offset, input.length()).toString());
|
||||||
++ trailing;
|
|
||||||
} else {
|
|
||||||
trailing = 0;
|
|
||||||
}
|
}
|
||||||
list.add(input.subSequence(index, input.length()));
|
return result.toArray(new String[result.size()]);
|
||||||
|
|
||||||
String[] result = new String[list.size() - trailing];
|
|
||||||
int i = 0;
|
|
||||||
for (Iterator<CharSequence> it = list.iterator();
|
|
||||||
it.hasNext() && i < result.length; ++ i)
|
|
||||||
{
|
|
||||||
result[i] = it.next().toString();
|
|
||||||
}
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
|
|
||||||
static int indexOf(CharSequence haystack, CharSequence needle, int start) {
|
|
||||||
if (needle.length() == 0) return start;
|
|
||||||
|
|
||||||
for (int i = start; i < haystack.length() - needle.length() + 1; ++i) {
|
|
||||||
int j = 0;
|
|
||||||
for (; j < needle.length(); ++j) {
|
|
||||||
if (haystack.charAt(i + j) != needle.charAt(j)) {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (j == needle.length()) {
|
|
||||||
return i;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return -1;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
629
classpath/java/util/regex/PikeVM.java
Normal file
629
classpath/java/util/regex/PikeVM.java
Normal file
@ -0,0 +1,629 @@
|
|||||||
|
/* Copyright (c) 2008-2013, Avian Contributors
|
||||||
|
|
||||||
|
Permission to use, copy, modify, and/or distribute this software
|
||||||
|
for any purpose with or without fee is hereby granted, provided
|
||||||
|
that the above copyright notice and this permission notice appear
|
||||||
|
in all copies.
|
||||||
|
|
||||||
|
There is NO WARRANTY for this software. See license.txt for
|
||||||
|
details. */
|
||||||
|
|
||||||
|
package java.util.regex;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* A minimal implementation of a regular expression engine.
|
||||||
|
*
|
||||||
|
* @author Johannes Schindelin
|
||||||
|
*/
|
||||||
|
class PikeVM implements PikeVMOpcodes {
|
||||||
|
private final int[] program;
|
||||||
|
private final int groupCount;
|
||||||
|
private final int offsetsCount;
|
||||||
|
/*
|
||||||
|
* For find(), we do not want to anchor the match at the start offset. Our
|
||||||
|
* compiler allows this by prefixing the code with an implicit '(?:.*?)'. For
|
||||||
|
* regular matches() calls, we want to skip that code and start at {@code
|
||||||
|
* findPrefixLength} instead.
|
||||||
|
*/
|
||||||
|
private final int findPrefixLength;
|
||||||
|
private final CharacterMatcher[] classes;
|
||||||
|
private final PikeVM[] lookarounds;
|
||||||
|
private final static CharacterMatcher wordCharacter =
|
||||||
|
CharacterMatcher.parse("\\w");
|
||||||
|
private final static CharacterMatcher lineTerminator =
|
||||||
|
CharacterMatcher.parse("[\n\r\u0085\u2028\u2029]");
|
||||||
|
private boolean multiLine;
|
||||||
|
|
||||||
|
public interface Result {
|
||||||
|
void set(int[] start, int[] end);
|
||||||
|
}
|
||||||
|
|
||||||
|
protected PikeVM(int[] program, int findPrefixLength, int groupCount,
|
||||||
|
CharacterMatcher[] classes, PikeVM[] lookarounds)
|
||||||
|
{
|
||||||
|
this.program = program;
|
||||||
|
this.findPrefixLength = findPrefixLength;
|
||||||
|
this.groupCount = groupCount;
|
||||||
|
offsetsCount = 2 * groupCount + 2;
|
||||||
|
this.classes = classes;
|
||||||
|
this.lookarounds = lookarounds;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The current thread states.
|
||||||
|
* <p>
|
||||||
|
* The threads are identified by their program counter. The rationale: as all
|
||||||
|
* threads are executed in lock-step, i.e. for the same character in the
|
||||||
|
* string to be matched, it does not make sense for two threads to be at the
|
||||||
|
* same program counter -- they would both do exactly the same for the rest of
|
||||||
|
* the execution.
|
||||||
|
* </p>
|
||||||
|
* <p>
|
||||||
|
* For efficiency, the threads are kept in a linked list that actually lives
|
||||||
|
* in an array indexed by the program counter, pointing to the next thread's
|
||||||
|
* program counter, in the order of high to low priority.
|
||||||
|
* </p>
|
||||||
|
* <p>
|
||||||
|
* Program counters which have no thread associated thread are marked as -1.
|
||||||
|
* The program counter associated with the least-priority thread (the last one
|
||||||
|
* in the linked list) is marked as -2 to be able to tell it apart from
|
||||||
|
* unscheduled threads.
|
||||||
|
* </p>
|
||||||
|
* <p>
|
||||||
|
* We actually never need to have an explicit value for the priority, the
|
||||||
|
* ordering is sufficient: whenever a new thread is to be scheduled and it is
|
||||||
|
* found to be scheduled already, it was already scheduled by a
|
||||||
|
* higher-priority thread.
|
||||||
|
* </p>
|
||||||
|
*/
|
||||||
|
private class ThreadQueue {
|
||||||
|
private int head, tail;
|
||||||
|
// next[pc] is 1 + the next thread's pc
|
||||||
|
private int[] next;
|
||||||
|
// offsets[pc][2 * group] is 1 + start offset
|
||||||
|
private int[][] offsets;
|
||||||
|
|
||||||
|
public ThreadQueue() {
|
||||||
|
head = tail = -1;
|
||||||
|
next = new int[program.length + 1];
|
||||||
|
offsets = new int[program.length + 1][];
|
||||||
|
}
|
||||||
|
|
||||||
|
public ThreadQueue(int startPC) {
|
||||||
|
head = tail = startPC;
|
||||||
|
next = new int[program.length + 1];
|
||||||
|
offsets = new int[program.length + 1][];
|
||||||
|
offsets[head] = new int[offsetsCount];
|
||||||
|
}
|
||||||
|
|
||||||
|
public int queueOneImmediately(ThreadQueue into) {
|
||||||
|
for (;;) {
|
||||||
|
if (head < 0) {
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
boolean wasQueued = queueNext(head, head, into);
|
||||||
|
int pc = head;
|
||||||
|
if (head == tail) {
|
||||||
|
head = tail = -1;
|
||||||
|
} else {
|
||||||
|
head = next[pc] - 1;
|
||||||
|
next[pc] = 0;
|
||||||
|
}
|
||||||
|
offsets[pc] = null;
|
||||||
|
if (wasQueued) {
|
||||||
|
into.tail = pc;
|
||||||
|
return pc;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Schedules the instruction at {@code nextPC} to be executed immediately.
|
||||||
|
* <p>
|
||||||
|
* For non-matching steps (SPLIT, SAVE_STATE, etc) we need to schedule the
|
||||||
|
* corresponding program counter(s) to be handled right after this opcode,
|
||||||
|
* before advancing to the next character.
|
||||||
|
* </p>
|
||||||
|
* <p>
|
||||||
|
* To achieve this, we insert the program counter to-be-scheduled in the
|
||||||
|
* linked thread list at the current position, but only if it has not been
|
||||||
|
* scheduled yet: if it has, a higher-priority thread already reached that
|
||||||
|
* state.
|
||||||
|
* </p>
|
||||||
|
* <p>
|
||||||
|
* In contrast to {@link #queueNext(int, int, ThreadQueue)}, this method
|
||||||
|
* works on the current step's thread list.
|
||||||
|
* </p>
|
||||||
|
*
|
||||||
|
* @param currentPC
|
||||||
|
* the current program counter
|
||||||
|
* @param nextPC
|
||||||
|
* the program counter to schedule
|
||||||
|
* @param copyThreadState
|
||||||
|
* whether to spawn off a new thread
|
||||||
|
* @return whether the step was queued (i.e. no thread was queued for the
|
||||||
|
* same {@code nextPC} already)
|
||||||
|
*/
|
||||||
|
public boolean queueImmediately(int currentPC, int nextPC,
|
||||||
|
boolean copyThreadState) {
|
||||||
|
if (isScheduled(nextPC)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
int[] offsets = this.offsets[currentPC];
|
||||||
|
if (copyThreadState) {
|
||||||
|
offsets = java.util.Arrays.copyOf(offsets, offsetsCount);
|
||||||
|
}
|
||||||
|
if (currentPC == tail) {
|
||||||
|
tail = nextPC;
|
||||||
|
} else {
|
||||||
|
next[nextPC] = next[currentPC];
|
||||||
|
}
|
||||||
|
this.offsets[nextPC] = offsets;
|
||||||
|
next[currentPC] = nextPC + 1;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Schedules the instruction at {@code nextPC} to be executed in the next
|
||||||
|
* step.
|
||||||
|
* <p>
|
||||||
|
* This method advances the current thread to the next program counter, to
|
||||||
|
* be executed after reading the next character.
|
||||||
|
* </p>
|
||||||
|
*
|
||||||
|
* @param currentPC
|
||||||
|
* the current program counter
|
||||||
|
* @param nextPC
|
||||||
|
* the program counter to schedule
|
||||||
|
* @param next
|
||||||
|
* the thread state of the next step
|
||||||
|
* @return whether the step was queued (i.e. no thread was queued for the
|
||||||
|
* same {@code nextPC} already)
|
||||||
|
*/
|
||||||
|
private boolean queueNext(int currentPC, int nextPC, ThreadQueue next) {
|
||||||
|
if (next.tail < 0) {
|
||||||
|
next.head = nextPC;
|
||||||
|
} else if (next.isScheduled(nextPC)) {
|
||||||
|
return false;
|
||||||
|
} else {
|
||||||
|
next.next[next.tail] = nextPC + 1;
|
||||||
|
}
|
||||||
|
next.offsets[nextPC] = offsets[currentPC];
|
||||||
|
next.tail = nextPC;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void saveOffset(int pc, int index, int offset) {
|
||||||
|
offsets[pc][index] = offset + 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setResult(Result result) {
|
||||||
|
// copy offsets
|
||||||
|
int[] offsets = this.offsets[program.length];
|
||||||
|
int[] groupStart = new int[groupCount + 1];
|
||||||
|
int[] groupEnd = new int[groupCount + 1];
|
||||||
|
for (int j = 0; j <= groupCount; ++j) {
|
||||||
|
groupStart[j] = offsets[2 * j] - 1;
|
||||||
|
groupEnd[j] = offsets[2 * j + 1] - 1;
|
||||||
|
}
|
||||||
|
result.set(groupStart, groupEnd);
|
||||||
|
}
|
||||||
|
|
||||||
|
private void mustStartMatchAt(int start) {
|
||||||
|
int previous = -1;
|
||||||
|
for (int pc = head; pc >= 0; ) {
|
||||||
|
int nextPC = next[pc] - 1;
|
||||||
|
if (start + 1 == offsets[pc][0]) {
|
||||||
|
previous = pc;
|
||||||
|
} else {
|
||||||
|
next[pc] = 0;
|
||||||
|
offsets[pc] = null;
|
||||||
|
if (pc == tail) {
|
||||||
|
head = tail = -1;
|
||||||
|
} else if (previous < 0) {
|
||||||
|
head = nextPC;
|
||||||
|
} else {
|
||||||
|
next[previous] = 1 + nextPC;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
pc = nextPC;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private int startOffset(int pc) {
|
||||||
|
return offsets[pc][0] - 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean isEmpty() {
|
||||||
|
return head < 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean isScheduled(int pc) {
|
||||||
|
return pc == tail || next[pc] > 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
public int next(int pc) {
|
||||||
|
return pc < 0 ? head : next[pc] - 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void clean() {
|
||||||
|
for (int pc = head; pc >= 0; ) {
|
||||||
|
int nextPC = next[pc] - 1;
|
||||||
|
next[pc] = 0;
|
||||||
|
offsets[pc] = null;
|
||||||
|
pc = nextPC;
|
||||||
|
}
|
||||||
|
head = tail = -1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Executes the Pike VM defined by the program.
|
||||||
|
* <p>
|
||||||
|
* The idea is to execute threads in parallel, at each step executing them
|
||||||
|
* from the highest priority thread to the lowest one. In contrast to most
|
||||||
|
* regular expression engines, the Thompson/Pike one gets away with linear
|
||||||
|
* complexity because the string is matched from left to right, at each step
|
||||||
|
* executing a number of threads bounded by the length of the program: if two
|
||||||
|
* threads would execute at the same instruction pointer of the program, we
|
||||||
|
* need only consider the higher-priority one.
|
||||||
|
* </p>
|
||||||
|
* <p>
|
||||||
|
* This implementation is based on the description of <a
|
||||||
|
* href="http://swtch.com/%7Ersc/regexp/regexp2.html">Russ Cox</a>.
|
||||||
|
* </p>
|
||||||
|
*
|
||||||
|
* @param characters
|
||||||
|
* the {@link String} to match
|
||||||
|
* @param start
|
||||||
|
* the start offset where to match
|
||||||
|
* @param length
|
||||||
|
* the end offset
|
||||||
|
* @param anchorStart
|
||||||
|
* whether the match must start at {@code start}
|
||||||
|
* @param anchorEnd
|
||||||
|
* whether the match must start at {@code end}
|
||||||
|
* @param result
|
||||||
|
* the {@link Matcher} to store the groups' offsets in, if successful
|
||||||
|
* @return whether a match was found
|
||||||
|
*/
|
||||||
|
public boolean matches(char[] characters, int start, int end,
|
||||||
|
boolean anchorStart, boolean anchorEnd, Result result)
|
||||||
|
{
|
||||||
|
ThreadQueue current = new ThreadQueue();
|
||||||
|
ThreadQueue next = new ThreadQueue();
|
||||||
|
|
||||||
|
// initialize the first thread
|
||||||
|
int startPC = anchorStart ? findPrefixLength : 0;
|
||||||
|
ThreadQueue queued = new ThreadQueue(startPC);
|
||||||
|
|
||||||
|
boolean foundMatch = false;
|
||||||
|
int step = end > start ? +1 : -1;
|
||||||
|
for (int i = start; i != end + step; i += step) {
|
||||||
|
if (queued.isEmpty()) {
|
||||||
|
// no threads left
|
||||||
|
return foundMatch;
|
||||||
|
}
|
||||||
|
|
||||||
|
char c = i != end ? characters[i] : 0;
|
||||||
|
int pc = -1;
|
||||||
|
for (;;) {
|
||||||
|
pc = current.next(pc);
|
||||||
|
if (pc < 0) {
|
||||||
|
pc = queued.queueOneImmediately(current);
|
||||||
|
}
|
||||||
|
if (pc < 0) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
// pc == program.length is a match!
|
||||||
|
if (pc == program.length) {
|
||||||
|
if (anchorEnd && i != end) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if (result == null) {
|
||||||
|
// only interested in a match, no need to go on
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
current.setResult(result);
|
||||||
|
|
||||||
|
// now that we found a match, even higher-priority matches must match
|
||||||
|
// at the same start offset
|
||||||
|
if (!anchorStart) {
|
||||||
|
next.mustStartMatchAt(current.startOffset(pc));
|
||||||
|
}
|
||||||
|
foundMatch = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
int opcode = program[pc];
|
||||||
|
switch (opcode) {
|
||||||
|
case DOT:
|
||||||
|
if (c != '\0' && c != '\r' && c != '\n') {
|
||||||
|
current.queueNext(pc, pc + 1, next);
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
case DOTALL:
|
||||||
|
current.queueNext(pc, pc + 1, next);
|
||||||
|
break;
|
||||||
|
case WORD_BOUNDARY:
|
||||||
|
case NON_WORD_BOUNDARY: {
|
||||||
|
int i2 = i - step;
|
||||||
|
int c2 = i2 < 0 || i2 >= characters.length ? -1 : characters[i2];
|
||||||
|
switch (opcode) {
|
||||||
|
case WORD_BOUNDARY:
|
||||||
|
if ((c2 < 0 || !wordCharacter.matches((char)c2))) {
|
||||||
|
if (wordCharacter.matches(c)) {
|
||||||
|
current.queueImmediately(pc, pc + 1, false);
|
||||||
|
}
|
||||||
|
} else if (i >= 0 && i < characters.length &&
|
||||||
|
!wordCharacter.matches(c)) {
|
||||||
|
current.queueImmediately(pc, pc + 1, false);
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
case NON_WORD_BOUNDARY:
|
||||||
|
if ((c2 < 0 || !wordCharacter.matches((char)c2))) {
|
||||||
|
if (i >= 0 && i < characters.length &&
|
||||||
|
!wordCharacter.matches(c)) {
|
||||||
|
current.queueImmediately(pc, pc + 1, false);
|
||||||
|
}
|
||||||
|
} else if (wordCharacter.matches(c)) {
|
||||||
|
current.queueImmediately(pc, pc + 1, false);
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
case LINE_START:
|
||||||
|
if (i == 0 || (multiLine &&
|
||||||
|
lineTerminator.matches(characters[i - 1]))) {
|
||||||
|
current.queueImmediately(pc, pc + 1, false);
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
case LINE_END:
|
||||||
|
if (i == characters.length || (multiLine &&
|
||||||
|
lineTerminator.matches(c))) {
|
||||||
|
current.queueImmediately(pc, pc + 1, false);
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
case CHARACTER_CLASS:
|
||||||
|
if (classes[program[pc + 1]].matches(c)) {
|
||||||
|
current.queueNext(pc, pc + 2, next);
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
case LOOKAHEAD:
|
||||||
|
if (lookarounds[program[pc + 1]].matches(characters,
|
||||||
|
i, characters.length, true, false, null)) {
|
||||||
|
current.queueImmediately(pc, pc + 2, false);
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
case LOOKBEHIND:
|
||||||
|
if (lookarounds[program[pc + 1]].matches(characters,
|
||||||
|
i - 1, -1, true, false, null)) {
|
||||||
|
current.queueImmediately(pc, pc + 2, false);
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
case NEGATIVE_LOOKAHEAD:
|
||||||
|
if (!lookarounds[program[pc + 1]].matches(characters,
|
||||||
|
i, characters.length, true, false, null)) {
|
||||||
|
current.queueImmediately(pc, pc + 2, false);
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
case NEGATIVE_LOOKBEHIND:
|
||||||
|
if (!lookarounds[program[pc + 1]].matches(characters,
|
||||||
|
i - 1, -1, true, false, null)) {
|
||||||
|
current.queueImmediately(pc, pc + 2, false);
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
/* immediate opcodes, i.e. thread continues within the same step */
|
||||||
|
case SAVE_OFFSET:
|
||||||
|
if (result != null) {
|
||||||
|
int index = program[pc + 1];
|
||||||
|
current.saveOffset(pc, index, i);
|
||||||
|
}
|
||||||
|
current.queueImmediately(pc, pc + 2, false);
|
||||||
|
break;
|
||||||
|
case SPLIT:
|
||||||
|
current.queueImmediately(pc, program[pc + 1], true);
|
||||||
|
current.queueImmediately(pc, pc + 2, false);
|
||||||
|
break;
|
||||||
|
case SPLIT_JMP:
|
||||||
|
current.queueImmediately(pc, pc + 2, true);
|
||||||
|
current.queueImmediately(pc, program[pc + 1], false);
|
||||||
|
break;
|
||||||
|
case JMP:
|
||||||
|
current.queueImmediately(pc, program[pc + 1], false);
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
if (program[pc] >= 0 && program[pc] <= 0xffff) {
|
||||||
|
if (c == (char)program[pc]) {
|
||||||
|
current.queueNext(pc, pc + 1, next);
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
throw new RuntimeException("Invalid opcode: " + opcode
|
||||||
|
+ " at pc " + pc);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// clean linked thread list (and states)
|
||||||
|
current.clean();
|
||||||
|
|
||||||
|
// prepare for next step
|
||||||
|
ThreadQueue swap = queued;
|
||||||
|
queued = next;
|
||||||
|
next = swap;
|
||||||
|
}
|
||||||
|
return foundMatch;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Determines whether this machine recognizes a pattern without special
|
||||||
|
* operators.
|
||||||
|
* <p>
|
||||||
|
* In case that the regular expression is actually a plain string without any
|
||||||
|
* special operators, we can avoid using a full-blown Pike VM and instead fall
|
||||||
|
* back to using the much faster {@link TrivialPattern}.
|
||||||
|
* </p>
|
||||||
|
*
|
||||||
|
* @return the string to match, or null if the machine recognizes a
|
||||||
|
* non-trivial pattern
|
||||||
|
*/
|
||||||
|
public String isPlainString() {
|
||||||
|
// we expect the machine to start with the find preamble and SAVE_OFFSET 0
|
||||||
|
// end with SAVE_OFFSET 1
|
||||||
|
int start = findPrefixLength;
|
||||||
|
if (start + 1 < program.length &&
|
||||||
|
program[start] == SAVE_OFFSET && program[start + 1] == 0) {
|
||||||
|
start += 2;
|
||||||
|
}
|
||||||
|
int end = program.length;
|
||||||
|
if (end > start + 1 &&
|
||||||
|
program[end - 2] == SAVE_OFFSET && program[end - 1] == 1) {
|
||||||
|
end -= 2;
|
||||||
|
}
|
||||||
|
for (int i = start; i < end; ++ i) {
|
||||||
|
if (program[i] < 0) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
char[] array = new char[end - start];
|
||||||
|
for (int i = start; i < end; ++ i) {
|
||||||
|
array[i - start] = (char)program[i];
|
||||||
|
}
|
||||||
|
return new String(array);
|
||||||
|
}
|
||||||
|
|
||||||
|
private static int length(int opcode) {
|
||||||
|
return opcode <= SINGLE_ARG_START && opcode >= SINGLE_ARG_END ? 2 : 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
private static boolean isJump(int opcode) {
|
||||||
|
return opcode <= SPLIT && opcode >= JMP;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Reverses the program (effectively matching the reverse pattern).
|
||||||
|
* <p>
|
||||||
|
* It is a well-known fact that any regular expression can be reordered
|
||||||
|
* trivially into an equivalent regular expression to be applied in backward
|
||||||
|
* direction (coming in real handy for look-behind expressions).
|
||||||
|
* </p>
|
||||||
|
* <p>
|
||||||
|
* Example: instead of matching the sequence "aaaabb" with the pattern "a+b+",
|
||||||
|
* we can match the reverse sequence "bbaaaa" with the pattern "b+a+".
|
||||||
|
* </p>
|
||||||
|
* <p>
|
||||||
|
* One caveat: while the reverse pattern is equivalent in the sense that it
|
||||||
|
* matches if, and only if, the original pattern matches the forward
|
||||||
|
* direction, the same is not true for submatches. Consider the input "a" and
|
||||||
|
* the pattern "(a?)a?": when matching in forward direction the captured group
|
||||||
|
* is "a", while the backward direction will yield the empty string. For that
|
||||||
|
* reason, Java dictates that capturing groups in look-behind patterns are
|
||||||
|
* ignored.
|
||||||
|
* </p>
|
||||||
|
*/
|
||||||
|
public void reverse() {
|
||||||
|
reverse(findPrefixLength, program.length);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Reverses a specific part of the program (to match in reverse direction).
|
||||||
|
* <p>
|
||||||
|
* This is the work-horse of {@link #reverse()}.
|
||||||
|
* </p>
|
||||||
|
* <p>
|
||||||
|
* To visualize the process of reversing a program, let's look at it as a
|
||||||
|
* directed graph (each jump is represented by an "<tt>X</tt>
|
||||||
|
* ", non-jumping steps are represented by a "<tt>o</tt>"s, arrows show the
|
||||||
|
* direction of the flow, <code>SPLIT</code>s spawn two arrows):
|
||||||
|
*
|
||||||
|
* <pre>
|
||||||
|
* o -> X -> X -> o -> X o -> o
|
||||||
|
* ^ | \ \___^____^
|
||||||
|
* \__/ \____________|
|
||||||
|
* </pre>
|
||||||
|
*
|
||||||
|
* The concept of reversing the program is easiest explained as following: if
|
||||||
|
* we insert auxiliary nodes "<tt>Y</tt>" for jump targets, the graph looks
|
||||||
|
* like this instead:
|
||||||
|
*
|
||||||
|
* <pre>
|
||||||
|
* Y -> o -> X -> X -> o -> X Y -> o -> Y -> o
|
||||||
|
* ^ | \ \___^_________^
|
||||||
|
* \_______/ \____________|
|
||||||
|
* </pre>
|
||||||
|
*
|
||||||
|
* It is now obvious that reversing the program is equivalent to reversing all
|
||||||
|
* arrows, simply deleting all <tt>X</tt>s and substituting each <tt>Y</tt>
|
||||||
|
* with a jump. Note that the reverse program will have the same number of
|
||||||
|
* <tt>JMP</tt>, but they will not be associated with the same arrows!:
|
||||||
|
*
|
||||||
|
* <pre>
|
||||||
|
* X <- o <- o X <- o <- X <- o
|
||||||
|
* | ^ ^____|________/
|
||||||
|
* \__/ \_______/
|
||||||
|
* </pre>
|
||||||
|
*
|
||||||
|
* </p>
|
||||||
|
* @param start
|
||||||
|
* start reversing the program with this instruction
|
||||||
|
* @param end
|
||||||
|
* stop reversing at this instruction (this must be either an index
|
||||||
|
* aligned exactly with an instruction, or exactly
|
||||||
|
* {@code program.length}.
|
||||||
|
*/
|
||||||
|
private void reverse(int start, int end) {
|
||||||
|
// Pass 1: build the list of jump targets
|
||||||
|
int[] newJumps = new int[end + 1];
|
||||||
|
boolean[] brokenArrows = new boolean[end + 1];
|
||||||
|
for (int pc = start; pc < end; pc += length(program[pc])) {
|
||||||
|
if (isJump(program[pc])) {
|
||||||
|
int target = program[pc + 1];
|
||||||
|
newJumps[pc + 1] = newJumps[target];
|
||||||
|
newJumps[target] = pc + 1;
|
||||||
|
if (program[pc] == JMP) {
|
||||||
|
brokenArrows[pc + 2] = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Pass 2: determine mapped program counters
|
||||||
|
int[] mapping = new int[end];
|
||||||
|
for (int pc = start, mappedPC = end; mappedPC > 0
|
||||||
|
&& pc < end; pc += length(program[pc])) {
|
||||||
|
for (int jump = newJumps[pc]; jump > 0; jump = newJumps[jump]) {
|
||||||
|
mappedPC -= 2;
|
||||||
|
}
|
||||||
|
if (!isJump(program[pc])) {
|
||||||
|
mappedPC -= length(program[pc]);
|
||||||
|
}
|
||||||
|
mapping[pc] = mappedPC;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Pass 3: write the new program
|
||||||
|
int[] reverse = new int[end];
|
||||||
|
for (int pc = start, mappedPC = end; mappedPC > 0;
|
||||||
|
pc += length(program[pc])) {
|
||||||
|
boolean brokenArrow = brokenArrows[pc];
|
||||||
|
for (int jump = newJumps[pc]; jump > 0; jump = newJumps[jump]) {
|
||||||
|
reverse[--mappedPC] = mapping[jump - 1];
|
||||||
|
if (brokenArrow) {
|
||||||
|
reverse[--mappedPC] = JMP;
|
||||||
|
brokenArrow = false;
|
||||||
|
} else {
|
||||||
|
reverse[--mappedPC] =
|
||||||
|
program[jump - 1] == SPLIT_JMP ? SPLIT_JMP : SPLIT;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (pc == end) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
if (!isJump(program[pc])) {
|
||||||
|
for (int i = length(program[pc]); i-- > 0; ) {
|
||||||
|
reverse[--mappedPC] = program[pc + i];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
System.arraycopy(reverse, start, program, start, end - start);
|
||||||
|
}
|
||||||
|
}
|
45
classpath/java/util/regex/PikeVMOpcodes.java
Normal file
45
classpath/java/util/regex/PikeVMOpcodes.java
Normal file
@ -0,0 +1,45 @@
|
|||||||
|
/* Copyright (c) 2008-2013, Avian Contributors
|
||||||
|
|
||||||
|
Permission to use, copy, modify, and/or distribute this software
|
||||||
|
for any purpose with or without fee is hereby granted, provided
|
||||||
|
that the above copyright notice and this permission notice appear
|
||||||
|
in all copies.
|
||||||
|
|
||||||
|
There is NO WARRANTY for this software. See license.txt for
|
||||||
|
details. */
|
||||||
|
|
||||||
|
package java.util.regex;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Opcodes for the Pike VM.
|
||||||
|
* <p>
|
||||||
|
* See {@link PikeVM}.
|
||||||
|
* </p>
|
||||||
|
*
|
||||||
|
* @author Johannes Schindelin
|
||||||
|
*/
|
||||||
|
interface PikeVMOpcodes {
|
||||||
|
final static int DOT = -1;
|
||||||
|
final static int DOTALL = -2;
|
||||||
|
|
||||||
|
final static int WORD_BOUNDARY = -10;
|
||||||
|
final static int NON_WORD_BOUNDARY = -11;
|
||||||
|
final static int LINE_START = -12;
|
||||||
|
final static int LINE_END = -13;
|
||||||
|
|
||||||
|
final static int CHARACTER_CLASS = -20;
|
||||||
|
|
||||||
|
final static int LOOKAHEAD = -30;
|
||||||
|
final static int LOOKBEHIND = -31;
|
||||||
|
final static int NEGATIVE_LOOKAHEAD = -32;
|
||||||
|
final static int NEGATIVE_LOOKBEHIND = -33;
|
||||||
|
|
||||||
|
final static int SAVE_OFFSET = -40;
|
||||||
|
|
||||||
|
final static int SPLIT = -50;
|
||||||
|
final static int SPLIT_JMP = -51; // this split prefers to jump
|
||||||
|
final static int JMP = -52;
|
||||||
|
|
||||||
|
final static int SINGLE_ARG_START = CHARACTER_CLASS;
|
||||||
|
final static int SINGLE_ARG_END = JMP;
|
||||||
|
}
|
80
classpath/java/util/regex/RegexMatcher.java
Normal file
80
classpath/java/util/regex/RegexMatcher.java
Normal file
@ -0,0 +1,80 @@
|
|||||||
|
/* Copyright (c) 2008-2013, Avian Contributors
|
||||||
|
|
||||||
|
Permission to use, copy, modify, and/or distribute this software
|
||||||
|
for any purpose with or without fee is hereby granted, provided
|
||||||
|
that the above copyright notice and this permission notice appear
|
||||||
|
in all copies.
|
||||||
|
|
||||||
|
There is NO WARRANTY for this software. See license.txt for
|
||||||
|
details. */
|
||||||
|
|
||||||
|
package java.util.regex;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* A minimal implementation of a regular expression matcher.
|
||||||
|
*
|
||||||
|
* @author Johannes Schindelin
|
||||||
|
*/
|
||||||
|
public class RegexMatcher extends Matcher {
|
||||||
|
private final PikeVM vm;
|
||||||
|
private char[] array;
|
||||||
|
int[] groupStart, groupEnd;
|
||||||
|
|
||||||
|
RegexMatcher(PikeVM vm, CharSequence string) {
|
||||||
|
super(string);
|
||||||
|
this.vm = vm;
|
||||||
|
}
|
||||||
|
|
||||||
|
private final PikeVM.Result adapter = new PikeVM.Result() {
|
||||||
|
public void set(int[] start, int[] end) {
|
||||||
|
RegexMatcher.this.start = start[0];
|
||||||
|
RegexMatcher.this.end = end[0];
|
||||||
|
RegexMatcher.this.groupStart = start;
|
||||||
|
RegexMatcher.this.groupEnd = end;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
public Matcher reset() {
|
||||||
|
start = end = -1;
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Matcher reset(CharSequence input) {
|
||||||
|
this.input = input;
|
||||||
|
array = input.toString().toCharArray();
|
||||||
|
return reset();
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean matches() {
|
||||||
|
return vm.matches(array, 0, array.length, true, true, adapter);
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean find() {
|
||||||
|
return find(end + (start == end ? 1 : 0));
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean find(int offset) {
|
||||||
|
return vm.matches(array, offset, array.length, false, false, adapter);
|
||||||
|
}
|
||||||
|
|
||||||
|
public int start(int group) {
|
||||||
|
return groupStart[group];
|
||||||
|
}
|
||||||
|
|
||||||
|
public int end(int group) {
|
||||||
|
return groupEnd[group];
|
||||||
|
}
|
||||||
|
|
||||||
|
public String group(int group) {
|
||||||
|
int offset = start(group);
|
||||||
|
if (offset < 0) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
int length = end(group) - offset;
|
||||||
|
return new String(array, offset, length);
|
||||||
|
}
|
||||||
|
|
||||||
|
public int groupCount() {
|
||||||
|
return groupStart.length - 1;
|
||||||
|
}
|
||||||
|
}
|
57
classpath/java/util/regex/RegexPattern.java
Normal file
57
classpath/java/util/regex/RegexPattern.java
Normal file
@ -0,0 +1,57 @@
|
|||||||
|
/* Copyright (c) 2008-2013, Avian Contributors
|
||||||
|
|
||||||
|
Permission to use, copy, modify, and/or distribute this software
|
||||||
|
for any purpose with or without fee is hereby granted, provided
|
||||||
|
that the above copyright notice and this permission notice appear
|
||||||
|
in all copies.
|
||||||
|
|
||||||
|
There is NO WARRANTY for this software. See license.txt for
|
||||||
|
details. */
|
||||||
|
|
||||||
|
package java.util.regex;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* A minimal implementation of a regular expression engine.
|
||||||
|
* <p>
|
||||||
|
* Intended as a permissively-licensed drop-in replacement for Oracle JDK's
|
||||||
|
* regular expression engine, this class uses the Pike VM implemented in
|
||||||
|
* {@link PikeVM} to match regular expressions.
|
||||||
|
* </p>
|
||||||
|
* <p>
|
||||||
|
* The Pike VM not only has a nicer runtime performance than Oracle JDK's
|
||||||
|
* backtracking approach -- <i>O(n*m)</i> instead of <i>O(2^m)</i> where
|
||||||
|
* <i>n</i> is the length of the regular expression pattern (after normalizing
|
||||||
|
* {<n>} quantifiers) and <i>m</i> the length of the text to match against
|
||||||
|
* the pattern -- but also supports arbitrary-sized look-behinds.
|
||||||
|
* </p>
|
||||||
|
* <p>
|
||||||
|
* The current implementation supports all regular expression constructs
|
||||||
|
* supported by Oracle JDK's regular expression engine except for the following
|
||||||
|
* ones:
|
||||||
|
* <ul>
|
||||||
|
* <li>control characters: \cX</li>
|
||||||
|
* <li>extended character classes: \p{...}</li>
|
||||||
|
* <li>extended boundary matchers: \A,\G,\Z,\z</li>
|
||||||
|
* <li>possessive quantifiers: X?+</li>
|
||||||
|
* <li>back references: \<n>, \k<name></li>
|
||||||
|
* <li>long escape: \Q, \E</li>
|
||||||
|
* <li>named groups: (?<name>X)</li>
|
||||||
|
* <li>flags: (?idmsuxU)</li>
|
||||||
|
* <li>independent, non-capturing group: (?>X)</li>
|
||||||
|
* </ul>
|
||||||
|
* </p>
|
||||||
|
*
|
||||||
|
* @author Johannes Schindelin
|
||||||
|
*/
|
||||||
|
public class RegexPattern extends Pattern {
|
||||||
|
private PikeVM vm;
|
||||||
|
|
||||||
|
public RegexMatcher matcher(CharSequence string) {
|
||||||
|
return new RegexMatcher(vm, string);
|
||||||
|
}
|
||||||
|
|
||||||
|
RegexPattern(String regex, int flags, PikeVM vm) {
|
||||||
|
super(regex, flags);
|
||||||
|
this.vm = vm;
|
||||||
|
}
|
||||||
|
}
|
48
classpath/java/util/regex/TrivialMatcher.java
Normal file
48
classpath/java/util/regex/TrivialMatcher.java
Normal file
@ -0,0 +1,48 @@
|
|||||||
|
/* Copyright (c) 2008-2013, Avian Contributors
|
||||||
|
|
||||||
|
Permission to use, copy, modify, and/or distribute this software
|
||||||
|
for any purpose with or without fee is hereby granted, provided
|
||||||
|
that the above copyright notice and this permission notice appear
|
||||||
|
in all copies.
|
||||||
|
|
||||||
|
There is NO WARRANTY for this software. See license.txt for
|
||||||
|
details. */
|
||||||
|
|
||||||
|
package java.util.regex;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* This is a work in progress.
|
||||||
|
*
|
||||||
|
* @author zsombor and others
|
||||||
|
*/
|
||||||
|
class TrivialMatcher extends Matcher {
|
||||||
|
private final String pattern;
|
||||||
|
|
||||||
|
TrivialMatcher(String pattern, CharSequence input) {
|
||||||
|
super(input);
|
||||||
|
this.pattern = pattern;
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean matches() {
|
||||||
|
if (pattern.equals(input.toString())) {
|
||||||
|
start = 0;
|
||||||
|
end = input.length();
|
||||||
|
return true;
|
||||||
|
} else {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean find(int start) {
|
||||||
|
String p = pattern;
|
||||||
|
int i = TrivialPattern.indexOf(input, p, start);
|
||||||
|
if (i >= 0) {
|
||||||
|
this.start = i;
|
||||||
|
this.end = i + p.length();
|
||||||
|
return true;
|
||||||
|
} else {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
112
classpath/java/util/regex/TrivialPattern.java
Normal file
112
classpath/java/util/regex/TrivialPattern.java
Normal file
@ -0,0 +1,112 @@
|
|||||||
|
/* Copyright (c) 2008-2013, Avian Contributors
|
||||||
|
|
||||||
|
Permission to use, copy, modify, and/or distribute this software
|
||||||
|
for any purpose with or without fee is hereby granted, provided
|
||||||
|
that the above copyright notice and this permission notice appear
|
||||||
|
in all copies.
|
||||||
|
|
||||||
|
There is NO WARRANTY for this software. See license.txt for
|
||||||
|
details. */
|
||||||
|
|
||||||
|
package java.util.regex;
|
||||||
|
|
||||||
|
import java.util.Iterator;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.LinkedList;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* This is a work in progress.
|
||||||
|
*
|
||||||
|
* @author zsombor and others
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
public class TrivialPattern extends Pattern {
|
||||||
|
|
||||||
|
private final String unescaped;
|
||||||
|
|
||||||
|
TrivialPattern(String pattern, String unescaped, int flags) {
|
||||||
|
super(pattern, flags);
|
||||||
|
this.unescaped = unescaped;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Matcher matcher(CharSequence input) {
|
||||||
|
return new TrivialMatcher(unescaped, input);
|
||||||
|
}
|
||||||
|
|
||||||
|
public String[] split(CharSequence input, int limit) {
|
||||||
|
boolean strip;
|
||||||
|
if (limit < 0) {
|
||||||
|
strip = false;
|
||||||
|
limit = Integer.MAX_VALUE;
|
||||||
|
} else if (limit == 0) {
|
||||||
|
strip = true;
|
||||||
|
limit = Integer.MAX_VALUE;
|
||||||
|
} else {
|
||||||
|
strip = false;
|
||||||
|
}
|
||||||
|
|
||||||
|
List<CharSequence> list = new LinkedList<CharSequence>();
|
||||||
|
int index = 0;
|
||||||
|
int trailing = 0;
|
||||||
|
int patternLength = unescaped.length();
|
||||||
|
while (index < input.length() && list.size() < limit - 1) {
|
||||||
|
int i;
|
||||||
|
if (patternLength == 0) {
|
||||||
|
if (list.size() == 0) {
|
||||||
|
i = 0;
|
||||||
|
} else {
|
||||||
|
i = index + 1;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
i = indexOf(input, unescaped, index);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (i >= 0) {
|
||||||
|
if (patternLength != 0 && i == index) {
|
||||||
|
++ trailing;
|
||||||
|
} else {
|
||||||
|
trailing = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
list.add(input.subSequence(index, i));
|
||||||
|
index = i + patternLength;
|
||||||
|
} else {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (strip && index > 0 && index == input.length()) {
|
||||||
|
++ trailing;
|
||||||
|
} else {
|
||||||
|
trailing = 0;
|
||||||
|
}
|
||||||
|
list.add(input.subSequence(index, input.length()));
|
||||||
|
|
||||||
|
String[] result = new String[list.size() - trailing];
|
||||||
|
int i = 0;
|
||||||
|
for (Iterator<CharSequence> it = list.iterator();
|
||||||
|
it.hasNext() && i < result.length; ++ i)
|
||||||
|
{
|
||||||
|
result[i] = it.next().toString();
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
static int indexOf(CharSequence haystack, CharSequence needle, int start) {
|
||||||
|
if (needle.length() == 0) return start;
|
||||||
|
|
||||||
|
for (int i = start; i < haystack.length() - needle.length() + 1; ++i) {
|
||||||
|
int j = 0;
|
||||||
|
for (; j < needle.length(); ++j) {
|
||||||
|
if (haystack.charAt(i + j) != needle.charAt(j)) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (j == needle.length()) {
|
||||||
|
return i;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
}
|
96
test/Regex.java
Normal file
96
test/Regex.java
Normal file
@ -0,0 +1,96 @@
|
|||||||
|
import java.util.regex.Matcher;
|
||||||
|
import java.util.regex.Pattern;
|
||||||
|
|
||||||
|
public class Regex {
|
||||||
|
private static void expect(boolean v) {
|
||||||
|
if (! v) throw new RuntimeException();
|
||||||
|
}
|
||||||
|
|
||||||
|
private static Matcher getMatcher(String regex, String string) {
|
||||||
|
return Pattern.compile(regex).matcher(string);
|
||||||
|
}
|
||||||
|
|
||||||
|
private static void expectMatch(String regex, String string) {
|
||||||
|
expect(getMatcher(regex, string).matches());
|
||||||
|
}
|
||||||
|
|
||||||
|
private static void expectNoMatch(String regex, String string) {
|
||||||
|
expect(!getMatcher(regex, string).matches());
|
||||||
|
}
|
||||||
|
|
||||||
|
private static void expectGroups(String regex, String string,
|
||||||
|
String... groups) {
|
||||||
|
Matcher matcher = getMatcher(regex, string);
|
||||||
|
expect(matcher.matches());
|
||||||
|
expect(matcher.groupCount() == groups.length);
|
||||||
|
for (int i = 1; i <= groups.length; ++i) {
|
||||||
|
if (groups[i - 1] == null) {
|
||||||
|
expect(matcher.group(i) == null);
|
||||||
|
} else {
|
||||||
|
expect(groups[i - 1].equals(matcher.group(i)));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private static void expectFind(String regex, String string,
|
||||||
|
String... matches)
|
||||||
|
{
|
||||||
|
Matcher matcher = getMatcher(regex, string);
|
||||||
|
int i = 0;
|
||||||
|
while (i < matches.length) {
|
||||||
|
expect(matcher.find());
|
||||||
|
expect(matches[i++].equals(matcher.group()));
|
||||||
|
}
|
||||||
|
expect(!matcher.find());
|
||||||
|
}
|
||||||
|
|
||||||
|
private static void expectSplit(String regex, String string,
|
||||||
|
String... list)
|
||||||
|
{
|
||||||
|
String[] array = Pattern.compile(regex).split(string);
|
||||||
|
expect(array.length == list.length);
|
||||||
|
for (int i = 0; i < list.length; ++ i) {
|
||||||
|
expect(list[i].equals(array[i]));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public static void main(String[] args) {
|
||||||
|
expectMatch("a(bb)?a", "abba");
|
||||||
|
expectNoMatch("a(bb)?a", "abbba");
|
||||||
|
expectNoMatch("a(bb)?a", "abbaa");
|
||||||
|
expectGroups("a(a*?)(a?)(a??)(a+)(a*)a", "aaaaaa", "", "a", "", "aaa", "");
|
||||||
|
expectMatch("...", "abc");
|
||||||
|
expectNoMatch(".", "\n");
|
||||||
|
expectGroups("a(bb)*a", "abbbba", "bb");
|
||||||
|
expectGroups("a(bb)?(bb)+a", "abba", null, "bb");
|
||||||
|
expectFind(" +", "Hello , world! ", " ", " ", " ");
|
||||||
|
expectMatch("[0-9A-Fa-f]+", "08ef");
|
||||||
|
expectNoMatch("[0-9A-Fa-f]+", "08@ef");
|
||||||
|
expectGroups("(?:a)", "a");
|
||||||
|
expectGroups("a|(b|c)", "a", (String)null);
|
||||||
|
expectGroups("a|(b|c)", "c", "c");
|
||||||
|
expectGroups("(?=a)a", "a");
|
||||||
|
expectGroups(".*(o)(?<=[A-Z][a-z]*)", "Hello", "o");
|
||||||
|
expectNoMatch("(?!a).", "a");
|
||||||
|
expectMatch("[\\d]", "0");
|
||||||
|
expectMatch("\\0777", "?7");
|
||||||
|
expectMatch("\\a", "\007");
|
||||||
|
expectMatch("\\\\", "\\");
|
||||||
|
expectMatch("\\x4A", "J");
|
||||||
|
expectMatch("\\x61", "a");
|
||||||
|
expectMatch("\\078", "\0078");
|
||||||
|
expectSplit("(?<=\\w)(?=\\W)|(?<=\\W)(?=\\w)", "a + b * x",
|
||||||
|
"a", " + ", "b", " * ", "x");
|
||||||
|
expectMatch("[0-9[def]]", "f");
|
||||||
|
expectNoMatch("[a-z&&[^d-f]]", "f");
|
||||||
|
expectSplit("^H", "Hello\nHobbes!", "", "ello\nHobbes!");
|
||||||
|
expectSplit("o.*?$", "Hello\r\nHobbes!", "Hello\r\nH");
|
||||||
|
expectSplit("\\b", "a+ b + c\nd", "", "a", "+ ", "b", " + ", "c", "\n", "d");
|
||||||
|
expectSplit("\\B", "Hi Cal!", "H", "i C", "a", "l!");
|
||||||
|
expectMatch("a{2,5}", "aaaa");
|
||||||
|
expectGroups("a??(a{2,5}?)", "aaaa", "aaaa");
|
||||||
|
expectGroups("a??(a{3}?)", "aaaa", "aaa");
|
||||||
|
expectNoMatch("a(a{3}?)", "aaaaa");
|
||||||
|
expectMatch("a(a{3,}?)", "aaaaa");
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in New Issue
Block a user