mirror of
https://github.com/corda/corda.git
synced 2025-01-08 14:03:06 +00:00
Regex compiler: fall back to TrivialPattern when possible
While at it, let's get rid of the unescaping in TrivialPattern which was buggy anyway: special operators such as \b were misinterpreted as trivial patterns. Signed-off-by: Johannes Schindelin <johannes.schindelin@gmx.de>
This commit is contained in:
parent
04d8955f98
commit
e2105670a0
@ -166,6 +166,11 @@ class Compiler implements PikeVMOpcodes {
|
|||||||
throw new IllegalArgumentException("Unclosed groups: ("
|
throw new IllegalArgumentException("Unclosed groups: ("
|
||||||
+ (groups.size() - 1) + "): " + regex);
|
+ (groups.size() - 1) + "): " + regex);
|
||||||
}
|
}
|
||||||
return new RegexPattern(regex, 0, new Output(root).toVM());
|
PikeVM vm = new Output(root).toVM();
|
||||||
|
String plain = vm.isPlainString();
|
||||||
|
if (plain != null) {
|
||||||
|
return new TrivialPattern(regex, plain, 0);
|
||||||
|
}
|
||||||
|
return new RegexPattern(regex, 0, vm);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -43,9 +43,6 @@ public abstract class Pattern implements PikeVMOpcodes {
|
|||||||
}
|
}
|
||||||
|
|
||||||
public static Pattern compile(String regex, int flags) {
|
public static Pattern compile(String regex, int flags) {
|
||||||
try {
|
|
||||||
return new TrivialPattern(regex, flags);
|
|
||||||
} catch (UnsupportedOperationException handledBelow) { }
|
|
||||||
if (flags != 0) {
|
if (flags != 0) {
|
||||||
throw new UnsupportedOperationException("TODO");
|
throw new UnsupportedOperationException("TODO");
|
||||||
}
|
}
|
||||||
|
@ -332,4 +332,41 @@ class PikeVM implements PikeVMOpcodes {
|
|||||||
}
|
}
|
||||||
return foundMatch;
|
return foundMatch;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Determines whether this machine recognizes a pattern without special
|
||||||
|
* operators.
|
||||||
|
* <p>
|
||||||
|
* In case that the regular expression is actually a plain string without any
|
||||||
|
* special operators, we can avoid using a full-blown Pike VM and instead fall
|
||||||
|
* back to using the much faster {@link TrivialPattern}.
|
||||||
|
* </p>
|
||||||
|
*
|
||||||
|
* @return the string to match, or null if the machine recognizes a
|
||||||
|
* non-trivial pattern
|
||||||
|
*/
|
||||||
|
public String isPlainString() {
|
||||||
|
// we expect the machine to start with SAVE_OFFSET 0 and
|
||||||
|
// end with SAVE_OFFSET 1
|
||||||
|
int start = 0;
|
||||||
|
if (start + 1 < program.length &&
|
||||||
|
program[start] == SAVE_OFFSET && program[start + 1] == 0) {
|
||||||
|
start += 2;
|
||||||
|
}
|
||||||
|
int end = program.length;
|
||||||
|
if (end > start + 1 &&
|
||||||
|
program[end - 2] == SAVE_OFFSET && program[end - 1] == 1) {
|
||||||
|
end -= 2;
|
||||||
|
}
|
||||||
|
for (int i = start; i < end; ++ i) {
|
||||||
|
if (program[i] < 0) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
char[] array = new char[end - start];
|
||||||
|
for (int i = start; i < end; ++ i) {
|
||||||
|
array[i - start] = (char)program[i];
|
||||||
|
}
|
||||||
|
return new String(array);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@ -22,102 +22,15 @@ import java.util.LinkedList;
|
|||||||
*/
|
*/
|
||||||
public class TrivialPattern extends Pattern {
|
public class TrivialPattern extends Pattern {
|
||||||
|
|
||||||
private final String trivialPattern;
|
private final String unescaped;
|
||||||
|
|
||||||
TrivialPattern(String pattern, int flags) {
|
TrivialPattern(String pattern, String unescaped, int flags) {
|
||||||
super(pattern, flags);
|
super(pattern, flags);
|
||||||
this.trivialPattern = trivial(pattern);
|
this.unescaped = unescaped;
|
||||||
}
|
|
||||||
|
|
||||||
private static String trivial(String pattern) {
|
|
||||||
StringBuffer buffer = new StringBuffer();
|
|
||||||
for (int i = 0; i < pattern.length(); ++i) {
|
|
||||||
char c = pattern.charAt(i);
|
|
||||||
switch (c) {
|
|
||||||
case '\\':
|
|
||||||
if (++i == pattern.length()) {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
c = pattern.charAt(i);
|
|
||||||
if (c == '0') {
|
|
||||||
int len = digits(pattern, ++i, 3, 8);
|
|
||||||
if (len == 3 && pattern.charAt(i) > '3') {
|
|
||||||
--len;
|
|
||||||
}
|
|
||||||
c = (char)Integer.parseInt(pattern.substring(i, i + len), 8);
|
|
||||||
i += len - 1;
|
|
||||||
} else if (c == 'x' || c == 'u') {
|
|
||||||
int len = digits(pattern, ++i, 4, 16);
|
|
||||||
c = (char)Integer.parseInt(pattern.substring(i, i + len), 16);
|
|
||||||
i += len - 1;
|
|
||||||
} else {
|
|
||||||
c = unescape(pattern.charAt(i));
|
|
||||||
}
|
|
||||||
if (c != -1) {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
// fallthru
|
|
||||||
case '.':
|
|
||||||
case '*':
|
|
||||||
case '+':
|
|
||||||
case '?':
|
|
||||||
case '|':
|
|
||||||
case '[':
|
|
||||||
case ']':
|
|
||||||
case '{':
|
|
||||||
case '}':
|
|
||||||
case '(':
|
|
||||||
case ')':
|
|
||||||
case '^':
|
|
||||||
case '$':
|
|
||||||
throw new UnsupportedOperationException
|
|
||||||
("only trivial regular expressions are supported so far (" + pattern + ")");
|
|
||||||
}
|
|
||||||
buffer.append(c);
|
|
||||||
}
|
|
||||||
return buffer.toString();
|
|
||||||
}
|
|
||||||
|
|
||||||
private static int digits(String s, int offset, int maxLength, int base) {
|
|
||||||
for (int i = 0; ; ++i) {
|
|
||||||
if (i == maxLength || offset + i >= s.length()) {
|
|
||||||
return i;
|
|
||||||
}
|
|
||||||
int value = s.charAt(offset + i) - '0';
|
|
||||||
if (value < 0) {
|
|
||||||
return i;
|
|
||||||
}
|
|
||||||
if (base > 10 && value >= 10) {
|
|
||||||
value += 10 - (value >= 'a' - '0' ? 'a' - '0' : 'A' - '0');
|
|
||||||
}
|
|
||||||
if (value >= base) {
|
|
||||||
return i;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private static char unescape(char c) {
|
|
||||||
switch (c) {
|
|
||||||
case '\\':
|
|
||||||
return c;
|
|
||||||
case 'a':
|
|
||||||
return 0x0007;
|
|
||||||
case 'e':
|
|
||||||
return 0x001B;
|
|
||||||
case 'f':
|
|
||||||
return 0x000C;
|
|
||||||
case 'n':
|
|
||||||
return 0x000A;
|
|
||||||
case 'r':
|
|
||||||
return 0x000D;
|
|
||||||
case 't':
|
|
||||||
return 0x0009;
|
|
||||||
}
|
|
||||||
return (char)-1;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public Matcher matcher(CharSequence input) {
|
public Matcher matcher(CharSequence input) {
|
||||||
return new TrivialMatcher(trivialPattern, input);
|
return new TrivialMatcher(unescaped, input);
|
||||||
}
|
}
|
||||||
|
|
||||||
public String[] split(CharSequence input, int limit) {
|
public String[] split(CharSequence input, int limit) {
|
||||||
@ -135,7 +48,7 @@ public class TrivialPattern extends Pattern {
|
|||||||
List<CharSequence> list = new LinkedList<CharSequence>();
|
List<CharSequence> list = new LinkedList<CharSequence>();
|
||||||
int index = 0;
|
int index = 0;
|
||||||
int trailing = 0;
|
int trailing = 0;
|
||||||
int patternLength = trivialPattern.length();
|
int patternLength = unescaped.length();
|
||||||
while (index < input.length() && list.size() < limit - 1) {
|
while (index < input.length() && list.size() < limit - 1) {
|
||||||
int i;
|
int i;
|
||||||
if (patternLength == 0) {
|
if (patternLength == 0) {
|
||||||
@ -145,7 +58,7 @@ public class TrivialPattern extends Pattern {
|
|||||||
i = index + 1;
|
i = index + 1;
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
i = indexOf(input, trivialPattern, index);
|
i = indexOf(input, unescaped, index);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (i >= 0) {
|
if (i >= 0) {
|
||||||
|
Loading…
Reference in New Issue
Block a user