From e2105670a0a88ff3b038910e79b8f765401d6b8f Mon Sep 17 00:00:00 2001 From: Johannes Schindelin Date: Fri, 22 Nov 2013 17:30:06 -0600 Subject: [PATCH] Regex compiler: fall back to TrivialPattern when possible While at it, let's get rid of the unescaping in TrivialPattern which was buggy anyway: special operators such as \b were misinterpreted as trivial patterns. Signed-off-by: Johannes Schindelin --- test/regex/Compiler.java | 7 ++- test/regex/Pattern.java | 3 -- test/regex/PikeVM.java | 37 +++++++++++++ test/regex/TrivialPattern.java | 99 +++------------------------------- 4 files changed, 49 insertions(+), 97 deletions(-) diff --git a/test/regex/Compiler.java b/test/regex/Compiler.java index 7213489d9c..70ebf13391 100644 --- a/test/regex/Compiler.java +++ b/test/regex/Compiler.java @@ -166,6 +166,11 @@ class Compiler implements PikeVMOpcodes { throw new IllegalArgumentException("Unclosed groups: (" + (groups.size() - 1) + "): " + regex); } - return new RegexPattern(regex, 0, new Output(root).toVM()); + PikeVM vm = new Output(root).toVM(); + String plain = vm.isPlainString(); + if (plain != null) { + return new TrivialPattern(regex, plain, 0); + } + return new RegexPattern(regex, 0, vm); } } diff --git a/test/regex/Pattern.java b/test/regex/Pattern.java index bd0405ed4c..49ac289ac3 100644 --- a/test/regex/Pattern.java +++ b/test/regex/Pattern.java @@ -43,9 +43,6 @@ public abstract class Pattern implements PikeVMOpcodes { } public static Pattern compile(String regex, int flags) { - try { - return new TrivialPattern(regex, flags); - } catch (UnsupportedOperationException handledBelow) { } if (flags != 0) { throw new UnsupportedOperationException("TODO"); } diff --git a/test/regex/PikeVM.java b/test/regex/PikeVM.java index db8ad44ef7..d1d6ce890b 100644 --- a/test/regex/PikeVM.java +++ b/test/regex/PikeVM.java @@ -332,4 +332,41 @@ class PikeVM implements PikeVMOpcodes { } return foundMatch; } + + /** + * Determines whether this machine recognizes a pattern without special + * operators. + *

+ * In case that the regular expression is actually a plain string without any + * special operators, we can avoid using a full-blown Pike VM and instead fall + * back to using the much faster {@link TrivialPattern}. + *

+ * + * @return the string to match, or null if the machine recognizes a + * non-trivial pattern + */ + public String isPlainString() { + // we expect the machine to start with SAVE_OFFSET 0 and + // end with SAVE_OFFSET 1 + int start = 0; + if (start + 1 < program.length && + program[start] == SAVE_OFFSET && program[start + 1] == 0) { + start += 2; + } + int end = program.length; + if (end > start + 1 && + program[end - 2] == SAVE_OFFSET && program[end - 1] == 1) { + end -= 2; + } + for (int i = start; i < end; ++ i) { + if (program[i] < 0) { + return null; + } + } + char[] array = new char[end - start]; + for (int i = start; i < end; ++ i) { + array[i - start] = (char)program[i]; + } + return new String(array); + } } diff --git a/test/regex/TrivialPattern.java b/test/regex/TrivialPattern.java index 40ea88fa3a..6289edc676 100644 --- a/test/regex/TrivialPattern.java +++ b/test/regex/TrivialPattern.java @@ -22,102 +22,15 @@ import java.util.LinkedList; */ public class TrivialPattern extends Pattern { - private final String trivialPattern; + private final String unescaped; - TrivialPattern(String pattern, int flags) { + TrivialPattern(String pattern, String unescaped, int flags) { super(pattern, flags); - this.trivialPattern = trivial(pattern); - } - - private static String trivial(String pattern) { - StringBuffer buffer = new StringBuffer(); - for (int i = 0; i < pattern.length(); ++i) { - char c = pattern.charAt(i); - switch (c) { - case '\\': - if (++i == pattern.length()) { - break; - } - c = pattern.charAt(i); - if (c == '0') { - int len = digits(pattern, ++i, 3, 8); - if (len == 3 && pattern.charAt(i) > '3') { - --len; - } - c = (char)Integer.parseInt(pattern.substring(i, i + len), 8); - i += len - 1; - } else if (c == 'x' || c == 'u') { - int len = digits(pattern, ++i, 4, 16); - c = (char)Integer.parseInt(pattern.substring(i, i + len), 16); - i += len - 1; - } else { - c = unescape(pattern.charAt(i)); - } - if (c != -1) { - break; - } - // fallthru - case '.': - case '*': - case '+': - case '?': - case '|': - case '[': - case ']': - case '{': - case '}': - case '(': - case ')': - case '^': - case '$': - throw new UnsupportedOperationException - ("only trivial regular expressions are supported so far (" + pattern + ")"); - } - buffer.append(c); - } - return buffer.toString(); - } - - private static int digits(String s, int offset, int maxLength, int base) { - for (int i = 0; ; ++i) { - if (i == maxLength || offset + i >= s.length()) { - return i; - } - int value = s.charAt(offset + i) - '0'; - if (value < 0) { - return i; - } - if (base > 10 && value >= 10) { - value += 10 - (value >= 'a' - '0' ? 'a' - '0' : 'A' - '0'); - } - if (value >= base) { - return i; - } - } - } - - private static char unescape(char c) { - switch (c) { - case '\\': - return c; - case 'a': - return 0x0007; - case 'e': - return 0x001B; - case 'f': - return 0x000C; - case 'n': - return 0x000A; - case 'r': - return 0x000D; - case 't': - return 0x0009; - } - return (char)-1; + this.unescaped = unescaped; } public Matcher matcher(CharSequence input) { - return new TrivialMatcher(trivialPattern, input); + return new TrivialMatcher(unescaped, input); } public String[] split(CharSequence input, int limit) { @@ -135,7 +48,7 @@ public class TrivialPattern extends Pattern { List list = new LinkedList(); int index = 0; int trailing = 0; - int patternLength = trivialPattern.length(); + int patternLength = unescaped.length(); while (index < input.length() && list.size() < limit - 1) { int i; if (patternLength == 0) { @@ -145,7 +58,7 @@ public class TrivialPattern extends Pattern { i = index + 1; } } else { - i = indexOf(input, trivialPattern, index); + i = indexOf(input, unescaped, index); } if (i >= 0) {