From ca428c406c70c4f37858df465bafad25da0bee49 Mon Sep 17 00:00:00 2001 From: Johannes Schindelin Date: Wed, 13 Nov 2013 17:54:47 -0600 Subject: [PATCH] Regex: implement find() Now that we have non-greedy repeats, we can implement the find() (which essentially prefixes the regular expression pattern with '.*?'. Signed-off-by: Johannes Schindelin --- test/Regex.java | 13 +++++++++ test/regex/Compiler.java | 18 +++++++++++-- test/regex/PikeVM.java | 51 ++++++++++++++++++++++++++++++------ test/regex/RegexMatcher.java | 6 ++++- 4 files changed, 77 insertions(+), 11 deletions(-) diff --git a/test/Regex.java b/test/Regex.java index 39f3dfe29a..db2cf86892 100644 --- a/test/Regex.java +++ b/test/Regex.java @@ -32,6 +32,18 @@ public class Regex { } } + private static void expectFind(String regex, String string, + String... matches) + { + Matcher matcher = getMatcher(regex, string); + int i = 0; + while (i < matches.length) { + expect(matcher.find()); + expect(matches[i++].equals(matcher.group())); + } + expect(!matcher.find()); + } + public static void main(String[] args) { expectMatch("a(bb)?a", "abba"); expectNoMatch("a(bb)?a", "abbba"); @@ -41,5 +53,6 @@ public class Regex { expectNoMatch(".", "\n"); expectGroups("a(bb)*a", "abbbba", "bb"); expectGroups("a(bb)?(bb)+a", "abba", null, "bb"); + expectFind(" +", "Hello , world! ", " ", " ", " "); } } diff --git a/test/regex/Compiler.java b/test/regex/Compiler.java index 6f65e6b4d9..2b6b83d23f 100644 --- a/test/regex/Compiler.java +++ b/test/regex/Compiler.java @@ -26,6 +26,7 @@ class Compiler implements PikeVMOpcodes { private int[] program; private int offset; private int groupCount = -1; + private int findPreambleSize; public Output(Expression expr) { // try-run to determine the code size @@ -54,9 +55,14 @@ class Compiler implements PikeVMOpcodes { } } - public PikeVM toVM() { - return new PikeVM(program, groupCount); + public void markFindPreambleEnd() { + findPreambleSize = offset; } + + public PikeVM toVM() { + return new PikeVM(program, findPreambleSize, groupCount); + } + } private abstract class Expression { @@ -148,6 +154,14 @@ class Compiler implements PikeVMOpcodes { } public void writeCode(Output output) { + // find() preamble + int start = output.offset; + output.add(SPLIT_JMP); + output.add(start + 5); + output.add(DOTALL); + output.add(SPLIT); + output.add(start + 2); + output.markFindPreambleEnd(); group.writeCode(output); } } diff --git a/test/regex/PikeVM.java b/test/regex/PikeVM.java index d1d6ce890b..7b3e55467d 100644 --- a/test/regex/PikeVM.java +++ b/test/regex/PikeVM.java @@ -19,13 +19,21 @@ class PikeVM implements PikeVMOpcodes { private final int[] program; private final int groupCount; private final int offsetsCount; + /* + * For find(), we do not want to anchor the match at the start offset. Our + * compiler allows this by prefixing the code with an implicit '(?:.*?)'. For + * regular matches() calls, we want to skip that code and start at {@code + * findPrefixLength} instead. + */ + private final int findPrefixLength; public interface Result { void set(int[] start, int[] end); } - protected PikeVM(int[] program, int groupCount) { + protected PikeVM(int[] program, int findPrefixLength, int groupCount) { this.program = program; + this.findPrefixLength = findPrefixLength; this.groupCount = groupCount; offsetsCount = 2 * groupCount + 2; } @@ -190,6 +198,31 @@ class PikeVM implements PikeVMOpcodes { result.set(groupStart, groupEnd); } + private void mustStartMatchAt(int start) { + int previous = -1; + for (int pc = head; pc >= 0; ) { + int nextPC = next[pc] - 1; + if (start + 1 == offsets[pc][0]) { + previous = pc; + } else { + next[pc] = 0; + offsets[pc] = null; + if (pc == tail) { + head = tail = -1; + } else if (previous < 0) { + head = nextPC; + } else { + next[previous] = 1 + nextPC; + } + } + pc = nextPC; + } + } + + private int startOffset(int pc) { + return offsets[pc][0] - 1; + } + public boolean isEmpty() { return head < 0; } @@ -250,11 +283,8 @@ class PikeVM implements PikeVMOpcodes { ThreadQueue next = new ThreadQueue(); // initialize the first thread - ThreadQueue queued = new ThreadQueue(0); - if (!anchorStart) { - // this requires non-greedy matching - throw new UnsupportedOperationException(); - } + int startPC = anchorStart ? findPrefixLength : 0; + ThreadQueue queued = new ThreadQueue(startPC); boolean foundMatch = false; for (int i = start; i <= end; ++i) { @@ -280,6 +310,11 @@ class PikeVM implements PikeVMOpcodes { continue; } current.setResult(result); + // now that we found a match, even higher-priority matches must match + // at the same start offset + if (!anchorStart) { + next.mustStartMatchAt(current.startOffset(pc)); + } foundMatch = true; break; } @@ -346,9 +381,9 @@ class PikeVM implements PikeVMOpcodes { * non-trivial pattern */ public String isPlainString() { - // we expect the machine to start with SAVE_OFFSET 0 and + // we expect the machine to start with the find preamble and SAVE_OFFSET 0 // end with SAVE_OFFSET 1 - int start = 0; + int start = findPrefixLength; if (start + 1 < program.length && program[start] == SAVE_OFFSET && program[start + 1] == 0) { start += 2; diff --git a/test/regex/RegexMatcher.java b/test/regex/RegexMatcher.java index 5ea2eea82c..78bc7c77ca 100644 --- a/test/regex/RegexMatcher.java +++ b/test/regex/RegexMatcher.java @@ -49,8 +49,12 @@ public class RegexMatcher extends Matcher { return vm.matches(array, 0, array.length, true, true, adapter); } + public boolean find() { + return find(end + (start == end ? 1 : 0)); + } + public boolean find(int offset) { - throw new UnsupportedOperationException("TODO"); + return vm.matches(array, offset, array.length, false, false, adapter); } public int start(int group) {