Regex: implement find()

Now that we have non-greedy repeats, we can implement the find() (which
essentially prefixes the regular expression pattern with '.*?'.

Signed-off-by: Johannes Schindelin <johannes.schindelin@gmx.de>
This commit is contained in:
Johannes Schindelin 2013-11-13 17:54:47 -06:00
parent 7da03b0f19
commit ca428c406c
4 changed files with 77 additions and 11 deletions

View File

@ -32,6 +32,18 @@ public class Regex {
} }
} }
private static void expectFind(String regex, String string,
String... matches)
{
Matcher matcher = getMatcher(regex, string);
int i = 0;
while (i < matches.length) {
expect(matcher.find());
expect(matches[i++].equals(matcher.group()));
}
expect(!matcher.find());
}
public static void main(String[] args) { public static void main(String[] args) {
expectMatch("a(bb)?a", "abba"); expectMatch("a(bb)?a", "abba");
expectNoMatch("a(bb)?a", "abbba"); expectNoMatch("a(bb)?a", "abbba");
@ -41,5 +53,6 @@ public class Regex {
expectNoMatch(".", "\n"); expectNoMatch(".", "\n");
expectGroups("a(bb)*a", "abbbba", "bb"); expectGroups("a(bb)*a", "abbbba", "bb");
expectGroups("a(bb)?(bb)+a", "abba", null, "bb"); expectGroups("a(bb)?(bb)+a", "abba", null, "bb");
expectFind(" +", "Hello , world! ", " ", " ", " ");
} }
} }

View File

@ -26,6 +26,7 @@ class Compiler implements PikeVMOpcodes {
private int[] program; private int[] program;
private int offset; private int offset;
private int groupCount = -1; private int groupCount = -1;
private int findPreambleSize;
public Output(Expression expr) { public Output(Expression expr) {
// try-run to determine the code size // try-run to determine the code size
@ -54,9 +55,14 @@ class Compiler implements PikeVMOpcodes {
} }
} }
public PikeVM toVM() { public void markFindPreambleEnd() {
return new PikeVM(program, groupCount); findPreambleSize = offset;
} }
public PikeVM toVM() {
return new PikeVM(program, findPreambleSize, groupCount);
}
} }
private abstract class Expression { private abstract class Expression {
@ -148,6 +154,14 @@ class Compiler implements PikeVMOpcodes {
} }
public void writeCode(Output output) { public void writeCode(Output output) {
// find() preamble
int start = output.offset;
output.add(SPLIT_JMP);
output.add(start + 5);
output.add(DOTALL);
output.add(SPLIT);
output.add(start + 2);
output.markFindPreambleEnd();
group.writeCode(output); group.writeCode(output);
} }
} }

View File

@ -19,13 +19,21 @@ class PikeVM implements PikeVMOpcodes {
private final int[] program; private final int[] program;
private final int groupCount; private final int groupCount;
private final int offsetsCount; private final int offsetsCount;
/*
* For find(), we do not want to anchor the match at the start offset. Our
* compiler allows this by prefixing the code with an implicit '(?:.*?)'. For
* regular matches() calls, we want to skip that code and start at {@code
* findPrefixLength} instead.
*/
private final int findPrefixLength;
public interface Result { public interface Result {
void set(int[] start, int[] end); void set(int[] start, int[] end);
} }
protected PikeVM(int[] program, int groupCount) { protected PikeVM(int[] program, int findPrefixLength, int groupCount) {
this.program = program; this.program = program;
this.findPrefixLength = findPrefixLength;
this.groupCount = groupCount; this.groupCount = groupCount;
offsetsCount = 2 * groupCount + 2; offsetsCount = 2 * groupCount + 2;
} }
@ -190,6 +198,31 @@ class PikeVM implements PikeVMOpcodes {
result.set(groupStart, groupEnd); result.set(groupStart, groupEnd);
} }
private void mustStartMatchAt(int start) {
int previous = -1;
for (int pc = head; pc >= 0; ) {
int nextPC = next[pc] - 1;
if (start + 1 == offsets[pc][0]) {
previous = pc;
} else {
next[pc] = 0;
offsets[pc] = null;
if (pc == tail) {
head = tail = -1;
} else if (previous < 0) {
head = nextPC;
} else {
next[previous] = 1 + nextPC;
}
}
pc = nextPC;
}
}
private int startOffset(int pc) {
return offsets[pc][0] - 1;
}
public boolean isEmpty() { public boolean isEmpty() {
return head < 0; return head < 0;
} }
@ -250,11 +283,8 @@ class PikeVM implements PikeVMOpcodes {
ThreadQueue next = new ThreadQueue(); ThreadQueue next = new ThreadQueue();
// initialize the first thread // initialize the first thread
ThreadQueue queued = new ThreadQueue(0); int startPC = anchorStart ? findPrefixLength : 0;
if (!anchorStart) { ThreadQueue queued = new ThreadQueue(startPC);
// this requires non-greedy matching
throw new UnsupportedOperationException();
}
boolean foundMatch = false; boolean foundMatch = false;
for (int i = start; i <= end; ++i) { for (int i = start; i <= end; ++i) {
@ -280,6 +310,11 @@ class PikeVM implements PikeVMOpcodes {
continue; continue;
} }
current.setResult(result); current.setResult(result);
// now that we found a match, even higher-priority matches must match
// at the same start offset
if (!anchorStart) {
next.mustStartMatchAt(current.startOffset(pc));
}
foundMatch = true; foundMatch = true;
break; break;
} }
@ -346,9 +381,9 @@ class PikeVM implements PikeVMOpcodes {
* non-trivial pattern * non-trivial pattern
*/ */
public String isPlainString() { public String isPlainString() {
// we expect the machine to start with SAVE_OFFSET 0 and // we expect the machine to start with the find preamble and SAVE_OFFSET 0
// end with SAVE_OFFSET 1 // end with SAVE_OFFSET 1
int start = 0; int start = findPrefixLength;
if (start + 1 < program.length && if (start + 1 < program.length &&
program[start] == SAVE_OFFSET && program[start + 1] == 0) { program[start] == SAVE_OFFSET && program[start + 1] == 0) {
start += 2; start += 2;

View File

@ -49,8 +49,12 @@ public class RegexMatcher extends Matcher {
return vm.matches(array, 0, array.length, true, true, adapter); return vm.matches(array, 0, array.length, true, true, adapter);
} }
public boolean find() {
return find(end + (start == end ? 1 : 0));
}
public boolean find(int offset) { public boolean find(int offset) {
throw new UnsupportedOperationException("TODO"); return vm.matches(array, offset, array.length, false, false, adapter);
} }
public int start(int group) { public int start(int group) {