Regex: support lookaheads

Signed-off-by: Johannes Schindelin <johannes.schindelin@gmx.de>
This commit is contained in:
Johannes Schindelin
2013-11-12 09:33:45 -06:00
parent d4a2f58eb5
commit 85af36ef90
4 changed files with 64 additions and 7 deletions

View File

@ -59,5 +59,6 @@ public class Regex {
expectGroups("(?:a)", "a"); expectGroups("(?:a)", "a");
expectGroups("a|(b|c)", "a", (String)null); expectGroups("a|(b|c)", "a", (String)null);
expectGroups("a|(b|c)", "c", "c"); expectGroups("a|(b|c)", "c", "c");
expectGroups("(?=a)a", "a");
} }
} }

View File

@ -28,6 +28,7 @@ class Compiler implements PikeVMOpcodes {
private int groupCount = -1; private int groupCount = -1;
private int findPreambleSize; private int findPreambleSize;
private ArrayList<CharacterMatcher> classes; private ArrayList<CharacterMatcher> classes;
private ArrayList<PikeVM> lookaheads;
public Output(Expression expr) { public Output(Expression expr) {
// try-run to determine the code size // try-run to determine the code size
@ -36,6 +37,7 @@ class Compiler implements PikeVMOpcodes {
offset = 0; offset = 0;
groupCount = -1; groupCount = -1;
classes = new ArrayList<CharacterMatcher>(); classes = new ArrayList<CharacterMatcher>();
lookaheads = new ArrayList<PikeVM>();
// write it out! // write it out!
expr.writeCode(this); expr.writeCode(this);
} }
@ -64,7 +66,10 @@ class Compiler implements PikeVMOpcodes {
public PikeVM toVM() { public PikeVM toVM() {
CharacterMatcher[] classes = new CharacterMatcher[this.classes.size()]; CharacterMatcher[] classes = new CharacterMatcher[this.classes.size()];
this.classes.toArray(classes); this.classes.toArray(classes);
return new PikeVM(program, findPreambleSize, groupCount, classes); PikeVM[] lookaheads = new PikeVM[this.lookaheads.size()];
this.lookaheads.toArray(lookaheads);
return new PikeVM(program, findPreambleSize, groupCount, classes,
lookaheads);
} }
public int addClass(CharacterMatcher characterClass) { public int addClass(CharacterMatcher characterClass) {
@ -75,6 +80,15 @@ class Compiler implements PikeVMOpcodes {
classes.add(characterClass); classes.add(characterClass);
return result; return result;
} }
public int addLookahead(PikeVM lookahead) {
if (program == null) {
return -1;
}
int result = lookaheads.size();
lookaheads.add(lookahead);
return result;
}
} }
private abstract class Expression { private abstract class Expression {
@ -212,6 +226,17 @@ class Compiler implements PikeVMOpcodes {
} }
} }
private class Lookahead extends Expression {
private final Group group = new Group(false, null);
@Override
protected void writeCode(Output output) {
PikeVM vm = new Output(group).toVM();
output.add(LOOKAHEAD);
output.add(output.addLookahead(vm));
}
}
private class Group0 extends Expression { private class Group0 extends Expression {
private final Group group; private final Group group;
@ -271,10 +296,24 @@ class Compiler implements PikeVMOpcodes {
case '(': { case '(': {
boolean capturing = true; boolean capturing = true;
if (index + 1 < array.length && array[index + 1] == '?') { if (index + 1 < array.length && array[index + 1] == '?') {
if (index + 2 < array.length && array[index + 2] == ':') { index += 2;
index += 2; if (index >= array.length) {
throw new RuntimeException("Short pattern @" + index + ": "
+ regex);
}
c = array[index];
switch (c) {
case ':':
capturing = false; capturing = false;
} else { break;
case '=': {
capturing = false;
Lookahead lookahead = new Lookahead();
current.push(lookahead);
groups.push(lookahead.group);
continue;
}
default:
throw new UnsupportedOperationException("Not yet supported: " throw new UnsupportedOperationException("Not yet supported: "
+ regex.substring(index)); + regex.substring(index));
} }

View File

@ -27,19 +27,21 @@ class PikeVM implements PikeVMOpcodes {
*/ */
private final int findPrefixLength; private final int findPrefixLength;
private final CharacterMatcher[] classes; private final CharacterMatcher[] classes;
private final PikeVM[] lookaheads;
public interface Result { public interface Result {
void set(int[] start, int[] end); void set(int[] start, int[] end);
} }
protected PikeVM(int[] program, int findPrefixLength, int groupCount, protected PikeVM(int[] program, int findPrefixLength, int groupCount,
CharacterMatcher[] classes) CharacterMatcher[] classes, PikeVM[] lookaheads)
{ {
this.program = program; this.program = program;
this.findPrefixLength = findPrefixLength; this.findPrefixLength = findPrefixLength;
this.groupCount = groupCount; this.groupCount = groupCount;
offsetsCount = 2 * groupCount + 2; offsetsCount = 2 * groupCount + 2;
this.classes = classes; this.classes = classes;
this.lookaheads = lookaheads;
} }
/** /**
@ -313,7 +315,12 @@ class PikeVM implements PikeVMOpcodes {
if (anchorEnd && i < end) { if (anchorEnd && i < end) {
continue; continue;
} }
if (result == null) {
// only interested in a match, no need to go on
return true;
}
current.setResult(result); current.setResult(result);
// now that we found a match, even higher-priority matches must match // now that we found a match, even higher-priority matches must match
// at the same start offset // at the same start offset
if (!anchorStart) { if (!anchorStart) {
@ -338,10 +345,18 @@ class PikeVM implements PikeVMOpcodes {
current.queueNext(pc, pc + 2, next); current.queueNext(pc, pc + 2, next);
} }
break; break;
case LOOKAHEAD:
if (lookaheads[program[pc + 1]].matches(characters,
i, characters.length, true, false, null)) {
current.queueImmediately(pc, pc + 2, false);
}
break;
/* immediate opcodes, i.e. thread continues within the same step */ /* immediate opcodes, i.e. thread continues within the same step */
case SAVE_OFFSET: case SAVE_OFFSET:
int index = program[pc + 1]; if (result != null) {
current.saveOffset(pc, index, i); int index = program[pc + 1];
current.saveOffset(pc, index, i);
}
current.queueImmediately(pc, pc + 2, false); current.queueImmediately(pc, pc + 2, false);
break; break;
case SPLIT: case SPLIT:

View File

@ -24,6 +24,8 @@ interface PikeVMOpcodes {
final static int CHARACTER_CLASS = -20; final static int CHARACTER_CLASS = -20;
final static int LOOKAHEAD = -30;
final static int SAVE_OFFSET = -40; final static int SAVE_OFFSET = -40;
final static int SPLIT = -50; final static int SPLIT = -50;