Regex: support look-behind patterns

Signed-off-by: Johannes Schindelin <johannes.schindelin@gmx.de>
This commit is contained in:
Johannes Schindelin 2013-11-14 11:10:18 -06:00
parent 62d1964779
commit 8b611c8075
4 changed files with 51 additions and 21 deletions

View File

@ -60,5 +60,6 @@ public class Regex {
expectGroups("a|(b|c)", "a", (String)null); expectGroups("a|(b|c)", "a", (String)null);
expectGroups("a|(b|c)", "c", "c"); expectGroups("a|(b|c)", "c", "c");
expectGroups("(?=a)a", "a"); expectGroups("(?=a)a", "a");
expectGroups(".*(o)(?<=[A-Z][a-z]*)", "Hello", "o");
} }
} }

View File

@ -28,7 +28,7 @@ class Compiler implements PikeVMOpcodes {
private int groupCount = -1; private int groupCount = -1;
private int findPreambleSize; private int findPreambleSize;
private ArrayList<CharacterMatcher> classes; private ArrayList<CharacterMatcher> classes;
private ArrayList<PikeVM> lookaheads; private ArrayList<PikeVM> lookarounds;
public Output(Expression expr) { public Output(Expression expr) {
// try-run to determine the code size // try-run to determine the code size
@ -37,7 +37,7 @@ class Compiler implements PikeVMOpcodes {
offset = 0; offset = 0;
groupCount = -1; groupCount = -1;
classes = new ArrayList<CharacterMatcher>(); classes = new ArrayList<CharacterMatcher>();
lookaheads = new ArrayList<PikeVM>(); lookarounds = new ArrayList<PikeVM>();
// write it out! // write it out!
expr.writeCode(this); expr.writeCode(this);
} }
@ -66,10 +66,10 @@ class Compiler implements PikeVMOpcodes {
public PikeVM toVM() { public PikeVM toVM() {
CharacterMatcher[] classes = new CharacterMatcher[this.classes.size()]; CharacterMatcher[] classes = new CharacterMatcher[this.classes.size()];
this.classes.toArray(classes); this.classes.toArray(classes);
PikeVM[] lookaheads = new PikeVM[this.lookaheads.size()]; PikeVM[] lookarounds = new PikeVM[this.lookarounds.size()];
this.lookaheads.toArray(lookaheads); this.lookarounds.toArray(lookarounds);
return new PikeVM(program, findPreambleSize, groupCount, classes, return new PikeVM(program, findPreambleSize, groupCount, classes,
lookaheads); lookarounds);
} }
public int addClass(CharacterMatcher characterClass) { public int addClass(CharacterMatcher characterClass) {
@ -81,12 +81,12 @@ class Compiler implements PikeVMOpcodes {
return result; return result;
} }
public int addLookahead(PikeVM lookahead) { public int addLookaround(PikeVM lookaround) {
if (program == null) { if (program == null) {
return -1; return -1;
} }
int result = lookaheads.size(); int result = lookarounds.size();
lookaheads.add(lookahead); lookarounds.add(lookaround);
return result; return result;
} }
} }
@ -226,14 +226,22 @@ class Compiler implements PikeVMOpcodes {
} }
} }
private class Lookahead extends Expression { private class Lookaround extends Expression {
private final Group group = new Group(false, null); private final Group group = new Group(false, null);
private final boolean forward;
public Lookaround(boolean forward) {
this.forward = forward;
}
@Override @Override
protected void writeCode(Output output) { protected void writeCode(Output output) {
PikeVM vm = new Output(group).toVM(); PikeVM vm = new Output(group).toVM();
output.add(LOOKAHEAD); if (!forward) {
output.add(output.addLookahead(vm)); vm.reverse();
}
output.add(forward ? LOOKAHEAD : LOOKBEHIND);
output.add(output.addLookaround(vm));
} }
} }
@ -302,15 +310,28 @@ class Compiler implements PikeVMOpcodes {
+ regex); + regex);
} }
c = array[index]; c = array[index];
boolean lookAhead = true;
if (c == '<') {
if (++ index >= array.length) {
throw new RuntimeException("Short pattern @" + index + ": "
+ regex);
}
lookAhead = false;
c = array[index];
if (c != '=' && c != '!') {
throw new IllegalArgumentException("Named groups not supported @"
+ index + ": " + regex);
}
}
switch (c) { switch (c) {
case ':': case ':':
capturing = false; capturing = false;
break; break;
case '=': { case '=': {
capturing = false; capturing = false;
Lookahead lookahead = new Lookahead(); Lookaround lookaround = new Lookaround(lookAhead);
current.push(lookahead); current.push(lookaround);
groups.push(lookahead.group); groups.push(lookaround.group);
continue; continue;
} }
default: default:

View File

@ -27,21 +27,21 @@ class PikeVM implements PikeVMOpcodes {
*/ */
private final int findPrefixLength; private final int findPrefixLength;
private final CharacterMatcher[] classes; private final CharacterMatcher[] classes;
private final PikeVM[] lookaheads; private final PikeVM[] lookarounds;
public interface Result { public interface Result {
void set(int[] start, int[] end); void set(int[] start, int[] end);
} }
protected PikeVM(int[] program, int findPrefixLength, int groupCount, protected PikeVM(int[] program, int findPrefixLength, int groupCount,
CharacterMatcher[] classes, PikeVM[] lookaheads) CharacterMatcher[] classes, PikeVM[] lookarounds)
{ {
this.program = program; this.program = program;
this.findPrefixLength = findPrefixLength; this.findPrefixLength = findPrefixLength;
this.groupCount = groupCount; this.groupCount = groupCount;
offsetsCount = 2 * groupCount + 2; offsetsCount = 2 * groupCount + 2;
this.classes = classes; this.classes = classes;
this.lookaheads = lookaheads; this.lookarounds = lookarounds;
} }
/** /**
@ -293,13 +293,14 @@ class PikeVM implements PikeVMOpcodes {
ThreadQueue queued = new ThreadQueue(startPC); ThreadQueue queued = new ThreadQueue(startPC);
boolean foundMatch = false; boolean foundMatch = false;
for (int i = start; i <= end; ++i) { int step = end > start ? +1 : -1;
for (int i = start; i != end + step; i += step) {
if (queued.isEmpty()) { if (queued.isEmpty()) {
// no threads left // no threads left
return foundMatch; return foundMatch;
} }
char c = i < end ? characters[i] : 0; char c = i != end ? characters[i] : 0;
int pc = -1; int pc = -1;
for (;;) { for (;;) {
pc = current.next(pc); pc = current.next(pc);
@ -312,7 +313,7 @@ class PikeVM implements PikeVMOpcodes {
// pc == program.length is a match! // pc == program.length is a match!
if (pc == program.length) { if (pc == program.length) {
if (anchorEnd && i < end) { if (anchorEnd && i != end) {
continue; continue;
} }
if (result == null) { if (result == null) {
@ -346,11 +347,17 @@ class PikeVM implements PikeVMOpcodes {
} }
break; break;
case LOOKAHEAD: case LOOKAHEAD:
if (lookaheads[program[pc + 1]].matches(characters, if (lookarounds[program[pc + 1]].matches(characters,
i, characters.length, true, false, null)) { i, characters.length, true, false, null)) {
current.queueImmediately(pc, pc + 2, false); current.queueImmediately(pc, pc + 2, false);
} }
break; break;
case LOOKBEHIND:
if (lookarounds[program[pc + 1]].matches(characters,
i - 1, -1, true, false, null)) {
current.queueImmediately(pc, pc + 2, false);
}
break;
/* immediate opcodes, i.e. thread continues within the same step */ /* immediate opcodes, i.e. thread continues within the same step */
case SAVE_OFFSET: case SAVE_OFFSET:
if (result != null) { if (result != null) {

View File

@ -25,6 +25,7 @@ interface PikeVMOpcodes {
final static int CHARACTER_CLASS = -20; final static int CHARACTER_CLASS = -20;
final static int LOOKAHEAD = -30; final static int LOOKAHEAD = -30;
final static int LOOKBEHIND = -31;
final static int SAVE_OFFSET = -40; final static int SAVE_OFFSET = -40;