From 8b611c807530cc9dc4fccc09429b77579a348ac7 Mon Sep 17 00:00:00 2001 From: Johannes Schindelin Date: Thu, 14 Nov 2013 11:10:18 -0600 Subject: [PATCH] Regex: support look-behind patterns Signed-off-by: Johannes Schindelin --- test/Regex.java | 1 + test/regex/Compiler.java | 49 +++++++++++++++++++++++++---------- test/regex/PikeVM.java | 21 ++++++++++----- test/regex/PikeVMOpcodes.java | 1 + 4 files changed, 51 insertions(+), 21 deletions(-) diff --git a/test/Regex.java b/test/Regex.java index 375d41a704..44157d369e 100644 --- a/test/Regex.java +++ b/test/Regex.java @@ -60,5 +60,6 @@ public class Regex { expectGroups("a|(b|c)", "a", (String)null); expectGroups("a|(b|c)", "c", "c"); expectGroups("(?=a)a", "a"); + expectGroups(".*(o)(?<=[A-Z][a-z]*)", "Hello", "o"); } } diff --git a/test/regex/Compiler.java b/test/regex/Compiler.java index 5922f104f7..6967e542ba 100644 --- a/test/regex/Compiler.java +++ b/test/regex/Compiler.java @@ -28,7 +28,7 @@ class Compiler implements PikeVMOpcodes { private int groupCount = -1; private int findPreambleSize; private ArrayList classes; - private ArrayList lookaheads; + private ArrayList lookarounds; public Output(Expression expr) { // try-run to determine the code size @@ -37,7 +37,7 @@ class Compiler implements PikeVMOpcodes { offset = 0; groupCount = -1; classes = new ArrayList(); - lookaheads = new ArrayList(); + lookarounds = new ArrayList(); // write it out! expr.writeCode(this); } @@ -66,10 +66,10 @@ class Compiler implements PikeVMOpcodes { public PikeVM toVM() { CharacterMatcher[] classes = new CharacterMatcher[this.classes.size()]; this.classes.toArray(classes); - PikeVM[] lookaheads = new PikeVM[this.lookaheads.size()]; - this.lookaheads.toArray(lookaheads); + PikeVM[] lookarounds = new PikeVM[this.lookarounds.size()]; + this.lookarounds.toArray(lookarounds); return new PikeVM(program, findPreambleSize, groupCount, classes, - lookaheads); + lookarounds); } public int addClass(CharacterMatcher characterClass) { @@ -81,12 +81,12 @@ class Compiler implements PikeVMOpcodes { return result; } - public int addLookahead(PikeVM lookahead) { + public int addLookaround(PikeVM lookaround) { if (program == null) { return -1; } - int result = lookaheads.size(); - lookaheads.add(lookahead); + int result = lookarounds.size(); + lookarounds.add(lookaround); return result; } } @@ -226,14 +226,22 @@ class Compiler implements PikeVMOpcodes { } } - private class Lookahead extends Expression { + private class Lookaround extends Expression { private final Group group = new Group(false, null); + private final boolean forward; + + public Lookaround(boolean forward) { + this.forward = forward; + } @Override protected void writeCode(Output output) { PikeVM vm = new Output(group).toVM(); - output.add(LOOKAHEAD); - output.add(output.addLookahead(vm)); + if (!forward) { + vm.reverse(); + } + output.add(forward ? LOOKAHEAD : LOOKBEHIND); + output.add(output.addLookaround(vm)); } } @@ -302,15 +310,28 @@ class Compiler implements PikeVMOpcodes { + regex); } c = array[index]; + boolean lookAhead = true; + if (c == '<') { + if (++ index >= array.length) { + throw new RuntimeException("Short pattern @" + index + ": " + + regex); + } + lookAhead = false; + c = array[index]; + if (c != '=' && c != '!') { + throw new IllegalArgumentException("Named groups not supported @" + + index + ": " + regex); + } + } switch (c) { case ':': capturing = false; break; case '=': { capturing = false; - Lookahead lookahead = new Lookahead(); - current.push(lookahead); - groups.push(lookahead.group); + Lookaround lookaround = new Lookaround(lookAhead); + current.push(lookaround); + groups.push(lookaround.group); continue; } default: diff --git a/test/regex/PikeVM.java b/test/regex/PikeVM.java index 1221ee91c5..d0bd453d27 100644 --- a/test/regex/PikeVM.java +++ b/test/regex/PikeVM.java @@ -27,21 +27,21 @@ class PikeVM implements PikeVMOpcodes { */ private final int findPrefixLength; private final CharacterMatcher[] classes; - private final PikeVM[] lookaheads; + private final PikeVM[] lookarounds; public interface Result { void set(int[] start, int[] end); } protected PikeVM(int[] program, int findPrefixLength, int groupCount, - CharacterMatcher[] classes, PikeVM[] lookaheads) + CharacterMatcher[] classes, PikeVM[] lookarounds) { this.program = program; this.findPrefixLength = findPrefixLength; this.groupCount = groupCount; offsetsCount = 2 * groupCount + 2; this.classes = classes; - this.lookaheads = lookaheads; + this.lookarounds = lookarounds; } /** @@ -293,13 +293,14 @@ class PikeVM implements PikeVMOpcodes { ThreadQueue queued = new ThreadQueue(startPC); boolean foundMatch = false; - for (int i = start; i <= end; ++i) { + int step = end > start ? +1 : -1; + for (int i = start; i != end + step; i += step) { if (queued.isEmpty()) { // no threads left return foundMatch; } - char c = i < end ? characters[i] : 0; + char c = i != end ? characters[i] : 0; int pc = -1; for (;;) { pc = current.next(pc); @@ -312,7 +313,7 @@ class PikeVM implements PikeVMOpcodes { // pc == program.length is a match! if (pc == program.length) { - if (anchorEnd && i < end) { + if (anchorEnd && i != end) { continue; } if (result == null) { @@ -346,11 +347,17 @@ class PikeVM implements PikeVMOpcodes { } break; case LOOKAHEAD: - if (lookaheads[program[pc + 1]].matches(characters, + if (lookarounds[program[pc + 1]].matches(characters, i, characters.length, true, false, null)) { current.queueImmediately(pc, pc + 2, false); } break; + case LOOKBEHIND: + if (lookarounds[program[pc + 1]].matches(characters, + i - 1, -1, true, false, null)) { + current.queueImmediately(pc, pc + 2, false); + } + break; /* immediate opcodes, i.e. thread continues within the same step */ case SAVE_OFFSET: if (result != null) { diff --git a/test/regex/PikeVMOpcodes.java b/test/regex/PikeVMOpcodes.java index 3f86f34adf..acd67cc2fd 100644 --- a/test/regex/PikeVMOpcodes.java +++ b/test/regex/PikeVMOpcodes.java @@ -25,6 +25,7 @@ interface PikeVMOpcodes { final static int CHARACTER_CLASS = -20; final static int LOOKAHEAD = -30; + final static int LOOKBEHIND = -31; final static int SAVE_OFFSET = -40;