mirror of
https://github.com/corda/corda.git
synced 2025-01-21 03:55:00 +00:00
Regex: implement find()
Now that we have non-greedy repeats, we can implement the find() (which essentially prefixes the regular expression pattern with '.*?'. Signed-off-by: Johannes Schindelin <johannes.schindelin@gmx.de>
This commit is contained in:
parent
7da03b0f19
commit
ca428c406c
@ -32,6 +32,18 @@ public class Regex {
|
||||
}
|
||||
}
|
||||
|
||||
private static void expectFind(String regex, String string,
|
||||
String... matches)
|
||||
{
|
||||
Matcher matcher = getMatcher(regex, string);
|
||||
int i = 0;
|
||||
while (i < matches.length) {
|
||||
expect(matcher.find());
|
||||
expect(matches[i++].equals(matcher.group()));
|
||||
}
|
||||
expect(!matcher.find());
|
||||
}
|
||||
|
||||
public static void main(String[] args) {
|
||||
expectMatch("a(bb)?a", "abba");
|
||||
expectNoMatch("a(bb)?a", "abbba");
|
||||
@ -41,5 +53,6 @@ public class Regex {
|
||||
expectNoMatch(".", "\n");
|
||||
expectGroups("a(bb)*a", "abbbba", "bb");
|
||||
expectGroups("a(bb)?(bb)+a", "abba", null, "bb");
|
||||
expectFind(" +", "Hello , world! ", " ", " ", " ");
|
||||
}
|
||||
}
|
||||
|
@ -26,6 +26,7 @@ class Compiler implements PikeVMOpcodes {
|
||||
private int[] program;
|
||||
private int offset;
|
||||
private int groupCount = -1;
|
||||
private int findPreambleSize;
|
||||
|
||||
public Output(Expression expr) {
|
||||
// try-run to determine the code size
|
||||
@ -54,9 +55,14 @@ class Compiler implements PikeVMOpcodes {
|
||||
}
|
||||
}
|
||||
|
||||
public PikeVM toVM() {
|
||||
return new PikeVM(program, groupCount);
|
||||
public void markFindPreambleEnd() {
|
||||
findPreambleSize = offset;
|
||||
}
|
||||
|
||||
public PikeVM toVM() {
|
||||
return new PikeVM(program, findPreambleSize, groupCount);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
private abstract class Expression {
|
||||
@ -148,6 +154,14 @@ class Compiler implements PikeVMOpcodes {
|
||||
}
|
||||
|
||||
public void writeCode(Output output) {
|
||||
// find() preamble
|
||||
int start = output.offset;
|
||||
output.add(SPLIT_JMP);
|
||||
output.add(start + 5);
|
||||
output.add(DOTALL);
|
||||
output.add(SPLIT);
|
||||
output.add(start + 2);
|
||||
output.markFindPreambleEnd();
|
||||
group.writeCode(output);
|
||||
}
|
||||
}
|
||||
|
@ -19,13 +19,21 @@ class PikeVM implements PikeVMOpcodes {
|
||||
private final int[] program;
|
||||
private final int groupCount;
|
||||
private final int offsetsCount;
|
||||
/*
|
||||
* For find(), we do not want to anchor the match at the start offset. Our
|
||||
* compiler allows this by prefixing the code with an implicit '(?:.*?)'. For
|
||||
* regular matches() calls, we want to skip that code and start at {@code
|
||||
* findPrefixLength} instead.
|
||||
*/
|
||||
private final int findPrefixLength;
|
||||
|
||||
public interface Result {
|
||||
void set(int[] start, int[] end);
|
||||
}
|
||||
|
||||
protected PikeVM(int[] program, int groupCount) {
|
||||
protected PikeVM(int[] program, int findPrefixLength, int groupCount) {
|
||||
this.program = program;
|
||||
this.findPrefixLength = findPrefixLength;
|
||||
this.groupCount = groupCount;
|
||||
offsetsCount = 2 * groupCount + 2;
|
||||
}
|
||||
@ -190,6 +198,31 @@ class PikeVM implements PikeVMOpcodes {
|
||||
result.set(groupStart, groupEnd);
|
||||
}
|
||||
|
||||
private void mustStartMatchAt(int start) {
|
||||
int previous = -1;
|
||||
for (int pc = head; pc >= 0; ) {
|
||||
int nextPC = next[pc] - 1;
|
||||
if (start + 1 == offsets[pc][0]) {
|
||||
previous = pc;
|
||||
} else {
|
||||
next[pc] = 0;
|
||||
offsets[pc] = null;
|
||||
if (pc == tail) {
|
||||
head = tail = -1;
|
||||
} else if (previous < 0) {
|
||||
head = nextPC;
|
||||
} else {
|
||||
next[previous] = 1 + nextPC;
|
||||
}
|
||||
}
|
||||
pc = nextPC;
|
||||
}
|
||||
}
|
||||
|
||||
private int startOffset(int pc) {
|
||||
return offsets[pc][0] - 1;
|
||||
}
|
||||
|
||||
public boolean isEmpty() {
|
||||
return head < 0;
|
||||
}
|
||||
@ -250,11 +283,8 @@ class PikeVM implements PikeVMOpcodes {
|
||||
ThreadQueue next = new ThreadQueue();
|
||||
|
||||
// initialize the first thread
|
||||
ThreadQueue queued = new ThreadQueue(0);
|
||||
if (!anchorStart) {
|
||||
// this requires non-greedy matching
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
int startPC = anchorStart ? findPrefixLength : 0;
|
||||
ThreadQueue queued = new ThreadQueue(startPC);
|
||||
|
||||
boolean foundMatch = false;
|
||||
for (int i = start; i <= end; ++i) {
|
||||
@ -280,6 +310,11 @@ class PikeVM implements PikeVMOpcodes {
|
||||
continue;
|
||||
}
|
||||
current.setResult(result);
|
||||
// now that we found a match, even higher-priority matches must match
|
||||
// at the same start offset
|
||||
if (!anchorStart) {
|
||||
next.mustStartMatchAt(current.startOffset(pc));
|
||||
}
|
||||
foundMatch = true;
|
||||
break;
|
||||
}
|
||||
@ -346,9 +381,9 @@ class PikeVM implements PikeVMOpcodes {
|
||||
* non-trivial pattern
|
||||
*/
|
||||
public String isPlainString() {
|
||||
// we expect the machine to start with SAVE_OFFSET 0 and
|
||||
// we expect the machine to start with the find preamble and SAVE_OFFSET 0
|
||||
// end with SAVE_OFFSET 1
|
||||
int start = 0;
|
||||
int start = findPrefixLength;
|
||||
if (start + 1 < program.length &&
|
||||
program[start] == SAVE_OFFSET && program[start + 1] == 0) {
|
||||
start += 2;
|
||||
|
@ -49,8 +49,12 @@ public class RegexMatcher extends Matcher {
|
||||
return vm.matches(array, 0, array.length, true, true, adapter);
|
||||
}
|
||||
|
||||
public boolean find() {
|
||||
return find(end + (start == end ? 1 : 0));
|
||||
}
|
||||
|
||||
public boolean find(int offset) {
|
||||
throw new UnsupportedOperationException("TODO");
|
||||
return vm.matches(array, offset, array.length, false, false, adapter);
|
||||
}
|
||||
|
||||
public int start(int group) {
|
||||
|
Loading…
Reference in New Issue
Block a user