2013-11-09 15:18:11 -06:00
|
|
|
/* Copyright (c) 2008-2013, Avian Contributors
|
|
|
|
|
|
|
|
Permission to use, copy, modify, and/or distribute this software
|
|
|
|
for any purpose with or without fee is hereby granted, provided
|
|
|
|
that the above copyright notice and this permission notice appear
|
|
|
|
in all copies.
|
|
|
|
|
|
|
|
There is NO WARRANTY for this software. See license.txt for
|
|
|
|
details. */
|
|
|
|
|
|
|
|
package regex;
|
|
|
|
|
|
|
|
import java.util.ArrayList;
|
|
|
|
import java.util.Stack;
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Compiles regular expressions into {@link PikeVM}s.
|
|
|
|
*
|
|
|
|
* @author Johannes Schindelin
|
|
|
|
*/
|
|
|
|
class Compiler implements PikeVMOpcodes {
|
|
|
|
private final static CharacterMatcher regularCharacter =
|
|
|
|
CharacterMatcher.parse("[^\\\\.*+?|\\[\\]{}()^$]");
|
|
|
|
|
|
|
|
private static class Output {
|
|
|
|
private int[] program;
|
|
|
|
private int offset;
|
|
|
|
private int groupCount = -1;
|
2013-11-13 17:54:47 -06:00
|
|
|
private int findPreambleSize;
|
2013-11-09 15:43:26 -06:00
|
|
|
private ArrayList<CharacterMatcher> classes;
|
2013-11-14 11:10:18 -06:00
|
|
|
private ArrayList<PikeVM> lookarounds;
|
2013-11-09 15:18:11 -06:00
|
|
|
|
|
|
|
public Output(Expression expr) {
|
|
|
|
// try-run to determine the code size
|
|
|
|
expr.writeCode(this);
|
|
|
|
program = new int[offset];
|
|
|
|
offset = 0;
|
|
|
|
groupCount = -1;
|
2013-11-09 15:43:26 -06:00
|
|
|
classes = new ArrayList<CharacterMatcher>();
|
2013-11-14 11:10:18 -06:00
|
|
|
lookarounds = new ArrayList<PikeVM>();
|
2013-11-09 15:18:11 -06:00
|
|
|
// write it out!
|
|
|
|
expr.writeCode(this);
|
|
|
|
}
|
|
|
|
|
|
|
|
public void add(int opcode) {
|
|
|
|
if (program != null) {
|
|
|
|
program[offset] = opcode;
|
|
|
|
}
|
|
|
|
offset++;
|
|
|
|
}
|
|
|
|
|
|
|
|
public int markJump() {
|
|
|
|
return offset++;
|
|
|
|
}
|
|
|
|
|
|
|
|
public void setJump(int mark) {
|
|
|
|
if (program != null) {
|
|
|
|
program[mark] = offset;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2013-11-13 17:54:47 -06:00
|
|
|
public void markFindPreambleEnd() {
|
|
|
|
findPreambleSize = offset;
|
|
|
|
}
|
|
|
|
|
2013-11-09 15:18:11 -06:00
|
|
|
public PikeVM toVM() {
|
2013-11-09 15:43:26 -06:00
|
|
|
CharacterMatcher[] classes = new CharacterMatcher[this.classes.size()];
|
|
|
|
this.classes.toArray(classes);
|
2013-11-14 11:10:18 -06:00
|
|
|
PikeVM[] lookarounds = new PikeVM[this.lookarounds.size()];
|
|
|
|
this.lookarounds.toArray(lookarounds);
|
2013-11-12 09:33:45 -06:00
|
|
|
return new PikeVM(program, findPreambleSize, groupCount, classes,
|
2013-11-14 11:10:18 -06:00
|
|
|
lookarounds);
|
2013-11-09 15:18:11 -06:00
|
|
|
}
|
2013-11-13 17:54:47 -06:00
|
|
|
|
2013-11-09 15:43:26 -06:00
|
|
|
public int addClass(CharacterMatcher characterClass) {
|
|
|
|
if (program == null) {
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
int result = classes.size();
|
|
|
|
classes.add(characterClass);
|
|
|
|
return result;
|
|
|
|
}
|
2013-11-12 09:33:45 -06:00
|
|
|
|
2013-11-14 11:10:18 -06:00
|
|
|
public int addLookaround(PikeVM lookaround) {
|
2013-11-12 09:33:45 -06:00
|
|
|
if (program == null) {
|
|
|
|
return -1;
|
|
|
|
}
|
2013-11-14 11:10:18 -06:00
|
|
|
int result = lookarounds.size();
|
|
|
|
lookarounds.add(lookaround);
|
2013-11-12 09:33:45 -06:00
|
|
|
return result;
|
|
|
|
}
|
2013-11-09 15:18:11 -06:00
|
|
|
}
|
|
|
|
|
|
|
|
private abstract class Expression {
|
|
|
|
protected abstract void writeCode(Output output);
|
|
|
|
}
|
|
|
|
|
2013-11-09 15:43:26 -06:00
|
|
|
private class CharacterRange extends Expression {
|
|
|
|
private final CharacterMatcher characterClass;
|
|
|
|
|
|
|
|
public CharacterRange(CharacterMatcher characterClass) {
|
|
|
|
this.characterClass = characterClass;
|
|
|
|
}
|
|
|
|
|
|
|
|
protected void writeCode(Output output) {
|
|
|
|
output.add(CHARACTER_CLASS);
|
|
|
|
output.add(output.addClass(characterClass));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2013-11-10 10:02:18 -06:00
|
|
|
private class Repeat extends Expression {
|
2013-11-09 15:18:11 -06:00
|
|
|
private Expression expr;
|
2013-11-10 10:02:18 -06:00
|
|
|
private int minCount, maxCount;
|
2013-11-10 10:23:01 -06:00
|
|
|
private boolean greedy;
|
2013-11-09 15:18:11 -06:00
|
|
|
|
2013-11-10 10:23:01 -06:00
|
|
|
public Repeat(Expression expr, int minCount, int maxCount, boolean greedy) {
|
2013-11-10 10:02:18 -06:00
|
|
|
if (minCount != 0 && minCount != 1) {
|
|
|
|
throw new RuntimeException("Unexpected min count: " + minCount);
|
|
|
|
}
|
|
|
|
if (maxCount != 1 && maxCount != -1) {
|
|
|
|
throw new RuntimeException("Unexpected max count: " + maxCount);
|
|
|
|
}
|
2013-11-09 15:18:11 -06:00
|
|
|
this.expr = expr;
|
2013-11-10 10:02:18 -06:00
|
|
|
this.minCount = minCount;
|
|
|
|
this.maxCount = maxCount;
|
2013-11-10 10:23:01 -06:00
|
|
|
this.greedy = greedy;
|
2013-11-09 15:18:11 -06:00
|
|
|
}
|
|
|
|
|
|
|
|
protected void writeCode(Output output) {
|
2013-11-10 10:02:18 -06:00
|
|
|
int start = output.offset;
|
2013-11-10 10:23:01 -06:00
|
|
|
int splitJmp = greedy ? SPLIT_JMP : SPLIT;
|
|
|
|
int split = greedy ? SPLIT : SPLIT_JMP;
|
2013-11-10 10:02:18 -06:00
|
|
|
if (minCount == 1 && maxCount == -1) {
|
|
|
|
expr.writeCode(output);
|
2013-11-10 10:23:01 -06:00
|
|
|
output.add(splitJmp);
|
2013-11-10 10:02:18 -06:00
|
|
|
output.add(start);
|
|
|
|
} else if (minCount == 0 && maxCount == -1) {
|
2013-11-10 10:23:01 -06:00
|
|
|
output.add(split);
|
2013-11-10 10:02:18 -06:00
|
|
|
int jump = output.markJump();
|
|
|
|
expr.writeCode(output);
|
2013-11-10 10:23:01 -06:00
|
|
|
output.add(splitJmp);
|
2013-11-10 10:02:18 -06:00
|
|
|
output.add(start + 2);
|
|
|
|
output.setJump(jump);
|
|
|
|
} else if (minCount == 0 && maxCount == 1) {
|
2013-11-10 10:23:01 -06:00
|
|
|
output.add(split);
|
2013-11-10 10:02:18 -06:00
|
|
|
int jump = output.markJump();
|
|
|
|
expr.writeCode(output);
|
|
|
|
output.setJump(jump);
|
|
|
|
} else {
|
|
|
|
throw new RuntimeException("Unexpected range: "
|
|
|
|
+ minCount + ", " + maxCount);
|
|
|
|
}
|
2013-11-09 15:18:11 -06:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
private class Group extends Expression {
|
2013-11-12 11:34:30 -06:00
|
|
|
private final boolean capturing;
|
|
|
|
|
2013-11-09 15:18:11 -06:00
|
|
|
private ArrayList<Expression> list = new ArrayList<Expression>();
|
2013-11-11 23:09:25 -06:00
|
|
|
private ArrayList<Group> alternatives;
|
2013-11-09 15:18:11 -06:00
|
|
|
|
2013-11-11 23:09:25 -06:00
|
|
|
public Group(boolean capturing, ArrayList<Expression> initialList) {
|
2013-11-12 11:34:30 -06:00
|
|
|
this.capturing = capturing;
|
2013-11-11 23:09:25 -06:00
|
|
|
if (initialList != null) {
|
|
|
|
list.addAll(initialList);
|
|
|
|
}
|
2013-11-12 11:34:30 -06:00
|
|
|
}
|
|
|
|
|
2013-11-09 15:18:11 -06:00
|
|
|
public void push(Expression expr) {
|
|
|
|
list.add(expr);
|
|
|
|
}
|
|
|
|
|
|
|
|
public void push(final int c) {
|
|
|
|
push(new Expression() {
|
|
|
|
public void writeCode(Output output) {
|
|
|
|
output.add(c);
|
|
|
|
}
|
|
|
|
});
|
|
|
|
}
|
|
|
|
|
2013-11-11 23:09:25 -06:00
|
|
|
public void startAlternative() {
|
|
|
|
if (alternatives == null) {
|
|
|
|
alternatives = new ArrayList<Group>();
|
|
|
|
}
|
|
|
|
alternatives.add(new Group(false, list));
|
|
|
|
list.clear();
|
|
|
|
}
|
|
|
|
|
2013-11-09 15:18:11 -06:00
|
|
|
public Expression pop() {
|
|
|
|
Expression result = list.remove(list.size() - 1);
|
|
|
|
return result;
|
|
|
|
}
|
|
|
|
|
|
|
|
protected void writeCode(Output output) {
|
2013-11-12 11:34:30 -06:00
|
|
|
int groupIndex = -1;
|
|
|
|
if (capturing) {
|
|
|
|
groupIndex = ++ output.groupCount;
|
|
|
|
output.add(SAVE_OFFSET);
|
|
|
|
output.add(2 * groupIndex);
|
|
|
|
}
|
2013-11-11 23:09:25 -06:00
|
|
|
int[] jumps = null;
|
|
|
|
if (alternatives != null) {
|
|
|
|
jumps = new int[alternatives.size()];
|
|
|
|
int i = 0;
|
|
|
|
for (Group alternative : alternatives) {
|
|
|
|
output.add(SPLIT);
|
|
|
|
int jump = output.markJump();
|
|
|
|
alternative.writeCode(output);
|
|
|
|
output.add(JMP);
|
|
|
|
jumps[i++] = output.markJump();
|
|
|
|
output.setJump(jump);
|
|
|
|
}
|
|
|
|
}
|
2013-11-09 15:18:11 -06:00
|
|
|
for (Expression expr : list) {
|
|
|
|
expr.writeCode(output);
|
|
|
|
}
|
2013-11-11 23:09:25 -06:00
|
|
|
if (jumps != null) {
|
|
|
|
for (int jump : jumps) {
|
|
|
|
output.setJump(jump);
|
|
|
|
}
|
|
|
|
}
|
2013-11-12 11:34:30 -06:00
|
|
|
if (capturing) {
|
|
|
|
output.add(SAVE_OFFSET);
|
|
|
|
output.add(2 * groupIndex + 1);
|
|
|
|
}
|
2013-11-09 15:18:11 -06:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2013-11-14 11:10:18 -06:00
|
|
|
private class Lookaround extends Expression {
|
2013-11-12 09:33:45 -06:00
|
|
|
private final Group group = new Group(false, null);
|
2013-11-14 11:10:18 -06:00
|
|
|
private final boolean forward;
|
|
|
|
|
|
|
|
public Lookaround(boolean forward) {
|
|
|
|
this.forward = forward;
|
|
|
|
}
|
2013-11-12 09:33:45 -06:00
|
|
|
|
|
|
|
@Override
|
|
|
|
protected void writeCode(Output output) {
|
|
|
|
PikeVM vm = new Output(group).toVM();
|
2013-11-14 11:10:18 -06:00
|
|
|
if (!forward) {
|
|
|
|
vm.reverse();
|
|
|
|
}
|
|
|
|
output.add(forward ? LOOKAHEAD : LOOKBEHIND);
|
|
|
|
output.add(output.addLookaround(vm));
|
2013-11-12 09:33:45 -06:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2013-11-09 15:18:11 -06:00
|
|
|
private class Group0 extends Expression {
|
|
|
|
private final Group group;
|
|
|
|
|
|
|
|
public Group0() {
|
2013-11-11 23:09:25 -06:00
|
|
|
group = new Group(true, null);
|
2013-11-09 15:18:11 -06:00
|
|
|
}
|
|
|
|
|
|
|
|
public void writeCode(Output output) {
|
2013-11-13 17:54:47 -06:00
|
|
|
// find() preamble
|
|
|
|
int start = output.offset;
|
|
|
|
output.add(SPLIT_JMP);
|
|
|
|
output.add(start + 5);
|
|
|
|
output.add(DOTALL);
|
|
|
|
output.add(SPLIT);
|
|
|
|
output.add(start + 2);
|
|
|
|
output.markFindPreambleEnd();
|
2013-11-09 15:18:11 -06:00
|
|
|
group.writeCode(output);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
private Group0 root;
|
|
|
|
private Stack<Group> groups;
|
|
|
|
|
|
|
|
public Compiler() {
|
|
|
|
root = new Group0();
|
|
|
|
groups = new Stack<Group>();
|
|
|
|
groups.add(root.group);
|
|
|
|
}
|
|
|
|
|
|
|
|
public Pattern compile(String regex) {
|
|
|
|
char[] array = regex.toCharArray();
|
2013-11-09 15:43:26 -06:00
|
|
|
CharacterMatcher.Parser characterClassParser =
|
|
|
|
new CharacterMatcher.Parser(array);
|
2013-11-09 15:18:11 -06:00
|
|
|
for (int index = 0; index < array.length; ++ index) {
|
|
|
|
char c = array[index];
|
|
|
|
Group current = groups.peek();
|
|
|
|
if (regularCharacter.matches(c)) {
|
|
|
|
current.push(c);
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
switch (c) {
|
2013-11-14 11:13:12 -06:00
|
|
|
case '.':
|
|
|
|
current.push(DOT);
|
|
|
|
continue;
|
2013-11-09 15:18:11 -06:00
|
|
|
case '?':
|
2013-11-10 10:02:18 -06:00
|
|
|
case '*':
|
2013-11-10 10:23:01 -06:00
|
|
|
case '+': {
|
|
|
|
boolean greedy = true;
|
|
|
|
if (index + 1 < array.length && array[index + 1] == '?') {
|
|
|
|
greedy = false;
|
|
|
|
++ index;
|
|
|
|
}
|
2013-11-10 10:02:18 -06:00
|
|
|
current.push(new Repeat(current.pop(),
|
2013-11-10 10:23:01 -06:00
|
|
|
c == '+' ? 1 : 0, c == '?' ? 1 : -1, greedy));
|
2013-11-10 10:02:18 -06:00
|
|
|
continue;
|
2013-11-10 10:23:01 -06:00
|
|
|
}
|
2013-11-12 11:34:30 -06:00
|
|
|
case '(': {
|
|
|
|
boolean capturing = true;
|
2013-11-09 15:18:11 -06:00
|
|
|
if (index + 1 < array.length && array[index + 1] == '?') {
|
2013-11-12 09:33:45 -06:00
|
|
|
index += 2;
|
|
|
|
if (index >= array.length) {
|
|
|
|
throw new RuntimeException("Short pattern @" + index + ": "
|
|
|
|
+ regex);
|
|
|
|
}
|
|
|
|
c = array[index];
|
2013-11-14 11:10:18 -06:00
|
|
|
boolean lookAhead = true;
|
|
|
|
if (c == '<') {
|
|
|
|
if (++ index >= array.length) {
|
|
|
|
throw new RuntimeException("Short pattern @" + index + ": "
|
|
|
|
+ regex);
|
|
|
|
}
|
|
|
|
lookAhead = false;
|
|
|
|
c = array[index];
|
|
|
|
if (c != '=' && c != '!') {
|
|
|
|
throw new IllegalArgumentException("Named groups not supported @"
|
|
|
|
+ index + ": " + regex);
|
|
|
|
}
|
|
|
|
}
|
2013-11-12 09:33:45 -06:00
|
|
|
switch (c) {
|
|
|
|
case ':':
|
|
|
|
capturing = false;
|
|
|
|
break;
|
|
|
|
case '=': {
|
2013-11-12 11:34:30 -06:00
|
|
|
capturing = false;
|
2013-11-14 11:10:18 -06:00
|
|
|
Lookaround lookaround = new Lookaround(lookAhead);
|
|
|
|
current.push(lookaround);
|
|
|
|
groups.push(lookaround.group);
|
2013-11-12 09:33:45 -06:00
|
|
|
continue;
|
|
|
|
}
|
|
|
|
default:
|
2013-11-12 11:34:30 -06:00
|
|
|
throw new UnsupportedOperationException("Not yet supported: "
|
|
|
|
+ regex.substring(index));
|
|
|
|
}
|
2013-11-09 15:18:11 -06:00
|
|
|
}
|
2013-11-11 23:09:25 -06:00
|
|
|
current.push(groups.push(new Group(capturing, null)));
|
2013-11-09 15:18:11 -06:00
|
|
|
continue;
|
2013-11-12 11:34:30 -06:00
|
|
|
}
|
2013-11-09 15:18:11 -06:00
|
|
|
case ')':
|
|
|
|
if (groups.size() < 2) {
|
|
|
|
throw new RuntimeException("Invalid group close @" + index + ": "
|
|
|
|
+ regex);
|
|
|
|
}
|
|
|
|
groups.pop();
|
|
|
|
continue;
|
2013-11-09 15:43:26 -06:00
|
|
|
case '[': {
|
|
|
|
CharacterMatcher matcher = characterClassParser.parseClass(index);
|
|
|
|
if (matcher == null) {
|
|
|
|
throw new RuntimeException("Invalid range @" + index + ": " + regex);
|
|
|
|
}
|
|
|
|
current.push(new CharacterRange(matcher));
|
|
|
|
index = characterClassParser.getEndOffset() - 1;
|
|
|
|
continue;
|
|
|
|
}
|
2013-11-11 23:09:25 -06:00
|
|
|
case '|':
|
|
|
|
current.startAlternative();
|
|
|
|
continue;
|
2013-11-09 15:18:11 -06:00
|
|
|
default:
|
|
|
|
throw new RuntimeException("Parse error @" + index + ": " + regex);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (groups.size() != 1) {
|
|
|
|
throw new IllegalArgumentException("Unclosed groups: ("
|
|
|
|
+ (groups.size() - 1) + "): " + regex);
|
|
|
|
}
|
2013-11-22 17:30:06 -06:00
|
|
|
PikeVM vm = new Output(root).toVM();
|
|
|
|
String plain = vm.isPlainString();
|
|
|
|
if (plain != null) {
|
|
|
|
return new TrivialPattern(regex, plain, 0);
|
|
|
|
}
|
|
|
|
return new RegexPattern(regex, 0, vm);
|
2013-11-09 15:18:11 -06:00
|
|
|
}
|
|
|
|
}
|