Regex: add support for character classes

Now we support regular expression patterns a la '[0-9]'.

Signed-off-by: Johannes Schindelin <johannes.schindelin@gmx.de>
This commit is contained in:
Johannes Schindelin 2013-11-09 15:43:26 -06:00
parent ca428c406c
commit 53563c4f8e
4 changed files with 51 additions and 2 deletions

View File

@ -54,5 +54,7 @@ public class Regex {
expectGroups("a(bb)*a", "abbbba", "bb"); expectGroups("a(bb)*a", "abbbba", "bb");
expectGroups("a(bb)?(bb)+a", "abba", null, "bb"); expectGroups("a(bb)?(bb)+a", "abba", null, "bb");
expectFind(" +", "Hello , world! ", " ", " ", " "); expectFind(" +", "Hello , world! ", " ", " ", " ");
expectMatch("[0-9A-Fa-f]+", "08ef");
expectNoMatch("[0-9A-Fa-f]+", "08@ef");
} }
} }

View File

@ -27,6 +27,7 @@ class Compiler implements PikeVMOpcodes {
private int offset; private int offset;
private int groupCount = -1; private int groupCount = -1;
private int findPreambleSize; private int findPreambleSize;
private ArrayList<CharacterMatcher> classes;
public Output(Expression expr) { public Output(Expression expr) {
// try-run to determine the code size // try-run to determine the code size
@ -34,6 +35,7 @@ class Compiler implements PikeVMOpcodes {
program = new int[offset]; program = new int[offset];
offset = 0; offset = 0;
groupCount = -1; groupCount = -1;
classes = new ArrayList<CharacterMatcher>();
// write it out! // write it out!
expr.writeCode(this); expr.writeCode(this);
} }
@ -60,15 +62,38 @@ class Compiler implements PikeVMOpcodes {
} }
public PikeVM toVM() { public PikeVM toVM() {
return new PikeVM(program, findPreambleSize, groupCount); CharacterMatcher[] classes = new CharacterMatcher[this.classes.size()];
this.classes.toArray(classes);
return new PikeVM(program, findPreambleSize, groupCount, classes);
} }
public int addClass(CharacterMatcher characterClass) {
if (program == null) {
return -1;
}
int result = classes.size();
classes.add(characterClass);
return result;
}
} }
private abstract class Expression { private abstract class Expression {
protected abstract void writeCode(Output output); protected abstract void writeCode(Output output);
} }
private class CharacterRange extends Expression {
private final CharacterMatcher characterClass;
public CharacterRange(CharacterMatcher characterClass) {
this.characterClass = characterClass;
}
protected void writeCode(Output output) {
output.add(CHARACTER_CLASS);
output.add(output.addClass(characterClass));
}
}
private class Repeat extends Expression { private class Repeat extends Expression {
private Expression expr; private Expression expr;
private int minCount, maxCount; private int minCount, maxCount;
@ -177,6 +202,8 @@ class Compiler implements PikeVMOpcodes {
public Pattern compile(String regex) { public Pattern compile(String regex) {
char[] array = regex.toCharArray(); char[] array = regex.toCharArray();
CharacterMatcher.Parser characterClassParser =
new CharacterMatcher.Parser(array);
for (int index = 0; index < array.length; ++ index) { for (int index = 0; index < array.length; ++ index) {
char c = array[index]; char c = array[index];
Group current = groups.peek(); Group current = groups.peek();
@ -214,6 +241,15 @@ class Compiler implements PikeVMOpcodes {
} }
groups.pop(); groups.pop();
continue; continue;
case '[': {
CharacterMatcher matcher = characterClassParser.parseClass(index);
if (matcher == null) {
throw new RuntimeException("Invalid range @" + index + ": " + regex);
}
current.push(new CharacterRange(matcher));
index = characterClassParser.getEndOffset() - 1;
continue;
}
default: default:
throw new RuntimeException("Parse error @" + index + ": " + regex); throw new RuntimeException("Parse error @" + index + ": " + regex);
} }

View File

@ -26,16 +26,20 @@ class PikeVM implements PikeVMOpcodes {
* findPrefixLength} instead. * findPrefixLength} instead.
*/ */
private final int findPrefixLength; private final int findPrefixLength;
private final CharacterMatcher[] classes;
public interface Result { public interface Result {
void set(int[] start, int[] end); void set(int[] start, int[] end);
} }
protected PikeVM(int[] program, int findPrefixLength, int groupCount) { protected PikeVM(int[] program, int findPrefixLength, int groupCount,
CharacterMatcher[] classes)
{
this.program = program; this.program = program;
this.findPrefixLength = findPrefixLength; this.findPrefixLength = findPrefixLength;
this.groupCount = groupCount; this.groupCount = groupCount;
offsetsCount = 2 * groupCount + 2; offsetsCount = 2 * groupCount + 2;
this.classes = classes;
} }
/** /**
@ -329,6 +333,11 @@ class PikeVM implements PikeVMOpcodes {
case DOTALL: case DOTALL:
current.queueNext(pc, pc + 1, next); current.queueNext(pc, pc + 1, next);
break; break;
case CHARACTER_CLASS:
if (classes[program[pc + 1]].matches(c)) {
current.queueNext(pc, pc + 2, next);
}
break;
/* immediate opcodes, i.e. thread continues within the same step */ /* immediate opcodes, i.e. thread continues within the same step */
case SAVE_OFFSET: case SAVE_OFFSET:
int index = program[pc + 1]; int index = program[pc + 1];

View File

@ -22,6 +22,8 @@ interface PikeVMOpcodes {
final static int DOT = -1; final static int DOT = -1;
final static int DOTALL = -2; final static int DOTALL = -2;
final static int CHARACTER_CLASS = -20;
final static int SAVE_OFFSET = -40; final static int SAVE_OFFSET = -40;
final static int SPLIT = -50; final static int SPLIT = -50;