From 53563c4f8ebb4f82b12e5bdaac978c5f7b671bfb Mon Sep 17 00:00:00 2001 From: Johannes Schindelin Date: Sat, 9 Nov 2013 15:43:26 -0600 Subject: [PATCH] Regex: add support for character classes Now we support regular expression patterns a la '[0-9]'. Signed-off-by: Johannes Schindelin --- test/Regex.java | 2 ++ test/regex/Compiler.java | 38 ++++++++++++++++++++++++++++++++++- test/regex/PikeVM.java | 11 +++++++++- test/regex/PikeVMOpcodes.java | 2 ++ 4 files changed, 51 insertions(+), 2 deletions(-) diff --git a/test/Regex.java b/test/Regex.java index db2cf86892..1865163840 100644 --- a/test/Regex.java +++ b/test/Regex.java @@ -54,5 +54,7 @@ public class Regex { expectGroups("a(bb)*a", "abbbba", "bb"); expectGroups("a(bb)?(bb)+a", "abba", null, "bb"); expectFind(" +", "Hello , world! ", " ", " ", " "); + expectMatch("[0-9A-Fa-f]+", "08ef"); + expectNoMatch("[0-9A-Fa-f]+", "08@ef"); } } diff --git a/test/regex/Compiler.java b/test/regex/Compiler.java index 2b6b83d23f..f3c5640225 100644 --- a/test/regex/Compiler.java +++ b/test/regex/Compiler.java @@ -27,6 +27,7 @@ class Compiler implements PikeVMOpcodes { private int offset; private int groupCount = -1; private int findPreambleSize; + private ArrayList classes; public Output(Expression expr) { // try-run to determine the code size @@ -34,6 +35,7 @@ class Compiler implements PikeVMOpcodes { program = new int[offset]; offset = 0; groupCount = -1; + classes = new ArrayList(); // write it out! expr.writeCode(this); } @@ -60,15 +62,38 @@ class Compiler implements PikeVMOpcodes { } public PikeVM toVM() { - return new PikeVM(program, findPreambleSize, groupCount); + CharacterMatcher[] classes = new CharacterMatcher[this.classes.size()]; + this.classes.toArray(classes); + return new PikeVM(program, findPreambleSize, groupCount, classes); } + public int addClass(CharacterMatcher characterClass) { + if (program == null) { + return -1; + } + int result = classes.size(); + classes.add(characterClass); + return result; + } } private abstract class Expression { protected abstract void writeCode(Output output); } + private class CharacterRange extends Expression { + private final CharacterMatcher characterClass; + + public CharacterRange(CharacterMatcher characterClass) { + this.characterClass = characterClass; + } + + protected void writeCode(Output output) { + output.add(CHARACTER_CLASS); + output.add(output.addClass(characterClass)); + } + } + private class Repeat extends Expression { private Expression expr; private int minCount, maxCount; @@ -177,6 +202,8 @@ class Compiler implements PikeVMOpcodes { public Pattern compile(String regex) { char[] array = regex.toCharArray(); + CharacterMatcher.Parser characterClassParser = + new CharacterMatcher.Parser(array); for (int index = 0; index < array.length; ++ index) { char c = array[index]; Group current = groups.peek(); @@ -214,6 +241,15 @@ class Compiler implements PikeVMOpcodes { } groups.pop(); continue; + case '[': { + CharacterMatcher matcher = characterClassParser.parseClass(index); + if (matcher == null) { + throw new RuntimeException("Invalid range @" + index + ": " + regex); + } + current.push(new CharacterRange(matcher)); + index = characterClassParser.getEndOffset() - 1; + continue; + } default: throw new RuntimeException("Parse error @" + index + ": " + regex); } diff --git a/test/regex/PikeVM.java b/test/regex/PikeVM.java index 7b3e55467d..fa7ff4ccb8 100644 --- a/test/regex/PikeVM.java +++ b/test/regex/PikeVM.java @@ -26,16 +26,20 @@ class PikeVM implements PikeVMOpcodes { * findPrefixLength} instead. */ private final int findPrefixLength; + private final CharacterMatcher[] classes; public interface Result { void set(int[] start, int[] end); } - protected PikeVM(int[] program, int findPrefixLength, int groupCount) { + protected PikeVM(int[] program, int findPrefixLength, int groupCount, + CharacterMatcher[] classes) + { this.program = program; this.findPrefixLength = findPrefixLength; this.groupCount = groupCount; offsetsCount = 2 * groupCount + 2; + this.classes = classes; } /** @@ -329,6 +333,11 @@ class PikeVM implements PikeVMOpcodes { case DOTALL: current.queueNext(pc, pc + 1, next); break; + case CHARACTER_CLASS: + if (classes[program[pc + 1]].matches(c)) { + current.queueNext(pc, pc + 2, next); + } + break; /* immediate opcodes, i.e. thread continues within the same step */ case SAVE_OFFSET: int index = program[pc + 1]; diff --git a/test/regex/PikeVMOpcodes.java b/test/regex/PikeVMOpcodes.java index e281aa25ab..0fa5619f62 100644 --- a/test/regex/PikeVMOpcodes.java +++ b/test/regex/PikeVMOpcodes.java @@ -22,6 +22,8 @@ interface PikeVMOpcodes { final static int DOT = -1; final static int DOTALL = -2; + final static int CHARACTER_CLASS = -20; + final static int SAVE_OFFSET = -40; final static int SPLIT = -50;