Regex: support special character classes

This adds support for character classes such as \d or \W, leaving \p{...}
style character classes as an exercise for later.

Signed-off-by: Johannes Schindelin <johannes.schindelin@gmx.de>
This commit is contained in:
Johannes Schindelin 2013-11-12 09:32:20 -06:00
parent 098f688cd8
commit 8ab10a6953
3 changed files with 68 additions and 1 deletions

View File

@ -62,5 +62,12 @@ public class Regex {
expectGroups("(?=a)a", "a");
expectGroups(".*(o)(?<=[A-Z][a-z]*)", "Hello", "o");
expectNoMatch("(?!a).", "a");
expectMatch("[\\d]", "0");
expectMatch("\\0777", "?7");
expectMatch("\\a", "\007");
expectMatch("\\\\", "\\");
expectMatch("\\x4A", "J");
expectMatch("\\x61", "a");
expectMatch("\\078", "\0078");
}
}

View File

@ -41,6 +41,28 @@ class CharacterMatcher {
return (map.length > index && map[index]) ^ inversePattern;
}
private static String specialClass(int c) {
if ('d' == c) {
return "[0-9]";
}
if ('D' == c) {
return "[^0-9]";
}
if ('s' == c) {
return "[ \\t\\n\\x0B\\f\\r]";
}
if ('S' == c) {
return "[^ \\t\\n\\x0B\\f\\r]";
}
if ('w' == c) {
return "[a-zA-Z_0-9]";
}
if ('W' == c) {
return "[^a-zA-Z_0-9]";
}
return null;
}
private CharacterMatcher(boolean[] map, boolean inversePattern) {
this.map = map;
this.inversePattern = inversePattern;
@ -65,6 +87,17 @@ class CharacterMatcher {
map = java.util.Arrays.copyOf(map, size);
}
private void merge(CharacterMatcher other) {
boolean inversePattern = this.inversePattern || other.inversePattern;
if ((map.length < other.map.length) ^ inversePattern) {
map = java.util.Arrays.copyOf(map, other.map.length);
}
for (int i = 0; i < map.length; ++ i) {
map[i] = (matches((char)i) || other.matches((char)i)) ^ inversePattern;
}
this.inversePattern = inversePattern;
}
static class Parser {
private final char[] description;
private int offset;
@ -165,6 +198,13 @@ class CharacterMatcher {
public CharacterMatcher parseClass() {
if (description[offset] != '[') {
if (description[offset] == '\\') {
String range = specialClass(description[++ offset]);
if (range != null) {
++ offset;
return CharacterMatcher.parse(range);
}
}
return null;
}
CharacterMatcher matcher = new CharacterMatcher(new boolean[0],
@ -196,9 +236,15 @@ class CharacterMatcher {
matcher.map[j] = true;
}
} else if (c == '\\') {
int saved = offset;
previous = parseEscapedCharacter();
if (previous < 0) {
unsupported("escape");
offset = saved - 1;
CharacterMatcher clazz = parseClass();
if (clazz == null) {
unsupported("escape");
}
matcher.merge(clazz);
} else {
matcher.setMatch(previous);
}

View File

@ -292,6 +292,20 @@ class Compiler implements PikeVMOpcodes {
case '.':
current.push(DOT);
continue;
case '\\':
int unescaped = characterClassParser.parseEscapedCharacter(index + 1);
if (unescaped >= 0) {
index = characterClassParser.getEndOffset() - 1;
current.push((char)unescaped);
continue;
}
CharacterMatcher characterClass = characterClassParser.parseClass(index);
if (characterClass != null) {
index = characterClassParser.getEndOffset() - 1;
current.push(new CharacterRange(characterClass));
continue;
}
throw new RuntimeException("Parse error @" + index + ": " + regex);
case '?':
case '*':
case '+': {