Regex: add a class for matching character classes

This will be used to match character classes (such as '[0-9a-f]'),
but it will also be used by the regular expression pattern compiler
to determine whether a character has special meaning in regular
expressions.

Signed-off-by: Johannes Schindelin <johannes.schindelin@gmx.de>
This commit is contained in:
Johannes Schindelin 2013-11-11 17:23:59 -06:00
parent d00f799d2e
commit 26c4bf8d8b

View File

@ -0,0 +1,225 @@
/* Copyright (c) 2008-2013, Avian Contributors
Permission to use, copy, modify, and/or distribute this software
for any purpose with or without fee is hereby granted, provided
that the above copyright notice and this permission notice appear
in all copies.
There is NO WARRANTY for this software. See license.txt for
details. */
package regex;
/**
* A class to match classes of characters.
* <p>
* This class is intended to be the working horse behind character classes
* such as {@code [a-z]}.
* </p>
* @author Johannes Schindelin
*/
class CharacterMatcher {
private boolean[] map;
private boolean inversePattern;
public static CharacterMatcher parse(String description) {
return parse(description.toCharArray());
}
public static CharacterMatcher parse(char[] description) {
Parser parser = new Parser(description);
CharacterMatcher result = parser.parseClass();
if (parser.getEndOffset() != description.length) {
throw new RuntimeException("Short character class @"
+ parser.getEndOffset() + ": " + new String(description));
}
return result;
}
public boolean matches(char c) {
int index = c;
return (map.length > index && map[index]) ^ inversePattern;
}
private CharacterMatcher(boolean[] map, boolean inversePattern) {
this.map = map;
this.inversePattern = inversePattern;
}
private void setMatch(int c) {
ensureCapacity(c + 1);
map[c] = true;
}
private void ensureCapacity(int length) {
if (map.length >= length) {
return;
}
int size = map.length;
if (size < 32) {
size = 32;
}
while (size < length) {
size <<= 1;
}
map = java.util.Arrays.copyOf(map, size);
}
static class Parser {
private final char[] description;
private int offset;
public Parser(char[] description) {
this.description = description;
}
public int getEndOffset() {
return offset;
}
/**
* Parses an escaped character.
*
* @param start the offset <u>after</u> the backslash
* @return the escaped character, or -1 if no character was recognized
*/
public int parseEscapedCharacter(int start) {
offset = start;
return parseEscapedCharacter();
}
private int parseEscapedCharacter() {
if (offset == description.length) {
throw new IllegalArgumentException("Short escaped character");
}
char c = description[offset++];
if (c == '0') {
int len = digits(offset, 3, 8);
if (len == 3 && description[offset] > '3') {
--len;
}
c = (char)Integer.parseInt(new String(description, offset, len), 8);
offset += len;
return c;
}
if (c == 'x' || c == 'u') {
int len = digits(offset, 4, 16);
c = (char)Integer.parseInt(new String(description, offset, len), 16);
offset += len;
return c;
}
switch (c) {
case 'a':
return 0x0007;
case 'e':
return 0x001B;
case 'f':
return 0x000C;
case 'n':
return 0x000A;
case 'r':
return 0x000D;
case 't':
return 0x0009;
case '\\':
case '.':
case '*':
case '+':
case '?':
case '|':
case '[':
case ']':
case '{':
case '}':
case '(':
case ')':
case '^':
case '$':
return c;
}
return -1;
}
public int digits(int offset, int maxLength, int base) {
for (int i = 0; ; ++i) {
if (i == maxLength || offset + i >= description.length) {
return i;
}
int value = description[offset + i] - '0';
if (value < 0) {
return i;
}
if (base > 10 && value >= 10) {
value += 10 - (value >= 'a' - '0' ? 'a' - '0' : 'A' - '0');
}
if (value >= base) {
return i;
}
}
}
public CharacterMatcher parseClass(int start) {
offset = start;
return parseClass();
}
public CharacterMatcher parseClass() {
if (description[offset] != '[') {
return null;
}
CharacterMatcher matcher = new CharacterMatcher(new boolean[0],
description[++ offset] == '^');
if (matcher.inversePattern) {
++ offset;
}
int previous = -1;
boolean firstCharacter = true;
for (;;) {
if (offset >= description.length) {
unsupported("short regex");
}
char c = description[offset++];
if (c == '-' && !firstCharacter && description[offset] != ']') {
if (previous < 0) {
unsupported("invalid range");
}
int rangeEnd = description[offset];
if ('\\' == rangeEnd) {
rangeEnd = parseEscapedCharacter();
if (rangeEnd < 0) {
unsupported("invalid range");
}
}
matcher.ensureCapacity(rangeEnd + 1);
for (int j = previous + 1; j <= rangeEnd; j++) {
matcher.map[j] = true;
}
} else if (c == '\\') {
previous = parseEscapedCharacter();
if (previous < 0) {
unsupported("escape");
} else {
matcher.setMatch(previous);
}
} else if (c == '&' || c == '[') {
unsupported("operation");
} else if (c == ']') {
break;
} else {
previous = c;
matcher.setMatch(previous);
}
firstCharacter = false;
}
return matcher;
}
private void unsupported(String msg) throws UnsupportedOperationException {
throw new UnsupportedOperationException("Unsupported " + msg + " @"
+ offset + ": "
+ new String(description, 0, description.length));
}
}
}