mirror of
https://github.com/corda/corda.git
synced 2025-01-07 13:38:47 +00:00
Regex: add a class for matching character classes
This will be used to match character classes (such as '[0-9a-f]'), but it will also be used by the regular expression pattern compiler to determine whether a character has special meaning in regular expressions. Signed-off-by: Johannes Schindelin <johannes.schindelin@gmx.de>
This commit is contained in:
parent
d00f799d2e
commit
26c4bf8d8b
225
test/regex/CharacterMatcher.java
Normal file
225
test/regex/CharacterMatcher.java
Normal file
@ -0,0 +1,225 @@
|
||||
/* Copyright (c) 2008-2013, Avian Contributors
|
||||
|
||||
Permission to use, copy, modify, and/or distribute this software
|
||||
for any purpose with or without fee is hereby granted, provided
|
||||
that the above copyright notice and this permission notice appear
|
||||
in all copies.
|
||||
|
||||
There is NO WARRANTY for this software. See license.txt for
|
||||
details. */
|
||||
|
||||
package regex;
|
||||
|
||||
/**
|
||||
* A class to match classes of characters.
|
||||
* <p>
|
||||
* This class is intended to be the working horse behind character classes
|
||||
* such as {@code [a-z]}.
|
||||
* </p>
|
||||
* @author Johannes Schindelin
|
||||
*/
|
||||
class CharacterMatcher {
|
||||
private boolean[] map;
|
||||
private boolean inversePattern;
|
||||
|
||||
public static CharacterMatcher parse(String description) {
|
||||
return parse(description.toCharArray());
|
||||
}
|
||||
|
||||
public static CharacterMatcher parse(char[] description) {
|
||||
Parser parser = new Parser(description);
|
||||
CharacterMatcher result = parser.parseClass();
|
||||
if (parser.getEndOffset() != description.length) {
|
||||
throw new RuntimeException("Short character class @"
|
||||
+ parser.getEndOffset() + ": " + new String(description));
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
public boolean matches(char c) {
|
||||
int index = c;
|
||||
return (map.length > index && map[index]) ^ inversePattern;
|
||||
}
|
||||
|
||||
private CharacterMatcher(boolean[] map, boolean inversePattern) {
|
||||
this.map = map;
|
||||
this.inversePattern = inversePattern;
|
||||
}
|
||||
|
||||
private void setMatch(int c) {
|
||||
ensureCapacity(c + 1);
|
||||
map[c] = true;
|
||||
}
|
||||
|
||||
private void ensureCapacity(int length) {
|
||||
if (map.length >= length) {
|
||||
return;
|
||||
}
|
||||
int size = map.length;
|
||||
if (size < 32) {
|
||||
size = 32;
|
||||
}
|
||||
while (size < length) {
|
||||
size <<= 1;
|
||||
}
|
||||
map = java.util.Arrays.copyOf(map, size);
|
||||
}
|
||||
|
||||
static class Parser {
|
||||
private final char[] description;
|
||||
private int offset;
|
||||
|
||||
public Parser(char[] description) {
|
||||
this.description = description;
|
||||
}
|
||||
|
||||
public int getEndOffset() {
|
||||
return offset;
|
||||
}
|
||||
|
||||
/**
|
||||
* Parses an escaped character.
|
||||
*
|
||||
* @param start the offset <u>after</u> the backslash
|
||||
* @return the escaped character, or -1 if no character was recognized
|
||||
*/
|
||||
public int parseEscapedCharacter(int start) {
|
||||
offset = start;
|
||||
return parseEscapedCharacter();
|
||||
}
|
||||
|
||||
private int parseEscapedCharacter() {
|
||||
if (offset == description.length) {
|
||||
throw new IllegalArgumentException("Short escaped character");
|
||||
}
|
||||
char c = description[offset++];
|
||||
if (c == '0') {
|
||||
int len = digits(offset, 3, 8);
|
||||
if (len == 3 && description[offset] > '3') {
|
||||
--len;
|
||||
}
|
||||
c = (char)Integer.parseInt(new String(description, offset, len), 8);
|
||||
offset += len;
|
||||
return c;
|
||||
}
|
||||
if (c == 'x' || c == 'u') {
|
||||
int len = digits(offset, 4, 16);
|
||||
c = (char)Integer.parseInt(new String(description, offset, len), 16);
|
||||
offset += len;
|
||||
return c;
|
||||
}
|
||||
switch (c) {
|
||||
case 'a':
|
||||
return 0x0007;
|
||||
case 'e':
|
||||
return 0x001B;
|
||||
case 'f':
|
||||
return 0x000C;
|
||||
case 'n':
|
||||
return 0x000A;
|
||||
case 'r':
|
||||
return 0x000D;
|
||||
case 't':
|
||||
return 0x0009;
|
||||
case '\\':
|
||||
case '.':
|
||||
case '*':
|
||||
case '+':
|
||||
case '?':
|
||||
case '|':
|
||||
case '[':
|
||||
case ']':
|
||||
case '{':
|
||||
case '}':
|
||||
case '(':
|
||||
case ')':
|
||||
case '^':
|
||||
case '$':
|
||||
return c;
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
public int digits(int offset, int maxLength, int base) {
|
||||
for (int i = 0; ; ++i) {
|
||||
if (i == maxLength || offset + i >= description.length) {
|
||||
return i;
|
||||
}
|
||||
int value = description[offset + i] - '0';
|
||||
if (value < 0) {
|
||||
return i;
|
||||
}
|
||||
if (base > 10 && value >= 10) {
|
||||
value += 10 - (value >= 'a' - '0' ? 'a' - '0' : 'A' - '0');
|
||||
}
|
||||
if (value >= base) {
|
||||
return i;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public CharacterMatcher parseClass(int start) {
|
||||
offset = start;
|
||||
return parseClass();
|
||||
}
|
||||
|
||||
public CharacterMatcher parseClass() {
|
||||
if (description[offset] != '[') {
|
||||
return null;
|
||||
}
|
||||
CharacterMatcher matcher = new CharacterMatcher(new boolean[0],
|
||||
description[++ offset] == '^');
|
||||
if (matcher.inversePattern) {
|
||||
++ offset;
|
||||
}
|
||||
|
||||
int previous = -1;
|
||||
boolean firstCharacter = true;
|
||||
for (;;) {
|
||||
if (offset >= description.length) {
|
||||
unsupported("short regex");
|
||||
}
|
||||
char c = description[offset++];
|
||||
if (c == '-' && !firstCharacter && description[offset] != ']') {
|
||||
if (previous < 0) {
|
||||
unsupported("invalid range");
|
||||
}
|
||||
int rangeEnd = description[offset];
|
||||
if ('\\' == rangeEnd) {
|
||||
rangeEnd = parseEscapedCharacter();
|
||||
if (rangeEnd < 0) {
|
||||
unsupported("invalid range");
|
||||
}
|
||||
}
|
||||
matcher.ensureCapacity(rangeEnd + 1);
|
||||
for (int j = previous + 1; j <= rangeEnd; j++) {
|
||||
matcher.map[j] = true;
|
||||
}
|
||||
} else if (c == '\\') {
|
||||
previous = parseEscapedCharacter();
|
||||
if (previous < 0) {
|
||||
unsupported("escape");
|
||||
} else {
|
||||
matcher.setMatch(previous);
|
||||
}
|
||||
} else if (c == '&' || c == '[') {
|
||||
unsupported("operation");
|
||||
} else if (c == ']') {
|
||||
break;
|
||||
} else {
|
||||
previous = c;
|
||||
matcher.setMatch(previous);
|
||||
}
|
||||
firstCharacter = false;
|
||||
}
|
||||
|
||||
return matcher;
|
||||
}
|
||||
|
||||
private void unsupported(String msg) throws UnsupportedOperationException {
|
||||
throw new UnsupportedOperationException("Unsupported " + msg + " @"
|
||||
+ offset + ": "
|
||||
+ new String(description, 0, description.length));
|
||||
}
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue
Block a user