From 26c4bf8d8b023c8a80a6d6dd3eeca11930383d92 Mon Sep 17 00:00:00 2001 From: Johannes Schindelin Date: Mon, 11 Nov 2013 17:23:59 -0600 Subject: [PATCH] Regex: add a class for matching character classes This will be used to match character classes (such as '[0-9a-f]'), but it will also be used by the regular expression pattern compiler to determine whether a character has special meaning in regular expressions. Signed-off-by: Johannes Schindelin --- test/regex/CharacterMatcher.java | 225 +++++++++++++++++++++++++++++++ 1 file changed, 225 insertions(+) create mode 100644 test/regex/CharacterMatcher.java diff --git a/test/regex/CharacterMatcher.java b/test/regex/CharacterMatcher.java new file mode 100644 index 0000000000..8e5d5318b8 --- /dev/null +++ b/test/regex/CharacterMatcher.java @@ -0,0 +1,225 @@ +/* Copyright (c) 2008-2013, Avian Contributors + + Permission to use, copy, modify, and/or distribute this software + for any purpose with or without fee is hereby granted, provided + that the above copyright notice and this permission notice appear + in all copies. + + There is NO WARRANTY for this software. See license.txt for + details. */ + +package regex; + +/** + * A class to match classes of characters. + *

+ * This class is intended to be the working horse behind character classes + * such as {@code [a-z]}. + *

+ * @author Johannes Schindelin + */ +class CharacterMatcher { + private boolean[] map; + private boolean inversePattern; + + public static CharacterMatcher parse(String description) { + return parse(description.toCharArray()); + } + + public static CharacterMatcher parse(char[] description) { + Parser parser = new Parser(description); + CharacterMatcher result = parser.parseClass(); + if (parser.getEndOffset() != description.length) { + throw new RuntimeException("Short character class @" + + parser.getEndOffset() + ": " + new String(description)); + } + return result; + } + + public boolean matches(char c) { + int index = c; + return (map.length > index && map[index]) ^ inversePattern; + } + + private CharacterMatcher(boolean[] map, boolean inversePattern) { + this.map = map; + this.inversePattern = inversePattern; + } + + private void setMatch(int c) { + ensureCapacity(c + 1); + map[c] = true; + } + + private void ensureCapacity(int length) { + if (map.length >= length) { + return; + } + int size = map.length; + if (size < 32) { + size = 32; + } + while (size < length) { + size <<= 1; + } + map = java.util.Arrays.copyOf(map, size); + } + + static class Parser { + private final char[] description; + private int offset; + + public Parser(char[] description) { + this.description = description; + } + + public int getEndOffset() { + return offset; + } + + /** + * Parses an escaped character. + * + * @param start the offset after the backslash + * @return the escaped character, or -1 if no character was recognized + */ + public int parseEscapedCharacter(int start) { + offset = start; + return parseEscapedCharacter(); + } + + private int parseEscapedCharacter() { + if (offset == description.length) { + throw new IllegalArgumentException("Short escaped character"); + } + char c = description[offset++]; + if (c == '0') { + int len = digits(offset, 3, 8); + if (len == 3 && description[offset] > '3') { + --len; + } + c = (char)Integer.parseInt(new String(description, offset, len), 8); + offset += len; + return c; + } + if (c == 'x' || c == 'u') { + int len = digits(offset, 4, 16); + c = (char)Integer.parseInt(new String(description, offset, len), 16); + offset += len; + return c; + } + switch (c) { + case 'a': + return 0x0007; + case 'e': + return 0x001B; + case 'f': + return 0x000C; + case 'n': + return 0x000A; + case 'r': + return 0x000D; + case 't': + return 0x0009; + case '\\': + case '.': + case '*': + case '+': + case '?': + case '|': + case '[': + case ']': + case '{': + case '}': + case '(': + case ')': + case '^': + case '$': + return c; + } + return -1; + } + + public int digits(int offset, int maxLength, int base) { + for (int i = 0; ; ++i) { + if (i == maxLength || offset + i >= description.length) { + return i; + } + int value = description[offset + i] - '0'; + if (value < 0) { + return i; + } + if (base > 10 && value >= 10) { + value += 10 - (value >= 'a' - '0' ? 'a' - '0' : 'A' - '0'); + } + if (value >= base) { + return i; + } + } + } + + public CharacterMatcher parseClass(int start) { + offset = start; + return parseClass(); + } + + public CharacterMatcher parseClass() { + if (description[offset] != '[') { + return null; + } + CharacterMatcher matcher = new CharacterMatcher(new boolean[0], + description[++ offset] == '^'); + if (matcher.inversePattern) { + ++ offset; + } + + int previous = -1; + boolean firstCharacter = true; + for (;;) { + if (offset >= description.length) { + unsupported("short regex"); + } + char c = description[offset++]; + if (c == '-' && !firstCharacter && description[offset] != ']') { + if (previous < 0) { + unsupported("invalid range"); + } + int rangeEnd = description[offset]; + if ('\\' == rangeEnd) { + rangeEnd = parseEscapedCharacter(); + if (rangeEnd < 0) { + unsupported("invalid range"); + } + } + matcher.ensureCapacity(rangeEnd + 1); + for (int j = previous + 1; j <= rangeEnd; j++) { + matcher.map[j] = true; + } + } else if (c == '\\') { + previous = parseEscapedCharacter(); + if (previous < 0) { + unsupported("escape"); + } else { + matcher.setMatch(previous); + } + } else if (c == '&' || c == '[') { + unsupported("operation"); + } else if (c == ']') { + break; + } else { + previous = c; + matcher.setMatch(previous); + } + firstCharacter = false; + } + + return matcher; + } + + private void unsupported(String msg) throws UnsupportedOperationException { + throw new UnsupportedOperationException("Unsupported " + msg + " @" + + offset + ": " + + new String(description, 0, description.length)); + } + } +}