Regex: add a class for matching character classes

This will be used to match character classes (such as '[0-9a-f]'), but it will also be used by the regular expression pattern compiler to determine whether a character has special meaning in regular expressions. Signed-off-by: Johannes Schindelin <johannes.schindelin@gmx.de>
2025-03-14 00:06:45 +00:00 · 2013-11-11 17:23:59 -06:00 · 2013-11-11 17:23:59 -06:00 · 26c4bf8d8b
commit 26c4bf8d8b
parent d00f799d2e
1 changed files with 225 additions and 0 deletions
--- a/test/regex/CharacterMatcher.java
+++ b/test/regex/CharacterMatcher.java
@ -0,0 +1,225 @@
+/* Copyright (c) 2008-2013, Avian Contributors
+
+   Permission to use, copy, modify, and/or distribute this software
+   for any purpose with or without fee is hereby granted, provided
+   that the above copyright notice and this permission notice appear
+   in all copies.
+
+   There is NO WARRANTY for this software.  See license.txt for
+   details. */
+
+package regex;
+
+/**
+ * A class to match classes of characters.
+ * <p>
+ * This class is intended to be the working horse behind character classes
+ * such as {@code [a-z]}.
+ * </p>
+ * @author Johannes Schindelin
+ */
+class CharacterMatcher {
+  private boolean[] map;
+  private boolean inversePattern;
+
+  public static CharacterMatcher parse(String description) {
+    return parse(description.toCharArray());
+  }
+
+  public static CharacterMatcher parse(char[] description) {
+    Parser parser = new Parser(description);
+    CharacterMatcher result = parser.parseClass();
+    if (parser.getEndOffset() != description.length) {
+      throw new RuntimeException("Short character class @"
+        + parser.getEndOffset() + ": " + new String(description));
+    }
+    return result;
+  }
+
+  public boolean matches(char c) {
+    int index = c;
+    return (map.length > index && map[index]) ^ inversePattern;
+  }
+
+  private CharacterMatcher(boolean[] map, boolean inversePattern) {
+    this.map = map;
+    this.inversePattern = inversePattern;
+  }
+
+  private void setMatch(int c) {
+    ensureCapacity(c + 1);
+    map[c] = true;
+  }
+
+  private void ensureCapacity(int length) {
+    if (map.length >= length) {
+      return;
+    }
+    int size = map.length;
+    if (size < 32) {
+      size = 32;
+    }
+    while (size < length) {
+      size <<= 1;
+    }
+    map = java.util.Arrays.copyOf(map, size);
+  }
+
+  static class Parser {
+    private final char[] description;
+    private int offset;
+
+    public Parser(char[] description) {
+      this.description = description;
+    }
+
+    public int getEndOffset() {
+      return offset;
+    }
+
+    /**
+     * Parses an escaped character.
+     * 
+     * @param start the offset <u>after</u> the backslash
+     * @return the escaped character, or -1 if no character was recognized
+     */
+    public int parseEscapedCharacter(int start) {
+      offset = start;
+      return parseEscapedCharacter();
+    }
+
+    private int parseEscapedCharacter() {
+      if (offset == description.length) {
+        throw new IllegalArgumentException("Short escaped character");
+      }
+      char c = description[offset++];
+      if (c == '0') {
+        int len = digits(offset, 3, 8);
+        if (len == 3 && description[offset] > '3') {
+          --len;
+        }
+        c = (char)Integer.parseInt(new String(description, offset, len), 8);
+        offset += len;
+        return c;
+      }
+      if (c == 'x' || c == 'u') {
+        int len = digits(offset, 4, 16);
+        c = (char)Integer.parseInt(new String(description, offset, len), 16);
+        offset += len;
+        return c;
+      }
+      switch (c) {
+      case 'a':
+         return 0x0007;
+      case 'e':
+         return 0x001B;
+      case 'f':
+         return 0x000C;
+      case 'n':
+         return 0x000A;
+      case 'r':
+         return 0x000D;
+      case 't':
+         return 0x0009;
+      case '\\':
+      case '.':
+      case '*':
+      case '+':
+      case '?':
+      case '|':
+      case '[':
+      case ']':
+      case '{':
+      case '}':
+      case '(':
+      case ')':
+      case '^':
+      case '$':
+        return c;
+      }
+      return -1;
+    }
+
+    public int digits(int offset, int maxLength, int base) {
+      for (int i = 0; ; ++i) {
+        if (i == maxLength || offset + i >= description.length) {
+          return i;
+        }
+        int value = description[offset + i] - '0';
+        if (value < 0) {
+          return i;
+        }
+        if (base > 10 && value >= 10) {
+          value += 10 - (value >= 'a' - '0' ? 'a' - '0' : 'A' - '0');
+        }
+        if (value >= base) {
+          return i;
+        }
+      }
+    }
+
+    public CharacterMatcher parseClass(int start) {
+      offset = start;
+      return parseClass();
+    }
+
+    public CharacterMatcher parseClass() {
+      if (description[offset] != '[') {
+        return null;
+      }
+      CharacterMatcher matcher = new CharacterMatcher(new boolean[0],
+        description[++ offset] == '^');
+      if (matcher.inversePattern) {
+        ++ offset;
+      }
+
+      int previous = -1;
+      boolean firstCharacter = true;
+      for (;;) {
+        if (offset >= description.length) {
+          unsupported("short regex");
+        }
+        char c = description[offset++];
+        if (c == '-' && !firstCharacter && description[offset] != ']') {
+          if (previous < 0) {
+            unsupported("invalid range");
+          }
+          int rangeEnd = description[offset];
+          if ('\\' == rangeEnd) {
+            rangeEnd = parseEscapedCharacter();
+            if (rangeEnd < 0) {
+              unsupported("invalid range");
+            }
+          }
+          matcher.ensureCapacity(rangeEnd + 1);
+          for (int j = previous + 1; j <= rangeEnd; j++) {
+            matcher.map[j] = true;
+          }
+        } else if (c == '\\') {
+          previous = parseEscapedCharacter();
+          if (previous < 0) {
+            unsupported("escape");
+          } else {
+            matcher.setMatch(previous);
+          }
+        } else if (c == '&' || c == '[') {
+          unsupported("operation");
+        } else if (c == ']') {
+          break;
+        } else {
+          previous = c;
+          matcher.setMatch(previous);
+        }
+        firstCharacter = false;
+      }
+
+      return matcher;
+    }
+
+    private void unsupported(String msg) throws UnsupportedOperationException {
+      throw new UnsupportedOperationException("Unsupported " + msg + " @"
+        + offset + ": "
+        + new String(description, 0, description.length));
+    }
+  }
+}