Regex: add a class for matching character classes

This will be used to match character classes (such as '[0-9a-f]'), but it will also be used by the regular expression pattern compiler to determine whether a character has special meaning in regular expressions. Signed-off-by: Johannes Schindelin <johannes.schindelin@gmx.de>
2025-01-09 06:23:04 +00:00 · 2013-11-11 17:23:59 -06:00 · 2013-11-11 17:23:59 -06:00 · 26c4bf8d8b
commit 26c4bf8d8b
parent d00f799d2e
1 changed files with 225 additions and 0 deletions
--- a/test/regex/CharacterMatcher.java
+++ b/test/regex/CharacterMatcher.java
@ -0,0 +1,225 @@
 /* Copyright (c) 2008-2013, Avian Contributors
   Permission to use, copy, modify, and/or distribute this software
   for any purpose with or without fee is hereby granted, provided
   that the above copyright notice and this permission notice appear
   in all copies.
   There is NO WARRANTY for this software.  See license.txt for
   details. */
 package regex;
 /**
 * A class to match classes of characters.
 * <p>
 * This class is intended to be the working horse behind character classes
 * such as {@code [a-z]}.
 * </p>
 * @author Johannes Schindelin
 */
 class CharacterMatcher {
  private boolean[] map;
  private boolean inversePattern;
  public static CharacterMatcher parse(String description) {
    return parse(description.toCharArray());
  }
  public static CharacterMatcher parse(char[] description) {
    Parser parser = new Parser(description);
    CharacterMatcher result = parser.parseClass();
    if (parser.getEndOffset() != description.length) {
      throw new RuntimeException("Short character class @"
        + parser.getEndOffset() + ": " + new String(description));
    }
    return result;
  }
  public boolean matches(char c) {
    int index = c;
    return (map.length > index && map[index]) ^ inversePattern;
  }
  private CharacterMatcher(boolean[] map, boolean inversePattern) {
    this.map = map;
    this.inversePattern = inversePattern;
  }
  private void setMatch(int c) {
    ensureCapacity(c + 1);
    map[c] = true;
  }
  private void ensureCapacity(int length) {
    if (map.length >= length) {
      return;
    }
    int size = map.length;
    if (size < 32) {
      size = 32;
    }
    while (size < length) {
      size <<= 1;
    }
    map = java.util.Arrays.copyOf(map, size);
  }
  static class Parser {
    private final char[] description;
    private int offset;
    public Parser(char[] description) {
      this.description = description;
    }
    public int getEndOffset() {
      return offset;
    }
    /**
     * Parses an escaped character.
     * 
     * @param start the offset <u>after</u> the backslash
     * @return the escaped character, or -1 if no character was recognized
     */
    public int parseEscapedCharacter(int start) {
      offset = start;
      return parseEscapedCharacter();
    }
    private int parseEscapedCharacter() {
      if (offset == description.length) {
        throw new IllegalArgumentException("Short escaped character");
      }
      char c = description[offset++];
      if (c == '0') {
        int len = digits(offset, 3, 8);
        if (len == 3 && description[offset] > '3') {
          --len;
        }
        c = (char)Integer.parseInt(new String(description, offset, len), 8);
        offset += len;
        return c;
      }
      if (c == 'x' || c == 'u') {
        int len = digits(offset, 4, 16);
        c = (char)Integer.parseInt(new String(description, offset, len), 16);
        offset += len;
        return c;
      }
      switch (c) {
      case 'a':
         return 0x0007;
      case 'e':
         return 0x001B;
      case 'f':
         return 0x000C;
      case 'n':
         return 0x000A;
      case 'r':
         return 0x000D;
      case 't':
         return 0x0009;
      case '\\':
      case '.':
      case '*':
      case '+':
      case '?':
      case '|':
      case '[':
      case ']':
      case '{':
      case '}':
      case '(':
      case ')':
      case '^':
      case '$':
        return c;
      }
      return -1;
    }
    public int digits(int offset, int maxLength, int base) {
      for (int i = 0; ; ++i) {
        if (i == maxLength || offset + i >= description.length) {
          return i;
        }
        int value = description[offset + i] - '0';
        if (value < 0) {
          return i;
        }
        if (base > 10 && value >= 10) {
          value += 10 - (value >= 'a' - '0' ? 'a' - '0' : 'A' - '0');
        }
        if (value >= base) {
          return i;
        }
      }
    }
    public CharacterMatcher parseClass(int start) {
      offset = start;
      return parseClass();
    }
    public CharacterMatcher parseClass() {
      if (description[offset] != '[') {
        return null;
      }
      CharacterMatcher matcher = new CharacterMatcher(new boolean[0],
        description[++ offset] == '^');
      if (matcher.inversePattern) {
        ++ offset;
      }
      int previous = -1;
      boolean firstCharacter = true;
      for (;;) {
        if (offset >= description.length) {
          unsupported("short regex");
        }
        char c = description[offset++];
        if (c == '-' && !firstCharacter && description[offset] != ']') {
          if (previous < 0) {
            unsupported("invalid range");
          }
          int rangeEnd = description[offset];
          if ('\\' == rangeEnd) {
            rangeEnd = parseEscapedCharacter();
            if (rangeEnd < 0) {
              unsupported("invalid range");
            }
          }
          matcher.ensureCapacity(rangeEnd + 1);
          for (int j = previous + 1; j <= rangeEnd; j++) {
            matcher.map[j] = true;
          }
        } else if (c == '\\') {
          previous = parseEscapedCharacter();
          if (previous < 0) {
            unsupported("escape");
          } else {
            matcher.setMatch(previous);
          }
        } else if (c == '&' || c == '[') {
          unsupported("operation");
        } else if (c == ']') {
          break;
        } else {
          previous = c;
          matcher.setMatch(previous);
        }
        firstCharacter = false;
      }
      return matcher;
    }
    private void unsupported(String msg) throws UnsupportedOperationException {
      throw new UnsupportedOperationException("Unsupported " + msg + " @"
        + offset + ": "
        + new String(description, 0, description.length));
    }
  }
 }