Merge pull request #105 from dscho/regex

Support (the most common subset of) regular expressions
2025-04-18 08:10:47 +00:00 · 2013-12-04 11:57:26 -08:00 · 2013-12-04 11:57:26 -08:00 · fe9ac94629
commit fe9ac94629
parent a90100ee32 6626b477ad
12 changed files with 1994 additions and 196 deletions
--- a/.gitignore
+++ b/.gitignore
@ -8,3 +8,4 @@ bin
 /lib
 /distrib
 *.pdb
+*.swp
--- a/classpath/java/util/regex/CharacterMatcher.java
+++ b/classpath/java/util/regex/CharacterMatcher.java
@ -0,0 +1,332 @@
+/* Copyright (c) 2008-2013, Avian Contributors
+
+   Permission to use, copy, modify, and/or distribute this software
+   for any purpose with or without fee is hereby granted, provided
+   that the above copyright notice and this permission notice appear
+   in all copies.
+
+   There is NO WARRANTY for this software.  See license.txt for
+   details. */
+
+package java.util.regex;
+
+/**
+ * A class to match classes of characters.
+ * <p>
+ * This class is intended to be the working horse behind character classes
+ * such as {@code [a-z]}.
+ * </p>
+ * @author Johannes Schindelin
+ */
+class CharacterMatcher {
+  private boolean[] map;
+  private boolean inversePattern;
+
+  public static CharacterMatcher parse(String description) {
+    return parse(description.toCharArray());
+  }
+
+  public static CharacterMatcher parse(char[] description) {
+    Parser parser = new Parser(description);
+    CharacterMatcher result = parser.parseClass();
+    if (parser.getEndOffset() != description.length) {
+      throw new RuntimeException("Short character class @"
+        + parser.getEndOffset() + ": " + new String(description));
+    }
+    return result;
+  }
+
+  public boolean matches(char c) {
+    int index = c;
+    return (map.length > index && map[index]) ^ inversePattern;
+  }
+
+  public String toString() {
+    StringBuilder builder = new StringBuilder();
+    builder.append("[");
+    if (inversePattern) {
+      builder.append("^");
+    }
+    for (int i = 0; i < map.length; ++ i) {
+      if (!map[i]) {
+        continue;
+      }
+      builder.append(i >= ' ' && i <= 0x7f ?
+        "" + (char)i : ("\\x" + Integer.toHexString(i)));
+      int j = i + 1;
+      while (j < map.length && map[j]) {
+        ++ j;
+      }
+      -- j;
+      if (j > i) {
+        if (j > i + 1) {
+          builder.append('-');
+        }
+        builder.append(j >= ' ' && j <= 0x7f ?
+          "" + (char)j : ("\\x" + Integer.toHexString(j)));
+        i = j;
+      }
+    }
+    builder.append("]");
+    return builder.toString();
+  }
+
+  private static String specialClass(int c) {
+    if ('d' == c) {
+      return "[0-9]";
+    }
+    if ('D' == c) {
+      return "[^0-9]";
+    }
+    if ('s' == c) {
+      return "[ \\t\\n\\x0B\\f\\r]";
+    }
+    if ('S' == c) {
+      return "[^ \\t\\n\\x0B\\f\\r]";
+    }
+    if ('w' == c) {
+      return "[a-zA-Z_0-9]";
+    }
+    if ('W' == c) {
+      return "[^a-zA-Z_0-9]";
+    }
+    return null;
+  }
+
+  private CharacterMatcher(boolean[] map, boolean inversePattern) {
+    this.map = map;
+    this.inversePattern = inversePattern;
+  }
+
+  private void setMatch(int c) {
+    ensureCapacity(c + 1);
+    map[c] = true;
+  }
+
+  private void ensureCapacity(int length) {
+    if (map.length >= length) {
+      return;
+    }
+    int size = map.length;
+    if (size < 32) {
+      size = 32;
+    }
+    while (size < length) {
+      size <<= 1;
+    }
+    map = java.util.Arrays.copyOf(map, size);
+  }
+
+  private void merge(CharacterMatcher other) {
+    boolean inversePattern = this.inversePattern || other.inversePattern;
+    if ((map.length < other.map.length) ^ inversePattern) {
+      map = java.util.Arrays.copyOf(map, other.map.length);
+    }
+    for (int i = 0; i < map.length; ++ i) {
+      map[i] = (matches((char)i) || other.matches((char)i)) ^ inversePattern;
+    }
+    this.inversePattern = inversePattern;
+  }
+
+  private void intersect(CharacterMatcher other) {
+    boolean inversePattern = this.inversePattern && other.inversePattern;
+    if ((map.length > other.map.length) ^ inversePattern) {
+      map = java.util.Arrays.copyOf(map, other.map.length);
+    }
+    for (int i = 0; i < map.length; ++ i) {
+      map[i] = (matches((char)i) && other.matches((char)i)) ^ inversePattern;
+    }
+    this.inversePattern = inversePattern;
+  }
+
+  static class Parser {
+    private final char[] description;
+    private int offset;
+
+    public Parser(char[] description) {
+      this.description = description;
+    }
+
+    public int getEndOffset() {
+      return offset;
+    }
+
+    /**
+     * Parses an escaped character.
+     * 
+     * @param start the offset <u>after</u> the backslash
+     * @return the escaped character, or -1 if no character was recognized
+     */
+    public int parseEscapedCharacter(int start) {
+      offset = start;
+      return parseEscapedCharacter();
+    }
+
+    private int parseEscapedCharacter() {
+      if (offset == description.length) {
+        throw new IllegalArgumentException("Short escaped character");
+      }
+      char c = description[offset++];
+      if (c == '0') {
+        int len = digits(offset, 3, 8);
+        if (len == 3 && description[offset] > '3') {
+          --len;
+        }
+        c = (char)Integer.parseInt(new String(description, offset, len), 8);
+        offset += len;
+        return c;
+      }
+      if (c == 'x' || c == 'u') {
+        int len = digits(offset, 4, 16);
+        c = (char)Integer.parseInt(new String(description, offset, len), 16);
+        offset += len;
+        return c;
+      }
+      switch (c) {
+      case 'a':
+         return 0x0007;
+      case 'e':
+         return 0x001B;
+      case 'f':
+         return 0x000C;
+      case 'n':
+         return 0x000A;
+      case 'r':
+         return 0x000D;
+      case 't':
+         return 0x0009;
+      case '\\':
+      case '.':
+      case '*':
+      case '+':
+      case '?':
+      case '|':
+      case '[':
+      case ']':
+      case '{':
+      case '}':
+      case '(':
+      case ')':
+      case '^':
+      case '$':
+        return c;
+      }
+      return -1;
+    }
+
+    public int digits(int offset, int maxLength, int base) {
+      for (int i = 0; ; ++i) {
+        if (i == maxLength || offset + i >= description.length) {
+          return i;
+        }
+        int value = description[offset + i] - '0';
+        if (value < 0) {
+          return i;
+        }
+        if (base > 10 && value >= 10) {
+          value += 10 - (value >= 'a' - '0' ? 'a' - '0' : 'A' - '0');
+        }
+        if (value >= base) {
+          return i;
+        }
+      }
+    }
+
+    public CharacterMatcher parseClass(int start) {
+      offset = start;
+      return parseClass();
+    }
+
+    public CharacterMatcher parseClass() {
+      if (description[offset] != '[') {
+        if (description[offset] == '\\') {
+          String range = specialClass(description[++ offset]);
+          if (range != null) {
+            ++ offset;
+            return CharacterMatcher.parse(range);
+          }
+        }
+        return null;
+      }
+      CharacterMatcher matcher = new CharacterMatcher(new boolean[0],
+        description[++ offset] == '^');
+      if (matcher.inversePattern) {
+        ++ offset;
+      }
+
+      int previous = -1;
+      boolean firstCharacter = true;
+      for (;;) {
+        if (offset >= description.length) {
+          unsupported("short regex");
+        }
+        char c = description[offset++];
+        if (c == '-' && !firstCharacter && description[offset] != ']') {
+          if (previous < 0) {
+            unsupported("invalid range");
+          }
+          int rangeEnd = description[offset];
+          if ('\\' == rangeEnd) {
+            rangeEnd = parseEscapedCharacter();
+            if (rangeEnd < 0) {
+              unsupported("invalid range");
+            }
+          }
+          matcher.ensureCapacity(rangeEnd + 1);
+          for (int j = previous + 1; j <= rangeEnd; j++) {
+            matcher.map[j] = true;
+          }
+        } else if (c == '\\') {
+          int saved = offset;
+          previous = parseEscapedCharacter();
+          if (previous < 0) {
+            offset = saved - 1;
+            CharacterMatcher clazz = parseClass();
+            if (clazz == null) {
+              unsupported("escape");
+            }
+            matcher.merge(clazz);
+          } else {
+            matcher.setMatch(previous);
+          }
+        } else if (c == '[') {
+          Parser parser = new Parser(description);
+          CharacterMatcher other = parser.parseClass(offset - 1);
+          if (other == null) {
+            unsupported("invalid merge");
+          }
+          matcher.merge(other);
+          offset = parser.getEndOffset();
+          previous = -1;
+        } else if (c == '&') {
+          if (offset + 2 > description.length || description[offset] != '&'
+              || description[offset + 1] != '[') {
+            unsupported("operation");
+          }
+          Parser parser = new Parser(description);
+          CharacterMatcher other = parser.parseClass(offset + 1);
+          if (other == null) {
+            unsupported("invalid intersection");
+          }
+          matcher.intersect(other);
+          offset = parser.getEndOffset();
+          previous = -1;
+        } else if (c == ']') {
+          break;
+        } else {
+          previous = c;
+          matcher.setMatch(previous);
+        }
+        firstCharacter = false;
+      }
+
+      return matcher;
+    }
+
+    private void unsupported(String msg) throws UnsupportedOperationException {
+      throw new UnsupportedOperationException("Unsupported " + msg + " @"
+        + offset + ": "
+        + new String(description, 0, description.length));
+    }
+  }
+}
--- a/classpath/java/util/regex/Compiler.java
+++ b/classpath/java/util/regex/Compiler.java
@ -0,0 +1,533 @@
+/* Copyright (c) 2008-2013, Avian Contributors
+
+   Permission to use, copy, modify, and/or distribute this software
+   for any purpose with or without fee is hereby granted, provided
+   that the above copyright notice and this permission notice appear
+   in all copies.
+
+   There is NO WARRANTY for this software.  See license.txt for
+   details. */
+
+package java.util.regex;
+
+import java.util.ArrayList;
+import java.util.Stack;
+
+/**
+ * Compiles regular expressions into {@link PikeVM}s.
+ * 
+ * @author Johannes Schindelin
+ */
+class Compiler implements PikeVMOpcodes {
+  private final static CharacterMatcher regularCharacter =
+      CharacterMatcher.parse("[^\\\\.*+?|\\[\\]{}()^$]");
+
+  private static class Output {
+    private int[] program;
+    private int offset;
+    private int groupCount = -1;
+    private int findPreambleSize;
+    private ArrayList<CharacterMatcher> classes;
+    private ArrayList<PikeVM> lookarounds;
+
+    public Output(Expression expr) {
+      // try-run to determine the code size
+      expr.writeCode(this);
+      program = new int[offset];
+      offset = 0;
+      groupCount = -1;
+      classes = new ArrayList<CharacterMatcher>();
+      lookarounds = new ArrayList<PikeVM>();
+      // write it out!
+      expr.writeCode(this);
+    }
+
+    public void add(int opcode) {
+      if (program != null) {
+        program[offset] = opcode;
+      }
+      offset++;
+    }
+
+    public int markJump() {
+      return offset++;
+    }
+
+    public void setJump(int mark) {
+      if (program != null) {
+        program[mark] = offset;
+      }
+    }
+
+    public void markFindPreambleEnd() {
+      findPreambleSize = offset;
+    }
+
+    public PikeVM toVM() {
+      CharacterMatcher[] classes = new CharacterMatcher[this.classes.size()];
+      this.classes.toArray(classes);
+      PikeVM[] lookarounds = new PikeVM[this.lookarounds.size()];
+      this.lookarounds.toArray(lookarounds);
+      return new PikeVM(program, findPreambleSize, groupCount, classes,
+        lookarounds);
+    }
+
+    public int addClass(CharacterMatcher characterClass) {
+      if (program == null) {
+        return -1;
+      }
+      int result = classes.size();
+      classes.add(characterClass);
+      return result;
+    }
+
+    public int addLookaround(PikeVM lookaround) {
+      if (program == null) {
+        return -1;
+      }
+      int result = lookarounds.size();
+      lookarounds.add(lookaround);
+      return result;
+    }
+  }
+
+  private abstract class Expression {
+    protected abstract void writeCode(Output output);
+  }
+
+  private class CharacterRange extends Expression {
+    private final CharacterMatcher characterClass;
+
+    public CharacterRange(CharacterMatcher characterClass) {
+      this.characterClass = characterClass;
+    }
+
+    protected void writeCode(Output output) {
+      output.add(CHARACTER_CLASS);
+      output.add(output.addClass(characterClass));
+    }
+
+    public String toString() {
+      return characterClass.toString();
+    }
+  }
+
+  private class Repeat extends Expression {
+    private Expression expr;
+    private int minCount, maxCount;
+    private boolean greedy;
+
+    public Repeat(Expression expr, int minCount, int maxCount, boolean greedy) {
+      if (minCount < 0) {
+        throw new RuntimeException("Unexpected min count: " + minCount);
+      }
+      if (maxCount != -1) {
+        if (maxCount == 0) {
+          throw new RuntimeException("Unexpected max count: " + maxCount);
+        }
+        if (minCount > maxCount) {
+          throw new RuntimeException("Unexpected range: " + minCount + ", " + maxCount);
+        }
+      }
+      this.expr = expr;
+      this.minCount = minCount;
+      this.maxCount = maxCount;
+      this.greedy = greedy;
+    }
+
+    protected void writeCode(Output output) {
+      int start = output.offset;
+      int splitJmp = greedy ? SPLIT_JMP : SPLIT;
+      int split = greedy ? SPLIT : SPLIT_JMP;
+      for (int i = 1; i < minCount; ++ i) {
+        expr.writeCode(output);
+      }
+      if (maxCount == -1) {
+        if (minCount > 0) {
+          int jump = output.offset;
+          expr.writeCode(output);
+          output.add(splitJmp);
+          output.add(jump);
+        } else {
+          output.add(split);
+          int jump = output.markJump();
+          expr.writeCode(output);
+          output.add(splitJmp);
+          output.add(start + 2);
+          output.setJump(jump);
+        }
+      } else {
+        if (minCount > 0) {
+          expr.writeCode(output);
+        }
+        if (maxCount > minCount) {
+          int[] jumps = new int[maxCount - minCount];
+          for (int i = 0; i < jumps.length; ++ i) {
+            output.add(split);
+            jumps[i] = output.markJump();
+            expr.writeCode(output);
+          }
+          for (int jump : jumps) {
+            output.setJump(jump);
+          }
+        }
+      }
+    }
+
+    public String toString() {
+      String qualifier = greedy ? "" : "?";
+      if (minCount == 0 && maxCount < 2) {
+        return expr.toString() + (minCount < 0 ? "*" : "?") + qualifier;
+      }
+      if (minCount == 1 && maxCount < 0) {
+        return expr.toString() + "+" + qualifier;
+      }
+      return expr.toString() + "{" + minCount + ","
+        + (maxCount < 0 ? "" : "" + maxCount) + "}" + qualifier;
+    }
+  }
+
+  private class Group extends Expression {
+    private final boolean capturing;
+
+    private ArrayList<Expression> list = new ArrayList<Expression>();
+    private ArrayList<Group> alternatives;
+
+    public Group(boolean capturing, ArrayList<Expression> initialList) {
+      this.capturing = capturing;
+      if (initialList != null) {
+        list.addAll(initialList);
+      }
+    }
+
+    public void push(Expression expr) {
+      list.add(expr);
+    }
+
+    public void push(final int c) {
+      push(new Expression() {
+        public void writeCode(Output output) {
+          output.add(c);
+        }
+
+        public String toString() {
+          if (c >= 0) {
+              return "" + (char)c;
+          }
+          switch (c) {
+          case DOT:
+            return ".";
+          case WORD_BOUNDARY:
+            return "\\b";
+          case NON_WORD_BOUNDARY:
+            return "\\B";
+          case LINE_START:
+            return "^";
+          case LINE_END:
+            return "$";
+          default:
+            throw new RuntimeException("Unhandled opcode: " + c);
+          }
+        }
+      });
+    }
+
+    public void startAlternative() {
+      if (alternatives == null) {
+        alternatives = new ArrayList<Group>();
+      }
+      alternatives.add(new Group(false, list));
+      list.clear();
+    }
+
+    public Expression pop() {
+      Expression result = list.remove(list.size() - 1);
+      return result;
+    }
+
+    protected void writeCode(Output output) {
+      int groupIndex = -1;
+      if (capturing) {
+        groupIndex = ++ output.groupCount;
+        output.add(SAVE_OFFSET);
+        output.add(2 * groupIndex);
+      }
+      int[] jumps = null;
+      if (alternatives != null) {
+        jumps = new int[alternatives.size()];
+        int i = 0;
+        for (Group alternative : alternatives) {
+          output.add(SPLIT);
+          int jump = output.markJump();
+          alternative.writeCode(output);
+          output.add(JMP);
+          jumps[i++] = output.markJump();
+          output.setJump(jump);
+        }
+      }
+      for (Expression expr : list) {
+        expr.writeCode(output);
+      }
+      if (jumps != null) {
+        for (int jump : jumps) {
+          output.setJump(jump);
+        }
+      }
+      if (capturing) {
+        output.add(SAVE_OFFSET);
+        output.add(2 * groupIndex + 1);
+      }
+    }
+
+    public String toString() {
+      StringBuilder builder = new StringBuilder();
+      if (alternatives != null || list.size() > 1) {
+        builder.append('(');
+        if (!capturing) {
+          builder.append("?:");
+        }
+      }
+      if (alternatives != null) {
+        for (Group alternative : alternatives) {
+          builder.append(alternative).append('|');
+        }
+      }
+      for (Expression expr : list) {
+        builder.append(expr);
+      }
+      if (alternatives != null || list.size() > 1) {
+        builder.append(')');
+      }
+      return builder.toString();
+    }
+  }
+
+  private class Lookaround extends Expression {
+    private final Group group = new Group(false, null);
+    private final boolean forward, negative;
+
+    public Lookaround(boolean forward, boolean negative) {
+      this.forward = forward;
+      this.negative = negative;
+    }
+
+    @Override
+    protected void writeCode(Output output) {
+      PikeVM vm = new Output(group).toVM();
+      if (!forward) {
+        vm.reverse();
+      }
+      output.add(forward ?
+        (negative ? NEGATIVE_LOOKAHEAD : LOOKAHEAD) :
+        (negative ? NEGATIVE_LOOKAHEAD : LOOKBEHIND));
+      output.add(output.addLookaround(vm));
+    }
+
+    public String toString() {
+      String inner = group.toString();
+      if (inner.startsWith("(?:")) {
+        inner = inner.substring(3);
+      } else {
+        inner += ")";
+      }
+      return "(?=" + inner;
+    }
+  }
+
+  private class Group0 extends Expression {
+    private final Group group;
+
+    public Group0() {
+      group = new Group(true, null);
+    }
+
+    public void writeCode(Output output) {
+      // find() preamble
+      int start = output.offset;
+      output.add(SPLIT_JMP);
+      output.add(start + 5);
+      output.add(DOTALL);
+      output.add(SPLIT);
+      output.add(start + 2);
+      output.markFindPreambleEnd();
+      group.writeCode(output);
+    }
+
+    public String toString() {
+      String inner = group.toString();
+      return inner.startsWith("(?:") && inner.endsWith(")") ?
+          inner.substring(1, inner.length() - 1) : inner;
+    }
+  }
+
+  private Group0 root;
+  private Stack<Group> groups;
+
+  public Compiler() {
+    root = new Group0();
+    groups = new Stack<Group>();
+    groups.add(root.group);
+  }
+
+  public Pattern compile(String regex) {
+    char[] array = regex.toCharArray();
+    CharacterMatcher.Parser characterClassParser =
+      new CharacterMatcher.Parser(array);
+    for (int index = 0; index < array.length; ++ index) {
+      char c = array[index];
+      Group current = groups.peek();
+      if (regularCharacter.matches(c)) {
+        current.push(c);
+        continue;
+      }
+      switch (c) {
+      case '.':
+        current.push(DOT);
+        continue;
+      case '\\':
+        int unescaped = characterClassParser.parseEscapedCharacter(index + 1);
+        if (unescaped >= 0) {
+          index = characterClassParser.getEndOffset() - 1;
+          current.push((char)unescaped);
+          continue;
+        }
+        CharacterMatcher characterClass = characterClassParser.parseClass(index);
+        if (characterClass != null) {
+          index = characterClassParser.getEndOffset() - 1;
+          current.push(new CharacterRange(characterClass));
+          continue;
+        }
+        switch (array[index + 1]) {
+        case 'b':
+          index++;
+          current.push(WORD_BOUNDARY);
+          continue;
+        case 'B':
+          index++;
+          current.push(NON_WORD_BOUNDARY);
+          continue;
+        }
+        throw new RuntimeException("Parse error @" + index + ": " + regex);
+      case '?':
+      case '*':
+      case '+': {
+        boolean greedy = true;
+        if (index + 1 < array.length && array[index + 1] == '?') {
+          greedy = false;
+          ++ index;
+        }
+        current.push(new Repeat(current.pop(),
+          c == '+' ? 1 : 0, c == '?' ? 1 : -1, greedy));
+        continue;
+      }
+      case '{': {
+        ++ index;
+        int length = characterClassParser.digits(index, 8, 10);
+        int min = Integer.parseInt(regex.substring(index, index + length));
+        int max = min;
+        index += length - 1;
+        c = index + 1 < array.length ? array[index + 1] : 0;
+        if (c == ',') {
+          ++ index;
+          length = characterClassParser.digits(index + 1, 8, 10);
+          max = length == 0 ? -1 :
+            Integer.parseInt(regex.substring(index + 1, index + 1 + length));
+          index += length;
+          c = index + 1< array.length ? array[index + 1] : 0;
+        }
+        if (c != '}') {
+          throw new RuntimeException("Invalid quantifier @" + index + ": "
+              + regex);
+        }
+        ++ index;
+        boolean greedy = true;
+        if (index + 1 < array.length && array[index + 1] == '?') {
+          ++ index;
+          greedy = false;
+        }
+        current.push(new Repeat(current.pop(), min, max, greedy));
+        continue;
+      }
+      case '(': {
+        boolean capturing = true;
+        if (index + 1 < array.length && array[index + 1] == '?') {
+          index += 2;
+          if (index >= array.length) {
+            throw new RuntimeException("Short pattern @" + index + ": "
+              + regex);
+          }
+          c = array[index];
+          boolean lookAhead = true;
+          if (c == '<') {
+            if (++ index >= array.length) {
+              throw new RuntimeException("Short pattern @" + index + ": "
+                + regex);
+            }
+            lookAhead = false;
+            c = array[index];
+            if (c != '=' && c != '!') {
+              throw new IllegalArgumentException("Named groups not supported @"
+                + index + ": " + regex);
+            }
+          }
+          switch (c) {
+          case ':':
+            capturing = false;
+            break;
+          case '!':
+          case '=': {
+            capturing = false;
+            Lookaround lookaround = new Lookaround(lookAhead, c == '!');
+            current.push(lookaround);
+            groups.push(lookaround.group);
+            continue;
+          }
+          default:
+            throw new UnsupportedOperationException("Not yet supported: "
+              + regex.substring(index));
+          }
+        }
+        current.push(groups.push(new Group(capturing, null)));
+        continue;
+      }
+      case ')':
+        if (groups.size() < 2) {
+          throw new RuntimeException("Invalid group close @" + index + ": "
+            + regex);
+        }
+        groups.pop();
+        continue;
+      case '[': {
+        CharacterMatcher matcher = characterClassParser.parseClass(index);
+        if (matcher == null) {
+          throw new RuntimeException("Invalid range @" + index + ": " + regex);
+        }
+        current.push(new CharacterRange(matcher));
+        index = characterClassParser.getEndOffset() - 1;
+        continue;
+      }
+      case '|':
+        current.startAlternative();
+        continue;
+      case '^':
+        current.push(LINE_START);
+        continue;
+      case '$':
+        current.push(LINE_END);
+        continue;
+      default:
+        throw new RuntimeException("Parse error @" + index + ": " + regex);
+      }
+    }
+    if (groups.size() != 1) {
+      throw new IllegalArgumentException("Unclosed groups: ("
+        + (groups.size() - 1) + "): " + regex);
+    }
+    PikeVM vm = new Output(root).toVM();
+    String plain = vm.isPlainString();
+    if (plain != null) {
+      return new TrivialPattern(regex, plain, 0);
+    }
+    return new RegexPattern(regex, 0, vm);
+  }
+}
--- a/classpath/java/util/regex/Matcher.java
+++ b/classpath/java/util/regex/Matcher.java
@ -15,27 +15,23 @@ package java.util.regex;
 * 
 * @author zsombor and others
 */
-public class Matcher {
-  private final Pattern pattern;
-  private CharSequence input;
-  private int start;
-  private int end;
+public abstract class Matcher {
+  protected CharSequence input;
+  protected int start;
+  protected int end;

-  Matcher(Pattern pattern, CharSequence input) {
-    this.pattern = pattern;
-    this.input = input;
+  public Matcher(CharSequence input) {
+    reset(input);
  }

-  public boolean matches() {
-    if (pattern.pattern().equals(input.toString())) {
-      start = 0;
-      end = input.length();
-      return true;
-    } else {
-      return false;
-    }
+  public abstract boolean matches();
+
+  public boolean find() {
+    return find(end);
  }

+  public abstract boolean find(int start);
+
  public Matcher reset() {
    return reset(input);
  }
@ -47,10 +43,6 @@ public class Matcher {
    return this;
  }

-  public int start() {
-    return start;
-  }
-
  public String replaceAll(String replacement) {
    return replace(replacement, Integer.MAX_VALUE);
  }
@ -59,7 +51,7 @@ public class Matcher {
    return replace(replacement, 1);
  }

-  private String replace(String replacement, int limit) {
+  protected String replace(String replacement, int limit) {
    reset();

    StringBuilder sb = null;
@ -88,23 +80,40 @@ public class Matcher {
    return sb.toString();
  }

+  public int start() {
+    return start;
+  }
+
  public int end() {
    return end;
  }

-  public boolean find() {
-    return find(end);
+  public String group() {
+    return input.subSequence(start, end).toString();
  }

-  public boolean find(int start) {
-    String p = pattern.pattern();
-    int i = Pattern.indexOf(input, p, start);
-    if (i >= 0) {
-      this.start = i;
-      this.end = i + p.length();
-      return true;
-    } else {
-      return false;
+  public int start(int group) {
+    if (group == 0) {
+      return start();
    }
+    throw new UnsupportedOperationException();
+  }
+
+  public int end(int group) {
+    if (group == 0) {
+      return end();
+    }
+    throw new UnsupportedOperationException();
+  }
+
+  public String group(int group) {
+    if (group == 0) {
+      return group();
+    }
+    throw new UnsupportedOperationException();
+  }
+
+  public int groupCount() {
+    return 0;
  }
 }
--- a/classpath/java/util/regex/Pattern.java
+++ b/classpath/java/util/regex/Pattern.java
@ -10,9 +10,8 @@

 package java.util.regex;

-import java.util.Iterator;
+import java.util.ArrayList;
 import java.util.List;
-import java.util.LinkedList;

 /**
 * This is a work in progress.
@ -20,7 +19,7 @@ import java.util.LinkedList;
 * @author zsombor and others
 * 
 */
-public class Pattern {
+public abstract class Pattern implements PikeVMOpcodes {

  public static final int UNIX_LINES       = 1;
  public static final int CASE_INSENSITIVE = 2;
@ -35,112 +34,26 @@ public class Pattern {
  private final String pattern;

  protected Pattern(String pattern, int flags) {
-    this.pattern = trivial(pattern);
+    this.pattern = pattern;
    this.patternFlags = flags;
  }

-  private static String trivial(String pattern) {
-    StringBuffer buffer = new StringBuffer();
-    for (int i = 0; i < pattern.length(); ++i) {
-      char c = pattern.charAt(i);
-      switch (c) {
-      case '\\':
-        if (++i == pattern.length()) {
-          break;
-        }
-        c = pattern.charAt(i);
-        if (c == '0') {
-          int len = digits(pattern, ++i, 3, 8);
-          if (len == 3 && pattern.charAt(i) > '3') {
-            --len;
-          }
-          c = (char)Integer.parseInt(pattern.substring(i, i + len), 8);
-          i += len - 1;
-        } else if (c == 'x' || c == 'u') {
-          int len = digits(pattern, ++i, 4, 16);
-          c = (char)Integer.parseInt(pattern.substring(i, i + len), 16);
-          i += len - 1;
-        } else {
-          c = unescape(pattern.charAt(i));
-        }
-        if (c != -1) {
-          break;
-        }
-        // fallthru
-      case '.':
-      case '*':
-      case '+':
-      case '?':
-      case '|':
-      case '[':
-      case ']':
-      case '{':
-      case '}':
-      case '(':
-      case ')':
-      case '^':
-      case '$':
-        throw new UnsupportedOperationException
-          ("only trivial regular expressions are supported so far (" + pattern + ")");
-      }
-      buffer.append(c);
-    }
-    return buffer.toString();
-  }
-
-  private static int digits(String s, int offset, int maxLength, int base) {
-    for (int i = 0; ; ++i) {
-      if (i == maxLength || offset + i >= s.length()) {
-        return i;
-      }
-      int value = s.charAt(offset + i) - '0';
-      if (value < 0) {
-        return i;
-      }
-      if (base > 10 && value >= 10) {
-        value += 10 - (value >= 'a' - '0' ? 'a' - '0' : 'A' - '0');
-      }
-      if (value >= base) {
-        return i;
-      }
-    }
-  }
-
-  private static char unescape(char c) {
-    switch (c) {
-    case '\\':
-       return c;
-    case 'a':
-       return 0x0007;
-    case 'e':
-       return 0x001B;
-    case 'f':
-       return 0x000C;
-    case 'n':
-       return 0x000A;
-    case 'r':
-       return 0x000D;
-    case 't':
-       return 0x0009;
-    }
-    return (char)-1;
-  }
-
  public static Pattern compile(String regex) {
-    return new Pattern(regex, 0);
+    return compile(regex, 0);
  }

  public static Pattern compile(String regex, int flags) {
-    return new Pattern(regex, flags);
+    if (flags != 0) {
+      throw new UnsupportedOperationException("TODO");
+    }
+    return new Compiler().compile(regex);
  }

  public int flags() {
    return patternFlags;
  }

-  public Matcher matcher(CharSequence input) {
-    return new Matcher(this, input);
-  }
+  public abstract Matcher matcher(CharSequence input);

  public static boolean matches(String regex, CharSequence input) {
    return Pattern.compile(regex).matcher(input).matches();
@ -155,79 +68,22 @@ public class Pattern {
  }

  public String[] split(CharSequence input, int limit) {
-    boolean strip;
-    if (limit < 0) {
-      strip = false;
+    if (limit <= 0) {
      limit = Integer.MAX_VALUE;
-    } else if (limit == 0) {
-      strip = true;
-      limit = Integer.MAX_VALUE;
-    } else {
-      strip = false;
    }
-
-    List<CharSequence> list = new LinkedList();
-    int index = 0;
-    int trailing = 0;
-    int patternLength = pattern.length();
-    while (index < input.length() && list.size() < limit - 1) {
-      int i;
-      if (patternLength == 0) {
-        if (list.size() == 0) {
-          i = 0;
-        } else {
-          i = index + 1;
-        }
-      } else {
-        i = indexOf(input, pattern, index);
-      }
-
-      if (i >= 0) {
-        if (patternLength != 0 && i == index) {
-          ++ trailing;
-        } else {
-          trailing = 0;
-        }
-
-        list.add(input.subSequence(index, i));
-        index = i + patternLength;
-      } else {
+    Matcher matcher = matcher(input);
+    List<String> result = new ArrayList<String>();
+    int offset = 0;
+    for (;;) {
+      if (result.size() >= limit || !matcher.find()) {
        break;
      }
+      result.add(input.subSequence(offset, matcher.start()).toString());
+      offset = matcher.end();
    }
-
-    if (strip && index > 0 && index == input.length()) {
-      ++ trailing;
-    } else {
-      trailing = 0;
+    if (offset == 0 || offset < input.length()) {
+      result.add(input.subSequence(offset, input.length()).toString());
    }
-    list.add(input.subSequence(index, input.length()));
-
-    String[] result = new String[list.size() - trailing];
-    int i = 0;
-    for (Iterator<CharSequence> it = list.iterator();
-         it.hasNext() && i < result.length; ++ i)
-    {
-      result[i] = it.next().toString();
-    }
-    return result;
-  }
-
-  static int indexOf(CharSequence haystack, CharSequence needle, int start) {
-    if (needle.length() == 0) return start;
-
-    for (int i = start; i < haystack.length() - needle.length() + 1; ++i) {
-      int j = 0;
-      for (; j < needle.length(); ++j) {
-        if (haystack.charAt(i + j) != needle.charAt(j)) {
-          break;
-        }
-      }
-      if (j == needle.length()) {
-        return i;
-      }
-    }
-
-    return -1;
+    return result.toArray(new String[result.size()]);
  }
 }
--- a/classpath/java/util/regex/PikeVM.java
+++ b/classpath/java/util/regex/PikeVM.java
@ -0,0 +1,629 @@
+/* Copyright (c) 2008-2013, Avian Contributors
+
+   Permission to use, copy, modify, and/or distribute this software
+   for any purpose with or without fee is hereby granted, provided
+   that the above copyright notice and this permission notice appear
+   in all copies.
+
+   There is NO WARRANTY for this software.  See license.txt for
+   details. */
+
+package java.util.regex;
+
+/**
+ * A minimal implementation of a regular expression engine.
+ * 
+ * @author Johannes Schindelin
+ */
+class PikeVM implements PikeVMOpcodes {
+  private final int[] program;
+  private final int groupCount;
+  private final int offsetsCount;
+  /*
+   * For find(), we do not want to anchor the match at the start offset. Our
+   * compiler allows this by prefixing the code with an implicit '(?:.*?)'. For
+   * regular matches() calls, we want to skip that code and start at {@code
+   * findPrefixLength} instead.
+   */
+  private final int findPrefixLength;
+  private final CharacterMatcher[] classes;
+  private final PikeVM[] lookarounds;
+  private final static CharacterMatcher wordCharacter =
+    CharacterMatcher.parse("\\w");
+  private final static CharacterMatcher lineTerminator =
+    CharacterMatcher.parse("[\n\r\u0085\u2028\u2029]");
+  private boolean multiLine;
+
+  public interface Result {
+    void set(int[] start, int[] end);
+  }
+
+  protected PikeVM(int[] program, int findPrefixLength, int groupCount,
+    CharacterMatcher[] classes, PikeVM[] lookarounds)
+  {
+    this.program = program;
+    this.findPrefixLength = findPrefixLength;
+    this.groupCount = groupCount;
+    offsetsCount = 2 * groupCount + 2;
+    this.classes = classes;
+    this.lookarounds = lookarounds;
+  }
+
+  /**
+   * The current thread states.
+   * <p>
+   * The threads are identified by their program counter. The rationale: as all
+   * threads are executed in lock-step, i.e. for the same character in the
+   * string to be matched, it does not make sense for two threads to be at the
+   * same program counter -- they would both do exactly the same for the rest of
+   * the execution.
+   * </p>
+   * <p>
+   * For efficiency, the threads are kept in a linked list that actually lives
+   * in an array indexed by the program counter, pointing to the next thread's
+   * program counter, in the order of high to low priority.
+   * </p>
+   * <p>
+   * Program counters which have no thread associated thread are marked as -1.
+   * The program counter associated with the least-priority thread (the last one
+   * in the linked list) is marked as -2 to be able to tell it apart from
+   * unscheduled threads.
+   * </p>
+   * <p>
+   * We actually never need to have an explicit value for the priority, the
+   * ordering is sufficient: whenever a new thread is to be scheduled and it is
+   * found to be scheduled already, it was already scheduled by a
+   * higher-priority thread.
+   * </p>
+   */
+  private class ThreadQueue {
+    private int head, tail;
+    // next[pc] is 1 + the next thread's pc
+    private int[] next;
+    // offsets[pc][2 * group] is 1 + start offset
+    private int[][] offsets;
+
+    public ThreadQueue() {
+      head = tail = -1;
+      next = new int[program.length + 1];
+      offsets = new int[program.length + 1][];
+    }
+
+    public ThreadQueue(int startPC) {
+      head = tail = startPC;
+      next = new int[program.length + 1];
+      offsets = new int[program.length + 1][];
+      offsets[head] = new int[offsetsCount];
+    }
+
+    public int queueOneImmediately(ThreadQueue into) {
+      for (;;) {
+        if (head < 0) {
+          return -1;
+        }
+        boolean wasQueued = queueNext(head, head, into);
+        int pc = head;
+        if (head == tail) {
+          head = tail = -1;
+        } else {
+          head = next[pc] - 1;
+          next[pc] = 0;
+        }
+        offsets[pc] = null;
+        if (wasQueued) {
+          into.tail = pc;
+          return pc;
+        }
+      }
+    }
+
+    /**
+     * Schedules the instruction at {@code nextPC} to be executed immediately.
+     * <p>
+     * For non-matching steps (SPLIT, SAVE_STATE, etc) we need to schedule the
+     * corresponding program counter(s) to be handled right after this opcode,
+     * before advancing to the next character.
+     * </p>
+     * <p>
+     * To achieve this, we insert the program counter to-be-scheduled in the
+     * linked thread list at the current position, but only if it has not been
+     * scheduled yet: if it has, a higher-priority thread already reached that
+     * state.
+     * </p>
+     * <p>
+     * In contrast to {@link #queueNext(int, int, ThreadQueue)}, this method
+     * works on the current step's thread list.
+     * </p>
+     * 
+     * @param currentPC
+     *          the current program counter
+     * @param nextPC
+     *          the program counter to schedule
+     * @param copyThreadState
+     *          whether to spawn off a new thread
+     * @return whether the step was queued (i.e. no thread was queued for the
+     *         same {@code nextPC} already)
+     */
+    public boolean queueImmediately(int currentPC, int nextPC,
+        boolean copyThreadState) {
+      if (isScheduled(nextPC)) {
+        return false;
+      }
+      int[] offsets = this.offsets[currentPC];
+      if (copyThreadState) {
+        offsets = java.util.Arrays.copyOf(offsets, offsetsCount);
+      }
+      if (currentPC == tail) {
+        tail = nextPC;
+      } else {
+        next[nextPC] = next[currentPC];
+      }
+      this.offsets[nextPC] = offsets;
+      next[currentPC] = nextPC + 1;
+      return true;
+    }
+
+    /**
+     * Schedules the instruction at {@code nextPC} to be executed in the next
+     * step.
+     * <p>
+     * This method advances the current thread to the next program counter, to
+     * be executed after reading the next character.
+     * </p>
+     * 
+     * @param currentPC
+     *          the current program counter
+     * @param nextPC
+     *          the program counter to schedule
+     * @param next
+     *          the thread state of the next step
+     * @return whether the step was queued (i.e. no thread was queued for the
+     *         same {@code nextPC} already)
+     */
+    private boolean queueNext(int currentPC, int nextPC, ThreadQueue next) {
+      if (next.tail < 0) {
+        next.head = nextPC;
+      } else if (next.isScheduled(nextPC)) {
+        return false;
+      } else {
+        next.next[next.tail] = nextPC + 1;
+      }
+      next.offsets[nextPC] = offsets[currentPC];
+      next.tail = nextPC;
+      return true;
+    }
+
+    public void saveOffset(int pc, int index, int offset) {
+      offsets[pc][index] = offset + 1;
+    }
+
+    public void setResult(Result result) {
+      // copy offsets
+      int[] offsets = this.offsets[program.length];
+      int[] groupStart = new int[groupCount + 1];
+      int[] groupEnd = new int[groupCount + 1];
+      for (int j = 0; j <= groupCount; ++j) {
+        groupStart[j] = offsets[2 * j] - 1;
+        groupEnd[j] = offsets[2 * j + 1] - 1;
+      }
+      result.set(groupStart, groupEnd);
+    }
+
+    private void mustStartMatchAt(int start) {
+      int previous = -1;
+      for (int pc = head; pc >= 0; ) {
+        int nextPC = next[pc] - 1;
+        if (start + 1 == offsets[pc][0]) {
+          previous = pc;
+        } else {
+          next[pc] = 0;
+          offsets[pc] = null;
+          if (pc == tail) {
+            head = tail = -1;
+          } else if (previous < 0) {
+            head = nextPC;
+          } else {
+            next[previous] = 1 + nextPC;
+          }
+        }
+        pc = nextPC;
+      }
+    }
+
+    private int startOffset(int pc) {
+      return offsets[pc][0] - 1;
+    }
+
+    public boolean isEmpty() {
+      return head < 0;
+    }
+
+    public boolean isScheduled(int pc) {
+      return pc == tail || next[pc] > 0;
+    }
+
+    public int next(int pc) {
+      return pc < 0 ? head : next[pc] - 1;
+    }
+
+    public void clean() {
+      for (int pc = head; pc >= 0; ) {
+        int nextPC = next[pc] - 1;
+        next[pc] = 0;
+        offsets[pc] = null;
+        pc = nextPC;
+      }
+      head = tail = -1;
+    }
+  }
+
+  /**
+   * Executes the Pike VM defined by the program.
+   * <p>
+   * The idea is to execute threads in parallel, at each step executing them
+   * from the highest priority thread to the lowest one. In contrast to most
+   * regular expression engines, the Thompson/Pike one gets away with linear
+   * complexity because the string is matched from left to right, at each step
+   * executing a number of threads bounded by the length of the program: if two
+   * threads would execute at the same instruction pointer of the program, we
+   * need only consider the higher-priority one.
+   * </p>
+   * <p>
+   * This implementation is based on the description of <a
+   * href="http://swtch.com/%7Ersc/regexp/regexp2.html">Russ Cox</a>.
+   * </p>
+   * 
+   * @param characters
+   *          the {@link String} to match
+   * @param start
+   *          the start offset where to match
+   * @param length
+   *          the end offset
+   * @param anchorStart
+   *          whether the match must start at {@code start}
+   * @param anchorEnd
+   *          whether the match must start at {@code end}
+   * @param result
+   *          the {@link Matcher} to store the groups' offsets in, if successful
+   * @return whether a match was found
+   */
+  public boolean matches(char[] characters, int start, int end,
+      boolean anchorStart, boolean anchorEnd, Result result)
+  {
+    ThreadQueue current = new ThreadQueue();
+    ThreadQueue next = new ThreadQueue();
+
+    // initialize the first thread
+    int startPC = anchorStart ? findPrefixLength : 0;
+    ThreadQueue queued = new ThreadQueue(startPC);
+
+    boolean foundMatch = false;
+    int step = end > start ? +1 : -1;
+    for (int i = start; i != end + step; i += step) {
+      if (queued.isEmpty()) {
+        // no threads left
+        return foundMatch;
+      }
+
+      char c = i != end ? characters[i] : 0;
+      int pc = -1;
+      for (;;) {
+        pc = current.next(pc);
+        if (pc < 0) {
+          pc = queued.queueOneImmediately(current);
+        }
+        if (pc < 0) {
+          break;
+        }
+
+        // pc == program.length is a match!
+        if (pc == program.length) {
+          if (anchorEnd && i != end) {
+            continue;
+          }
+          if (result == null) {
+            // only interested in a match, no need to go on
+            return true;
+          }
+          current.setResult(result);
+
+          // now that we found a match, even higher-priority matches must match
+          // at the same start offset
+          if (!anchorStart) {
+            next.mustStartMatchAt(current.startOffset(pc));
+          }
+          foundMatch = true;
+          break;
+        }
+
+        int opcode = program[pc];
+        switch (opcode) {
+        case DOT:
+          if (c != '\0' && c != '\r' && c != '\n') {
+            current.queueNext(pc, pc + 1, next);
+          }
+          break;
+        case DOTALL:
+          current.queueNext(pc, pc + 1, next);
+          break;
+        case WORD_BOUNDARY:
+        case NON_WORD_BOUNDARY: {
+          int i2 = i - step;
+          int c2 = i2 < 0 || i2 >= characters.length ? -1 : characters[i2];
+          switch (opcode) {
+          case WORD_BOUNDARY:
+            if ((c2 < 0 || !wordCharacter.matches((char)c2))) {
+              if (wordCharacter.matches(c)) {
+                current.queueImmediately(pc, pc + 1, false);
+              }
+            } else if (i >= 0 && i < characters.length &&
+                !wordCharacter.matches(c)) {
+              current.queueImmediately(pc, pc + 1, false);
+            }
+            break;
+          case NON_WORD_BOUNDARY:
+            if ((c2 < 0 || !wordCharacter.matches((char)c2))) {
+              if (i >= 0 && i < characters.length &&
+                  !wordCharacter.matches(c)) {
+                current.queueImmediately(pc, pc + 1, false);
+              }
+            } else if (wordCharacter.matches(c)) {
+              current.queueImmediately(pc, pc + 1, false);
+            }
+            break;
+          }
+          break;
+        }
+        case LINE_START:
+          if (i == 0 || (multiLine &&
+              lineTerminator.matches(characters[i - 1]))) {
+            current.queueImmediately(pc, pc + 1, false);
+          }
+          break;
+        case LINE_END:
+          if (i == characters.length || (multiLine &&
+              lineTerminator.matches(c))) {
+            current.queueImmediately(pc, pc + 1, false);
+          }
+          break;
+        case CHARACTER_CLASS:
+          if (classes[program[pc + 1]].matches(c)) {
+            current.queueNext(pc, pc + 2, next);
+          }
+          break;
+        case LOOKAHEAD:
+          if (lookarounds[program[pc + 1]].matches(characters,
+              i, characters.length, true, false, null)) {
+            current.queueImmediately(pc, pc + 2, false);
+          }
+          break;
+        case LOOKBEHIND:
+          if (lookarounds[program[pc + 1]].matches(characters,
+              i - 1, -1, true, false, null)) {
+            current.queueImmediately(pc, pc + 2, false);
+          }
+          break;
+        case NEGATIVE_LOOKAHEAD:
+          if (!lookarounds[program[pc + 1]].matches(characters,
+              i, characters.length, true, false, null)) {
+            current.queueImmediately(pc, pc + 2, false);
+          }
+          break;
+        case NEGATIVE_LOOKBEHIND:
+          if (!lookarounds[program[pc + 1]].matches(characters,
+              i - 1, -1, true, false, null)) {
+            current.queueImmediately(pc, pc + 2, false);
+          }
+          break;
+        /* immediate opcodes, i.e. thread continues within the same step */
+        case SAVE_OFFSET:
+          if (result != null) {
+            int index = program[pc + 1];
+            current.saveOffset(pc, index, i);
+          }
+          current.queueImmediately(pc, pc + 2, false);
+          break;
+        case SPLIT:
+          current.queueImmediately(pc, program[pc + 1], true);
+          current.queueImmediately(pc, pc + 2, false);
+          break;
+        case SPLIT_JMP:
+          current.queueImmediately(pc, pc + 2, true);
+          current.queueImmediately(pc, program[pc + 1], false);
+          break;
+        case JMP:
+          current.queueImmediately(pc, program[pc + 1], false);
+          break;
+        default:
+          if (program[pc] >= 0 && program[pc] <= 0xffff) {
+            if (c == (char)program[pc]) {
+              current.queueNext(pc, pc + 1, next);
+            }
+            break;
+          }
+          throw new RuntimeException("Invalid opcode: " + opcode
+            + " at pc " + pc);
+        }
+      }
+      // clean linked thread list (and states)
+      current.clean();
+
+      // prepare for next step
+      ThreadQueue swap = queued;
+      queued = next;
+      next = swap;
+    }
+    return foundMatch;
+  }
+
+  /**
+   * Determines whether this machine recognizes a pattern without special
+   * operators.
+   * <p>
+   * In case that the regular expression is actually a plain string without any
+   * special operators, we can avoid using a full-blown Pike VM and instead fall
+   * back to using the much faster {@link TrivialPattern}.
+   * </p>
+   * 
+   * @return the string to match, or null if the machine recognizes a
+   *         non-trivial pattern
+   */
+  public String isPlainString() {
+    // we expect the machine to start with the find preamble and SAVE_OFFSET 0
+    // end with SAVE_OFFSET 1
+    int start = findPrefixLength;
+    if (start + 1 < program.length &&
+        program[start] == SAVE_OFFSET && program[start + 1] == 0) {
+      start += 2;
+    }
+    int end = program.length;
+    if (end > start + 1 &&
+        program[end - 2] == SAVE_OFFSET && program[end - 1] == 1) {
+      end -= 2;
+    }
+    for (int i = start; i < end; ++ i) {
+      if (program[i] < 0) {
+        return null;
+      }
+    }
+    char[] array = new char[end - start];
+    for (int i = start; i < end; ++ i) {
+      array[i - start] = (char)program[i];
+    }
+    return new String(array);
+  }
+
+  private static int length(int opcode) {
+    return opcode <= SINGLE_ARG_START && opcode >= SINGLE_ARG_END ? 2 : 1;
+  }
+
+  private static boolean isJump(int opcode) {
+    return opcode <= SPLIT && opcode >= JMP;
+  }
+
+  /**
+   * Reverses the program (effectively matching the reverse pattern).
+   * <p>
+   * It is a well-known fact that any regular expression can be reordered
+   * trivially into an equivalent regular expression to be applied in backward
+   * direction (coming in real handy for look-behind expressions).
+   * </p>
+   * <p>
+   * Example: instead of matching the sequence "aaaabb" with the pattern "a+b+",
+   * we can match the reverse sequence "bbaaaa" with the pattern "b+a+".
+   * </p>
+   * <p>
+   * One caveat: while the reverse pattern is equivalent in the sense that it
+   * matches if, and only if, the original pattern matches the forward
+   * direction, the same is not true for submatches. Consider the input "a" and
+   * the pattern "(a?)a?": when matching in forward direction the captured group
+   * is "a", while the backward direction will yield the empty string. For that
+   * reason, Java dictates that capturing groups in look-behind patterns are
+   * ignored.
+   * </p>
+   */
+  public void reverse() {
+    reverse(findPrefixLength, program.length);
+  }
+
+  /**
+   * Reverses a specific part of the program (to match in reverse direction).
+   * <p>
+   * This is the work-horse of {@link #reverse()}.
+   * </p>
+   * <p>
+   * To visualize the process of reversing a program, let's look at it as a
+   * directed graph (each jump is represented by an "<tt>X</tt>
+   * ", non-jumping steps are represented by a "<tt>o</tt>"s, arrows show the
+   * direction of the flow, <code>SPLIT</code>s spawn two arrows):
+   * 
+   * <pre>
+   * o -> X -> X -> o -> X    o -> o
+   * ^    |     \         \___^____^
+   *  \__/       \____________|
+   * </pre>
+   * 
+   * The concept of reversing the program is easiest explained as following: if
+   * we insert auxiliary nodes "<tt>Y</tt>" for jump targets, the graph looks
+   * like this instead:
+   * 
+   * <pre>
+   * Y -> o -> X -> X -> o -> X    Y -> o -> Y -> o
+   * ^         |     \         \___^_________^
+   *  \_______/       \____________|
+   * </pre>
+   * 
+   * It is now obvious that reversing the program is equivalent to reversing all
+   * arrows, simply deleting all <tt>X</tt>s and substituting each <tt>Y</tt>
+   * with a jump. Note that the reverse program will have the same number of
+   * <tt>JMP</tt>, but they will not be associated with the same arrows!:
+   * 
+   * <pre>
+   * X <- o <- o    X <- o <- X <- o
+   * |    ^    ^____|________/
+   *  \__/ \_______/
+   * </pre>
+   * 
+   * </p>
+   * @param start
+   *          start reversing the program with this instruction
+   * @param end
+   *          stop reversing at this instruction (this must be either an index
+   *          aligned exactly with an instruction, or exactly
+   *          {@code program.length}.
+   */
+  private void reverse(int start, int end) {
+    // Pass 1: build the list of jump targets
+    int[] newJumps = new int[end + 1];
+    boolean[] brokenArrows = new boolean[end + 1];
+    for (int pc = start; pc < end; pc += length(program[pc])) {
+      if (isJump(program[pc])) {
+        int target = program[pc + 1];
+        newJumps[pc + 1] = newJumps[target];
+        newJumps[target] = pc + 1;
+        if (program[pc] == JMP) {
+          brokenArrows[pc + 2] = true;
+        }
+      }
+    }
+
+    // Pass 2: determine mapped program counters
+    int[] mapping = new int[end];
+    for (int pc = start, mappedPC = end; mappedPC > 0
+        && pc < end; pc += length(program[pc])) {
+      for (int jump = newJumps[pc]; jump > 0; jump = newJumps[jump]) {
+        mappedPC -= 2;
+      }
+      if (!isJump(program[pc])) {
+        mappedPC -= length(program[pc]);
+      }
+      mapping[pc] = mappedPC;
+    }
+
+    // Pass 3: write the new program
+    int[] reverse =  new int[end];
+    for (int pc = start, mappedPC = end; mappedPC > 0;
+        pc += length(program[pc])) {
+      boolean brokenArrow = brokenArrows[pc];
+      for (int jump = newJumps[pc]; jump > 0; jump = newJumps[jump]) {
+        reverse[--mappedPC] = mapping[jump - 1];
+        if (brokenArrow) {
+          reverse[--mappedPC] = JMP;
+          brokenArrow = false;
+        } else {
+          reverse[--mappedPC] =
+              program[jump - 1] == SPLIT_JMP ? SPLIT_JMP : SPLIT;
+        }
+      }
+      if (pc == end) {
+        break;
+      }
+      if (!isJump(program[pc])) {
+        for (int i = length(program[pc]); i-- > 0; ) {
+          reverse[--mappedPC] = program[pc + i];
+        }
+      }
+    }
+    System.arraycopy(reverse, start, program, start, end - start);
+  }
+}
--- a/classpath/java/util/regex/PikeVMOpcodes.java
+++ b/classpath/java/util/regex/PikeVMOpcodes.java
@ -0,0 +1,45 @@
+/* Copyright (c) 2008-2013, Avian Contributors
+
+   Permission to use, copy, modify, and/or distribute this software
+   for any purpose with or without fee is hereby granted, provided
+   that the above copyright notice and this permission notice appear
+   in all copies.
+
+   There is NO WARRANTY for this software.  See license.txt for
+   details. */
+
+package java.util.regex;
+
+/**
+ * Opcodes for the Pike VM.
+ * <p>
+ * See {@link PikeVM}.
+ * </p>
+ * 
+ * @author Johannes Schindelin
+ */
+interface PikeVMOpcodes {
+  final static int DOT = -1;
+  final static int DOTALL = -2;
+
+  final static int WORD_BOUNDARY = -10;
+  final static int NON_WORD_BOUNDARY = -11;
+  final static int LINE_START = -12;
+  final static int LINE_END = -13;
+
+  final static int CHARACTER_CLASS = -20;
+
+  final static int LOOKAHEAD = -30;
+  final static int LOOKBEHIND = -31;
+  final static int NEGATIVE_LOOKAHEAD = -32;
+  final static int NEGATIVE_LOOKBEHIND = -33;
+
+  final static int SAVE_OFFSET = -40;
+
+  final static int SPLIT = -50;
+  final static int SPLIT_JMP = -51; // this split prefers to jump
+  final static int JMP = -52;
+
+  final static int SINGLE_ARG_START = CHARACTER_CLASS;
+  final static int SINGLE_ARG_END = JMP;
+}
--- a/classpath/java/util/regex/RegexMatcher.java
+++ b/classpath/java/util/regex/RegexMatcher.java
@ -0,0 +1,80 @@
+/* Copyright (c) 2008-2013, Avian Contributors
+
+   Permission to use, copy, modify, and/or distribute this software
+   for any purpose with or without fee is hereby granted, provided
+   that the above copyright notice and this permission notice appear
+   in all copies.
+
+   There is NO WARRANTY for this software.  See license.txt for
+   details. */
+
+package java.util.regex;
+
+/**
+ * A minimal implementation of a regular expression matcher.
+ * 
+ * @author Johannes Schindelin
+ */
+public class RegexMatcher extends Matcher {
+  private final PikeVM vm;
+  private char[] array;
+  int[] groupStart, groupEnd;
+
+  RegexMatcher(PikeVM vm, CharSequence string) {
+    super(string);
+    this.vm = vm;
+  }
+
+  private final PikeVM.Result adapter = new PikeVM.Result() {
+    public void set(int[] start, int[] end) {
+      RegexMatcher.this.start = start[0];
+      RegexMatcher.this.end = end[0];
+      RegexMatcher.this.groupStart = start;
+      RegexMatcher.this.groupEnd = end;
+    }
+  };
+
+  public Matcher reset() {
+    start = end = -1;
+    return this;
+  }
+
+  public Matcher reset(CharSequence input) {
+    this.input = input;
+    array = input.toString().toCharArray();
+    return reset();
+  }
+
+  public boolean matches() {
+    return vm.matches(array, 0, array.length, true, true, adapter);
+  }
+
+  public boolean find() {
+    return find(end + (start == end ? 1 : 0));
+  }
+
+  public boolean find(int offset) {
+    return vm.matches(array, offset, array.length, false, false, adapter);
+  }
+
+  public int start(int group) {
+    return groupStart[group];
+  }
+
+  public int end(int group) {
+    return groupEnd[group];
+  }
+
+  public String group(int group) {
+    int offset = start(group);
+    if (offset < 0) {
+      return null;
+    }
+    int length = end(group) - offset;
+    return new String(array, offset, length);
+  }
+
+  public int groupCount() {
+    return groupStart.length - 1;
+  }
+}
--- a/classpath/java/util/regex/RegexPattern.java
+++ b/classpath/java/util/regex/RegexPattern.java
@ -0,0 +1,57 @@
+/* Copyright (c) 2008-2013, Avian Contributors
+
+   Permission to use, copy, modify, and/or distribute this software
+   for any purpose with or without fee is hereby granted, provided
+   that the above copyright notice and this permission notice appear
+   in all copies.
+
+   There is NO WARRANTY for this software.  See license.txt for
+   details. */
+
+package java.util.regex;
+
+/**
+ * A minimal implementation of a regular expression engine.
+ * <p>
+ * Intended as a permissively-licensed drop-in replacement for Oracle JDK's
+ * regular expression engine, this class uses the Pike VM implemented in
+ * {@link PikeVM} to match regular expressions.
+ * </p>
+ * <p>
+ * The Pike VM not only has a nicer runtime performance than Oracle JDK's
+ * backtracking approach -- <i>O(n*m)</i> instead of <i>O(2^m)</i> where
+ * <i>n</i> is the length of the regular expression pattern (after normalizing
+ * {&lt;n&gt;} quantifiers) and <i>m</i> the length of the text to match against
+ * the pattern -- but also supports arbitrary-sized look-behinds.
+ * </p>
+ * <p>
+ * The current implementation supports all regular expression constructs
+ * supported by Oracle JDK's regular expression engine except for the following
+ * ones:
+ * <ul>
+ * <li>control characters: \cX</li>
+ * <li>extended character classes: \p{...}</li>
+ * <li>extended boundary matchers: \A,\G,\Z,\z</li>
+ * <li>possessive quantifiers: X?+</li>
+ * <li>back references: \&lt;n&gt;, \k&lt;name&gt;</li>
+ * <li>long escape: \Q, \E</li>
+ * <li>named groups: (?&lt;name&gt;X)</li>
+ * <li>flags: (?idmsuxU)</li>
+ * <li>independent, non-capturing group: (?>X)</li>
+ * </ul>
+ * </p>
+ * 
+ * @author Johannes Schindelin
+ */
+public class RegexPattern extends Pattern {
+  private PikeVM vm;
+
+  public RegexMatcher matcher(CharSequence string) {
+    return new RegexMatcher(vm, string);
+  }
+
+  RegexPattern(String regex, int flags, PikeVM vm) {
+    super(regex, flags);
+    this.vm = vm;
+  }
+}
--- a/classpath/java/util/regex/TrivialMatcher.java
+++ b/classpath/java/util/regex/TrivialMatcher.java
@ -0,0 +1,48 @@
+/* Copyright (c) 2008-2013, Avian Contributors
+
+   Permission to use, copy, modify, and/or distribute this software
+   for any purpose with or without fee is hereby granted, provided
+   that the above copyright notice and this permission notice appear
+   in all copies.
+
+   There is NO WARRANTY for this software.  See license.txt for
+   details. */
+
+package java.util.regex;
+
+/**
+ * This is a work in progress.
+ * 
+ * @author zsombor and others
+ */
+class TrivialMatcher extends Matcher {
+  private final String pattern;
+
+  TrivialMatcher(String pattern, CharSequence input) {
+    super(input);
+    this.pattern = pattern;
+  }
+
+  public boolean matches() {
+    if (pattern.equals(input.toString())) {
+      start = 0;
+      end = input.length();
+      return true;
+    } else {
+      return false;
+    }
+  }
+
+  public boolean find(int start) {
+    String p = pattern;
+    int i = TrivialPattern.indexOf(input, p, start);
+    if (i >= 0) {
+      this.start = i;
+      this.end = i + p.length();
+      return true;
+    } else {
+      return false;
+    }
+  }
+}
+
--- a/classpath/java/util/regex/TrivialPattern.java
+++ b/classpath/java/util/regex/TrivialPattern.java
@ -0,0 +1,112 @@
+/* Copyright (c) 2008-2013, Avian Contributors
+
+   Permission to use, copy, modify, and/or distribute this software
+   for any purpose with or without fee is hereby granted, provided
+   that the above copyright notice and this permission notice appear
+   in all copies.
+
+   There is NO WARRANTY for this software.  See license.txt for
+   details. */
+
+package java.util.regex;
+
+import java.util.Iterator;
+import java.util.List;
+import java.util.LinkedList;
+
+/**
+ * This is a work in progress.
+ * 
+ * @author zsombor and others
+ * 
+ */
+public class TrivialPattern extends Pattern {
+
+  private final String unescaped;
+
+  TrivialPattern(String pattern, String unescaped, int flags) {
+    super(pattern, flags);
+    this.unescaped = unescaped;
+  }
+
+  public Matcher matcher(CharSequence input) {
+    return new TrivialMatcher(unescaped, input);
+  }
+
+  public String[] split(CharSequence input, int limit) {
+    boolean strip;
+    if (limit < 0) {
+      strip = false;
+      limit = Integer.MAX_VALUE;
+    } else if (limit == 0) {
+      strip = true;
+      limit = Integer.MAX_VALUE;
+    } else {
+      strip = false;
+    }
+
+    List<CharSequence> list = new LinkedList<CharSequence>();
+    int index = 0;
+    int trailing = 0;
+    int patternLength = unescaped.length();
+    while (index < input.length() && list.size() < limit - 1) {
+      int i;
+      if (patternLength == 0) {
+        if (list.size() == 0) {
+          i = 0;
+        } else {
+          i = index + 1;
+        }
+      } else {
+        i = indexOf(input, unescaped, index);
+      }
+
+      if (i >= 0) {
+        if (patternLength != 0 && i == index) {
+          ++ trailing;
+        } else {
+          trailing = 0;
+        }
+
+        list.add(input.subSequence(index, i));
+        index = i + patternLength;
+      } else {
+        break;
+      }
+    }
+
+    if (strip && index > 0 && index == input.length()) {
+      ++ trailing;
+    } else {
+      trailing = 0;
+    }
+    list.add(input.subSequence(index, input.length()));
+
+    String[] result = new String[list.size() - trailing];
+    int i = 0;
+    for (Iterator<CharSequence> it = list.iterator();
+         it.hasNext() && i < result.length; ++ i)
+    {
+      result[i] = it.next().toString();
+    }
+    return result;
+  }
+
+  static int indexOf(CharSequence haystack, CharSequence needle, int start) {
+    if (needle.length() == 0) return start;
+
+    for (int i = start; i < haystack.length() - needle.length() + 1; ++i) {
+      int j = 0;
+      for (; j < needle.length(); ++j) {
+        if (haystack.charAt(i + j) != needle.charAt(j)) {
+          break;
+        }
+      }
+      if (j == needle.length()) {
+        return i;
+      }
+    }
+
+    return -1;
+  }
+}
--- a/test/Regex.java
+++ b/test/Regex.java
@ -0,0 +1,96 @@
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+public class Regex {
+  private static void expect(boolean v) {
+    if (! v) throw new RuntimeException();
+  }
+
+  private static Matcher getMatcher(String regex, String string) {
+    return Pattern.compile(regex).matcher(string);
+  }
+
+  private static void expectMatch(String regex, String string) {
+    expect(getMatcher(regex, string).matches());
+  }
+
+  private static void expectNoMatch(String regex, String string) {
+    expect(!getMatcher(regex, string).matches());
+  }
+
+  private static void expectGroups(String regex, String string,
+      String... groups) {
+    Matcher matcher = getMatcher(regex, string);
+    expect(matcher.matches());
+    expect(matcher.groupCount() == groups.length);
+    for (int i = 1; i <= groups.length; ++i) {
+      if (groups[i - 1] == null) {
+        expect(matcher.group(i) == null);
+      } else {
+        expect(groups[i - 1].equals(matcher.group(i)));
+      }
+    }
+  }
+
+  private static void expectFind(String regex, String string,
+      String... matches)
+  {
+    Matcher matcher = getMatcher(regex, string);
+    int i = 0;
+    while (i < matches.length) {
+      expect(matcher.find());
+      expect(matches[i++].equals(matcher.group()));
+    }
+    expect(!matcher.find());
+  }
+
+  private static void expectSplit(String regex, String string,
+      String... list)
+  {
+    String[] array = Pattern.compile(regex).split(string);
+    expect(array.length == list.length);
+    for (int i = 0; i < list.length; ++ i) {
+      expect(list[i].equals(array[i]));
+    }
+  }
+
+  public static void main(String[] args) {
+    expectMatch("a(bb)?a", "abba");
+    expectNoMatch("a(bb)?a", "abbba");
+    expectNoMatch("a(bb)?a", "abbaa");
+    expectGroups("a(a*?)(a?)(a??)(a+)(a*)a", "aaaaaa", "", "a", "", "aaa", "");
+    expectMatch("...", "abc");
+    expectNoMatch(".", "\n");
+    expectGroups("a(bb)*a", "abbbba", "bb");
+    expectGroups("a(bb)?(bb)+a", "abba", null, "bb");
+    expectFind(" +", "Hello  ,   world! ", "  ", "   ", " ");
+    expectMatch("[0-9A-Fa-f]+", "08ef");
+    expectNoMatch("[0-9A-Fa-f]+", "08@ef");
+    expectGroups("(?:a)", "a");
+    expectGroups("a|(b|c)", "a", (String)null);
+    expectGroups("a|(b|c)", "c", "c");
+    expectGroups("(?=a)a", "a");
+    expectGroups(".*(o)(?<=[A-Z][a-z]*)", "Hello", "o");
+    expectNoMatch("(?!a).", "a");
+    expectMatch("[\\d]", "0");
+    expectMatch("\\0777", "?7");
+    expectMatch("\\a", "\007");
+    expectMatch("\\\\", "\\");
+    expectMatch("\\x4A", "J");
+    expectMatch("\\x61", "a");
+    expectMatch("\\078", "\0078");
+    expectSplit("(?<=\\w)(?=\\W)|(?<=\\W)(?=\\w)", "a + b * x",
+      "a", " + ", "b", " * ", "x");
+    expectMatch("[0-9[def]]", "f");
+    expectNoMatch("[a-z&&[^d-f]]", "f");
+    expectSplit("^H", "Hello\nHobbes!", "", "ello\nHobbes!");
+    expectSplit("o.*?$", "Hello\r\nHobbes!", "Hello\r\nH");
+    expectSplit("\\b", "a+ b + c\nd", "", "a", "+ ", "b", " + ", "c", "\n", "d");
+    expectSplit("\\B", "Hi Cal!", "H", "i C", "a", "l!");
+    expectMatch("a{2,5}", "aaaa");
+    expectGroups("a??(a{2,5}?)", "aaaa", "aaaa");
+    expectGroups("a??(a{3}?)", "aaaa", "aaa");
+    expectNoMatch("a(a{3}?)", "aaaaa");
+    expectMatch("a(a{3,}?)", "aaaaa");
+  }
+}