From b4e1ee97ebd3d0324c461c8c45407abb638e96fe Mon Sep 17 00:00:00 2001 From: Johannes Schindelin Date: Tue, 29 Oct 2013 10:15:42 -0500 Subject: [PATCH 01/31] Avoid committing temporary vi files Signed-off-by: Johannes Schindelin --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 588446f90d..8800bb0b0c 100644 --- a/.gitignore +++ b/.gitignore @@ -8,3 +8,4 @@ bin /lib /distrib *.pdb +*.swp From 84829dc390b74672003ba71908c2a0f3f2bfc97c Mon Sep 17 00:00:00 2001 From: Johannes Schindelin Date: Fri, 1 Nov 2013 22:12:38 -0500 Subject: [PATCH 02/31] Refactor Pattern / Matcher classes This makes both the Pattern and the Matcher class abstract so that more specialized patterns than the trivial patterns we support so far can be implemented as convenient subclasses of the respective abstract base classes. To ease development, we work on copies in test/regex/ in the 'regex' package. That way, it can be developed in Eclipse (because it does not interfere with Oracle JRE's java.util.regex.* classes). Signed-off-by: Johannes Schindelin --- makefile | 2 +- test/regex/Matcher.java | 90 +++++++++++++++ test/regex/Pattern.java | 89 +++++++++++++++ test/regex/TrivialMatcher.java | 48 ++++++++ test/regex/TrivialPattern.java | 199 +++++++++++++++++++++++++++++++++ 5 files changed, 427 insertions(+), 1 deletion(-) create mode 100644 test/regex/Matcher.java create mode 100644 test/regex/Pattern.java create mode 100644 test/regex/TrivialMatcher.java create mode 100644 test/regex/TrivialPattern.java diff --git a/makefile b/makefile index 0b98039593..d66cfea947 100755 --- a/makefile +++ b/makefile @@ -1344,7 +1344,7 @@ vm-classes = \ avian/*.class \ avian/resource/*.class -test-support-sources = $(shell find $(test)/avian/ -name '*.java') +test-support-sources = $(shell find $(test)/avian $(test)/regex -name '*.java') test-sources = $(wildcard $(test)/*.java) test-cpp-sources = $(wildcard $(test)/*.cpp) test-sources += $(test-support-sources) diff --git a/test/regex/Matcher.java b/test/regex/Matcher.java new file mode 100644 index 0000000000..fc99d201e3 --- /dev/null +++ b/test/regex/Matcher.java @@ -0,0 +1,90 @@ +/* Copyright (c) 2008-2013, Avian Contributors + + Permission to use, copy, modify, and/or distribute this software + for any purpose with or without fee is hereby granted, provided + that the above copyright notice and this permission notice appear + in all copies. + + There is NO WARRANTY for this software. See license.txt for + details. */ + +package regex; + +/** + * This is a work in progress. + * + * @author zsombor and others + */ +public abstract class Matcher { + protected CharSequence input; + protected int start; + protected int end; + + public Matcher(CharSequence input) { + reset(input); + } + + public abstract boolean matches(); + + public boolean find() { + return find(end); + } + + public abstract boolean find(int start); + + public Matcher reset() { + return reset(input); + } + + public Matcher reset(CharSequence input) { + this.input = input; + start = 0; + end = 0; + return this; + } + + public String replaceAll(String replacement) { + return replace(replacement, Integer.MAX_VALUE); + } + + public String replaceFirst(String replacement) { + return replace(replacement, 1); + } + + protected String replace(String replacement, int limit) { + reset(); + + StringBuilder sb = null; + int index = 0; + int count = 0; + while (count < limit && index < input.length()) { + if (find(index)) { + if (sb == null) { + sb = new StringBuilder(); + } + if (start > index) { + sb.append(input.subSequence(index, start)); + } + sb.append(replacement); + index = end; + ++ count; + } else if (index == 0) { + return input.toString(); + } else { + break; + } + } + if (index < input.length()) { + sb.append(input.subSequence(index, input.length())); + } + return sb.toString(); + } + + public int start() { + return start; + } + + public int end() { + return end; + } +} diff --git a/test/regex/Pattern.java b/test/regex/Pattern.java new file mode 100644 index 0000000000..ddcaacb7f5 --- /dev/null +++ b/test/regex/Pattern.java @@ -0,0 +1,89 @@ +/* Copyright (c) 2008-2013, Avian Contributors + + Permission to use, copy, modify, and/or distribute this software + for any purpose with or without fee is hereby granted, provided + that the above copyright notice and this permission notice appear + in all copies. + + There is NO WARRANTY for this software. See license.txt for + details. */ + +package regex; + +import java.util.ArrayList; +import java.util.List; + +/** + * This is a work in progress. + * + * @author zsombor and others + * + */ +public abstract class Pattern { + + public static final int UNIX_LINES = 1; + public static final int CASE_INSENSITIVE = 2; + public static final int COMMENTS = 4; + public static final int MULTILINE = 8; + public static final int LITERAL = 16; + public static final int DOTALL = 32; + public static final int UNICODE_CASE = 64; + public static final int CANON_EQ = 128; + + private final int patternFlags; + private final String pattern; + + protected Pattern(String pattern, int flags) { + this.pattern = pattern; + this.patternFlags = flags; + } + + public static Pattern compile(String regex) { + return compile(regex, 0); + } + + public static Pattern compile(String regex, int flags) { + try { + return new TrivialPattern(regex, flags); + } catch (UnsupportedOperationException handledBelow) { } + throw new UnsupportedOperationException("Cannot handle regex " + regex); + } + + public int flags() { + return patternFlags; + } + + public abstract Matcher matcher(CharSequence input); + + public static boolean matches(String regex, CharSequence input) { + return Pattern.compile(regex).matcher(input).matches(); + } + + public String pattern() { + return pattern; + } + + public String[] split(CharSequence input) { + return split(input, 0); + } + + public String[] split(CharSequence input, int limit) { + if (limit <= 0) { + limit = Integer.MAX_VALUE; + } + Matcher matcher = matcher(input); + List result = new ArrayList(); + int offset = 0; + for (;;) { + if (result.size() >= limit || !matcher.find()) { + break; + } + result.add(input.subSequence(offset, matcher.start()).toString()); + offset = matcher.end(); + } + if (offset == 0 || offset < input.length()) { + result.add(input.subSequence(offset, input.length()).toString()); + } + return result.toArray(new String[result.size()]); + } +} diff --git a/test/regex/TrivialMatcher.java b/test/regex/TrivialMatcher.java new file mode 100644 index 0000000000..9a1a7d3737 --- /dev/null +++ b/test/regex/TrivialMatcher.java @@ -0,0 +1,48 @@ +/* Copyright (c) 2008-2013, Avian Contributors + + Permission to use, copy, modify, and/or distribute this software + for any purpose with or without fee is hereby granted, provided + that the above copyright notice and this permission notice appear + in all copies. + + There is NO WARRANTY for this software. See license.txt for + details. */ + +package regex; + +/** + * This is a work in progress. + * + * @author zsombor and others + */ +class TrivialMatcher extends Matcher { + private final String pattern; + + TrivialMatcher(String pattern, CharSequence input) { + super(input); + this.pattern = pattern; + } + + public boolean matches() { + if (pattern.equals(input.toString())) { + start = 0; + end = input.length(); + return true; + } else { + return false; + } + } + + public boolean find(int start) { + String p = pattern; + int i = TrivialPattern.indexOf(input, p, start); + if (i >= 0) { + this.start = i; + this.end = i + p.length(); + return true; + } else { + return false; + } + } +} + diff --git a/test/regex/TrivialPattern.java b/test/regex/TrivialPattern.java new file mode 100644 index 0000000000..40ea88fa3a --- /dev/null +++ b/test/regex/TrivialPattern.java @@ -0,0 +1,199 @@ +/* Copyright (c) 2008-2013, Avian Contributors + + Permission to use, copy, modify, and/or distribute this software + for any purpose with or without fee is hereby granted, provided + that the above copyright notice and this permission notice appear + in all copies. + + There is NO WARRANTY for this software. See license.txt for + details. */ + +package regex; + +import java.util.Iterator; +import java.util.List; +import java.util.LinkedList; + +/** + * This is a work in progress. + * + * @author zsombor and others + * + */ +public class TrivialPattern extends Pattern { + + private final String trivialPattern; + + TrivialPattern(String pattern, int flags) { + super(pattern, flags); + this.trivialPattern = trivial(pattern); + } + + private static String trivial(String pattern) { + StringBuffer buffer = new StringBuffer(); + for (int i = 0; i < pattern.length(); ++i) { + char c = pattern.charAt(i); + switch (c) { + case '\\': + if (++i == pattern.length()) { + break; + } + c = pattern.charAt(i); + if (c == '0') { + int len = digits(pattern, ++i, 3, 8); + if (len == 3 && pattern.charAt(i) > '3') { + --len; + } + c = (char)Integer.parseInt(pattern.substring(i, i + len), 8); + i += len - 1; + } else if (c == 'x' || c == 'u') { + int len = digits(pattern, ++i, 4, 16); + c = (char)Integer.parseInt(pattern.substring(i, i + len), 16); + i += len - 1; + } else { + c = unescape(pattern.charAt(i)); + } + if (c != -1) { + break; + } + // fallthru + case '.': + case '*': + case '+': + case '?': + case '|': + case '[': + case ']': + case '{': + case '}': + case '(': + case ')': + case '^': + case '$': + throw new UnsupportedOperationException + ("only trivial regular expressions are supported so far (" + pattern + ")"); + } + buffer.append(c); + } + return buffer.toString(); + } + + private static int digits(String s, int offset, int maxLength, int base) { + for (int i = 0; ; ++i) { + if (i == maxLength || offset + i >= s.length()) { + return i; + } + int value = s.charAt(offset + i) - '0'; + if (value < 0) { + return i; + } + if (base > 10 && value >= 10) { + value += 10 - (value >= 'a' - '0' ? 'a' - '0' : 'A' - '0'); + } + if (value >= base) { + return i; + } + } + } + + private static char unescape(char c) { + switch (c) { + case '\\': + return c; + case 'a': + return 0x0007; + case 'e': + return 0x001B; + case 'f': + return 0x000C; + case 'n': + return 0x000A; + case 'r': + return 0x000D; + case 't': + return 0x0009; + } + return (char)-1; + } + + public Matcher matcher(CharSequence input) { + return new TrivialMatcher(trivialPattern, input); + } + + public String[] split(CharSequence input, int limit) { + boolean strip; + if (limit < 0) { + strip = false; + limit = Integer.MAX_VALUE; + } else if (limit == 0) { + strip = true; + limit = Integer.MAX_VALUE; + } else { + strip = false; + } + + List list = new LinkedList(); + int index = 0; + int trailing = 0; + int patternLength = trivialPattern.length(); + while (index < input.length() && list.size() < limit - 1) { + int i; + if (patternLength == 0) { + if (list.size() == 0) { + i = 0; + } else { + i = index + 1; + } + } else { + i = indexOf(input, trivialPattern, index); + } + + if (i >= 0) { + if (patternLength != 0 && i == index) { + ++ trailing; + } else { + trailing = 0; + } + + list.add(input.subSequence(index, i)); + index = i + patternLength; + } else { + break; + } + } + + if (strip && index > 0 && index == input.length()) { + ++ trailing; + } else { + trailing = 0; + } + list.add(input.subSequence(index, input.length())); + + String[] result = new String[list.size() - trailing]; + int i = 0; + for (Iterator it = list.iterator(); + it.hasNext() && i < result.length; ++ i) + { + result[i] = it.next().toString(); + } + return result; + } + + static int indexOf(CharSequence haystack, CharSequence needle, int start) { + if (needle.length() == 0) return start; + + for (int i = start; i < haystack.length() - needle.length() + 1; ++i) { + int j = 0; + for (; j < needle.length(); ++j) { + if (haystack.charAt(i + j) != needle.charAt(j)) { + break; + } + } + if (j == needle.length()) { + return i; + } + } + + return -1; + } +} From 944f5f3567f7e070e585e4ecee5d9dc73d442271 Mon Sep 17 00:00:00 2001 From: Johannes Schindelin Date: Fri, 8 Nov 2013 22:26:53 -0600 Subject: [PATCH 03/31] Start implementing a regular expression engine So far, these are humble beginnings indeed. Based on the descriptions of http://swtch.com/%7Ersc/regexp/regexp2.html I started implementing a Thompson NFA / Pike VM. The idea being that eventually, regular expressions are to be compiled into special-purpose bytecode for the Pike VM that executes a varying number of threads in lock-step over each character of the text to match. The thread count is bounded by the length of the program: two different threads with identical instruction pointer at the same character-to-match would yield exactly the same outcome (and therefore, we can execute just one such thread instead of possibly many). To allow for matching groups, each thread carries a state with it, saving the group offsets acquired so far. Signed-off-by: Johannes Schindelin --- test/regex/PikeVM.java | 300 ++++++++++++++++++++++++++++++++++ test/regex/PikeVMOpcodes.java | 28 ++++ 2 files changed, 328 insertions(+) create mode 100644 test/regex/PikeVM.java create mode 100644 test/regex/PikeVMOpcodes.java diff --git a/test/regex/PikeVM.java b/test/regex/PikeVM.java new file mode 100644 index 0000000000..c4b14e07b7 --- /dev/null +++ b/test/regex/PikeVM.java @@ -0,0 +1,300 @@ +/* Copyright (c) 2008-2013, Avian Contributors + + Permission to use, copy, modify, and/or distribute this software + for any purpose with or without fee is hereby granted, provided + that the above copyright notice and this permission notice appear + in all copies. + + There is NO WARRANTY for this software. See license.txt for + details. */ + +package regex; + +/** + * A minimal implementation of a regular expression engine. + * + * @author Johannes Schindelin + */ +class PikeVM implements PikeVMOpcodes { + private final int[] program; + private final int groupCount; + private final int offsetsCount; + + public interface Result { + void set(int[] start, int[] end); + } + + protected PikeVM(int[] program, int groupCount) { + this.program = program; + this.groupCount = groupCount; + offsetsCount = 2 * groupCount + 2; + } + + /** + * The current thread states. + *

+ * The threads are identified by their program counter. The rationale: as all + * threads are executed in lock-step, i.e. for the same character in the + * string to be matched, it does not make sense for two threads to be at the + * same program counter -- they would both do exactly the same for the rest of + * the execution. + *

+ *

+ * For efficiency, the threads are kept in a linked list that actually lives + * in an array indexed by the program counter, pointing to the next thread's + * program counter, in the order of high to low priority. + *

+ *

+ * Program counters which have no thread associated thread are marked as -1. + * The program counter associated with the least-priority thread (the last one + * in the linked list) is marked as -2 to be able to tell it apart from + * unscheduled threads. + *

+ *

+ * We actually never need to have an explicit value for the priority, the + * ordering is sufficient: whenever a new thread is to be scheduled and it is + * found to be scheduled already, it was already scheduled by a + * higher-priority thread. + *

+ */ + private class ThreadQueue { + private int head, tail; + // next[pc] is 1 + the next thread's pc + private int[] next; + // offsets[pc][2 * group] is 1 + start offset + private int[][] offsets; + + public ThreadQueue() { + head = tail = -1; + next = new int[program.length + 1]; + offsets = new int[program.length + 1][]; + } + + /** + * Schedules the instruction at {@code nextPC} to be executed immediately. + *

+ * For non-matching steps (SPLIT, SAVE_STATE, etc) we need to schedule the + * corresponding program counter(s) to be handled right after this opcode, + * before advancing to the next character. + *

+ *

+ * To achieve this, we insert the program counter to-be-scheduled in the + * linked thread list at the current position, but only if it has not been + * scheduled yet: if it has, a higher-priority thread already reached that + * state. + *

+ *

+ * In contrast to {@link #queueNext(int, int, ThreadQueue)}, this method + * works on the current step's thread list. + *

+ * + * @param currentPC + * the current program counter + * @param nextPC + * the program counter to schedule + * @param copyThreadState + * whether to spawn off a new thread + * @return whether the step was queued (i.e. no thread was queued for the + * same {@code nextPC} already) + */ + public boolean queueImmediately(int currentPC, int nextPC, + boolean copyThreadState) { + if (isScheduled(nextPC)) { + return false; + } + int[] offsets = this.offsets[currentPC]; + if (copyThreadState) { + offsets = java.util.Arrays.copyOf(offsets, offsetsCount); + } + if (currentPC == tail) { + tail = nextPC; + } else { + next[nextPC] = next[currentPC]; + } + this.offsets[nextPC] = offsets; + next[currentPC] = nextPC + 1; + return true; + } + + /** + * Schedules the instruction at {@code nextPC} to be executed in the next + * step. + *

+ * This method advances the current thread to the next program counter, to + * be executed after reading the next character. + *

+ * + * @param currentPC + * the current program counter + * @param nextPC + * the program counter to schedule + * @param next + * the thread state of the next step + * @return whether the step was queued (i.e. no thread was queued for the + * same {@code nextPC} already) + */ + private boolean queueNext(int currentPC, int nextPC, ThreadQueue next) { + if (next.tail < 0) { + next.head = nextPC; + } else if (next.isScheduled(nextPC)) { + return false; + } else { + next.next[next.tail] = nextPC + 1; + } + next.offsets[nextPC] = + currentPC < 0 ? new int[offsetsCount] : offsets[currentPC]; + next.tail = nextPC; + return true; + } + + public void saveOffset(int pc, int index, int offset) { + offsets[pc][index] = offset + 1; + } + + public void setResult(Result result) { + // copy offsets + int[] offsets = this.offsets[program.length]; + int[] groupStart = new int[groupCount + 1]; + int[] groupEnd = new int[groupCount + 1]; + for (int j = 0; j <= groupCount; ++j) { + groupStart[j] = offsets[2 * j] - 1; + groupEnd[j] = offsets[2 * j + 1] - 1; + } + result.set(groupStart, groupEnd); + } + + public boolean isEmpty() { + return head < 0; + } + + public boolean isScheduled(int pc) { + return pc == tail || next[pc] > 0; + } + + public int next(int pc) { + return pc < 0 ? head : next[pc] - 1; + } + + public void clean() { + for (int pc = head; pc >= 0; ) { + int nextPC = next[pc] - 1; + next[pc] = 0; + offsets[pc] = null; + pc = nextPC; + } + head = tail = -1; + } + } + + /** + * Executes the Pike VM defined by the program. + *

+ * The idea is to execute threads in parallel, at each step executing them + * from the highest priority thread to the lowest one. In contrast to most + * regular expression engines, the Thompson/Pike one gets away with linear + * complexity because the string is matched from left to right, at each step + * executing a number of threads bounded by the length of the program: if two + * threads would execute at the same instruction pointer of the program, we + * need only consider the higher-priority one. + *

+ *

+ * This implementation is based on the description of Russ Cox. + *

+ * + * @param characters + * the {@link String} to match + * @param start + * the start offset where to match + * @param length + * the end offset + * @param anchorStart + * whether the match must start at {@code start} + * @param anchorEnd + * whether the match must start at {@code end} + * @param result + * the {@link Matcher} to store the groups' offsets in, if successful + * @return whether a match was found + */ + public boolean matches(char[] characters, int start, int end, + boolean anchorStart, boolean anchorEnd, Result result) + { + ThreadQueue current = new ThreadQueue(); + ThreadQueue next = new ThreadQueue(); + + // initialize the first thread + current.queueNext(-1, 0, current); + if (!anchorStart) { + // this requires non-greedy matching + throw new UnsupportedOperationException(); + } + + boolean foundMatch = false; + for (int i = start; i <= end; ++i) { + if (current.isEmpty()) { + // no threads left + return foundMatch; + } + + char c = i < end ? characters[i] : 0; + int pc = -1; + for (;;) { + pc = current.next(pc); + if (pc < 0) { + break; + } + + // pc == program.length is a match! + if (pc == program.length) { + if (anchorEnd && i < end) { + continue; + } + current.setResult(result); + foundMatch = true; + break; + } + + int opcode = program[pc]; + switch (opcode) { + /* Possible optimization: make all opcodes <= 0xffff implicit chars */ + case CHAR: + if (c == (char)program[pc + 1]) { + current.queueNext(pc, pc + 2, next); + } + break; + case DOT: + if (c != '\0' && c != '\r' && c != '\n') { + current.queueNext(pc, pc + 1, next); + } + break; + case DOTALL: + current.queueNext(pc, pc + 1, next); + break; + case SAVE_OFFSET: + int index = program[pc + 1]; + current.saveOffset(pc, index, i); + current.queueImmediately(pc, pc + 2, false); + break; + case SPLIT: + current.queueImmediately(pc, program[pc + 1], true); + current.queueImmediately(pc, pc + 2, false); + break; + case JMP: + current.queueImmediately(pc, program[pc + 1], false); + break; + default: + throw new RuntimeException("Invalid opcode: " + opcode + + " at pc " + pc); + } + } + // clean linked thread list (and states) + current.clean(); + + // prepare for next step + ThreadQueue swap = current; + current = next; + next = swap; + } + return foundMatch; + } +} diff --git a/test/regex/PikeVMOpcodes.java b/test/regex/PikeVMOpcodes.java new file mode 100644 index 0000000000..4518130700 --- /dev/null +++ b/test/regex/PikeVMOpcodes.java @@ -0,0 +1,28 @@ +/* Copyright (c) 2008-2013, Avian Contributors + + Permission to use, copy, modify, and/or distribute this software + for any purpose with or without fee is hereby granted, provided + that the above copyright notice and this permission notice appear + in all copies. + + There is NO WARRANTY for this software. See license.txt for + details. */ + +package regex; + +/** + * Opcodes for the Pike VM. + *

+ * See {@link PikeVM}. + *

+ * + * @author Johannes Schindelin + */ +interface PikeVMOpcodes { + final static int CHAR = 1; + final static int DOT = 2; + final static int DOTALL = 3; + final static int SAVE_OFFSET = 4; + final static int SPLIT = 5; + final static int JMP = 6; +} From e6ad10de04733206cfcdc9611b62569e1cfba69d Mon Sep 17 00:00:00 2001 From: Johannes Schindelin Date: Fri, 8 Nov 2013 22:26:53 -0600 Subject: [PATCH 04/31] Implement Pattern / Matcher classes based on the PikeVM Based on the just-implemented PikeVM, let's test it with a specific regular expression. At this point, no parsing is implemented but instead an explicit program executing a(bb)?a is hardcoded. Signed-off-by: Johannes Schindelin --- test/regex/Pattern.java | 19 ++++++++++++- test/regex/RegexMatcher.java | 55 ++++++++++++++++++++++++++++++++++++ test/regex/RegexPattern.java | 29 +++++++++++++++++++ 3 files changed, 102 insertions(+), 1 deletion(-) create mode 100644 test/regex/RegexMatcher.java create mode 100644 test/regex/RegexPattern.java diff --git a/test/regex/Pattern.java b/test/regex/Pattern.java index ddcaacb7f5..0089b45564 100644 --- a/test/regex/Pattern.java +++ b/test/regex/Pattern.java @@ -19,7 +19,7 @@ import java.util.List; * @author zsombor and others * */ -public abstract class Pattern { +public abstract class Pattern implements PikeVMOpcodes { public static final int UNIX_LINES = 1; public static final int CASE_INSENSITIVE = 2; @@ -46,6 +46,23 @@ public abstract class Pattern { try { return new TrivialPattern(regex, flags); } catch (UnsupportedOperationException handledBelow) { } + if (flags != 0) { + throw new UnsupportedOperationException("TODO"); + } + if ("a(bb)?a".equals(regex)) { + int[] program = new int[] { + SAVE_OFFSET, 0, + CHAR, 'a', + SPLIT, 14, + SAVE_OFFSET, 2, + CHAR, 'b', + CHAR, 'b', + SAVE_OFFSET, 3, + /* 14 */ CHAR, 'a', + SAVE_OFFSET, 1 + }; + return new RegexPattern(regex, flags, new PikeVM(program, 1)); + } throw new UnsupportedOperationException("Cannot handle regex " + regex); } diff --git a/test/regex/RegexMatcher.java b/test/regex/RegexMatcher.java new file mode 100644 index 0000000000..7a266d805d --- /dev/null +++ b/test/regex/RegexMatcher.java @@ -0,0 +1,55 @@ +/* Copyright (c) 2008-2013, Avian Contributors + + Permission to use, copy, modify, and/or distribute this software + for any purpose with or without fee is hereby granted, provided + that the above copyright notice and this permission notice appear + in all copies. + + There is NO WARRANTY for this software. See license.txt for + details. */ + +package regex; + +/** + * A minimal implementation of a regular expression matcher. + * + * @author Johannes Schindelin + */ +public class RegexMatcher extends Matcher { + private final PikeVM vm; + private char[] array; + int[] groupStart, groupEnd; + + RegexMatcher(PikeVM vm, CharSequence string) { + super(string); + this.vm = vm; + } + + private final PikeVM.Result adapter = new PikeVM.Result() { + public void set(int[] start, int[] end) { + RegexMatcher.this.start = start[0]; + RegexMatcher.this.end = end[0]; + RegexMatcher.this.groupStart = start; + RegexMatcher.this.groupEnd = end; + } + }; + + public Matcher reset() { + start = end = -1; + return this; + } + + public Matcher reset(CharSequence input) { + this.input = input; + array = input.toString().toCharArray(); + return reset(); + } + + public boolean matches() { + return vm.matches(array, 0, array.length, true, true, adapter); + } + + public boolean find(int offset) { + throw new UnsupportedOperationException("TODO"); + } +} diff --git a/test/regex/RegexPattern.java b/test/regex/RegexPattern.java new file mode 100644 index 0000000000..0e6ed488d0 --- /dev/null +++ b/test/regex/RegexPattern.java @@ -0,0 +1,29 @@ +/* Copyright (c) 2008-2013, Avian Contributors + + Permission to use, copy, modify, and/or distribute this software + for any purpose with or without fee is hereby granted, provided + that the above copyright notice and this permission notice appear + in all copies. + + There is NO WARRANTY for this software. See license.txt for + details. */ + +package regex; + +/** + * A minimal implementation of a regular expression engine. + * + * @author Johannes Schindelin + */ +public class RegexPattern extends Pattern { + private PikeVM vm; + + public RegexMatcher matcher(CharSequence string) { + return new RegexMatcher(vm, string); + } + + RegexPattern(String regex, int flags, PikeVM vm) { + super(regex, flags); + this.vm = vm; + } +} From 2073d4bffb992151eb97545b9284390a3a7b5adc Mon Sep 17 00:00:00 2001 From: Johannes Schindelin Date: Fri, 22 Nov 2013 14:41:29 -0600 Subject: [PATCH 05/31] Prepare the Matcher class for multiple groups Signed-off-by: Johannes Schindelin --- test/regex/Matcher.java | 29 +++++++++++++++++++++++++++++ test/regex/RegexMatcher.java | 21 +++++++++++++++++++++ 2 files changed, 50 insertions(+) diff --git a/test/regex/Matcher.java b/test/regex/Matcher.java index fc99d201e3..13c2efba1a 100644 --- a/test/regex/Matcher.java +++ b/test/regex/Matcher.java @@ -87,4 +87,33 @@ public abstract class Matcher { public int end() { return end; } + + public String group() { + return input.subSequence(start, end).toString(); + } + + public int start(int group) { + if (group == 0) { + return start(); + } + throw new UnsupportedOperationException(); + } + + public int end(int group) { + if (group == 0) { + return end(); + } + throw new UnsupportedOperationException(); + } + + public String group(int group) { + if (group == 0) { + return group(); + } + throw new UnsupportedOperationException(); + } + + public int groupCount() { + return 0; + } } diff --git a/test/regex/RegexMatcher.java b/test/regex/RegexMatcher.java index 7a266d805d..5ea2eea82c 100644 --- a/test/regex/RegexMatcher.java +++ b/test/regex/RegexMatcher.java @@ -52,4 +52,25 @@ public class RegexMatcher extends Matcher { public boolean find(int offset) { throw new UnsupportedOperationException("TODO"); } + + public int start(int group) { + return groupStart[group]; + } + + public int end(int group) { + return groupEnd[group]; + } + + public String group(int group) { + int offset = start(group); + if (offset < 0) { + return null; + } + int length = end(group) - offset; + return new String(array, offset, length); + } + + public int groupCount() { + return groupStart.length - 1; + } } From b03283033ede70b7ccaeb4824425202f8e3d3172 Mon Sep 17 00:00:00 2001 From: Johannes Schindelin Date: Sat, 9 Nov 2013 14:16:22 -0600 Subject: [PATCH 06/31] Add a unit test for the regular expression engine We still do not parse the regular expression patterns, but we can at least test that the hardcoded 'a(bb)+a' works as expected. This class will be extended as we support more and more features. Signed-off-by: Johannes Schindelin --- test/Regex.java | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) create mode 100644 test/Regex.java diff --git a/test/Regex.java b/test/Regex.java new file mode 100644 index 0000000000..dba6aa6e80 --- /dev/null +++ b/test/Regex.java @@ -0,0 +1,26 @@ +import regex.Matcher; +import regex.Pattern; + +public class Regex { + private static void expect(boolean v) { + if (! v) throw new RuntimeException(); + } + + private static Matcher getMatcher(String regex, String string) { + return Pattern.compile(regex).matcher(string); + } + + private static void expectMatch(String regex, String string) { + expect(getMatcher(regex, string).matches()); + } + + private static void expectNoMatch(String regex, String string) { + expect(!getMatcher(regex, string).matches()); + } + + public static void main(String[] args) { + expectMatch("a(bb)?a", "abba"); + expectNoMatch("a(bb)?a", "abbba"); + expectNoMatch("a(bb)?a", "abbaa"); + } +} From 63b06ebde8b016f811b906f50d45af53516ab287 Mon Sep 17 00:00:00 2001 From: Johannes Schindelin Date: Sat, 9 Nov 2013 14:13:19 -0600 Subject: [PATCH 07/31] Regex: optimize matching characters Instead of having an opcode 'CHAR', let's have the opcodes that fall within the range of a char *be* the opcode 'match this character'. While at it, break the ranges of the different types of opcodes apart into ranges so that related operations are clustered. Signed-off-by: Johannes Schindelin --- test/regex/Pattern.java | 10 +++++----- test/regex/PikeVM.java | 13 +++++++------ test/regex/PikeVMOpcodes.java | 13 +++++++------ 3 files changed, 19 insertions(+), 17 deletions(-) diff --git a/test/regex/Pattern.java b/test/regex/Pattern.java index 0089b45564..d10fe8fa3b 100644 --- a/test/regex/Pattern.java +++ b/test/regex/Pattern.java @@ -52,13 +52,13 @@ public abstract class Pattern implements PikeVMOpcodes { if ("a(bb)?a".equals(regex)) { int[] program = new int[] { SAVE_OFFSET, 0, - CHAR, 'a', - SPLIT, 14, + 'a', + SPLIT, 11, SAVE_OFFSET, 2, - CHAR, 'b', - CHAR, 'b', + 'b', + 'b', SAVE_OFFSET, 3, - /* 14 */ CHAR, 'a', + /* 11 */ 'a', SAVE_OFFSET, 1 }; return new RegexPattern(regex, flags, new PikeVM(program, 1)); diff --git a/test/regex/PikeVM.java b/test/regex/PikeVM.java index c4b14e07b7..9219280c68 100644 --- a/test/regex/PikeVM.java +++ b/test/regex/PikeVM.java @@ -256,12 +256,6 @@ class PikeVM implements PikeVMOpcodes { int opcode = program[pc]; switch (opcode) { - /* Possible optimization: make all opcodes <= 0xffff implicit chars */ - case CHAR: - if (c == (char)program[pc + 1]) { - current.queueNext(pc, pc + 2, next); - } - break; case DOT: if (c != '\0' && c != '\r' && c != '\n') { current.queueNext(pc, pc + 1, next); @@ -270,6 +264,7 @@ class PikeVM implements PikeVMOpcodes { case DOTALL: current.queueNext(pc, pc + 1, next); break; + /* immediate opcodes, i.e. thread continues within the same step */ case SAVE_OFFSET: int index = program[pc + 1]; current.saveOffset(pc, index, i); @@ -283,6 +278,12 @@ class PikeVM implements PikeVMOpcodes { current.queueImmediately(pc, program[pc + 1], false); break; default: + if (program[pc] >= 0 && program[pc] <= 0xffff) { + if (c == (char)program[pc]) { + current.queueNext(pc, pc + 1, next); + } + break; + } throw new RuntimeException("Invalid opcode: " + opcode + " at pc " + pc); } diff --git a/test/regex/PikeVMOpcodes.java b/test/regex/PikeVMOpcodes.java index 4518130700..c12ad99427 100644 --- a/test/regex/PikeVMOpcodes.java +++ b/test/regex/PikeVMOpcodes.java @@ -19,10 +19,11 @@ package regex; * @author Johannes Schindelin */ interface PikeVMOpcodes { - final static int CHAR = 1; - final static int DOT = 2; - final static int DOTALL = 3; - final static int SAVE_OFFSET = 4; - final static int SPLIT = 5; - final static int JMP = 6; + final static int DOT = -1; + final static int DOTALL = -2; + + final static int SAVE_OFFSET = -40; + + final static int SPLIT = -50; + final static int JMP = -51; } From edb48ffec26dcd9b0fc48d50b1be7c4acf2baf8b Mon Sep 17 00:00:00 2001 From: Johannes Schindelin Date: Mon, 11 Nov 2013 16:36:19 -0600 Subject: [PATCH 08/31] Regex: support prioritized threads If we want to match greedy or reluctant regular expressions, we have to make sure that certain threads are split off with a higher priority than others. We will use the ThreadQueues' natural order as priority order: high to low. To support splitting into different-priority threads, let's introduce a second SPLIT opcode: SPLIT_JMP. The latter prefers to jump while the former prefers to execute the opcode directly after the SPLIT opcode. There is a subtle challenge here, though: let's assume that there are two current threads and the higher-priority one wants to jump where the lower-priority one is already. In the PikeVM implementation before this change, queueImmediately() would see that there is already a thread queued for that program counter and *not* queue the higher-priority one. Example: when matching the pattern '(a?)(a??)(a?)' against the string 'aa', after the first character, the first (high priority) thread will have matched the first group while the second thread matched the second group. In the following step, therefore, the first thread will want to SPLIT_JMP to match the final 'a' to the third group but the second thread already queued that program counter. The proposed solution is to introduce a third thread queue: 'queued'. When queuing threads to be executed after reading the next character from the string to match, they are not directly queued into 'next' but into 'queued'. Every thread requiring immediate execution (i.e. before reading the next character) will be queued into 'current'. Whenever 'current' is drained, the next thread from 'queued' that has not been queued to 'current' yet will be executed. That way, we can guarantee that 1) no lower-priority thread can override a higher-priority thread and 2) infinite loop are prevented. Signed-off-by: Johannes Schindelin --- test/regex/PikeVM.java | 46 ++++++++++++++++++++++++++++++----- test/regex/PikeVMOpcodes.java | 3 ++- 2 files changed, 42 insertions(+), 7 deletions(-) diff --git a/test/regex/PikeVM.java b/test/regex/PikeVM.java index 9219280c68..db8ad44ef7 100644 --- a/test/regex/PikeVM.java +++ b/test/regex/PikeVM.java @@ -70,6 +70,34 @@ class PikeVM implements PikeVMOpcodes { offsets = new int[program.length + 1][]; } + public ThreadQueue(int startPC) { + head = tail = startPC; + next = new int[program.length + 1]; + offsets = new int[program.length + 1][]; + offsets[head] = new int[offsetsCount]; + } + + public int queueOneImmediately(ThreadQueue into) { + for (;;) { + if (head < 0) { + return -1; + } + boolean wasQueued = queueNext(head, head, into); + int pc = head; + if (head == tail) { + head = tail = -1; + } else { + head = next[pc] - 1; + next[pc] = 0; + } + offsets[pc] = null; + if (wasQueued) { + into.tail = pc; + return pc; + } + } + } + /** * Schedules the instruction at {@code nextPC} to be executed immediately. *

@@ -141,8 +169,7 @@ class PikeVM implements PikeVMOpcodes { } else { next.next[next.tail] = nextPC + 1; } - next.offsets[nextPC] = - currentPC < 0 ? new int[offsetsCount] : offsets[currentPC]; + next.offsets[nextPC] = offsets[currentPC]; next.tail = nextPC; return true; } @@ -223,7 +250,7 @@ class PikeVM implements PikeVMOpcodes { ThreadQueue next = new ThreadQueue(); // initialize the first thread - current.queueNext(-1, 0, current); + ThreadQueue queued = new ThreadQueue(0); if (!anchorStart) { // this requires non-greedy matching throw new UnsupportedOperationException(); @@ -231,7 +258,7 @@ class PikeVM implements PikeVMOpcodes { boolean foundMatch = false; for (int i = start; i <= end; ++i) { - if (current.isEmpty()) { + if (queued.isEmpty()) { // no threads left return foundMatch; } @@ -240,6 +267,9 @@ class PikeVM implements PikeVMOpcodes { int pc = -1; for (;;) { pc = current.next(pc); + if (pc < 0) { + pc = queued.queueOneImmediately(current); + } if (pc < 0) { break; } @@ -274,6 +304,10 @@ class PikeVM implements PikeVMOpcodes { current.queueImmediately(pc, program[pc + 1], true); current.queueImmediately(pc, pc + 2, false); break; + case SPLIT_JMP: + current.queueImmediately(pc, pc + 2, true); + current.queueImmediately(pc, program[pc + 1], false); + break; case JMP: current.queueImmediately(pc, program[pc + 1], false); break; @@ -292,8 +326,8 @@ class PikeVM implements PikeVMOpcodes { current.clean(); // prepare for next step - ThreadQueue swap = current; - current = next; + ThreadQueue swap = queued; + queued = next; next = swap; } return foundMatch; diff --git a/test/regex/PikeVMOpcodes.java b/test/regex/PikeVMOpcodes.java index c12ad99427..e281aa25ab 100644 --- a/test/regex/PikeVMOpcodes.java +++ b/test/regex/PikeVMOpcodes.java @@ -25,5 +25,6 @@ interface PikeVMOpcodes { final static int SAVE_OFFSET = -40; final static int SPLIT = -50; - final static int JMP = -51; + final static int SPLIT_JMP = -51; // this split prefers to jump + final static int JMP = -52; } From d00f799d2e0699fe6499e366fcaca71eac98ef23 Mon Sep 17 00:00:00 2001 From: Johannes Schindelin Date: Mon, 11 Nov 2013 09:29:24 -0600 Subject: [PATCH 09/31] Regex: special-case a(a*?)(a?)(a??)(a+)(a*)a Among other challenges, this regular expression is designed to demonstrate that thread prioritization is finicky: Given the string 'aaaaaa' to match, the first four threads will try to grab the second 'a', the third thread (the one that matched the '(a??)' group) having scheduled the same instruction pointer to the '(a+)' group that the second -- higher-priority -- thread will try to advance to only after processing the '(a??)' group's SPLIT. The second thread must override the third thread in that case, essentially stopping the latter. Signed-off-by: Johannes Schindelin --- test/Regex.java | 11 +++++++++++ test/regex/Pattern.java | 34 +++++++++++++++++++++++++++++++--- 2 files changed, 42 insertions(+), 3 deletions(-) diff --git a/test/Regex.java b/test/Regex.java index dba6aa6e80..60f00dd1c3 100644 --- a/test/Regex.java +++ b/test/Regex.java @@ -18,9 +18,20 @@ public class Regex { expect(!getMatcher(regex, string).matches()); } + private static void expectGroups(String regex, String string, + String... groups) { + Matcher matcher = getMatcher(regex, string); + expect(matcher.matches()); + expect(matcher.groupCount() == groups.length); + for (int i = 1; i <= groups.length; ++i) { + expect(groups[i - 1].equals(matcher.group(i))); + } + } + public static void main(String[] args) { expectMatch("a(bb)?a", "abba"); expectNoMatch("a(bb)?a", "abbba"); expectNoMatch("a(bb)?a", "abbaa"); + expectGroups("a(a*?)(a?)(a??)(a+)(a*)a", "aaaaaa", "", "a", "", "aaa", ""); } } diff --git a/test/regex/Pattern.java b/test/regex/Pattern.java index d10fe8fa3b..200d558b2a 100644 --- a/test/regex/Pattern.java +++ b/test/regex/Pattern.java @@ -50,7 +50,7 @@ public abstract class Pattern implements PikeVMOpcodes { throw new UnsupportedOperationException("TODO"); } if ("a(bb)?a".equals(regex)) { - int[] program = new int[] { + return new RegexPattern(regex, flags, new PikeVM(new int[] { SAVE_OFFSET, 0, 'a', SPLIT, 11, @@ -60,8 +60,36 @@ public abstract class Pattern implements PikeVMOpcodes { SAVE_OFFSET, 3, /* 11 */ 'a', SAVE_OFFSET, 1 - }; - return new RegexPattern(regex, flags, new PikeVM(program, 1)); + }, 1)); + } else if ("a(a*?)(a?)(a??)(a+)(a*)a".equals(regex)) { + return new RegexPattern(regex, flags, new PikeVM(new int[] { + SAVE_OFFSET, 0, + 'a', + SAVE_OFFSET, 2, + SPLIT_JMP, 10, + /* 7 */ 'a', + SPLIT, 7, + /* 10 */ SAVE_OFFSET, 3, + SAVE_OFFSET, 4, + SPLIT, 17, + 'a', + /* 17 */ SAVE_OFFSET, 5, + SAVE_OFFSET, 6, + SPLIT_JMP, 24, + 'a', + /* 24 */ SAVE_OFFSET, 7, + SAVE_OFFSET, 8, + /* 28 */ 'a', + SPLIT_JMP, 28, + SAVE_OFFSET, 9, + SAVE_OFFSET, 10, + SPLIT, 40, + /* 37 */ 'a', + SPLIT_JMP, 37, + /* 40 */ SAVE_OFFSET, 11, + 'a', + SAVE_OFFSET, 1 + }, 5)); } throw new UnsupportedOperationException("Cannot handle regex " + regex); } From 26c4bf8d8b023c8a80a6d6dd3eeca11930383d92 Mon Sep 17 00:00:00 2001 From: Johannes Schindelin Date: Mon, 11 Nov 2013 17:23:59 -0600 Subject: [PATCH 10/31] Regex: add a class for matching character classes This will be used to match character classes (such as '[0-9a-f]'), but it will also be used by the regular expression pattern compiler to determine whether a character has special meaning in regular expressions. Signed-off-by: Johannes Schindelin --- test/regex/CharacterMatcher.java | 225 +++++++++++++++++++++++++++++++ 1 file changed, 225 insertions(+) create mode 100644 test/regex/CharacterMatcher.java diff --git a/test/regex/CharacterMatcher.java b/test/regex/CharacterMatcher.java new file mode 100644 index 0000000000..8e5d5318b8 --- /dev/null +++ b/test/regex/CharacterMatcher.java @@ -0,0 +1,225 @@ +/* Copyright (c) 2008-2013, Avian Contributors + + Permission to use, copy, modify, and/or distribute this software + for any purpose with or without fee is hereby granted, provided + that the above copyright notice and this permission notice appear + in all copies. + + There is NO WARRANTY for this software. See license.txt for + details. */ + +package regex; + +/** + * A class to match classes of characters. + *

+ * This class is intended to be the working horse behind character classes + * such as {@code [a-z]}. + *

+ * @author Johannes Schindelin + */ +class CharacterMatcher { + private boolean[] map; + private boolean inversePattern; + + public static CharacterMatcher parse(String description) { + return parse(description.toCharArray()); + } + + public static CharacterMatcher parse(char[] description) { + Parser parser = new Parser(description); + CharacterMatcher result = parser.parseClass(); + if (parser.getEndOffset() != description.length) { + throw new RuntimeException("Short character class @" + + parser.getEndOffset() + ": " + new String(description)); + } + return result; + } + + public boolean matches(char c) { + int index = c; + return (map.length > index && map[index]) ^ inversePattern; + } + + private CharacterMatcher(boolean[] map, boolean inversePattern) { + this.map = map; + this.inversePattern = inversePattern; + } + + private void setMatch(int c) { + ensureCapacity(c + 1); + map[c] = true; + } + + private void ensureCapacity(int length) { + if (map.length >= length) { + return; + } + int size = map.length; + if (size < 32) { + size = 32; + } + while (size < length) { + size <<= 1; + } + map = java.util.Arrays.copyOf(map, size); + } + + static class Parser { + private final char[] description; + private int offset; + + public Parser(char[] description) { + this.description = description; + } + + public int getEndOffset() { + return offset; + } + + /** + * Parses an escaped character. + * + * @param start the offset after the backslash + * @return the escaped character, or -1 if no character was recognized + */ + public int parseEscapedCharacter(int start) { + offset = start; + return parseEscapedCharacter(); + } + + private int parseEscapedCharacter() { + if (offset == description.length) { + throw new IllegalArgumentException("Short escaped character"); + } + char c = description[offset++]; + if (c == '0') { + int len = digits(offset, 3, 8); + if (len == 3 && description[offset] > '3') { + --len; + } + c = (char)Integer.parseInt(new String(description, offset, len), 8); + offset += len; + return c; + } + if (c == 'x' || c == 'u') { + int len = digits(offset, 4, 16); + c = (char)Integer.parseInt(new String(description, offset, len), 16); + offset += len; + return c; + } + switch (c) { + case 'a': + return 0x0007; + case 'e': + return 0x001B; + case 'f': + return 0x000C; + case 'n': + return 0x000A; + case 'r': + return 0x000D; + case 't': + return 0x0009; + case '\\': + case '.': + case '*': + case '+': + case '?': + case '|': + case '[': + case ']': + case '{': + case '}': + case '(': + case ')': + case '^': + case '$': + return c; + } + return -1; + } + + public int digits(int offset, int maxLength, int base) { + for (int i = 0; ; ++i) { + if (i == maxLength || offset + i >= description.length) { + return i; + } + int value = description[offset + i] - '0'; + if (value < 0) { + return i; + } + if (base > 10 && value >= 10) { + value += 10 - (value >= 'a' - '0' ? 'a' - '0' : 'A' - '0'); + } + if (value >= base) { + return i; + } + } + } + + public CharacterMatcher parseClass(int start) { + offset = start; + return parseClass(); + } + + public CharacterMatcher parseClass() { + if (description[offset] != '[') { + return null; + } + CharacterMatcher matcher = new CharacterMatcher(new boolean[0], + description[++ offset] == '^'); + if (matcher.inversePattern) { + ++ offset; + } + + int previous = -1; + boolean firstCharacter = true; + for (;;) { + if (offset >= description.length) { + unsupported("short regex"); + } + char c = description[offset++]; + if (c == '-' && !firstCharacter && description[offset] != ']') { + if (previous < 0) { + unsupported("invalid range"); + } + int rangeEnd = description[offset]; + if ('\\' == rangeEnd) { + rangeEnd = parseEscapedCharacter(); + if (rangeEnd < 0) { + unsupported("invalid range"); + } + } + matcher.ensureCapacity(rangeEnd + 1); + for (int j = previous + 1; j <= rangeEnd; j++) { + matcher.map[j] = true; + } + } else if (c == '\\') { + previous = parseEscapedCharacter(); + if (previous < 0) { + unsupported("escape"); + } else { + matcher.setMatch(previous); + } + } else if (c == '&' || c == '[') { + unsupported("operation"); + } else if (c == ']') { + break; + } else { + previous = c; + matcher.setMatch(previous); + } + firstCharacter = false; + } + + return matcher; + } + + private void unsupported(String msg) throws UnsupportedOperationException { + throw new UnsupportedOperationException("Unsupported " + msg + " @" + + offset + ": " + + new String(description, 0, description.length)); + } + } +} From 04d8955f98a787b3ce572eef45fa9f2b50daff6f Mon Sep 17 00:00:00 2001 From: Johannes Schindelin Date: Sat, 9 Nov 2013 15:18:11 -0600 Subject: [PATCH 11/31] Regex: Implement compiler for regular expression patterns Originally, this developer wanted to (ab)use the PikeVM with a hand-crafted program and an added "callback" opcode to parse the regular expressions. However, this turned out to be completely unnecessary: there are no ambiguities in regular expression patterns, so there is no need to do anything else than parse the pattern, one character at a time, into a nested expression that then knows how to write itself into a program for the PikeVM. For the moment, we still hardcode the program for the regular expression pattern demonstrating the challenge with the prioritized threads because the compiler cannot yet parse reluctant operators. Signed-off-by: Johannes Schindelin --- test/regex/Compiler.java | 171 +++++++++++++++++++++++++++++++++++++++ test/regex/Pattern.java | 16 +--- 2 files changed, 173 insertions(+), 14 deletions(-) create mode 100644 test/regex/Compiler.java diff --git a/test/regex/Compiler.java b/test/regex/Compiler.java new file mode 100644 index 0000000000..7213489d9c --- /dev/null +++ b/test/regex/Compiler.java @@ -0,0 +1,171 @@ +/* Copyright (c) 2008-2013, Avian Contributors + + Permission to use, copy, modify, and/or distribute this software + for any purpose with or without fee is hereby granted, provided + that the above copyright notice and this permission notice appear + in all copies. + + There is NO WARRANTY for this software. See license.txt for + details. */ + +package regex; + +import java.util.ArrayList; +import java.util.Stack; + +/** + * Compiles regular expressions into {@link PikeVM}s. + * + * @author Johannes Schindelin + */ +class Compiler implements PikeVMOpcodes { + private final static CharacterMatcher regularCharacter = + CharacterMatcher.parse("[^\\\\.*+?|\\[\\]{}()^$]"); + + private static class Output { + private int[] program; + private int offset; + private int groupCount = -1; + + public Output(Expression expr) { + // try-run to determine the code size + expr.writeCode(this); + program = new int[offset]; + offset = 0; + groupCount = -1; + // write it out! + expr.writeCode(this); + } + + public void add(int opcode) { + if (program != null) { + program[offset] = opcode; + } + offset++; + } + + public int markJump() { + return offset++; + } + + public void setJump(int mark) { + if (program != null) { + program[mark] = offset; + } + } + + public PikeVM toVM() { + return new PikeVM(program, groupCount); + } + } + + private abstract class Expression { + protected abstract void writeCode(Output output); + } + + private class QuestionMark extends Expression { + private Expression expr; + + public QuestionMark(Expression expr) { + this.expr = expr; + } + + protected void writeCode(Output output) { + output.add(SPLIT); + int jump = output.markJump(); + expr.writeCode(output); + output.setJump(jump); + } + } + + private class Group extends Expression { + private ArrayList list = new ArrayList(); + + public void push(Expression expr) { + list.add(expr); + } + + public void push(final int c) { + push(new Expression() { + public void writeCode(Output output) { + output.add(c); + } + }); + } + + public Expression pop() { + Expression result = list.remove(list.size() - 1); + return result; + } + + protected void writeCode(Output output) { + int groupIndex = ++ output.groupCount; + output.add(SAVE_OFFSET); + output.add(2 * groupIndex); + for (Expression expr : list) { + expr.writeCode(output); + } + output.add(SAVE_OFFSET); + output.add(2 * groupIndex + 1); + } + } + + private class Group0 extends Expression { + private final Group group; + + public Group0() { + group = new Group(); + } + + public void writeCode(Output output) { + group.writeCode(output); + } + } + + private Group0 root; + private Stack groups; + + public Compiler() { + root = new Group0(); + groups = new Stack(); + groups.add(root.group); + } + + public Pattern compile(String regex) { + char[] array = regex.toCharArray(); + for (int index = 0; index < array.length; ++ index) { + char c = array[index]; + Group current = groups.peek(); + if (regularCharacter.matches(c)) { + current.push(c); + continue; + } + switch (c) { + case '?': + current.push(new QuestionMark(current.pop())); + break; + case '(': + if (index + 1 < array.length && array[index + 1] == '?') { + throw new UnsupportedOperationException("Not yet supported: " + + regex.substring(index)); + } + current.push(groups.push(new Group())); + continue; + case ')': + if (groups.size() < 2) { + throw new RuntimeException("Invalid group close @" + index + ": " + + regex); + } + groups.pop(); + continue; + default: + throw new RuntimeException("Parse error @" + index + ": " + regex); + } + } + if (groups.size() != 1) { + throw new IllegalArgumentException("Unclosed groups: (" + + (groups.size() - 1) + "): " + regex); + } + return new RegexPattern(regex, 0, new Output(root).toVM()); + } +} diff --git a/test/regex/Pattern.java b/test/regex/Pattern.java index 200d558b2a..bd0405ed4c 100644 --- a/test/regex/Pattern.java +++ b/test/regex/Pattern.java @@ -49,19 +49,7 @@ public abstract class Pattern implements PikeVMOpcodes { if (flags != 0) { throw new UnsupportedOperationException("TODO"); } - if ("a(bb)?a".equals(regex)) { - return new RegexPattern(regex, flags, new PikeVM(new int[] { - SAVE_OFFSET, 0, - 'a', - SPLIT, 11, - SAVE_OFFSET, 2, - 'b', - 'b', - SAVE_OFFSET, 3, - /* 11 */ 'a', - SAVE_OFFSET, 1 - }, 1)); - } else if ("a(a*?)(a?)(a??)(a+)(a*)a".equals(regex)) { + if ("a(a*?)(a?)(a??)(a+)(a*)a".equals(regex)) { return new RegexPattern(regex, flags, new PikeVM(new int[] { SAVE_OFFSET, 0, 'a', @@ -91,7 +79,7 @@ public abstract class Pattern implements PikeVMOpcodes { SAVE_OFFSET, 1 }, 5)); } - throw new UnsupportedOperationException("Cannot handle regex " + regex); + return new Compiler().compile(regex); } public int flags() { From e2105670a0a88ff3b038910e79b8f765401d6b8f Mon Sep 17 00:00:00 2001 From: Johannes Schindelin Date: Fri, 22 Nov 2013 17:30:06 -0600 Subject: [PATCH 12/31] Regex compiler: fall back to TrivialPattern when possible While at it, let's get rid of the unescaping in TrivialPattern which was buggy anyway: special operators such as \b were misinterpreted as trivial patterns. Signed-off-by: Johannes Schindelin --- test/regex/Compiler.java | 7 ++- test/regex/Pattern.java | 3 -- test/regex/PikeVM.java | 37 +++++++++++++ test/regex/TrivialPattern.java | 99 +++------------------------------- 4 files changed, 49 insertions(+), 97 deletions(-) diff --git a/test/regex/Compiler.java b/test/regex/Compiler.java index 7213489d9c..70ebf13391 100644 --- a/test/regex/Compiler.java +++ b/test/regex/Compiler.java @@ -166,6 +166,11 @@ class Compiler implements PikeVMOpcodes { throw new IllegalArgumentException("Unclosed groups: (" + (groups.size() - 1) + "): " + regex); } - return new RegexPattern(regex, 0, new Output(root).toVM()); + PikeVM vm = new Output(root).toVM(); + String plain = vm.isPlainString(); + if (plain != null) { + return new TrivialPattern(regex, plain, 0); + } + return new RegexPattern(regex, 0, vm); } } diff --git a/test/regex/Pattern.java b/test/regex/Pattern.java index bd0405ed4c..49ac289ac3 100644 --- a/test/regex/Pattern.java +++ b/test/regex/Pattern.java @@ -43,9 +43,6 @@ public abstract class Pattern implements PikeVMOpcodes { } public static Pattern compile(String regex, int flags) { - try { - return new TrivialPattern(regex, flags); - } catch (UnsupportedOperationException handledBelow) { } if (flags != 0) { throw new UnsupportedOperationException("TODO"); } diff --git a/test/regex/PikeVM.java b/test/regex/PikeVM.java index db8ad44ef7..d1d6ce890b 100644 --- a/test/regex/PikeVM.java +++ b/test/regex/PikeVM.java @@ -332,4 +332,41 @@ class PikeVM implements PikeVMOpcodes { } return foundMatch; } + + /** + * Determines whether this machine recognizes a pattern without special + * operators. + *

+ * In case that the regular expression is actually a plain string without any + * special operators, we can avoid using a full-blown Pike VM and instead fall + * back to using the much faster {@link TrivialPattern}. + *

+ * + * @return the string to match, or null if the machine recognizes a + * non-trivial pattern + */ + public String isPlainString() { + // we expect the machine to start with SAVE_OFFSET 0 and + // end with SAVE_OFFSET 1 + int start = 0; + if (start + 1 < program.length && + program[start] == SAVE_OFFSET && program[start + 1] == 0) { + start += 2; + } + int end = program.length; + if (end > start + 1 && + program[end - 2] == SAVE_OFFSET && program[end - 1] == 1) { + end -= 2; + } + for (int i = start; i < end; ++ i) { + if (program[i] < 0) { + return null; + } + } + char[] array = new char[end - start]; + for (int i = start; i < end; ++ i) { + array[i - start] = (char)program[i]; + } + return new String(array); + } } diff --git a/test/regex/TrivialPattern.java b/test/regex/TrivialPattern.java index 40ea88fa3a..6289edc676 100644 --- a/test/regex/TrivialPattern.java +++ b/test/regex/TrivialPattern.java @@ -22,102 +22,15 @@ import java.util.LinkedList; */ public class TrivialPattern extends Pattern { - private final String trivialPattern; + private final String unescaped; - TrivialPattern(String pattern, int flags) { + TrivialPattern(String pattern, String unescaped, int flags) { super(pattern, flags); - this.trivialPattern = trivial(pattern); - } - - private static String trivial(String pattern) { - StringBuffer buffer = new StringBuffer(); - for (int i = 0; i < pattern.length(); ++i) { - char c = pattern.charAt(i); - switch (c) { - case '\\': - if (++i == pattern.length()) { - break; - } - c = pattern.charAt(i); - if (c == '0') { - int len = digits(pattern, ++i, 3, 8); - if (len == 3 && pattern.charAt(i) > '3') { - --len; - } - c = (char)Integer.parseInt(pattern.substring(i, i + len), 8); - i += len - 1; - } else if (c == 'x' || c == 'u') { - int len = digits(pattern, ++i, 4, 16); - c = (char)Integer.parseInt(pattern.substring(i, i + len), 16); - i += len - 1; - } else { - c = unescape(pattern.charAt(i)); - } - if (c != -1) { - break; - } - // fallthru - case '.': - case '*': - case '+': - case '?': - case '|': - case '[': - case ']': - case '{': - case '}': - case '(': - case ')': - case '^': - case '$': - throw new UnsupportedOperationException - ("only trivial regular expressions are supported so far (" + pattern + ")"); - } - buffer.append(c); - } - return buffer.toString(); - } - - private static int digits(String s, int offset, int maxLength, int base) { - for (int i = 0; ; ++i) { - if (i == maxLength || offset + i >= s.length()) { - return i; - } - int value = s.charAt(offset + i) - '0'; - if (value < 0) { - return i; - } - if (base > 10 && value >= 10) { - value += 10 - (value >= 'a' - '0' ? 'a' - '0' : 'A' - '0'); - } - if (value >= base) { - return i; - } - } - } - - private static char unescape(char c) { - switch (c) { - case '\\': - return c; - case 'a': - return 0x0007; - case 'e': - return 0x001B; - case 'f': - return 0x000C; - case 'n': - return 0x000A; - case 'r': - return 0x000D; - case 't': - return 0x0009; - } - return (char)-1; + this.unescaped = unescaped; } public Matcher matcher(CharSequence input) { - return new TrivialMatcher(trivialPattern, input); + return new TrivialMatcher(unescaped, input); } public String[] split(CharSequence input, int limit) { @@ -135,7 +48,7 @@ public class TrivialPattern extends Pattern { List list = new LinkedList(); int index = 0; int trailing = 0; - int patternLength = trivialPattern.length(); + int patternLength = unescaped.length(); while (index < input.length() && list.size() < limit - 1) { int i; if (patternLength == 0) { @@ -145,7 +58,7 @@ public class TrivialPattern extends Pattern { i = index + 1; } } else { - i = indexOf(input, trivialPattern, index); + i = indexOf(input, unescaped, index); } if (i >= 0) { From d753edafcd4a5d1c55cf754d7f736e5ddb492c61 Mon Sep 17 00:00:00 2001 From: Johannes Schindelin Date: Thu, 14 Nov 2013 11:13:12 -0600 Subject: [PATCH 13/31] Regex: support the dot Signed-off-by: Johannes Schindelin --- test/Regex.java | 2 ++ test/regex/Compiler.java | 3 +++ 2 files changed, 5 insertions(+) diff --git a/test/Regex.java b/test/Regex.java index 60f00dd1c3..ae8aac9b62 100644 --- a/test/Regex.java +++ b/test/Regex.java @@ -33,5 +33,7 @@ public class Regex { expectNoMatch("a(bb)?a", "abbba"); expectNoMatch("a(bb)?a", "abbaa"); expectGroups("a(a*?)(a?)(a??)(a+)(a*)a", "aaaaaa", "", "a", "", "aaa", ""); + expectMatch("...", "abc"); + expectNoMatch(".", "\n"); } } diff --git a/test/regex/Compiler.java b/test/regex/Compiler.java index 70ebf13391..879c3ef0b9 100644 --- a/test/regex/Compiler.java +++ b/test/regex/Compiler.java @@ -141,6 +141,9 @@ class Compiler implements PikeVMOpcodes { continue; } switch (c) { + case '.': + current.push(DOT); + continue; case '?': current.push(new QuestionMark(current.pop())); break; From f979505b3dda84b06a728d8ff7516757107553fb Mon Sep 17 00:00:00 2001 From: Johannes Schindelin Date: Sun, 10 Nov 2013 10:02:18 -0600 Subject: [PATCH 14/31] Regex: implement * and + operators Signed-off-by: Johannes Schindelin --- test/Regex.java | 8 ++++++- test/regex/Compiler.java | 45 +++++++++++++++++++++++++++++++++------- 2 files changed, 44 insertions(+), 9 deletions(-) diff --git a/test/Regex.java b/test/Regex.java index ae8aac9b62..39f3dfe29a 100644 --- a/test/Regex.java +++ b/test/Regex.java @@ -24,7 +24,11 @@ public class Regex { expect(matcher.matches()); expect(matcher.groupCount() == groups.length); for (int i = 1; i <= groups.length; ++i) { - expect(groups[i - 1].equals(matcher.group(i))); + if (groups[i - 1] == null) { + expect(matcher.group(i) == null); + } else { + expect(groups[i - 1].equals(matcher.group(i))); + } } } @@ -35,5 +39,7 @@ public class Regex { expectGroups("a(a*?)(a?)(a??)(a+)(a*)a", "aaaaaa", "", "a", "", "aaa", ""); expectMatch("...", "abc"); expectNoMatch(".", "\n"); + expectGroups("a(bb)*a", "abbbba", "bb"); + expectGroups("a(bb)?(bb)+a", "abba", null, "bb"); } } diff --git a/test/regex/Compiler.java b/test/regex/Compiler.java index 879c3ef0b9..75c9d2d9df 100644 --- a/test/regex/Compiler.java +++ b/test/regex/Compiler.java @@ -63,18 +63,44 @@ class Compiler implements PikeVMOpcodes { protected abstract void writeCode(Output output); } - private class QuestionMark extends Expression { + private class Repeat extends Expression { private Expression expr; + private int minCount, maxCount; - public QuestionMark(Expression expr) { + public Repeat(Expression expr, int minCount, int maxCount) { + if (minCount != 0 && minCount != 1) { + throw new RuntimeException("Unexpected min count: " + minCount); + } + if (maxCount != 1 && maxCount != -1) { + throw new RuntimeException("Unexpected max count: " + maxCount); + } this.expr = expr; + this.minCount = minCount; + this.maxCount = maxCount; } protected void writeCode(Output output) { - output.add(SPLIT); - int jump = output.markJump(); - expr.writeCode(output); - output.setJump(jump); + int start = output.offset; + if (minCount == 1 && maxCount == -1) { + expr.writeCode(output); + output.add(SPLIT_JMP); + output.add(start); + } else if (minCount == 0 && maxCount == -1) { + output.add(SPLIT); + int jump = output.markJump(); + expr.writeCode(output); + output.add(SPLIT_JMP); + output.add(start + 2); + output.setJump(jump); + } else if (minCount == 0 && maxCount == 1) { + output.add(SPLIT); + int jump = output.markJump(); + expr.writeCode(output); + output.setJump(jump); + } else { + throw new RuntimeException("Unexpected range: " + + minCount + ", " + maxCount); + } } } @@ -145,8 +171,11 @@ class Compiler implements PikeVMOpcodes { current.push(DOT); continue; case '?': - current.push(new QuestionMark(current.pop())); - break; + case '*': + case '+': + current.push(new Repeat(current.pop(), + c == '+' ? 1 : 0, c == '?' ? 1 : -1)); + continue; case '(': if (index + 1 < array.length && array[index + 1] == '?') { throw new UnsupportedOperationException("Not yet supported: " From 7da03b0f1999235cdb382c8295e5b8f6daa4a4de Mon Sep 17 00:00:00 2001 From: Johannes Schindelin Date: Sun, 10 Nov 2013 10:23:01 -0600 Subject: [PATCH 15/31] Regex: Implement reluctant '?', '*' and '+' Now that we have reluctant quantifiers, we can get rid of the hardcoded program for the challenging regular expression pattern. Signed-off-by: Johannes Schindelin --- test/regex/Compiler.java | 24 +++++++++++++++++------- test/regex/Pattern.java | 30 ------------------------------ 2 files changed, 17 insertions(+), 37 deletions(-) diff --git a/test/regex/Compiler.java b/test/regex/Compiler.java index 75c9d2d9df..6f65e6b4d9 100644 --- a/test/regex/Compiler.java +++ b/test/regex/Compiler.java @@ -66,8 +66,9 @@ class Compiler implements PikeVMOpcodes { private class Repeat extends Expression { private Expression expr; private int minCount, maxCount; + private boolean greedy; - public Repeat(Expression expr, int minCount, int maxCount) { + public Repeat(Expression expr, int minCount, int maxCount, boolean greedy) { if (minCount != 0 && minCount != 1) { throw new RuntimeException("Unexpected min count: " + minCount); } @@ -77,23 +78,26 @@ class Compiler implements PikeVMOpcodes { this.expr = expr; this.minCount = minCount; this.maxCount = maxCount; + this.greedy = greedy; } protected void writeCode(Output output) { int start = output.offset; + int splitJmp = greedy ? SPLIT_JMP : SPLIT; + int split = greedy ? SPLIT : SPLIT_JMP; if (minCount == 1 && maxCount == -1) { expr.writeCode(output); - output.add(SPLIT_JMP); + output.add(splitJmp); output.add(start); } else if (minCount == 0 && maxCount == -1) { - output.add(SPLIT); + output.add(split); int jump = output.markJump(); expr.writeCode(output); - output.add(SPLIT_JMP); + output.add(splitJmp); output.add(start + 2); output.setJump(jump); } else if (minCount == 0 && maxCount == 1) { - output.add(SPLIT); + output.add(split); int jump = output.markJump(); expr.writeCode(output); output.setJump(jump); @@ -172,10 +176,16 @@ class Compiler implements PikeVMOpcodes { continue; case '?': case '*': - case '+': + case '+': { + boolean greedy = true; + if (index + 1 < array.length && array[index + 1] == '?') { + greedy = false; + ++ index; + } current.push(new Repeat(current.pop(), - c == '+' ? 1 : 0, c == '?' ? 1 : -1)); + c == '+' ? 1 : 0, c == '?' ? 1 : -1, greedy)); continue; + } case '(': if (index + 1 < array.length && array[index + 1] == '?') { throw new UnsupportedOperationException("Not yet supported: " diff --git a/test/regex/Pattern.java b/test/regex/Pattern.java index 49ac289ac3..f0d5596e2f 100644 --- a/test/regex/Pattern.java +++ b/test/regex/Pattern.java @@ -46,36 +46,6 @@ public abstract class Pattern implements PikeVMOpcodes { if (flags != 0) { throw new UnsupportedOperationException("TODO"); } - if ("a(a*?)(a?)(a??)(a+)(a*)a".equals(regex)) { - return new RegexPattern(regex, flags, new PikeVM(new int[] { - SAVE_OFFSET, 0, - 'a', - SAVE_OFFSET, 2, - SPLIT_JMP, 10, - /* 7 */ 'a', - SPLIT, 7, - /* 10 */ SAVE_OFFSET, 3, - SAVE_OFFSET, 4, - SPLIT, 17, - 'a', - /* 17 */ SAVE_OFFSET, 5, - SAVE_OFFSET, 6, - SPLIT_JMP, 24, - 'a', - /* 24 */ SAVE_OFFSET, 7, - SAVE_OFFSET, 8, - /* 28 */ 'a', - SPLIT_JMP, 28, - SAVE_OFFSET, 9, - SAVE_OFFSET, 10, - SPLIT, 40, - /* 37 */ 'a', - SPLIT_JMP, 37, - /* 40 */ SAVE_OFFSET, 11, - 'a', - SAVE_OFFSET, 1 - }, 5)); - } return new Compiler().compile(regex); } From ca428c406c70c4f37858df465bafad25da0bee49 Mon Sep 17 00:00:00 2001 From: Johannes Schindelin Date: Wed, 13 Nov 2013 17:54:47 -0600 Subject: [PATCH 16/31] Regex: implement find() Now that we have non-greedy repeats, we can implement the find() (which essentially prefixes the regular expression pattern with '.*?'. Signed-off-by: Johannes Schindelin --- test/Regex.java | 13 +++++++++ test/regex/Compiler.java | 18 +++++++++++-- test/regex/PikeVM.java | 51 ++++++++++++++++++++++++++++++------ test/regex/RegexMatcher.java | 6 ++++- 4 files changed, 77 insertions(+), 11 deletions(-) diff --git a/test/Regex.java b/test/Regex.java index 39f3dfe29a..db2cf86892 100644 --- a/test/Regex.java +++ b/test/Regex.java @@ -32,6 +32,18 @@ public class Regex { } } + private static void expectFind(String regex, String string, + String... matches) + { + Matcher matcher = getMatcher(regex, string); + int i = 0; + while (i < matches.length) { + expect(matcher.find()); + expect(matches[i++].equals(matcher.group())); + } + expect(!matcher.find()); + } + public static void main(String[] args) { expectMatch("a(bb)?a", "abba"); expectNoMatch("a(bb)?a", "abbba"); @@ -41,5 +53,6 @@ public class Regex { expectNoMatch(".", "\n"); expectGroups("a(bb)*a", "abbbba", "bb"); expectGroups("a(bb)?(bb)+a", "abba", null, "bb"); + expectFind(" +", "Hello , world! ", " ", " ", " "); } } diff --git a/test/regex/Compiler.java b/test/regex/Compiler.java index 6f65e6b4d9..2b6b83d23f 100644 --- a/test/regex/Compiler.java +++ b/test/regex/Compiler.java @@ -26,6 +26,7 @@ class Compiler implements PikeVMOpcodes { private int[] program; private int offset; private int groupCount = -1; + private int findPreambleSize; public Output(Expression expr) { // try-run to determine the code size @@ -54,9 +55,14 @@ class Compiler implements PikeVMOpcodes { } } - public PikeVM toVM() { - return new PikeVM(program, groupCount); + public void markFindPreambleEnd() { + findPreambleSize = offset; } + + public PikeVM toVM() { + return new PikeVM(program, findPreambleSize, groupCount); + } + } private abstract class Expression { @@ -148,6 +154,14 @@ class Compiler implements PikeVMOpcodes { } public void writeCode(Output output) { + // find() preamble + int start = output.offset; + output.add(SPLIT_JMP); + output.add(start + 5); + output.add(DOTALL); + output.add(SPLIT); + output.add(start + 2); + output.markFindPreambleEnd(); group.writeCode(output); } } diff --git a/test/regex/PikeVM.java b/test/regex/PikeVM.java index d1d6ce890b..7b3e55467d 100644 --- a/test/regex/PikeVM.java +++ b/test/regex/PikeVM.java @@ -19,13 +19,21 @@ class PikeVM implements PikeVMOpcodes { private final int[] program; private final int groupCount; private final int offsetsCount; + /* + * For find(), we do not want to anchor the match at the start offset. Our + * compiler allows this by prefixing the code with an implicit '(?:.*?)'. For + * regular matches() calls, we want to skip that code and start at {@code + * findPrefixLength} instead. + */ + private final int findPrefixLength; public interface Result { void set(int[] start, int[] end); } - protected PikeVM(int[] program, int groupCount) { + protected PikeVM(int[] program, int findPrefixLength, int groupCount) { this.program = program; + this.findPrefixLength = findPrefixLength; this.groupCount = groupCount; offsetsCount = 2 * groupCount + 2; } @@ -190,6 +198,31 @@ class PikeVM implements PikeVMOpcodes { result.set(groupStart, groupEnd); } + private void mustStartMatchAt(int start) { + int previous = -1; + for (int pc = head; pc >= 0; ) { + int nextPC = next[pc] - 1; + if (start + 1 == offsets[pc][0]) { + previous = pc; + } else { + next[pc] = 0; + offsets[pc] = null; + if (pc == tail) { + head = tail = -1; + } else if (previous < 0) { + head = nextPC; + } else { + next[previous] = 1 + nextPC; + } + } + pc = nextPC; + } + } + + private int startOffset(int pc) { + return offsets[pc][0] - 1; + } + public boolean isEmpty() { return head < 0; } @@ -250,11 +283,8 @@ class PikeVM implements PikeVMOpcodes { ThreadQueue next = new ThreadQueue(); // initialize the first thread - ThreadQueue queued = new ThreadQueue(0); - if (!anchorStart) { - // this requires non-greedy matching - throw new UnsupportedOperationException(); - } + int startPC = anchorStart ? findPrefixLength : 0; + ThreadQueue queued = new ThreadQueue(startPC); boolean foundMatch = false; for (int i = start; i <= end; ++i) { @@ -280,6 +310,11 @@ class PikeVM implements PikeVMOpcodes { continue; } current.setResult(result); + // now that we found a match, even higher-priority matches must match + // at the same start offset + if (!anchorStart) { + next.mustStartMatchAt(current.startOffset(pc)); + } foundMatch = true; break; } @@ -346,9 +381,9 @@ class PikeVM implements PikeVMOpcodes { * non-trivial pattern */ public String isPlainString() { - // we expect the machine to start with SAVE_OFFSET 0 and + // we expect the machine to start with the find preamble and SAVE_OFFSET 0 // end with SAVE_OFFSET 1 - int start = 0; + int start = findPrefixLength; if (start + 1 < program.length && program[start] == SAVE_OFFSET && program[start + 1] == 0) { start += 2; diff --git a/test/regex/RegexMatcher.java b/test/regex/RegexMatcher.java index 5ea2eea82c..78bc7c77ca 100644 --- a/test/regex/RegexMatcher.java +++ b/test/regex/RegexMatcher.java @@ -49,8 +49,12 @@ public class RegexMatcher extends Matcher { return vm.matches(array, 0, array.length, true, true, adapter); } + public boolean find() { + return find(end + (start == end ? 1 : 0)); + } + public boolean find(int offset) { - throw new UnsupportedOperationException("TODO"); + return vm.matches(array, offset, array.length, false, false, adapter); } public int start(int group) { From 53563c4f8ebb4f82b12e5bdaac978c5f7b671bfb Mon Sep 17 00:00:00 2001 From: Johannes Schindelin Date: Sat, 9 Nov 2013 15:43:26 -0600 Subject: [PATCH 17/31] Regex: add support for character classes Now we support regular expression patterns a la '[0-9]'. Signed-off-by: Johannes Schindelin --- test/Regex.java | 2 ++ test/regex/Compiler.java | 38 ++++++++++++++++++++++++++++++++++- test/regex/PikeVM.java | 11 +++++++++- test/regex/PikeVMOpcodes.java | 2 ++ 4 files changed, 51 insertions(+), 2 deletions(-) diff --git a/test/Regex.java b/test/Regex.java index db2cf86892..1865163840 100644 --- a/test/Regex.java +++ b/test/Regex.java @@ -54,5 +54,7 @@ public class Regex { expectGroups("a(bb)*a", "abbbba", "bb"); expectGroups("a(bb)?(bb)+a", "abba", null, "bb"); expectFind(" +", "Hello , world! ", " ", " ", " "); + expectMatch("[0-9A-Fa-f]+", "08ef"); + expectNoMatch("[0-9A-Fa-f]+", "08@ef"); } } diff --git a/test/regex/Compiler.java b/test/regex/Compiler.java index 2b6b83d23f..f3c5640225 100644 --- a/test/regex/Compiler.java +++ b/test/regex/Compiler.java @@ -27,6 +27,7 @@ class Compiler implements PikeVMOpcodes { private int offset; private int groupCount = -1; private int findPreambleSize; + private ArrayList classes; public Output(Expression expr) { // try-run to determine the code size @@ -34,6 +35,7 @@ class Compiler implements PikeVMOpcodes { program = new int[offset]; offset = 0; groupCount = -1; + classes = new ArrayList(); // write it out! expr.writeCode(this); } @@ -60,15 +62,38 @@ class Compiler implements PikeVMOpcodes { } public PikeVM toVM() { - return new PikeVM(program, findPreambleSize, groupCount); + CharacterMatcher[] classes = new CharacterMatcher[this.classes.size()]; + this.classes.toArray(classes); + return new PikeVM(program, findPreambleSize, groupCount, classes); } + public int addClass(CharacterMatcher characterClass) { + if (program == null) { + return -1; + } + int result = classes.size(); + classes.add(characterClass); + return result; + } } private abstract class Expression { protected abstract void writeCode(Output output); } + private class CharacterRange extends Expression { + private final CharacterMatcher characterClass; + + public CharacterRange(CharacterMatcher characterClass) { + this.characterClass = characterClass; + } + + protected void writeCode(Output output) { + output.add(CHARACTER_CLASS); + output.add(output.addClass(characterClass)); + } + } + private class Repeat extends Expression { private Expression expr; private int minCount, maxCount; @@ -177,6 +202,8 @@ class Compiler implements PikeVMOpcodes { public Pattern compile(String regex) { char[] array = regex.toCharArray(); + CharacterMatcher.Parser characterClassParser = + new CharacterMatcher.Parser(array); for (int index = 0; index < array.length; ++ index) { char c = array[index]; Group current = groups.peek(); @@ -214,6 +241,15 @@ class Compiler implements PikeVMOpcodes { } groups.pop(); continue; + case '[': { + CharacterMatcher matcher = characterClassParser.parseClass(index); + if (matcher == null) { + throw new RuntimeException("Invalid range @" + index + ": " + regex); + } + current.push(new CharacterRange(matcher)); + index = characterClassParser.getEndOffset() - 1; + continue; + } default: throw new RuntimeException("Parse error @" + index + ": " + regex); } diff --git a/test/regex/PikeVM.java b/test/regex/PikeVM.java index 7b3e55467d..fa7ff4ccb8 100644 --- a/test/regex/PikeVM.java +++ b/test/regex/PikeVM.java @@ -26,16 +26,20 @@ class PikeVM implements PikeVMOpcodes { * findPrefixLength} instead. */ private final int findPrefixLength; + private final CharacterMatcher[] classes; public interface Result { void set(int[] start, int[] end); } - protected PikeVM(int[] program, int findPrefixLength, int groupCount) { + protected PikeVM(int[] program, int findPrefixLength, int groupCount, + CharacterMatcher[] classes) + { this.program = program; this.findPrefixLength = findPrefixLength; this.groupCount = groupCount; offsetsCount = 2 * groupCount + 2; + this.classes = classes; } /** @@ -329,6 +333,11 @@ class PikeVM implements PikeVMOpcodes { case DOTALL: current.queueNext(pc, pc + 1, next); break; + case CHARACTER_CLASS: + if (classes[program[pc + 1]].matches(c)) { + current.queueNext(pc, pc + 2, next); + } + break; /* immediate opcodes, i.e. thread continues within the same step */ case SAVE_OFFSET: int index = program[pc + 1]; diff --git a/test/regex/PikeVMOpcodes.java b/test/regex/PikeVMOpcodes.java index e281aa25ab..0fa5619f62 100644 --- a/test/regex/PikeVMOpcodes.java +++ b/test/regex/PikeVMOpcodes.java @@ -22,6 +22,8 @@ interface PikeVMOpcodes { final static int DOT = -1; final static int DOTALL = -2; + final static int CHARACTER_CLASS = -20; + final static int SAVE_OFFSET = -40; final static int SPLIT = -50; From c3a06a600aef747827735f8be3fbb4569e8bd525 Mon Sep 17 00:00:00 2001 From: Johannes Schindelin Date: Tue, 12 Nov 2013 11:34:30 -0600 Subject: [PATCH 18/31] Regex: implement non-capturing groups Signed-off-by: Johannes Schindelin --- test/Regex.java | 1 + test/regex/Compiler.java | 38 ++++++++++++++++++++++++++++---------- 2 files changed, 29 insertions(+), 10 deletions(-) diff --git a/test/Regex.java b/test/Regex.java index 1865163840..56b1356d4f 100644 --- a/test/Regex.java +++ b/test/Regex.java @@ -56,5 +56,6 @@ public class Regex { expectFind(" +", "Hello , world! ", " ", " ", " "); expectMatch("[0-9A-Fa-f]+", "08ef"); expectNoMatch("[0-9A-Fa-f]+", "08@ef"); + expectGroups("(?:a)", "a"); } } diff --git a/test/regex/Compiler.java b/test/regex/Compiler.java index f3c5640225..a27a93388f 100644 --- a/test/regex/Compiler.java +++ b/test/regex/Compiler.java @@ -140,8 +140,14 @@ class Compiler implements PikeVMOpcodes { } private class Group extends Expression { + private final boolean capturing; + private ArrayList list = new ArrayList(); + public Group(boolean capturing) { + this.capturing = capturing; + } + public void push(Expression expr) { list.add(expr); } @@ -160,14 +166,19 @@ class Compiler implements PikeVMOpcodes { } protected void writeCode(Output output) { - int groupIndex = ++ output.groupCount; - output.add(SAVE_OFFSET); - output.add(2 * groupIndex); + int groupIndex = -1; + if (capturing) { + groupIndex = ++ output.groupCount; + output.add(SAVE_OFFSET); + output.add(2 * groupIndex); + } for (Expression expr : list) { expr.writeCode(output); } - output.add(SAVE_OFFSET); - output.add(2 * groupIndex + 1); + if (capturing) { + output.add(SAVE_OFFSET); + output.add(2 * groupIndex + 1); + } } } @@ -175,7 +186,7 @@ class Compiler implements PikeVMOpcodes { private final Group group; public Group0() { - group = new Group(); + group = new Group(true); } public void writeCode(Output output) { @@ -227,13 +238,20 @@ class Compiler implements PikeVMOpcodes { c == '+' ? 1 : 0, c == '?' ? 1 : -1, greedy)); continue; } - case '(': + case '(': { + boolean capturing = true; if (index + 1 < array.length && array[index + 1] == '?') { - throw new UnsupportedOperationException("Not yet supported: " - + regex.substring(index)); + if (index + 2 < array.length && array[index + 2] == ':') { + index += 2; + capturing = false; + } else { + throw new UnsupportedOperationException("Not yet supported: " + + regex.substring(index)); + } } - current.push(groups.push(new Group())); + current.push(groups.push(new Group(capturing))); continue; + } case ')': if (groups.size() < 2) { throw new RuntimeException("Invalid group close @" + index + ": " From d4a2f58eb57b0ea1cf2763c9b16ea99ca2867a8d Mon Sep 17 00:00:00 2001 From: Johannes Schindelin Date: Mon, 11 Nov 2013 23:09:25 -0600 Subject: [PATCH 19/31] Regex: implement alternatives Now we support regular expressions like 'A|B|C'. Signed-off-by: Johannes Schindelin --- test/Regex.java | 2 ++ test/regex/Compiler.java | 39 ++++++++++++++++++++++++++++++++++++--- 2 files changed, 38 insertions(+), 3 deletions(-) diff --git a/test/Regex.java b/test/Regex.java index 56b1356d4f..b5545564e7 100644 --- a/test/Regex.java +++ b/test/Regex.java @@ -57,5 +57,7 @@ public class Regex { expectMatch("[0-9A-Fa-f]+", "08ef"); expectNoMatch("[0-9A-Fa-f]+", "08@ef"); expectGroups("(?:a)", "a"); + expectGroups("a|(b|c)", "a", (String)null); + expectGroups("a|(b|c)", "c", "c"); } } diff --git a/test/regex/Compiler.java b/test/regex/Compiler.java index a27a93388f..3b250ca640 100644 --- a/test/regex/Compiler.java +++ b/test/regex/Compiler.java @@ -143,9 +143,13 @@ class Compiler implements PikeVMOpcodes { private final boolean capturing; private ArrayList list = new ArrayList(); + private ArrayList alternatives; - public Group(boolean capturing) { + public Group(boolean capturing, ArrayList initialList) { this.capturing = capturing; + if (initialList != null) { + list.addAll(initialList); + } } public void push(Expression expr) { @@ -160,6 +164,14 @@ class Compiler implements PikeVMOpcodes { }); } + public void startAlternative() { + if (alternatives == null) { + alternatives = new ArrayList(); + } + alternatives.add(new Group(false, list)); + list.clear(); + } + public Expression pop() { Expression result = list.remove(list.size() - 1); return result; @@ -172,9 +184,27 @@ class Compiler implements PikeVMOpcodes { output.add(SAVE_OFFSET); output.add(2 * groupIndex); } + int[] jumps = null; + if (alternatives != null) { + jumps = new int[alternatives.size()]; + int i = 0; + for (Group alternative : alternatives) { + output.add(SPLIT); + int jump = output.markJump(); + alternative.writeCode(output); + output.add(JMP); + jumps[i++] = output.markJump(); + output.setJump(jump); + } + } for (Expression expr : list) { expr.writeCode(output); } + if (jumps != null) { + for (int jump : jumps) { + output.setJump(jump); + } + } if (capturing) { output.add(SAVE_OFFSET); output.add(2 * groupIndex + 1); @@ -186,7 +216,7 @@ class Compiler implements PikeVMOpcodes { private final Group group; public Group0() { - group = new Group(true); + group = new Group(true, null); } public void writeCode(Output output) { @@ -249,7 +279,7 @@ class Compiler implements PikeVMOpcodes { + regex.substring(index)); } } - current.push(groups.push(new Group(capturing))); + current.push(groups.push(new Group(capturing, null))); continue; } case ')': @@ -268,6 +298,9 @@ class Compiler implements PikeVMOpcodes { index = characterClassParser.getEndOffset() - 1; continue; } + case '|': + current.startAlternative(); + continue; default: throw new RuntimeException("Parse error @" + index + ": " + regex); } From 85af36ef9060aa062b60dbcc096979d411688c5c Mon Sep 17 00:00:00 2001 From: Johannes Schindelin Date: Tue, 12 Nov 2013 09:33:45 -0600 Subject: [PATCH 20/31] Regex: support lookaheads Signed-off-by: Johannes Schindelin --- test/Regex.java | 1 + test/regex/Compiler.java | 47 ++++++++++++++++++++++++++++++++--- test/regex/PikeVM.java | 21 +++++++++++++--- test/regex/PikeVMOpcodes.java | 2 ++ 4 files changed, 64 insertions(+), 7 deletions(-) diff --git a/test/Regex.java b/test/Regex.java index b5545564e7..375d41a704 100644 --- a/test/Regex.java +++ b/test/Regex.java @@ -59,5 +59,6 @@ public class Regex { expectGroups("(?:a)", "a"); expectGroups("a|(b|c)", "a", (String)null); expectGroups("a|(b|c)", "c", "c"); + expectGroups("(?=a)a", "a"); } } diff --git a/test/regex/Compiler.java b/test/regex/Compiler.java index 3b250ca640..5922f104f7 100644 --- a/test/regex/Compiler.java +++ b/test/regex/Compiler.java @@ -28,6 +28,7 @@ class Compiler implements PikeVMOpcodes { private int groupCount = -1; private int findPreambleSize; private ArrayList classes; + private ArrayList lookaheads; public Output(Expression expr) { // try-run to determine the code size @@ -36,6 +37,7 @@ class Compiler implements PikeVMOpcodes { offset = 0; groupCount = -1; classes = new ArrayList(); + lookaheads = new ArrayList(); // write it out! expr.writeCode(this); } @@ -64,7 +66,10 @@ class Compiler implements PikeVMOpcodes { public PikeVM toVM() { CharacterMatcher[] classes = new CharacterMatcher[this.classes.size()]; this.classes.toArray(classes); - return new PikeVM(program, findPreambleSize, groupCount, classes); + PikeVM[] lookaheads = new PikeVM[this.lookaheads.size()]; + this.lookaheads.toArray(lookaheads); + return new PikeVM(program, findPreambleSize, groupCount, classes, + lookaheads); } public int addClass(CharacterMatcher characterClass) { @@ -75,6 +80,15 @@ class Compiler implements PikeVMOpcodes { classes.add(characterClass); return result; } + + public int addLookahead(PikeVM lookahead) { + if (program == null) { + return -1; + } + int result = lookaheads.size(); + lookaheads.add(lookahead); + return result; + } } private abstract class Expression { @@ -212,6 +226,17 @@ class Compiler implements PikeVMOpcodes { } } + private class Lookahead extends Expression { + private final Group group = new Group(false, null); + + @Override + protected void writeCode(Output output) { + PikeVM vm = new Output(group).toVM(); + output.add(LOOKAHEAD); + output.add(output.addLookahead(vm)); + } + } + private class Group0 extends Expression { private final Group group; @@ -271,10 +296,24 @@ class Compiler implements PikeVMOpcodes { case '(': { boolean capturing = true; if (index + 1 < array.length && array[index + 1] == '?') { - if (index + 2 < array.length && array[index + 2] == ':') { - index += 2; + index += 2; + if (index >= array.length) { + throw new RuntimeException("Short pattern @" + index + ": " + + regex); + } + c = array[index]; + switch (c) { + case ':': capturing = false; - } else { + break; + case '=': { + capturing = false; + Lookahead lookahead = new Lookahead(); + current.push(lookahead); + groups.push(lookahead.group); + continue; + } + default: throw new UnsupportedOperationException("Not yet supported: " + regex.substring(index)); } diff --git a/test/regex/PikeVM.java b/test/regex/PikeVM.java index fa7ff4ccb8..ec20323875 100644 --- a/test/regex/PikeVM.java +++ b/test/regex/PikeVM.java @@ -27,19 +27,21 @@ class PikeVM implements PikeVMOpcodes { */ private final int findPrefixLength; private final CharacterMatcher[] classes; + private final PikeVM[] lookaheads; public interface Result { void set(int[] start, int[] end); } protected PikeVM(int[] program, int findPrefixLength, int groupCount, - CharacterMatcher[] classes) + CharacterMatcher[] classes, PikeVM[] lookaheads) { this.program = program; this.findPrefixLength = findPrefixLength; this.groupCount = groupCount; offsetsCount = 2 * groupCount + 2; this.classes = classes; + this.lookaheads = lookaheads; } /** @@ -313,7 +315,12 @@ class PikeVM implements PikeVMOpcodes { if (anchorEnd && i < end) { continue; } + if (result == null) { + // only interested in a match, no need to go on + return true; + } current.setResult(result); + // now that we found a match, even higher-priority matches must match // at the same start offset if (!anchorStart) { @@ -338,10 +345,18 @@ class PikeVM implements PikeVMOpcodes { current.queueNext(pc, pc + 2, next); } break; + case LOOKAHEAD: + if (lookaheads[program[pc + 1]].matches(characters, + i, characters.length, true, false, null)) { + current.queueImmediately(pc, pc + 2, false); + } + break; /* immediate opcodes, i.e. thread continues within the same step */ case SAVE_OFFSET: - int index = program[pc + 1]; - current.saveOffset(pc, index, i); + if (result != null) { + int index = program[pc + 1]; + current.saveOffset(pc, index, i); + } current.queueImmediately(pc, pc + 2, false); break; case SPLIT: diff --git a/test/regex/PikeVMOpcodes.java b/test/regex/PikeVMOpcodes.java index 0fa5619f62..ed66eaf6f3 100644 --- a/test/regex/PikeVMOpcodes.java +++ b/test/regex/PikeVMOpcodes.java @@ -24,6 +24,8 @@ interface PikeVMOpcodes { final static int CHARACTER_CLASS = -20; + final static int LOOKAHEAD = -30; + final static int SAVE_OFFSET = -40; final static int SPLIT = -50; From 62d1964779b8779ac36446642df800fcfe5d9a21 Mon Sep 17 00:00:00 2001 From: Johannes Schindelin Date: Wed, 13 Nov 2013 11:13:06 -0600 Subject: [PATCH 21/31] Regex: add a method to reverse the PikeVM program A program for the PikeVM corresponds to a regular expression pattern. The program matches the character sequence in left-to-right order. However, for look-behind expressions, we will want to match the character sequence backwards. To this end, it is nice that regular expression patterns can be reversed in a straight-forward manner. However, it would be nice if we could avoid multiple parsing passes and simply parse even look-behind expressions as if they were look-ahead ones, and then simply reverse the program for that part. Happily, it is not difficult to reverse the program so it is equivalent to matching the pattern backwards. There is one catch, though. Imagine matching the sequence "a" against the regular expression "(a?)a?". If we match forward, the group will match the letter "a", when matching backwards, it will match the empty string. So, while the reverse pattern is equivalent to the forward pattern in terms of "does the pattern match that sequence", but not its sub-matches. For that reason, Java simply ignores capturing groups in look-behind patterns (and for consistency, the same holds for look-ahead patterns). Signed-off-by: Johannes Schindelin --- test/regex/PikeVM.java | 134 ++++++++++++++++++++++++++++++++++ test/regex/PikeVMOpcodes.java | 3 + 2 files changed, 137 insertions(+) diff --git a/test/regex/PikeVM.java b/test/regex/PikeVM.java index ec20323875..1221ee91c5 100644 --- a/test/regex/PikeVM.java +++ b/test/regex/PikeVM.java @@ -428,4 +428,138 @@ class PikeVM implements PikeVMOpcodes { } return new String(array); } + + private static int length(int opcode) { + return opcode <= SINGLE_ARG_START && opcode >= SINGLE_ARG_END ? 2 : 1; + } + + private static boolean isJump(int opcode) { + return opcode <= SPLIT && opcode >= JMP; + } + + /** + * Reverses the program (effectively matching the reverse pattern). + *

+ * It is a well-known fact that any regular expression can be reordered + * trivially into an equivalent regular expression to be applied in backward + * direction (coming in real handy for look-behind expressions). + *

+ *

+ * Example: instead of matching the sequence "aaaabb" with the pattern "a+b+", + * we can match the reverse sequence "bbaaaa" with the pattern "b+a+". + *

+ *

+ * One caveat: while the reverse pattern is equivalent in the sense that it + * matches if, and only if, the original pattern matches the forward + * direction, the same is not true for submatches. Consider the input "a" and + * the pattern "(a?)a?": when matching in forward direction the captured group + * is "a", while the backward direction will yield the empty string. For that + * reason, Java dictates that capturing groups in look-behind patterns are + * ignored. + *

+ */ + public void reverse() { + reverse(findPrefixLength, program.length); + } + + /** + * Reverses a specific part of the program (to match in reverse direction). + *

+ * This is the work-horse of {@link #reverse()}. + *

+ *

+ * To visualize the process of reversing a program, let's look at it as a + * directed graph (each jump is represented by an "X + * ", non-jumping steps are represented by a "o"s, arrows show the + * direction of the flow, SPLITs spawn two arrows): + * + *

+   * o -> X -> X -> o -> X    o -> o
+   * ^    |     \         \___^____^
+   *  \__/       \____________|
+   * 
+ * + * The concept of reversing the program is easiest explained as following: if + * we insert auxiliary nodes "Y" for jump targets, the graph looks + * like this instead: + * + *
+   * Y -> o -> X -> X -> o -> X    Y -> o -> Y -> o
+   * ^         |     \         \___^_________^
+   *  \_______/       \____________|
+   * 
+ * + * It is now obvious that reversing the program is equivalent to reversing all + * arrows, simply deleting all Xs and substituting each Y + * with a jump. Note that the reverse program will have the same number of + * JMP, but they will not be associated with the same arrows!: + * + *
+   * X <- o <- o    X <- o <- X <- o
+   * |    ^    ^____|________/
+   *  \__/ \_______/
+   * 
+ * + *

+ * @param start + * start reversing the program with this instruction + * @param end + * stop reversing at this instruction (this must be either an index + * aligned exactly with an instruction, or exactly + * {@code program.length}. + */ + private void reverse(int start, int end) { + // Pass 1: build the list of jump targets + int[] newJumps = new int[end + 1]; + boolean[] brokenArrows = new boolean[end + 1]; + for (int pc = start; pc < end; pc += length(program[pc])) { + if (isJump(program[pc])) { + int target = program[pc + 1]; + newJumps[pc + 1] = newJumps[target]; + newJumps[target] = pc + 1; + if (program[pc] == JMP) { + brokenArrows[pc + 2] = true; + } + } + } + + // Pass 2: determine mapped program counters + int[] mapping = new int[end]; + for (int pc = start, mappedPC = end; mappedPC > 0 + && pc < end; pc += length(program[pc])) { + for (int jump = newJumps[pc]; jump > 0; jump = newJumps[jump]) { + mappedPC -= 2; + } + if (!isJump(program[pc])) { + mappedPC -= length(program[pc]); + } + mapping[pc] = mappedPC; + } + + // Pass 3: write the new program + int[] reverse = new int[end]; + for (int pc = start, mappedPC = end; mappedPC > 0; + pc += length(program[pc])) { + boolean brokenArrow = brokenArrows[pc]; + for (int jump = newJumps[pc]; jump > 0; jump = newJumps[jump]) { + reverse[--mappedPC] = mapping[jump - 1]; + if (brokenArrow) { + reverse[--mappedPC] = JMP; + brokenArrow = false; + } else { + reverse[--mappedPC] = + program[jump - 1] == SPLIT_JMP ? SPLIT_JMP : SPLIT; + } + } + if (pc == end) { + break; + } + if (!isJump(program[pc])) { + for (int i = length(program[pc]); i-- > 0; ) { + reverse[--mappedPC] = program[pc + i]; + } + } + } + System.arraycopy(reverse, start, program, start, end - start); + } } diff --git a/test/regex/PikeVMOpcodes.java b/test/regex/PikeVMOpcodes.java index ed66eaf6f3..3f86f34adf 100644 --- a/test/regex/PikeVMOpcodes.java +++ b/test/regex/PikeVMOpcodes.java @@ -31,4 +31,7 @@ interface PikeVMOpcodes { final static int SPLIT = -50; final static int SPLIT_JMP = -51; // this split prefers to jump final static int JMP = -52; + + final static int SINGLE_ARG_START = CHARACTER_CLASS; + final static int SINGLE_ARG_END = JMP; } From 8b611c807530cc9dc4fccc09429b77579a348ac7 Mon Sep 17 00:00:00 2001 From: Johannes Schindelin Date: Thu, 14 Nov 2013 11:10:18 -0600 Subject: [PATCH 22/31] Regex: support look-behind patterns Signed-off-by: Johannes Schindelin --- test/Regex.java | 1 + test/regex/Compiler.java | 49 +++++++++++++++++++++++++---------- test/regex/PikeVM.java | 21 ++++++++++----- test/regex/PikeVMOpcodes.java | 1 + 4 files changed, 51 insertions(+), 21 deletions(-) diff --git a/test/Regex.java b/test/Regex.java index 375d41a704..44157d369e 100644 --- a/test/Regex.java +++ b/test/Regex.java @@ -60,5 +60,6 @@ public class Regex { expectGroups("a|(b|c)", "a", (String)null); expectGroups("a|(b|c)", "c", "c"); expectGroups("(?=a)a", "a"); + expectGroups(".*(o)(?<=[A-Z][a-z]*)", "Hello", "o"); } } diff --git a/test/regex/Compiler.java b/test/regex/Compiler.java index 5922f104f7..6967e542ba 100644 --- a/test/regex/Compiler.java +++ b/test/regex/Compiler.java @@ -28,7 +28,7 @@ class Compiler implements PikeVMOpcodes { private int groupCount = -1; private int findPreambleSize; private ArrayList classes; - private ArrayList lookaheads; + private ArrayList lookarounds; public Output(Expression expr) { // try-run to determine the code size @@ -37,7 +37,7 @@ class Compiler implements PikeVMOpcodes { offset = 0; groupCount = -1; classes = new ArrayList(); - lookaheads = new ArrayList(); + lookarounds = new ArrayList(); // write it out! expr.writeCode(this); } @@ -66,10 +66,10 @@ class Compiler implements PikeVMOpcodes { public PikeVM toVM() { CharacterMatcher[] classes = new CharacterMatcher[this.classes.size()]; this.classes.toArray(classes); - PikeVM[] lookaheads = new PikeVM[this.lookaheads.size()]; - this.lookaheads.toArray(lookaheads); + PikeVM[] lookarounds = new PikeVM[this.lookarounds.size()]; + this.lookarounds.toArray(lookarounds); return new PikeVM(program, findPreambleSize, groupCount, classes, - lookaheads); + lookarounds); } public int addClass(CharacterMatcher characterClass) { @@ -81,12 +81,12 @@ class Compiler implements PikeVMOpcodes { return result; } - public int addLookahead(PikeVM lookahead) { + public int addLookaround(PikeVM lookaround) { if (program == null) { return -1; } - int result = lookaheads.size(); - lookaheads.add(lookahead); + int result = lookarounds.size(); + lookarounds.add(lookaround); return result; } } @@ -226,14 +226,22 @@ class Compiler implements PikeVMOpcodes { } } - private class Lookahead extends Expression { + private class Lookaround extends Expression { private final Group group = new Group(false, null); + private final boolean forward; + + public Lookaround(boolean forward) { + this.forward = forward; + } @Override protected void writeCode(Output output) { PikeVM vm = new Output(group).toVM(); - output.add(LOOKAHEAD); - output.add(output.addLookahead(vm)); + if (!forward) { + vm.reverse(); + } + output.add(forward ? LOOKAHEAD : LOOKBEHIND); + output.add(output.addLookaround(vm)); } } @@ -302,15 +310,28 @@ class Compiler implements PikeVMOpcodes { + regex); } c = array[index]; + boolean lookAhead = true; + if (c == '<') { + if (++ index >= array.length) { + throw new RuntimeException("Short pattern @" + index + ": " + + regex); + } + lookAhead = false; + c = array[index]; + if (c != '=' && c != '!') { + throw new IllegalArgumentException("Named groups not supported @" + + index + ": " + regex); + } + } switch (c) { case ':': capturing = false; break; case '=': { capturing = false; - Lookahead lookahead = new Lookahead(); - current.push(lookahead); - groups.push(lookahead.group); + Lookaround lookaround = new Lookaround(lookAhead); + current.push(lookaround); + groups.push(lookaround.group); continue; } default: diff --git a/test/regex/PikeVM.java b/test/regex/PikeVM.java index 1221ee91c5..d0bd453d27 100644 --- a/test/regex/PikeVM.java +++ b/test/regex/PikeVM.java @@ -27,21 +27,21 @@ class PikeVM implements PikeVMOpcodes { */ private final int findPrefixLength; private final CharacterMatcher[] classes; - private final PikeVM[] lookaheads; + private final PikeVM[] lookarounds; public interface Result { void set(int[] start, int[] end); } protected PikeVM(int[] program, int findPrefixLength, int groupCount, - CharacterMatcher[] classes, PikeVM[] lookaheads) + CharacterMatcher[] classes, PikeVM[] lookarounds) { this.program = program; this.findPrefixLength = findPrefixLength; this.groupCount = groupCount; offsetsCount = 2 * groupCount + 2; this.classes = classes; - this.lookaheads = lookaheads; + this.lookarounds = lookarounds; } /** @@ -293,13 +293,14 @@ class PikeVM implements PikeVMOpcodes { ThreadQueue queued = new ThreadQueue(startPC); boolean foundMatch = false; - for (int i = start; i <= end; ++i) { + int step = end > start ? +1 : -1; + for (int i = start; i != end + step; i += step) { if (queued.isEmpty()) { // no threads left return foundMatch; } - char c = i < end ? characters[i] : 0; + char c = i != end ? characters[i] : 0; int pc = -1; for (;;) { pc = current.next(pc); @@ -312,7 +313,7 @@ class PikeVM implements PikeVMOpcodes { // pc == program.length is a match! if (pc == program.length) { - if (anchorEnd && i < end) { + if (anchorEnd && i != end) { continue; } if (result == null) { @@ -346,11 +347,17 @@ class PikeVM implements PikeVMOpcodes { } break; case LOOKAHEAD: - if (lookaheads[program[pc + 1]].matches(characters, + if (lookarounds[program[pc + 1]].matches(characters, i, characters.length, true, false, null)) { current.queueImmediately(pc, pc + 2, false); } break; + case LOOKBEHIND: + if (lookarounds[program[pc + 1]].matches(characters, + i - 1, -1, true, false, null)) { + current.queueImmediately(pc, pc + 2, false); + } + break; /* immediate opcodes, i.e. thread continues within the same step */ case SAVE_OFFSET: if (result != null) { diff --git a/test/regex/PikeVMOpcodes.java b/test/regex/PikeVMOpcodes.java index 3f86f34adf..acd67cc2fd 100644 --- a/test/regex/PikeVMOpcodes.java +++ b/test/regex/PikeVMOpcodes.java @@ -25,6 +25,7 @@ interface PikeVMOpcodes { final static int CHARACTER_CLASS = -20; final static int LOOKAHEAD = -30; + final static int LOOKBEHIND = -31; final static int SAVE_OFFSET = -40; From 098f688cd8f8fb55856570b293b7999a33efd3ea Mon Sep 17 00:00:00 2001 From: Johannes Schindelin Date: Wed, 20 Nov 2013 09:57:04 -0600 Subject: [PATCH 23/31] Regex: implement negative look-arounds Signed-off-by: Johannes Schindelin --- test/Regex.java | 1 + test/regex/Compiler.java | 12 ++++++++---- test/regex/PikeVM.java | 12 ++++++++++++ test/regex/PikeVMOpcodes.java | 2 ++ 4 files changed, 23 insertions(+), 4 deletions(-) diff --git a/test/Regex.java b/test/Regex.java index 44157d369e..48836cfe38 100644 --- a/test/Regex.java +++ b/test/Regex.java @@ -61,5 +61,6 @@ public class Regex { expectGroups("a|(b|c)", "c", "c"); expectGroups("(?=a)a", "a"); expectGroups(".*(o)(?<=[A-Z][a-z]*)", "Hello", "o"); + expectNoMatch("(?!a).", "a"); } } diff --git a/test/regex/Compiler.java b/test/regex/Compiler.java index 6967e542ba..e109be63bf 100644 --- a/test/regex/Compiler.java +++ b/test/regex/Compiler.java @@ -228,10 +228,11 @@ class Compiler implements PikeVMOpcodes { private class Lookaround extends Expression { private final Group group = new Group(false, null); - private final boolean forward; + private final boolean forward, negative; - public Lookaround(boolean forward) { + public Lookaround(boolean forward, boolean negative) { this.forward = forward; + this.negative = negative; } @Override @@ -240,7 +241,9 @@ class Compiler implements PikeVMOpcodes { if (!forward) { vm.reverse(); } - output.add(forward ? LOOKAHEAD : LOOKBEHIND); + output.add(forward ? + (negative ? NEGATIVE_LOOKAHEAD : LOOKAHEAD) : + (negative ? NEGATIVE_LOOKAHEAD : LOOKBEHIND)); output.add(output.addLookaround(vm)); } } @@ -327,9 +330,10 @@ class Compiler implements PikeVMOpcodes { case ':': capturing = false; break; + case '!': case '=': { capturing = false; - Lookaround lookaround = new Lookaround(lookAhead); + Lookaround lookaround = new Lookaround(lookAhead, c == '!'); current.push(lookaround); groups.push(lookaround.group); continue; diff --git a/test/regex/PikeVM.java b/test/regex/PikeVM.java index d0bd453d27..0888cdaa87 100644 --- a/test/regex/PikeVM.java +++ b/test/regex/PikeVM.java @@ -358,6 +358,18 @@ class PikeVM implements PikeVMOpcodes { current.queueImmediately(pc, pc + 2, false); } break; + case NEGATIVE_LOOKAHEAD: + if (!lookarounds[program[pc + 1]].matches(characters, + i, characters.length, true, false, null)) { + current.queueImmediately(pc, pc + 2, false); + } + break; + case NEGATIVE_LOOKBEHIND: + if (!lookarounds[program[pc + 1]].matches(characters, + i - 1, -1, true, false, null)) { + current.queueImmediately(pc, pc + 2, false); + } + break; /* immediate opcodes, i.e. thread continues within the same step */ case SAVE_OFFSET: if (result != null) { diff --git a/test/regex/PikeVMOpcodes.java b/test/regex/PikeVMOpcodes.java index acd67cc2fd..53aaa4c5ed 100644 --- a/test/regex/PikeVMOpcodes.java +++ b/test/regex/PikeVMOpcodes.java @@ -26,6 +26,8 @@ interface PikeVMOpcodes { final static int LOOKAHEAD = -30; final static int LOOKBEHIND = -31; + final static int NEGATIVE_LOOKAHEAD = -32; + final static int NEGATIVE_LOOKBEHIND = -33; final static int SAVE_OFFSET = -40; From 8ab10a695344cd05eee2c4069cbd433b90ac9ba7 Mon Sep 17 00:00:00 2001 From: Johannes Schindelin Date: Tue, 12 Nov 2013 09:32:20 -0600 Subject: [PATCH 24/31] Regex: support special character classes This adds support for character classes such as \d or \W, leaving \p{...} style character classes as an exercise for later. Signed-off-by: Johannes Schindelin --- test/Regex.java | 7 +++++ test/regex/CharacterMatcher.java | 48 +++++++++++++++++++++++++++++++- test/regex/Compiler.java | 14 ++++++++++ 3 files changed, 68 insertions(+), 1 deletion(-) diff --git a/test/Regex.java b/test/Regex.java index 48836cfe38..6a5e0909ad 100644 --- a/test/Regex.java +++ b/test/Regex.java @@ -62,5 +62,12 @@ public class Regex { expectGroups("(?=a)a", "a"); expectGroups(".*(o)(?<=[A-Z][a-z]*)", "Hello", "o"); expectNoMatch("(?!a).", "a"); + expectMatch("[\\d]", "0"); + expectMatch("\\0777", "?7"); + expectMatch("\\a", "\007"); + expectMatch("\\\\", "\\"); + expectMatch("\\x4A", "J"); + expectMatch("\\x61", "a"); + expectMatch("\\078", "\0078"); } } diff --git a/test/regex/CharacterMatcher.java b/test/regex/CharacterMatcher.java index 8e5d5318b8..b5b99a75cb 100644 --- a/test/regex/CharacterMatcher.java +++ b/test/regex/CharacterMatcher.java @@ -41,6 +41,28 @@ class CharacterMatcher { return (map.length > index && map[index]) ^ inversePattern; } + private static String specialClass(int c) { + if ('d' == c) { + return "[0-9]"; + } + if ('D' == c) { + return "[^0-9]"; + } + if ('s' == c) { + return "[ \\t\\n\\x0B\\f\\r]"; + } + if ('S' == c) { + return "[^ \\t\\n\\x0B\\f\\r]"; + } + if ('w' == c) { + return "[a-zA-Z_0-9]"; + } + if ('W' == c) { + return "[^a-zA-Z_0-9]"; + } + return null; + } + private CharacterMatcher(boolean[] map, boolean inversePattern) { this.map = map; this.inversePattern = inversePattern; @@ -65,6 +87,17 @@ class CharacterMatcher { map = java.util.Arrays.copyOf(map, size); } + private void merge(CharacterMatcher other) { + boolean inversePattern = this.inversePattern || other.inversePattern; + if ((map.length < other.map.length) ^ inversePattern) { + map = java.util.Arrays.copyOf(map, other.map.length); + } + for (int i = 0; i < map.length; ++ i) { + map[i] = (matches((char)i) || other.matches((char)i)) ^ inversePattern; + } + this.inversePattern = inversePattern; + } + static class Parser { private final char[] description; private int offset; @@ -165,6 +198,13 @@ class CharacterMatcher { public CharacterMatcher parseClass() { if (description[offset] != '[') { + if (description[offset] == '\\') { + String range = specialClass(description[++ offset]); + if (range != null) { + ++ offset; + return CharacterMatcher.parse(range); + } + } return null; } CharacterMatcher matcher = new CharacterMatcher(new boolean[0], @@ -196,9 +236,15 @@ class CharacterMatcher { matcher.map[j] = true; } } else if (c == '\\') { + int saved = offset; previous = parseEscapedCharacter(); if (previous < 0) { - unsupported("escape"); + offset = saved - 1; + CharacterMatcher clazz = parseClass(); + if (clazz == null) { + unsupported("escape"); + } + matcher.merge(clazz); } else { matcher.setMatch(previous); } diff --git a/test/regex/Compiler.java b/test/regex/Compiler.java index e109be63bf..05242e0d0e 100644 --- a/test/regex/Compiler.java +++ b/test/regex/Compiler.java @@ -292,6 +292,20 @@ class Compiler implements PikeVMOpcodes { case '.': current.push(DOT); continue; + case '\\': + int unescaped = characterClassParser.parseEscapedCharacter(index + 1); + if (unescaped >= 0) { + index = characterClassParser.getEndOffset() - 1; + current.push((char)unescaped); + continue; + } + CharacterMatcher characterClass = characterClassParser.parseClass(index); + if (characterClass != null) { + index = characterClassParser.getEndOffset() - 1; + current.push(new CharacterRange(characterClass)); + continue; + } + throw new RuntimeException("Parse error @" + index + ": " + regex); case '?': case '*': case '+': { From b4c768b1010e1ae491913dbfaaef189de5fe134f Mon Sep 17 00:00:00 2001 From: Johannes Schindelin Date: Thu, 14 Nov 2013 14:46:28 -0600 Subject: [PATCH 25/31] Regex: Test Pattern#split(String) The particular pattern we use to test it is used in ImgLib2, based on this answer on stackoverflow: http://stackoverflow.com/a/279337 Signed-off-by: Johannes Schindelin --- test/Regex.java | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/test/Regex.java b/test/Regex.java index 6a5e0909ad..16b50745fa 100644 --- a/test/Regex.java +++ b/test/Regex.java @@ -44,6 +44,16 @@ public class Regex { expect(!matcher.find()); } + private static void expectSplit(String regex, String string, + String... list) + { + String[] array = Pattern.compile(regex).split(string); + expect(array.length == list.length); + for (int i = 0; i < list.length; ++ i) { + expect(list[i].equals(array[i])); + } + } + public static void main(String[] args) { expectMatch("a(bb)?a", "abba"); expectNoMatch("a(bb)?a", "abbba"); @@ -69,5 +79,7 @@ public class Regex { expectMatch("\\x4A", "J"); expectMatch("\\x61", "a"); expectMatch("\\078", "\0078"); + expectSplit("(?<=\\w)(?=\\W)|(?<=\\W)(?=\\w)", "a + b * x", + "a", " + ", "b", " * ", "x"); } } From fe32cce2ad4e4272bda0555a48a0246017ceb842 Mon Sep 17 00:00:00 2001 From: Johannes Schindelin Date: Tue, 12 Nov 2013 13:52:20 -0600 Subject: [PATCH 26/31] Regex: support intersection/union of character classes Signed-off-by: Johannes Schindelin --- test/Regex.java | 2 ++ test/regex/CharacterMatcher.java | 35 ++++++++++++++++++++++++++++++-- 2 files changed, 35 insertions(+), 2 deletions(-) diff --git a/test/Regex.java b/test/Regex.java index 16b50745fa..b26105e1ed 100644 --- a/test/Regex.java +++ b/test/Regex.java @@ -81,5 +81,7 @@ public class Regex { expectMatch("\\078", "\0078"); expectSplit("(?<=\\w)(?=\\W)|(?<=\\W)(?=\\w)", "a + b * x", "a", " + ", "b", " * ", "x"); + expectMatch("[0-9[def]]", "f"); + expectNoMatch("[a-z&&[^d-f]]", "f"); } } diff --git a/test/regex/CharacterMatcher.java b/test/regex/CharacterMatcher.java index b5b99a75cb..fd8109622c 100644 --- a/test/regex/CharacterMatcher.java +++ b/test/regex/CharacterMatcher.java @@ -98,6 +98,17 @@ class CharacterMatcher { this.inversePattern = inversePattern; } + private void intersect(CharacterMatcher other) { + boolean inversePattern = this.inversePattern && other.inversePattern; + if ((map.length > other.map.length) ^ inversePattern) { + map = java.util.Arrays.copyOf(map, other.map.length); + } + for (int i = 0; i < map.length; ++ i) { + map[i] = (matches((char)i) && other.matches((char)i)) ^ inversePattern; + } + this.inversePattern = inversePattern; + } + static class Parser { private final char[] description; private int offset; @@ -248,8 +259,28 @@ class CharacterMatcher { } else { matcher.setMatch(previous); } - } else if (c == '&' || c == '[') { - unsupported("operation"); + } else if (c == '[') { + Parser parser = new Parser(description); + CharacterMatcher other = parser.parseClass(offset - 1); + if (other == null) { + unsupported("invalid merge"); + } + matcher.merge(other); + offset = parser.getEndOffset(); + previous = -1; + } else if (c == '&') { + if (offset + 2 > description.length || description[offset] != '&' + || description[offset + 1] != '[') { + unsupported("operation"); + } + Parser parser = new Parser(description); + CharacterMatcher other = parser.parseClass(offset + 1); + if (other == null) { + unsupported("invalid intersection"); + } + matcher.intersect(other); + offset = parser.getEndOffset(); + previous = -1; } else if (c == ']') { break; } else { From fb6486e276f8d815d9b4367d02192d864087c359 Mon Sep 17 00:00:00 2001 From: Johannes Schindelin Date: Tue, 19 Nov 2013 22:58:01 -0600 Subject: [PATCH 27/31] Regex: implement ^,$,\b and \B Signed-off-by: Johannes Schindelin --- test/Regex.java | 4 ++++ test/regex/Compiler.java | 16 +++++++++++++ test/regex/PikeVM.java | 45 +++++++++++++++++++++++++++++++++++ test/regex/PikeVMOpcodes.java | 5 ++++ 4 files changed, 70 insertions(+) diff --git a/test/Regex.java b/test/Regex.java index b26105e1ed..2139837a1a 100644 --- a/test/Regex.java +++ b/test/Regex.java @@ -83,5 +83,9 @@ public class Regex { "a", " + ", "b", " * ", "x"); expectMatch("[0-9[def]]", "f"); expectNoMatch("[a-z&&[^d-f]]", "f"); + expectSplit("^H", "Hello\nHobbes!", "", "ello\nHobbes!"); + expectSplit("o.*?$", "Hello\r\nHobbes!", "Hello\r\nH"); + expectSplit("\\b", "a+ b + c\nd", "", "a", "+ ", "b", " + ", "c", "\n", "d"); + expectSplit("\\B", "Hi Cal!", "H", "i C", "a", "l!"); } } diff --git a/test/regex/Compiler.java b/test/regex/Compiler.java index 05242e0d0e..3a1b0b3aab 100644 --- a/test/regex/Compiler.java +++ b/test/regex/Compiler.java @@ -305,6 +305,16 @@ class Compiler implements PikeVMOpcodes { current.push(new CharacterRange(characterClass)); continue; } + switch (array[index + 1]) { + case 'b': + index++; + current.push(WORD_BOUNDARY); + continue; + case 'B': + index++; + current.push(NON_WORD_BOUNDARY); + continue; + } throw new RuntimeException("Parse error @" + index + ": " + regex); case '?': case '*': @@ -379,6 +389,12 @@ class Compiler implements PikeVMOpcodes { case '|': current.startAlternative(); continue; + case '^': + current.push(LINE_START); + continue; + case '$': + current.push(LINE_END); + continue; default: throw new RuntimeException("Parse error @" + index + ": " + regex); } diff --git a/test/regex/PikeVM.java b/test/regex/PikeVM.java index 0888cdaa87..0decad95d8 100644 --- a/test/regex/PikeVM.java +++ b/test/regex/PikeVM.java @@ -28,6 +28,11 @@ class PikeVM implements PikeVMOpcodes { private final int findPrefixLength; private final CharacterMatcher[] classes; private final PikeVM[] lookarounds; + private final static CharacterMatcher wordCharacter = + CharacterMatcher.parse("\\w"); + private final static CharacterMatcher lineTerminator = + CharacterMatcher.parse("[\n\r\u0085\u2028\u2029]"); + private boolean multiLine; public interface Result { void set(int[] start, int[] end); @@ -341,6 +346,46 @@ class PikeVM implements PikeVMOpcodes { case DOTALL: current.queueNext(pc, pc + 1, next); break; + case WORD_BOUNDARY: + case NON_WORD_BOUNDARY: { + int i2 = i - step; + int c2 = i2 < 0 || i2 >= characters.length ? -1 : characters[i2]; + switch (opcode) { + case WORD_BOUNDARY: + if ((c2 < 0 || !wordCharacter.matches((char)c2))) { + if (wordCharacter.matches(c)) { + current.queueImmediately(pc, pc + 1, false); + } + } else if (i >= 0 && i < characters.length && + !wordCharacter.matches(c)) { + current.queueImmediately(pc, pc + 1, false); + } + break; + case NON_WORD_BOUNDARY: + if ((c2 < 0 || !wordCharacter.matches((char)c2))) { + if (i >= 0 && i < characters.length && + !wordCharacter.matches(c)) { + current.queueImmediately(pc, pc + 1, false); + } + } else if (wordCharacter.matches(c)) { + current.queueImmediately(pc, pc + 1, false); + } + break; + } + break; + } + case LINE_START: + if (i == 0 || (multiLine && + lineTerminator.matches(characters[i - 1]))) { + current.queueImmediately(pc, pc + 1, false); + } + break; + case LINE_END: + if (i == characters.length || (multiLine && + lineTerminator.matches(c))) { + current.queueImmediately(pc, pc + 1, false); + } + break; case CHARACTER_CLASS: if (classes[program[pc + 1]].matches(c)) { current.queueNext(pc, pc + 2, next); diff --git a/test/regex/PikeVMOpcodes.java b/test/regex/PikeVMOpcodes.java index 53aaa4c5ed..80ccff4bda 100644 --- a/test/regex/PikeVMOpcodes.java +++ b/test/regex/PikeVMOpcodes.java @@ -22,6 +22,11 @@ interface PikeVMOpcodes { final static int DOT = -1; final static int DOTALL = -2; + final static int WORD_BOUNDARY = -10; + final static int NON_WORD_BOUNDARY = -11; + final static int LINE_START = -12; + final static int LINE_END = -13; + final static int CHARACTER_CLASS = -20; final static int LOOKAHEAD = -30; From c975e25864a38c524a6b1234847b70fe1bd90c60 Mon Sep 17 00:00:00 2001 From: Johannes Schindelin Date: Wed, 20 Nov 2013 10:15:43 -0600 Subject: [PATCH 28/31] Regex: implement counted quantifiers: {,} Signed-off-by: Johannes Schindelin --- test/Regex.java | 5 +++ test/regex/Compiler.java | 86 ++++++++++++++++++++++++++++++---------- 2 files changed, 71 insertions(+), 20 deletions(-) diff --git a/test/Regex.java b/test/Regex.java index 2139837a1a..12a93fe270 100644 --- a/test/Regex.java +++ b/test/Regex.java @@ -87,5 +87,10 @@ public class Regex { expectSplit("o.*?$", "Hello\r\nHobbes!", "Hello\r\nH"); expectSplit("\\b", "a+ b + c\nd", "", "a", "+ ", "b", " + ", "c", "\n", "d"); expectSplit("\\B", "Hi Cal!", "H", "i C", "a", "l!"); + expectMatch("a{2,5}", "aaaa"); + expectGroups("a??(a{2,5}?)", "aaaa", "aaaa"); + expectGroups("a??(a{3}?)", "aaaa", "aaa"); + expectNoMatch("a(a{3}?)", "aaaaa"); + expectMatch("a(a{3,}?)", "aaaaa"); } } diff --git a/test/regex/Compiler.java b/test/regex/Compiler.java index 3a1b0b3aab..1e9f4b4978 100644 --- a/test/regex/Compiler.java +++ b/test/regex/Compiler.java @@ -114,11 +114,16 @@ class Compiler implements PikeVMOpcodes { private boolean greedy; public Repeat(Expression expr, int minCount, int maxCount, boolean greedy) { - if (minCount != 0 && minCount != 1) { + if (minCount < 0) { throw new RuntimeException("Unexpected min count: " + minCount); } - if (maxCount != 1 && maxCount != -1) { - throw new RuntimeException("Unexpected max count: " + maxCount); + if (maxCount != -1) { + if (maxCount == 0) { + throw new RuntimeException("Unexpected max count: " + maxCount); + } + if (minCount > maxCount) { + throw new RuntimeException("Unexpected range: " + minCount + ", " + maxCount); + } } this.expr = expr; this.minCount = minCount; @@ -130,25 +135,38 @@ class Compiler implements PikeVMOpcodes { int start = output.offset; int splitJmp = greedy ? SPLIT_JMP : SPLIT; int split = greedy ? SPLIT : SPLIT_JMP; - if (minCount == 1 && maxCount == -1) { + for (int i = 1; i < minCount; ++ i) { expr.writeCode(output); - output.add(splitJmp); - output.add(start); - } else if (minCount == 0 && maxCount == -1) { - output.add(split); - int jump = output.markJump(); - expr.writeCode(output); - output.add(splitJmp); - output.add(start + 2); - output.setJump(jump); - } else if (minCount == 0 && maxCount == 1) { - output.add(split); - int jump = output.markJump(); - expr.writeCode(output); - output.setJump(jump); + } + if (maxCount == -1) { + if (minCount > 0) { + int jump = output.offset; + expr.writeCode(output); + output.add(splitJmp); + output.add(jump); + } else { + output.add(split); + int jump = output.markJump(); + expr.writeCode(output); + output.add(splitJmp); + output.add(start + 2); + output.setJump(jump); + } } else { - throw new RuntimeException("Unexpected range: " - + minCount + ", " + maxCount); + if (minCount > 0) { + expr.writeCode(output); + } + if (maxCount > minCount) { + int[] jumps = new int[maxCount - minCount]; + for (int i = 0; i < jumps.length; ++ i) { + output.add(split); + jumps[i] = output.markJump(); + expr.writeCode(output); + } + for (int jump : jumps) { + output.setJump(jump); + } + } } } } @@ -328,6 +346,34 @@ class Compiler implements PikeVMOpcodes { c == '+' ? 1 : 0, c == '?' ? 1 : -1, greedy)); continue; } + case '{': { + ++ index; + int length = characterClassParser.digits(index, 8, 10); + int min = Integer.parseInt(regex.substring(index, index + length)); + int max = min; + index += length - 1; + c = index + 1 < array.length ? array[index + 1] : 0; + if (c == ',') { + ++ index; + length = characterClassParser.digits(index + 1, 8, 10); + max = length == 0 ? -1 : + Integer.parseInt(regex.substring(index + 1, index + 1 + length)); + index += length; + c = index + 1< array.length ? array[index + 1] : 0; + } + if (c != '}') { + throw new RuntimeException("Invalid quantifier @" + index + ": " + + regex); + } + ++ index; + boolean greedy = true; + if (index + 1 < array.length && array[index + 1] == '?') { + ++ index; + greedy = false; + } + current.push(new Repeat(current.pop(), min, max, greedy)); + continue; + } case '(': { boolean capturing = true; if (index + 1 < array.length && array[index + 1] == '?') { From 9e7169fe34c943b76aa7ce4cc75c92fab2a40c32 Mon Sep 17 00:00:00 2001 From: Johannes Schindelin Date: Tue, 12 Nov 2013 17:16:49 -0600 Subject: [PATCH 29/31] Regex: let toString() in the Compiler reconstruct the regex Signed-off-by: Johannes Schindelin --- test/regex/CharacterMatcher.java | 30 +++++++++++++ test/regex/Compiler.java | 74 ++++++++++++++++++++++++++++++++ 2 files changed, 104 insertions(+) diff --git a/test/regex/CharacterMatcher.java b/test/regex/CharacterMatcher.java index fd8109622c..c423a34514 100644 --- a/test/regex/CharacterMatcher.java +++ b/test/regex/CharacterMatcher.java @@ -41,6 +41,36 @@ class CharacterMatcher { return (map.length > index && map[index]) ^ inversePattern; } + public String toString() { + StringBuilder builder = new StringBuilder(); + builder.append("["); + if (inversePattern) { + builder.append("^"); + } + for (int i = 0; i < map.length; ++ i) { + if (!map[i]) { + continue; + } + builder.append(i >= ' ' && i <= 0x7f ? + "" + (char)i : ("\\x" + Integer.toHexString(i))); + int j = i + 1; + while (j < map.length && map[j]) { + ++ j; + } + -- j; + if (j > i) { + if (j > i + 1) { + builder.append('-'); + } + builder.append(j >= ' ' && j <= 0x7f ? + "" + (char)j : ("\\x" + Integer.toHexString(j))); + i = j; + } + } + builder.append("]"); + return builder.toString(); + } + private static String specialClass(int c) { if ('d' == c) { return "[0-9]"; diff --git a/test/regex/Compiler.java b/test/regex/Compiler.java index 1e9f4b4978..dd593ac19e 100644 --- a/test/regex/Compiler.java +++ b/test/regex/Compiler.java @@ -106,6 +106,10 @@ class Compiler implements PikeVMOpcodes { output.add(CHARACTER_CLASS); output.add(output.addClass(characterClass)); } + + public String toString() { + return characterClass.toString(); + } } private class Repeat extends Expression { @@ -169,6 +173,18 @@ class Compiler implements PikeVMOpcodes { } } } + + public String toString() { + String qualifier = greedy ? "" : "?"; + if (minCount == 0 && maxCount < 2) { + return expr.toString() + (minCount < 0 ? "*" : "?") + qualifier; + } + if (minCount == 1 && maxCount < 0) { + return expr.toString() + "+" + qualifier; + } + return expr.toString() + "{" + minCount + "," + + (maxCount < 0 ? "" : "" + maxCount) + "}" + qualifier; + } } private class Group extends Expression { @@ -193,6 +209,26 @@ class Compiler implements PikeVMOpcodes { public void writeCode(Output output) { output.add(c); } + + public String toString() { + if (c >= 0) { + return "" + (char)c; + } + switch (c) { + case DOT: + return "."; + case WORD_BOUNDARY: + return "\\b"; + case NON_WORD_BOUNDARY: + return "\\B"; + case LINE_START: + return "^"; + case LINE_END: + return "$"; + default: + throw new RuntimeException("Unhandled opcode: " + c); + } + } }); } @@ -242,6 +278,28 @@ class Compiler implements PikeVMOpcodes { output.add(2 * groupIndex + 1); } } + + public String toString() { + StringBuilder builder = new StringBuilder(); + if (alternatives != null || list.size() > 1) { + builder.append('('); + if (!capturing) { + builder.append("?:"); + } + } + if (alternatives != null) { + for (Group alternative : alternatives) { + builder.append(alternative).append('|'); + } + } + for (Expression expr : list) { + builder.append(expr); + } + if (alternatives != null || list.size() > 1) { + builder.append(')'); + } + return builder.toString(); + } } private class Lookaround extends Expression { @@ -264,6 +322,16 @@ class Compiler implements PikeVMOpcodes { (negative ? NEGATIVE_LOOKAHEAD : LOOKBEHIND)); output.add(output.addLookaround(vm)); } + + public String toString() { + String inner = group.toString(); + if (inner.startsWith("(?:")) { + inner = inner.substring(3); + } else { + inner += ")"; + } + return "(?=" + inner; + } } private class Group0 extends Expression { @@ -284,6 +352,12 @@ class Compiler implements PikeVMOpcodes { output.markFindPreambleEnd(); group.writeCode(output); } + + public String toString() { + String inner = group.toString(); + return inner.startsWith("(?:") && inner.endsWith(")") ? + inner.substring(1, inner.length() - 1) : inner; + } } private Group0 root; From e96379ee190fd7a664db02aefa8554161b1fa327 Mon Sep 17 00:00:00 2001 From: Johannes Schindelin Date: Fri, 15 Nov 2013 13:09:26 -0600 Subject: [PATCH 30/31] Regex: document the strengths and limitations Signed-off-by: Johannes Schindelin --- test/regex/RegexPattern.java | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/test/regex/RegexPattern.java b/test/regex/RegexPattern.java index 0e6ed488d0..efa5426ce0 100644 --- a/test/regex/RegexPattern.java +++ b/test/regex/RegexPattern.java @@ -12,6 +12,34 @@ package regex; /** * A minimal implementation of a regular expression engine. + *

+ * Intended as a permissively-licensed drop-in replacement for Oracle JDK's + * regular expression engine, this class uses the Pike VM implemented in + * {@link PikeVM} to match regular expressions. + *

+ *

+ * The Pike VM not only has a nicer runtime performance than Oracle JDK's + * backtracking approach -- O(n*m) instead of O(2^m) where + * n is the length of the regular expression pattern (after normalizing + * {<n>} quantifiers) and m the length of the text to match against + * the pattern -- but also supports arbitrary-sized look-behinds. + *

+ *

+ * The current implementation supports all regular expression constructs + * supported by Oracle JDK's regular expression engine except for the following + * ones: + *

    + *
  • control characters: \cX
  • + *
  • extended character classes: \p{...}
  • + *
  • extended boundary matchers: \A,\G,\Z,\z
  • + *
  • possessive quantifiers: X?+
  • + *
  • back references: \<n>, \k<name>
  • + *
  • long escape: \Q, \E
  • + *
  • named groups: (?<name>X)
  • + *
  • flags: (?idmsuxU)
  • + *
  • independent, non-capturing group: (?>X)
  • + *
+ *

* * @author Johannes Schindelin */ From 6626b477ad783f17fe53f70884f9313a2e7fd926 Mon Sep 17 00:00:00 2001 From: Johannes Schindelin Date: Fri, 22 Nov 2013 17:42:12 -0600 Subject: [PATCH 31/31] Replace java.util.regex.* with the new regular expression engine Signed-off-by: Johannes Schindelin --- .../java/util}/regex/CharacterMatcher.java | 2 +- .../java/util}/regex/Compiler.java | 2 +- classpath/java/util/regex/Matcher.java | 73 ++++--- classpath/java/util/regex/Pattern.java | 184 ++---------------- .../java/util}/regex/PikeVM.java | 2 +- .../java/util}/regex/PikeVMOpcodes.java | 2 +- .../java/util}/regex/RegexMatcher.java | 2 +- .../java/util}/regex/RegexPattern.java | 2 +- .../java/util}/regex/TrivialMatcher.java | 2 +- .../java/util}/regex/TrivialPattern.java | 2 +- makefile | 2 +- test/Regex.java | 4 +- test/regex/Matcher.java | 119 ----------- test/regex/Pattern.java | 89 --------- 14 files changed, 72 insertions(+), 415 deletions(-) rename {test => classpath/java/util}/regex/CharacterMatcher.java (99%) rename {test => classpath/java/util}/regex/Compiler.java (99%) rename {test => classpath/java/util}/regex/PikeVM.java (99%) rename {test => classpath/java/util}/regex/PikeVMOpcodes.java (97%) rename {test => classpath/java/util}/regex/RegexMatcher.java (98%) rename {test => classpath/java/util}/regex/RegexPattern.java (98%) rename {test => classpath/java/util}/regex/TrivialMatcher.java (97%) rename {test => classpath/java/util}/regex/TrivialPattern.java (99%) delete mode 100644 test/regex/Matcher.java delete mode 100644 test/regex/Pattern.java diff --git a/test/regex/CharacterMatcher.java b/classpath/java/util/regex/CharacterMatcher.java similarity index 99% rename from test/regex/CharacterMatcher.java rename to classpath/java/util/regex/CharacterMatcher.java index c423a34514..36a74fe4c4 100644 --- a/test/regex/CharacterMatcher.java +++ b/classpath/java/util/regex/CharacterMatcher.java @@ -8,7 +8,7 @@ There is NO WARRANTY for this software. See license.txt for details. */ -package regex; +package java.util.regex; /** * A class to match classes of characters. diff --git a/test/regex/Compiler.java b/classpath/java/util/regex/Compiler.java similarity index 99% rename from test/regex/Compiler.java rename to classpath/java/util/regex/Compiler.java index dd593ac19e..0cf50fcee4 100644 --- a/test/regex/Compiler.java +++ b/classpath/java/util/regex/Compiler.java @@ -8,7 +8,7 @@ There is NO WARRANTY for this software. See license.txt for details. */ -package regex; +package java.util.regex; import java.util.ArrayList; import java.util.Stack; diff --git a/classpath/java/util/regex/Matcher.java b/classpath/java/util/regex/Matcher.java index 4397931fdf..89f8306cc5 100644 --- a/classpath/java/util/regex/Matcher.java +++ b/classpath/java/util/regex/Matcher.java @@ -15,27 +15,23 @@ package java.util.regex; * * @author zsombor and others */ -public class Matcher { - private final Pattern pattern; - private CharSequence input; - private int start; - private int end; +public abstract class Matcher { + protected CharSequence input; + protected int start; + protected int end; - Matcher(Pattern pattern, CharSequence input) { - this.pattern = pattern; - this.input = input; + public Matcher(CharSequence input) { + reset(input); } - public boolean matches() { - if (pattern.pattern().equals(input.toString())) { - start = 0; - end = input.length(); - return true; - } else { - return false; - } + public abstract boolean matches(); + + public boolean find() { + return find(end); } + public abstract boolean find(int start); + public Matcher reset() { return reset(input); } @@ -47,10 +43,6 @@ public class Matcher { return this; } - public int start() { - return start; - } - public String replaceAll(String replacement) { return replace(replacement, Integer.MAX_VALUE); } @@ -59,7 +51,7 @@ public class Matcher { return replace(replacement, 1); } - private String replace(String replacement, int limit) { + protected String replace(String replacement, int limit) { reset(); StringBuilder sb = null; @@ -88,23 +80,40 @@ public class Matcher { return sb.toString(); } + public int start() { + return start; + } + public int end() { return end; } - public boolean find() { - return find(end); + public String group() { + return input.subSequence(start, end).toString(); } - public boolean find(int start) { - String p = pattern.pattern(); - int i = Pattern.indexOf(input, p, start); - if (i >= 0) { - this.start = i; - this.end = i + p.length(); - return true; - } else { - return false; + public int start(int group) { + if (group == 0) { + return start(); } + throw new UnsupportedOperationException(); + } + + public int end(int group) { + if (group == 0) { + return end(); + } + throw new UnsupportedOperationException(); + } + + public String group(int group) { + if (group == 0) { + return group(); + } + throw new UnsupportedOperationException(); + } + + public int groupCount() { + return 0; } } diff --git a/classpath/java/util/regex/Pattern.java b/classpath/java/util/regex/Pattern.java index b9c84eb6f3..be63b73e29 100644 --- a/classpath/java/util/regex/Pattern.java +++ b/classpath/java/util/regex/Pattern.java @@ -10,9 +10,8 @@ package java.util.regex; -import java.util.Iterator; +import java.util.ArrayList; import java.util.List; -import java.util.LinkedList; /** * This is a work in progress. @@ -20,7 +19,7 @@ import java.util.LinkedList; * @author zsombor and others * */ -public class Pattern { +public abstract class Pattern implements PikeVMOpcodes { public static final int UNIX_LINES = 1; public static final int CASE_INSENSITIVE = 2; @@ -35,112 +34,26 @@ public class Pattern { private final String pattern; protected Pattern(String pattern, int flags) { - this.pattern = trivial(pattern); + this.pattern = pattern; this.patternFlags = flags; } - private static String trivial(String pattern) { - StringBuffer buffer = new StringBuffer(); - for (int i = 0; i < pattern.length(); ++i) { - char c = pattern.charAt(i); - switch (c) { - case '\\': - if (++i == pattern.length()) { - break; - } - c = pattern.charAt(i); - if (c == '0') { - int len = digits(pattern, ++i, 3, 8); - if (len == 3 && pattern.charAt(i) > '3') { - --len; - } - c = (char)Integer.parseInt(pattern.substring(i, i + len), 8); - i += len - 1; - } else if (c == 'x' || c == 'u') { - int len = digits(pattern, ++i, 4, 16); - c = (char)Integer.parseInt(pattern.substring(i, i + len), 16); - i += len - 1; - } else { - c = unescape(pattern.charAt(i)); - } - if (c != -1) { - break; - } - // fallthru - case '.': - case '*': - case '+': - case '?': - case '|': - case '[': - case ']': - case '{': - case '}': - case '(': - case ')': - case '^': - case '$': - throw new UnsupportedOperationException - ("only trivial regular expressions are supported so far (" + pattern + ")"); - } - buffer.append(c); - } - return buffer.toString(); - } - - private static int digits(String s, int offset, int maxLength, int base) { - for (int i = 0; ; ++i) { - if (i == maxLength || offset + i >= s.length()) { - return i; - } - int value = s.charAt(offset + i) - '0'; - if (value < 0) { - return i; - } - if (base > 10 && value >= 10) { - value += 10 - (value >= 'a' - '0' ? 'a' - '0' : 'A' - '0'); - } - if (value >= base) { - return i; - } - } - } - - private static char unescape(char c) { - switch (c) { - case '\\': - return c; - case 'a': - return 0x0007; - case 'e': - return 0x001B; - case 'f': - return 0x000C; - case 'n': - return 0x000A; - case 'r': - return 0x000D; - case 't': - return 0x0009; - } - return (char)-1; - } - public static Pattern compile(String regex) { - return new Pattern(regex, 0); + return compile(regex, 0); } public static Pattern compile(String regex, int flags) { - return new Pattern(regex, flags); + if (flags != 0) { + throw new UnsupportedOperationException("TODO"); + } + return new Compiler().compile(regex); } public int flags() { return patternFlags; } - public Matcher matcher(CharSequence input) { - return new Matcher(this, input); - } + public abstract Matcher matcher(CharSequence input); public static boolean matches(String regex, CharSequence input) { return Pattern.compile(regex).matcher(input).matches(); @@ -155,79 +68,22 @@ public class Pattern { } public String[] split(CharSequence input, int limit) { - boolean strip; - if (limit < 0) { - strip = false; + if (limit <= 0) { limit = Integer.MAX_VALUE; - } else if (limit == 0) { - strip = true; - limit = Integer.MAX_VALUE; - } else { - strip = false; } - - List list = new LinkedList(); - int index = 0; - int trailing = 0; - int patternLength = pattern.length(); - while (index < input.length() && list.size() < limit - 1) { - int i; - if (patternLength == 0) { - if (list.size() == 0) { - i = 0; - } else { - i = index + 1; - } - } else { - i = indexOf(input, pattern, index); - } - - if (i >= 0) { - if (patternLength != 0 && i == index) { - ++ trailing; - } else { - trailing = 0; - } - - list.add(input.subSequence(index, i)); - index = i + patternLength; - } else { + Matcher matcher = matcher(input); + List result = new ArrayList(); + int offset = 0; + for (;;) { + if (result.size() >= limit || !matcher.find()) { break; } + result.add(input.subSequence(offset, matcher.start()).toString()); + offset = matcher.end(); } - - if (strip && index > 0 && index == input.length()) { - ++ trailing; - } else { - trailing = 0; + if (offset == 0 || offset < input.length()) { + result.add(input.subSequence(offset, input.length()).toString()); } - list.add(input.subSequence(index, input.length())); - - String[] result = new String[list.size() - trailing]; - int i = 0; - for (Iterator it = list.iterator(); - it.hasNext() && i < result.length; ++ i) - { - result[i] = it.next().toString(); - } - return result; - } - - static int indexOf(CharSequence haystack, CharSequence needle, int start) { - if (needle.length() == 0) return start; - - for (int i = start; i < haystack.length() - needle.length() + 1; ++i) { - int j = 0; - for (; j < needle.length(); ++j) { - if (haystack.charAt(i + j) != needle.charAt(j)) { - break; - } - } - if (j == needle.length()) { - return i; - } - } - - return -1; + return result.toArray(new String[result.size()]); } } diff --git a/test/regex/PikeVM.java b/classpath/java/util/regex/PikeVM.java similarity index 99% rename from test/regex/PikeVM.java rename to classpath/java/util/regex/PikeVM.java index 0decad95d8..d34ef068a3 100644 --- a/test/regex/PikeVM.java +++ b/classpath/java/util/regex/PikeVM.java @@ -8,7 +8,7 @@ There is NO WARRANTY for this software. See license.txt for details. */ -package regex; +package java.util.regex; /** * A minimal implementation of a regular expression engine. diff --git a/test/regex/PikeVMOpcodes.java b/classpath/java/util/regex/PikeVMOpcodes.java similarity index 97% rename from test/regex/PikeVMOpcodes.java rename to classpath/java/util/regex/PikeVMOpcodes.java index 80ccff4bda..d932aec870 100644 --- a/test/regex/PikeVMOpcodes.java +++ b/classpath/java/util/regex/PikeVMOpcodes.java @@ -8,7 +8,7 @@ There is NO WARRANTY for this software. See license.txt for details. */ -package regex; +package java.util.regex; /** * Opcodes for the Pike VM. diff --git a/test/regex/RegexMatcher.java b/classpath/java/util/regex/RegexMatcher.java similarity index 98% rename from test/regex/RegexMatcher.java rename to classpath/java/util/regex/RegexMatcher.java index 78bc7c77ca..145b15a704 100644 --- a/test/regex/RegexMatcher.java +++ b/classpath/java/util/regex/RegexMatcher.java @@ -8,7 +8,7 @@ There is NO WARRANTY for this software. See license.txt for details. */ -package regex; +package java.util.regex; /** * A minimal implementation of a regular expression matcher. diff --git a/test/regex/RegexPattern.java b/classpath/java/util/regex/RegexPattern.java similarity index 98% rename from test/regex/RegexPattern.java rename to classpath/java/util/regex/RegexPattern.java index efa5426ce0..bceb90cfc4 100644 --- a/test/regex/RegexPattern.java +++ b/classpath/java/util/regex/RegexPattern.java @@ -8,7 +8,7 @@ There is NO WARRANTY for this software. See license.txt for details. */ -package regex; +package java.util.regex; /** * A minimal implementation of a regular expression engine. diff --git a/test/regex/TrivialMatcher.java b/classpath/java/util/regex/TrivialMatcher.java similarity index 97% rename from test/regex/TrivialMatcher.java rename to classpath/java/util/regex/TrivialMatcher.java index 9a1a7d3737..2b735f83e1 100644 --- a/test/regex/TrivialMatcher.java +++ b/classpath/java/util/regex/TrivialMatcher.java @@ -8,7 +8,7 @@ There is NO WARRANTY for this software. See license.txt for details. */ -package regex; +package java.util.regex; /** * This is a work in progress. diff --git a/test/regex/TrivialPattern.java b/classpath/java/util/regex/TrivialPattern.java similarity index 99% rename from test/regex/TrivialPattern.java rename to classpath/java/util/regex/TrivialPattern.java index 6289edc676..1041e1bfaa 100644 --- a/test/regex/TrivialPattern.java +++ b/classpath/java/util/regex/TrivialPattern.java @@ -8,7 +8,7 @@ There is NO WARRANTY for this software. See license.txt for details. */ -package regex; +package java.util.regex; import java.util.Iterator; import java.util.List; diff --git a/makefile b/makefile index d66cfea947..0b98039593 100755 --- a/makefile +++ b/makefile @@ -1344,7 +1344,7 @@ vm-classes = \ avian/*.class \ avian/resource/*.class -test-support-sources = $(shell find $(test)/avian $(test)/regex -name '*.java') +test-support-sources = $(shell find $(test)/avian/ -name '*.java') test-sources = $(wildcard $(test)/*.java) test-cpp-sources = $(wildcard $(test)/*.cpp) test-sources += $(test-support-sources) diff --git a/test/Regex.java b/test/Regex.java index 12a93fe270..22108dde5a 100644 --- a/test/Regex.java +++ b/test/Regex.java @@ -1,5 +1,5 @@ -import regex.Matcher; -import regex.Pattern; +import java.util.regex.Matcher; +import java.util.regex.Pattern; public class Regex { private static void expect(boolean v) { diff --git a/test/regex/Matcher.java b/test/regex/Matcher.java deleted file mode 100644 index 13c2efba1a..0000000000 --- a/test/regex/Matcher.java +++ /dev/null @@ -1,119 +0,0 @@ -/* Copyright (c) 2008-2013, Avian Contributors - - Permission to use, copy, modify, and/or distribute this software - for any purpose with or without fee is hereby granted, provided - that the above copyright notice and this permission notice appear - in all copies. - - There is NO WARRANTY for this software. See license.txt for - details. */ - -package regex; - -/** - * This is a work in progress. - * - * @author zsombor and others - */ -public abstract class Matcher { - protected CharSequence input; - protected int start; - protected int end; - - public Matcher(CharSequence input) { - reset(input); - } - - public abstract boolean matches(); - - public boolean find() { - return find(end); - } - - public abstract boolean find(int start); - - public Matcher reset() { - return reset(input); - } - - public Matcher reset(CharSequence input) { - this.input = input; - start = 0; - end = 0; - return this; - } - - public String replaceAll(String replacement) { - return replace(replacement, Integer.MAX_VALUE); - } - - public String replaceFirst(String replacement) { - return replace(replacement, 1); - } - - protected String replace(String replacement, int limit) { - reset(); - - StringBuilder sb = null; - int index = 0; - int count = 0; - while (count < limit && index < input.length()) { - if (find(index)) { - if (sb == null) { - sb = new StringBuilder(); - } - if (start > index) { - sb.append(input.subSequence(index, start)); - } - sb.append(replacement); - index = end; - ++ count; - } else if (index == 0) { - return input.toString(); - } else { - break; - } - } - if (index < input.length()) { - sb.append(input.subSequence(index, input.length())); - } - return sb.toString(); - } - - public int start() { - return start; - } - - public int end() { - return end; - } - - public String group() { - return input.subSequence(start, end).toString(); - } - - public int start(int group) { - if (group == 0) { - return start(); - } - throw new UnsupportedOperationException(); - } - - public int end(int group) { - if (group == 0) { - return end(); - } - throw new UnsupportedOperationException(); - } - - public String group(int group) { - if (group == 0) { - return group(); - } - throw new UnsupportedOperationException(); - } - - public int groupCount() { - return 0; - } -} diff --git a/test/regex/Pattern.java b/test/regex/Pattern.java deleted file mode 100644 index f0d5596e2f..0000000000 --- a/test/regex/Pattern.java +++ /dev/null @@ -1,89 +0,0 @@ -/* Copyright (c) 2008-2013, Avian Contributors - - Permission to use, copy, modify, and/or distribute this software - for any purpose with or without fee is hereby granted, provided - that the above copyright notice and this permission notice appear - in all copies. - - There is NO WARRANTY for this software. See license.txt for - details. */ - -package regex; - -import java.util.ArrayList; -import java.util.List; - -/** - * This is a work in progress. - * - * @author zsombor and others - * - */ -public abstract class Pattern implements PikeVMOpcodes { - - public static final int UNIX_LINES = 1; - public static final int CASE_INSENSITIVE = 2; - public static final int COMMENTS = 4; - public static final int MULTILINE = 8; - public static final int LITERAL = 16; - public static final int DOTALL = 32; - public static final int UNICODE_CASE = 64; - public static final int CANON_EQ = 128; - - private final int patternFlags; - private final String pattern; - - protected Pattern(String pattern, int flags) { - this.pattern = pattern; - this.patternFlags = flags; - } - - public static Pattern compile(String regex) { - return compile(regex, 0); - } - - public static Pattern compile(String regex, int flags) { - if (flags != 0) { - throw new UnsupportedOperationException("TODO"); - } - return new Compiler().compile(regex); - } - - public int flags() { - return patternFlags; - } - - public abstract Matcher matcher(CharSequence input); - - public static boolean matches(String regex, CharSequence input) { - return Pattern.compile(regex).matcher(input).matches(); - } - - public String pattern() { - return pattern; - } - - public String[] split(CharSequence input) { - return split(input, 0); - } - - public String[] split(CharSequence input, int limit) { - if (limit <= 0) { - limit = Integer.MAX_VALUE; - } - Matcher matcher = matcher(input); - List result = new ArrayList(); - int offset = 0; - for (;;) { - if (result.size() >= limit || !matcher.find()) { - break; - } - result.add(input.subSequence(offset, matcher.start()).toString()); - offset = matcher.end(); - } - if (offset == 0 || offset < input.length()) { - result.add(input.subSequence(offset, input.length()).toString()); - } - return result.toArray(new String[result.size()]); - } -}