Refactor Pattern / Matcher classes

This makes both the Pattern and the Matcher class abstract so that more
specialized patterns than the trivial patterns we support so far can be
implemented as convenient subclasses of the respective abstract base
classes.

To ease development, we work on copies in test/regex/ in the 'regex'
package. That way, it can be developed in Eclipse (because it does not
interfere with Oracle JRE's java.util.regex.* classes).

Signed-off-by: Johannes Schindelin <johannes.schindelin@gmx.de>
This commit is contained in:
Johannes Schindelin 2013-11-01 22:12:38 -05:00
parent b4e1ee97eb
commit 84829dc390
5 changed files with 427 additions and 1 deletions

View File

@ -1344,7 +1344,7 @@ vm-classes = \
avian/*.class \ avian/*.class \
avian/resource/*.class avian/resource/*.class
test-support-sources = $(shell find $(test)/avian/ -name '*.java') test-support-sources = $(shell find $(test)/avian $(test)/regex -name '*.java')
test-sources = $(wildcard $(test)/*.java) test-sources = $(wildcard $(test)/*.java)
test-cpp-sources = $(wildcard $(test)/*.cpp) test-cpp-sources = $(wildcard $(test)/*.cpp)
test-sources += $(test-support-sources) test-sources += $(test-support-sources)

90
test/regex/Matcher.java Normal file
View File

@ -0,0 +1,90 @@
/* Copyright (c) 2008-2013, Avian Contributors
Permission to use, copy, modify, and/or distribute this software
for any purpose with or without fee is hereby granted, provided
that the above copyright notice and this permission notice appear
in all copies.
There is NO WARRANTY for this software. See license.txt for
details. */
package regex;
/**
* This is a work in progress.
*
* @author zsombor and others
*/
public abstract class Matcher {
protected CharSequence input;
protected int start;
protected int end;
public Matcher(CharSequence input) {
reset(input);
}
public abstract boolean matches();
public boolean find() {
return find(end);
}
public abstract boolean find(int start);
public Matcher reset() {
return reset(input);
}
public Matcher reset(CharSequence input) {
this.input = input;
start = 0;
end = 0;
return this;
}
public String replaceAll(String replacement) {
return replace(replacement, Integer.MAX_VALUE);
}
public String replaceFirst(String replacement) {
return replace(replacement, 1);
}
protected String replace(String replacement, int limit) {
reset();
StringBuilder sb = null;
int index = 0;
int count = 0;
while (count < limit && index < input.length()) {
if (find(index)) {
if (sb == null) {
sb = new StringBuilder();
}
if (start > index) {
sb.append(input.subSequence(index, start));
}
sb.append(replacement);
index = end;
++ count;
} else if (index == 0) {
return input.toString();
} else {
break;
}
}
if (index < input.length()) {
sb.append(input.subSequence(index, input.length()));
}
return sb.toString();
}
public int start() {
return start;
}
public int end() {
return end;
}
}

89
test/regex/Pattern.java Normal file
View File

@ -0,0 +1,89 @@
/* Copyright (c) 2008-2013, Avian Contributors
Permission to use, copy, modify, and/or distribute this software
for any purpose with or without fee is hereby granted, provided
that the above copyright notice and this permission notice appear
in all copies.
There is NO WARRANTY for this software. See license.txt for
details. */
package regex;
import java.util.ArrayList;
import java.util.List;
/**
* This is a work in progress.
*
* @author zsombor and others
*
*/
public abstract class Pattern {
public static final int UNIX_LINES = 1;
public static final int CASE_INSENSITIVE = 2;
public static final int COMMENTS = 4;
public static final int MULTILINE = 8;
public static final int LITERAL = 16;
public static final int DOTALL = 32;
public static final int UNICODE_CASE = 64;
public static final int CANON_EQ = 128;
private final int patternFlags;
private final String pattern;
protected Pattern(String pattern, int flags) {
this.pattern = pattern;
this.patternFlags = flags;
}
public static Pattern compile(String regex) {
return compile(regex, 0);
}
public static Pattern compile(String regex, int flags) {
try {
return new TrivialPattern(regex, flags);
} catch (UnsupportedOperationException handledBelow) { }
throw new UnsupportedOperationException("Cannot handle regex " + regex);
}
public int flags() {
return patternFlags;
}
public abstract Matcher matcher(CharSequence input);
public static boolean matches(String regex, CharSequence input) {
return Pattern.compile(regex).matcher(input).matches();
}
public String pattern() {
return pattern;
}
public String[] split(CharSequence input) {
return split(input, 0);
}
public String[] split(CharSequence input, int limit) {
if (limit <= 0) {
limit = Integer.MAX_VALUE;
}
Matcher matcher = matcher(input);
List<String> result = new ArrayList<String>();
int offset = 0;
for (;;) {
if (result.size() >= limit || !matcher.find()) {
break;
}
result.add(input.subSequence(offset, matcher.start()).toString());
offset = matcher.end();
}
if (offset == 0 || offset < input.length()) {
result.add(input.subSequence(offset, input.length()).toString());
}
return result.toArray(new String[result.size()]);
}
}

View File

@ -0,0 +1,48 @@
/* Copyright (c) 2008-2013, Avian Contributors
Permission to use, copy, modify, and/or distribute this software
for any purpose with or without fee is hereby granted, provided
that the above copyright notice and this permission notice appear
in all copies.
There is NO WARRANTY for this software. See license.txt for
details. */
package regex;
/**
* This is a work in progress.
*
* @author zsombor and others
*/
class TrivialMatcher extends Matcher {
private final String pattern;
TrivialMatcher(String pattern, CharSequence input) {
super(input);
this.pattern = pattern;
}
public boolean matches() {
if (pattern.equals(input.toString())) {
start = 0;
end = input.length();
return true;
} else {
return false;
}
}
public boolean find(int start) {
String p = pattern;
int i = TrivialPattern.indexOf(input, p, start);
if (i >= 0) {
this.start = i;
this.end = i + p.length();
return true;
} else {
return false;
}
}
}

View File

@ -0,0 +1,199 @@
/* Copyright (c) 2008-2013, Avian Contributors
Permission to use, copy, modify, and/or distribute this software
for any purpose with or without fee is hereby granted, provided
that the above copyright notice and this permission notice appear
in all copies.
There is NO WARRANTY for this software. See license.txt for
details. */
package regex;
import java.util.Iterator;
import java.util.List;
import java.util.LinkedList;
/**
* This is a work in progress.
*
* @author zsombor and others
*
*/
public class TrivialPattern extends Pattern {
private final String trivialPattern;
TrivialPattern(String pattern, int flags) {
super(pattern, flags);
this.trivialPattern = trivial(pattern);
}
private static String trivial(String pattern) {
StringBuffer buffer = new StringBuffer();
for (int i = 0; i < pattern.length(); ++i) {
char c = pattern.charAt(i);
switch (c) {
case '\\':
if (++i == pattern.length()) {
break;
}
c = pattern.charAt(i);
if (c == '0') {
int len = digits(pattern, ++i, 3, 8);
if (len == 3 && pattern.charAt(i) > '3') {
--len;
}
c = (char)Integer.parseInt(pattern.substring(i, i + len), 8);
i += len - 1;
} else if (c == 'x' || c == 'u') {
int len = digits(pattern, ++i, 4, 16);
c = (char)Integer.parseInt(pattern.substring(i, i + len), 16);
i += len - 1;
} else {
c = unescape(pattern.charAt(i));
}
if (c != -1) {
break;
}
// fallthru
case '.':
case '*':
case '+':
case '?':
case '|':
case '[':
case ']':
case '{':
case '}':
case '(':
case ')':
case '^':
case '$':
throw new UnsupportedOperationException
("only trivial regular expressions are supported so far (" + pattern + ")");
}
buffer.append(c);
}
return buffer.toString();
}
private static int digits(String s, int offset, int maxLength, int base) {
for (int i = 0; ; ++i) {
if (i == maxLength || offset + i >= s.length()) {
return i;
}
int value = s.charAt(offset + i) - '0';
if (value < 0) {
return i;
}
if (base > 10 && value >= 10) {
value += 10 - (value >= 'a' - '0' ? 'a' - '0' : 'A' - '0');
}
if (value >= base) {
return i;
}
}
}
private static char unescape(char c) {
switch (c) {
case '\\':
return c;
case 'a':
return 0x0007;
case 'e':
return 0x001B;
case 'f':
return 0x000C;
case 'n':
return 0x000A;
case 'r':
return 0x000D;
case 't':
return 0x0009;
}
return (char)-1;
}
public Matcher matcher(CharSequence input) {
return new TrivialMatcher(trivialPattern, input);
}
public String[] split(CharSequence input, int limit) {
boolean strip;
if (limit < 0) {
strip = false;
limit = Integer.MAX_VALUE;
} else if (limit == 0) {
strip = true;
limit = Integer.MAX_VALUE;
} else {
strip = false;
}
List<CharSequence> list = new LinkedList<CharSequence>();
int index = 0;
int trailing = 0;
int patternLength = trivialPattern.length();
while (index < input.length() && list.size() < limit - 1) {
int i;
if (patternLength == 0) {
if (list.size() == 0) {
i = 0;
} else {
i = index + 1;
}
} else {
i = indexOf(input, trivialPattern, index);
}
if (i >= 0) {
if (patternLength != 0 && i == index) {
++ trailing;
} else {
trailing = 0;
}
list.add(input.subSequence(index, i));
index = i + patternLength;
} else {
break;
}
}
if (strip && index > 0 && index == input.length()) {
++ trailing;
} else {
trailing = 0;
}
list.add(input.subSequence(index, input.length()));
String[] result = new String[list.size() - trailing];
int i = 0;
for (Iterator<CharSequence> it = list.iterator();
it.hasNext() && i < result.length; ++ i)
{
result[i] = it.next().toString();
}
return result;
}
static int indexOf(CharSequence haystack, CharSequence needle, int start) {
if (needle.length() == 0) return start;
for (int i = start; i < haystack.length() - needle.length() + 1; ++i) {
int j = 0;
for (; j < needle.length(); ++j) {
if (haystack.charAt(i + j) != needle.charAt(j)) {
break;
}
}
if (j == needle.length()) {
return i;
}
}
return -1;
}
}