diff --git a/test/Regex.java b/test/Regex.java index b26105e1ed..2139837a1a 100644 --- a/test/Regex.java +++ b/test/Regex.java @@ -83,5 +83,9 @@ public class Regex { "a", " + ", "b", " * ", "x"); expectMatch("[0-9[def]]", "f"); expectNoMatch("[a-z&&[^d-f]]", "f"); + expectSplit("^H", "Hello\nHobbes!", "", "ello\nHobbes!"); + expectSplit("o.*?$", "Hello\r\nHobbes!", "Hello\r\nH"); + expectSplit("\\b", "a+ b + c\nd", "", "a", "+ ", "b", " + ", "c", "\n", "d"); + expectSplit("\\B", "Hi Cal!", "H", "i C", "a", "l!"); } } diff --git a/test/regex/Compiler.java b/test/regex/Compiler.java index 05242e0d0e..3a1b0b3aab 100644 --- a/test/regex/Compiler.java +++ b/test/regex/Compiler.java @@ -305,6 +305,16 @@ class Compiler implements PikeVMOpcodes { current.push(new CharacterRange(characterClass)); continue; } + switch (array[index + 1]) { + case 'b': + index++; + current.push(WORD_BOUNDARY); + continue; + case 'B': + index++; + current.push(NON_WORD_BOUNDARY); + continue; + } throw new RuntimeException("Parse error @" + index + ": " + regex); case '?': case '*': @@ -379,6 +389,12 @@ class Compiler implements PikeVMOpcodes { case '|': current.startAlternative(); continue; + case '^': + current.push(LINE_START); + continue; + case '$': + current.push(LINE_END); + continue; default: throw new RuntimeException("Parse error @" + index + ": " + regex); } diff --git a/test/regex/PikeVM.java b/test/regex/PikeVM.java index 0888cdaa87..0decad95d8 100644 --- a/test/regex/PikeVM.java +++ b/test/regex/PikeVM.java @@ -28,6 +28,11 @@ class PikeVM implements PikeVMOpcodes { private final int findPrefixLength; private final CharacterMatcher[] classes; private final PikeVM[] lookarounds; + private final static CharacterMatcher wordCharacter = + CharacterMatcher.parse("\\w"); + private final static CharacterMatcher lineTerminator = + CharacterMatcher.parse("[\n\r\u0085\u2028\u2029]"); + private boolean multiLine; public interface Result { void set(int[] start, int[] end); @@ -341,6 +346,46 @@ class PikeVM implements PikeVMOpcodes { case DOTALL: current.queueNext(pc, pc + 1, next); break; + case WORD_BOUNDARY: + case NON_WORD_BOUNDARY: { + int i2 = i - step; + int c2 = i2 < 0 || i2 >= characters.length ? -1 : characters[i2]; + switch (opcode) { + case WORD_BOUNDARY: + if ((c2 < 0 || !wordCharacter.matches((char)c2))) { + if (wordCharacter.matches(c)) { + current.queueImmediately(pc, pc + 1, false); + } + } else if (i >= 0 && i < characters.length && + !wordCharacter.matches(c)) { + current.queueImmediately(pc, pc + 1, false); + } + break; + case NON_WORD_BOUNDARY: + if ((c2 < 0 || !wordCharacter.matches((char)c2))) { + if (i >= 0 && i < characters.length && + !wordCharacter.matches(c)) { + current.queueImmediately(pc, pc + 1, false); + } + } else if (wordCharacter.matches(c)) { + current.queueImmediately(pc, pc + 1, false); + } + break; + } + break; + } + case LINE_START: + if (i == 0 || (multiLine && + lineTerminator.matches(characters[i - 1]))) { + current.queueImmediately(pc, pc + 1, false); + } + break; + case LINE_END: + if (i == characters.length || (multiLine && + lineTerminator.matches(c))) { + current.queueImmediately(pc, pc + 1, false); + } + break; case CHARACTER_CLASS: if (classes[program[pc + 1]].matches(c)) { current.queueNext(pc, pc + 2, next); diff --git a/test/regex/PikeVMOpcodes.java b/test/regex/PikeVMOpcodes.java index 53aaa4c5ed..80ccff4bda 100644 --- a/test/regex/PikeVMOpcodes.java +++ b/test/regex/PikeVMOpcodes.java @@ -22,6 +22,11 @@ interface PikeVMOpcodes { final static int DOT = -1; final static int DOTALL = -2; + final static int WORD_BOUNDARY = -10; + final static int NON_WORD_BOUNDARY = -11; + final static int LINE_START = -12; + final static int LINE_END = -13; + final static int CHARACTER_CLASS = -20; final static int LOOKAHEAD = -30;