973 lines
30 KiB
JavaScript
973 lines
30 KiB
JavaScript
|
;(function(root, factory) {
|
||
|
// istanbul ignore next
|
||
|
if (typeof define === "function" && define.amd) {
|
||
|
// istanbul ignore next
|
||
|
define([], factory)
|
||
|
} else if (typeof module === "object" && module.exports) {
|
||
|
module.exports = factory()
|
||
|
} else {
|
||
|
// istanbul ignore next
|
||
|
root.regexpToAst = factory()
|
||
|
}
|
||
|
})(
|
||
|
typeof self !== "undefined"
|
||
|
? // istanbul ignore next
|
||
|
self
|
||
|
: this,
|
||
|
function() {
|
||
|
// references
|
||
|
// https://hackernoon.com/the-madness-of-parsing-real-world-javascript-regexps-d9ee336df983
|
||
|
// https://www.ecma-international.org/ecma-262/8.0/index.html#prod-Pattern
|
||
|
function RegExpParser() {}
|
||
|
|
||
|
RegExpParser.prototype.saveState = function() {
|
||
|
return {
|
||
|
idx: this.idx,
|
||
|
input: this.input,
|
||
|
groupIdx: this.groupIdx
|
||
|
}
|
||
|
}
|
||
|
|
||
|
RegExpParser.prototype.restoreState = function(newState) {
|
||
|
this.idx = newState.idx
|
||
|
this.input = newState.input
|
||
|
this.groupIdx = newState.groupIdx
|
||
|
}
|
||
|
|
||
|
RegExpParser.prototype.pattern = function(input) {
|
||
|
// parser state
|
||
|
this.idx = 0
|
||
|
this.input = input
|
||
|
this.groupIdx = 0
|
||
|
|
||
|
this.consumeChar("/")
|
||
|
var value = this.disjunction()
|
||
|
this.consumeChar("/")
|
||
|
|
||
|
var flags = {
|
||
|
type: "Flags",
|
||
|
global: false,
|
||
|
ignoreCase: false,
|
||
|
multiLine: false,
|
||
|
unicode: false,
|
||
|
sticky: false
|
||
|
}
|
||
|
|
||
|
while (this.isRegExpFlag()) {
|
||
|
switch (this.popChar()) {
|
||
|
case "g":
|
||
|
addFlag(flags, "global")
|
||
|
break
|
||
|
case "i":
|
||
|
addFlag(flags, "ignoreCase")
|
||
|
break
|
||
|
case "m":
|
||
|
addFlag(flags, "multiLine")
|
||
|
break
|
||
|
case "u":
|
||
|
addFlag(flags, "unicode")
|
||
|
break
|
||
|
case "y":
|
||
|
addFlag(flags, "sticky")
|
||
|
break
|
||
|
}
|
||
|
}
|
||
|
|
||
|
if (this.idx !== this.input.length) {
|
||
|
throw Error(
|
||
|
"Redundant input: " + this.input.substring(this.idx)
|
||
|
)
|
||
|
}
|
||
|
return { type: "Pattern", flags: flags, value: value }
|
||
|
}
|
||
|
|
||
|
RegExpParser.prototype.disjunction = function() {
|
||
|
var alts = []
|
||
|
alts.push(this.alternative())
|
||
|
|
||
|
while (this.peekChar() === "|") {
|
||
|
this.consumeChar("|")
|
||
|
alts.push(this.alternative())
|
||
|
}
|
||
|
|
||
|
return { type: "Disjunction", value: alts }
|
||
|
}
|
||
|
|
||
|
RegExpParser.prototype.alternative = function() {
|
||
|
var terms = []
|
||
|
|
||
|
while (this.isTerm()) {
|
||
|
terms.push(this.term())
|
||
|
}
|
||
|
|
||
|
return { type: "Alternative", value: terms }
|
||
|
}
|
||
|
|
||
|
RegExpParser.prototype.term = function() {
|
||
|
if (this.isAssertion()) {
|
||
|
return this.assertion()
|
||
|
} else {
|
||
|
return this.atom()
|
||
|
}
|
||
|
}
|
||
|
|
||
|
RegExpParser.prototype.assertion = function() {
|
||
|
switch (this.popChar()) {
|
||
|
case "^":
|
||
|
return { type: "StartAnchor" }
|
||
|
case "$":
|
||
|
return { type: "EndAnchor" }
|
||
|
// '\b' or '\B'
|
||
|
case "\\":
|
||
|
switch (this.popChar()) {
|
||
|
case "b":
|
||
|
return { type: "WordBoundary" }
|
||
|
case "B":
|
||
|
return { type: "NonWordBoundary" }
|
||
|
}
|
||
|
// istanbul ignore next
|
||
|
throw Error("Invalid Assertion Escape")
|
||
|
// '(?=' or '(?!'
|
||
|
case "(":
|
||
|
this.consumeChar("?")
|
||
|
|
||
|
var type
|
||
|
switch (this.popChar()) {
|
||
|
case "=":
|
||
|
type = "Lookahead"
|
||
|
break
|
||
|
case "!":
|
||
|
type = "NegativeLookahead"
|
||
|
break
|
||
|
}
|
||
|
ASSERT_EXISTS(type)
|
||
|
|
||
|
var disjunction = this.disjunction()
|
||
|
|
||
|
this.consumeChar(")")
|
||
|
|
||
|
return { type: type, value: disjunction }
|
||
|
}
|
||
|
// istanbul ignore next
|
||
|
ASSERT_NEVER_REACH_HERE()
|
||
|
}
|
||
|
|
||
|
RegExpParser.prototype.quantifier = function(isBacktracking) {
|
||
|
var range
|
||
|
switch (this.popChar()) {
|
||
|
case "*":
|
||
|
range = {
|
||
|
atLeast: 0,
|
||
|
atMost: Infinity
|
||
|
}
|
||
|
break
|
||
|
case "+":
|
||
|
range = {
|
||
|
atLeast: 1,
|
||
|
atMost: Infinity
|
||
|
}
|
||
|
break
|
||
|
case "?":
|
||
|
range = {
|
||
|
atLeast: 0,
|
||
|
atMost: 1
|
||
|
}
|
||
|
break
|
||
|
case "{":
|
||
|
var atLeast = this.integerIncludingZero()
|
||
|
switch (this.popChar()) {
|
||
|
case "}":
|
||
|
range = {
|
||
|
atLeast: atLeast,
|
||
|
atMost: atLeast
|
||
|
}
|
||
|
break
|
||
|
case ",":
|
||
|
var atMost
|
||
|
if (this.isDigit()) {
|
||
|
atMost = this.integerIncludingZero()
|
||
|
range = {
|
||
|
atLeast: atLeast,
|
||
|
atMost: atMost
|
||
|
}
|
||
|
} else {
|
||
|
range = {
|
||
|
atLeast: atLeast,
|
||
|
atMost: Infinity
|
||
|
}
|
||
|
}
|
||
|
this.consumeChar("}")
|
||
|
break
|
||
|
}
|
||
|
// throwing exceptions from "ASSERT_EXISTS" during backtracking
|
||
|
// causes severe performance degradations
|
||
|
if (isBacktracking === true && range === undefined) {
|
||
|
return undefined
|
||
|
}
|
||
|
ASSERT_EXISTS(range)
|
||
|
break
|
||
|
}
|
||
|
|
||
|
// throwing exceptions from "ASSERT_EXISTS" during backtracking
|
||
|
// causes severe performance degradations
|
||
|
if (isBacktracking === true && range === undefined) {
|
||
|
return undefined
|
||
|
}
|
||
|
|
||
|
ASSERT_EXISTS(range)
|
||
|
|
||
|
if (this.peekChar(0) === "?") {
|
||
|
this.consumeChar("?")
|
||
|
range.greedy = false
|
||
|
} else {
|
||
|
range.greedy = true
|
||
|
}
|
||
|
|
||
|
range.type = "Quantifier"
|
||
|
return range
|
||
|
}
|
||
|
|
||
|
RegExpParser.prototype.atom = function() {
|
||
|
var atom
|
||
|
switch (this.peekChar()) {
|
||
|
case ".":
|
||
|
atom = this.dotAll()
|
||
|
break
|
||
|
case "\\":
|
||
|
atom = this.atomEscape()
|
||
|
break
|
||
|
case "[":
|
||
|
atom = this.characterClass()
|
||
|
break
|
||
|
case "(":
|
||
|
atom = this.group()
|
||
|
break
|
||
|
}
|
||
|
|
||
|
if (atom === undefined && this.isPatternCharacter()) {
|
||
|
atom = this.patternCharacter()
|
||
|
}
|
||
|
|
||
|
ASSERT_EXISTS(atom)
|
||
|
|
||
|
if (this.isQuantifier()) {
|
||
|
atom.quantifier = this.quantifier()
|
||
|
}
|
||
|
|
||
|
return atom
|
||
|
}
|
||
|
|
||
|
RegExpParser.prototype.dotAll = function() {
|
||
|
this.consumeChar(".")
|
||
|
return {
|
||
|
type: "Set",
|
||
|
complement: true,
|
||
|
value: [cc("\n"), cc("\r"), cc("\u2028"), cc("\u2029")]
|
||
|
}
|
||
|
}
|
||
|
|
||
|
RegExpParser.prototype.atomEscape = function() {
|
||
|
this.consumeChar("\\")
|
||
|
|
||
|
switch (this.peekChar()) {
|
||
|
case "1":
|
||
|
case "2":
|
||
|
case "3":
|
||
|
case "4":
|
||
|
case "5":
|
||
|
case "6":
|
||
|
case "7":
|
||
|
case "8":
|
||
|
case "9":
|
||
|
return this.decimalEscapeAtom()
|
||
|
case "d":
|
||
|
case "D":
|
||
|
case "s":
|
||
|
case "S":
|
||
|
case "w":
|
||
|
case "W":
|
||
|
return this.characterClassEscape()
|
||
|
case "f":
|
||
|
case "n":
|
||
|
case "r":
|
||
|
case "t":
|
||
|
case "v":
|
||
|
return this.controlEscapeAtom()
|
||
|
case "c":
|
||
|
return this.controlLetterEscapeAtom()
|
||
|
case "0":
|
||
|
return this.nulCharacterAtom()
|
||
|
case "x":
|
||
|
return this.hexEscapeSequenceAtom()
|
||
|
case "u":
|
||
|
return this.regExpUnicodeEscapeSequenceAtom()
|
||
|
default:
|
||
|
return this.identityEscapeAtom()
|
||
|
}
|
||
|
}
|
||
|
|
||
|
RegExpParser.prototype.decimalEscapeAtom = function() {
|
||
|
var value = this.positiveInteger()
|
||
|
|
||
|
return { type: "GroupBackReference", value: value }
|
||
|
}
|
||
|
|
||
|
RegExpParser.prototype.characterClassEscape = function() {
|
||
|
var set
|
||
|
var complement = false
|
||
|
switch (this.popChar()) {
|
||
|
case "d":
|
||
|
set = digitsCharCodes
|
||
|
break
|
||
|
case "D":
|
||
|
set = digitsCharCodes
|
||
|
complement = true
|
||
|
break
|
||
|
case "s":
|
||
|
set = whitespaceCodes
|
||
|
break
|
||
|
case "S":
|
||
|
set = whitespaceCodes
|
||
|
complement = true
|
||
|
break
|
||
|
case "w":
|
||
|
set = wordCharCodes
|
||
|
break
|
||
|
case "W":
|
||
|
set = wordCharCodes
|
||
|
complement = true
|
||
|
break
|
||
|
}
|
||
|
|
||
|
ASSERT_EXISTS(set)
|
||
|
|
||
|
return { type: "Set", value: set, complement: complement }
|
||
|
}
|
||
|
|
||
|
RegExpParser.prototype.controlEscapeAtom = function() {
|
||
|
var escapeCode
|
||
|
switch (this.popChar()) {
|
||
|
case "f":
|
||
|
escapeCode = cc("\f")
|
||
|
break
|
||
|
case "n":
|
||
|
escapeCode = cc("\n")
|
||
|
break
|
||
|
case "r":
|
||
|
escapeCode = cc("\r")
|
||
|
break
|
||
|
case "t":
|
||
|
escapeCode = cc("\t")
|
||
|
break
|
||
|
case "v":
|
||
|
escapeCode = cc("\v")
|
||
|
break
|
||
|
}
|
||
|
ASSERT_EXISTS(escapeCode)
|
||
|
|
||
|
return { type: "Character", value: escapeCode }
|
||
|
}
|
||
|
|
||
|
RegExpParser.prototype.controlLetterEscapeAtom = function() {
|
||
|
this.consumeChar("c")
|
||
|
var letter = this.popChar()
|
||
|
if (/[a-zA-Z]/.test(letter) === false) {
|
||
|
throw Error("Invalid ")
|
||
|
}
|
||
|
|
||
|
var letterCode = letter.toUpperCase().charCodeAt(0) - 64
|
||
|
return { type: "Character", value: letterCode }
|
||
|
}
|
||
|
|
||
|
RegExpParser.prototype.nulCharacterAtom = function() {
|
||
|
// TODO implement '[lookahead ∉ DecimalDigit]'
|
||
|
// TODO: for the deprecated octal escape sequence
|
||
|
this.consumeChar("0")
|
||
|
return { type: "Character", value: cc("\0") }
|
||
|
}
|
||
|
|
||
|
RegExpParser.prototype.hexEscapeSequenceAtom = function() {
|
||
|
this.consumeChar("x")
|
||
|
return this.parseHexDigits(2)
|
||
|
}
|
||
|
|
||
|
RegExpParser.prototype.regExpUnicodeEscapeSequenceAtom = function() {
|
||
|
this.consumeChar("u")
|
||
|
return this.parseHexDigits(4)
|
||
|
}
|
||
|
|
||
|
RegExpParser.prototype.identityEscapeAtom = function() {
|
||
|
// TODO: implement "SourceCharacter but not UnicodeIDContinue"
|
||
|
// // http://unicode.org/reports/tr31/#Specific_Character_Adjustments
|
||
|
var escapedChar = this.popChar()
|
||
|
return { type: "Character", value: cc(escapedChar) }
|
||
|
}
|
||
|
|
||
|
RegExpParser.prototype.classPatternCharacterAtom = function() {
|
||
|
switch (this.peekChar()) {
|
||
|
// istanbul ignore next
|
||
|
case "\n":
|
||
|
// istanbul ignore next
|
||
|
case "\r":
|
||
|
// istanbul ignore next
|
||
|
case "\u2028":
|
||
|
// istanbul ignore next
|
||
|
case "\u2029":
|
||
|
// istanbul ignore next
|
||
|
case "\\":
|
||
|
// istanbul ignore next
|
||
|
case "]":
|
||
|
throw Error("TBD")
|
||
|
default:
|
||
|
var nextChar = this.popChar()
|
||
|
return { type: "Character", value: cc(nextChar) }
|
||
|
}
|
||
|
}
|
||
|
|
||
|
RegExpParser.prototype.characterClass = function() {
|
||
|
var set = []
|
||
|
var complement = false
|
||
|
this.consumeChar("[")
|
||
|
if (this.peekChar(0) === "^") {
|
||
|
this.consumeChar("^")
|
||
|
complement = true
|
||
|
}
|
||
|
|
||
|
while (this.isClassAtom()) {
|
||
|
var from = this.classAtom()
|
||
|
var isFromSingleChar = from.type === "Character"
|
||
|
if (isFromSingleChar && this.isRangeDash()) {
|
||
|
this.consumeChar("-")
|
||
|
var to = this.classAtom()
|
||
|
var isToSingleChar = to.type === "Character"
|
||
|
|
||
|
// a range can only be used when both sides are single characters
|
||
|
if (isToSingleChar) {
|
||
|
if (to.value < from.value) {
|
||
|
throw Error("Range out of order in character class")
|
||
|
}
|
||
|
set.push({ from: from.value, to: to.value })
|
||
|
} else {
|
||
|
// literal dash
|
||
|
insertToSet(from.value, set)
|
||
|
set.push(cc("-"))
|
||
|
insertToSet(to.value, set)
|
||
|
}
|
||
|
} else {
|
||
|
insertToSet(from.value, set)
|
||
|
}
|
||
|
}
|
||
|
|
||
|
this.consumeChar("]")
|
||
|
|
||
|
return { type: "Set", complement: complement, value: set }
|
||
|
}
|
||
|
|
||
|
RegExpParser.prototype.classAtom = function() {
|
||
|
switch (this.peekChar()) {
|
||
|
// istanbul ignore next
|
||
|
case "]":
|
||
|
// istanbul ignore next
|
||
|
case "\n":
|
||
|
// istanbul ignore next
|
||
|
case "\r":
|
||
|
// istanbul ignore next
|
||
|
case "\u2028":
|
||
|
// istanbul ignore next
|
||
|
case "\u2029":
|
||
|
throw Error("TBD")
|
||
|
case "\\":
|
||
|
return this.classEscape()
|
||
|
default:
|
||
|
return this.classPatternCharacterAtom()
|
||
|
}
|
||
|
}
|
||
|
|
||
|
RegExpParser.prototype.classEscape = function() {
|
||
|
this.consumeChar("\\")
|
||
|
switch (this.peekChar()) {
|
||
|
// Matches a backspace.
|
||
|
// (Not to be confused with \b word boundary outside characterClass)
|
||
|
case "b":
|
||
|
this.consumeChar("b")
|
||
|
return { type: "Character", value: cc("\u0008") }
|
||
|
case "d":
|
||
|
case "D":
|
||
|
case "s":
|
||
|
case "S":
|
||
|
case "w":
|
||
|
case "W":
|
||
|
return this.characterClassEscape()
|
||
|
case "f":
|
||
|
case "n":
|
||
|
case "r":
|
||
|
case "t":
|
||
|
case "v":
|
||
|
return this.controlEscapeAtom()
|
||
|
case "c":
|
||
|
return this.controlLetterEscapeAtom()
|
||
|
case "0":
|
||
|
return this.nulCharacterAtom()
|
||
|
case "x":
|
||
|
return this.hexEscapeSequenceAtom()
|
||
|
case "u":
|
||
|
return this.regExpUnicodeEscapeSequenceAtom()
|
||
|
default:
|
||
|
return this.identityEscapeAtom()
|
||
|
}
|
||
|
}
|
||
|
|
||
|
RegExpParser.prototype.group = function() {
|
||
|
var capturing = true
|
||
|
this.consumeChar("(")
|
||
|
switch (this.peekChar(0)) {
|
||
|
case "?":
|
||
|
this.consumeChar("?")
|
||
|
this.consumeChar(":")
|
||
|
capturing = false
|
||
|
break
|
||
|
default:
|
||
|
this.groupIdx++
|
||
|
break
|
||
|
}
|
||
|
var value = this.disjunction()
|
||
|
this.consumeChar(")")
|
||
|
|
||
|
var groupAst = {
|
||
|
type: "Group",
|
||
|
capturing: capturing,
|
||
|
value: value
|
||
|
}
|
||
|
|
||
|
if (capturing) {
|
||
|
groupAst.idx = this.groupIdx
|
||
|
}
|
||
|
|
||
|
return groupAst
|
||
|
}
|
||
|
|
||
|
RegExpParser.prototype.positiveInteger = function() {
|
||
|
var number = this.popChar()
|
||
|
|
||
|
// istanbul ignore next - can't ever get here due to previous lookahead checks
|
||
|
// still implementing this error checking in case this ever changes.
|
||
|
if (decimalPatternNoZero.test(number) === false) {
|
||
|
throw Error("Expecting a positive integer")
|
||
|
}
|
||
|
|
||
|
while (decimalPattern.test(this.peekChar(0))) {
|
||
|
number += this.popChar()
|
||
|
}
|
||
|
|
||
|
return parseInt(number, 10)
|
||
|
}
|
||
|
|
||
|
RegExpParser.prototype.integerIncludingZero = function() {
|
||
|
var number = this.popChar()
|
||
|
if (decimalPattern.test(number) === false) {
|
||
|
throw Error("Expecting an integer")
|
||
|
}
|
||
|
|
||
|
while (decimalPattern.test(this.peekChar(0))) {
|
||
|
number += this.popChar()
|
||
|
}
|
||
|
|
||
|
return parseInt(number, 10)
|
||
|
}
|
||
|
|
||
|
RegExpParser.prototype.patternCharacter = function() {
|
||
|
var nextChar = this.popChar()
|
||
|
switch (nextChar) {
|
||
|
// istanbul ignore next
|
||
|
case "\n":
|
||
|
// istanbul ignore next
|
||
|
case "\r":
|
||
|
// istanbul ignore next
|
||
|
case "\u2028":
|
||
|
// istanbul ignore next
|
||
|
case "\u2029":
|
||
|
// istanbul ignore next
|
||
|
case "^":
|
||
|
// istanbul ignore next
|
||
|
case "$":
|
||
|
// istanbul ignore next
|
||
|
case "\\":
|
||
|
// istanbul ignore next
|
||
|
case ".":
|
||
|
// istanbul ignore next
|
||
|
case "*":
|
||
|
// istanbul ignore next
|
||
|
case "+":
|
||
|
// istanbul ignore next
|
||
|
case "?":
|
||
|
// istanbul ignore next
|
||
|
case "(":
|
||
|
// istanbul ignore next
|
||
|
case ")":
|
||
|
// istanbul ignore next
|
||
|
case "[":
|
||
|
// istanbul ignore next
|
||
|
case "|":
|
||
|
// istanbul ignore next
|
||
|
throw Error("TBD")
|
||
|
default:
|
||
|
return { type: "Character", value: cc(nextChar) }
|
||
|
}
|
||
|
}
|
||
|
RegExpParser.prototype.isRegExpFlag = function() {
|
||
|
switch (this.peekChar(0)) {
|
||
|
case "g":
|
||
|
case "i":
|
||
|
case "m":
|
||
|
case "u":
|
||
|
case "y":
|
||
|
return true
|
||
|
default:
|
||
|
return false
|
||
|
}
|
||
|
}
|
||
|
|
||
|
RegExpParser.prototype.isRangeDash = function() {
|
||
|
return this.peekChar() === "-" && this.isClassAtom(1)
|
||
|
}
|
||
|
|
||
|
RegExpParser.prototype.isDigit = function() {
|
||
|
return decimalPattern.test(this.peekChar(0))
|
||
|
}
|
||
|
|
||
|
RegExpParser.prototype.isClassAtom = function(howMuch) {
|
||
|
if (howMuch === undefined) {
|
||
|
howMuch = 0
|
||
|
}
|
||
|
|
||
|
switch (this.peekChar(howMuch)) {
|
||
|
case "]":
|
||
|
case "\n":
|
||
|
case "\r":
|
||
|
case "\u2028":
|
||
|
case "\u2029":
|
||
|
return false
|
||
|
default:
|
||
|
return true
|
||
|
}
|
||
|
}
|
||
|
|
||
|
RegExpParser.prototype.isTerm = function() {
|
||
|
return this.isAtom() || this.isAssertion()
|
||
|
}
|
||
|
|
||
|
RegExpParser.prototype.isAtom = function() {
|
||
|
if (this.isPatternCharacter()) {
|
||
|
return true
|
||
|
}
|
||
|
|
||
|
switch (this.peekChar(0)) {
|
||
|
case ".":
|
||
|
case "\\": // atomEscape
|
||
|
case "[": // characterClass
|
||
|
// TODO: isAtom must be called before isAssertion - disambiguate
|
||
|
case "(": // group
|
||
|
return true
|
||
|
default:
|
||
|
return false
|
||
|
}
|
||
|
}
|
||
|
|
||
|
RegExpParser.prototype.isAssertion = function() {
|
||
|
switch (this.peekChar(0)) {
|
||
|
case "^":
|
||
|
case "$":
|
||
|
return true
|
||
|
// '\b' or '\B'
|
||
|
case "\\":
|
||
|
switch (this.peekChar(1)) {
|
||
|
case "b":
|
||
|
case "B":
|
||
|
return true
|
||
|
default:
|
||
|
return false
|
||
|
}
|
||
|
// '(?=' or '(?!'
|
||
|
case "(":
|
||
|
return (
|
||
|
this.peekChar(1) === "?" &&
|
||
|
(this.peekChar(2) === "=" || this.peekChar(2) === "!")
|
||
|
)
|
||
|
default:
|
||
|
return false
|
||
|
}
|
||
|
}
|
||
|
|
||
|
RegExpParser.prototype.isQuantifier = function() {
|
||
|
var prevState = this.saveState()
|
||
|
try {
|
||
|
return this.quantifier(true) !== undefined
|
||
|
} catch (e) {
|
||
|
return false
|
||
|
} finally {
|
||
|
this.restoreState(prevState)
|
||
|
}
|
||
|
}
|
||
|
|
||
|
RegExpParser.prototype.isPatternCharacter = function() {
|
||
|
switch (this.peekChar()) {
|
||
|
case "^":
|
||
|
case "$":
|
||
|
case "\\":
|
||
|
case ".":
|
||
|
case "*":
|
||
|
case "+":
|
||
|
case "?":
|
||
|
case "(":
|
||
|
case ")":
|
||
|
case "[":
|
||
|
case "|":
|
||
|
case "/":
|
||
|
case "\n":
|
||
|
case "\r":
|
||
|
case "\u2028":
|
||
|
case "\u2029":
|
||
|
return false
|
||
|
default:
|
||
|
return true
|
||
|
}
|
||
|
}
|
||
|
|
||
|
RegExpParser.prototype.parseHexDigits = function(howMany) {
|
||
|
var hexString = ""
|
||
|
for (var i = 0; i < howMany; i++) {
|
||
|
var hexChar = this.popChar()
|
||
|
if (hexDigitPattern.test(hexChar) === false) {
|
||
|
throw Error("Expecting a HexDecimal digits")
|
||
|
}
|
||
|
hexString += hexChar
|
||
|
}
|
||
|
var charCode = parseInt(hexString, 16)
|
||
|
return { type: "Character", value: charCode }
|
||
|
}
|
||
|
|
||
|
RegExpParser.prototype.peekChar = function(howMuch) {
|
||
|
if (howMuch === undefined) {
|
||
|
howMuch = 0
|
||
|
}
|
||
|
return this.input[this.idx + howMuch]
|
||
|
}
|
||
|
|
||
|
RegExpParser.prototype.popChar = function() {
|
||
|
var nextChar = this.peekChar(0)
|
||
|
this.consumeChar()
|
||
|
return nextChar
|
||
|
}
|
||
|
|
||
|
RegExpParser.prototype.consumeChar = function(char) {
|
||
|
if (char !== undefined && this.input[this.idx] !== char) {
|
||
|
throw Error(
|
||
|
"Expected: '" +
|
||
|
char +
|
||
|
"' but found: '" +
|
||
|
this.input[this.idx] +
|
||
|
"' at offset: " +
|
||
|
this.idx
|
||
|
)
|
||
|
}
|
||
|
|
||
|
if (this.idx >= this.input.length) {
|
||
|
throw Error("Unexpected end of input")
|
||
|
}
|
||
|
this.idx++
|
||
|
}
|
||
|
|
||
|
// consts and utilities
|
||
|
var hexDigitPattern = /[0-9a-fA-F]/
|
||
|
var decimalPattern = /[0-9]/
|
||
|
var decimalPatternNoZero = /[1-9]/
|
||
|
|
||
|
function cc(char) {
|
||
|
return char.charCodeAt(0)
|
||
|
}
|
||
|
|
||
|
function insertToSet(item, set) {
|
||
|
if (item.length !== undefined) {
|
||
|
item.forEach(function(subItem) {
|
||
|
set.push(subItem)
|
||
|
})
|
||
|
} else {
|
||
|
set.push(item)
|
||
|
}
|
||
|
}
|
||
|
|
||
|
function addFlag(flagObj, flagKey) {
|
||
|
if (flagObj[flagKey] === true) {
|
||
|
throw "duplicate flag " + flagKey
|
||
|
}
|
||
|
|
||
|
flagObj[flagKey] = true
|
||
|
}
|
||
|
|
||
|
function ASSERT_EXISTS(obj) {
|
||
|
// istanbul ignore next
|
||
|
if (obj === undefined) {
|
||
|
throw Error("Internal Error - Should never get here!")
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// istanbul ignore next
|
||
|
function ASSERT_NEVER_REACH_HERE() {
|
||
|
throw Error("Internal Error - Should never get here!")
|
||
|
}
|
||
|
|
||
|
var i
|
||
|
var digitsCharCodes = []
|
||
|
for (i = cc("0"); i <= cc("9"); i++) {
|
||
|
digitsCharCodes.push(i)
|
||
|
}
|
||
|
|
||
|
var wordCharCodes = [cc("_")].concat(digitsCharCodes)
|
||
|
for (i = cc("a"); i <= cc("z"); i++) {
|
||
|
wordCharCodes.push(i)
|
||
|
}
|
||
|
|
||
|
for (i = cc("A"); i <= cc("Z"); i++) {
|
||
|
wordCharCodes.push(i)
|
||
|
}
|
||
|
|
||
|
// https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/RegExp#character-classes
|
||
|
var whitespaceCodes = [
|
||
|
cc(" "),
|
||
|
cc("\f"),
|
||
|
cc("\n"),
|
||
|
cc("\r"),
|
||
|
cc("\t"),
|
||
|
cc("\v"),
|
||
|
cc("\t"),
|
||
|
cc("\u00a0"),
|
||
|
cc("\u1680"),
|
||
|
cc("\u2000"),
|
||
|
cc("\u2001"),
|
||
|
cc("\u2002"),
|
||
|
cc("\u2003"),
|
||
|
cc("\u2004"),
|
||
|
cc("\u2005"),
|
||
|
cc("\u2006"),
|
||
|
cc("\u2007"),
|
||
|
cc("\u2008"),
|
||
|
cc("\u2009"),
|
||
|
cc("\u200a"),
|
||
|
cc("\u2028"),
|
||
|
cc("\u2029"),
|
||
|
cc("\u202f"),
|
||
|
cc("\u205f"),
|
||
|
cc("\u3000"),
|
||
|
cc("\ufeff")
|
||
|
]
|
||
|
|
||
|
function BaseRegExpVisitor() {}
|
||
|
|
||
|
BaseRegExpVisitor.prototype.visitChildren = function(node) {
|
||
|
for (var key in node) {
|
||
|
var child = node[key]
|
||
|
/* istanbul ignore else */
|
||
|
if (node.hasOwnProperty(key)) {
|
||
|
if (child.type !== undefined) {
|
||
|
this.visit(child)
|
||
|
} else if (Array.isArray(child)) {
|
||
|
child.forEach(function(subChild) {
|
||
|
this.visit(subChild)
|
||
|
}, this)
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
BaseRegExpVisitor.prototype.visit = function(node) {
|
||
|
switch (node.type) {
|
||
|
case "Pattern":
|
||
|
this.visitPattern(node)
|
||
|
break
|
||
|
case "Flags":
|
||
|
this.visitFlags(node)
|
||
|
break
|
||
|
case "Disjunction":
|
||
|
this.visitDisjunction(node)
|
||
|
break
|
||
|
case "Alternative":
|
||
|
this.visitAlternative(node)
|
||
|
break
|
||
|
case "StartAnchor":
|
||
|
this.visitStartAnchor(node)
|
||
|
break
|
||
|
case "EndAnchor":
|
||
|
this.visitEndAnchor(node)
|
||
|
break
|
||
|
case "WordBoundary":
|
||
|
this.visitWordBoundary(node)
|
||
|
break
|
||
|
case "NonWordBoundary":
|
||
|
this.visitNonWordBoundary(node)
|
||
|
break
|
||
|
case "Lookahead":
|
||
|
this.visitLookahead(node)
|
||
|
break
|
||
|
case "NegativeLookahead":
|
||
|
this.visitNegativeLookahead(node)
|
||
|
break
|
||
|
case "Character":
|
||
|
this.visitCharacter(node)
|
||
|
break
|
||
|
case "Set":
|
||
|
this.visitSet(node)
|
||
|
break
|
||
|
case "Group":
|
||
|
this.visitGroup(node)
|
||
|
break
|
||
|
case "GroupBackReference":
|
||
|
this.visitGroupBackReference(node)
|
||
|
break
|
||
|
case "Quantifier":
|
||
|
this.visitQuantifier(node)
|
||
|
break
|
||
|
}
|
||
|
|
||
|
this.visitChildren(node)
|
||
|
}
|
||
|
|
||
|
BaseRegExpVisitor.prototype.visitPattern = function(node) {}
|
||
|
|
||
|
BaseRegExpVisitor.prototype.visitFlags = function(node) {}
|
||
|
|
||
|
BaseRegExpVisitor.prototype.visitDisjunction = function(node) {}
|
||
|
|
||
|
BaseRegExpVisitor.prototype.visitAlternative = function(node) {}
|
||
|
|
||
|
// Assertion
|
||
|
BaseRegExpVisitor.prototype.visitStartAnchor = function(node) {}
|
||
|
|
||
|
BaseRegExpVisitor.prototype.visitEndAnchor = function(node) {}
|
||
|
|
||
|
BaseRegExpVisitor.prototype.visitWordBoundary = function(node) {}
|
||
|
|
||
|
BaseRegExpVisitor.prototype.visitNonWordBoundary = function(node) {}
|
||
|
|
||
|
BaseRegExpVisitor.prototype.visitLookahead = function(node) {}
|
||
|
|
||
|
BaseRegExpVisitor.prototype.visitNegativeLookahead = function(node) {}
|
||
|
|
||
|
// atoms
|
||
|
BaseRegExpVisitor.prototype.visitCharacter = function(node) {}
|
||
|
|
||
|
BaseRegExpVisitor.prototype.visitSet = function(node) {}
|
||
|
|
||
|
BaseRegExpVisitor.prototype.visitGroup = function(node) {}
|
||
|
|
||
|
BaseRegExpVisitor.prototype.visitGroupBackReference = function(node) {}
|
||
|
|
||
|
BaseRegExpVisitor.prototype.visitQuantifier = function(node) {}
|
||
|
|
||
|
return {
|
||
|
RegExpParser: RegExpParser,
|
||
|
BaseRegExpVisitor: BaseRegExpVisitor,
|
||
|
VERSION: "0.4.0"
|
||
|
}
|
||
|
}
|
||
|
)
|