GGD/node_modules/regexp-to-ast/lib/regexp-to-ast.js

973 lines
30 KiB
JavaScript
Raw Normal View History

;(function(root, factory) {
// istanbul ignore next
if (typeof define === "function" && define.amd) {
// istanbul ignore next
define([], factory)
} else if (typeof module === "object" && module.exports) {
module.exports = factory()
} else {
// istanbul ignore next
root.regexpToAst = factory()
}
})(
typeof self !== "undefined"
? // istanbul ignore next
self
: this,
function() {
// references
// https://hackernoon.com/the-madness-of-parsing-real-world-javascript-regexps-d9ee336df983
// https://www.ecma-international.org/ecma-262/8.0/index.html#prod-Pattern
function RegExpParser() {}
RegExpParser.prototype.saveState = function() {
return {
idx: this.idx,
input: this.input,
groupIdx: this.groupIdx
}
}
RegExpParser.prototype.restoreState = function(newState) {
this.idx = newState.idx
this.input = newState.input
this.groupIdx = newState.groupIdx
}
RegExpParser.prototype.pattern = function(input) {
// parser state
this.idx = 0
this.input = input
this.groupIdx = 0
this.consumeChar("/")
var value = this.disjunction()
this.consumeChar("/")
var flags = {
type: "Flags",
global: false,
ignoreCase: false,
multiLine: false,
unicode: false,
sticky: false
}
while (this.isRegExpFlag()) {
switch (this.popChar()) {
case "g":
addFlag(flags, "global")
break
case "i":
addFlag(flags, "ignoreCase")
break
case "m":
addFlag(flags, "multiLine")
break
case "u":
addFlag(flags, "unicode")
break
case "y":
addFlag(flags, "sticky")
break
}
}
if (this.idx !== this.input.length) {
throw Error(
"Redundant input: " + this.input.substring(this.idx)
)
}
return { type: "Pattern", flags: flags, value: value }
}
RegExpParser.prototype.disjunction = function() {
var alts = []
alts.push(this.alternative())
while (this.peekChar() === "|") {
this.consumeChar("|")
alts.push(this.alternative())
}
return { type: "Disjunction", value: alts }
}
RegExpParser.prototype.alternative = function() {
var terms = []
while (this.isTerm()) {
terms.push(this.term())
}
return { type: "Alternative", value: terms }
}
RegExpParser.prototype.term = function() {
if (this.isAssertion()) {
return this.assertion()
} else {
return this.atom()
}
}
RegExpParser.prototype.assertion = function() {
switch (this.popChar()) {
case "^":
return { type: "StartAnchor" }
case "$":
return { type: "EndAnchor" }
// '\b' or '\B'
case "\\":
switch (this.popChar()) {
case "b":
return { type: "WordBoundary" }
case "B":
return { type: "NonWordBoundary" }
}
// istanbul ignore next
throw Error("Invalid Assertion Escape")
// '(?=' or '(?!'
case "(":
this.consumeChar("?")
var type
switch (this.popChar()) {
case "=":
type = "Lookahead"
break
case "!":
type = "NegativeLookahead"
break
}
ASSERT_EXISTS(type)
var disjunction = this.disjunction()
this.consumeChar(")")
return { type: type, value: disjunction }
}
// istanbul ignore next
ASSERT_NEVER_REACH_HERE()
}
RegExpParser.prototype.quantifier = function(isBacktracking) {
var range
switch (this.popChar()) {
case "*":
range = {
atLeast: 0,
atMost: Infinity
}
break
case "+":
range = {
atLeast: 1,
atMost: Infinity
}
break
case "?":
range = {
atLeast: 0,
atMost: 1
}
break
case "{":
var atLeast = this.integerIncludingZero()
switch (this.popChar()) {
case "}":
range = {
atLeast: atLeast,
atMost: atLeast
}
break
case ",":
var atMost
if (this.isDigit()) {
atMost = this.integerIncludingZero()
range = {
atLeast: atLeast,
atMost: atMost
}
} else {
range = {
atLeast: atLeast,
atMost: Infinity
}
}
this.consumeChar("}")
break
}
// throwing exceptions from "ASSERT_EXISTS" during backtracking
// causes severe performance degradations
if (isBacktracking === true && range === undefined) {
return undefined
}
ASSERT_EXISTS(range)
break
}
// throwing exceptions from "ASSERT_EXISTS" during backtracking
// causes severe performance degradations
if (isBacktracking === true && range === undefined) {
return undefined
}
ASSERT_EXISTS(range)
if (this.peekChar(0) === "?") {
this.consumeChar("?")
range.greedy = false
} else {
range.greedy = true
}
range.type = "Quantifier"
return range
}
RegExpParser.prototype.atom = function() {
var atom
switch (this.peekChar()) {
case ".":
atom = this.dotAll()
break
case "\\":
atom = this.atomEscape()
break
case "[":
atom = this.characterClass()
break
case "(":
atom = this.group()
break
}
if (atom === undefined && this.isPatternCharacter()) {
atom = this.patternCharacter()
}
ASSERT_EXISTS(atom)
if (this.isQuantifier()) {
atom.quantifier = this.quantifier()
}
return atom
}
RegExpParser.prototype.dotAll = function() {
this.consumeChar(".")
return {
type: "Set",
complement: true,
value: [cc("\n"), cc("\r"), cc("\u2028"), cc("\u2029")]
}
}
RegExpParser.prototype.atomEscape = function() {
this.consumeChar("\\")
switch (this.peekChar()) {
case "1":
case "2":
case "3":
case "4":
case "5":
case "6":
case "7":
case "8":
case "9":
return this.decimalEscapeAtom()
case "d":
case "D":
case "s":
case "S":
case "w":
case "W":
return this.characterClassEscape()
case "f":
case "n":
case "r":
case "t":
case "v":
return this.controlEscapeAtom()
case "c":
return this.controlLetterEscapeAtom()
case "0":
return this.nulCharacterAtom()
case "x":
return this.hexEscapeSequenceAtom()
case "u":
return this.regExpUnicodeEscapeSequenceAtom()
default:
return this.identityEscapeAtom()
}
}
RegExpParser.prototype.decimalEscapeAtom = function() {
var value = this.positiveInteger()
return { type: "GroupBackReference", value: value }
}
RegExpParser.prototype.characterClassEscape = function() {
var set
var complement = false
switch (this.popChar()) {
case "d":
set = digitsCharCodes
break
case "D":
set = digitsCharCodes
complement = true
break
case "s":
set = whitespaceCodes
break
case "S":
set = whitespaceCodes
complement = true
break
case "w":
set = wordCharCodes
break
case "W":
set = wordCharCodes
complement = true
break
}
ASSERT_EXISTS(set)
return { type: "Set", value: set, complement: complement }
}
RegExpParser.prototype.controlEscapeAtom = function() {
var escapeCode
switch (this.popChar()) {
case "f":
escapeCode = cc("\f")
break
case "n":
escapeCode = cc("\n")
break
case "r":
escapeCode = cc("\r")
break
case "t":
escapeCode = cc("\t")
break
case "v":
escapeCode = cc("\v")
break
}
ASSERT_EXISTS(escapeCode)
return { type: "Character", value: escapeCode }
}
RegExpParser.prototype.controlLetterEscapeAtom = function() {
this.consumeChar("c")
var letter = this.popChar()
if (/[a-zA-Z]/.test(letter) === false) {
throw Error("Invalid ")
}
var letterCode = letter.toUpperCase().charCodeAt(0) - 64
return { type: "Character", value: letterCode }
}
RegExpParser.prototype.nulCharacterAtom = function() {
// TODO implement '[lookahead ∉ DecimalDigit]'
// TODO: for the deprecated octal escape sequence
this.consumeChar("0")
return { type: "Character", value: cc("\0") }
}
RegExpParser.prototype.hexEscapeSequenceAtom = function() {
this.consumeChar("x")
return this.parseHexDigits(2)
}
RegExpParser.prototype.regExpUnicodeEscapeSequenceAtom = function() {
this.consumeChar("u")
return this.parseHexDigits(4)
}
RegExpParser.prototype.identityEscapeAtom = function() {
// TODO: implement "SourceCharacter but not UnicodeIDContinue"
// // http://unicode.org/reports/tr31/#Specific_Character_Adjustments
var escapedChar = this.popChar()
return { type: "Character", value: cc(escapedChar) }
}
RegExpParser.prototype.classPatternCharacterAtom = function() {
switch (this.peekChar()) {
// istanbul ignore next
case "\n":
// istanbul ignore next
case "\r":
// istanbul ignore next
case "\u2028":
// istanbul ignore next
case "\u2029":
// istanbul ignore next
case "\\":
// istanbul ignore next
case "]":
throw Error("TBD")
default:
var nextChar = this.popChar()
return { type: "Character", value: cc(nextChar) }
}
}
RegExpParser.prototype.characterClass = function() {
var set = []
var complement = false
this.consumeChar("[")
if (this.peekChar(0) === "^") {
this.consumeChar("^")
complement = true
}
while (this.isClassAtom()) {
var from = this.classAtom()
var isFromSingleChar = from.type === "Character"
if (isFromSingleChar && this.isRangeDash()) {
this.consumeChar("-")
var to = this.classAtom()
var isToSingleChar = to.type === "Character"
// a range can only be used when both sides are single characters
if (isToSingleChar) {
if (to.value < from.value) {
throw Error("Range out of order in character class")
}
set.push({ from: from.value, to: to.value })
} else {
// literal dash
insertToSet(from.value, set)
set.push(cc("-"))
insertToSet(to.value, set)
}
} else {
insertToSet(from.value, set)
}
}
this.consumeChar("]")
return { type: "Set", complement: complement, value: set }
}
RegExpParser.prototype.classAtom = function() {
switch (this.peekChar()) {
// istanbul ignore next
case "]":
// istanbul ignore next
case "\n":
// istanbul ignore next
case "\r":
// istanbul ignore next
case "\u2028":
// istanbul ignore next
case "\u2029":
throw Error("TBD")
case "\\":
return this.classEscape()
default:
return this.classPatternCharacterAtom()
}
}
RegExpParser.prototype.classEscape = function() {
this.consumeChar("\\")
switch (this.peekChar()) {
// Matches a backspace.
// (Not to be confused with \b word boundary outside characterClass)
case "b":
this.consumeChar("b")
return { type: "Character", value: cc("\u0008") }
case "d":
case "D":
case "s":
case "S":
case "w":
case "W":
return this.characterClassEscape()
case "f":
case "n":
case "r":
case "t":
case "v":
return this.controlEscapeAtom()
case "c":
return this.controlLetterEscapeAtom()
case "0":
return this.nulCharacterAtom()
case "x":
return this.hexEscapeSequenceAtom()
case "u":
return this.regExpUnicodeEscapeSequenceAtom()
default:
return this.identityEscapeAtom()
}
}
RegExpParser.prototype.group = function() {
var capturing = true
this.consumeChar("(")
switch (this.peekChar(0)) {
case "?":
this.consumeChar("?")
this.consumeChar(":")
capturing = false
break
default:
this.groupIdx++
break
}
var value = this.disjunction()
this.consumeChar(")")
var groupAst = {
type: "Group",
capturing: capturing,
value: value
}
if (capturing) {
groupAst.idx = this.groupIdx
}
return groupAst
}
RegExpParser.prototype.positiveInteger = function() {
var number = this.popChar()
// istanbul ignore next - can't ever get here due to previous lookahead checks
// still implementing this error checking in case this ever changes.
if (decimalPatternNoZero.test(number) === false) {
throw Error("Expecting a positive integer")
}
while (decimalPattern.test(this.peekChar(0))) {
number += this.popChar()
}
return parseInt(number, 10)
}
RegExpParser.prototype.integerIncludingZero = function() {
var number = this.popChar()
if (decimalPattern.test(number) === false) {
throw Error("Expecting an integer")
}
while (decimalPattern.test(this.peekChar(0))) {
number += this.popChar()
}
return parseInt(number, 10)
}
RegExpParser.prototype.patternCharacter = function() {
var nextChar = this.popChar()
switch (nextChar) {
// istanbul ignore next
case "\n":
// istanbul ignore next
case "\r":
// istanbul ignore next
case "\u2028":
// istanbul ignore next
case "\u2029":
// istanbul ignore next
case "^":
// istanbul ignore next
case "$":
// istanbul ignore next
case "\\":
// istanbul ignore next
case ".":
// istanbul ignore next
case "*":
// istanbul ignore next
case "+":
// istanbul ignore next
case "?":
// istanbul ignore next
case "(":
// istanbul ignore next
case ")":
// istanbul ignore next
case "[":
// istanbul ignore next
case "|":
// istanbul ignore next
throw Error("TBD")
default:
return { type: "Character", value: cc(nextChar) }
}
}
RegExpParser.prototype.isRegExpFlag = function() {
switch (this.peekChar(0)) {
case "g":
case "i":
case "m":
case "u":
case "y":
return true
default:
return false
}
}
RegExpParser.prototype.isRangeDash = function() {
return this.peekChar() === "-" && this.isClassAtom(1)
}
RegExpParser.prototype.isDigit = function() {
return decimalPattern.test(this.peekChar(0))
}
RegExpParser.prototype.isClassAtom = function(howMuch) {
if (howMuch === undefined) {
howMuch = 0
}
switch (this.peekChar(howMuch)) {
case "]":
case "\n":
case "\r":
case "\u2028":
case "\u2029":
return false
default:
return true
}
}
RegExpParser.prototype.isTerm = function() {
return this.isAtom() || this.isAssertion()
}
RegExpParser.prototype.isAtom = function() {
if (this.isPatternCharacter()) {
return true
}
switch (this.peekChar(0)) {
case ".":
case "\\": // atomEscape
case "[": // characterClass
// TODO: isAtom must be called before isAssertion - disambiguate
case "(": // group
return true
default:
return false
}
}
RegExpParser.prototype.isAssertion = function() {
switch (this.peekChar(0)) {
case "^":
case "$":
return true
// '\b' or '\B'
case "\\":
switch (this.peekChar(1)) {
case "b":
case "B":
return true
default:
return false
}
// '(?=' or '(?!'
case "(":
return (
this.peekChar(1) === "?" &&
(this.peekChar(2) === "=" || this.peekChar(2) === "!")
)
default:
return false
}
}
RegExpParser.prototype.isQuantifier = function() {
var prevState = this.saveState()
try {
return this.quantifier(true) !== undefined
} catch (e) {
return false
} finally {
this.restoreState(prevState)
}
}
RegExpParser.prototype.isPatternCharacter = function() {
switch (this.peekChar()) {
case "^":
case "$":
case "\\":
case ".":
case "*":
case "+":
case "?":
case "(":
case ")":
case "[":
case "|":
case "/":
case "\n":
case "\r":
case "\u2028":
case "\u2029":
return false
default:
return true
}
}
RegExpParser.prototype.parseHexDigits = function(howMany) {
var hexString = ""
for (var i = 0; i < howMany; i++) {
var hexChar = this.popChar()
if (hexDigitPattern.test(hexChar) === false) {
throw Error("Expecting a HexDecimal digits")
}
hexString += hexChar
}
var charCode = parseInt(hexString, 16)
return { type: "Character", value: charCode }
}
RegExpParser.prototype.peekChar = function(howMuch) {
if (howMuch === undefined) {
howMuch = 0
}
return this.input[this.idx + howMuch]
}
RegExpParser.prototype.popChar = function() {
var nextChar = this.peekChar(0)
this.consumeChar()
return nextChar
}
RegExpParser.prototype.consumeChar = function(char) {
if (char !== undefined && this.input[this.idx] !== char) {
throw Error(
"Expected: '" +
char +
"' but found: '" +
this.input[this.idx] +
"' at offset: " +
this.idx
)
}
if (this.idx >= this.input.length) {
throw Error("Unexpected end of input")
}
this.idx++
}
// consts and utilities
var hexDigitPattern = /[0-9a-fA-F]/
var decimalPattern = /[0-9]/
var decimalPatternNoZero = /[1-9]/
function cc(char) {
return char.charCodeAt(0)
}
function insertToSet(item, set) {
if (item.length !== undefined) {
item.forEach(function(subItem) {
set.push(subItem)
})
} else {
set.push(item)
}
}
function addFlag(flagObj, flagKey) {
if (flagObj[flagKey] === true) {
throw "duplicate flag " + flagKey
}
flagObj[flagKey] = true
}
function ASSERT_EXISTS(obj) {
// istanbul ignore next
if (obj === undefined) {
throw Error("Internal Error - Should never get here!")
}
}
// istanbul ignore next
function ASSERT_NEVER_REACH_HERE() {
throw Error("Internal Error - Should never get here!")
}
var i
var digitsCharCodes = []
for (i = cc("0"); i <= cc("9"); i++) {
digitsCharCodes.push(i)
}
var wordCharCodes = [cc("_")].concat(digitsCharCodes)
for (i = cc("a"); i <= cc("z"); i++) {
wordCharCodes.push(i)
}
for (i = cc("A"); i <= cc("Z"); i++) {
wordCharCodes.push(i)
}
// https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/RegExp#character-classes
var whitespaceCodes = [
cc(" "),
cc("\f"),
cc("\n"),
cc("\r"),
cc("\t"),
cc("\v"),
cc("\t"),
cc("\u00a0"),
cc("\u1680"),
cc("\u2000"),
cc("\u2001"),
cc("\u2002"),
cc("\u2003"),
cc("\u2004"),
cc("\u2005"),
cc("\u2006"),
cc("\u2007"),
cc("\u2008"),
cc("\u2009"),
cc("\u200a"),
cc("\u2028"),
cc("\u2029"),
cc("\u202f"),
cc("\u205f"),
cc("\u3000"),
cc("\ufeff")
]
function BaseRegExpVisitor() {}
BaseRegExpVisitor.prototype.visitChildren = function(node) {
for (var key in node) {
var child = node[key]
/* istanbul ignore else */
if (node.hasOwnProperty(key)) {
if (child.type !== undefined) {
this.visit(child)
} else if (Array.isArray(child)) {
child.forEach(function(subChild) {
this.visit(subChild)
}, this)
}
}
}
}
BaseRegExpVisitor.prototype.visit = function(node) {
switch (node.type) {
case "Pattern":
this.visitPattern(node)
break
case "Flags":
this.visitFlags(node)
break
case "Disjunction":
this.visitDisjunction(node)
break
case "Alternative":
this.visitAlternative(node)
break
case "StartAnchor":
this.visitStartAnchor(node)
break
case "EndAnchor":
this.visitEndAnchor(node)
break
case "WordBoundary":
this.visitWordBoundary(node)
break
case "NonWordBoundary":
this.visitNonWordBoundary(node)
break
case "Lookahead":
this.visitLookahead(node)
break
case "NegativeLookahead":
this.visitNegativeLookahead(node)
break
case "Character":
this.visitCharacter(node)
break
case "Set":
this.visitSet(node)
break
case "Group":
this.visitGroup(node)
break
case "GroupBackReference":
this.visitGroupBackReference(node)
break
case "Quantifier":
this.visitQuantifier(node)
break
}
this.visitChildren(node)
}
BaseRegExpVisitor.prototype.visitPattern = function(node) {}
BaseRegExpVisitor.prototype.visitFlags = function(node) {}
BaseRegExpVisitor.prototype.visitDisjunction = function(node) {}
BaseRegExpVisitor.prototype.visitAlternative = function(node) {}
// Assertion
BaseRegExpVisitor.prototype.visitStartAnchor = function(node) {}
BaseRegExpVisitor.prototype.visitEndAnchor = function(node) {}
BaseRegExpVisitor.prototype.visitWordBoundary = function(node) {}
BaseRegExpVisitor.prototype.visitNonWordBoundary = function(node) {}
BaseRegExpVisitor.prototype.visitLookahead = function(node) {}
BaseRegExpVisitor.prototype.visitNegativeLookahead = function(node) {}
// atoms
BaseRegExpVisitor.prototype.visitCharacter = function(node) {}
BaseRegExpVisitor.prototype.visitSet = function(node) {}
BaseRegExpVisitor.prototype.visitGroup = function(node) {}
BaseRegExpVisitor.prototype.visitGroupBackReference = function(node) {}
BaseRegExpVisitor.prototype.visitQuantifier = function(node) {}
return {
RegExpParser: RegExpParser,
BaseRegExpVisitor: BaseRegExpVisitor,
VERSION: "0.4.0"
}
}
)