;(function(root, factory) { // istanbul ignore next if (typeof define === "function" && define.amd) { // istanbul ignore next define([], factory) } else if (typeof module === "object" && module.exports) { module.exports = factory() } else { // istanbul ignore next root.regexpToAst = factory() } })( typeof self !== "undefined" ? // istanbul ignore next self : this, function() { // references // https://hackernoon.com/the-madness-of-parsing-real-world-javascript-regexps-d9ee336df983 // https://www.ecma-international.org/ecma-262/8.0/index.html#prod-Pattern function RegExpParser() {} RegExpParser.prototype.saveState = function() { return { idx: this.idx, input: this.input, groupIdx: this.groupIdx } } RegExpParser.prototype.restoreState = function(newState) { this.idx = newState.idx this.input = newState.input this.groupIdx = newState.groupIdx } RegExpParser.prototype.pattern = function(input) { // parser state this.idx = 0 this.input = input this.groupIdx = 0 this.consumeChar("/") var value = this.disjunction() this.consumeChar("/") var flags = { type: "Flags", global: false, ignoreCase: false, multiLine: false, unicode: false, sticky: false } while (this.isRegExpFlag()) { switch (this.popChar()) { case "g": addFlag(flags, "global") break case "i": addFlag(flags, "ignoreCase") break case "m": addFlag(flags, "multiLine") break case "u": addFlag(flags, "unicode") break case "y": addFlag(flags, "sticky") break } } if (this.idx !== this.input.length) { throw Error( "Redundant input: " + this.input.substring(this.idx) ) } return { type: "Pattern", flags: flags, value: value } } RegExpParser.prototype.disjunction = function() { var alts = [] alts.push(this.alternative()) while (this.peekChar() === "|") { this.consumeChar("|") alts.push(this.alternative()) } return { type: "Disjunction", value: alts } } RegExpParser.prototype.alternative = function() { var terms = [] while (this.isTerm()) { terms.push(this.term()) } return { type: "Alternative", value: terms } } RegExpParser.prototype.term = function() { if (this.isAssertion()) { return this.assertion() } else { return this.atom() } } RegExpParser.prototype.assertion = function() { switch (this.popChar()) { case "^": return { type: "StartAnchor" } case "$": return { type: "EndAnchor" } // '\b' or '\B' case "\\": switch (this.popChar()) { case "b": return { type: "WordBoundary" } case "B": return { type: "NonWordBoundary" } } // istanbul ignore next throw Error("Invalid Assertion Escape") // '(?=' or '(?!' case "(": this.consumeChar("?") var type switch (this.popChar()) { case "=": type = "Lookahead" break case "!": type = "NegativeLookahead" break } ASSERT_EXISTS(type) var disjunction = this.disjunction() this.consumeChar(")") return { type: type, value: disjunction } } // istanbul ignore next ASSERT_NEVER_REACH_HERE() } RegExpParser.prototype.quantifier = function(isBacktracking) { var range switch (this.popChar()) { case "*": range = { atLeast: 0, atMost: Infinity } break case "+": range = { atLeast: 1, atMost: Infinity } break case "?": range = { atLeast: 0, atMost: 1 } break case "{": var atLeast = this.integerIncludingZero() switch (this.popChar()) { case "}": range = { atLeast: atLeast, atMost: atLeast } break case ",": var atMost if (this.isDigit()) { atMost = this.integerIncludingZero() range = { atLeast: atLeast, atMost: atMost } } else { range = { atLeast: atLeast, atMost: Infinity } } this.consumeChar("}") break } // throwing exceptions from "ASSERT_EXISTS" during backtracking // causes severe performance degradations if (isBacktracking === true && range === undefined) { return undefined } ASSERT_EXISTS(range) break } // throwing exceptions from "ASSERT_EXISTS" during backtracking // causes severe performance degradations if (isBacktracking === true && range === undefined) { return undefined } ASSERT_EXISTS(range) if (this.peekChar(0) === "?") { this.consumeChar("?") range.greedy = false } else { range.greedy = true } range.type = "Quantifier" return range } RegExpParser.prototype.atom = function() { var atom switch (this.peekChar()) { case ".": atom = this.dotAll() break case "\\": atom = this.atomEscape() break case "[": atom = this.characterClass() break case "(": atom = this.group() break } if (atom === undefined && this.isPatternCharacter()) { atom = this.patternCharacter() } ASSERT_EXISTS(atom) if (this.isQuantifier()) { atom.quantifier = this.quantifier() } return atom } RegExpParser.prototype.dotAll = function() { this.consumeChar(".") return { type: "Set", complement: true, value: [cc("\n"), cc("\r"), cc("\u2028"), cc("\u2029")] } } RegExpParser.prototype.atomEscape = function() { this.consumeChar("\\") switch (this.peekChar()) { case "1": case "2": case "3": case "4": case "5": case "6": case "7": case "8": case "9": return this.decimalEscapeAtom() case "d": case "D": case "s": case "S": case "w": case "W": return this.characterClassEscape() case "f": case "n": case "r": case "t": case "v": return this.controlEscapeAtom() case "c": return this.controlLetterEscapeAtom() case "0": return this.nulCharacterAtom() case "x": return this.hexEscapeSequenceAtom() case "u": return this.regExpUnicodeEscapeSequenceAtom() default: return this.identityEscapeAtom() } } RegExpParser.prototype.decimalEscapeAtom = function() { var value = this.positiveInteger() return { type: "GroupBackReference", value: value } } RegExpParser.prototype.characterClassEscape = function() { var set var complement = false switch (this.popChar()) { case "d": set = digitsCharCodes break case "D": set = digitsCharCodes complement = true break case "s": set = whitespaceCodes break case "S": set = whitespaceCodes complement = true break case "w": set = wordCharCodes break case "W": set = wordCharCodes complement = true break } ASSERT_EXISTS(set) return { type: "Set", value: set, complement: complement } } RegExpParser.prototype.controlEscapeAtom = function() { var escapeCode switch (this.popChar()) { case "f": escapeCode = cc("\f") break case "n": escapeCode = cc("\n") break case "r": escapeCode = cc("\r") break case "t": escapeCode = cc("\t") break case "v": escapeCode = cc("\v") break } ASSERT_EXISTS(escapeCode) return { type: "Character", value: escapeCode } } RegExpParser.prototype.controlLetterEscapeAtom = function() { this.consumeChar("c") var letter = this.popChar() if (/[a-zA-Z]/.test(letter) === false) { throw Error("Invalid ") } var letterCode = letter.toUpperCase().charCodeAt(0) - 64 return { type: "Character", value: letterCode } } RegExpParser.prototype.nulCharacterAtom = function() { // TODO implement '[lookahead ∉ DecimalDigit]' // TODO: for the deprecated octal escape sequence this.consumeChar("0") return { type: "Character", value: cc("\0") } } RegExpParser.prototype.hexEscapeSequenceAtom = function() { this.consumeChar("x") return this.parseHexDigits(2) } RegExpParser.prototype.regExpUnicodeEscapeSequenceAtom = function() { this.consumeChar("u") return this.parseHexDigits(4) } RegExpParser.prototype.identityEscapeAtom = function() { // TODO: implement "SourceCharacter but not UnicodeIDContinue" // // http://unicode.org/reports/tr31/#Specific_Character_Adjustments var escapedChar = this.popChar() return { type: "Character", value: cc(escapedChar) } } RegExpParser.prototype.classPatternCharacterAtom = function() { switch (this.peekChar()) { // istanbul ignore next case "\n": // istanbul ignore next case "\r": // istanbul ignore next case "\u2028": // istanbul ignore next case "\u2029": // istanbul ignore next case "\\": // istanbul ignore next case "]": throw Error("TBD") default: var nextChar = this.popChar() return { type: "Character", value: cc(nextChar) } } } RegExpParser.prototype.characterClass = function() { var set = [] var complement = false this.consumeChar("[") if (this.peekChar(0) === "^") { this.consumeChar("^") complement = true } while (this.isClassAtom()) { var from = this.classAtom() var isFromSingleChar = from.type === "Character" if (isFromSingleChar && this.isRangeDash()) { this.consumeChar("-") var to = this.classAtom() var isToSingleChar = to.type === "Character" // a range can only be used when both sides are single characters if (isToSingleChar) { if (to.value < from.value) { throw Error("Range out of order in character class") } set.push({ from: from.value, to: to.value }) } else { // literal dash insertToSet(from.value, set) set.push(cc("-")) insertToSet(to.value, set) } } else { insertToSet(from.value, set) } } this.consumeChar("]") return { type: "Set", complement: complement, value: set } } RegExpParser.prototype.classAtom = function() { switch (this.peekChar()) { // istanbul ignore next case "]": // istanbul ignore next case "\n": // istanbul ignore next case "\r": // istanbul ignore next case "\u2028": // istanbul ignore next case "\u2029": throw Error("TBD") case "\\": return this.classEscape() default: return this.classPatternCharacterAtom() } } RegExpParser.prototype.classEscape = function() { this.consumeChar("\\") switch (this.peekChar()) { // Matches a backspace. // (Not to be confused with \b word boundary outside characterClass) case "b": this.consumeChar("b") return { type: "Character", value: cc("\u0008") } case "d": case "D": case "s": case "S": case "w": case "W": return this.characterClassEscape() case "f": case "n": case "r": case "t": case "v": return this.controlEscapeAtom() case "c": return this.controlLetterEscapeAtom() case "0": return this.nulCharacterAtom() case "x": return this.hexEscapeSequenceAtom() case "u": return this.regExpUnicodeEscapeSequenceAtom() default: return this.identityEscapeAtom() } } RegExpParser.prototype.group = function() { var capturing = true this.consumeChar("(") switch (this.peekChar(0)) { case "?": this.consumeChar("?") this.consumeChar(":") capturing = false break default: this.groupIdx++ break } var value = this.disjunction() this.consumeChar(")") var groupAst = { type: "Group", capturing: capturing, value: value } if (capturing) { groupAst.idx = this.groupIdx } return groupAst } RegExpParser.prototype.positiveInteger = function() { var number = this.popChar() // istanbul ignore next - can't ever get here due to previous lookahead checks // still implementing this error checking in case this ever changes. if (decimalPatternNoZero.test(number) === false) { throw Error("Expecting a positive integer") } while (decimalPattern.test(this.peekChar(0))) { number += this.popChar() } return parseInt(number, 10) } RegExpParser.prototype.integerIncludingZero = function() { var number = this.popChar() if (decimalPattern.test(number) === false) { throw Error("Expecting an integer") } while (decimalPattern.test(this.peekChar(0))) { number += this.popChar() } return parseInt(number, 10) } RegExpParser.prototype.patternCharacter = function() { var nextChar = this.popChar() switch (nextChar) { // istanbul ignore next case "\n": // istanbul ignore next case "\r": // istanbul ignore next case "\u2028": // istanbul ignore next case "\u2029": // istanbul ignore next case "^": // istanbul ignore next case "$": // istanbul ignore next case "\\": // istanbul ignore next case ".": // istanbul ignore next case "*": // istanbul ignore next case "+": // istanbul ignore next case "?": // istanbul ignore next case "(": // istanbul ignore next case ")": // istanbul ignore next case "[": // istanbul ignore next case "|": // istanbul ignore next throw Error("TBD") default: return { type: "Character", value: cc(nextChar) } } } RegExpParser.prototype.isRegExpFlag = function() { switch (this.peekChar(0)) { case "g": case "i": case "m": case "u": case "y": return true default: return false } } RegExpParser.prototype.isRangeDash = function() { return this.peekChar() === "-" && this.isClassAtom(1) } RegExpParser.prototype.isDigit = function() { return decimalPattern.test(this.peekChar(0)) } RegExpParser.prototype.isClassAtom = function(howMuch) { if (howMuch === undefined) { howMuch = 0 } switch (this.peekChar(howMuch)) { case "]": case "\n": case "\r": case "\u2028": case "\u2029": return false default: return true } } RegExpParser.prototype.isTerm = function() { return this.isAtom() || this.isAssertion() } RegExpParser.prototype.isAtom = function() { if (this.isPatternCharacter()) { return true } switch (this.peekChar(0)) { case ".": case "\\": // atomEscape case "[": // characterClass // TODO: isAtom must be called before isAssertion - disambiguate case "(": // group return true default: return false } } RegExpParser.prototype.isAssertion = function() { switch (this.peekChar(0)) { case "^": case "$": return true // '\b' or '\B' case "\\": switch (this.peekChar(1)) { case "b": case "B": return true default: return false } // '(?=' or '(?!' case "(": return ( this.peekChar(1) === "?" && (this.peekChar(2) === "=" || this.peekChar(2) === "!") ) default: return false } } RegExpParser.prototype.isQuantifier = function() { var prevState = this.saveState() try { return this.quantifier(true) !== undefined } catch (e) { return false } finally { this.restoreState(prevState) } } RegExpParser.prototype.isPatternCharacter = function() { switch (this.peekChar()) { case "^": case "$": case "\\": case ".": case "*": case "+": case "?": case "(": case ")": case "[": case "|": case "/": case "\n": case "\r": case "\u2028": case "\u2029": return false default: return true } } RegExpParser.prototype.parseHexDigits = function(howMany) { var hexString = "" for (var i = 0; i < howMany; i++) { var hexChar = this.popChar() if (hexDigitPattern.test(hexChar) === false) { throw Error("Expecting a HexDecimal digits") } hexString += hexChar } var charCode = parseInt(hexString, 16) return { type: "Character", value: charCode } } RegExpParser.prototype.peekChar = function(howMuch) { if (howMuch === undefined) { howMuch = 0 } return this.input[this.idx + howMuch] } RegExpParser.prototype.popChar = function() { var nextChar = this.peekChar(0) this.consumeChar() return nextChar } RegExpParser.prototype.consumeChar = function(char) { if (char !== undefined && this.input[this.idx] !== char) { throw Error( "Expected: '" + char + "' but found: '" + this.input[this.idx] + "' at offset: " + this.idx ) } if (this.idx >= this.input.length) { throw Error("Unexpected end of input") } this.idx++ } // consts and utilities var hexDigitPattern = /[0-9a-fA-F]/ var decimalPattern = /[0-9]/ var decimalPatternNoZero = /[1-9]/ function cc(char) { return char.charCodeAt(0) } function insertToSet(item, set) { if (item.length !== undefined) { item.forEach(function(subItem) { set.push(subItem) }) } else { set.push(item) } } function addFlag(flagObj, flagKey) { if (flagObj[flagKey] === true) { throw "duplicate flag " + flagKey } flagObj[flagKey] = true } function ASSERT_EXISTS(obj) { // istanbul ignore next if (obj === undefined) { throw Error("Internal Error - Should never get here!") } } // istanbul ignore next function ASSERT_NEVER_REACH_HERE() { throw Error("Internal Error - Should never get here!") } var i var digitsCharCodes = [] for (i = cc("0"); i <= cc("9"); i++) { digitsCharCodes.push(i) } var wordCharCodes = [cc("_")].concat(digitsCharCodes) for (i = cc("a"); i <= cc("z"); i++) { wordCharCodes.push(i) } for (i = cc("A"); i <= cc("Z"); i++) { wordCharCodes.push(i) } // https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/RegExp#character-classes var whitespaceCodes = [ cc(" "), cc("\f"), cc("\n"), cc("\r"), cc("\t"), cc("\v"), cc("\t"), cc("\u00a0"), cc("\u1680"), cc("\u2000"), cc("\u2001"), cc("\u2002"), cc("\u2003"), cc("\u2004"), cc("\u2005"), cc("\u2006"), cc("\u2007"), cc("\u2008"), cc("\u2009"), cc("\u200a"), cc("\u2028"), cc("\u2029"), cc("\u202f"), cc("\u205f"), cc("\u3000"), cc("\ufeff") ] function BaseRegExpVisitor() {} BaseRegExpVisitor.prototype.visitChildren = function(node) { for (var key in node) { var child = node[key] /* istanbul ignore else */ if (node.hasOwnProperty(key)) { if (child.type !== undefined) { this.visit(child) } else if (Array.isArray(child)) { child.forEach(function(subChild) { this.visit(subChild) }, this) } } } } BaseRegExpVisitor.prototype.visit = function(node) { switch (node.type) { case "Pattern": this.visitPattern(node) break case "Flags": this.visitFlags(node) break case "Disjunction": this.visitDisjunction(node) break case "Alternative": this.visitAlternative(node) break case "StartAnchor": this.visitStartAnchor(node) break case "EndAnchor": this.visitEndAnchor(node) break case "WordBoundary": this.visitWordBoundary(node) break case "NonWordBoundary": this.visitNonWordBoundary(node) break case "Lookahead": this.visitLookahead(node) break case "NegativeLookahead": this.visitNegativeLookahead(node) break case "Character": this.visitCharacter(node) break case "Set": this.visitSet(node) break case "Group": this.visitGroup(node) break case "GroupBackReference": this.visitGroupBackReference(node) break case "Quantifier": this.visitQuantifier(node) break } this.visitChildren(node) } BaseRegExpVisitor.prototype.visitPattern = function(node) {} BaseRegExpVisitor.prototype.visitFlags = function(node) {} BaseRegExpVisitor.prototype.visitDisjunction = function(node) {} BaseRegExpVisitor.prototype.visitAlternative = function(node) {} // Assertion BaseRegExpVisitor.prototype.visitStartAnchor = function(node) {} BaseRegExpVisitor.prototype.visitEndAnchor = function(node) {} BaseRegExpVisitor.prototype.visitWordBoundary = function(node) {} BaseRegExpVisitor.prototype.visitNonWordBoundary = function(node) {} BaseRegExpVisitor.prototype.visitLookahead = function(node) {} BaseRegExpVisitor.prototype.visitNegativeLookahead = function(node) {} // atoms BaseRegExpVisitor.prototype.visitCharacter = function(node) {} BaseRegExpVisitor.prototype.visitSet = function(node) {} BaseRegExpVisitor.prototype.visitGroup = function(node) {} BaseRegExpVisitor.prototype.visitGroupBackReference = function(node) {} BaseRegExpVisitor.prototype.visitQuantifier = function(node) {} return { RegExpParser: RegExpParser, BaseRegExpVisitor: BaseRegExpVisitor, VERSION: "0.4.0" } } )