"use strict"; const { createToken: createTokenOrg, Lexer } = require("chevrotain"); const camelCase = require("lodash/camelCase"); let chars; // A little mini DSL for easier lexer definition. const fragments = {}; try { chars = require("./unicodesets"); } catch (e) { throw Error( "unicodesets.js file could not be found. Did you try to run the command: yarn run build ?" ); } function inlineFragments(def) { let inlinedDef = def; Object.keys(fragments).forEach(prevFragmentName => { const prevFragmentDef = fragments[prevFragmentName]; const templateRegExp = new RegExp(`{{${prevFragmentName}}}`, "g"); inlinedDef = inlinedDef.replace(templateRegExp, prevFragmentDef); }); return inlinedDef; } function FRAGMENT(name, def) { fragments[name] = inlineFragments(def); } function MAKE_PATTERN(def, flags) { const inlinedDef = inlineFragments(def); return new RegExp(inlinedDef, flags); } // The order of fragments definitions is important FRAGMENT("Digits", "[0-9]([0-9_]*[0-9])?"); FRAGMENT("ExponentPart", "[eE][+-]?{{Digits}}"); FRAGMENT("HexDigit", "[0-9a-fA-F]"); FRAGMENT("HexDigits", "{{HexDigit}}(({{HexDigit}}|'_')*{{HexDigit}})?"); FRAGMENT("FloatTypeSuffix", "[fFdD]"); FRAGMENT("LineTerminator", "(\\x0A|(\\x0D(\\x0A)?))"); FRAGMENT("UnicodeMarker", "uu*"); FRAGMENT("UnicodeEscape", "\\\\{{UnicodeMarker}}{{HexDigit}}{4}"); FRAGMENT("RawInputCharacter", "\\\\{{UnicodeMarker}}[0-9a-fA-F]{4}"); FRAGMENT("UnicodeInputCharacter", "({{UnicodeEscape}}|{{RawInputCharacter}})"); FRAGMENT("OctalDigit", "[0-7]"); FRAGMENT("ZeroToThree", "[0-3]"); FRAGMENT( "OctalEscape", "\\\\({{OctalDigit}}|{{ZeroToThree}}?{{OctalDigit}}{2})" ); FRAGMENT("EscapeSequence", "\\\\[btnfr\"'\\\\]|{{OctalEscape}}"); // Not using InputCharacter terminology there because CR and LF are already captured in EscapeSequence FRAGMENT( "StringCharacter", "(?:(?:{{EscapeSequence}})|{{UnicodeInputCharacter}})" ); function matchJavaIdentifier(text, startOffset) { let endOffset = startOffset; let charCode = text.codePointAt(endOffset); // We verifiy if the first character is from one of these categories // Corresponds to the isJavaIdentifierStart function from Java if (chars.firstIdentChar.has(charCode)) { endOffset++; // If we encounter a surrogate pair (something that is beyond 65535/FFFF) // We skip another offset because a surrogate pair is of length 2. if (charCode > 65535) { endOffset++; } charCode = text.codePointAt(endOffset); } // We verify if the remaining characters is from one of these categories // Corresponds to the isJavaIdentifierPart function from Java while (chars.restIdentChar.has(charCode)) { endOffset++; // See above. if (charCode > 65535) { endOffset++; } charCode = text.codePointAt(endOffset); } // No match, must return null to conform with the RegExp.prototype.exec signature if (endOffset === startOffset) { return null; } const matchedString = text.substring(startOffset, endOffset); // according to the RegExp.prototype.exec API the first item in the returned array must be the whole matched string. return [matchedString]; } const Identifier = createTokenOrg({ name: "Identifier", pattern: { exec: matchJavaIdentifier }, line_breaks: false, start_chars_hint: Array.from(chars.firstIdentChar, x => String.fromCharCode(x) ) }); const allTokens = []; const tokenDictionary = {}; function createToken(options) { // TODO create a test to check all the tokenbs have a label defined if (!options.label) { // simple token (e.g operator) if (typeof options.pattern === "string") { options.label = `'${options.pattern}'`; } // Complex token (e.g literal) else if (options.pattern instanceof RegExp) { options.label = `'${options.name}'`; } } const newTokenType = createTokenOrg(options); allTokens.push(newTokenType); tokenDictionary[options.name] = newTokenType; return newTokenType; } function createKeywordLikeToken(options) { // A keyword 'like' token uses the "longer_alt" config option // to resolve ambiguities, see: http://sap.github.io/chevrotain/docs/features/token_alternative_matches.html options.longer_alt = Identifier; return createToken(options); } // Token Categories // Used a Token Category to mark all restricted keywords. // This could be used in syntax highlights implementation. const RestrictedKeyword = createToken({ name: "RestrictedKeyword", pattern: Lexer.NA }); // Used a Token Category to mark all keywords. // This could be used in syntax highlights implementation. const Keyword = createToken({ name: "Keyword", pattern: Lexer.NA }); const AssignmentOperator = createToken({ name: "AssignmentOperator", pattern: Lexer.NA }); const BinaryOperator = createToken({ name: "BinaryOperator", pattern: Lexer.NA }); const UnaryPrefixOperator = createToken({ name: "UnaryPrefixOperator", pattern: Lexer.NA }); const UnaryPrefixOperatorNotPlusMinus = createToken({ name: "UnaryPrefixOperatorNotPlusMinus", pattern: Lexer.NA }); const UnarySuffixOperator = createToken({ name: "UnarySuffixOperator", pattern: Lexer.NA }); // https://docs.oracle.com/javase/specs/jls/se11/html/jls-3.html#jls-3.11 const Separators = createToken({ name: "Separators", pattern: Lexer.NA }); // https://docs.oracle.com/javase/specs/jls/se11/html/jls-3.html#jls-3.6 // Note [\\x09\\x20\\x0C] is equivalent to [\\t\\x20\\f] and that \\x20 represents // space character createToken({ name: "WhiteSpace", pattern: MAKE_PATTERN("[\\x09\\x20\\x0C]|{{LineTerminator}}"), group: Lexer.SKIPPED }); createToken({ name: "LineComment", pattern: /\/\/[^\n\r]*/, group: "comments" }); createToken({ name: "TraditionalComment", pattern: /\/\*([^*]|\*(?!\/))*\*\//, group: "comments" }); createToken({ name: "BinaryLiteral", pattern: /0[bB][01]([01_]*[01])?[lL]?/ }); createToken({ name: "FloatLiteral", pattern: MAKE_PATTERN( "{{Digits}}\\.({{Digits}})?({{ExponentPart}})?({{FloatTypeSuffix}})?|" + "\\.{{Digits}}({{ExponentPart}})?({{FloatTypeSuffix}})?|" + "{{Digits}}{{ExponentPart}}({{FloatTypeSuffix}})?|" + "{{Digits}}({{ExponentPart}})?{{FloatTypeSuffix}}" ) }); createToken({ name: "OctalLiteral", pattern: /0_*[0-7]([0-7_]*[0-7])?[lL]?/ }); createToken({ name: "HexFloatLiteral", pattern: MAKE_PATTERN( "0[xX]({{HexDigits}}\\.?|({{HexDigits}})?\\.{{HexDigits}})[pP][+-]?{{Digits}}[fFdD]?" ) }); createToken({ name: "HexLiteral", pattern: /0[xX][0-9a-fA-F]([0-9a-fA-F_]*[0-9a-fA-F])?[lL]?/ }); createToken({ name: "DecimalLiteral", pattern: MAKE_PATTERN("(0|[1-9](_+{{Digits}}|({{Digits}})?))[lL]?") }); // https://docs.oracle.com/javase/specs/jls/se11/html/jls-3.html#jls-3.10.4 createToken({ name: "CharLiteral", // Not using SingleCharacter Terminology because ' and \ are captured in EscapeSequence pattern: MAKE_PATTERN( "'(?:[^\\\\']|(?:(?:{{EscapeSequence}})|{{UnicodeInputCharacter}}))'" ) }); createToken({ name: "TextBlock", pattern: /"""\s*\n(\\"|\s|.)*?"""/ }); createToken({ name: "StringLiteral", pattern: MAKE_PATTERN('"(?:[^\\\\"]|{{StringCharacter}})*"') }); // https://docs.oracle.com/javase/specs/jls/se11/html/jls-3.html#jls-3.9 // TODO: how to handle the special rule (see spec above) for "requires" and "transitive" const restrictedKeywords = [ "open", "module", "requires", "transitive", "exports", "opens", "to", "uses", "provides", "with", "sealed", "non-sealed", "permits" ]; // By sorting the keywords in descending order we avoid ambiguities // of common prefixes. sortDescLength(restrictedKeywords).forEach(word => { createKeywordLikeToken({ name: word[0].toUpperCase() + camelCase(word.substr(1)), pattern: word, // restricted keywords can also be used as an Identifiers according to the spec. // TODO: inspect this causes no ambiguities categories: [Identifier, RestrictedKeyword] }); }); // https://docs.oracle.com/javase/specs/jls/se11/html/jls-3.html#jls-3.9 const keywords = [ "abstract", "continue", "for", "new", "switch", "assert", "default", "if", "package", "synchronized", "boolean", "do", "goto", "private", "this", "break", "double", "implements", "protected", "throw", "byte", "else", "import", "public", "throws", "case", "enum", // "instanceof", // special handling for "instanceof" operator below "return", "transient", "catch", "extends", "int", "short", "try", "char", "final", "interface", "static", "void", "class", "finally", "long", "strictfp", "volatile", "const", "float", "native", "super", "while", ["_", "underscore"] ]; sortDescLength(keywords).forEach(word => { // For handling symbols keywords (underscore) const isPair = Array.isArray(word); const actualName = isPair ? word[1] : word; const actualPattern = isPair ? word[0] : word; const options = { name: actualName[0].toUpperCase() + actualName.substr(1), pattern: actualPattern, categories: Keyword }; if (isPair) { options.label = `'${actualName}'`; } createKeywordLikeToken(options); }); createKeywordLikeToken({ name: "Instanceof", pattern: "instanceof", categories: [Keyword, BinaryOperator] }); createKeywordLikeToken({ name: "Var", pattern: "var", // https://docs.oracle.com/javase/specs/jls/se16/html/jls-3.html#jls-3.9 // "var is not a keyword, but rather an identifier with special meaning as the type of a local variable declaration" categories: Identifier }); createKeywordLikeToken({ name: "Yield", pattern: "yield", // https://docs.oracle.com/javase/specs/jls/se16/html/jls-3.html#jls-3.9 // "yield is not a keyword, but rather an identifier with special meaning as the type of a local variable declaration" categories: Identifier }); createKeywordLikeToken({ name: "Record", pattern: "record", // https://docs.oracle.com/javase/specs/jls/se16/html/jls-3.html#jls-3.9 // "record is not a keyword, but rather an identifier with special meaning as the type of a local variable declaration" categories: Identifier }); createKeywordLikeToken({ name: "True", pattern: "true" }); createKeywordLikeToken({ name: "False", pattern: "false" }); createKeywordLikeToken({ name: "Null", pattern: "null" }); // punctuation and symbols createToken({ name: "At", pattern: "@", categories: [Separators] }); createToken({ name: "Arrow", pattern: "->" }); createToken({ name: "DotDotDot", pattern: "...", categories: [Separators] }); createToken({ name: "Dot", pattern: ".", categories: [Separators] }); createToken({ name: "Comma", pattern: ",", categories: [Separators] }); createToken({ name: "Semicolon", pattern: ";", categories: [Separators] }); createToken({ name: "ColonColon", pattern: "::", categories: [Separators] }); createToken({ name: "Colon", pattern: ":" }); createToken({ name: "QuestionMark", pattern: "?" }); createToken({ name: "LBrace", pattern: "(", categories: [Separators] }); createToken({ name: "RBrace", pattern: ")", categories: [Separators] }); createToken({ name: "LCurly", pattern: "{", categories: [Separators] }); createToken({ name: "RCurly", pattern: "}", categories: [Separators] }); createToken({ name: "LSquare", pattern: "[", categories: [Separators] }); createToken({ name: "RSquare", pattern: "]", categories: [Separators] }); // prefix and suffix operators // must be defined before "-" createToken({ name: "MinusMinus", pattern: "--", categories: [ UnaryPrefixOperator, UnarySuffixOperator, UnaryPrefixOperatorNotPlusMinus ] }); // must be defined before "+" createToken({ name: "PlusPlus", pattern: "++", categories: [ UnaryPrefixOperator, UnarySuffixOperator, UnaryPrefixOperatorNotPlusMinus ] }); createToken({ name: "Complement", pattern: "~", categories: [UnaryPrefixOperator, UnaryPrefixOperatorNotPlusMinus] }); createToken({ name: "LessEquals", pattern: "<=", categories: [BinaryOperator] }); createToken({ name: "LessLessEquals", pattern: "<<=", categories: [AssignmentOperator] }); createToken({ name: "Less", pattern: "<", categories: [BinaryOperator] }); createToken({ name: "GreaterEquals", pattern: ">=", categories: [BinaryOperator] }); createToken({ name: "GreaterGreaterEquals", pattern: ">>=", categories: [AssignmentOperator] }); createToken({ name: "GreaterGreaterGreaterEquals", pattern: ">>>=", categories: [AssignmentOperator] }); createToken({ name: "Greater", pattern: ">", categories: [BinaryOperator] }); createToken({ name: "EqualsEquals", pattern: "==", categories: [BinaryOperator] }); createToken({ name: "Equals", pattern: "=", categories: [BinaryOperator, AssignmentOperator] }); createToken({ name: "MinusEquals", pattern: "-=", categories: [AssignmentOperator] }); createToken({ name: "Minus", pattern: "-", categories: [BinaryOperator, UnaryPrefixOperator] }); createToken({ name: "PlusEquals", pattern: "+=", categories: [AssignmentOperator] }); createToken({ name: "Plus", pattern: "+", categories: [BinaryOperator, UnaryPrefixOperator] }); createToken({ name: "AndAnd", pattern: "&&", categories: [BinaryOperator] }); createToken({ name: "AndEquals", pattern: "&=", categories: [AssignmentOperator] }); createToken({ name: "And", pattern: "&", categories: [BinaryOperator] }); createToken({ name: "XorEquals", pattern: "^=", categories: [AssignmentOperator] }); createToken({ name: "Xor", pattern: "^", categories: [BinaryOperator] }); createToken({ name: "NotEquals", pattern: "!=", categories: [BinaryOperator] }); createToken({ name: "OrOr", pattern: "||", categories: [BinaryOperator] }); createToken({ name: "OrEquals", pattern: "|=", categories: [AssignmentOperator] }); createToken({ name: "Or", pattern: "|", categories: [BinaryOperator] }); createToken({ name: "MultiplyEquals", pattern: "*=", categories: [AssignmentOperator] }); createToken({ name: "Star", pattern: "*", categories: [BinaryOperator] }); createToken({ name: "DivideEquals", pattern: "/=", categories: [AssignmentOperator] }); createToken({ name: "Divide", pattern: "/", categories: [BinaryOperator] }); createToken({ name: "ModuloEquals", pattern: "%=", categories: [AssignmentOperator] }); createToken({ name: "Modulo", pattern: "%", categories: [BinaryOperator] }); // must be defined after "!=" createToken({ name: "Not", pattern: "!", categories: [UnaryPrefixOperator, UnaryPrefixOperatorNotPlusMinus] }); // Identifier must appear AFTER all the keywords to avoid ambiguities. // See: https://github.com/SAP/chevrotain/blob/master/examples/lexer/keywords_vs_identifiers/keywords_vs_identifiers.js allTokens.push(Identifier); tokenDictionary["Identifier"] = Identifier; function sortDescLength(arr) { // sort is not stable, but that will not affect the lexing results. return arr.sort((a, b) => { return b.length - a.length; }); } module.exports = { allTokens, tokens: tokenDictionary };