GGD/node_modules/java-parser/src/tokens.js

537 lines
15 KiB
JavaScript

"use strict";
const { createToken: createTokenOrg, Lexer } = require("chevrotain");
const camelCase = require("lodash/camelCase");
let chars;
// A little mini DSL for easier lexer definition.
const fragments = {};
try {
chars = require("./unicodesets");
} catch (e) {
throw Error(
"unicodesets.js file could not be found. Did you try to run the command: yarn run build ?"
);
}
function inlineFragments(def) {
let inlinedDef = def;
Object.keys(fragments).forEach(prevFragmentName => {
const prevFragmentDef = fragments[prevFragmentName];
const templateRegExp = new RegExp(`{{${prevFragmentName}}}`, "g");
inlinedDef = inlinedDef.replace(templateRegExp, prevFragmentDef);
});
return inlinedDef;
}
function FRAGMENT(name, def) {
fragments[name] = inlineFragments(def);
}
function MAKE_PATTERN(def, flags) {
const inlinedDef = inlineFragments(def);
return new RegExp(inlinedDef, flags);
}
// The order of fragments definitions is important
FRAGMENT("Digits", "[0-9]([0-9_]*[0-9])?");
FRAGMENT("ExponentPart", "[eE][+-]?{{Digits}}");
FRAGMENT("HexDigit", "[0-9a-fA-F]");
FRAGMENT("HexDigits", "{{HexDigit}}(({{HexDigit}}|'_')*{{HexDigit}})?");
FRAGMENT("FloatTypeSuffix", "[fFdD]");
FRAGMENT("LineTerminator", "(\\x0A|(\\x0D(\\x0A)?))");
FRAGMENT("UnicodeMarker", "uu*");
FRAGMENT("UnicodeEscape", "\\\\{{UnicodeMarker}}{{HexDigit}}{4}");
FRAGMENT("RawInputCharacter", "\\\\{{UnicodeMarker}}[0-9a-fA-F]{4}");
FRAGMENT("UnicodeInputCharacter", "({{UnicodeEscape}}|{{RawInputCharacter}})");
FRAGMENT("OctalDigit", "[0-7]");
FRAGMENT("ZeroToThree", "[0-3]");
FRAGMENT(
"OctalEscape",
"\\\\({{OctalDigit}}|{{ZeroToThree}}?{{OctalDigit}}{2})"
);
FRAGMENT("EscapeSequence", "\\\\[btnfr\"'\\\\]|{{OctalEscape}}");
// Not using InputCharacter terminology there because CR and LF are already captured in EscapeSequence
FRAGMENT(
"StringCharacter",
"(?:(?:{{EscapeSequence}})|{{UnicodeInputCharacter}})"
);
function matchJavaIdentifier(text, startOffset) {
let endOffset = startOffset;
let charCode = text.codePointAt(endOffset);
// We verifiy if the first character is from one of these categories
// Corresponds to the isJavaIdentifierStart function from Java
if (chars.firstIdentChar.has(charCode)) {
endOffset++;
// If we encounter a surrogate pair (something that is beyond 65535/FFFF)
// We skip another offset because a surrogate pair is of length 2.
if (charCode > 65535) {
endOffset++;
}
charCode = text.codePointAt(endOffset);
}
// We verify if the remaining characters is from one of these categories
// Corresponds to the isJavaIdentifierPart function from Java
while (chars.restIdentChar.has(charCode)) {
endOffset++;
// See above.
if (charCode > 65535) {
endOffset++;
}
charCode = text.codePointAt(endOffset);
}
// No match, must return null to conform with the RegExp.prototype.exec signature
if (endOffset === startOffset) {
return null;
}
const matchedString = text.substring(startOffset, endOffset);
// according to the RegExp.prototype.exec API the first item in the returned array must be the whole matched string.
return [matchedString];
}
const Identifier = createTokenOrg({
name: "Identifier",
pattern: { exec: matchJavaIdentifier },
line_breaks: false,
start_chars_hint: Array.from(chars.firstIdentChar, x =>
String.fromCharCode(x)
)
});
const allTokens = [];
const tokenDictionary = {};
function createToken(options) {
// TODO create a test to check all the tokenbs have a label defined
if (!options.label) {
// simple token (e.g operator)
if (typeof options.pattern === "string") {
options.label = `'${options.pattern}'`;
}
// Complex token (e.g literal)
else if (options.pattern instanceof RegExp) {
options.label = `'${options.name}'`;
}
}
const newTokenType = createTokenOrg(options);
allTokens.push(newTokenType);
tokenDictionary[options.name] = newTokenType;
return newTokenType;
}
function createKeywordLikeToken(options) {
// A keyword 'like' token uses the "longer_alt" config option
// to resolve ambiguities, see: http://sap.github.io/chevrotain/docs/features/token_alternative_matches.html
options.longer_alt = Identifier;
return createToken(options);
}
// Token Categories
// Used a Token Category to mark all restricted keywords.
// This could be used in syntax highlights implementation.
const RestrictedKeyword = createToken({
name: "RestrictedKeyword",
pattern: Lexer.NA
});
// Used a Token Category to mark all keywords.
// This could be used in syntax highlights implementation.
const Keyword = createToken({
name: "Keyword",
pattern: Lexer.NA
});
const AssignmentOperator = createToken({
name: "AssignmentOperator",
pattern: Lexer.NA
});
const BinaryOperator = createToken({
name: "BinaryOperator",
pattern: Lexer.NA
});
const UnaryPrefixOperator = createToken({
name: "UnaryPrefixOperator",
pattern: Lexer.NA
});
const UnaryPrefixOperatorNotPlusMinus = createToken({
name: "UnaryPrefixOperatorNotPlusMinus",
pattern: Lexer.NA
});
const UnarySuffixOperator = createToken({
name: "UnarySuffixOperator",
pattern: Lexer.NA
});
// https://docs.oracle.com/javase/specs/jls/se11/html/jls-3.html#jls-3.11
const Separators = createToken({
name: "Separators",
pattern: Lexer.NA
});
// https://docs.oracle.com/javase/specs/jls/se11/html/jls-3.html#jls-3.6
// Note [\\x09\\x20\\x0C] is equivalent to [\\t\\x20\\f] and that \\x20 represents
// space character
createToken({
name: "WhiteSpace",
pattern: MAKE_PATTERN("[\\x09\\x20\\x0C]|{{LineTerminator}}"),
group: Lexer.SKIPPED
});
createToken({
name: "LineComment",
pattern: /\/\/[^\n\r]*/,
group: "comments"
});
createToken({
name: "TraditionalComment",
pattern: /\/\*([^*]|\*(?!\/))*\*\//,
group: "comments"
});
createToken({ name: "BinaryLiteral", pattern: /0[bB][01]([01_]*[01])?[lL]?/ });
createToken({
name: "FloatLiteral",
pattern: MAKE_PATTERN(
"{{Digits}}\\.({{Digits}})?({{ExponentPart}})?({{FloatTypeSuffix}})?|" +
"\\.{{Digits}}({{ExponentPart}})?({{FloatTypeSuffix}})?|" +
"{{Digits}}{{ExponentPart}}({{FloatTypeSuffix}})?|" +
"{{Digits}}({{ExponentPart}})?{{FloatTypeSuffix}}"
)
});
createToken({ name: "OctalLiteral", pattern: /0_*[0-7]([0-7_]*[0-7])?[lL]?/ });
createToken({
name: "HexFloatLiteral",
pattern: MAKE_PATTERN(
"0[xX]({{HexDigits}}\\.?|({{HexDigits}})?\\.{{HexDigits}})[pP][+-]?{{Digits}}[fFdD]?"
)
});
createToken({
name: "HexLiteral",
pattern: /0[xX][0-9a-fA-F]([0-9a-fA-F_]*[0-9a-fA-F])?[lL]?/
});
createToken({
name: "DecimalLiteral",
pattern: MAKE_PATTERN("(0|[1-9](_+{{Digits}}|({{Digits}})?))[lL]?")
});
// https://docs.oracle.com/javase/specs/jls/se11/html/jls-3.html#jls-3.10.4
createToken({
name: "CharLiteral",
// Not using SingleCharacter Terminology because ' and \ are captured in EscapeSequence
pattern: MAKE_PATTERN(
"'(?:[^\\\\']|(?:(?:{{EscapeSequence}})|{{UnicodeInputCharacter}}))'"
)
});
createToken({
name: "TextBlock",
pattern: /"""\s*\n(\\"|\s|.)*?"""/
});
createToken({
name: "StringLiteral",
pattern: MAKE_PATTERN('"(?:[^\\\\"]|{{StringCharacter}})*"')
});
// https://docs.oracle.com/javase/specs/jls/se11/html/jls-3.html#jls-3.9
// TODO: how to handle the special rule (see spec above) for "requires" and "transitive"
const restrictedKeywords = [
"open",
"module",
"requires",
"transitive",
"exports",
"opens",
"to",
"uses",
"provides",
"with",
"sealed",
"non-sealed",
"permits"
];
// By sorting the keywords in descending order we avoid ambiguities
// of common prefixes.
sortDescLength(restrictedKeywords).forEach(word => {
createKeywordLikeToken({
name: word[0].toUpperCase() + camelCase(word.substr(1)),
pattern: word,
// restricted keywords can also be used as an Identifiers according to the spec.
// TODO: inspect this causes no ambiguities
categories: [Identifier, RestrictedKeyword]
});
});
// https://docs.oracle.com/javase/specs/jls/se11/html/jls-3.html#jls-3.9
const keywords = [
"abstract",
"continue",
"for",
"new",
"switch",
"assert",
"default",
"if",
"package",
"synchronized",
"boolean",
"do",
"goto",
"private",
"this",
"break",
"double",
"implements",
"protected",
"throw",
"byte",
"else",
"import",
"public",
"throws",
"case",
"enum",
// "instanceof", // special handling for "instanceof" operator below
"return",
"transient",
"catch",
"extends",
"int",
"short",
"try",
"char",
"final",
"interface",
"static",
"void",
"class",
"finally",
"long",
"strictfp",
"volatile",
"const",
"float",
"native",
"super",
"while",
["_", "underscore"]
];
sortDescLength(keywords).forEach(word => {
// For handling symbols keywords (underscore)
const isPair = Array.isArray(word);
const actualName = isPair ? word[1] : word;
const actualPattern = isPair ? word[0] : word;
const options = {
name: actualName[0].toUpperCase() + actualName.substr(1),
pattern: actualPattern,
categories: Keyword
};
if (isPair) {
options.label = `'${actualName}'`;
}
createKeywordLikeToken(options);
});
createKeywordLikeToken({
name: "Instanceof",
pattern: "instanceof",
categories: [Keyword, BinaryOperator]
});
createKeywordLikeToken({
name: "Var",
pattern: "var",
// https://docs.oracle.com/javase/specs/jls/se16/html/jls-3.html#jls-3.9
// "var is not a keyword, but rather an identifier with special meaning as the type of a local variable declaration"
categories: Identifier
});
createKeywordLikeToken({
name: "Yield",
pattern: "yield",
// https://docs.oracle.com/javase/specs/jls/se16/html/jls-3.html#jls-3.9
// "yield is not a keyword, but rather an identifier with special meaning as the type of a local variable declaration"
categories: Identifier
});
createKeywordLikeToken({
name: "Record",
pattern: "record",
// https://docs.oracle.com/javase/specs/jls/se16/html/jls-3.html#jls-3.9
// "record is not a keyword, but rather an identifier with special meaning as the type of a local variable declaration"
categories: Identifier
});
createKeywordLikeToken({ name: "True", pattern: "true" });
createKeywordLikeToken({ name: "False", pattern: "false" });
createKeywordLikeToken({ name: "Null", pattern: "null" });
// punctuation and symbols
createToken({ name: "At", pattern: "@", categories: [Separators] });
createToken({ name: "Arrow", pattern: "->" });
createToken({ name: "DotDotDot", pattern: "...", categories: [Separators] });
createToken({ name: "Dot", pattern: ".", categories: [Separators] });
createToken({ name: "Comma", pattern: ",", categories: [Separators] });
createToken({ name: "Semicolon", pattern: ";", categories: [Separators] });
createToken({ name: "ColonColon", pattern: "::", categories: [Separators] });
createToken({ name: "Colon", pattern: ":" });
createToken({ name: "QuestionMark", pattern: "?" });
createToken({ name: "LBrace", pattern: "(", categories: [Separators] });
createToken({ name: "RBrace", pattern: ")", categories: [Separators] });
createToken({ name: "LCurly", pattern: "{", categories: [Separators] });
createToken({ name: "RCurly", pattern: "}", categories: [Separators] });
createToken({ name: "LSquare", pattern: "[", categories: [Separators] });
createToken({ name: "RSquare", pattern: "]", categories: [Separators] });
// prefix and suffix operators
// must be defined before "-"
createToken({
name: "MinusMinus",
pattern: "--",
categories: [
UnaryPrefixOperator,
UnarySuffixOperator,
UnaryPrefixOperatorNotPlusMinus
]
});
// must be defined before "+"
createToken({
name: "PlusPlus",
pattern: "++",
categories: [
UnaryPrefixOperator,
UnarySuffixOperator,
UnaryPrefixOperatorNotPlusMinus
]
});
createToken({
name: "Complement",
pattern: "~",
categories: [UnaryPrefixOperator, UnaryPrefixOperatorNotPlusMinus]
});
createToken({
name: "LessEquals",
pattern: "<=",
categories: [BinaryOperator]
});
createToken({
name: "LessLessEquals",
pattern: "<<=",
categories: [AssignmentOperator]
});
createToken({ name: "Less", pattern: "<", categories: [BinaryOperator] });
createToken({
name: "GreaterEquals",
pattern: ">=",
categories: [BinaryOperator]
});
createToken({
name: "GreaterGreaterEquals",
pattern: ">>=",
categories: [AssignmentOperator]
});
createToken({
name: "GreaterGreaterGreaterEquals",
pattern: ">>>=",
categories: [AssignmentOperator]
});
createToken({ name: "Greater", pattern: ">", categories: [BinaryOperator] });
createToken({
name: "EqualsEquals",
pattern: "==",
categories: [BinaryOperator]
});
createToken({
name: "Equals",
pattern: "=",
categories: [BinaryOperator, AssignmentOperator]
});
createToken({
name: "MinusEquals",
pattern: "-=",
categories: [AssignmentOperator]
});
createToken({
name: "Minus",
pattern: "-",
categories: [BinaryOperator, UnaryPrefixOperator]
});
createToken({
name: "PlusEquals",
pattern: "+=",
categories: [AssignmentOperator]
});
createToken({
name: "Plus",
pattern: "+",
categories: [BinaryOperator, UnaryPrefixOperator]
});
createToken({ name: "AndAnd", pattern: "&&", categories: [BinaryOperator] });
createToken({
name: "AndEquals",
pattern: "&=",
categories: [AssignmentOperator]
});
createToken({ name: "And", pattern: "&", categories: [BinaryOperator] });
createToken({
name: "XorEquals",
pattern: "^=",
categories: [AssignmentOperator]
});
createToken({ name: "Xor", pattern: "^", categories: [BinaryOperator] });
createToken({ name: "NotEquals", pattern: "!=", categories: [BinaryOperator] });
createToken({ name: "OrOr", pattern: "||", categories: [BinaryOperator] });
createToken({
name: "OrEquals",
pattern: "|=",
categories: [AssignmentOperator]
});
createToken({ name: "Or", pattern: "|", categories: [BinaryOperator] });
createToken({
name: "MultiplyEquals",
pattern: "*=",
categories: [AssignmentOperator]
});
createToken({ name: "Star", pattern: "*", categories: [BinaryOperator] });
createToken({
name: "DivideEquals",
pattern: "/=",
categories: [AssignmentOperator]
});
createToken({ name: "Divide", pattern: "/", categories: [BinaryOperator] });
createToken({
name: "ModuloEquals",
pattern: "%=",
categories: [AssignmentOperator]
});
createToken({ name: "Modulo", pattern: "%", categories: [BinaryOperator] });
// must be defined after "!="
createToken({
name: "Not",
pattern: "!",
categories: [UnaryPrefixOperator, UnaryPrefixOperatorNotPlusMinus]
});
// Identifier must appear AFTER all the keywords to avoid ambiguities.
// See: https://github.com/SAP/chevrotain/blob/master/examples/lexer/keywords_vs_identifiers/keywords_vs_identifiers.js
allTokens.push(Identifier);
tokenDictionary["Identifier"] = Identifier;
function sortDescLength(arr) {
// sort is not stable, but that will not affect the lexing results.
return arr.sort((a, b) => {
return b.length - a.length;
});
}
module.exports = {
allTokens,
tokens: tokenDictionary
};