You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
pegjs/src/parser.pegjs

370 lines
8.2 KiB
Plaintext

/*
* PEG.js Grammar
* ==============
*
* PEG.js grammar syntax is designed to be simple, expressive, and similar to
* JavaScript where possible. This means that many rules, especially in the
* lexical part, are based on the grammar from ECMA-262, 5.1 Edition [1]. Some
* are directly taken or adapted from the JavaScript example grammar (see
* examples/javascript.pegjs).
*
* [1] http://www.ecma-international.org/publications/standards/Ecma-262.htm
*/
{
function extractOptional(optional, index) {
return optional ? optional[index] : null;
}
function extractList(list, index) {
var result = new Array(list.length), i;
for (i = 0; i < list.length; i++) {
result[i] = list[i][index];
}
return result;
}
function buildList(first, rest, index) {
return [first].concat(extractList(rest, index));
}
}
Grammar
= __ initializer:(Initializer __)? rules:(Rule __)+ {
return {
type: "grammar",
initializer: extractOptional(initializer, 0),
rules: extractList(rules, 0)
};
}
Initializer
= code:Action (__ ";")? {
return {
type: "initializer",
code: code
};
}
Rule
= name:Identifier __
displayName:(String __)?
"=" __
expression:Expression (__ ";")? {
return {
type: "rule",
name: name,
expression: displayName !== null
? {
type: "named",
name: displayName[0],
expression: expression
}
: expression
};
}
Expression
= Choice
Choice
= first:Sequence rest:(__ "/" __ Sequence)* {
return rest.length > 0
? { type: "choice", alternatives: buildList(first, rest, 3) }
: first;
}
Sequence
= first:Labeled rest:(__ Labeled)* __ code:Action {
var expression = rest.length > 0
? { type: "sequence", elements: buildList(first, rest, 1) }
: first;
return {
type: "action",
expression: expression,
code: code
};
}
/ first:Labeled rest:(__ Labeled)* {
return rest.length > 0
? { type: "sequence", elements: buildList(first, rest, 1) }
: first;
}
Labeled
= label:Identifier __ ":" __ expression:Prefixed {
return {
type: "labeled",
label: label,
expression: expression
};
}
/ Prefixed
Prefixed
= "$" __ expression:Suffixed {
return {
type: "text",
expression: expression
};
}
/ "&" __ code:Action {
return {
type: "semantic_and",
code: code
};
}
/ "&" __ expression:Suffixed {
return {
type: "simple_and",
expression: expression
};
}
/ "!" __ code:Action {
return {
type: "semantic_not",
code: code
};
}
/ "!" __ expression:Suffixed {
return {
type: "simple_not",
expression: expression
};
}
/ Suffixed
Suffixed
= expression:Primary __ "?" {
return {
type: "optional",
expression: expression
};
}
/ expression:Primary __ "*" {
return {
type: "zero_or_more",
expression: expression
};
}
/ expression:Primary __ "+" {
return {
type: "one_or_more",
expression: expression
};
}
/ Primary
Primary
= name:Identifier !(__ (String __)? "=") {
return {
type: "rule_ref",
name: name
};
}
/ Literal
/ Class
/ "." { return { type: "any" }; }
/ "(" __ expression:Expression __ ")" { return expression; }
/* "Lexical" elements */
SourceCharacter
= .
WhiteSpace "whitespace"
= "\t"
/ "\v"
/ "\f"
/ " "
/ "\u00A0"
/ "\uFEFF"
/ Zs
LineTerminator
= [\n\r\u2028\u2029]
LineTerminatorSequence "end of line"
= "\n"
/ "\r\n"
/ "\r"
/ "\u2028"
/ "\u2029"
Comment "comment"
= MultiLineComment
/ SingleLineComment
MultiLineComment
= "/*" (!"*/" SourceCharacter)* "*/"
SingleLineComment
= "//" (!LineTerminator SourceCharacter)*
Action "action"
= braced:Braced __ { return braced.substr(1, braced.length - 2); }
Braced
= $("{" (Braced / NonBraceCharacters)* "}")
NonBraceCharacters
= NonBraceCharacter+
NonBraceCharacter
= [^{}]
Identifier "identifier"
= $((Letter / "_") (Letter / Digit / "_")*)
Literal "literal"
= value:(DoubleQuotedString / SingleQuotedString) flags:"i"? {
return {
type: "literal",
value: value,
ignoreCase: flags === "i"
};
}
String "string"
= string:(DoubleQuotedString / SingleQuotedString) { return string; }
DoubleQuotedString
= '"' chars:DoubleQuotedCharacter* '"' { return chars.join(""); }
DoubleQuotedCharacter
= SimpleDoubleQuotedCharacter
/ SimpleEscapeSequence
/ ZeroEscapeSequence
/ HexEscapeSequence
/ UnicodeEscapeSequence
/ EOLEscapeSequence
SimpleDoubleQuotedCharacter
= !('"' / "\\" / LineTerminator) char_:. { return char_; }
SingleQuotedString
= "'" chars:SingleQuotedCharacter* "'" { return chars.join(""); }
SingleQuotedCharacter
= SimpleSingleQuotedCharacter
/ SimpleEscapeSequence
/ ZeroEscapeSequence
/ HexEscapeSequence
/ UnicodeEscapeSequence
/ EOLEscapeSequence
SimpleSingleQuotedCharacter
= !("'" / "\\" / LineTerminator) char_:. { return char_; }
Class "character class"
= "[" inverted:"^"? parts:(ClassCharacterRange / ClassCharacter)* "]" flags:"i"? {
return {
type: "class",
parts: parts,
rawText: text().replace(/\s+$/, ""),
inverted: inverted === "^",
ignoreCase: flags === "i"
};
}
ClassCharacterRange
= begin:ClassCharacter "-" end:ClassCharacter {
if (begin.charCodeAt(0) > end.charCodeAt(0)) {
error("Invalid character range: " + text() + ".");
}
return [begin, end];
}
ClassCharacter
= BracketDelimitedCharacter
BracketDelimitedCharacter
= SimpleBracketDelimitedCharacter
/ SimpleEscapeSequence
/ ZeroEscapeSequence
/ HexEscapeSequence
/ UnicodeEscapeSequence
/ EOLEscapeSequence
SimpleBracketDelimitedCharacter
= !("]" / "\\" / LineTerminator) char_:. { return char_; }
SimpleEscapeSequence
= "\\" !(Digit / "x" / "u" / LineTerminator) char_:. {
return char_
.replace("b", "\b")
.replace("f", "\f")
.replace("n", "\n")
.replace("r", "\r")
.replace("t", "\t")
.replace("v", "\x0B"); // IE does not recognize "\v".
}
ZeroEscapeSequence
= "\\0" !Digit { return "\x00"; }
HexEscapeSequence
= "\\x" digits:$(HexDigit HexDigit) {
return String.fromCharCode(parseInt(digits, 16));
}
UnicodeEscapeSequence
= "\\u" digits:$(HexDigit HexDigit HexDigit HexDigit) {
return String.fromCharCode(parseInt(digits, 16));
}
EOLEscapeSequence
= "\\" eol:LineTerminatorSequence { return ""; }
Digit
= [0-9]
HexDigit
= [0-9a-fA-F]
Letter
= LowerCaseLetter
/ UpperCaseLetter
LowerCaseLetter
= [a-z]
UpperCaseLetter
= [A-Z]
/*
* Unicode Character Categories
*
* Extracted from the following Unicode Character Database file:
*
* http://www.unicode.org/Public/6.3.0/ucd/extracted/DerivedGeneralCategory.txt
*
* Unix magic used:
*
* grep "; $CATEGORY" DerivedGeneralCategory.txt | # Filter characters
* cut -f1 -d " " | # Extract code points
* grep -v '[0-9a-fA-F]\{5\}' | # Exclude non-BMP characters
* sed -e 's/\.\./-/' | # Adjust formatting
* sed -e 's/\([0-9a-fA-F]\{4\}\)/\\u\1/g' | # Adjust formatting
* tr -d '\n' # Join lines
*
* ECMA-262 allows using Unicode 3.0 or later, version 6.3.0 was the latest one
* at the time of writing.
*
* Non-BMP characters are completely ignored to avoid surrogate pair handling
* (detecting surrogate pairs isn't possible with a simple character class and
* other methods would degrade performance). I don't consider it a big deal as
* even parsers in JavaScript engines of common browsers seem to ignore them.
*/
// Separator, Space
Zs = [\u0020\u00A0\u1680\u2000-\u200A\u202F\u205F\u3000]
/* Skipped */
__
= (WhiteSpace / LineTerminatorSequence / Comment)*