PEG.js grammar: Change whitespace handling

Before this commit, whitespace was handled at the lexical level by
making tokens consume any whitespace coming after them. This was
accomplished by appending |__| to every token rule.

This commit changes whitespace handling to be more explicit. Tokens no
longer consume whitespace coming after them and syntactic rules have to
cope with it. While this slightly complicates the syntactic grammar, I
think it's a cleaner way. Moreover, it is what JavaScript example
grammar does.

One small side-effect of thich change is that the grammar is now
stand-alone (it doesn't require utils.js anymore).
redux
David Majda 10 years ago
parent 4725632641
commit ae89f5e469

File diff suppressed because it is too large Load Diff

@ -1,25 +1,34 @@
describe("PEG.js grammar parser", function() { describe("PEG.js grammar parser", function() {
var trivialGrammar, var trivialGrammar,
literalAbcd = { type: "literal", value: "abcd", ignoreCase: false }, literalAbcd = { type: "literal", value: "abcd", ignoreCase: false },
literalEfgh = { type: "literal", value: "efgh", ignoreCase: false }, literalEfgh = { type: "literal", value: "efgh", ignoreCase: false },
literalIjkl = { type: "literal", value: "ijkl", ignoreCase: false }, literalIjkl = { type: "literal", value: "ijkl", ignoreCase: false },
optionalLiteral = { type: "optional", expression: literalAbcd }, optionalLiteral = { type: "optional", expression: literalAbcd },
simpleNotLiteral = { type: "simple_not", expression: literalAbcd }, zeroOrMoreLiteral = { type: "zero_or_more", expression: literalAbcd },
labeledAbcd = { type: "labeled", label: "a", expression: literalAbcd }, oneOrMoreLiteral = { type: "one_or_more", expression: literalAbcd },
labeledEfgh = { type: "labeled", label: "b", expression: literalEfgh }, simpleNotLiteral = { type: "simple_not", expression: literalAbcd },
labeledIjkl = { type: "labeled", label: "c", expression: literalIjkl }, textOptionalLiteral = { type: "text", expression: optionalLiteral },
sequenceOfLiterals = { simpleAndOptionalLiteral = { type: "simple_and", expression: optionalLiteral },
simpleNotOptionalLiteral = { type: "simple_not", expression: optionalLiteral },
semanticAnd = { type: "semantic_and", code: " code " },
semanticNot = { type: "semantic_not", code: " code " },
labeledAbcd = { type: "labeled", label: "a", expression: literalAbcd },
labeledEfgh = { type: "labeled", label: "b", expression: literalEfgh },
labeledIjkl = { type: "labeled", label: "c", expression: literalIjkl },
labeledSimpleNotLiteral = { type: "labeled", label: "label", expression: simpleNotLiteral },
sequenceOfLiterals = {
type: "sequence", type: "sequence",
elements: [literalAbcd, literalEfgh, literalIjkl] elements: [literalAbcd, literalEfgh, literalIjkl]
}, },
sequenceOfLabeleds = { sequenceOfLabeleds = {
type: "sequence", type: "sequence",
elements: [labeledAbcd, labeledEfgh, labeledIjkl] elements: [labeledAbcd, labeledEfgh, labeledIjkl]
}, },
choiceOfLiterals = { choiceOfLiterals = {
type: "choice", type: "choice",
alternatives: [literalAbcd, literalEfgh, literalIjkl] alternatives: [literalAbcd, literalEfgh, literalIjkl]
}; },
namedChoiceOfLiterals = { type: "named", name: "start rule", expression: choiceOfLiterals };
function oneRuleGrammar(expression) { function oneRuleGrammar(expression) {
var initializer = arguments.length > 1 ? arguments[1] : null; var initializer = arguments.length > 1 ? arguments[1] : null;
@ -192,7 +201,7 @@ describe("PEG.js grammar parser", function() {
}); });
expect('{ code } start = "abcd"' ).toParseAs(grammar); expect('{ code } start = "abcd"' ).toParseAs(grammar);
expect('{ code }; start = "abcd"').toParseAs(grammar); expect('{ code }\n; start = "abcd"').toParseAs(grammar);
}); });
/* Canonical Rule is "a: \"abcd\"". */ /* Canonical Rule is "a: \"abcd\"". */
@ -200,14 +209,19 @@ describe("PEG.js grammar parser", function() {
expect('start = "abcd" / "efgh" / "ijkl"').toParseAs( expect('start = "abcd" / "efgh" / "ijkl"').toParseAs(
oneRuleGrammar(choiceOfLiterals) oneRuleGrammar(choiceOfLiterals)
); );
expect('start\n= "abcd" / "efgh" / "ijkl"').toParseAs(
oneRuleGrammar(choiceOfLiterals)
);
expect('start =\n"abcd" / "efgh" / "ijkl"').toParseAs(
oneRuleGrammar(choiceOfLiterals)
);
expect('start "start rule" = "abcd" / "efgh" / "ijkl"').toParseAs( expect('start "start rule" = "abcd" / "efgh" / "ijkl"').toParseAs(
oneRuleGrammar({ oneRuleGrammar(namedChoiceOfLiterals)
type: "named", );
name: "start rule", expect('start "start rule"\n= "abcd" / "efgh" / "ijkl"').toParseAs(
expression: choiceOfLiterals oneRuleGrammar(namedChoiceOfLiterals)
})
); );
expect('start = "abcd" / "efgh" / "ijkl";').toParseAs( expect('start = "abcd" / "efgh" / "ijkl"\n;').toParseAs(
oneRuleGrammar(choiceOfLiterals) oneRuleGrammar(choiceOfLiterals)
); );
}); });
@ -230,6 +244,18 @@ describe("PEG.js grammar parser", function() {
type: "choice", type: "choice",
alternatives: [sequenceOfLiterals, sequenceOfLiterals, sequenceOfLiterals] alternatives: [sequenceOfLiterals, sequenceOfLiterals, sequenceOfLiterals]
})); }));
expect(
'start = "abcd" "efgh" "ijkl"\n/ "abcd" "efgh" "ijkl"\n/ "abcd" "efgh" "ijkl"'
).toParseAs(oneRuleGrammar({
type: "choice",
alternatives: [sequenceOfLiterals, sequenceOfLiterals, sequenceOfLiterals]
}));
expect(
'start = "abcd" "efgh" "ijkl" /\n"abcd" "efgh" "ijkl" /\n"abcd" "efgh" "ijkl"'
).toParseAs(oneRuleGrammar({
type: "choice",
alternatives: [sequenceOfLiterals, sequenceOfLiterals, sequenceOfLiterals]
}));
}); });
/* Canonical Sequence is "\"abcd\" \"efgh\" \"ijkl\"". */ /* Canonical Sequence is "\"abcd\" \"efgh\" \"ijkl\"". */
@ -237,6 +263,9 @@ describe("PEG.js grammar parser", function() {
expect('start = a:"abcd" { code }').toParseAs( expect('start = a:"abcd" { code }').toParseAs(
oneRuleGrammar({ type: "action", expression: labeledAbcd, code: " code " }) oneRuleGrammar({ type: "action", expression: labeledAbcd, code: " code " })
); );
expect('start = a:"abcd"\n{ code }').toParseAs(
oneRuleGrammar({ type: "action", expression: labeledAbcd, code: " code " })
);
expect('start = a:"abcd" b:"efgh" c:"ijkl" { code }').toParseAs( expect('start = a:"abcd" b:"efgh" c:"ijkl" { code }').toParseAs(
oneRuleGrammar({ oneRuleGrammar({
type: "action", type: "action",
@ -244,6 +273,13 @@ describe("PEG.js grammar parser", function() {
code: " code " code: " code "
}) })
); );
expect('start = a:"abcd"\nb:"efgh"\nc:"ijkl" { code }').toParseAs(
oneRuleGrammar({
type: "action",
expression: sequenceOfLabeleds,
code: " code "
})
);
expect('start = a:"abcd"').toParseAs( expect('start = a:"abcd"').toParseAs(
oneRuleGrammar(labeledAbcd) oneRuleGrammar(labeledAbcd)
@ -251,64 +287,54 @@ describe("PEG.js grammar parser", function() {
expect('start = a:"abcd" b:"efgh" c:"ijkl"').toParseAs( expect('start = a:"abcd" b:"efgh" c:"ijkl"').toParseAs(
oneRuleGrammar(sequenceOfLabeleds) oneRuleGrammar(sequenceOfLabeleds)
); );
expect('start = a:"abcd"\nb:"efgh"\nc:"ijkl"').toParseAs(
oneRuleGrammar(sequenceOfLabeleds)
);
}); });
/* Canonical Labeled is "label:\"abcd\"". */ /* Canonical Labeled is "label:\"abcd\"". */
it("parses Labeled", function() { it("parses Labeled", function() {
expect('start = label:!"abcd"').toParseAs(oneRuleGrammar({ expect('start = label:!"abcd"' ).toParseAs(oneRuleGrammar(labeledSimpleNotLiteral));
type: "labeled", expect('start = label\n:!"abcd"').toParseAs(oneRuleGrammar(labeledSimpleNotLiteral));
label: "label", expect('start = label:\n!"abcd"').toParseAs(oneRuleGrammar(labeledSimpleNotLiteral));
expression: simpleNotLiteral expect('start = !"abcd"' ).toParseAs(oneRuleGrammar(simpleNotLiteral));
}));
expect('start = !"abcd"' ).toParseAs(oneRuleGrammar(simpleNotLiteral));
}); });
/* Canonical Prefixed is "!\"abcd\"". */ /* Canonical Prefixed is "!\"abcd\"". */
it("parses Prefixed", function() { it("parses Prefixed", function() {
expect('start = $"abcd"?' ).toParseAs(oneRuleGrammar({ expect('start = $"abcd"?' ).toParseAs(oneRuleGrammar(textOptionalLiteral));
type: "text", expect('start = $\n"abcd"?' ).toParseAs(oneRuleGrammar(textOptionalLiteral));
expression: optionalLiteral expect('start = &{ code }' ).toParseAs(oneRuleGrammar(semanticAnd));
})); expect('start = &\n{ code }').toParseAs(oneRuleGrammar(semanticAnd));
expect('start = &{ code }').toParseAs(oneRuleGrammar({ expect('start = &"abcd"?' ).toParseAs(oneRuleGrammar(simpleAndOptionalLiteral));
type: "semantic_and", expect('start = &\n"abcd"?' ).toParseAs(oneRuleGrammar(simpleAndOptionalLiteral));
code: " code " expect('start = !{ code }' ).toParseAs(oneRuleGrammar(semanticNot));
})); expect('start = !\n{ code }').toParseAs(oneRuleGrammar(semanticNot));
expect('start = &"abcd"?' ).toParseAs(oneRuleGrammar({ expect('start = !"abcd"?' ).toParseAs(oneRuleGrammar(simpleNotOptionalLiteral));
type: "simple_and", expect('start = !\n"abcd"?' ).toParseAs(oneRuleGrammar(simpleNotOptionalLiteral));
expression: optionalLiteral expect('start = "abcd"?' ).toParseAs(oneRuleGrammar(optionalLiteral));
}));
expect('start = !{ code }').toParseAs(oneRuleGrammar({
type: "semantic_not",
code: " code "
}));
expect('start = !"abcd"?' ).toParseAs(oneRuleGrammar({
type: "simple_not",
expression: optionalLiteral
}));
expect('start = "abcd"?' ).toParseAs(oneRuleGrammar(optionalLiteral));
}); });
/* Canonical Suffixed is "\"abcd\"?". */ /* Canonical Suffixed is "\"abcd\"?". */
it("parses Suffixed", function() { it("parses Suffixed", function() {
expect('start = "abcd"?').toParseAs(oneRuleGrammar(optionalLiteral)); expect('start = "abcd"?' ).toParseAs(oneRuleGrammar(optionalLiteral));
expect('start = "abcd"*').toParseAs(oneRuleGrammar({ expect('start = "abcd"\n?').toParseAs(oneRuleGrammar(optionalLiteral));
type: "zero_or_more", expect('start = "abcd"*' ).toParseAs(oneRuleGrammar(zeroOrMoreLiteral));
expression: literalAbcd expect('start = "abcd"\n*').toParseAs(oneRuleGrammar(zeroOrMoreLiteral));
})); expect('start = "abcd"+' ).toParseAs(oneRuleGrammar(oneOrMoreLiteral));
expect('start = "abcd"+').toParseAs(oneRuleGrammar({ expect('start = "abcd"\n+').toParseAs(oneRuleGrammar(oneOrMoreLiteral));
type: "one_or_more", expect('start = "abcd"' ).toParseAs(literalGrammar("abcd"));
expression: literalAbcd
}));
expect('start = "abcd"' ).toParseAs(literalGrammar("abcd"));
}); });
/* Canonical Primary is "\"abcd\"". */ /* Canonical Primary is "\"abcd\"". */
it("parses Primary", function() { it("parses Primary", function() {
expect('start = a' ).toParseAs(ruleRefGrammar("a")); expect('start = a' ).toParseAs(ruleRefGrammar("a"));
expect('start = "abcd"' ).toParseAs(literalGrammar("abcd")); expect('start = "abcd"' ).toParseAs(literalGrammar("abcd"));
expect('start = [a-d]' ).toParseAs(classGrammar([["a", "d"]], "[a-d]")); expect('start = [a-d]' ).toParseAs(classGrammar([["a", "d"]], "[a-d]"));
expect('start = .' ).toParseAs(oneRuleGrammar({ type: "any" })); expect('start = .' ).toParseAs(oneRuleGrammar({ type: "any" }));
expect('start = ("abcd")').toParseAs(literalGrammar("abcd")); expect('start = ("abcd")' ).toParseAs(literalGrammar("abcd"));
expect('start = (\n"abcd")').toParseAs(literalGrammar("abcd"));
expect('start = ("abcd"\n)').toParseAs(literalGrammar("abcd"));
}); });
/* Canonical Action is "{ code }". */ /* Canonical Action is "{ code }". */
@ -350,8 +376,6 @@ describe("PEG.js grammar parser", function() {
expect('start = a0' ).toParseAs(ruleRefGrammar("a0")); expect('start = a0' ).toParseAs(ruleRefGrammar("a0"));
expect('start = a_' ).toParseAs(ruleRefGrammar("a_")); expect('start = a_' ).toParseAs(ruleRefGrammar("a_"));
expect('start = abcd').toParseAs(ruleRefGrammar("abcd")); expect('start = abcd').toParseAs(ruleRefGrammar("abcd"));
expect('start = a\n').toParseAs(ruleRefGrammar("a"));
}); });
/* Canonical Literal is "\"abcd\"". */ /* Canonical Literal is "\"abcd\"". */
@ -360,8 +384,6 @@ describe("PEG.js grammar parser", function() {
expect("start = 'abcd'" ).toParseAs(literalGrammar("abcd")); expect("start = 'abcd'" ).toParseAs(literalGrammar("abcd"));
expect('start = "abcd"i').toParseAs(literalGrammar("abcd", true)); expect('start = "abcd"i').toParseAs(literalGrammar("abcd", true));
expect('start = "abcd"\n').toParseAs(literalGrammar("abcd"));
}); });
/* Canonical String is "\"abcd\"". */ /* Canonical String is "\"abcd\"". */
@ -374,8 +396,6 @@ describe("PEG.js grammar parser", function() {
expect('start "abcd" = "abcd"' ).toParseAs(grammar); expect('start "abcd" = "abcd"' ).toParseAs(grammar);
expect('start \'abcd\' = "abcd"').toParseAs(grammar); expect('start \'abcd\' = "abcd"').toParseAs(grammar);
expect('start "abcd"\n= "abcd"').toParseAs(grammar);
}); });
/* Canonical DoubleQuotedString is "\"abcd\"". */ /* Canonical DoubleQuotedString is "\"abcd\"". */
@ -445,8 +465,6 @@ describe("PEG.js grammar parser", function() {
expect('start = [a-d]i').toParseAs( expect('start = [a-d]i').toParseAs(
classGrammar([["a", "d"]], "[a-d]i", false, true) classGrammar([["a", "d"]], "[a-d]i", false, true)
); );
expect('start = [a-d]\n').toParseAs(classGrammar([["a", "d"]], "[a-d]"));
}); });
/* Canonical ClassCharacterRange is "a-d". */ /* Canonical ClassCharacterRange is "a-d". */

@ -1,18 +1,34 @@
{ {
var utils = require("./utils"); function extractOptional(optional, index) {
return optional ? optional[index] : null;
}
function extractList(list, index) {
var result = new Array(list.length), i;
for (i = 0; i < list.length; i++) {
result[i] = list[i][index];
}
return result;
}
function buildList(first, rest, index) {
return [first].concat(extractList(rest, index));
}
} }
Grammar Grammar
= __ initializer:Initializer? rules:Rule+ { = __ initializer:(Initializer __)? rules:(Rule __)+ {
return { return {
type: "grammar", type: "grammar",
initializer: initializer, initializer: extractOptional(initializer, 0),
rules: rules rules: extractList(rules, 0)
}; };
} }
Initializer Initializer
= code:Action Semicolon? { = code:Action (__ Semicolon)? {
return { return {
type: "initializer", type: "initializer",
code: code code: code
@ -20,14 +36,17 @@ Initializer
} }
Rule Rule
= name:Identifier displayName:String? Equals expression:Expression Semicolon? { = name:Identifier __
displayName:(String __)?
Equals __
expression:Expression (__ Semicolon)? {
return { return {
type: "rule", type: "rule",
name: name, name: name,
expression: displayName !== null expression: displayName !== null
? { ? {
type: "named", type: "named",
name: displayName, name: displayName[0],
expression: expression expression: expression
} }
: expression : expression
@ -38,46 +57,31 @@ Expression
= Choice = Choice
Choice Choice
= head:Sequence tail:(Slash Sequence)* { = first:Sequence rest:(__ Slash __ Sequence)* {
if (tail.length > 0) { return rest.length > 0
var alternatives = [head].concat(utils.map( ? { type: "choice", alternatives: buildList(first, rest, 3) }
tail, : first;
function(element) { return element[1]; }
));
return {
type: "choice",
alternatives: alternatives
};
} else {
return head;
}
} }
Sequence Sequence
= elements:Labeled+ code:Action { = first:Labeled rest:(__ Labeled)* __ code:Action {
var expression = elements.length !== 1 var expression = rest.length > 0
? { ? { type: "sequence", elements: buildList(first, rest, 1) }
type: "sequence", : first;
elements: elements
}
: elements[0];
return { return {
type: "action", type: "action",
expression: expression, expression: expression,
code: code code: code
}; };
} }
/ elements:Labeled+ { / first:Labeled rest:(__ Labeled)* {
return elements.length !== 1 return rest.length > 0
? { ? { type: "sequence", elements: buildList(first, rest, 1) }
type: "sequence", : first;
elements: elements
}
: elements[0];
} }
Labeled Labeled
= label:Identifier Colon expression:Prefixed { = label:Identifier __ Colon __ expression:Prefixed {
return { return {
type: "labeled", type: "labeled",
label: label, label: label,
@ -87,31 +91,31 @@ Labeled
/ Prefixed / Prefixed
Prefixed Prefixed
= Dollar expression:Suffixed { = Dollar __ expression:Suffixed {
return { return {
type: "text", type: "text",
expression: expression expression: expression
}; };
} }
/ And code:Action { / And __ code:Action {
return { return {
type: "semantic_and", type: "semantic_and",
code: code code: code
}; };
} }
/ And expression:Suffixed { / And __ expression:Suffixed {
return { return {
type: "simple_and", type: "simple_and",
expression: expression expression: expression
}; };
} }
/ Not code:Action { / Not __ code:Action {
return { return {
type: "semantic_not", type: "semantic_not",
code: code code: code
}; };
} }
/ Not expression:Suffixed { / Not __ expression:Suffixed {
return { return {
type: "simple_not", type: "simple_not",
expression: expression expression: expression
@ -120,19 +124,19 @@ Prefixed
/ Suffixed / Suffixed
Suffixed Suffixed
= expression:Primary Question { = expression:Primary __ Question {
return { return {
type: "optional", type: "optional",
expression: expression expression: expression
}; };
} }
/ expression:Primary Star { / expression:Primary __ Star {
return { return {
type: "zero_or_more", type: "zero_or_more",
expression: expression expression: expression
}; };
} }
/ expression:Primary Plus { / expression:Primary __ Plus {
return { return {
type: "one_or_more", type: "one_or_more",
expression: expression expression: expression
@ -141,7 +145,7 @@ Suffixed
/ Primary / Primary
Primary Primary
= name:Identifier !(String? Equals) { = name:Identifier !(__ (String __)? Equals) {
return { return {
type: "rule_ref", type: "rule_ref",
name: name name: name
@ -150,7 +154,7 @@ Primary
/ Literal / Literal
/ Class / Class
/ Dot { return { type: "any" }; } / Dot { return { type: "any" }; }
/ Lparen expression:Expression Rparen { return expression; } / Lparen __ expression:Expression __ Rparen { return expression; }
/* "Lexical" elements */ /* "Lexical" elements */
@ -166,19 +170,19 @@ NonBraceCharacters
NonBraceCharacter NonBraceCharacter
= [^{}] = [^{}]
Equals = "=" __ { return "="; } Equals = "="
Colon = ":" __ { return ":"; } Colon = ":"
Semicolon = ";" __ { return ";"; } Semicolon = ";"
Slash = "/" __ { return "/"; } Slash = "/"
And = "&" __ { return "&"; } And = "&"
Not = "!" __ { return "!"; } Not = "!"
Dollar = "$" __ { return "$"; } Dollar = "$"
Question = "?" __ { return "?"; } Question = "?"
Star = "*" __ { return "*"; } Star = "*"
Plus = "+" __ { return "+"; } Plus = "+"
Lparen = "(" __ { return "("; } Lparen = "("
Rparen = ")" __ { return ")"; } Rparen = ")"
Dot = "." __ { return "."; } Dot = "."
/* /*
* Modeled after ECMA-262, 5th ed., 7.6, but much simplified: * Modeled after ECMA-262, 5th ed., 7.6, but much simplified:
@ -199,14 +203,14 @@ Dot = "." __ { return "."; }
* purpose in the grammar. * purpose in the grammar.
*/ */
Identifier "identifier" Identifier "identifier"
= chars:$((Letter / "_") (Letter / Digit / "_")*) __ { return chars; } = $((Letter / "_") (Letter / Digit / "_")*)
/* /*
* Modeled after ECMA-262, 5th ed., 7.8.4. (syntax & semantics, rules only * Modeled after ECMA-262, 5th ed., 7.8.4. (syntax & semantics, rules only
* vaguely). * vaguely).
*/ */
Literal "literal" Literal "literal"
= value:(DoubleQuotedString / SingleQuotedString) flags:"i"? __ { = value:(DoubleQuotedString / SingleQuotedString) flags:"i"? {
return { return {
type: "literal", type: "literal",
value: value, value: value,
@ -215,7 +219,7 @@ Literal "literal"
} }
String "string" String "string"
= string:(DoubleQuotedString / SingleQuotedString) __ { return string; } = string:(DoubleQuotedString / SingleQuotedString) { return string; }
DoubleQuotedString DoubleQuotedString
= '"' chars:DoubleQuotedCharacter* '"' { return chars.join(""); } = '"' chars:DoubleQuotedCharacter* '"' { return chars.join(""); }
@ -246,19 +250,15 @@ SimpleSingleQuotedCharacter
= !("'" / "\\" / EOLChar) char_:. { return char_; } = !("'" / "\\" / EOLChar) char_:. { return char_; }
Class "character class" Class "character class"
= class_:( = "[" inverted:"^"? parts:(ClassCharacterRange / ClassCharacter)* "]" flags:"i"? {
"[" inverted:"^"? parts:(ClassCharacterRange / ClassCharacter)* "]" flags:"i"? { return {
return { type: "class",
type: "class", parts: parts,
parts: parts, rawText: text().replace(/\s+$/, ""),
rawText: text().replace(/\s+$/, ""), inverted: inverted === "^",
inverted: inverted === "^", ignoreCase: flags === "i"
ignoreCase: flags === "i" };
}; }
}
)
__
{ return class_; }
ClassCharacterRange ClassCharacterRange
= begin:ClassCharacter "-" end:ClassCharacter { = begin:ClassCharacter "-" end:ClassCharacter {

Loading…
Cancel
Save