PEG.js grammar: Change whitespace handling

Before this commit, whitespace was handled at the lexical level by
making tokens consume any whitespace coming after them. This was
accomplished by appending |__| to every token rule.

This commit changes whitespace handling to be more explicit. Tokens no
longer consume whitespace coming after them and syntactic rules have to
cope with it. While this slightly complicates the syntactic grammar, I
think it's a cleaner way. Moreover, it is what JavaScript example
grammar does.

One small side-effect of thich change is that the grammar is now
stand-alone (it doesn't require utils.js anymore).
redux
David Majda 10 years ago
parent 4725632641
commit ae89f5e469

File diff suppressed because it is too large Load Diff

@ -1,25 +1,34 @@
describe("PEG.js grammar parser", function() {
var trivialGrammar,
literalAbcd = { type: "literal", value: "abcd", ignoreCase: false },
literalEfgh = { type: "literal", value: "efgh", ignoreCase: false },
literalIjkl = { type: "literal", value: "ijkl", ignoreCase: false },
optionalLiteral = { type: "optional", expression: literalAbcd },
simpleNotLiteral = { type: "simple_not", expression: literalAbcd },
labeledAbcd = { type: "labeled", label: "a", expression: literalAbcd },
labeledEfgh = { type: "labeled", label: "b", expression: literalEfgh },
labeledIjkl = { type: "labeled", label: "c", expression: literalIjkl },
sequenceOfLiterals = {
literalAbcd = { type: "literal", value: "abcd", ignoreCase: false },
literalEfgh = { type: "literal", value: "efgh", ignoreCase: false },
literalIjkl = { type: "literal", value: "ijkl", ignoreCase: false },
optionalLiteral = { type: "optional", expression: literalAbcd },
zeroOrMoreLiteral = { type: "zero_or_more", expression: literalAbcd },
oneOrMoreLiteral = { type: "one_or_more", expression: literalAbcd },
simpleNotLiteral = { type: "simple_not", expression: literalAbcd },
textOptionalLiteral = { type: "text", expression: optionalLiteral },
simpleAndOptionalLiteral = { type: "simple_and", expression: optionalLiteral },
simpleNotOptionalLiteral = { type: "simple_not", expression: optionalLiteral },
semanticAnd = { type: "semantic_and", code: " code " },
semanticNot = { type: "semantic_not", code: " code " },
labeledAbcd = { type: "labeled", label: "a", expression: literalAbcd },
labeledEfgh = { type: "labeled", label: "b", expression: literalEfgh },
labeledIjkl = { type: "labeled", label: "c", expression: literalIjkl },
labeledSimpleNotLiteral = { type: "labeled", label: "label", expression: simpleNotLiteral },
sequenceOfLiterals = {
type: "sequence",
elements: [literalAbcd, literalEfgh, literalIjkl]
},
sequenceOfLabeleds = {
sequenceOfLabeleds = {
type: "sequence",
elements: [labeledAbcd, labeledEfgh, labeledIjkl]
},
choiceOfLiterals = {
choiceOfLiterals = {
type: "choice",
alternatives: [literalAbcd, literalEfgh, literalIjkl]
};
},
namedChoiceOfLiterals = { type: "named", name: "start rule", expression: choiceOfLiterals };
function oneRuleGrammar(expression) {
var initializer = arguments.length > 1 ? arguments[1] : null;
@ -192,7 +201,7 @@ describe("PEG.js grammar parser", function() {
});
expect('{ code } start = "abcd"' ).toParseAs(grammar);
expect('{ code }; start = "abcd"').toParseAs(grammar);
expect('{ code }\n; start = "abcd"').toParseAs(grammar);
});
/* Canonical Rule is "a: \"abcd\"". */
@ -200,14 +209,19 @@ describe("PEG.js grammar parser", function() {
expect('start = "abcd" / "efgh" / "ijkl"').toParseAs(
oneRuleGrammar(choiceOfLiterals)
);
expect('start\n= "abcd" / "efgh" / "ijkl"').toParseAs(
oneRuleGrammar(choiceOfLiterals)
);
expect('start =\n"abcd" / "efgh" / "ijkl"').toParseAs(
oneRuleGrammar(choiceOfLiterals)
);
expect('start "start rule" = "abcd" / "efgh" / "ijkl"').toParseAs(
oneRuleGrammar({
type: "named",
name: "start rule",
expression: choiceOfLiterals
})
oneRuleGrammar(namedChoiceOfLiterals)
);
expect('start "start rule"\n= "abcd" / "efgh" / "ijkl"').toParseAs(
oneRuleGrammar(namedChoiceOfLiterals)
);
expect('start = "abcd" / "efgh" / "ijkl";').toParseAs(
expect('start = "abcd" / "efgh" / "ijkl"\n;').toParseAs(
oneRuleGrammar(choiceOfLiterals)
);
});
@ -230,6 +244,18 @@ describe("PEG.js grammar parser", function() {
type: "choice",
alternatives: [sequenceOfLiterals, sequenceOfLiterals, sequenceOfLiterals]
}));
expect(
'start = "abcd" "efgh" "ijkl"\n/ "abcd" "efgh" "ijkl"\n/ "abcd" "efgh" "ijkl"'
).toParseAs(oneRuleGrammar({
type: "choice",
alternatives: [sequenceOfLiterals, sequenceOfLiterals, sequenceOfLiterals]
}));
expect(
'start = "abcd" "efgh" "ijkl" /\n"abcd" "efgh" "ijkl" /\n"abcd" "efgh" "ijkl"'
).toParseAs(oneRuleGrammar({
type: "choice",
alternatives: [sequenceOfLiterals, sequenceOfLiterals, sequenceOfLiterals]
}));
});
/* Canonical Sequence is "\"abcd\" \"efgh\" \"ijkl\"". */
@ -237,6 +263,9 @@ describe("PEG.js grammar parser", function() {
expect('start = a:"abcd" { code }').toParseAs(
oneRuleGrammar({ type: "action", expression: labeledAbcd, code: " code " })
);
expect('start = a:"abcd"\n{ code }').toParseAs(
oneRuleGrammar({ type: "action", expression: labeledAbcd, code: " code " })
);
expect('start = a:"abcd" b:"efgh" c:"ijkl" { code }').toParseAs(
oneRuleGrammar({
type: "action",
@ -244,6 +273,13 @@ describe("PEG.js grammar parser", function() {
code: " code "
})
);
expect('start = a:"abcd"\nb:"efgh"\nc:"ijkl" { code }').toParseAs(
oneRuleGrammar({
type: "action",
expression: sequenceOfLabeleds,
code: " code "
})
);
expect('start = a:"abcd"').toParseAs(
oneRuleGrammar(labeledAbcd)
@ -251,64 +287,54 @@ describe("PEG.js grammar parser", function() {
expect('start = a:"abcd" b:"efgh" c:"ijkl"').toParseAs(
oneRuleGrammar(sequenceOfLabeleds)
);
expect('start = a:"abcd"\nb:"efgh"\nc:"ijkl"').toParseAs(
oneRuleGrammar(sequenceOfLabeleds)
);
});
/* Canonical Labeled is "label:\"abcd\"". */
it("parses Labeled", function() {
expect('start = label:!"abcd"').toParseAs(oneRuleGrammar({
type: "labeled",
label: "label",
expression: simpleNotLiteral
}));
expect('start = !"abcd"' ).toParseAs(oneRuleGrammar(simpleNotLiteral));
expect('start = label:!"abcd"' ).toParseAs(oneRuleGrammar(labeledSimpleNotLiteral));
expect('start = label\n:!"abcd"').toParseAs(oneRuleGrammar(labeledSimpleNotLiteral));
expect('start = label:\n!"abcd"').toParseAs(oneRuleGrammar(labeledSimpleNotLiteral));
expect('start = !"abcd"' ).toParseAs(oneRuleGrammar(simpleNotLiteral));
});
/* Canonical Prefixed is "!\"abcd\"". */
it("parses Prefixed", function() {
expect('start = $"abcd"?' ).toParseAs(oneRuleGrammar({
type: "text",
expression: optionalLiteral
}));
expect('start = &{ code }').toParseAs(oneRuleGrammar({
type: "semantic_and",
code: " code "
}));
expect('start = &"abcd"?' ).toParseAs(oneRuleGrammar({
type: "simple_and",
expression: optionalLiteral
}));
expect('start = !{ code }').toParseAs(oneRuleGrammar({
type: "semantic_not",
code: " code "
}));
expect('start = !"abcd"?' ).toParseAs(oneRuleGrammar({
type: "simple_not",
expression: optionalLiteral
}));
expect('start = "abcd"?' ).toParseAs(oneRuleGrammar(optionalLiteral));
expect('start = $"abcd"?' ).toParseAs(oneRuleGrammar(textOptionalLiteral));
expect('start = $\n"abcd"?' ).toParseAs(oneRuleGrammar(textOptionalLiteral));
expect('start = &{ code }' ).toParseAs(oneRuleGrammar(semanticAnd));
expect('start = &\n{ code }').toParseAs(oneRuleGrammar(semanticAnd));
expect('start = &"abcd"?' ).toParseAs(oneRuleGrammar(simpleAndOptionalLiteral));
expect('start = &\n"abcd"?' ).toParseAs(oneRuleGrammar(simpleAndOptionalLiteral));
expect('start = !{ code }' ).toParseAs(oneRuleGrammar(semanticNot));
expect('start = !\n{ code }').toParseAs(oneRuleGrammar(semanticNot));
expect('start = !"abcd"?' ).toParseAs(oneRuleGrammar(simpleNotOptionalLiteral));
expect('start = !\n"abcd"?' ).toParseAs(oneRuleGrammar(simpleNotOptionalLiteral));
expect('start = "abcd"?' ).toParseAs(oneRuleGrammar(optionalLiteral));
});
/* Canonical Suffixed is "\"abcd\"?". */
it("parses Suffixed", function() {
expect('start = "abcd"?').toParseAs(oneRuleGrammar(optionalLiteral));
expect('start = "abcd"*').toParseAs(oneRuleGrammar({
type: "zero_or_more",
expression: literalAbcd
}));
expect('start = "abcd"+').toParseAs(oneRuleGrammar({
type: "one_or_more",
expression: literalAbcd
}));
expect('start = "abcd"' ).toParseAs(literalGrammar("abcd"));
expect('start = "abcd"?' ).toParseAs(oneRuleGrammar(optionalLiteral));
expect('start = "abcd"\n?').toParseAs(oneRuleGrammar(optionalLiteral));
expect('start = "abcd"*' ).toParseAs(oneRuleGrammar(zeroOrMoreLiteral));
expect('start = "abcd"\n*').toParseAs(oneRuleGrammar(zeroOrMoreLiteral));
expect('start = "abcd"+' ).toParseAs(oneRuleGrammar(oneOrMoreLiteral));
expect('start = "abcd"\n+').toParseAs(oneRuleGrammar(oneOrMoreLiteral));
expect('start = "abcd"' ).toParseAs(literalGrammar("abcd"));
});
/* Canonical Primary is "\"abcd\"". */
it("parses Primary", function() {
expect('start = a' ).toParseAs(ruleRefGrammar("a"));
expect('start = "abcd"' ).toParseAs(literalGrammar("abcd"));
expect('start = [a-d]' ).toParseAs(classGrammar([["a", "d"]], "[a-d]"));
expect('start = .' ).toParseAs(oneRuleGrammar({ type: "any" }));
expect('start = ("abcd")').toParseAs(literalGrammar("abcd"));
expect('start = a' ).toParseAs(ruleRefGrammar("a"));
expect('start = "abcd"' ).toParseAs(literalGrammar("abcd"));
expect('start = [a-d]' ).toParseAs(classGrammar([["a", "d"]], "[a-d]"));
expect('start = .' ).toParseAs(oneRuleGrammar({ type: "any" }));
expect('start = ("abcd")' ).toParseAs(literalGrammar("abcd"));
expect('start = (\n"abcd")').toParseAs(literalGrammar("abcd"));
expect('start = ("abcd"\n)').toParseAs(literalGrammar("abcd"));
});
/* Canonical Action is "{ code }". */
@ -350,8 +376,6 @@ describe("PEG.js grammar parser", function() {
expect('start = a0' ).toParseAs(ruleRefGrammar("a0"));
expect('start = a_' ).toParseAs(ruleRefGrammar("a_"));
expect('start = abcd').toParseAs(ruleRefGrammar("abcd"));
expect('start = a\n').toParseAs(ruleRefGrammar("a"));
});
/* Canonical Literal is "\"abcd\"". */
@ -360,8 +384,6 @@ describe("PEG.js grammar parser", function() {
expect("start = 'abcd'" ).toParseAs(literalGrammar("abcd"));
expect('start = "abcd"i').toParseAs(literalGrammar("abcd", true));
expect('start = "abcd"\n').toParseAs(literalGrammar("abcd"));
});
/* Canonical String is "\"abcd\"". */
@ -374,8 +396,6 @@ describe("PEG.js grammar parser", function() {
expect('start "abcd" = "abcd"' ).toParseAs(grammar);
expect('start \'abcd\' = "abcd"').toParseAs(grammar);
expect('start "abcd"\n= "abcd"').toParseAs(grammar);
});
/* Canonical DoubleQuotedString is "\"abcd\"". */
@ -445,8 +465,6 @@ describe("PEG.js grammar parser", function() {
expect('start = [a-d]i').toParseAs(
classGrammar([["a", "d"]], "[a-d]i", false, true)
);
expect('start = [a-d]\n').toParseAs(classGrammar([["a", "d"]], "[a-d]"));
});
/* Canonical ClassCharacterRange is "a-d". */

@ -1,18 +1,34 @@
{
var utils = require("./utils");
function extractOptional(optional, index) {
return optional ? optional[index] : null;
}
function extractList(list, index) {
var result = new Array(list.length), i;
for (i = 0; i < list.length; i++) {
result[i] = list[i][index];
}
return result;
}
function buildList(first, rest, index) {
return [first].concat(extractList(rest, index));
}
}
Grammar
= __ initializer:Initializer? rules:Rule+ {
= __ initializer:(Initializer __)? rules:(Rule __)+ {
return {
type: "grammar",
initializer: initializer,
rules: rules
initializer: extractOptional(initializer, 0),
rules: extractList(rules, 0)
};
}
Initializer
= code:Action Semicolon? {
= code:Action (__ Semicolon)? {
return {
type: "initializer",
code: code
@ -20,14 +36,17 @@ Initializer
}
Rule
= name:Identifier displayName:String? Equals expression:Expression Semicolon? {
= name:Identifier __
displayName:(String __)?
Equals __
expression:Expression (__ Semicolon)? {
return {
type: "rule",
name: name,
expression: displayName !== null
? {
type: "named",
name: displayName,
name: displayName[0],
expression: expression
}
: expression
@ -38,46 +57,31 @@ Expression
= Choice
Choice
= head:Sequence tail:(Slash Sequence)* {
if (tail.length > 0) {
var alternatives = [head].concat(utils.map(
tail,
function(element) { return element[1]; }
));
return {
type: "choice",
alternatives: alternatives
};
} else {
return head;
}
= first:Sequence rest:(__ Slash __ Sequence)* {
return rest.length > 0
? { type: "choice", alternatives: buildList(first, rest, 3) }
: first;
}
Sequence
= elements:Labeled+ code:Action {
var expression = elements.length !== 1
? {
type: "sequence",
elements: elements
}
: elements[0];
= first:Labeled rest:(__ Labeled)* __ code:Action {
var expression = rest.length > 0
? { type: "sequence", elements: buildList(first, rest, 1) }
: first;
return {
type: "action",
expression: expression,
code: code
};
}
/ elements:Labeled+ {
return elements.length !== 1
? {
type: "sequence",
elements: elements
}
: elements[0];
/ first:Labeled rest:(__ Labeled)* {
return rest.length > 0
? { type: "sequence", elements: buildList(first, rest, 1) }
: first;
}
Labeled
= label:Identifier Colon expression:Prefixed {
= label:Identifier __ Colon __ expression:Prefixed {
return {
type: "labeled",
label: label,
@ -87,31 +91,31 @@ Labeled
/ Prefixed
Prefixed
= Dollar expression:Suffixed {
= Dollar __ expression:Suffixed {
return {
type: "text",
expression: expression
};
}
/ And code:Action {
/ And __ code:Action {
return {
type: "semantic_and",
code: code
};
}
/ And expression:Suffixed {
/ And __ expression:Suffixed {
return {
type: "simple_and",
expression: expression
};
}
/ Not code:Action {
/ Not __ code:Action {
return {
type: "semantic_not",
code: code
};
}
/ Not expression:Suffixed {
/ Not __ expression:Suffixed {
return {
type: "simple_not",
expression: expression
@ -120,19 +124,19 @@ Prefixed
/ Suffixed
Suffixed
= expression:Primary Question {
= expression:Primary __ Question {
return {
type: "optional",
expression: expression
};
}
/ expression:Primary Star {
/ expression:Primary __ Star {
return {
type: "zero_or_more",
expression: expression
};
}
/ expression:Primary Plus {
/ expression:Primary __ Plus {
return {
type: "one_or_more",
expression: expression
@ -141,7 +145,7 @@ Suffixed
/ Primary
Primary
= name:Identifier !(String? Equals) {
= name:Identifier !(__ (String __)? Equals) {
return {
type: "rule_ref",
name: name
@ -150,7 +154,7 @@ Primary
/ Literal
/ Class
/ Dot { return { type: "any" }; }
/ Lparen expression:Expression Rparen { return expression; }
/ Lparen __ expression:Expression __ Rparen { return expression; }
/* "Lexical" elements */
@ -166,19 +170,19 @@ NonBraceCharacters
NonBraceCharacter
= [^{}]
Equals = "=" __ { return "="; }
Colon = ":" __ { return ":"; }
Semicolon = ";" __ { return ";"; }
Slash = "/" __ { return "/"; }
And = "&" __ { return "&"; }
Not = "!" __ { return "!"; }
Dollar = "$" __ { return "$"; }
Question = "?" __ { return "?"; }
Star = "*" __ { return "*"; }
Plus = "+" __ { return "+"; }
Lparen = "(" __ { return "("; }
Rparen = ")" __ { return ")"; }
Dot = "." __ { return "."; }
Equals = "="
Colon = ":"
Semicolon = ";"
Slash = "/"
And = "&"
Not = "!"
Dollar = "$"
Question = "?"
Star = "*"
Plus = "+"
Lparen = "("
Rparen = ")"
Dot = "."
/*
* Modeled after ECMA-262, 5th ed., 7.6, but much simplified:
@ -199,14 +203,14 @@ Dot = "." __ { return "."; }
* purpose in the grammar.
*/
Identifier "identifier"
= chars:$((Letter / "_") (Letter / Digit / "_")*) __ { return chars; }
= $((Letter / "_") (Letter / Digit / "_")*)
/*
* Modeled after ECMA-262, 5th ed., 7.8.4. (syntax & semantics, rules only
* vaguely).
*/
Literal "literal"
= value:(DoubleQuotedString / SingleQuotedString) flags:"i"? __ {
= value:(DoubleQuotedString / SingleQuotedString) flags:"i"? {
return {
type: "literal",
value: value,
@ -215,7 +219,7 @@ Literal "literal"
}
String "string"
= string:(DoubleQuotedString / SingleQuotedString) __ { return string; }
= string:(DoubleQuotedString / SingleQuotedString) { return string; }
DoubleQuotedString
= '"' chars:DoubleQuotedCharacter* '"' { return chars.join(""); }
@ -246,19 +250,15 @@ SimpleSingleQuotedCharacter
= !("'" / "\\" / EOLChar) char_:. { return char_; }
Class "character class"
= class_:(
"[" inverted:"^"? parts:(ClassCharacterRange / ClassCharacter)* "]" flags:"i"? {
return {
type: "class",
parts: parts,
rawText: text().replace(/\s+$/, ""),
inverted: inverted === "^",
ignoreCase: flags === "i"
};
}
)
__
{ return class_; }
= "[" inverted:"^"? parts:(ClassCharacterRange / ClassCharacter)* "]" flags:"i"? {
return {
type: "class",
parts: parts,
rawText: text().replace(/\s+$/, ""),
inverted: inverted === "^",
ignoreCase: flags === "i"
};
}
ClassCharacterRange
= begin:ClassCharacter "-" end:ClassCharacter {

Loading…
Cancel
Save