Browse Source

PEG.js grammar: More JavaScript-like rules for skipped elements

redux
David Majda 8 years ago
parent
commit
bcb5271649
  1. 1270
      lib/parser.js
  2. 124
      spec/parser.spec.js
  3. 92
      src/parser.pegjs

1270
lib/parser.js
File diff suppressed because it is too large
View File

124
spec/parser.spec.js

@ -337,6 +337,58 @@ describe("PEG.js grammar parser", function() {
expect('start = ("abcd"\n)').toParseAs(literalGrammar("abcd"));
});
/* The SourceCharacter rule is not tested. */
/* Canonical WhiteSpace is " ". */
it("parses WhiteSpace", function() {
expect('start =\t"abcd"' ).toParseAs(trivialGrammar);
expect('start =\x0B"abcd"' ).toParseAs(trivialGrammar); // no "\v" in IE
expect('start =\f"abcd"' ).toParseAs(trivialGrammar);
expect('start = "abcd"' ).toParseAs(trivialGrammar);
expect('start =\u00A0"abcd"').toParseAs(trivialGrammar);
expect('start =\uFEFF"abcd"').toParseAs(trivialGrammar);
expect('start =\u1680"abcd"').toParseAs(trivialGrammar);
});
/* Canonical LineTerminator is "\n". */
it("parses LineTerminator", function() {
expect('start = "\n"' ).toFailToParse();
expect('start = "\r"' ).toFailToParse();
expect('start = "\u2028"').toFailToParse();
expect('start = "\u2029"').toFailToParse();
});
/* Canonical LineTerminatorSequence is "\r\n". */
it("parses LineTerminatorSequence", function() {
expect('start =\n"abcd"' ).toParseAs(trivialGrammar);
expect('start =\r\n"abcd"' ).toParseAs(trivialGrammar);
expect('start =\r"abcd"' ).toParseAs(trivialGrammar);
expect('start =\u2028"abcd"').toParseAs(trivialGrammar);
expect('start =\u2029"abcd"').toParseAs(trivialGrammar);
});
// Canonical Comment is "/* comment */".
it("parses Comment", function() {
expect('start =// comment\n"abcd"' ).toParseAs(trivialGrammar);
expect('start =/* comment */"abcd"').toParseAs(trivialGrammar);
});
// Canonical MultiLineComment is "/* comment */".
it("parses MultiLineComment", function() {
expect('start =/**/"abcd"' ).toParseAs(trivialGrammar);
expect('start =/*a*/"abcd"' ).toParseAs(trivialGrammar);
expect('start =/*aaa*/"abcd"').toParseAs(trivialGrammar);
expect('start =/**/*/"abcd"').toFailToParse();
});
/* Canonical SingleLineComment is "// comment". */
it("parses SingleLineComment", function() {
expect('start =//\n"abcd"' ).toParseAs(trivialGrammar);
expect('start =//a\n"abcd"' ).toParseAs(trivialGrammar);
expect('start =//aaa\n"abcd"').toParseAs(trivialGrammar);
});
/* Canonical Action is "{ code }". */
it("parses Action", function() {
expect('start = "abcd" { code }').toParseAs(actionGrammar(" code "));
@ -534,83 +586,19 @@ describe("PEG.js grammar parser", function() {
/* Canonical EOLEscapeSequence is "\\\n". */
it("parses EOLEscapeSequence", function() {
expect('start = "\\\n"' ).toParseAs(literalGrammar(""));
expect('start = "\\\r\n"').toParseAs(literalGrammar(""));
});
/* Trivial character class rules are not tested. */
/* Unicode character category rules are not tested. */
/* Canonical __ is "\n". */
it("parses __", function() {
expect('start ="abcd"' ).toParseAs(trivialGrammar);
expect('start = "abcd"' ).toParseAs(trivialGrammar);
expect('start =\n"abcd"' ).toParseAs(trivialGrammar);
expect('start =\r\n"abcd"' ).toParseAs(trivialGrammar);
expect('start =/* comment */"abcd"').toParseAs(trivialGrammar);
expect('start = "abcd"' ).toParseAs(trivialGrammar);
});
// Canonical Comment is "/* comment */".
it("parses Comment", function() {
expect('start =// comment\n"abcd"' ).toParseAs(trivialGrammar);
expect('start =/* comment */"abcd"').toParseAs(trivialGrammar);
});
/* Canonical SingleLineComment is "// comment". */
it("parses SingleLineComment", function() {
expect('start =//\n"abcd"' ).toParseAs(trivialGrammar);
expect('start =//a\n"abcd"' ).toParseAs(trivialGrammar);
expect('start =//aaa\n"abcd"').toParseAs(trivialGrammar);
});
// Canonical MultiLineComment is "/* comment */".
it("parses MultiLineComment", function() {
expect('start =/**/"abcd"' ).toParseAs(trivialGrammar);
expect('start =/*a*/"abcd"' ).toParseAs(trivialGrammar);
expect('start =/*aaa*/"abcd"').toParseAs(trivialGrammar);
expect('start =/***/"abcd"' ).toParseAs(trivialGrammar);
expect('start =/**/*/"abcd"').toFailToParse();
});
/* Canonical EOL is "\n". */
it("parses EOL", function() {
expect('start =\n"abcd"' ).toParseAs(trivialGrammar);
expect('start =\r\n"abcd"' ).toParseAs(trivialGrammar);
expect('start =\r"abcd"' ).toParseAs(trivialGrammar);
expect('start =\u2028"abcd"').toParseAs(trivialGrammar);
expect('start =\u2029"abcd"').toParseAs(trivialGrammar);
});
/* Canonical EOLChar is "\n". */
it("parses EOLChar", function() {
expect('start =\n"abcd"' ).toParseAs(trivialGrammar);
expect('start =\r"abcd"' ).toParseAs(trivialGrammar);
expect('start =\u2028"abcd"').toParseAs(trivialGrammar);
expect('start =\u2029"abcd"').toParseAs(trivialGrammar);
});
/* Canonical Whitespace is " ". */
it("parses Whitespace", function() {
expect('start =\t"abcd"' ).toParseAs(trivialGrammar);
expect('start =\x0B"abcd"' ).toParseAs(trivialGrammar); // no "\v" in IE
expect('start =\f"abcd"' ).toParseAs(trivialGrammar);
expect('start = "abcd"' ).toParseAs(trivialGrammar);
expect('start =\u00A0"abcd"').toParseAs(trivialGrammar);
expect('start =\uFEFF"abcd"').toParseAs(trivialGrammar);
expect('start =\u1680"abcd"').toParseAs(trivialGrammar);
expect('start =\u180E"abcd"').toParseAs(trivialGrammar);
expect('start =\u2000"abcd"').toParseAs(trivialGrammar);
expect('start =\u2001"abcd"').toParseAs(trivialGrammar);
expect('start =\u2002"abcd"').toParseAs(trivialGrammar);
expect('start =\u2003"abcd"').toParseAs(trivialGrammar);
expect('start =\u2004"abcd"').toParseAs(trivialGrammar);
expect('start =\u2005"abcd"').toParseAs(trivialGrammar);
expect('start =\u2006"abcd"').toParseAs(trivialGrammar);
expect('start =\u2007"abcd"').toParseAs(trivialGrammar);
expect('start =\u2008"abcd"').toParseAs(trivialGrammar);
expect('start =\u2009"abcd"').toParseAs(trivialGrammar);
expect('start =\u200A"abcd"').toParseAs(trivialGrammar);
expect('start =\u202F"abcd"').toParseAs(trivialGrammar);
expect('start =\u205F"abcd"').toParseAs(trivialGrammar);
expect('start =\u3000"abcd"').toParseAs(trivialGrammar);
});
});

92
src/parser.pegjs

@ -171,6 +171,38 @@ Primary
/* "Lexical" elements */
SourceCharacter
= .
WhiteSpace "whitespace"
= "\t"
/ "\v"
/ "\f"
/ " "
/ "\u00A0"
/ "\uFEFF"
/ Zs
LineTerminator
= [\n\r\u2028\u2029]
LineTerminatorSequence "end of line"
= "\n"
/ "\r\n"
/ "\r"
/ "\u2028"
/ "\u2029"
Comment "comment"
= MultiLineComment
/ SingleLineComment
MultiLineComment
= "/*" (!"*/" SourceCharacter)* "*/"
SingleLineComment
= "//" (!LineTerminator SourceCharacter)*
Action "action"
= braced:Braced __ { return braced.substr(1, braced.length - 2); }
@ -210,7 +242,7 @@ DoubleQuotedCharacter
/ EOLEscapeSequence
SimpleDoubleQuotedCharacter
= !('"' / "\\" / EOLChar) char_:. { return char_; }
= !('"' / "\\" / LineTerminator) char_:. { return char_; }
SingleQuotedString
= "'" chars:SingleQuotedCharacter* "'" { return chars.join(""); }
@ -224,7 +256,7 @@ SingleQuotedCharacter
/ EOLEscapeSequence
SimpleSingleQuotedCharacter
= !("'" / "\\" / EOLChar) char_:. { return char_; }
= !("'" / "\\" / LineTerminator) char_:. { return char_; }
Class "character class"
= "[" inverted:"^"? parts:(ClassCharacterRange / ClassCharacter)* "]" flags:"i"? {
@ -258,10 +290,10 @@ BracketDelimitedCharacter
/ EOLEscapeSequence
SimpleBracketDelimitedCharacter
= !("]" / "\\" / EOLChar) char_:. { return char_; }
= !("]" / "\\" / LineTerminator) char_:. { return char_; }
SimpleEscapeSequence
= "\\" !(Digit / "x" / "u" / EOLChar) char_:. {
= "\\" !(Digit / "x" / "u" / LineTerminator) char_:. {
return char_
.replace("b", "\b")
.replace("f", "\f")
@ -285,7 +317,7 @@ UnicodeEscapeSequence
}
EOLEscapeSequence
= "\\" eol:EOL { return ""; }
= "\\" eol:LineTerminatorSequence { return ""; }
Digit
= [0-9]
@ -303,27 +335,35 @@ LowerCaseLetter
UpperCaseLetter
= [A-Z]
__ = (Whitespace / EOL / Comment)*
Comment "comment"
= SingleLineComment
/ MultiLineComment
SingleLineComment
= "//" (!EOLChar .)*
MultiLineComment
= "/*" (!"*/" .)* "*/"
/*
* Unicode Character Categories
*
* Extracted from the following Unicode Character Database file:
*
* http://www.unicode.org/Public/6.3.0/ucd/extracted/DerivedGeneralCategory.txt
*
* Unix magic used:
*
* grep "; $CATEGORY" DerivedGeneralCategory.txt | # Filter characters
* cut -f1 -d " " | # Extract code points
* grep -v '[0-9a-fA-F]\{5\}' | # Exclude non-BMP characters
* sed -e 's/\.\./-/' | # Adjust formatting
* sed -e 's/\([0-9a-fA-F]\{4\}\)/\\u\1/g' | # Adjust formatting
* tr -d '\n' # Join lines
*
* ECMA-262 allows using Unicode 3.0 or later, version 6.3.0 was the latest one
* at the time of writing.
*
* Non-BMP characters are completely ignored to avoid surrogate pair handling
* (detecting surrogate pairs isn't possible with a simple character class and
* other methods would degrade performance). I don't consider it a big deal as
* even parsers in JavaScript engines of common browsers seem to ignore them.
*/
EOL "end of line"
= "\n"
/ "\r\n"
/ "\r"
/ "\u2028"
/ "\u2029"
// Separator, Space
Zs = [\u0020\u00A0\u1680\u2000-\u200A\u202F\u205F\u3000]
EOLChar
= [\n\r\u2028\u2029]
/* Skipped */
Whitespace "whitespace"
= [ \t\v\f\u00A0\uFEFF\u1680\u180E\u2000-\u200A\u202F\u205F\u3000]
__
= (WhiteSpace / LineTerminatorSequence / Comment)*
Loading…
Cancel
Save