PEG.js grammar: More JavaScript-like rules for strings/literals/classes

11 years ago · 0d6b91cb20
parent bcb5271649
commit 0d6b91cb20
3 changed files with 658 additions and 583 deletions
--- a/lib/parser.js
+++ b/lib/parser.js
--- a/spec/parser.spec.js
+++ b/spec/parser.spec.js
@ -428,16 +428,16 @@ describe("PEG.js grammar parser", function() {
    expect('start = abcd').toParseAs(ruleRefGrammar("abcd"));
  });

-  /* Canonical Literal is "\"abcd\"". */
-  it("parses Literal", function() {
-    expect('start = "abcd"' ).toParseAs(literalGrammar("abcd"));
-    expect("start = 'abcd'" ).toParseAs(literalGrammar("abcd"));
+  /* Trivial character class rules are not tested. */

+  /* Canonical LiteralMatcher is "\"abcd\"". */
+  it("parses LiteralMatcher", function() {
+    expect('start = "abcd"' ).toParseAs(literalGrammar("abcd"));
    expect('start = "abcd"i').toParseAs(literalGrammar("abcd", true));
  });

-  /* Canonical String is "\"abcd\"". */
-  it("parses String", function() {
+  /* Canonical StringLiteral is "\"abcd\"". */
+  it("parses StringLiteral", function() {
    var grammar = oneRuleGrammar({
      type:       "named",
      name:       "abcd",
@ -448,67 +448,42 @@ describe("PEG.js grammar parser", function() {
    expect('start \'abcd\' = "abcd"').toParseAs(grammar);
  });

-  /* Canonical DoubleQuotedString is "\"abcd\"". */
-  it("parses DoubleQuotedString", function() {
-    expect('start = ""'   ).toParseAs(literalGrammar(""));
-    expect('start = "a"'  ).toParseAs(literalGrammar("a"));
-    expect('start = "abc"').toParseAs(literalGrammar("abc"));
-  });
-
-  /* Canonical DoubleQuotedCharacter is "a". */
-  it("parses DoubleQuotedCharacter", function() {
-    expect('start = "a"'      ).toParseAs(literalGrammar("a"));
-    expect('start = "\\n"'    ).toParseAs(literalGrammar("\n"));
-    expect('start = "\\0"'    ).toParseAs(literalGrammar("\x00"));
-    expect('start = "\\xFF"'  ).toParseAs(literalGrammar("\xFF"));
-    expect('start = "\\uFFFF"').toParseAs(literalGrammar("\uFFFF"));
-    expect('start = "\\\n"'   ).toParseAs(literalGrammar(""));
-  });
-
-  /* Canonical SimpleDoubleQuotedCharacter is "a". */
-  it("parses SimpleDoubleQuotedCharacter", function() {
-    expect('start = "a"').toParseAs(literalGrammar("a"));
+  /* Canonical DoubleStringCharacter is "a". */
+  it("parses DoubleStringCharacter", function() {
+    expect('start = "a"'   ).toParseAs(literalGrammar("a"));
+    expect('start = "\\n"' ).toParseAs(literalGrammar("\n"));
+    expect('start = "\\\n"').toParseAs(literalGrammar(""));

    expect('start = """' ).toFailToParse();
    expect('start = "\\"').toFailToParse();
    expect('start = "\n"').toFailToParse();
  });

-  /* Canonical SingleQuotedString is "'abcd'". */
-  it("parses SingleQuotedString", function() {
-    expect("start = ''"   ).toParseAs(literalGrammar(""));
-    expect("start = 'a'"  ).toParseAs(literalGrammar("a"));
-    expect("start = 'abc'").toParseAs(literalGrammar("abc"));
-  });
-
-  /* Canonical SingleQuotedCharacter is "a". */
-  it("parses SingleQuotedCharacter", function() {
-    expect("start = 'a'"      ).toParseAs(literalGrammar("a"));
-    expect("start = '\\n'"    ).toParseAs(literalGrammar("\n"));
-    expect("start = '\\0'"    ).toParseAs(literalGrammar("\x00"));
-    expect("start = '\\xFF'"  ).toParseAs(literalGrammar("\xFF"));
-    expect("start = '\\uFFFF'").toParseAs(literalGrammar("\uFFFF"));
-    expect("start = '\\\n'"   ).toParseAs(literalGrammar(""));
-  });
-
-  /* Canonical SimpleSingleQuotedCharacter is "a". */
-  it("parses SimpleSingleQuotedCharacter", function() {
-    expect("start = 'a'").toParseAs(literalGrammar("a"));
+  /* Canonical SingleStringCharacter is "a". */
+  it("parses SingleStringCharacter", function() {
+    expect("start = 'a'"   ).toParseAs(literalGrammar("a"));
+    expect("start = '\\n'" ).toParseAs(literalGrammar("\n"));
+    expect("start = '\\\n'").toParseAs(literalGrammar(""));

    expect("start = '''" ).toFailToParse();
    expect("start = '\\'").toFailToParse();
    expect("start = '\n'").toFailToParse();
  });

-  /* Canonical Class is "[a-d]". */
-  it("parses Class", function() {
-    expect('start = []'         ).toParseAs(classGrammar([],           "[]"));
-    expect('start = [a-d]'      ).toParseAs(classGrammar([["a", "d"]], "[a-d]"));
-    expect('start = [a]'        ).toParseAs(classGrammar(["a"],        "[a]"));
+  /* Canonical CharacterClassMatcher is "[a-d]". */
+  it("parses CharacterClassMatcher", function() {
+    expect('start = []').toParseAs(
+      classGrammar([], "[]")
+    );
+    expect('start = [a-d]').toParseAs(
+      classGrammar([["a", "d"]], "[a-d]")
+    );
+    expect('start = [a]').toParseAs(
+      classGrammar(["a"], "[a]")
+    );
    expect('start = [a-de-hi-l]').toParseAs(
      classGrammar([["a", "d"], ["e", "h"], ["i", "l"]], "[a-de-hi-l]")
    );
-
    expect('start = [^a-d]').toParseAs(
      classGrammar([["a", "d"]], "[^a-d]", true, false)
    );
@ -520,8 +495,8 @@ describe("PEG.js grammar parser", function() {
  /* Canonical ClassCharacterRange is "a-d". */
  it("parses ClassCharacterRange", function() {
    expect('start = [a-d]').toParseAs(classGrammar([["a", "d"]], "[a-d]"));
-    expect('start = [a-a]').toParseAs(classGrammar([["a", "a"]], "[a-a]"));

+    expect('start = [a-a]').toParseAs(classGrammar([["a", "a"]], "[a-a]"));
    expect('start = [b-a]').toFailToParse({
      message: "Invalid character range: b-a."
    });
@ -529,67 +504,67 @@ describe("PEG.js grammar parser", function() {

  /* Canonical ClassCharacter is "a". */
  it("parses ClassCharacter", function() {
-    expect('start = [a]').toParseAs(classGrammar(["a"], "[a]"));
+    expect('start = [a]'   ).toParseAs(classGrammar(["a"],  "[a]"));
+    expect('start = [\\n]' ).toParseAs(classGrammar(["\n"], "[\\n]"));
+    expect('start = [\\\n]').toParseAs(classGrammar([''],   "[\\\n]"));
+
+    expect('start = []]' ).toFailToParse();
+    expect('start = [\\]').toFailToParse();
+    expect('start = [\n]').toFailToParse();
  });

-  /* Canonical BracketDelimitedCharacter is "a". */
-  it("parses BracketDelimitedCharacter", function() {
-    expect('start = [a]'      ).toParseAs(classGrammar(["a"],      "[a]"));
-    expect('start = [\\n]'    ).toParseAs(classGrammar(["\n"],     "[\\n]"));
-    expect('start = [\\0]'    ).toParseAs(classGrammar(["\x00"],   "[\\0]"));
-    expect('start = [\\xFF]'  ).toParseAs(classGrammar(["\xFF"],   "[\\xFF]"));
-    expect('start = [\\uFFFF]').toParseAs(classGrammar(["\uFFFF"], "[\\uFFFF]"));
-    expect('start = [\\\n]'   ).toParseAs(classGrammar([""],       "[\\\n]"));
+  /* Canonical LineContinuation is "\\\n". */
+  it("parses LineContinuation", function() {
+    expect('start = "\\\r\n"').toParseAs(literalGrammar(""));
  });

-  /* Canonical SimpleBracketDelimiedCharacter is "a". */
-  it("parses SimpleBracketDelimitedCharacter", function() {
-    expect('start = [a]').toParseAs(classGrammar(["a"], "[a]"));
+  /* Canonical EscapeSequence is "n". */
+  it("parses EscapeSequence", function() {
+    expect('start = "\\n"'    ).toParseAs(literalGrammar("\n"));
+    expect('start = "\\0"'    ).toParseAs(literalGrammar("\x00"));
+    expect('start = "\\xFF"'  ).toParseAs(literalGrammar("\xFF"));
+    expect('start = "\\uFFFF"').toParseAs(literalGrammar("\uFFFF"));

-    expect('start = []]' ).toFailToParse();
-    expect('start = [\\]').toFailToParse();
-    expect('start = [\n]').toFailToParse();
+    expect('start = "\\09"').toFailToParse();
  });

-  /* Canonical SimpleEscapeSequence is "\\n". */
-  it("parses SimpleEscapeSequence", function() {
-    expect('start = "\\b"').toParseAs(literalGrammar("\b"));
-    expect('start = "\\f"').toParseAs(literalGrammar("\f"));
+  /* Canonical CharacterEscapeSequence is "n". */
+  it("parses CharacterEscapeSequence", function() {
    expect('start = "\\n"').toParseAs(literalGrammar("\n"));
-    expect('start = "\\r"').toParseAs(literalGrammar("\r"));
-    expect('start = "\\t"').toParseAs(literalGrammar("\t"));
-    expect('start = "\\v"').toParseAs(literalGrammar("\x0B")); // no "\v" in IE
    expect('start = "\\a"').toParseAs(literalGrammar("a"));
-
-    expect('start = "\\1"').toFailToParse();
-    expect('start = "\\x"').toFailToParse();
-    expect('start = "\\u"').toFailToParse();
  });

-  /* Canonical ZeroEscapeSequence is "\\0". */
-  it("parses ZeroEscapeSequence", function() {
-    expect('start = "\\0"').toParseAs(literalGrammar("\x00"));
+  /* Canonical SingleEscapeCharacter is "n". */
+  it("parses SingleEscapeCharacter", function() {
+    expect('start = "\\\'"').toParseAs(literalGrammar("'"));
+    expect('start = "\\""' ).toParseAs(literalGrammar('"'));
+    expect('start = "\\\\"').toParseAs(literalGrammar("\\"));
+    expect('start = "\\b"' ).toParseAs(literalGrammar("\b"));
+    expect('start = "\\f"' ).toParseAs(literalGrammar("\f"));
+    expect('start = "\\n"' ).toParseAs(literalGrammar("\n"));
+    expect('start = "\\r"' ).toParseAs(literalGrammar("\r"));
+    expect('start = "\\t"' ).toParseAs(literalGrammar("\t"));
+    expect('start = "\\v"' ).toParseAs(literalGrammar("\x0B"));   // no "\v" in IE
+  });

-    expect('start = "\\00"').toFailToParse();
-    expect('start = "\\09"').toFailToParse();
+  /* Canonical NonEscapeCharacter is "a". */
+  it("parses NonEscapeCharacter", function() {
+    expect('start = "\\a"').toParseAs(literalGrammar("a"));
  });

-  /* Canonical HexEscapeSequence is "\\xFF". */
+  /* The EscapeCharacter rule is not tested. */
+
+  /* Canonical HexEscapeSequence is "xFF". */
  it("parses HexEscapeSequence", function() {
    expect('start = "\\xFF"').toParseAs(literalGrammar("\xFF"));
  });

-  /* Canonical UnicodeEscapeSequence is "\\uFFFF". */
+  /* Canonical UnicodeEscapeSequence is "uFFFF". */
  it("parses UnicodeEscapeSequence", function() {
    expect('start = "\\uFFFF"').toParseAs(literalGrammar("\uFFFF"));
  });

-  /* Canonical EOLEscapeSequence is "\\\n". */
-  it("parses EOLEscapeSequence", function() {
-    expect('start = "\\\r\n"').toParseAs(literalGrammar(""));
-  });
-
-  /* Trivial character class rules are not tested. */
+  /* Digit rules are not tested. */

  /* Unicode character category rules are not tested. */

--- a/src/parser.pegjs
+++ b/src/parser.pegjs
@ -50,7 +50,7 @@ Initializer

 Rule
  = name:Identifier __
-    displayName:(String __)?
+    displayName:(StringLiteral __)?
    "=" __
    expression:Expression (__ ";")? {
      return {
@ -158,14 +158,14 @@ Suffixed
  / Primary

 Primary
-  = name:Identifier !(__ (String __)? "=") {
+  = name:Identifier !(__ (StringLiteral __)? "=") {
      return {
        type: "rule_ref",
        name: name
      };
    }
-  / Literal
-  / Class
+  / LiteralMatcher
+  / CharacterClassMatcher
  / "." { return { type: "any" }; }
  / "(" __ expression:Expression __ ")" { return expression; }

@ -216,124 +216,117 @@ NonBraceCharacter
  = [^{}]

 Identifier "identifier"
-  = $((Letter / "_") (Letter / Digit / "_")*)
+  = $((Letter / "_") (Letter / DecimalDigit / "_")*)

-Literal "literal"
-  = value:(DoubleQuotedString / SingleQuotedString) flags:"i"? {
-      return {
-        type:       "literal",
-        value:      value,
-        ignoreCase: flags === "i"
-      };
-    }
-
-String "string"
-  = string:(DoubleQuotedString / SingleQuotedString) { return string; }
-
-DoubleQuotedString
-  = '"' chars:DoubleQuotedCharacter* '"' { return chars.join(""); }
-
-DoubleQuotedCharacter
-  = SimpleDoubleQuotedCharacter
-  / SimpleEscapeSequence
-  / ZeroEscapeSequence
-  / HexEscapeSequence
-  / UnicodeEscapeSequence
-  / EOLEscapeSequence
-
-SimpleDoubleQuotedCharacter
-  = !('"' / "\\" / LineTerminator) char_:. { return char_; }
+Letter
+  = LowerCaseLetter
+  / UpperCaseLetter

-SingleQuotedString
-  = "'" chars:SingleQuotedCharacter* "'" { return chars.join(""); }
+LowerCaseLetter
+  = [a-z]

-SingleQuotedCharacter
-  = SimpleSingleQuotedCharacter
-  / SimpleEscapeSequence
-  / ZeroEscapeSequence
-  / HexEscapeSequence
-  / UnicodeEscapeSequence
-  / EOLEscapeSequence
+UpperCaseLetter
+  = [A-Z]

-SimpleSingleQuotedCharacter
-  = !("'" / "\\" / LineTerminator) char_:. { return char_; }
+LiteralMatcher "literal"
+  = value:StringLiteral ignoreCase:"i"? {
+      return { type: "literal", value: value, ignoreCase: ignoreCase !== null };
+    }

-Class "character class"
-  = "[" inverted:"^"? parts:(ClassCharacterRange / ClassCharacter)* "]" flags:"i"? {
+StringLiteral "string"
+  = '"' chars:DoubleStringCharacter* '"' { return chars.join(""); }
+  / "'" chars:SingleStringCharacter* "'" { return chars.join(""); }
+
+DoubleStringCharacter
+  = !('"' / "\\" / LineTerminator) SourceCharacter { return text(); }
+  / "\\" sequence:EscapeSequence { return sequence; }
+  / LineContinuation
+
+SingleStringCharacter
+  = !("'" / "\\" / LineTerminator) SourceCharacter { return text(); }
+  / "\\" sequence:EscapeSequence { return sequence; }
+  / LineContinuation
+
+CharacterClassMatcher "character class"
+  = "["
+    inverted:"^"?
+    parts:(ClassCharacterRange / ClassCharacter)*
+    "]"
+    ignoreCase:"i"?
+    {
      return {
        type:       "class",
        parts:      parts,
-        rawText:    text().replace(/\s+$/, ""),
-        inverted:   inverted === "^",
-        ignoreCase: flags === "i"
+        inverted:   inverted !== null,
+        ignoreCase: ignoreCase !== null,
+        rawText:    text()
      };
    }

 ClassCharacterRange
  = begin:ClassCharacter "-" end:ClassCharacter {
      if (begin.charCodeAt(0) > end.charCodeAt(0)) {
-        error("Invalid character range: " + text() + ".");
+        error(
+          "Invalid character range: " + text() + "."
+        );
      }

      return [begin, end];
    }

 ClassCharacter
-  = BracketDelimitedCharacter
+  = !("]" / "\\" / LineTerminator) SourceCharacter { return text(); }
+  / "\\" sequence:EscapeSequence { return sequence; }
+  / LineContinuation
+
+LineContinuation
+  = "\\" LineTerminatorSequence { return ""; }

-BracketDelimitedCharacter
-  = SimpleBracketDelimitedCharacter
-  / SimpleEscapeSequence
-  / ZeroEscapeSequence
+EscapeSequence
+  = CharacterEscapeSequence
+  / "0" !DecimalDigit { return "\0"; }
  / HexEscapeSequence
  / UnicodeEscapeSequence
-  / EOLEscapeSequence
-
-SimpleBracketDelimitedCharacter
-  = !("]" / "\\" / LineTerminator) char_:. { return char_; }
-
-SimpleEscapeSequence
-  = "\\" !(Digit / "x" / "u" / LineTerminator) char_:. {
-      return char_
-        .replace("b", "\b")
-        .replace("f", "\f")
-        .replace("n", "\n")
-        .replace("r", "\r")
-        .replace("t", "\t")
-        .replace("v", "\x0B"); // IE does not recognize "\v".
-    }

-ZeroEscapeSequence
-  = "\\0" !Digit { return "\x00"; }
+CharacterEscapeSequence
+  = SingleEscapeCharacter
+  / NonEscapeCharacter
+
+SingleEscapeCharacter
+  = "'"
+  / '"'
+  / "\\"
+  / "b"  { return "\b";   }
+  / "f"  { return "\f";   }
+  / "n"  { return "\n";   }
+  / "r"  { return "\r";   }
+  / "t"  { return "\t";   }
+  / "v"  { return "\x0B"; }   // IE does not recognize "\v".
+
+NonEscapeCharacter
+  = !(EscapeCharacter / LineTerminator) SourceCharacter { return text(); }
+
+EscapeCharacter
+  = SingleEscapeCharacter
+  / DecimalDigit
+  / "x"
+  / "u"

 HexEscapeSequence
-  = "\\x" digits:$(HexDigit HexDigit) {
+  = "x" digits:$(HexDigit HexDigit) {
      return String.fromCharCode(parseInt(digits, 16));
    }

 UnicodeEscapeSequence
-  = "\\u" digits:$(HexDigit HexDigit HexDigit HexDigit) {
+  = "u" digits:$(HexDigit HexDigit HexDigit HexDigit) {
      return String.fromCharCode(parseInt(digits, 16));
    }

-EOLEscapeSequence
-  = "\\" eol:LineTerminatorSequence { return ""; }
-
-Digit
+DecimalDigit
  = [0-9]

 HexDigit
-  = [0-9a-fA-F]
-
-Letter
-  = LowerCaseLetter
-  / UpperCaseLetter
-
-LowerCaseLetter
-  = [a-z]
-
-UpperCaseLetter
-  = [A-Z]
+  = [0-9a-f]i

 /*
 * Unicode Character Categories