From b540b2d4603221674bb05068ce3c352d2cddbf74 Mon Sep 17 00:00:00 2001
From: David Majda <david@majda.cz>
Date: Fri, 30 Sep 2011 11:17:47 +0200
Subject: [PATCH] Implement case-insensitive literal matching

---
 README.md             |  2 +-
 src/emitter.js        | 24 ++++++++++++++++++----
 src/parser.js         | 46 ++++++++++++++++++++++++++++++++++++-------
 src/parser.pegjs      |  7 ++++---
 test/compiler-test.js | 38 ++++++++++++++++++++++++-----------
 test/parser-test.js   | 21 +++++++++++---------
 test/passes-test.js   | 18 ++++++++---------
 7 files changed, 112 insertions(+), 44 deletions(-)
diff --git a/README.md b/README.md
index a238bbd..5f023d1 100644
--- a/README.md
+++ b/README.md
@@ -128,7 +128,7 @@ There are several types of parsing expressions, some of them containing subexpre
 
 #### "*literal*"<br>'*literal*'
 
-Match exact literal string and return it. The string syntax is the same as in JavaScript.
+Match exact literal string and return it. The string syntax is the same as in JavaScript. Appending `i` right after the literal makes the match case-insensitive.
 
 #### .
 
diff --git a/src/emitter.js b/src/emitter.js
index 3341780..3f7d9e9 100644
--- a/src/emitter.js
+++ b/src/emitter.js
@@ -585,12 +585,28 @@ PEG.compiler.emitter = function(ast) {
             '#if node.value.length === 0',
             '  #{resultVar} = "";',
             '#else',
-            '  #if node.value.length === 1',
-            '    if (input.charCodeAt(pos) === #{node.value.charCodeAt(0)}) {',
+            '  #if !node.ignoreCase',
+            '    #if node.value.length === 1',
+            '      if (input.charCodeAt(pos) === #{node.value.charCodeAt(0)}) {',
+            '    #else',
+            '      if (input.substr(pos, #{node.value.length}) === #{string(node.value)}) {',
+            '    #end',
             '  #else',
-            '    if (input.substr(pos, #{node.value.length}) === #{string(node.value)}) {',
+            /*
+             * One-char literals are not optimized when case-insensitive
+             * matching is enabled. This is because there is no simple way to
+             * lowercase a character code that works for character outside ASCII
+             * letters. Moreover, |toLowerCase| can change string length,
+             * meaning the result of lowercasing a character can be more
+             * characters.
+             */
+            '    if (input.substr(pos, #{node.value.length}).toLowerCase() === #{string(node.value.toLowerCase())}) {',
             '  #end',
-            '    #{resultVar} = #{string(node.value)};',
+            '    #if !node.ignoreCase',
+            '      #{resultVar} = #{string(node.value)};',
+            '    #else',
+            '      #{resultVar} = input.substr(pos, #{node.value.length});',
+            '    #end',
             '    pos += #{node.value.length};',
             '  } else {',
             '    #{resultVar} = null;',
diff --git a/src/parser.js b/src/parser.js
index 6b76597..cc9e68c 100644
--- a/src/parser.js
+++ b/src/parser.js
@@ -1793,19 +1793,51 @@ PEG.parser = (function(){
           return cachedResult.result;
         }
         
-        var result0;
-        var pos0, pos1;
+        var result0, result1, result2, result3;
+        var pos0, pos1, pos2;
         
         reportFailures++;
         pos0 = pos;
-        result0 = parse_string();
+        pos1 = pos;
+        result0 = parse_doubleQuotedString();
+        if (result0 === null) {
+          result0 = parse_singleQuotedString();
+        }
         if (result0 !== null) {
-          result0 = (function(value) {
+          if (input.charCodeAt(pos) === 105) {
+            result1 = "i";
+            pos += 1;
+          } else {
+            result1 = null;
+            if (reportFailures === 0) {
+              matchFailed("\"i\"");
+            }
+          }
+          result1 = result1 !== null ? result1 : "";
+          if (result1 !== null) {
+            result2 = parse___();
+            if (result2 !== null) {
+              result0 = [result0, result1, result2];
+            } else {
+              result0 = null;
+              pos = pos1;
+            }
+          } else {
+            result0 = null;
+            pos = pos1;
+          }
+        } else {
+          result0 = null;
+          pos = pos1;
+        }
+        if (result0 !== null) {
+          result0 = (function(value, flags) {
               return {
-                type:  "literal",
-                value: value
+                type:       "literal",
+                value:      value,
+                ignoreCase: flags === "i"
               };
-            })(result0);
+            })(result0[0], result0[1]);
         }
         if (result0 === null) {
           pos = pos0;
diff --git a/src/parser.pegjs b/src/parser.pegjs
index 81f4a1b..5aca67a 100644
--- a/src/parser.pegjs
+++ b/src/parser.pegjs
@@ -195,10 +195,11 @@ identifier "identifier"
  * vaguely).
  */
 literal "literal"
-  = value:string {
+  = value:(doubleQuotedString / singleQuotedString) flags:"i"? __ {
       return {
-        type:  "literal",
-        value: value
+        type:       "literal",
+        value:      value,
+        ignoreCase: flags === "i"
       };
     }
 
diff --git a/test/compiler-test.js b/test/compiler-test.js
index 4ca27e6..ec2fa39 100644
--- a/test/compiler-test.js
+++ b/test/compiler-test.js
@@ -190,17 +190,33 @@ test("literals", function() {
   parses(zeroCharParser, "", "");
   doesNotParse(zeroCharParser, "a");
 
-  var oneCharParser = PEG.buildParser('start = "a"');
-  parses(oneCharParser, "a", "a");
-  doesNotParse(oneCharParser, "");
-  doesNotParse(oneCharParser, "b");
-
-  var multiCharParser = PEG.buildParser('start = "abcd"');
-  parses(multiCharParser, "abcd", "abcd");
-  doesNotParse(multiCharParser, "");
-  doesNotParse(multiCharParser, "abc");
-  doesNotParse(multiCharParser, "abcde");
-  doesNotParse(multiCharParser, "efgh");
+  var oneCharCaseSensitiveParser = PEG.buildParser('start = "a"');
+  parses(oneCharCaseSensitiveParser, "a", "a");
+  doesNotParse(oneCharCaseSensitiveParser, "");
+  doesNotParse(oneCharCaseSensitiveParser, "A");
+  doesNotParse(oneCharCaseSensitiveParser, "b");
+
+  var multiCharCaseSensitiveParser = PEG.buildParser('start = "abcd"');
+  parses(multiCharCaseSensitiveParser, "abcd", "abcd");
+  doesNotParse(multiCharCaseSensitiveParser, "");
+  doesNotParse(multiCharCaseSensitiveParser, "abc");
+  doesNotParse(multiCharCaseSensitiveParser, "abcde");
+  doesNotParse(multiCharCaseSensitiveParser, "ABCD");
+  doesNotParse(multiCharCaseSensitiveParser, "efgh");
+
+  var oneCharCaseInsensitiveParser = PEG.buildParser('start = "a"i');
+  parses(oneCharCaseInsensitiveParser, "a", "a");
+  parses(oneCharCaseInsensitiveParser, "A", "A");
+  doesNotParse(oneCharCaseInsensitiveParser, "");
+  doesNotParse(oneCharCaseInsensitiveParser, "b");
+
+  var multiCharCaseInsensitiveParser = PEG.buildParser('start = "abcd"i');
+  parses(multiCharCaseInsensitiveParser, "abcd", "abcd");
+  parses(multiCharCaseInsensitiveParser, "ABCD", "ABCD");
+  doesNotParse(multiCharCaseInsensitiveParser, "");
+  doesNotParse(multiCharCaseInsensitiveParser, "abc");
+  doesNotParse(multiCharCaseInsensitiveParser, "abcde");
+  doesNotParse(multiCharCaseInsensitiveParser, "efgh");
 
   /*
    * Test that the parsing position moves forward after successful parsing of
diff --git a/test/parser-test.js b/test/parser-test.js
index 5f3510b..8cb1753 100644
--- a/test/parser-test.js
+++ b/test/parser-test.js
@@ -83,10 +83,11 @@ function ruleRef(name) {
   };
 }
 
-function literal(value) {
+function literal(value, ignoreCase) {
   return {
-    type:  "literal",
-    value: value
+    type:       "literal",
+    value:      value,
+    ignoreCase: ignoreCase
   };
 }
 
@@ -103,9 +104,9 @@ function klass(inverted, parts, rawText) {
   };
 }
 
-var literalAbcd  = literal("abcd");
-var literalEfgh  = literal("efgh");
-var literalIjkl  = literal("ijkl");
+var literalAbcd  = literal("abcd", false);
+var literalEfgh  = literal("efgh", false);
+var literalIjkl  = literal("ijkl", false);
 
 var optionalLiteral = optional(literalAbcd);
 
@@ -128,7 +129,7 @@ function oneRuleGrammar(expression) {
   };
 }
 
-var simpleGrammar = oneRuleGrammar(literal("abcd"));
+var simpleGrammar = oneRuleGrammar(literal("abcd", false));
 
 function identifierGrammar(identifier) {
   return oneRuleGrammar(ruleRef(identifier));
@@ -136,7 +137,7 @@ function identifierGrammar(identifier) {
 
 var literal_ = literal;
 function literalGrammar(literal) {
-  return oneRuleGrammar(literal_(literal));
+  return oneRuleGrammar(literal_(literal, false));
 }
 
 function classGrammar(inverted, parts, rawText) {
@@ -147,7 +148,7 @@ var anyGrammar = oneRuleGrammar(any());
 
 var action_ = action;
 function actionGrammar(action) {
-  return oneRuleGrammar(action_(literal("a"), action));
+  return oneRuleGrammar(action_(literal("a", false), action));
 }
 
 var initializerGrammar = {
@@ -334,6 +335,8 @@ test("parses identifier", function() {
 /* Canonical literal is "\"abcd\"". */
 test("parses literal", function() {
   parserParses('start = "abcd"', literalGrammar("abcd"));
+  parserParses("start = 'abcd'", literalGrammar("abcd"));
+  parserParses('start = "abcd"i', oneRuleGrammar(literal("abcd", true)));
 });
 
 /* Canonical string is "\"abcd\"". */
diff --git a/test/passes-test.js b/test/passes-test.js
index 5d81abf..53798b4 100644
--- a/test/passes-test.js
+++ b/test/passes-test.js
@@ -16,7 +16,7 @@ test("removes proxy rules", function() {
     type:        "rule",
     name:        "proxied",
     displayName: null,
-    expression:  { type: "literal", value: "a" }
+    expression:  { type: "literal", value: "a", ignoreCase: false }
   };
 
   var proxiedRuleRef = {
@@ -50,8 +50,8 @@ test("removes proxy rules", function() {
         type:         "choice",
         alternatives: [
           proxiedRuleRef,
-          { type: "literal", value: "a" },
-          { type: "literal", value: "b" }
+          { type: "literal", value: "a", ignoreCase: false },
+          { type: "literal", value: "b", ignoreCase: false }
         ]
       })
     },
@@ -60,8 +60,8 @@ test("removes proxy rules", function() {
       ast:     simpleGrammarWithStartAndProxied({
         type:         "choice",
         alternatives: [
-          { type: "literal", value: "a" },
-          { type: "literal", value: "b" },
+          { type: "literal", value: "a", ignoreCase: false },
+          { type: "literal", value: "b", ignoreCase: false },
           proxiedRuleRef
         ]
       })
@@ -72,8 +72,8 @@ test("removes proxy rules", function() {
         type:     "sequence",
         elements: [
           proxiedRuleRef,
-          { type: "literal", value: "a" },
-          { type: "literal", value: "b" }
+          { type: "literal", value: "a", ignoreCase: false },
+          { type: "literal", value: "b", ignoreCase: false }
         ]
       })
     },
@@ -82,8 +82,8 @@ test("removes proxy rules", function() {
       ast:     simpleGrammarWithStartAndProxied({
         type:     "sequence",
         elements: [
-          { type: "literal", value: "a" },
-          { type: "literal", value: "b" },
+          { type: "literal", value: "a", ignoreCase: false },
+          { type: "literal", value: "b", ignoreCase: false },
           proxiedRuleRef
         ]
       })