From b540b2d4603221674bb05068ce3c352d2cddbf74 Mon Sep 17 00:00:00 2001 From: David Majda Date: Fri, 30 Sep 2011 11:17:47 +0200 Subject: [PATCH] Implement case-insensitive literal matching --- README.md | 2 +- src/emitter.js | 24 ++++++++++++++++++---- src/parser.js | 46 ++++++++++++++++++++++++++++++++++++------- src/parser.pegjs | 7 ++++--- test/compiler-test.js | 38 ++++++++++++++++++++++++----------- test/parser-test.js | 21 +++++++++++--------- test/passes-test.js | 18 ++++++++--------- 7 files changed, 112 insertions(+), 44 deletions(-) diff --git a/README.md b/README.md index a238bbd..5f023d1 100644 --- a/README.md +++ b/README.md @@ -128,7 +128,7 @@ There are several types of parsing expressions, some of them containing subexpre #### "*literal*"
'*literal*' -Match exact literal string and return it. The string syntax is the same as in JavaScript. +Match exact literal string and return it. The string syntax is the same as in JavaScript. Appending `i` right after the literal makes the match case-insensitive. #### . diff --git a/src/emitter.js b/src/emitter.js index 3341780..3f7d9e9 100644 --- a/src/emitter.js +++ b/src/emitter.js @@ -585,12 +585,28 @@ PEG.compiler.emitter = function(ast) { '#if node.value.length === 0', ' #{resultVar} = "";', '#else', - ' #if node.value.length === 1', - ' if (input.charCodeAt(pos) === #{node.value.charCodeAt(0)}) {', + ' #if !node.ignoreCase', + ' #if node.value.length === 1', + ' if (input.charCodeAt(pos) === #{node.value.charCodeAt(0)}) {', + ' #else', + ' if (input.substr(pos, #{node.value.length}) === #{string(node.value)}) {', + ' #end', ' #else', - ' if (input.substr(pos, #{node.value.length}) === #{string(node.value)}) {', + /* + * One-char literals are not optimized when case-insensitive + * matching is enabled. This is because there is no simple way to + * lowercase a character code that works for character outside ASCII + * letters. Moreover, |toLowerCase| can change string length, + * meaning the result of lowercasing a character can be more + * characters. + */ + ' if (input.substr(pos, #{node.value.length}).toLowerCase() === #{string(node.value.toLowerCase())}) {', ' #end', - ' #{resultVar} = #{string(node.value)};', + ' #if !node.ignoreCase', + ' #{resultVar} = #{string(node.value)};', + ' #else', + ' #{resultVar} = input.substr(pos, #{node.value.length});', + ' #end', ' pos += #{node.value.length};', ' } else {', ' #{resultVar} = null;', diff --git a/src/parser.js b/src/parser.js index 6b76597..cc9e68c 100644 --- a/src/parser.js +++ b/src/parser.js @@ -1793,19 +1793,51 @@ PEG.parser = (function(){ return cachedResult.result; } - var result0; - var pos0, pos1; + var result0, result1, result2, result3; + var pos0, pos1, pos2; reportFailures++; pos0 = pos; - result0 = parse_string(); + pos1 = pos; + result0 = parse_doubleQuotedString(); + if (result0 === null) { + result0 = parse_singleQuotedString(); + } if (result0 !== null) { - result0 = (function(value) { + if (input.charCodeAt(pos) === 105) { + result1 = "i"; + pos += 1; + } else { + result1 = null; + if (reportFailures === 0) { + matchFailed("\"i\""); + } + } + result1 = result1 !== null ? result1 : ""; + if (result1 !== null) { + result2 = parse___(); + if (result2 !== null) { + result0 = [result0, result1, result2]; + } else { + result0 = null; + pos = pos1; + } + } else { + result0 = null; + pos = pos1; + } + } else { + result0 = null; + pos = pos1; + } + if (result0 !== null) { + result0 = (function(value, flags) { return { - type: "literal", - value: value + type: "literal", + value: value, + ignoreCase: flags === "i" }; - })(result0); + })(result0[0], result0[1]); } if (result0 === null) { pos = pos0; diff --git a/src/parser.pegjs b/src/parser.pegjs index 81f4a1b..5aca67a 100644 --- a/src/parser.pegjs +++ b/src/parser.pegjs @@ -195,10 +195,11 @@ identifier "identifier" * vaguely). */ literal "literal" - = value:string { + = value:(doubleQuotedString / singleQuotedString) flags:"i"? __ { return { - type: "literal", - value: value + type: "literal", + value: value, + ignoreCase: flags === "i" }; } diff --git a/test/compiler-test.js b/test/compiler-test.js index 4ca27e6..ec2fa39 100644 --- a/test/compiler-test.js +++ b/test/compiler-test.js @@ -190,17 +190,33 @@ test("literals", function() { parses(zeroCharParser, "", ""); doesNotParse(zeroCharParser, "a"); - var oneCharParser = PEG.buildParser('start = "a"'); - parses(oneCharParser, "a", "a"); - doesNotParse(oneCharParser, ""); - doesNotParse(oneCharParser, "b"); - - var multiCharParser = PEG.buildParser('start = "abcd"'); - parses(multiCharParser, "abcd", "abcd"); - doesNotParse(multiCharParser, ""); - doesNotParse(multiCharParser, "abc"); - doesNotParse(multiCharParser, "abcde"); - doesNotParse(multiCharParser, "efgh"); + var oneCharCaseSensitiveParser = PEG.buildParser('start = "a"'); + parses(oneCharCaseSensitiveParser, "a", "a"); + doesNotParse(oneCharCaseSensitiveParser, ""); + doesNotParse(oneCharCaseSensitiveParser, "A"); + doesNotParse(oneCharCaseSensitiveParser, "b"); + + var multiCharCaseSensitiveParser = PEG.buildParser('start = "abcd"'); + parses(multiCharCaseSensitiveParser, "abcd", "abcd"); + doesNotParse(multiCharCaseSensitiveParser, ""); + doesNotParse(multiCharCaseSensitiveParser, "abc"); + doesNotParse(multiCharCaseSensitiveParser, "abcde"); + doesNotParse(multiCharCaseSensitiveParser, "ABCD"); + doesNotParse(multiCharCaseSensitiveParser, "efgh"); + + var oneCharCaseInsensitiveParser = PEG.buildParser('start = "a"i'); + parses(oneCharCaseInsensitiveParser, "a", "a"); + parses(oneCharCaseInsensitiveParser, "A", "A"); + doesNotParse(oneCharCaseInsensitiveParser, ""); + doesNotParse(oneCharCaseInsensitiveParser, "b"); + + var multiCharCaseInsensitiveParser = PEG.buildParser('start = "abcd"i'); + parses(multiCharCaseInsensitiveParser, "abcd", "abcd"); + parses(multiCharCaseInsensitiveParser, "ABCD", "ABCD"); + doesNotParse(multiCharCaseInsensitiveParser, ""); + doesNotParse(multiCharCaseInsensitiveParser, "abc"); + doesNotParse(multiCharCaseInsensitiveParser, "abcde"); + doesNotParse(multiCharCaseInsensitiveParser, "efgh"); /* * Test that the parsing position moves forward after successful parsing of diff --git a/test/parser-test.js b/test/parser-test.js index 5f3510b..8cb1753 100644 --- a/test/parser-test.js +++ b/test/parser-test.js @@ -83,10 +83,11 @@ function ruleRef(name) { }; } -function literal(value) { +function literal(value, ignoreCase) { return { - type: "literal", - value: value + type: "literal", + value: value, + ignoreCase: ignoreCase }; } @@ -103,9 +104,9 @@ function klass(inverted, parts, rawText) { }; } -var literalAbcd = literal("abcd"); -var literalEfgh = literal("efgh"); -var literalIjkl = literal("ijkl"); +var literalAbcd = literal("abcd", false); +var literalEfgh = literal("efgh", false); +var literalIjkl = literal("ijkl", false); var optionalLiteral = optional(literalAbcd); @@ -128,7 +129,7 @@ function oneRuleGrammar(expression) { }; } -var simpleGrammar = oneRuleGrammar(literal("abcd")); +var simpleGrammar = oneRuleGrammar(literal("abcd", false)); function identifierGrammar(identifier) { return oneRuleGrammar(ruleRef(identifier)); @@ -136,7 +137,7 @@ function identifierGrammar(identifier) { var literal_ = literal; function literalGrammar(literal) { - return oneRuleGrammar(literal_(literal)); + return oneRuleGrammar(literal_(literal, false)); } function classGrammar(inverted, parts, rawText) { @@ -147,7 +148,7 @@ var anyGrammar = oneRuleGrammar(any()); var action_ = action; function actionGrammar(action) { - return oneRuleGrammar(action_(literal("a"), action)); + return oneRuleGrammar(action_(literal("a", false), action)); } var initializerGrammar = { @@ -334,6 +335,8 @@ test("parses identifier", function() { /* Canonical literal is "\"abcd\"". */ test("parses literal", function() { parserParses('start = "abcd"', literalGrammar("abcd")); + parserParses("start = 'abcd'", literalGrammar("abcd")); + parserParses('start = "abcd"i', oneRuleGrammar(literal("abcd", true))); }); /* Canonical string is "\"abcd\"". */ diff --git a/test/passes-test.js b/test/passes-test.js index 5d81abf..53798b4 100644 --- a/test/passes-test.js +++ b/test/passes-test.js @@ -16,7 +16,7 @@ test("removes proxy rules", function() { type: "rule", name: "proxied", displayName: null, - expression: { type: "literal", value: "a" } + expression: { type: "literal", value: "a", ignoreCase: false } }; var proxiedRuleRef = { @@ -50,8 +50,8 @@ test("removes proxy rules", function() { type: "choice", alternatives: [ proxiedRuleRef, - { type: "literal", value: "a" }, - { type: "literal", value: "b" } + { type: "literal", value: "a", ignoreCase: false }, + { type: "literal", value: "b", ignoreCase: false } ] }) }, @@ -60,8 +60,8 @@ test("removes proxy rules", function() { ast: simpleGrammarWithStartAndProxied({ type: "choice", alternatives: [ - { type: "literal", value: "a" }, - { type: "literal", value: "b" }, + { type: "literal", value: "a", ignoreCase: false }, + { type: "literal", value: "b", ignoreCase: false }, proxiedRuleRef ] }) @@ -72,8 +72,8 @@ test("removes proxy rules", function() { type: "sequence", elements: [ proxiedRuleRef, - { type: "literal", value: "a" }, - { type: "literal", value: "b" } + { type: "literal", value: "a", ignoreCase: false }, + { type: "literal", value: "b", ignoreCase: false } ] }) }, @@ -82,8 +82,8 @@ test("removes proxy rules", function() { ast: simpleGrammarWithStartAndProxied({ type: "sequence", elements: [ - { type: "literal", value: "a" }, - { type: "literal", value: "b" }, + { type: "literal", value: "a", ignoreCase: false }, + { type: "literal", value: "b", ignoreCase: false }, proxiedRuleRef ] })