Implement case-insensitive literal matching

redux
David Majda 13 years ago
parent 88c50a3e33
commit b540b2d460

@ -128,7 +128,7 @@ There are several types of parsing expressions, some of them containing subexpre
#### "*literal*"<br>'*literal*'
Match exact literal string and return it. The string syntax is the same as in JavaScript.
Match exact literal string and return it. The string syntax is the same as in JavaScript. Appending `i` right after the literal makes the match case-insensitive.
#### .

@ -585,12 +585,28 @@ PEG.compiler.emitter = function(ast) {
'#if node.value.length === 0',
' #{resultVar} = "";',
'#else',
' #if node.value.length === 1',
' if (input.charCodeAt(pos) === #{node.value.charCodeAt(0)}) {',
' #if !node.ignoreCase',
' #if node.value.length === 1',
' if (input.charCodeAt(pos) === #{node.value.charCodeAt(0)}) {',
' #else',
' if (input.substr(pos, #{node.value.length}) === #{string(node.value)}) {',
' #end',
' #else',
' if (input.substr(pos, #{node.value.length}) === #{string(node.value)}) {',
/*
* One-char literals are not optimized when case-insensitive
* matching is enabled. This is because there is no simple way to
* lowercase a character code that works for character outside ASCII
* letters. Moreover, |toLowerCase| can change string length,
* meaning the result of lowercasing a character can be more
* characters.
*/
' if (input.substr(pos, #{node.value.length}).toLowerCase() === #{string(node.value.toLowerCase())}) {',
' #end',
' #{resultVar} = #{string(node.value)};',
' #if !node.ignoreCase',
' #{resultVar} = #{string(node.value)};',
' #else',
' #{resultVar} = input.substr(pos, #{node.value.length});',
' #end',
' pos += #{node.value.length};',
' } else {',
' #{resultVar} = null;',

@ -1793,19 +1793,51 @@ PEG.parser = (function(){
return cachedResult.result;
}
var result0;
var pos0, pos1;
var result0, result1, result2, result3;
var pos0, pos1, pos2;
reportFailures++;
pos0 = pos;
result0 = parse_string();
pos1 = pos;
result0 = parse_doubleQuotedString();
if (result0 === null) {
result0 = parse_singleQuotedString();
}
if (result0 !== null) {
result0 = (function(value) {
if (input.charCodeAt(pos) === 105) {
result1 = "i";
pos += 1;
} else {
result1 = null;
if (reportFailures === 0) {
matchFailed("\"i\"");
}
}
result1 = result1 !== null ? result1 : "";
if (result1 !== null) {
result2 = parse___();
if (result2 !== null) {
result0 = [result0, result1, result2];
} else {
result0 = null;
pos = pos1;
}
} else {
result0 = null;
pos = pos1;
}
} else {
result0 = null;
pos = pos1;
}
if (result0 !== null) {
result0 = (function(value, flags) {
return {
type: "literal",
value: value
type: "literal",
value: value,
ignoreCase: flags === "i"
};
})(result0);
})(result0[0], result0[1]);
}
if (result0 === null) {
pos = pos0;

@ -195,10 +195,11 @@ identifier "identifier"
* vaguely).
*/
literal "literal"
= value:string {
= value:(doubleQuotedString / singleQuotedString) flags:"i"? __ {
return {
type: "literal",
value: value
type: "literal",
value: value,
ignoreCase: flags === "i"
};
}

@ -190,17 +190,33 @@ test("literals", function() {
parses(zeroCharParser, "", "");
doesNotParse(zeroCharParser, "a");
var oneCharParser = PEG.buildParser('start = "a"');
parses(oneCharParser, "a", "a");
doesNotParse(oneCharParser, "");
doesNotParse(oneCharParser, "b");
var multiCharParser = PEG.buildParser('start = "abcd"');
parses(multiCharParser, "abcd", "abcd");
doesNotParse(multiCharParser, "");
doesNotParse(multiCharParser, "abc");
doesNotParse(multiCharParser, "abcde");
doesNotParse(multiCharParser, "efgh");
var oneCharCaseSensitiveParser = PEG.buildParser('start = "a"');
parses(oneCharCaseSensitiveParser, "a", "a");
doesNotParse(oneCharCaseSensitiveParser, "");
doesNotParse(oneCharCaseSensitiveParser, "A");
doesNotParse(oneCharCaseSensitiveParser, "b");
var multiCharCaseSensitiveParser = PEG.buildParser('start = "abcd"');
parses(multiCharCaseSensitiveParser, "abcd", "abcd");
doesNotParse(multiCharCaseSensitiveParser, "");
doesNotParse(multiCharCaseSensitiveParser, "abc");
doesNotParse(multiCharCaseSensitiveParser, "abcde");
doesNotParse(multiCharCaseSensitiveParser, "ABCD");
doesNotParse(multiCharCaseSensitiveParser, "efgh");
var oneCharCaseInsensitiveParser = PEG.buildParser('start = "a"i');
parses(oneCharCaseInsensitiveParser, "a", "a");
parses(oneCharCaseInsensitiveParser, "A", "A");
doesNotParse(oneCharCaseInsensitiveParser, "");
doesNotParse(oneCharCaseInsensitiveParser, "b");
var multiCharCaseInsensitiveParser = PEG.buildParser('start = "abcd"i');
parses(multiCharCaseInsensitiveParser, "abcd", "abcd");
parses(multiCharCaseInsensitiveParser, "ABCD", "ABCD");
doesNotParse(multiCharCaseInsensitiveParser, "");
doesNotParse(multiCharCaseInsensitiveParser, "abc");
doesNotParse(multiCharCaseInsensitiveParser, "abcde");
doesNotParse(multiCharCaseInsensitiveParser, "efgh");
/*
* Test that the parsing position moves forward after successful parsing of

@ -83,10 +83,11 @@ function ruleRef(name) {
};
}
function literal(value) {
function literal(value, ignoreCase) {
return {
type: "literal",
value: value
type: "literal",
value: value,
ignoreCase: ignoreCase
};
}
@ -103,9 +104,9 @@ function klass(inverted, parts, rawText) {
};
}
var literalAbcd = literal("abcd");
var literalEfgh = literal("efgh");
var literalIjkl = literal("ijkl");
var literalAbcd = literal("abcd", false);
var literalEfgh = literal("efgh", false);
var literalIjkl = literal("ijkl", false);
var optionalLiteral = optional(literalAbcd);
@ -128,7 +129,7 @@ function oneRuleGrammar(expression) {
};
}
var simpleGrammar = oneRuleGrammar(literal("abcd"));
var simpleGrammar = oneRuleGrammar(literal("abcd", false));
function identifierGrammar(identifier) {
return oneRuleGrammar(ruleRef(identifier));
@ -136,7 +137,7 @@ function identifierGrammar(identifier) {
var literal_ = literal;
function literalGrammar(literal) {
return oneRuleGrammar(literal_(literal));
return oneRuleGrammar(literal_(literal, false));
}
function classGrammar(inverted, parts, rawText) {
@ -147,7 +148,7 @@ var anyGrammar = oneRuleGrammar(any());
var action_ = action;
function actionGrammar(action) {
return oneRuleGrammar(action_(literal("a"), action));
return oneRuleGrammar(action_(literal("a", false), action));
}
var initializerGrammar = {
@ -334,6 +335,8 @@ test("parses identifier", function() {
/* Canonical literal is "\"abcd\"". */
test("parses literal", function() {
parserParses('start = "abcd"', literalGrammar("abcd"));
parserParses("start = 'abcd'", literalGrammar("abcd"));
parserParses('start = "abcd"i', oneRuleGrammar(literal("abcd", true)));
});
/* Canonical string is "\"abcd\"". */

@ -16,7 +16,7 @@ test("removes proxy rules", function() {
type: "rule",
name: "proxied",
displayName: null,
expression: { type: "literal", value: "a" }
expression: { type: "literal", value: "a", ignoreCase: false }
};
var proxiedRuleRef = {
@ -50,8 +50,8 @@ test("removes proxy rules", function() {
type: "choice",
alternatives: [
proxiedRuleRef,
{ type: "literal", value: "a" },
{ type: "literal", value: "b" }
{ type: "literal", value: "a", ignoreCase: false },
{ type: "literal", value: "b", ignoreCase: false }
]
})
},
@ -60,8 +60,8 @@ test("removes proxy rules", function() {
ast: simpleGrammarWithStartAndProxied({
type: "choice",
alternatives: [
{ type: "literal", value: "a" },
{ type: "literal", value: "b" },
{ type: "literal", value: "a", ignoreCase: false },
{ type: "literal", value: "b", ignoreCase: false },
proxiedRuleRef
]
})
@ -72,8 +72,8 @@ test("removes proxy rules", function() {
type: "sequence",
elements: [
proxiedRuleRef,
{ type: "literal", value: "a" },
{ type: "literal", value: "b" }
{ type: "literal", value: "a", ignoreCase: false },
{ type: "literal", value: "b", ignoreCase: false }
]
})
},
@ -82,8 +82,8 @@ test("removes proxy rules", function() {
ast: simpleGrammarWithStartAndProxied({
type: "sequence",
elements: [
{ type: "literal", value: "a" },
{ type: "literal", value: "b" },
{ type: "literal", value: "a", ignoreCase: false },
{ type: "literal", value: "b", ignoreCase: false },
proxiedRuleRef
]
})

Loading…
Cancel
Save