Rewrite variable handling in generated parsers

Before this commit, variables for saving match results and parse
positions in generated parsers were not used efficiently. Each rule
basically used its own variable(s) for storing the data, with names
generated sequentially during code emitting. There was no reuse of
variables and a lot of unnecessary assignments between them.

It is easy to see that both match results and parse positions can
actually be stored on a stack that grows as the parser walks deeper in
the grammar tree and shrinks as it returns. Moreover, if one creates a
new stack for each rule the parser enters, its maximum depth can be
computed statically from the grammar. This allows us to implement the
stack not as an array, but as a set of numbered variables in each
function that handles parsing of a grammar rule, avoiding potentially
slow array accesses.

This commit implements the idea from the previous paragraph, using
separate stack for match results and for parse positions. As a result,
defined variables are reused and unnecessary copying avoided.

Speed implications
------------------

This change speeds up the benchmark suite execution by 2.14%.

Detailed results (benchmark suite totals as reported by "jake benchmark"
on Node.js 0.4.8):

-----------------------------------
 Test #      Before        After
-----------------------------------
      1   129.01 kB/s   131.98 kB/s
      2   129.39 kB/s   130.13 kB/s
      3   128.63 kB/s   132.57 kB/s
      4   127.53 kB/s   129.82 kB/s
      5   127.98 kB/s   131.80 kB/s
-----------------------------------
Average   128.51 kB/s   131.26 kB/s
-----------------------------------

Size implications
-----------------

This change makes a sample of generated parsers 8.60% smaller:

Before:

  $ wc -c src/parser.js examples/*.js
   110867 src/parser.js
    13886 examples/arithmetics.js
   450125 examples/css.js
   632390 examples/javascript.js
    61365 examples/json.js
  1268633 total

After:

  $ wc -c src/parser.js examples/*.js
    99597 src/parser.js
    13077 examples/arithmetics.js
   399893 examples/css.js
   592044 examples/javascript.js
    54797 examples/json.js
  1159408 total
redux
David Majda 13 years ago
parent bb83b2189a
commit d123cf0eda

@ -12,7 +12,8 @@ PEG.compiler = {
];
var PASS_NAMES = [
"proxyRules"
"proxyRules",
"stackDepths"
];
for (var i = 0; i < CHECK_NAMES.length; i++) {

@ -76,19 +76,8 @@ PEG.compiler.emitter = function(ast) {
return indentMultilineParts(interpolateVariablesInParts(args)).join("\n");
};
/* Unique ID generator. */
var UID = {
_counters: {},
next: function(prefix) {
this._counters[prefix] = this._counters[prefix] || 0;
return prefix + this._counters[prefix]++;
},
reset: function() {
this._counters = {};
}
};
function resultVar(index) { return "result" + index; }
function posVar(index) { return "pos" + index; }
var emit = buildNodeVisitor({
grammar: function(node) {
@ -332,16 +321,16 @@ PEG.compiler.emitter = function(ast) {
},
rule: function(node) {
/*
* We want to reset variable names at the beginning of every function so
* that a little change in the source grammar does not change variables in
* all the generated code. This is desired especially when one has the
* generated grammar stored in a VCS (this is true e.g. for our
* metagrammar).
*/
UID.reset();
var context = {
resultIndex: 0,
posIndex: 0
};
var resultVars = map(range(node.resultStackDepth), resultVar);
var posVars = map(range(node.posStackDepth), posVar);
var resultVar = UID.next("result");
var resultVarsCode = resultVars.length > 0 ? "var " + resultVars.join(", ") + ";" : "";
var posVarsCode = posVars.length > 0 ? "var " + posVars.join(", ") + ";" : "";
if (node.displayName !== null) {
var setReportFailuresCode = formatCode(
@ -356,7 +345,7 @@ PEG.compiler.emitter = function(ast) {
"}",
{
displayName: node.displayName,
resultVar: resultVar
resultVar: resultVar(context.resultIndex)
}
);
} else {
@ -374,6 +363,9 @@ PEG.compiler.emitter = function(ast) {
" return cachedResult.result;",
" }",
" ",
" ${resultVarsCode}",
" ${posVarsCode}",
" ",
" ${setReportFailuresCode}",
" ${code}",
" ${restoreReportFailuresCode}",
@ -387,51 +379,68 @@ PEG.compiler.emitter = function(ast) {
"}",
{
name: node.name,
resultVarsCode: resultVarsCode,
posVarsCode: posVarsCode,
setReportFailuresCode: setReportFailuresCode,
restoreReportFailuresCode: restoreReportFailuresCode,
reportFailureCode: reportFailureCode,
code: emit(node.expression, resultVar),
resultVar: resultVar
code: emit(node.expression, context),
resultVar: resultVar(context.resultIndex)
}
);
},
/*
* The contract for all code fragments generated by the following functions
* is as follows:
* is as follows.
*
* The code fragment tries to match a part of the input starting with the
* position indicated in |pos|. That position may point past the end of the
* input.
*
* * If the code fragment matches the input, it advances |pos| to point to
* the first chracter following the matched part of the input and sets
* variable with a name computed by calling
* |resultVar(context.resultIndex)| to an appropriate value. This value is
* always non-|null|.
*
* * If the code fragment does not match the input, it returns with |pos|
* set to the original value and it sets a variable with a name computed
* by calling |resultVar(context.resultIndex)| to |null|.
*
* * The code fragment should try to match a part of the input starting with
* the position indicated in |pos|. That position may point past the end of
* the input.
* The code can use variables with names computed by calling
*
* * If the code fragment matches the input, it advances |pos| after the
* matched part of the input and sets variable with a name stored in
* |resultVar| to appropriate value, which is always non-null.
* |resultVar(context.resultIndex + i)|
*
* * If the code fragment does not match the input, it does not change |pos|
* and it sets a variable with a name stored in |resultVar| to |null|.
* and
*
* |posVar(context.posIndex + i)|
*
* where |i| >= 1 to store necessary data (return values and positions). It
* won't use any other variables.
*/
choice: function(node, resultVar) {
var code = formatCode(
"var ${resultVar} = null;",
{ resultVar: resultVar }
);
choice: function(node, context) {
var code, nextAlternativesCode;
for (var i = node.alternatives.length - 1; i >= 0; i--) {
var alternativeResultVar = UID.next("result");
nextAlternativesCode = i !== node.alternatives.length - 1
? formatCode(
"if (${resultVar} === null) {",
" ${code}",
"}",
{
code: code,
resultVar: resultVar(context.resultIndex)
}
)
: "";
code = formatCode(
"${alternativeCode}",
"if (${alternativeResultVar} !== null) {",
" var ${resultVar} = ${alternativeResultVar};",
"} else {",
" ${code};",
"}",
"${currentAlternativeCode}",
"${nextAlternativesCode}",
{
alternativeCode: emit(node.alternatives[i], alternativeResultVar),
alternativeResultVar: alternativeResultVar,
code: code,
resultVar: resultVar
currentAlternativeCode: emit(node.alternatives[i], context),
nextAlternativesCode: nextAlternativesCode
}
);
}
@ -439,177 +448,185 @@ PEG.compiler.emitter = function(ast) {
return code;
},
sequence: function(node, resultVar) {
var savedPosVar = UID.next("savedPos");
var elementResultVars = map(node.elements, function() {
return UID.next("result")
sequence: function(node, context) {
var elementResultVars = map(node.elements, function(element, i) {
return resultVar(context.resultIndex + i);
});
var code = formatCode(
"var ${resultVar} = ${elementResultVarArray};",
"${resultVar} = ${elementResultVarArray};",
{
resultVar: resultVar,
resultVar: resultVar(context.resultIndex),
elementResultVarArray: "[" + elementResultVars.join(", ") + "]"
}
);
var elementContext;
for (var i = node.elements.length - 1; i >= 0; i--) {
elementContext = {
resultIndex: context.resultIndex + i,
posIndex: context.posIndex + 1
};
code = formatCode(
"${elementCode}",
"if (${elementResultVar} !== null) {",
" ${code}",
"} else {",
" var ${resultVar} = null;",
" pos = ${savedPosVar};",
" ${resultVar} = null;",
" pos = ${posVar};",
"}",
{
elementCode: emit(node.elements[i], elementResultVars[i]),
elementCode: emit(node.elements[i], elementContext),
elementResultVar: elementResultVars[i],
code: code,
savedPosVar: savedPosVar,
resultVar: resultVar
posVar: posVar(context.posIndex),
resultVar: resultVar(context.resultIndex)
}
);
}
return formatCode(
"var ${savedPosVar} = pos;",
"${posVar} = pos;",
"${code}",
{
code: code,
savedPosVar: savedPosVar
code: code,
posVar: posVar(context.posIndex)
}
);
},
labeled: function(node, resultVar) {
return emit(node.expression, resultVar);
labeled: function(node, context) {
return emit(node.expression, context);
},
simple_and: function(node, resultVar) {
var savedPosVar = UID.next("savedPos");
var expressionResultVar = UID.next("result");
simple_and: function(node, context) {
var expressionContext = {
resultIndex: context.resultIndex,
posIndex: context.posIndex + 1
};
return formatCode(
"var ${savedPosVar} = pos;",
"${posVar} = pos;",
"reportFailures++;",
"${expressionCode}",
"reportFailures--;",
"if (${expressionResultVar} !== null) {",
" var ${resultVar} = '';",
" pos = ${savedPosVar};",
"if (${resultVar} !== null) {",
" ${resultVar} = '';",
" pos = ${posVar};",
"} else {",
" var ${resultVar} = null;",
" ${resultVar} = null;",
"}",
{
expressionCode: emit(node.expression, expressionResultVar),
expressionResultVar: expressionResultVar,
savedPosVar: savedPosVar,
resultVar: resultVar
expressionCode: emit(node.expression, expressionContext),
posVar: posVar(context.posIndex),
resultVar: resultVar(context.resultIndex)
}
);
},
simple_not: function(node, resultVar) {
var savedPosVar = UID.next("savedPos");
var expressionResultVar = UID.next("result");
simple_not: function(node, context) {
var expressionContext = {
resultIndex: context.resultIndex,
posIndex: context.posIndex + 1
};
return formatCode(
"var ${savedPosVar} = pos;",
"${posVar} = pos;",
"reportFailures++;",
"${expressionCode}",
"reportFailures--;",
"if (${expressionResultVar} === null) {",
" var ${resultVar} = '';",
"if (${resultVar} === null) {",
" ${resultVar} = '';",
"} else {",
" var ${resultVar} = null;",
" pos = ${savedPosVar};",
" ${resultVar} = null;",
" pos = ${posVar};",
"}",
{
expressionCode: emit(node.expression, expressionResultVar),
expressionResultVar: expressionResultVar,
savedPosVar: savedPosVar,
resultVar: resultVar
expressionCode: emit(node.expression, expressionContext),
posVar: posVar(context.posIndex),
resultVar: resultVar(context.resultIndex)
}
);
},
semantic_and: function(node, resultVar) {
semantic_and: function(node, context) {
return formatCode(
"var ${resultVar} = (function() {${actionCode}})() ? '' : null;",
"${resultVar} = (function() {${actionCode}})() ? '' : null;",
{
actionCode: node.code,
resultVar: resultVar
actionCode: node.code,
resultVar: resultVar(context.resultIndex)
}
);
},
semantic_not: function(node, resultVar) {
semantic_not: function(node, context) {
return formatCode(
"var ${resultVar} = (function() {${actionCode}})() ? null : '';",
"${resultVar} = (function() {${actionCode}})() ? null : '';",
{
actionCode: node.code,
resultVar: resultVar
actionCode: node.code,
resultVar: resultVar(context.resultIndex)
}
);
},
optional: function(node, resultVar) {
var expressionResultVar = UID.next("result");
optional: function(node, context) {
return formatCode(
"${expressionCode}",
"var ${resultVar} = ${expressionResultVar} !== null ? ${expressionResultVar} : '';",
"${resultVar} = ${resultVar} !== null ? ${resultVar} : '';",
{
expressionCode: emit(node.expression, expressionResultVar),
expressionResultVar: expressionResultVar,
resultVar: resultVar
expressionCode: emit(node.expression, context),
resultVar: resultVar(context.resultIndex)
}
);
},
zero_or_more: function(node, resultVar) {
var expressionResultVar = UID.next("result");
zero_or_more: function(node, context) {
var expressionContext = {
resultIndex: context.resultIndex + 1,
posIndex: context.posIndex
};
return formatCode(
"var ${resultVar} = [];",
"${resultVar} = [];",
"${expressionCode}",
"while (${expressionResultVar} !== null) {",
" ${resultVar}.push(${expressionResultVar});",
" ${expressionCode}",
"}",
{
expressionCode: emit(node.expression, expressionResultVar),
expressionResultVar: expressionResultVar,
resultVar: resultVar
expressionCode: emit(node.expression, expressionContext),
expressionResultVar: resultVar(context.resultIndex + 1),
resultVar: resultVar(context.resultIndex)
}
);
},
one_or_more: function(node, resultVar) {
var expressionResultVar = UID.next("result");
one_or_more: function(node, context) {
var expressionContext = {
resultIndex: context.resultIndex + 1,
posIndex: context.posIndex
};
return formatCode(
"${expressionCode}",
"if (${expressionResultVar} !== null) {",
" var ${resultVar} = [];",
" ${resultVar} = [];",
" while (${expressionResultVar} !== null) {",
" ${resultVar}.push(${expressionResultVar});",
" ${expressionCode}",
" }",
"} else {",
" var ${resultVar} = null;",
" ${resultVar} = null;",
"}",
{
expressionCode: emit(node.expression, expressionResultVar),
expressionResultVar: expressionResultVar,
resultVar: resultVar
expressionCode: emit(node.expression, expressionContext),
expressionResultVar: resultVar(context.resultIndex + 1),
resultVar: resultVar(context.resultIndex)
}
);
},
action: function(node, resultVar) {
action: function(node, context) {
/*
* In case of sequences, we splat their elements into function arguments
* one by one. Example:
@ -619,9 +636,10 @@ PEG.compiler.emitter = function(ast) {
* This behavior is reflected in this function.
*/
var expressionResultVar = UID.next("result");
var actionResultVar = UID.next("result");
var savedPosVar = UID.next("savedPos");
var expressionContext = {
resultIndex: context.resultIndex,
posIndex: context.posIndex + 1
};
if (node.expression.type === "sequence") {
var formalParams = [];
@ -632,59 +650,54 @@ PEG.compiler.emitter = function(ast) {
for (var i = 0; i < elementsLength; i++) {
if (elements[i].type === "labeled") {
formalParams.push(elements[i].label);
actualParams.push(expressionResultVar + "[" + i + "]");
actualParams.push(resultVar(context.resultIndex) + "[" + i + "]");
}
}
} else if (node.expression.type === "labeled") {
var formalParams = [node.expression.label];
var actualParams = [expressionResultVar];
var actualParams = [resultVar(context.resultIndex)];
} else {
var formalParams = [];
var actualParams = [];
}
return formatCode(
"var ${savedPosVar} = pos;",
"${posVar} = pos;",
"${expressionCode}",
"var ${actionResultVar} = ${expressionResultVar} !== null",
" ? (function(${formalParams}) {${actionCode}})(${actualParams})",
" : null;",
"if (${actionResultVar} !== null) {",
" var ${resultVar} = ${actionResultVar};",
"} else {",
" var ${resultVar} = null;",
" pos = ${savedPosVar};",
"if (${resultVar} !== null) {",
" ${resultVar} = (function(${formalParams}) {${actionCode}})(${actualParams});",
"}",
"if (${resultVar} === null) {",
" pos = ${posVar};",
"}",
{
expressionCode: emit(node.expression, expressionResultVar),
expressionResultVar: expressionResultVar,
actionCode: node.code,
actionResultVar: actionResultVar,
formalParams: formalParams.join(", "),
actualParams: actualParams.join(", "),
savedPosVar: savedPosVar,
resultVar: resultVar
expressionCode: emit(node.expression, expressionContext),
actionCode: node.code,
formalParams: formalParams.join(", "),
actualParams: actualParams.join(", "),
posVar: posVar(context.posIndex),
resultVar: resultVar(context.resultIndex)
}
);
},
rule_ref: function(node, resultVar) {
rule_ref: function(node, context) {
return formatCode(
"var ${resultVar} = ${ruleMethod}();",
"${resultVar} = ${ruleMethod}();",
{
ruleMethod: "parse_" + node.name,
resultVar: resultVar
resultVar: resultVar(context.resultIndex)
}
);
},
literal: function(node, resultVar) {
literal: function(node, context) {
return formatCode(
"if (input.substr(pos, ${length}) === ${value|string}) {",
" var ${resultVar} = ${value|string};",
" ${resultVar} = ${value|string};",
" pos += ${length};",
"} else {",
" var ${resultVar} = null;",
" ${resultVar} = null;",
" if (reportFailures === 0) {",
" matchFailed(${valueQuoted|string});",
" }",
@ -693,27 +706,27 @@ PEG.compiler.emitter = function(ast) {
value: node.value,
valueQuoted: quote(node.value),
length: node.value.length,
resultVar: resultVar
resultVar: resultVar(context.resultIndex)
}
);
},
any: function(node, resultVar) {
any: function(node, context) {
return formatCode(
"if (input.length > pos) {",
" var ${resultVar} = input.charAt(pos);",
" ${resultVar} = input.charAt(pos);",
" pos++;",
"} else {",
" var ${resultVar} = null;",
" ${resultVar} = null;",
" if (reportFailures === 0) {",
" matchFailed('any character');",
" }",
"}",
{ resultVar: resultVar }
{ resultVar: resultVar(context.resultIndex) }
);
},
"class": function(node, resultVar) {
"class": function(node, context) {
if (node.parts.length > 0) {
var regexp = "/^["
+ (node.inverted ? "^" : "")
@ -735,10 +748,10 @@ PEG.compiler.emitter = function(ast) {
return formatCode(
"if (input.substr(pos).match(${regexp}) !== null) {",
" var ${resultVar} = input.charAt(pos);",
" ${resultVar} = input.charAt(pos);",
" pos++;",
"} else {",
" var ${resultVar} = null;",
" ${resultVar} = null;",
" if (reportFailures === 0) {",
" matchFailed(${rawText|string});",
" }",
@ -746,7 +759,7 @@ PEG.compiler.emitter = function(ast) {
{
regexp: regexp,
rawText: node.rawText,
resultVar: resultVar
resultVar: resultVar(context.resultIndex)
}
);
}

File diff suppressed because it is too large Load Diff

@ -75,6 +75,82 @@ PEG.compiler.passes = {
}
}
return ast;
},
/*
* Adds |resultStackDepth| and |posStackDepth| properties to each AST node.
* These properties specify how many positions on the result or position stack
* code generated by the emitter for the node will use. This information is
* used to declare varibles holding the stack data in the generated code.
*/
stackDepths: function(ast) {
function computeZeroes(node) {
node.resultStackDepth = 0;
node.posStackDepth = 0;
}
function computeFromExpression(resultStackDelta, posStackDelta) {
return function(node) {
compute(node.expression);
node.resultStackDepth = node.expression.resultStackDepth + resultStackDelta;
node.posStackDepth = node.expression.posStackDepth + posStackDelta;
}
}
var compute = buildNodeVisitor({
grammar:
function(node) {
for (var name in node.rules) {
compute(node.rules[name]);
}
},
rule: computeFromExpression(1, 1),
choice:
function(node) {
each(node.alternatives, compute);
node.resultStackDepth = Math.max.apply(
null,
map(node.alternatives, function(e) { return e.resultStackDepth; })
);
node.posStackDepth = Math.max.apply(
null,
map(node.alternatives, function(e) { return e.posStackDepth; })
);
},
sequence:
function(node) {
each(node.elements, compute);
node.resultStackDepth = 1 + Math.max.apply(
null,
map(node.elements, function(e, i) { return i + e.resultStackDepth; })
);
node.posStackDepth = 1 + Math.max.apply(
null,
map(node.elements, function(e) { return e.posStackDepth; })
);
},
labeled: computeFromExpression(0, 0),
simple_and: computeFromExpression(0, 1),
simple_not: computeFromExpression(0, 1),
semantic_and: computeZeroes,
semantic_not: computeZeroes,
optional: computeFromExpression(0, 0),
zero_or_more: computeFromExpression(1, 0),
one_or_more: computeFromExpression(1, 0),
action: computeFromExpression(0, 1),
rule_ref: computeZeroes,
literal: computeZeroes,
any: computeZeroes,
"class": computeZeroes
});
compute(ast);
return ast;
}
};

@ -1,3 +1,17 @@
/* Like Python's |range|, but without |step|. */
function range(start, stop) {
if (stop === undefined) {
stop = start;
start = 0;
}
var result = new Array(Math.max(0, stop - start));
for (var i = 0, j = start; j < stop; i++, j++) {
result[i] = j;
}
return result;
}
function contains(array, value) {
/*
* Stupid IE does not have Array.prototype.indexOf, otherwise this function
@ -15,7 +29,7 @@ function contains(array, value) {
function each(array, callback) {
var length = array.length;
for (var i = 0; i < length; i++) {
callback(array[i]);
callback(array[i], i);
}
}
@ -23,7 +37,7 @@ function map(array, callback) {
var result = [];
var length = array.length;
for (var i = 0; i < length; i++) {
result[i] = callback(array[i]);
result[i] = callback(array[i], i);
}
return result;
}

@ -147,4 +147,90 @@ test("removes proxy rules", function() {
}
});
test("computes stack depths", function() {
var cases = [
/* Choice */
{
grammar: 'start = "a" / "b" / "c"',
resultStackDepth: 1,
posStackDepth: 1
},
{
grammar: 'start = "a" / "b"* / "c"',
resultStackDepth: 2,
posStackDepth: 1
},
{
grammar: 'start = "a" / &"b" / "c"',
resultStackDepth: 1,
posStackDepth: 2
},
/* Sequence */
{
grammar: 'start = "a" "b" "c"',
resultStackDepth: 4,
posStackDepth: 2
},
{
grammar: 'start = "a" "b" "c"*',
resultStackDepth: 5,
posStackDepth: 2
},
{
grammar: 'start = "a" "b"* "c"',
resultStackDepth: 4,
posStackDepth: 2
},
{
grammar: 'start = "a" ("b"*)* "c"',
resultStackDepth: 5,
posStackDepth: 2
},
{
grammar: 'start = "a"* "b" "c"',
resultStackDepth: 4,
posStackDepth: 2
},
{
grammar: 'start = ("a"*)* "b" "c"',
resultStackDepth: 4,
posStackDepth: 2
},
{
grammar: 'start = (("a"*)*)* "b" "c"',
resultStackDepth: 5,
posStackDepth: 2
},
{
grammar: 'start = "a" &"b" "c"',
resultStackDepth: 4,
posStackDepth: 3
},
/* Others */
{ grammar: 'start = label:"a"', resultStackDepth: 1, posStackDepth: 1 },
{ grammar: 'start = &"a"', resultStackDepth: 1, posStackDepth: 2 },
{ grammar: 'start = !"a"', resultStackDepth: 1, posStackDepth: 2 },
{ grammar: 'start = &{ code }', resultStackDepth: 1, posStackDepth: 1 },
{ grammar: 'start = !{ code }', resultStackDepth: 1, posStackDepth: 1 },
{ grammar: 'start = "a"?', resultStackDepth: 1, posStackDepth: 1 },
{ grammar: 'start = "a"*', resultStackDepth: 2, posStackDepth: 1 },
{ grammar: 'start = "a"+', resultStackDepth: 2, posStackDepth: 1 },
{ grammar: 'start = "a" { code }', resultStackDepth: 1, posStackDepth: 2 },
{ grammar: 'start = a', resultStackDepth: 1, posStackDepth: 1 },
{ grammar: 'start = "a"', resultStackDepth: 1, posStackDepth: 1 },
{ grammar: 'start = .', resultStackDepth: 1, posStackDepth: 1 },
{ grammar: 'start = [a-z]', resultStackDepth: 1, posStackDepth: 1 }
];
for (var i = 0; i < cases.length; i++) {
var ast = PEG.parser.parse(cases[i].grammar);
PEG.compiler.passes.stackDepths(ast)
deepEqual(ast.rules["start"].resultStackDepth, cases[i].resultStackDepth);
deepEqual(ast.rules["start"].posStackDepth, cases[i].posStackDepth);
}
});
})();

Loading…
Cancel
Save