pegjs/lib/compiler/passes/generate-bytecode.js
David Majda dad1207c46 Improve semantics of the TEXT bytecode instruction
The TEXT instruction now replaces position at the top of the stack with
the input from that position until the current position. This is simpler
and cleaner semantics than the previous one, where TEXT also popped an
additional value from the stack and kept the position there.
2014-05-16 13:30:03 +02:00

615 lines
15 KiB
JavaScript

var arrays = require("../../utils/arrays"),
objects = require("../../utils/objects"),
asts = require("../asts"),
visitor = require("../visitor"),
op = require("../opcodes"),
js = require("../javascript");
/* Generates bytecode.
*
* Instructions
* ============
*
* Stack Manipulation
* ------------------
*
* [0] PUSH c
*
* stack.push(consts[c]);
*
* [26] PUSH_UNDEFINED
*
* stack.push(undefined);
*
* [27] PUSH_NULL
*
* stack.push(null);
*
* [28] PUSH_FAILED
*
* stack.push(FAILED);
*
* [29] PUSH_EMPTY_ARRAY
*
* stack.push([]);
*
* [1] PUSH_CURR_POS
*
* stack.push(currPos);
*
* [2] POP
*
* stack.pop();
*
* [3] POP_CURR_POS
*
* currPos = stack.pop();
*
* [4] POP_N n
*
* stack.pop(n);
*
* [5] NIP
*
* value = stack.pop();
* stack.pop();
* stack.push(value);
*
* [6] APPEND
*
* value = stack.pop();
* array = stack.pop();
* array.push(value);
* stack.push(array);
*
* [7] WRAP n
*
* stack.push(stack.pop(n));
*
* [8] TEXT
*
* stack.push(input.substring(stack.pop(), currPos));
*
* Conditions and Loops
* --------------------
*
* [9] IF t, f
*
* if (stack.top()) {
* interpret(ip + 3, ip + 3 + t);
* } else {
* interpret(ip + 3 + t, ip + 3 + t + f);
* }
*
* [10] IF_ERROR t, f
*
* if (stack.top() === FAILED) {
* interpret(ip + 3, ip + 3 + t);
* } else {
* interpret(ip + 3 + t, ip + 3 + t + f);
* }
*
* [11] IF_NOT_ERROR t, f
*
* if (stack.top() !== FAILED) {
* interpret(ip + 3, ip + 3 + t);
* } else {
* interpret(ip + 3 + t, ip + 3 + t + f);
* }
*
* [12] WHILE_NOT_ERROR b
*
* while(stack.top() !== FAILED) {
* interpret(ip + 2, ip + 2 + b);
* }
*
* Matching
* --------
*
* [13] MATCH_ANY a, f, ...
*
* if (input.length > currPos) {
* interpret(ip + 3, ip + 3 + a);
* } else {
* interpret(ip + 3 + a, ip + 3 + a + f);
* }
*
* [14] MATCH_STRING s, a, f, ...
*
* if (input.substr(currPos, consts[s].length) === consts[s]) {
* interpret(ip + 4, ip + 4 + a);
* } else {
* interpret(ip + 4 + a, ip + 4 + a + f);
* }
*
* [15] MATCH_STRING_IC s, a, f, ...
*
* if (input.substr(currPos, consts[s].length).toLowerCase() === consts[s]) {
* interpret(ip + 4, ip + 4 + a);
* } else {
* interpret(ip + 4 + a, ip + 4 + a + f);
* }
*
* [16] MATCH_REGEXP r, a, f, ...
*
* if (consts[r].test(input.charAt(currPos))) {
* interpret(ip + 4, ip + 4 + a);
* } else {
* interpret(ip + 4 + a, ip + 4 + a + f);
* }
*
* [17] ACCEPT_N n
*
* stack.push(input.substring(currPos, n));
* currPos += n;
*
* [18] ACCEPT_STRING s
*
* stack.push(consts[s]);
* currPos += consts[s].length;
*
* [19] FAIL e
*
* stack.push(FAILED);
* fail(consts[e]);
*
* Calls
* -----
*
* [20] REPORT_SAVED_POS p
*
* reportedPos = stack[p];
*
* [21] REPORT_CURR_POS
*
* reportedPos = currPos;
*
* [22] CALL f, n, pc, p1, p2, ..., pN
*
* value = consts[f](stack[p1], ..., stack[pN]);
* stack.pop(n);
* stack.push(value);
*
* Rules
* -----
*
* [23] RULE r
*
* stack.push(parseRule(r));
*
* Failure Reporting
* -----------------
*
* [24] SILENT_FAILS_ON
*
* silentFails++;
*
* [25] SILENT_FAILS_OFF
*
* silentFails--;
*/
function generateBytecode(ast) {
var consts = [];
function addConst(value) {
var index = arrays.indexOf(consts, value);
return index === -1 ? consts.push(value) - 1 : index;
}
function addFunctionConst(params, code) {
return addConst(
"function(" + params.join(", ") + ") {" + code + "}"
);
}
function buildSequence() {
return Array.prototype.concat.apply([], arguments);
}
function buildCondition(condCode, thenCode, elseCode) {
return condCode.concat(
[thenCode.length, elseCode.length],
thenCode,
elseCode
);
}
function buildLoop(condCode, bodyCode) {
return condCode.concat([bodyCode.length], bodyCode);
}
function buildCall(functionIndex, delta, env, sp) {
var params = arrays.map(objects.values(env), function(p) { return sp - p; });
return [op.CALL, functionIndex, delta, params.length].concat(params);
}
function buildSimplePredicate(expression, negative, context) {
return buildSequence(
[op.PUSH_CURR_POS],
[op.SILENT_FAILS_ON],
generate(expression, {
sp: context.sp + 1,
env: { },
action: null
}),
[op.SILENT_FAILS_OFF],
buildCondition(
[negative ? op.IF_ERROR : op.IF_NOT_ERROR],
buildSequence(
[op.POP],
[negative ? op.POP : op.POP_CURR_POS],
[op.PUSH_UNDEFINED]
),
buildSequence(
[op.POP],
[negative ? op.POP_CURR_POS : op.POP],
[op.PUSH_FAILED]
)
)
);
}
function buildSemanticPredicate(code, negative, context) {
var functionIndex = addFunctionConst(objects.keys(context.env), code);
return buildSequence(
[op.REPORT_CURR_POS],
buildCall(functionIndex, 0, context.env, context.sp),
buildCondition(
[op.IF],
buildSequence(
[op.POP],
negative ? [op.PUSH_FAILED] : [op.PUSH_UNDEFINED]
),
buildSequence(
[op.POP],
negative ? [op.PUSH_UNDEFINED] : [op.PUSH_FAILED]
)
)
);
}
function buildAppendLoop(expressionCode) {
return buildLoop(
[op.WHILE_NOT_ERROR],
buildSequence([op.APPEND], expressionCode)
);
}
var generate = visitor.build({
grammar: function(node) {
arrays.each(node.rules, generate);
node.consts = consts;
},
rule: function(node) {
node.bytecode = generate(node.expression, {
sp: -1, // stack pointer
env: { }, // mapping of label names to stack positions
action: null // action nodes pass themselves to children here
});
},
named: function(node, context) {
var nameIndex = addConst(
'{ type: "other", description: "' + js.stringEscape(node.name) + '" }'
);
/*
* The code generated below is slightly suboptimal because |FAIL| pushes
* to the stack, so we need to stick a |POP| in front of it. We lack a
* dedicated instruction that would just report the failure and not touch
* the stack.
*/
return buildSequence(
[op.SILENT_FAILS_ON],
generate(node.expression, context),
[op.SILENT_FAILS_OFF],
buildCondition([op.IF_ERROR], [op.FAIL, nameIndex], [])
);
},
choice: function(node, context) {
function buildAlternativesCode(alternatives, context) {
return buildSequence(
generate(alternatives[0], {
sp: context.sp,
env: { },
action: null
}),
alternatives.length > 1
? buildCondition(
[op.IF_ERROR],
buildSequence(
[op.POP],
buildAlternativesCode(alternatives.slice(1), context)
),
[]
)
: []
);
}
return buildAlternativesCode(node.alternatives, context);
},
action: function(node, context) {
var env = { },
emitCall = node.expression.type !== "sequence"
|| node.expression.elements.length === 0,
expressionCode = generate(node.expression, {
sp: context.sp + (emitCall ? 1 : 0),
env: env,
action: node
}),
functionIndex = addFunctionConst(objects.keys(env), node.code);
return emitCall
? buildSequence(
[op.PUSH_CURR_POS],
expressionCode,
buildCondition(
[op.IF_NOT_ERROR],
buildSequence(
[op.REPORT_SAVED_POS, 1],
buildCall(functionIndex, 1, env, context.sp + 2)
),
[]
),
[op.NIP]
)
: expressionCode;
},
sequence: function(node, context) {
function buildElementsCode(elements, context) {
var processedCount, functionIndex;
if (elements.length > 0) {
processedCount = node.elements.length - elements.slice(1).length;
return buildSequence(
generate(elements[0], {
sp: context.sp,
env: context.env,
action: null
}),
buildCondition(
[op.IF_NOT_ERROR],
buildElementsCode(elements.slice(1), {
sp: context.sp + 1,
env: context.env,
action: context.action
}),
buildSequence(
processedCount > 1 ? [op.POP_N, processedCount] : [op.POP],
[op.POP_CURR_POS],
[op.PUSH_FAILED]
)
)
);
} else {
if (context.action) {
functionIndex = addFunctionConst(
objects.keys(context.env),
context.action.code
);
return buildSequence(
[op.REPORT_SAVED_POS, node.elements.length],
buildCall(
functionIndex,
node.elements.length,
context.env,
context.sp
),
[op.NIP]
);
} else {
return buildSequence([op.WRAP, node.elements.length], [op.NIP]);
}
}
}
return buildSequence(
[op.PUSH_CURR_POS],
buildElementsCode(node.elements, {
sp: context.sp + 1,
env: context.env,
action: context.action
})
);
},
labeled: function(node, context) {
context.env[node.label] = context.sp + 1;
return generate(node.expression, {
sp: context.sp,
env: { },
action: null
});
},
text: function(node, context) {
return buildSequence(
[op.PUSH_CURR_POS],
generate(node.expression, {
sp: context.sp + 1,
env: { },
action: null
}),
buildCondition(
[op.IF_NOT_ERROR],
buildSequence([op.POP], [op.TEXT]),
[op.NIP]
)
);
},
simple_and: function(node, context) {
return buildSimplePredicate(node.expression, false, context);
},
simple_not: function(node, context) {
return buildSimplePredicate(node.expression, true, context);
},
semantic_and: function(node, context) {
return buildSemanticPredicate(node.code, false, context);
},
semantic_not: function(node, context) {
return buildSemanticPredicate(node.code, true, context);
},
optional: function(node, context) {
return buildSequence(
generate(node.expression, {
sp: context.sp,
env: { },
action: null
}),
buildCondition(
[op.IF_ERROR],
buildSequence([op.POP], [op.PUSH_NULL]),
[]
)
);
},
zero_or_more: function(node, context) {
var expressionCode = generate(node.expression, {
sp: context.sp + 1,
env: { },
action: null
});
return buildSequence(
[op.PUSH_EMPTY_ARRAY],
expressionCode,
buildAppendLoop(expressionCode),
[op.POP]
);
},
one_or_more: function(node, context) {
var expressionCode = generate(node.expression, {
sp: context.sp + 1,
env: { },
action: null
});
return buildSequence(
[op.PUSH_EMPTY_ARRAY],
expressionCode,
buildCondition(
[op.IF_NOT_ERROR],
buildSequence(buildAppendLoop(expressionCode), [op.POP]),
buildSequence([op.POP], [op.POP], [op.PUSH_FAILED])
)
);
},
rule_ref: function(node) {
return [op.RULE, asts.indexOfRule(ast, node.name)];
},
literal: function(node) {
var stringIndex, expectedIndex;
if (node.value.length > 0) {
stringIndex = addConst('"'
+ js.stringEscape(
node.ignoreCase ? node.value.toLowerCase() : node.value
)
+ '"'
);
expectedIndex = addConst([
'{',
'type: "literal",',
'value: "' + js.stringEscape(node.value) + '",',
'description: "'
+ js.stringEscape('"' + js.stringEscape(node.value) + '"')
+ '"',
'}'
].join(' '));
/*
* For case-sensitive strings the value must match the beginning of the
* remaining input exactly. As a result, we can use |ACCEPT_STRING| and
* save one |substr| call that would be needed if we used |ACCEPT_N|.
*/
return buildCondition(
node.ignoreCase
? [op.MATCH_STRING_IC, stringIndex]
: [op.MATCH_STRING, stringIndex],
node.ignoreCase
? [op.ACCEPT_N, node.value.length]
: [op.ACCEPT_STRING, stringIndex],
[op.FAIL, expectedIndex]
);
} else {
stringIndex = addConst('""');
return [op.PUSH, stringIndex];
}
},
"class": function(node) {
var regexp, regexpIndex, expectedIndex;
if (node.parts.length > 0) {
regexp = '/^['
+ (node.inverted ? '^' : '')
+ arrays.map(node.parts, function(part) {
return part instanceof Array
? js.regexpClassEscape(part[0])
+ '-'
+ js.regexpClassEscape(part[1])
: js.regexpClassEscape(part);
}).join('')
+ ']/' + (node.ignoreCase ? 'i' : '');
} else {
/*
* IE considers regexps /[]/ and /[^]/ as syntactically invalid, so we
* translate them into euqivalents it can handle.
*/
regexp = node.inverted ? '/^[\\S\\s]/' : '/^(?!)/';
}
regexpIndex = addConst(regexp);
expectedIndex = addConst([
'{',
'type: "class",',
'value: "' + js.stringEscape(node.rawText) + '",',
'description: "' + js.stringEscape(node.rawText) + '"',
'}'
].join(' '));
return buildCondition(
[op.MATCH_REGEXP, regexpIndex],
[op.ACCEPT_N, 1],
[op.FAIL, expectedIndex]
);
},
any: function() {
var expectedIndex = addConst('{ type: "any", description: "any character" }');
return buildCondition(
[op.MATCH_ANY],
[op.ACCEPT_N, 1],
[op.FAIL, expectedIndex]
);
}
});
generate(ast);
}
module.exports = generateBytecode;