You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

181 lines
7.0 KiB
JavaScript

"use strict";
/*
loop through rules
manage context
trackPosition wrapper (document order relative to context-creating operations!) - positions are automatically emitted in streaming mode
parser namespace property on parsing functions
*/
// TODO: What if a parsing rule is dependent on some context that's parsed somewhere else in the input entirely? Isn't this fundamentally incompatible with the streaming parser paradigm?
// NOTE: Use sentinel objects to denote failure, to prevent throw/catch overhead, which can get problematic especially with heavy peek/test usage
// FIXME: Consider whether NotEnoughInput can be handled on a core level rather than in individual core operations, since it seems to always need to be propagated?
// FIXME: Need a way to mark end of input, to avoid the case where a trailing optional yields a NotEnoughInput even though it *should* have parsed the input end as the actual end, and concluded that the optional is not present.
const PromiseTry = require("es6-promise-try");
const matchValue = require("match-value");
const util = require("util");
const yieldcore = require("yieldcore");
const assert = require("assert");
const chalk = require("chalk"); // FIXME: Replace with a better alternative
const { NoMatch, NotEnoughInput } = require("./symbols");
/* TO DO:
- Streaming input
- Streaming output
- binary input support
- amount of characters
- amount of bytes
- repeat {N,M} times
- return match with position metadata
- Improve organization of (debug formatting) code
FEATURES:
- streaming mode for all named matches
- grammar-defined streams for large payloads
*/
function isInternalFrame(frame) {
if (process.env.DEBUG_PARSER_INTERNAL) {
return false;
} else {
return (frame.instruction === "internal" && typeof frame.name === "string" && frame.name.startsWith("_"));
}
}
function singleLineInspect(value, options = {}) {
return util.inspect(value, { colors: true, compact: true, breakLength: Infinity, ... options });
}
function getStackSize(stack) {
return stack.filter((frame) => !isInternalFrame(frame)).length;
}
function formatFrameInstruction(frame) {
if (frame.instruction === "internal") {
if (frame.name?.__protocolKitInstruction === true) {
let { __protocolKitInstruction, ... conciseRule } = frame.name;
return chalk.blue(singleLineInspect(conciseRule));
} else {
return chalk.gray(`[internal: ${frame.name}]`);
}
} else {
return util.inspect(frame.instruction, { colors: true, compact: true, breakLength: Infinity });
}
}
function formatFrameReturnValue(returnValue) {
if (returnValue === NoMatch) {
return chalk.redBright(singleLineInspect(returnValue, { colors: false }));
} else {
return singleLineInspect(returnValue);
}
}
module.exports = {
NoMatch: NoMatch,
parse: function parse(input, rootParser) {
// NOTE: `state` is mutable from within core ops, `context` is not, but both may be updated externally (eg. for input shifting)
let state = {
currentInput: input,
currentIndex: 0,
isFullyLoaded: true // FIXME: Only set to true once the input stream has been fully consumed
};
function shiftInput(bytes) {
// This function is called to remove a certain amount of bytes from the start of the currentInput; this can be done to reduce memory usage whenever the parser is at a point where backtracking is no longer possible.
// FIXME: Also debuglog input shifts.
assert(state.currentInput.length >= bytes); // FIXME: Better check
state.currentInput = state.currentInput.slice(bytes);
state.currentIndex -= bytes;
}
function formatIndex() {
// This formats the current parsing index for display in debug messages, padded to the maximum possible width of any index, so that debug log entries remain visually aligned.
return String(state.currentIndex).padStart(Math.ceil(Math.log10(state.currentInput.length)));
}
function* applyRule(rule, frame, stack) {
// HACK: This converts yielded string literals into string literal matching rules.
if (typeof rule === "string") {
rule = { __protocolKitInstruction: true, type: "literal", string: rule };
}
assert(typeof rule === "object" && rule.__protocolKitInstruction === true); // FIXME: Better error
let context = { startIndex: state.currentIndex };
let handler = matchValue.literal(rule.type, require("./core-ops"));
let result = yield yieldcore.Internal(handler(rule, state, context), rule);
// FIXME: Restore index when retrying a match after NotEnoughInput
return result;
}
let core = yieldcore.create(rootParser, {
onYieldInstruction: function* (instruction, frame, stack) {
// Parser yielded
let result = yield* applyRule(instruction, frame, stack);
// console.log("previousFrame", (result === NoMatch), stack.at(-2));
// MARKER: Loading more input asynchronously
if (result === NotEnoughInput && state.isFullyLoaded === true) {
result = NoMatch;
}
if (result === NotEnoughInput) {
// TODO: Does this correctly come out of the parsing Promise?
throw new Error(`Ran out of input`);
} else if (result === NoMatch && stack.length > 1 && stack.at(-2).instruction !== "internal") {
// We can never return a NoMatch to a user-supplied generator! Instead, we forcibly NoMatch-fail every user-supplied generator up until the next internal instruction, to essentially emulate a `throw` that gets caught in internal instructions.
for (let i = stack.length - 2; i >= 0; i--) {
if (stack[i].instruction === "internal") {
break;
} else {
stack[i].forceValue = NoMatch;
}
}
} else {
return result;
}
},
onReturn: function* (value) {
// FIXME: Value produced, emit this at some point?, noop for now
return value; // Return the produced value unchanged
},
onBeforeStackDecrease: function (stack, returnValue) {
let frame = stack.at(-1);
if (process.env.DEBUG_PARSER && !isInternalFrame(frame)) {
console.log(`!! (${formatIndex()})` + chalk.gray("│ ".repeat(getStackSize(stack))) + formatFrameReturnValue(returnValue));
}
},
onAfterStackIncrease: function (stack) {
let frame = stack.at(-1);
if (process.env.DEBUG_PARSER && !isInternalFrame(frame)) {
console.log(`>> (${formatIndex()})` + chalk.gray("│ ".repeat(getStackSize(stack))) + formatFrameInstruction(frame));
}
}
});
return PromiseTry(() => {
return core.resume();
}).then((result) => {
if (result === NoMatch) {
throw new Error(`No match`);
} else {
return result;
}
});
// // FIXME: Detect when rules run out but end of input has not yet been reached, as this is an error (unless specified otherwise - need to figure out how to let grammar authors configure this maybe, for formats that allow trailing data, but that still need to be embeddable? or maybe that doesn't matter because when it's embedded, by definition the sub-parser will never be the root parser, and therefore there are always more higher-level rules left? maybe it's sufficient to just let the top-level parse call determine whether this is valid or not)
}
};