protocolkit/poc-generators-2/index.js

"use strict";

// function Literal(sequence) {
// 	// sequence = string or bytes
// }

// function Bytes(count) {

// }

// function ByteRange(rangeStart, rangeEnd) {

// }

// function Characters(count) {

// }

// function CharacterRange(rangeStart, rangeEnd) {

// }

// function Has(parser) {
// 	// does not consume
// 	let hasMatch = yield peek(parser);
// 	return hasMatch;
// }

// function Until(parser) {
// 	// does not consume
// 	return function Until(input, index, _speculativeContext) {
// 		let indicesRead = 0;

// 		while (true) {
// 			let has = yield Has(parser);
// 			indicesRead += 1;
// 		}
// 	}
// }

// function UntilNot(parser) {
// 	// does not consume
// }

/*
loop through rules
manage context
trackPosition wrapper (document order relative to context-creating operations!) - positions are automatically emitted in streaming mode
parser namespace property on parsing functions
*/


// TODO: What if a parsing rule is dependent on some context that's parsed somewhere else in the input entirely? Isn't this fundamentally incompatible with the streaming parser paradigm?
// NOTE: Use sentinel objects to denote failure, to prevent throw/catch overhead, which can get problematic especially with heavy peek/test usage
// FIXME: Consider whether NotEnoughInput can be handled on a core level rather than in individual core operations, since it seems to always need to be propagated?
// FIXME: Need a way to mark end of input, to avoid the case where a trailing optional yields a NotEnoughInput even though it *should* have parsed the input end as the actual end, and concluded that the optional is not present.

const isGeneratorFunction = require("is-generator-function");
const isRegex = require("is-regex");
const asExpression = require("as-expression");
const matchValue = require("match-value");
const util = require("util");
const yieldcore = require("./yieldcore");
const assert = require("assert");
const chalk = require("chalk");

const { NoMatch, NotEnoughInput } = require("./symbols");

/* TO DO:
- amount of characters
- amount of bytes
- regex
- return match with position metadata

FEATURES:
- streaming mode for all named matches
- grammar-defined streams for large payloads
*/

function isInternalFrame(frame) {
	if (process.env.DEBUG_PARSER_INTERNAL) {
		return false;
	} else {
		return (frame.instruction === "internal" && typeof frame.name === "string" && frame.name.startsWith("_"));
	}
}

function getStackSize(stack) {
	return stack.filter((frame) => !isInternalFrame(frame)).length;
}

function formatFrameInstruction(frame) {
	if (frame.instruction === "internal") {
		let formattedRule = util.inspect(frame.name, { colors: true, compact: true, breakLength: Infinity });
		return chalk.blue(`[internal: ${formattedRule}]`);
	} else {
		return util.inspect(frame.instruction, { colors: true, compact: true, breakLength: Infinity });
	}
}

module.exports = {
	NoMatch: NoMatch,
	parse: function parse(input, rootParser) {
		// NOTE: `state` is mutable from within core ops, `context` is not, but both may be updated externally (eg. for input shifting)
		let state = {
			currentInput: input,
			currentIndex: 0,
			isFullyLoaded: true // FIXME: Only set to true once the input stream has been fully consumed
		};

		function shiftInput(bytes) {
			// This function is called to remove a certain amount of bytes from the start of the currentInput; this can be done to reduce memory usage whenever the parser is at a point where backtracking is no longer possible.
			// FIXME: Also debuglog input shifts.
			assert(state.currentInput.length >= bytes); // FIXME: Better check
			state.currentInput = state.currentInput.slice(bytes);
			state.currentIndex -= bytes;
		}

		function formatIndex() {
			// This formats the current parsing index for display in debug messages, padded to the maximum possible width of any index, so that debug log entries remain visually aligned.
			return String(state.currentIndex).padStart(Math.ceil(Math.log10(state.currentInput.length)));
		}

		function* applyRule(rule, frame, stack) {
			// HACK: This converts yielded string literals into string literal matching rules.
			if (typeof rule === "string") {
				rule = { __protocolKitInstruction: true, type: "literal", string: rule };
			}

			assert(typeof rule === "object" && rule.__protocolKitInstruction === true); // FIXME: Better error

			let context = { startIndex: state.currentIndex };

			let handler = matchValue.literal(rule.type, require("./core-ops"));

			let result = yield yieldcore.Internal(handler(rule, state, context), rule);

			// FIXME: Restore index when retrying a match after NotEnoughInput

			return result;
		}

		let core = yieldcore.create(rootParser, {
			onYieldInstruction: function* (instruction, frame, stack) {
				// Parser yielded
				let result = yield* applyRule(instruction, frame, stack);

				if (result === NotEnoughInput) {
					throw new Error(`Ran out of input`);
				} else if (result === NoMatch) {
					throw new Error(`No match`);
				} else {
					return result;
				}
			},
			onReturn: function* (value) {
				// FIXME: Value produced, emit this at some point?, noop for now
				return value; // Return the produced value unchanged
			},
			onBeforeStackDecrease: function (stack, returnValue) {
				let frame = stack.at(-1);

				if (process.env.DEBUG_PARSER && !isInternalFrame(frame)) {
					console.log(`!! (${formatIndex()})` + "  ".repeat(getStackSize(stack)) + util.inspect(returnValue, { colors: true, compact: true, breakLength: Infinity }));
				}
			},
			onAfterStackIncrease: function (stack) {
				let frame = stack.at(-1);

				if (process.env.DEBUG_PARSER && !isInternalFrame(frame)) {
					console.log(`>> (${formatIndex()})` + "  ".repeat(getStackSize(stack)) + formatFrameInstruction(frame));
				}
			}
		});

		return core.resume();


		// let rootResult = applyRule(rootParser);

		// // FIXME: Detect when rules run out but end of input has not yet been reached, as this is an error (unless specified otherwise - need to figure out how to let grammar authors configure this maybe, for formats that allow trailing data, but that still need to be embeddable? or maybe that doesn't matter because when it's embedded, by definition the sub-parser will never be the root parser, and therefore there are always more higher-level rules left? maybe it's sufficient to just let the top-level parse call determine whether this is valid or not)
		// if (currentIndex < input.length) {
		// 	console.log("incomplete result:", rootResult);
		// 	throw new Error("Ran out of parsing rules before end of input");
		// }


		// if (rootResult === NoMatch) {
		// 	throw new Error(`No match`);
		// } else if (rootResult === NotEnoughInput) {
		// 	throw new Error("Not enough input");
		// } else {
		// 	return rootResult;
		// }
	}
};