You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

388 lines
14 KiB
JavaScript

"use strict";
// function Literal(sequence) {
// // sequence = string or bytes
// }
// function Bytes(count) {
// }
// function ByteRange(rangeStart, rangeEnd) {
// }
// function Characters(count) {
// }
// function CharacterRange(rangeStart, rangeEnd) {
// }
// function Has(parser) {
// // does not consume
// let hasMatch = yield peek(parser);
// return hasMatch;
// }
// function Until(parser) {
// // does not consume
// return function Until(input, index, _speculativeContext) {
// let indicesRead = 0;
// while (true) {
// let has = yield Has(parser);
// indicesRead += 1;
// }
// }
// }
// function UntilNot(parser) {
// // does not consume
// }
/*
loop through rules
manage context
trackPosition wrapper (document order relative to context-creating operations!) - positions are automatically emitted in streaming mode
parser namespace property on parsing functions
*/
// TODO: What if a parsing rule is dependent on some context that's parsed somewhere else in the input entirely? Isn't this fundamentally incompatible with the streaming parser paradigm?
// NOTE: Use sentinel objects to denote failure, to prevent throw/catch overhead, which can get problematic especially with heavy peek/test usage
// FIXME: Consider whether NotEnoughInput can be handled on a core level rather than in individual core operations, since it seems to always need to be propagated?
// FIXME: Need a way to mark end of input, to avoid the case where a trailing optional yields a NotEnoughInput even though it *should* have parsed the input end as the actual end, and concluded that the optional is not present.
const isGeneratorFunction = require("is-generator-function");
const isRegex = require("is-regex");
const asExpression = require("as-expression");
const matchValue = require("match-value");
const util = require("util");
const NoMatch = Symbol("protocolkit:NoMatch");
const NotEnoughInput = Symbol("protocolkit:NotEnoughInput");
// This is a utility function for propagating NoMatches through the stack, without resorting to `throw`/`catch` (which can be slow)
// FIXME: Use a Result type instead?
function assertMatch(testResult, produceResult) {
if (testResult === NoMatch || testResult === NotEnoughInput) {
return testResult;
} else {
return produceResult(testResult);
}
}
/* TO DO:
- amount of characters
- amount of bytes
- regex
- return match with position metadata
FEATURES:
- streaming mode for all named matches
- grammar-defined streams for large payloads
*/
module.exports = {
NoMatch: NoMatch,
parse: function parse(input, rootParser) {
let currentInput = input; // TODO: Cut this down, switch to chunked API instead
let currentIndex = 0;
let inputLength = input.length;
let parserStack = [];
let inputIsEnded = true; // FIXME: Make this dynamic in streaming mode
function printIndex() {
return String(currentIndex).padStart(Math.ceil(Math.log10(input.length)));
}
function applyRule(rule) {
let currentFrame = {
startPosition: currentIndex,
rule: rule
};
parserStack.push(currentFrame);
if (process.env.DEBUG_PARSER) {
// console.log(parserStack);
console.log(`>> (${printIndex()})` + " ".repeat(parserStack.length) + util.inspect(rule, { colors: true, compact: true, breakLength: Infinity }));
}
// HACK
if (typeof rule === "string") {
rule = { __protocolKitInstruction: true, type: "literal", string: rule };
} else if (isRegex(rule)) {
rule = { __protocolKitInstruction: true, type: "regex", regex: rule };
}
// console.log({rule});
let result = asExpression(() => {
if (isGeneratorFunction(rule)) {
let returnValue; // FIXME: Is this correct?
let lastValue;
let done = false;
let generator = rule();
while (done === false) {
let subRule = generator.next(lastValue);
// console.log({subRule});
if (subRule.done === true) {
returnValue = subRule.value;
done = true;
} else {
lastValue = applyRule(subRule.value);
if (lastValue === NoMatch || lastValue === NotEnoughInput) {
// Don't bother parsing any further
return lastValue;
}
}
}
return returnValue;
} else if (typeof rule === "object" && rule.__protocolKitInstruction === true) {
return matchValue(rule.type, {
literal: () => {
let { string } = rule;
if (currentIndex + string.length > inputLength) {
return NotEnoughInput;
// throw new Error(`End of input reached`); // FIXME: Error type
} else if (input.slice(currentIndex, currentIndex + string.length) === string) {
currentIndex += string.length;
return string;
} else {
return NoMatch;
}
},
regex: () => {
let { regex } = rule;
// HACK: This is very much an imperfect approach. We're doing a (potentially large) string copy, and are letting it match at *any* position in the input, potentially wasting resurces if it turns out the match wasn't at index 0. This is unfortunate, but likely still the best option - the internal regex implementation is highly optimized (meaning a written-in-JS implementation is unlikely to beat it in performance), and the built-in implementation doesn't allow anchoring a match separately from the regex definition itself. We *could* transform the regex to have a start anchor, but then this would defeat optimization of repeatedly used regexes - this transformation step would be applied *every time the parsing rule is used*, instead of only once at JS parsing time. Should investigate whether there's any performant way of cacheing this work internally!
// FIXME: Disallow global-flagged regexes? As the internal starting index can throw off our logic
// FIXME: The approach we've chosen here is probably *really* unperformant when combining a regex literal with an `until` combinator!
let match = regex.exec(input.slice(currentIndex));
if (match?.index === 0) {
// Valid match, because it starts at the currentIndex
currentIndex += match[0].length;
// NOTE: We only return the groups, and not the full match, for consistency with the rest of the API - wholeMatch should be used for that (and the performance cost of that additional call should be negligible)
return {
$positional: match.slice(1),
... match.groups
};
} else {
return NoMatch;
}
},
endOfInput: () => {
// FIXME: Make this not order-sensitive in an `either`! Currently the NotEnoughInput marker *might* cause issues if this (zero-width) rule comes after nonzero-width rules? Need to investigate.
if (currentIndex === input.length) {
// FIXME: Make this NotEnoughInput-aware; there is probably a similar "exception from the core handling" problem here as in `until`
return true;
} else {
return NoMatch;
}
},
wholeMatch: () => {
let result = applyRule(rule.rule);
return assertMatch(result, () => {
return input.slice(currentFrame.startPosition, currentIndex);
});
},
either: () => {
let encounteredNotEnoughInput = false;
for (let option of rule.options) {
// FIXME: currentFrame.startPosition
let startPosition = currentIndex;
let result = applyRule(option);
if (result === NoMatch) {
// Restore index and try again with the next option
encounteredNotEnoughInput = encounteredNotEnoughInput || (result === NotEnoughInput);
currentIndex = startPosition;
continue;
} else {
// Don't restore index; the match has been consumed
// FIXME: This includes NotEnoughInput! As it warrants an immediate abort. Handling of NotEnoughInput markers should be moved to a centralized place instead. Also, we should figure out exactly how to retain the current parsing position when one is encountered, and whether eg. individual core operations need to manage cursor resets for this purpose, or whether the core can centrally handle that as well, eg. by retaining the parsing stack.
return result;
}
}
// None of the options matched
if (encounteredNotEnoughInput) {
// This means that at least one of the options returned a NotEnoughInput; which means that we couldn't actually determine whether that option *would* have matched or not, so the entire Either will be considered to need more input
return NotEnoughInput;
} else {
return NoMatch;
}
},
peek: () => {
let result = applyRule(rule.rule);
currentIndex = currentFrame.startPosition;
return result;
},
test: () => {
// FIXME: Test
// TODO: Share implementation with `peek`, maybe compose?
let result = applyRule(rule.rule);
currentIndex = currentFrame.startPosition;
if (result === NotEnoughInput) {
// Propagate this marker directly, as we will need to re-parse after receiving more input, and we cannot yet decide whether there is a match or not.
return NotEnoughInput;
} else if (result === NoMatch) {
return false;
} else {
return true;
}
},
zeroOrMore: () => {
let matches = [];
while (true) {
let result = applyRule(rule.rule);
if (result === NotEnoughInput) {
// Propagate, reparse later
return NotEnoughInput;
} else if (result === NoMatch) {
break;
} else {
matches.push(result);
}
}
return matches;
},
oneOrMore: () => {
// FIXME: Compose on zeroOrMore, but add a length assertion
let matches = applyRule({ __protocolKitInstruction: true, type: "zeroOrMore", rule: rule.rule });
if (matches === NotEnoughInput || matches.length > 0) {
return matches;
} else {
return NoMatch;
}
},
optional: () => {
let result = applyRule(rule.rule);
if (result === NotEnoughInput) {
return NotEnoughInput;
} else if (result === NoMatch) {
return undefined; // TODO: Or return `null` instead?
} else {
return result;
}
},
until: () => {
// FIXME: We're probably never actually triggering NotEnoughInput right now, due to how the loop logic works here?
// TODO: Build this on `peek` instead? Is there any actual benefit to that?
for (; currentIndex <= input.length; currentIndex++) {
let result = applyRule(rule.rule);
// FIXME: Fix the structure here, and figure out a way to deal with allowEnd without needing to special-case NotEnoughInput handling against inputIsEnded, because that should be a core concern only
if (result === NotEnoughInput) {
if (inputIsEnded && rule.allowEnd) {
// Fall through
break;
} else {
return NotEnoughInput;
}
} else if (result === NoMatch) {
continue;
} else {
// Fall through
break;
}
}
// We've consumed everything *up to* the match, but not the match itself
currentIndex -= 1;
return input.slice(currentFrame.startPosition, currentIndex);
}
// zeroOrMore: () => {
// },
// oneOrMore: () => {
// },
// either: () => {
// contextStack.push({
// index: currentIndex
// });
// // try each rule, try next on error, until success or final failure
// },
// optional: () => {
// // Also generates a context
// },
// peek: () => {
// // TODO: semantic difference between peek and either is that the either context should be thrown away after it fully completes (including nested rules)?
// // TODO: emit items (or not) option
// contextStack.push({
// index: currentIndex
// });
// // run parser as normal, but reset index afterwards -- return boolean true/false or an actual parsed item? maybe a separate test instruction for boolean result?
// },
// test: () => {
// },
// wholeMatch: () => {
// },
// trackPosition: () => {
// }
});
} else {
// FIXME: Do we need to implement anything else, or is this just a bug in the grammar?
throw new Error(`Unimplemented`);
}
});
if (process.env.DEBUG_PARSER) {
console.log(`!! (${printIndex()})` + " ".repeat(parserStack.length) + util.inspect(result, { colors: true, compact: true, breakLength: Infinity }));
}
parserStack.pop();
// HACK: Make this nicer, maybe visually represent this in the parse debug tree as well
if (inputIsEnded && result === NotEnoughInput) {
result = NoMatch;
}
return result;
}
let rootResult = applyRule(rootParser);
// FIXME: Detect when rules run out but end of input has not yet been reached, as this is an error (unless specified otherwise - need to figure out how to let grammar authors configure this maybe, for formats that allow trailing data, but that still need to be embeddable? or maybe that doesn't matter because when it's embedded, by definition the sub-parser will never be the root parser, and therefore there are always more higher-level rules left? maybe it's sufficient to just let the top-level parse call determine whether this is valid or not)
if (currentIndex < input.length) {
console.log("incomplete result:", rootResult);
throw new Error("Ran out of parsing rules before end of input");
}
if (rootResult === NoMatch) {
throw new Error(`No match`);
} else if (rootResult === NotEnoughInput) {
throw new Error("Not enough input");
} else {
return rootResult;
}
}
};