// FIXME: Unicode only! Need to check if we can assume that strings are always unicode, even when the source data was interpreted as another string encoding
// Restore index and try again with the next option
state.currentIndex=context.startIndex;
continue;
}else{
// Don't restore index; the match has been consumed
// FIXME: This includes NotEnoughInput! As it warrants an immediate abort. Handling of NotEnoughInput markers should be moved to a centralized place instead. Also, we should figure out exactly how to retain the current parsing position when one is encountered, and whether eg. individual core operations need to manage cursor resets for this purpose, or whether the core can centrally handle that as well, eg. by retaining the parsing stack.
// FIXME: Make this not order-sensitive in an `either`! Currently the NotEnoughInput marker *might* cause issues if this (zero-width) rule comes after nonzero-width rules? Need to investigate.
// NOTE: `state` is mutable from within core ops, `context` is not, but both may be updated externally (eg. for input shifting)
letstate={
currentInput:input,
currentIndex:0,
isFullyLoaded:true// FIXME: Only set to true once the input stream has been fully consumed
};
functionshiftInput(bytes){
// This function is called to remove a certain amount of bytes from the start of the currentInput; this can be done to reduce memory usage whenever the parser is at a point where backtracking is no longer possible.
// This formats the current parsing index for display in debug messages, padded to the maximum possible width of any index, so that debug log entries remain visually aligned.
// FIXME: Unicode only! Need to check if we can assume that strings are always unicode, even when the source data was interpreted as another string encoding
letcodepoint=input.codePointAt(currentIndex)
// FIXME: Find a way to do this generically without breaking the EndOfInput operation
// TODO: Should we return the codepoint in string form here? That will be unnecessary work in most cases where `wholeMatch` will be used
currentIndex+=1;
return;
}else{
returnNoMatch;
}
},
// NOTE: Regex literals deprecated due to incompatibility with streaming/mixed-mode parsing
// regex: () => {
// let { regex } = rule;
// // HACK: This is very much an imperfect approach. We're doing a (potentially large) string copy, and are letting it match at *any* position in the input, potentially wasting resurces if it turns out the match wasn't at index 0. This is unfortunate, but likely still the best option - the internal regex implementation is highly optimized (meaning a written-in-JS implementation is unlikely to beat it in performance), and the built-in implementation doesn't allow anchoring a match separately from the regex definition itself. We *could* transform the regex to have a start anchor, but then this would defeat optimization of repeatedly used regexes - this transformation step would be applied *every time the parsing rule is used*, instead of only once at JS parsing time. Should investigate whether there's any performant way of cacheing this work internally!
// // FIXME: Disallow global-flagged regexes? As the internal starting index can throw off our logic
// // FIXME: The approach we've chosen here is probably *really* unperformant when combining a regex literal with an `until` combinator!
// let match = regex.exec(input.slice(currentIndex));
// if (match?.index === 0) {
// // Valid match, because it starts at the currentIndex
// currentIndex += match[0].length;
// // NOTE: We only return the groups, and not the full match, for consistency with the rest of the API - wholeMatch should be used for that (and the performance cost of that additional call should be negligible)
// return {
// $positional: match.slice(1),
// ... match.groups
// };
// } else {
// return NoMatch;
// }
// },
endOfInput:()=>{
// FIXME: Make this not order-sensitive in an `either`! Currently the NotEnoughInput marker *might* cause issues if this (zero-width) rule comes after nonzero-width rules? Need to investigate.
if(currentIndex===input.length){
// FIXME: Make this NotEnoughInput-aware; there is probably a similar "exception from the core handling" problem here as in `until`
currentIndex+=1;// We have consumed the 'virtual' end-of-input marker at the end of the input
// FIXME: Verify that this doesn't break anything elsewhere
// Don't restore index; the match has been consumed
// FIXME: This includes NotEnoughInput! As it warrants an immediate abort. Handling of NotEnoughInput markers should be moved to a centralized place instead. Also, we should figure out exactly how to retain the current parsing position when one is encountered, and whether eg. individual core operations need to manage cursor resets for this purpose, or whether the core can centrally handle that as well, eg. by retaining the parsing stack.
returnresult;
}
}
// None of the options matched
if(encounteredNotEnoughInput){
// This means that at least one of the options returned a NotEnoughInput; which means that we couldn't actually determine whether that option *would* have matched or not, so the entire Either will be considered to need more input
returnNotEnoughInput;
}else{
returnNoMatch;
}
},
peek:()=>{
letresult=applyRule(rule.rule);
currentIndex=currentFrame.startPosition;
returnresult;
},
test:()=>{
// FIXME: Test
// TODO: Share implementation with `peek`, maybe compose?
letresult=applyRule(rule.rule);
currentIndex=currentFrame.startPosition;
if(result===NotEnoughInput){
// Propagate this marker directly, as we will need to re-parse after receiving more input, and we cannot yet decide whether there is a match or not.
returnNotEnoughInput;
}elseif(result===NoMatch){
returnfalse;
}else{
returntrue;
}
},
zeroOrMore:()=>{
letmatches=[];
while(true){
letresult=applyRule(rule.rule);
if(result===NotEnoughInput){
// Propagate, reparse later
returnNotEnoughInput;
}elseif(result===NoMatch){
break;
}else{
matches.push(result);
}
}
returnmatches;
},
oneOrMore:()=>{
// FIXME: Compose on zeroOrMore, but add a length assertion
returnundefined;// TODO: Or return `null` instead?
}else{
returnresult;
}
},
until:()=>{
// FIXME: We're probably never actually triggering NotEnoughInput right now, due to how the loop logic works here?
// TODO: Build this on `peek` instead? Is there any actual benefit to that?
// TODO: *Should* end of input be handled specially here, or should it be up to the parser itself to determine whether to stop there? Could be surprising behaviour for it to fail a match just because the input ended with only desired values, but it could also be surprising to expect it to match a delimiter without that delimiter actually being there (but wouldn't that be handled anyway by a subsequent rule for that delimiter?)
for(;currentIndex<=input.length;currentIndex++){
console.log({currentIndex,length:input.length});
letresult=applyRule(rule.rule);
// FIXME: Fix the structure here, and figure out a way to deal with allowEnd without needing to special-case NotEnoughInput handling against inputIsEnded, because that should be a core concern only
if(result===NotEnoughInput){
if(inputIsEnded&&rule.allowEnd){
// Fall through
break;
}else{
returnNotEnoughInput;
}
}elseif(result===NoMatch){
continue;
}else{
// Fall through
break;
}
}
// We've consumed everything *up to* the match, but not the match itself
// // try each rule, try next on error, until success or final failure
// },
// optional: () => {
// // Also generates a context
// },
// peek: () => {
// // TODO: semantic difference between peek and either is that the either context should be thrown away after it fully completes (including nested rules)?
// // TODO: emit items (or not) option
// contextStack.push({
// index: currentIndex
// });
// // run parser as normal, but reset index afterwards -- return boolean true/false or an actual parsed item? maybe a separate test instruction for boolean result?
// },
// test: () => {
// },
// wholeMatch: () => {
// },
// trackPosition: () => {
// }
});
}else{
// FIXME: Do we need to implement anything else, or is this just a bug in the grammar?
// HACK: Make this nicer, maybe visually represent this in the parse debug tree as well
if(inputIsEnded&&result===NotEnoughInput){
result=NoMatch;
}
// FIXME: Restore index when retrying a match after NotEnoughInput
returnresult;
}
letrootResult=applyRule(rootParser);
// FIXME: Detect when rules run out but end of input has not yet been reached, as this is an error (unless specified otherwise - need to figure out how to let grammar authors configure this maybe, for formats that allow trailing data, but that still need to be embeddable? or maybe that doesn't matter because when it's embedded, by definition the sub-parser will never be the root parser, and therefore there are always more higher-level rules left? maybe it's sufficient to just let the top-level parse call determine whether this is valid or not)
if(currentIndex<input.length){
console.log("incomplete result:",rootResult);
thrownewError("Ran out of parsing rules before end of input");
// // FIXME: Detect when rules run out but end of input has not yet been reached, as this is an error (unless specified otherwise - need to figure out how to let grammar authors configure this maybe, for formats that allow trailing data, but that still need to be embeddable? or maybe that doesn't matter because when it's embedded, by definition the sub-parser will never be the root parser, and therefore there are always more higher-level rules left? maybe it's sufficient to just let the top-level parse call determine whether this is valid or not)
// if (currentIndex < input.length) {
// console.log("incomplete result:", rootResult);
// throw new Error("Ran out of parsing rules before end of input");
// NOTE: This does *not* allow leading newlines, because a line template may start parsing in the middle of an existing line, and allowing leading newlines would break the intended behaviour of only parsing that same line