// HACK: This is very much an imperfect approach. We're doing a (potentially large) string copy, and are letting it match at *any* position in the input, potentially wasting resurces if it turns out the match wasn't at index 0. This is unfortunate, but likely still the best option - the internal regex implementation is highly optimized (meaning a written-in-JS implementation is unlikely to beat it in performance), and the built-in implementation doesn't allow anchoring a match separately from the regex definition itself. We *could* transform the regex to have a start anchor, but then this would defeat optimization of repeatedly used regexes - this transformation step would be applied *every time the parsing rule is used*, instead of only once at JS parsing time. Should investigate whether there's any performant way of cacheing this work internally!
// FIXME: Disallow global-flagged regexes? As the internal starting index can throw off our logic
// FIXME: The approach we've chosen here is probably *really* unperformant when combining a regex literal with an `until` combinator!
letmatch=regex.exec(input.slice(currentIndex));
if(match?.index===0){
// Valid match, because it starts at the currentIndex
currentIndex+=match[0].length;
// NOTE: We only return the groups, and not the full match, for consistency with the rest of the API - wholeMatch should be used for that (and the performance cost of that additional call should be negligible)
return{
$positional:match.slice(1),
...match.groups
};
characterRange:()=>{
// FIXME: Unicode only! Need to check if we can assume that strings are always unicode, even when the source data was interpreted as another string encoding
letcodepoint=input.codePointAt(currentIndex)
// FIXME: Find a way to do this generically without breaking the EndOfInput operation
// TODO: Should we return the codepoint in string form here? That will be unnecessary work in most cases where `wholeMatch` will be used
currentIndex+=1;
return;
}else{
returnNoMatch;
}
},
// NOTE: Regex literals deprecated due to incompatibility with streaming/mixed-mode parsing
// regex: () => {
// let { regex } = rule;
// // HACK: This is very much an imperfect approach. We're doing a (potentially large) string copy, and are letting it match at *any* position in the input, potentially wasting resurces if it turns out the match wasn't at index 0. This is unfortunate, but likely still the best option - the internal regex implementation is highly optimized (meaning a written-in-JS implementation is unlikely to beat it in performance), and the built-in implementation doesn't allow anchoring a match separately from the regex definition itself. We *could* transform the regex to have a start anchor, but then this would defeat optimization of repeatedly used regexes - this transformation step would be applied *every time the parsing rule is used*, instead of only once at JS parsing time. Should investigate whether there's any performant way of cacheing this work internally!
// // FIXME: Disallow global-flagged regexes? As the internal starting index can throw off our logic
// // FIXME: The approach we've chosen here is probably *really* unperformant when combining a regex literal with an `until` combinator!
// let match = regex.exec(input.slice(currentIndex));
// if (match?.index === 0) {
// // Valid match, because it starts at the currentIndex
// currentIndex += match[0].length;
// // NOTE: We only return the groups, and not the full match, for consistency with the rest of the API - wholeMatch should be used for that (and the performance cost of that additional call should be negligible)
// return {
// $positional: match.slice(1),
// ... match.groups
// };
// } else {
// return NoMatch;
// }
// },
endOfInput:()=>{
// FIXME: Make this not order-sensitive in an `either`! Currently the NotEnoughInput marker *might* cause issues if this (zero-width) rule comes after nonzero-width rules? Need to investigate.
if(currentIndex===input.length){
// FIXME: Make this NotEnoughInput-aware; there is probably a similar "exception from the core handling" problem here as in `until`
currentIndex+=1;// We have consumed the 'virtual' end-of-input marker at the end of the input
// FIXME: Verify that this doesn't break anything elsewhere
returntrue;
}else{
returnNoMatch;
@ -287,7 +305,9 @@ module.exports = {
until:()=>{
// FIXME: We're probably never actually triggering NotEnoughInput right now, due to how the loop logic works here?
// TODO: Build this on `peek` instead? Is there any actual benefit to that?
// TODO: *Should* end of input be handled specially here, or should it be up to the parser itself to determine whether to stop there? Could be surprising behaviour for it to fail a match just because the input ended with only desired values, but it could also be surprising to expect it to match a delimiter without that delimiter actually being there (but wouldn't that be handled anyway by a subsequent rule for that delimiter?)
for(;currentIndex<=input.length;currentIndex++){
console.log({currentIndex,length:input.length});
letresult=applyRule(rule.rule);
// FIXME: Fix the structure here, and figure out a way to deal with allowEnd without needing to special-case NotEnoughInput handling against inputIsEnded, because that should be a core concern only