diff --git a/poc-generators-2/combinator/until-delimiter.js b/poc-generators-2/combinator/until-delimiter.js new file mode 100644 index 0000000..41512fc --- /dev/null +++ b/poc-generators-2/combinator/until-delimiter.js @@ -0,0 +1,11 @@ +"use strict"; + +const { until } = require("../operations"); + +module.exports = function untilDelimiter(rule) { + return function* () { + let value = yield until(rule); + yield rule; // Consume the delimiter + return value; + } +}; diff --git a/poc-generators-2/eoi.js b/poc-generators-2/eoi.js new file mode 100644 index 0000000..6595caf --- /dev/null +++ b/poc-generators-2/eoi.js @@ -0,0 +1,22 @@ +"use strict"; + +const { until, EndOfInput } = require("./operations"); +const { parse } = require("./index"); + +function* foo() { + yield "hello"; + yield EndOfInput; +} + +console.log(parse("hello", foo)); + +// console.log(parse("hello", until(EndOfInput))); + +function* bar() { + yield until("s"); + yield "s"; +} + +// console.log(parse("hellos", bar)); + + diff --git a/poc-generators-2/hls.js b/poc-generators-2/hls.js index b7384d1..c43b4fa 100644 --- a/poc-generators-2/hls.js +++ b/poc-generators-2/hls.js @@ -1,31 +1,29 @@ "use strict"; -const { either, wholeMatch, optional, oneOrMore, until, EndOfInput } = require("./operations"); - -function* Newline() { - yield "\n"; -} - -function* Digits() { - return yield /[0-9]+/; +const { either, oneOrMore, EndOfInput } = require("./operations"); +const Newline = require("./simple/lines/newline").LF; +const RestOfLine = require("./simple/lines/rest-of-line").LF; +const Integer = require("./simple/numeric/integer"); +const Decimal = require("./simple/numeric/decimal"); + +function lastItem(array) { + return array[array.length - 1]; } -function* Integer() { - return parseInt(yield wholeMatch(Digits)); -} - -function* Decimal() { - // NOTE: this gets converted to a floating point value! - let decimalString = yield wholeMatch(function* () { - yield Digits; - - yield optional(function* () { - yield "."; - yield Digits; - }); - }); - - return parseFloat(decimalString); +function line(strings, ... parsers) { + return function* () { + let parserResults = []; + + for (let i = 0; i < strings.length - 1; i++) { + yield strings[i]; + parserResults.push(yield parsers[i]); + } + + yield lastItem(strings); + yield Newline; + + return parserResults; + } } function* Playlist() { @@ -33,22 +31,13 @@ function* Playlist() { } function* MediaPlaylist() { - yield "#EXTM3U"; - yield Newline; - - yield "#EXT-X-TARGETDURATION:"; - let targetDuration = yield Integer; - yield Newline; + yield line`#EXTM3U`; + let [ targetDuration] = yield line`#EXT-X-TARGETDURATION:${Integer}`; yield Newline; let items = yield oneOrMore(function* () { - yield "#EXTINF:"; - let duration = yield Decimal; - yield ","; - yield Newline; - - let url = yield until(Newline); - yield either([ EndOfInput, Newline ]); + let [ duration ] = yield line`#EXTINF:${Decimal},`; + let url = yield RestOfLine; return { url, duration }; }); @@ -56,4 +45,4 @@ function* MediaPlaylist() { return { targetDuration, items }; } -module.exports = { Playlist }; \ No newline at end of file +module.exports = { Playlist }; diff --git a/poc-generators-2/index.js b/poc-generators-2/index.js index 31137ae..3177c50 100644 --- a/poc-generators-2/index.js +++ b/poc-generators-2/index.js @@ -160,31 +160,49 @@ module.exports = { return NoMatch; } }, - regex: () => { - let { regex } = rule; - - // HACK: This is very much an imperfect approach. We're doing a (potentially large) string copy, and are letting it match at *any* position in the input, potentially wasting resurces if it turns out the match wasn't at index 0. This is unfortunate, but likely still the best option - the internal regex implementation is highly optimized (meaning a written-in-JS implementation is unlikely to beat it in performance), and the built-in implementation doesn't allow anchoring a match separately from the regex definition itself. We *could* transform the regex to have a start anchor, but then this would defeat optimization of repeatedly used regexes - this transformation step would be applied *every time the parsing rule is used*, instead of only once at JS parsing time. Should investigate whether there's any performant way of cacheing this work internally! - // FIXME: Disallow global-flagged regexes? As the internal starting index can throw off our logic - // FIXME: The approach we've chosen here is probably *really* unperformant when combining a regex literal with an `until` combinator! - let match = regex.exec(input.slice(currentIndex)); - - if (match?.index === 0) { - // Valid match, because it starts at the currentIndex - currentIndex += match[0].length; - - // NOTE: We only return the groups, and not the full match, for consistency with the rest of the API - wholeMatch should be used for that (and the performance cost of that additional call should be negligible) - return { - $positional: match.slice(1), - ... match.groups - }; + characterRange: () => { + // FIXME: Unicode only! Need to check if we can assume that strings are always unicode, even when the source data was interpreted as another string encoding + let codepoint = input.codePointAt(currentIndex) + + // FIXME: Find a way to do this generically without breaking the EndOfInput operation + if (currentIndex === input.length) { + return NotEnoughInput; + } else if (codepoint >= rule.start && codepoint <= rule.end) { + // TODO: Should we return the codepoint in string form here? That will be unnecessary work in most cases where `wholeMatch` will be used + currentIndex += 1; + return; } else { return NoMatch; } }, + // NOTE: Regex literals deprecated due to incompatibility with streaming/mixed-mode parsing + // regex: () => { + // let { regex } = rule; + + // // HACK: This is very much an imperfect approach. We're doing a (potentially large) string copy, and are letting it match at *any* position in the input, potentially wasting resurces if it turns out the match wasn't at index 0. This is unfortunate, but likely still the best option - the internal regex implementation is highly optimized (meaning a written-in-JS implementation is unlikely to beat it in performance), and the built-in implementation doesn't allow anchoring a match separately from the regex definition itself. We *could* transform the regex to have a start anchor, but then this would defeat optimization of repeatedly used regexes - this transformation step would be applied *every time the parsing rule is used*, instead of only once at JS parsing time. Should investigate whether there's any performant way of cacheing this work internally! + // // FIXME: Disallow global-flagged regexes? As the internal starting index can throw off our logic + // // FIXME: The approach we've chosen here is probably *really* unperformant when combining a regex literal with an `until` combinator! + // let match = regex.exec(input.slice(currentIndex)); + + // if (match?.index === 0) { + // // Valid match, because it starts at the currentIndex + // currentIndex += match[0].length; + + // // NOTE: We only return the groups, and not the full match, for consistency with the rest of the API - wholeMatch should be used for that (and the performance cost of that additional call should be negligible) + // return { + // $positional: match.slice(1), + // ... match.groups + // }; + // } else { + // return NoMatch; + // } + // }, endOfInput: () => { // FIXME: Make this not order-sensitive in an `either`! Currently the NotEnoughInput marker *might* cause issues if this (zero-width) rule comes after nonzero-width rules? Need to investigate. if (currentIndex === input.length) { // FIXME: Make this NotEnoughInput-aware; there is probably a similar "exception from the core handling" problem here as in `until` + currentIndex += 1; // We have consumed the 'virtual' end-of-input marker at the end of the input + // FIXME: Verify that this doesn't break anything elsewhere return true; } else { return NoMatch; @@ -287,7 +305,9 @@ module.exports = { until: () => { // FIXME: We're probably never actually triggering NotEnoughInput right now, due to how the loop logic works here? // TODO: Build this on `peek` instead? Is there any actual benefit to that? + // TODO: *Should* end of input be handled specially here, or should it be up to the parser itself to determine whether to stop there? Could be surprising behaviour for it to fail a match just because the input ended with only desired values, but it could also be surprising to expect it to match a delimiter without that delimiter actually being there (but wouldn't that be handled anyway by a subsequent rule for that delimiter?) for (; currentIndex <= input.length; currentIndex++) { + console.log({currentIndex, length: input.length}); let result = applyRule(rule.rule); // FIXME: Fix the structure here, and figure out a way to deal with allowEnd without needing to special-case NotEnoughInput handling against inputIsEnded, because that should be a core concern only diff --git a/poc-generators-2/operations.js b/poc-generators-2/operations.js index 4d6edc9..330afe0 100644 --- a/poc-generators-2/operations.js +++ b/poc-generators-2/operations.js @@ -1,5 +1,18 @@ "use strict"; +function parseCodepoint(input, nullValue) { + if (input === null) { + return nullValue; + } else if (typeof input === "number") { + return input; + } else if (typeof input === "string") { + return input.codePointAt(0); + } else { + // FIXME: Better validation here + throw new Error(`Invalid codepoint input`); + } +} + module.exports = { zeroOrMore: (rule) => { return { @@ -70,4 +83,16 @@ module.exports = { __protocolKitInstruction: true, type: "endOfInput" }, + characterRange: (start, end) => { + // NOTE: Both inclusive! + let startCodepoint = parseCodepoint(start, 0); + let endCodepoint = parseCodepoint(end, Infinity); + + return { + __protocolKitInstruction: true, + type: "characterRange", + start: startCodepoint, + end: endCodepoint + }; + } }; diff --git a/poc-generators-2/simple/lines/carriage-return.js b/poc-generators-2/simple/lines/carriage-return.js new file mode 100644 index 0000000..c9f04f9 --- /dev/null +++ b/poc-generators-2/simple/lines/carriage-return.js @@ -0,0 +1,5 @@ +"use strict"; + +module.exports = function* CarriageReturn() { + yield "\r"; +}; diff --git a/poc-generators-2/simple/lines/line-feed.js b/poc-generators-2/simple/lines/line-feed.js new file mode 100644 index 0000000..a75ea11 --- /dev/null +++ b/poc-generators-2/simple/lines/line-feed.js @@ -0,0 +1,5 @@ +"use strict"; + +module.exports = function* LineFeed() { + yield "\n"; +}; diff --git a/poc-generators-2/simple/lines/newline.js b/poc-generators-2/simple/lines/newline.js new file mode 100644 index 0000000..0620d14 --- /dev/null +++ b/poc-generators-2/simple/lines/newline.js @@ -0,0 +1,13 @@ +"use strict"; + +const LineFeed = require("./line-feed"); +const CarriageReturn = require("./carriage-return"); + +module.exports = { + LF: LineFeed, + CR: CarriageReturn, + CRLF: function* () { + yield LineFeed; + yield CarriageReturn; + } +}; diff --git a/poc-generators-2/simple/lines/rest-of-line.js b/poc-generators-2/simple/lines/rest-of-line.js new file mode 100644 index 0000000..bfdddaa --- /dev/null +++ b/poc-generators-2/simple/lines/rest-of-line.js @@ -0,0 +1,17 @@ +"use strict"; + +const { either, EndOfInput } = require("../../operations"); +const Newline = require("./newline"); +const untilDelimiter = require("../../combinator/until-delimiter"); + +function createRestOfLine(newlineRule) { + return function* RestOfLine() { + return yield untilDelimiter(either([ newlineRule, EndOfInput ])); + } +} + +module.exports = { + LF: createRestOfLine(Newline.LF), + CR: createRestOfLine(Newline.CR), + CRLF: createRestOfLine(Newline.CRLF), +}; diff --git a/poc-generators-2/simple/numeric/decimal.js b/poc-generators-2/simple/numeric/decimal.js new file mode 100644 index 0000000..64c8f1c --- /dev/null +++ b/poc-generators-2/simple/numeric/decimal.js @@ -0,0 +1,18 @@ +"use strict"; + +const { wholeMatch, optional } = require("../../operations"); +const Digits = require("./digits"); + +module.exports = function* Decimal() { + // NOTE: this gets converted to a floating point value! + let decimalString = yield wholeMatch(function* () { + yield Digits; + + yield optional(function* () { + yield "."; + yield Digits; + }); + }); + + return parseFloat(decimalString); +}; diff --git a/poc-generators-2/simple/numeric/digit.js b/poc-generators-2/simple/numeric/digit.js new file mode 100644 index 0000000..6cf1cda --- /dev/null +++ b/poc-generators-2/simple/numeric/digit.js @@ -0,0 +1,7 @@ +"use strict"; + +const { characterRange } = require("../../operations"); + +module.exports = function* Digit() { + return yield characterRange("0", "9"); +}; diff --git a/poc-generators-2/simple/numeric/digits.js b/poc-generators-2/simple/numeric/digits.js new file mode 100644 index 0000000..1a881f4 --- /dev/null +++ b/poc-generators-2/simple/numeric/digits.js @@ -0,0 +1,8 @@ +"use strict"; + +const Digit = require("./digit"); +const { oneOrMore } = require("../../operations"); + +module.exports = function* Digits() { + return yield oneOrMore(Digit); +}; diff --git a/poc-generators-2/simple/numeric/integer.js b/poc-generators-2/simple/numeric/integer.js new file mode 100644 index 0000000..79b3654 --- /dev/null +++ b/poc-generators-2/simple/numeric/integer.js @@ -0,0 +1,8 @@ +"use strict"; + +const { wholeMatch } = require("../../operations"); +const Digits = require("./digits"); + +module.exports = function* Integer() { + return parseInt(yield wholeMatch(Digits)); +};