master
Sven Slootweg 2 years ago
parent a78ed3b5ff
commit d34156d043

@ -0,0 +1,11 @@
"use strict";
const { until } = require("../operations");
module.exports = function untilDelimiter(rule) {
return function* () {
let value = yield until(rule);
yield rule; // Consume the delimiter
return value;
}
};

@ -0,0 +1,22 @@
"use strict";
const { until, EndOfInput } = require("./operations");
const { parse } = require("./index");
function* foo() {
yield "hello";
yield EndOfInput;
}
console.log(parse("hello", foo));
// console.log(parse("hello", until(EndOfInput)));
function* bar() {
yield until("s");
yield "s";
}
// console.log(parse("hellos", bar));

@ -1,31 +1,29 @@
"use strict";
const { either, wholeMatch, optional, oneOrMore, until, EndOfInput } = require("./operations");
function* Newline() {
yield "\n";
}
function* Digits() {
return yield /[0-9]+/;
const { either, oneOrMore, EndOfInput } = require("./operations");
const Newline = require("./simple/lines/newline").LF;
const RestOfLine = require("./simple/lines/rest-of-line").LF;
const Integer = require("./simple/numeric/integer");
const Decimal = require("./simple/numeric/decimal");
function lastItem(array) {
return array[array.length - 1];
}
function* Integer() {
return parseInt(yield wholeMatch(Digits));
}
function* Decimal() {
// NOTE: this gets converted to a floating point value!
let decimalString = yield wholeMatch(function* () {
yield Digits;
yield optional(function* () {
yield ".";
yield Digits;
});
});
return parseFloat(decimalString);
function line(strings, ... parsers) {
return function* () {
let parserResults = [];
for (let i = 0; i < strings.length - 1; i++) {
yield strings[i];
parserResults.push(yield parsers[i]);
}
yield lastItem(strings);
yield Newline;
return parserResults;
}
}
function* Playlist() {
@ -33,22 +31,13 @@ function* Playlist() {
}
function* MediaPlaylist() {
yield "#EXTM3U";
yield Newline;
yield "#EXT-X-TARGETDURATION:";
let targetDuration = yield Integer;
yield Newline;
yield line`#EXTM3U`;
let [ targetDuration] = yield line`#EXT-X-TARGETDURATION:${Integer}`;
yield Newline;
let items = yield oneOrMore(function* () {
yield "#EXTINF:";
let duration = yield Decimal;
yield ",";
yield Newline;
let url = yield until(Newline);
yield either([ EndOfInput, Newline ]);
let [ duration ] = yield line`#EXTINF:${Decimal},`;
let url = yield RestOfLine;
return { url, duration };
});
@ -56,4 +45,4 @@ function* MediaPlaylist() {
return { targetDuration, items };
}
module.exports = { Playlist };
module.exports = { Playlist };

@ -160,31 +160,49 @@ module.exports = {
return NoMatch;
}
},
regex: () => {
let { regex } = rule;
// HACK: This is very much an imperfect approach. We're doing a (potentially large) string copy, and are letting it match at *any* position in the input, potentially wasting resurces if it turns out the match wasn't at index 0. This is unfortunate, but likely still the best option - the internal regex implementation is highly optimized (meaning a written-in-JS implementation is unlikely to beat it in performance), and the built-in implementation doesn't allow anchoring a match separately from the regex definition itself. We *could* transform the regex to have a start anchor, but then this would defeat optimization of repeatedly used regexes - this transformation step would be applied *every time the parsing rule is used*, instead of only once at JS parsing time. Should investigate whether there's any performant way of cacheing this work internally!
// FIXME: Disallow global-flagged regexes? As the internal starting index can throw off our logic
// FIXME: The approach we've chosen here is probably *really* unperformant when combining a regex literal with an `until` combinator!
let match = regex.exec(input.slice(currentIndex));
if (match?.index === 0) {
// Valid match, because it starts at the currentIndex
currentIndex += match[0].length;
// NOTE: We only return the groups, and not the full match, for consistency with the rest of the API - wholeMatch should be used for that (and the performance cost of that additional call should be negligible)
return {
$positional: match.slice(1),
... match.groups
};
characterRange: () => {
// FIXME: Unicode only! Need to check if we can assume that strings are always unicode, even when the source data was interpreted as another string encoding
let codepoint = input.codePointAt(currentIndex)
// FIXME: Find a way to do this generically without breaking the EndOfInput operation
if (currentIndex === input.length) {
return NotEnoughInput;
} else if (codepoint >= rule.start && codepoint <= rule.end) {
// TODO: Should we return the codepoint in string form here? That will be unnecessary work in most cases where `wholeMatch` will be used
currentIndex += 1;
return;
} else {
return NoMatch;
}
},
// NOTE: Regex literals deprecated due to incompatibility with streaming/mixed-mode parsing
// regex: () => {
// let { regex } = rule;
// // HACK: This is very much an imperfect approach. We're doing a (potentially large) string copy, and are letting it match at *any* position in the input, potentially wasting resurces if it turns out the match wasn't at index 0. This is unfortunate, but likely still the best option - the internal regex implementation is highly optimized (meaning a written-in-JS implementation is unlikely to beat it in performance), and the built-in implementation doesn't allow anchoring a match separately from the regex definition itself. We *could* transform the regex to have a start anchor, but then this would defeat optimization of repeatedly used regexes - this transformation step would be applied *every time the parsing rule is used*, instead of only once at JS parsing time. Should investigate whether there's any performant way of cacheing this work internally!
// // FIXME: Disallow global-flagged regexes? As the internal starting index can throw off our logic
// // FIXME: The approach we've chosen here is probably *really* unperformant when combining a regex literal with an `until` combinator!
// let match = regex.exec(input.slice(currentIndex));
// if (match?.index === 0) {
// // Valid match, because it starts at the currentIndex
// currentIndex += match[0].length;
// // NOTE: We only return the groups, and not the full match, for consistency with the rest of the API - wholeMatch should be used for that (and the performance cost of that additional call should be negligible)
// return {
// $positional: match.slice(1),
// ... match.groups
// };
// } else {
// return NoMatch;
// }
// },
endOfInput: () => {
// FIXME: Make this not order-sensitive in an `either`! Currently the NotEnoughInput marker *might* cause issues if this (zero-width) rule comes after nonzero-width rules? Need to investigate.
if (currentIndex === input.length) {
// FIXME: Make this NotEnoughInput-aware; there is probably a similar "exception from the core handling" problem here as in `until`
currentIndex += 1; // We have consumed the 'virtual' end-of-input marker at the end of the input
// FIXME: Verify that this doesn't break anything elsewhere
return true;
} else {
return NoMatch;
@ -287,7 +305,9 @@ module.exports = {
until: () => {
// FIXME: We're probably never actually triggering NotEnoughInput right now, due to how the loop logic works here?
// TODO: Build this on `peek` instead? Is there any actual benefit to that?
// TODO: *Should* end of input be handled specially here, or should it be up to the parser itself to determine whether to stop there? Could be surprising behaviour for it to fail a match just because the input ended with only desired values, but it could also be surprising to expect it to match a delimiter without that delimiter actually being there (but wouldn't that be handled anyway by a subsequent rule for that delimiter?)
for (; currentIndex <= input.length; currentIndex++) {
console.log({currentIndex, length: input.length});
let result = applyRule(rule.rule);
// FIXME: Fix the structure here, and figure out a way to deal with allowEnd without needing to special-case NotEnoughInput handling against inputIsEnded, because that should be a core concern only

@ -1,5 +1,18 @@
"use strict";
function parseCodepoint(input, nullValue) {
if (input === null) {
return nullValue;
} else if (typeof input === "number") {
return input;
} else if (typeof input === "string") {
return input.codePointAt(0);
} else {
// FIXME: Better validation here
throw new Error(`Invalid codepoint input`);
}
}
module.exports = {
zeroOrMore: (rule) => {
return {
@ -70,4 +83,16 @@ module.exports = {
__protocolKitInstruction: true,
type: "endOfInput"
},
characterRange: (start, end) => {
// NOTE: Both inclusive!
let startCodepoint = parseCodepoint(start, 0);
let endCodepoint = parseCodepoint(end, Infinity);
return {
__protocolKitInstruction: true,
type: "characterRange",
start: startCodepoint,
end: endCodepoint
};
}
};

@ -0,0 +1,5 @@
"use strict";
module.exports = function* CarriageReturn() {
yield "\r";
};

@ -0,0 +1,5 @@
"use strict";
module.exports = function* LineFeed() {
yield "\n";
};

@ -0,0 +1,13 @@
"use strict";
const LineFeed = require("./line-feed");
const CarriageReturn = require("./carriage-return");
module.exports = {
LF: LineFeed,
CR: CarriageReturn,
CRLF: function* () {
yield LineFeed;
yield CarriageReturn;
}
};

@ -0,0 +1,17 @@
"use strict";
const { either, EndOfInput } = require("../../operations");
const Newline = require("./newline");
const untilDelimiter = require("../../combinator/until-delimiter");
function createRestOfLine(newlineRule) {
return function* RestOfLine() {
return yield untilDelimiter(either([ newlineRule, EndOfInput ]));
}
}
module.exports = {
LF: createRestOfLine(Newline.LF),
CR: createRestOfLine(Newline.CR),
CRLF: createRestOfLine(Newline.CRLF),
};

@ -0,0 +1,18 @@
"use strict";
const { wholeMatch, optional } = require("../../operations");
const Digits = require("./digits");
module.exports = function* Decimal() {
// NOTE: this gets converted to a floating point value!
let decimalString = yield wholeMatch(function* () {
yield Digits;
yield optional(function* () {
yield ".";
yield Digits;
});
});
return parseFloat(decimalString);
};

@ -0,0 +1,7 @@
"use strict";
const { characterRange } = require("../../operations");
module.exports = function* Digit() {
return yield characterRange("0", "9");
};

@ -0,0 +1,8 @@
"use strict";
const Digit = require("./digit");
const { oneOrMore } = require("../../operations");
module.exports = function* Digits() {
return yield oneOrMore(Digit);
};

@ -0,0 +1,8 @@
"use strict";
const { wholeMatch } = require("../../operations");
const Digits = require("./digits");
module.exports = function* Integer() {
return parseInt(yield wholeMatch(Digits));
};
Loading…
Cancel
Save