WIP

2 years ago · d34156d043
parent a78ed3b5ff
commit d34156d043
13 changed files with 203 additions and 55 deletions
--- a/poc-generators-2/combinator/until-delimiter.js
+++ b/poc-generators-2/combinator/until-delimiter.js
@ -0,0 +1,11 @@
+"use strict";
+
+const { until } = require("../operations");
+
+module.exports = function untilDelimiter(rule) {
+	return function* () {
+		let value = yield until(rule);
+		yield rule; // Consume the delimiter
+		return value;
+	}
+};
--- a/poc-generators-2/eoi.js
+++ b/poc-generators-2/eoi.js
@ -0,0 +1,22 @@
+"use strict";
+
+const { until, EndOfInput } = require("./operations");
+const { parse } = require("./index");
+
+function* foo() {
+	yield "hello";
+	yield EndOfInput;
+}
+
+console.log(parse("hello", foo));
+
+// console.log(parse("hello", until(EndOfInput)));
+
+function* bar() {
+	yield until("s");
+	yield "s";
+}
+
+// console.log(parse("hellos", bar));
+
+
--- a/poc-generators-2/hls.js
+++ b/poc-generators-2/hls.js
@ -1,31 +1,29 @@
 "use strict";

-const { either, wholeMatch, optional, oneOrMore, until, EndOfInput } = require("./operations");
-
-function* Newline() {
-	yield "\n";
+const { either, oneOrMore, EndOfInput } = require("./operations");
+const Newline = require("./simple/lines/newline").LF;
+const RestOfLine = require("./simple/lines/rest-of-line").LF;
+const Integer = require("./simple/numeric/integer");
+const Decimal = require("./simple/numeric/decimal");
+
+function lastItem(array) {
+	return array[array.length - 1];
 }

-function* Digits() {
-	return yield /[0-9]+/;
-}
+function line(strings, ... parsers) {
+	return function* () {
+		let parserResults = [];
 	
-function* Integer() {
-	return parseInt(yield wholeMatch(Digits));
+		for (let i = 0; i < strings.length - 1; i++) {
+			yield strings[i];
+			parserResults.push(yield parsers[i]);
 		}
 	
-function* Decimal() {
-	// NOTE: this gets converted to a floating point value!
-	let decimalString = yield wholeMatch(function* () {
-		yield Digits;
-
-		yield optional(function* () {
-			yield ".";
-			yield Digits;
-		});
-	});
+		yield lastItem(strings);
+		yield Newline;
 	
-	return parseFloat(decimalString);
+		return parserResults;
+	}
 }

 function* Playlist() {
@ -33,22 +31,13 @@ function* Playlist() {
 }

 function* MediaPlaylist() {
-	yield "#EXTM3U";
-	yield Newline;
-
-	yield "#EXT-X-TARGETDURATION:";
-	let targetDuration = yield Integer;
-	yield Newline;
+	yield line`#EXTM3U`;
+	let [ targetDuration] = yield line`#EXT-X-TARGETDURATION:${Integer}`;
 	yield Newline;

 	let items = yield oneOrMore(function* () {
-		yield "#EXTINF:";
-		let duration = yield Decimal;
-		yield ",";
-		yield Newline;
-
-		let url = yield until(Newline);
-		yield either([ EndOfInput, Newline ]);
+		let [ duration ] = yield line`#EXTINF:${Decimal},`;
+		let url = yield RestOfLine;

 		return { url, duration };
 	});
--- a/poc-generators-2/index.js
+++ b/poc-generators-2/index.js
@ -160,31 +160,49 @@ module.exports = {
 								return NoMatch;
 							}
 						},
-						regex: () => {
-							let { regex } = rule;
-
-							// HACK: This is very much an imperfect approach. We're doing a (potentially large) string copy, and are letting it match at *any* position in the input, potentially wasting resurces if it turns out the match wasn't at index 0. This is unfortunate, but likely still the best option - the internal regex implementation is highly optimized (meaning a written-in-JS implementation is unlikely to beat it in performance), and the built-in implementation doesn't allow anchoring a match separately from the regex definition itself. We *could* transform the regex to have a start anchor, but then this would defeat optimization of repeatedly used regexes - this transformation step would be applied *every time the parsing rule is used*, instead of only once at JS parsing time. Should investigate whether there's any performant way of cacheing this work internally!
-							// FIXME: Disallow global-flagged regexes? As the internal starting index can throw off our logic
-							// FIXME: The approach we've chosen here is probably *really* unperformant when combining a regex literal with an `until` combinator!
-							let match = regex.exec(input.slice(currentIndex));
-
-							if (match?.index === 0) {
-								// Valid match, because it starts at the currentIndex
-								currentIndex += match[0].length;
-
-								// NOTE: We only return the groups, and not the full match, for consistency with the rest of the API - wholeMatch should be used for that (and the performance cost of that additional call should be negligible)
-								return {
-									$positional: match.slice(1),
-									... match.groups
-								};
+						characterRange: () => {
+							// FIXME: Unicode only! Need to check if we can assume that strings are always unicode, even when the source data was interpreted as another string encoding
+							let codepoint = input.codePointAt(currentIndex)
+
+							// FIXME: Find a way to do this generically without breaking the EndOfInput operation
+							if (currentIndex === input.length) {
+								return NotEnoughInput;
+							} else if (codepoint >= rule.start && codepoint <= rule.end) {
+								// TODO: Should we return the codepoint in string form here? That will be unnecessary work in most cases where `wholeMatch` will be used
+								currentIndex += 1;
+								return;
 							} else {
 								return NoMatch;
 							}
 						},
+						// NOTE: Regex literals deprecated due to incompatibility with streaming/mixed-mode parsing
+						// regex: () => {
+						// 	let { regex } = rule;
+
+						// 	// HACK: This is very much an imperfect approach. We're doing a (potentially large) string copy, and are letting it match at *any* position in the input, potentially wasting resurces if it turns out the match wasn't at index 0. This is unfortunate, but likely still the best option - the internal regex implementation is highly optimized (meaning a written-in-JS implementation is unlikely to beat it in performance), and the built-in implementation doesn't allow anchoring a match separately from the regex definition itself. We *could* transform the regex to have a start anchor, but then this would defeat optimization of repeatedly used regexes - this transformation step would be applied *every time the parsing rule is used*, instead of only once at JS parsing time. Should investigate whether there's any performant way of cacheing this work internally!
+						// 	// FIXME: Disallow global-flagged regexes? As the internal starting index can throw off our logic
+						// 	// FIXME: The approach we've chosen here is probably *really* unperformant when combining a regex literal with an `until` combinator!
+						// 	let match = regex.exec(input.slice(currentIndex));
+
+						// 	if (match?.index === 0) {
+						// 		// Valid match, because it starts at the currentIndex
+						// 		currentIndex += match[0].length;
+
+						// 		// NOTE: We only return the groups, and not the full match, for consistency with the rest of the API - wholeMatch should be used for that (and the performance cost of that additional call should be negligible)
+						// 		return {
+						// 			$positional: match.slice(1),
+						// 			... match.groups
+						// 		};
+						// 	} else {
+						// 		return NoMatch;
+						// 	}
+						// },
 						endOfInput: () => {
 							// FIXME: Make this not order-sensitive in an `either`! Currently the NotEnoughInput marker *might* cause issues if this (zero-width) rule comes after nonzero-width rules? Need to investigate.
 							if (currentIndex === input.length) {
 								// FIXME: Make this NotEnoughInput-aware; there is probably a similar "exception from the core handling" problem here as in `until`
+								currentIndex += 1; // We have consumed the 'virtual' end-of-input marker at the end of the input
+								// FIXME: Verify that this doesn't break anything elsewhere
 								return true;
 							} else {
 								return NoMatch;
@ -287,7 +305,9 @@ module.exports = {
 						until: () => {
 							// FIXME: We're probably never actually triggering NotEnoughInput right now, due to how the loop logic works here?
 							// TODO: Build this on `peek` instead? Is there any actual benefit to that?
+							// TODO: *Should* end of input be handled specially here, or should it be up to the parser itself to determine whether to stop there? Could be surprising behaviour for it to fail a match just because the input ended with only desired values, but it could also be surprising to expect it to match a delimiter without that delimiter actually being there (but wouldn't that be handled anyway by a subsequent rule for that delimiter?)
 							for (; currentIndex <= input.length; currentIndex++) {
+								console.log({currentIndex, length: input.length});
 								let result = applyRule(rule.rule);

 								// FIXME: Fix the structure here, and figure out a way to deal with allowEnd without needing to special-case NotEnoughInput handling against inputIsEnded, because that should be a core concern only
--- a/poc-generators-2/operations.js
+++ b/poc-generators-2/operations.js
@ -1,5 +1,18 @@
 "use strict";

+function parseCodepoint(input, nullValue) {
+	if (input === null) {
+		return nullValue;
+	} else if (typeof input === "number") {
+		return input;
+	} else if (typeof input === "string") {
+		return input.codePointAt(0);
+	} else {
+		// FIXME: Better validation here
+		throw new Error(`Invalid codepoint input`);
+	}
+}
+
 module.exports = {
 	zeroOrMore: (rule) => {
 		return {
@ -70,4 +83,16 @@ module.exports = {
 		__protocolKitInstruction: true,
 		type: "endOfInput"
 	},
+	characterRange: (start, end) => {
+		// NOTE: Both inclusive!
+		let startCodepoint = parseCodepoint(start, 0);
+		let endCodepoint = parseCodepoint(end, Infinity);
+
+		return {
+			__protocolKitInstruction: true,
+			type: "characterRange",
+			start: startCodepoint,
+			end: endCodepoint
+		};
+	}
 };
--- a/poc-generators-2/simple/lines/carriage-return.js
+++ b/poc-generators-2/simple/lines/carriage-return.js
@ -0,0 +1,5 @@
+"use strict";
+
+module.exports = function* CarriageReturn() {
+	yield "\r";
+};
--- a/poc-generators-2/simple/lines/line-feed.js
+++ b/poc-generators-2/simple/lines/line-feed.js
@ -0,0 +1,5 @@
+"use strict";
+
+module.exports = function* LineFeed() {
+	yield "\n";
+};
--- a/poc-generators-2/simple/lines/newline.js
+++ b/poc-generators-2/simple/lines/newline.js
@ -0,0 +1,13 @@
+"use strict";
+
+const LineFeed = require("./line-feed");
+const CarriageReturn = require("./carriage-return");
+
+module.exports = {
+	LF: LineFeed,
+	CR: CarriageReturn,
+	CRLF: function* () {
+		yield LineFeed;
+		yield CarriageReturn;
+	}
+};
--- a/poc-generators-2/simple/lines/rest-of-line.js
+++ b/poc-generators-2/simple/lines/rest-of-line.js
@ -0,0 +1,17 @@
+"use strict";
+
+const { either, EndOfInput } = require("../../operations");
+const Newline = require("./newline");
+const untilDelimiter = require("../../combinator/until-delimiter");
+
+function createRestOfLine(newlineRule) {
+	return function* RestOfLine() {
+		return yield untilDelimiter(either([ newlineRule, EndOfInput ]));
+	}
+}
+
+module.exports = {
+	LF: createRestOfLine(Newline.LF),
+	CR: createRestOfLine(Newline.CR),
+	CRLF: createRestOfLine(Newline.CRLF),
+};
--- a/poc-generators-2/simple/numeric/decimal.js
+++ b/poc-generators-2/simple/numeric/decimal.js
@ -0,0 +1,18 @@
+"use strict";
+
+const { wholeMatch, optional } = require("../../operations");
+const Digits = require("./digits");
+
+module.exports = function* Decimal() {
+	// NOTE: this gets converted to a floating point value!
+	let decimalString = yield wholeMatch(function* () {
+		yield Digits;
+
+		yield optional(function* () {
+			yield ".";
+			yield Digits;
+		});
+	});
+
+	return parseFloat(decimalString);
+};
--- a/poc-generators-2/simple/numeric/digit.js
+++ b/poc-generators-2/simple/numeric/digit.js
@ -0,0 +1,7 @@
+"use strict";
+
+const { characterRange } = require("../../operations");
+
+module.exports = function* Digit() {
+	return yield characterRange("0", "9");
+};
--- a/poc-generators-2/simple/numeric/digits.js
+++ b/poc-generators-2/simple/numeric/digits.js
@ -0,0 +1,8 @@
+"use strict";
+
+const Digit = require("./digit");
+const { oneOrMore } = require("../../operations");
+
+module.exports = function* Digits() {
+	return yield oneOrMore(Digit);
+};
--- a/poc-generators-2/simple/numeric/integer.js
+++ b/poc-generators-2/simple/numeric/integer.js
@ -0,0 +1,8 @@
+"use strict";
+
+const { wholeMatch } = require("../../operations");
+const Digits = require("./digits");
+
+module.exports = function* Integer() {
+	return parseInt(yield wholeMatch(Digits));
+};