"use strict"; const surgeon = require("surgeon"); const pianola = require("pianola"); const url = require("url"); const flipArray = require("flip-array"); const unreachable = require("@joepie91/unreachable")("seekseek:scrape-documentation"); function stripLines(input) { return input .split("\n") .map((line) => line.trim()) .filter((line) => line.length > 0) .join("\n"); } function deduplicateSpaces(input) { return input.replace(/\s{2,}/g, " "); } function cleanText(input) { return deduplicateSpaces(stripLines(input)); } function extractCell({ optional }) { return function (input, [index], _options) { let numericIndex = (input.labels != null) // FIXME: Auto-detect number usage? // TODO: Improve performance by avoiding loops, via eg. a lookup Map? ? input.labels.indexOf(index) // FIXME: index parsing as number? : index; // HACK, refactor so that this conditional only actually occurs for label-based lookups if (numericIndex === -1) { if (optional) { return new pianola.FinalResultSentinel(undefined); } else { throw new Error(`Specified label '${index}' does not exist in table`); } } else if (numericIndex >= input.cells.length) { if (optional) { return new pianola.FinalResultSentinel(undefined); } else { throw new Error(`Tried to access cell ${numericIndex}, but record only has cells 0-${input.cells.length - 1}`); } } else { return input.cells[numericIndex]; } }; } module.exports = surgeon.default({ subroutines: { trace: function (input) { console.log("TRACE:", input); return input; }, strip: function (input) { return input.trim(); }, stripLines: stripLines, removeLines: function (input) { return deduplicateSpaces(input.replace(/\n/g, " ")); }, text: function (input, _values, options) { return cleanText(surgeon.readSubroutine(input, ["property", "textContent"], options)); }, readProp: function (input, [property], options) { return surgeon.readSubroutine(input, ["property", property], options); }, readAttr: function (input, [property], options) { return surgeon.readSubroutine(input, ["attribute", property], options); }, selectN: function (input, [selector, n], options) { return surgeon.selectSubroutine(input, [selector, `{${parseInt(n)+1},}[${n}]`], options); }, selectOne: function (input, [...selectors], options) { return surgeon.selectSubroutine(input, [selectors.join(" "), `{1}`], options); }, selectAny: function (input, [...selectors], options) { return surgeon.selectSubroutine(input, [selectors.join(" "), `{0,}`], options); }, selectMany: function (input, [ ... selectors ], options) { return surgeon.selectSubroutine(input, [selectors.join(" "), `{1,}`], options); }, pickOne: function (input) { if (!Array.isArray(input)) { throw new Error("Input must be an array"); } else if (input.length > 0) { throw new Error(`Input array must only contain one element, got ${input.length} elements instead`); } else { return input[0]; } }, result: function (input, [result], _options) { return result; }, index: function (input, [index]) { return input.eq(index); }, extractTable: function (input, [direction], options) { let {evaluator} = options; let cells = evaluator.querySelectorAll(input, "tr").map((row) => { return evaluator.querySelectorAll(row, "td, th"); }); let records; if (direction === "vertical") { records = cells; } else if (direction === "horizontal") { records = flipArray(cells); } else { throw new Error("Invalid table record direction specified. Must be one of: horizontal, vertical"); } return records.map((record) => { return {cells: record}; }); }, labelRow: function (input, [ index ], options) { let parsedIndex = parseInt(index); let normalizedIndex = (parsedIndex >= 0) ? parsedIndex : input.length - index + 1; if (normalizedIndex > input.length - 1) { throw new Error(`Invalid index ${index} specified for label row`); } else { let labelRow = input[normalizedIndex]; let labels = labelRow.cells.map((cell) => { // FIXME: Abstract this out in some way return cleanText(surgeon.readSubroutine(cell, [ "property", "textContent" ], options)); }); let dataRows = input .slice(0, normalizedIndex) .concat(input.slice(normalizedIndex + 1)); return dataRows.map((row) => { return { ... row, labels: labels }; }); } }, // TODO: `labelRow 0` to label stuff extractDefinitionList: function (input, _, options) { let { evaluator } = options; let definitions = []; let currentTerm; evaluator .querySelectorAll(input, "dt, dd") .forEach((node) => { // TODO: This appears to be a cheerio oddity? Need to double-check that this won't break in the future. Maybe should use a surgeon-specific API? let firstNode = node[0]; if (firstNode.name === "dt") { // Term currentTerm = node; } else if (firstNode.name === "dd") { // Definition if (currentTerm != null) { definitions.push({ // TODO: Support custom filters for terms (eg. to remove superscript text) term: cleanText(surgeon.readSubroutine(currentTerm, ["property", "textContent"], options)), definition: node }); currentTerm = undefined; } else { // Ignore any superfluous definitions } } else { throw unreachable(`Unknown node type ${firstNode.name}`); } }); // We conceptually represent a definition list as a single-row table with pre-labeled columns/cells return { labels: definitions.map((item) => item.term), cells: definitions.map((item) => item.definition) }; }, cell: extractCell({ optional: false }), maybeCell: extractCell({ optional: true }), isMatch: function (input, [regex], _options) { let matcher = new RegExp(regex); return matcher.test(input); }, binaryBytes: function (input) { let matcher = new RegExp(`^(${regex.number})\s?([kKmMgGtTpP])i?[bB]$`); let match = matcher.exec(input); if (match == null) { throw new Error(`Not recognized as a binary byte amount: ${input}`); } else { let number = parseInt(match[1]); let unit = match[2].toLowerCase(); if (unit === "k") { return number * 1024; } else if (unit === "m") { return number * 1024 * 1024; } else if (unit === "g") { return number * 1024 * 1024 * 1024; } else if (unit === "t") { return number * 1024 * 1024 * 1024 * 1024; } else if (unit === "p") { return number * 1024 * 1024 * 1024 * 1024 * 1024; } } }, KiB: function (input) { return parseInt(input) * 1024; }, MiB: function (input) { return parseInt(input) * 1024 * 1024; }, GiB: function (input) { return parseInt(input) * 1024 * 1024 * 1024; }, TiB: function (input) { return parseInt(input) * 1024 * 1024 * 1024 * 1024; }, integer: function (input) { return parseInt(input); }, number: function (input) { return parseFloat(input); }, kbps: function (input) { return parseInt(input) * 1000; }, mbps: function (input) { return parseInt(input) * 1000 * 1000; }, gbps: function (input) { return parseInt(input) * 1000 * 1000 * 1000; }, absoluteUrl: function (input, [base], _options) { return url.resolve(base, input); } } });;