You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

271 lines
7.8 KiB

"use strict";
const surgeon = require("surgeon");
const pianola = require("pianola");
const url = require("url");
const flipArray = require("flip-array");
const unreachable = require("@joepie91/unreachable")("seekseek:scrape-documentation");
function stripLines(input) {
return input
.map((line) => line.trim())
.filter((line) => line.length > 0)
function deduplicateSpaces(input) {
return input.replace(/\s{2,}/g, " ");
function cleanText(input) {
return deduplicateSpaces(stripLines(input));
function extractCell({ optional }) {
return function (input, [index], _options) {
let numericIndex = (input.labels != null)
// FIXME: Auto-detect number usage?
// TODO: Improve performance by avoiding loops, via eg. a lookup Map?
? input.labels.indexOf(index)
// FIXME: index parsing as number?
: index;
// HACK, refactor so that this conditional only actually occurs for label-based lookups
if (numericIndex === -1) {
if (optional) {
return new pianola.FinalResultSentinel(undefined);
} else {
throw new Error(`Specified label '${index}' does not exist in table`);
} else if (numericIndex >= input.cells.length) {
if (optional) {
return new pianola.FinalResultSentinel(undefined);
} else {
throw new Error(`Tried to access cell ${numericIndex}, but record only has cells 0-${input.cells.length - 1}`);
} else {
return input.cells[numericIndex];
module.exports = surgeon.default({
subroutines: {
trace: function (input) {
console.log("TRACE:", input);
return input;
strip: function (input) {
return input.trim();
stripLines: stripLines,
removeLines: function (input) {
return deduplicateSpaces(input.replace(/\n/g, " "));
ignoreEmptyString: function (input) {
if (input === "") {
return null;
} else {
return input;
text: function (input, _values, options) {
return cleanText(surgeon.readSubroutine(input, ["property", "textContent"], options));
readProp: function (input, [property], options) {
return surgeon.readSubroutine(input, ["property", property], options);
readAttr: function (input, [property], options) {
return surgeon.readSubroutine(input, ["attribute", property], options);
selectN: function (input, [selector, n], options) {
return surgeon.selectSubroutine(input, [selector, `{${parseInt(n)+1},}[${n}]`], options);
selectMaybeN: function (input, [selector, n], options) {
let matches = surgeon.selectSubroutine(input, [selector, `{0,}`], options);
if (matches.length === 0) {
return new pianola.FinalResultSentinel(undefined);
} else {
return matches[n];
selectMaybeOne: function (input, [...selectors], options) {
return surgeon.selectSubroutine(input, [selectors.join(" "), `{0,1}`], options);
selectOne: function (input, [...selectors], options) {
return surgeon.selectSubroutine(input, [selectors.join(" "), `{1}`], options);
selectAny: function (input, [...selectors], options) {
return surgeon.selectSubroutine(input, [selectors.join(" "), `{0,}`], options);
selectMany: function (input, [ ... selectors ], options) {
return surgeon.selectSubroutine(input, [selectors.join(" "), `{1,}`], options);
pickOne: function (input) {
if (!Array.isArray(input)) {
throw new Error("Input must be an array");
} else if (input.length > 0) {
throw new Error(`Input array must only contain one element, got ${input.length} elements instead`);
} else {
return input[0];
result: function (input, [result], _options) {
return result;
index: function (input, [index]) {
return input.eq(index);
extractTable: function (input, [direction], options) {
let {evaluator} = options;
let cells = evaluator.querySelectorAll(input, "tr").map((row) => {
return evaluator.querySelectorAll(row, "td, th");
let records;
if (direction === "vertical") {
records = cells;
} else if (direction === "horizontal") {
records = flipArray(cells);
} else {
throw new Error("Invalid table record direction specified. Must be one of: horizontal, vertical");
return => {
return {cells: record};
labelRow: function (input, [ index ], options) {
let parsedIndex = parseInt(index);
let normalizedIndex = (parsedIndex >= 0)
? parsedIndex
: input.length - index + 1;
if (normalizedIndex > input.length - 1) {
throw new Error(`Invalid index ${index} specified for label row`);
} else {
let labelRow = input[normalizedIndex];
let labels = => {
// FIXME: Abstract this out in some way
return cleanText(surgeon.readSubroutine(cell, [ "property", "textContent" ], options));
let dataRows = input
.slice(0, normalizedIndex)
.concat(input.slice(normalizedIndex + 1));
return => {
return {
... row,
labels: labels
// TODO: `labelRow 0` to label stuff
extractDefinitionList: function (input, _, options) {
let { evaluator } = options;
let definitions = [];
let currentTerm;
.querySelectorAll(input, "dt, dd")
.forEach((node) => {
// TODO: This appears to be a cheerio oddity? Need to double-check that this won't break in the future. Maybe should use a surgeon-specific API?
let firstNode = node[0];
if ( === "dt") {
// Term
currentTerm = node;
} else if ( === "dd") {
// Definition
if (currentTerm != null) {
// TODO: Support custom filters for terms (eg. to remove superscript text)
term: cleanText(surgeon.readSubroutine(currentTerm, ["property", "textContent"], options)),
definition: node
currentTerm = undefined;
} else {
// Ignore any superfluous definitions
} else {
throw unreachable(`Unknown node type ${}`);
// We conceptually represent a definition list as a single-row table with pre-labeled columns/cells
return {
labels: => item.term),
cells: => item.definition)
cell: extractCell({ optional: false }),
maybeCell: extractCell({ optional: true }),
isMatch: function (input, [regex], _options) {
let matcher = new RegExp(regex);
return matcher.test(input);
binaryBytes: function (input) {
let matcher = new RegExp(`^(${regex.number})\s?([kKmMgGtTpP])i?[bB]$`);
let match = matcher.exec(input);
if (match == null) {
throw new Error(`Not recognized as a binary byte amount: ${input}`);
} else {
let number = parseInt(match[1]);
let unit = match[2].toLowerCase();
if (unit === "k") {
return number * 1024;
} else if (unit === "m") {
return number * 1024 * 1024;
} else if (unit === "g") {
return number * 1024 * 1024 * 1024;
} else if (unit === "t") {
return number * 1024 * 1024 * 1024 * 1024;
} else if (unit === "p") {
return number * 1024 * 1024 * 1024 * 1024 * 1024;
KiB: function (input) {
return parseInt(input) * 1024;
MiB: function (input) {
return parseInt(input) * 1024 * 1024;
GiB: function (input) {
return parseInt(input) * 1024 * 1024 * 1024;
TiB: function (input) {
return parseInt(input) * 1024 * 1024 * 1024 * 1024;
integer: function (input) {
return parseInt(input);
number: function (input) {
return parseFloat(input);
kbps: function (input) {
return parseInt(input) * 1000;
mbps: function (input) {
return parseInt(input) * 1000 * 1000;
gbps: function (input) {
return parseInt(input) * 1000 * 1000 * 1000;
absoluteUrl: function (input, [base], _options) {
return url.resolve(base, input);