|
|
|
"use strict";
|
|
|
|
|
|
|
|
const surgeon = require("surgeon");
|
|
|
|
const pianola = require("pianola");
|
|
|
|
const url = require("url");
|
|
|
|
const flipArray = require("flip-array");
|
|
|
|
const unreachable = require("@joepie91/unreachable")("seekseek:scrape-documentation");
|
|
|
|
|
|
|
|
function stripLines(input) {
|
|
|
|
return input
|
|
|
|
.split("\n")
|
|
|
|
.map((line) => line.trim())
|
|
|
|
.filter((line) => line.length > 0)
|
|
|
|
.join("\n");
|
|
|
|
}
|
|
|
|
|
|
|
|
function deduplicateSpaces(input) {
|
|
|
|
return input.replace(/\s{2,}/g, " ");
|
|
|
|
}
|
|
|
|
|
|
|
|
function cleanText(input) {
|
|
|
|
return deduplicateSpaces(stripLines(input));
|
|
|
|
}
|
|
|
|
|
|
|
|
function extractCell({ optional }) {
|
|
|
|
return function (input, [index], _options) {
|
|
|
|
let numericIndex = (input.labels != null)
|
|
|
|
// FIXME: Auto-detect number usage?
|
|
|
|
// TODO: Improve performance by avoiding loops, via eg. a lookup Map?
|
|
|
|
? input.labels.indexOf(index)
|
|
|
|
// FIXME: index parsing as number?
|
|
|
|
: index;
|
|
|
|
|
|
|
|
// HACK, refactor so that this conditional only actually occurs for label-based lookups
|
|
|
|
if (numericIndex === -1) {
|
|
|
|
if (optional) {
|
|
|
|
return new pianola.FinalResultSentinel(undefined);
|
|
|
|
} else {
|
|
|
|
throw new Error(`Specified label '${index}' does not exist in table`);
|
|
|
|
}
|
|
|
|
} else if (numericIndex >= input.cells.length) {
|
|
|
|
if (optional) {
|
|
|
|
return new pianola.FinalResultSentinel(undefined);
|
|
|
|
} else {
|
|
|
|
throw new Error(`Tried to access cell ${numericIndex}, but record only has cells 0-${input.cells.length - 1}`);
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
return input.cells[numericIndex];
|
|
|
|
}
|
|
|
|
};
|
|
|
|
}
|
|
|
|
|
|
|
|
module.exports = surgeon.default({
|
|
|
|
subroutines: {
|
|
|
|
trace: function (input) {
|
|
|
|
console.log("TRACE:", input);
|
|
|
|
return input;
|
|
|
|
},
|
|
|
|
strip: function (input) {
|
|
|
|
return input.trim();
|
|
|
|
},
|
|
|
|
stripLines: stripLines,
|
|
|
|
removeLines: function (input) {
|
|
|
|
return deduplicateSpaces(input.replace(/\n/g, " "));
|
|
|
|
},
|
|
|
|
ignoreEmptyString: function (input) {
|
|
|
|
if (input === "") {
|
|
|
|
return null;
|
|
|
|
} else {
|
|
|
|
return input;
|
|
|
|
}
|
|
|
|
},
|
|
|
|
text: function (input, _values, options) {
|
|
|
|
return cleanText(surgeon.readSubroutine(input, ["property", "textContent"], options));
|
|
|
|
},
|
|
|
|
readProp: function (input, [property], options) {
|
|
|
|
return surgeon.readSubroutine(input, ["property", property], options);
|
|
|
|
},
|
|
|
|
readAttr: function (input, [property], options) {
|
|
|
|
return surgeon.readSubroutine(input, ["attribute", property], options);
|
|
|
|
},
|
|
|
|
selectN: function (input, [selector, n], options) {
|
|
|
|
return surgeon.selectSubroutine(input, [selector, `{${parseInt(n)+1},}[${n}]`], options);
|
|
|
|
},
|
|
|
|
selectMaybeN: function (input, [selector, n], options) {
|
|
|
|
let matches = surgeon.selectSubroutine(input, [selector, `{0,}`], options);
|
|
|
|
|
|
|
|
if (matches.length === 0) {
|
|
|
|
return new pianola.FinalResultSentinel(undefined);
|
|
|
|
} else {
|
|
|
|
return matches[n];
|
|
|
|
}
|
|
|
|
},
|
|
|
|
selectMaybeOne: function (input, [...selectors], options) {
|
|
|
|
return surgeon.selectSubroutine(input, [selectors.join(" "), `{0,1}`], options);
|
|
|
|
},
|
|
|
|
selectOne: function (input, [...selectors], options) {
|
|
|
|
return surgeon.selectSubroutine(input, [selectors.join(" "), `{1}`], options);
|
|
|
|
},
|
|
|
|
selectAny: function (input, [...selectors], options) {
|
|
|
|
return surgeon.selectSubroutine(input, [selectors.join(" "), `{0,}`], options);
|
|
|
|
},
|
|
|
|
selectMany: function (input, [ ... selectors ], options) {
|
|
|
|
return surgeon.selectSubroutine(input, [selectors.join(" "), `{1,}`], options);
|
|
|
|
},
|
|
|
|
pickOne: function (input) {
|
|
|
|
if (!Array.isArray(input)) {
|
|
|
|
throw new Error("Input must be an array");
|
|
|
|
} else if (input.length > 0) {
|
|
|
|
throw new Error(`Input array must only contain one element, got ${input.length} elements instead`);
|
|
|
|
} else {
|
|
|
|
return input[0];
|
|
|
|
}
|
|
|
|
},
|
|
|
|
result: function (input, [result], _options) {
|
|
|
|
return result;
|
|
|
|
},
|
|
|
|
index: function (input, [index]) {
|
|
|
|
return input.eq(index);
|
|
|
|
},
|
|
|
|
extractTable: function (input, [direction], options) {
|
|
|
|
let {evaluator} = options;
|
|
|
|
|
|
|
|
let cells = evaluator.querySelectorAll(input, "tr").map((row) => {
|
|
|
|
return evaluator.querySelectorAll(row, "td, th");
|
|
|
|
});
|
|
|
|
|
|
|
|
let records;
|
|
|
|
|
|
|
|
if (direction === "vertical") {
|
|
|
|
records = cells;
|
|
|
|
} else if (direction === "horizontal") {
|
|
|
|
records = flipArray(cells);
|
|
|
|
} else {
|
|
|
|
throw new Error("Invalid table record direction specified. Must be one of: horizontal, vertical");
|
|
|
|
}
|
|
|
|
|
|
|
|
return records.map((record) => {
|
|
|
|
return {cells: record};
|
|
|
|
});
|
|
|
|
},
|
|
|
|
labelRow: function (input, [ index ], options) {
|
|
|
|
let parsedIndex = parseInt(index);
|
|
|
|
let normalizedIndex = (parsedIndex >= 0)
|
|
|
|
? parsedIndex
|
|
|
|
: input.length - index + 1;
|
|
|
|
|
|
|
|
if (normalizedIndex > input.length - 1) {
|
|
|
|
throw new Error(`Invalid index ${index} specified for label row`);
|
|
|
|
} else {
|
|
|
|
let labelRow = input[normalizedIndex];
|
|
|
|
|
|
|
|
let labels = labelRow.cells.map((cell) => {
|
|
|
|
// FIXME: Abstract this out in some way
|
|
|
|
return cleanText(surgeon.readSubroutine(cell, [ "property", "textContent" ], options));
|
|
|
|
});
|
|
|
|
|
|
|
|
let dataRows = input
|
|
|
|
.slice(0, normalizedIndex)
|
|
|
|
.concat(input.slice(normalizedIndex + 1));
|
|
|
|
|
|
|
|
return dataRows.map((row) => {
|
|
|
|
return {
|
|
|
|
... row,
|
|
|
|
labels: labels
|
|
|
|
};
|
|
|
|
});
|
|
|
|
}
|
|
|
|
},
|
|
|
|
// TODO: `labelRow 0` to label stuff
|
|
|
|
extractDefinitionList: function (input, _, options) {
|
|
|
|
let { evaluator } = options;
|
|
|
|
|
|
|
|
let definitions = [];
|
|
|
|
let currentTerm;
|
|
|
|
|
|
|
|
evaluator
|
|
|
|
.querySelectorAll(input, "dt, dd")
|
|
|
|
.forEach((node) => {
|
|
|
|
// TODO: This appears to be a cheerio oddity? Need to double-check that this won't break in the future. Maybe should use a surgeon-specific API?
|
|
|
|
let firstNode = node[0];
|
|
|
|
|
|
|
|
if (firstNode.name === "dt") {
|
|
|
|
// Term
|
|
|
|
currentTerm = node;
|
|
|
|
} else if (firstNode.name === "dd") {
|
|
|
|
// Definition
|
|
|
|
if (currentTerm != null) {
|
|
|
|
definitions.push({
|
|
|
|
// TODO: Support custom filters for terms (eg. to remove superscript text)
|
|
|
|
term: cleanText(surgeon.readSubroutine(currentTerm, ["property", "textContent"], options)),
|
|
|
|
definition: node
|
|
|
|
});
|
|
|
|
|
|
|
|
currentTerm = undefined;
|
|
|
|
} else {
|
|
|
|
// Ignore any superfluous definitions
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
throw unreachable(`Unknown node type ${firstNode.name}`);
|
|
|
|
}
|
|
|
|
});
|
|
|
|
|
|
|
|
// We conceptually represent a definition list as a single-row table with pre-labeled columns/cells
|
|
|
|
return {
|
|
|
|
labels: definitions.map((item) => item.term),
|
|
|
|
cells: definitions.map((item) => item.definition)
|
|
|
|
};
|
|
|
|
},
|
|
|
|
cell: extractCell({ optional: false }),
|
|
|
|
maybeCell: extractCell({ optional: true }),
|
|
|
|
isMatch: function (input, [regex], _options) {
|
|
|
|
let matcher = new RegExp(regex);
|
|
|
|
return matcher.test(input);
|
|
|
|
},
|
|
|
|
binaryBytes: function (input) {
|
|
|
|
let matcher = new RegExp(`^(${regex.number})\s?([kKmMgGtTpP])i?[bB]$`);
|
|
|
|
let match = matcher.exec(input);
|
|
|
|
|
|
|
|
if (match == null) {
|
|
|
|
throw new Error(`Not recognized as a binary byte amount: ${input}`);
|
|
|
|
} else {
|
|
|
|
let number = parseInt(match[1]);
|
|
|
|
let unit = match[2].toLowerCase();
|
|
|
|
|
|
|
|
if (unit === "k") {
|
|
|
|
return number * 1024;
|
|
|
|
} else if (unit === "m") {
|
|
|
|
return number * 1024 * 1024;
|
|
|
|
} else if (unit === "g") {
|
|
|
|
return number * 1024 * 1024 * 1024;
|
|
|
|
} else if (unit === "t") {
|
|
|
|
return number * 1024 * 1024 * 1024 * 1024;
|
|
|
|
} else if (unit === "p") {
|
|
|
|
return number * 1024 * 1024 * 1024 * 1024 * 1024;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
},
|
|
|
|
KiB: function (input) {
|
|
|
|
return parseInt(input) * 1024;
|
|
|
|
},
|
|
|
|
MiB: function (input) {
|
|
|
|
return parseInt(input) * 1024 * 1024;
|
|
|
|
},
|
|
|
|
GiB: function (input) {
|
|
|
|
return parseInt(input) * 1024 * 1024 * 1024;
|
|
|
|
},
|
|
|
|
TiB: function (input) {
|
|
|
|
return parseInt(input) * 1024 * 1024 * 1024 * 1024;
|
|
|
|
},
|
|
|
|
integer: function (input) {
|
|
|
|
return parseInt(input);
|
|
|
|
},
|
|
|
|
number: function (input) {
|
|
|
|
return parseFloat(input);
|
|
|
|
},
|
|
|
|
kbps: function (input) {
|
|
|
|
return parseInt(input) * 1000;
|
|
|
|
},
|
|
|
|
mbps: function (input) {
|
|
|
|
return parseInt(input) * 1000 * 1000;
|
|
|
|
},
|
|
|
|
gbps: function (input) {
|
|
|
|
return parseInt(input) * 1000 * 1000 * 1000;
|
|
|
|
},
|
|
|
|
absoluteUrl: function (input, [base], _options) {
|
|
|
|
return url.resolve(base, input);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
});;
|