You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

252 lines
7.3 KiB
JavaScript

"use strict";
const surgeon = require("surgeon");
const pianola = require("pianola");
const url = require("url");
const flipArray = require("flip-array");
const unreachable = require("@joepie91/unreachable")("seekseek:scrape-documentation");
function stripLines(input) {
return input
.split("\n")
.map((line) => line.trim())
.filter((line) => line.length > 0)
.join("\n");
}
function deduplicateSpaces(input) {
return input.replace(/\s{2,}/g, " ");
}
function cleanText(input) {
return deduplicateSpaces(stripLines(input));
}
function extractCell({ optional }) {
return function (input, [index], _options) {
let numericIndex = (input.labels != null)
// FIXME: Auto-detect number usage?
// TODO: Improve performance by avoiding loops, via eg. a lookup Map?
? input.labels.indexOf(index)
// FIXME: index parsing as number?
: index;
// HACK, refactor so that this conditional only actually occurs for label-based lookups
if (numericIndex === -1) {
if (optional) {
return new pianola.FinalResultSentinel(undefined);
} else {
throw new Error(`Specified label '${index}' does not exist in table`);
}
} else if (numericIndex >= input.cells.length) {
if (optional) {
return new pianola.FinalResultSentinel(undefined);
} else {
throw new Error(`Tried to access cell ${numericIndex}, but record only has cells 0-${input.cells.length - 1}`);
}
} else {
return input.cells[numericIndex];
}
};
}
module.exports = surgeon.default({
subroutines: {
trace: function (input) {
console.log("TRACE:", input);
return input;
},
strip: function (input) {
return input.trim();
},
stripLines: stripLines,
removeLines: function (input) {
return deduplicateSpaces(input.replace(/\n/g, " "));
},
text: function (input, _values, options) {
return cleanText(surgeon.readSubroutine(input, ["property", "textContent"], options));
},
readProp: function (input, [property], options) {
return surgeon.readSubroutine(input, ["property", property], options);
},
readAttr: function (input, [property], options) {
return surgeon.readSubroutine(input, ["attribute", property], options);
},
selectN: function (input, [selector, n], options) {
return surgeon.selectSubroutine(input, [selector, `{${parseInt(n)+1},}[${n}]`], options);
},
selectOne: function (input, [...selectors], options) {
return surgeon.selectSubroutine(input, [selectors.join(" "), `{1}`], options);
},
selectAny: function (input, [...selectors], options) {
return surgeon.selectSubroutine(input, [selectors.join(" "), `{0,}`], options);
},
selectMany: function (input, [ ... selectors ], options) {
return surgeon.selectSubroutine(input, [selectors.join(" "), `{1,}`], options);
},
pickOne: function (input) {
if (!Array.isArray(input)) {
throw new Error("Input must be an array");
} else if (input.length > 0) {
throw new Error(`Input array must only contain one element, got ${input.length} elements instead`);
} else {
return input[0];
}
},
result: function (input, [result], _options) {
return result;
},
index: function (input, [index]) {
return input.eq(index);
},
extractTable: function (input, [direction], options) {
let {evaluator} = options;
let cells = evaluator.querySelectorAll(input, "tr").map((row) => {
return evaluator.querySelectorAll(row, "td, th");
});
let records;
if (direction === "vertical") {
records = cells;
} else if (direction === "horizontal") {
records = flipArray(cells);
} else {
throw new Error("Invalid table record direction specified. Must be one of: horizontal, vertical");
}
return records.map((record) => {
return {cells: record};
});
},
labelRow: function (input, [ index ], options) {
let parsedIndex = parseInt(index);
let normalizedIndex = (parsedIndex >= 0)
? parsedIndex
: input.length - index + 1;
if (normalizedIndex > input.length - 1) {
throw new Error(`Invalid index ${index} specified for label row`);
} else {
let labelRow = input[normalizedIndex];
let labels = labelRow.cells.map((cell) => {
// FIXME: Abstract this out in some way
return cleanText(surgeon.readSubroutine(cell, [ "property", "textContent" ], options));
});
let dataRows = input
.slice(0, normalizedIndex)
.concat(input.slice(normalizedIndex + 1));
return dataRows.map((row) => {
return {
... row,
labels: labels
};
});
}
},
// TODO: `labelRow 0` to label stuff
extractDefinitionList: function (input, _, options) {
let { evaluator } = options;
let definitions = [];
let currentTerm;
evaluator
.querySelectorAll(input, "dt, dd")
.forEach((node) => {
// TODO: This appears to be a cheerio oddity? Need to double-check that this won't break in the future. Maybe should use a surgeon-specific API?
let firstNode = node[0];
if (firstNode.name === "dt") {
// Term
currentTerm = node;
} else if (firstNode.name === "dd") {
// Definition
if (currentTerm != null) {
definitions.push({
// TODO: Support custom filters for terms (eg. to remove superscript text)
term: cleanText(surgeon.readSubroutine(currentTerm, ["property", "textContent"], options)),
definition: node
});
currentTerm = undefined;
} else {
// Ignore any superfluous definitions
}
} else {
throw unreachable(`Unknown node type ${firstNode.name}`);
}
});
// We conceptually represent a definition list as a single-row table with pre-labeled columns/cells
return {
labels: definitions.map((item) => item.term),
cells: definitions.map((item) => item.definition)
};
},
cell: extractCell({ optional: false }),
maybeCell: extractCell({ optional: true }),
isMatch: function (input, [regex], _options) {
let matcher = new RegExp(regex);
return matcher.test(input);
},
binaryBytes: function (input) {
let matcher = new RegExp(`^(${regex.number})\s?([kKmMgGtTpP])i?[bB]$`);
let match = matcher.exec(input);
if (match == null) {
throw new Error(`Not recognized as a binary byte amount: ${input}`);
} else {
let number = parseInt(match[1]);
let unit = match[2].toLowerCase();
if (unit === "k") {
return number * 1024;
} else if (unit === "m") {
return number * 1024 * 1024;
} else if (unit === "g") {
return number * 1024 * 1024 * 1024;
} else if (unit === "t") {
return number * 1024 * 1024 * 1024 * 1024;
} else if (unit === "p") {
return number * 1024 * 1024 * 1024 * 1024 * 1024;
}
}
},
KiB: function (input) {
return parseInt(input) * 1024;
},
MiB: function (input) {
return parseInt(input) * 1024 * 1024;
},
GiB: function (input) {
return parseInt(input) * 1024 * 1024 * 1024;
},
TiB: function (input) {
return parseInt(input) * 1024 * 1024 * 1024 * 1024;
},
integer: function (input) {
return parseInt(input);
},
number: function (input) {
return parseFloat(input);
},
kbps: function (input) {
return parseInt(input) * 1000;
},
mbps: function (input) {
return parseInt(input) * 1000 * 1000;
},
gbps: function (input) {
return parseInt(input) * 1000 * 1000 * 1000;
},
absoluteUrl: function (input, [base], _options) {
return url.resolve(base, input);
}
}
});;