Add Farnell support, support auto-casemapping, WIP Surgeon support
parent
249573a968
commit
795ed9c760
@ -0,0 +1,21 @@
|
||||
"use strict";
|
||||
|
||||
const createDatasheet = require("../../shared/create-datasheet");
|
||||
|
||||
module.exports = function farnellNormalizeProduct() {
|
||||
return async function (api) {
|
||||
let { data } = api;
|
||||
|
||||
createDatasheet(api, {
|
||||
priority: 0.65,
|
||||
source: "farnell",
|
||||
manufacturer: data.itemData.manufacturer,
|
||||
productID: data.itemData.productID,
|
||||
name: data.itemData.model,
|
||||
description: data.itemData.description,
|
||||
url: data.itemData.datasheetURL,
|
||||
// Farnell also shows manufacturer names in ALL-CAPS
|
||||
fixCasing: true
|
||||
});
|
||||
};
|
||||
};
|
@ -0,0 +1,31 @@
|
||||
"use strict";
|
||||
|
||||
const surgeon = require("../../shared/surgeon-utils");
|
||||
|
||||
// https://uk.farnell.com/molex/51281-2694/fpc-connector-rcpt-26pos-0-3mm/dp/3051223
|
||||
|
||||
module.exports = function farnellScrapeProduct({ gotSession }) {
|
||||
return async function ({ data, updateData, expireDependents }) {
|
||||
let response = await gotSession(data.url, { timeout: 30000 });
|
||||
|
||||
let detailsTable = surgeon(`selectOne ".productDescription dl" | extractDefinitionList`, response.body);
|
||||
|
||||
let itemData = surgeon({
|
||||
datasheetURL: [ () => detailsTable, `maybeCell "Technical Datasheet:" | selectOne a | readAttr href` ],
|
||||
manufacturer: `selectOne .schemaOrg | text`,
|
||||
model: `selectOne .ManufacturerPartNumber | text`,
|
||||
productID: `selectOne .ManufacturerOrderCode | text`,
|
||||
description: `selectOne .pdpAttributesName | text`,
|
||||
alsoKnownAs: [ () => detailsTable, `maybeCell "Also Known As:" | text` ]
|
||||
}, response.body);
|
||||
|
||||
// TODO: Merge self into productID-normalized item to deal gracefully with changing URLs
|
||||
|
||||
updateData((oldData) => ({
|
||||
... oldData,
|
||||
itemData: itemData
|
||||
}));
|
||||
|
||||
expireDependents();
|
||||
};
|
||||
};
|
@ -0,0 +1,51 @@
|
||||
"use strict";
|
||||
|
||||
const assert = require("assert");
|
||||
|
||||
const pipe = require("@promistream/pipe");
|
||||
const simpleSink = require("@promistream/simple-sink");
|
||||
const fromNodeStream = require("@promistream/from-node-stream");
|
||||
const decodeString = require("@promistream/decode-string");
|
||||
const parseSitemap = require("@promistream/parse-sitemap");
|
||||
|
||||
module.exports = function farnellScrapeSitemap({ gotSession }) {
|
||||
return async function ({ data, createItem }) {
|
||||
let resultCount = 0;
|
||||
|
||||
await pipe([
|
||||
fromNodeStream.fromReadable(gotSession.stream(data.url)),
|
||||
// NOTE: The URL lies, Farnell's sitemaps are not gzipped
|
||||
decodeString("utf8"),
|
||||
parseSitemap(),
|
||||
simpleSink((item) => {
|
||||
assert(item.url);
|
||||
|
||||
if (item.type === "sitemap") {
|
||||
// NOTE: We are only interested in the sitemaps that enumerate components, not those that list categories etc.
|
||||
if (/products_[0-9]+\.xml(\.gz)?/.test(item.url)) {
|
||||
createItem({
|
||||
id: `farnell:sitemap:${item.url}`,
|
||||
tags: [ "farnell:sitemap" ],
|
||||
data: { url: item.url }
|
||||
});
|
||||
|
||||
resultCount += 1;
|
||||
}
|
||||
} else if (item.type === "url") {
|
||||
if (/\/dp\/[0-9]+$/.test(item.url)) {
|
||||
createItem({
|
||||
id: `farnell:product:${item.url}`,
|
||||
tags: [ "farnell:product" ],
|
||||
data: { url: item.url }
|
||||
});
|
||||
|
||||
resultCount += 1;
|
||||
}
|
||||
}
|
||||
})
|
||||
]).read();
|
||||
|
||||
// If we don't get at least *some* items out of a sitemap, something is wrong - eg. the URL format changed and we are no longer matching anything.
|
||||
assert(resultCount > 0);
|
||||
};
|
||||
};
|
@ -0,0 +1,251 @@
|
||||
"use strict";
|
||||
|
||||
const surgeon = require("surgeon");
|
||||
const pianola = require("pianola");
|
||||
const url = require("url");
|
||||
const flipArray = require("flip-array");
|
||||
const unreachable = require("@joepie91/unreachable")("seekseek:scrape-documentation");
|
||||
|
||||
function stripLines(input) {
|
||||
return input
|
||||
.split("\n")
|
||||
.map((line) => line.trim())
|
||||
.filter((line) => line.length > 0)
|
||||
.join("\n");
|
||||
}
|
||||
|
||||
function deduplicateSpaces(input) {
|
||||
return input.replace(/\s{2,}/g, " ");
|
||||
}
|
||||
|
||||
function cleanText(input) {
|
||||
return deduplicateSpaces(stripLines(input));
|
||||
}
|
||||
|
||||
function extractCell({ optional }) {
|
||||
return function (input, [index], _options) {
|
||||
let numericIndex = (input.labels != null)
|
||||
// FIXME: Auto-detect number usage?
|
||||
// TODO: Improve performance by avoiding loops, via eg. a lookup Map?
|
||||
? input.labels.indexOf(index)
|
||||
// FIXME: index parsing as number?
|
||||
: index;
|
||||
|
||||
// HACK, refactor so that this conditional only actually occurs for label-based lookups
|
||||
if (numericIndex === -1) {
|
||||
if (optional) {
|
||||
return new pianola.FinalResultSentinel(undefined);
|
||||
} else {
|
||||
throw new Error(`Specified label '${index}' does not exist in table`);
|
||||
}
|
||||
} else if (numericIndex >= input.cells.length) {
|
||||
if (optional) {
|
||||
return new pianola.FinalResultSentinel(undefined);
|
||||
} else {
|
||||
throw new Error(`Tried to access cell ${numericIndex}, but record only has cells 0-${input.cells.length - 1}`);
|
||||
}
|
||||
} else {
|
||||
return input.cells[numericIndex];
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
module.exports = surgeon.default({
|
||||
subroutines: {
|
||||
trace: function (input) {
|
||||
console.log("TRACE:", input);
|
||||
return input;
|
||||
},
|
||||
strip: function (input) {
|
||||
return input.trim();
|
||||
},
|
||||
stripLines: stripLines,
|
||||
removeLines: function (input) {
|
||||
return deduplicateSpaces(input.replace(/\n/g, " "));
|
||||
},
|
||||
text: function (input, _values, options) {
|
||||
return cleanText(surgeon.readSubroutine(input, ["property", "textContent"], options));
|
||||
},
|
||||
readProp: function (input, [property], options) {
|
||||
return surgeon.readSubroutine(input, ["property", property], options);
|
||||
},
|
||||
readAttr: function (input, [property], options) {
|
||||
return surgeon.readSubroutine(input, ["attribute", property], options);
|
||||
},
|
||||
selectN: function (input, [selector, n], options) {
|
||||
return surgeon.selectSubroutine(input, [selector, `{${parseInt(n)+1},}[${n}]`], options);
|
||||
},
|
||||
selectOne: function (input, [...selectors], options) {
|
||||
return surgeon.selectSubroutine(input, [selectors.join(" "), `{1}`], options);
|
||||
},
|
||||
selectAny: function (input, [...selectors], options) {
|
||||
return surgeon.selectSubroutine(input, [selectors.join(" "), `{0,}`], options);
|
||||
},
|
||||
selectMany: function (input, [ ... selectors ], options) {
|
||||
return surgeon.selectSubroutine(input, [selectors.join(" "), `{1,}`], options);
|
||||
},
|
||||
pickOne: function (input) {
|
||||
if (!Array.isArray(input)) {
|
||||
throw new Error("Input must be an array");
|
||||
} else if (input.length > 0) {
|
||||
throw new Error(`Input array must only contain one element, got ${input.length} elements instead`);
|
||||
} else {
|
||||
return input[0];
|
||||
}
|
||||
},
|
||||
result: function (input, [result], _options) {
|
||||
return result;
|
||||
},
|
||||
index: function (input, [index]) {
|
||||
return input.eq(index);
|
||||
},
|
||||
extractTable: function (input, [direction], options) {
|
||||
let {evaluator} = options;
|
||||
|
||||
let cells = evaluator.querySelectorAll(input, "tr").map((row) => {
|
||||
return evaluator.querySelectorAll(row, "td, th");
|
||||
});
|
||||
|
||||
let records;
|
||||
|
||||
if (direction === "vertical") {
|
||||
records = cells;
|
||||
} else if (direction === "horizontal") {
|
||||
records = flipArray(cells);
|
||||
} else {
|
||||
throw new Error("Invalid table record direction specified. Must be one of: horizontal, vertical");
|
||||
}
|
||||
|
||||
return records.map((record) => {
|
||||
return {cells: record};
|
||||
});
|
||||
},
|
||||
labelRow: function (input, [ index ], options) {
|
||||
let parsedIndex = parseInt(index);
|
||||
let normalizedIndex = (parsedIndex >= 0)
|
||||
? parsedIndex
|
||||
: input.length - index + 1;
|
||||
|
||||
if (normalizedIndex > input.length - 1) {
|
||||
throw new Error(`Invalid index ${index} specified for label row`);
|
||||
} else {
|
||||
let labelRow = input[normalizedIndex];
|
||||
|
||||
let labels = labelRow.cells.map((cell) => {
|
||||
// FIXME: Abstract this out in some way
|
||||
return cleanText(surgeon.readSubroutine(cell, [ "property", "textContent" ], options));
|
||||
});
|
||||
|
||||
let dataRows = input
|
||||
.slice(0, normalizedIndex)
|
||||
.concat(input.slice(normalizedIndex + 1));
|
||||
|
||||
return dataRows.map((row) => {
|
||||
return {
|
||||
... row,
|
||||
labels: labels
|
||||
};
|
||||
});
|
||||
}
|
||||
},
|
||||
// TODO: `labelRow 0` to label stuff
|
||||
extractDefinitionList: function (input, _, options) {
|
||||
let { evaluator } = options;
|
||||
|
||||
let definitions = [];
|
||||
let currentTerm;
|
||||
|
||||
evaluator
|
||||
.querySelectorAll(input, "dt, dd")
|
||||
.forEach((node) => {
|
||||
// TODO: This appears to be a cheerio oddity? Need to double-check that this won't break in the future. Maybe should use a surgeon-specific API?
|
||||
let firstNode = node[0];
|
||||
|
||||
if (firstNode.name === "dt") {
|
||||
// Term
|
||||
currentTerm = node;
|
||||
} else if (firstNode.name === "dd") {
|
||||
// Definition
|
||||
if (currentTerm != null) {
|
||||
definitions.push({
|
||||
// TODO: Support custom filters for terms (eg. to remove superscript text)
|
||||
term: cleanText(surgeon.readSubroutine(currentTerm, ["property", "textContent"], options)),
|
||||
definition: node
|
||||
});
|
||||
|
||||
currentTerm = undefined;
|
||||
} else {
|
||||
// Ignore any superfluous definitions
|
||||
}
|
||||
} else {
|
||||
throw unreachable(`Unknown node type ${firstNode.name}`);
|
||||
}
|
||||
});
|
||||
|
||||
// We conceptually represent a definition list as a single-row table with pre-labeled columns/cells
|
||||
return {
|
||||
labels: definitions.map((item) => item.term),
|
||||
cells: definitions.map((item) => item.definition)
|
||||
};
|
||||
},
|
||||
cell: extractCell({ optional: false }),
|
||||
maybeCell: extractCell({ optional: true }),
|
||||
isMatch: function (input, [regex], _options) {
|
||||
let matcher = new RegExp(regex);
|
||||
return matcher.test(input);
|
||||
},
|
||||
binaryBytes: function (input) {
|
||||
let matcher = new RegExp(`^(${regex.number})\s?([kKmMgGtTpP])i?[bB]$`);
|
||||
let match = matcher.exec(input);
|
||||
|
||||
if (match == null) {
|
||||
throw new Error(`Not recognized as a binary byte amount: ${input}`);
|
||||
} else {
|
||||
let number = parseInt(match[1]);
|
||||
let unit = match[2].toLowerCase();
|
||||
|
||||
if (unit === "k") {
|
||||
return number * 1024;
|
||||
} else if (unit === "m") {
|
||||
return number * 1024 * 1024;
|
||||
} else if (unit === "g") {
|
||||
return number * 1024 * 1024 * 1024;
|
||||
} else if (unit === "t") {
|
||||
return number * 1024 * 1024 * 1024 * 1024;
|
||||
} else if (unit === "p") {
|
||||
return number * 1024 * 1024 * 1024 * 1024 * 1024;
|
||||
}
|
||||
}
|
||||
},
|
||||
KiB: function (input) {
|
||||
return parseInt(input) * 1024;
|
||||
},
|
||||
MiB: function (input) {
|
||||
return parseInt(input) * 1024 * 1024;
|
||||
},
|
||||
GiB: function (input) {
|
||||
return parseInt(input) * 1024 * 1024 * 1024;
|
||||
},
|
||||
TiB: function (input) {
|
||||
return parseInt(input) * 1024 * 1024 * 1024 * 1024;
|
||||
},
|
||||
integer: function (input) {
|
||||
return parseInt(input);
|
||||
},
|
||||
number: function (input) {
|
||||
return parseFloat(input);
|
||||
},
|
||||
kbps: function (input) {
|
||||
return parseInt(input) * 1000;
|
||||
},
|
||||
mbps: function (input) {
|
||||
return parseInt(input) * 1000 * 1000;
|
||||
},
|
||||
gbps: function (input) {
|
||||
return parseInt(input) * 1000 * 1000 * 1000;
|
||||
},
|
||||
absoluteUrl: function (input, [base], _options) {
|
||||
return url.resolve(base, input);
|
||||
}
|
||||
}
|
||||
});;
|
Loading…
Reference in New Issue