You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
52 lines
1.5 KiB
JavaScript
52 lines
1.5 KiB
JavaScript
3 years ago
|
"use strict";
|
||
|
|
||
|
const assert = require("assert");
|
||
|
|
||
|
const pipe = require("@promistream/pipe");
|
||
|
const simpleSink = require("@promistream/simple-sink");
|
||
|
const fromNodeStream = require("@promistream/from-node-stream");
|
||
|
const decodeString = require("@promistream/decode-string");
|
||
|
const parseSitemap = require("@promistream/parse-sitemap");
|
||
|
|
||
|
module.exports = function farnellScrapeSitemap({ gotSession }) {
|
||
|
return async function ({ data, createItem }) {
|
||
|
let resultCount = 0;
|
||
|
|
||
|
await pipe([
|
||
|
fromNodeStream.fromReadable(gotSession.stream(data.url)),
|
||
|
// NOTE: The URL lies, Farnell's sitemaps are not gzipped
|
||
|
decodeString("utf8"),
|
||
|
parseSitemap(),
|
||
|
simpleSink((item) => {
|
||
|
assert(item.url);
|
||
|
|
||
|
if (item.type === "sitemap") {
|
||
|
// NOTE: We are only interested in the sitemaps that enumerate components, not those that list categories etc.
|
||
|
if (/products_[0-9]+\.xml(\.gz)?/.test(item.url)) {
|
||
|
createItem({
|
||
|
id: `farnell:sitemap:${item.url}`,
|
||
|
tags: [ "farnell:sitemap" ],
|
||
|
data: { url: item.url }
|
||
|
});
|
||
|
|
||
|
resultCount += 1;
|
||
|
}
|
||
|
} else if (item.type === "url") {
|
||
|
if (/\/dp\/[0-9]+$/.test(item.url)) {
|
||
|
createItem({
|
||
|
id: `farnell:product:${item.url}`,
|
||
|
tags: [ "farnell:product" ],
|
||
|
data: { url: item.url }
|
||
|
});
|
||
|
|
||
|
resultCount += 1;
|
||
|
}
|
||
|
}
|
||
|
})
|
||
|
]).read();
|
||
|
|
||
|
// If we don't get at least *some* items out of a sitemap, something is wrong - eg. the URL format changed and we are no longer matching anything.
|
||
|
assert(resultCount > 0);
|
||
|
};
|
||
|
};
|