You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

52 lines
1.5 KiB
JavaScript

"use strict";
const assert = require("assert");
const pipe = require("@promistream/pipe");
const simpleSink = require("@promistream/simple-sink");
const fromNodeStream = require("@promistream/from-node-stream");
const decodeString = require("@promistream/decode-string");
const parseSitemap = require("@promistream/parse-sitemap");
module.exports = function farnellScrapeSitemap({ gotSession }) {
return async function ({ data, createItem }) {
let resultCount = 0;
await pipe([
fromNodeStream.fromReadable(gotSession.stream(data.url)),
// NOTE: The URL lies, Farnell's sitemaps are not gzipped
decodeString("utf8"),
parseSitemap(),
simpleSink((item) => {
assert(item.url);
if (item.type === "sitemap") {
// NOTE: We are only interested in the sitemaps that enumerate components, not those that list categories etc.
if (/products_[0-9]+\.xml(\.gz)?/.test(item.url)) {
createItem({
id: `farnell:sitemap:${item.url}`,
tags: [ "farnell:sitemap" ],
data: { url: item.url }
});
resultCount += 1;
}
} else if (item.type === "url") {
if (/\/dp\/[0-9]+$/.test(item.url)) {
createItem({
id: `farnell:product:${item.url}`,
tags: [ "farnell:product" ],
data: { url: item.url }
});
resultCount += 1;
}
}
})
]).read();
// If we don't get at least *some* items out of a sitemap, something is wrong - eg. the URL format changed and we are no longer matching anything.
assert(resultCount > 0);
};
};