"use strict"; const assert = require("assert"); const pipe = require("@promistream/pipe"); const simpleSink = require("@promistream/simple-sink"); const fromNodeStream = require("@promistream/from-node-stream"); const decodeString = require("@promistream/decode-string"); const parseSitemap = require("@promistream/parse-sitemap"); module.exports = function farnellScrapeSitemap({ gotSession }) { return async function ({ data, createItem }) { let resultCount = 0; await pipe([ fromNodeStream.fromReadable(gotSession.stream(data.url)), // NOTE: The URL lies, Farnell's sitemaps are not gzipped decodeString("utf8"), parseSitemap(), simpleSink((item) => { assert(item.url); if (item.type === "sitemap") { // NOTE: We are only interested in the sitemaps that enumerate components, not those that list categories etc. if (/products_[0-9]+\.xml(\.gz)?/.test(item.url)) { createItem({ id: `farnell:sitemap:${item.url}`, tags: [ "farnell:sitemap" ], data: { url: item.url } }); resultCount += 1; } } else if (item.type === "url") { if (/\/dp\/[0-9]+$/.test(item.url)) { createItem({ id: `farnell:product:${item.url}`, tags: [ "farnell:product" ], data: { url: item.url } }); resultCount += 1; } } }) ]).read(); // If we don't get at least *some* items out of a sitemap, something is wrong - eg. the URL format changed and we are no longer matching anything. assert(resultCount > 0); }; };