"use strict"; const assert = require("assert"); const pipe = require("@promistream/pipe"); const simpleSink = require("@promistream/simple-sink"); const assureResponse = require("../../shared/assure-response"); const parseSitemapResponse = require("../../shared/parse-sitemap-response"); module.exports = function tmeScrapeSitemap({ session }) { return async function ({ data, createItem }) { let response = await session.get(data.url, { stream: true }); assureResponse(response); let resultCount = 0; await pipe([ parseSitemapResponse(response), simpleSink((item) => { assert(item.url); if (item.type === "sitemap") { // NOTE: We are only interested in the sitemaps that enumerate components, not those that list categories etc. if (/pip_part[0-9]+\.xml(\.gz)?/.test(item.url)) { createItem({ id: `tme:sitemap:${item.url}`, tags: [ "tme:sitemap" ], data: { url: item.url } }); resultCount += 1; } } else if (item.type === "url") { if (item.url.startsWith("https://www.tme.eu/en/details/")) { createItem({ id: `tme:product:${item.url}`, tags: [ "tme:product" ], data: { url: item.url } }); resultCount += 1; } } }) ]).read(); // If we don't get at least *some* items out of a sitemap, something is wrong - eg. the URL format changed and we are no longer matching anything. assert(resultCount > 0); }; };