You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

51 lines
1.4 KiB
JavaScript

"use strict";
const assert = require("assert");
const pipe = require("@promistream/pipe");
const simpleSink = require("@promistream/simple-sink");
const assureResponse = require("../../shared/assure-response");
const parseSitemapResponse = require("../../shared/parse-sitemap-response");
module.exports = function tmeScrapeSitemap({ session }) {
return async function ({ data, createItem }) {
let response = await session.get(data.url, { stream: true });
assureResponse(response);
let resultCount = 0;
await pipe([
parseSitemapResponse(response),
simpleSink((item) => {
assert(item.url);
if (item.type === "sitemap") {
// NOTE: We are only interested in the sitemaps that enumerate components, not those that list categories etc.
if (/pip_part[0-9]+\.xml(\.gz)?/.test(item.url)) {
createItem({
id: `tme:sitemap:${item.url}`,
tags: [ "tme:sitemap" ],
data: { url: item.url }
});
resultCount += 1;
}
} else if (item.type === "url") {
if (item.url.startsWith("https://www.tme.eu/en/details/")) {
createItem({
id: `tme:product:${item.url}`,
tags: [ "tme:product" ],
data: { url: item.url }
});
resultCount += 1;
}
}
})
]).read();
// If we don't get at least *some* items out of a sitemap, something is wrong - eg. the URL format changed and we are no longer matching anything.
assert(resultCount > 0);
};
};