"use strict"; const consumable = require("@joepie91/consumable"); const mapFilter = require("@promistream/map-filter"); const parseXML = require("@promistream/parse-xml"); const pipe = require("@promistream/pipe"); // NOTE: This is not a 100% strict parser. It assumes that the sitemap XML is well-formed, and tags appearing in strange places are ignored. module.exports = function createSitemapParsingStream() { // The consumable is always filled with an object, so that mutating operations never fail. However, if a tag appears outside of an expected item context, its value will be stored on a placeholder object that gets thrown away as soon as an item context is encountered, effectively throwing away the tag's data itself. let currentItem = consumable({}); let currentTag; return pipe([ parseXML({ events: [ "opentag", "closetag", "text" ] }), mapFilter((event) => { if (event.type === "opentag") { let tag = event.value; if (tag.name === "sitemap") { currentItem.replace({ type: "sitemap", url: undefined, lastModified: undefined }); } else if (tag.name === "url") { currentItem.replace({ type: "url", url: undefined, priority: undefined }); } currentTag = tag.name; return mapFilter.NoValue; } else if (event.type === "closetag") { let tag = event.value; // We need to ensure that we unset the currentTag once we're done with it, otherwise text elements *between* tags might erroneously end up in our items. Technically this is not correct as we've just moved to the parent tag, but since we're only interested in text which *directly* exists within a tag uninterrupted, we can cut some corners here. currentTag = undefined; if (tag.name === "sitemap" || tag.name === "url") { return currentItem.replace({}); } else { return mapFilter.NoValue; } } else { let text = event.value; if (currentTag === "loc") { currentItem.peek().url = text; } else if (currentTag === "lastmod") { currentItem.peek().lastModified = text; } else if (currentTag === "priority") { currentItem.peek().currentTag = text; } return mapFilter.NoValue; } }) ]); };