|
|
|
@ -0,0 +1,65 @@
|
|
|
|
|
"use strict";
|
|
|
|
|
|
|
|
|
|
const consumable = require("@joepie91/consumable");
|
|
|
|
|
|
|
|
|
|
const mapFilter = require("@promistream/map-filter");
|
|
|
|
|
const parseXML = require("@promistream/parse-xml");
|
|
|
|
|
const pipe = require("@promistream/pipe");
|
|
|
|
|
|
|
|
|
|
// NOTE: This is not a 100% strict parser. It assumes that the sitemap XML is well-formed, and tags appearing in strange places are ignored.
|
|
|
|
|
|
|
|
|
|
module.exports = function createSitemapParsingStream() {
|
|
|
|
|
// The consumable is always filled with an object, so that mutating operations never fail. However, if a tag appears outside of an expected item context, its value will be stored on a placeholder object that gets thrown away as soon as an item context is encountered, effectively throwing away the tag's data itself.
|
|
|
|
|
let currentItem = consumable({});
|
|
|
|
|
let currentTag;
|
|
|
|
|
|
|
|
|
|
return pipe([
|
|
|
|
|
parseXML({ events: [ "opentag", "closetag", "text" ] }),
|
|
|
|
|
mapFilter((event) => {
|
|
|
|
|
if (event.type === "opentag") {
|
|
|
|
|
let tag = event.value;
|
|
|
|
|
|
|
|
|
|
if (tag.name === "sitemap") {
|
|
|
|
|
currentItem.replace({
|
|
|
|
|
type: "sitemap",
|
|
|
|
|
url: undefined,
|
|
|
|
|
lastModified: undefined
|
|
|
|
|
});
|
|
|
|
|
} else if (tag.name === "url") {
|
|
|
|
|
currentItem.replace({
|
|
|
|
|
type: "url",
|
|
|
|
|
url: undefined,
|
|
|
|
|
priority: undefined
|
|
|
|
|
});
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
currentTag = tag.name;
|
|
|
|
|
|
|
|
|
|
return mapFilter.NoValue;
|
|
|
|
|
} else if (event.type === "closetag") {
|
|
|
|
|
let tag = event.value;
|
|
|
|
|
|
|
|
|
|
// We need to ensure that we unset the currentTag once we're done with it, otherwise text elements *between* tags might erroneously end up in our items. Technically this is not correct as we've just moved to the parent tag, but since we're only interested in text which *directly* exists within a tag uninterrupted, we can cut some corners here.
|
|
|
|
|
currentTag = undefined;
|
|
|
|
|
|
|
|
|
|
if (tag.name === "sitemap" || tag.name === "url") {
|
|
|
|
|
return currentItem.replace({});
|
|
|
|
|
} else {
|
|
|
|
|
return mapFilter.NoValue;
|
|
|
|
|
}
|
|
|
|
|
} else {
|
|
|
|
|
let text = event.value;
|
|
|
|
|
|
|
|
|
|
if (currentTag === "loc") {
|
|
|
|
|
currentItem.peek().url = text;
|
|
|
|
|
} else if (currentTag === "lastmod") {
|
|
|
|
|
currentItem.peek().lastModified = text;
|
|
|
|
|
} else if (currentTag === "priority") {
|
|
|
|
|
currentItem.peek().currentTag = text;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return mapFilter.NoValue;
|
|
|
|
|
}
|
|
|
|
|
})
|
|
|
|
|
]);
|
|
|
|
|
};
|