Initial commit
commit
1a418b73bb
@ -0,0 +1 @@
|
|||||||
|
node_modules
|
@ -0,0 +1,65 @@
|
|||||||
|
"use strict";
|
||||||
|
|
||||||
|
const consumable = require("@joepie91/consumable");
|
||||||
|
|
||||||
|
const mapFilter = require("@promistream/map-filter");
|
||||||
|
const parseXML = require("@promistream/parse-xml");
|
||||||
|
const pipe = require("@promistream/pipe");
|
||||||
|
|
||||||
|
// NOTE: This is not a 100% strict parser. It assumes that the sitemap XML is well-formed, and tags appearing in strange places are ignored.
|
||||||
|
|
||||||
|
module.exports = function createSitemapParsingStream() {
|
||||||
|
// The consumable is always filled with an object, so that mutating operations never fail. However, if a tag appears outside of an expected item context, its value will be stored on a placeholder object that gets thrown away as soon as an item context is encountered, effectively throwing away the tag's data itself.
|
||||||
|
let currentItem = consumable({});
|
||||||
|
let currentTag;
|
||||||
|
|
||||||
|
return pipe([
|
||||||
|
parseXML({ events: [ "opentag", "closetag", "text" ] }),
|
||||||
|
mapFilter((event) => {
|
||||||
|
if (event.type === "opentag") {
|
||||||
|
let tag = event.value;
|
||||||
|
|
||||||
|
if (tag.name === "sitemap") {
|
||||||
|
currentItem.replace({
|
||||||
|
type: "sitemap",
|
||||||
|
url: undefined,
|
||||||
|
lastModified: undefined
|
||||||
|
});
|
||||||
|
} else if (tag.name === "url") {
|
||||||
|
currentItem.replace({
|
||||||
|
type: "url",
|
||||||
|
url: undefined,
|
||||||
|
priority: undefined
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
currentTag = tag.name;
|
||||||
|
|
||||||
|
return mapFilter.NoValue;
|
||||||
|
} else if (event.type === "closetag") {
|
||||||
|
let tag = event.value;
|
||||||
|
|
||||||
|
// We need to ensure that we unset the currentTag once we're done with it, otherwise text elements *between* tags might erroneously end up in our items. Technically this is not correct as we've just moved to the parent tag, but since we're only interested in text which *directly* exists within a tag uninterrupted, we can cut some corners here.
|
||||||
|
currentTag = undefined;
|
||||||
|
|
||||||
|
if (tag.name === "sitemap" || tag.name === "url") {
|
||||||
|
return currentItem.replace({});
|
||||||
|
} else {
|
||||||
|
return mapFilter.NoValue;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
let text = event.value;
|
||||||
|
|
||||||
|
if (currentTag === "loc") {
|
||||||
|
currentItem.peek().url = text;
|
||||||
|
} else if (currentTag === "lastmod") {
|
||||||
|
currentItem.peek().lastModified = text;
|
||||||
|
} else if (currentTag === "priority") {
|
||||||
|
currentItem.peek().currentTag = text;
|
||||||
|
}
|
||||||
|
|
||||||
|
return mapFilter.NoValue;
|
||||||
|
}
|
||||||
|
})
|
||||||
|
]);
|
||||||
|
};
|
@ -0,0 +1 @@
|
|||||||
|
TODO: Support for image sitemaps (https://developers.google.com/search/docs/advanced/sitemaps/image-sitemaps)
|
@ -0,0 +1,21 @@
|
|||||||
|
{
|
||||||
|
"name": "@promistream/parse-sitemap",
|
||||||
|
"version": "0.1.0",
|
||||||
|
"main": "index.js",
|
||||||
|
"keywords": [
|
||||||
|
"promistream"
|
||||||
|
],
|
||||||
|
"repository": "http://git.cryto.net/promistream/parse-sitemap.git",
|
||||||
|
"author": "Sven Slootweg <admin@cryto.net>",
|
||||||
|
"license": "WTFPL OR CC0-1.0",
|
||||||
|
"devDependencies": {
|
||||||
|
"@joepie91/eslint-config": "^1.1.0",
|
||||||
|
"eslint": "^6.8.0"
|
||||||
|
},
|
||||||
|
"dependencies": {
|
||||||
|
"@joepie91/consumable": "^1.0.1",
|
||||||
|
"@promistream/map-filter": "^0.1.0",
|
||||||
|
"@promistream/parse-xml": "^0.1.0",
|
||||||
|
"@promistream/pipe": "^0.1.4"
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in New Issue