Browse Source

Initial commit

master
Sven Slootweg 5 months ago
commit
1a418b73bb
  1. 1
      .gitignore
  2. 65
      index.js
  3. 1
      notes.txt
  4. 21
      package.json
  5. 1484
      yarn.lock

1
.gitignore

@ -0,0 +1 @@
node_modules

65
index.js

@ -0,0 +1,65 @@
"use strict";
const consumable = require("@joepie91/consumable");
const mapFilter = require("@promistream/map-filter");
const parseXML = require("@promistream/parse-xml");
const pipe = require("@promistream/pipe");
// NOTE: This is not a 100% strict parser. It assumes that the sitemap XML is well-formed, and tags appearing in strange places are ignored.
module.exports = function createSitemapParsingStream() {
// The consumable is always filled with an object, so that mutating operations never fail. However, if a tag appears outside of an expected item context, its value will be stored on a placeholder object that gets thrown away as soon as an item context is encountered, effectively throwing away the tag's data itself.
let currentItem = consumable({});
let currentTag;
return pipe([
parseXML({ events: [ "opentag", "closetag", "text" ] }),
mapFilter((event) => {
if (event.type === "opentag") {
let tag = event.value;
if (tag.name === "sitemap") {
currentItem.replace({
type: "sitemap",
url: undefined,
lastModified: undefined
});
} else if (tag.name === "url") {
currentItem.replace({
type: "url",
url: undefined,
priority: undefined
});
}
currentTag = tag.name;
return mapFilter.NoValue;
} else if (event.type === "closetag") {
let tag = event.value;
// We need to ensure that we unset the currentTag once we're done with it, otherwise text elements *between* tags might erroneously end up in our items. Technically this is not correct as we've just moved to the parent tag, but since we're only interested in text which *directly* exists within a tag uninterrupted, we can cut some corners here.
currentTag = undefined;
if (tag.name === "sitemap" || tag.name === "url") {
return currentItem.replace({});
} else {
return mapFilter.NoValue;
}
} else {
let text = event.value;
if (currentTag === "loc") {
currentItem.peek().url = text;
} else if (currentTag === "lastmod") {
currentItem.peek().lastModified = text;
} else if (currentTag === "priority") {
currentItem.peek().currentTag = text;
}
return mapFilter.NoValue;
}
})
]);
};

1
notes.txt

@ -0,0 +1 @@
TODO: Support for image sitemaps (https://developers.google.com/search/docs/advanced/sitemaps/image-sitemaps)

21
package.json

@ -0,0 +1,21 @@
{
"name": "@promistream/parse-sitemap",
"version": "0.1.0",
"main": "index.js",
"keywords": [
"promistream"
],
"repository": "http://git.cryto.net/promistream/parse-sitemap.git",
"author": "Sven Slootweg <admin@cryto.net>",
"license": "WTFPL OR CC0-1.0",
"devDependencies": {
"@joepie91/eslint-config": "^1.1.0",
"eslint": "^6.8.0"
},
"dependencies": {
"@joepie91/consumable": "^1.0.1",
"@promistream/map-filter": "^0.1.0",
"@promistream/parse-xml": "^0.1.0",
"@promistream/pipe": "^0.1.4"
}
}

1484
yarn.lock

File diff suppressed because it is too large
Loading…
Cancel
Save