You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
58 lines
1.6 KiB
JavaScript
58 lines
1.6 KiB
JavaScript
3 years ago
|
"use strict";
|
||
|
|
||
|
const bhttp = require("bhttp");
|
||
|
const zlib = require("zlib");
|
||
|
const assert = require("assert");
|
||
|
|
||
|
const pipe = require("@promistream/pipe");
|
||
|
const fromNodeStream = require("@promistream/from-node-stream");
|
||
|
const simpleSink = require("@promistream/simple-sink");
|
||
|
const parseSitemap = require("@promistream/parse-sitemap");
|
||
|
const decodeString = require("@promistream/decode-string");
|
||
|
|
||
|
const assureResponse = require("../../shared/assure-response");
|
||
|
|
||
|
let session = bhttp.session({
|
||
|
headers: {
|
||
|
"user-agent": "Mozilla/5.0 (X11; Linux x86_64; rv:86.0) Gecko/20100101 Firefox/86.0",
|
||
|
"accept": "*/*"
|
||
|
}
|
||
|
});
|
||
|
|
||
|
// NOTE: WIP, currently running into PerimeterX issues, to be continued later
|
||
|
|
||
|
module.exports = function mouserScrapeSitemap({ }) {
|
||
|
return async function ({ data, createItem }) {
|
||
|
// console.log((await session.get("https://www.mouser.com/")).statusCode);
|
||
|
// console.log({data});
|
||
|
console.log("Making request....");
|
||
|
let response = await session.get(data.url, { stream: true });
|
||
|
// let response = await session.get("http://localhost:4567", { stream: true });
|
||
|
console.log("Got response");
|
||
|
assureResponse(response);
|
||
|
console.log("Valid");
|
||
|
|
||
|
await pipe([
|
||
|
fromNodeStream(response),
|
||
|
(data.url.endsWith(".gz"))
|
||
|
? fromNodeStream(zlib.createGunzip())
|
||
|
: null,
|
||
|
decodeString("utf8"),
|
||
|
parseSitemap(),
|
||
|
simpleSink((item) => {
|
||
|
assert(item.url);
|
||
|
|
||
|
if (item.type === "sitemap") {
|
||
|
createItem({
|
||
|
id: `mouser:sitemap:${item.url}`,
|
||
|
tags: [ "mouser:sitemap" ],
|
||
|
data: { url: item.url }
|
||
|
});
|
||
|
} else {
|
||
|
console.log(item);
|
||
|
}
|
||
|
})
|
||
|
]).read();
|
||
|
};
|
||
|
};
|